#3328. PYXFIB
∑i=0⌊nk⌋Cni×k×Fi×k∑i=0nCni×Fi×[i≡0(modk)]i≡0(modk),单位根反演有1k∑j=0k−1wkij1k∑i=0nCni×Fi∑j=0k−1wkij\sum_{i = 0} ^{\lfloor \frac{n}{k} \rfloor} C_{n} ^{i \times k} \times F_{i \times k}\\ \sum_{i = 0} ^{n} C_n ^{i} \times F_i \times [i \equiv 0 \pmod k]\\ i \equiv 0 \pmod k,单位根反演有 \frac{1}{k} \sum_{j = 0} ^{k - 1} w_k ^ {ij}\\ \frac{1}{k}\sum_{i = 0} ^{n} C_n ^ i \times F_i \sum_{j = 0} ^{k - 1} w_k ^{ij}\\ i=0∑⌊kn⌋Cni×k×Fi×ki=0∑nCni×Fi×[i≡0(modk)]i≡0(modk),单位根反演有k1j=0∑k−1wkijk1i=0∑nCni×Fij=0∑k−1wkij
由于FiF_iFi为斐波那契数列的第iii项,构造矩阵A=[1101]A = [_1 ^ 1\ _0 ^ 1]A=[11 01],则有Ai[0][0]A ^ i[0][0]Ai[0][0]为FiF_iFi,∑i=0nCni×Fi=(I+A)n\sum\limits_{i = 0} ^{n}C_n ^ i \times F_i = (I + A) ^ ni=0∑nCni×Fi=(I+A)n。
1k∑i=0n∑j=0k−1Cni×Ai×wkij1k∑j=0k−1(A×wkj+I)n\frac{1}{k} \sum_{i = 0} ^{n} \sum_{j = 0} ^{k - 1} C_{n} ^ i \times A ^ i \times w_{k} ^ {ij}\\ \frac{1}{k}\sum_{j = 0} ^{k - 1}(A \times w_k ^ j + I) ^ n\\ k1i=0∑nj=0∑k−1Cni×Ai×wkijk1j=0∑k−1(A×wkj+I)n
#include <bits/stdc++.h>
#pragma GCC optimize("Ofast,no-stack-protector,unroll-loops,fast-math")
#pragma GCC target("sse,sse2,sse3,ssse3,sse4.1,sse4.2,avx,avx2,popcnt,tune=native")using namespace std;int n, k, mod;int dec(int a, int b) {return a >= b ? a - b : a + mod - b;
}int add(int a, int b) {return a + b >= mod ? a + b - mod : a + b;
}namespace min_25 {const int N = 1e6 + 10;int a[N], id1[N], id2[N], prime[N], g1[N], s[N], sum1[N], sum[N], m, cnt, T, wn;bool st[N];inline int ID(int x) {return x <= T ? id1[x] : id2[n / x];}void pre() {T = sqrt(n + 0.5);for (int i = 2; i <= T; i++) {if (!st[i]) {prime[++cnt] = i;sum1[cnt] = sum1[cnt - 1] + 1;}for (int j = 1; j <= cnt && 1ll * i * prime[j] <= T; j++) {st[i * prime[j]] = 1;if (i % prime[j] == 0) {break;}}}for (int l = 1, r; l <= n; l = r + 1) {r = n / (n / l);a[++m] = n / l;a[m] <= T ? id1[a[m]] = m : id2[n / a[m]] = m;g1[m] = a[m] - 1;}for (int j = 1; j <= cnt && 1ll * prime[j] * prime[j] <= n; j++) {for (int i = 1; i <= m && 1ll * prime[j] * prime[j] <= a[i]; i++) {g1[i] = dec(g1[i], dec(g1[ID(a[i] / prime[j])], sum1[j - 1]));}}}void init() {for (int i = 1; i <= cnt; i++) {sum[i] = add(sum[i - 1], wn);}for (int i = 1; i <= m; i++) {s[i] = 1ll * g1[i] * wn % mod;}for (int j = cnt; j >= 1; j--) {for (int i = 1; i <= m && 1ll * prime[j] * prime[j] <= a[i]; i++) {for (int cur = prime[j], w = wn; 1ll * cur * prime[j] <= a[i]; cur *= prime[j], w = 1ll * w * wn % mod) {s[i] = add(s[i], add(1ll * dec(s[ID(a[i] / cur)], sum[j]) * w % mod, 1ll * wn * w % mod));}}}}int solve(int n) {return n ? s[ID(n)] + 1 : 0;}
}int quick_pow(int a, int n) {int ans = 1;while (n) {if (n & 1) {ans = 1ll * ans * a % mod;}a = 1ll * a * a % mod;n >>= 1;}return ans;
}int get_g() {if (mod == 2) {return 1;}int cur = mod - 1;vector<int> facts;for (int i = 2; 1ll * i * i <= cur; i++) {if (cur % i == 0) {facts.push_back(i);while (cur % i == 0) {cur /= i;}}}if (cur != 1) {facts.push_back(cur);}for (int i = 1; ; i++) {if (__gcd(i, mod) != 1) {continue;}int flag = 1;for (int &it : facts) {if (quick_pow(i, (mod - 1) / it) == 1) {flag = 0;break;}}if (flag) {return i;}}
}int main() {// freopen("in.txt", "r", stdin);// freopen("out.txt", "w", stdout);scanf("%d %d %d", &n, &k, &mod);int w[20] = {1}, wn = quick_pow(get_g(), (mod - 1) / k);for (int i = 1; i <= k; i++) {w[i] = 1ll * w[i - 1] * wn % mod;}int ans[20] = {0};min_25::pre();for (int j = 0; j < k; j++) {min_25::wn = w[j];min_25::init();int res = 0;for (int l = 1, r; l <= n; l = r + 1) {r = n / (n / l);int cur = dec(min_25::solve(r), min_25::solve(l - 1));res = add(res, 1ll * (n / l) * cur % mod);}for (int r = 0; r < k; r++) {ans[r] = add(ans[r], 1ll * w[k - j * r % k] * res % mod);}}int inv_k = quick_pow(k, mod - 2);for (int i = 0; i < k; i++) {printf("%lld ", 1ll * ans[i] * inv_k % mod);}return 0;
}