100+40+0=140。暴力没写满……

简单模拟

很久很久以前，有一个 \(1\sim n\) 的排列 \(a\)，还有一个长度为 \(q\) 的，每个元素在 \(1\) 到 \(n\) 之间的序列 \(b_0,\dots,b_{q-1}\)。

作为一道简单模拟题，你需要模拟 \(m\) 次操作，第 \(i\)（\(1\) 到 \(m\)）次操作你会在 \(a\) 中找到值为 \(b_{(i-1)\bmod q}\) 的元素，并把它与第一个元素交换。

你只需要输出 \(m\) 次操作之后的序列即可。

\(1\leq n,q\leq 200000，1\leq m\leq 10^{18}\)。

题解

看位置对应的元素没有头绪，但是看元素所在的位置却有迹可循。

分析元素所在位置的变换，因为每次交换的是 \(b_{(i-1)\bmod q}\) 和第一个元素的位置，所以

当 \((i-1)\bmod q=0\) 时
1. \(i=1\)，交换的是元素 \(b_0\) 和元素 \(a_1\) 的位置。
2. \(i>1\)，交换的是元素 \(b_0\) 和元素 \(b_{q-1}\) 的位置。
当 \((i-1)\bmod q>0\)，交换的是 \(b_{(i-1)\bmod q}\) 和 \(b_{(i-2)\bmod q}\) 的位置。

把操作 \(q\) 次看成一个周期，我们可以

暴力模拟特殊的第一个周期。
寻找普通的中间周期中元素位置变化的实质，即我们要找到每 \(q\) 次操作对应的置换。然后用置换的知识快速计算中间周期的结果。
暴力末尾不足一个周期的操作。

每 \(q\) 次操作对应的置换可以通过如下代码求得：

b[0]=b[q];
for(int i=1;i<=n;++i) per[i]=i;
for(int i=1;i<=q;++i) swap(per[b[i]],per[b[i-1]]);

那么per[i]表示的是元素 \(i\) 最终与元素per[i]交换位置了。即per[i]所在位置上现在应该填 \(i\)。

如何快速计算中间周期的结果呢？我们可以把每个轮换找出来，然后利用它的循环节进行操作。

时间复杂度 \(O(q)\)。

AC程序：

CO int N=200000+10;
int b[N],per[N];
int cir[N],len;
bool vis[N];
int ans[N],pos[N];

int main(){
    freopen("sim.in","r",stdin),freopen("sim.out","w",stdout);
    int n=read<int>(),q=read<int>();
    LL m=read<LL>();
    
    pos[b[0]=read<int>()]=1;
    for(int i=2;i<=n;++i) pos[read<int>()]=i;
    for(int i=1;i<=q;++i) read(b[i]);
    
    if(m<=q){
        for(int i=1;i<=m;++i) swap(pos[b[i-1]],pos[b[i]]);
        
        for(int i=1;i<=n;++i) ans[pos[i]]=i;
        for(int i=1;i<=n;++i) printf("%d%c",ans[i]," \n"[i==n]);
        return 0;
    }
    m-=q;
    for(int i=1;i<=q;++i) swap(pos[b[i-1]],pos[b[i]]);
    
    b[0]=b[q];
    for(int i=1;i<=n;++i) per[i]=i;
    for(int i=1;i<=q;++i) swap(per[b[i]],per[b[i-1]]);
    for(int i=1;i<=n;++i)if(!vis[i]){
        vis[i]=1,len=0,cir[len++]=i;
        for(int j=per[i];j!=i;j=per[j]) vis[j]=1,cir[len++]=j;
        int rest=m/q%len;
        for(int j=0;j<len;++j) per[cir[(j+rest)%len]]=cir[j];
    }
    for(int i=1;i<=n;++i) ans[pos[i]]=per[i];
    
    for(int i=1;i<=n;++i) pos[ans[i]]=i;
    for(int i=1;i<=m%q;++i) swap(pos[b[i]],pos[b[i-1]]);
    
    for(int i=1;i<=n;++i) ans[pos[i]]=i;
    for(int i=1;i<=n;++i) printf("%d%c",ans[i]," \n"[i==n]);
    return 0;
}

暴力程序：

CO int N=200000+10;
int pos[N],b[N],ans[N];

int main(){
    int n=read<int>(),q=read<int>();
    LL m=read<LL>();
    
    pos[b[0]=read<int>()]=1;
    for(int i=2;i<=n;++i) pos[read<int>()]=i;
    for(int i=1;i<=q;++i) read(b[i]);
    
    if(m<=q){
        for(int i=1;i<=m;++i) swap(pos[b[i-1]],pos[b[i]]);
        
        for(int i=1;i<=n;++i) ans[pos[i]]=i;
        for(int i=1;i<=n;++i) printf("%d%c",ans[i]," \n"[i==n]);
        return 0;
    }
    m-=q;
    for(int i=1;i<=q;++i) swap(pos[b[i-1]],pos[b[i]]);
    
    b[0]=b[q];
    for(int i=q+1;i<=m;++i) b[i]=b[i-q];
    for(int i=1;i<=m;++i) swap(pos[b[i]],pos[b[i-1]]);
    
    for(int i=1;i<=n;++i) ans[pos[i]]=i;
    for(int i=1;i<=n;++i) printf("%d%c",ans[i]," \n"[i==n]);
    return 0;
}

std给的是倍增。现在想来确实倍增代码实现更简单，考场上我看到这题的数据范围应该优先去写倍增的。

简单构造

一次歌唱比赛中，一位歌手刚刚结束表演，评委正在打分。一共有 \(n\) 位评委，他们每人可以打 \(1\) 分或 \(0\) 分，第 \(i\) 位评委希望歌手的得分为 \(v_i\)。

评委们有特殊的控分技巧，他们会按一个顺序依次评分，第一个评分的评委会不管三七二十一打 \(0\) 分。对于接下来的评委，假设前面 \(a\) 位评委评分总和为 \(b\)，评委会认为这位歌手期望得分为 \(\frac{b}{a}n\)，如果这个得分低于他所希望的得分，他会打 \(1\) 分，否则他会打 \(0\) 分。

作为最大的黑幕——裁判，你对这一切心知肚明。你希望选手的得分为 \(p~(0\leq p\leq n)\)，为此你可以调换评委们的评分顺序。你需要输出一个 \(1\sim n\) 的排列，第 \(i\) 个位置表示第 \(i\) 个评分的裁判的编号，让选手的得分最接近 p。如果有多种，你只需要输出任意一种。

Subtask 1，10pts，\(1\leq n\leq 10\)。
Subtask 2，20pts，\(1\leq n\leq 100\)。
Subtask 3，30pts，\(1\leq n\leq 1000\)。
Subtask 4，10pts，\(1\leq n\leq 10^5，p=0\)。
Subtask 5，30pts，\(1\leq n\leq 10^5\)。

暴力

我当时认为裁判要努力让期望得分最大的 \(p\) 个评委打 \(1\) 分，让剩余 \(n-p\) 个评委打 \(0\) 分。

然后思考了一下 \(p=0\) 和 \(p=n\) 的情况，发现那 \(p\) 个期望得分大的评委应该从小到大排序，\(n-p\) 个小的评委应该从大到小排序。

设计状态 \(f(i,j,k)\) 表示大评委用了 \(i\) 个，小评委用了 \(j\) 个，得了 \(k\) 分的可行性。

暴力转移：

namespace T1{
    CO int N=100+10;
    pair<int,int> v[N],a[N],b[N];
    int f[N][N][N];
    pair<int,int> g[N][N][N];

    int main(int n,int p){
        for(int i=1;i<=n;++i) read(v[i].first),v[i].second=i;
        sort(v+1,v+n+1);
        for(int i=1;i<=p;++i) a[i]=v[n-p+i];
        for(int i=1;i<=n-p;++i) b[i]=v[n-p+1-i];
        f[1][0][0]=1,g[1][0][0]=make_pair(1,0);
        f[0][1][0]=1,g[0][1][0]=make_pair(2,0);
        for(int i=0;i<=p;++i)for(int j=0;j<=n-p;++j)if(i+j>=1)
            for(int k=0;k<=i+j;++k)if(f[i][j][k]){
                if(i<p){
                    int d=(LD)k/(i+j)*n<a[i+1].first;
                    f[i+1][j][k+d]=1,g[i+1][j][k+d]=make_pair(1,d);
                }
                if(j<n-p){
                    int d=(LD)k/(i+j)*n<b[j+1].first;
                    f[i][j+1][k+d]=1,g[i][j+1][k+d]=make_pair(2,d);
                }
            }
        int delta=0;
        while(!f[p][n-p][p+delta] and !f[p][n-p][p-delta]) ++delta;
        int i=p,j=n-p,k=f[p][n-p][p+delta]?p+delta:p-delta;
        vector<int> sol;
        while(i or j){
            if(g[i][j][k].first==1) sol.push_back(a[i].second),k-=g[i][j][k].second,--i;
            else sol.push_back(b[j].second),k-=g[i][j][k].second,--j;
        }
        for(;sol.size();sol.pop_back()) printf("%d%c",sol.back()," \n"[sol.size()==1]);
    }
}

然后我吸收别人CSP的经验，想着把决策打出来找规律。但是本身这个DP就是我臆想的，所以规律就是一个尽量调整……

现在想来我似乎可以将大小评委人数上下微调，说不定能过。

CSAcademy Voting

Let's make a key observation first: if we have two adjacent judges \(i\) and \(i+1\) with assessments \(x\) and \(y\), with \(x<y\), swapping them will either yield the same total score for the singer, or decrease it by one.

To prove this, let's denote as \(s\) the score as it is before the \(i\)th judge. Now, let's consider some cases:

their votes are \(1\) and \(1\): this means \(x\cdot (i-1) > n\cdot s\) and \(y\cdot i > n\cdot (s + 1)\). Since \(y \le n\), we get \(y\cdot (i-1) > ns\). At least one of these judges still puts \(1\).
votes are \(1\) and \(0\): this means \(x\cdot (i-1) > n\cdot s\) and \(y\cdot i \le n\cdot (s + 1)\). Since \(y > x\), we get \(x\cdot i \le n\cdot (s + 1)\). That is, if the first judge puts \(1\), the second judge puts \(0\). Anyway, at least one of them puts \(0\).
votes are \(0\) and \(1\): \(x\cdot (i-1) \le n\cdot s\) and \(y\cdot i > n\cdot s\). Since \(x \le n\), we get \(x\cdot i \le n\cdot (s + 1)\). As in the previous case, at least one of these judges puts \(0\).
votes are \(0\) and \(0\). \(x\cdot (i-1) \le n\cdot s\) and \(y\cdot i \le n\cdot s\). Obviously \(y\cdot (i-1) \le n\cdot s\). Since \(y > x\), we get \(x\cdot i \le n\cdot s\). Both judges still puts \(0\).

Now let's prove that the overall score either doesn't change or decreases by one. All judges before \(i\) vote the same way. If the sum of votes of swapped judges doesn't change, overall result also remains the same. Otherwise we find the first judge after \(i+1\) who changes his vote. If he doesn't exist, the result decreases by one. If he exists, he changes his vote from \(0\) to \(1\). All judges after him don't change their votes. The result doesn't change.

Let's see how we can use this observation to solve the problem. Focusing it towards extreme cases yields the following new observations:

the highest total score possible is obtained when judges are in increasing order of their assessment (call this \(M\))
the lowest total score possible is obtained when judges are in decreasing order of their assessment (call this \(m\))
any number between \(m\) and \(M\) can be obtained. (let's call this statement \(Q\))

We will investigate the third of these statements a bit further. Let's say the array of assessments of the judges is \(A_1, A_2\ ...\ A_n\), in increasing order. We will build a list \(\mathcal{P}\) of permutations of \(A\), say with \(k\) elements, respecting the following conditions:

\(\mathcal{P}_1\ = A_1\ A_2\ \ \ \ \ ...\ A_n\)
\(\mathcal{P}_k\ = A_n\ A_{n-1}\ ...\ A_1\)
For any \(i < k\), we can transform permutation \(\mathcal{P}_i\) into permutation \(\mathcal{P}_{i+1}\) by swapping a single pair of adjacent elements, \(\mathcal{P}_i[j]\) and \(\mathcal{P}_i[j+1]\) that initially were in increasing order (\(\mathcal{P}_i[j] < \mathcal{P}_i[j+1]\)).

An example of such a list, for \(A=[1, 2, 3]\) is:

\[ \mathcal{P}_1\ = 1\ 2\ 3\\ \mathcal{P}_2\ = 1\ 3\ 2\\ \mathcal{P}_3\ = 3\ 1\ 2\\ \mathcal{P}_4\ = 3\ 2\ 1 \]

The special property of such a list is that the score of the singer for permutation \(\mathcal{P}_i\) will be either the same or one more than the score for \(\mathcal{P}_{i+1}\) (this is due to the observation above). The statement \(Q\) is proved by this: as the score for \(\mathcal{P}_1\) is \(M\), the score for \(\mathcal{P}_k\) is mm and the score decreases by at most \(1\) at every step, every number between \(m\) and \(M\) is achieved by at least one of the permutations in \(\mathcal{P}\). Also, if we denote by \(S_i\) the score for permutation \(\mathcal{P}_i\), \(S\) will be sorted in decreasing order.

If \(X\) is less than \(m\), the answer is \(\mathcal{P}_k\). If \(X\) is greater than \(M\), the answer is \(\mathcal{P}_1\). If \(X\) is between \(m\) and \(M\), we can binary search for a position in \(\mathcal{P}\) that yields exactly \(X\). Beware: you need to be able to compute only the permutations for indexes found through the binary search. You cannot pre-compute all permutations of \(\mathcal{P}\). So all that is left is to find a good way to construct the list \(\mathcal{P}\) that allows this. An easy example is continually taking the last element and moving it to the front:

\[ \mathcal{P}_1\ = A_1\ A_2\ ...\ A_{n-2}\ A_{n-1}\ A_n\\ \mathcal{P}_2\ = A_1\ A_2\ ...\ A_{n-2}\ A_n\ A_{n-1}\\ \mathcal{P}_3\ = A_1\ A_2\ ...\ A_n\ A_{n-2}\ A_{n-1}\\ \vdots\\ \mathcal{P}_{n-1}\ = A_n\ A_1\ A_2\ ...\ A_{n-2}\ A_{n-1}\\ \mathcal{P}_{n}\ = A_n\ A_1\ A_2\ ...\ A_{n-1}\ A_{n-2}\\ \vdots\\ \mathcal{P}_{2n-2}\ = A_n\ A_{n-1}\ A_1\ ...\ A_{n-3}\ A_{n-2}\\ \vdots\\ \mathcal{P}_{k}\ = A_n\ A_{n-1}\ ...\ A_{2}\ A_{1} \]
参考std的程序实现，构造的调整是不断地把第一个元素放到后面。

时间复杂度 \(O(n \log n)\)。

CO int N=100000+10;
int a[N],test[N],nw[N];

int calc(int n,int a[]){
    int sum=0;
    for(int i=2;i<=n;++i)
        if((LD)sum/(i-1)*n<a[i]) ++sum;
    return sum;
}

int main(){
    int n=read<int>(),p=read<int>();
    for(int i=1;i<=n;++i) read(a[i]);
    sort(a+1,a+n+1);
    if(p>=calc(n,a)){
        for(int i=1;i<=n;++i) printf("%d%c",a[i]," \n"[i==n]);
        return 0;
    }
    int l=0,r=n;
    while(l<r){
        int mid=(l+r+1)>>1;
        for(int i=1;i<=n-mid;++i) test[i]=a[i+mid];
        for(int i=1;i<=mid;++i) test[i+n-mid]=a[mid-i+1];
        if(calc(n,test)>p) l=mid;
        else r=mid-1;
    }
    for(int i=1;i<=n-l;++i) nw[i]=a[i+l];
    for(int i=1;i<=l;++i) nw[i+n-l]=a[l-i+1];
    l=0,r=n-r-1;
    while(l<r){
        int mid=(l+r+1)>>1;
        copy(nw+1,nw+n+1,a+1);
        for(int i=1;i<=mid;++i) swap(a[i],a[i+1]);
        if(calc(n,test)>=p) l=mid;
        else r=mid-1;
    }
    copy(nw+1,nw+n+1,a+1);
    for(int i=1;i<=l;++i) swap(a[i],a[i+1]);
    for(int i=1;i<=n;++i) printf("%d%c",a[i]," \n"[i==n]);
    return 0;
}

简单计数

有一个 \(n\) 个点 \(m\) 条边的有向图，每条边有一个 \(1\) 到 \(m\) 的颜色。

对于每条欧拉回路，定义和谐值为相邻的同色边（一条欧拉回路共有 \(m\) 对相邻边）对数，问每条欧拉回路的和谐值之和，模 \(998244353\) 输出。

两条欧拉回路不同当且仅当存在边对 \((e_1,e_2)\)，在一条欧拉回路中 \(e_2\) 紧随 \(e_1\)，在另一条中 \(e_2\) 不紧随 \(e_1\)。

保证给定的图每个点出度至少为 \(1\)，没有自环，对于两个点 \(a\) 和 \(b\)，不存在两条 \(a\) 到 \(b\) 的边，且存在欧拉回路。

Subtask 1，10pts，\(2\leq n\leq 4\)。
Subtask 2，10pts，\(2\leq n\leq 10\) 且 \(m\leq 16\)。
Subtask 3，10pts，\(2\leq n\leq 10\)。
Subtask 4，20pts，\(2\leq n\leq 23\)。
Subtask 5，20pts，\(2\leq n\leq 40\)。
Subtask 6，10pts，\(2\leq n\leq 100\)。
Subtask 7，20pts，\(2\leq n\leq 300\)。

题解

BEST theorem

一个有向弱连通图如果存在欧拉回路，那么设第 \(i\) 个点的度为 \(deg_i\)（因为存在欧拉回路，入度和出度相等），那么欧拉回路的个数为
\[ T(s)\prod_i(deg_i-1)! \]

其中 \(T(s)\) 为以 \(s\) 为根的外向（内向）生成树个数，\(s\) 为任意一个点。

可以证明图中若存在欧拉回路则以任意一个点为根的内向、外向生成树个数都相等。

为了保持题解的完整性，这里给出BEST theorem的大致证明。

我们选择 \(s\) 点出发的任意一条边作为欧拉回路的起始边，考虑这个欧拉回路除 \(s\) 号点外每个点的最后一条出边，如果这些出边不构成一棵朝向 \(s\) 点的内向生成树的话，那么就形成了一些有向环，那么这个欧拉回路就会在那里绕一圈然后出不去，这显然不可能。

我们考虑证如果我们先决定了一棵生成树，再对于每个点决定了剩下的出边的排列顺序，那么这一定是一条合法的欧拉回路。如果它不是合法的话，那么我们肯定会卡在某个点用完了出边。

如果这个点不是 \(s\) 号点的话，那么进入这个点的边数比出这个点的边数多，这显然不可能。

如果卡在 \(s\) 号点的话，假设有一条边还没有用过，那么这个点的最后一条出边也没有用过，那么这条边指向的点的最后一条出边也没有用过，以此类推。最后我们可以推到这条边指向的点为 \(s\) 号点，那么 \(s\) 也有出边没有用过，所以没有卡住。

所以一条欧拉回路与一棵朝向 \(s\) 的生成树和每个点不在树中的出边任意的一个排列一一对应，所以个数即为

\[ T(s)deg_s!\prod_{i\neq s}(deg_i-1)! \]

由于欧拉回路本质上是个环，一个环会被从起点出发的每条边算上一次，所以方案数要除以 \(deg_s\)。所以个数为

\[ T(s)\prod_i(deg_i-1)! \]

这个定理对于有自环和重边的图同样好用。

可以发现，这个定理的成立与生成树的计数无关，所以有：

It is a property of Eulerian graphs that t_v(G) = t_w(G) for every two vertices v and w in a connected Eulerian graph G.

以点 \(s\) 为根的外向生成树个数是一个经典问题。

对于有向图的边 \((u,v)\)，M[u][v]--,M[v][v]++，然后把矩阵 \(M\) 去掉第 \(s\) 行第 \(s\) 列求行列式，行列式的值即为朝向点a的有向生成树个数。

套路运用

回到原问题，我们考虑枚举每对相邻的同色边 \(u\rightarrow v\rightarrow w\)，至多有 \(O(n^3)\) 对这样的边，计数它们在欧拉回路中相邻的方案数。

事实上这个方案数十分容易计算，我们在图中删去 \(u\rightarrow v,v\rightarrow w\)，添上 \(u\rightarrow w\) 的边，那么它们在欧拉回路中相邻的方案显然与这个新图中的欧拉回路一一对应。

需要注意的是如果 \(deg_v=1\)，新图中 \(v\) 没有连边，这时候我们可以发现 \(u\rightarrow v,v\rightarrow w\) 在欧拉回路中必须相邻，那么只要将原图的欧拉回路个数计入答案即可。

以下假设 \(deg_v>1\)，那么我们考虑取 \(s\) 点为 \(v\) 点，删掉了 \(u\rightarrow v,v\rightarrow w\)，加上了 \(u\rightarrow w\)，那么只有 \(v\) 的度减了 \(1\)，所以可以 \(O(1)\) 算右边的连乘式。

那么我们就是要计数以 \(v\) 点为根的外向生成树个数。直接暴力求行列式是 \(O(n^6)\) 的。

至此的解题思路都是套路，已经能得50分了。毕竟时限有6秒。

设原图对应的矩阵为 \(M\)，那么我们删掉了 \(u\rightarrow v,v\rightarrow w\)，加上了 \(u\rightarrow w\)，即 M[u][v]++,M[v][v]--,M[v][w]++,M[w][w]--,M[u][w]--,M[w][w]++。

M[w][w]加一减一抵消之后实际效果就是M[u][v]++,M[v][v]--,M[v][w]++,M[u][w]--。

因为去掉了第 \(v\) 行第 \(v\) 列，那么实际对于行列式的影响只有 M[u][w]--。

众所周知行列式可以如下计算：
\[ \left|\begin{matrix} a & b & c\\ d & e-1 & f\\ g & h & i \end{matrix}\right| =\left|\begin{matrix} a & b & c\\ d & e & f\\ g & h & i \end{matrix}\right| +\left|\begin{matrix} a & b & c\\ 0 & -1 & 0\\ g & h & i \end{matrix}\right| \]

由矩阵树定理，前面那个行列式对于任意 \(n-1\) 阶余子式都是相等的。所以我们需要找到一种快速计算后面那个行列式的办法。

线代小课堂

graph1

子式是选出某些行和列，余子式是删去某些行和列。

代数余子式是余子式乘上 \(-1\) 的行列编号和次方。

graph2

详细写一下过程：

\[ \left|\begin{matrix} a & b & c\\ u & v & w\\ x & y & z \end{matrix}\right| =|a|(-1)^{1+1}\left|\begin{matrix} v & w\\ y & z \end{matrix}\right| +|b|(-1)^{1+2}\left|\begin{matrix} u & w\\ x & z \end{matrix}\right| +|c|(-1)^{1+3}\left|\begin{matrix} u & v\\ x & y \end{matrix}\right|\\ =a\left|\begin{matrix} v & w\\ y & z \end{matrix}\right| -b\left|\begin{matrix} u & w\\ x & z \end{matrix}\right| +c\left|\begin{matrix} u & v\\ x & y \end{matrix}\right| \]

有个这个定理，那么

\[ \left|\begin{matrix} a & b & c\\ 0 & -1 & 0\\ g & h & i \end{matrix}\right| =0(-1)^{2+1}\left|\begin{matrix} b & c\\ h & i \end{matrix}\right| +(-1)(-1)^{2+2}\left|\begin{matrix} a & c\\ g & i \end{matrix}\right| +0(-1)^{2+3}\left|\begin{matrix} a & b\\ g & h \end{matrix}\right|\\ =-\left|\begin{matrix} a & c\\ g & i \end{matrix}\right| \]
注意到只有选到 \(-1\) 的那一行一列时，后面的式子才有值。并且因为选出的行列编号相等，所以符号是 \(1\)。

因此我们的任务就是快速地算出余子式。

那么如何求代数余子式呢？

graph3

若 \(\det A\neq 0\)，则 \(A\) 具有逆矩阵，且 \(A\) 的逆矩阵就是 \(\frac{1}{\det A}\text{adj}~A\)。

所以我们发现对于一个行列式非 \(0\) 的矩阵 \(A\)，求出逆矩阵 \(A^{-1}\) 之后，第 \(i\) 行第 \(j\) 列的代数余子式就是 \(A^{-1}_{j,i}\times \det A\)。

所以M[u][w]减一实际上的效果就是减去了 \(M^{-1}_{w,u}\times \det M\)。这样就可以得到一个\(O(n^4)\) 的做法。

已经可以拿到80分了，考场上推到这一步就该开始打代码了。

CO int N=300+10;
int e[N][N],deg[N];
int fac[N],pre[N],suf[N];
int a[N][N],b[N][N],ib[N][N];

int deter(int n,int a[N][N]){
    int ans=1;
    for(int i=1;i<=n;++i){  
        int p=i;
        for(int j=i;j<=n;++j)
            if(a[j][i]) {p=j;break;}
        if(p!=i) swap(a[p],a[i]),ans=mod-ans;
        ans=mul(ans,a[i][i]);
        int inv=fpow(a[i][i],mod-2);
        for(int j=i+1;j<=n;++j){
            int coef=mul(mod-a[j][i],inv);
            for(int k=i;k<=n;++k) a[j][k]=add(a[j][k],mul(coef,a[i][k]));
        }
    }
    return ans;
}
void inver(int n,int a[N][N],int b[N][N]){
    for(int i=1;i<=n;++i) fill(b[i]+1,b[i]+n+1,0),b[i][i]=1;
    for(int i=1;i<=n;++i){
        int p=i;
        for(int j=i;j<=n;++j)
            if(a[j][i]) {p=j;break;}
        if(p!=i) swap(a[p],a[i]),swap(b[p],b[i]);
        int inv=fpow(a[i][i],mod-2);
        for(int j=i;j<=n;++j) a[i][j]=mul(a[i][j],inv);
        for(int j=1;j<=n;++j) b[i][j]=mul(b[i][j],inv);
        for(int j=1;j<=n;++j)if(j!=i){
            int coef=mod-a[j][i];
            for(int k=i;k<=n;++k) a[j][k]=add(a[j][k],mul(coef,a[i][k]));
            for(int k=1;k<=n;++k) b[j][k]=add(b[j][k],mul(coef,b[i][k]));
        }
    }
}
int main(){
//  freopen("count.in","r",stdin),freopen("count.out","w",stdout);
    int n=read<int>();
    for(int m=read<int>();m--;){
        int c=read<int>(),u=read<int>(),v=read<int>();
        e[u][v]=c,++deg[v];
        a[u][v]=add(a[u][v],mod-1),++a[v][v];
    }
    
    fac[0]=1;
    for(int i=1;i<=n;++i) fac[i]=mul(fac[i-1],i);
    pre[0]=1;
    for(int i=1;i<=n;++i) pre[i]=mul(pre[i-1],fac[deg[i]-1]);
    suf[n+1]=1;
    for(int i=n;i>=1;--i) suf[i]=mul(suf[i+1],fac[deg[i]-1]);
    for(int i=1;i<n;++i) copy(a[i]+1,a[i]+n,b[i]+1);
    int det=deter(n-1,b),all=mul(det,pre[n]);
    
    int ans=0;
    for(int v=1;v<=n;++v){
        if(deg[v]==1){
            for(int u=1;u<=n;++u)if(e[u][v])
                for(int w=1;w<=n;++w)if(e[v][w]==e[u][v])
                    ans=add(ans,all);
            continue;
        }
        int rhs=mul(pre[v-1],mul(fac[deg[v]-2],suf[v+1]));
        for(int i=1;i<=n;++i)if(i!=v)
            for(int j=1;j<=n;++j)if(j!=v)
                b[i-(i>v)][j-(j>v)]=a[i][j];
        inver(n-1,b,ib);
        for(int u=1;u<=n;++u)if(e[u][v])
            for(int w=1;w<=n;++w)if(e[v][w]==e[u][v]){
                int sum=add(det,mod-mul(det,ib[w-(w>v)][u-(u>v)])); // determinant all equal
                ans=add(ans,mul(sum,rhs));
            }
    }
    printf("%d\n",ans);
    return 0;
}

这个做法瓶颈在于对于每个 \(v\)，在去掉第 \(v\) 行第 \(v\) 列之后求逆矩阵。

简单优化

考虑高斯消元求逆矩阵的过程，如果我们消去了除了 \(v\) 以外所有的元，提取出逆矩阵去掉第 \(v\) 行第 \(v\) 列之后的部分，那么效果是一样的。

基于这点我们可以分治消元，例如solve(l,r)表示当前还没有消去 l~r 的元，对于mid+1~r这些行的所有自由元，将其消去后调用solve(l,mid)，对于l~mid这些行的所有自由元，消去后调用solve(mid+1,r)。

时间复杂度 \(O(n^3\log n)\)。由于常数不是很大，这个做法可以通过。

但是有个问题，例如消mid+1~r的元的时候，你是不能把l~mid的行跟它们交换的。

所以这就不一定消得出来，出题人刻意卡是能卡掉的。

我在读入的时候加了个随机化，然后就可以过了。

CO int N=300+10;

struct matrix {int n,v[N][N];};

int deter(matrix a){
    int ans=1;
    for(int i=1;i<=a.n;++i){
        int p=i;
        for(int j=i;j<=a.n;++j)
            if(a.v[j][i]) {p=j;break;}
        if(p!=i) swap(a.v[p],a.v[i]),ans=mod-ans;
        ans=mul(ans,a.v[i][i]);
        int inv=fpow(a.v[i][i],mod-2);
        for(int j=i+1;j<=a.n;++j){
            int coef=mul(mod-a.v[j][i],inv);
            for(int k=i;k<=a.n;++k)
                a.v[j][k]=add(a.v[j][k],mul(coef,a.v[i][k]));
        }
    }
    return ans;
}

int n;
int e[N][N],deg[N];
int fac[N],pre[N],suf[N];
int det,ans;

void solve(int l,int r,CO matrix&a,CO matrix&b){
    if(l==r){
        if(deg[l]==1) return;
        int rhs=mul(pre[l-1],mul(fac[deg[l]-2],suf[l+1]));
        for(int u=1;u<=n;++u)if(e[u][l])
            for(int w=1;w<=n;++w)if(e[l][w]==e[u][l]){
                int sum=add(det,mod-mul(det,b.v[w][u]));
                ans=add(ans,mul(sum,rhs));
            }
        return;
    }
    int mid=(l+r)>>1;
    matrix la=a,lb=b;
    for(int i=mid+1;i<=r;++i){
        int p=i;
        for(int j=i;j<=r;++j)
            if(la.v[j][i]) {p=j;break;}
        if(p!=i) swap(la.v[p],la.v[i]),swap(lb.v[p],lb.v[i]);
        assert(la.v[i][i]);
        int inv=fpow(la.v[i][i],mod-2);
        for(int j=l;j<=r;++j) la.v[i][j]=mul(la.v[i][j],inv);
        for(int j=1;j<=n;++j) lb.v[i][j]=mul(lb.v[i][j],inv);
        for(int j=1;j<=n;++j)if(j!=i){
            int coef=mod-la.v[j][i];
            for(int k=l;k<=r;++k)
                la.v[j][k]=add(la.v[j][k],mul(coef,la.v[i][k]));
            for(int k=1;k<=n;++k)
                lb.v[j][k]=add(lb.v[j][k],mul(coef,lb.v[i][k]));
        }
    }
    solve(l,mid,la,lb);
    matrix ra=a,rb=b;
    for(int i=l;i<=mid;++i){
        int p=i;
        for(int j=i;j<=mid;++j)
            if(ra.v[j][i]) {p=j;break;}
        if(p!=i) swap(ra.v[p],ra.v[i]),swap(rb.v[p],rb.v[i]);
        assert(ra.v[i][i]);
        int inv=fpow(ra.v[i][i],mod-2);
        for(int j=l;j<=r;++j) ra.v[i][j]=mul(ra.v[i][j],inv);
        for(int j=1;j<=n;++j) rb.v[i][j]=mul(rb.v[i][j],inv);
        for(int j=1;j<=n;++j)if(j!=i){
            int coef=mod-ra.v[j][i];
            for(int k=l;k<=r;++k)
                ra.v[j][k]=add(ra.v[j][k],mul(coef,ra.v[i][k]));
            for(int k=1;k<=n;++k)
                rb.v[j][k]=add(rb.v[j][k],mul(coef,rb.v[i][k]));
        }
    }
    solve(mid+1,r,ra,rb);
}

int per[N];

int main(){
    freopen("count.in","r",stdin),freopen("count.out","w",stdout);
    srand(20030506);
    read(n);
    for(int i=1;i<=n;++i) per[i]=i;
    random_shuffle(per+1,per+n+1); // edit 1
    matrix a=(matrix){n};
    for(int i=1;i<=n;++i) fill(a.v[i]+1,a.v[i]+n+1,0);
    for(int m=read<int>();m--;){
        int c=read<int>(),u=per[read<int>()],v=per[read<int>()];
        e[u][v]=c,++deg[v];
        a.v[u][v]=add(a.v[u][v],mod-1),++a.v[v][v];
    }
    
    fac[0]=1;
    for(int i=1;i<=n;++i) fac[i]=mul(fac[i-1],i);
    pre[0]=1;
    for(int i=1;i<=n;++i) pre[i]=mul(pre[i-1],fac[deg[i]-1]);
    suf[n+1]=1;
    for(int i=n;i>=1;--i) suf[i]=mul(suf[i+1],fac[deg[i]-1]);
    
    a.n=n-1,det=deter(a);
    int all=mul(det,pre[n]);
    for(int v=1;v<=n;++v)if(deg[v]==1)
        for(int u=1;u<=n;++u)if(e[u][v])
            for(int w=1;w<=n;++w)if(e[v][w]==e[u][v])
                ans=add(ans,all);
    
    a.n=n;
    matrix b=(matrix){n};
    for(int i=1;i<=n;++i) fill(b.v[i]+1,b.v[i]+n+1,0),b.v[i][i]=1;
    solve(1,n,a,b);
    printf("%d\n",ans);
    return 0;
}

test20191210 钟子谦