I am solving this problem and get a TLE although i do an O(nlogn^2) solution with time limit = 2s. Is this tle caused by ordered_set swap or is something in my code wrong???
My code
#pragma GCC optimize("Ofast")
#pragma GCC optimize("unroll-loops")
#include<bits/stdc++.h>
#include <ext/pb_ds/assoc_container.hpp>
#include <ext/pb_ds/tree_policy.hpp>
using namespace std;
using namespace __gnu_pbds;
// #define int long long
#define fi first
#define se second
#define siz(x) (int)(x.size())
#define all(x) x.begin(), x.end()
#define debug_arr(x,len) for(int _=1; _<=len; _++) cout<<x[_]<<" "; cout<<'\n';
#define debug(x) cout<<'\n'<<#x<<": "<<x<<'\n';
#define ordered_set tree<int, null_type,less<int>, rb_tree_tag,tree_order_statistics_node_update>
const int maxN = 2e5+5;
int n, a[maxN], ans[maxN];
vector<int>adj[maxN];
ordered_set s[maxN];
void dfs(int u, int v)
{
for(auto i: adj[u])
{
if(i == v) continue;
dfs(i,u);
if(siz(s[u]) < siz(s[i])) swap(s[u], s[i]);
for(auto j: s[i]) s[u].insert(j);
}
ans[u] = siz(s[u]) - s[u].order_of_key(a[u]+1);
}
void solve()
{
}
int32_t main()
{
ios_base::sync_with_stdio(0); cin.tie(0);
cin>>n;
for(int i=1; i<=n; i+=1) cin>>a[i];
for(int i=2; i<=n; i+=1)
{
int x; cin>>x;
adj[x].push_back(i);
adj[i].push_back(x);
}
for(int i=1; i<=n; i+=1) s[i].insert(a[i]);
dfs(1, 0);
for(int i=1; i<=n; i+=1) cout<<ans[i]<<'\n';
}



