This is my hand-unroll of the well known (?) 64-ary tree, or, "bitset" tree used for predecessor problems. I tried to squeeze a pretty rudimentary implementation of 200A - Cinema in $$$O(k \sqrt{k} \log_W(n))$$$ using my usual bitset tree template, but it fell pretty short (1.7s). Rewriting made it go down to just 0.6s, which is seen in these two submissions: 376029723, 376055924.
Codestruct PredProblem
{
static constexpr int B = 64;
uint64_t tree[32];
int treeID;
PredProblem()
{
for (auto &i : tree)
i = -1;
treeID = -1;
};
void erase(const int i)
{
tree[i / B] &= ~(1ULL << (i % B));
if (!tree[i / B])
treeID &= ~(1 << (i / B));
};
bool get(const int i)
{
return tree[i / B] & (1ULL << (i % B));
};
int next(const int i)
{
uint64_t a = tree[i / B] & -(1ULL << (i % B));
if (!a)
{
int nb = __builtin_ctz(treeID & -(1 << (i / B + 1)));
return nb * B + __builtin_ctzll(tree[nb]);
}
else
return i / B * B + __builtin_ctzll(a);
};
int prev(const int i)
{
uint64_t a = tree[i / B] & ((i % B == 63 ? 0 : (1ULL << (i % B + 1))) - 1);
if (!a)
{
int x = treeID & ((1 << (i / B)) - 1);
if (!x)
return -1;
int nb = __lg(x);
return nb * B + __lg(tree[nb]);
}
else
return i / B * B + __lg(a);
};
};
This can probably beat the DSU implementation suggested (technically not in TC, since they treat $$$O(\alpha(n))$$$ as $$$O(1)$$$). At least it uses much lower memory.