Anyways, I have indeed done the swap to cpp and it is so fast! In java, I had my normal b-tree solution, but I had to go through the hell of writing the ceil function this time for it. Unfortunately, despite all my optimizations, I was unable to ac all the test cases on traffic on cses. I really pulled out all the stops. Anyways, literally just transcribing it to cpp almost line for line allowed it to comfortably slide under the 1 second bounds. Anyways, I digress. Since then, I've been on a cpp high and for some inexplicable reason, my stupid ahh thought it would be a good idea to turn the general k-killer josephus problem into O(n). At first I was going to start domestically abusing the CPU and force it to reveal the n-th bit of a 200000 bit string, but unfortunately, I think cses is all out of x200000 CPUs. I instead had to settle for a 3 layer prefix sum lookup table solution which unfortunately scales badly, but is quite good for n <= 2e5. All of those static casts are quite necessary!!!! I don't like the yellow squigglies that clang gives me! Unfortunately for me, I somehow as a god at fucking up in the most unnoticeable way and was actually doing my bitshifts inverted! I spent a sad 3 hours debugging that. But that's all in the past now, and it is the most glorious code you've ever laid your eyes upon.
Anyways, the macro and var names are obviously the greatest you've ever seen. Tell me how I should optimize this more! I think it might be possible to parallelize it with vectorization and bitshifts for a 4x speed up on the initial construction of the prefix sums. Obviously, I could probably directly set the prefix sums instead of using a few methods to build it up, but where's the fun in that?
#include <iostream>
using namespace std;
#define tinyuwublock 16
#define numtinyuwus 16384
#define uwusperowo 256
#define numowos 64
#define owospermegauwu 16
#define nummegauwus 4
uint16_t bitvector[numtinyuwus];
uint16_t uwucounts[numtinyuwus];
uint32_t owoprefix[numowos+1];
uint32_t megauwuprefix[nummegauwus+1];
uint8_t lut[65536][16];
void buildluts() {
for (int i = 0; i < 65536; ++i) {
int c = 0;
for (int b = 0; b < 16; ++b) {
if (i & (1 << b)) {
lut[i][c++] = b;
}
}
for (int b = c; b < 16; ++b) {
lut[i][b] = 255;
}
}
}
void repairprefix() {
for (int i = 0; i < numtinyuwus; ++i) {
uwucounts[i] = __builtin_popcount(bitvector[i]);
}
owoprefix[0] = 0;
for (int i = 0; i < numowos; ++i) {
uint32_t sum = 0;
int start = i * uwusperowo;
for (int j = 0; j < uwusperowo; ++j) {
sum += uwucounts[start + j];
}
owoprefix[i+1] = owoprefix[i] + sum;
}
megauwuprefix[0] = 0;
for (int i = 0; i < nummegauwus; ++i) {
uint32_t s = 0;
int st = i * owospermegauwu;
for (int j = 0; j < owospermegauwu; ++j) {
s+= owoprefix[st + j + 1] - owoprefix[j + st];
}
megauwuprefix[i+1] = megauwuprefix[i] + s;
}
}
int select1(uint32_t n) {
int muwu = 0;
while (megauwuprefix[muwu + 1] < n) ++muwu;
n -= megauwuprefix[muwu];
int owo = muwu * owospermegauwu;
while (owoprefix[owo + 1] - megauwuprefix[muwu] < n) ++owo;
n -= (owoprefix[owo] - megauwuprefix[muwu]);
int base = owo * uwusperowo;
int i = 0;
while (uwucounts[base + i] < n) {
n -= uwucounts[base + i];
++i;
}
int idx = base + i;
return idx * tinyuwublock + lut[bitvector[idx]][n - 1];
}
void flip(int n) {
int uwu = n/tinyuwublock;
int off = n%tinyuwublock;
int q = ((bitvector[uwu] >> off) & 1) ? -1 : 1;
bitvector[uwu] ^= (1 << off);
uwucounts[uwu] += q;
int owo = uwu / uwusperowo;
for (int i = owo+1; i <= numowos; ++i) {
if (q == -1) owoprefix[i]-=1;
else owoprefix[i] += 1;
}
int muwu = owo / owospermegauwu;
for (int i = muwu+1; i <= nummegauwus; ++i) {
if (q == -1) megauwuprefix[i] -= 1;
else megauwuprefix[i] += 1;
}
}
int main() {
cin.tie(nullptr);
ios::sync_with_stdio(false);
int n;
cin >> n;
int k;
cin >> k;
++k;
int full_blocks = n / tinyuwublock;
for (int i = 0; i < full_blocks; ++i) bitvector[i] = 0xFFFF;
if (n % tinyuwublock) bitvector[full_blocks] = (1u << (n % tinyuwublock)) - 1;
uint32_t pos = 0;
int alive = n;
buildluts();
repairprefix();
while (alive != 0) {
pos = (static_cast<uint32_t>(pos) + static_cast<uint32_t>(k) - 1u) % static_cast<uint32_t>(alive);
int idx = select1(static_cast<uint32_t>(pos + 1u));
flip(idx);
--alive;
cout << idx +1 << ' ';
}
}









wow this is some funny naming conventions