Why is My Code Running Slowly on Codeforces Compared to AtCoder and My Computer?

#	User	Rating
1	tourist	3985
2	jiangly	3814
3	jqdai0815	3682
4	Benq	3529
5	orzdevinwang	3526
6	ksun48	3517
7	Radewoosh	3410
8	hos.lyric	3399
9	ecnerwala	3392
9	Um_nik	3392

#	User	Contrib.
1	cry	169
2	maomao90	162
2	Um_nik	162
4	atcoder_official	161
5	djm03178	158
6	-is-this-fft-	157
7	adamant	155
8	awoo	154
8	Dominater069	154
10	luogu_official	150

I was solving the problem 364E - Empty Rectangles. The time complexity of my solution is $$$O(n^2k \log n)$$$, where $$$n \leq 2500$$$ and $$$k \leq 6$$$. The time limit for this problem is 12 seconds.

However, my submission 295732996 runs significantly slower on Codeforces compared to AtCoder and my local machine. (btw, I AC this problem using C++17 (GCC 7-32), 11155 ms, a little faster)

Comparisons

To investigate, I embedded a data generator that creates random matrices with uniformly distributed 0s and 1s, then ran the code on different platforms:

On Codeforces: 12437ms
On AtCoder: 4260ms
On My Local Machine (i3-6100, g++ 14.2.1, Arch Linux): 4596ms

The Code

Here is the implementation with the embedded data generator:

#include <algorithm>
#include <array>
#include <cstdint>
#include <iostream>
#include <random>
#include <string>
#include <vector>

using u8 = uint8_t;
using i64 = long long;
using i128 = __int128_t;
using u32 = unsigned int;
using u64 = unsigned long long;
using u128 = __uint128_t;

template <typename T, int N>
struct StaticVector
{
  int p;
  std::array<T, N> v;
  
  StaticVector() : p(0) {}

  T &operator[](int x) { return v[x]; }
  const T &operator[](int x) const { return v[x]; }

  int size() const { return p; }

  bool full() const { return p == (int)v.size(); }

  void push_back(const T &t) 
  { 
    if (full()) return ;
    v[p++] = t; 
  }
};

template <typename T, int K, typename Cmp>
StaticVector<T, K> merge(const StaticVector<T, K> &a, const StaticVector<T, K> &b, Cmp &&cmp)
{
  StaticVector<T, K> c;
  int i = 0, j = 0;
  while (!c.full() && j < b.size()) {
    if (cmp(a[i], b[j])) c.push_back(a[i++]);
    else c.push_back(b[j++]);
  }
  while (!c.full()) c.push_back(a[i++]);
  return c;
}

std::vector<std::vector<u8>> transpose(std::vector<std::vector<u8>> a)
{
  int n = a.size();
  int m = a[0].size();
  std::vector<std::vector<u8>> b(m, std::vector<u8>(n));
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < m; j++) {
      b[j][i] = a[i][j];
    }
  }
  return b;
}

template <int K>
i64 solve_details(std::vector<std::vector<u8>> a)
{
  if (a.size() < a[0].size()) a = transpose(std::move(a));

  if (a.size() == 1 && a[0].size() == 1) {
    return a[0][0] == K;
  } else {
    int mid = a.size() / 2;
    int n = a[0].size();

    static std::vector<StaticVector<int, K + 2>> f;
    static std::vector<StaticVector<int, K + 2>> g;

    f.assign(n, {});
    g.assign(n, {});

    for (int i = mid - 1; i >= 0; i--) {
      for (int j = 0; j < n; j++) {
        if (a[i][j] == 1) f[j].push_back(i);
      }
    }

    for (int i = mid; i < (int)a.size(); i++) {
      for (int j = 0; j < n; j++) {
        if (a[i][j] == 1) g[j].push_back(i + 1);
      }
    }

    i64 ans = 0;

    for (int l = 0; l < n; l++) {
      StaticVector<int, K + 2> fp, gp;
      fp.push_back(mid);
      gp.push_back(mid);
      for (int t = 0; t <= K; t++) {
        fp.push_back(-1);
        gp.push_back(a.size() + 1);
      }
      for (int r = l + 1; r <= n; r++) {
        fp = merge(std::move(fp), f[r - 1], std::greater<int>{});
        gp = merge(std::move(gp), g[r - 1], std::less<int>{});
        for (int i = 0; i <= K; i++) {
          int uc = fp[i] - fp[i + 1];
          int dc = gp[K - i + 1] - gp[K - i];
          if (fp[i] == mid) uc--;
          if (gp[K - i] == mid) dc--;
          ans += uc * dc;
        }
      }
    }

    ans += solve_details<K>(std::vector(a.begin(), a.begin() + mid));
    ans += solve_details<K>(std::vector(a.begin() + mid, a.end()));

    return ans;
  }
}

template <int K>
i64 solve(std::vector<std::vector<u8>> a, int k)
{
  if (K < k) return solve<std::min(K + 1, 6)>(std::move(a), k);
  return solve_details<K>(std::move(a));
}

int main()
{
  std::cin.tie(nullptr);
  std::ios::sync_with_stdio(false);

  int n, m, k;
  std::cin >> n >> m >> k;

  std::vector a(n, std::vector<u8>(m));

  std::mt19937 mt;

  for (int i = 0; i < n; i++) {
    for (int j = 0; j < m; j++) {
      a[i][j] = mt() & 1;
    }
  }

  std::cout << solve<0>(a, k) << std::endl;
}

Why does my code run so much slower on Codeforces? Is this due to differences in the hardware, compiler, or some other factor? Any insights would be appreciated.

EarthMessenger's blog

Comparisons

The Code