I was solving the problem 364E - Empty Rectangles. The time complexity of my solution is $$$O(n^2k \log n)$$$, where $$$n \leq 2500$$$ and $$$k \leq 6$$$. The time limit for this problem is 12 seconds.
However, my submission 295732996 runs significantly slower on Codeforces compared to AtCoder and my local machine. (btw, I AC this problem using C++17 (GCC 7-32)
, 11155 ms, a little faster)
Comparisons
To investigate, I embedded a data generator that creates random matrices with uniformly distributed 0s and 1s, then ran the code on different platforms:
On Codeforces: 12437ms
On AtCoder: 4260ms
On My Local Machine (i3-6100, g++ 14.2.1, Arch Linux): 4596ms
The Code
Here is the implementation with the embedded data generator:
#include <algorithm>
#include <array>
#include <cstdint>
#include <iostream>
#include <random>
#include <string>
#include <vector>
using u8 = uint8_t;
using i64 = long long;
using i128 = __int128_t;
using u32 = unsigned int;
using u64 = unsigned long long;
using u128 = __uint128_t;
template <typename T, int N>
struct StaticVector
{
int p;
std::array<T, N> v;
StaticVector() : p(0) {}
T &operator[](int x) { return v[x]; }
const T &operator[](int x) const { return v[x]; }
int size() const { return p; }
bool full() const { return p == (int)v.size(); }
void push_back(const T &t)
{
if (full()) return ;
v[p++] = t;
}
};
template <typename T, int K, typename Cmp>
StaticVector<T, K> merge(const StaticVector<T, K> &a, const StaticVector<T, K> &b, Cmp &&cmp)
{
StaticVector<T, K> c;
int i = 0, j = 0;
while (!c.full() && j < b.size()) {
if (cmp(a[i], b[j])) c.push_back(a[i++]);
else c.push_back(b[j++]);
}
while (!c.full()) c.push_back(a[i++]);
return c;
}
std::vector<std::vector<u8>> transpose(std::vector<std::vector<u8>> a)
{
int n = a.size();
int m = a[0].size();
std::vector<std::vector<u8>> b(m, std::vector<u8>(n));
for (int i = 0; i < n; i++) {
for (int j = 0; j < m; j++) {
b[j][i] = a[i][j];
}
}
return b;
}
template <int K>
i64 solve_details(std::vector<std::vector<u8>> a)
{
if (a.size() < a[0].size()) a = transpose(std::move(a));
if (a.size() == 1 && a[0].size() == 1) {
return a[0][0] == K;
} else {
int mid = a.size() / 2;
int n = a[0].size();
static std::vector<StaticVector<int, K + 2>> f;
static std::vector<StaticVector<int, K + 2>> g;
f.assign(n, {});
g.assign(n, {});
for (int i = mid - 1; i >= 0; i--) {
for (int j = 0; j < n; j++) {
if (a[i][j] == 1) f[j].push_back(i);
}
}
for (int i = mid; i < (int)a.size(); i++) {
for (int j = 0; j < n; j++) {
if (a[i][j] == 1) g[j].push_back(i + 1);
}
}
i64 ans = 0;
for (int l = 0; l < n; l++) {
StaticVector<int, K + 2> fp, gp;
fp.push_back(mid);
gp.push_back(mid);
for (int t = 0; t <= K; t++) {
fp.push_back(-1);
gp.push_back(a.size() + 1);
}
for (int r = l + 1; r <= n; r++) {
fp = merge(std::move(fp), f[r - 1], std::greater<int>{});
gp = merge(std::move(gp), g[r - 1], std::less<int>{});
for (int i = 0; i <= K; i++) {
int uc = fp[i] - fp[i + 1];
int dc = gp[K - i + 1] - gp[K - i];
if (fp[i] == mid) uc--;
if (gp[K - i] == mid) dc--;
ans += uc * dc;
}
}
}
ans += solve_details<K>(std::vector(a.begin(), a.begin() + mid));
ans += solve_details<K>(std::vector(a.begin() + mid, a.end()));
return ans;
}
}
template <int K>
i64 solve(std::vector<std::vector<u8>> a, int k)
{
if (K < k) return solve<std::min(K + 1, 6)>(std::move(a), k);
return solve_details<K>(std::move(a));
}
int main()
{
std::cin.tie(nullptr);
std::ios::sync_with_stdio(false);
int n, m, k;
std::cin >> n >> m >> k;
std::vector a(n, std::vector<u8>(m));
std::mt19937 mt;
for (int i = 0; i < n; i++) {
for (int j = 0; j < m; j++) {
a[i][j] = mt() & 1;
}
}
std::cout << solve<0>(a, k) << std::endl;
}
Why does my code run so much slower on Codeforces? Is this due to differences in the hardware, compiler, or some other factor? Any insights would be appreciated.