I was solving the problem 364E - Empty Rectangles. The time complexity of my solution is $$$O(n^2k \log n)$$$, where $$$n \leq 2500$$$ and $$$k \leq 6$$$. The time limit for this problem is 12 seconds.
However, my submission 295732996 runs significantly slower on Codeforces compared to AtCoder and my local machine. (btw, I AC this problem using C++17 (GCC 7-32)
, 11155 ms, a little faster)
Comparisons
To investigate, I embedded a data generator that creates random matrices with uniformly distributed 0s and 1s, then ran the code on different platforms:
On Codeforces: 12437ms
On AtCoder: 4260ms
On My Local Machine (i3-6100, g++ 14.2.1, Arch Linux): 4596ms
The Code
Here is the implementation with the embedded data generator:
#include <algorithm>
#include <array>
#include <cstdint>
#include <iostream>
#include <random>
#include <string>
#include <vector>
using u8 = uint8_t;
using i64 = long long;
using i128 = __int128_t;
using u32 = unsigned int;
using u64 = unsigned long long;
using u128 = __uint128_t;
template <typename T, int N>
struct StaticVector
{
int p;
std::array<T, N> v;
StaticVector() : p(0) {}
T &operator[](int x) { return v[x]; }
const T &operator[](int x) const { return v[x]; }
int size() const { return p; }
bool full() const { return p == (int)v.size(); }
void push_back(const T &t)
{
if (full()) return ;
v[p++] = t;
}
};
template <typename T, int K, typename Cmp>
StaticVector<T, K> merge(const StaticVector<T, K> &a, const StaticVector<T, K> &b, Cmp &&cmp)
{
StaticVector<T, K> c;
int i = 0, j = 0;
while (!c.full() && j < b.size()) {
if (cmp(a[i], b[j])) c.push_back(a[i++]);
else c.push_back(b[j++]);
}
while (!c.full()) c.push_back(a[i++]);
return c;
}
std::vector<std::vector<u8>> transpose(std::vector<std::vector<u8>> a)
{
int n = a.size();
int m = a[0].size();
std::vector<std::vector<u8>> b(m, std::vector<u8>(n));
for (int i = 0; i < n; i++) {
for (int j = 0; j < m; j++) {
b[j][i] = a[i][j];
}
}
return b;
}
template <int K>
i64 solve_details(std::vector<std::vector<u8>> a)
{
if (a.size() < a[0].size()) a = transpose(std::move(a));
if (a.size() == 1 && a[0].size() == 1) {
return a[0][0] == K;
} else {
int mid = a.size() / 2;
int n = a[0].size();
static std::vector<StaticVector<int, K + 2>> f;
static std::vector<StaticVector<int, K + 2>> g;
f.assign(n, {});
g.assign(n, {});
for (int i = mid - 1; i >= 0; i--) {
for (int j = 0; j < n; j++) {
if (a[i][j] == 1) f[j].push_back(i);
}
}
for (int i = mid; i < (int)a.size(); i++) {
for (int j = 0; j < n; j++) {
if (a[i][j] == 1) g[j].push_back(i + 1);
}
}
i64 ans = 0;
for (int l = 0; l < n; l++) {
StaticVector<int, K + 2> fp, gp;
fp.push_back(mid);
gp.push_back(mid);
for (int t = 0; t <= K; t++) {
fp.push_back(-1);
gp.push_back(a.size() + 1);
}
for (int r = l + 1; r <= n; r++) {
fp = merge(std::move(fp), f[r - 1], std::greater<int>{});
gp = merge(std::move(gp), g[r - 1], std::less<int>{});
for (int i = 0; i <= K; i++) {
int uc = fp[i] - fp[i + 1];
int dc = gp[K - i + 1] - gp[K - i];
if (fp[i] == mid) uc--;
if (gp[K - i] == mid) dc--;
ans += uc * dc;
}
}
}
ans += solve_details<K>(std::vector(a.begin(), a.begin() + mid));
ans += solve_details<K>(std::vector(a.begin() + mid, a.end()));
return ans;
}
}
template <int K>
i64 solve(std::vector<std::vector<u8>> a, int k)
{
if (K < k) return solve<std::min(K + 1, 6)>(std::move(a), k);
return solve_details<K>(std::move(a));
}
int main()
{
std::cin.tie(nullptr);
std::ios::sync_with_stdio(false);
int n, m, k;
std::cin >> n >> m >> k;
std::vector a(n, std::vector<u8>(m));
std::mt19937 mt;
for (int i = 0; i < n; i++) {
for (int j = 0; j < m; j++) {
a[i][j] = mt() & 1;
}
}
std::cout << solve<0>(a, k) << std::endl;
}
Why does my code run so much slower on Codeforces? Is this due to differences in the hardware, compiler, or some other factor? Any insights would be appreciated.
I meet the same trouble on some problems. Actually my PC is old and not fast. But in some problems my computer runs even faster than CF. In my impression, CF judge runs fast. Is the judge slower than before?