I was stuck on this CSES question : Corner Subgrid Count for nearly 2 months, when I had already got the approach at the first view only. (It's not that I have been trying daily since the last 2 months, I got TLE and I left... bruhh — )
And I just figured out that I was not using the correct compile-time optimization (bruh-, what the hell was that). I was trying with Ofast and O3 optimizations but just do popcnt optimization and you are done. My approach (as obvious) was just break each row into bits of size 64, do pairwise-AND and tally the number of bits set in the pairwise-ANDs. Finally, add (count*(count — 1))/2 to the ans.
But I am amazed to see that how powerful a tool pragma actually is and how it can make your life easy :
TLE-ed Code#include <bits/stdc++.h>
using namespace std;
#pragma GCC optimize("Ofast")
#define int long long int
#define ull unsigned long long
//std::mt19937_64 rng(std::chrono::steady_clock::now().time_since_epoch().count());
constexpr int V = 1E9;
constexpr int MAXN=1e9;
const int mod=998244353;
void solve()
{
int n;
cin >> n ;
vector<string> v(n);
for(int i=0; i<n; i++) cin>>v[i];
int m = v[0].size();
int B = (m + 63) / 64;
vector<vector<unsigned long long>> vals(n, vector<unsigned long long>(B, 0ULL));
for (int i = 0; i < n; ++i) {
for (int j = 0; j < m; ++j) {
if (v[i][j] == '1') {
int b = j >> 6; // j / 64
int off = j & 63; // j % 64
vals[i][b] |= (1ULL << off);
}
}
}
// for(auto x: vals)
// {
// for(auto y : x) cout<<y<<" ";
// cout<<endl;
// }
int ans = 0;
for (int i = 0; i < n; ++i) {
const ull *Ai = vals[i].data();
for (int j = i + 1; j < n; ++j) {
const ull *Aj = vals[j].data();
int cnt = 0;
for (int z = 0; z < B; ++z) {
cnt += __builtin_popcountll(Ai[z] & Aj[z]);
}
if (cnt >= 2) ans += 1LL * cnt * (cnt - 1) / 2;
}
}
cout<<ans<<endl;
}
int32_t main() {
std::ios::sync_with_stdio(false);
std::cin.tie(nullptr);
//auto begin = std::chrono::high_resolution_clock::now();
//freopen("piggyback.in", "r", stdin);
//freopen("piggyback.out", "w", stdout);
int t=1;
//cin >> t;
while (t--)
{
solve();
}
// auto end = std::chrono::high_resolution_clock::now();
// auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
// cerr << "Time measured: " << elapsed.count() * 1e-9 << " seconds.\n";
return 0;
}
Accepted Code#include <bits/stdc++.h>
using namespace std;
#pragma GCC target("popcnt")
#define int long long int
#define ull unsigned long long
//std::mt19937_64 rng(std::chrono::steady_clock::now().time_since_epoch().count());
constexpr int V = 1E9;
constexpr int MAXN=1e9;
const int mod=998244353;
void solve()
{
int n;
cin >> n ;
vector<string> v(n);
for(int i=0; i<n; i++) cin>>v[i];
int m = v[0].size();
int B = (m + 63) / 64;
vector<vector<unsigned long long>> vals(n, vector<unsigned long long>(B, 0ULL));
for (int i = 0; i < n; ++i) {
for (int j = 0; j < m; ++j) {
if (v[i][j] == '1') {
int b = j >> 6; // j / 64
int off = j & 63; // j % 64
vals[i][b] |= (1ULL << off);
}
}
}
// for(auto x: vals)
// {
// for(auto y : x) cout<<y<<" ";
// cout<<endl;
// }
int ans = 0;
for (int i = 0; i < n; ++i) {
const ull *Ai = vals[i].data();
for (int j = i + 1; j < n; ++j) {
const ull *Aj = vals[j].data();
int cnt = 0;
for (int z = 0; z < B; ++z) {
cnt += __builtin_popcountll(Ai[z] & Aj[z]);
}
if (cnt >= 2) ans += 1LL * cnt * (cnt - 1) / 2;
}
}
cout<<ans<<endl;
}
int32_t main() {
std::ios::sync_with_stdio(false);
std::cin.tie(nullptr);
//auto begin = std::chrono::high_resolution_clock::now();
//freopen("piggyback.in", "r", stdin);
//freopen("piggyback.out", "w", stdout);
int t=1;
//cin >> t;
while (t--)
{
solve();
}
// auto end = std::chrono::high_resolution_clock::now();
// auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
// cerr << "Time measured: " << elapsed.count() * 1e-9 << " seconds.\n";
return 0;
}
If any of you got it Accepted without Pragma, I would love to hear that too
Полный текст и комментарии »