forked from WojciechMula/sse-popcount
-
Notifications
You must be signed in to change notification settings - Fork 0
/
popcnt-bit-parallel-scalar.cpp
98 lines (68 loc) · 2.89 KB
/
popcnt-bit-parallel-scalar.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
std::uint64_t popcnt_parallel_64bit_naive(const uint8_t* data, const size_t n) {
uint64_t result = 0;
size_t i = 0;
#define ITER { \
const uint64_t t1 = *reinterpret_cast<const uint64_t*>(data + i); \
const uint64_t t2 = (t1 & 0x5555555555555555llu) + ((t1 >> 1) & 0x5555555555555555llu); \
const uint64_t t3 = (t2 & 0x3333333333333333llu) + ((t2 >> 2) & 0x3333333333333333llu); \
const uint64_t t4 = (t3 & 0x0f0f0f0f0f0f0f0fllu) + ((t3 >> 4) & 0x0f0f0f0f0f0f0f0fllu); \
const uint64_t t5 = (t4 & 0x00ff00ff00ff00ffllu) + ((t4 >> 8) & 0x00ff00ff00ff00ffllu); \
const uint64_t t6 = (t5 & 0x0000ffff0000ffffllu) + ((t5 >> 16) & 0x0000ffff0000ffffllu); \
const uint64_t t7 = (t6 & 0x00000000ffffffffllu) + ((t6 >> 32) & 0x00000000ffffffffllu); \
result += t7; \
i += 8; \
}
while (i + 4*8 <= n) {
ITER ITER ITER ITER
}
#undef ITER
for (/**/; i < n; i++) {
result += lookup8bit[data[i]];
}
return result;
}
std::uint64_t popcnt_parallel_64bit_optimized(const uint8_t* data, const size_t n) {
uint64_t result = 0;
size_t i = 0;
while (i + 4*8 <= n) {
uint64_t partial = 0; // packed_byte
#define ITER { \
const uint64_t t1 = *reinterpret_cast<const uint64_t*>(data + i); \
const uint64_t t2 = (t1 & 0x5555555555555555llu) + ((t1 >> 1) & 0x5555555555555555llu); \
const uint64_t t3 = (t2 & 0x3333333333333333llu) + ((t2 >> 2) & 0x3333333333333333llu); \
const uint64_t t4 = (t3 & 0x0f0f0f0f0f0f0f0fllu) + ((t3 >> 4) & 0x0f0f0f0f0f0f0f0fllu); \
partial += t4; \
i += 8; \
}
ITER ITER ITER ITER
#undef ITER
const uint64_t t5 = (partial & 0x00ff00ff00ff00ffllu) + ((partial >> 8) & 0x00ff00ff00ff00ffllu);
const uint64_t t6 = (t5 & 0x0000ffff0000ffffllu) + ((t5 >> 16) & 0x0000ffff0000ffffllu);
const uint64_t t7 = (t6 & 0x00000000ffffffffllu) + ((t6 >> 32) & 0x00000000ffffffffllu);
result += t7;
}
for (/**/; i < n; i++) {
result += lookup8bit[data[i]];
}
return result;
}
// popcnt_mul from popcnt-harley-seal.cpp
std::uint64_t popcnt_parallel_64bit_mul(const uint8_t* data, const size_t n) {
uint64_t result = 0;
size_t i = 0;
for (/**/; i < n; i += 8) {
const uint64_t m1 = UINT64_C(0x5555555555555555);
const uint64_t m2 = UINT64_C(0x3333333333333333);
const uint64_t m4 = UINT64_C(0x0F0F0F0F0F0F0F0F);
const uint64_t h01 = UINT64_C(0x0101010101010101);
uint64_t x = *reinterpret_cast<const uint64_t*>(data + i);
x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
x = (x + (x >> 4)) & m4;
result += (x * h01) >> 56;
}
for (/**/; i < n; i++) {
result += lookup8bit[data[i]];
}
return result;
}