1#include <m4ri/m4ri_config.h>
12static inline void __M4RI_TEMPLATE_NAME(_mzd_combine)(
word *m,
word const *t[N],
wi_t wide) {
13 assert(1 <= N && N <= 8);
33 *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++;
35 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++;
break;
36 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++;
break;
37 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++;
break;
38 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++;
break;
39 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++;
break;
40 case 2: *m++ ^= *t[0]++ ^ *t[1]++;
break;
41 case 1: *m++ ^= *t[0]++;
break;
46 __m128i *m__ = (__m128i *)m;
50 case 8: t__[N - 8] = (__m128i *)t[N - 8];
51 case 7: t__[N - 7] = (__m128i *)t[N - 7];
52 case 6: t__[N - 6] = (__m128i *)t[N - 6];
53 case 5: t__[N - 5] = (__m128i *)t[N - 5];
54 case 4: t__[N - 4] = (__m128i *)t[N - 4];
55 case 3: t__[N - 3] = (__m128i *)t[N - 3];
56 case 2: t__[N - 2] = (__m128i *)t[N - 2];
57 case 1: t__[N - 1] = (__m128i *)t[N - 1];
60 __m128i xmm0, xmm1, xmm2, xmm3;
63 for (; i + 4 <= (wide >> 1); i += 4) {
70 xmm0 = _mm_xor_si128(xmm0, t__[7][0]);
71 xmm1 = _mm_xor_si128(xmm1, t__[7][1]);
72 xmm2 = _mm_xor_si128(xmm2, t__[7][2]);
73 xmm3 = _mm_xor_si128(xmm3, t__[7][3]);
76 xmm0 = _mm_xor_si128(xmm0, t__[6][0]);
77 xmm1 = _mm_xor_si128(xmm1, t__[6][1]);
78 xmm2 = _mm_xor_si128(xmm2, t__[6][2]);
79 xmm3 = _mm_xor_si128(xmm3, t__[6][3]);
82 xmm0 = _mm_xor_si128(xmm0, t__[5][0]);
83 xmm1 = _mm_xor_si128(xmm1, t__[5][1]);
84 xmm2 = _mm_xor_si128(xmm2, t__[5][2]);
85 xmm3 = _mm_xor_si128(xmm3, t__[5][3]);
88 xmm0 = _mm_xor_si128(xmm0, t__[4][0]);
89 xmm1 = _mm_xor_si128(xmm1, t__[4][1]);
90 xmm2 = _mm_xor_si128(xmm2, t__[4][2]);
91 xmm3 = _mm_xor_si128(xmm3, t__[4][3]);
94 xmm0 = _mm_xor_si128(xmm0, t__[3][0]);
95 xmm1 = _mm_xor_si128(xmm1, t__[3][1]);
96 xmm2 = _mm_xor_si128(xmm2, t__[3][2]);
97 xmm3 = _mm_xor_si128(xmm3, t__[3][3]);
100 xmm0 = _mm_xor_si128(xmm0, t__[2][0]);
101 xmm1 = _mm_xor_si128(xmm1, t__[2][1]);
102 xmm2 = _mm_xor_si128(xmm2, t__[2][2]);
103 xmm3 = _mm_xor_si128(xmm3, t__[2][3]);
106 xmm0 = _mm_xor_si128(xmm0, t__[1][0]);
107 xmm1 = _mm_xor_si128(xmm1, t__[1][1]);
108 xmm2 = _mm_xor_si128(xmm2, t__[1][2]);
109 xmm3 = _mm_xor_si128(xmm3, t__[1][3]);
112 xmm0 = _mm_xor_si128(xmm0, t__[0][0]);
113 xmm1 = _mm_xor_si128(xmm1, t__[0][1]);
114 xmm2 = _mm_xor_si128(xmm2, t__[0][2]);
115 xmm3 = _mm_xor_si128(xmm3, t__[0][3]);
125 for (; i < (wide >> 1); i++) {
128 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++);
129 xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
130 xmm2 = _mm_xor_si128(*t__[4]++, *t__[5]++);
131 xmm3 = _mm_xor_si128(*t__[6]++, *t__[7]++);
132 xmm0 = _mm_xor_si128(xmm0, xmm1);
133 xmm2 = _mm_xor_si128(xmm2, xmm3);
134 xmm0 = _mm_xor_si128(xmm0, xmm2);
135 xmm0 = _mm_xor_si128(*m__, xmm0);
138 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++);
139 xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
140 xmm0 = _mm_xor_si128(xmm0, *t__[4]++);
141 xmm1 = _mm_xor_si128(xmm1, *t__[5]++);
142 xmm0 = _mm_xor_si128(xmm0, *t__[6]++);
143 xmm0 = _mm_xor_si128(xmm0, xmm1);
144 xmm0 = _mm_xor_si128(*m__, xmm0);
147 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++);
148 xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
149 xmm0 = _mm_xor_si128(xmm0, *t__[4]++);
150 xmm1 = _mm_xor_si128(xmm1, *t__[5]++);
151 xmm0 = _mm_xor_si128(xmm0, xmm1);
152 xmm0 = _mm_xor_si128(*m__, xmm0);
155 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++);
156 xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
157 xmm0 = _mm_xor_si128(xmm0, *t__[4]++);
158 xmm0 = _mm_xor_si128(xmm0, xmm1);
159 xmm0 = _mm_xor_si128(*m__, xmm0);
162 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++);
163 xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
164 xmm0 = _mm_xor_si128(xmm0, xmm1);
165 xmm0 = _mm_xor_si128(*m__, xmm0);
168 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++);
169 xmm1 = _mm_xor_si128(*m__, *t__[2]++);
170 xmm0 = _mm_xor_si128(xmm0, xmm1);
173 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++);
174 xmm0 = _mm_xor_si128(*m__, xmm0);
176 case 1: xmm0 = _mm_xor_si128(*m__, *t__[0]++);
break;
184 case 8: t[N - 8] = (
word *)t__[N - 8];
185 case 7: t[N - 7] = (
word *)t__[N - 7];
186 case 6: t[N - 6] = (
word *)t__[N - 6];
187 case 5: t[N - 5] = (
word *)t__[N - 5];
188 case 4: t[N - 4] = (
word *)t__[N - 4];
189 case 3: t[N - 3] = (
word *)t__[N - 3];
190 case 2: t[N - 2] = (
word *)t__[N - 2];
191 case 1: t[N - 1] = (
word *)t__[N - 1];
196 *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++;
198 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++;
break;
199 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++;
break;
200 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++;
break;
201 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++;
break;
202 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++;
break;
203 case 2: *m++ ^= *t[0]++ ^ *t[1]++;
break;
204 case 1: *m++ ^= *t[0]++;
break;
210 for (
wi_t i = 0; i < wide; i++) {
213 *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++;
215 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++;
break;
216 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++;
break;
217 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++;
break;
218 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++;
break;
219 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++;
break;
220 case 2: *m++ ^= *t[0]++ ^ *t[1]++;
break;
221 case 1: *m++ ^= *t[0]++;
break;
#define __M4RI_ALIGNMENT(addr, n)
Return alignment of addr w.r.t. n. For example the address 17 would be 1 aligned w....
Definition misc.h:421
#define __M4RI_UNLIKELY(cond)
Macro to help with branch prediction.
Definition misc.h:449
int64_t wi_t
Type of word indexes.
Definition misc.h:81
uint64_t word
A word is the typical packed data structure to represent packed bits.
Definition misc.h:87