1 /* MIT (BSD) license - see LICENSE file for details */
2 /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
3 * Copyright (C) 2013, 2015 Mark Adler
4 * Version 1.3 31 Dec 2015 Mark Adler
8 This software is provided 'as-is', without any express or implied
9 warranty. In no event will the author be held liable for any damages
10 arising from the use of this software.
12 Permission is granted to anyone to use this software for any purpose,
13 including commercial applications, and to alter it and redistribute it
14 freely, subject to the following restrictions:
16 1. The origin of this software must not be misrepresented; you must not
17 claim that you wrote the original software. If you use this software
18 in a product, an acknowledgment in the product documentation would be
19 appreciated but is not required.
20 2. Altered source versions must be plainly marked as such, and must not be
21 misrepresented as being the original software.
22 3. This notice may not be removed or altered from any source distribution.
25 madler@alumni.caltech.edu
28 /* Use hardware CRC instruction on Intel SSE 4.2 processors. This computes a
29 CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc. A software
30 version is provided as a fall-back, as well as for speed comparisons. */
33 1.0 10 Feb 2013 First version
34 1.1 1 Aug 2013 Correct comments on why three crc instructions in parallel
35 1.2 1 Nov 2015 Add const qualifier to avoid compiler warning
36 Load entire input into memory (test code)
37 Argument gives number of times to repeat (test code)
38 Argument < 0 forces software implementation (test code)
39 1.3 31 Dec 2015 Check for Intel architecture using compiler macro
40 Support big-endian processors in software calculation
41 Add header for external use
45 #include <ccan/compiler/compiler.h>
48 static uint32_t crc32c_sw(uint32_t crc, void const *buf, size_t len);
50 /* CRC-32C (iSCSI) polynomial in reversed bit order. */
51 #define POLY 0x82f63b78
55 /* Hardware CRC-32C for Intel and compatible processors. */
57 /* Multiply a matrix times a vector over the Galois field of two elements,
58 GF(2). Each element is a bit in an unsigned integer. mat must have at
59 least as many entries as the power of two for most significant one bit in
61 static inline uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec) {
72 /* Multiply a matrix by itself over GF(2). Both mat and square must have 32
74 static inline void gf2_matrix_square(uint32_t *square, uint32_t *mat) {
76 for (n = 0; n < 32; n++)
77 square[n] = gf2_matrix_times(mat, mat[n]);
80 /* Construct an operator to apply len zeros to a crc. len must be a power of
81 two. If len is not a power of two, then the result is the same as for the
82 largest power of two less than len. The result for len == 0 is the same as
83 for len == 1. A version of this routine could be easily written for any
84 len, but that is not needed for this application. */
85 static void crc32c_zeros_op(uint32_t *even, size_t len) {
86 uint32_t odd[32]; /* odd-power-of-two zeros operator */
88 /* put operator for one zero bit in odd */
89 odd[0] = POLY; /* CRC-32C polynomial */
92 for (n = 1; n < 32; n++) {
97 /* put operator for two zero bits in even */
98 gf2_matrix_square(even, odd);
100 /* put operator for four zero bits in odd */
101 gf2_matrix_square(odd, even);
103 /* first square will put the operator for one zero byte (eight zero bits),
104 in even -- next square puts operator for two zero bytes in odd, and so
105 on, until len has been rotated down to zero */
107 gf2_matrix_square(even, odd);
111 gf2_matrix_square(odd, even);
115 /* answer ended up in odd -- copy to even */
116 for (n = 0; n < 32; n++)
120 /* Take a length and build four lookup tables for applying the zeros operator
121 for that length, byte-by-byte on the operand. */
122 static void crc32c_zeros(uint32_t zeros[][256], size_t len) {
125 crc32c_zeros_op(op, len);
127 for (n = 0; n < 256; n++) {
128 zeros[0][n] = gf2_matrix_times(op, n);
129 zeros[1][n] = gf2_matrix_times(op, n << 8);
130 zeros[2][n] = gf2_matrix_times(op, n << 16);
131 zeros[3][n] = gf2_matrix_times(op, n << 24);
135 /* Apply the zeros operator table to crc. */
136 static inline uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc) {
137 return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
138 zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
141 /* Block sizes for three-way parallel crc computation. LONG and SHORT must
142 both be powers of two. The associated string constants must be set
143 accordingly, for use in constructing the assembler instructions. */
145 #define LONGx1 "8192"
146 #define LONGx2 "16384"
148 #define SHORTx1 "256"
149 #define SHORTx2 "512"
151 /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
152 static bool crc32c_once_hw;
153 static uint32_t crc32c_long[4][256];
154 static uint32_t crc32c_short[4][256];
156 /* Initialize tables for shifting crcs. */
157 static void crc32c_init_hw(void) {
158 crc32c_once_hw = true;
159 crc32c_zeros(crc32c_long, LONG);
160 crc32c_zeros(crc32c_short, SHORT);
163 /* Compute CRC-32C using the Intel hardware instruction. */
164 static uint32_t crc32c_hw(uint32_t crc, void const *buf, size_t len) {
165 /* populate shift tables the first time through */
169 /* pre-process the crc */
171 uint64_t crc0 = crc; /* 64-bits for crc32q instruction */
173 /* compute the crc for up to seven leading bytes to bring the data pointer
174 to an eight-byte boundary */
175 unsigned char const *next = buf;
176 while (len && ((uintptr_t)next & 7) != 0) {
177 __asm__("crc32b\t" "(%1), %0"
179 : "r"(next), "0"(crc0));
184 /* compute the crc on sets of LONG*3 bytes, executing three independent crc
185 instructions, each on LONG bytes -- this is optimized for the Nehalem,
186 Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
187 throughput of one crc per cycle, but a latency of three cycles */
188 while (len >= LONG*3) {
191 unsigned char const * const end = next + LONG;
193 __asm__("crc32q\t" "(%3), %0\n\t"
194 "crc32q\t" LONGx1 "(%3), %1\n\t"
195 "crc32q\t" LONGx2 "(%3), %2"
196 : "=r"(crc0), "=r"(crc1), "=r"(crc2)
197 : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
199 } while (next < end);
200 crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
201 crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2;
206 /* do the same thing, but now on SHORT*3 blocks for the remaining data less
207 than a LONG*3 block */
208 while (len >= SHORT*3) {
211 unsigned char const * const end = next + SHORT;
213 __asm__("crc32q\t" "(%3), %0\n\t"
214 "crc32q\t" SHORTx1 "(%3), %1\n\t"
215 "crc32q\t" SHORTx2 "(%3), %2"
216 : "=r"(crc0), "=r"(crc1), "=r"(crc2)
217 : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
219 } while (next < end);
220 crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
221 crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2;
226 /* compute the crc on the remaining eight-byte units less than a SHORT*3
229 unsigned char const * const end = next + (len - (len & 7));
231 __asm__("crc32q\t" "(%1), %0"
233 : "r"(next), "0"(crc0));
239 /* compute the crc for up to seven trailing bytes */
241 __asm__("crc32b\t" "(%1), %0"
243 : "r"(next), "0"(crc0));
248 /* return a post-processed crc */
252 /* Compute a CRC-32C. If the crc32 instruction is available, use the hardware
253 version. Otherwise, use the software version. */
254 uint32_t crc32c(uint32_t crc, void const *buf, size_t len) {
256 return cpu_supports("sse4.2") ? crc32c_hw(crc, buf, len) : crc32c_sw(crc, buf, len);
259 #else /* !__x86_64__ */
261 uint32_t crc32c(uint32_t crc, void const *buf, size_t len) {
262 return crc32c_sw(crc, buf, len);
267 /* Construct table for software CRC-32C little-endian calculation. */
268 static bool crc32c_once_little;
269 static uint32_t crc32c_table_little[8][256];
270 static void crc32c_init_sw_little(void) {
272 for (n = 0; n < 256; n++) {
274 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
275 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
276 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
277 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
278 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
279 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
280 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
281 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
282 crc32c_table_little[0][n] = crc;
284 for (n = 0; n < 256; n++) {
285 uint32_t crc = crc32c_table_little[0][n];
287 for (k = 1; k < 8; k++) {
288 crc = crc32c_table_little[0][crc & 0xff] ^ (crc >> 8);
289 crc32c_table_little[k][n] = crc;
292 crc32c_once_little = true;
295 /* Compute a CRC-32C in software assuming a little-endian architecture,
296 constructing the required table if that hasn't already been done. */
297 static uint32_t crc32c_sw_little(uint32_t crc, void const *buf, size_t len) {
298 unsigned char const *next = buf;
300 if (!crc32c_once_little)
301 crc32c_init_sw_little();
303 while (len && ((uintptr_t)next & 7) != 0) {
304 crc = crc32c_table_little[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
310 crcw ^= *(uint64_t const *)next;
311 crcw = crc32c_table_little[7][crcw & 0xff] ^
312 crc32c_table_little[6][(crcw >> 8) & 0xff] ^
313 crc32c_table_little[5][(crcw >> 16) & 0xff] ^
314 crc32c_table_little[4][(crcw >> 24) & 0xff] ^
315 crc32c_table_little[3][(crcw >> 32) & 0xff] ^
316 crc32c_table_little[2][(crcw >> 40) & 0xff] ^
317 crc32c_table_little[1][(crcw >> 48) & 0xff] ^
318 crc32c_table_little[0][crcw >> 56];
325 crc = crc32c_table_little[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
331 /* Swap the bytes in a uint64_t. (Only for big-endian.) */
332 #if defined(__has_builtin) || (defined(__GNUC__) && \
333 (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)))
334 # define swap __builtin_bswap64
336 static inline uint64_t swap(uint64_t x) {
337 x = ((x << 8) & 0xff00ff00ff00ff00) | ((x >> 8) & 0xff00ff00ff00ff);
338 x = ((x << 16) & 0xffff0000ffff0000) | ((x >> 16) & 0xffff0000ffff);
339 return (x << 32) | (x >> 32);
343 /* Construct tables for software CRC-32C big-endian calculation. */
344 static bool crc32c_once_big;
345 static uint32_t crc32c_table_big_byte[256];
346 static uint64_t crc32c_table_big[8][256];
347 static void crc32c_init_sw_big(void) {
349 for (n = 0; n < 256; n++) {
351 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
352 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
353 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
354 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
355 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
356 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
357 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
358 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
359 crc32c_table_big_byte[n] = crc;
361 for (n = 0; n < 256; n++) {
362 uint32_t crc = crc32c_table_big_byte[n];
363 crc32c_table_big[0][n] = swap(crc);
365 for (k = 1; k < 8; k++) {
366 crc = crc32c_table_big_byte[crc & 0xff] ^ (crc >> 8);
367 crc32c_table_big[k][n] = swap(crc);
370 crc32c_once_big = true;
373 /* Compute a CRC-32C in software assuming a big-endian architecture,
374 constructing the required tables if that hasn't already been done. */
375 static uint32_t crc32c_sw_big(uint32_t crc, void const *buf, size_t len) {
376 unsigned char const *next = buf;
378 if (!crc32c_once_big)
379 crc32c_init_sw_big();
381 while (len && ((uintptr_t)next & 7) != 0) {
382 crc = crc32c_table_big_byte[(crc ^ *next++) & 0xff] ^ (crc >> 8);
386 uint64_t crcw = swap(crc);
388 crcw ^= *(uint64_t const *)next;
389 crcw = crc32c_table_big[0][crcw & 0xff] ^
390 crc32c_table_big[1][(crcw >> 8) & 0xff] ^
391 crc32c_table_big[2][(crcw >> 16) & 0xff] ^
392 crc32c_table_big[3][(crcw >> 24) & 0xff] ^
393 crc32c_table_big[4][(crcw >> 32) & 0xff] ^
394 crc32c_table_big[5][(crcw >> 40) & 0xff] ^
395 crc32c_table_big[6][(crcw >> 48) & 0xff] ^
396 crc32c_table_big[7][(crcw >> 56)];
403 crc = crc32c_table_big_byte[(crc ^ *next++) & 0xff] ^ (crc >> 8);
409 /* Table-driven software CRC-32C. This is about 15 times slower than using the
410 hardware instructions. Determine the endianess of the processor and proceed
411 accordingly. Ideally the endianess will be determined at compile time, in
412 which case the unused functions and tables for the other endianess will be
413 removed by the optimizer. If not, then the proper routines and tables will
414 be used, even if the endianess is changed mid-stream. (Yes, there are
415 processors that permit that -- go figure.) */
416 static uint32_t crc32c_sw(uint32_t crc, void const *buf, size_t len) {
417 static int const little = 1;
418 if (*(char const *)&little)
419 return crc32c_sw_little(crc, buf, len);
421 return crc32c_sw_big(crc, buf, len);