1 /* MIT (BSD) license - see LICENSE file for details */
2 /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
3 * Copyright (C) 2013, 2015 Mark Adler
4 * Version 1.3 31 Dec 2015 Mark Adler
8 This software is provided 'as-is', without any express or implied
9 warranty. In no event will the author be held liable for any damages
10 arising from the use of this software.
12 Permission is granted to anyone to use this software for any purpose,
13 including commercial applications, and to alter it and redistribute it
14 freely, subject to the following restrictions:
16 1. The origin of this software must not be misrepresented; you must not
17 claim that you wrote the original software. If you use this software
18 in a product, an acknowledgment in the product documentation would be
19 appreciated but is not required.
20 2. Altered source versions must be plainly marked as such, and must not be
21 misrepresented as being the original software.
22 3. This notice may not be removed or altered from any source distribution.
25 madler@alumni.caltech.edu
28 /* Use hardware CRC instruction on Intel SSE 4.2 processors. This computes a
29 CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc. A software
30 version is provided as a fall-back, as well as for speed comparisons. */
33 1.0 10 Feb 2013 First version
34 1.1 1 Aug 2013 Correct comments on why three crc instructions in parallel
35 1.2 1 Nov 2015 Add const qualifier to avoid compiler warning
36 Load entire input into memory (test code)
37 Argument gives number of times to repeat (test code)
38 Argument < 0 forces software implementation (test code)
39 1.3 31 Dec 2015 Check for Intel architecture using compiler macro
40 Support big-endian processors in software calculation
41 Add header for external use
45 #include <ccan/compiler/compiler.h>
48 static uint32_t crc32c_sw(uint32_t crc, void const *buf, size_t len);
50 /* CRC-32C (iSCSI) polynomial in reversed bit order. */
51 #define POLY 0x82f63b78
55 /* Hardware CRC-32C for Intel and compatible processors. */
57 /* Multiply a matrix times a vector over the Galois field of two elements,
58 GF(2). Each element is a bit in an unsigned integer. mat must have at
59 least as many entries as the power of two for most significant one bit in
61 static inline uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec) {
72 /* Multiply a matrix by itself over GF(2). Both mat and square must have 32
74 static inline void gf2_matrix_square(uint32_t *square, uint32_t *mat) {
75 for (unsigned n = 0; n < 32; n++)
76 square[n] = gf2_matrix_times(mat, mat[n]);
79 /* Construct an operator to apply len zeros to a crc. len must be a power of
80 two. If len is not a power of two, then the result is the same as for the
81 largest power of two less than len. The result for len == 0 is the same as
82 for len == 1. A version of this routine could be easily written for any
83 len, but that is not needed for this application. */
84 static void crc32c_zeros_op(uint32_t *even, size_t len) {
85 uint32_t odd[32]; /* odd-power-of-two zeros operator */
87 /* put operator for one zero bit in odd */
88 odd[0] = POLY; /* CRC-32C polynomial */
90 for (unsigned n = 1; n < 32; n++) {
95 /* put operator for two zero bits in even */
96 gf2_matrix_square(even, odd);
98 /* put operator for four zero bits in odd */
99 gf2_matrix_square(odd, even);
101 /* first square will put the operator for one zero byte (eight zero bits),
102 in even -- next square puts operator for two zero bytes in odd, and so
103 on, until len has been rotated down to zero */
105 gf2_matrix_square(even, odd);
109 gf2_matrix_square(odd, even);
113 /* answer ended up in odd -- copy to even */
114 for (unsigned n = 0; n < 32; n++)
118 /* Take a length and build four lookup tables for applying the zeros operator
119 for that length, byte-by-byte on the operand. */
120 static void crc32c_zeros(uint32_t zeros[][256], size_t len) {
123 crc32c_zeros_op(op, len);
124 for (unsigned n = 0; n < 256; n++) {
125 zeros[0][n] = gf2_matrix_times(op, n);
126 zeros[1][n] = gf2_matrix_times(op, n << 8);
127 zeros[2][n] = gf2_matrix_times(op, n << 16);
128 zeros[3][n] = gf2_matrix_times(op, n << 24);
132 /* Apply the zeros operator table to crc. */
133 static inline uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc) {
134 return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
135 zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
138 /* Block sizes for three-way parallel crc computation. LONG and SHORT must
139 both be powers of two. The associated string constants must be set
140 accordingly, for use in constructing the assembler instructions. */
142 #define LONGx1 "8192"
143 #define LONGx2 "16384"
145 #define SHORTx1 "256"
146 #define SHORTx2 "512"
148 /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
149 static bool crc32c_once_hw;
150 static uint32_t crc32c_long[4][256];
151 static uint32_t crc32c_short[4][256];
153 /* Initialize tables for shifting crcs. */
154 static void crc32c_init_hw(void) {
155 crc32c_once_hw = true;
156 crc32c_zeros(crc32c_long, LONG);
157 crc32c_zeros(crc32c_short, SHORT);
160 /* Compute CRC-32C using the Intel hardware instruction. */
161 static uint32_t crc32c_hw(uint32_t crc, void const *buf, size_t len) {
162 /* populate shift tables the first time through */
166 /* pre-process the crc */
168 uint64_t crc0 = crc; /* 64-bits for crc32q instruction */
170 /* compute the crc for up to seven leading bytes to bring the data pointer
171 to an eight-byte boundary */
172 unsigned char const *next = buf;
173 while (len && ((uintptr_t)next & 7) != 0) {
174 __asm__("crc32b\t" "(%1), %0"
176 : "r"(next), "0"(crc0));
181 /* compute the crc on sets of LONG*3 bytes, executing three independent crc
182 instructions, each on LONG bytes -- this is optimized for the Nehalem,
183 Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
184 throughput of one crc per cycle, but a latency of three cycles */
185 while (len >= LONG*3) {
188 unsigned char const * const end = next + LONG;
190 __asm__("crc32q\t" "(%3), %0\n\t"
191 "crc32q\t" LONGx1 "(%3), %1\n\t"
192 "crc32q\t" LONGx2 "(%3), %2"
193 : "=r"(crc0), "=r"(crc1), "=r"(crc2)
194 : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
196 } while (next < end);
197 crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
198 crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2;
203 /* do the same thing, but now on SHORT*3 blocks for the remaining data less
204 than a LONG*3 block */
205 while (len >= SHORT*3) {
208 unsigned char const * const end = next + SHORT;
210 __asm__("crc32q\t" "(%3), %0\n\t"
211 "crc32q\t" SHORTx1 "(%3), %1\n\t"
212 "crc32q\t" SHORTx2 "(%3), %2"
213 : "=r"(crc0), "=r"(crc1), "=r"(crc2)
214 : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
216 } while (next < end);
217 crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
218 crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2;
223 /* compute the crc on the remaining eight-byte units less than a SHORT*3
226 unsigned char const * const end = next + (len - (len & 7));
228 __asm__("crc32q\t" "(%1), %0"
230 : "r"(next), "0"(crc0));
236 /* compute the crc for up to seven trailing bytes */
238 __asm__("crc32b\t" "(%1), %0"
240 : "r"(next), "0"(crc0));
245 /* return a post-processed crc */
249 /* Compute a CRC-32C. If the crc32 instruction is available, use the hardware
250 version. Otherwise, use the software version. */
251 uint32_t crc32c(uint32_t crc, void const *buf, size_t len) {
253 return cpu_supports("sse4.2") ? crc32c_hw(crc, buf, len) : crc32c_sw(crc, buf, len);
256 #else /* !__x86_64__ */
258 uint32_t crc32c(uint32_t crc, void const *buf, size_t len) {
259 return crc32c_sw(crc, buf, len);
264 /* Construct table for software CRC-32C little-endian calculation. */
265 static bool crc32c_once_little;
266 static uint32_t crc32c_table_little[8][256];
267 static void crc32c_init_sw_little(void) {
268 for (unsigned n = 0; n < 256; n++) {
270 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
271 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
272 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
273 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
274 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
275 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
276 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
277 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
278 crc32c_table_little[0][n] = crc;
280 for (unsigned n = 0; n < 256; n++) {
281 uint32_t crc = crc32c_table_little[0][n];
282 for (unsigned k = 1; k < 8; k++) {
283 crc = crc32c_table_little[0][crc & 0xff] ^ (crc >> 8);
284 crc32c_table_little[k][n] = crc;
287 crc32c_once_little = true;
290 /* Compute a CRC-32C in software assuming a little-endian architecture,
291 constructing the required table if that hasn't already been done. */
292 static uint32_t crc32c_sw_little(uint32_t crc, void const *buf, size_t len) {
293 unsigned char const *next = buf;
295 if (!crc32c_once_little)
296 crc32c_init_sw_little();
298 while (len && ((uintptr_t)next & 7) != 0) {
299 crc = crc32c_table_little[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
305 crcw ^= *(uint64_t const *)next;
306 crcw = crc32c_table_little[7][crcw & 0xff] ^
307 crc32c_table_little[6][(crcw >> 8) & 0xff] ^
308 crc32c_table_little[5][(crcw >> 16) & 0xff] ^
309 crc32c_table_little[4][(crcw >> 24) & 0xff] ^
310 crc32c_table_little[3][(crcw >> 32) & 0xff] ^
311 crc32c_table_little[2][(crcw >> 40) & 0xff] ^
312 crc32c_table_little[1][(crcw >> 48) & 0xff] ^
313 crc32c_table_little[0][crcw >> 56];
320 crc = crc32c_table_little[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
326 /* Swap the bytes in a uint64_t. (Only for big-endian.) */
327 #if defined(__has_builtin) || (defined(__GNUC__) && \
328 (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)))
329 # define swap __builtin_bswap64
331 static inline uint64_t swap(uint64_t x) {
332 x = ((x << 8) & 0xff00ff00ff00ff00) | ((x >> 8) & 0xff00ff00ff00ff);
333 x = ((x << 16) & 0xffff0000ffff0000) | ((x >> 16) & 0xffff0000ffff);
334 return (x << 32) | (x >> 32);
338 /* Construct tables for software CRC-32C big-endian calculation. */
339 static bool crc32c_once_big;
340 static uint32_t crc32c_table_big_byte[256];
341 static uint64_t crc32c_table_big[8][256];
342 static void crc32c_init_sw_big(void) {
343 for (unsigned n = 0; n < 256; n++) {
345 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
346 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
347 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
348 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
349 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
350 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
351 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
352 crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
353 crc32c_table_big_byte[n] = crc;
355 for (unsigned n = 0; n < 256; n++) {
356 uint32_t crc = crc32c_table_big_byte[n];
357 crc32c_table_big[0][n] = swap(crc);
358 for (unsigned k = 1; k < 8; k++) {
359 crc = crc32c_table_big_byte[crc & 0xff] ^ (crc >> 8);
360 crc32c_table_big[k][n] = swap(crc);
363 crc32c_once_big = true;
366 /* Compute a CRC-32C in software assuming a big-endian architecture,
367 constructing the required tables if that hasn't already been done. */
368 static uint32_t crc32c_sw_big(uint32_t crc, void const *buf, size_t len) {
369 unsigned char const *next = buf;
371 if (!crc32c_once_big)
372 crc32c_init_sw_big();
374 while (len && ((uintptr_t)next & 7) != 0) {
375 crc = crc32c_table_big_byte[(crc ^ *next++) & 0xff] ^ (crc >> 8);
379 uint64_t crcw = swap(crc);
381 crcw ^= *(uint64_t const *)next;
382 crcw = crc32c_table_big[0][crcw & 0xff] ^
383 crc32c_table_big[1][(crcw >> 8) & 0xff] ^
384 crc32c_table_big[2][(crcw >> 16) & 0xff] ^
385 crc32c_table_big[3][(crcw >> 24) & 0xff] ^
386 crc32c_table_big[4][(crcw >> 32) & 0xff] ^
387 crc32c_table_big[5][(crcw >> 40) & 0xff] ^
388 crc32c_table_big[6][(crcw >> 48) & 0xff] ^
389 crc32c_table_big[7][(crcw >> 56)];
396 crc = crc32c_table_big_byte[(crc ^ *next++) & 0xff] ^ (crc >> 8);
402 /* Table-driven software CRC-32C. This is about 15 times slower than using the
403 hardware instructions. Determine the endianess of the processor and proceed
404 accordingly. Ideally the endianess will be determined at compile time, in
405 which case the unused functions and tables for the other endianess will be
406 removed by the optimizer. If not, then the proper routines and tables will
407 be used, even if the endianess is changed mid-stream. (Yes, there are
408 processors that permit that -- go figure.) */
409 static uint32_t crc32c_sw(uint32_t crc, void const *buf, size_t len) {
410 static int const little = 1;
411 if (*(char const *)&little)
412 return crc32c_sw_little(crc, buf, len);
414 return crc32c_sw_big(crc, buf, len);