git.ozlabs.org Git - ccan/blob - ccan/crc32c/crc32c.c

   1 /* MIT (BSD) license - see LICENSE file for details */
   2 /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
   3  * Copyright (C) 2013, 2015 Mark Adler
   4  * Version 1.3  31 Dec 2015  Mark Adler
   5  */
   6
   7 /*
   8   This software is provided 'as-is', without any express or implied
   9   warranty.  In no event will the author be held liable for any damages
  10   arising from the use of this software.
  11
  12   Permission is granted to anyone to use this software for any purpose,
  13   including commercial applications, and to alter it and redistribute it
  14   freely, subject to the following restrictions:
  15
  16   1. The origin of this software must not be misrepresented; you must not
  17      claim that you wrote the original software. If you use this software
  18      in a product, an acknowledgment in the product documentation would be
  19      appreciated but is not required.
  20   2. Altered source versions must be plainly marked as such, and must not be
  21      misrepresented as being the original software.
  22   3. This notice may not be removed or altered from any source distribution.
  23
  24   Mark Adler
  25   madler@alumni.caltech.edu
  26  */
  27
  28 /* Use hardware CRC instruction on Intel SSE 4.2 processors.  This computes a
  29    CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc.  A software
  30    version is provided as a fall-back, as well as for speed comparisons. */
  31
  32 /* Version history:
  33    1.0  10 Feb 2013  First version
  34    1.1   1 Aug 2013  Correct comments on why three crc instructions in parallel
  35    1.2   1 Nov 2015  Add const qualifier to avoid compiler warning
  36                      Load entire input into memory (test code)
  37                      Argument gives number of times to repeat (test code)
  38                      Argument < 0 forces software implementation (test code)
  39    1.3  31 Dec 2015  Check for Intel architecture using compiler macro
  40                      Support big-endian processors in software calculation
  41                      Add header for external use
  42  */
  43
  44 #include "crc32c.h"
  45 #include <ccan/compiler/compiler.h>
  46 #include <stdbool.h>
  47
  48 static uint32_t crc32c_sw(uint32_t crc, void const *buf, size_t len);
  49
  50 /* CRC-32C (iSCSI) polynomial in reversed bit order. */
  51 #define POLY 0x82f63b78
  52
  53 #ifdef __x86_64__
  54
  55 /* Hardware CRC-32C for Intel and compatible processors. */
  56
  57 /* Multiply a matrix times a vector over the Galois field of two elements,
  58    GF(2).  Each element is a bit in an unsigned integer.  mat must have at
  59    least as many entries as the power of two for most significant one bit in
  60    vec. */
  61 static inline uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec) {
  62     uint32_t sum = 0;
  63     while (vec) {
  64         if (vec & 1)
  65             sum ^= *mat;
  66         vec >>= 1;
  67         mat++;
  68     }
  69     return sum;
  70 }
  71
  72 /* Multiply a matrix by itself over GF(2).  Both mat and square must have 32
  73    rows. */
  74 static inline void gf2_matrix_square(uint32_t *square, uint32_t *mat) {
  75     unsigned n;
  76     for (n = 0; n < 32; n++)
  77         square[n] = gf2_matrix_times(mat, mat[n]);
  78 }
  79
  80 /* Construct an operator to apply len zeros to a crc.  len must be a power of
  81    two.  If len is not a power of two, then the result is the same as for the
  82    largest power of two less than len.  The result for len == 0 is the same as
  83    for len == 1.  A version of this routine could be easily written for any
  84    len, but that is not needed for this application. */
  85 static void crc32c_zeros_op(uint32_t *even, size_t len) {
  86     uint32_t odd[32];       /* odd-power-of-two zeros operator */
  87
  88     /* put operator for one zero bit in odd */
  89     odd[0] = POLY;              /* CRC-32C polynomial */
  90     uint32_t row = 1;
  91     unsigned n;
  92     for (n = 1; n < 32; n++) {
  93         odd[n] = row;
  94         row <<= 1;
  95     }
  96
  97     /* put operator for two zero bits in even */
  98     gf2_matrix_square(even, odd);
  99
 100     /* put operator for four zero bits in odd */
 101     gf2_matrix_square(odd, even);
 102
 103     /* first square will put the operator for one zero byte (eight zero bits),
 104        in even -- next square puts operator for two zero bytes in odd, and so
 105        on, until len has been rotated down to zero */
 106     do {
 107         gf2_matrix_square(even, odd);
 108         len >>= 1;
 109         if (len == 0)
 110             return;
 111         gf2_matrix_square(odd, even);
 112         len >>= 1;
 113     } while (len);
 114
 115     /* answer ended up in odd -- copy to even */
 116     for (n = 0; n < 32; n++)
 117         even[n] = odd[n];
 118 }
 119
 120 /* Take a length and build four lookup tables for applying the zeros operator
 121    for that length, byte-by-byte on the operand. */
 122 static void crc32c_zeros(uint32_t zeros[][256], size_t len) {
 123     uint32_t op[32];
 124
 125     crc32c_zeros_op(op, len);
 126     unsigned n;
 127     for (n = 0; n < 256; n++) {
 128         zeros[0][n] = gf2_matrix_times(op, n);
 129         zeros[1][n] = gf2_matrix_times(op, n << 8);
 130         zeros[2][n] = gf2_matrix_times(op, n << 16);
 131         zeros[3][n] = gf2_matrix_times(op, n << 24);
 132     }
 133 }
 134
 135 /* Apply the zeros operator table to crc. */
 136 static inline uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc) {
 137     return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
 138            zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
 139 }
 140
 141 /* Block sizes for three-way parallel crc computation.  LONG and SHORT must
 142    both be powers of two.  The associated string constants must be set
 143    accordingly, for use in constructing the assembler instructions. */
 144 #define LONG 8192
 145 #define LONGx1 "8192"
 146 #define LONGx2 "16384"
 147 #define SHORT 256
 148 #define SHORTx1 "256"
 149 #define SHORTx2 "512"
 150
 151 /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
 152 static bool crc32c_once_hw;
 153 static uint32_t crc32c_long[4][256];
 154 static uint32_t crc32c_short[4][256];
 155
 156 /* Initialize tables for shifting crcs. */
 157 static void crc32c_init_hw(void) {
 158     crc32c_once_hw = true;
 159     crc32c_zeros(crc32c_long, LONG);
 160     crc32c_zeros(crc32c_short, SHORT);
 161 }
 162
 163 /* Compute CRC-32C using the Intel hardware instruction. */
 164 static uint32_t crc32c_hw(uint32_t crc, void const *buf, size_t len) {
 165     /* populate shift tables the first time through */
 166     if (!crc32c_once_hw)
 167         crc32c_init_hw();
 168
 169     /* pre-process the crc */
 170     crc = ~crc;
 171     uint64_t crc0 = crc;            /* 64-bits for crc32q instruction */
 172
 173     /* compute the crc for up to seven leading bytes to bring the data pointer
 174        to an eight-byte boundary */
 175     unsigned char const *next = buf;
 176     while (len && ((uintptr_t)next & 7) != 0) {
 177         __asm__("crc32b\t" "(%1), %0"
 178                 : "=r"(crc0)
 179                 : "r"(next), "0"(crc0));
 180         next++;
 181         len--;
 182     }
 183
 184     /* compute the crc on sets of LONG*3 bytes, executing three independent crc
 185        instructions, each on LONG bytes -- this is optimized for the Nehalem,
 186        Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
 187        throughput of one crc per cycle, but a latency of three cycles */
 188     while (len >= LONG*3) {
 189         uint64_t crc1 = 0;
 190         uint64_t crc2 = 0;
 191         unsigned char const * const end = next + LONG;
 192         do {
 193             __asm__("crc32q\t" "(%3), %0\n\t"
 194                     "crc32q\t" LONGx1 "(%3), %1\n\t"
 195                     "crc32q\t" LONGx2 "(%3), %2"
 196                     : "=r"(crc0), "=r"(crc1), "=r"(crc2)
 197                     : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
 198             next += 8;
 199         } while (next < end);
 200         crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
 201         crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2;
 202         next += LONG*2;
 203         len -= LONG*3;
 204     }
 205
 206     /* do the same thing, but now on SHORT*3 blocks for the remaining data less
 207        than a LONG*3 block */
 208     while (len >= SHORT*3) {
 209         uint64_t crc1 = 0;
 210         uint64_t crc2 = 0;
 211         unsigned char const * const end = next + SHORT;
 212         do {
 213             __asm__("crc32q\t" "(%3), %0\n\t"
 214                     "crc32q\t" SHORTx1 "(%3), %1\n\t"
 215                     "crc32q\t" SHORTx2 "(%3), %2"
 216                     : "=r"(crc0), "=r"(crc1), "=r"(crc2)
 217                     : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
 218             next += 8;
 219         } while (next < end);
 220         crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
 221         crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2;
 222         next += SHORT*2;
 223         len -= SHORT*3;
 224     }
 225
 226     /* compute the crc on the remaining eight-byte units less than a SHORT*3
 227        block */
 228     {
 229         unsigned char const * const end = next + (len - (len & 7));
 230         while (next < end) {
 231             __asm__("crc32q\t" "(%1), %0"
 232                     : "=r"(crc0)
 233                     : "r"(next), "0"(crc0));
 234             next += 8;
 235         }
 236         len &= 7;
 237     }
 238
 239     /* compute the crc for up to seven trailing bytes */
 240     while (len) {
 241         __asm__("crc32b\t" "(%1), %0"
 242                 : "=r"(crc0)
 243                 : "r"(next), "0"(crc0));
 244         next++;
 245         len--;
 246     }
 247
 248     /* return a post-processed crc */
 249     return ~crc0;
 250 }
 251
 252 /* Compute a CRC-32C.  If the crc32 instruction is available, use the hardware
 253    version.  Otherwise, use the software version. */
 254 uint32_t crc32c(uint32_t crc, void const *buf, size_t len) {
 255
 256     return cpu_supports("sse4.2") ? crc32c_hw(crc, buf, len) : crc32c_sw(crc, buf, len);
 257 }
 258
 259 #else /* !__x86_64__ */
 260
 261 uint32_t crc32c(uint32_t crc, void const *buf, size_t len) {
 262     return crc32c_sw(crc, buf, len);
 263 }
 264
 265 #endif
 266
 267 /* Construct table for software CRC-32C little-endian calculation. */
 268 static bool crc32c_once_little;
 269 static uint32_t crc32c_table_little[8][256];
 270 static void crc32c_init_sw_little(void) {
 271     unsigned n;
 272     for (n = 0; n < 256; n++) {
 273         uint32_t crc = n;
 274         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
 275         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
 276         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
 277         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
 278         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
 279         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
 280         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
 281         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
 282         crc32c_table_little[0][n] = crc;
 283     }
 284     for (n = 0; n < 256; n++) {
 285         uint32_t crc = crc32c_table_little[0][n];
 286         unsigned k;
 287         for (k = 1; k < 8; k++) {
 288             crc = crc32c_table_little[0][crc & 0xff] ^ (crc >> 8);
 289             crc32c_table_little[k][n] = crc;
 290         }
 291     }
 292     crc32c_once_little = true;
 293 }
 294
 295 /* Compute a CRC-32C in software assuming a little-endian architecture,
 296    constructing the required table if that hasn't already been done. */
 297 static uint32_t crc32c_sw_little(uint32_t crc, void const *buf, size_t len) {
 298     unsigned char const *next = buf;
 299
 300     if (!crc32c_once_little)
 301         crc32c_init_sw_little();
 302     crc = ~crc;
 303     while (len && ((uintptr_t)next & 7) != 0) {
 304         crc = crc32c_table_little[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
 305         len--;
 306     }
 307     if (len >= 8) {
 308         uint64_t crcw = crc;
 309         do {
 310             crcw ^= *(uint64_t const *)next;
 311             crcw = crc32c_table_little[7][crcw & 0xff] ^
 312                    crc32c_table_little[6][(crcw >> 8) & 0xff] ^
 313                    crc32c_table_little[5][(crcw >> 16) & 0xff] ^
 314                    crc32c_table_little[4][(crcw >> 24) & 0xff] ^
 315                    crc32c_table_little[3][(crcw >> 32) & 0xff] ^
 316                    crc32c_table_little[2][(crcw >> 40) & 0xff] ^
 317                    crc32c_table_little[1][(crcw >> 48) & 0xff] ^
 318                    crc32c_table_little[0][crcw >> 56];
 319             next += 8;
 320             len -= 8;
 321         } while (len >= 8);
 322         crc = crcw;
 323     }
 324     while (len) {
 325         crc = crc32c_table_little[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
 326         len--;
 327     }
 328     return ~crc;
 329 }
 330
 331 /* Swap the bytes in a uint64_t.  (Only for big-endian.) */
 332 #if defined(__has_builtin) || (defined(__GNUC__) && \
 333     (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)))
 334 #  define swap __builtin_bswap64
 335 #else
 336 static inline uint64_t swap(uint64_t x) {
 337     x = ((x << 8) & 0xff00ff00ff00ff00) | ((x >> 8) & 0xff00ff00ff00ff);
 338     x = ((x << 16) & 0xffff0000ffff0000) | ((x >> 16) & 0xffff0000ffff);
 339     return (x << 32) | (x >> 32);
 340 }
 341 #endif
 342
 343 /* Construct tables for software CRC-32C big-endian calculation. */
 344 static bool crc32c_once_big;
 345 static uint32_t crc32c_table_big_byte[256];
 346 static uint64_t crc32c_table_big[8][256];
 347 static void crc32c_init_sw_big(void) {
 348     unsigned n;
 349     for (n = 0; n < 256; n++) {
 350         uint32_t crc = n;
 351         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
 352         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
 353         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
 354         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
 355         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
 356         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
 357         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
 358         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
 359         crc32c_table_big_byte[n] = crc;
 360     }
 361     for (n = 0; n < 256; n++) {
 362         uint32_t crc = crc32c_table_big_byte[n];
 363         crc32c_table_big[0][n] = swap(crc);
 364         unsigned k;
 365         for (k = 1; k < 8; k++) {
 366             crc = crc32c_table_big_byte[crc & 0xff] ^ (crc >> 8);
 367             crc32c_table_big[k][n] = swap(crc);
 368         }
 369     }
 370     crc32c_once_big = true;
 371 }
 372
 373 /* Compute a CRC-32C in software assuming a big-endian architecture,
 374    constructing the required tables if that hasn't already been done. */
 375 static uint32_t crc32c_sw_big(uint32_t crc, void const *buf, size_t len) {
 376     unsigned char const *next = buf;
 377
 378     if (!crc32c_once_big)
 379         crc32c_init_sw_big();
 380     crc = ~crc;
 381     while (len && ((uintptr_t)next & 7) != 0) {
 382         crc = crc32c_table_big_byte[(crc ^ *next++) & 0xff] ^ (crc >> 8);
 383         len--;
 384     }
 385     if (len >= 8) {
 386         uint64_t crcw = swap(crc);
 387         do {
 388             crcw ^= *(uint64_t const *)next;
 389             crcw = crc32c_table_big[0][crcw & 0xff] ^
 390                    crc32c_table_big[1][(crcw >> 8) & 0xff] ^
 391                    crc32c_table_big[2][(crcw >> 16) & 0xff] ^
 392                    crc32c_table_big[3][(crcw >> 24) & 0xff] ^
 393                    crc32c_table_big[4][(crcw >> 32) & 0xff] ^
 394                    crc32c_table_big[5][(crcw >> 40) & 0xff] ^
 395                    crc32c_table_big[6][(crcw >> 48) & 0xff] ^
 396                    crc32c_table_big[7][(crcw >> 56)];
 397             next += 8;
 398             len -= 8;
 399         } while (len >= 8);
 400         crc = swap(crcw);
 401     }
 402     while (len) {
 403         crc = crc32c_table_big_byte[(crc ^ *next++) & 0xff] ^ (crc >> 8);
 404         len--;
 405     }
 406     return ~crc;
 407 }
 408
 409 /* Table-driven software CRC-32C.  This is about 15 times slower than using the
 410    hardware instructions.  Determine the endianess of the processor and proceed
 411    accordingly.  Ideally the endianess will be determined at compile time, in
 412    which case the unused functions and tables for the other endianess will be
 413    removed by the optimizer.  If not, then the proper routines and tables will
 414    be used, even if the endianess is changed mid-stream.  (Yes, there are
 415    processors that permit that -- go figure.) */
 416 static uint32_t crc32c_sw(uint32_t crc, void const *buf, size_t len) {
 417     static int const little = 1;
 418     if (*(char const *)&little)
 419         return crc32c_sw_little(crc, buf, len);
 420     else
 421         return crc32c_sw_big(crc, buf, len);
 422 }