github.com/grailbio/base@v0.0.11/compress/libdeflate/crc32.c

github.com/grailbio/base@v0.0.11/compress/libdeflate/crc32.c (about)

     1  /*
     2   * crc32.c - CRC-32 checksum algorithm for the gzip format
     3   *
     4   * Copyright 2016 Eric Biggers
     5   *
     6   * Permission is hereby granted, free of charge, to any person
     7   * obtaining a copy of this software and associated documentation
     8   * files (the "Software"), to deal in the Software without
     9   * restriction, including without limitation the rights to use,
    10   * copy, modify, merge, publish, distribute, sublicense, and/or sell
    11   * copies of the Software, and to permit persons to whom the
    12   * Software is furnished to do so, subject to the following
    13   * conditions:
    14   *
    15   * The above copyright notice and this permission notice shall be
    16   * included in all copies or substantial portions of the Software.
    17   *
    18   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    19   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    20   * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    21   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    22   * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    23   * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    24   * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    25   * OTHER DEALINGS IN THE SOFTWARE.
    26   */
    27  
    28  /*
    29   * High-level description of CRC
    30   * =============================
    31   *
    32   * Consider a bit sequence 'bits[1...len]'.  Interpret 'bits' as the "message"
    33   * polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2),
    34   * where the coefficient of 'x^i' is 'bits[len - i]'.  Then, compute:
    35   *
    36   *			R(x) = M(x)*x^n mod G(x)
    37   *
    38   * where G(x) is a selected "generator" polynomial of degree 'n'.  The remainder
    39   * R(x) is a polynomial of max degree 'n - 1'.  The CRC of 'bits' is R(x)
    40   * interpreted as a bitstring of length 'n'.
    41   *
    42   * CRC used in gzip
    43   * ================
    44   *
    45   * In the gzip format (RFC 1952):
    46   *
    47   *	- The bitstring to checksum is formed from the bytes of the uncompressed
    48   *	  data by concatenating the bits from the bytes in order, proceeding
    49   *	  from the low-order bit to the high-order bit within each byte.
    50   *
    51   *	- The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 +
    52   *	  x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1.
    53   *	  Consequently, the CRC length is 32 bits ("CRC-32").
    54   *
    55   *	- The highest order 32 coefficients of M(x)*x^n are inverted.
    56   *
    57   *	- All 32 coefficients of R(x) are inverted.
    58   *
    59   * The two inversions cause added leading and trailing zero bits to affect the
    60   * resulting CRC, whereas with a regular CRC such bits would have no effect on
    61   * the CRC.
    62   *
    63   * Computation and optimizations
    64   * =============================
    65   *
    66   * We can compute R(x) through "long division", maintaining only 32 bits of
    67   * state at any given time.  Multiplication by 'x' can be implemented as
    68   * right-shifting by 1 (assuming the polynomial<=>bitstring mapping where the
    69   * highest order bit represents the coefficient of x^0), and both addition and
    70   * subtraction can be implemented as bitwise exclusive OR (since we are working
    71   * in GF(2)).  Here is an unoptimized implementation:
    72   *
    73   *	static u32 crc32_gzip(const u8 *buffer, size_t size)
    74   *	{
    75   *		u32 remainder = 0;
    76   *		const u32 divisor = 0xEDB88320;
    77   *
    78   *		for (size_t i = 0; i < size * 8 + 32; i++) {
    79   *			int bit;
    80   *			u32 multiple;
    81   *
    82   *			if (i < size * 8)
    83   *				bit = (buffer[i / 8] >> (i % 8)) & 1;
    84   *			else
    85   *				bit = 0; // one of the 32 appended 0 bits
    86   *
    87   *			if (i < 32) // the first 32 bits are inverted
    88   *				bit ^= 1;
    89   *
    90   *			if (remainder & 1)
    91   *				multiple = divisor;
    92   *			else
    93   *				multiple = 0;
    94   *
    95   *			remainder >>= 1;
    96   *			remainder |= (u32)bit << 31;
    97   *			remainder ^= multiple;
    98   *		}
    99   *
   100   *		return ~remainder;
   101   *	}
   102   *
   103   * In this implementation, the 32-bit integer 'remainder' maintains the
   104   * remainder of the currently processed portion of the message (with 32 zero
   105   * bits appended) when divided by the generator polynomial.  'remainder' is the
   106   * representation of R(x), and 'divisor' is the representation of G(x) excluding
   107   * the x^32 coefficient.  For each bit to process, we multiply R(x) by 'x^1',
   108   * then add 'x^0' if the new bit is a 1.  If this causes R(x) to gain a nonzero
   109   * x^32 term, then we subtract G(x) from R(x).
   110   *
   111   * We can speed this up by taking advantage of the fact that XOR is commutative
   112   * and associative, so the order in which we combine the inputs into 'remainder'
   113   * is unimportant.  And since each message bit we add doesn't affect the choice
   114   * of 'multiple' until 32 bits later, we need not actually add each message bit
   115   * until that point:
   116   *
   117   *	static u32 crc32_gzip(const u8 *buffer, size_t size)
   118   *	{
   119   *		u32 remainder = ~0;
   120   *		const u32 divisor = 0xEDB88320;
   121   *
   122   *		for (size_t i = 0; i < size * 8; i++) {
   123   *			int bit;
   124   *			u32 multiple;
   125   *
   126   *			bit = (buffer[i / 8] >> (i % 8)) & 1;
   127   *			remainder ^= bit;
   128   *			if (remainder & 1)
   129   *				multiple = divisor;
   130   *			else
   131   *				multiple = 0;
   132   *			remainder >>= 1;
   133   *			remainder ^= multiple;
   134   *		}
   135   *
   136   *		return ~remainder;
   137   *	}
   138   *
   139   * With the above implementation we get the effect of 32 appended 0 bits for
   140   * free; they never affect the choice of a divisor, nor would they change the
   141   * value of 'remainder' if they were to be actually XOR'ed in.  And by starting
   142   * with a remainder of all 1 bits, we get the effect of complementing the first
   143   * 32 message bits.
   144   *
   145   * The next optimization is to process the input in multi-bit units.  Suppose
   146   * that we insert the next 'n' message bits into the remainder.  Then we get an
   147   * intermediate remainder of length '32 + n' bits, and the CRC of the extra 'n'
   148   * bits is the amount by which the low 32 bits of the remainder will change as a
   149   * result of cancelling out those 'n' bits.  Taking n=8 (one byte) and
   150   * precomputing a table containing the CRC of each possible byte, we get
   151   * crc32_slice1() defined below.
   152   *
   153   * As a further optimization, we could increase the multi-bit unit size to 16.
   154   * However, that is inefficient because the table size explodes from 256 entries
   155   * (1024 bytes) to 65536 entries (262144 bytes), which wastes memory and won't
   156   * fit in L1 cache on typical processors.
   157   *
   158   * However, we can actually process 4 bytes at a time using 4 different tables
   159   * with 256 entries each.  Logically, we form a 64-bit intermediate remainder
   160   * and cancel out the high 32 bits in 8-bit chunks.  Bits 32-39 are cancelled
   161   * out by the CRC of those bits, whereas bits 40-47 are be cancelled out by the
   162   * CRC of those bits with 8 zero bits appended, and so on.  This method is
   163   * implemented in crc32_slice4(), defined below.
   164   *
   165   * In crc32_slice8(), this method is extended to 8 bytes at a time.  The
   166   * intermediate remainder (which we never actually store explicitly) is 96 bits.
   167   *
   168   * On CPUs that support fast carryless multiplication, CRCs can be computed even
   169   * more quickly via "folding".  See e.g. the x86 PCLMUL implementation.
   170   */
   171  
   172  #include "lib_common.h"
   173  #include "libdeflate.h"
   174  
   175  typedef u32 (*crc32_func_t)(u32, const u8 *, size_t);
   176  
   177  /* Include architecture-specific implementations if available */
   178  #undef CRC32_SLICE1
   179  #undef CRC32_SLICE4
   180  #undef CRC32_SLICE8
   181  #undef DEFAULT_IMPL
   182  #undef DISPATCH
   183  #if defined(__arm__) || defined(__aarch64__)
   184  #  include "arm/crc32_impl.h"
   185  #elif defined(__i386__) || defined(__x86_64__)
   186  #  include "crc32_impl.h"
   187  #endif
   188  
   189  /*
   190   * Define a generic implementation (crc32_slice8()) if needed.  crc32_slice1()
   191   * may also be needed as a fallback for architecture-specific implementations.
   192   */
   193  
   194  #ifndef DEFAULT_IMPL
   195  #  define CRC32_SLICE8	1
   196  #  define DEFAULT_IMPL	crc32_slice8
   197  #endif
   198  
   199  #if defined(CRC32_SLICE1) || defined(CRC32_SLICE4) || defined(CRC32_SLICE8)
   200  #include "crc32_table.h"
   201  static forceinline u32
   202  crc32_update_byte(u32 remainder, u8 next_byte)
   203  {
   204  	return (remainder >> 8) ^ crc32_table[(u8)remainder ^ next_byte];
   205  }
   206  #endif
   207  
   208  #ifdef CRC32_SLICE1
   209  static u32
   210  crc32_slice1(u32 remainder, const u8 *buffer, size_t size)
   211  {
   212  	size_t i;
   213  
   214  	STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x100);
   215  
   216  	for (i = 0; i < size; i++)
   217  		remainder = crc32_update_byte(remainder, buffer[i]);
   218  	return remainder;
   219  }
   220  #endif /* CRC32_SLICE1 */
   221  
   222  #ifdef CRC32_SLICE4
   223  static u32
   224  crc32_slice4(u32 remainder, const u8 *buffer, size_t size)
   225  {
   226  	const u8 *p = buffer;
   227  	const u8 *end = buffer + size;
   228  	const u8 *end32;
   229  
   230  	STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x400);
   231  
   232  	for (; ((uintptr_t)p & 3) && p != end; p++)
   233  		remainder = crc32_update_byte(remainder, *p);
   234  
   235  	end32 = p + ((end - p) & ~3);
   236  	for (; p != end32; p += 4) {
   237  		u32 v = le32_bswap(*(const u32 *)p);
   238  		remainder =
   239  		    crc32_table[0x300 + (u8)((remainder ^ v) >>  0)] ^
   240  		    crc32_table[0x200 + (u8)((remainder ^ v) >>  8)] ^
   241  		    crc32_table[0x100 + (u8)((remainder ^ v) >> 16)] ^
   242  		    crc32_table[0x000 + (u8)((remainder ^ v) >> 24)];
   243  	}
   244  
   245  	for (; p != end; p++)
   246  		remainder = crc32_update_byte(remainder, *p);
   247  
   248  	return remainder;
   249  }
   250  #endif /* CRC32_SLICE4 */
   251  
   252  #ifdef CRC32_SLICE8
   253  static u32
   254  crc32_slice8(u32 remainder, const u8 *buffer, size_t size)
   255  {
   256  	const u8 *p = buffer;
   257  	const u8 *end = buffer + size;
   258  	const u8 *end64;
   259  
   260  	STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x800);
   261  
   262  	for (; ((uintptr_t)p & 7) && p != end; p++)
   263  		remainder = crc32_update_byte(remainder, *p);
   264  
   265  	end64 = p + ((end - p) & ~7);
   266  	for (; p != end64; p += 8) {
   267  		u32 v1 = le32_bswap(*(const u32 *)(p + 0));
   268  		u32 v2 = le32_bswap(*(const u32 *)(p + 4));
   269  		remainder =
   270  		    crc32_table[0x700 + (u8)((remainder ^ v1) >>  0)] ^
   271  		    crc32_table[0x600 + (u8)((remainder ^ v1) >>  8)] ^
   272  		    crc32_table[0x500 + (u8)((remainder ^ v1) >> 16)] ^
   273  		    crc32_table[0x400 + (u8)((remainder ^ v1) >> 24)] ^
   274  		    crc32_table[0x300 + (u8)(v2 >>  0)] ^
   275  		    crc32_table[0x200 + (u8)(v2 >>  8)] ^
   276  		    crc32_table[0x100 + (u8)(v2 >> 16)] ^
   277  		    crc32_table[0x000 + (u8)(v2 >> 24)];
   278  	}
   279  
   280  	for (; p != end; p++)
   281  		remainder = crc32_update_byte(remainder, *p);
   282  
   283  	return remainder;
   284  }
   285  #endif /* CRC32_SLICE8 */
   286  
   287  #ifdef DISPATCH
   288  static u32 dispatch(u32, const u8 *, size_t);
   289  
   290  static volatile crc32_func_t crc32_impl = dispatch;
   291  
   292  /* Choose the fastest implementation at runtime */
   293  static u32 dispatch(u32 remainder, const u8 *buffer, size_t size)
   294  {
   295  	crc32_func_t f = arch_select_crc32_func();
   296  
   297  	if (f == NULL)
   298  		f = DEFAULT_IMPL;
   299  
   300  	crc32_impl = f;
   301  	return crc32_impl(remainder, buffer, size);
   302  }
   303  #else
   304  #  define crc32_impl DEFAULT_IMPL /* only one implementation, use it */
   305  #endif
   306  
   307  LIBDEFLATEAPI u32
   308  libdeflate_crc32(u32 remainder, const void *buffer, size_t size)
   309  {
   310  	if (buffer == NULL) /* return initial value */
   311  		return 0;
   312  	return ~crc32_impl(~remainder, buffer, size);
   313  }