github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/compress/libdeflate/crc32_pclmul_template.h

github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/compress/libdeflate/crc32_pclmul_template.h (about)

     1  // NOLINT(build/header_guard)
     2  /*
     3   * x86/crc32_pclmul_template.h
     4   *
     5   * Copyright 2016 Eric Biggers
     6   *
     7   * Permission is hereby granted, free of charge, to any person
     8   * obtaining a copy of this software and associated documentation
     9   * files (the "Software"), to deal in the Software without
    10   * restriction, including without limitation the rights to use,
    11   * copy, modify, merge, publish, distribute, sublicense, and/or sell
    12   * copies of the Software, and to permit persons to whom the
    13   * Software is furnished to do so, subject to the following
    14   * conditions:
    15   *
    16   * The above copyright notice and this permission notice shall be
    17   * included in all copies or substantial portions of the Software.
    18   *
    19   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    20   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    21   * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    22   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    23   * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    24   * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    25   * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    26   * OTHER DEALINGS IN THE SOFTWARE.
    27   */
    28  
    29  #include <wmmintrin.h>
    30  
    31  /*
    32   * CRC-32 folding with PCLMULQDQ.
    33   *
    34   * The basic idea is to repeatedly "fold" each 512 bits into the next 512 bits,
    35   * producing an abbreviated message which is congruent the original message
    36   * modulo the generator polynomial G(x).
    37   *
    38   * Folding each 512 bits is implemented as eight 64-bit folds, each of which
    39   * uses one carryless multiplication instruction.  It's expected that CPUs may
    40   * be able to execute some of these multiplications in parallel.
    41   *
    42   * Explanation of "folding": let A(x) be 64 bits from the message, and let B(x)
    43   * be 95 bits from a constant distance D later in the message.  The relevant
    44   * portion of the message can be written as:
    45   *
    46   *	M(x) = A(x)*x^D + B(x)
    47   *
    48   * ... where + and * represent addition and multiplication, respectively, of
    49   * polynomials over GF(2).  Note that when implemented on a computer, these
    50   * operations are equivalent to XOR and carryless multiplication, respectively.
    51   *
    52   * For the purpose of CRC calculation, only the remainder modulo the generator
    53   * polynomial G(x) matters:
    54   *
    55   *	M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x)
    56   *
    57   * Since the modulo operation can be applied anywhere in a sequence of additions
    58   * and multiplications without affecting the result, this is equivalent to:
    59   *
    60   *	M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x)
    61   *
    62   * For any D, 'x^D mod G(x)' will be a polynomial with maximum degree 31, i.e.
    63   * a 32-bit quantity.  So 'A(x) * (x^D mod G(x))' is equivalent to a carryless
    64   * multiplication of a 64-bit quantity by a 32-bit quantity, producing a 95-bit
    65   * product.  Then, adding (XOR-ing) the product to B(x) produces a polynomial
    66   * with the same length as B(x) but with the same remainder as 'A(x)*x^D +
    67   * B(x)'.  This is the basic fold operation with 64 bits.
    68   *
    69   * Note that the carryless multiplication instruction PCLMULQDQ actually takes
    70   * two 64-bit inputs and produces a 127-bit product in the low-order bits of a
    71   * 128-bit XMM register.  This works fine, but care must be taken to account for
    72   * "bit endianness".  With the CRC version implemented here, bits are always
    73   * ordered such that the lowest-order bit represents the coefficient of highest
    74   * power of x and the highest-order bit represents the coefficient of the lowest
    75   * power of x.  This is backwards from the more intuitive order.  Still,
    76   * carryless multiplication works essentially the same either way.  It just must
    77   * be accounted for that when we XOR the 95-bit product in the low-order 95 bits
    78   * of a 128-bit XMM register into 128-bits of later data held in another XMM
    79   * register, we'll really be XOR-ing the product into the mathematically higher
    80   * degree end of those later bits, not the lower degree end as may be expected.
    81   *
    82   * So given that caveat and the fact that we process 512 bits per iteration, the
    83   * 'D' values we need for the two 64-bit halves of each 128 bits of data are:
    84   *
    85   *	D = (512 + 95) - 64	 for the higher-degree half of each 128 bits,
    86   *				 i.e. the lower order bits in the XMM register
    87   *
    88   *	D = (512 + 95) - 128	 for the lower-degree half of each 128 bits,
    89   *				 i.e. the higher order bits in the XMM register
    90   *
    91   * The required 'x^D mod G(x)' values were precomputed.
    92   *
    93   * When <= 512 bits remain in the message, we finish up by folding across
    94   * smaller distances.  This works similarly; the distance D is just different,
    95   * so different constant multipliers must be used.  Finally, once the remaining
    96   * message is just 64 bits, it is is reduced to the CRC-32 using Barrett
    97   * reduction (explained later).
    98   *
    99   * For more information see the original paper from Intel:
   100   *	"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
   101   *	December 2009
   102   *	http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
   103   */
   104  static u32 ATTRIBUTES
   105  FUNCNAME_ALIGNED(u32 remainder, const __m128i *p, size_t nr_segs)
   106  {
   107  	/* Constants precomputed by gen_crc32_multipliers.c.  Do not edit! */
   108  	const __v2di multipliers_4 = (__v2di){ 0x8F352D95, 0x1D9513D7 };
   109  	const __v2di multipliers_2 = (__v2di){ 0xF1DA05AA, 0x81256527 };
   110  	const __v2di multipliers_1 = (__v2di){ 0xAE689191, 0xCCAA009E };
   111  	const __v2di final_multiplier = (__v2di){ 0xB8BC6765 };
   112  	const __m128i mask32 = (__m128i)(__v4si){ 0xFFFFFFFF };
   113  	const __v2di barrett_reduction_constants =
   114  			(__v2di){ 0x00000001F7011641, 0x00000001DB710641 };
   115  
   116  	const __m128i * const end = p + nr_segs;
   117  	const __m128i * const end512 = p + (nr_segs & ~3);
   118  	__m128i x0, x1, x2, x3;
   119  
   120  	/*
   121  	 * Account for the current 'remainder', i.e. the CRC of the part of the
   122  	 * message already processed.  Explanation: rewrite the message
   123  	 * polynomial M(x) in terms of the first part A(x), the second part
   124  	 * B(x), and the length of the second part in bits |B(x)| >= 32:
   125  	 *
   126  	 *	M(x) = A(x)*x^|B(x)| + B(x)
   127  	 *
   128  	 * Then the CRC of M(x) is:
   129  	 *
   130  	 *	CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x))
   131  	 *	          = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x))
   132  	 *	          = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x))
   133  	 *
   134  	 * Note: all arithmetic is modulo G(x), the generator polynomial; that's
   135  	 * why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x).
   136  	 *
   137  	 * So the CRC of the full message is the CRC of the second part of the
   138  	 * message where the first 32 bits of the second part of the message
   139  	 * have been XOR'ed with the CRC of the first part of the message.
   140  	 */
   141  	x0 = *p++;
   142  	x0 ^= (__m128i)(__v4si){ remainder };
   143  
   144  	if (p > end512) /* only 128, 256, or 384 bits of input? */
   145  		goto _128_bits_at_a_time;
   146  	x1 = *p++;
   147  	x2 = *p++;
   148  	x3 = *p++;
   149  
   150  	/* Fold 512 bits at a time */
   151  	for (; p != end512; p += 4) {
   152  		__m128i y0, y1, y2, y3;
   153  
   154  		y0 = p[0];
   155  		y1 = p[1];
   156  		y2 = p[2];
   157  		y3 = p[3];
   158  
   159  		/*
   160  		 * Note: the immediate constant for PCLMULQDQ specifies which
   161  		 * 64-bit halves of the 128-bit vectors to multiply:
   162  		 *
   163  		 * 0x00 means low halves (higher degree polynomial terms for us)
   164  		 * 0x11 means high halves (lower degree polynomial terms for us)
   165  		 */
   166  		y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x00);
   167  		y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x00);
   168  		y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x00);
   169  		y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x00);
   170  		y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x11);
   171  		y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x11);
   172  		y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x11);
   173  		y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x11);
   174  
   175  		x0 = y0;
   176  		x1 = y1;
   177  		x2 = y2;
   178  		x3 = y3;
   179  	}
   180  
   181  	/* Fold 512 bits => 128 bits */
   182  	x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x00);
   183  	x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x00);
   184  	x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x11);
   185  	x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x11);
   186  	x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x00);
   187  	x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x11);
   188  	x0 = x3;
   189  
   190  _128_bits_at_a_time:
   191  	while (p != end) {
   192  		/* Fold 128 bits into next 128 bits */
   193  		x1 = *p++;
   194  		x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x00);
   195  		x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x11);
   196  		x0 = x1;
   197  	}
   198  
   199  	/* Now there are just 128 bits left, stored in 'x0'. */
   200  
   201  	/*
   202  	 * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
   203  	 * which is equivalent to multiplying by x^32.  This is needed because
   204  	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
   205  	 */
   206  	x0 = _mm_srli_si128(x0, 8) ^
   207  	     _mm_clmulepi64_si128(x0, multipliers_1, 0x10);
   208  
   209  	/* Fold 96 => 64 bits */
   210  	x0 = _mm_srli_si128(x0, 4) ^
   211  	     _mm_clmulepi64_si128(x0 & mask32, final_multiplier, 0x00);
   212  
   213          /*
   214  	 * Finally, reduce 64 => 32 bits using Barrett reduction.
   215  	 *
   216  	 * Let M(x) = A(x)*x^32 + B(x) be the remaining message.  The goal is to
   217  	 * compute R(x) = M(x) mod G(x).  Since degree(B(x)) < degree(G(x)):
   218  	 *
   219  	 *	R(x) = (A(x)*x^32 + B(x)) mod G(x)
   220  	 *	     = (A(x)*x^32) mod G(x) + B(x)
   221  	 *
   222  	 * Then, by the Division Algorithm there exists a unique q(x) such that:
   223  	 *
   224  	 *	A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
   225  	 *
   226  	 * Since the left-hand side is of maximum degree 31, the right-hand side
   227  	 * must be too.  This implies that we can apply 'mod x^32' to the
   228  	 * right-hand side without changing its value:
   229  	 *
   230  	 *	(A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
   231  	 *
   232  	 * Note that '+' is equivalent to '-' in polynomials over GF(2).
   233  	 *
   234  	 * We also know that:
   235  	 *
   236  	 *	              / A(x)*x^32 \
   237  	 *	q(x) = floor (  ---------  )
   238  	 *	              \    G(x)   /
   239  	 *
   240  	 * To compute this efficiently, we can multiply the top and bottom by
   241  	 * x^32 and move the division by G(x) to the top:
   242  	 *
   243  	 *	              / A(x) * floor(x^64 / G(x)) \
   244  	 *	q(x) = floor (  -------------------------  )
   245  	 *	              \           x^32            /
   246  	 *
   247  	 * Note that floor(x^64 / G(x)) is a constant.
   248  	 *
   249  	 * So finally we have:
   250  	 *
   251  	 *	                          / A(x) * floor(x^64 / G(x)) \
   252  	 *	R(x) = B(x) + G(x)*floor (  -------------------------  )
   253  	 *	                          \           x^32            /
   254  	 */
   255  	x1 = x0;
   256  	x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x00);
   257  	x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x10);
   258  	return _mm_cvtsi128_si32(_mm_srli_si128(x0 ^ x1, 4));
   259  }
   260  
   261  #define IMPL_ALIGNMENT		16
   262  #define IMPL_SEGMENT_SIZE	16
   263  #include "crc32_vec_template.h"