github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/compress/libdeflate/adler32_impl.h (about)

     1  #ifndef GO_SRC_GITHUB_COM_GRAILBIO_BASE_COMPRESS_LIBDEFLATE_ADLER32_IMPL_H_
     2  #define GO_SRC_GITHUB_COM_GRAILBIO_BASE_COMPRESS_LIBDEFLATE_ADLER32_IMPL_H_
     3  /*
     4   * x86/adler32_impl.h - x86 implementations of Adler-32 checksum algorithm
     5   *
     6   * Copyright 2016 Eric Biggers
     7   *
     8   * Permission is hereby granted, free of charge, to any person
     9   * obtaining a copy of this software and associated documentation
    10   * files (the "Software"), to deal in the Software without
    11   * restriction, including without limitation the rights to use,
    12   * copy, modify, merge, publish, distribute, sublicense, and/or sell
    13   * copies of the Software, and to permit persons to whom the
    14   * Software is furnished to do so, subject to the following
    15   * conditions:
    16   *
    17   * The above copyright notice and this permission notice shall be
    18   * included in all copies or substantial portions of the Software.
    19   *
    20   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    21   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    22   * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    23   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    24   * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    25   * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    26   * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    27   * OTHER DEALINGS IN THE SOFTWARE.
    28   */
    29  
    30  #include "cpu_features.h"
    31  
    32  /* AVX2 implementation */
    33  #undef DISPATCH_AVX2
    34  #if !defined(DEFAULT_IMPL) &&	\
    35  	(defined(__AVX2__) || (X86_CPU_FEATURES_ENABLED &&	\
    36  			       COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS))
    37  #  define FUNCNAME		adler32_avx2
    38  #  define FUNCNAME_CHUNK	adler32_avx2_chunk
    39  #  define IMPL_ALIGNMENT	32
    40  #  define IMPL_SEGMENT_SIZE	32
    41  #  define IMPL_MAX_CHUNK_SIZE	MAX_CHUNK_SIZE
    42  #  ifdef __AVX2__
    43  #    define ATTRIBUTES
    44  #    define DEFAULT_IMPL	adler32_avx2
    45  #  else
    46  #    define ATTRIBUTES		__attribute__((target("avx2")))
    47  #    define DISPATCH		1
    48  #    define DISPATCH_AVX2	1
    49  #  endif
    50  #  include <immintrin.h>
    51  static forceinline ATTRIBUTES void
    52  adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
    53  {
    54  	const __m256i zeroes = _mm256_setzero_si256();
    55  	const __v32qi multipliers = (__v32qi) { 32, 31, 30, 29, 28, 27, 26, 25,
    56  						24, 23, 22, 21, 20, 19, 18, 17,
    57  						16, 15, 14, 13, 12, 11, 10, 9,
    58  						8,  7,  6,  5,  4,  3,  2,  1 };
    59  	const __v16hi ones = (__v16hi)_mm256_set1_epi16(1);
    60  	__v8si v_s1 = (__v8si)zeroes;
    61  	__v8si v_s1_sums = (__v8si)zeroes;
    62  	__v8si v_s2 = (__v8si)zeroes;
    63  
    64  	do {
    65  		__m256i bytes = *p++;
    66  		__v16hi sums = (__v16hi)_mm256_maddubs_epi16(
    67  						bytes, (__m256i)multipliers);
    68  		v_s1_sums += v_s1;
    69  		v_s1 += (__v8si)_mm256_sad_epu8(bytes, zeroes);
    70  		v_s2 += (__v8si)_mm256_madd_epi16((__m256i)sums, (__m256i)ones);
    71  	} while (p != end);
    72  
    73  	v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes);
    74  	v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes);
    75  	*s1 += (u32)v_s1[0] + (u32)v_s1[4];
    76  
    77  	v_s2 += (__v8si)_mm256_slli_epi32((__m256i)v_s1_sums, 5);
    78  	v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes);
    79  	v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes);
    80  	*s2 += (u32)v_s2[0] + (u32)v_s2[4];
    81  }
    82  #  include "adler32_vec_template.h"
    83  #endif /* AVX2 implementation */
    84  
    85  /* SSE2 implementation */
    86  #undef DISPATCH_SSE2
    87  #if !defined(DEFAULT_IMPL) &&	\
    88  	(defined(__SSE2__) || (X86_CPU_FEATURES_ENABLED &&	\
    89  			       COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS))
    90  #  define FUNCNAME		adler32_sse2
    91  #  define FUNCNAME_CHUNK	adler32_sse2_chunk
    92  #  define IMPL_ALIGNMENT	16
    93  #  define IMPL_SEGMENT_SIZE	32
    94  /*
    95   * The 16-bit precision byte counters must not be allowed to undergo *signed*
    96   * overflow, otherwise the signed multiplications at the end (_mm_madd_epi16)
    97   * would behave incorrectly.
    98   */
    99  #  define IMPL_MAX_CHUNK_SIZE	(32 * (0x7FFF / 0xFF))
   100  #  ifdef __SSE2__
   101  #    define ATTRIBUTES
   102  #    define DEFAULT_IMPL	adler32_sse2
   103  #  else
   104  #    define ATTRIBUTES		__attribute__((target("sse2")))
   105  #    define DISPATCH		1
   106  #    define DISPATCH_SSE2	1
   107  #  endif
   108  #  include <emmintrin.h>
   109  static forceinline ATTRIBUTES void
   110  adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
   111  {
   112  	const __m128i zeroes = _mm_setzero_si128();
   113  
   114  	/* s1 counters: 32-bit, sum of bytes */
   115  	__v4si v_s1 = (__v4si)zeroes;
   116  
   117  	/* s2 counters: 32-bit, sum of s1 values */
   118  	__v4si v_s2 = (__v4si)zeroes;
   119  
   120  	/*
   121  	 * Thirty-two 16-bit counters for byte sums.  Each accumulates the bytes
   122  	 * that eventually need to be multiplied by a number 32...1 for addition
   123  	 * into s2.
   124  	 */
   125  	__v8hi v_byte_sums_a = (__v8hi)zeroes;
   126  	__v8hi v_byte_sums_b = (__v8hi)zeroes;
   127  	__v8hi v_byte_sums_c = (__v8hi)zeroes;
   128  	__v8hi v_byte_sums_d = (__v8hi)zeroes;
   129  
   130  	do {
   131  		/* Load the next 32 bytes */
   132  		const __m128i bytes1 = *p++;
   133  		const __m128i bytes2 = *p++;
   134  
   135  		/*
   136  		 * Accumulate the previous s1 counters into the s2 counters.
   137  		 * Logically, this really should be v_s2 += v_s1 * 32, but we
   138  		 * can do the multiplication (or left shift) later.
   139  		 */
   140  		v_s2 += v_s1;
   141  
   142  		/*
   143  		 * s1 update: use "Packed Sum of Absolute Differences" to add
   144  		 * the bytes horizontally with 8 bytes per sum.  Then add the
   145  		 * sums to the s1 counters.
   146  		 */
   147  		v_s1 += (__v4si)_mm_sad_epu8(bytes1, zeroes);
   148  		v_s1 += (__v4si)_mm_sad_epu8(bytes2, zeroes);
   149  
   150  		/*
   151  		 * Also accumulate the bytes into 32 separate counters that have
   152  		 * 16-bit precision.
   153  		 */
   154  		v_byte_sums_a += (__v8hi)_mm_unpacklo_epi8(bytes1, zeroes);
   155  		v_byte_sums_b += (__v8hi)_mm_unpackhi_epi8(bytes1, zeroes);
   156  		v_byte_sums_c += (__v8hi)_mm_unpacklo_epi8(bytes2, zeroes);
   157  		v_byte_sums_d += (__v8hi)_mm_unpackhi_epi8(bytes2, zeroes);
   158  
   159  	} while (p != end);
   160  
   161  	/* Finish calculating the s2 counters */
   162  	v_s2 = (__v4si)_mm_slli_epi32((__m128i)v_s2, 5);
   163  	v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_a,
   164  				       (__m128i)(__v8hi){ 32, 31, 30, 29, 28, 27, 26, 25 });
   165  	v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_b,
   166  				       (__m128i)(__v8hi){ 24, 23, 22, 21, 20, 19, 18, 17 });
   167  	v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_c,
   168  				       (__m128i)(__v8hi){ 16, 15, 14, 13, 12, 11, 10, 9 });
   169  	v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_d,
   170  				       (__m128i)(__v8hi){ 8,  7,  6,  5,  4,  3,  2,  1 });
   171  
   172  	/* Now accumulate what we computed into the real s1 and s2 */
   173  	v_s1 += (__v4si)_mm_shuffle_epi32((__m128i)v_s1, 0x31);
   174  	v_s1 += (__v4si)_mm_shuffle_epi32((__m128i)v_s1, 0x02);
   175  	*s1 += _mm_cvtsi128_si32((__m128i)v_s1);
   176  
   177  	v_s2 += (__v4si)_mm_shuffle_epi32((__m128i)v_s2, 0x31);
   178  	v_s2 += (__v4si)_mm_shuffle_epi32((__m128i)v_s2, 0x02);
   179  	*s2 += _mm_cvtsi128_si32((__m128i)v_s2);
   180  }
   181  #  include "adler32_vec_template.h"
   182  #endif /* SSE2 implementation */
   183  
   184  #ifdef DISPATCH
   185  static inline adler32_func_t
   186  arch_select_adler32_func(void)
   187  {
   188  	u32 features = get_cpu_features();
   189  
   190  #ifdef DISPATCH_AVX2
   191  	if (features & X86_CPU_FEATURE_AVX2)
   192  		return adler32_avx2;
   193  #endif
   194  #ifdef DISPATCH_SSE2
   195  	if (features & X86_CPU_FEATURE_SSE2)
   196  		return adler32_sse2;
   197  #endif
   198  	return NULL;
   199  }
   200  #endif /* DISPATCH */
   201  
   202  #endif  // GO_SRC_GITHUB_COM_GRAILBIO_BASE_COMPRESS_LIBDEFLATE_ADLER32_IMPL_H_