github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/compress/libdeflate/adler32_impl.h (about) 1 #ifndef GO_SRC_GITHUB_COM_GRAILBIO_BASE_COMPRESS_LIBDEFLATE_ADLER32_IMPL_H_ 2 #define GO_SRC_GITHUB_COM_GRAILBIO_BASE_COMPRESS_LIBDEFLATE_ADLER32_IMPL_H_ 3 /* 4 * x86/adler32_impl.h - x86 implementations of Adler-32 checksum algorithm 5 * 6 * Copyright 2016 Eric Biggers 7 * 8 * Permission is hereby granted, free of charge, to any person 9 * obtaining a copy of this software and associated documentation 10 * files (the "Software"), to deal in the Software without 11 * restriction, including without limitation the rights to use, 12 * copy, modify, merge, publish, distribute, sublicense, and/or sell 13 * copies of the Software, and to permit persons to whom the 14 * Software is furnished to do so, subject to the following 15 * conditions: 16 * 17 * The above copyright notice and this permission notice shall be 18 * included in all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 22 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 24 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 25 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 26 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 27 * OTHER DEALINGS IN THE SOFTWARE. 28 */ 29 30 #include "cpu_features.h" 31 32 /* AVX2 implementation */ 33 #undef DISPATCH_AVX2 34 #if !defined(DEFAULT_IMPL) && \ 35 (defined(__AVX2__) || (X86_CPU_FEATURES_ENABLED && \ 36 COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS)) 37 # define FUNCNAME adler32_avx2 38 # define FUNCNAME_CHUNK adler32_avx2_chunk 39 # define IMPL_ALIGNMENT 32 40 # define IMPL_SEGMENT_SIZE 32 41 # define IMPL_MAX_CHUNK_SIZE MAX_CHUNK_SIZE 42 # ifdef __AVX2__ 43 # define ATTRIBUTES 44 # define DEFAULT_IMPL adler32_avx2 45 # else 46 # define ATTRIBUTES __attribute__((target("avx2"))) 47 # define DISPATCH 1 48 # define DISPATCH_AVX2 1 49 # endif 50 # include <immintrin.h> 51 static forceinline ATTRIBUTES void 52 adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) 53 { 54 const __m256i zeroes = _mm256_setzero_si256(); 55 const __v32qi multipliers = (__v32qi) { 32, 31, 30, 29, 28, 27, 26, 25, 56 24, 23, 22, 21, 20, 19, 18, 17, 57 16, 15, 14, 13, 12, 11, 10, 9, 58 8, 7, 6, 5, 4, 3, 2, 1 }; 59 const __v16hi ones = (__v16hi)_mm256_set1_epi16(1); 60 __v8si v_s1 = (__v8si)zeroes; 61 __v8si v_s1_sums = (__v8si)zeroes; 62 __v8si v_s2 = (__v8si)zeroes; 63 64 do { 65 __m256i bytes = *p++; 66 __v16hi sums = (__v16hi)_mm256_maddubs_epi16( 67 bytes, (__m256i)multipliers); 68 v_s1_sums += v_s1; 69 v_s1 += (__v8si)_mm256_sad_epu8(bytes, zeroes); 70 v_s2 += (__v8si)_mm256_madd_epi16((__m256i)sums, (__m256i)ones); 71 } while (p != end); 72 73 v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes); 74 v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes); 75 *s1 += (u32)v_s1[0] + (u32)v_s1[4]; 76 77 v_s2 += (__v8si)_mm256_slli_epi32((__m256i)v_s1_sums, 5); 78 v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes); 79 v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes); 80 *s2 += (u32)v_s2[0] + (u32)v_s2[4]; 81 } 82 # include "adler32_vec_template.h" 83 #endif /* AVX2 implementation */ 84 85 /* SSE2 implementation */ 86 #undef DISPATCH_SSE2 87 #if !defined(DEFAULT_IMPL) && \ 88 (defined(__SSE2__) || (X86_CPU_FEATURES_ENABLED && \ 89 COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS)) 90 # define FUNCNAME adler32_sse2 91 # define FUNCNAME_CHUNK adler32_sse2_chunk 92 # define IMPL_ALIGNMENT 16 93 # define IMPL_SEGMENT_SIZE 32 94 /* 95 * The 16-bit precision byte counters must not be allowed to undergo *signed* 96 * overflow, otherwise the signed multiplications at the end (_mm_madd_epi16) 97 * would behave incorrectly. 98 */ 99 # define IMPL_MAX_CHUNK_SIZE (32 * (0x7FFF / 0xFF)) 100 # ifdef __SSE2__ 101 # define ATTRIBUTES 102 # define DEFAULT_IMPL adler32_sse2 103 # else 104 # define ATTRIBUTES __attribute__((target("sse2"))) 105 # define DISPATCH 1 106 # define DISPATCH_SSE2 1 107 # endif 108 # include <emmintrin.h> 109 static forceinline ATTRIBUTES void 110 adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) 111 { 112 const __m128i zeroes = _mm_setzero_si128(); 113 114 /* s1 counters: 32-bit, sum of bytes */ 115 __v4si v_s1 = (__v4si)zeroes; 116 117 /* s2 counters: 32-bit, sum of s1 values */ 118 __v4si v_s2 = (__v4si)zeroes; 119 120 /* 121 * Thirty-two 16-bit counters for byte sums. Each accumulates the bytes 122 * that eventually need to be multiplied by a number 32...1 for addition 123 * into s2. 124 */ 125 __v8hi v_byte_sums_a = (__v8hi)zeroes; 126 __v8hi v_byte_sums_b = (__v8hi)zeroes; 127 __v8hi v_byte_sums_c = (__v8hi)zeroes; 128 __v8hi v_byte_sums_d = (__v8hi)zeroes; 129 130 do { 131 /* Load the next 32 bytes */ 132 const __m128i bytes1 = *p++; 133 const __m128i bytes2 = *p++; 134 135 /* 136 * Accumulate the previous s1 counters into the s2 counters. 137 * Logically, this really should be v_s2 += v_s1 * 32, but we 138 * can do the multiplication (or left shift) later. 139 */ 140 v_s2 += v_s1; 141 142 /* 143 * s1 update: use "Packed Sum of Absolute Differences" to add 144 * the bytes horizontally with 8 bytes per sum. Then add the 145 * sums to the s1 counters. 146 */ 147 v_s1 += (__v4si)_mm_sad_epu8(bytes1, zeroes); 148 v_s1 += (__v4si)_mm_sad_epu8(bytes2, zeroes); 149 150 /* 151 * Also accumulate the bytes into 32 separate counters that have 152 * 16-bit precision. 153 */ 154 v_byte_sums_a += (__v8hi)_mm_unpacklo_epi8(bytes1, zeroes); 155 v_byte_sums_b += (__v8hi)_mm_unpackhi_epi8(bytes1, zeroes); 156 v_byte_sums_c += (__v8hi)_mm_unpacklo_epi8(bytes2, zeroes); 157 v_byte_sums_d += (__v8hi)_mm_unpackhi_epi8(bytes2, zeroes); 158 159 } while (p != end); 160 161 /* Finish calculating the s2 counters */ 162 v_s2 = (__v4si)_mm_slli_epi32((__m128i)v_s2, 5); 163 v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_a, 164 (__m128i)(__v8hi){ 32, 31, 30, 29, 28, 27, 26, 25 }); 165 v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_b, 166 (__m128i)(__v8hi){ 24, 23, 22, 21, 20, 19, 18, 17 }); 167 v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_c, 168 (__m128i)(__v8hi){ 16, 15, 14, 13, 12, 11, 10, 9 }); 169 v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_d, 170 (__m128i)(__v8hi){ 8, 7, 6, 5, 4, 3, 2, 1 }); 171 172 /* Now accumulate what we computed into the real s1 and s2 */ 173 v_s1 += (__v4si)_mm_shuffle_epi32((__m128i)v_s1, 0x31); 174 v_s1 += (__v4si)_mm_shuffle_epi32((__m128i)v_s1, 0x02); 175 *s1 += _mm_cvtsi128_si32((__m128i)v_s1); 176 177 v_s2 += (__v4si)_mm_shuffle_epi32((__m128i)v_s2, 0x31); 178 v_s2 += (__v4si)_mm_shuffle_epi32((__m128i)v_s2, 0x02); 179 *s2 += _mm_cvtsi128_si32((__m128i)v_s2); 180 } 181 # include "adler32_vec_template.h" 182 #endif /* SSE2 implementation */ 183 184 #ifdef DISPATCH 185 static inline adler32_func_t 186 arch_select_adler32_func(void) 187 { 188 u32 features = get_cpu_features(); 189 190 #ifdef DISPATCH_AVX2 191 if (features & X86_CPU_FEATURE_AVX2) 192 return adler32_avx2; 193 #endif 194 #ifdef DISPATCH_SSE2 195 if (features & X86_CPU_FEATURE_SSE2) 196 return adler32_sse2; 197 #endif 198 return NULL; 199 } 200 #endif /* DISPATCH */ 201 202 #endif // GO_SRC_GITHUB_COM_GRAILBIO_BASE_COMPRESS_LIBDEFLATE_ADLER32_IMPL_H_