github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/compress/libdeflate/crc32.c (about) 1 /* 2 * crc32.c - CRC-32 checksum algorithm for the gzip format 3 * 4 * Copyright 2016 Eric Biggers 5 * 6 * Permission is hereby granted, free of charge, to any person 7 * obtaining a copy of this software and associated documentation 8 * files (the "Software"), to deal in the Software without 9 * restriction, including without limitation the rights to use, 10 * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 * copies of the Software, and to permit persons to whom the 12 * Software is furnished to do so, subject to the following 13 * conditions: 14 * 15 * The above copyright notice and this permission notice shall be 16 * included in all copies or substantial portions of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 * OTHER DEALINGS IN THE SOFTWARE. 26 */ 27 28 /* 29 * High-level description of CRC 30 * ============================= 31 * 32 * Consider a bit sequence 'bits[1...len]'. Interpret 'bits' as the "message" 33 * polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2), 34 * where the coefficient of 'x^i' is 'bits[len - i]'. Then, compute: 35 * 36 * R(x) = M(x)*x^n mod G(x) 37 * 38 * where G(x) is a selected "generator" polynomial of degree 'n'. The remainder 39 * R(x) is a polynomial of max degree 'n - 1'. The CRC of 'bits' is R(x) 40 * interpreted as a bitstring of length 'n'. 41 * 42 * CRC used in gzip 43 * ================ 44 * 45 * In the gzip format (RFC 1952): 46 * 47 * - The bitstring to checksum is formed from the bytes of the uncompressed 48 * data by concatenating the bits from the bytes in order, proceeding 49 * from the low-order bit to the high-order bit within each byte. 50 * 51 * - The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 + 52 * x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1. 53 * Consequently, the CRC length is 32 bits ("CRC-32"). 54 * 55 * - The highest order 32 coefficients of M(x)*x^n are inverted. 56 * 57 * - All 32 coefficients of R(x) are inverted. 58 * 59 * The two inversions cause added leading and trailing zero bits to affect the 60 * resulting CRC, whereas with a regular CRC such bits would have no effect on 61 * the CRC. 62 * 63 * Computation and optimizations 64 * ============================= 65 * 66 * We can compute R(x) through "long division", maintaining only 32 bits of 67 * state at any given time. Multiplication by 'x' can be implemented as 68 * right-shifting by 1 (assuming the polynomial<=>bitstring mapping where the 69 * highest order bit represents the coefficient of x^0), and both addition and 70 * subtraction can be implemented as bitwise exclusive OR (since we are working 71 * in GF(2)). Here is an unoptimized implementation: 72 * 73 * static u32 crc32_gzip(const u8 *buffer, size_t size) 74 * { 75 * u32 remainder = 0; 76 * const u32 divisor = 0xEDB88320; 77 * 78 * for (size_t i = 0; i < size * 8 + 32; i++) { 79 * int bit; 80 * u32 multiple; 81 * 82 * if (i < size * 8) 83 * bit = (buffer[i / 8] >> (i % 8)) & 1; 84 * else 85 * bit = 0; // one of the 32 appended 0 bits 86 * 87 * if (i < 32) // the first 32 bits are inverted 88 * bit ^= 1; 89 * 90 * if (remainder & 1) 91 * multiple = divisor; 92 * else 93 * multiple = 0; 94 * 95 * remainder >>= 1; 96 * remainder |= (u32)bit << 31; 97 * remainder ^= multiple; 98 * } 99 * 100 * return ~remainder; 101 * } 102 * 103 * In this implementation, the 32-bit integer 'remainder' maintains the 104 * remainder of the currently processed portion of the message (with 32 zero 105 * bits appended) when divided by the generator polynomial. 'remainder' is the 106 * representation of R(x), and 'divisor' is the representation of G(x) excluding 107 * the x^32 coefficient. For each bit to process, we multiply R(x) by 'x^1', 108 * then add 'x^0' if the new bit is a 1. If this causes R(x) to gain a nonzero 109 * x^32 term, then we subtract G(x) from R(x). 110 * 111 * We can speed this up by taking advantage of the fact that XOR is commutative 112 * and associative, so the order in which we combine the inputs into 'remainder' 113 * is unimportant. And since each message bit we add doesn't affect the choice 114 * of 'multiple' until 32 bits later, we need not actually add each message bit 115 * until that point: 116 * 117 * static u32 crc32_gzip(const u8 *buffer, size_t size) 118 * { 119 * u32 remainder = ~0; 120 * const u32 divisor = 0xEDB88320; 121 * 122 * for (size_t i = 0; i < size * 8; i++) { 123 * int bit; 124 * u32 multiple; 125 * 126 * bit = (buffer[i / 8] >> (i % 8)) & 1; 127 * remainder ^= bit; 128 * if (remainder & 1) 129 * multiple = divisor; 130 * else 131 * multiple = 0; 132 * remainder >>= 1; 133 * remainder ^= multiple; 134 * } 135 * 136 * return ~remainder; 137 * } 138 * 139 * With the above implementation we get the effect of 32 appended 0 bits for 140 * free; they never affect the choice of a divisor, nor would they change the 141 * value of 'remainder' if they were to be actually XOR'ed in. And by starting 142 * with a remainder of all 1 bits, we get the effect of complementing the first 143 * 32 message bits. 144 * 145 * The next optimization is to process the input in multi-bit units. Suppose 146 * that we insert the next 'n' message bits into the remainder. Then we get an 147 * intermediate remainder of length '32 + n' bits, and the CRC of the extra 'n' 148 * bits is the amount by which the low 32 bits of the remainder will change as a 149 * result of cancelling out those 'n' bits. Taking n=8 (one byte) and 150 * precomputing a table containing the CRC of each possible byte, we get 151 * crc32_slice1() defined below. 152 * 153 * As a further optimization, we could increase the multi-bit unit size to 16. 154 * However, that is inefficient because the table size explodes from 256 entries 155 * (1024 bytes) to 65536 entries (262144 bytes), which wastes memory and won't 156 * fit in L1 cache on typical processors. 157 * 158 * However, we can actually process 4 bytes at a time using 4 different tables 159 * with 256 entries each. Logically, we form a 64-bit intermediate remainder 160 * and cancel out the high 32 bits in 8-bit chunks. Bits 32-39 are cancelled 161 * out by the CRC of those bits, whereas bits 40-47 are be cancelled out by the 162 * CRC of those bits with 8 zero bits appended, and so on. This method is 163 * implemented in crc32_slice4(), defined below. 164 * 165 * In crc32_slice8(), this method is extended to 8 bytes at a time. The 166 * intermediate remainder (which we never actually store explicitly) is 96 bits. 167 * 168 * On CPUs that support fast carryless multiplication, CRCs can be computed even 169 * more quickly via "folding". See e.g. the x86 PCLMUL implementation. 170 */ 171 172 #include "lib_common.h" 173 #include "libdeflate.h" 174 175 typedef u32 (*crc32_func_t)(u32, const u8 *, size_t); 176 177 /* Include architecture-specific implementations if available */ 178 #undef CRC32_SLICE1 179 #undef CRC32_SLICE4 180 #undef CRC32_SLICE8 181 #undef DEFAULT_IMPL 182 #undef DISPATCH 183 #if defined(__arm__) || defined(__aarch64__) 184 # include "arm/crc32_impl.h" 185 #elif defined(__i386__) || defined(__x86_64__) 186 # include "crc32_impl.h" 187 #endif 188 189 /* 190 * Define a generic implementation (crc32_slice8()) if needed. crc32_slice1() 191 * may also be needed as a fallback for architecture-specific implementations. 192 */ 193 194 #ifndef DEFAULT_IMPL 195 # define CRC32_SLICE8 1 196 # define DEFAULT_IMPL crc32_slice8 197 #endif 198 199 #if defined(CRC32_SLICE1) || defined(CRC32_SLICE4) || defined(CRC32_SLICE8) 200 #include "crc32_table.h" 201 static forceinline u32 202 crc32_update_byte(u32 remainder, u8 next_byte) 203 { 204 return (remainder >> 8) ^ crc32_table[(u8)remainder ^ next_byte]; 205 } 206 #endif 207 208 #ifdef CRC32_SLICE1 209 static u32 210 crc32_slice1(u32 remainder, const u8 *buffer, size_t size) 211 { 212 size_t i; 213 214 STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x100); 215 216 for (i = 0; i < size; i++) 217 remainder = crc32_update_byte(remainder, buffer[i]); 218 return remainder; 219 } 220 #endif /* CRC32_SLICE1 */ 221 222 #ifdef CRC32_SLICE4 223 static u32 224 crc32_slice4(u32 remainder, const u8 *buffer, size_t size) 225 { 226 const u8 *p = buffer; 227 const u8 *end = buffer + size; 228 const u8 *end32; 229 230 STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x400); 231 232 for (; ((uintptr_t)p & 3) && p != end; p++) 233 remainder = crc32_update_byte(remainder, *p); 234 235 end32 = p + ((end - p) & ~3); 236 for (; p != end32; p += 4) { 237 u32 v = le32_bswap(*(const u32 *)p); 238 remainder = 239 crc32_table[0x300 + (u8)((remainder ^ v) >> 0)] ^ 240 crc32_table[0x200 + (u8)((remainder ^ v) >> 8)] ^ 241 crc32_table[0x100 + (u8)((remainder ^ v) >> 16)] ^ 242 crc32_table[0x000 + (u8)((remainder ^ v) >> 24)]; 243 } 244 245 for (; p != end; p++) 246 remainder = crc32_update_byte(remainder, *p); 247 248 return remainder; 249 } 250 #endif /* CRC32_SLICE4 */ 251 252 #ifdef CRC32_SLICE8 253 static u32 254 crc32_slice8(u32 remainder, const u8 *buffer, size_t size) 255 { 256 const u8 *p = buffer; 257 const u8 *end = buffer + size; 258 const u8 *end64; 259 260 STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x800); 261 262 for (; ((uintptr_t)p & 7) && p != end; p++) 263 remainder = crc32_update_byte(remainder, *p); 264 265 end64 = p + ((end - p) & ~7); 266 for (; p != end64; p += 8) { 267 u32 v1 = le32_bswap(*(const u32 *)(p + 0)); 268 u32 v2 = le32_bswap(*(const u32 *)(p + 4)); 269 remainder = 270 crc32_table[0x700 + (u8)((remainder ^ v1) >> 0)] ^ 271 crc32_table[0x600 + (u8)((remainder ^ v1) >> 8)] ^ 272 crc32_table[0x500 + (u8)((remainder ^ v1) >> 16)] ^ 273 crc32_table[0x400 + (u8)((remainder ^ v1) >> 24)] ^ 274 crc32_table[0x300 + (u8)(v2 >> 0)] ^ 275 crc32_table[0x200 + (u8)(v2 >> 8)] ^ 276 crc32_table[0x100 + (u8)(v2 >> 16)] ^ 277 crc32_table[0x000 + (u8)(v2 >> 24)]; 278 } 279 280 for (; p != end; p++) 281 remainder = crc32_update_byte(remainder, *p); 282 283 return remainder; 284 } 285 #endif /* CRC32_SLICE8 */ 286 287 #ifdef DISPATCH 288 static u32 dispatch(u32, const u8 *, size_t); 289 290 static volatile crc32_func_t crc32_impl = dispatch; 291 292 /* Choose the fastest implementation at runtime */ 293 static u32 dispatch(u32 remainder, const u8 *buffer, size_t size) 294 { 295 crc32_func_t f = arch_select_crc32_func(); 296 297 if (f == NULL) 298 f = DEFAULT_IMPL; 299 300 crc32_impl = f; 301 return crc32_impl(remainder, buffer, size); 302 } 303 #else 304 # define crc32_impl DEFAULT_IMPL /* only one implementation, use it */ 305 #endif 306 307 LIBDEFLATEAPI u32 308 libdeflate_crc32(u32 remainder, const void *buffer, size_t size) 309 { 310 if (buffer == NULL) /* return initial value */ 311 return 0; 312 return ~crc32_impl(~remainder, buffer, size); 313 }