github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/compress/libdeflate/adler32_vec_template.h (about) 1 // NOLINT(build/header_guard) 2 /* 3 * adler32_vec_template.h - template for vectorized Adler-32 implementations 4 * 5 * Copyright 2016 Eric Biggers 6 * 7 * Permission is hereby granted, free of charge, to any person 8 * obtaining a copy of this software and associated documentation 9 * files (the "Software"), to deal in the Software without 10 * restriction, including without limitation the rights to use, 11 * copy, modify, merge, publish, distribute, sublicense, and/or sell 12 * copies of the Software, and to permit persons to whom the 13 * Software is furnished to do so, subject to the following 14 * conditions: 15 * 16 * The above copyright notice and this permission notice shall be 17 * included in all copies or substantial portions of the Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 20 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 21 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 22 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 23 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 24 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 25 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 26 * OTHER DEALINGS IN THE SOFTWARE. 27 */ 28 29 /* 30 * This file contains a template for vectorized Adler-32 implementations. 31 * 32 * The inner loop between reductions modulo 65521 of an unvectorized Adler-32 33 * implementation looks something like this: 34 * 35 * do { 36 * s1 += *p; 37 * s2 += s1; 38 * } while (++p != chunk_end); 39 * 40 * For vectorized calculation of s1, we only need to sum the input bytes. They 41 * can be accumulated into multiple counters which are eventually summed 42 * together. 43 * 44 * For vectorized calculation of s2, the basic idea is that for each iteration 45 * that processes N bytes, we can perform the following vectorizable 46 * calculation: 47 * 48 * s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N 49 * 50 * Or, equivalently, we can sum the byte_1...byte_N for each iteration into N 51 * separate counters, then do the multiplications by N...1 just once at the end 52 * rather than once per iteration. 53 * 54 * Also, we must account for how previous bytes will affect s2 by doing the 55 * following at beginning of each iteration: 56 * 57 * s2 += s1 * N 58 * 59 * Furthermore, like s1, "s2" can actually be multiple counters which are 60 * eventually summed together. 61 */ 62 63 static u32 ATTRIBUTES 64 FUNCNAME(u32 adler, const u8 *p, size_t size) 65 { 66 u32 s1 = adler & 0xFFFF; 67 u32 s2 = adler >> 16; 68 const u8 * const end = p + size; 69 const u8 *vend; 70 const size_t max_chunk_size = 71 MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) - 72 (MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) % 73 IMPL_SEGMENT_SIZE); 74 75 /* Process a byte at a time until the needed alignment is reached */ 76 if (p != end && (uintptr_t)p % IMPL_ALIGNMENT) { 77 do { 78 s1 += *p++; 79 s2 += s1; 80 } while (p != end && (uintptr_t)p % IMPL_ALIGNMENT); 81 s1 %= DIVISOR; 82 s2 %= DIVISOR; 83 } 84 85 /* 86 * Process "chunks" of bytes using vector instructions. Chunk sizes are 87 * limited to MAX_CHUNK_SIZE, which guarantees that s1 and s2 never 88 * overflow before being reduced modulo DIVISOR. For vector processing, 89 * chunk sizes are also made evenly divisible by IMPL_SEGMENT_SIZE and 90 * may be further limited to IMPL_MAX_CHUNK_SIZE. 91 */ 92 STATIC_ASSERT(IMPL_SEGMENT_SIZE % IMPL_ALIGNMENT == 0); 93 vend = end - ((size_t)(end - p) % IMPL_SEGMENT_SIZE); 94 while (p != vend) { 95 size_t chunk_size = MIN((size_t)(vend - p), max_chunk_size); 96 97 s2 += s1 * chunk_size; 98 99 FUNCNAME_CHUNK((const void *)p, (const void *)(p + chunk_size), 100 &s1, &s2); 101 102 p += chunk_size; 103 s1 %= DIVISOR; 104 s2 %= DIVISOR; 105 } 106 107 /* Process any remaining bytes */ 108 if (p != end) { 109 do { 110 s1 += *p++; 111 s2 += s1; 112 } while (p != end); 113 s1 %= DIVISOR; 114 s2 %= DIVISOR; 115 } 116 117 return (s2 << 16) | s1; 118 } 119 120 #undef FUNCNAME 121 #undef FUNCNAME_CHUNK 122 #undef ATTRIBUTES 123 #undef IMPL_ALIGNMENT 124 #undef IMPL_SEGMENT_SIZE 125 #undef IMPL_MAX_CHUNK_SIZE