github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/compress/libdeflate/adler32_vec_template.h

github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/compress/libdeflate/adler32_vec_template.h (about)

     1  // NOLINT(build/header_guard)
     2  /*
     3   * adler32_vec_template.h - template for vectorized Adler-32 implementations
     4   *
     5   * Copyright 2016 Eric Biggers
     6   *
     7   * Permission is hereby granted, free of charge, to any person
     8   * obtaining a copy of this software and associated documentation
     9   * files (the "Software"), to deal in the Software without
    10   * restriction, including without limitation the rights to use,
    11   * copy, modify, merge, publish, distribute, sublicense, and/or sell
    12   * copies of the Software, and to permit persons to whom the
    13   * Software is furnished to do so, subject to the following
    14   * conditions:
    15   *
    16   * The above copyright notice and this permission notice shall be
    17   * included in all copies or substantial portions of the Software.
    18   *
    19   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    20   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    21   * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    22   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    23   * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    24   * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    25   * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    26   * OTHER DEALINGS IN THE SOFTWARE.
    27   */
    28  
    29  /*
    30   * This file contains a template for vectorized Adler-32 implementations.
    31   *
    32   * The inner loop between reductions modulo 65521 of an unvectorized Adler-32
    33   * implementation looks something like this:
    34   *
    35   *	do {
    36   *		s1 += *p;
    37   *		s2 += s1;
    38   *	} while (++p != chunk_end);
    39   *
    40   * For vectorized calculation of s1, we only need to sum the input bytes.  They
    41   * can be accumulated into multiple counters which are eventually summed
    42   * together.
    43   *
    44   * For vectorized calculation of s2, the basic idea is that for each iteration
    45   * that processes N bytes, we can perform the following vectorizable
    46   * calculation:
    47   *
    48   *	s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N
    49   *
    50   * Or, equivalently, we can sum the byte_1...byte_N for each iteration into N
    51   * separate counters, then do the multiplications by N...1 just once at the end
    52   * rather than once per iteration.
    53   *
    54   * Also, we must account for how previous bytes will affect s2 by doing the
    55   * following at beginning of each iteration:
    56   *
    57   *	s2 += s1 * N
    58   *
    59   * Furthermore, like s1, "s2" can actually be multiple counters which are
    60   * eventually summed together.
    61   */
    62  
    63  static u32 ATTRIBUTES
    64  FUNCNAME(u32 adler, const u8 *p, size_t size)
    65  {
    66  	u32 s1 = adler & 0xFFFF;
    67  	u32 s2 = adler >> 16;
    68  	const u8 * const end = p + size;
    69  	const u8 *vend;
    70  	const size_t max_chunk_size =
    71  		MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) -
    72  		(MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) %
    73  		 IMPL_SEGMENT_SIZE);
    74  
    75  	/* Process a byte at a time until the needed alignment is reached */
    76  	if (p != end && (uintptr_t)p % IMPL_ALIGNMENT) {
    77  		do {
    78  			s1 += *p++;
    79  			s2 += s1;
    80  		} while (p != end && (uintptr_t)p % IMPL_ALIGNMENT);
    81  		s1 %= DIVISOR;
    82  		s2 %= DIVISOR;
    83  	}
    84  
    85  	/*
    86  	 * Process "chunks" of bytes using vector instructions.  Chunk sizes are
    87  	 * limited to MAX_CHUNK_SIZE, which guarantees that s1 and s2 never
    88  	 * overflow before being reduced modulo DIVISOR.  For vector processing,
    89  	 * chunk sizes are also made evenly divisible by IMPL_SEGMENT_SIZE and
    90  	 * may be further limited to IMPL_MAX_CHUNK_SIZE.
    91  	 */
    92  	STATIC_ASSERT(IMPL_SEGMENT_SIZE % IMPL_ALIGNMENT == 0);
    93  	vend = end - ((size_t)(end - p) % IMPL_SEGMENT_SIZE);
    94  	while (p != vend) {
    95  		size_t chunk_size = MIN((size_t)(vend - p), max_chunk_size);
    96  
    97  		s2 += s1 * chunk_size;
    98  
    99  		FUNCNAME_CHUNK((const void *)p, (const void *)(p + chunk_size),
   100  			       &s1, &s2);
   101  
   102  		p += chunk_size;
   103  		s1 %= DIVISOR;
   104  		s2 %= DIVISOR;
   105  	}
   106  
   107  	/* Process any remaining bytes */
   108  	if (p != end) {
   109  		do {
   110  			s1 += *p++;
   111  			s2 += s1;
   112  		} while (p != end);
   113  		s1 %= DIVISOR;
   114  		s2 %= DIVISOR;
   115  	}
   116  
   117  	return (s2 << 16) | s1;
   118  }
   119  
   120  #undef FUNCNAME
   121  #undef FUNCNAME_CHUNK
   122  #undef ATTRIBUTES
   123  #undef IMPL_ALIGNMENT
   124  #undef IMPL_SEGMENT_SIZE
   125  #undef IMPL_MAX_CHUNK_SIZE