github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/compress/libdeflate/arm/adler32_impl.h (about)

     1  #ifndef GO_SRC_GITHUB_COM_GRAILBIO_BASE_COMPRESS_LIBDEFLATE_ARM_ADLER32_IMPL_H_
     2  #define GO_SRC_GITHUB_COM_GRAILBIO_BASE_COMPRESS_LIBDEFLATE_ARM_ADLER32_IMPL_H_
     3  /*
     4   * arm/adler32_impl.h - ARM implementations of Adler-32 checksum algorithm
     5   *
     6   * Copyright 2016 Eric Biggers
     7   *
     8   * Permission is hereby granted, free of charge, to any person
     9   * obtaining a copy of this software and associated documentation
    10   * files (the "Software"), to deal in the Software without
    11   * restriction, including without limitation the rights to use,
    12   * copy, modify, merge, publish, distribute, sublicense, and/or sell
    13   * copies of the Software, and to permit persons to whom the
    14   * Software is furnished to do so, subject to the following
    15   * conditions:
    16   *
    17   * The above copyright notice and this permission notice shall be
    18   * included in all copies or substantial portions of the Software.
    19   *
    20   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    21   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    22   * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    23   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    24   * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    25   * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    26   * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    27   * OTHER DEALINGS IN THE SOFTWARE.
    28   */
    29  
    30  #include "cpu_features.h"
    31  
    32  /* NEON implementation */
    33  #undef DISPATCH_NEON
    34  #if !defined(DEFAULT_IMPL) &&	\
    35  	(defined(__ARM_NEON) || (ARM_CPU_FEATURES_ENABLED &&	\
    36  				 COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS))
    37  #  define FUNCNAME		adler32_neon
    38  #  define FUNCNAME_CHUNK	adler32_neon_chunk
    39  #  define IMPL_ALIGNMENT	16
    40  #  define IMPL_SEGMENT_SIZE	32
    41  /* Prevent unsigned overflow of the 16-bit precision byte counters */
    42  #  define IMPL_MAX_CHUNK_SIZE	(32 * (0xFFFF / 0xFF))
    43  #  ifdef __ARM_NEON
    44  #    define ATTRIBUTES
    45  #    define DEFAULT_IMPL	adler32_neon
    46  #  else
    47  #    ifdef __arm__
    48  #      define ATTRIBUTES	__attribute__((target("fpu=neon")))
    49  #    else
    50  #      define ATTRIBUTES	__attribute__((target("+simd")))
    51  #    endif
    52  #    define DISPATCH		1
    53  #    define DISPATCH_NEON	1
    54  #  endif
    55  #  include <arm_neon.h>
    56  static forceinline ATTRIBUTES void
    57  adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end,
    58  		   u32 *s1, u32 *s2)
    59  {
    60  	uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 };
    61  	uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, 0 };
    62  	uint16x8_t v_byte_sums_a = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
    63  	uint16x8_t v_byte_sums_b = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
    64  	uint16x8_t v_byte_sums_c = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
    65  	uint16x8_t v_byte_sums_d = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
    66  
    67  	do {
    68  		const uint8x16_t bytes1 = *p++;
    69  		const uint8x16_t bytes2 = *p++;
    70  		uint16x8_t tmp;
    71  
    72  		v_s2 += v_s1;
    73  
    74  		/* Vector Pairwise Add Long (u8 => u16) */
    75  		tmp = vpaddlq_u8(bytes1);
    76  
    77  		/* Vector Pairwise Add and Accumulate Long (u8 => u16) */
    78  		tmp = vpadalq_u8(tmp, bytes2);
    79  
    80  		/* Vector Pairwise Add and Accumulate Long (u16 => u32) */
    81  		v_s1 = vpadalq_u16(v_s1, tmp);
    82  
    83  		/* Vector Add Wide (u8 => u16) */
    84  		v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(bytes1));
    85  		v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(bytes1));
    86  		v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(bytes2));
    87  		v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(bytes2));
    88  
    89  	} while (p != end);
    90  
    91  	/* Vector Shift Left (u32) */
    92  	v_s2 = vqshlq_n_u32(v_s2, 5);
    93  
    94  	/* Vector Multiply Accumulate Long (u16 => u32) */
    95  	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a),  (uint16x4_t) { 32, 31, 30, 29 });
    96  	v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_a), (uint16x4_t) { 28, 27, 26, 25 });
    97  	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b),  (uint16x4_t) { 24, 23, 22, 21 });
    98  	v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_b), (uint16x4_t) { 20, 19, 18, 17 });
    99  	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c),  (uint16x4_t) { 16, 15, 14, 13 });
   100  	v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_c), (uint16x4_t) { 12, 11, 10,  9 });
   101  	v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_byte_sums_d), (uint16x4_t) {  8,  7,  6,  5 });
   102  	v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_d), (uint16x4_t) {  4,  3,  2,  1 });
   103  
   104  	*s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
   105  	*s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
   106  }
   107  #  include "../adler32_vec_template.h"
   108  #endif /* NEON implementation */
   109  
   110  #ifdef DISPATCH
   111  static inline adler32_func_t
   112  arch_select_adler32_func(void)
   113  {
   114  	u32 features = get_cpu_features();
   115  
   116  #ifdef DISPATCH_NEON
   117  	if (features & ARM_CPU_FEATURE_NEON)
   118  		return adler32_neon;
   119  #endif
   120  	return NULL;
   121  }
   122  #endif /* DISPATCH */
   123  
   124  #endif  // GO_SRC_GITHUB_COM_GRAILBIO_BASE_COMPRESS_LIBDEFLATE_ARM_ADLER32_IMPL_H_