github.com/mh-cbon/go@v0.0.0-20160603070303-9e112a3fe4c0/src/hash/crc32/crc32_s390x.s (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 // Vector register range containing CRC-32 constants 8 9 #define CONST_PERM_LE2BE V9 10 #define CONST_R2R1 V10 11 #define CONST_R4R3 V11 12 #define CONST_R5 V12 13 #define CONST_RU_POLY V13 14 #define CONST_CRC_POLY V14 15 16 17 // The CRC-32 constant block contains reduction constants to fold and 18 // process particular chunks of the input data stream in parallel. 19 // 20 // Note that the constant definitions below are extended in order to compute 21 // intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction. 22 // The rightmost doubleword can be 0 to prevent contribution to the result or 23 // can be multiplied by 1 to perform an XOR without the need for a separate 24 // VECTOR EXCLUSIVE OR instruction. 25 // 26 // The polynomials used are bit-reflected: 27 // 28 // IEEE: P'(x) = 0x0edb88320 29 // Castagnoli: P'(x) = 0x082f63b78 30 31 32 // IEEE polynomial constants 33 DATA ·crclecons+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask 34 DATA ·crclecons+8(SB)/8, $0x0706050403020100 35 DATA ·crclecons+16(SB)/8, $0x00000001c6e41596 // R2 36 DATA ·crclecons+24(SB)/8, $0x0000000154442bd4 // R1 37 DATA ·crclecons+32(SB)/8, $0x00000000ccaa009e // R4 38 DATA ·crclecons+40(SB)/8, $0x00000001751997d0 // R3 39 DATA ·crclecons+48(SB)/8, $0x0000000000000000 40 DATA ·crclecons+56(SB)/8, $0x0000000163cd6124 // R5 41 DATA ·crclecons+64(SB)/8, $0x0000000000000000 42 DATA ·crclecons+72(SB)/8, $0x00000001F7011641 // u' 43 DATA ·crclecons+80(SB)/8, $0x0000000000000000 44 DATA ·crclecons+88(SB)/8, $0x00000001DB710641 // P'(x) << 1 45 46 GLOBL ·crclecons(SB),RODATA, $144 47 48 // Castagonli Polynomial constants 49 DATA ·crcclecons+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask 50 DATA ·crcclecons+8(SB)/8, $0x0706050403020100 51 DATA ·crcclecons+16(SB)/8, $0x000000009e4addf8 // R2 52 DATA ·crcclecons+24(SB)/8, $0x00000000740eef02 // R1 53 DATA ·crcclecons+32(SB)/8, $0x000000014cd00bd6 // R4 54 DATA ·crcclecons+40(SB)/8, $0x00000000f20c0dfe // R3 55 DATA ·crcclecons+48(SB)/8, $0x0000000000000000 56 DATA ·crcclecons+56(SB)/8, $0x00000000dd45aab8 // R5 57 DATA ·crcclecons+64(SB)/8, $0x0000000000000000 58 DATA ·crcclecons+72(SB)/8, $0x00000000dea713f1 // u' 59 DATA ·crcclecons+80(SB)/8, $0x0000000000000000 60 DATA ·crcclecons+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1 61 62 GLOBL ·crcclecons(SB),RODATA, $144 63 64 // func hasVectorFacility() bool 65 TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1 66 MOVD $x-24(SP), R1 67 XC $24, 0(R1), 0(R1) // clear the storage 68 MOVD $2, R0 // R0 is the number of double words stored -1 69 WORD $0xB2B01000 // STFLE 0(R1) 70 XOR R0, R0 // reset the value of R0 71 MOVBZ z-8(SP), R1 72 AND $0x40, R1 73 BEQ novector 74 vectorinstalled: 75 // check if the vector instruction has been enabled 76 VLEIB $0, $0xF, V16 77 VLGVB $0, V16, R1 78 CMPBNE R1, $0xF, novector 79 MOVB $1, ret+0(FP) // have vx 80 RET 81 novector: 82 MOVB $0, ret+0(FP) // no vx 83 RET 84 85 86 // The CRC-32 function(s) use these calling conventions: 87 // 88 // Parameters: 89 // 90 // R2: Initial CRC value, typically ~0; and final CRC (return) value. 91 // R3: Input buffer pointer, performance might be improved if the 92 // buffer is on a doubleword boundary. 93 // R4: Length of the buffer, must be 64 bytes or greater. 94 // 95 // Register usage: 96 // 97 // R5: CRC-32 constant pool base pointer. 98 // V0: Initial CRC value and intermediate constants and results. 99 // V1..V4: Data for CRC computation. 100 // V5..V8: Next data chunks that are fetched from the input buffer. 101 // 102 // V9..V14: CRC-32 constants. 103 104 // func vectorizedIEEE(crc uint32, p []byte) uint32 105 TEXT ·vectorizedIEEE(SB),NOSPLIT,$0 106 MOVWZ crc+0(FP), R2 // R2 stores the CRC value 107 MOVD p+8(FP), R3 // data pointer 108 MOVD p_len+16(FP), R4 // len(p) 109 110 MOVD $·crclecons(SB), R5 111 BR vectorizedBody<>(SB) 112 113 // func vectorizedCastagnoli(crc uint32, p []byte) uint32 114 TEXT ·vectorizedCastagnoli(SB),NOSPLIT,$0 115 MOVWZ crc+0(FP), R2 // R2 stores the CRC value 116 MOVD p+8(FP), R3 // data pointer 117 MOVD p_len+16(FP), R4 // len(p) 118 119 // R5: crc-32 constant pool base pointer, constant is used to reduce crc 120 MOVD $·crcclecons(SB), R5 121 BR vectorizedBody<>(SB) 122 123 TEXT vectorizedBody<>(SB),NOSPLIT,$0 124 XOR $0xffffffff, R2 // NOTW R2 125 VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY 126 127 // Load the initial CRC value into the rightmost word of V0 128 VZERO V0 129 VLVGF $3, R2, V0 130 131 // Load a 64-byte data chunk and XOR with CRC 132 VLM 0(R3), V1, V4 // 64-bytes into V1..V4 133 134 // Reflect the data if the CRC operation is in the bit-reflected domain 135 VPERM V1, V1, CONST_PERM_LE2BE, V1 136 VPERM V2, V2, CONST_PERM_LE2BE, V2 137 VPERM V3, V3, CONST_PERM_LE2BE, V3 138 VPERM V4, V4, CONST_PERM_LE2BE, V4 139 140 VX V0, V1, V1 // V1 ^= CRC 141 ADD $64, R3 // BUF = BUF + 64 142 ADD $(-64), R4 143 144 // Check remaining buffer size and jump to proper folding method 145 CMP R4, $64 146 BLT less_than_64bytes 147 148 fold_64bytes_loop: 149 // Load the next 64-byte data chunk into V5 to V8 150 VLM 0(R3), V5, V8 151 VPERM V5, V5, CONST_PERM_LE2BE, V5 152 VPERM V6, V6, CONST_PERM_LE2BE, V6 153 VPERM V7, V7, CONST_PERM_LE2BE, V7 154 VPERM V8, V8, CONST_PERM_LE2BE, V8 155 156 157 // Perform a GF(2) multiplication of the doublewords in V1 with 158 // the reduction constants in V0. The intermediate result is 159 // then folded (accumulated) with the next data chunk in V5 and 160 // stored in V1. Repeat this step for the register contents 161 // in V2, V3, and V4 respectively. 162 163 VGFMAG CONST_R2R1, V1, V5, V1 164 VGFMAG CONST_R2R1, V2, V6, V2 165 VGFMAG CONST_R2R1, V3, V7, V3 166 VGFMAG CONST_R2R1, V4, V8 ,V4 167 168 // Adjust buffer pointer and length for next loop 169 ADD $64, R3 // BUF = BUF + 64 170 ADD $(-64), R4 // LEN = LEN - 64 171 172 CMP R4, $64 173 BGE fold_64bytes_loop 174 175 less_than_64bytes: 176 // Fold V1 to V4 into a single 128-bit value in V1 177 VGFMAG CONST_R4R3, V1, V2, V1 178 VGFMAG CONST_R4R3, V1, V3, V1 179 VGFMAG CONST_R4R3, V1, V4, V1 180 181 // Check whether to continue with 64-bit folding 182 CMP R4, $16 183 BLT final_fold 184 185 fold_16bytes_loop: 186 VL 0(R3), V2 // Load next data chunk 187 VPERM V2, V2, CONST_PERM_LE2BE, V2 188 189 VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk 190 191 // Adjust buffer pointer and size for folding next data chunk 192 ADD $16, R3 193 ADD $-16, R4 194 195 // Process remaining data chunks 196 CMP R4 ,$16 197 BGE fold_16bytes_loop 198 199 final_fold: 200 VLEIB $7, $0x40, V9 201 VSRLB V9, CONST_R4R3, V0 202 VLEIG $0, $1, V0 203 204 VGFMG V0, V1, V1 205 206 VLEIB $7, $0x20, V9 // Shift by words 207 VSRLB V9, V1, V2 // Store remaining bits in V2 208 VUPLLF V1, V1 // Split rightmost doubleword 209 VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2 210 211 212 // The input values to the Barret reduction are the degree-63 polynomial 213 // in V1 (R(x)), degree-32 generator polynomial, and the reduction 214 // constant u. The Barret reduction result is the CRC value of R(x) mod 215 // P(x). 216 // 217 // The Barret reduction algorithm is defined as: 218 // 219 // 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u 220 // 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) 221 // 3. C(x) = R(x) XOR T2(x) mod x^32 222 // 223 // Note: To compensate the division by x^32, use the vector unpack 224 // instruction to move the leftmost word into the leftmost doubleword 225 // of the vector register. The rightmost doubleword is multiplied 226 // with zero to not contribute to the intermedate results. 227 228 229 // T1(x) = floor( R(x) / x^32 ) GF2MUL u 230 VUPLLF V1, V2 231 VGFMG CONST_RU_POLY, V2, V2 232 233 234 // Compute the GF(2) product of the CRC polynomial in VO with T1(x) in 235 // V2 and XOR the intermediate result, T2(x), with the value in V1. 236 // The final result is in the rightmost word of V2. 237 238 VUPLLF V2 , V2 239 VGFMAG CONST_CRC_POLY, V2, V1, V2 240 241 done: 242 VLGVF $2, V2, R2 243 XOR $0xffffffff, R2 // NOTW R2 244 MOVWZ R2, ret + 32(FP) 245 RET