github.com/mh-cbon/go@v0.0.0-20160603070303-9e112a3fe4c0/src/hash/crc32/crc32_s390x.s

github.com/mh-cbon/go@v0.0.0-20160603070303-9e112a3fe4c0/src/hash/crc32/crc32_s390x.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  // Vector register range containing CRC-32 constants
     8  
     9  #define CONST_PERM_LE2BE        V9
    10  #define CONST_R2R1              V10
    11  #define CONST_R4R3              V11
    12  #define CONST_R5                V12
    13  #define CONST_RU_POLY           V13
    14  #define CONST_CRC_POLY          V14
    15  
    16  
    17  // The CRC-32 constant block contains reduction constants to fold and
    18  // process particular chunks of the input data stream in parallel.
    19  //
    20  // Note that the constant definitions below are extended in order to compute
    21  // intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
    22  // The rightmost doubleword can be 0 to prevent contribution to the result or
    23  // can be multiplied by 1 to perform an XOR without the need for a separate
    24  // VECTOR EXCLUSIVE OR instruction.
    25  //
    26  // The polynomials used are bit-reflected:
    27  //
    28  //            IEEE: P'(x) = 0x0edb88320
    29  //      Castagnoli: P'(x) = 0x082f63b78
    30  
    31  
    32  // IEEE polynomial constants
    33  DATA    ·crclecons+0(SB)/8,  $0x0F0E0D0C0B0A0908       // LE-to-BE mask
    34  DATA    ·crclecons+8(SB)/8,  $0x0706050403020100
    35  DATA    ·crclecons+16(SB)/8, $0x00000001c6e41596       // R2
    36  DATA    ·crclecons+24(SB)/8, $0x0000000154442bd4       // R1
    37  DATA    ·crclecons+32(SB)/8, $0x00000000ccaa009e       // R4
    38  DATA    ·crclecons+40(SB)/8, $0x00000001751997d0       // R3
    39  DATA    ·crclecons+48(SB)/8, $0x0000000000000000
    40  DATA    ·crclecons+56(SB)/8, $0x0000000163cd6124       // R5
    41  DATA    ·crclecons+64(SB)/8, $0x0000000000000000
    42  DATA    ·crclecons+72(SB)/8, $0x00000001F7011641       // u'
    43  DATA    ·crclecons+80(SB)/8, $0x0000000000000000
    44  DATA    ·crclecons+88(SB)/8, $0x00000001DB710641       // P'(x) << 1
    45  
    46  GLOBL    ·crclecons(SB),RODATA, $144
    47  
    48  // Castagonli Polynomial constants
    49  DATA    ·crcclecons+0(SB)/8,  $0x0F0E0D0C0B0A0908      // LE-to-BE mask
    50  DATA    ·crcclecons+8(SB)/8,  $0x0706050403020100
    51  DATA    ·crcclecons+16(SB)/8, $0x000000009e4addf8      // R2
    52  DATA    ·crcclecons+24(SB)/8, $0x00000000740eef02      // R1
    53  DATA    ·crcclecons+32(SB)/8, $0x000000014cd00bd6      // R4
    54  DATA    ·crcclecons+40(SB)/8, $0x00000000f20c0dfe      // R3
    55  DATA    ·crcclecons+48(SB)/8, $0x0000000000000000
    56  DATA    ·crcclecons+56(SB)/8, $0x00000000dd45aab8      // R5
    57  DATA    ·crcclecons+64(SB)/8, $0x0000000000000000
    58  DATA    ·crcclecons+72(SB)/8, $0x00000000dea713f1      // u'
    59  DATA    ·crcclecons+80(SB)/8, $0x0000000000000000
    60  DATA    ·crcclecons+88(SB)/8, $0x0000000105ec76f0      // P'(x) << 1
    61  
    62  GLOBL   ·crcclecons(SB),RODATA, $144
    63  
    64  // func hasVectorFacility() bool
    65  TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1
    66  	MOVD    $x-24(SP), R1
    67  	XC      $24, 0(R1), 0(R1) // clear the storage
    68  	MOVD    $2, R0            // R0 is the number of double words stored -1
    69  	WORD    $0xB2B01000       // STFLE 0(R1)
    70  	XOR     R0, R0            // reset the value of R0
    71  	MOVBZ   z-8(SP), R1
    72  	AND     $0x40, R1
    73  	BEQ     novector
    74  vectorinstalled:
    75  	// check if the vector instruction has been enabled
    76  	VLEIB   $0, $0xF, V16
    77  	VLGVB   $0, V16, R1
    78  	CMPBNE  R1, $0xF, novector
    79  	MOVB    $1, ret+0(FP) // have vx
    80  	RET
    81  novector:
    82  	MOVB    $0, ret+0(FP) // no vx
    83  	RET
    84  
    85  
    86  // The CRC-32 function(s) use these calling conventions:
    87  //
    88  // Parameters:
    89  //
    90  //      R2:    Initial CRC value, typically ~0; and final CRC (return) value.
    91  //      R3:    Input buffer pointer, performance might be improved if the
    92  //             buffer is on a doubleword boundary.
    93  //      R4:    Length of the buffer, must be 64 bytes or greater.
    94  //
    95  // Register usage:
    96  //
    97  //      R5:     CRC-32 constant pool base pointer.
    98  //      V0:     Initial CRC value and intermediate constants and results.
    99  //      V1..V4: Data for CRC computation.
   100  //      V5..V8: Next data chunks that are fetched from the input buffer.
   101  //
   102  //      V9..V14: CRC-32 constants.
   103  
   104  // func vectorizedIEEE(crc uint32, p []byte) uint32
   105  TEXT ·vectorizedIEEE(SB),NOSPLIT,$0
   106  	MOVWZ   crc+0(FP), R2     // R2 stores the CRC value
   107  	MOVD    p+8(FP), R3       // data pointer
   108  	MOVD    p_len+16(FP), R4  // len(p)
   109  
   110  	MOVD    $·crclecons(SB), R5
   111  	BR      vectorizedBody<>(SB)
   112  
   113  // func vectorizedCastagnoli(crc uint32, p []byte) uint32
   114  TEXT ·vectorizedCastagnoli(SB),NOSPLIT,$0
   115  	MOVWZ   crc+0(FP), R2     // R2 stores the CRC value
   116  	MOVD    p+8(FP), R3       // data pointer
   117  	MOVD    p_len+16(FP), R4  // len(p)
   118  
   119  	// R5: crc-32 constant pool base pointer, constant is used to reduce crc
   120  	MOVD    $·crcclecons(SB), R5
   121  	BR      vectorizedBody<>(SB)
   122  
   123  TEXT vectorizedBody<>(SB),NOSPLIT,$0
   124  	XOR     $0xffffffff, R2 // NOTW R2
   125  	VLM     0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
   126  
   127  	// Load the initial CRC value into the rightmost word of V0
   128  	VZERO   V0
   129  	VLVGF   $3, R2, V0
   130  
   131  	// Load a 64-byte data chunk and XOR with CRC
   132  	VLM     0(R3), V1, V4    // 64-bytes into V1..V4
   133  
   134  	// Reflect the data if the CRC operation is in the bit-reflected domain
   135  	VPERM   V1, V1, CONST_PERM_LE2BE, V1
   136  	VPERM   V2, V2, CONST_PERM_LE2BE, V2
   137  	VPERM   V3, V3, CONST_PERM_LE2BE, V3
   138  	VPERM   V4, V4, CONST_PERM_LE2BE, V4
   139  
   140  	VX      V0, V1, V1     // V1 ^= CRC
   141  	ADD     $64, R3        // BUF = BUF + 64
   142  	ADD     $(-64), R4
   143  
   144  	// Check remaining buffer size and jump to proper folding method
   145  	CMP     R4, $64
   146  	BLT     less_than_64bytes
   147  
   148  fold_64bytes_loop:
   149  	// Load the next 64-byte data chunk into V5 to V8
   150  	VLM     0(R3), V5, V8
   151  	VPERM   V5, V5, CONST_PERM_LE2BE, V5
   152  	VPERM   V6, V6, CONST_PERM_LE2BE, V6
   153  	VPERM   V7, V7, CONST_PERM_LE2BE, V7
   154  	VPERM   V8, V8, CONST_PERM_LE2BE, V8
   155  
   156  
   157  	// Perform a GF(2) multiplication of the doublewords in V1 with
   158  	// the reduction constants in V0.  The intermediate result is
   159  	// then folded (accumulated) with the next data chunk in V5 and
   160  	// stored in V1.  Repeat this step for the register contents
   161  	// in V2, V3, and V4 respectively.
   162  
   163  	VGFMAG  CONST_R2R1, V1, V5, V1
   164  	VGFMAG  CONST_R2R1, V2, V6, V2
   165  	VGFMAG  CONST_R2R1, V3, V7, V3
   166  	VGFMAG  CONST_R2R1, V4, V8 ,V4
   167  
   168  	// Adjust buffer pointer and length for next loop
   169  	ADD     $64, R3                  // BUF = BUF + 64
   170  	ADD     $(-64), R4               // LEN = LEN - 64
   171  
   172  	CMP     R4, $64
   173  	BGE     fold_64bytes_loop
   174  
   175  less_than_64bytes:
   176  	// Fold V1 to V4 into a single 128-bit value in V1
   177  	VGFMAG  CONST_R4R3, V1, V2, V1
   178  	VGFMAG  CONST_R4R3, V1, V3, V1
   179  	VGFMAG  CONST_R4R3, V1, V4, V1
   180  
   181  	// Check whether to continue with 64-bit folding
   182  	CMP R4, $16
   183  	BLT final_fold
   184  
   185  fold_16bytes_loop:
   186  	VL      0(R3), V2               // Load next data chunk
   187  	VPERM   V2, V2, CONST_PERM_LE2BE, V2
   188  
   189  	VGFMAG  CONST_R4R3, V1, V2, V1  // Fold next data chunk
   190  
   191  	// Adjust buffer pointer and size for folding next data chunk
   192  	ADD     $16, R3
   193  	ADD     $-16, R4
   194  
   195  	// Process remaining data chunks
   196  	CMP     R4 ,$16
   197  	BGE     fold_16bytes_loop
   198  
   199  final_fold:
   200  	VLEIB   $7, $0x40, V9
   201  	VSRLB   V9, CONST_R4R3, V0
   202  	VLEIG   $0, $1, V0
   203  
   204  	VGFMG   V0, V1, V1
   205  
   206  	VLEIB   $7, $0x20, V9         // Shift by words
   207  	VSRLB   V9, V1, V2            // Store remaining bits in V2
   208  	VUPLLF  V1, V1                // Split rightmost doubleword
   209  	VGFMAG  CONST_R5, V1, V2, V1  // V1 = (V1 * R5) XOR V2
   210  
   211  
   212  	// The input values to the Barret reduction are the degree-63 polynomial
   213  	// in V1 (R(x)), degree-32 generator polynomial, and the reduction
   214  	// constant u.  The Barret reduction result is the CRC value of R(x) mod
   215  	// P(x).
   216  	//
   217  	// The Barret reduction algorithm is defined as:
   218  	//
   219  	//    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
   220  	//    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
   221  	//    3. C(x)  = R(x) XOR T2(x) mod x^32
   222  	//
   223  	// Note: To compensate the division by x^32, use the vector unpack
   224  	// instruction to move the leftmost word into the leftmost doubleword
   225  	// of the vector register.  The rightmost doubleword is multiplied
   226  	// with zero to not contribute to the intermedate results.
   227  
   228  
   229  	// T1(x) = floor( R(x) / x^32 ) GF2MUL u
   230  	VUPLLF  V1, V2
   231  	VGFMG   CONST_RU_POLY, V2, V2
   232  
   233  
   234  	// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
   235  	// V2 and XOR the intermediate result, T2(x),  with the value in V1.
   236  	// The final result is in the rightmost word of V2.
   237  
   238  	VUPLLF  V2 , V2
   239  	VGFMAG  CONST_CRC_POLY, V2, V1, V2
   240  
   241  done:
   242  	VLGVF   $2, V2, R2
   243  	XOR     $0xffffffff, R2 // NOTW R2
   244  	MOVWZ   R2, ret + 32(FP)
   245  	RET