github.com/SandwichDev/go-internals@v0.0.0-20210605002614-12311ac6b2c5/bytealg/count_s390x.s (about) 1 // Copyright 2019 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "textflag.h" 7 8 // condition code masks 9 #define EQ 8 10 #define NE 7 11 12 // register assignments 13 #define R_ZERO R0 14 #define R_VAL R1 15 #define R_TMP R2 16 #define R_PTR R3 17 #define R_LEN R4 18 #define R_CHAR R5 19 #define R_RET R6 20 #define R_ITER R7 21 #define R_CNT R8 22 #define R_MPTR R9 23 24 // vector register assignments 25 #define V_ZERO V0 26 #define V_CHAR V1 27 #define V_MASK V2 28 #define V_VAL V3 29 #define V_CNT V4 30 31 // mask for trailing bytes in vector implementation 32 GLOBL countbytemask<>(SB), RODATA, $16 33 DATA countbytemask<>+0(SB)/8, $0x0101010101010101 34 DATA countbytemask<>+8(SB)/8, $0x0101010101010101 35 36 // func Count(b []byte, c byte) int 37 TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-40 38 LMG b+0(FP), R_PTR, R_LEN 39 MOVBZ c+24(FP), R_CHAR 40 MOVD $ret+32(FP), R_RET 41 BR countbytebody<>(SB) 42 43 // func CountString(s string, c byte) int 44 TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32 45 LMG s+0(FP), R_PTR, R_LEN 46 MOVBZ c+16(FP), R_CHAR 47 MOVD $ret+24(FP), R_RET 48 BR countbytebody<>(SB) 49 50 // input: 51 // R_PTR = address of array of bytes 52 // R_LEN = number of bytes in array 53 // R_CHAR = byte value to count zero (extended to register width) 54 // R_RET = address of return value 55 TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0 56 MOVD $internal∕cpu·S390X+const_offsetS390xHasVX(SB), R_TMP 57 MOVD $countbytemask<>(SB), R_MPTR 58 CGIJ $EQ, R_LEN, $0, ret0 // return if length is 0. 59 SRD $4, R_LEN, R_ITER // R_ITER is the number of 16-byte chunks 60 MOVBZ (R_TMP), R_TMP // load bool indicating support for vector facility 61 CGIJ $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available 62 63 // Start of vector code (have vector facility). 64 // 65 // Set R_LEN to be the length mod 16 minus 1 to use as an index for 66 // vector 'load with length' (VLL). It will be in the range [-1,14]. 67 // Also replicate c across a 16-byte vector and initialize V_ZERO. 68 ANDW $0xf, R_LEN 69 VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0} 70 VZERO V_ZERO // V_ZERO = [1]uint128{0} 71 ADDW $-1, R_LEN 72 VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c} 73 74 // Jump to loop if we have more than 15 bytes to process. 75 CGIJ $NE, R_ITER, $0, vxchunks 76 77 // Load 1-15 bytes and corresponding mask. 78 // Note: only the low 32-bits of R_LEN are used for the index. 79 VLL R_LEN, (R_PTR), V_VAL 80 VLL R_LEN, (R_MPTR), V_MASK 81 82 // Compare each byte in input chunk against byte to be counted. 83 // Each byte element will be set to either 0 (no match) or 1 (match). 84 VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00 85 VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits 86 87 // Accumulate matched byte count in 128-bit integer value. 88 VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15} 89 VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3} 90 91 // Return rightmost (lowest) 64-bit part of accumulator. 92 VSTEG $1, V_CNT, (R_RET) 93 RET 94 95 vxchunks: 96 // Load 0x01 into every byte element in the 16-byte mask vector. 97 VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1} 98 VZERO V_CNT // initial uint128 count of 0 99 100 vxloop: 101 // Load input bytes in 16-byte chunks. 102 VL (R_PTR), V_VAL 103 104 // Compare each byte in input chunk against byte to be counted. 105 // Each byte element will be set to either 0 (no match) or 1 (match). 106 VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00 107 VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits 108 109 // Increment input string address. 110 MOVD $16(R_PTR), R_PTR 111 112 // Accumulate matched byte count in 128-bit integer value. 113 VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15} 114 VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3} 115 VAQ V_VAL, V_CNT, V_CNT // accumulate 116 117 // Repeat until all 16-byte chunks are done. 118 BRCTG R_ITER, vxloop 119 120 // Skip to end if there are no trailing bytes. 121 CIJ $EQ, R_LEN, $-1, vxret 122 123 // Load 1-15 bytes and corresponding mask. 124 // Note: only the low 32-bits of R_LEN are used for the index. 125 VLL R_LEN, (R_PTR), V_VAL 126 VLL R_LEN, (R_MPTR), V_MASK 127 128 // Compare each byte in input chunk against byte to be counted. 129 // Each byte element will be set to either 0 (no match) or 1 (match). 130 VCEQB V_CHAR, V_VAL, V_VAL 131 VN V_MASK, V_VAL, V_VAL 132 133 // Accumulate matched byte count in 128-bit integer value. 134 VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15} 135 VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3} 136 VAQ V_VAL, V_CNT, V_CNT // accumulate 137 138 vxret: 139 // Return rightmost (lowest) 64-bit part of accumulator. 140 VSTEG $1, V_CNT, (R_RET) 141 RET 142 143 novx: 144 // Start of non-vector code (the vector facility not available). 145 // 146 // Initialise counter and constant zero. 147 MOVD $0, R_CNT 148 MOVD $0, R_ZERO 149 150 loop: 151 // Read 1-byte from input and compare. 152 // Note: avoid putting LOCGR in critical path. 153 MOVBZ (R_PTR), R_VAL 154 MOVD $1, R_TMP 155 MOVD $1(R_PTR), R_PTR 156 CMPW R_VAL, R_CHAR 157 LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match) 158 ADD R_TMP, R_CNT // accumulate 64-bit result 159 160 // Repeat until all bytes have been checked. 161 BRCTG R_LEN, loop 162 163 ret: 164 MOVD R_CNT, (R_RET) 165 RET 166 167 ret0: 168 MOVD $0, (R_RET) 169 RET