github.com/primecitizens/pcz/std@v0.2.1/core/bytealg/count_s390x.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2019 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 //go:build pcz && s390x 9 10 #include "textflag.h" 11 12 // condition code masks 13 #define EQ 8 14 #define NE 7 15 16 // register assignments 17 #define R_ZERO R0 18 #define R_VAL R1 19 #define R_TMP R2 20 #define R_PTR R3 21 #define R_LEN R4 22 #define R_CHAR R5 23 #define R_RET R6 24 #define R_ITER R7 25 #define R_CNT R8 26 #define R_MPTR R9 27 28 // vector register assignments 29 #define V_ZERO V0 30 #define V_CHAR V1 31 #define V_MASK V2 32 #define V_VAL V3 33 #define V_CNT V4 34 35 // mask for trailing bytes in vector implementation 36 GLOBL countbytemask<>(SB), RODATA, $16 37 DATA countbytemask<>+0(SB)/8, $0x0101010101010101 38 DATA countbytemask<>+8(SB)/8, $0x0101010101010101 39 40 // func CountSlice(b []byte, c byte) int 41 TEXT ·CountSlice(SB), NOSPLIT|NOFRAME, $0-40 42 LMG b+0(FP), R_PTR, R_LEN 43 MOVBZ c+24(FP), R_CHAR 44 MOVD $ret+32(FP), R_RET 45 BR countbytebody<>(SB) 46 47 // func Count(s string, c byte) int 48 TEXT ·Count(SB), NOSPLIT|NOFRAME, $0-32 49 LMG s+0(FP), R_PTR, R_LEN 50 MOVBZ c+16(FP), R_CHAR 51 MOVD $ret+24(FP), R_RET 52 BR countbytebody<>(SB) 53 54 // input: 55 // R_PTR = address of array of bytes 56 // R_LEN = number of bytes in array 57 // R_CHAR = byte value to count zero (extended to register width) 58 // R_RET = address of return value 59 TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0 60 MOVD ·hasVX(SB), R_TMP 61 MOVD $countbytemask<>(SB), R_MPTR 62 CGIJ $EQ, R_LEN, $0, ret0 // return if length is 0. 63 SRD $4, R_LEN, R_ITER // R_ITER is the number of 16-byte chunks 64 MOVBZ (R_TMP), R_TMP // load bool indicating support for vector facility 65 CGIJ $EQ, R_TMP, $0, novx // jump to scalar code if the vector facility is not available 66 67 // Start of vector code (have vector facility). 68 // 69 // Set R_LEN to be the length mod 16 minus 1 to use as an index for 70 // vector 'load with length' (VLL). It will be in the range [-1,14]. 71 // Also replicate c across a 16-byte vector and initialize V_ZERO. 72 ANDW $0xf, R_LEN 73 VLVGB $0, R_CHAR, V_CHAR // V_CHAR = [16]byte{c, 0, ..., 0, 0} 74 VZERO V_ZERO // V_ZERO = [1]uint128{0} 75 ADDW $-1, R_LEN 76 VREPB $0, V_CHAR, V_CHAR // V_CHAR = [16]byte{c, c, ..., c, c} 77 78 // Jump to loop if we have more than 15 bytes to process. 79 CGIJ $NE, R_ITER, $0, vxchunks 80 81 // Load 1-15 bytes and corresponding mask. 82 // Note: only the low 32-bits of R_LEN are used for the index. 83 VLL R_LEN, (R_PTR), V_VAL 84 VLL R_LEN, (R_MPTR), V_MASK 85 86 // Compare each byte in input chunk against byte to be counted. 87 // Each byte element will be set to either 0 (no match) or 1 (match). 88 VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00 89 VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits 90 91 // Accumulate matched byte count in 128-bit integer value. 92 VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15} 93 VSUMQF V_VAL, V_ZERO, V_CNT // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3} 94 95 // Return rightmost (lowest) 64-bit part of accumulator. 96 VSTEG $1, V_CNT, (R_RET) 97 RET 98 99 vxchunks: 100 // Load 0x01 into every byte element in the 16-byte mask vector. 101 VREPIB $1, V_MASK // V_MASK = [16]byte{1, 1, ..., 1, 1} 102 VZERO V_CNT // initial uint128 count of 0 103 104 vxloop: 105 // Load input bytes in 16-byte chunks. 106 VL (R_PTR), V_VAL 107 108 // Compare each byte in input chunk against byte to be counted. 109 // Each byte element will be set to either 0 (no match) or 1 (match). 110 VCEQB V_CHAR, V_VAL, V_VAL // each byte will be either 0xff or 0x00 111 VN V_MASK, V_VAL, V_VAL // mask out most significant 7 bits 112 113 // Increment input string address. 114 MOVD $16(R_PTR), R_PTR 115 116 // Accumulate matched byte count in 128-bit integer value. 117 VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15} 118 VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3} 119 VAQ V_VAL, V_CNT, V_CNT // accumulate 120 121 // Repeat until all 16-byte chunks are done. 122 BRCTG R_ITER, vxloop 123 124 // Skip to end if there are no trailing bytes. 125 CIJ $EQ, R_LEN, $-1, vxret 126 127 // Load 1-15 bytes and corresponding mask. 128 // Note: only the low 32-bits of R_LEN are used for the index. 129 VLL R_LEN, (R_PTR), V_VAL 130 VLL R_LEN, (R_MPTR), V_MASK 131 132 // Compare each byte in input chunk against byte to be counted. 133 // Each byte element will be set to either 0 (no match) or 1 (match). 134 VCEQB V_CHAR, V_VAL, V_VAL 135 VN V_MASK, V_VAL, V_VAL 136 137 // Accumulate matched byte count in 128-bit integer value. 138 VSUMB V_VAL, V_ZERO, V_VAL // [16]byte{x0, x1, ..., x14, x15} → [4]uint32{x0+x1+x2+x3, ..., x12+x13+x14+x15} 139 VSUMQF V_VAL, V_ZERO, V_VAL // [4]uint32{x0, x1, x2, x3} → [1]uint128{x0+x1+x2+x3} 140 VAQ V_VAL, V_CNT, V_CNT // accumulate 141 142 vxret: 143 // Return rightmost (lowest) 64-bit part of accumulator. 144 VSTEG $1, V_CNT, (R_RET) 145 RET 146 147 novx: 148 // Start of non-vector code (the vector facility not available). 149 // 150 // Initialise counter and constant zero. 151 MOVD $0, R_CNT 152 MOVD $0, R_ZERO 153 154 loop: 155 // Read 1-byte from input and compare. 156 // Note: avoid putting LOCGR in critical path. 157 MOVBZ (R_PTR), R_VAL 158 MOVD $1, R_TMP 159 MOVD $1(R_PTR), R_PTR 160 CMPW R_VAL, R_CHAR 161 LOCGR $NE, R_ZERO, R_TMP // select 0 if no match (1 if there is a match) 162 ADD R_TMP, R_CNT // accumulate 64-bit result 163 164 // Repeat until all bytes have been checked. 165 BRCTG R_LEN, loop 166 167 ret: 168 MOVD R_CNT, (R_RET) 169 RET 170 171 ret0: 172 MOVD $0, (R_RET) 173 RET