github.com/bir3/gocompiler@v0.9.2202/src/internal/bytealg/count_ppc64x.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build ppc64le || ppc64 6 7 #include "go_asm.h" 8 #include "textflag.h" 9 10 TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 11 // R3 = byte array pointer 12 // R4 = length 13 // R6 = byte to count 14 MTVRD R6, V1 // move compare byte 15 MOVD R6, R5 16 VSPLTB $7, V1, V1 // replicate byte across V1 17 BR countbytebody<>(SB) 18 19 TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32 20 // R3 = byte array pointer 21 // R4 = length 22 // R5 = byte to count 23 MTVRD R5, V1 // move compare byte 24 VSPLTB $7, V1, V1 // replicate byte across V1 25 BR countbytebody<>(SB) 26 27 // R3: addr of string 28 // R4: len of string 29 // R5: byte to count 30 // V1: byte to count, splatted. 31 // On exit: 32 // R3: return value 33 TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0 34 MOVD $0, R18 // byte count 35 36 #ifndef GOPPC64_power10 37 RLDIMI $8, R5, $48, R5 38 RLDIMI $16, R5, $32, R5 39 RLDIMI $32, R5, $0, R5 // fill reg with the byte to count 40 #endif 41 42 CMPU R4, $32 // Check if it's a small string (<32 bytes) 43 BLT tail // Jump to the small string case 44 SRD $5, R4, R20 45 MOVD R20, CTR 46 MOVD $16, R21 47 XXLXOR V4, V4, V4 48 XXLXOR V5, V5, V5 49 50 PCALIGN $16 51 cmploop: 52 LXVD2X (R0)(R3), V0 // Count 32B per loop with two vector accumulators. 53 LXVD2X (R21)(R3), V2 54 VCMPEQUB V2, V1, V2 55 VCMPEQUB V0, V1, V0 56 VPOPCNTD V2, V2 // A match is 0xFF or 0. Count the bits into doubleword buckets. 57 VPOPCNTD V0, V0 58 VADDUDM V0, V4, V4 // Accumulate the popcounts. They are 8x the count. 59 VADDUDM V2, V5, V5 // The count will be fixed up afterwards. 60 ADD $32, R3 61 BDNZ cmploop 62 63 VADDUDM V4, V5, V5 64 MFVSRD V5, R18 65 VSLDOI $8, V5, V5, V5 66 MFVSRD V5, R21 67 ADD R21, R18, R18 68 ANDCC $31, R4, R4 69 // Skip the tail processing if no bytes remaining. 70 BEQ tail_0 71 72 #ifdef GOPPC64_power10 73 SRD $3, R18, R18 // Fix the vector loop count before counting the tail on P10. 74 75 tail: // Count the last 0 - 31 bytes. 76 CMP R4, $16 77 BLE small_tail_p10 78 LXV 0(R3), V0 79 VCMPEQUB V0, V1, V0 80 VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14. 81 SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it. 82 ADD R14, R18, R18 83 ADD $16, R3, R3 84 ANDCC $15, R4, R4 85 86 small_tail_p10: 87 SLD $56, R4, R6 88 LXVLL R3, R6, V0 89 VCMPEQUB V0, V1, V0 90 VCLRRB V0, R4, V0 // If <16B being compared, clear matches of the 16-R4 bytes. 91 VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14. 92 SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it. 93 ADD R14, R18, R3 94 RET 95 96 #else 97 tail: // Count the last 0 - 31 bytes. 98 CMP R4, $16 99 BLT tail_8 100 MOVD (R3), R12 101 MOVD 8(R3), R14 102 CMPB R12, R5, R12 103 CMPB R14, R5, R14 104 POPCNTD R12, R12 105 POPCNTD R14, R14 106 ADD R12, R18, R18 107 ADD R14, R18, R18 108 ADD $16, R3, R3 109 ADD $-16, R4, R4 110 111 tail_8: // Count the remaining 0 - 15 bytes. 112 CMP R4, $8 113 BLT tail_4 114 MOVD (R3), R12 115 CMPB R12, R5, R12 116 POPCNTD R12, R12 117 ADD R12, R18, R18 118 ADD $8, R3, R3 119 ADD $-8, R4, R4 120 121 tail_4: // Count the remaining 0 - 7 bytes. 122 CMP R4, $4 123 BLT tail_2 124 MOVWZ (R3), R12 125 CMPB R12, R5, R12 126 SLD $32, R12, R12 // Remove non-participating matches. 127 POPCNTD R12, R12 128 ADD R12, R18, R18 129 ADD $4, R3, R3 130 ADD $-4, R4, R4 131 132 tail_2: // Count the remaining 0 - 3 bytes. 133 CMP R4, $2 134 BLT tail_1 135 MOVHZ (R3), R12 136 CMPB R12, R5, R12 137 SLD $48, R12, R12 // Remove non-participating matches. 138 POPCNTD R12, R12 139 ADD R12, R18, R18 140 ADD $2, R3, R3 141 ADD $-2, R4, R4 142 143 tail_1: // Count the remaining 0 - 1 bytes. 144 CMP R4, $1 145 BLT tail_0 146 MOVBZ (R3), R12 147 CMPB R12, R5, R12 148 ANDCC $0x8, R12, R12 149 ADD R12, R18, R18 150 #endif 151 152 tail_0: // No remaining tail to count. 153 SRD $3, R18, R3 // Fixup count, it is off by 8x. 154 RET