github.com/primecitizens/pcz/std@v0.2.1/core/mem/equal_ppc64x.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2018 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 //go:build pcz && (ppc64 || ppc64le) 9 10 #include "textflag.h" 11 12 // 4K (smallest case) page size offset mask for PPC64. 13 #define PAGE_OFFSET 4095 14 15 // TODO: At writing, ISEL and BC do not support CR bit type arguments, 16 // define them here for readability. 17 #define CR0LT 4*0+0 18 #define CR0EQ 4*0+2 19 #define CR1LT 4*1+0 20 #define CR6LT 4*6+0 21 22 // Likewise, the BC opcode is hard to read, and no extended 23 // mnemonics are offered for these forms. 24 #define BGELR_CR6 BC 4, CR6LT, (LR) 25 #define BEQLR BC 12, CR0EQ, (LR) 26 27 // Equal(a, b unsafe.Pointer, size uintptr) bool 28 TEXT ·Equal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25 29 // R3 = a 30 // R4 = b 31 // R5 = size 32 BR memeqbody<>(SB) 33 34 // memequal_varlen(a, b unsafe.Pointer) bool 35 TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-17 36 // R3 = a 37 // R4 = b 38 CMP R3, R4 39 BEQ eq 40 MOVD 8(R11), R5 // compiler stores size at offset 8 in the closure 41 BR memeqbody<>(SB) 42 eq: 43 MOVD $1, R3 44 RET 45 46 // Do an efficient memequal for ppc64 47 // R3 = s1 48 // R4 = s2 49 // R5 = len 50 // On exit: 51 // R3 = return value 52 TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0 53 MOVD R3, R8 // Move s1 into R8 54 ADD R5, R3, R9 // &s1[len(s1)] 55 ADD R5, R4, R10 // &s2[len(s2)] 56 MOVD $1, R11 57 CMP R5, $16 // Use GPR checks for check for len <= 16 58 BLE check0_16 59 MOVD $0, R3 // Assume no-match in case BGELR CR6 returns 60 CMP R5, $32 // Use overlapping VSX loads for len <= 32 61 BLE check17_32 // Do a pair of overlapping VSR compares 62 CMP R5, $64 63 BLE check33_64 // Hybrid check + overlap compare. 64 65 setup64: 66 SRD $6, R5, R6 // number of 64 byte chunks to compare 67 MOVD R6, CTR 68 MOVD $16, R14 // index for VSX loads and stores 69 MOVD $32, R15 70 MOVD $48, R16 71 ANDCC $0x3F, R5, R5 // len%64==0? 72 73 PCALIGN $32 74 loop64: 75 LXVD2X (R8+R0), V0 76 LXVD2X (R4+R0), V1 77 VCMPEQUBCC V0, V1, V2 // compare, setting CR6 78 BGELR_CR6 79 LXVD2X (R8+R14), V0 80 LXVD2X (R4+R14), V1 81 VCMPEQUBCC V0, V1, V2 82 BGELR_CR6 83 LXVD2X (R8+R15), V0 84 LXVD2X (R4+R15), V1 85 VCMPEQUBCC V0, V1, V2 86 BGELR_CR6 87 LXVD2X (R8+R16), V0 88 LXVD2X (R4+R16), V1 89 VCMPEQUBCC V0, V1, V2 90 BGELR_CR6 91 ADD $64,R8 // bump up to next 64 92 ADD $64,R4 93 BDNZ loop64 94 95 ISEL $CR0EQ, R11, R3, R3 // If no tail, return 1, otherwise R3 remains 0. 96 BEQLR // return if no tail. 97 98 ADD $-64, R9, R8 99 ADD $-64, R10, R4 100 LXVD2X (R8+R0), V0 101 LXVD2X (R4+R0), V1 102 VCMPEQUBCC V0, V1, V2 103 BGELR_CR6 104 LXVD2X (R8+R14), V0 105 LXVD2X (R4+R14), V1 106 VCMPEQUBCC V0, V1, V2 107 BGELR_CR6 108 LXVD2X (R8+R15), V0 109 LXVD2X (R4+R15), V1 110 VCMPEQUBCC V0, V1, V2 111 BGELR_CR6 112 LXVD2X (R8+R16), V0 113 LXVD2X (R4+R16), V1 114 VCMPEQUBCC V0, V1, V2 115 ISEL $CR6LT, R11, R0, R3 116 RET 117 118 check33_64: 119 // Bytes 0-15 120 LXVD2X (R8+R0), V0 121 LXVD2X (R4+R0), V1 122 VCMPEQUBCC V0, V1, V2 123 BGELR_CR6 124 ADD $16, R8 125 ADD $16, R4 126 127 // Bytes 16-31 128 LXVD2X (R8+R0), V0 129 LXVD2X (R4+R0), V1 130 VCMPEQUBCC V0, V1, V2 131 BGELR_CR6 132 133 // A little tricky, but point R4,R8 to &sx[len-32], 134 // and reuse check17_32 to check the next 1-31 bytes (with some overlap) 135 ADD $-32, R9, R8 136 ADD $-32, R10, R4 137 // Fallthrough 138 139 check17_32: 140 LXVD2X (R8+R0), V0 141 LXVD2X (R4+R0), V1 142 VCMPEQUBCC V0, V1, V2 143 ISEL $CR6LT, R11, R0, R5 144 145 // Load sX[len(sX)-16:len(sX)] and compare. 146 ADD $-16, R9 147 ADD $-16, R10 148 LXVD2X (R9+R0), V0 149 LXVD2X (R10+R0), V1 150 VCMPEQUBCC V0, V1, V2 151 ISEL $CR6LT, R5, R0, R3 152 RET 153 154 check0_16: 155 CMP R5, $8 156 BLT check0_7 157 // Load sX[0:7] and compare. 158 MOVD (R8), R6 159 MOVD (R4), R7 160 CMP R6, R7 161 ISEL $CR0EQ, R11, R0, R5 162 // Load sX[len(sX)-8:len(sX)] and compare. 163 MOVD -8(R9), R6 164 MOVD -8(R10), R7 165 CMP R6, R7 166 ISEL $CR0EQ, R5, R0, R3 167 RET 168 169 check0_7: 170 CMP R5,$0 171 MOVD $1, R3 172 BEQLR // return if len == 0 173 174 // Check < 8B loads with a single compare, but select the load address 175 // such that it cannot cross a page boundary. Load a few bytes from the 176 // lower address if that does not cross the lower page. Or, load a few 177 // extra bytes from the higher addresses. And align those values 178 // consistently in register as either address may have differing 179 // alignment requirements. 180 ANDCC $PAGE_OFFSET, R8, R6 // &sX & PAGE_OFFSET 181 ANDCC $PAGE_OFFSET, R4, R9 182 SUBC R5, $8, R12 // 8-len 183 SLD $3, R12, R14 // (8-len)*8 184 CMPU R6, R12, CR1 // Enough bytes lower in the page to load lower? 185 CMPU R9, R12, CR0 186 SUB R12, R8, R6 // compute lower load address 187 SUB R12, R4, R9 188 ISEL $CR1LT, R8, R6, R8 // R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len)) 189 ISEL $CR0LT, R4, R9, R4 // Similar for s2 190 MOVD (R8), R15 191 MOVD (R4), R16 192 SLD R14, R15, R7 193 SLD R14, R16, R17 194 SRD R14, R7, R7 // Clear the upper (8-len) bytes (with 2 shifts) 195 SRD R14, R17, R17 196 SRD R14, R15, R6 // Clear the lower (8-len) bytes 197 SRD R14, R16, R9 198 #ifdef GOARCH_ppc64le 199 ISEL $CR1LT, R7, R6, R8 // Choose the correct len bytes to compare based on alignment 200 ISEL $CR0LT, R17, R9, R4 201 #else 202 ISEL $CR1LT, R6, R7, R8 203 ISEL $CR0LT, R9, R17, R4 204 #endif 205 CMP R4, R8 206 ISEL $CR0EQ, R11, R0, R3 207 RET