github.com/primecitizens/pcz/std@v0.2.1/core/mem/clear_ppc64x.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2014 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 //go:build pcz && ppc64 || ppc64le 9 10 #include "textflag.h" 11 12 // See memclrNoHeapPointers Go doc for important implementation constraints. 13 14 // func Clear(ptr unsafe.Pointer, n uintptr) 15 TEXT ·Clear<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-16 16 // R3 = ptr 17 // R4 = n 18 19 // Determine if there are doublewords to clear 20 check: 21 ANDCC $7, R4, R5 // R5: leftover bytes to clear 22 SRD $3, R4, R6 // R6: double words to clear 23 CMP R6, $0, CR1 // CR1[EQ] set if no double words 24 25 BC 12, 6, nozerolarge // only single bytes 26 CMP R4, $512 27 BLT under512 // special case for < 512 28 ANDCC $127, R3, R8 // check for 128 alignment of address 29 BEQ zero512setup 30 31 ANDCC $7, R3, R15 32 BEQ zero512xsetup // at least 8 byte aligned 33 34 // zero bytes up to 8 byte alignment 35 36 ANDCC $1, R3, R15 // check for byte alignment 37 BEQ byte2 38 MOVB R0, 0(R3) // zero 1 byte 39 ADD $1, R3 // bump ptr by 1 40 ADD $-1, R4 41 42 byte2: 43 ANDCC $2, R3, R15 // check for 2 byte alignment 44 BEQ byte4 45 MOVH R0, 0(R3) // zero 2 bytes 46 ADD $2, R3 // bump ptr by 2 47 ADD $-2, R4 48 49 byte4: 50 ANDCC $4, R3, R15 // check for 4 byte alignment 51 BEQ zero512xsetup 52 MOVW R0, 0(R3) // zero 4 bytes 53 ADD $4, R3 // bump ptr by 4 54 ADD $-4, R4 55 BR zero512xsetup // ptr should now be 8 byte aligned 56 57 under512: 58 SRDCC $3, R6, R7 // 64 byte chunks? 59 XXLXOR VS32, VS32, VS32 // clear VS32 (V0) 60 BEQ lt64gt8 61 62 // Prepare to clear 64 bytes at a time. 63 64 zero64setup: 65 DCBTST (R3) // prepare data cache 66 MOVD R7, CTR // number of 64 byte chunks 67 MOVD $16, R8 68 MOVD $32, R16 69 MOVD $48, R17 70 71 zero64: 72 STXVD2X VS32, (R3+R0) // store 16 bytes 73 STXVD2X VS32, (R3+R8) 74 STXVD2X VS32, (R3+R16) 75 STXVD2X VS32, (R3+R17) 76 ADD $64, R3 77 ADD $-64, R4 78 BDNZ zero64 // dec ctr, br zero64 if ctr not 0 79 SRDCC $3, R4, R6 // remaining doublewords 80 BEQ nozerolarge 81 82 lt64gt8: 83 CMP R4, $32 84 BLT lt32gt8 85 MOVD $16, R8 86 STXVD2X VS32, (R3+R0) 87 STXVD2X VS32, (R3+R8) 88 ADD $-32, R4 89 ADD $32, R3 90 lt32gt8: 91 CMP R4, $16 92 BLT lt16gt8 93 STXVD2X VS32, (R3+R0) 94 ADD $16, R3 95 ADD $-16, R4 96 lt16gt8: 97 CMP R4, $8 98 BLT nozerolarge 99 MOVD R0, 0(R3) 100 ADD $8, R3 101 ADD $-8, R4 102 103 nozerolarge: 104 ANDCC $7, R4, R5 // any remaining bytes 105 BC 4, 1, LR // ble lr 106 107 zerotail: 108 MOVD R5, CTR // set up to clear tail bytes 109 110 zerotailloop: 111 MOVB R0, 0(R3) // clear single bytes 112 ADD $1, R3 113 BDNZ zerotailloop // dec ctr, br zerotailloop if ctr not 0 114 RET 115 116 zero512xsetup: // 512 chunk with extra needed 117 ANDCC $8, R3, R11 // 8 byte alignment? 118 BEQ zero512setup16 119 MOVD R0, 0(R3) // clear 8 bytes 120 ADD $8, R3 // update ptr to next 8 121 ADD $-8, R4 // dec count by 8 122 123 zero512setup16: 124 ANDCC $127, R3, R14 // < 128 byte alignment 125 BEQ zero512setup // handle 128 byte alignment 126 MOVD $128, R15 127 SUB R14, R15, R14 // find increment to 128 alignment 128 SRD $4, R14, R15 // number of 16 byte chunks 129 130 zero512presetup: 131 MOVD R15, CTR // loop counter of 16 bytes 132 XXLXOR VS32, VS32, VS32 // clear VS32 (V0) 133 134 zero512preloop: // clear up to 128 alignment 135 STXVD2X VS32, (R3+R0) // clear 16 bytes 136 ADD $16, R3 // update ptr 137 ADD $-16, R4 // dec count 138 BDNZ zero512preloop 139 140 zero512setup: // setup for dcbz loop 141 CMP R4, $512 // check if at least 512 142 BLT remain 143 SRD $9, R4, R8 // loop count for 512 chunks 144 MOVD R8, CTR // set up counter 145 MOVD $128, R9 // index regs for 128 bytes 146 MOVD $256, R10 147 MOVD $384, R11 148 PCALIGN $32 149 150 zero512: 151 DCBZ (R3+R0) // clear first chunk 152 DCBZ (R3+R9) // clear second chunk 153 DCBZ (R3+R10) // clear third chunk 154 DCBZ (R3+R11) // clear fourth chunk 155 ADD $512, R3 156 BDNZ zero512 157 ANDCC $511, R4 158 159 remain: 160 CMP R4, $128 // check if 128 byte chunks left 161 BLT smaller 162 DCBZ (R3+R0) // clear 128 163 ADD $128, R3 164 ADD $-128, R4 165 BR remain 166 167 smaller: 168 ANDCC $127, R4, R7 // find leftovers 169 BEQ done 170 CMP R7, $64 // more than 64, do 64 at a time 171 XXLXOR VS32, VS32, VS32 172 BLT lt64gt8 // less than 64 173 SRD $6, R7, R7 // set up counter for 64 174 BR zero64setup 175 176 done: 177 RET