github.com/primecitizens/pcz/std@v0.2.1/core/mem/clear_arm64.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2014 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 //go:build pcz && arm64 9 10 #include "textflag.h" 11 12 // See memclrNoHeapPointers Go doc for important implementation constraints. 13 14 // func Clear(ptr unsafe.Pointer, n uintptr) 15 // Also called from assembly in sys_windows_arm64.s without g (but using Go stack convention). 16 TEXT ·Clear<ABIInternal>(SB),NOSPLIT,$0-16 17 CMP $16, R1 18 // If n is equal to 16 bytes, use zero_exact_16 to zero 19 BEQ zero_exact_16 20 21 // If n is greater than 16 bytes, use zero_by_16 to zero 22 BHI zero_by_16 23 24 // n is less than 16 bytes 25 ADD R1, R0, R7 26 TBZ $3, R1, less_than_8 27 MOVD ZR, (R0) 28 MOVD ZR, -8(R7) 29 RET 30 31 less_than_8: 32 TBZ $2, R1, less_than_4 33 MOVW ZR, (R0) 34 MOVW ZR, -4(R7) 35 RET 36 37 less_than_4: 38 CBZ R1, ending 39 MOVB ZR, (R0) 40 TBZ $1, R1, ending 41 MOVH ZR, -2(R7) 42 43 ending: 44 RET 45 46 zero_exact_16: 47 // n is exactly 16 bytes 48 STP (ZR, ZR), (R0) 49 RET 50 51 zero_by_16: 52 // n greater than 16 bytes, check if the start address is aligned 53 NEG R0, R4 54 ANDS $15, R4, R4 55 // Try zeroing using zva if the start address is aligned with 16 56 BEQ try_zva 57 58 // Non-aligned store 59 STP (ZR, ZR), (R0) 60 // Make the destination aligned 61 SUB R4, R1, R1 62 ADD R4, R0, R0 63 B try_zva 64 65 tail_maybe_long: 66 CMP $64, R1 67 BHS no_zva 68 69 tail63: 70 ANDS $48, R1, R3 71 BEQ last16 72 CMPW $32, R3 73 BEQ last48 74 BLT last32 75 STP.P (ZR, ZR), 16(R0) 76 last48: 77 STP.P (ZR, ZR), 16(R0) 78 last32: 79 STP.P (ZR, ZR), 16(R0) 80 // The last store length is at most 16, so it is safe to use 81 // stp to write last 16 bytes 82 last16: 83 ANDS $15, R1, R1 84 CBZ R1, last_end 85 ADD R1, R0, R0 86 STP (ZR, ZR), -16(R0) 87 last_end: 88 RET 89 90 no_zva: 91 SUB $16, R0, R0 92 SUB $64, R1, R1 93 94 loop_64: 95 STP (ZR, ZR), 16(R0) 96 STP (ZR, ZR), 32(R0) 97 STP (ZR, ZR), 48(R0) 98 STP.W (ZR, ZR), 64(R0) 99 SUBS $64, R1, R1 100 BGE loop_64 101 ANDS $63, R1, ZR 102 ADD $16, R0, R0 103 BNE tail63 104 RET 105 106 try_zva: 107 // Try using the ZVA feature to zero entire cache lines 108 // It is not meaningful to use ZVA if the block size is less than 64, 109 // so make sure that n is greater than or equal to 64 110 CMP $63, R1 111 BLE tail63 112 113 CMP $128, R1 114 // Ensure n is at least 128 bytes, so that there is enough to copy after 115 // alignment. 116 BLT no_zva 117 // Check if ZVA is allowed from user code, and if so get the block size 118 MOVW block_size<>(SB), R5 119 TBNZ $31, R5, no_zva 120 CBNZ R5, zero_by_line 121 // DCZID_EL0 bit assignments 122 // [63:5] Reserved 123 // [4] DZP, if bit set DC ZVA instruction is prohibited, else permitted 124 // [3:0] log2 of the block size in words, eg. if it returns 0x4 then block size is 16 words 125 MRS DCZID_EL0, R3 126 TBZ $4, R3, init 127 // ZVA not available 128 MOVW $~0, R5 129 MOVW R5, block_size<>(SB) 130 B no_zva 131 132 init: 133 MOVW $4, R9 134 ANDW $15, R3, R5 135 LSLW R5, R9, R5 136 MOVW R5, block_size<>(SB) 137 138 ANDS $63, R5, R9 139 // Block size is less than 64. 140 BNE no_zva 141 142 zero_by_line: 143 CMP R5, R1 144 // Not enough memory to reach alignment 145 BLO no_zva 146 SUB $1, R5, R6 147 NEG R0, R4 148 ANDS R6, R4, R4 149 // Already aligned 150 BEQ aligned 151 152 // check there is enough to copy after alignment 153 SUB R4, R1, R3 154 155 // Check that the remaining length to ZVA after alignment 156 // is greater than 64. 157 CMP $64, R3 158 CCMP GE, R3, R5, $10 // condition code GE, NZCV=0b1010 159 BLT no_zva 160 161 // We now have at least 64 bytes to zero, update n 162 MOVD R3, R1 163 164 loop_zva_prolog: 165 STP (ZR, ZR), (R0) 166 STP (ZR, ZR), 16(R0) 167 STP (ZR, ZR), 32(R0) 168 SUBS $64, R4, R4 169 STP (ZR, ZR), 48(R0) 170 ADD $64, R0, R0 171 BGE loop_zva_prolog 172 173 ADD R4, R0, R0 174 175 aligned: 176 SUB R5, R1, R1 177 178 loop_zva: 179 WORD $0xd50b7420 // DC ZVA, R0 180 ADD R5, R0, R0 181 SUBS R5, R1, R1 182 BHS loop_zva 183 ANDS R6, R1, R1 184 BNE tail_maybe_long 185 RET 186 187 GLOBL block_size<>(SB), NOPTR, $8