github.com/primecitizens/pcz/std@v0.2.1/core/mem/move_arm64.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2014 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 //go:build pcz && arm64 9 10 #include "textflag.h" 11 12 // See memmove Go doc for important implementation constraints. 13 14 // Register map 15 // 16 // dstin R0 17 // src R1 18 // count R2 19 // dst R3 (same as R0, but gets modified in unaligned cases) 20 // srcend R4 21 // dstend R5 22 // data R6-R17 23 // tmp1 R14 24 25 // Copies are split into 3 main cases: small copies of up to 32 bytes, medium 26 // copies of up to 128 bytes, and large copies. The overhead of the overlap 27 // check is negligible since it is only required for large copies. 28 // 29 // Large copies use a software pipelined loop processing 64 bytes per iteration. 30 // The destination pointer is 16-byte aligned to minimize unaligned accesses. 31 // The loop tail is handled by always copying 64 bytes from the end. 32 33 // func Move(to, from unsafe.Pointer, n uintptr) 34 TEXT ·Move<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24 35 CBZ R2, copy0 36 37 // Small copies: 1..16 bytes 38 CMP $16, R2 39 BLE copy16 40 41 // Large copies 42 CMP $128, R2 43 BHI copy_long 44 CMP $32, R2 45 BHI copy32_128 46 47 // Small copies: 17..32 bytes. 48 LDP (R1), (R6, R7) 49 ADD R1, R2, R4 // R4 points just past the last source byte 50 LDP -16(R4), (R12, R13) 51 STP (R6, R7), (R0) 52 ADD R0, R2, R5 // R5 points just past the last destination byte 53 STP (R12, R13), -16(R5) 54 RET 55 56 // Small copies: 1..16 bytes. 57 copy16: 58 ADD R1, R2, R4 // R4 points just past the last source byte 59 ADD R0, R2, R5 // R5 points just past the last destination byte 60 CMP $8, R2 61 BLT copy7 62 MOVD (R1), R6 63 MOVD -8(R4), R7 64 MOVD R6, (R0) 65 MOVD R7, -8(R5) 66 RET 67 68 copy7: 69 TBZ $2, R2, copy3 70 MOVWU (R1), R6 71 MOVWU -4(R4), R7 72 MOVW R6, (R0) 73 MOVW R7, -4(R5) 74 RET 75 76 copy3: 77 TBZ $1, R2, copy1 78 MOVHU (R1), R6 79 MOVHU -2(R4), R7 80 MOVH R6, (R0) 81 MOVH R7, -2(R5) 82 RET 83 84 copy1: 85 MOVBU (R1), R6 86 MOVB R6, (R0) 87 88 copy0: 89 RET 90 91 // Medium copies: 33..128 bytes. 92 copy32_128: 93 ADD R1, R2, R4 // R4 points just past the last source byte 94 ADD R0, R2, R5 // R5 points just past the last destination byte 95 LDP (R1), (R6, R7) 96 LDP 16(R1), (R8, R9) 97 LDP -32(R4), (R10, R11) 98 LDP -16(R4), (R12, R13) 99 CMP $64, R2 100 BHI copy128 101 STP (R6, R7), (R0) 102 STP (R8, R9), 16(R0) 103 STP (R10, R11), -32(R5) 104 STP (R12, R13), -16(R5) 105 RET 106 107 // Copy 65..128 bytes. 108 copy128: 109 LDP 32(R1), (R14, R15) 110 LDP 48(R1), (R16, R17) 111 CMP $96, R2 112 BLS copy96 113 LDP -64(R4), (R2, R3) 114 LDP -48(R4), (R1, R4) 115 STP (R2, R3), -64(R5) 116 STP (R1, R4), -48(R5) 117 118 copy96: 119 STP (R6, R7), (R0) 120 STP (R8, R9), 16(R0) 121 STP (R14, R15), 32(R0) 122 STP (R16, R17), 48(R0) 123 STP (R10, R11), -32(R5) 124 STP (R12, R13), -16(R5) 125 RET 126 127 // Copy more than 128 bytes. 128 copy_long: 129 ADD R1, R2, R4 // R4 points just past the last source byte 130 ADD R0, R2, R5 // R5 points just past the last destination byte 131 MOVD ZR, R7 132 MOVD ZR, R8 133 134 CMP $1024, R2 135 BLT backward_check 136 // feature detect to decide how to align 137 MOVBU ·arm64UseAlignedLoads(SB), R6 138 CBNZ R6, use_aligned_loads 139 MOVD R0, R7 140 MOVD R5, R8 141 B backward_check 142 use_aligned_loads: 143 MOVD R1, R7 144 MOVD R4, R8 145 // R7 and R8 are used here for the realignment calculation. In 146 // the use_aligned_loads case, R7 is the src pointer and R8 is 147 // srcend pointer, which is used in the backward copy case. 148 // When doing aligned stores, R7 is the dst pointer and R8 is 149 // the dstend pointer. 150 151 backward_check: 152 // Use backward copy if there is an overlap. 153 SUB R1, R0, R14 154 CBZ R14, copy0 155 CMP R2, R14 156 BCC copy_long_backward 157 158 // Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment. 159 LDP (R1), (R12, R13) // Load A 160 AND $15, R7, R14 // Calculate the realignment offset 161 SUB R14, R1, R1 162 SUB R14, R0, R3 // move dst back same amount as src 163 ADD R14, R2, R2 164 LDP 16(R1), (R6, R7) // Load B 165 STP (R12, R13), (R0) // Store A 166 LDP 32(R1), (R8, R9) // Load C 167 LDP 48(R1), (R10, R11) // Load D 168 LDP.W 64(R1), (R12, R13) // Load E 169 // 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end 170 SUBS $144, R2, R2 171 BLS copy64_from_end 172 173 loop64: 174 STP (R6, R7), 16(R3) // Store B 175 LDP 16(R1), (R6, R7) // Load B (next iteration) 176 STP (R8, R9), 32(R3) // Store C 177 LDP 32(R1), (R8, R9) // Load C 178 STP (R10, R11), 48(R3) // Store D 179 LDP 48(R1), (R10, R11) // Load D 180 STP.W (R12, R13), 64(R3) // Store E 181 LDP.W 64(R1), (R12, R13) // Load E 182 SUBS $64, R2, R2 183 BHI loop64 184 185 // Write the last iteration and copy 64 bytes from the end. 186 copy64_from_end: 187 LDP -64(R4), (R14, R15) // Load F 188 STP (R6, R7), 16(R3) // Store B 189 LDP -48(R4), (R6, R7) // Load G 190 STP (R8, R9), 32(R3) // Store C 191 LDP -32(R4), (R8, R9) // Load H 192 STP (R10, R11), 48(R3) // Store D 193 LDP -16(R4), (R10, R11) // Load I 194 STP (R12, R13), 64(R3) // Store E 195 STP (R14, R15), -64(R5) // Store F 196 STP (R6, R7), -48(R5) // Store G 197 STP (R8, R9), -32(R5) // Store H 198 STP (R10, R11), -16(R5) // Store I 199 RET 200 201 // Large backward copy for overlapping copies. 202 // Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment. 203 copy_long_backward: 204 LDP -16(R4), (R12, R13) 205 AND $15, R8, R14 206 SUB R14, R4, R4 207 SUB R14, R2, R2 208 LDP -16(R4), (R6, R7) 209 STP (R12, R13), -16(R5) 210 LDP -32(R4), (R8, R9) 211 LDP -48(R4), (R10, R11) 212 LDP.W -64(R4), (R12, R13) 213 SUB R14, R5, R5 214 SUBS $128, R2, R2 215 BLS copy64_from_start 216 217 loop64_backward: 218 STP (R6, R7), -16(R5) 219 LDP -16(R4), (R6, R7) 220 STP (R8, R9), -32(R5) 221 LDP -32(R4), (R8, R9) 222 STP (R10, R11), -48(R5) 223 LDP -48(R4), (R10, R11) 224 STP.W (R12, R13), -64(R5) 225 LDP.W -64(R4), (R12, R13) 226 SUBS $64, R2, R2 227 BHI loop64_backward 228 229 // Write the last iteration and copy 64 bytes from the start. 230 copy64_from_start: 231 LDP 48(R1), (R2, R3) 232 STP (R6, R7), -16(R5) 233 LDP 32(R1), (R6, R7) 234 STP (R8, R9), -32(R5) 235 LDP 16(R1), (R8, R9) 236 STP (R10, R11), -48(R5) 237 LDP (R1), (R10, R11) 238 STP (R12, R13), -64(R5) 239 STP (R2, R3), 48(R0) 240 STP (R6, R7), 32(R0) 241 STP (R8, R9), 16(R0) 242 STP (R10, R11), (R0) 243 RET