github.com/ice-blockchain/go/src@v0.0.0-20240403114104-1564d284e521/runtime/memmove_arm64.s (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 // See memmove Go doc for important implementation constraints. 8 9 // Register map 10 // 11 // dstin R0 12 // src R1 13 // count R2 14 // dst R3 (same as R0, but gets modified in unaligned cases) 15 // srcend R4 16 // dstend R5 17 // data R6-R17 18 // tmp1 R14 19 20 // Copies are split into 3 main cases: small copies of up to 32 bytes, medium 21 // copies of up to 128 bytes, and large copies. The overhead of the overlap 22 // check is negligible since it is only required for large copies. 23 // 24 // Large copies use a software pipelined loop processing 64 bytes per iteration. 25 // The destination pointer is 16-byte aligned to minimize unaligned accesses. 26 // The loop tail is handled by always copying 64 bytes from the end. 27 28 // func memmove(to, from unsafe.Pointer, n uintptr) 29 TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24 30 CBZ R2, copy0 31 32 // Small copies: 1..16 bytes 33 CMP $16, R2 34 BLE copy16 35 36 // Large copies 37 CMP $128, R2 38 BHI copy_long 39 CMP $32, R2 40 BHI copy32_128 41 42 // Small copies: 17..32 bytes. 43 LDP (R1), (R6, R7) 44 ADD R1, R2, R4 // R4 points just past the last source byte 45 LDP -16(R4), (R12, R13) 46 STP (R6, R7), (R0) 47 ADD R0, R2, R5 // R5 points just past the last destination byte 48 STP (R12, R13), -16(R5) 49 RET 50 51 // Small copies: 1..16 bytes. 52 copy16: 53 ADD R1, R2, R4 // R4 points just past the last source byte 54 ADD R0, R2, R5 // R5 points just past the last destination byte 55 CMP $8, R2 56 BLT copy7 57 MOVD (R1), R6 58 MOVD -8(R4), R7 59 MOVD R6, (R0) 60 MOVD R7, -8(R5) 61 RET 62 63 copy7: 64 TBZ $2, R2, copy3 65 MOVWU (R1), R6 66 MOVWU -4(R4), R7 67 MOVW R6, (R0) 68 MOVW R7, -4(R5) 69 RET 70 71 copy3: 72 TBZ $1, R2, copy1 73 MOVHU (R1), R6 74 MOVHU -2(R4), R7 75 MOVH R6, (R0) 76 MOVH R7, -2(R5) 77 RET 78 79 copy1: 80 MOVBU (R1), R6 81 MOVB R6, (R0) 82 83 copy0: 84 RET 85 86 // Medium copies: 33..128 bytes. 87 copy32_128: 88 ADD R1, R2, R4 // R4 points just past the last source byte 89 ADD R0, R2, R5 // R5 points just past the last destination byte 90 LDP (R1), (R6, R7) 91 LDP 16(R1), (R8, R9) 92 LDP -32(R4), (R10, R11) 93 LDP -16(R4), (R12, R13) 94 CMP $64, R2 95 BHI copy128 96 STP (R6, R7), (R0) 97 STP (R8, R9), 16(R0) 98 STP (R10, R11), -32(R5) 99 STP (R12, R13), -16(R5) 100 RET 101 102 // Copy 65..128 bytes. 103 copy128: 104 LDP 32(R1), (R14, R15) 105 LDP 48(R1), (R16, R17) 106 CMP $96, R2 107 BLS copy96 108 LDP -64(R4), (R2, R3) 109 LDP -48(R4), (R1, R4) 110 STP (R2, R3), -64(R5) 111 STP (R1, R4), -48(R5) 112 113 copy96: 114 STP (R6, R7), (R0) 115 STP (R8, R9), 16(R0) 116 STP (R14, R15), 32(R0) 117 STP (R16, R17), 48(R0) 118 STP (R10, R11), -32(R5) 119 STP (R12, R13), -16(R5) 120 RET 121 122 // Copy more than 128 bytes. 123 copy_long: 124 ADD R1, R2, R4 // R4 points just past the last source byte 125 ADD R0, R2, R5 // R5 points just past the last destination byte 126 MOVD ZR, R7 127 MOVD ZR, R8 128 129 CMP $1024, R2 130 BLT backward_check 131 // feature detect to decide how to align 132 MOVBU runtime·arm64UseAlignedLoads(SB), R6 133 CBNZ R6, use_aligned_loads 134 MOVD R0, R7 135 MOVD R5, R8 136 B backward_check 137 use_aligned_loads: 138 MOVD R1, R7 139 MOVD R4, R8 140 // R7 and R8 are used here for the realignment calculation. In 141 // the use_aligned_loads case, R7 is the src pointer and R8 is 142 // srcend pointer, which is used in the backward copy case. 143 // When doing aligned stores, R7 is the dst pointer and R8 is 144 // the dstend pointer. 145 146 backward_check: 147 // Use backward copy if there is an overlap. 148 SUB R1, R0, R14 149 CBZ R14, copy0 150 CMP R2, R14 151 BCC copy_long_backward 152 153 // Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment. 154 LDP (R1), (R12, R13) // Load A 155 AND $15, R7, R14 // Calculate the realignment offset 156 SUB R14, R1, R1 157 SUB R14, R0, R3 // move dst back same amount as src 158 ADD R14, R2, R2 159 LDP 16(R1), (R6, R7) // Load B 160 STP (R12, R13), (R0) // Store A 161 LDP 32(R1), (R8, R9) // Load C 162 LDP 48(R1), (R10, R11) // Load D 163 LDP.W 64(R1), (R12, R13) // Load E 164 // 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end 165 SUBS $144, R2, R2 166 BLS copy64_from_end 167 168 loop64: 169 STP (R6, R7), 16(R3) // Store B 170 LDP 16(R1), (R6, R7) // Load B (next iteration) 171 STP (R8, R9), 32(R3) // Store C 172 LDP 32(R1), (R8, R9) // Load C 173 STP (R10, R11), 48(R3) // Store D 174 LDP 48(R1), (R10, R11) // Load D 175 STP.W (R12, R13), 64(R3) // Store E 176 LDP.W 64(R1), (R12, R13) // Load E 177 SUBS $64, R2, R2 178 BHI loop64 179 180 // Write the last iteration and copy 64 bytes from the end. 181 copy64_from_end: 182 LDP -64(R4), (R14, R15) // Load F 183 STP (R6, R7), 16(R3) // Store B 184 LDP -48(R4), (R6, R7) // Load G 185 STP (R8, R9), 32(R3) // Store C 186 LDP -32(R4), (R8, R9) // Load H 187 STP (R10, R11), 48(R3) // Store D 188 LDP -16(R4), (R10, R11) // Load I 189 STP (R12, R13), 64(R3) // Store E 190 STP (R14, R15), -64(R5) // Store F 191 STP (R6, R7), -48(R5) // Store G 192 STP (R8, R9), -32(R5) // Store H 193 STP (R10, R11), -16(R5) // Store I 194 RET 195 196 // Large backward copy for overlapping copies. 197 // Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment. 198 copy_long_backward: 199 LDP -16(R4), (R12, R13) 200 AND $15, R8, R14 201 SUB R14, R4, R4 202 SUB R14, R2, R2 203 LDP -16(R4), (R6, R7) 204 STP (R12, R13), -16(R5) 205 LDP -32(R4), (R8, R9) 206 LDP -48(R4), (R10, R11) 207 LDP.W -64(R4), (R12, R13) 208 SUB R14, R5, R5 209 SUBS $128, R2, R2 210 BLS copy64_from_start 211 212 loop64_backward: 213 STP (R6, R7), -16(R5) 214 LDP -16(R4), (R6, R7) 215 STP (R8, R9), -32(R5) 216 LDP -32(R4), (R8, R9) 217 STP (R10, R11), -48(R5) 218 LDP -48(R4), (R10, R11) 219 STP.W (R12, R13), -64(R5) 220 LDP.W -64(R4), (R12, R13) 221 SUBS $64, R2, R2 222 BHI loop64_backward 223 224 // Write the last iteration and copy 64 bytes from the start. 225 copy64_from_start: 226 LDP 48(R1), (R2, R3) 227 STP (R6, R7), -16(R5) 228 LDP 32(R1), (R6, R7) 229 STP (R8, R9), -32(R5) 230 LDP 16(R1), (R8, R9) 231 STP (R10, R11), -48(R5) 232 LDP (R1), (R10, R11) 233 STP (R12, R13), -64(R5) 234 STP (R2, R3), 48(R0) 235 STP (R6, R7), 32(R0) 236 STP (R8, R9), 16(R0) 237 STP (R10, R11), (R0) 238 RET