github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/runtime/memmove_arm64.s (about) 1 // Copyright 2014 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "textflag.h" 6 7 // See memmove Go doc for important implementation constraints. 8 9 // Register map 10 // 11 // dstin R0 12 // src R1 13 // count R2 14 // dst R3 (same as R0, but gets modified in unaligned cases) 15 // srcend R4 16 // dstend R5 17 // data R6-R17 18 // tmp1 R14 19 20 // Copies are split into 3 main cases: small copies of up to 32 bytes, medium 21 // copies of up to 128 bytes, and large copies. The overhead of the overlap 22 // check is negligible since it is only required for large copies. 23 // 24 // Large copies use a software pipelined loop processing 64 bytes per iteration. 25 // The destination pointer is 16-byte aligned to minimize unaligned accesses. 26 // The loop tail is handled by always copying 64 bytes from the end. 27 28 // func memmove(to, from unsafe.Pointer, n uintptr) 29 TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24 30 MOVD to+0(FP), R0 31 MOVD from+8(FP), R1 32 MOVD n+16(FP), R2 33 CBZ R2, copy0 34 35 // Small copies: 1..16 bytes 36 CMP $16, R2 37 BLE copy16 38 39 // Large copies 40 CMP $128, R2 41 BHI copy_long 42 CMP $32, R2 43 BHI copy32_128 44 45 // Small copies: 17..32 bytes. 46 LDP (R1), (R6, R7) 47 ADD R1, R2, R4 // R4 points just past the last source byte 48 LDP -16(R4), (R12, R13) 49 STP (R6, R7), (R0) 50 ADD R0, R2, R5 // R5 points just past the last destination byte 51 STP (R12, R13), -16(R5) 52 RET 53 54 // Small copies: 1..16 bytes. 55 copy16: 56 ADD R1, R2, R4 // R4 points just past the last source byte 57 ADD R0, R2, R5 // R5 points just past the last destination byte 58 CMP $8, R2 59 BLT copy7 60 MOVD (R1), R6 61 MOVD -8(R4), R7 62 MOVD R6, (R0) 63 MOVD R7, -8(R5) 64 RET 65 66 copy7: 67 TBZ $2, R2, copy3 68 MOVWU (R1), R6 69 MOVWU -4(R4), R7 70 MOVW R6, (R0) 71 MOVW R7, -4(R5) 72 RET 73 74 copy3: 75 TBZ $1, R2, copy1 76 MOVHU (R1), R6 77 MOVHU -2(R4), R7 78 MOVH R6, (R0) 79 MOVH R7, -2(R5) 80 RET 81 82 copy1: 83 MOVBU (R1), R6 84 MOVB R6, (R0) 85 86 copy0: 87 RET 88 89 // Medium copies: 33..128 bytes. 90 copy32_128: 91 ADD R1, R2, R4 // R4 points just past the last source byte 92 ADD R0, R2, R5 // R5 points just past the last destination byte 93 LDP (R1), (R6, R7) 94 LDP 16(R1), (R8, R9) 95 LDP -32(R4), (R10, R11) 96 LDP -16(R4), (R12, R13) 97 CMP $64, R2 98 BHI copy128 99 STP (R6, R7), (R0) 100 STP (R8, R9), 16(R0) 101 STP (R10, R11), -32(R5) 102 STP (R12, R13), -16(R5) 103 RET 104 105 // Copy 65..128 bytes. 106 copy128: 107 LDP 32(R1), (R14, R15) 108 LDP 48(R1), (R16, R17) 109 CMP $96, R2 110 BLS copy96 111 LDP -64(R4), (R2, R3) 112 LDP -48(R4), (R1, R4) 113 STP (R2, R3), -64(R5) 114 STP (R1, R4), -48(R5) 115 116 copy96: 117 STP (R6, R7), (R0) 118 STP (R8, R9), 16(R0) 119 STP (R14, R15), 32(R0) 120 STP (R16, R17), 48(R0) 121 STP (R10, R11), -32(R5) 122 STP (R12, R13), -16(R5) 123 RET 124 125 // Copy more than 128 bytes. 126 copy_long: 127 ADD R1, R2, R4 // R4 points just past the last source byte 128 ADD R0, R2, R5 // R5 points just past the last destination byte 129 MOVD ZR, R7 130 MOVD ZR, R8 131 132 CMP $1024, R2 133 BLT backward_check 134 // feature detect to decide how to align 135 MOVBU runtime·arm64UseAlignedLoads(SB), R6 136 CBNZ R6, use_aligned_loads 137 MOVD R0, R7 138 MOVD R5, R8 139 B backward_check 140 use_aligned_loads: 141 MOVD R1, R7 142 MOVD R4, R8 143 // R7 and R8 are used here for the realignment calculation. In 144 // the use_aligned_loads case, R7 is the src pointer and R8 is 145 // srcend pointer, which is used in the backward copy case. 146 // When doing aligned stores, R7 is the dst pointer and R8 is 147 // the dstend pointer. 148 149 backward_check: 150 // Use backward copy if there is an overlap. 151 SUB R1, R0, R14 152 CBZ R14, copy0 153 CMP R2, R14 154 BCC copy_long_backward 155 156 // Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment. 157 LDP (R1), (R12, R13) // Load A 158 AND $15, R7, R14 // Calculate the realignment offset 159 SUB R14, R1, R1 160 SUB R14, R0, R3 // move dst back same amount as src 161 ADD R14, R2, R2 162 LDP 16(R1), (R6, R7) // Load B 163 STP (R12, R13), (R0) // Store A 164 LDP 32(R1), (R8, R9) // Load C 165 LDP 48(R1), (R10, R11) // Load D 166 LDP.W 64(R1), (R12, R13) // Load E 167 // 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end 168 SUBS $144, R2, R2 169 BLS copy64_from_end 170 171 loop64: 172 STP (R6, R7), 16(R3) // Store B 173 LDP 16(R1), (R6, R7) // Load B (next iteration) 174 STP (R8, R9), 32(R3) // Store C 175 LDP 32(R1), (R8, R9) // Load C 176 STP (R10, R11), 48(R3) // Store D 177 LDP 48(R1), (R10, R11) // Load D 178 STP.W (R12, R13), 64(R3) // Store E 179 LDP.W 64(R1), (R12, R13) // Load E 180 SUBS $64, R2, R2 181 BHI loop64 182 183 // Write the last iteration and copy 64 bytes from the end. 184 copy64_from_end: 185 LDP -64(R4), (R14, R15) // Load F 186 STP (R6, R7), 16(R3) // Store B 187 LDP -48(R4), (R6, R7) // Load G 188 STP (R8, R9), 32(R3) // Store C 189 LDP -32(R4), (R8, R9) // Load H 190 STP (R10, R11), 48(R3) // Store D 191 LDP -16(R4), (R10, R11) // Load I 192 STP (R12, R13), 64(R3) // Store E 193 STP (R14, R15), -64(R5) // Store F 194 STP (R6, R7), -48(R5) // Store G 195 STP (R8, R9), -32(R5) // Store H 196 STP (R10, R11), -16(R5) // Store I 197 RET 198 199 // Large backward copy for overlapping copies. 200 // Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment. 201 copy_long_backward: 202 LDP -16(R4), (R12, R13) 203 AND $15, R8, R14 204 SUB R14, R4, R4 205 SUB R14, R2, R2 206 LDP -16(R4), (R6, R7) 207 STP (R12, R13), -16(R5) 208 LDP -32(R4), (R8, R9) 209 LDP -48(R4), (R10, R11) 210 LDP.W -64(R4), (R12, R13) 211 SUB R14, R5, R5 212 SUBS $128, R2, R2 213 BLS copy64_from_start 214 215 loop64_backward: 216 STP (R6, R7), -16(R5) 217 LDP -16(R4), (R6, R7) 218 STP (R8, R9), -32(R5) 219 LDP -32(R4), (R8, R9) 220 STP (R10, R11), -48(R5) 221 LDP -48(R4), (R10, R11) 222 STP.W (R12, R13), -64(R5) 223 LDP.W -64(R4), (R12, R13) 224 SUBS $64, R2, R2 225 BHI loop64_backward 226 227 // Write the last iteration and copy 64 bytes from the start. 228 copy64_from_start: 229 LDP 48(R1), (R2, R3) 230 STP (R6, R7), -16(R5) 231 LDP 32(R1), (R6, R7) 232 STP (R8, R9), -32(R5) 233 LDP 16(R1), (R8, R9) 234 STP (R10, R11), -48(R5) 235 LDP (R1), (R10, R11) 236 STP (R12, R13), -64(R5) 237 STP (R2, R3), 48(R0) 238 STP (R6, R7), 32(R0) 239 STP (R8, R9), 16(R0) 240 STP (R10, R11), (R0) 241 RET