github.com/primecitizens/pcz/std@v0.2.1/core/mem/move_arm.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Inferno's libkern/memmove-arm.s 5 // https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-arm.s 6 // 7 // Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. 8 // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. 9 // Portions Copyright 2009 The Go Authors. All rights reserved. 10 // 11 // Permission is hereby granted, free of charge, to any person obtaining a copy 12 // of this software and associated documentation files (the "Software"), to deal 13 // in the Software without restriction, including without limitation the rights 14 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 15 // copies of the Software, and to permit persons to whom the Software is 16 // furnished to do so, subject to the following conditions: 17 // 18 // The above copyright notice and this permission notice shall be included in 19 // all copies or substantial portions of the Software. 20 // 21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 24 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 25 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 26 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 27 // THE SOFTWARE. 28 29 //go:build pcz && arm 30 31 #include "textflag.h" 32 33 // TE or TS are spilled to the stack during bulk register moves. 34 #define TS R0 35 #define TE R8 36 37 // Warning: the linker will use R11 to synthesize certain instructions. Please 38 // take care and double check with objdump. 39 #define FROM R11 40 #define N R12 41 #define TMP R12 /* N and TMP don't overlap */ 42 #define TMP1 R5 43 44 #define RSHIFT R5 45 #define LSHIFT R6 46 #define OFFSET R7 47 48 #define BR0 R0 /* shared with TS */ 49 #define BW0 R1 50 #define BR1 R1 51 #define BW1 R2 52 #define BR2 R2 53 #define BW2 R3 54 #define BR3 R3 55 #define BW3 R4 56 57 #define FW0 R1 58 #define FR0 R2 59 #define FW1 R2 60 #define FR1 R3 61 #define FW2 R3 62 #define FR2 R4 63 #define FW3 R4 64 #define FR3 R8 /* shared with TE */ 65 66 // See memmove Go doc for important implementation constraints. 67 68 // func Move(to, from unsafe.Pointer, n uintptr) 69 TEXT ·Move(SB), NOSPLIT, $4-12 70 _memmove: 71 MOVW to+0(FP), TS 72 MOVW from+4(FP), FROM 73 MOVW n+8(FP), N 74 75 ADD N, TS, TE /* to end pointer */ 76 77 CMP FROM, TS 78 BLS _forward 79 80 _back: 81 ADD N, FROM /* from end pointer */ 82 CMP $4, N /* need at least 4 bytes to copy */ 83 BLT _b1tail 84 85 _b4align: /* align destination on 4 */ 86 AND.S $3, TE, TMP 87 BEQ _b4aligned 88 89 MOVBU.W -1(FROM), TMP /* pre-indexed */ 90 MOVBU.W TMP, -1(TE) /* pre-indexed */ 91 B _b4align 92 93 _b4aligned: /* is source now aligned? */ 94 AND.S $3, FROM, TMP 95 BNE _bunaligned 96 97 ADD $31, TS, TMP /* do 32-byte chunks if possible */ 98 MOVW TS, savedts-4(SP) 99 _b32loop: 100 CMP TMP, TE 101 BLS _b4tail 102 103 MOVM.DB.W (FROM), [R0-R7] 104 MOVM.DB.W [R0-R7], (TE) 105 B _b32loop 106 107 _b4tail: /* do remaining words if possible */ 108 MOVW savedts-4(SP), TS 109 ADD $3, TS, TMP 110 _b4loop: 111 CMP TMP, TE 112 BLS _b1tail 113 114 MOVW.W -4(FROM), TMP1 /* pre-indexed */ 115 MOVW.W TMP1, -4(TE) /* pre-indexed */ 116 B _b4loop 117 118 _b1tail: /* remaining bytes */ 119 CMP TE, TS 120 BEQ _return 121 122 MOVBU.W -1(FROM), TMP /* pre-indexed */ 123 MOVBU.W TMP, -1(TE) /* pre-indexed */ 124 B _b1tail 125 126 _forward: 127 CMP $4, N /* need at least 4 bytes to copy */ 128 BLT _f1tail 129 130 _f4align: /* align destination on 4 */ 131 AND.S $3, TS, TMP 132 BEQ _f4aligned 133 134 MOVBU.P 1(FROM), TMP /* implicit write back */ 135 MOVBU.P TMP, 1(TS) /* implicit write back */ 136 B _f4align 137 138 _f4aligned: /* is source now aligned? */ 139 AND.S $3, FROM, TMP 140 BNE _funaligned 141 142 SUB $31, TE, TMP /* do 32-byte chunks if possible */ 143 MOVW TE, savedte-4(SP) 144 _f32loop: 145 CMP TMP, TS 146 BHS _f4tail 147 148 MOVM.IA.W (FROM), [R1-R8] 149 MOVM.IA.W [R1-R8], (TS) 150 B _f32loop 151 152 _f4tail: 153 MOVW savedte-4(SP), TE 154 SUB $3, TE, TMP /* do remaining words if possible */ 155 _f4loop: 156 CMP TMP, TS 157 BHS _f1tail 158 159 MOVW.P 4(FROM), TMP1 /* implicit write back */ 160 MOVW.P TMP1, 4(TS) /* implicit write back */ 161 B _f4loop 162 163 _f1tail: 164 CMP TS, TE 165 BEQ _return 166 167 MOVBU.P 1(FROM), TMP /* implicit write back */ 168 MOVBU.P TMP, 1(TS) /* implicit write back */ 169 B _f1tail 170 171 _return: 172 MOVW to+0(FP), R0 173 RET 174 175 _bunaligned: 176 CMP $2, TMP /* is TMP < 2 ? */ 177 178 MOVW.LT $8, RSHIFT /* (R(n)<<24)|(R(n-1)>>8) */ 179 MOVW.LT $24, LSHIFT 180 MOVW.LT $1, OFFSET 181 182 MOVW.EQ $16, RSHIFT /* (R(n)<<16)|(R(n-1)>>16) */ 183 MOVW.EQ $16, LSHIFT 184 MOVW.EQ $2, OFFSET 185 186 MOVW.GT $24, RSHIFT /* (R(n)<<8)|(R(n-1)>>24) */ 187 MOVW.GT $8, LSHIFT 188 MOVW.GT $3, OFFSET 189 190 ADD $16, TS, TMP /* do 16-byte chunks if possible */ 191 CMP TMP, TE 192 BLS _b1tail 193 194 BIC $3, FROM /* align source */ 195 MOVW TS, savedts-4(SP) 196 MOVW (FROM), BR0 /* prime first block register */ 197 198 _bu16loop: 199 CMP TMP, TE 200 BLS _bu1tail 201 202 MOVW BR0<<LSHIFT, BW3 203 MOVM.DB.W (FROM), [BR0-BR3] 204 ORR BR3>>RSHIFT, BW3 205 206 MOVW BR3<<LSHIFT, BW2 207 ORR BR2>>RSHIFT, BW2 208 209 MOVW BR2<<LSHIFT, BW1 210 ORR BR1>>RSHIFT, BW1 211 212 MOVW BR1<<LSHIFT, BW0 213 ORR BR0>>RSHIFT, BW0 214 215 MOVM.DB.W [BW0-BW3], (TE) 216 B _bu16loop 217 218 _bu1tail: 219 MOVW savedts-4(SP), TS 220 ADD OFFSET, FROM 221 B _b1tail 222 223 _funaligned: 224 CMP $2, TMP 225 226 MOVW.LT $8, RSHIFT /* (R(n+1)<<24)|(R(n)>>8) */ 227 MOVW.LT $24, LSHIFT 228 MOVW.LT $3, OFFSET 229 230 MOVW.EQ $16, RSHIFT /* (R(n+1)<<16)|(R(n)>>16) */ 231 MOVW.EQ $16, LSHIFT 232 MOVW.EQ $2, OFFSET 233 234 MOVW.GT $24, RSHIFT /* (R(n+1)<<8)|(R(n)>>24) */ 235 MOVW.GT $8, LSHIFT 236 MOVW.GT $1, OFFSET 237 238 SUB $16, TE, TMP /* do 16-byte chunks if possible */ 239 CMP TMP, TS 240 BHS _f1tail 241 242 BIC $3, FROM /* align source */ 243 MOVW TE, savedte-4(SP) 244 MOVW.P 4(FROM), FR3 /* prime last block register, implicit write back */ 245 246 _fu16loop: 247 CMP TMP, TS 248 BHS _fu1tail 249 250 MOVW FR3>>RSHIFT, FW0 251 MOVM.IA.W (FROM), [FR0,FR1,FR2,FR3] 252 ORR FR0<<LSHIFT, FW0 253 254 MOVW FR0>>RSHIFT, FW1 255 ORR FR1<<LSHIFT, FW1 256 257 MOVW FR1>>RSHIFT, FW2 258 ORR FR2<<LSHIFT, FW2 259 260 MOVW FR2>>RSHIFT, FW3 261 ORR FR3<<LSHIFT, FW3 262 263 MOVM.IA.W [FW0,FW1,FW2,FW3], (TS) 264 B _fu16loop 265 266 _fu1tail: 267 MOVW savedte-4(SP), TE 268 SUB OFFSET, FROM 269 B _f1tail