github.com/varialus/godfly@v0.0.0-20130904042352-1934f9f095ab/src/pkg/runtime/memmove_arm.s (about) 1 // Inferno's libkern/memmove-arm.s 2 // http://code.google.com/p/inferno-os/source/browse/libkern/memmove-arm.s 3 // 4 // Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. 5 // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. 6 // Portions Copyright 2009 The Go Authors. All rights reserved. 7 // 8 // Permission is hereby granted, free of charge, to any person obtaining a copy 9 // of this software and associated documentation files (the "Software"), to deal 10 // in the Software without restriction, including without limitation the rights 11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 // copies of the Software, and to permit persons to whom the Software is 13 // furnished to do so, subject to the following conditions: 14 // 15 // The above copyright notice and this permission notice shall be included in 16 // all copies or substantial portions of the Software. 17 // 18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 // THE SOFTWARE. 25 26 #include "../../cmd/ld/textflag.h" 27 28 // TE or TS are spilled to the stack during bulk register moves. 29 TS = 0 30 TE = 8 31 32 // Warning: the linker will use R11 to synthesize certain instructions. Please 33 // take care and double check with objdump. 34 FROM = 11 35 N = 12 36 TMP = 12 /* N and TMP don't overlap */ 37 TMP1 = 5 38 39 RSHIFT = 5 40 LSHIFT = 6 41 OFFSET = 7 42 43 BR0 = 0 /* shared with TS */ 44 BW0 = 1 45 BR1 = 1 46 BW1 = 2 47 BR2 = 2 48 BW2 = 3 49 BR3 = 3 50 BW3 = 4 51 52 FW0 = 1 53 FR0 = 2 54 FW1 = 2 55 FR1 = 3 56 FW2 = 3 57 FR2 = 4 58 FW3 = 4 59 FR3 = 8 /* shared with TE */ 60 61 TEXT runtime·memmove(SB), NOSPLIT, $4-12 62 _memmove: 63 MOVW to+0(FP), R(TS) 64 MOVW from+4(FP), R(FROM) 65 MOVW n+8(FP), R(N) 66 67 ADD R(N), R(TS), R(TE) /* to end pointer */ 68 69 CMP R(FROM), R(TS) 70 BLS _forward 71 72 _back: 73 ADD R(N), R(FROM) /* from end pointer */ 74 CMP $4, R(N) /* need at least 4 bytes to copy */ 75 BLT _b1tail 76 77 _b4align: /* align destination on 4 */ 78 AND.S $3, R(TE), R(TMP) 79 BEQ _b4aligned 80 81 MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */ 82 MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */ 83 B _b4align 84 85 _b4aligned: /* is source now aligned? */ 86 AND.S $3, R(FROM), R(TMP) 87 BNE _bunaligned 88 89 ADD $31, R(TS), R(TMP) /* do 32-byte chunks if possible */ 90 MOVW R(TS), savedts-4(SP) 91 _b32loop: 92 CMP R(TMP), R(TE) 93 BLS _b4tail 94 95 MOVM.DB.W (R(FROM)), [R0-R7] 96 MOVM.DB.W [R0-R7], (R(TE)) 97 B _b32loop 98 99 _b4tail: /* do remaining words if possible */ 100 MOVW savedts-4(SP), R(TS) 101 ADD $3, R(TS), R(TMP) 102 _b4loop: 103 CMP R(TMP), R(TE) 104 BLS _b1tail 105 106 MOVW.W -4(R(FROM)), R(TMP1) /* pre-indexed */ 107 MOVW.W R(TMP1), -4(R(TE)) /* pre-indexed */ 108 B _b4loop 109 110 _b1tail: /* remaining bytes */ 111 CMP R(TE), R(TS) 112 BEQ _return 113 114 MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */ 115 MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */ 116 B _b1tail 117 118 _forward: 119 CMP $4, R(N) /* need at least 4 bytes to copy */ 120 BLT _f1tail 121 122 _f4align: /* align destination on 4 */ 123 AND.S $3, R(TS), R(TMP) 124 BEQ _f4aligned 125 126 MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */ 127 MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */ 128 B _f4align 129 130 _f4aligned: /* is source now aligned? */ 131 AND.S $3, R(FROM), R(TMP) 132 BNE _funaligned 133 134 SUB $31, R(TE), R(TMP) /* do 32-byte chunks if possible */ 135 MOVW R(TE), savedte-4(SP) 136 _f32loop: 137 CMP R(TMP), R(TS) 138 BHS _f4tail 139 140 MOVM.IA.W (R(FROM)), [R1-R8] 141 MOVM.IA.W [R1-R8], (R(TS)) 142 B _f32loop 143 144 _f4tail: 145 MOVW savedte-4(SP), R(TE) 146 SUB $3, R(TE), R(TMP) /* do remaining words if possible */ 147 _f4loop: 148 CMP R(TMP), R(TS) 149 BHS _f1tail 150 151 MOVW.P 4(R(FROM)), R(TMP1) /* implicit write back */ 152 MOVW.P R(TMP1), 4(R(TS)) /* implicit write back */ 153 B _f4loop 154 155 _f1tail: 156 CMP R(TS), R(TE) 157 BEQ _return 158 159 MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */ 160 MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */ 161 B _f1tail 162 163 _return: 164 MOVW to+0(FP), R0 165 RET 166 167 _bunaligned: 168 CMP $2, R(TMP) /* is R(TMP) < 2 ? */ 169 170 MOVW.LT $8, R(RSHIFT) /* (R(n)<<24)|(R(n-1)>>8) */ 171 MOVW.LT $24, R(LSHIFT) 172 MOVW.LT $1, R(OFFSET) 173 174 MOVW.EQ $16, R(RSHIFT) /* (R(n)<<16)|(R(n-1)>>16) */ 175 MOVW.EQ $16, R(LSHIFT) 176 MOVW.EQ $2, R(OFFSET) 177 178 MOVW.GT $24, R(RSHIFT) /* (R(n)<<8)|(R(n-1)>>24) */ 179 MOVW.GT $8, R(LSHIFT) 180 MOVW.GT $3, R(OFFSET) 181 182 ADD $16, R(TS), R(TMP) /* do 16-byte chunks if possible */ 183 CMP R(TMP), R(TE) 184 BLS _b1tail 185 186 BIC $3, R(FROM) /* align source */ 187 MOVW R(TS), savedts-4(SP) 188 MOVW (R(FROM)), R(BR0) /* prime first block register */ 189 190 _bu16loop: 191 CMP R(TMP), R(TE) 192 BLS _bu1tail 193 194 MOVW R(BR0)<<R(LSHIFT), R(BW3) 195 MOVM.DB.W (R(FROM)), [R(BR0)-R(BR3)] 196 ORR R(BR3)>>R(RSHIFT), R(BW3) 197 198 MOVW R(BR3)<<R(LSHIFT), R(BW2) 199 ORR R(BR2)>>R(RSHIFT), R(BW2) 200 201 MOVW R(BR2)<<R(LSHIFT), R(BW1) 202 ORR R(BR1)>>R(RSHIFT), R(BW1) 203 204 MOVW R(BR1)<<R(LSHIFT), R(BW0) 205 ORR R(BR0)>>R(RSHIFT), R(BW0) 206 207 MOVM.DB.W [R(BW0)-R(BW3)], (R(TE)) 208 B _bu16loop 209 210 _bu1tail: 211 MOVW savedts-4(SP), R(TS) 212 ADD R(OFFSET), R(FROM) 213 B _b1tail 214 215 _funaligned: 216 CMP $2, R(TMP) 217 218 MOVW.LT $8, R(RSHIFT) /* (R(n+1)<<24)|(R(n)>>8) */ 219 MOVW.LT $24, R(LSHIFT) 220 MOVW.LT $3, R(OFFSET) 221 222 MOVW.EQ $16, R(RSHIFT) /* (R(n+1)<<16)|(R(n)>>16) */ 223 MOVW.EQ $16, R(LSHIFT) 224 MOVW.EQ $2, R(OFFSET) 225 226 MOVW.GT $24, R(RSHIFT) /* (R(n+1)<<8)|(R(n)>>24) */ 227 MOVW.GT $8, R(LSHIFT) 228 MOVW.GT $1, R(OFFSET) 229 230 SUB $16, R(TE), R(TMP) /* do 16-byte chunks if possible */ 231 CMP R(TMP), R(TS) 232 BHS _f1tail 233 234 BIC $3, R(FROM) /* align source */ 235 MOVW R(TE), savedte-4(SP) 236 MOVW.P 4(R(FROM)), R(FR3) /* prime last block register, implicit write back */ 237 238 _fu16loop: 239 CMP R(TMP), R(TS) 240 BHS _fu1tail 241 242 MOVW R(FR3)>>R(RSHIFT), R(FW0) 243 MOVM.IA.W (R(FROM)), [R(FR0),R(FR1),R(FR2),R(FR3)] 244 ORR R(FR0)<<R(LSHIFT), R(FW0) 245 246 MOVW R(FR0)>>R(RSHIFT), R(FW1) 247 ORR R(FR1)<<R(LSHIFT), R(FW1) 248 249 MOVW R(FR1)>>R(RSHIFT), R(FW2) 250 ORR R(FR2)<<R(LSHIFT), R(FW2) 251 252 MOVW R(FR2)>>R(RSHIFT), R(FW3) 253 ORR R(FR3)<<R(LSHIFT), R(FW3) 254 255 MOVM.IA.W [R(FW0),R(FW1),R(FW2),R(FW3)], (R(TS)) 256 B _fu16loop 257 258 _fu1tail: 259 MOVW savedte-4(SP), R(TE) 260 SUB R(OFFSET), R(FROM) 261 B _f1tail