github.com/euank/go@v0.0.0-20160829210321-495514729181/src/runtime/memmove_amd64.s (about) 1 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64) 2 // https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s 3 // 4 // Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. 5 // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. 6 // Portions Copyright 2009 The Go Authors. All rights reserved. 7 // 8 // Permission is hereby granted, free of charge, to any person obtaining a copy 9 // of this software and associated documentation files (the "Software"), to deal 10 // in the Software without restriction, including without limitation the rights 11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 // copies of the Software, and to permit persons to whom the Software is 13 // furnished to do so, subject to the following conditions: 14 // 15 // The above copyright notice and this permission notice shall be included in 16 // all copies or substantial portions of the Software. 17 // 18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 // THE SOFTWARE. 25 26 // +build !plan9 27 28 #include "textflag.h" 29 30 // void runtime·memmove(void*, void*, uintptr) 31 TEXT runtime·memmove(SB), NOSPLIT, $0-24 32 33 MOVQ to+0(FP), DI 34 MOVQ from+8(FP), SI 35 MOVQ n+16(FP), BX 36 37 // REP instructions have a high startup cost, so we handle small sizes 38 // with some straightline code. The REP MOVSQ instruction is really fast 39 // for large sizes. The cutover is approximately 2K. 40 tail: 41 // move_129through256 or smaller work whether or not the source and the 42 // destination memory regions overlap because they load all data into 43 // registers before writing it back. move_256through2048 on the other 44 // hand can be used only when the memory regions don't overlap or the copy 45 // direction is forward. 46 TESTQ BX, BX 47 JEQ move_0 48 CMPQ BX, $2 49 JBE move_1or2 50 CMPQ BX, $4 51 JBE move_3or4 52 CMPQ BX, $8 53 JB move_5through7 54 JE move_8 55 CMPQ BX, $16 56 JBE move_9through16 57 CMPQ BX, $32 58 JBE move_17through32 59 CMPQ BX, $64 60 JBE move_33through64 61 CMPQ BX, $128 62 JBE move_65through128 63 CMPQ BX, $256 64 JBE move_129through256 65 // TODO: use branch table and BSR to make this just a single dispatch 66 67 /* 68 * check and set for backwards 69 */ 70 CMPQ SI, DI 71 JLS back 72 73 /* 74 * forward copy loop 75 */ 76 forward: 77 CMPQ BX, $2048 78 JLS move_256through2048 79 80 // If REP MOVSB isn't fast, don't use it 81 TESTL $(1<<9), runtime·cpuid_ebx7(SB) // erms, aka enhanced REP MOVSB/STOSB 82 JEQ fwdBy8 83 84 // Check alignment 85 MOVL SI, AX 86 ORL DI, AX 87 TESTL $7, AX 88 JEQ fwdBy8 89 90 // Do 1 byte at a time 91 MOVQ BX, CX 92 REP; MOVSB 93 RET 94 95 fwdBy8: 96 // Do 8 bytes at a time 97 MOVQ BX, CX 98 SHRQ $3, CX 99 ANDQ $7, BX 100 REP; MOVSQ 101 JMP tail 102 103 back: 104 /* 105 * check overlap 106 */ 107 MOVQ SI, CX 108 ADDQ BX, CX 109 CMPQ CX, DI 110 JLS forward 111 112 /* 113 * whole thing backwards has 114 * adjusted addresses 115 */ 116 ADDQ BX, DI 117 ADDQ BX, SI 118 STD 119 120 /* 121 * copy 122 */ 123 MOVQ BX, CX 124 SHRQ $3, CX 125 ANDQ $7, BX 126 127 SUBQ $8, DI 128 SUBQ $8, SI 129 REP; MOVSQ 130 131 CLD 132 ADDQ $8, DI 133 ADDQ $8, SI 134 SUBQ BX, DI 135 SUBQ BX, SI 136 JMP tail 137 138 move_1or2: 139 MOVB (SI), AX 140 MOVB -1(SI)(BX*1), CX 141 MOVB AX, (DI) 142 MOVB CX, -1(DI)(BX*1) 143 RET 144 move_0: 145 RET 146 move_3or4: 147 MOVW (SI), AX 148 MOVW -2(SI)(BX*1), CX 149 MOVW AX, (DI) 150 MOVW CX, -2(DI)(BX*1) 151 RET 152 move_5through7: 153 MOVL (SI), AX 154 MOVL -4(SI)(BX*1), CX 155 MOVL AX, (DI) 156 MOVL CX, -4(DI)(BX*1) 157 RET 158 move_8: 159 // We need a separate case for 8 to make sure we write pointers atomically. 160 MOVQ (SI), AX 161 MOVQ AX, (DI) 162 RET 163 move_9through16: 164 MOVQ (SI), AX 165 MOVQ -8(SI)(BX*1), CX 166 MOVQ AX, (DI) 167 MOVQ CX, -8(DI)(BX*1) 168 RET 169 move_17through32: 170 MOVOU (SI), X0 171 MOVOU -16(SI)(BX*1), X1 172 MOVOU X0, (DI) 173 MOVOU X1, -16(DI)(BX*1) 174 RET 175 move_33through64: 176 MOVOU (SI), X0 177 MOVOU 16(SI), X1 178 MOVOU -32(SI)(BX*1), X2 179 MOVOU -16(SI)(BX*1), X3 180 MOVOU X0, (DI) 181 MOVOU X1, 16(DI) 182 MOVOU X2, -32(DI)(BX*1) 183 MOVOU X3, -16(DI)(BX*1) 184 RET 185 move_65through128: 186 MOVOU (SI), X0 187 MOVOU 16(SI), X1 188 MOVOU 32(SI), X2 189 MOVOU 48(SI), X3 190 MOVOU -64(SI)(BX*1), X4 191 MOVOU -48(SI)(BX*1), X5 192 MOVOU -32(SI)(BX*1), X6 193 MOVOU -16(SI)(BX*1), X7 194 MOVOU X0, (DI) 195 MOVOU X1, 16(DI) 196 MOVOU X2, 32(DI) 197 MOVOU X3, 48(DI) 198 MOVOU X4, -64(DI)(BX*1) 199 MOVOU X5, -48(DI)(BX*1) 200 MOVOU X6, -32(DI)(BX*1) 201 MOVOU X7, -16(DI)(BX*1) 202 RET 203 move_129through256: 204 MOVOU (SI), X0 205 MOVOU 16(SI), X1 206 MOVOU 32(SI), X2 207 MOVOU 48(SI), X3 208 MOVOU 64(SI), X4 209 MOVOU 80(SI), X5 210 MOVOU 96(SI), X6 211 MOVOU 112(SI), X7 212 MOVOU -128(SI)(BX*1), X8 213 MOVOU -112(SI)(BX*1), X9 214 MOVOU -96(SI)(BX*1), X10 215 MOVOU -80(SI)(BX*1), X11 216 MOVOU -64(SI)(BX*1), X12 217 MOVOU -48(SI)(BX*1), X13 218 MOVOU -32(SI)(BX*1), X14 219 MOVOU -16(SI)(BX*1), X15 220 MOVOU X0, (DI) 221 MOVOU X1, 16(DI) 222 MOVOU X2, 32(DI) 223 MOVOU X3, 48(DI) 224 MOVOU X4, 64(DI) 225 MOVOU X5, 80(DI) 226 MOVOU X6, 96(DI) 227 MOVOU X7, 112(DI) 228 MOVOU X8, -128(DI)(BX*1) 229 MOVOU X9, -112(DI)(BX*1) 230 MOVOU X10, -96(DI)(BX*1) 231 MOVOU X11, -80(DI)(BX*1) 232 MOVOU X12, -64(DI)(BX*1) 233 MOVOU X13, -48(DI)(BX*1) 234 MOVOU X14, -32(DI)(BX*1) 235 MOVOU X15, -16(DI)(BX*1) 236 RET 237 move_256through2048: 238 SUBQ $256, BX 239 MOVOU (SI), X0 240 MOVOU 16(SI), X1 241 MOVOU 32(SI), X2 242 MOVOU 48(SI), X3 243 MOVOU 64(SI), X4 244 MOVOU 80(SI), X5 245 MOVOU 96(SI), X6 246 MOVOU 112(SI), X7 247 MOVOU 128(SI), X8 248 MOVOU 144(SI), X9 249 MOVOU 160(SI), X10 250 MOVOU 176(SI), X11 251 MOVOU 192(SI), X12 252 MOVOU 208(SI), X13 253 MOVOU 224(SI), X14 254 MOVOU 240(SI), X15 255 MOVOU X0, (DI) 256 MOVOU X1, 16(DI) 257 MOVOU X2, 32(DI) 258 MOVOU X3, 48(DI) 259 MOVOU X4, 64(DI) 260 MOVOU X5, 80(DI) 261 MOVOU X6, 96(DI) 262 MOVOU X7, 112(DI) 263 MOVOU X8, 128(DI) 264 MOVOU X9, 144(DI) 265 MOVOU X10, 160(DI) 266 MOVOU X11, 176(DI) 267 MOVOU X12, 192(DI) 268 MOVOU X13, 208(DI) 269 MOVOU X14, 224(DI) 270 MOVOU X15, 240(DI) 271 CMPQ BX, $256 272 LEAQ 256(SI), SI 273 LEAQ 256(DI), DI 274 JGE move_256through2048 275 JMP tail