github.com/panjjo/go@v0.0.0-20161104043856-d62b31386338/src/runtime/memmove_amd64.s (about) 1 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64) 2 // https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s 3 // 4 // Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. 5 // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. 6 // Portions Copyright 2009 The Go Authors. All rights reserved. 7 // 8 // Permission is hereby granted, free of charge, to any person obtaining a copy 9 // of this software and associated documentation files (the "Software"), to deal 10 // in the Software without restriction, including without limitation the rights 11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 // copies of the Software, and to permit persons to whom the Software is 13 // furnished to do so, subject to the following conditions: 14 // 15 // The above copyright notice and this permission notice shall be included in 16 // all copies or substantial portions of the Software. 17 // 18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 // THE SOFTWARE. 25 26 // +build !plan9 27 28 #include "textflag.h" 29 30 // void runtime·memmove(void*, void*, uintptr) 31 TEXT runtime·memmove(SB), NOSPLIT, $0-24 32 33 MOVQ to+0(FP), DI 34 MOVQ from+8(FP), SI 35 MOVQ n+16(FP), BX 36 37 // REP instructions have a high startup cost, so we handle small sizes 38 // with some straightline code. The REP MOVSQ instruction is really fast 39 // for large sizes. The cutover is approximately 2K. 40 tail: 41 // move_129through256 or smaller work whether or not the source and the 42 // destination memory regions overlap because they load all data into 43 // registers before writing it back. move_256through2048 on the other 44 // hand can be used only when the memory regions don't overlap or the copy 45 // direction is forward. 46 TESTQ BX, BX 47 JEQ move_0 48 CMPQ BX, $2 49 JBE move_1or2 50 CMPQ BX, $4 51 JBE move_3or4 52 CMPQ BX, $8 53 JB move_5through7 54 JE move_8 55 CMPQ BX, $16 56 JBE move_9through16 57 CMPQ BX, $32 58 JBE move_17through32 59 CMPQ BX, $64 60 JBE move_33through64 61 CMPQ BX, $128 62 JBE move_65through128 63 CMPQ BX, $256 64 JBE move_129through256 65 // TODO: use branch table and BSR to make this just a single dispatch 66 67 TESTB $1, runtime·useRepMovs(SB) 68 JZ avxUnaligned 69 70 /* 71 * check and set for backwards 72 */ 73 CMPQ SI, DI 74 JLS back 75 76 /* 77 * forward copy loop 78 */ 79 forward: 80 CMPQ BX, $2048 81 JLS move_256through2048 82 83 // If REP MOVSB isn't fast, don't use it 84 TESTL $(1<<9), runtime·cpuid_ebx7(SB) // erms, aka enhanced REP MOVSB/STOSB 85 JEQ fwdBy8 86 87 // Check alignment 88 MOVL SI, AX 89 ORL DI, AX 90 TESTL $7, AX 91 JEQ fwdBy8 92 93 // Do 1 byte at a time 94 MOVQ BX, CX 95 REP; MOVSB 96 RET 97 98 fwdBy8: 99 // Do 8 bytes at a time 100 MOVQ BX, CX 101 SHRQ $3, CX 102 ANDQ $7, BX 103 REP; MOVSQ 104 JMP tail 105 106 back: 107 /* 108 * check overlap 109 */ 110 MOVQ SI, CX 111 ADDQ BX, CX 112 CMPQ CX, DI 113 JLS forward 114 /* 115 * whole thing backwards has 116 * adjusted addresses 117 */ 118 ADDQ BX, DI 119 ADDQ BX, SI 120 STD 121 122 /* 123 * copy 124 */ 125 MOVQ BX, CX 126 SHRQ $3, CX 127 ANDQ $7, BX 128 129 SUBQ $8, DI 130 SUBQ $8, SI 131 REP; MOVSQ 132 133 CLD 134 ADDQ $8, DI 135 ADDQ $8, SI 136 SUBQ BX, DI 137 SUBQ BX, SI 138 JMP tail 139 140 move_1or2: 141 MOVB (SI), AX 142 MOVB -1(SI)(BX*1), CX 143 MOVB AX, (DI) 144 MOVB CX, -1(DI)(BX*1) 145 RET 146 move_0: 147 RET 148 move_3or4: 149 MOVW (SI), AX 150 MOVW -2(SI)(BX*1), CX 151 MOVW AX, (DI) 152 MOVW CX, -2(DI)(BX*1) 153 RET 154 move_5through7: 155 MOVL (SI), AX 156 MOVL -4(SI)(BX*1), CX 157 MOVL AX, (DI) 158 MOVL CX, -4(DI)(BX*1) 159 RET 160 move_8: 161 // We need a separate case for 8 to make sure we write pointers atomically. 162 MOVQ (SI), AX 163 MOVQ AX, (DI) 164 RET 165 move_9through16: 166 MOVQ (SI), AX 167 MOVQ -8(SI)(BX*1), CX 168 MOVQ AX, (DI) 169 MOVQ CX, -8(DI)(BX*1) 170 RET 171 move_17through32: 172 MOVOU (SI), X0 173 MOVOU -16(SI)(BX*1), X1 174 MOVOU X0, (DI) 175 MOVOU X1, -16(DI)(BX*1) 176 RET 177 move_33through64: 178 MOVOU (SI), X0 179 MOVOU 16(SI), X1 180 MOVOU -32(SI)(BX*1), X2 181 MOVOU -16(SI)(BX*1), X3 182 MOVOU X0, (DI) 183 MOVOU X1, 16(DI) 184 MOVOU X2, -32(DI)(BX*1) 185 MOVOU X3, -16(DI)(BX*1) 186 RET 187 move_65through128: 188 MOVOU (SI), X0 189 MOVOU 16(SI), X1 190 MOVOU 32(SI), X2 191 MOVOU 48(SI), X3 192 MOVOU -64(SI)(BX*1), X4 193 MOVOU -48(SI)(BX*1), X5 194 MOVOU -32(SI)(BX*1), X6 195 MOVOU -16(SI)(BX*1), X7 196 MOVOU X0, (DI) 197 MOVOU X1, 16(DI) 198 MOVOU X2, 32(DI) 199 MOVOU X3, 48(DI) 200 MOVOU X4, -64(DI)(BX*1) 201 MOVOU X5, -48(DI)(BX*1) 202 MOVOU X6, -32(DI)(BX*1) 203 MOVOU X7, -16(DI)(BX*1) 204 RET 205 move_129through256: 206 MOVOU (SI), X0 207 MOVOU 16(SI), X1 208 MOVOU 32(SI), X2 209 MOVOU 48(SI), X3 210 MOVOU 64(SI), X4 211 MOVOU 80(SI), X5 212 MOVOU 96(SI), X6 213 MOVOU 112(SI), X7 214 MOVOU -128(SI)(BX*1), X8 215 MOVOU -112(SI)(BX*1), X9 216 MOVOU -96(SI)(BX*1), X10 217 MOVOU -80(SI)(BX*1), X11 218 MOVOU -64(SI)(BX*1), X12 219 MOVOU -48(SI)(BX*1), X13 220 MOVOU -32(SI)(BX*1), X14 221 MOVOU -16(SI)(BX*1), X15 222 MOVOU X0, (DI) 223 MOVOU X1, 16(DI) 224 MOVOU X2, 32(DI) 225 MOVOU X3, 48(DI) 226 MOVOU X4, 64(DI) 227 MOVOU X5, 80(DI) 228 MOVOU X6, 96(DI) 229 MOVOU X7, 112(DI) 230 MOVOU X8, -128(DI)(BX*1) 231 MOVOU X9, -112(DI)(BX*1) 232 MOVOU X10, -96(DI)(BX*1) 233 MOVOU X11, -80(DI)(BX*1) 234 MOVOU X12, -64(DI)(BX*1) 235 MOVOU X13, -48(DI)(BX*1) 236 MOVOU X14, -32(DI)(BX*1) 237 MOVOU X15, -16(DI)(BX*1) 238 RET 239 move_256through2048: 240 SUBQ $256, BX 241 MOVOU (SI), X0 242 MOVOU 16(SI), X1 243 MOVOU 32(SI), X2 244 MOVOU 48(SI), X3 245 MOVOU 64(SI), X4 246 MOVOU 80(SI), X5 247 MOVOU 96(SI), X6 248 MOVOU 112(SI), X7 249 MOVOU 128(SI), X8 250 MOVOU 144(SI), X9 251 MOVOU 160(SI), X10 252 MOVOU 176(SI), X11 253 MOVOU 192(SI), X12 254 MOVOU 208(SI), X13 255 MOVOU 224(SI), X14 256 MOVOU 240(SI), X15 257 MOVOU X0, (DI) 258 MOVOU X1, 16(DI) 259 MOVOU X2, 32(DI) 260 MOVOU X3, 48(DI) 261 MOVOU X4, 64(DI) 262 MOVOU X5, 80(DI) 263 MOVOU X6, 96(DI) 264 MOVOU X7, 112(DI) 265 MOVOU X8, 128(DI) 266 MOVOU X9, 144(DI) 267 MOVOU X10, 160(DI) 268 MOVOU X11, 176(DI) 269 MOVOU X12, 192(DI) 270 MOVOU X13, 208(DI) 271 MOVOU X14, 224(DI) 272 MOVOU X15, 240(DI) 273 CMPQ BX, $256 274 LEAQ 256(SI), SI 275 LEAQ 256(DI), DI 276 JGE move_256through2048 277 JMP tail 278 279 avxUnaligned: 280 // There are two implementations of move algorithm. 281 // The first one for non-ovelapped memory regions. It uses forward copying. 282 // The second one for overlapped regions. It uses backward copying 283 MOVQ DI, CX 284 SUBQ SI, CX 285 // Now CX contains distance between SRC and DEST 286 CMPQ CX, BX 287 // If the distance lesser than region length it means that regions are overlapped 288 JC copy_backward 289 290 // Non-temporal copy would be better for big sizes. 291 CMPQ BX, $0x100000 292 JAE gobble_big_data_fwd 293 294 // Memory layout on the source side 295 // SI CX 296 // |<---------BX before correction--------->| 297 // | |<--BX corrected-->| | 298 // | | |<--- AX --->| 299 // |<-R11->| |<-128 bytes->| 300 // +----------------------------------------+ 301 // | Head | Body | Tail | 302 // +-------+------------------+-------------+ 303 // ^ ^ ^ 304 // | | | 305 // Save head into Y4 Save tail into X5..X12 306 // | 307 // SI+R11, where R11 = ((DI & -32) + 32) - DI 308 // Algorithm: 309 // 1. Unaligned save of the tail's 128 bytes 310 // 2. Unaligned save of the head's 32 bytes 311 // 3. Destination-aligned copying of body (128 bytes per iteration) 312 // 4. Put head on the new place 313 // 5. Put the tail on the new place 314 // It can be important to satisfy processor's pipeline requirements for 315 // small sizes as the cost of unaligned memory region copying is 316 // comparable with the cost of main loop. So code is slightly messed there. 317 // There is more clean implementation of that algorithm for bigger sizes 318 // where the cost of unaligned part copying is negligible. 319 // You can see it after gobble_big_data_fwd label. 320 LEAQ (SI)(BX*1), CX 321 MOVQ DI, R10 322 // CX points to the end of buffer so we need go back slightly. We will use negative offsets there. 323 MOVOU -0x80(CX), X5 324 MOVOU -0x70(CX), X6 325 MOVQ $0x80, AX 326 // Align destination address 327 ANDQ $-32, DI 328 ADDQ $32, DI 329 // Continue tail saving. 330 MOVOU -0x60(CX), X7 331 MOVOU -0x50(CX), X8 332 // Make R11 delta between aligned and unaligned destination addresses. 333 MOVQ DI, R11 334 SUBQ R10, R11 335 // Continue tail saving. 336 MOVOU -0x40(CX), X9 337 MOVOU -0x30(CX), X10 338 // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying. 339 SUBQ R11, BX 340 // Continue tail saving. 341 MOVOU -0x20(CX), X11 342 MOVOU -0x10(CX), X12 343 // The tail will be put on it's place after main body copying. 344 // It's time for the unaligned heading part. 345 VMOVDQU (SI), Y4 346 // Adjust source address to point past head. 347 ADDQ R11, SI 348 SUBQ AX, BX 349 // Aligned memory copying there 350 gobble_128_loop: 351 VMOVDQU (SI), Y0 352 VMOVDQU 0x20(SI), Y1 353 VMOVDQU 0x40(SI), Y2 354 VMOVDQU 0x60(SI), Y3 355 ADDQ AX, SI 356 VMOVDQA Y0, (DI) 357 VMOVDQA Y1, 0x20(DI) 358 VMOVDQA Y2, 0x40(DI) 359 VMOVDQA Y3, 0x60(DI) 360 ADDQ AX, DI 361 SUBQ AX, BX 362 JA gobble_128_loop 363 // Now we can store unaligned parts. 364 ADDQ AX, BX 365 ADDQ DI, BX 366 VMOVDQU Y4, (R10) 367 VZEROUPPER 368 MOVOU X5, -0x80(BX) 369 MOVOU X6, -0x70(BX) 370 MOVOU X7, -0x60(BX) 371 MOVOU X8, -0x50(BX) 372 MOVOU X9, -0x40(BX) 373 MOVOU X10, -0x30(BX) 374 MOVOU X11, -0x20(BX) 375 MOVOU X12, -0x10(BX) 376 RET 377 378 gobble_big_data_fwd: 379 // There is forward copying for big regions. 380 // It uses non-temporal mov instructions. 381 // Details of this algorithm are commented previously for small sizes. 382 LEAQ (SI)(BX*1), CX 383 MOVOU -0x80(SI)(BX*1), X5 384 MOVOU -0x70(CX), X6 385 MOVOU -0x60(CX), X7 386 MOVOU -0x50(CX), X8 387 MOVOU -0x40(CX), X9 388 MOVOU -0x30(CX), X10 389 MOVOU -0x20(CX), X11 390 MOVOU -0x10(CX), X12 391 VMOVDQU (SI), Y4 392 MOVQ DI, R8 393 ANDQ $-32, DI 394 ADDQ $32, DI 395 MOVQ DI, R10 396 SUBQ R8, R10 397 SUBQ R10, BX 398 ADDQ R10, SI 399 LEAQ (DI)(BX*1), CX 400 SUBQ $0x80, BX 401 gobble_mem_fwd_loop: 402 PREFETCHNTA 0x1C0(SI) 403 PREFETCHNTA 0x280(SI) 404 // Prefetch values were choosen empirically. 405 // Approach for prefetch usage as in 7.6.6 of [1] 406 // [1] 64-ia-32-architectures-optimization-manual.pdf 407 // http://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf 408 VMOVDQU (SI), Y0 409 VMOVDQU 0x20(SI), Y1 410 VMOVDQU 0x40(SI), Y2 411 VMOVDQU 0x60(SI), Y3 412 ADDQ $0x80, SI 413 VMOVNTDQ Y0, (DI) 414 VMOVNTDQ Y1, 0x20(DI) 415 VMOVNTDQ Y2, 0x40(DI) 416 VMOVNTDQ Y3, 0x60(DI) 417 ADDQ $0x80, DI 418 SUBQ $0x80, BX 419 JA gobble_mem_fwd_loop 420 // NT instructions don't follow the normal cache-coherency rules. 421 // We need SFENCE there to make copied data available timely. 422 SFENCE 423 VMOVDQU Y4, (R8) 424 VZEROUPPER 425 MOVOU X5, -0x80(CX) 426 MOVOU X6, -0x70(CX) 427 MOVOU X7, -0x60(CX) 428 MOVOU X8, -0x50(CX) 429 MOVOU X9, -0x40(CX) 430 MOVOU X10, -0x30(CX) 431 MOVOU X11, -0x20(CX) 432 MOVOU X12, -0x10(CX) 433 RET 434 435 copy_backward: 436 MOVQ DI, AX 437 // Backward copying is about the same as the forward one. 438 // Firstly we load unaligned tail in the beginning of region. 439 MOVOU (SI), X5 440 MOVOU 0x10(SI), X6 441 ADDQ BX, DI 442 MOVOU 0x20(SI), X7 443 MOVOU 0x30(SI), X8 444 LEAQ -0x20(DI), R10 445 MOVQ DI, R11 446 MOVOU 0x40(SI), X9 447 MOVOU 0x50(SI), X10 448 ANDQ $0x1F, R11 449 MOVOU 0x60(SI), X11 450 MOVOU 0x70(SI), X12 451 XORQ R11, DI 452 // Let's point SI to the end of region 453 ADDQ BX, SI 454 // and load unaligned head into X4. 455 VMOVDQU -0x20(SI), Y4 456 SUBQ R11, SI 457 SUBQ R11, BX 458 // If there is enough data for non-temporal moves go to special loop 459 CMPQ BX, $0x100000 460 JA gobble_big_data_bwd 461 SUBQ $0x80, BX 462 gobble_mem_bwd_loop: 463 VMOVDQU -0x20(SI), Y0 464 VMOVDQU -0x40(SI), Y1 465 VMOVDQU -0x60(SI), Y2 466 VMOVDQU -0x80(SI), Y3 467 SUBQ $0x80, SI 468 VMOVDQA Y0, -0x20(DI) 469 VMOVDQA Y1, -0x40(DI) 470 VMOVDQA Y2, -0x60(DI) 471 VMOVDQA Y3, -0x80(DI) 472 SUBQ $0x80, DI 473 SUBQ $0x80, BX 474 JA gobble_mem_bwd_loop 475 // Let's store unaligned data 476 VMOVDQU Y4, (R10) 477 VZEROUPPER 478 MOVOU X5, (AX) 479 MOVOU X6, 0x10(AX) 480 MOVOU X7, 0x20(AX) 481 MOVOU X8, 0x30(AX) 482 MOVOU X9, 0x40(AX) 483 MOVOU X10, 0x50(AX) 484 MOVOU X11, 0x60(AX) 485 MOVOU X12, 0x70(AX) 486 RET 487 488 gobble_big_data_bwd: 489 SUBQ $0x80, BX 490 gobble_big_mem_bwd_loop: 491 PREFETCHNTA -0x1C0(SI) 492 PREFETCHNTA -0x280(SI) 493 VMOVDQU -0x20(SI), Y0 494 VMOVDQU -0x40(SI), Y1 495 VMOVDQU -0x60(SI), Y2 496 VMOVDQU -0x80(SI), Y3 497 SUBQ $0x80, SI 498 VMOVNTDQ Y0, -0x20(DI) 499 VMOVNTDQ Y1, -0x40(DI) 500 VMOVNTDQ Y2, -0x60(DI) 501 VMOVNTDQ Y3, -0x80(DI) 502 SUBQ $0x80, DI 503 SUBQ $0x80, BX 504 JA gobble_big_mem_bwd_loop 505 SFENCE 506 VMOVDQU Y4, (R10) 507 VZEROUPPER 508 MOVOU X5, (AX) 509 MOVOU X6, 0x10(AX) 510 MOVOU X7, 0x20(AX) 511 MOVOU X8, 0x30(AX) 512 MOVOU X9, 0x40(AX) 513 MOVOU X10, 0x50(AX) 514 MOVOU X11, 0x60(AX) 515 MOVOU X12, 0x70(AX) 516 RET