github.com/comwrg/go/src@v0.0.0-20220319063731-c238d0440370/runtime/memmove_amd64.s (about) 1 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64) 2 // https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s 3 // 4 // Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. 5 // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. 6 // Portions Copyright 2009 The Go Authors. All rights reserved. 7 // 8 // Permission is hereby granted, free of charge, to any person obtaining a copy 9 // of this software and associated documentation files (the "Software"), to deal 10 // in the Software without restriction, including without limitation the rights 11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 // copies of the Software, and to permit persons to whom the Software is 13 // furnished to do so, subject to the following conditions: 14 // 15 // The above copyright notice and this permission notice shall be included in 16 // all copies or substantial portions of the Software. 17 // 18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 // THE SOFTWARE. 25 26 //go:build !plan9 27 // +build !plan9 28 29 #include "go_asm.h" 30 #include "textflag.h" 31 32 // See memmove Go doc for important implementation constraints. 33 34 // func memmove(to, from unsafe.Pointer, n uintptr) 35 // ABIInternal for performance. 36 TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT, $0-24 37 #ifdef GOEXPERIMENT_regabiargs 38 // AX = to 39 // BX = from 40 // CX = n 41 MOVQ AX, DI 42 MOVQ BX, SI 43 MOVQ CX, BX 44 #else 45 MOVQ to+0(FP), DI 46 MOVQ from+8(FP), SI 47 MOVQ n+16(FP), BX 48 #endif 49 50 // REP instructions have a high startup cost, so we handle small sizes 51 // with some straightline code. The REP MOVSQ instruction is really fast 52 // for large sizes. The cutover is approximately 2K. 53 tail: 54 // move_129through256 or smaller work whether or not the source and the 55 // destination memory regions overlap because they load all data into 56 // registers before writing it back. move_256through2048 on the other 57 // hand can be used only when the memory regions don't overlap or the copy 58 // direction is forward. 59 // 60 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing. 61 TESTQ BX, BX 62 JEQ move_0 63 CMPQ BX, $2 64 JBE move_1or2 65 CMPQ BX, $4 66 JB move_3 67 JBE move_4 68 CMPQ BX, $8 69 JB move_5through7 70 JE move_8 71 CMPQ BX, $16 72 JBE move_9through16 73 CMPQ BX, $32 74 JBE move_17through32 75 CMPQ BX, $64 76 JBE move_33through64 77 CMPQ BX, $128 78 JBE move_65through128 79 CMPQ BX, $256 80 JBE move_129through256 81 82 TESTB $1, runtime·useAVXmemmove(SB) 83 JNZ avxUnaligned 84 85 /* 86 * check and set for backwards 87 */ 88 CMPQ SI, DI 89 JLS back 90 91 /* 92 * forward copy loop 93 */ 94 forward: 95 CMPQ BX, $2048 96 JLS move_256through2048 97 98 // If REP MOVSB isn't fast, don't use it 99 CMPB internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB 100 JNE fwdBy8 101 102 // Check alignment 103 MOVL SI, AX 104 ORL DI, AX 105 TESTL $7, AX 106 JEQ fwdBy8 107 108 // Do 1 byte at a time 109 MOVQ BX, CX 110 REP; MOVSB 111 RET 112 113 fwdBy8: 114 // Do 8 bytes at a time 115 MOVQ BX, CX 116 SHRQ $3, CX 117 ANDQ $7, BX 118 REP; MOVSQ 119 JMP tail 120 121 back: 122 /* 123 * check overlap 124 */ 125 MOVQ SI, CX 126 ADDQ BX, CX 127 CMPQ CX, DI 128 JLS forward 129 /* 130 * whole thing backwards has 131 * adjusted addresses 132 */ 133 ADDQ BX, DI 134 ADDQ BX, SI 135 STD 136 137 /* 138 * copy 139 */ 140 MOVQ BX, CX 141 SHRQ $3, CX 142 ANDQ $7, BX 143 144 SUBQ $8, DI 145 SUBQ $8, SI 146 REP; MOVSQ 147 148 CLD 149 ADDQ $8, DI 150 ADDQ $8, SI 151 SUBQ BX, DI 152 SUBQ BX, SI 153 JMP tail 154 155 move_1or2: 156 MOVB (SI), AX 157 MOVB -1(SI)(BX*1), CX 158 MOVB AX, (DI) 159 MOVB CX, -1(DI)(BX*1) 160 RET 161 move_0: 162 RET 163 move_4: 164 MOVL (SI), AX 165 MOVL AX, (DI) 166 RET 167 move_3: 168 MOVW (SI), AX 169 MOVB 2(SI), CX 170 MOVW AX, (DI) 171 MOVB CX, 2(DI) 172 RET 173 move_5through7: 174 MOVL (SI), AX 175 MOVL -4(SI)(BX*1), CX 176 MOVL AX, (DI) 177 MOVL CX, -4(DI)(BX*1) 178 RET 179 move_8: 180 // We need a separate case for 8 to make sure we write pointers atomically. 181 MOVQ (SI), AX 182 MOVQ AX, (DI) 183 RET 184 move_9through16: 185 MOVQ (SI), AX 186 MOVQ -8(SI)(BX*1), CX 187 MOVQ AX, (DI) 188 MOVQ CX, -8(DI)(BX*1) 189 RET 190 move_17through32: 191 MOVOU (SI), X0 192 MOVOU -16(SI)(BX*1), X1 193 MOVOU X0, (DI) 194 MOVOU X1, -16(DI)(BX*1) 195 RET 196 move_33through64: 197 MOVOU (SI), X0 198 MOVOU 16(SI), X1 199 MOVOU -32(SI)(BX*1), X2 200 MOVOU -16(SI)(BX*1), X3 201 MOVOU X0, (DI) 202 MOVOU X1, 16(DI) 203 MOVOU X2, -32(DI)(BX*1) 204 MOVOU X3, -16(DI)(BX*1) 205 RET 206 move_65through128: 207 MOVOU (SI), X0 208 MOVOU 16(SI), X1 209 MOVOU 32(SI), X2 210 MOVOU 48(SI), X3 211 MOVOU -64(SI)(BX*1), X4 212 MOVOU -48(SI)(BX*1), X5 213 MOVOU -32(SI)(BX*1), X6 214 MOVOU -16(SI)(BX*1), X7 215 MOVOU X0, (DI) 216 MOVOU X1, 16(DI) 217 MOVOU X2, 32(DI) 218 MOVOU X3, 48(DI) 219 MOVOU X4, -64(DI)(BX*1) 220 MOVOU X5, -48(DI)(BX*1) 221 MOVOU X6, -32(DI)(BX*1) 222 MOVOU X7, -16(DI)(BX*1) 223 RET 224 move_129through256: 225 MOVOU (SI), X0 226 MOVOU 16(SI), X1 227 MOVOU 32(SI), X2 228 MOVOU 48(SI), X3 229 MOVOU 64(SI), X4 230 MOVOU 80(SI), X5 231 MOVOU 96(SI), X6 232 MOVOU 112(SI), X7 233 MOVOU -128(SI)(BX*1), X8 234 MOVOU -112(SI)(BX*1), X9 235 MOVOU -96(SI)(BX*1), X10 236 MOVOU -80(SI)(BX*1), X11 237 MOVOU -64(SI)(BX*1), X12 238 MOVOU -48(SI)(BX*1), X13 239 MOVOU -32(SI)(BX*1), X14 240 MOVOU -16(SI)(BX*1), X15 241 MOVOU X0, (DI) 242 MOVOU X1, 16(DI) 243 MOVOU X2, 32(DI) 244 MOVOU X3, 48(DI) 245 MOVOU X4, 64(DI) 246 MOVOU X5, 80(DI) 247 MOVOU X6, 96(DI) 248 MOVOU X7, 112(DI) 249 MOVOU X8, -128(DI)(BX*1) 250 MOVOU X9, -112(DI)(BX*1) 251 MOVOU X10, -96(DI)(BX*1) 252 MOVOU X11, -80(DI)(BX*1) 253 MOVOU X12, -64(DI)(BX*1) 254 MOVOU X13, -48(DI)(BX*1) 255 MOVOU X14, -32(DI)(BX*1) 256 MOVOU X15, -16(DI)(BX*1) 257 #ifdef GOEXPERIMENT_regabig 258 // X15 must be zero on return 259 PXOR X15, X15 260 #endif 261 RET 262 move_256through2048: 263 SUBQ $256, BX 264 MOVOU (SI), X0 265 MOVOU 16(SI), X1 266 MOVOU 32(SI), X2 267 MOVOU 48(SI), X3 268 MOVOU 64(SI), X4 269 MOVOU 80(SI), X5 270 MOVOU 96(SI), X6 271 MOVOU 112(SI), X7 272 MOVOU 128(SI), X8 273 MOVOU 144(SI), X9 274 MOVOU 160(SI), X10 275 MOVOU 176(SI), X11 276 MOVOU 192(SI), X12 277 MOVOU 208(SI), X13 278 MOVOU 224(SI), X14 279 MOVOU 240(SI), X15 280 MOVOU X0, (DI) 281 MOVOU X1, 16(DI) 282 MOVOU X2, 32(DI) 283 MOVOU X3, 48(DI) 284 MOVOU X4, 64(DI) 285 MOVOU X5, 80(DI) 286 MOVOU X6, 96(DI) 287 MOVOU X7, 112(DI) 288 MOVOU X8, 128(DI) 289 MOVOU X9, 144(DI) 290 MOVOU X10, 160(DI) 291 MOVOU X11, 176(DI) 292 MOVOU X12, 192(DI) 293 MOVOU X13, 208(DI) 294 MOVOU X14, 224(DI) 295 MOVOU X15, 240(DI) 296 CMPQ BX, $256 297 LEAQ 256(SI), SI 298 LEAQ 256(DI), DI 299 JGE move_256through2048 300 #ifdef GOEXPERIMENT_regabig 301 // X15 must be zero on return 302 PXOR X15, X15 303 #endif 304 JMP tail 305 306 avxUnaligned: 307 // There are two implementations of move algorithm. 308 // The first one for non-overlapped memory regions. It uses forward copying. 309 // The second one for overlapped regions. It uses backward copying 310 MOVQ DI, CX 311 SUBQ SI, CX 312 // Now CX contains distance between SRC and DEST 313 CMPQ CX, BX 314 // If the distance lesser than region length it means that regions are overlapped 315 JC copy_backward 316 317 // Non-temporal copy would be better for big sizes. 318 CMPQ BX, $0x100000 319 JAE gobble_big_data_fwd 320 321 // Memory layout on the source side 322 // SI CX 323 // |<---------BX before correction--------->| 324 // | |<--BX corrected-->| | 325 // | | |<--- AX --->| 326 // |<-R11->| |<-128 bytes->| 327 // +----------------------------------------+ 328 // | Head | Body | Tail | 329 // +-------+------------------+-------------+ 330 // ^ ^ ^ 331 // | | | 332 // Save head into Y4 Save tail into X5..X12 333 // | 334 // SI+R11, where R11 = ((DI & -32) + 32) - DI 335 // Algorithm: 336 // 1. Unaligned save of the tail's 128 bytes 337 // 2. Unaligned save of the head's 32 bytes 338 // 3. Destination-aligned copying of body (128 bytes per iteration) 339 // 4. Put head on the new place 340 // 5. Put the tail on the new place 341 // It can be important to satisfy processor's pipeline requirements for 342 // small sizes as the cost of unaligned memory region copying is 343 // comparable with the cost of main loop. So code is slightly messed there. 344 // There is more clean implementation of that algorithm for bigger sizes 345 // where the cost of unaligned part copying is negligible. 346 // You can see it after gobble_big_data_fwd label. 347 LEAQ (SI)(BX*1), CX 348 MOVQ DI, R10 349 // CX points to the end of buffer so we need go back slightly. We will use negative offsets there. 350 MOVOU -0x80(CX), X5 351 MOVOU -0x70(CX), X6 352 MOVQ $0x80, AX 353 // Align destination address 354 ANDQ $-32, DI 355 ADDQ $32, DI 356 // Continue tail saving. 357 MOVOU -0x60(CX), X7 358 MOVOU -0x50(CX), X8 359 // Make R11 delta between aligned and unaligned destination addresses. 360 MOVQ DI, R11 361 SUBQ R10, R11 362 // Continue tail saving. 363 MOVOU -0x40(CX), X9 364 MOVOU -0x30(CX), X10 365 // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying. 366 SUBQ R11, BX 367 // Continue tail saving. 368 MOVOU -0x20(CX), X11 369 MOVOU -0x10(CX), X12 370 // The tail will be put on its place after main body copying. 371 // It's time for the unaligned heading part. 372 VMOVDQU (SI), Y4 373 // Adjust source address to point past head. 374 ADDQ R11, SI 375 SUBQ AX, BX 376 // Aligned memory copying there 377 gobble_128_loop: 378 VMOVDQU (SI), Y0 379 VMOVDQU 0x20(SI), Y1 380 VMOVDQU 0x40(SI), Y2 381 VMOVDQU 0x60(SI), Y3 382 ADDQ AX, SI 383 VMOVDQA Y0, (DI) 384 VMOVDQA Y1, 0x20(DI) 385 VMOVDQA Y2, 0x40(DI) 386 VMOVDQA Y3, 0x60(DI) 387 ADDQ AX, DI 388 SUBQ AX, BX 389 JA gobble_128_loop 390 // Now we can store unaligned parts. 391 ADDQ AX, BX 392 ADDQ DI, BX 393 VMOVDQU Y4, (R10) 394 VZEROUPPER 395 MOVOU X5, -0x80(BX) 396 MOVOU X6, -0x70(BX) 397 MOVOU X7, -0x60(BX) 398 MOVOU X8, -0x50(BX) 399 MOVOU X9, -0x40(BX) 400 MOVOU X10, -0x30(BX) 401 MOVOU X11, -0x20(BX) 402 MOVOU X12, -0x10(BX) 403 RET 404 405 gobble_big_data_fwd: 406 // There is forward copying for big regions. 407 // It uses non-temporal mov instructions. 408 // Details of this algorithm are commented previously for small sizes. 409 LEAQ (SI)(BX*1), CX 410 MOVOU -0x80(SI)(BX*1), X5 411 MOVOU -0x70(CX), X6 412 MOVOU -0x60(CX), X7 413 MOVOU -0x50(CX), X8 414 MOVOU -0x40(CX), X9 415 MOVOU -0x30(CX), X10 416 MOVOU -0x20(CX), X11 417 MOVOU -0x10(CX), X12 418 VMOVDQU (SI), Y4 419 MOVQ DI, R8 420 ANDQ $-32, DI 421 ADDQ $32, DI 422 MOVQ DI, R10 423 SUBQ R8, R10 424 SUBQ R10, BX 425 ADDQ R10, SI 426 LEAQ (DI)(BX*1), CX 427 SUBQ $0x80, BX 428 gobble_mem_fwd_loop: 429 PREFETCHNTA 0x1C0(SI) 430 PREFETCHNTA 0x280(SI) 431 // Prefetch values were chosen empirically. 432 // Approach for prefetch usage as in 7.6.6 of [1] 433 // [1] 64-ia-32-architectures-optimization-manual.pdf 434 // https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf 435 VMOVDQU (SI), Y0 436 VMOVDQU 0x20(SI), Y1 437 VMOVDQU 0x40(SI), Y2 438 VMOVDQU 0x60(SI), Y3 439 ADDQ $0x80, SI 440 VMOVNTDQ Y0, (DI) 441 VMOVNTDQ Y1, 0x20(DI) 442 VMOVNTDQ Y2, 0x40(DI) 443 VMOVNTDQ Y3, 0x60(DI) 444 ADDQ $0x80, DI 445 SUBQ $0x80, BX 446 JA gobble_mem_fwd_loop 447 // NT instructions don't follow the normal cache-coherency rules. 448 // We need SFENCE there to make copied data available timely. 449 SFENCE 450 VMOVDQU Y4, (R8) 451 VZEROUPPER 452 MOVOU X5, -0x80(CX) 453 MOVOU X6, -0x70(CX) 454 MOVOU X7, -0x60(CX) 455 MOVOU X8, -0x50(CX) 456 MOVOU X9, -0x40(CX) 457 MOVOU X10, -0x30(CX) 458 MOVOU X11, -0x20(CX) 459 MOVOU X12, -0x10(CX) 460 RET 461 462 copy_backward: 463 MOVQ DI, AX 464 // Backward copying is about the same as the forward one. 465 // Firstly we load unaligned tail in the beginning of region. 466 MOVOU (SI), X5 467 MOVOU 0x10(SI), X6 468 ADDQ BX, DI 469 MOVOU 0x20(SI), X7 470 MOVOU 0x30(SI), X8 471 LEAQ -0x20(DI), R10 472 MOVQ DI, R11 473 MOVOU 0x40(SI), X9 474 MOVOU 0x50(SI), X10 475 ANDQ $0x1F, R11 476 MOVOU 0x60(SI), X11 477 MOVOU 0x70(SI), X12 478 XORQ R11, DI 479 // Let's point SI to the end of region 480 ADDQ BX, SI 481 // and load unaligned head into X4. 482 VMOVDQU -0x20(SI), Y4 483 SUBQ R11, SI 484 SUBQ R11, BX 485 // If there is enough data for non-temporal moves go to special loop 486 CMPQ BX, $0x100000 487 JA gobble_big_data_bwd 488 SUBQ $0x80, BX 489 gobble_mem_bwd_loop: 490 VMOVDQU -0x20(SI), Y0 491 VMOVDQU -0x40(SI), Y1 492 VMOVDQU -0x60(SI), Y2 493 VMOVDQU -0x80(SI), Y3 494 SUBQ $0x80, SI 495 VMOVDQA Y0, -0x20(DI) 496 VMOVDQA Y1, -0x40(DI) 497 VMOVDQA Y2, -0x60(DI) 498 VMOVDQA Y3, -0x80(DI) 499 SUBQ $0x80, DI 500 SUBQ $0x80, BX 501 JA gobble_mem_bwd_loop 502 // Let's store unaligned data 503 VMOVDQU Y4, (R10) 504 VZEROUPPER 505 MOVOU X5, (AX) 506 MOVOU X6, 0x10(AX) 507 MOVOU X7, 0x20(AX) 508 MOVOU X8, 0x30(AX) 509 MOVOU X9, 0x40(AX) 510 MOVOU X10, 0x50(AX) 511 MOVOU X11, 0x60(AX) 512 MOVOU X12, 0x70(AX) 513 RET 514 515 gobble_big_data_bwd: 516 SUBQ $0x80, BX 517 gobble_big_mem_bwd_loop: 518 PREFETCHNTA -0x1C0(SI) 519 PREFETCHNTA -0x280(SI) 520 VMOVDQU -0x20(SI), Y0 521 VMOVDQU -0x40(SI), Y1 522 VMOVDQU -0x60(SI), Y2 523 VMOVDQU -0x80(SI), Y3 524 SUBQ $0x80, SI 525 VMOVNTDQ Y0, -0x20(DI) 526 VMOVNTDQ Y1, -0x40(DI) 527 VMOVNTDQ Y2, -0x60(DI) 528 VMOVNTDQ Y3, -0x80(DI) 529 SUBQ $0x80, DI 530 SUBQ $0x80, BX 531 JA gobble_big_mem_bwd_loop 532 SFENCE 533 VMOVDQU Y4, (R10) 534 VZEROUPPER 535 MOVOU X5, (AX) 536 MOVOU X6, 0x10(AX) 537 MOVOU X7, 0x20(AX) 538 MOVOU X8, 0x30(AX) 539 MOVOU X9, 0x40(AX) 540 MOVOU X10, 0x50(AX) 541 MOVOU X11, 0x60(AX) 542 MOVOU X12, 0x70(AX) 543 RET