github.com/twelsh-aw/go/src@v0.0.0-20230516233729-a56fe86a7c81/runtime/memmove_amd64.s (about) 1 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64) 2 // https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s 3 // 4 // Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. 5 // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. 6 // Portions Copyright 2009 The Go Authors. All rights reserved. 7 // 8 // Permission is hereby granted, free of charge, to any person obtaining a copy 9 // of this software and associated documentation files (the "Software"), to deal 10 // in the Software without restriction, including without limitation the rights 11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 // copies of the Software, and to permit persons to whom the Software is 13 // furnished to do so, subject to the following conditions: 14 // 15 // The above copyright notice and this permission notice shall be included in 16 // all copies or substantial portions of the Software. 17 // 18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 // THE SOFTWARE. 25 26 //go:build !plan9 27 28 #include "go_asm.h" 29 #include "textflag.h" 30 31 // See memmove Go doc for important implementation constraints. 32 33 // func memmove(to, from unsafe.Pointer, n uintptr) 34 // ABIInternal for performance. 35 TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT, $0-24 36 // AX = to 37 // BX = from 38 // CX = n 39 MOVQ AX, DI 40 MOVQ BX, SI 41 MOVQ CX, BX 42 43 // REP instructions have a high startup cost, so we handle small sizes 44 // with some straightline code. The REP MOVSQ instruction is really fast 45 // for large sizes. The cutover is approximately 2K. 46 tail: 47 // move_129through256 or smaller work whether or not the source and the 48 // destination memory regions overlap because they load all data into 49 // registers before writing it back. move_256through2048 on the other 50 // hand can be used only when the memory regions don't overlap or the copy 51 // direction is forward. 52 // 53 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing. 54 TESTQ BX, BX 55 JEQ move_0 56 CMPQ BX, $2 57 JBE move_1or2 58 CMPQ BX, $4 59 JB move_3 60 JBE move_4 61 CMPQ BX, $8 62 JB move_5through7 63 JE move_8 64 CMPQ BX, $16 65 JBE move_9through16 66 CMPQ BX, $32 67 JBE move_17through32 68 CMPQ BX, $64 69 JBE move_33through64 70 CMPQ BX, $128 71 JBE move_65through128 72 CMPQ BX, $256 73 JBE move_129through256 74 75 TESTB $1, runtime·useAVXmemmove(SB) 76 JNZ avxUnaligned 77 78 /* 79 * check and set for backwards 80 */ 81 CMPQ SI, DI 82 JLS back 83 84 /* 85 * forward copy loop 86 */ 87 forward: 88 CMPQ BX, $2048 89 JLS move_256through2048 90 91 // If REP MOVSB isn't fast, don't use it 92 CMPB internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB 93 JNE fwdBy8 94 95 // Check alignment 96 MOVL SI, AX 97 ORL DI, AX 98 TESTL $7, AX 99 JEQ fwdBy8 100 101 // Do 1 byte at a time 102 MOVQ BX, CX 103 REP; MOVSB 104 RET 105 106 fwdBy8: 107 // Do 8 bytes at a time 108 MOVQ BX, CX 109 SHRQ $3, CX 110 ANDQ $7, BX 111 REP; MOVSQ 112 JMP tail 113 114 back: 115 /* 116 * check overlap 117 */ 118 MOVQ SI, CX 119 ADDQ BX, CX 120 CMPQ CX, DI 121 JLS forward 122 /* 123 * whole thing backwards has 124 * adjusted addresses 125 */ 126 ADDQ BX, DI 127 ADDQ BX, SI 128 STD 129 130 /* 131 * copy 132 */ 133 MOVQ BX, CX 134 SHRQ $3, CX 135 ANDQ $7, BX 136 137 SUBQ $8, DI 138 SUBQ $8, SI 139 REP; MOVSQ 140 141 CLD 142 ADDQ $8, DI 143 ADDQ $8, SI 144 SUBQ BX, DI 145 SUBQ BX, SI 146 JMP tail 147 148 move_1or2: 149 MOVB (SI), AX 150 MOVB -1(SI)(BX*1), CX 151 MOVB AX, (DI) 152 MOVB CX, -1(DI)(BX*1) 153 RET 154 move_0: 155 RET 156 move_4: 157 MOVL (SI), AX 158 MOVL AX, (DI) 159 RET 160 move_3: 161 MOVW (SI), AX 162 MOVB 2(SI), CX 163 MOVW AX, (DI) 164 MOVB CX, 2(DI) 165 RET 166 move_5through7: 167 MOVL (SI), AX 168 MOVL -4(SI)(BX*1), CX 169 MOVL AX, (DI) 170 MOVL CX, -4(DI)(BX*1) 171 RET 172 move_8: 173 // We need a separate case for 8 to make sure we write pointers atomically. 174 MOVQ (SI), AX 175 MOVQ AX, (DI) 176 RET 177 move_9through16: 178 MOVQ (SI), AX 179 MOVQ -8(SI)(BX*1), CX 180 MOVQ AX, (DI) 181 MOVQ CX, -8(DI)(BX*1) 182 RET 183 move_17through32: 184 MOVOU (SI), X0 185 MOVOU -16(SI)(BX*1), X1 186 MOVOU X0, (DI) 187 MOVOU X1, -16(DI)(BX*1) 188 RET 189 move_33through64: 190 MOVOU (SI), X0 191 MOVOU 16(SI), X1 192 MOVOU -32(SI)(BX*1), X2 193 MOVOU -16(SI)(BX*1), X3 194 MOVOU X0, (DI) 195 MOVOU X1, 16(DI) 196 MOVOU X2, -32(DI)(BX*1) 197 MOVOU X3, -16(DI)(BX*1) 198 RET 199 move_65through128: 200 MOVOU (SI), X0 201 MOVOU 16(SI), X1 202 MOVOU 32(SI), X2 203 MOVOU 48(SI), X3 204 MOVOU -64(SI)(BX*1), X4 205 MOVOU -48(SI)(BX*1), X5 206 MOVOU -32(SI)(BX*1), X6 207 MOVOU -16(SI)(BX*1), X7 208 MOVOU X0, (DI) 209 MOVOU X1, 16(DI) 210 MOVOU X2, 32(DI) 211 MOVOU X3, 48(DI) 212 MOVOU X4, -64(DI)(BX*1) 213 MOVOU X5, -48(DI)(BX*1) 214 MOVOU X6, -32(DI)(BX*1) 215 MOVOU X7, -16(DI)(BX*1) 216 RET 217 move_129through256: 218 MOVOU (SI), X0 219 MOVOU 16(SI), X1 220 MOVOU 32(SI), X2 221 MOVOU 48(SI), X3 222 MOVOU 64(SI), X4 223 MOVOU 80(SI), X5 224 MOVOU 96(SI), X6 225 MOVOU 112(SI), X7 226 MOVOU -128(SI)(BX*1), X8 227 MOVOU -112(SI)(BX*1), X9 228 MOVOU -96(SI)(BX*1), X10 229 MOVOU -80(SI)(BX*1), X11 230 MOVOU -64(SI)(BX*1), X12 231 MOVOU -48(SI)(BX*1), X13 232 MOVOU -32(SI)(BX*1), X14 233 MOVOU -16(SI)(BX*1), X15 234 MOVOU X0, (DI) 235 MOVOU X1, 16(DI) 236 MOVOU X2, 32(DI) 237 MOVOU X3, 48(DI) 238 MOVOU X4, 64(DI) 239 MOVOU X5, 80(DI) 240 MOVOU X6, 96(DI) 241 MOVOU X7, 112(DI) 242 MOVOU X8, -128(DI)(BX*1) 243 MOVOU X9, -112(DI)(BX*1) 244 MOVOU X10, -96(DI)(BX*1) 245 MOVOU X11, -80(DI)(BX*1) 246 MOVOU X12, -64(DI)(BX*1) 247 MOVOU X13, -48(DI)(BX*1) 248 MOVOU X14, -32(DI)(BX*1) 249 MOVOU X15, -16(DI)(BX*1) 250 // X15 must be zero on return 251 PXOR X15, X15 252 RET 253 move_256through2048: 254 SUBQ $256, BX 255 MOVOU (SI), X0 256 MOVOU 16(SI), X1 257 MOVOU 32(SI), X2 258 MOVOU 48(SI), X3 259 MOVOU 64(SI), X4 260 MOVOU 80(SI), X5 261 MOVOU 96(SI), X6 262 MOVOU 112(SI), X7 263 MOVOU 128(SI), X8 264 MOVOU 144(SI), X9 265 MOVOU 160(SI), X10 266 MOVOU 176(SI), X11 267 MOVOU 192(SI), X12 268 MOVOU 208(SI), X13 269 MOVOU 224(SI), X14 270 MOVOU 240(SI), X15 271 MOVOU X0, (DI) 272 MOVOU X1, 16(DI) 273 MOVOU X2, 32(DI) 274 MOVOU X3, 48(DI) 275 MOVOU X4, 64(DI) 276 MOVOU X5, 80(DI) 277 MOVOU X6, 96(DI) 278 MOVOU X7, 112(DI) 279 MOVOU X8, 128(DI) 280 MOVOU X9, 144(DI) 281 MOVOU X10, 160(DI) 282 MOVOU X11, 176(DI) 283 MOVOU X12, 192(DI) 284 MOVOU X13, 208(DI) 285 MOVOU X14, 224(DI) 286 MOVOU X15, 240(DI) 287 CMPQ BX, $256 288 LEAQ 256(SI), SI 289 LEAQ 256(DI), DI 290 JGE move_256through2048 291 // X15 must be zero on return 292 PXOR X15, X15 293 JMP tail 294 295 avxUnaligned: 296 // There are two implementations of move algorithm. 297 // The first one for non-overlapped memory regions. It uses forward copying. 298 // The second one for overlapped regions. It uses backward copying 299 MOVQ DI, CX 300 SUBQ SI, CX 301 // Now CX contains distance between SRC and DEST 302 CMPQ CX, BX 303 // If the distance lesser than region length it means that regions are overlapped 304 JC copy_backward 305 306 // Non-temporal copy would be better for big sizes. 307 CMPQ BX, $0x100000 308 JAE gobble_big_data_fwd 309 310 // Memory layout on the source side 311 // SI CX 312 // |<---------BX before correction--------->| 313 // | |<--BX corrected-->| | 314 // | | |<--- AX --->| 315 // |<-R11->| |<-128 bytes->| 316 // +----------------------------------------+ 317 // | Head | Body | Tail | 318 // +-------+------------------+-------------+ 319 // ^ ^ ^ 320 // | | | 321 // Save head into Y4 Save tail into X5..X12 322 // | 323 // SI+R11, where R11 = ((DI & -32) + 32) - DI 324 // Algorithm: 325 // 1. Unaligned save of the tail's 128 bytes 326 // 2. Unaligned save of the head's 32 bytes 327 // 3. Destination-aligned copying of body (128 bytes per iteration) 328 // 4. Put head on the new place 329 // 5. Put the tail on the new place 330 // It can be important to satisfy processor's pipeline requirements for 331 // small sizes as the cost of unaligned memory region copying is 332 // comparable with the cost of main loop. So code is slightly messed there. 333 // There is more clean implementation of that algorithm for bigger sizes 334 // where the cost of unaligned part copying is negligible. 335 // You can see it after gobble_big_data_fwd label. 336 LEAQ (SI)(BX*1), CX 337 MOVQ DI, R10 338 // CX points to the end of buffer so we need go back slightly. We will use negative offsets there. 339 MOVOU -0x80(CX), X5 340 MOVOU -0x70(CX), X6 341 MOVQ $0x80, AX 342 // Align destination address 343 ANDQ $-32, DI 344 ADDQ $32, DI 345 // Continue tail saving. 346 MOVOU -0x60(CX), X7 347 MOVOU -0x50(CX), X8 348 // Make R11 delta between aligned and unaligned destination addresses. 349 MOVQ DI, R11 350 SUBQ R10, R11 351 // Continue tail saving. 352 MOVOU -0x40(CX), X9 353 MOVOU -0x30(CX), X10 354 // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying. 355 SUBQ R11, BX 356 // Continue tail saving. 357 MOVOU -0x20(CX), X11 358 MOVOU -0x10(CX), X12 359 // The tail will be put on its place after main body copying. 360 // It's time for the unaligned heading part. 361 VMOVDQU (SI), Y4 362 // Adjust source address to point past head. 363 ADDQ R11, SI 364 SUBQ AX, BX 365 // Aligned memory copying there 366 gobble_128_loop: 367 VMOVDQU (SI), Y0 368 VMOVDQU 0x20(SI), Y1 369 VMOVDQU 0x40(SI), Y2 370 VMOVDQU 0x60(SI), Y3 371 ADDQ AX, SI 372 VMOVDQA Y0, (DI) 373 VMOVDQA Y1, 0x20(DI) 374 VMOVDQA Y2, 0x40(DI) 375 VMOVDQA Y3, 0x60(DI) 376 ADDQ AX, DI 377 SUBQ AX, BX 378 JA gobble_128_loop 379 // Now we can store unaligned parts. 380 ADDQ AX, BX 381 ADDQ DI, BX 382 VMOVDQU Y4, (R10) 383 VZEROUPPER 384 MOVOU X5, -0x80(BX) 385 MOVOU X6, -0x70(BX) 386 MOVOU X7, -0x60(BX) 387 MOVOU X8, -0x50(BX) 388 MOVOU X9, -0x40(BX) 389 MOVOU X10, -0x30(BX) 390 MOVOU X11, -0x20(BX) 391 MOVOU X12, -0x10(BX) 392 RET 393 394 gobble_big_data_fwd: 395 // There is forward copying for big regions. 396 // It uses non-temporal mov instructions. 397 // Details of this algorithm are commented previously for small sizes. 398 LEAQ (SI)(BX*1), CX 399 MOVOU -0x80(SI)(BX*1), X5 400 MOVOU -0x70(CX), X6 401 MOVOU -0x60(CX), X7 402 MOVOU -0x50(CX), X8 403 MOVOU -0x40(CX), X9 404 MOVOU -0x30(CX), X10 405 MOVOU -0x20(CX), X11 406 MOVOU -0x10(CX), X12 407 VMOVDQU (SI), Y4 408 MOVQ DI, R8 409 ANDQ $-32, DI 410 ADDQ $32, DI 411 MOVQ DI, R10 412 SUBQ R8, R10 413 SUBQ R10, BX 414 ADDQ R10, SI 415 LEAQ (DI)(BX*1), CX 416 SUBQ $0x80, BX 417 gobble_mem_fwd_loop: 418 PREFETCHNTA 0x1C0(SI) 419 PREFETCHNTA 0x280(SI) 420 // Prefetch values were chosen empirically. 421 // Approach for prefetch usage as in 9.5.6 of [1] 422 // [1] 64-ia-32-architectures-optimization-manual.pdf 423 // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf 424 VMOVDQU (SI), Y0 425 VMOVDQU 0x20(SI), Y1 426 VMOVDQU 0x40(SI), Y2 427 VMOVDQU 0x60(SI), Y3 428 ADDQ $0x80, SI 429 VMOVNTDQ Y0, (DI) 430 VMOVNTDQ Y1, 0x20(DI) 431 VMOVNTDQ Y2, 0x40(DI) 432 VMOVNTDQ Y3, 0x60(DI) 433 ADDQ $0x80, DI 434 SUBQ $0x80, BX 435 JA gobble_mem_fwd_loop 436 // NT instructions don't follow the normal cache-coherency rules. 437 // We need SFENCE there to make copied data available timely. 438 SFENCE 439 VMOVDQU Y4, (R8) 440 VZEROUPPER 441 MOVOU X5, -0x80(CX) 442 MOVOU X6, -0x70(CX) 443 MOVOU X7, -0x60(CX) 444 MOVOU X8, -0x50(CX) 445 MOVOU X9, -0x40(CX) 446 MOVOU X10, -0x30(CX) 447 MOVOU X11, -0x20(CX) 448 MOVOU X12, -0x10(CX) 449 RET 450 451 copy_backward: 452 MOVQ DI, AX 453 // Backward copying is about the same as the forward one. 454 // Firstly we load unaligned tail in the beginning of region. 455 MOVOU (SI), X5 456 MOVOU 0x10(SI), X6 457 ADDQ BX, DI 458 MOVOU 0x20(SI), X7 459 MOVOU 0x30(SI), X8 460 LEAQ -0x20(DI), R10 461 MOVQ DI, R11 462 MOVOU 0x40(SI), X9 463 MOVOU 0x50(SI), X10 464 ANDQ $0x1F, R11 465 MOVOU 0x60(SI), X11 466 MOVOU 0x70(SI), X12 467 XORQ R11, DI 468 // Let's point SI to the end of region 469 ADDQ BX, SI 470 // and load unaligned head into X4. 471 VMOVDQU -0x20(SI), Y4 472 SUBQ R11, SI 473 SUBQ R11, BX 474 // If there is enough data for non-temporal moves go to special loop 475 CMPQ BX, $0x100000 476 JA gobble_big_data_bwd 477 SUBQ $0x80, BX 478 gobble_mem_bwd_loop: 479 VMOVDQU -0x20(SI), Y0 480 VMOVDQU -0x40(SI), Y1 481 VMOVDQU -0x60(SI), Y2 482 VMOVDQU -0x80(SI), Y3 483 SUBQ $0x80, SI 484 VMOVDQA Y0, -0x20(DI) 485 VMOVDQA Y1, -0x40(DI) 486 VMOVDQA Y2, -0x60(DI) 487 VMOVDQA Y3, -0x80(DI) 488 SUBQ $0x80, DI 489 SUBQ $0x80, BX 490 JA gobble_mem_bwd_loop 491 // Let's store unaligned data 492 VMOVDQU Y4, (R10) 493 VZEROUPPER 494 MOVOU X5, (AX) 495 MOVOU X6, 0x10(AX) 496 MOVOU X7, 0x20(AX) 497 MOVOU X8, 0x30(AX) 498 MOVOU X9, 0x40(AX) 499 MOVOU X10, 0x50(AX) 500 MOVOU X11, 0x60(AX) 501 MOVOU X12, 0x70(AX) 502 RET 503 504 gobble_big_data_bwd: 505 SUBQ $0x80, BX 506 gobble_big_mem_bwd_loop: 507 PREFETCHNTA -0x1C0(SI) 508 PREFETCHNTA -0x280(SI) 509 VMOVDQU -0x20(SI), Y0 510 VMOVDQU -0x40(SI), Y1 511 VMOVDQU -0x60(SI), Y2 512 VMOVDQU -0x80(SI), Y3 513 SUBQ $0x80, SI 514 VMOVNTDQ Y0, -0x20(DI) 515 VMOVNTDQ Y1, -0x40(DI) 516 VMOVNTDQ Y2, -0x60(DI) 517 VMOVNTDQ Y3, -0x80(DI) 518 SUBQ $0x80, DI 519 SUBQ $0x80, BX 520 JA gobble_big_mem_bwd_loop 521 SFENCE 522 VMOVDQU Y4, (R10) 523 VZEROUPPER 524 MOVOU X5, (AX) 525 MOVOU X6, 0x10(AX) 526 MOVOU X7, 0x20(AX) 527 MOVOU X8, 0x30(AX) 528 MOVOU X9, 0x40(AX) 529 MOVOU X10, 0x50(AX) 530 MOVOU X11, 0x60(AX) 531 MOVOU X12, 0x70(AX) 532 RET