github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/runtime/memmove_amd64.s (about) 1 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64) 2 // https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s 3 // 4 // Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. 5 // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. 6 // Portions Copyright 2009 The Go Authors. All rights reserved. 7 // 8 // Permission is hereby granted, free of charge, to any person obtaining a copy 9 // of this software and associated documentation files (the "Software"), to deal 10 // in the Software without restriction, including without limitation the rights 11 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 // copies of the Software, and to permit persons to whom the Software is 13 // furnished to do so, subject to the following conditions: 14 // 15 // The above copyright notice and this permission notice shall be included in 16 // all copies or substantial portions of the Software. 17 // 18 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 // THE SOFTWARE. 25 26 // +build !plan9 27 28 #include "go_asm.h" 29 #include "textflag.h" 30 31 // See memmove Go doc for important implementation constraints. 32 33 // func memmove(to, from unsafe.Pointer, n uintptr) 34 TEXT runtime·memmove(SB), NOSPLIT, $0-24 35 36 MOVQ to+0(FP), DI 37 MOVQ from+8(FP), SI 38 MOVQ n+16(FP), BX 39 40 // REP instructions have a high startup cost, so we handle small sizes 41 // with some straightline code. The REP MOVSQ instruction is really fast 42 // for large sizes. The cutover is approximately 2K. 43 tail: 44 // move_129through256 or smaller work whether or not the source and the 45 // destination memory regions overlap because they load all data into 46 // registers before writing it back. move_256through2048 on the other 47 // hand can be used only when the memory regions don't overlap or the copy 48 // direction is forward. 49 // 50 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing. 51 TESTQ BX, BX 52 JEQ move_0 53 CMPQ BX, $2 54 JBE move_1or2 55 CMPQ BX, $4 56 JB move_3 57 JBE move_4 58 CMPQ BX, $8 59 JB move_5through7 60 JE move_8 61 CMPQ BX, $16 62 JBE move_9through16 63 CMPQ BX, $32 64 JBE move_17through32 65 CMPQ BX, $64 66 JBE move_33through64 67 CMPQ BX, $128 68 JBE move_65through128 69 CMPQ BX, $256 70 JBE move_129through256 71 72 TESTB $1, runtime·useAVXmemmove(SB) 73 JNZ avxUnaligned 74 75 /* 76 * check and set for backwards 77 */ 78 CMPQ SI, DI 79 JLS back 80 81 /* 82 * forward copy loop 83 */ 84 forward: 85 CMPQ BX, $2048 86 JLS move_256through2048 87 88 // If REP MOVSB isn't fast, don't use it 89 CMPB internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB 90 JNE fwdBy8 91 92 // Check alignment 93 MOVL SI, AX 94 ORL DI, AX 95 TESTL $7, AX 96 JEQ fwdBy8 97 98 // Do 1 byte at a time 99 MOVQ BX, CX 100 REP; MOVSB 101 RET 102 103 fwdBy8: 104 // Do 8 bytes at a time 105 MOVQ BX, CX 106 SHRQ $3, CX 107 ANDQ $7, BX 108 REP; MOVSQ 109 JMP tail 110 111 back: 112 /* 113 * check overlap 114 */ 115 MOVQ SI, CX 116 ADDQ BX, CX 117 CMPQ CX, DI 118 JLS forward 119 /* 120 * whole thing backwards has 121 * adjusted addresses 122 */ 123 ADDQ BX, DI 124 ADDQ BX, SI 125 STD 126 127 /* 128 * copy 129 */ 130 MOVQ BX, CX 131 SHRQ $3, CX 132 ANDQ $7, BX 133 134 SUBQ $8, DI 135 SUBQ $8, SI 136 REP; MOVSQ 137 138 CLD 139 ADDQ $8, DI 140 ADDQ $8, SI 141 SUBQ BX, DI 142 SUBQ BX, SI 143 JMP tail 144 145 move_1or2: 146 MOVB (SI), AX 147 MOVB -1(SI)(BX*1), CX 148 MOVB AX, (DI) 149 MOVB CX, -1(DI)(BX*1) 150 RET 151 move_0: 152 RET 153 move_4: 154 MOVL (SI), AX 155 MOVL AX, (DI) 156 RET 157 move_3: 158 MOVW (SI), AX 159 MOVB 2(SI), CX 160 MOVW AX, (DI) 161 MOVB CX, 2(DI) 162 RET 163 move_5through7: 164 MOVL (SI), AX 165 MOVL -4(SI)(BX*1), CX 166 MOVL AX, (DI) 167 MOVL CX, -4(DI)(BX*1) 168 RET 169 move_8: 170 // We need a separate case for 8 to make sure we write pointers atomically. 171 MOVQ (SI), AX 172 MOVQ AX, (DI) 173 RET 174 move_9through16: 175 MOVQ (SI), AX 176 MOVQ -8(SI)(BX*1), CX 177 MOVQ AX, (DI) 178 MOVQ CX, -8(DI)(BX*1) 179 RET 180 move_17through32: 181 MOVOU (SI), X0 182 MOVOU -16(SI)(BX*1), X1 183 MOVOU X0, (DI) 184 MOVOU X1, -16(DI)(BX*1) 185 RET 186 move_33through64: 187 MOVOU (SI), X0 188 MOVOU 16(SI), X1 189 MOVOU -32(SI)(BX*1), X2 190 MOVOU -16(SI)(BX*1), X3 191 MOVOU X0, (DI) 192 MOVOU X1, 16(DI) 193 MOVOU X2, -32(DI)(BX*1) 194 MOVOU X3, -16(DI)(BX*1) 195 RET 196 move_65through128: 197 MOVOU (SI), X0 198 MOVOU 16(SI), X1 199 MOVOU 32(SI), X2 200 MOVOU 48(SI), X3 201 MOVOU -64(SI)(BX*1), X4 202 MOVOU -48(SI)(BX*1), X5 203 MOVOU -32(SI)(BX*1), X6 204 MOVOU -16(SI)(BX*1), X7 205 MOVOU X0, (DI) 206 MOVOU X1, 16(DI) 207 MOVOU X2, 32(DI) 208 MOVOU X3, 48(DI) 209 MOVOU X4, -64(DI)(BX*1) 210 MOVOU X5, -48(DI)(BX*1) 211 MOVOU X6, -32(DI)(BX*1) 212 MOVOU X7, -16(DI)(BX*1) 213 RET 214 move_129through256: 215 MOVOU (SI), X0 216 MOVOU 16(SI), X1 217 MOVOU 32(SI), X2 218 MOVOU 48(SI), X3 219 MOVOU 64(SI), X4 220 MOVOU 80(SI), X5 221 MOVOU 96(SI), X6 222 MOVOU 112(SI), X7 223 MOVOU -128(SI)(BX*1), X8 224 MOVOU -112(SI)(BX*1), X9 225 MOVOU -96(SI)(BX*1), X10 226 MOVOU -80(SI)(BX*1), X11 227 MOVOU -64(SI)(BX*1), X12 228 MOVOU -48(SI)(BX*1), X13 229 MOVOU -32(SI)(BX*1), X14 230 MOVOU -16(SI)(BX*1), X15 231 MOVOU X0, (DI) 232 MOVOU X1, 16(DI) 233 MOVOU X2, 32(DI) 234 MOVOU X3, 48(DI) 235 MOVOU X4, 64(DI) 236 MOVOU X5, 80(DI) 237 MOVOU X6, 96(DI) 238 MOVOU X7, 112(DI) 239 MOVOU X8, -128(DI)(BX*1) 240 MOVOU X9, -112(DI)(BX*1) 241 MOVOU X10, -96(DI)(BX*1) 242 MOVOU X11, -80(DI)(BX*1) 243 MOVOU X12, -64(DI)(BX*1) 244 MOVOU X13, -48(DI)(BX*1) 245 MOVOU X14, -32(DI)(BX*1) 246 MOVOU X15, -16(DI)(BX*1) 247 RET 248 move_256through2048: 249 SUBQ $256, BX 250 MOVOU (SI), X0 251 MOVOU 16(SI), X1 252 MOVOU 32(SI), X2 253 MOVOU 48(SI), X3 254 MOVOU 64(SI), X4 255 MOVOU 80(SI), X5 256 MOVOU 96(SI), X6 257 MOVOU 112(SI), X7 258 MOVOU 128(SI), X8 259 MOVOU 144(SI), X9 260 MOVOU 160(SI), X10 261 MOVOU 176(SI), X11 262 MOVOU 192(SI), X12 263 MOVOU 208(SI), X13 264 MOVOU 224(SI), X14 265 MOVOU 240(SI), X15 266 MOVOU X0, (DI) 267 MOVOU X1, 16(DI) 268 MOVOU X2, 32(DI) 269 MOVOU X3, 48(DI) 270 MOVOU X4, 64(DI) 271 MOVOU X5, 80(DI) 272 MOVOU X6, 96(DI) 273 MOVOU X7, 112(DI) 274 MOVOU X8, 128(DI) 275 MOVOU X9, 144(DI) 276 MOVOU X10, 160(DI) 277 MOVOU X11, 176(DI) 278 MOVOU X12, 192(DI) 279 MOVOU X13, 208(DI) 280 MOVOU X14, 224(DI) 281 MOVOU X15, 240(DI) 282 CMPQ BX, $256 283 LEAQ 256(SI), SI 284 LEAQ 256(DI), DI 285 JGE move_256through2048 286 JMP tail 287 288 avxUnaligned: 289 // There are two implementations of move algorithm. 290 // The first one for non-overlapped memory regions. It uses forward copying. 291 // The second one for overlapped regions. It uses backward copying 292 MOVQ DI, CX 293 SUBQ SI, CX 294 // Now CX contains distance between SRC and DEST 295 CMPQ CX, BX 296 // If the distance lesser than region length it means that regions are overlapped 297 JC copy_backward 298 299 // Non-temporal copy would be better for big sizes. 300 CMPQ BX, $0x100000 301 JAE gobble_big_data_fwd 302 303 // Memory layout on the source side 304 // SI CX 305 // |<---------BX before correction--------->| 306 // | |<--BX corrected-->| | 307 // | | |<--- AX --->| 308 // |<-R11->| |<-128 bytes->| 309 // +----------------------------------------+ 310 // | Head | Body | Tail | 311 // +-------+------------------+-------------+ 312 // ^ ^ ^ 313 // | | | 314 // Save head into Y4 Save tail into X5..X12 315 // | 316 // SI+R11, where R11 = ((DI & -32) + 32) - DI 317 // Algorithm: 318 // 1. Unaligned save of the tail's 128 bytes 319 // 2. Unaligned save of the head's 32 bytes 320 // 3. Destination-aligned copying of body (128 bytes per iteration) 321 // 4. Put head on the new place 322 // 5. Put the tail on the new place 323 // It can be important to satisfy processor's pipeline requirements for 324 // small sizes as the cost of unaligned memory region copying is 325 // comparable with the cost of main loop. So code is slightly messed there. 326 // There is more clean implementation of that algorithm for bigger sizes 327 // where the cost of unaligned part copying is negligible. 328 // You can see it after gobble_big_data_fwd label. 329 LEAQ (SI)(BX*1), CX 330 MOVQ DI, R10 331 // CX points to the end of buffer so we need go back slightly. We will use negative offsets there. 332 MOVOU -0x80(CX), X5 333 MOVOU -0x70(CX), X6 334 MOVQ $0x80, AX 335 // Align destination address 336 ANDQ $-32, DI 337 ADDQ $32, DI 338 // Continue tail saving. 339 MOVOU -0x60(CX), X7 340 MOVOU -0x50(CX), X8 341 // Make R11 delta between aligned and unaligned destination addresses. 342 MOVQ DI, R11 343 SUBQ R10, R11 344 // Continue tail saving. 345 MOVOU -0x40(CX), X9 346 MOVOU -0x30(CX), X10 347 // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying. 348 SUBQ R11, BX 349 // Continue tail saving. 350 MOVOU -0x20(CX), X11 351 MOVOU -0x10(CX), X12 352 // The tail will be put on its place after main body copying. 353 // It's time for the unaligned heading part. 354 VMOVDQU (SI), Y4 355 // Adjust source address to point past head. 356 ADDQ R11, SI 357 SUBQ AX, BX 358 // Aligned memory copying there 359 gobble_128_loop: 360 VMOVDQU (SI), Y0 361 VMOVDQU 0x20(SI), Y1 362 VMOVDQU 0x40(SI), Y2 363 VMOVDQU 0x60(SI), Y3 364 ADDQ AX, SI 365 VMOVDQA Y0, (DI) 366 VMOVDQA Y1, 0x20(DI) 367 VMOVDQA Y2, 0x40(DI) 368 VMOVDQA Y3, 0x60(DI) 369 ADDQ AX, DI 370 SUBQ AX, BX 371 JA gobble_128_loop 372 // Now we can store unaligned parts. 373 ADDQ AX, BX 374 ADDQ DI, BX 375 VMOVDQU Y4, (R10) 376 VZEROUPPER 377 MOVOU X5, -0x80(BX) 378 MOVOU X6, -0x70(BX) 379 MOVOU X7, -0x60(BX) 380 MOVOU X8, -0x50(BX) 381 MOVOU X9, -0x40(BX) 382 MOVOU X10, -0x30(BX) 383 MOVOU X11, -0x20(BX) 384 MOVOU X12, -0x10(BX) 385 RET 386 387 gobble_big_data_fwd: 388 // There is forward copying for big regions. 389 // It uses non-temporal mov instructions. 390 // Details of this algorithm are commented previously for small sizes. 391 LEAQ (SI)(BX*1), CX 392 MOVOU -0x80(SI)(BX*1), X5 393 MOVOU -0x70(CX), X6 394 MOVOU -0x60(CX), X7 395 MOVOU -0x50(CX), X8 396 MOVOU -0x40(CX), X9 397 MOVOU -0x30(CX), X10 398 MOVOU -0x20(CX), X11 399 MOVOU -0x10(CX), X12 400 VMOVDQU (SI), Y4 401 MOVQ DI, R8 402 ANDQ $-32, DI 403 ADDQ $32, DI 404 MOVQ DI, R10 405 SUBQ R8, R10 406 SUBQ R10, BX 407 ADDQ R10, SI 408 LEAQ (DI)(BX*1), CX 409 SUBQ $0x80, BX 410 gobble_mem_fwd_loop: 411 PREFETCHNTA 0x1C0(SI) 412 PREFETCHNTA 0x280(SI) 413 // Prefetch values were chosen empirically. 414 // Approach for prefetch usage as in 7.6.6 of [1] 415 // [1] 64-ia-32-architectures-optimization-manual.pdf 416 // https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf 417 VMOVDQU (SI), Y0 418 VMOVDQU 0x20(SI), Y1 419 VMOVDQU 0x40(SI), Y2 420 VMOVDQU 0x60(SI), Y3 421 ADDQ $0x80, SI 422 VMOVNTDQ Y0, (DI) 423 VMOVNTDQ Y1, 0x20(DI) 424 VMOVNTDQ Y2, 0x40(DI) 425 VMOVNTDQ Y3, 0x60(DI) 426 ADDQ $0x80, DI 427 SUBQ $0x80, BX 428 JA gobble_mem_fwd_loop 429 // NT instructions don't follow the normal cache-coherency rules. 430 // We need SFENCE there to make copied data available timely. 431 SFENCE 432 VMOVDQU Y4, (R8) 433 VZEROUPPER 434 MOVOU X5, -0x80(CX) 435 MOVOU X6, -0x70(CX) 436 MOVOU X7, -0x60(CX) 437 MOVOU X8, -0x50(CX) 438 MOVOU X9, -0x40(CX) 439 MOVOU X10, -0x30(CX) 440 MOVOU X11, -0x20(CX) 441 MOVOU X12, -0x10(CX) 442 RET 443 444 copy_backward: 445 MOVQ DI, AX 446 // Backward copying is about the same as the forward one. 447 // Firstly we load unaligned tail in the beginning of region. 448 MOVOU (SI), X5 449 MOVOU 0x10(SI), X6 450 ADDQ BX, DI 451 MOVOU 0x20(SI), X7 452 MOVOU 0x30(SI), X8 453 LEAQ -0x20(DI), R10 454 MOVQ DI, R11 455 MOVOU 0x40(SI), X9 456 MOVOU 0x50(SI), X10 457 ANDQ $0x1F, R11 458 MOVOU 0x60(SI), X11 459 MOVOU 0x70(SI), X12 460 XORQ R11, DI 461 // Let's point SI to the end of region 462 ADDQ BX, SI 463 // and load unaligned head into X4. 464 VMOVDQU -0x20(SI), Y4 465 SUBQ R11, SI 466 SUBQ R11, BX 467 // If there is enough data for non-temporal moves go to special loop 468 CMPQ BX, $0x100000 469 JA gobble_big_data_bwd 470 SUBQ $0x80, BX 471 gobble_mem_bwd_loop: 472 VMOVDQU -0x20(SI), Y0 473 VMOVDQU -0x40(SI), Y1 474 VMOVDQU -0x60(SI), Y2 475 VMOVDQU -0x80(SI), Y3 476 SUBQ $0x80, SI 477 VMOVDQA Y0, -0x20(DI) 478 VMOVDQA Y1, -0x40(DI) 479 VMOVDQA Y2, -0x60(DI) 480 VMOVDQA Y3, -0x80(DI) 481 SUBQ $0x80, DI 482 SUBQ $0x80, BX 483 JA gobble_mem_bwd_loop 484 // Let's store unaligned data 485 VMOVDQU Y4, (R10) 486 VZEROUPPER 487 MOVOU X5, (AX) 488 MOVOU X6, 0x10(AX) 489 MOVOU X7, 0x20(AX) 490 MOVOU X8, 0x30(AX) 491 MOVOU X9, 0x40(AX) 492 MOVOU X10, 0x50(AX) 493 MOVOU X11, 0x60(AX) 494 MOVOU X12, 0x70(AX) 495 RET 496 497 gobble_big_data_bwd: 498 SUBQ $0x80, BX 499 gobble_big_mem_bwd_loop: 500 PREFETCHNTA -0x1C0(SI) 501 PREFETCHNTA -0x280(SI) 502 VMOVDQU -0x20(SI), Y0 503 VMOVDQU -0x40(SI), Y1 504 VMOVDQU -0x60(SI), Y2 505 VMOVDQU -0x80(SI), Y3 506 SUBQ $0x80, SI 507 VMOVNTDQ Y0, -0x20(DI) 508 VMOVNTDQ Y1, -0x40(DI) 509 VMOVNTDQ Y2, -0x60(DI) 510 VMOVNTDQ Y3, -0x80(DI) 511 SUBQ $0x80, DI 512 SUBQ $0x80, BX 513 JA gobble_big_mem_bwd_loop 514 SFENCE 515 VMOVDQU Y4, (R10) 516 VZEROUPPER 517 MOVOU X5, (AX) 518 MOVOU X6, 0x10(AX) 519 MOVOU X7, 0x20(AX) 520 MOVOU X8, 0x30(AX) 521 MOVOU X9, 0x40(AX) 522 MOVOU X10, 0x50(AX) 523 MOVOU X11, 0x60(AX) 524 MOVOU X12, 0x70(AX) 525 RET