github.com/primecitizens/pcz/std@v0.2.1/core/mem/move_amd64.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64) 5 // https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s 6 // 7 // Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. 8 // Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. 9 // Portions Copyright 2009 The Go Authors. All rights reserved. 10 // 11 // Permission is hereby granted, free of charge, to any person obtaining a copy 12 // of this software and associated documentation files (the "Software"), to deal 13 // in the Software without restriction, including without limitation the rights 14 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 15 // copies of the Software, and to permit persons to whom the Software is 16 // furnished to do so, subject to the following conditions: 17 // 18 // The above copyright notice and this permission notice shall be included in 19 // all copies or substantial portions of the Software. 20 // 21 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 24 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 25 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 26 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 27 // THE SOFTWARE. 28 29 //go:build pcz && amd64 && !plan9 30 31 #include "textflag.h" 32 33 // See memmove Go doc for important implementation constraints. 34 35 // func Move(to, from unsafe.Pointer, n uintptr) 36 // ABIInternal for performance. 37 TEXT ·Move<ABIInternal>(SB), NOSPLIT, $0-24 38 // AX = to 39 // BX = from 40 // CX = n 41 MOVQ AX, DI 42 MOVQ BX, SI 43 MOVQ CX, BX 44 45 // REP instructions have a high startup cost, so we handle small sizes 46 // with some straightline code. The REP MOVSQ instruction is really fast 47 // for large sizes. The cutover is approximately 2K. 48 tail: 49 // move_129through256 or smaller work whether or not the source and the 50 // destination memory regions overlap because they load all data into 51 // registers before writing it back. move_256through2048 on the other 52 // hand can be used only when the memory regions don't overlap or the copy 53 // direction is forward. 54 // 55 // BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing. 56 TESTQ BX, BX 57 JEQ move_0 58 CMPQ BX, $2 59 JBE move_1or2 60 CMPQ BX, $4 61 JB move_3 62 JBE move_4 63 CMPQ BX, $8 64 JB move_5through7 65 JE move_8 66 CMPQ BX, $16 67 JBE move_9through16 68 CMPQ BX, $32 69 JBE move_17through32 70 CMPQ BX, $64 71 JBE move_33through64 72 CMPQ BX, $128 73 JBE move_65through128 74 CMPQ BX, $256 75 JBE move_129through256 76 77 TESTB $1, ·useAVXmemmove(SB) 78 JNZ avxUnaligned 79 80 /* 81 * check and set for backwards 82 */ 83 CMPQ SI, DI 84 JLS back 85 86 /* 87 * forward copy loop 88 */ 89 forward: 90 CMPQ BX, $2048 91 JLS move_256through2048 92 93 // If REP MOVSB isn't fast, don't use it 94 CMPB ·hasERMS(SB), $1 // enhanced REP MOVSB/STOSB 95 JNE fwdBy8 96 97 // Check alignment 98 MOVL SI, AX 99 ORL DI, AX 100 TESTL $7, AX 101 JEQ fwdBy8 102 103 // Do 1 byte at a time 104 MOVQ BX, CX 105 REP; MOVSB 106 RET 107 108 fwdBy8: 109 // Do 8 bytes at a time 110 MOVQ BX, CX 111 SHRQ $3, CX 112 ANDQ $7, BX 113 REP; MOVSQ 114 JMP tail 115 116 back: 117 /* 118 * check overlap 119 */ 120 MOVQ SI, CX 121 ADDQ BX, CX 122 CMPQ CX, DI 123 JLS forward 124 /* 125 * whole thing backwards has 126 * adjusted addresses 127 */ 128 ADDQ BX, DI 129 ADDQ BX, SI 130 STD 131 132 /* 133 * copy 134 */ 135 MOVQ BX, CX 136 SHRQ $3, CX 137 ANDQ $7, BX 138 139 SUBQ $8, DI 140 SUBQ $8, SI 141 REP; MOVSQ 142 143 CLD 144 ADDQ $8, DI 145 ADDQ $8, SI 146 SUBQ BX, DI 147 SUBQ BX, SI 148 JMP tail 149 150 move_1or2: 151 MOVB (SI), AX 152 MOVB -1(SI)(BX*1), CX 153 MOVB AX, (DI) 154 MOVB CX, -1(DI)(BX*1) 155 RET 156 move_0: 157 RET 158 move_4: 159 MOVL (SI), AX 160 MOVL AX, (DI) 161 RET 162 move_3: 163 MOVW (SI), AX 164 MOVB 2(SI), CX 165 MOVW AX, (DI) 166 MOVB CX, 2(DI) 167 RET 168 move_5through7: 169 MOVL (SI), AX 170 MOVL -4(SI)(BX*1), CX 171 MOVL AX, (DI) 172 MOVL CX, -4(DI)(BX*1) 173 RET 174 move_8: 175 // We need a separate case for 8 to make sure we write pointers atomically. 176 MOVQ (SI), AX 177 MOVQ AX, (DI) 178 RET 179 move_9through16: 180 MOVQ (SI), AX 181 MOVQ -8(SI)(BX*1), CX 182 MOVQ AX, (DI) 183 MOVQ CX, -8(DI)(BX*1) 184 RET 185 move_17through32: 186 MOVOU (SI), X0 187 MOVOU -16(SI)(BX*1), X1 188 MOVOU X0, (DI) 189 MOVOU X1, -16(DI)(BX*1) 190 RET 191 move_33through64: 192 MOVOU (SI), X0 193 MOVOU 16(SI), X1 194 MOVOU -32(SI)(BX*1), X2 195 MOVOU -16(SI)(BX*1), X3 196 MOVOU X0, (DI) 197 MOVOU X1, 16(DI) 198 MOVOU X2, -32(DI)(BX*1) 199 MOVOU X3, -16(DI)(BX*1) 200 RET 201 move_65through128: 202 MOVOU (SI), X0 203 MOVOU 16(SI), X1 204 MOVOU 32(SI), X2 205 MOVOU 48(SI), X3 206 MOVOU -64(SI)(BX*1), X4 207 MOVOU -48(SI)(BX*1), X5 208 MOVOU -32(SI)(BX*1), X6 209 MOVOU -16(SI)(BX*1), X7 210 MOVOU X0, (DI) 211 MOVOU X1, 16(DI) 212 MOVOU X2, 32(DI) 213 MOVOU X3, 48(DI) 214 MOVOU X4, -64(DI)(BX*1) 215 MOVOU X5, -48(DI)(BX*1) 216 MOVOU X6, -32(DI)(BX*1) 217 MOVOU X7, -16(DI)(BX*1) 218 RET 219 move_129through256: 220 MOVOU (SI), X0 221 MOVOU 16(SI), X1 222 MOVOU 32(SI), X2 223 MOVOU 48(SI), X3 224 MOVOU 64(SI), X4 225 MOVOU 80(SI), X5 226 MOVOU 96(SI), X6 227 MOVOU 112(SI), X7 228 MOVOU -128(SI)(BX*1), X8 229 MOVOU -112(SI)(BX*1), X9 230 MOVOU -96(SI)(BX*1), X10 231 MOVOU -80(SI)(BX*1), X11 232 MOVOU -64(SI)(BX*1), X12 233 MOVOU -48(SI)(BX*1), X13 234 MOVOU -32(SI)(BX*1), X14 235 MOVOU -16(SI)(BX*1), X15 236 MOVOU X0, (DI) 237 MOVOU X1, 16(DI) 238 MOVOU X2, 32(DI) 239 MOVOU X3, 48(DI) 240 MOVOU X4, 64(DI) 241 MOVOU X5, 80(DI) 242 MOVOU X6, 96(DI) 243 MOVOU X7, 112(DI) 244 MOVOU X8, -128(DI)(BX*1) 245 MOVOU X9, -112(DI)(BX*1) 246 MOVOU X10, -96(DI)(BX*1) 247 MOVOU X11, -80(DI)(BX*1) 248 MOVOU X12, -64(DI)(BX*1) 249 MOVOU X13, -48(DI)(BX*1) 250 MOVOU X14, -32(DI)(BX*1) 251 MOVOU X15, -16(DI)(BX*1) 252 // X15 must be zero on return 253 PXOR X15, X15 254 RET 255 move_256through2048: 256 SUBQ $256, BX 257 MOVOU (SI), X0 258 MOVOU 16(SI), X1 259 MOVOU 32(SI), X2 260 MOVOU 48(SI), X3 261 MOVOU 64(SI), X4 262 MOVOU 80(SI), X5 263 MOVOU 96(SI), X6 264 MOVOU 112(SI), X7 265 MOVOU 128(SI), X8 266 MOVOU 144(SI), X9 267 MOVOU 160(SI), X10 268 MOVOU 176(SI), X11 269 MOVOU 192(SI), X12 270 MOVOU 208(SI), X13 271 MOVOU 224(SI), X14 272 MOVOU 240(SI), X15 273 MOVOU X0, (DI) 274 MOVOU X1, 16(DI) 275 MOVOU X2, 32(DI) 276 MOVOU X3, 48(DI) 277 MOVOU X4, 64(DI) 278 MOVOU X5, 80(DI) 279 MOVOU X6, 96(DI) 280 MOVOU X7, 112(DI) 281 MOVOU X8, 128(DI) 282 MOVOU X9, 144(DI) 283 MOVOU X10, 160(DI) 284 MOVOU X11, 176(DI) 285 MOVOU X12, 192(DI) 286 MOVOU X13, 208(DI) 287 MOVOU X14, 224(DI) 288 MOVOU X15, 240(DI) 289 CMPQ BX, $256 290 LEAQ 256(SI), SI 291 LEAQ 256(DI), DI 292 JGE move_256through2048 293 // X15 must be zero on return 294 PXOR X15, X15 295 JMP tail 296 297 avxUnaligned: 298 // There are two implementations of move algorithm. 299 // The first one for non-overlapped memory regions. It uses forward copying. 300 // The second one for overlapped regions. It uses backward copying 301 MOVQ DI, CX 302 SUBQ SI, CX 303 // Now CX contains distance between SRC and DEST 304 CMPQ CX, BX 305 // If the distance lesser than region length it means that regions are overlapped 306 JC copy_backward 307 308 // Non-temporal copy would be better for big sizes. 309 CMPQ BX, $0x100000 310 JAE gobble_big_data_fwd 311 312 // Memory layout on the source side 313 // SI CX 314 // |<---------BX before correction--------->| 315 // | |<--BX corrected-->| | 316 // | | |<--- AX --->| 317 // |<-R11->| |<-128 bytes->| 318 // +----------------------------------------+ 319 // | Head | Body | Tail | 320 // +-------+------------------+-------------+ 321 // ^ ^ ^ 322 // | | | 323 // Save head into Y4 Save tail into X5..X12 324 // | 325 // SI+R11, where R11 = ((DI & -32) + 32) - DI 326 // Algorithm: 327 // 1. Unaligned save of the tail's 128 bytes 328 // 2. Unaligned save of the head's 32 bytes 329 // 3. Destination-aligned copying of body (128 bytes per iteration) 330 // 4. Put head on the new place 331 // 5. Put the tail on the new place 332 // It can be important to satisfy processor's pipeline requirements for 333 // small sizes as the cost of unaligned memory region copying is 334 // comparable with the cost of main loop. So code is slightly messed there. 335 // There is more clean implementation of that algorithm for bigger sizes 336 // where the cost of unaligned part copying is negligible. 337 // You can see it after gobble_big_data_fwd label. 338 LEAQ (SI)(BX*1), CX 339 MOVQ DI, R10 340 // CX points to the end of buffer so we need go back slightly. We will use negative offsets there. 341 MOVOU -0x80(CX), X5 342 MOVOU -0x70(CX), X6 343 MOVQ $0x80, AX 344 // Align destination address 345 ANDQ $-32, DI 346 ADDQ $32, DI 347 // Continue tail saving. 348 MOVOU -0x60(CX), X7 349 MOVOU -0x50(CX), X8 350 // Make R11 delta between aligned and unaligned destination addresses. 351 MOVQ DI, R11 352 SUBQ R10, R11 353 // Continue tail saving. 354 MOVOU -0x40(CX), X9 355 MOVOU -0x30(CX), X10 356 // Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying. 357 SUBQ R11, BX 358 // Continue tail saving. 359 MOVOU -0x20(CX), X11 360 MOVOU -0x10(CX), X12 361 // The tail will be put on its place after main body copying. 362 // It's time for the unaligned heading part. 363 VMOVDQU (SI), Y4 364 // Adjust source address to point past head. 365 ADDQ R11, SI 366 SUBQ AX, BX 367 // Aligned memory copying there 368 gobble_128_loop: 369 VMOVDQU (SI), Y0 370 VMOVDQU 0x20(SI), Y1 371 VMOVDQU 0x40(SI), Y2 372 VMOVDQU 0x60(SI), Y3 373 ADDQ AX, SI 374 VMOVDQA Y0, (DI) 375 VMOVDQA Y1, 0x20(DI) 376 VMOVDQA Y2, 0x40(DI) 377 VMOVDQA Y3, 0x60(DI) 378 ADDQ AX, DI 379 SUBQ AX, BX 380 JA gobble_128_loop 381 // Now we can store unaligned parts. 382 ADDQ AX, BX 383 ADDQ DI, BX 384 VMOVDQU Y4, (R10) 385 VZEROUPPER 386 MOVOU X5, -0x80(BX) 387 MOVOU X6, -0x70(BX) 388 MOVOU X7, -0x60(BX) 389 MOVOU X8, -0x50(BX) 390 MOVOU X9, -0x40(BX) 391 MOVOU X10, -0x30(BX) 392 MOVOU X11, -0x20(BX) 393 MOVOU X12, -0x10(BX) 394 RET 395 396 gobble_big_data_fwd: 397 // There is forward copying for big regions. 398 // It uses non-temporal mov instructions. 399 // Details of this algorithm are commented previously for small sizes. 400 LEAQ (SI)(BX*1), CX 401 MOVOU -0x80(SI)(BX*1), X5 402 MOVOU -0x70(CX), X6 403 MOVOU -0x60(CX), X7 404 MOVOU -0x50(CX), X8 405 MOVOU -0x40(CX), X9 406 MOVOU -0x30(CX), X10 407 MOVOU -0x20(CX), X11 408 MOVOU -0x10(CX), X12 409 VMOVDQU (SI), Y4 410 MOVQ DI, R8 411 ANDQ $-32, DI 412 ADDQ $32, DI 413 MOVQ DI, R10 414 SUBQ R8, R10 415 SUBQ R10, BX 416 ADDQ R10, SI 417 LEAQ (DI)(BX*1), CX 418 SUBQ $0x80, BX 419 gobble_mem_fwd_loop: 420 PREFETCHNTA 0x1C0(SI) 421 PREFETCHNTA 0x280(SI) 422 // Prefetch values were chosen empirically. 423 // Approach for prefetch usage as in 9.5.6 of [1] 424 // [1] 64-ia-32-architectures-optimization-manual.pdf 425 // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf 426 VMOVDQU (SI), Y0 427 VMOVDQU 0x20(SI), Y1 428 VMOVDQU 0x40(SI), Y2 429 VMOVDQU 0x60(SI), Y3 430 ADDQ $0x80, SI 431 VMOVNTDQ Y0, (DI) 432 VMOVNTDQ Y1, 0x20(DI) 433 VMOVNTDQ Y2, 0x40(DI) 434 VMOVNTDQ Y3, 0x60(DI) 435 ADDQ $0x80, DI 436 SUBQ $0x80, BX 437 JA gobble_mem_fwd_loop 438 // NT instructions don't follow the normal cache-coherency rules. 439 // We need SFENCE there to make copied data available timely. 440 SFENCE 441 VMOVDQU Y4, (R8) 442 VZEROUPPER 443 MOVOU X5, -0x80(CX) 444 MOVOU X6, -0x70(CX) 445 MOVOU X7, -0x60(CX) 446 MOVOU X8, -0x50(CX) 447 MOVOU X9, -0x40(CX) 448 MOVOU X10, -0x30(CX) 449 MOVOU X11, -0x20(CX) 450 MOVOU X12, -0x10(CX) 451 RET 452 453 copy_backward: 454 MOVQ DI, AX 455 // Backward copying is about the same as the forward one. 456 // Firstly we load unaligned tail in the beginning of region. 457 MOVOU (SI), X5 458 MOVOU 0x10(SI), X6 459 ADDQ BX, DI 460 MOVOU 0x20(SI), X7 461 MOVOU 0x30(SI), X8 462 LEAQ -0x20(DI), R10 463 MOVQ DI, R11 464 MOVOU 0x40(SI), X9 465 MOVOU 0x50(SI), X10 466 ANDQ $0x1F, R11 467 MOVOU 0x60(SI), X11 468 MOVOU 0x70(SI), X12 469 XORQ R11, DI 470 // Let's point SI to the end of region 471 ADDQ BX, SI 472 // and load unaligned head into X4. 473 VMOVDQU -0x20(SI), Y4 474 SUBQ R11, SI 475 SUBQ R11, BX 476 // If there is enough data for non-temporal moves go to special loop 477 CMPQ BX, $0x100000 478 JA gobble_big_data_bwd 479 SUBQ $0x80, BX 480 gobble_mem_bwd_loop: 481 VMOVDQU -0x20(SI), Y0 482 VMOVDQU -0x40(SI), Y1 483 VMOVDQU -0x60(SI), Y2 484 VMOVDQU -0x80(SI), Y3 485 SUBQ $0x80, SI 486 VMOVDQA Y0, -0x20(DI) 487 VMOVDQA Y1, -0x40(DI) 488 VMOVDQA Y2, -0x60(DI) 489 VMOVDQA Y3, -0x80(DI) 490 SUBQ $0x80, DI 491 SUBQ $0x80, BX 492 JA gobble_mem_bwd_loop 493 // Let's store unaligned data 494 VMOVDQU Y4, (R10) 495 VZEROUPPER 496 MOVOU X5, (AX) 497 MOVOU X6, 0x10(AX) 498 MOVOU X7, 0x20(AX) 499 MOVOU X8, 0x30(AX) 500 MOVOU X9, 0x40(AX) 501 MOVOU X10, 0x50(AX) 502 MOVOU X11, 0x60(AX) 503 MOVOU X12, 0x70(AX) 504 RET 505 506 gobble_big_data_bwd: 507 SUBQ $0x80, BX 508 gobble_big_mem_bwd_loop: 509 PREFETCHNTA -0x1C0(SI) 510 PREFETCHNTA -0x280(SI) 511 VMOVDQU -0x20(SI), Y0 512 VMOVDQU -0x40(SI), Y1 513 VMOVDQU -0x60(SI), Y2 514 VMOVDQU -0x80(SI), Y3 515 SUBQ $0x80, SI 516 VMOVNTDQ Y0, -0x20(DI) 517 VMOVNTDQ Y1, -0x40(DI) 518 VMOVNTDQ Y2, -0x60(DI) 519 VMOVNTDQ Y3, -0x80(DI) 520 SUBQ $0x80, DI 521 SUBQ $0x80, BX 522 JA gobble_big_mem_bwd_loop 523 SFENCE 524 VMOVDQU Y4, (R10) 525 VZEROUPPER 526 MOVOU X5, (AX) 527 MOVOU X6, 0x10(AX) 528 MOVOU X7, 0x20(AX) 529 MOVOU X8, 0x30(AX) 530 MOVOU X9, 0x40(AX) 531 MOVOU X10, 0x50(AX) 532 MOVOU X11, 0x60(AX) 533 MOVOU X12, 0x70(AX) 534 RET