github.com/primecitizens/pcz/std@v0.2.1/core/mem/move_amd64.s

github.com/primecitizens/pcz/std@v0.2.1/core/mem/move_amd64.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  //
     4  // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
     5  // https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s
     6  //
     7  //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
     8  //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     9  //         Portions Copyright 2009 The Go Authors. All rights reserved.
    10  //
    11  // Permission is hereby granted, free of charge, to any person obtaining a copy
    12  // of this software and associated documentation files (the "Software"), to deal
    13  // in the Software without restriction, including without limitation the rights
    14  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    15  // copies of the Software, and to permit persons to whom the Software is
    16  // furnished to do so, subject to the following conditions:
    17  //
    18  // The above copyright notice and this permission notice shall be included in
    19  // all copies or substantial portions of the Software.
    20  //
    21  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    22  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    23  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    24  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    25  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    26  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    27  // THE SOFTWARE.
    28  
    29  //go:build pcz && amd64 && !plan9
    30  
    31  #include "textflag.h"
    32  
    33  // See memmove Go doc for important implementation constraints.
    34  
    35  // func Move(to, from unsafe.Pointer, n uintptr)
    36  // ABIInternal for performance.
    37  TEXT ·Move<ABIInternal>(SB), NOSPLIT, $0-24
    38  	// AX = to
    39  	// BX = from
    40  	// CX = n
    41  	MOVQ AX, DI
    42  	MOVQ BX, SI
    43  	MOVQ CX, BX
    44  
    45  	// REP instructions have a high startup cost, so we handle small sizes
    46  	// with some straightline code. The REP MOVSQ instruction is really fast
    47  	// for large sizes. The cutover is approximately 2K.
    48  tail:
    49  	// move_129through256 or smaller work whether or not the source and the
    50  	// destination memory regions overlap because they load all data into
    51  	// registers before writing it back.  move_256through2048 on the other
    52  	// hand can be used only when the memory regions don't overlap or the copy
    53  	// direction is forward.
    54  	//
    55  	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
    56  	TESTQ BX, BX
    57  	JEQ move_0
    58  	CMPQ BX, $2
    59  	JBE move_1or2
    60  	CMPQ BX, $4
    61  	JB move_3
    62  	JBE move_4
    63  	CMPQ BX, $8
    64  	JB move_5through7
    65  	JE move_8
    66  	CMPQ BX, $16
    67  	JBE move_9through16
    68  	CMPQ BX, $32
    69  	JBE move_17through32
    70  	CMPQ BX, $64
    71  	JBE move_33through64
    72  	CMPQ BX, $128
    73  	JBE move_65through128
    74  	CMPQ BX, $256
    75  	JBE move_129through256
    76  
    77  	TESTB $1, ·useAVXmemmove(SB)
    78  	JNZ avxUnaligned
    79  
    80  /*
    81   * check and set for backwards
    82   */
    83  	CMPQ SI, DI
    84  	JLS back
    85  
    86  /*
    87   * forward copy loop
    88   */
    89  forward:
    90  	CMPQ BX, $2048
    91  	JLS move_256through2048
    92  
    93  	// If REP MOVSB isn't fast, don't use it
    94  	CMPB ·hasERMS(SB), $1 // enhanced REP MOVSB/STOSB
    95  	JNE fwdBy8
    96  
    97  	// Check alignment
    98  	MOVL SI, AX
    99  	ORL DI, AX
   100  	TESTL $7, AX
   101  	JEQ fwdBy8
   102  
   103  	// Do 1 byte at a time
   104  	MOVQ BX, CX
   105  	REP;	MOVSB
   106  	RET
   107  
   108  fwdBy8:
   109  	// Do 8 bytes at a time
   110  	MOVQ BX, CX
   111  	SHRQ $3, CX
   112  	ANDQ $7, BX
   113  	REP;	MOVSQ
   114  	JMP tail
   115  
   116  back:
   117  /*
   118   * check overlap
   119   */
   120  	MOVQ SI, CX
   121  	ADDQ BX, CX
   122  	CMPQ CX, DI
   123  	JLS forward
   124  /*
   125   * whole thing backwards has
   126   * adjusted addresses
   127   */
   128  	ADDQ BX, DI
   129  	ADDQ BX, SI
   130  	STD
   131  
   132  /*
   133   * copy
   134   */
   135  	MOVQ BX, CX
   136  	SHRQ $3, CX
   137  	ANDQ $7, BX
   138  
   139  	SUBQ $8, DI
   140  	SUBQ $8, SI
   141  	REP;	MOVSQ
   142  
   143  	CLD
   144  	ADDQ $8, DI
   145  	ADDQ $8, SI
   146  	SUBQ BX, DI
   147  	SUBQ BX, SI
   148  	JMP tail
   149  
   150  move_1or2:
   151  	MOVB (SI), AX
   152  	MOVB -1(SI)(BX*1), CX
   153  	MOVB AX, (DI)
   154  	MOVB CX, -1(DI)(BX*1)
   155  	RET
   156  move_0:
   157  	RET
   158  move_4:
   159  	MOVL (SI), AX
   160  	MOVL AX, (DI)
   161  	RET
   162  move_3:
   163  	MOVW (SI), AX
   164  	MOVB 2(SI), CX
   165  	MOVW AX, (DI)
   166  	MOVB CX, 2(DI)
   167  	RET
   168  move_5through7:
   169  	MOVL (SI), AX
   170  	MOVL -4(SI)(BX*1), CX
   171  	MOVL AX, (DI)
   172  	MOVL CX, -4(DI)(BX*1)
   173  	RET
   174  move_8:
   175  	// We need a separate case for 8 to make sure we write pointers atomically.
   176  	MOVQ (SI), AX
   177  	MOVQ AX, (DI)
   178  	RET
   179  move_9through16:
   180  	MOVQ (SI), AX
   181  	MOVQ -8(SI)(BX*1), CX
   182  	MOVQ AX, (DI)
   183  	MOVQ CX, -8(DI)(BX*1)
   184  	RET
   185  move_17through32:
   186  	MOVOU (SI), X0
   187  	MOVOU -16(SI)(BX*1), X1
   188  	MOVOU X0, (DI)
   189  	MOVOU X1, -16(DI)(BX*1)
   190  	RET
   191  move_33through64:
   192  	MOVOU (SI), X0
   193  	MOVOU 16(SI), X1
   194  	MOVOU -32(SI)(BX*1), X2
   195  	MOVOU -16(SI)(BX*1), X3
   196  	MOVOU X0, (DI)
   197  	MOVOU X1, 16(DI)
   198  	MOVOU X2, -32(DI)(BX*1)
   199  	MOVOU X3, -16(DI)(BX*1)
   200  	RET
   201  move_65through128:
   202  	MOVOU (SI), X0
   203  	MOVOU 16(SI), X1
   204  	MOVOU 32(SI), X2
   205  	MOVOU 48(SI), X3
   206  	MOVOU -64(SI)(BX*1), X4
   207  	MOVOU -48(SI)(BX*1), X5
   208  	MOVOU -32(SI)(BX*1), X6
   209  	MOVOU -16(SI)(BX*1), X7
   210  	MOVOU X0, (DI)
   211  	MOVOU X1, 16(DI)
   212  	MOVOU X2, 32(DI)
   213  	MOVOU X3, 48(DI)
   214  	MOVOU X4, -64(DI)(BX*1)
   215  	MOVOU X5, -48(DI)(BX*1)
   216  	MOVOU X6, -32(DI)(BX*1)
   217  	MOVOU X7, -16(DI)(BX*1)
   218  	RET
   219  move_129through256:
   220  	MOVOU (SI), X0
   221  	MOVOU 16(SI), X1
   222  	MOVOU 32(SI), X2
   223  	MOVOU 48(SI), X3
   224  	MOVOU 64(SI), X4
   225  	MOVOU 80(SI), X5
   226  	MOVOU 96(SI), X6
   227  	MOVOU 112(SI), X7
   228  	MOVOU -128(SI)(BX*1), X8
   229  	MOVOU -112(SI)(BX*1), X9
   230  	MOVOU -96(SI)(BX*1), X10
   231  	MOVOU -80(SI)(BX*1), X11
   232  	MOVOU -64(SI)(BX*1), X12
   233  	MOVOU -48(SI)(BX*1), X13
   234  	MOVOU -32(SI)(BX*1), X14
   235  	MOVOU -16(SI)(BX*1), X15
   236  	MOVOU X0, (DI)
   237  	MOVOU X1, 16(DI)
   238  	MOVOU X2, 32(DI)
   239  	MOVOU X3, 48(DI)
   240  	MOVOU X4, 64(DI)
   241  	MOVOU X5, 80(DI)
   242  	MOVOU X6, 96(DI)
   243  	MOVOU X7, 112(DI)
   244  	MOVOU X8, -128(DI)(BX*1)
   245  	MOVOU X9, -112(DI)(BX*1)
   246  	MOVOU X10, -96(DI)(BX*1)
   247  	MOVOU X11, -80(DI)(BX*1)
   248  	MOVOU X12, -64(DI)(BX*1)
   249  	MOVOU X13, -48(DI)(BX*1)
   250  	MOVOU X14, -32(DI)(BX*1)
   251  	MOVOU X15, -16(DI)(BX*1)
   252  	// X15 must be zero on return
   253  	PXOR X15, X15
   254  	RET
   255  move_256through2048:
   256  	SUBQ $256, BX
   257  	MOVOU (SI), X0
   258  	MOVOU 16(SI), X1
   259  	MOVOU 32(SI), X2
   260  	MOVOU 48(SI), X3
   261  	MOVOU 64(SI), X4
   262  	MOVOU 80(SI), X5
   263  	MOVOU 96(SI), X6
   264  	MOVOU 112(SI), X7
   265  	MOVOU 128(SI), X8
   266  	MOVOU 144(SI), X9
   267  	MOVOU 160(SI), X10
   268  	MOVOU 176(SI), X11
   269  	MOVOU 192(SI), X12
   270  	MOVOU 208(SI), X13
   271  	MOVOU 224(SI), X14
   272  	MOVOU 240(SI), X15
   273  	MOVOU X0, (DI)
   274  	MOVOU X1, 16(DI)
   275  	MOVOU X2, 32(DI)
   276  	MOVOU X3, 48(DI)
   277  	MOVOU X4, 64(DI)
   278  	MOVOU X5, 80(DI)
   279  	MOVOU X6, 96(DI)
   280  	MOVOU X7, 112(DI)
   281  	MOVOU X8, 128(DI)
   282  	MOVOU X9, 144(DI)
   283  	MOVOU X10, 160(DI)
   284  	MOVOU X11, 176(DI)
   285  	MOVOU X12, 192(DI)
   286  	MOVOU X13, 208(DI)
   287  	MOVOU X14, 224(DI)
   288  	MOVOU X15, 240(DI)
   289  	CMPQ BX, $256
   290  	LEAQ 256(SI), SI
   291  	LEAQ 256(DI), DI
   292  	JGE move_256through2048
   293  	// X15 must be zero on return
   294  	PXOR X15, X15
   295  	JMP tail
   296  
   297  avxUnaligned:
   298  	// There are two implementations of move algorithm.
   299  	// The first one for non-overlapped memory regions. It uses forward copying.
   300  	// The second one for overlapped regions. It uses backward copying
   301  	MOVQ DI, CX
   302  	SUBQ SI, CX
   303  	// Now CX contains distance between SRC and DEST
   304  	CMPQ CX, BX
   305  	// If the distance lesser than region length it means that regions are overlapped
   306  	JC copy_backward
   307  
   308  	// Non-temporal copy would be better for big sizes.
   309  	CMPQ BX, $0x100000
   310  	JAE gobble_big_data_fwd
   311  
   312  	// Memory layout on the source side
   313  	// SI                                       CX
   314  	// |<---------BX before correction--------->|
   315  	// |       |<--BX corrected-->|             |
   316  	// |       |                  |<--- AX  --->|
   317  	// |<-R11->|                  |<-128 bytes->|
   318  	// +----------------------------------------+
   319  	// | Head  | Body             | Tail        |
   320  	// +-------+------------------+-------------+
   321  	// ^       ^                  ^
   322  	// |       |                  |
   323  	// Save head into Y4          Save tail into X5..X12
   324  	//         |
   325  	//         SI+R11, where R11 = ((DI & -32) + 32) - DI
   326  	// Algorithm:
   327  	// 1. Unaligned save of the tail's 128 bytes
   328  	// 2. Unaligned save of the head's 32  bytes
   329  	// 3. Destination-aligned copying of body (128 bytes per iteration)
   330  	// 4. Put head on the new place
   331  	// 5. Put the tail on the new place
   332  	// It can be important to satisfy processor's pipeline requirements for
   333  	// small sizes as the cost of unaligned memory region copying is
   334  	// comparable with the cost of main loop. So code is slightly messed there.
   335  	// There is more clean implementation of that algorithm for bigger sizes
   336  	// where the cost of unaligned part copying is negligible.
   337  	// You can see it after gobble_big_data_fwd label.
   338  	LEAQ (SI)(BX*1), CX
   339  	MOVQ DI, R10
   340  	// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
   341  	MOVOU -0x80(CX), X5
   342  	MOVOU -0x70(CX), X6
   343  	MOVQ $0x80, AX
   344  	// Align destination address
   345  	ANDQ $-32, DI
   346  	ADDQ $32, DI
   347  	// Continue tail saving.
   348  	MOVOU -0x60(CX), X7
   349  	MOVOU -0x50(CX), X8
   350  	// Make R11 delta between aligned and unaligned destination addresses.
   351  	MOVQ DI, R11
   352  	SUBQ R10, R11
   353  	// Continue tail saving.
   354  	MOVOU -0x40(CX), X9
   355  	MOVOU -0x30(CX), X10
   356  	// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
   357  	SUBQ R11, BX
   358  	// Continue tail saving.
   359  	MOVOU -0x20(CX), X11
   360  	MOVOU -0x10(CX), X12
   361  	// The tail will be put on its place after main body copying.
   362  	// It's time for the unaligned heading part.
   363  	VMOVDQU (SI), Y4
   364  	// Adjust source address to point past head.
   365  	ADDQ R11, SI
   366  	SUBQ AX, BX
   367  	// Aligned memory copying there
   368  gobble_128_loop:
   369  	VMOVDQU (SI), Y0
   370  	VMOVDQU 0x20(SI), Y1
   371  	VMOVDQU 0x40(SI), Y2
   372  	VMOVDQU 0x60(SI), Y3
   373  	ADDQ AX, SI
   374  	VMOVDQA Y0, (DI)
   375  	VMOVDQA Y1, 0x20(DI)
   376  	VMOVDQA Y2, 0x40(DI)
   377  	VMOVDQA Y3, 0x60(DI)
   378  	ADDQ AX, DI
   379  	SUBQ AX, BX
   380  	JA gobble_128_loop
   381  	// Now we can store unaligned parts.
   382  	ADDQ AX, BX
   383  	ADDQ DI, BX
   384  	VMOVDQU Y4, (R10)
   385  	VZEROUPPER
   386  	MOVOU X5, -0x80(BX)
   387  	MOVOU X6, -0x70(BX)
   388  	MOVOU X7, -0x60(BX)
   389  	MOVOU X8, -0x50(BX)
   390  	MOVOU X9, -0x40(BX)
   391  	MOVOU X10, -0x30(BX)
   392  	MOVOU X11, -0x20(BX)
   393  	MOVOU X12, -0x10(BX)
   394  	RET
   395  
   396  gobble_big_data_fwd:
   397  	// There is forward copying for big regions.
   398  	// It uses non-temporal mov instructions.
   399  	// Details of this algorithm are commented previously for small sizes.
   400  	LEAQ (SI)(BX*1), CX
   401  	MOVOU -0x80(SI)(BX*1), X5
   402  	MOVOU -0x70(CX), X6
   403  	MOVOU -0x60(CX), X7
   404  	MOVOU -0x50(CX), X8
   405  	MOVOU -0x40(CX), X9
   406  	MOVOU -0x30(CX), X10
   407  	MOVOU -0x20(CX), X11
   408  	MOVOU -0x10(CX), X12
   409  	VMOVDQU (SI), Y4
   410  	MOVQ DI, R8
   411  	ANDQ $-32, DI
   412  	ADDQ $32, DI
   413  	MOVQ DI, R10
   414  	SUBQ R8, R10
   415  	SUBQ R10, BX
   416  	ADDQ R10, SI
   417  	LEAQ (DI)(BX*1), CX
   418  	SUBQ $0x80, BX
   419  gobble_mem_fwd_loop:
   420  	PREFETCHNTA 0x1C0(SI)
   421  	PREFETCHNTA 0x280(SI)
   422  	// Prefetch values were chosen empirically.
   423  	// Approach for prefetch usage as in 9.5.6 of [1]
   424  	// [1] 64-ia-32-architectures-optimization-manual.pdf
   425  	// https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
   426  	VMOVDQU (SI), Y0
   427  	VMOVDQU 0x20(SI), Y1
   428  	VMOVDQU 0x40(SI), Y2
   429  	VMOVDQU 0x60(SI), Y3
   430  	ADDQ $0x80, SI
   431  	VMOVNTDQ Y0, (DI)
   432  	VMOVNTDQ Y1, 0x20(DI)
   433  	VMOVNTDQ Y2, 0x40(DI)
   434  	VMOVNTDQ Y3, 0x60(DI)
   435  	ADDQ $0x80, DI
   436  	SUBQ $0x80, BX
   437  	JA gobble_mem_fwd_loop
   438  	// NT instructions don't follow the normal cache-coherency rules.
   439  	// We need SFENCE there to make copied data available timely.
   440  	SFENCE
   441  	VMOVDQU Y4, (R8)
   442  	VZEROUPPER
   443  	MOVOU X5, -0x80(CX)
   444  	MOVOU X6, -0x70(CX)
   445  	MOVOU X7, -0x60(CX)
   446  	MOVOU X8, -0x50(CX)
   447  	MOVOU X9, -0x40(CX)
   448  	MOVOU X10, -0x30(CX)
   449  	MOVOU X11, -0x20(CX)
   450  	MOVOU X12, -0x10(CX)
   451  	RET
   452  
   453  copy_backward:
   454  	MOVQ DI, AX
   455  	// Backward copying is about the same as the forward one.
   456  	// Firstly we load unaligned tail in the beginning of region.
   457  	MOVOU (SI), X5
   458  	MOVOU 0x10(SI), X6
   459  	ADDQ BX, DI
   460  	MOVOU 0x20(SI), X7
   461  	MOVOU 0x30(SI), X8
   462  	LEAQ -0x20(DI), R10
   463  	MOVQ DI, R11
   464  	MOVOU 0x40(SI), X9
   465  	MOVOU 0x50(SI), X10
   466  	ANDQ $0x1F, R11
   467  	MOVOU 0x60(SI), X11
   468  	MOVOU 0x70(SI), X12
   469  	XORQ R11, DI
   470  	// Let's point SI to the end of region
   471  	ADDQ BX, SI
   472  	// and load unaligned head into X4.
   473  	VMOVDQU -0x20(SI), Y4
   474  	SUBQ R11, SI
   475  	SUBQ R11, BX
   476  	// If there is enough data for non-temporal moves go to special loop
   477  	CMPQ BX, $0x100000
   478  	JA gobble_big_data_bwd
   479  	SUBQ $0x80, BX
   480  gobble_mem_bwd_loop:
   481  	VMOVDQU -0x20(SI), Y0
   482  	VMOVDQU -0x40(SI), Y1
   483  	VMOVDQU -0x60(SI), Y2
   484  	VMOVDQU -0x80(SI), Y3
   485  	SUBQ $0x80, SI
   486  	VMOVDQA Y0, -0x20(DI)
   487  	VMOVDQA Y1, -0x40(DI)
   488  	VMOVDQA Y2, -0x60(DI)
   489  	VMOVDQA Y3, -0x80(DI)
   490  	SUBQ $0x80, DI
   491  	SUBQ $0x80, BX
   492  	JA gobble_mem_bwd_loop
   493  	// Let's store unaligned data
   494  	VMOVDQU Y4, (R10)
   495  	VZEROUPPER
   496  	MOVOU X5, (AX)
   497  	MOVOU X6, 0x10(AX)
   498  	MOVOU X7, 0x20(AX)
   499  	MOVOU X8, 0x30(AX)
   500  	MOVOU X9, 0x40(AX)
   501  	MOVOU X10, 0x50(AX)
   502  	MOVOU X11, 0x60(AX)
   503  	MOVOU X12, 0x70(AX)
   504  	RET
   505  
   506  gobble_big_data_bwd:
   507  	SUBQ $0x80, BX
   508  gobble_big_mem_bwd_loop:
   509  	PREFETCHNTA -0x1C0(SI)
   510  	PREFETCHNTA -0x280(SI)
   511  	VMOVDQU -0x20(SI), Y0
   512  	VMOVDQU -0x40(SI), Y1
   513  	VMOVDQU -0x60(SI), Y2
   514  	VMOVDQU -0x80(SI), Y3
   515  	SUBQ $0x80, SI
   516  	VMOVNTDQ Y0, -0x20(DI)
   517  	VMOVNTDQ Y1, -0x40(DI)
   518  	VMOVNTDQ Y2, -0x60(DI)
   519  	VMOVNTDQ Y3, -0x80(DI)
   520  	SUBQ $0x80, DI
   521  	SUBQ $0x80, BX
   522  	JA gobble_big_mem_bwd_loop
   523  	SFENCE
   524  	VMOVDQU Y4, (R10)
   525  	VZEROUPPER
   526  	MOVOU X5, (AX)
   527  	MOVOU X6, 0x10(AX)
   528  	MOVOU X7, 0x20(AX)
   529  	MOVOU X8, 0x30(AX)
   530  	MOVOU X9, 0x40(AX)
   531  	MOVOU X10, 0x50(AX)
   532  	MOVOU X11, 0x60(AX)
   533  	MOVOU X12, 0x70(AX)
   534  	RET