github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/runtime/memmove_amd64.s

github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/runtime/memmove_amd64.s (about)

     1  // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
     2  // https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s
     3  //
     4  //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
     5  //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     6  //         Portions Copyright 2009 The Go Authors. All rights reserved.
     7  //
     8  // Permission is hereby granted, free of charge, to any person obtaining a copy
     9  // of this software and associated documentation files (the "Software"), to deal
    10  // in the Software without restriction, including without limitation the rights
    11  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    12  // copies of the Software, and to permit persons to whom the Software is
    13  // furnished to do so, subject to the following conditions:
    14  //
    15  // The above copyright notice and this permission notice shall be included in
    16  // all copies or substantial portions of the Software.
    17  //
    18  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    19  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    20  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    21  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    22  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    23  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    24  // THE SOFTWARE.
    25  
    26  // +build !plan9
    27  
    28  #include "go_asm.h"
    29  #include "textflag.h"
    30  
    31  // See memmove Go doc for important implementation constraints.
    32  
    33  // func memmove(to, from unsafe.Pointer, n uintptr)
    34  TEXT runtime·memmove(SB), NOSPLIT, $0-24
    35  
    36  	MOVQ	to+0(FP), DI
    37  	MOVQ	from+8(FP), SI
    38  	MOVQ	n+16(FP), BX
    39  
    40  	// REP instructions have a high startup cost, so we handle small sizes
    41  	// with some straightline code. The REP MOVSQ instruction is really fast
    42  	// for large sizes. The cutover is approximately 2K.
    43  tail:
    44  	// move_129through256 or smaller work whether or not the source and the
    45  	// destination memory regions overlap because they load all data into
    46  	// registers before writing it back.  move_256through2048 on the other
    47  	// hand can be used only when the memory regions don't overlap or the copy
    48  	// direction is forward.
    49  	//
    50  	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
    51  	TESTQ	BX, BX
    52  	JEQ	move_0
    53  	CMPQ	BX, $2
    54  	JBE	move_1or2
    55  	CMPQ	BX, $4
    56  	JB	move_3
    57  	JBE	move_4
    58  	CMPQ	BX, $8
    59  	JB	move_5through7
    60  	JE	move_8
    61  	CMPQ	BX, $16
    62  	JBE	move_9through16
    63  	CMPQ	BX, $32
    64  	JBE	move_17through32
    65  	CMPQ	BX, $64
    66  	JBE	move_33through64
    67  	CMPQ	BX, $128
    68  	JBE	move_65through128
    69  	CMPQ	BX, $256
    70  	JBE	move_129through256
    71  
    72  	TESTB	$1, runtime·useAVXmemmove(SB)
    73  	JNZ	avxUnaligned
    74  
    75  /*
    76   * check and set for backwards
    77   */
    78  	CMPQ	SI, DI
    79  	JLS	back
    80  
    81  /*
    82   * forward copy loop
    83   */
    84  forward:
    85  	CMPQ	BX, $2048
    86  	JLS	move_256through2048
    87  
    88  	// If REP MOVSB isn't fast, don't use it
    89  	CMPB	internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
    90  	JNE	fwdBy8
    91  
    92  	// Check alignment
    93  	MOVL	SI, AX
    94  	ORL	DI, AX
    95  	TESTL	$7, AX
    96  	JEQ	fwdBy8
    97  
    98  	// Do 1 byte at a time
    99  	MOVQ	BX, CX
   100  	REP;	MOVSB
   101  	RET
   102  
   103  fwdBy8:
   104  	// Do 8 bytes at a time
   105  	MOVQ	BX, CX
   106  	SHRQ	$3, CX
   107  	ANDQ	$7, BX
   108  	REP;	MOVSQ
   109  	JMP	tail
   110  
   111  back:
   112  /*
   113   * check overlap
   114   */
   115  	MOVQ	SI, CX
   116  	ADDQ	BX, CX
   117  	CMPQ	CX, DI
   118  	JLS	forward
   119  /*
   120   * whole thing backwards has
   121   * adjusted addresses
   122   */
   123  	ADDQ	BX, DI
   124  	ADDQ	BX, SI
   125  	STD
   126  
   127  /*
   128   * copy
   129   */
   130  	MOVQ	BX, CX
   131  	SHRQ	$3, CX
   132  	ANDQ	$7, BX
   133  
   134  	SUBQ	$8, DI
   135  	SUBQ	$8, SI
   136  	REP;	MOVSQ
   137  
   138  	CLD
   139  	ADDQ	$8, DI
   140  	ADDQ	$8, SI
   141  	SUBQ	BX, DI
   142  	SUBQ	BX, SI
   143  	JMP	tail
   144  
   145  move_1or2:
   146  	MOVB	(SI), AX
   147  	MOVB	-1(SI)(BX*1), CX
   148  	MOVB	AX, (DI)
   149  	MOVB	CX, -1(DI)(BX*1)
   150  	RET
   151  move_0:
   152  	RET
   153  move_4:
   154  	MOVL	(SI), AX
   155  	MOVL	AX, (DI)
   156  	RET
   157  move_3:
   158  	MOVW	(SI), AX
   159  	MOVB	2(SI), CX
   160  	MOVW	AX, (DI)
   161  	MOVB	CX, 2(DI)
   162  	RET
   163  move_5through7:
   164  	MOVL	(SI), AX
   165  	MOVL	-4(SI)(BX*1), CX
   166  	MOVL	AX, (DI)
   167  	MOVL	CX, -4(DI)(BX*1)
   168  	RET
   169  move_8:
   170  	// We need a separate case for 8 to make sure we write pointers atomically.
   171  	MOVQ	(SI), AX
   172  	MOVQ	AX, (DI)
   173  	RET
   174  move_9through16:
   175  	MOVQ	(SI), AX
   176  	MOVQ	-8(SI)(BX*1), CX
   177  	MOVQ	AX, (DI)
   178  	MOVQ	CX, -8(DI)(BX*1)
   179  	RET
   180  move_17through32:
   181  	MOVOU	(SI), X0
   182  	MOVOU	-16(SI)(BX*1), X1
   183  	MOVOU	X0, (DI)
   184  	MOVOU	X1, -16(DI)(BX*1)
   185  	RET
   186  move_33through64:
   187  	MOVOU	(SI), X0
   188  	MOVOU	16(SI), X1
   189  	MOVOU	-32(SI)(BX*1), X2
   190  	MOVOU	-16(SI)(BX*1), X3
   191  	MOVOU	X0, (DI)
   192  	MOVOU	X1, 16(DI)
   193  	MOVOU	X2, -32(DI)(BX*1)
   194  	MOVOU	X3, -16(DI)(BX*1)
   195  	RET
   196  move_65through128:
   197  	MOVOU	(SI), X0
   198  	MOVOU	16(SI), X1
   199  	MOVOU	32(SI), X2
   200  	MOVOU	48(SI), X3
   201  	MOVOU	-64(SI)(BX*1), X4
   202  	MOVOU	-48(SI)(BX*1), X5
   203  	MOVOU	-32(SI)(BX*1), X6
   204  	MOVOU	-16(SI)(BX*1), X7
   205  	MOVOU	X0, (DI)
   206  	MOVOU	X1, 16(DI)
   207  	MOVOU	X2, 32(DI)
   208  	MOVOU	X3, 48(DI)
   209  	MOVOU	X4, -64(DI)(BX*1)
   210  	MOVOU	X5, -48(DI)(BX*1)
   211  	MOVOU	X6, -32(DI)(BX*1)
   212  	MOVOU	X7, -16(DI)(BX*1)
   213  	RET
   214  move_129through256:
   215  	MOVOU	(SI), X0
   216  	MOVOU	16(SI), X1
   217  	MOVOU	32(SI), X2
   218  	MOVOU	48(SI), X3
   219  	MOVOU	64(SI), X4
   220  	MOVOU	80(SI), X5
   221  	MOVOU	96(SI), X6
   222  	MOVOU	112(SI), X7
   223  	MOVOU	-128(SI)(BX*1), X8
   224  	MOVOU	-112(SI)(BX*1), X9
   225  	MOVOU	-96(SI)(BX*1), X10
   226  	MOVOU	-80(SI)(BX*1), X11
   227  	MOVOU	-64(SI)(BX*1), X12
   228  	MOVOU	-48(SI)(BX*1), X13
   229  	MOVOU	-32(SI)(BX*1), X14
   230  	MOVOU	-16(SI)(BX*1), X15
   231  	MOVOU	X0, (DI)
   232  	MOVOU	X1, 16(DI)
   233  	MOVOU	X2, 32(DI)
   234  	MOVOU	X3, 48(DI)
   235  	MOVOU	X4, 64(DI)
   236  	MOVOU	X5, 80(DI)
   237  	MOVOU	X6, 96(DI)
   238  	MOVOU	X7, 112(DI)
   239  	MOVOU	X8, -128(DI)(BX*1)
   240  	MOVOU	X9, -112(DI)(BX*1)
   241  	MOVOU	X10, -96(DI)(BX*1)
   242  	MOVOU	X11, -80(DI)(BX*1)
   243  	MOVOU	X12, -64(DI)(BX*1)
   244  	MOVOU	X13, -48(DI)(BX*1)
   245  	MOVOU	X14, -32(DI)(BX*1)
   246  	MOVOU	X15, -16(DI)(BX*1)
   247  	RET
   248  move_256through2048:
   249  	SUBQ	$256, BX
   250  	MOVOU	(SI), X0
   251  	MOVOU	16(SI), X1
   252  	MOVOU	32(SI), X2
   253  	MOVOU	48(SI), X3
   254  	MOVOU	64(SI), X4
   255  	MOVOU	80(SI), X5
   256  	MOVOU	96(SI), X6
   257  	MOVOU	112(SI), X7
   258  	MOVOU	128(SI), X8
   259  	MOVOU	144(SI), X9
   260  	MOVOU	160(SI), X10
   261  	MOVOU	176(SI), X11
   262  	MOVOU	192(SI), X12
   263  	MOVOU	208(SI), X13
   264  	MOVOU	224(SI), X14
   265  	MOVOU	240(SI), X15
   266  	MOVOU	X0, (DI)
   267  	MOVOU	X1, 16(DI)
   268  	MOVOU	X2, 32(DI)
   269  	MOVOU	X3, 48(DI)
   270  	MOVOU	X4, 64(DI)
   271  	MOVOU	X5, 80(DI)
   272  	MOVOU	X6, 96(DI)
   273  	MOVOU	X7, 112(DI)
   274  	MOVOU	X8, 128(DI)
   275  	MOVOU	X9, 144(DI)
   276  	MOVOU	X10, 160(DI)
   277  	MOVOU	X11, 176(DI)
   278  	MOVOU	X12, 192(DI)
   279  	MOVOU	X13, 208(DI)
   280  	MOVOU	X14, 224(DI)
   281  	MOVOU	X15, 240(DI)
   282  	CMPQ	BX, $256
   283  	LEAQ	256(SI), SI
   284  	LEAQ	256(DI), DI
   285  	JGE	move_256through2048
   286  	JMP	tail
   287  
   288  avxUnaligned:
   289  	// There are two implementations of move algorithm.
   290  	// The first one for non-overlapped memory regions. It uses forward copying.
   291  	// The second one for overlapped regions. It uses backward copying
   292  	MOVQ	DI, CX
   293  	SUBQ	SI, CX
   294  	// Now CX contains distance between SRC and DEST
   295  	CMPQ	CX, BX
   296  	// If the distance lesser than region length it means that regions are overlapped
   297  	JC	copy_backward
   298  
   299  	// Non-temporal copy would be better for big sizes.
   300  	CMPQ	BX, $0x100000
   301  	JAE	gobble_big_data_fwd
   302  
   303  	// Memory layout on the source side
   304  	// SI                                       CX
   305  	// |<---------BX before correction--------->|
   306  	// |       |<--BX corrected-->|             |
   307  	// |       |                  |<--- AX  --->|
   308  	// |<-R11->|                  |<-128 bytes->|
   309  	// +----------------------------------------+
   310  	// | Head  | Body             | Tail        |
   311  	// +-------+------------------+-------------+
   312  	// ^       ^                  ^
   313  	// |       |                  |
   314  	// Save head into Y4          Save tail into X5..X12
   315  	//         |
   316  	//         SI+R11, where R11 = ((DI & -32) + 32) - DI
   317  	// Algorithm:
   318  	// 1. Unaligned save of the tail's 128 bytes
   319  	// 2. Unaligned save of the head's 32  bytes
   320  	// 3. Destination-aligned copying of body (128 bytes per iteration)
   321  	// 4. Put head on the new place
   322  	// 5. Put the tail on the new place
   323  	// It can be important to satisfy processor's pipeline requirements for
   324  	// small sizes as the cost of unaligned memory region copying is
   325  	// comparable with the cost of main loop. So code is slightly messed there.
   326  	// There is more clean implementation of that algorithm for bigger sizes
   327  	// where the cost of unaligned part copying is negligible.
   328  	// You can see it after gobble_big_data_fwd label.
   329  	LEAQ	(SI)(BX*1), CX
   330  	MOVQ	DI, R10
   331  	// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
   332  	MOVOU	-0x80(CX), X5
   333  	MOVOU	-0x70(CX), X6
   334  	MOVQ	$0x80, AX
   335  	// Align destination address
   336  	ANDQ	$-32, DI
   337  	ADDQ	$32, DI
   338  	// Continue tail saving.
   339  	MOVOU	-0x60(CX), X7
   340  	MOVOU	-0x50(CX), X8
   341  	// Make R11 delta between aligned and unaligned destination addresses.
   342  	MOVQ	DI, R11
   343  	SUBQ	R10, R11
   344  	// Continue tail saving.
   345  	MOVOU	-0x40(CX), X9
   346  	MOVOU	-0x30(CX), X10
   347  	// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
   348  	SUBQ	R11, BX
   349  	// Continue tail saving.
   350  	MOVOU	-0x20(CX), X11
   351  	MOVOU	-0x10(CX), X12
   352  	// The tail will be put on its place after main body copying.
   353  	// It's time for the unaligned heading part.
   354  	VMOVDQU	(SI), Y4
   355  	// Adjust source address to point past head.
   356  	ADDQ	R11, SI
   357  	SUBQ	AX, BX
   358  	// Aligned memory copying there
   359  gobble_128_loop:
   360  	VMOVDQU	(SI), Y0
   361  	VMOVDQU	0x20(SI), Y1
   362  	VMOVDQU	0x40(SI), Y2
   363  	VMOVDQU	0x60(SI), Y3
   364  	ADDQ	AX, SI
   365  	VMOVDQA	Y0, (DI)
   366  	VMOVDQA	Y1, 0x20(DI)
   367  	VMOVDQA	Y2, 0x40(DI)
   368  	VMOVDQA	Y3, 0x60(DI)
   369  	ADDQ	AX, DI
   370  	SUBQ	AX, BX
   371  	JA	gobble_128_loop
   372  	// Now we can store unaligned parts.
   373  	ADDQ	AX, BX
   374  	ADDQ	DI, BX
   375  	VMOVDQU	Y4, (R10)
   376  	VZEROUPPER
   377  	MOVOU	X5, -0x80(BX)
   378  	MOVOU	X6, -0x70(BX)
   379  	MOVOU	X7, -0x60(BX)
   380  	MOVOU	X8, -0x50(BX)
   381  	MOVOU	X9, -0x40(BX)
   382  	MOVOU	X10, -0x30(BX)
   383  	MOVOU	X11, -0x20(BX)
   384  	MOVOU	X12, -0x10(BX)
   385  	RET
   386  
   387  gobble_big_data_fwd:
   388  	// There is forward copying for big regions.
   389  	// It uses non-temporal mov instructions.
   390  	// Details of this algorithm are commented previously for small sizes.
   391  	LEAQ	(SI)(BX*1), CX
   392  	MOVOU	-0x80(SI)(BX*1), X5
   393  	MOVOU	-0x70(CX), X6
   394  	MOVOU	-0x60(CX), X7
   395  	MOVOU	-0x50(CX), X8
   396  	MOVOU	-0x40(CX), X9
   397  	MOVOU	-0x30(CX), X10
   398  	MOVOU	-0x20(CX), X11
   399  	MOVOU	-0x10(CX), X12
   400  	VMOVDQU	(SI), Y4
   401  	MOVQ	DI, R8
   402  	ANDQ	$-32, DI
   403  	ADDQ	$32, DI
   404  	MOVQ	DI, R10
   405  	SUBQ	R8, R10
   406  	SUBQ	R10, BX
   407  	ADDQ	R10, SI
   408  	LEAQ	(DI)(BX*1), CX
   409  	SUBQ	$0x80, BX
   410  gobble_mem_fwd_loop:
   411  	PREFETCHNTA 0x1C0(SI)
   412  	PREFETCHNTA 0x280(SI)
   413  	// Prefetch values were chosen empirically.
   414  	// Approach for prefetch usage as in 7.6.6 of [1]
   415  	// [1] 64-ia-32-architectures-optimization-manual.pdf
   416  	// https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
   417  	VMOVDQU	(SI), Y0
   418  	VMOVDQU	0x20(SI), Y1
   419  	VMOVDQU	0x40(SI), Y2
   420  	VMOVDQU	0x60(SI), Y3
   421  	ADDQ	$0x80, SI
   422  	VMOVNTDQ Y0, (DI)
   423  	VMOVNTDQ Y1, 0x20(DI)
   424  	VMOVNTDQ Y2, 0x40(DI)
   425  	VMOVNTDQ Y3, 0x60(DI)
   426  	ADDQ	$0x80, DI
   427  	SUBQ	$0x80, BX
   428  	JA		gobble_mem_fwd_loop
   429  	// NT instructions don't follow the normal cache-coherency rules.
   430  	// We need SFENCE there to make copied data available timely.
   431  	SFENCE
   432  	VMOVDQU	Y4, (R8)
   433  	VZEROUPPER
   434  	MOVOU	X5, -0x80(CX)
   435  	MOVOU	X6, -0x70(CX)
   436  	MOVOU	X7, -0x60(CX)
   437  	MOVOU	X8, -0x50(CX)
   438  	MOVOU	X9, -0x40(CX)
   439  	MOVOU	X10, -0x30(CX)
   440  	MOVOU	X11, -0x20(CX)
   441  	MOVOU	X12, -0x10(CX)
   442  	RET
   443  
   444  copy_backward:
   445  	MOVQ	DI, AX
   446  	// Backward copying is about the same as the forward one.
   447  	// Firstly we load unaligned tail in the beginning of region.
   448  	MOVOU	(SI), X5
   449  	MOVOU	0x10(SI), X6
   450  	ADDQ	BX, DI
   451  	MOVOU	0x20(SI), X7
   452  	MOVOU	0x30(SI), X8
   453  	LEAQ	-0x20(DI), R10
   454  	MOVQ	DI, R11
   455  	MOVOU	0x40(SI), X9
   456  	MOVOU	0x50(SI), X10
   457  	ANDQ	$0x1F, R11
   458  	MOVOU	0x60(SI), X11
   459  	MOVOU	0x70(SI), X12
   460  	XORQ	R11, DI
   461  	// Let's point SI to the end of region
   462  	ADDQ	BX, SI
   463  	// and load unaligned head into X4.
   464  	VMOVDQU	-0x20(SI), Y4
   465  	SUBQ	R11, SI
   466  	SUBQ	R11, BX
   467  	// If there is enough data for non-temporal moves go to special loop
   468  	CMPQ	BX, $0x100000
   469  	JA		gobble_big_data_bwd
   470  	SUBQ	$0x80, BX
   471  gobble_mem_bwd_loop:
   472  	VMOVDQU	-0x20(SI), Y0
   473  	VMOVDQU	-0x40(SI), Y1
   474  	VMOVDQU	-0x60(SI), Y2
   475  	VMOVDQU	-0x80(SI), Y3
   476  	SUBQ	$0x80, SI
   477  	VMOVDQA	Y0, -0x20(DI)
   478  	VMOVDQA	Y1, -0x40(DI)
   479  	VMOVDQA	Y2, -0x60(DI)
   480  	VMOVDQA	Y3, -0x80(DI)
   481  	SUBQ	$0x80, DI
   482  	SUBQ	$0x80, BX
   483  	JA		gobble_mem_bwd_loop
   484  	// Let's store unaligned data
   485  	VMOVDQU	Y4, (R10)
   486  	VZEROUPPER
   487  	MOVOU	X5, (AX)
   488  	MOVOU	X6, 0x10(AX)
   489  	MOVOU	X7, 0x20(AX)
   490  	MOVOU	X8, 0x30(AX)
   491  	MOVOU	X9, 0x40(AX)
   492  	MOVOU	X10, 0x50(AX)
   493  	MOVOU	X11, 0x60(AX)
   494  	MOVOU	X12, 0x70(AX)
   495  	RET
   496  
   497  gobble_big_data_bwd:
   498  	SUBQ	$0x80, BX
   499  gobble_big_mem_bwd_loop:
   500  	PREFETCHNTA -0x1C0(SI)
   501  	PREFETCHNTA -0x280(SI)
   502  	VMOVDQU	-0x20(SI), Y0
   503  	VMOVDQU	-0x40(SI), Y1
   504  	VMOVDQU	-0x60(SI), Y2
   505  	VMOVDQU	-0x80(SI), Y3
   506  	SUBQ	$0x80, SI
   507  	VMOVNTDQ	Y0, -0x20(DI)
   508  	VMOVNTDQ	Y1, -0x40(DI)
   509  	VMOVNTDQ	Y2, -0x60(DI)
   510  	VMOVNTDQ	Y3, -0x80(DI)
   511  	SUBQ	$0x80, DI
   512  	SUBQ	$0x80, BX
   513  	JA	gobble_big_mem_bwd_loop
   514  	SFENCE
   515  	VMOVDQU	Y4, (R10)
   516  	VZEROUPPER
   517  	MOVOU	X5, (AX)
   518  	MOVOU	X6, 0x10(AX)
   519  	MOVOU	X7, 0x20(AX)
   520  	MOVOU	X8, 0x30(AX)
   521  	MOVOU	X9, 0x40(AX)
   522  	MOVOU	X10, 0x50(AX)
   523  	MOVOU	X11, 0x60(AX)
   524  	MOVOU	X12, 0x70(AX)
   525  	RET