github.com/megatontech/mynoteforgo@v0.0.0-20200507084910-5d0c6ea6e890/源码/runtime/memmove_amd64.s (about)

     1  // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
     2  // https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s
     3  //
     4  //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
     5  //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     6  //         Portions Copyright 2009 The Go Authors. All rights reserved.
     7  //
     8  // Permission is hereby granted, free of charge, to any person obtaining a copy
     9  // of this software and associated documentation files (the "Software"), to deal
    10  // in the Software without restriction, including without limitation the rights
    11  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    12  // copies of the Software, and to permit persons to whom the Software is
    13  // furnished to do so, subject to the following conditions:
    14  //
    15  // The above copyright notice and this permission notice shall be included in
    16  // all copies or substantial portions of the Software.
    17  //
    18  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    19  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    20  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    21  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    22  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    23  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    24  // THE SOFTWARE.
    25  
    26  // +build !plan9
    27  
    28  #include "go_asm.h"
    29  #include "textflag.h"
    30  
    31  // func memmove(to, from unsafe.Pointer, n uintptr)
    32  TEXT runtime·memmove(SB), NOSPLIT, $0-24
    33  
    34  	MOVQ	to+0(FP), DI
    35  	MOVQ	from+8(FP), SI
    36  	MOVQ	n+16(FP), BX
    37  
    38  	// REP instructions have a high startup cost, so we handle small sizes
    39  	// with some straightline code. The REP MOVSQ instruction is really fast
    40  	// for large sizes. The cutover is approximately 2K.
    41  tail:
    42  	// move_129through256 or smaller work whether or not the source and the
    43  	// destination memory regions overlap because they load all data into
    44  	// registers before writing it back.  move_256through2048 on the other
    45  	// hand can be used only when the memory regions don't overlap or the copy
    46  	// direction is forward.
    47  	//
    48  	// BSR+branch table make almost all memmove/memclr benchmarks worse. Not worth doing.
    49  	TESTQ	BX, BX
    50  	JEQ	move_0
    51  	CMPQ	BX, $2
    52  	JBE	move_1or2
    53  	CMPQ	BX, $4
    54  	JB	move_3
    55  	JBE	move_4
    56  	CMPQ	BX, $8
    57  	JB	move_5through7
    58  	JE	move_8
    59  	CMPQ	BX, $16
    60  	JBE	move_9through16
    61  	CMPQ	BX, $32
    62  	JBE	move_17through32
    63  	CMPQ	BX, $64
    64  	JBE	move_33through64
    65  	CMPQ	BX, $128
    66  	JBE	move_65through128
    67  	CMPQ	BX, $256
    68  	JBE	move_129through256
    69  
    70  	TESTB	$1, runtime·useAVXmemmove(SB)
    71  	JNZ	avxUnaligned
    72  
    73  /*
    74   * check and set for backwards
    75   */
    76  	CMPQ	SI, DI
    77  	JLS	back
    78  
    79  /*
    80   * forward copy loop
    81   */
    82  forward:
    83  	CMPQ	BX, $2048
    84  	JLS	move_256through2048
    85  
    86  	// If REP MOVSB isn't fast, don't use it
    87  	CMPB	internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
    88  	JNE	fwdBy8
    89  
    90  	// Check alignment
    91  	MOVL	SI, AX
    92  	ORL	DI, AX
    93  	TESTL	$7, AX
    94  	JEQ	fwdBy8
    95  
    96  	// Do 1 byte at a time
    97  	MOVQ	BX, CX
    98  	REP;	MOVSB
    99  	RET
   100  
   101  fwdBy8:
   102  	// Do 8 bytes at a time
   103  	MOVQ	BX, CX
   104  	SHRQ	$3, CX
   105  	ANDQ	$7, BX
   106  	REP;	MOVSQ
   107  	JMP	tail
   108  
   109  back:
   110  /*
   111   * check overlap
   112   */
   113  	MOVQ	SI, CX
   114  	ADDQ	BX, CX
   115  	CMPQ	CX, DI
   116  	JLS	forward
   117  /*
   118   * whole thing backwards has
   119   * adjusted addresses
   120   */
   121  	ADDQ	BX, DI
   122  	ADDQ	BX, SI
   123  	STD
   124  
   125  /*
   126   * copy
   127   */
   128  	MOVQ	BX, CX
   129  	SHRQ	$3, CX
   130  	ANDQ	$7, BX
   131  
   132  	SUBQ	$8, DI
   133  	SUBQ	$8, SI
   134  	REP;	MOVSQ
   135  
   136  	CLD
   137  	ADDQ	$8, DI
   138  	ADDQ	$8, SI
   139  	SUBQ	BX, DI
   140  	SUBQ	BX, SI
   141  	JMP	tail
   142  
   143  move_1or2:
   144  	MOVB	(SI), AX
   145  	MOVB	-1(SI)(BX*1), CX
   146  	MOVB	AX, (DI)
   147  	MOVB	CX, -1(DI)(BX*1)
   148  	RET
   149  move_0:
   150  	RET
   151  move_4:
   152  	MOVL	(SI), AX
   153  	MOVL	AX, (DI)
   154  	RET
   155  move_3:
   156  	MOVW	(SI), AX
   157  	MOVB	2(SI), CX
   158  	MOVW	AX, (DI)
   159  	MOVB	CX, 2(DI)
   160  	RET
   161  move_5through7:
   162  	MOVL	(SI), AX
   163  	MOVL	-4(SI)(BX*1), CX
   164  	MOVL	AX, (DI)
   165  	MOVL	CX, -4(DI)(BX*1)
   166  	RET
   167  move_8:
   168  	// We need a separate case for 8 to make sure we write pointers atomically.
   169  	MOVQ	(SI), AX
   170  	MOVQ	AX, (DI)
   171  	RET
   172  move_9through16:
   173  	MOVQ	(SI), AX
   174  	MOVQ	-8(SI)(BX*1), CX
   175  	MOVQ	AX, (DI)
   176  	MOVQ	CX, -8(DI)(BX*1)
   177  	RET
   178  move_17through32:
   179  	MOVOU	(SI), X0
   180  	MOVOU	-16(SI)(BX*1), X1
   181  	MOVOU	X0, (DI)
   182  	MOVOU	X1, -16(DI)(BX*1)
   183  	RET
   184  move_33through64:
   185  	MOVOU	(SI), X0
   186  	MOVOU	16(SI), X1
   187  	MOVOU	-32(SI)(BX*1), X2
   188  	MOVOU	-16(SI)(BX*1), X3
   189  	MOVOU	X0, (DI)
   190  	MOVOU	X1, 16(DI)
   191  	MOVOU	X2, -32(DI)(BX*1)
   192  	MOVOU	X3, -16(DI)(BX*1)
   193  	RET
   194  move_65through128:
   195  	MOVOU	(SI), X0
   196  	MOVOU	16(SI), X1
   197  	MOVOU	32(SI), X2
   198  	MOVOU	48(SI), X3
   199  	MOVOU	-64(SI)(BX*1), X4
   200  	MOVOU	-48(SI)(BX*1), X5
   201  	MOVOU	-32(SI)(BX*1), X6
   202  	MOVOU	-16(SI)(BX*1), X7
   203  	MOVOU	X0, (DI)
   204  	MOVOU	X1, 16(DI)
   205  	MOVOU	X2, 32(DI)
   206  	MOVOU	X3, 48(DI)
   207  	MOVOU	X4, -64(DI)(BX*1)
   208  	MOVOU	X5, -48(DI)(BX*1)
   209  	MOVOU	X6, -32(DI)(BX*1)
   210  	MOVOU	X7, -16(DI)(BX*1)
   211  	RET
   212  move_129through256:
   213  	MOVOU	(SI), X0
   214  	MOVOU	16(SI), X1
   215  	MOVOU	32(SI), X2
   216  	MOVOU	48(SI), X3
   217  	MOVOU	64(SI), X4
   218  	MOVOU	80(SI), X5
   219  	MOVOU	96(SI), X6
   220  	MOVOU	112(SI), X7
   221  	MOVOU	-128(SI)(BX*1), X8
   222  	MOVOU	-112(SI)(BX*1), X9
   223  	MOVOU	-96(SI)(BX*1), X10
   224  	MOVOU	-80(SI)(BX*1), X11
   225  	MOVOU	-64(SI)(BX*1), X12
   226  	MOVOU	-48(SI)(BX*1), X13
   227  	MOVOU	-32(SI)(BX*1), X14
   228  	MOVOU	-16(SI)(BX*1), X15
   229  	MOVOU	X0, (DI)
   230  	MOVOU	X1, 16(DI)
   231  	MOVOU	X2, 32(DI)
   232  	MOVOU	X3, 48(DI)
   233  	MOVOU	X4, 64(DI)
   234  	MOVOU	X5, 80(DI)
   235  	MOVOU	X6, 96(DI)
   236  	MOVOU	X7, 112(DI)
   237  	MOVOU	X8, -128(DI)(BX*1)
   238  	MOVOU	X9, -112(DI)(BX*1)
   239  	MOVOU	X10, -96(DI)(BX*1)
   240  	MOVOU	X11, -80(DI)(BX*1)
   241  	MOVOU	X12, -64(DI)(BX*1)
   242  	MOVOU	X13, -48(DI)(BX*1)
   243  	MOVOU	X14, -32(DI)(BX*1)
   244  	MOVOU	X15, -16(DI)(BX*1)
   245  	RET
   246  move_256through2048:
   247  	SUBQ	$256, BX
   248  	MOVOU	(SI), X0
   249  	MOVOU	16(SI), X1
   250  	MOVOU	32(SI), X2
   251  	MOVOU	48(SI), X3
   252  	MOVOU	64(SI), X4
   253  	MOVOU	80(SI), X5
   254  	MOVOU	96(SI), X6
   255  	MOVOU	112(SI), X7
   256  	MOVOU	128(SI), X8
   257  	MOVOU	144(SI), X9
   258  	MOVOU	160(SI), X10
   259  	MOVOU	176(SI), X11
   260  	MOVOU	192(SI), X12
   261  	MOVOU	208(SI), X13
   262  	MOVOU	224(SI), X14
   263  	MOVOU	240(SI), X15
   264  	MOVOU	X0, (DI)
   265  	MOVOU	X1, 16(DI)
   266  	MOVOU	X2, 32(DI)
   267  	MOVOU	X3, 48(DI)
   268  	MOVOU	X4, 64(DI)
   269  	MOVOU	X5, 80(DI)
   270  	MOVOU	X6, 96(DI)
   271  	MOVOU	X7, 112(DI)
   272  	MOVOU	X8, 128(DI)
   273  	MOVOU	X9, 144(DI)
   274  	MOVOU	X10, 160(DI)
   275  	MOVOU	X11, 176(DI)
   276  	MOVOU	X12, 192(DI)
   277  	MOVOU	X13, 208(DI)
   278  	MOVOU	X14, 224(DI)
   279  	MOVOU	X15, 240(DI)
   280  	CMPQ	BX, $256
   281  	LEAQ	256(SI), SI
   282  	LEAQ	256(DI), DI
   283  	JGE	move_256through2048
   284  	JMP	tail
   285  
   286  avxUnaligned:
   287  	// There are two implementations of move algorithm.
   288  	// The first one for non-overlapped memory regions. It uses forward copying.
   289  	// The second one for overlapped regions. It uses backward copying
   290  	MOVQ	DI, CX
   291  	SUBQ	SI, CX
   292  	// Now CX contains distance between SRC and DEST
   293  	CMPQ	CX, BX
   294  	// If the distance lesser than region length it means that regions are overlapped
   295  	JC	copy_backward
   296  
   297  	// Non-temporal copy would be better for big sizes.
   298  	CMPQ	BX, $0x100000
   299  	JAE	gobble_big_data_fwd
   300  
   301  	// Memory layout on the source side
   302  	// SI                                       CX
   303  	// |<---------BX before correction--------->|
   304  	// |       |<--BX corrected-->|             |
   305  	// |       |                  |<--- AX  --->|
   306  	// |<-R11->|                  |<-128 bytes->|
   307  	// +----------------------------------------+
   308  	// | Head  | Body             | Tail        |
   309  	// +-------+------------------+-------------+
   310  	// ^       ^                  ^
   311  	// |       |                  |
   312  	// Save head into Y4          Save tail into X5..X12
   313  	//         |
   314  	//         SI+R11, where R11 = ((DI & -32) + 32) - DI
   315  	// Algorithm:
   316  	// 1. Unaligned save of the tail's 128 bytes
   317  	// 2. Unaligned save of the head's 32  bytes
   318  	// 3. Destination-aligned copying of body (128 bytes per iteration)
   319  	// 4. Put head on the new place
   320  	// 5. Put the tail on the new place
   321  	// It can be important to satisfy processor's pipeline requirements for
   322  	// small sizes as the cost of unaligned memory region copying is
   323  	// comparable with the cost of main loop. So code is slightly messed there.
   324  	// There is more clean implementation of that algorithm for bigger sizes
   325  	// where the cost of unaligned part copying is negligible.
   326  	// You can see it after gobble_big_data_fwd label.
   327  	LEAQ	(SI)(BX*1), CX
   328  	MOVQ	DI, R10
   329  	// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
   330  	MOVOU	-0x80(CX), X5
   331  	MOVOU	-0x70(CX), X6
   332  	MOVQ	$0x80, AX
   333  	// Align destination address
   334  	ANDQ	$-32, DI
   335  	ADDQ	$32, DI
   336  	// Continue tail saving.
   337  	MOVOU	-0x60(CX), X7
   338  	MOVOU	-0x50(CX), X8
   339  	// Make R11 delta between aligned and unaligned destination addresses.
   340  	MOVQ	DI, R11
   341  	SUBQ	R10, R11
   342  	// Continue tail saving.
   343  	MOVOU	-0x40(CX), X9
   344  	MOVOU	-0x30(CX), X10
   345  	// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
   346  	SUBQ	R11, BX
   347  	// Continue tail saving.
   348  	MOVOU	-0x20(CX), X11
   349  	MOVOU	-0x10(CX), X12
   350  	// The tail will be put on its place after main body copying.
   351  	// It's time for the unaligned heading part.
   352  	VMOVDQU	(SI), Y4
   353  	// Adjust source address to point past head.
   354  	ADDQ	R11, SI
   355  	SUBQ	AX, BX
   356  	// Aligned memory copying there
   357  gobble_128_loop:
   358  	VMOVDQU	(SI), Y0
   359  	VMOVDQU	0x20(SI), Y1
   360  	VMOVDQU	0x40(SI), Y2
   361  	VMOVDQU	0x60(SI), Y3
   362  	ADDQ	AX, SI
   363  	VMOVDQA	Y0, (DI)
   364  	VMOVDQA	Y1, 0x20(DI)
   365  	VMOVDQA	Y2, 0x40(DI)
   366  	VMOVDQA	Y3, 0x60(DI)
   367  	ADDQ	AX, DI
   368  	SUBQ	AX, BX
   369  	JA	gobble_128_loop
   370  	// Now we can store unaligned parts.
   371  	ADDQ	AX, BX
   372  	ADDQ	DI, BX
   373  	VMOVDQU	Y4, (R10)
   374  	VZEROUPPER
   375  	MOVOU	X5, -0x80(BX)
   376  	MOVOU	X6, -0x70(BX)
   377  	MOVOU	X7, -0x60(BX)
   378  	MOVOU	X8, -0x50(BX)
   379  	MOVOU	X9, -0x40(BX)
   380  	MOVOU	X10, -0x30(BX)
   381  	MOVOU	X11, -0x20(BX)
   382  	MOVOU	X12, -0x10(BX)
   383  	RET
   384  
   385  gobble_big_data_fwd:
   386  	// There is forward copying for big regions.
   387  	// It uses non-temporal mov instructions.
   388  	// Details of this algorithm are commented previously for small sizes.
   389  	LEAQ	(SI)(BX*1), CX
   390  	MOVOU	-0x80(SI)(BX*1), X5
   391  	MOVOU	-0x70(CX), X6
   392  	MOVOU	-0x60(CX), X7
   393  	MOVOU	-0x50(CX), X8
   394  	MOVOU	-0x40(CX), X9
   395  	MOVOU	-0x30(CX), X10
   396  	MOVOU	-0x20(CX), X11
   397  	MOVOU	-0x10(CX), X12
   398  	VMOVDQU	(SI), Y4
   399  	MOVQ	DI, R8
   400  	ANDQ	$-32, DI
   401  	ADDQ	$32, DI
   402  	MOVQ	DI, R10
   403  	SUBQ	R8, R10
   404  	SUBQ	R10, BX
   405  	ADDQ	R10, SI
   406  	LEAQ	(DI)(BX*1), CX
   407  	SUBQ	$0x80, BX
   408  gobble_mem_fwd_loop:
   409  	PREFETCHNTA 0x1C0(SI)
   410  	PREFETCHNTA 0x280(SI)
   411  	// Prefetch values were chosen empirically.
   412  	// Approach for prefetch usage as in 7.6.6 of [1]
   413  	// [1] 64-ia-32-architectures-optimization-manual.pdf
   414  	// https://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
   415  	VMOVDQU	(SI), Y0
   416  	VMOVDQU	0x20(SI), Y1
   417  	VMOVDQU	0x40(SI), Y2
   418  	VMOVDQU	0x60(SI), Y3
   419  	ADDQ	$0x80, SI
   420  	VMOVNTDQ Y0, (DI)
   421  	VMOVNTDQ Y1, 0x20(DI)
   422  	VMOVNTDQ Y2, 0x40(DI)
   423  	VMOVNTDQ Y3, 0x60(DI)
   424  	ADDQ	$0x80, DI
   425  	SUBQ	$0x80, BX
   426  	JA		gobble_mem_fwd_loop
   427  	// NT instructions don't follow the normal cache-coherency rules.
   428  	// We need SFENCE there to make copied data available timely.
   429  	SFENCE
   430  	VMOVDQU	Y4, (R8)
   431  	VZEROUPPER
   432  	MOVOU	X5, -0x80(CX)
   433  	MOVOU	X6, -0x70(CX)
   434  	MOVOU	X7, -0x60(CX)
   435  	MOVOU	X8, -0x50(CX)
   436  	MOVOU	X9, -0x40(CX)
   437  	MOVOU	X10, -0x30(CX)
   438  	MOVOU	X11, -0x20(CX)
   439  	MOVOU	X12, -0x10(CX)
   440  	RET
   441  
   442  copy_backward:
   443  	MOVQ	DI, AX
   444  	// Backward copying is about the same as the forward one.
   445  	// Firstly we load unaligned tail in the beginning of region.
   446  	MOVOU	(SI), X5
   447  	MOVOU	0x10(SI), X6
   448  	ADDQ	BX, DI
   449  	MOVOU	0x20(SI), X7
   450  	MOVOU	0x30(SI), X8
   451  	LEAQ	-0x20(DI), R10
   452  	MOVQ	DI, R11
   453  	MOVOU	0x40(SI), X9
   454  	MOVOU	0x50(SI), X10
   455  	ANDQ	$0x1F, R11
   456  	MOVOU	0x60(SI), X11
   457  	MOVOU	0x70(SI), X12
   458  	XORQ	R11, DI
   459  	// Let's point SI to the end of region
   460  	ADDQ	BX, SI
   461  	// and load unaligned head into X4.
   462  	VMOVDQU	-0x20(SI), Y4
   463  	SUBQ	R11, SI
   464  	SUBQ	R11, BX
   465  	// If there is enough data for non-temporal moves go to special loop
   466  	CMPQ	BX, $0x100000
   467  	JA		gobble_big_data_bwd
   468  	SUBQ	$0x80, BX
   469  gobble_mem_bwd_loop:
   470  	VMOVDQU	-0x20(SI), Y0
   471  	VMOVDQU	-0x40(SI), Y1
   472  	VMOVDQU	-0x60(SI), Y2
   473  	VMOVDQU	-0x80(SI), Y3
   474  	SUBQ	$0x80, SI
   475  	VMOVDQA	Y0, -0x20(DI)
   476  	VMOVDQA	Y1, -0x40(DI)
   477  	VMOVDQA	Y2, -0x60(DI)
   478  	VMOVDQA	Y3, -0x80(DI)
   479  	SUBQ	$0x80, DI
   480  	SUBQ	$0x80, BX
   481  	JA		gobble_mem_bwd_loop
   482  	// Let's store unaligned data
   483  	VMOVDQU	Y4, (R10)
   484  	VZEROUPPER
   485  	MOVOU	X5, (AX)
   486  	MOVOU	X6, 0x10(AX)
   487  	MOVOU	X7, 0x20(AX)
   488  	MOVOU	X8, 0x30(AX)
   489  	MOVOU	X9, 0x40(AX)
   490  	MOVOU	X10, 0x50(AX)
   491  	MOVOU	X11, 0x60(AX)
   492  	MOVOU	X12, 0x70(AX)
   493  	RET
   494  
   495  gobble_big_data_bwd:
   496  	SUBQ	$0x80, BX
   497  gobble_big_mem_bwd_loop:
   498  	PREFETCHNTA -0x1C0(SI)
   499  	PREFETCHNTA -0x280(SI)
   500  	VMOVDQU	-0x20(SI), Y0
   501  	VMOVDQU	-0x40(SI), Y1
   502  	VMOVDQU	-0x60(SI), Y2
   503  	VMOVDQU	-0x80(SI), Y3
   504  	SUBQ	$0x80, SI
   505  	VMOVNTDQ	Y0, -0x20(DI)
   506  	VMOVNTDQ	Y1, -0x40(DI)
   507  	VMOVNTDQ	Y2, -0x60(DI)
   508  	VMOVNTDQ	Y3, -0x80(DI)
   509  	SUBQ	$0x80, DI
   510  	SUBQ	$0x80, BX
   511  	JA	gobble_big_mem_bwd_loop
   512  	SFENCE
   513  	VMOVDQU	Y4, (R10)
   514  	VZEROUPPER
   515  	MOVOU	X5, (AX)
   516  	MOVOU	X6, 0x10(AX)
   517  	MOVOU	X7, 0x20(AX)
   518  	MOVOU	X8, 0x30(AX)
   519  	MOVOU	X9, 0x40(AX)
   520  	MOVOU	X10, 0x50(AX)
   521  	MOVOU	X11, 0x60(AX)
   522  	MOVOU	X12, 0x70(AX)
   523  	RET