github.com/sanprasirt/go@v0.0.0-20170607001320-a027466e4b6d/src/runtime/memmove_amd64.s (about)

     1  // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
     2  // https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s
     3  //
     4  //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
     5  //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     6  //         Portions Copyright 2009 The Go Authors. All rights reserved.
     7  //
     8  // Permission is hereby granted, free of charge, to any person obtaining a copy
     9  // of this software and associated documentation files (the "Software"), to deal
    10  // in the Software without restriction, including without limitation the rights
    11  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    12  // copies of the Software, and to permit persons to whom the Software is
    13  // furnished to do so, subject to the following conditions:
    14  //
    15  // The above copyright notice and this permission notice shall be included in
    16  // all copies or substantial portions of the Software.
    17  //
    18  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    19  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    20  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    21  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    22  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    23  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    24  // THE SOFTWARE.
    25  
    26  // +build !plan9
    27  
    28  #include "textflag.h"
    29  
    30  // void runtime·memmove(void*, void*, uintptr)
    31  TEXT runtime·memmove(SB), NOSPLIT, $0-24
    32  
    33  	MOVQ	to+0(FP), DI
    34  	MOVQ	from+8(FP), SI
    35  	MOVQ	n+16(FP), BX
    36  
    37  	// REP instructions have a high startup cost, so we handle small sizes
    38  	// with some straightline code. The REP MOVSQ instruction is really fast
    39  	// for large sizes. The cutover is approximately 2K.
    40  tail:
    41  	// move_129through256 or smaller work whether or not the source and the
    42  	// destination memory regions overlap because they load all data into
    43  	// registers before writing it back.  move_256through2048 on the other
    44  	// hand can be used only when the memory regions don't overlap or the copy
    45  	// direction is forward.
    46  	TESTQ	BX, BX
    47  	JEQ	move_0
    48  	CMPQ	BX, $2
    49  	JBE	move_1or2
    50  	CMPQ	BX, $4
    51  	JBE	move_3or4
    52  	CMPQ	BX, $8
    53  	JB	move_5through7
    54  	JE	move_8
    55  	CMPQ	BX, $16
    56  	JBE	move_9through16
    57  	CMPQ	BX, $32
    58  	JBE	move_17through32
    59  	CMPQ	BX, $64
    60  	JBE	move_33through64
    61  	CMPQ	BX, $128
    62  	JBE	move_65through128
    63  	CMPQ	BX, $256
    64  	JBE	move_129through256
    65  	// TODO: use branch table and BSR to make this just a single dispatch
    66  
    67  	TESTB	$1, runtime·useAVXmemmove(SB)
    68  	JNZ	avxUnaligned
    69  
    70  /*
    71   * check and set for backwards
    72   */
    73  	CMPQ	SI, DI
    74  	JLS	back
    75  
    76  /*
    77   * forward copy loop
    78   */
    79  forward:
    80  	CMPQ	BX, $2048
    81  	JLS	move_256through2048
    82  
    83  	// If REP MOVSB isn't fast, don't use it
    84  	CMPB	runtime·support_erms(SB), $1 // enhanced REP MOVSB/STOSB
    85  	JNE	fwdBy8
    86  
    87  	// Check alignment
    88  	MOVL	SI, AX
    89  	ORL	DI, AX
    90  	TESTL	$7, AX
    91  	JEQ	fwdBy8
    92  
    93  	// Do 1 byte at a time
    94  	MOVQ	BX, CX
    95  	REP;	MOVSB
    96  	RET
    97  
    98  fwdBy8:
    99  	// Do 8 bytes at a time
   100  	MOVQ	BX, CX
   101  	SHRQ	$3, CX
   102  	ANDQ	$7, BX
   103  	REP;	MOVSQ
   104  	JMP	tail
   105  
   106  back:
   107  /*
   108   * check overlap
   109   */
   110  	MOVQ	SI, CX
   111  	ADDQ	BX, CX
   112  	CMPQ	CX, DI
   113  	JLS	forward
   114  /*
   115   * whole thing backwards has
   116   * adjusted addresses
   117   */
   118  	ADDQ	BX, DI
   119  	ADDQ	BX, SI
   120  	STD
   121  
   122  /*
   123   * copy
   124   */
   125  	MOVQ	BX, CX
   126  	SHRQ	$3, CX
   127  	ANDQ	$7, BX
   128  
   129  	SUBQ	$8, DI
   130  	SUBQ	$8, SI
   131  	REP;	MOVSQ
   132  
   133  	CLD
   134  	ADDQ	$8, DI
   135  	ADDQ	$8, SI
   136  	SUBQ	BX, DI
   137  	SUBQ	BX, SI
   138  	JMP	tail
   139  
   140  move_1or2:
   141  	MOVB	(SI), AX
   142  	MOVB	-1(SI)(BX*1), CX
   143  	MOVB	AX, (DI)
   144  	MOVB	CX, -1(DI)(BX*1)
   145  	RET
   146  move_0:
   147  	RET
   148  move_3or4:
   149  	CMPQ	BX, $4
   150  	JB	move_3
   151  	MOVL	(SI), AX
   152  	MOVL	AX, (DI)
   153  	RET
   154  move_3:
   155  	MOVW	(SI), AX
   156  	MOVB	2(SI), CX
   157  	MOVW	AX, (DI)
   158  	MOVB	CX, 2(DI)
   159  	RET
   160  move_5through7:
   161  	MOVL	(SI), AX
   162  	MOVL	-4(SI)(BX*1), CX
   163  	MOVL	AX, (DI)
   164  	MOVL	CX, -4(DI)(BX*1)
   165  	RET
   166  move_8:
   167  	// We need a separate case for 8 to make sure we write pointers atomically.
   168  	MOVQ	(SI), AX
   169  	MOVQ	AX, (DI)
   170  	RET
   171  move_9through16:
   172  	MOVQ	(SI), AX
   173  	MOVQ	-8(SI)(BX*1), CX
   174  	MOVQ	AX, (DI)
   175  	MOVQ	CX, -8(DI)(BX*1)
   176  	RET
   177  move_17through32:
   178  	MOVOU	(SI), X0
   179  	MOVOU	-16(SI)(BX*1), X1
   180  	MOVOU	X0, (DI)
   181  	MOVOU	X1, -16(DI)(BX*1)
   182  	RET
   183  move_33through64:
   184  	MOVOU	(SI), X0
   185  	MOVOU	16(SI), X1
   186  	MOVOU	-32(SI)(BX*1), X2
   187  	MOVOU	-16(SI)(BX*1), X3
   188  	MOVOU	X0, (DI)
   189  	MOVOU	X1, 16(DI)
   190  	MOVOU	X2, -32(DI)(BX*1)
   191  	MOVOU	X3, -16(DI)(BX*1)
   192  	RET
   193  move_65through128:
   194  	MOVOU	(SI), X0
   195  	MOVOU	16(SI), X1
   196  	MOVOU	32(SI), X2
   197  	MOVOU	48(SI), X3
   198  	MOVOU	-64(SI)(BX*1), X4
   199  	MOVOU	-48(SI)(BX*1), X5
   200  	MOVOU	-32(SI)(BX*1), X6
   201  	MOVOU	-16(SI)(BX*1), X7
   202  	MOVOU	X0, (DI)
   203  	MOVOU	X1, 16(DI)
   204  	MOVOU	X2, 32(DI)
   205  	MOVOU	X3, 48(DI)
   206  	MOVOU	X4, -64(DI)(BX*1)
   207  	MOVOU	X5, -48(DI)(BX*1)
   208  	MOVOU	X6, -32(DI)(BX*1)
   209  	MOVOU	X7, -16(DI)(BX*1)
   210  	RET
   211  move_129through256:
   212  	MOVOU	(SI), X0
   213  	MOVOU	16(SI), X1
   214  	MOVOU	32(SI), X2
   215  	MOVOU	48(SI), X3
   216  	MOVOU	64(SI), X4
   217  	MOVOU	80(SI), X5
   218  	MOVOU	96(SI), X6
   219  	MOVOU	112(SI), X7
   220  	MOVOU	-128(SI)(BX*1), X8
   221  	MOVOU	-112(SI)(BX*1), X9
   222  	MOVOU	-96(SI)(BX*1), X10
   223  	MOVOU	-80(SI)(BX*1), X11
   224  	MOVOU	-64(SI)(BX*1), X12
   225  	MOVOU	-48(SI)(BX*1), X13
   226  	MOVOU	-32(SI)(BX*1), X14
   227  	MOVOU	-16(SI)(BX*1), X15
   228  	MOVOU	X0, (DI)
   229  	MOVOU	X1, 16(DI)
   230  	MOVOU	X2, 32(DI)
   231  	MOVOU	X3, 48(DI)
   232  	MOVOU	X4, 64(DI)
   233  	MOVOU	X5, 80(DI)
   234  	MOVOU	X6, 96(DI)
   235  	MOVOU	X7, 112(DI)
   236  	MOVOU	X8, -128(DI)(BX*1)
   237  	MOVOU	X9, -112(DI)(BX*1)
   238  	MOVOU	X10, -96(DI)(BX*1)
   239  	MOVOU	X11, -80(DI)(BX*1)
   240  	MOVOU	X12, -64(DI)(BX*1)
   241  	MOVOU	X13, -48(DI)(BX*1)
   242  	MOVOU	X14, -32(DI)(BX*1)
   243  	MOVOU	X15, -16(DI)(BX*1)
   244  	RET
   245  move_256through2048:
   246  	SUBQ	$256, BX
   247  	MOVOU	(SI), X0
   248  	MOVOU	16(SI), X1
   249  	MOVOU	32(SI), X2
   250  	MOVOU	48(SI), X3
   251  	MOVOU	64(SI), X4
   252  	MOVOU	80(SI), X5
   253  	MOVOU	96(SI), X6
   254  	MOVOU	112(SI), X7
   255  	MOVOU	128(SI), X8
   256  	MOVOU	144(SI), X9
   257  	MOVOU	160(SI), X10
   258  	MOVOU	176(SI), X11
   259  	MOVOU	192(SI), X12
   260  	MOVOU	208(SI), X13
   261  	MOVOU	224(SI), X14
   262  	MOVOU	240(SI), X15
   263  	MOVOU	X0, (DI)
   264  	MOVOU	X1, 16(DI)
   265  	MOVOU	X2, 32(DI)
   266  	MOVOU	X3, 48(DI)
   267  	MOVOU	X4, 64(DI)
   268  	MOVOU	X5, 80(DI)
   269  	MOVOU	X6, 96(DI)
   270  	MOVOU	X7, 112(DI)
   271  	MOVOU	X8, 128(DI)
   272  	MOVOU	X9, 144(DI)
   273  	MOVOU	X10, 160(DI)
   274  	MOVOU	X11, 176(DI)
   275  	MOVOU	X12, 192(DI)
   276  	MOVOU	X13, 208(DI)
   277  	MOVOU	X14, 224(DI)
   278  	MOVOU	X15, 240(DI)
   279  	CMPQ	BX, $256
   280  	LEAQ	256(SI), SI
   281  	LEAQ	256(DI), DI
   282  	JGE	move_256through2048
   283  	JMP	tail
   284  
   285  avxUnaligned:
   286  	// There are two implementations of move algorithm.
   287  	// The first one for non-ovelapped memory regions. It uses forward copying.
   288  	// The second one for overlapped regions. It uses backward copying
   289  	MOVQ	DI, CX
   290  	SUBQ	SI, CX
   291  	// Now CX contains distance between SRC and DEST
   292  	CMPQ	CX, BX
   293  	// If the distance lesser than region length it means that regions are overlapped
   294  	JC	copy_backward
   295  
   296  	// Non-temporal copy would be better for big sizes.
   297  	CMPQ	BX, $0x100000
   298  	JAE	gobble_big_data_fwd
   299  
   300  	// Memory layout on the source side
   301  	// SI                                       CX
   302  	// |<---------BX before correction--------->|
   303  	// |       |<--BX corrected-->|             |
   304  	// |       |                  |<--- AX  --->|
   305  	// |<-R11->|                  |<-128 bytes->|
   306  	// +----------------------------------------+
   307  	// | Head  | Body             | Tail        |
   308  	// +-------+------------------+-------------+
   309  	// ^       ^                  ^
   310  	// |       |                  |
   311  	// Save head into Y4          Save tail into X5..X12
   312  	//         |
   313  	//         SI+R11, where R11 = ((DI & -32) + 32) - DI
   314  	// Algorithm:
   315  	// 1. Unaligned save of the tail's 128 bytes
   316  	// 2. Unaligned save of the head's 32  bytes
   317  	// 3. Destination-aligned copying of body (128 bytes per iteration)
   318  	// 4. Put head on the new place
   319  	// 5. Put the tail on the new place
   320  	// It can be important to satisfy processor's pipeline requirements for
   321  	// small sizes as the cost of unaligned memory region copying is
   322  	// comparable with the cost of main loop. So code is slightly messed there.
   323  	// There is more clean implementation of that algorithm for bigger sizes
   324  	// where the cost of unaligned part copying is negligible.
   325  	// You can see it after gobble_big_data_fwd label.
   326  	LEAQ	(SI)(BX*1), CX
   327  	MOVQ	DI, R10
   328  	// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
   329  	MOVOU	-0x80(CX), X5
   330  	MOVOU	-0x70(CX), X6
   331  	MOVQ	$0x80, AX
   332  	// Align destination address
   333  	ANDQ	$-32, DI
   334  	ADDQ	$32, DI
   335  	// Continue tail saving.
   336  	MOVOU	-0x60(CX), X7
   337  	MOVOU	-0x50(CX), X8
   338  	// Make R11 delta between aligned and unaligned destination addresses.
   339  	MOVQ	DI, R11
   340  	SUBQ	R10, R11
   341  	// Continue tail saving.
   342  	MOVOU	-0x40(CX), X9
   343  	MOVOU	-0x30(CX), X10
   344  	// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
   345  	SUBQ	R11, BX
   346  	// Continue tail saving.
   347  	MOVOU	-0x20(CX), X11
   348  	MOVOU	-0x10(CX), X12
   349  	// The tail will be put on it's place after main body copying.
   350  	// It's time for the unaligned heading part.
   351  	VMOVDQU	(SI), Y4
   352  	// Adjust source address to point past head.
   353  	ADDQ	R11, SI
   354  	SUBQ	AX, BX
   355  	// Aligned memory copying there
   356  gobble_128_loop:
   357  	VMOVDQU	(SI), Y0
   358  	VMOVDQU	0x20(SI), Y1
   359  	VMOVDQU	0x40(SI), Y2
   360  	VMOVDQU	0x60(SI), Y3
   361  	ADDQ	AX, SI
   362  	VMOVDQA	Y0, (DI)
   363  	VMOVDQA	Y1, 0x20(DI)
   364  	VMOVDQA	Y2, 0x40(DI)
   365  	VMOVDQA	Y3, 0x60(DI)
   366  	ADDQ	AX, DI
   367  	SUBQ	AX, BX
   368  	JA	gobble_128_loop
   369  	// Now we can store unaligned parts.
   370  	ADDQ	AX, BX
   371  	ADDQ	DI, BX
   372  	VMOVDQU	Y4, (R10)
   373  	VZEROUPPER
   374  	MOVOU	X5, -0x80(BX)
   375  	MOVOU	X6, -0x70(BX)
   376  	MOVOU	X7, -0x60(BX)
   377  	MOVOU	X8, -0x50(BX)
   378  	MOVOU	X9, -0x40(BX)
   379  	MOVOU	X10, -0x30(BX)
   380  	MOVOU	X11, -0x20(BX)
   381  	MOVOU	X12, -0x10(BX)
   382  	RET
   383  
   384  gobble_big_data_fwd:
   385  	// There is forward copying for big regions.
   386  	// It uses non-temporal mov instructions.
   387  	// Details of this algorithm are commented previously for small sizes.
   388  	LEAQ	(SI)(BX*1), CX
   389  	MOVOU	-0x80(SI)(BX*1), X5
   390  	MOVOU	-0x70(CX), X6
   391  	MOVOU	-0x60(CX), X7
   392  	MOVOU	-0x50(CX), X8
   393  	MOVOU	-0x40(CX), X9
   394  	MOVOU	-0x30(CX), X10
   395  	MOVOU	-0x20(CX), X11
   396  	MOVOU	-0x10(CX), X12
   397  	VMOVDQU	(SI), Y4
   398  	MOVQ	DI, R8
   399  	ANDQ	$-32, DI
   400  	ADDQ	$32, DI
   401  	MOVQ	DI, R10
   402  	SUBQ	R8, R10
   403  	SUBQ	R10, BX
   404  	ADDQ	R10, SI
   405  	LEAQ	(DI)(BX*1), CX
   406  	SUBQ	$0x80, BX
   407  gobble_mem_fwd_loop:
   408  	PREFETCHNTA 0x1C0(SI)
   409  	PREFETCHNTA 0x280(SI)
   410  	// Prefetch values were chosen empirically.
   411  	// Approach for prefetch usage as in 7.6.6 of [1]
   412  	// [1] 64-ia-32-architectures-optimization-manual.pdf
   413  	// http://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
   414  	VMOVDQU	(SI), Y0
   415  	VMOVDQU	0x20(SI), Y1
   416  	VMOVDQU	0x40(SI), Y2
   417  	VMOVDQU	0x60(SI), Y3
   418  	ADDQ	$0x80, SI
   419  	VMOVNTDQ Y0, (DI)
   420  	VMOVNTDQ Y1, 0x20(DI)
   421  	VMOVNTDQ Y2, 0x40(DI)
   422  	VMOVNTDQ Y3, 0x60(DI)
   423  	ADDQ	$0x80, DI
   424  	SUBQ	$0x80, BX
   425  	JA		gobble_mem_fwd_loop
   426  	// NT instructions don't follow the normal cache-coherency rules.
   427  	// We need SFENCE there to make copied data available timely.
   428  	SFENCE
   429  	VMOVDQU	Y4, (R8)
   430  	VZEROUPPER
   431  	MOVOU	X5, -0x80(CX)
   432  	MOVOU	X6, -0x70(CX)
   433  	MOVOU	X7, -0x60(CX)
   434  	MOVOU	X8, -0x50(CX)
   435  	MOVOU	X9, -0x40(CX)
   436  	MOVOU	X10, -0x30(CX)
   437  	MOVOU	X11, -0x20(CX)
   438  	MOVOU	X12, -0x10(CX)
   439  	RET
   440  
   441  copy_backward:
   442  	MOVQ	DI, AX
   443  	// Backward copying is about the same as the forward one.
   444  	// Firstly we load unaligned tail in the beginning of region.
   445  	MOVOU	(SI), X5
   446  	MOVOU	0x10(SI), X6
   447  	ADDQ	BX, DI
   448  	MOVOU	0x20(SI), X7
   449  	MOVOU	0x30(SI), X8
   450  	LEAQ	-0x20(DI), R10
   451  	MOVQ	DI, R11
   452  	MOVOU	0x40(SI), X9
   453  	MOVOU	0x50(SI), X10
   454  	ANDQ	$0x1F, R11
   455  	MOVOU	0x60(SI), X11
   456  	MOVOU	0x70(SI), X12
   457  	XORQ	R11, DI
   458  	// Let's point SI to the end of region
   459  	ADDQ	BX, SI
   460  	// and load unaligned head into X4.
   461  	VMOVDQU	-0x20(SI), Y4
   462  	SUBQ	R11, SI
   463  	SUBQ	R11, BX
   464  	// If there is enough data for non-temporal moves go to special loop
   465  	CMPQ	BX, $0x100000
   466  	JA		gobble_big_data_bwd
   467  	SUBQ	$0x80, BX
   468  gobble_mem_bwd_loop:
   469  	VMOVDQU	-0x20(SI), Y0
   470  	VMOVDQU	-0x40(SI), Y1
   471  	VMOVDQU	-0x60(SI), Y2
   472  	VMOVDQU	-0x80(SI), Y3
   473  	SUBQ	$0x80, SI
   474  	VMOVDQA	Y0, -0x20(DI)
   475  	VMOVDQA	Y1, -0x40(DI)
   476  	VMOVDQA	Y2, -0x60(DI)
   477  	VMOVDQA	Y3, -0x80(DI)
   478  	SUBQ	$0x80, DI
   479  	SUBQ	$0x80, BX
   480  	JA		gobble_mem_bwd_loop
   481  	// Let's store unaligned data
   482  	VMOVDQU	Y4, (R10)
   483  	VZEROUPPER
   484  	MOVOU	X5, (AX)
   485  	MOVOU	X6, 0x10(AX)
   486  	MOVOU	X7, 0x20(AX)
   487  	MOVOU	X8, 0x30(AX)
   488  	MOVOU	X9, 0x40(AX)
   489  	MOVOU	X10, 0x50(AX)
   490  	MOVOU	X11, 0x60(AX)
   491  	MOVOU	X12, 0x70(AX)
   492  	RET
   493  
   494  gobble_big_data_bwd:
   495  	SUBQ	$0x80, BX
   496  gobble_big_mem_bwd_loop:
   497  	PREFETCHNTA -0x1C0(SI)
   498  	PREFETCHNTA -0x280(SI)
   499  	VMOVDQU	-0x20(SI), Y0
   500  	VMOVDQU	-0x40(SI), Y1
   501  	VMOVDQU	-0x60(SI), Y2
   502  	VMOVDQU	-0x80(SI), Y3
   503  	SUBQ	$0x80, SI
   504  	VMOVNTDQ	Y0, -0x20(DI)
   505  	VMOVNTDQ	Y1, -0x40(DI)
   506  	VMOVNTDQ	Y2, -0x60(DI)
   507  	VMOVNTDQ	Y3, -0x80(DI)
   508  	SUBQ	$0x80, DI
   509  	SUBQ	$0x80, BX
   510  	JA	gobble_big_mem_bwd_loop
   511  	SFENCE
   512  	VMOVDQU	Y4, (R10)
   513  	VZEROUPPER
   514  	MOVOU	X5, (AX)
   515  	MOVOU	X6, 0x10(AX)
   516  	MOVOU	X7, 0x20(AX)
   517  	MOVOU	X8, 0x30(AX)
   518  	MOVOU	X9, 0x40(AX)
   519  	MOVOU	X10, 0x50(AX)
   520  	MOVOU	X11, 0x60(AX)
   521  	MOVOU	X12, 0x70(AX)
   522  	RET