github.com/panjjo/go@v0.0.0-20161104043856-d62b31386338/src/runtime/memmove_amd64.s

github.com/panjjo/go@v0.0.0-20161104043856-d62b31386338/src/runtime/memmove_amd64.s (about)

     1  // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
     2  // https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s
     3  //
     4  //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
     5  //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     6  //         Portions Copyright 2009 The Go Authors. All rights reserved.
     7  //
     8  // Permission is hereby granted, free of charge, to any person obtaining a copy
     9  // of this software and associated documentation files (the "Software"), to deal
    10  // in the Software without restriction, including without limitation the rights
    11  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    12  // copies of the Software, and to permit persons to whom the Software is
    13  // furnished to do so, subject to the following conditions:
    14  //
    15  // The above copyright notice and this permission notice shall be included in
    16  // all copies or substantial portions of the Software.
    17  //
    18  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    19  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    20  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    21  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    22  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    23  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    24  // THE SOFTWARE.
    25  
    26  // +build !plan9
    27  
    28  #include "textflag.h"
    29  
    30  // void runtime·memmove(void*, void*, uintptr)
    31  TEXT runtime·memmove(SB), NOSPLIT, $0-24
    32  
    33  	MOVQ	to+0(FP), DI
    34  	MOVQ	from+8(FP), SI
    35  	MOVQ	n+16(FP), BX
    36  
    37  	// REP instructions have a high startup cost, so we handle small sizes
    38  	// with some straightline code. The REP MOVSQ instruction is really fast
    39  	// for large sizes. The cutover is approximately 2K.
    40  tail:
    41  	// move_129through256 or smaller work whether or not the source and the
    42  	// destination memory regions overlap because they load all data into
    43  	// registers before writing it back.  move_256through2048 on the other
    44  	// hand can be used only when the memory regions don't overlap or the copy
    45  	// direction is forward.
    46  	TESTQ	BX, BX
    47  	JEQ	move_0
    48  	CMPQ	BX, $2
    49  	JBE	move_1or2
    50  	CMPQ	BX, $4
    51  	JBE	move_3or4
    52  	CMPQ	BX, $8
    53  	JB	move_5through7
    54  	JE	move_8
    55  	CMPQ	BX, $16
    56  	JBE	move_9through16
    57  	CMPQ	BX, $32
    58  	JBE	move_17through32
    59  	CMPQ	BX, $64
    60  	JBE	move_33through64
    61  	CMPQ	BX, $128
    62  	JBE	move_65through128
    63  	CMPQ	BX, $256
    64  	JBE	move_129through256
    65  	// TODO: use branch table and BSR to make this just a single dispatch
    66  
    67  	TESTB	$1, runtime·useRepMovs(SB)
    68  	JZ	avxUnaligned
    69  
    70  /*
    71   * check and set for backwards
    72   */
    73  	CMPQ	SI, DI
    74  	JLS	back
    75  
    76  /*
    77   * forward copy loop
    78   */
    79  forward:
    80  	CMPQ	BX, $2048
    81  	JLS	move_256through2048
    82  
    83  	// If REP MOVSB isn't fast, don't use it
    84  	TESTL	$(1<<9), runtime·cpuid_ebx7(SB) // erms, aka enhanced REP MOVSB/STOSB
    85  	JEQ	fwdBy8
    86  
    87  	// Check alignment
    88  	MOVL	SI, AX
    89  	ORL	DI, AX
    90  	TESTL	$7, AX
    91  	JEQ	fwdBy8
    92  
    93  	// Do 1 byte at a time
    94  	MOVQ	BX, CX
    95  	REP;	MOVSB
    96  	RET
    97  
    98  fwdBy8:
    99  	// Do 8 bytes at a time
   100  	MOVQ	BX, CX
   101  	SHRQ	$3, CX
   102  	ANDQ	$7, BX
   103  	REP;	MOVSQ
   104  	JMP	tail
   105  
   106  back:
   107  /*
   108   * check overlap
   109   */
   110  	MOVQ	SI, CX
   111  	ADDQ	BX, CX
   112  	CMPQ	CX, DI
   113  	JLS	forward
   114  /*
   115   * whole thing backwards has
   116   * adjusted addresses
   117   */
   118  	ADDQ	BX, DI
   119  	ADDQ	BX, SI
   120  	STD
   121  
   122  /*
   123   * copy
   124   */
   125  	MOVQ	BX, CX
   126  	SHRQ	$3, CX
   127  	ANDQ	$7, BX
   128  
   129  	SUBQ	$8, DI
   130  	SUBQ	$8, SI
   131  	REP;	MOVSQ
   132  
   133  	CLD
   134  	ADDQ	$8, DI
   135  	ADDQ	$8, SI
   136  	SUBQ	BX, DI
   137  	SUBQ	BX, SI
   138  	JMP	tail
   139  
   140  move_1or2:
   141  	MOVB	(SI), AX
   142  	MOVB	-1(SI)(BX*1), CX
   143  	MOVB	AX, (DI)
   144  	MOVB	CX, -1(DI)(BX*1)
   145  	RET
   146  move_0:
   147  	RET
   148  move_3or4:
   149  	MOVW	(SI), AX
   150  	MOVW	-2(SI)(BX*1), CX
   151  	MOVW	AX, (DI)
   152  	MOVW	CX, -2(DI)(BX*1)
   153  	RET
   154  move_5through7:
   155  	MOVL	(SI), AX
   156  	MOVL	-4(SI)(BX*1), CX
   157  	MOVL	AX, (DI)
   158  	MOVL	CX, -4(DI)(BX*1)
   159  	RET
   160  move_8:
   161  	// We need a separate case for 8 to make sure we write pointers atomically.
   162  	MOVQ	(SI), AX
   163  	MOVQ	AX, (DI)
   164  	RET
   165  move_9through16:
   166  	MOVQ	(SI), AX
   167  	MOVQ	-8(SI)(BX*1), CX
   168  	MOVQ	AX, (DI)
   169  	MOVQ	CX, -8(DI)(BX*1)
   170  	RET
   171  move_17through32:
   172  	MOVOU	(SI), X0
   173  	MOVOU	-16(SI)(BX*1), X1
   174  	MOVOU	X0, (DI)
   175  	MOVOU	X1, -16(DI)(BX*1)
   176  	RET
   177  move_33through64:
   178  	MOVOU	(SI), X0
   179  	MOVOU	16(SI), X1
   180  	MOVOU	-32(SI)(BX*1), X2
   181  	MOVOU	-16(SI)(BX*1), X3
   182  	MOVOU	X0, (DI)
   183  	MOVOU	X1, 16(DI)
   184  	MOVOU	X2, -32(DI)(BX*1)
   185  	MOVOU	X3, -16(DI)(BX*1)
   186  	RET
   187  move_65through128:
   188  	MOVOU	(SI), X0
   189  	MOVOU	16(SI), X1
   190  	MOVOU	32(SI), X2
   191  	MOVOU	48(SI), X3
   192  	MOVOU	-64(SI)(BX*1), X4
   193  	MOVOU	-48(SI)(BX*1), X5
   194  	MOVOU	-32(SI)(BX*1), X6
   195  	MOVOU	-16(SI)(BX*1), X7
   196  	MOVOU	X0, (DI)
   197  	MOVOU	X1, 16(DI)
   198  	MOVOU	X2, 32(DI)
   199  	MOVOU	X3, 48(DI)
   200  	MOVOU	X4, -64(DI)(BX*1)
   201  	MOVOU	X5, -48(DI)(BX*1)
   202  	MOVOU	X6, -32(DI)(BX*1)
   203  	MOVOU	X7, -16(DI)(BX*1)
   204  	RET
   205  move_129through256:
   206  	MOVOU	(SI), X0
   207  	MOVOU	16(SI), X1
   208  	MOVOU	32(SI), X2
   209  	MOVOU	48(SI), X3
   210  	MOVOU	64(SI), X4
   211  	MOVOU	80(SI), X5
   212  	MOVOU	96(SI), X6
   213  	MOVOU	112(SI), X7
   214  	MOVOU	-128(SI)(BX*1), X8
   215  	MOVOU	-112(SI)(BX*1), X9
   216  	MOVOU	-96(SI)(BX*1), X10
   217  	MOVOU	-80(SI)(BX*1), X11
   218  	MOVOU	-64(SI)(BX*1), X12
   219  	MOVOU	-48(SI)(BX*1), X13
   220  	MOVOU	-32(SI)(BX*1), X14
   221  	MOVOU	-16(SI)(BX*1), X15
   222  	MOVOU	X0, (DI)
   223  	MOVOU	X1, 16(DI)
   224  	MOVOU	X2, 32(DI)
   225  	MOVOU	X3, 48(DI)
   226  	MOVOU	X4, 64(DI)
   227  	MOVOU	X5, 80(DI)
   228  	MOVOU	X6, 96(DI)
   229  	MOVOU	X7, 112(DI)
   230  	MOVOU	X8, -128(DI)(BX*1)
   231  	MOVOU	X9, -112(DI)(BX*1)
   232  	MOVOU	X10, -96(DI)(BX*1)
   233  	MOVOU	X11, -80(DI)(BX*1)
   234  	MOVOU	X12, -64(DI)(BX*1)
   235  	MOVOU	X13, -48(DI)(BX*1)
   236  	MOVOU	X14, -32(DI)(BX*1)
   237  	MOVOU	X15, -16(DI)(BX*1)
   238  	RET
   239  move_256through2048:
   240  	SUBQ	$256, BX
   241  	MOVOU	(SI), X0
   242  	MOVOU	16(SI), X1
   243  	MOVOU	32(SI), X2
   244  	MOVOU	48(SI), X3
   245  	MOVOU	64(SI), X4
   246  	MOVOU	80(SI), X5
   247  	MOVOU	96(SI), X6
   248  	MOVOU	112(SI), X7
   249  	MOVOU	128(SI), X8
   250  	MOVOU	144(SI), X9
   251  	MOVOU	160(SI), X10
   252  	MOVOU	176(SI), X11
   253  	MOVOU	192(SI), X12
   254  	MOVOU	208(SI), X13
   255  	MOVOU	224(SI), X14
   256  	MOVOU	240(SI), X15
   257  	MOVOU	X0, (DI)
   258  	MOVOU	X1, 16(DI)
   259  	MOVOU	X2, 32(DI)
   260  	MOVOU	X3, 48(DI)
   261  	MOVOU	X4, 64(DI)
   262  	MOVOU	X5, 80(DI)
   263  	MOVOU	X6, 96(DI)
   264  	MOVOU	X7, 112(DI)
   265  	MOVOU	X8, 128(DI)
   266  	MOVOU	X9, 144(DI)
   267  	MOVOU	X10, 160(DI)
   268  	MOVOU	X11, 176(DI)
   269  	MOVOU	X12, 192(DI)
   270  	MOVOU	X13, 208(DI)
   271  	MOVOU	X14, 224(DI)
   272  	MOVOU	X15, 240(DI)
   273  	CMPQ	BX, $256
   274  	LEAQ	256(SI), SI
   275  	LEAQ	256(DI), DI
   276  	JGE	move_256through2048
   277  	JMP	tail
   278  
   279  avxUnaligned:
   280  	// There are two implementations of move algorithm.
   281  	// The first one for non-ovelapped memory regions. It uses forward copying.
   282  	// The second one for overlapped regions. It uses backward copying
   283  	MOVQ	DI, CX
   284  	SUBQ	SI, CX
   285  	// Now CX contains distance between SRC and DEST
   286  	CMPQ	CX, BX
   287  	// If the distance lesser than region length it means that regions are overlapped
   288  	JC	copy_backward
   289  
   290  	// Non-temporal copy would be better for big sizes.
   291  	CMPQ	BX, $0x100000
   292  	JAE	gobble_big_data_fwd
   293  
   294  	// Memory layout on the source side
   295  	// SI                                       CX
   296  	// |<---------BX before correction--------->|
   297  	// |       |<--BX corrected-->|             |
   298  	// |       |                  |<--- AX  --->|
   299  	// |<-R11->|                  |<-128 bytes->|
   300  	// +----------------------------------------+
   301  	// | Head  | Body             | Tail        |
   302  	// +-------+------------------+-------------+
   303  	// ^       ^                  ^
   304  	// |       |                  |
   305  	// Save head into Y4          Save tail into X5..X12
   306  	//         |
   307  	//         SI+R11, where R11 = ((DI & -32) + 32) - DI
   308  	// Algorithm:
   309  	// 1. Unaligned save of the tail's 128 bytes
   310  	// 2. Unaligned save of the head's 32  bytes
   311  	// 3. Destination-aligned copying of body (128 bytes per iteration)
   312  	// 4. Put head on the new place
   313  	// 5. Put the tail on the new place
   314  	// It can be important to satisfy processor's pipeline requirements for
   315  	// small sizes as the cost of unaligned memory region copying is
   316  	// comparable with the cost of main loop. So code is slightly messed there.
   317  	// There is more clean implementation of that algorithm for bigger sizes
   318  	// where the cost of unaligned part copying is negligible.
   319  	// You can see it after gobble_big_data_fwd label.
   320  	LEAQ	(SI)(BX*1), CX
   321  	MOVQ	DI, R10
   322  	// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
   323  	MOVOU	-0x80(CX), X5
   324  	MOVOU	-0x70(CX), X6
   325  	MOVQ	$0x80, AX
   326  	// Align destination address
   327  	ANDQ	$-32, DI
   328  	ADDQ	$32, DI
   329  	// Continue tail saving.
   330  	MOVOU	-0x60(CX), X7
   331  	MOVOU	-0x50(CX), X8
   332  	// Make R11 delta between aligned and unaligned destination addresses.
   333  	MOVQ	DI, R11
   334  	SUBQ	R10, R11
   335  	// Continue tail saving.
   336  	MOVOU	-0x40(CX), X9
   337  	MOVOU	-0x30(CX), X10
   338  	// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
   339  	SUBQ	R11, BX
   340  	// Continue tail saving.
   341  	MOVOU	-0x20(CX), X11
   342  	MOVOU	-0x10(CX), X12
   343  	// The tail will be put on it's place after main body copying.
   344  	// It's time for the unaligned heading part.
   345  	VMOVDQU	(SI), Y4
   346  	// Adjust source address to point past head.
   347  	ADDQ	R11, SI
   348  	SUBQ	AX, BX
   349  	// Aligned memory copying there
   350  gobble_128_loop:
   351  	VMOVDQU	(SI), Y0
   352  	VMOVDQU	0x20(SI), Y1
   353  	VMOVDQU	0x40(SI), Y2
   354  	VMOVDQU	0x60(SI), Y3
   355  	ADDQ	AX, SI
   356  	VMOVDQA	Y0, (DI)
   357  	VMOVDQA	Y1, 0x20(DI)
   358  	VMOVDQA	Y2, 0x40(DI)
   359  	VMOVDQA	Y3, 0x60(DI)
   360  	ADDQ	AX, DI
   361  	SUBQ	AX, BX
   362  	JA	gobble_128_loop
   363  	// Now we can store unaligned parts.
   364  	ADDQ	AX, BX
   365  	ADDQ	DI, BX
   366  	VMOVDQU	Y4, (R10)
   367  	VZEROUPPER
   368  	MOVOU	X5, -0x80(BX)
   369  	MOVOU	X6, -0x70(BX)
   370  	MOVOU	X7, -0x60(BX)
   371  	MOVOU	X8, -0x50(BX)
   372  	MOVOU	X9, -0x40(BX)
   373  	MOVOU	X10, -0x30(BX)
   374  	MOVOU	X11, -0x20(BX)
   375  	MOVOU	X12, -0x10(BX)
   376  	RET
   377  
   378  gobble_big_data_fwd:
   379  	// There is forward copying for big regions.
   380  	// It uses non-temporal mov instructions.
   381  	// Details of this algorithm are commented previously for small sizes.
   382  	LEAQ	(SI)(BX*1), CX
   383  	MOVOU	-0x80(SI)(BX*1), X5
   384  	MOVOU	-0x70(CX), X6
   385  	MOVOU	-0x60(CX), X7
   386  	MOVOU	-0x50(CX), X8
   387  	MOVOU	-0x40(CX), X9
   388  	MOVOU	-0x30(CX), X10
   389  	MOVOU	-0x20(CX), X11
   390  	MOVOU	-0x10(CX), X12
   391  	VMOVDQU	(SI), Y4
   392  	MOVQ	DI, R8
   393  	ANDQ	$-32, DI
   394  	ADDQ	$32, DI
   395  	MOVQ	DI, R10
   396  	SUBQ	R8, R10
   397  	SUBQ	R10, BX
   398  	ADDQ	R10, SI
   399  	LEAQ	(DI)(BX*1), CX
   400  	SUBQ	$0x80, BX
   401  gobble_mem_fwd_loop:
   402  	PREFETCHNTA 0x1C0(SI)
   403  	PREFETCHNTA 0x280(SI)
   404  	// Prefetch values were choosen empirically.
   405  	// Approach for prefetch usage as in 7.6.6 of [1]
   406  	// [1] 64-ia-32-architectures-optimization-manual.pdf
   407  	// http://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
   408  	VMOVDQU	(SI), Y0
   409  	VMOVDQU	0x20(SI), Y1
   410  	VMOVDQU	0x40(SI), Y2
   411  	VMOVDQU	0x60(SI), Y3
   412  	ADDQ	$0x80, SI
   413  	VMOVNTDQ Y0, (DI)
   414  	VMOVNTDQ Y1, 0x20(DI)
   415  	VMOVNTDQ Y2, 0x40(DI)
   416  	VMOVNTDQ Y3, 0x60(DI)
   417  	ADDQ	$0x80, DI
   418  	SUBQ	$0x80, BX
   419  	JA		gobble_mem_fwd_loop
   420  	// NT instructions don't follow the normal cache-coherency rules.
   421  	// We need SFENCE there to make copied data available timely.
   422  	SFENCE
   423  	VMOVDQU	Y4, (R8)
   424  	VZEROUPPER
   425  	MOVOU	X5, -0x80(CX)
   426  	MOVOU	X6, -0x70(CX)
   427  	MOVOU	X7, -0x60(CX)
   428  	MOVOU	X8, -0x50(CX)
   429  	MOVOU	X9, -0x40(CX)
   430  	MOVOU	X10, -0x30(CX)
   431  	MOVOU	X11, -0x20(CX)
   432  	MOVOU	X12, -0x10(CX)
   433  	RET
   434  
   435  copy_backward:
   436  	MOVQ	DI, AX
   437  	// Backward copying is about the same as the forward one.
   438  	// Firstly we load unaligned tail in the beginning of region.
   439  	MOVOU	(SI), X5
   440  	MOVOU	0x10(SI), X6
   441  	ADDQ	BX, DI
   442  	MOVOU	0x20(SI), X7
   443  	MOVOU	0x30(SI), X8
   444  	LEAQ	-0x20(DI), R10
   445  	MOVQ	DI, R11
   446  	MOVOU	0x40(SI), X9
   447  	MOVOU	0x50(SI), X10
   448  	ANDQ	$0x1F, R11
   449  	MOVOU	0x60(SI), X11
   450  	MOVOU	0x70(SI), X12
   451  	XORQ	R11, DI
   452  	// Let's point SI to the end of region
   453  	ADDQ	BX, SI
   454  	// and load unaligned head into X4.
   455  	VMOVDQU	-0x20(SI), Y4
   456  	SUBQ	R11, SI
   457  	SUBQ	R11, BX
   458  	// If there is enough data for non-temporal moves go to special loop
   459  	CMPQ	BX, $0x100000
   460  	JA		gobble_big_data_bwd
   461  	SUBQ	$0x80, BX
   462  gobble_mem_bwd_loop:
   463  	VMOVDQU	-0x20(SI), Y0
   464  	VMOVDQU	-0x40(SI), Y1
   465  	VMOVDQU	-0x60(SI), Y2
   466  	VMOVDQU	-0x80(SI), Y3
   467  	SUBQ	$0x80, SI
   468  	VMOVDQA	Y0, -0x20(DI)
   469  	VMOVDQA	Y1, -0x40(DI)
   470  	VMOVDQA	Y2, -0x60(DI)
   471  	VMOVDQA	Y3, -0x80(DI)
   472  	SUBQ	$0x80, DI
   473  	SUBQ	$0x80, BX
   474  	JA		gobble_mem_bwd_loop
   475  	// Let's store unaligned data
   476  	VMOVDQU	Y4, (R10)
   477  	VZEROUPPER
   478  	MOVOU	X5, (AX)
   479  	MOVOU	X6, 0x10(AX)
   480  	MOVOU	X7, 0x20(AX)
   481  	MOVOU	X8, 0x30(AX)
   482  	MOVOU	X9, 0x40(AX)
   483  	MOVOU	X10, 0x50(AX)
   484  	MOVOU	X11, 0x60(AX)
   485  	MOVOU	X12, 0x70(AX)
   486  	RET
   487  
   488  gobble_big_data_bwd:
   489  	SUBQ	$0x80, BX
   490  gobble_big_mem_bwd_loop:
   491  	PREFETCHNTA -0x1C0(SI)
   492  	PREFETCHNTA -0x280(SI)
   493  	VMOVDQU	-0x20(SI), Y0
   494  	VMOVDQU	-0x40(SI), Y1
   495  	VMOVDQU	-0x60(SI), Y2
   496  	VMOVDQU	-0x80(SI), Y3
   497  	SUBQ	$0x80, SI
   498  	VMOVNTDQ	Y0, -0x20(DI)
   499  	VMOVNTDQ	Y1, -0x40(DI)
   500  	VMOVNTDQ	Y2, -0x60(DI)
   501  	VMOVNTDQ	Y3, -0x80(DI)
   502  	SUBQ	$0x80, DI
   503  	SUBQ	$0x80, BX
   504  	JA	gobble_big_mem_bwd_loop
   505  	SFENCE
   506  	VMOVDQU	Y4, (R10)
   507  	VZEROUPPER
   508  	MOVOU	X5, (AX)
   509  	MOVOU	X6, 0x10(AX)
   510  	MOVOU	X7, 0x20(AX)
   511  	MOVOU	X8, 0x30(AX)
   512  	MOVOU	X9, 0x40(AX)
   513  	MOVOU	X10, 0x50(AX)
   514  	MOVOU	X11, 0x60(AX)
   515  	MOVOU	X12, 0x70(AX)
   516  	RET