github.com/4ad/go@v0.0.0-20161219182952-69a12818b605/src/runtime/memmove_amd64.s (about)

     1  // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
     2  // http://code.google.com/p/inferno-os/source/browse/libkern/memmove-386.s
     3  //
     4  //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
     5  //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     6  //         Portions Copyright 2009 The Go Authors. All rights reserved.
     7  //
     8  // Permission is hereby granted, free of charge, to any person obtaining a copy
     9  // of this software and associated documentation files (the "Software"), to deal
    10  // in the Software without restriction, including without limitation the rights
    11  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    12  // copies of the Software, and to permit persons to whom the Software is
    13  // furnished to do so, subject to the following conditions:
    14  //
    15  // The above copyright notice and this permission notice shall be included in
    16  // all copies or substantial portions of the Software.
    17  //
    18  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    19  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    20  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    21  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    22  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    23  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    24  // THE SOFTWARE.
    25  
    26  // +build !plan9
    27  
    28  #include "textflag.h"
    29  
    30  // void runtime·memmove(void*, void*, uintptr)
    31  TEXT runtime·memmove(SB), NOSPLIT, $0-24
    32  
    33  	MOVQ	to+0(FP), DI
    34  	MOVQ	from+8(FP), SI
    35  	MOVQ	n+16(FP), BX
    36  
    37  	// REP instructions have a high startup cost, so we handle small sizes
    38  	// with some straightline code. The REP MOVSQ instruction is really fast
    39  	// for large sizes. The cutover is approximately 2K.
    40  tail:
    41  	// move_129through256 or smaller work whether or not the source and the
    42  	// destination memory regions overlap because they load all data into
    43  	// registers before writing it back.  move_256through2048 on the other
    44  	// hand can be used only when the memory regions don't overlap or the copy
    45  	// direction is forward.
    46  	TESTQ	BX, BX
    47  	JEQ	move_0
    48  	CMPQ	BX, $2
    49  	JBE	move_1or2
    50  	CMPQ	BX, $4
    51  	JBE	move_3or4
    52  	CMPQ	BX, $8
    53  	JB	move_5through7
    54  	JE	move_8
    55  	CMPQ	BX, $16
    56  	JBE	move_9through16
    57  	CMPQ	BX, $32
    58  	JBE	move_17through32
    59  	CMPQ	BX, $64
    60  	JBE	move_33through64
    61  	CMPQ	BX, $128
    62  	JBE	move_65through128
    63  	CMPQ	BX, $256
    64  	JBE	move_129through256
    65  	// TODO: use branch table and BSR to make this just a single dispatch
    66  
    67  /*
    68   * check and set for backwards
    69   */
    70  	CMPQ	SI, DI
    71  	JLS	back
    72  
    73  /*
    74   * forward copy loop
    75   */
    76  forward:
    77  	CMPQ	BX, $2048
    78  	JLS	move_256through2048
    79  
    80  	// If REP MOVSB isn't fast, don't use it
    81  	TESTL	$(1<<9), runtime·cpuid_ebx7(SB) // erms, aka enhanced REP MOVSB/STOSB
    82  	JEQ	fwdBy8
    83  
    84  	// Check alignment
    85  	MOVL	SI, AX
    86  	ORL	DI, AX
    87  	TESTL	$7, AX
    88  	JEQ	fwdBy8
    89  
    90  	// Do 1 byte at a time
    91  	MOVQ	BX, CX
    92  	REP;	MOVSB
    93  	RET
    94  
    95  fwdBy8:
    96  	// Do 8 bytes at a time
    97  	MOVQ	BX, CX
    98  	SHRQ	$3, CX
    99  	ANDQ	$7, BX
   100  	REP;	MOVSQ
   101  	JMP	tail
   102  
   103  back:
   104  /*
   105   * check overlap
   106   */
   107  	MOVQ	SI, CX
   108  	ADDQ	BX, CX
   109  	CMPQ	CX, DI
   110  	JLS	forward
   111  	
   112  /*
   113   * whole thing backwards has
   114   * adjusted addresses
   115   */
   116  	ADDQ	BX, DI
   117  	ADDQ	BX, SI
   118  	STD
   119  
   120  /*
   121   * copy
   122   */
   123  	MOVQ	BX, CX
   124  	SHRQ	$3, CX
   125  	ANDQ	$7, BX
   126  
   127  	SUBQ	$8, DI
   128  	SUBQ	$8, SI
   129  	REP;	MOVSQ
   130  
   131  	CLD
   132  	ADDQ	$8, DI
   133  	ADDQ	$8, SI
   134  	SUBQ	BX, DI
   135  	SUBQ	BX, SI
   136  	JMP	tail
   137  
   138  move_1or2:
   139  	MOVB	(SI), AX
   140  	MOVB	-1(SI)(BX*1), CX
   141  	MOVB	AX, (DI)
   142  	MOVB	CX, -1(DI)(BX*1)
   143  	RET
   144  move_0:
   145  	RET
   146  move_3or4:
   147  	MOVW	(SI), AX
   148  	MOVW	-2(SI)(BX*1), CX
   149  	MOVW	AX, (DI)
   150  	MOVW	CX, -2(DI)(BX*1)
   151  	RET
   152  move_5through7:
   153  	MOVL	(SI), AX
   154  	MOVL	-4(SI)(BX*1), CX
   155  	MOVL	AX, (DI)
   156  	MOVL	CX, -4(DI)(BX*1)
   157  	RET
   158  move_8:
   159  	// We need a separate case for 8 to make sure we write pointers atomically.
   160  	MOVQ	(SI), AX
   161  	MOVQ	AX, (DI)
   162  	RET
   163  move_9through16:
   164  	MOVQ	(SI), AX
   165  	MOVQ	-8(SI)(BX*1), CX
   166  	MOVQ	AX, (DI)
   167  	MOVQ	CX, -8(DI)(BX*1)
   168  	RET
   169  move_17through32:
   170  	MOVOU	(SI), X0
   171  	MOVOU	-16(SI)(BX*1), X1
   172  	MOVOU	X0, (DI)
   173  	MOVOU	X1, -16(DI)(BX*1)
   174  	RET
   175  move_33through64:
   176  	MOVOU	(SI), X0
   177  	MOVOU	16(SI), X1
   178  	MOVOU	-32(SI)(BX*1), X2
   179  	MOVOU	-16(SI)(BX*1), X3
   180  	MOVOU	X0, (DI)
   181  	MOVOU	X1, 16(DI)
   182  	MOVOU	X2, -32(DI)(BX*1)
   183  	MOVOU	X3, -16(DI)(BX*1)
   184  	RET
   185  move_65through128:
   186  	MOVOU	(SI), X0
   187  	MOVOU	16(SI), X1
   188  	MOVOU	32(SI), X2
   189  	MOVOU	48(SI), X3
   190  	MOVOU	-64(SI)(BX*1), X4
   191  	MOVOU	-48(SI)(BX*1), X5
   192  	MOVOU	-32(SI)(BX*1), X6
   193  	MOVOU	-16(SI)(BX*1), X7
   194  	MOVOU	X0, (DI)
   195  	MOVOU	X1, 16(DI)
   196  	MOVOU	X2, 32(DI)
   197  	MOVOU	X3, 48(DI)
   198  	MOVOU	X4, -64(DI)(BX*1)
   199  	MOVOU	X5, -48(DI)(BX*1)
   200  	MOVOU	X6, -32(DI)(BX*1)
   201  	MOVOU	X7, -16(DI)(BX*1)
   202  	RET
   203  move_129through256:
   204  	MOVOU	(SI), X0
   205  	MOVOU	16(SI), X1
   206  	MOVOU	32(SI), X2
   207  	MOVOU	48(SI), X3
   208  	MOVOU	64(SI), X4
   209  	MOVOU	80(SI), X5
   210  	MOVOU	96(SI), X6
   211  	MOVOU	112(SI), X7
   212  	MOVOU	-128(SI)(BX*1), X8
   213  	MOVOU	-112(SI)(BX*1), X9
   214  	MOVOU	-96(SI)(BX*1), X10
   215  	MOVOU	-80(SI)(BX*1), X11
   216  	MOVOU	-64(SI)(BX*1), X12
   217  	MOVOU	-48(SI)(BX*1), X13
   218  	MOVOU	-32(SI)(BX*1), X14
   219  	MOVOU	-16(SI)(BX*1), X15
   220  	MOVOU	X0, (DI)
   221  	MOVOU	X1, 16(DI)
   222  	MOVOU	X2, 32(DI)
   223  	MOVOU	X3, 48(DI)
   224  	MOVOU	X4, 64(DI)
   225  	MOVOU	X5, 80(DI)
   226  	MOVOU	X6, 96(DI)
   227  	MOVOU	X7, 112(DI)
   228  	MOVOU	X8, -128(DI)(BX*1)
   229  	MOVOU	X9, -112(DI)(BX*1)
   230  	MOVOU	X10, -96(DI)(BX*1)
   231  	MOVOU	X11, -80(DI)(BX*1)
   232  	MOVOU	X12, -64(DI)(BX*1)
   233  	MOVOU	X13, -48(DI)(BX*1)
   234  	MOVOU	X14, -32(DI)(BX*1)
   235  	MOVOU	X15, -16(DI)(BX*1)
   236  	RET
   237  move_256through2048:
   238  	SUBQ	$256, BX
   239  	MOVOU	(SI), X0
   240  	MOVOU	16(SI), X1
   241  	MOVOU	32(SI), X2
   242  	MOVOU	48(SI), X3
   243  	MOVOU	64(SI), X4
   244  	MOVOU	80(SI), X5
   245  	MOVOU	96(SI), X6
   246  	MOVOU	112(SI), X7
   247  	MOVOU	128(SI), X8
   248  	MOVOU	144(SI), X9
   249  	MOVOU	160(SI), X10
   250  	MOVOU	176(SI), X11
   251  	MOVOU	192(SI), X12
   252  	MOVOU	208(SI), X13
   253  	MOVOU	224(SI), X14
   254  	MOVOU	240(SI), X15
   255  	MOVOU	X0, (DI)
   256  	MOVOU	X1, 16(DI)
   257  	MOVOU	X2, 32(DI)
   258  	MOVOU	X3, 48(DI)
   259  	MOVOU	X4, 64(DI)
   260  	MOVOU	X5, 80(DI)
   261  	MOVOU	X6, 96(DI)
   262  	MOVOU	X7, 112(DI)
   263  	MOVOU	X8, 128(DI)
   264  	MOVOU	X9, 144(DI)
   265  	MOVOU	X10, 160(DI)
   266  	MOVOU	X11, 176(DI)
   267  	MOVOU	X12, 192(DI)
   268  	MOVOU	X13, 208(DI)
   269  	MOVOU	X14, 224(DI)
   270  	MOVOU	X15, 240(DI)
   271  	CMPQ	BX, $256
   272  	LEAQ	256(SI), SI
   273  	LEAQ	256(DI), DI
   274  	JGE	move_256through2048
   275  	JMP	tail