rsc.io/go@v0.0.0-20150416155037-e040fd465409/src/runtime/memmove_amd64.s (about)

     1  // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
     2  // http://code.google.com/p/inferno-os/source/browse/libkern/memmove-386.s
     3  //
     4  //         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
     5  //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     6  //         Portions Copyright 2009 The Go Authors. All rights reserved.
     7  //
     8  // Permission is hereby granted, free of charge, to any person obtaining a copy
     9  // of this software and associated documentation files (the "Software"), to deal
    10  // in the Software without restriction, including without limitation the rights
    11  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    12  // copies of the Software, and to permit persons to whom the Software is
    13  // furnished to do so, subject to the following conditions:
    14  //
    15  // The above copyright notice and this permission notice shall be included in
    16  // all copies or substantial portions of the Software.
    17  //
    18  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    19  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    20  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    21  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    22  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    23  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    24  // THE SOFTWARE.
    25  
    26  // +build !plan9
    27  
    28  #include "textflag.h"
    29  
    30  // void runtime·memmove(void*, void*, uintptr)
    31  TEXT runtime·memmove(SB), NOSPLIT, $0-24
    32  
    33  	MOVQ	to+0(FP), DI
    34  	MOVQ	from+8(FP), SI
    35  	MOVQ	n+16(FP), BX
    36  
    37  	// REP instructions have a high startup cost, so we handle small sizes
    38  	// with some straightline code.  The REP MOVSQ instruction is really fast
    39  	// for large sizes.  The cutover is approximately 2K.
    40  tail:
    41  	// move_129through256 or smaller work whether or not the source and the
    42  	// destination memory regions overlap because they load all data into
    43  	// registers before writing it back.  move_256through2048 on the other
    44  	// hand can be used only when the memory regions don't overlap or the copy
    45  	// direction is forward.
    46  	TESTQ	BX, BX
    47  	JEQ	move_0
    48  	CMPQ	BX, $2
    49  	JBE	move_1or2
    50  	CMPQ	BX, $4
    51  	JBE	move_3or4
    52  	CMPQ	BX, $8
    53  	JBE	move_5through8
    54  	CMPQ	BX, $16
    55  	JBE	move_9through16
    56  	CMPQ	BX, $32
    57  	JBE	move_17through32
    58  	CMPQ	BX, $64
    59  	JBE	move_33through64
    60  	CMPQ	BX, $128
    61  	JBE	move_65through128
    62  	CMPQ	BX, $256
    63  	JBE	move_129through256
    64  	// TODO: use branch table and BSR to make this just a single dispatch
    65  
    66  /*
    67   * check and set for backwards
    68   */
    69  	CMPQ	SI, DI
    70  	JLS	back
    71  
    72  /*
    73   * forward copy loop
    74   */
    75  forward:
    76  	CMPQ	BX, $2048
    77  	JLS	move_256through2048
    78  
    79  	MOVQ	BX, CX
    80  	SHRQ	$3, CX
    81  	ANDQ	$7, BX
    82  	REP;	MOVSQ
    83  	JMP	tail
    84  
    85  back:
    86  /*
    87   * check overlap
    88   */
    89  	MOVQ	SI, CX
    90  	ADDQ	BX, CX
    91  	CMPQ	CX, DI
    92  	JLS	forward
    93  	
    94  /*
    95   * whole thing backwards has
    96   * adjusted addresses
    97   */
    98  	ADDQ	BX, DI
    99  	ADDQ	BX, SI
   100  	STD
   101  
   102  /*
   103   * copy
   104   */
   105  	MOVQ	BX, CX
   106  	SHRQ	$3, CX
   107  	ANDQ	$7, BX
   108  
   109  	SUBQ	$8, DI
   110  	SUBQ	$8, SI
   111  	REP;	MOVSQ
   112  
   113  	CLD
   114  	ADDQ	$8, DI
   115  	ADDQ	$8, SI
   116  	SUBQ	BX, DI
   117  	SUBQ	BX, SI
   118  	JMP	tail
   119  
   120  move_1or2:
   121  	MOVB	(SI), AX
   122  	MOVB	-1(SI)(BX*1), CX
   123  	MOVB	AX, (DI)
   124  	MOVB	CX, -1(DI)(BX*1)
   125  	RET
   126  move_0:
   127  	RET
   128  move_3or4:
   129  	MOVW	(SI), AX
   130  	MOVW	-2(SI)(BX*1), CX
   131  	MOVW	AX, (DI)
   132  	MOVW	CX, -2(DI)(BX*1)
   133  	RET
   134  move_5through8:
   135  	MOVL	(SI), AX
   136  	MOVL	-4(SI)(BX*1), CX
   137  	MOVL	AX, (DI)
   138  	MOVL	CX, -4(DI)(BX*1)
   139  	RET
   140  move_9through16:
   141  	MOVQ	(SI), AX
   142  	MOVQ	-8(SI)(BX*1), CX
   143  	MOVQ	AX, (DI)
   144  	MOVQ	CX, -8(DI)(BX*1)
   145  	RET
   146  move_17through32:
   147  	MOVOU	(SI), X0
   148  	MOVOU	-16(SI)(BX*1), X1
   149  	MOVOU	X0, (DI)
   150  	MOVOU	X1, -16(DI)(BX*1)
   151  	RET
   152  move_33through64:
   153  	MOVOU	(SI), X0
   154  	MOVOU	16(SI), X1
   155  	MOVOU	-32(SI)(BX*1), X2
   156  	MOVOU	-16(SI)(BX*1), X3
   157  	MOVOU	X0, (DI)
   158  	MOVOU	X1, 16(DI)
   159  	MOVOU	X2, -32(DI)(BX*1)
   160  	MOVOU	X3, -16(DI)(BX*1)
   161  	RET
   162  move_65through128:
   163  	MOVOU	(SI), X0
   164  	MOVOU	16(SI), X1
   165  	MOVOU	32(SI), X2
   166  	MOVOU	48(SI), X3
   167  	MOVOU	-64(SI)(BX*1), X4
   168  	MOVOU	-48(SI)(BX*1), X5
   169  	MOVOU	-32(SI)(BX*1), X6
   170  	MOVOU	-16(SI)(BX*1), X7
   171  	MOVOU	X0, (DI)
   172  	MOVOU	X1, 16(DI)
   173  	MOVOU	X2, 32(DI)
   174  	MOVOU	X3, 48(DI)
   175  	MOVOU	X4, -64(DI)(BX*1)
   176  	MOVOU	X5, -48(DI)(BX*1)
   177  	MOVOU	X6, -32(DI)(BX*1)
   178  	MOVOU	X7, -16(DI)(BX*1)
   179  	RET
   180  move_129through256:
   181  	MOVOU	(SI), X0
   182  	MOVOU	16(SI), X1
   183  	MOVOU	32(SI), X2
   184  	MOVOU	48(SI), X3
   185  	MOVOU	64(SI), X4
   186  	MOVOU	80(SI), X5
   187  	MOVOU	96(SI), X6
   188  	MOVOU	112(SI), X7
   189  	MOVOU	-128(SI)(BX*1), X8
   190  	MOVOU	-112(SI)(BX*1), X9
   191  	MOVOU	-96(SI)(BX*1), X10
   192  	MOVOU	-80(SI)(BX*1), X11
   193  	MOVOU	-64(SI)(BX*1), X12
   194  	MOVOU	-48(SI)(BX*1), X13
   195  	MOVOU	-32(SI)(BX*1), X14
   196  	MOVOU	-16(SI)(BX*1), X15
   197  	MOVOU	X0, (DI)
   198  	MOVOU	X1, 16(DI)
   199  	MOVOU	X2, 32(DI)
   200  	MOVOU	X3, 48(DI)
   201  	MOVOU	X4, 64(DI)
   202  	MOVOU	X5, 80(DI)
   203  	MOVOU	X6, 96(DI)
   204  	MOVOU	X7, 112(DI)
   205  	MOVOU	X8, -128(DI)(BX*1)
   206  	MOVOU	X9, -112(DI)(BX*1)
   207  	MOVOU	X10, -96(DI)(BX*1)
   208  	MOVOU	X11, -80(DI)(BX*1)
   209  	MOVOU	X12, -64(DI)(BX*1)
   210  	MOVOU	X13, -48(DI)(BX*1)
   211  	MOVOU	X14, -32(DI)(BX*1)
   212  	MOVOU	X15, -16(DI)(BX*1)
   213  	RET
   214  move_256through2048:
   215  	SUBQ	$256, BX
   216  	MOVOU	(SI), X0
   217  	MOVOU	16(SI), X1
   218  	MOVOU	32(SI), X2
   219  	MOVOU	48(SI), X3
   220  	MOVOU	64(SI), X4
   221  	MOVOU	80(SI), X5
   222  	MOVOU	96(SI), X6
   223  	MOVOU	112(SI), X7
   224  	MOVOU	128(SI), X8
   225  	MOVOU	144(SI), X9
   226  	MOVOU	160(SI), X10
   227  	MOVOU	176(SI), X11
   228  	MOVOU	192(SI), X12
   229  	MOVOU	208(SI), X13
   230  	MOVOU	224(SI), X14
   231  	MOVOU	240(SI), X15
   232  	MOVOU	X0, (DI)
   233  	MOVOU	X1, 16(DI)
   234  	MOVOU	X2, 32(DI)
   235  	MOVOU	X3, 48(DI)
   236  	MOVOU	X4, 64(DI)
   237  	MOVOU	X5, 80(DI)
   238  	MOVOU	X6, 96(DI)
   239  	MOVOU	X7, 112(DI)
   240  	MOVOU	X8, 128(DI)
   241  	MOVOU	X9, 144(DI)
   242  	MOVOU	X10, 160(DI)
   243  	MOVOU	X11, 176(DI)
   244  	MOVOU	X12, 192(DI)
   245  	MOVOU	X13, 208(DI)
   246  	MOVOU	X14, 224(DI)
   247  	MOVOU	X15, 240(DI)
   248  	CMPQ	BX, $256
   249  	LEAQ	256(SI), SI
   250  	LEAQ	256(DI), DI
   251  	JGE	move_256through2048
   252  	JMP	tail