github.com/fjballest/golang@v0.0.0-20151209143359-e4c5fe594ca8/src/runtime/memmove_amd64.s (about)

     1  // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
     2  // http://code.google.com/p/inferno-os/source/browse/libkern/memmove-386.s
     3  //
     4  //         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
     5  //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
     6  //         Portions Copyright 2009 The Go Authors. All rights reserved.
     7  //
     8  // Permission is hereby granted, free of charge, to any person obtaining a copy
     9  // of this software and associated documentation files (the "Software"), to deal
    10  // in the Software without restriction, including without limitation the rights
    11  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    12  // copies of the Software, and to permit persons to whom the Software is
    13  // furnished to do so, subject to the following conditions:
    14  //
    15  // The above copyright notice and this permission notice shall be included in
    16  // all copies or substantial portions of the Software.
    17  //
    18  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    19  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    20  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    21  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    22  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    23  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    24  // THE SOFTWARE.
    25  
    26  // +build !plan9
    27  
    28  #include "textflag.h"
    29  
    30  // void runtime·memmove(void*, void*, uintptr)
    31  TEXT runtime·memmove(SB), NOSPLIT, $0-24
    32  
    33  	MOVQ	to+0(FP), DI
    34  	MOVQ	from+8(FP), SI
    35  	MOVQ	n+16(FP), BX
    36  
    37  	// REP instructions have a high startup cost, so we handle small sizes
    38  	// with some straightline code.  The REP MOVSQ instruction is really fast
    39  	// for large sizes.  The cutover is approximately 2K.
    40  tail:
    41  	// move_129through256 or smaller work whether or not the source and the
    42  	// destination memory regions overlap because they load all data into
    43  	// registers before writing it back.  move_256through2048 on the other
    44  	// hand can be used only when the memory regions don't overlap or the copy
    45  	// direction is forward.
    46  	TESTQ	BX, BX
    47  	JEQ	move_0
    48  	CMPQ	BX, $2
    49  	JBE	move_1or2
    50  	CMPQ	BX, $4
    51  	JBE	move_3or4
    52  	CMPQ	BX, $8
    53  	JB	move_5through7
    54  	JE	move_8
    55  	CMPQ	BX, $16
    56  	JBE	move_9through16
    57  	CMPQ	BX, $32
    58  	JBE	move_17through32
    59  	CMPQ	BX, $64
    60  	JBE	move_33through64
    61  	CMPQ	BX, $128
    62  	JBE	move_65through128
    63  	CMPQ	BX, $256
    64  	JBE	move_129through256
    65  	// TODO: use branch table and BSR to make this just a single dispatch
    66  
    67  /*
    68   * check and set for backwards
    69   */
    70  	CMPQ	SI, DI
    71  	JLS	back
    72  
    73  /*
    74   * forward copy loop
    75   */
    76  forward:
    77  	CMPQ	BX, $2048
    78  	JLS	move_256through2048
    79  
    80  	MOVQ	BX, CX
    81  	SHRQ	$3, CX
    82  	ANDQ	$7, BX
    83  	REP;	MOVSQ
    84  	JMP	tail
    85  
    86  back:
    87  /*
    88   * check overlap
    89   */
    90  	MOVQ	SI, CX
    91  	ADDQ	BX, CX
    92  	CMPQ	CX, DI
    93  	JLS	forward
    94  	
    95  /*
    96   * whole thing backwards has
    97   * adjusted addresses
    98   */
    99  	ADDQ	BX, DI
   100  	ADDQ	BX, SI
   101  	STD
   102  
   103  /*
   104   * copy
   105   */
   106  	MOVQ	BX, CX
   107  	SHRQ	$3, CX
   108  	ANDQ	$7, BX
   109  
   110  	SUBQ	$8, DI
   111  	SUBQ	$8, SI
   112  	REP;	MOVSQ
   113  
   114  	CLD
   115  	ADDQ	$8, DI
   116  	ADDQ	$8, SI
   117  	SUBQ	BX, DI
   118  	SUBQ	BX, SI
   119  	JMP	tail
   120  
   121  move_1or2:
   122  	MOVB	(SI), AX
   123  	MOVB	-1(SI)(BX*1), CX
   124  	MOVB	AX, (DI)
   125  	MOVB	CX, -1(DI)(BX*1)
   126  	RET
   127  move_0:
   128  	RET
   129  move_3or4:
   130  	MOVW	(SI), AX
   131  	MOVW	-2(SI)(BX*1), CX
   132  	MOVW	AX, (DI)
   133  	MOVW	CX, -2(DI)(BX*1)
   134  	RET
   135  move_5through7:
   136  	MOVL	(SI), AX
   137  	MOVL	-4(SI)(BX*1), CX
   138  	MOVL	AX, (DI)
   139  	MOVL	CX, -4(DI)(BX*1)
   140  	RET
   141  move_8:
   142  	// We need a separate case for 8 to make sure we write pointers atomically.
   143  	MOVQ	(SI), AX
   144  	MOVQ	AX, (DI)
   145  	RET
   146  move_9through16:
   147  	MOVQ	(SI), AX
   148  	MOVQ	-8(SI)(BX*1), CX
   149  	MOVQ	AX, (DI)
   150  	MOVQ	CX, -8(DI)(BX*1)
   151  	RET
   152  move_17through32:
   153  	MOVOU	(SI), X0
   154  	MOVOU	-16(SI)(BX*1), X1
   155  	MOVOU	X0, (DI)
   156  	MOVOU	X1, -16(DI)(BX*1)
   157  	RET
   158  move_33through64:
   159  	MOVOU	(SI), X0
   160  	MOVOU	16(SI), X1
   161  	MOVOU	-32(SI)(BX*1), X2
   162  	MOVOU	-16(SI)(BX*1), X3
   163  	MOVOU	X0, (DI)
   164  	MOVOU	X1, 16(DI)
   165  	MOVOU	X2, -32(DI)(BX*1)
   166  	MOVOU	X3, -16(DI)(BX*1)
   167  	RET
   168  move_65through128:
   169  	MOVOU	(SI), X0
   170  	MOVOU	16(SI), X1
   171  	MOVOU	32(SI), X2
   172  	MOVOU	48(SI), X3
   173  	MOVOU	-64(SI)(BX*1), X4
   174  	MOVOU	-48(SI)(BX*1), X5
   175  	MOVOU	-32(SI)(BX*1), X6
   176  	MOVOU	-16(SI)(BX*1), X7
   177  	MOVOU	X0, (DI)
   178  	MOVOU	X1, 16(DI)
   179  	MOVOU	X2, 32(DI)
   180  	MOVOU	X3, 48(DI)
   181  	MOVOU	X4, -64(DI)(BX*1)
   182  	MOVOU	X5, -48(DI)(BX*1)
   183  	MOVOU	X6, -32(DI)(BX*1)
   184  	MOVOU	X7, -16(DI)(BX*1)
   185  	RET
   186  move_129through256:
   187  	MOVOU	(SI), X0
   188  	MOVOU	16(SI), X1
   189  	MOVOU	32(SI), X2
   190  	MOVOU	48(SI), X3
   191  	MOVOU	64(SI), X4
   192  	MOVOU	80(SI), X5
   193  	MOVOU	96(SI), X6
   194  	MOVOU	112(SI), X7
   195  	MOVOU	-128(SI)(BX*1), X8
   196  	MOVOU	-112(SI)(BX*1), X9
   197  	MOVOU	-96(SI)(BX*1), X10
   198  	MOVOU	-80(SI)(BX*1), X11
   199  	MOVOU	-64(SI)(BX*1), X12
   200  	MOVOU	-48(SI)(BX*1), X13
   201  	MOVOU	-32(SI)(BX*1), X14
   202  	MOVOU	-16(SI)(BX*1), X15
   203  	MOVOU	X0, (DI)
   204  	MOVOU	X1, 16(DI)
   205  	MOVOU	X2, 32(DI)
   206  	MOVOU	X3, 48(DI)
   207  	MOVOU	X4, 64(DI)
   208  	MOVOU	X5, 80(DI)
   209  	MOVOU	X6, 96(DI)
   210  	MOVOU	X7, 112(DI)
   211  	MOVOU	X8, -128(DI)(BX*1)
   212  	MOVOU	X9, -112(DI)(BX*1)
   213  	MOVOU	X10, -96(DI)(BX*1)
   214  	MOVOU	X11, -80(DI)(BX*1)
   215  	MOVOU	X12, -64(DI)(BX*1)
   216  	MOVOU	X13, -48(DI)(BX*1)
   217  	MOVOU	X14, -32(DI)(BX*1)
   218  	MOVOU	X15, -16(DI)(BX*1)
   219  	RET
   220  move_256through2048:
   221  	SUBQ	$256, BX
   222  	MOVOU	(SI), X0
   223  	MOVOU	16(SI), X1
   224  	MOVOU	32(SI), X2
   225  	MOVOU	48(SI), X3
   226  	MOVOU	64(SI), X4
   227  	MOVOU	80(SI), X5
   228  	MOVOU	96(SI), X6
   229  	MOVOU	112(SI), X7
   230  	MOVOU	128(SI), X8
   231  	MOVOU	144(SI), X9
   232  	MOVOU	160(SI), X10
   233  	MOVOU	176(SI), X11
   234  	MOVOU	192(SI), X12
   235  	MOVOU	208(SI), X13
   236  	MOVOU	224(SI), X14
   237  	MOVOU	240(SI), X15
   238  	MOVOU	X0, (DI)
   239  	MOVOU	X1, 16(DI)
   240  	MOVOU	X2, 32(DI)
   241  	MOVOU	X3, 48(DI)
   242  	MOVOU	X4, 64(DI)
   243  	MOVOU	X5, 80(DI)
   244  	MOVOU	X6, 96(DI)
   245  	MOVOU	X7, 112(DI)
   246  	MOVOU	X8, 128(DI)
   247  	MOVOU	X9, 144(DI)
   248  	MOVOU	X10, 160(DI)
   249  	MOVOU	X11, 176(DI)
   250  	MOVOU	X12, 192(DI)
   251  	MOVOU	X13, 208(DI)
   252  	MOVOU	X14, 224(DI)
   253  	MOVOU	X15, 240(DI)
   254  	CMPQ	BX, $256
   255  	LEAQ	256(SI), SI
   256  	LEAQ	256(DI), DI
   257  	JGE	move_256through2048
   258  	JMP	tail