github.com/x04/go/src@v0.0.0-20200202162449-3d481ceb3525/runtime/memmove_arm64.s (about)

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  // See memmove Go doc for important implementation constraints.
     8  
     9  // func memmove(to, from unsafe.Pointer, n uintptr)
    10  TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
    11  	MOVD	to+0(FP), R3
    12  	MOVD	from+8(FP), R4
    13  	MOVD	n+16(FP), R5
    14  	CBNZ	R5, check
    15  	RET
    16  
    17  check:
    18  	CMP	$16, R5
    19  	BLE	copy16
    20  
    21  	AND	$~31, R5, R7	// R7 is N&~31
    22  	SUB	R7, R5, R6	// R6 is N&31
    23  
    24  	CMP	R3, R4
    25  	BLT	backward
    26  
    27  	// Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes.
    28  	// R3 and R4 are advanced as we copy.
    29  
    30  	// (There may be implementations of armv8 where copying by bytes until
    31  	// at least one of source or dest is word aligned is a worthwhile
    32  	// optimization, but the on the one tested so far (xgene) it did not
    33  	// make a significance difference.)
    34  
    35  	CBZ	R7, noforwardlarge	// Do we need to do any quadword copying?
    36  
    37  	ADD	R3, R7, R9	// R9 points just past where we copy by word
    38  
    39  forwardlargeloop:
    40  	// Copy 32 bytes at a time.
    41  	LDP.P	32(R4), (R8, R10)
    42  	STP.P	(R8, R10), 32(R3)
    43  	LDP	-16(R4), (R11, R12)
    44  	STP	(R11, R12), -16(R3)
    45  	SUB 	$32, R7, R7
    46  	CBNZ	R7, forwardlargeloop
    47  
    48  noforwardlarge:
    49  	CBNZ	R6, forwardtail		// Do we need to copy any tail bytes?
    50  	RET
    51  
    52  forwardtail:
    53  	// There are R6 <= 31 bytes remaining to copy.
    54  	// This is large enough to still contain pointers,
    55  	// which must be copied atomically.
    56  	// Copy the next 16 bytes, then 8 bytes, then any remaining bytes.
    57  	TBZ	$4, R6, 3(PC)	// write 16 bytes if R6&16 != 0
    58  	LDP.P	16(R4), (R8, R10)
    59  	STP.P	(R8, R10), 16(R3)
    60  
    61  	TBZ	$3, R6, 3(PC)	// write 8 bytes if R6&8 != 0
    62  	MOVD.P	8(R4), R8
    63  	MOVD.P	R8, 8(R3)
    64  
    65  	AND	$7, R6
    66  	CBNZ	R6, 2(PC)
    67  	RET
    68  
    69  	ADD	R3, R6, R9	// R9 points just past the destination memory
    70  
    71  forwardtailloop:
    72  	MOVBU.P 1(R4), R8
    73  	MOVBU.P	R8, 1(R3)
    74  	CMP	R3, R9
    75  	BNE	forwardtailloop
    76  	RET
    77  
    78  	// Small copies: 1..16 bytes.
    79  copy16:
    80  	ADD	R4, R5, R8	// R8 points just past the last source byte
    81  	ADD	R3, R5, R9	// R9 points just past the last destination byte
    82  	CMP	$8, R5
    83  	BLT	copy7
    84  	MOVD	(R4), R6
    85  	MOVD	-8(R8), R7
    86  	MOVD	R6, (R3)
    87  	MOVD	R7, -8(R9)
    88  	RET
    89  
    90  copy7:
    91  	TBZ	$2, R5, copy3
    92  	MOVWU	(R4), R6
    93  	MOVWU	-4(R8), R7
    94  	MOVW	R6, (R3)
    95  	MOVW	R7, -4(R9)
    96  	RET
    97  
    98  copy3:
    99  	TBZ	$1, R5, copy1
   100  	MOVHU	(R4), R6
   101  	MOVHU	-2(R8), R7
   102  	MOVH	R6, (R3)
   103  	MOVH	R7, -2(R9)
   104  	RET
   105  
   106  copy1:
   107  	MOVBU	(R4), R6
   108  	MOVB	R6, (R3)
   109  	RET
   110  
   111  backward:
   112  	// Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords.
   113  	// R3 and R4 are advanced to the end of the destination/source buffers
   114  	// respectively and moved back as we copy.
   115  
   116  	ADD	R4, R5, R4	// R4 points just past the last source byte
   117  	ADD	R3, R5, R3	// R3 points just past the last destination byte
   118  
   119  	CBZ	R6, nobackwardtail	// Do we need to do any byte-by-byte copying?
   120  
   121  	AND	$7, R6, R12
   122  	CBZ	R12, backwardtaillarge
   123  
   124  	SUB	R12, R3, R9	// R9 points at the lowest destination byte that should be copied by byte.
   125  backwardtailloop:
   126  	// Copy sub-pointer-size tail.
   127  	MOVBU.W	-1(R4), R8
   128  	MOVBU.W	R8, -1(R3)
   129  	CMP	R9, R3
   130  	BNE	backwardtailloop
   131  
   132  backwardtaillarge:
   133  	// Do 8/16-byte write if possible.
   134  	// See comment at forwardtail.
   135  	TBZ	$3, R6, 3(PC)
   136  	MOVD.W	-8(R4), R8
   137  	MOVD.W	R8, -8(R3)
   138  
   139  	TBZ	$4, R6, 3(PC)
   140  	LDP.W	-16(R4), (R8, R10)
   141  	STP.W	(R8, R10), -16(R3)
   142  
   143  nobackwardtail:
   144  	CBNZ     R7, backwardlarge	// Do we need to do any doubleword-by-doubleword copying?
   145  	RET
   146  
   147  backwardlarge:
   148  	SUB	R7, R3, R9	// R9 points at the lowest destination byte
   149  
   150  backwardlargeloop:
   151  	LDP	-16(R4), (R8, R10)
   152  	STP	(R8, R10), -16(R3)
   153  	LDP.W	-32(R4), (R11, R12)
   154  	STP.W	(R11, R12), -32(R3)
   155  	CMP	R9, R3
   156  	BNE	backwardlargeloop
   157  	RET