github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/runtime/memmove_arm64.s

github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/runtime/memmove_arm64.s (about)

     1  // Copyright 2014 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  // See memmove Go doc for important implementation constraints.
     8  
     9  // Register map
    10  //
    11  // dstin  R0
    12  // src    R1
    13  // count  R2
    14  // dst    R3 (same as R0, but gets modified in unaligned cases)
    15  // srcend R4
    16  // dstend R5
    17  // data   R6-R17
    18  // tmp1   R14
    19  
    20  // Copies are split into 3 main cases: small copies of up to 32 bytes, medium
    21  // copies of up to 128 bytes, and large copies. The overhead of the overlap
    22  // check is negligible since it is only required for large copies.
    23  //
    24  // Large copies use a software pipelined loop processing 64 bytes per iteration.
    25  // The destination pointer is 16-byte aligned to minimize unaligned accesses.
    26  // The loop tail is handled by always copying 64 bytes from the end.
    27  
    28  // func memmove(to, from unsafe.Pointer, n uintptr)
    29  TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
    30  	MOVD	to+0(FP), R0
    31  	MOVD	from+8(FP), R1
    32  	MOVD	n+16(FP), R2
    33  	CBZ	R2, copy0
    34  
    35  	// Small copies: 1..16 bytes
    36  	CMP	$16, R2
    37  	BLE	copy16
    38  
    39  	// Large copies
    40  	CMP	$128, R2
    41  	BHI	copy_long
    42  	CMP	$32, R2
    43  	BHI	copy32_128
    44  
    45  	// Small copies: 17..32 bytes.
    46  	LDP	(R1), (R6, R7)
    47  	ADD	R1, R2, R4          // R4 points just past the last source byte
    48  	LDP	-16(R4), (R12, R13)
    49  	STP	(R6, R7), (R0)
    50  	ADD	R0, R2, R5          // R5 points just past the last destination byte
    51  	STP	(R12, R13), -16(R5)
    52  	RET
    53  
    54  // Small copies: 1..16 bytes.
    55  copy16:
    56  	ADD	R1, R2, R4 // R4 points just past the last source byte
    57  	ADD	R0, R2, R5 // R5 points just past the last destination byte
    58  	CMP	$8, R2
    59  	BLT	copy7
    60  	MOVD	(R1), R6
    61  	MOVD	-8(R4), R7
    62  	MOVD	R6, (R0)
    63  	MOVD	R7, -8(R5)
    64  	RET
    65  
    66  copy7:
    67  	TBZ	$2, R2, copy3
    68  	MOVWU	(R1), R6
    69  	MOVWU	-4(R4), R7
    70  	MOVW	R6, (R0)
    71  	MOVW	R7, -4(R5)
    72  	RET
    73  
    74  copy3:
    75  	TBZ	$1, R2, copy1
    76  	MOVHU	(R1), R6
    77  	MOVHU	-2(R4), R7
    78  	MOVH	R6, (R0)
    79  	MOVH	R7, -2(R5)
    80  	RET
    81  
    82  copy1:
    83  	MOVBU	(R1), R6
    84  	MOVB	R6, (R0)
    85  
    86  copy0:
    87  	RET
    88  
    89  	// Medium copies: 33..128 bytes.
    90  copy32_128:
    91  	ADD	R1, R2, R4          // R4 points just past the last source byte
    92  	ADD	R0, R2, R5          // R5 points just past the last destination byte
    93  	LDP	(R1), (R6, R7)
    94  	LDP	16(R1), (R8, R9)
    95  	LDP	-32(R4), (R10, R11)
    96  	LDP	-16(R4), (R12, R13)
    97  	CMP	$64, R2
    98  	BHI	copy128
    99  	STP	(R6, R7), (R0)
   100  	STP	(R8, R9), 16(R0)
   101  	STP	(R10, R11), -32(R5)
   102  	STP	(R12, R13), -16(R5)
   103  	RET
   104  
   105  	// Copy 65..128 bytes.
   106  copy128:
   107  	LDP	32(R1), (R14, R15)
   108  	LDP	48(R1), (R16, R17)
   109  	CMP	$96, R2
   110  	BLS	copy96
   111  	LDP	-64(R4), (R2, R3)
   112  	LDP	-48(R4), (R1, R4)
   113  	STP	(R2, R3), -64(R5)
   114  	STP	(R1, R4), -48(R5)
   115  
   116  copy96:
   117  	STP	(R6, R7), (R0)
   118  	STP	(R8, R9), 16(R0)
   119  	STP	(R14, R15), 32(R0)
   120  	STP	(R16, R17), 48(R0)
   121  	STP	(R10, R11), -32(R5)
   122  	STP	(R12, R13), -16(R5)
   123  	RET
   124  
   125  	// Copy more than 128 bytes.
   126  copy_long:
   127  	ADD	R1, R2, R4 // R4 points just past the last source byte
   128  	ADD	R0, R2, R5 // R5 points just past the last destination byte
   129  	MOVD	ZR, R7
   130  	MOVD	ZR, R8
   131  
   132  	CMP	$1024, R2
   133  	BLT	backward_check
   134  	// feature detect to decide how to align
   135  	MOVBU	runtime·arm64UseAlignedLoads(SB), R6
   136  	CBNZ	R6, use_aligned_loads
   137  	MOVD	R0, R7
   138  	MOVD	R5, R8
   139  	B	backward_check
   140  use_aligned_loads:
   141  	MOVD	R1, R7
   142  	MOVD	R4, R8
   143  	// R7 and R8 are used here for the realignment calculation. In
   144  	// the use_aligned_loads case, R7 is the src pointer and R8 is
   145  	// srcend pointer, which is used in the backward copy case.
   146  	// When doing aligned stores, R7 is the dst pointer and R8 is
   147  	// the dstend pointer.
   148  
   149  backward_check:
   150  	// Use backward copy if there is an overlap.
   151  	SUB	R1, R0, R14
   152  	CBZ	R14, copy0
   153  	CMP	R2, R14
   154  	BCC	copy_long_backward
   155  
   156  	// Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment.
   157  	LDP	(R1), (R12, R13)     // Load  A
   158  	AND	$15, R7, R14         // Calculate the realignment offset
   159  	SUB	R14, R1, R1
   160  	SUB	R14, R0, R3          // move dst back same amount as src
   161  	ADD	R14, R2, R2
   162  	LDP	16(R1), (R6, R7)     // Load   B
   163  	STP	(R12, R13), (R0)     // Store A
   164  	LDP	32(R1), (R8, R9)     // Load    C
   165  	LDP	48(R1), (R10, R11)   // Load     D
   166  	LDP.W	64(R1), (R12, R13)   // Load      E
   167  	// 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end
   168  	SUBS	$144, R2, R2
   169  	BLS	copy64_from_end
   170  
   171  loop64:
   172  	STP	(R6, R7), 16(R3)     // Store  B
   173  	LDP	16(R1), (R6, R7)     // Load   B (next iteration)
   174  	STP	(R8, R9), 32(R3)     // Store   C
   175  	LDP	32(R1), (R8, R9)     // Load    C
   176  	STP	(R10, R11), 48(R3)   // Store    D
   177  	LDP	48(R1), (R10, R11)   // Load     D
   178  	STP.W	(R12, R13), 64(R3)   // Store     E
   179  	LDP.W	64(R1), (R12, R13)   // Load      E
   180  	SUBS	$64, R2, R2
   181  	BHI	loop64
   182  
   183  	// Write the last iteration and copy 64 bytes from the end.
   184  copy64_from_end:
   185  	LDP	-64(R4), (R14, R15)  // Load       F
   186  	STP	(R6, R7), 16(R3)     // Store  B
   187  	LDP	-48(R4), (R6, R7)    // Load        G
   188  	STP	(R8, R9), 32(R3)     // Store   C
   189  	LDP	-32(R4), (R8, R9)    // Load         H
   190  	STP	(R10, R11), 48(R3)   // Store    D
   191  	LDP	-16(R4), (R10, R11)  // Load          I
   192  	STP	(R12, R13), 64(R3)   // Store     E
   193  	STP	(R14, R15), -64(R5)  // Store      F
   194  	STP	(R6, R7), -48(R5)    // Store       G
   195  	STP	(R8, R9), -32(R5)    // Store        H
   196  	STP	(R10, R11), -16(R5)  // Store         I
   197  	RET
   198  
   199  	// Large backward copy for overlapping copies.
   200  	// Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment.
   201  copy_long_backward:
   202  	LDP	-16(R4), (R12, R13)
   203  	AND	$15, R8, R14
   204  	SUB	R14, R4, R4
   205  	SUB	R14, R2, R2
   206  	LDP	-16(R4), (R6, R7)
   207  	STP	(R12, R13), -16(R5)
   208  	LDP	-32(R4), (R8, R9)
   209  	LDP	-48(R4), (R10, R11)
   210  	LDP.W	-64(R4), (R12, R13)
   211  	SUB	R14, R5, R5
   212  	SUBS	$128, R2, R2
   213  	BLS	copy64_from_start
   214  
   215  loop64_backward:
   216  	STP	(R6, R7), -16(R5)
   217  	LDP	-16(R4), (R6, R7)
   218  	STP	(R8, R9), -32(R5)
   219  	LDP	-32(R4), (R8, R9)
   220  	STP	(R10, R11), -48(R5)
   221  	LDP	-48(R4), (R10, R11)
   222  	STP.W	(R12, R13), -64(R5)
   223  	LDP.W	-64(R4), (R12, R13)
   224  	SUBS	$64, R2, R2
   225  	BHI	loop64_backward
   226  
   227  	// Write the last iteration and copy 64 bytes from the start.
   228  copy64_from_start:
   229  	LDP	48(R1), (R2, R3)
   230  	STP	(R6, R7), -16(R5)
   231  	LDP	32(R1), (R6, R7)
   232  	STP	(R8, R9), -32(R5)
   233  	LDP	16(R1), (R8, R9)
   234  	STP	(R10, R11), -48(R5)
   235  	LDP	(R1), (R10, R11)
   236  	STP	(R12, R13), -64(R5)
   237  	STP	(R2, R3), 48(R0)
   238  	STP	(R6, R7), 32(R0)
   239  	STP	(R8, R9), 16(R0)
   240  	STP	(R10, R11), (R0)
   241  	RET