github.com/primecitizens/pcz/std@v0.2.1/core/mem/move_arm64.s

github.com/primecitizens/pcz/std@v0.2.1/core/mem/move_arm64.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  //
     4  // Copyright 2014 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  //go:build pcz && arm64
     9  
    10  #include "textflag.h"
    11  
    12  // See memmove Go doc for important implementation constraints.
    13  
    14  // Register map
    15  //
    16  // dstin  R0
    17  // src    R1
    18  // count  R2
    19  // dst    R3 (same as R0, but gets modified in unaligned cases)
    20  // srcend R4
    21  // dstend R5
    22  // data   R6-R17
    23  // tmp1   R14
    24  
    25  // Copies are split into 3 main cases: small copies of up to 32 bytes, medium
    26  // copies of up to 128 bytes, and large copies. The overhead of the overlap
    27  // check is negligible since it is only required for large copies.
    28  //
    29  // Large copies use a software pipelined loop processing 64 bytes per iteration.
    30  // The destination pointer is 16-byte aligned to minimize unaligned accesses.
    31  // The loop tail is handled by always copying 64 bytes from the end.
    32  
    33  // func Move(to, from unsafe.Pointer, n uintptr)
    34  TEXT ·Move<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
    35  	CBZ R2, copy0
    36  
    37  	// Small copies: 1..16 bytes
    38  	CMP $16, R2
    39  	BLE copy16
    40  
    41  	// Large copies
    42  	CMP $128, R2
    43  	BHI copy_long
    44  	CMP $32, R2
    45  	BHI copy32_128
    46  
    47  	// Small copies: 17..32 bytes.
    48  	LDP (R1), (R6, R7)
    49  	ADD R1, R2, R4          // R4 points just past the last source byte
    50  	LDP -16(R4), (R12, R13)
    51  	STP (R6, R7), (R0)
    52  	ADD R0, R2, R5          // R5 points just past the last destination byte
    53  	STP (R12, R13), -16(R5)
    54  	RET
    55  
    56  // Small copies: 1..16 bytes.
    57  copy16:
    58  	ADD R1, R2, R4 // R4 points just past the last source byte
    59  	ADD R0, R2, R5 // R5 points just past the last destination byte
    60  	CMP $8, R2
    61  	BLT copy7
    62  	MOVD (R1), R6
    63  	MOVD -8(R4), R7
    64  	MOVD R6, (R0)
    65  	MOVD R7, -8(R5)
    66  	RET
    67  
    68  copy7:
    69  	TBZ $2, R2, copy3
    70  	MOVWU (R1), R6
    71  	MOVWU -4(R4), R7
    72  	MOVW R6, (R0)
    73  	MOVW R7, -4(R5)
    74  	RET
    75  
    76  copy3:
    77  	TBZ $1, R2, copy1
    78  	MOVHU (R1), R6
    79  	MOVHU -2(R4), R7
    80  	MOVH R6, (R0)
    81  	MOVH R7, -2(R5)
    82  	RET
    83  
    84  copy1:
    85  	MOVBU (R1), R6
    86  	MOVB R6, (R0)
    87  
    88  copy0:
    89  	RET
    90  
    91  	// Medium copies: 33..128 bytes.
    92  copy32_128:
    93  	ADD R1, R2, R4          // R4 points just past the last source byte
    94  	ADD R0, R2, R5          // R5 points just past the last destination byte
    95  	LDP (R1), (R6, R7)
    96  	LDP 16(R1), (R8, R9)
    97  	LDP -32(R4), (R10, R11)
    98  	LDP -16(R4), (R12, R13)
    99  	CMP $64, R2
   100  	BHI copy128
   101  	STP (R6, R7), (R0)
   102  	STP (R8, R9), 16(R0)
   103  	STP (R10, R11), -32(R5)
   104  	STP (R12, R13), -16(R5)
   105  	RET
   106  
   107  	// Copy 65..128 bytes.
   108  copy128:
   109  	LDP 32(R1), (R14, R15)
   110  	LDP 48(R1), (R16, R17)
   111  	CMP $96, R2
   112  	BLS copy96
   113  	LDP -64(R4), (R2, R3)
   114  	LDP -48(R4), (R1, R4)
   115  	STP (R2, R3), -64(R5)
   116  	STP (R1, R4), -48(R5)
   117  
   118  copy96:
   119  	STP (R6, R7), (R0)
   120  	STP (R8, R9), 16(R0)
   121  	STP (R14, R15), 32(R0)
   122  	STP (R16, R17), 48(R0)
   123  	STP (R10, R11), -32(R5)
   124  	STP (R12, R13), -16(R5)
   125  	RET
   126  
   127  	// Copy more than 128 bytes.
   128  copy_long:
   129  	ADD R1, R2, R4 // R4 points just past the last source byte
   130  	ADD R0, R2, R5 // R5 points just past the last destination byte
   131  	MOVD ZR, R7
   132  	MOVD ZR, R8
   133  
   134  	CMP $1024, R2
   135  	BLT backward_check
   136  	// feature detect to decide how to align
   137  	MOVBU ·arm64UseAlignedLoads(SB), R6
   138  	CBNZ R6, use_aligned_loads
   139  	MOVD R0, R7
   140  	MOVD R5, R8
   141  	B backward_check
   142  use_aligned_loads:
   143  	MOVD R1, R7
   144  	MOVD R4, R8
   145  	// R7 and R8 are used here for the realignment calculation. In
   146  	// the use_aligned_loads case, R7 is the src pointer and R8 is
   147  	// srcend pointer, which is used in the backward copy case.
   148  	// When doing aligned stores, R7 is the dst pointer and R8 is
   149  	// the dstend pointer.
   150  
   151  backward_check:
   152  	// Use backward copy if there is an overlap.
   153  	SUB R1, R0, R14
   154  	CBZ R14, copy0
   155  	CMP R2, R14
   156  	BCC copy_long_backward
   157  
   158  	// Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment.
   159  	LDP (R1), (R12, R13)     // Load  A
   160  	AND $15, R7, R14         // Calculate the realignment offset
   161  	SUB R14, R1, R1
   162  	SUB R14, R0, R3          // move dst back same amount as src
   163  	ADD R14, R2, R2
   164  	LDP 16(R1), (R6, R7)     // Load   B
   165  	STP (R12, R13), (R0)     // Store A
   166  	LDP 32(R1), (R8, R9)     // Load    C
   167  	LDP 48(R1), (R10, R11)   // Load     D
   168  	LDP.W 64(R1), (R12, R13)   // Load      E
   169  	// 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end
   170  	SUBS $144, R2, R2
   171  	BLS copy64_from_end
   172  
   173  loop64:
   174  	STP (R6, R7), 16(R3)     // Store  B
   175  	LDP 16(R1), (R6, R7)     // Load   B (next iteration)
   176  	STP (R8, R9), 32(R3)     // Store   C
   177  	LDP 32(R1), (R8, R9)     // Load    C
   178  	STP (R10, R11), 48(R3)   // Store    D
   179  	LDP 48(R1), (R10, R11)   // Load     D
   180  	STP.W (R12, R13), 64(R3)   // Store     E
   181  	LDP.W 64(R1), (R12, R13)   // Load      E
   182  	SUBS $64, R2, R2
   183  	BHI loop64
   184  
   185  	// Write the last iteration and copy 64 bytes from the end.
   186  copy64_from_end:
   187  	LDP -64(R4), (R14, R15)  // Load       F
   188  	STP (R6, R7), 16(R3)     // Store  B
   189  	LDP -48(R4), (R6, R7)    // Load        G
   190  	STP (R8, R9), 32(R3)     // Store   C
   191  	LDP -32(R4), (R8, R9)    // Load         H
   192  	STP (R10, R11), 48(R3)   // Store    D
   193  	LDP -16(R4), (R10, R11)  // Load          I
   194  	STP (R12, R13), 64(R3)   // Store     E
   195  	STP (R14, R15), -64(R5)  // Store      F
   196  	STP (R6, R7), -48(R5)    // Store       G
   197  	STP (R8, R9), -32(R5)    // Store        H
   198  	STP (R10, R11), -16(R5)  // Store         I
   199  	RET
   200  
   201  	// Large backward copy for overlapping copies.
   202  	// Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment.
   203  copy_long_backward:
   204  	LDP -16(R4), (R12, R13)
   205  	AND $15, R8, R14
   206  	SUB R14, R4, R4
   207  	SUB R14, R2, R2
   208  	LDP -16(R4), (R6, R7)
   209  	STP (R12, R13), -16(R5)
   210  	LDP -32(R4), (R8, R9)
   211  	LDP -48(R4), (R10, R11)
   212  	LDP.W -64(R4), (R12, R13)
   213  	SUB R14, R5, R5
   214  	SUBS $128, R2, R2
   215  	BLS copy64_from_start
   216  
   217  loop64_backward:
   218  	STP (R6, R7), -16(R5)
   219  	LDP -16(R4), (R6, R7)
   220  	STP (R8, R9), -32(R5)
   221  	LDP -32(R4), (R8, R9)
   222  	STP (R10, R11), -48(R5)
   223  	LDP -48(R4), (R10, R11)
   224  	STP.W (R12, R13), -64(R5)
   225  	LDP.W -64(R4), (R12, R13)
   226  	SUBS $64, R2, R2
   227  	BHI loop64_backward
   228  
   229  	// Write the last iteration and copy 64 bytes from the start.
   230  copy64_from_start:
   231  	LDP 48(R1), (R2, R3)
   232  	STP (R6, R7), -16(R5)
   233  	LDP 32(R1), (R6, R7)
   234  	STP (R8, R9), -32(R5)
   235  	LDP 16(R1), (R8, R9)
   236  	STP (R10, R11), -48(R5)
   237  	LDP (R1), (R10, R11)
   238  	STP (R12, R13), -64(R5)
   239  	STP (R2, R3), 48(R0)
   240  	STP (R6, R7), 32(R0)
   241  	STP (R8, R9), 16(R0)
   242  	STP (R10, R11), (R0)
   243  	RET