github.com/primecitizens/pcz/std@v0.2.1/core/mem/move_ppc64x.s (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright 2023 The Prime Citizens
     3  //
     4  // Copyright 2014 The Go Authors. All rights reserved.
     5  // Use of this source code is governed by a BSD-style
     6  // license that can be found in the LICENSE file.
     7  
     8  //go:build pcz && (ppc64 || ppc64le)
     9  
    10  #include "textflag.h"
    11  
    12  // See memmove Go doc for important implementation constraints.
    13  
    14  // func Move(to, from unsafe.Pointer, n uintptr)
    15  
    16  // target address
    17  #define TGT R3
    18  // source address
    19  #define SRC R4
    20  // length to move
    21  #define LEN R5
    22  // number of doublewords
    23  #define DWORDS R6
    24  // number of bytes < 8
    25  #define BYTES R7
    26  // const 16 used as index
    27  #define IDX16 R8
    28  // temp used for copies, etc.
    29  #define TMP R9
    30  // number of 64 byte chunks
    31  #define QWORDS R10
    32  // index values
    33  #define IDX32 R14
    34  #define IDX48 R15
    35  #define OCTWORDS R16
    36  
    37  TEXT ·Move<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
    38  	// R3 = TGT = to
    39  	// R4 = SRC = from
    40  	// R5 = LEN = n
    41  
    42  	// Determine if there are doublewords to
    43  	// copy so a more efficient move can be done
    44  check:
    45  	ANDCC $7, LEN, BYTES // R7: bytes to copy
    46  	SRD $3, LEN, DWORDS // R6: double words to copy
    47  	MOVFL CR0, CR3 // save CR from ANDCC
    48  	CMP DWORDS, $0, CR1 // CR1[EQ] set if no double words to copy
    49  
    50  	// Determine overlap by subtracting dest - src and comparing against the
    51  	// length.  This catches the cases where src and dest are in different types
    52  	// of storage such as stack and static to avoid doing backward move when not
    53  	// necessary.
    54  
    55  	SUB SRC, TGT, TMP // dest - src
    56  	CMPU TMP, LEN, CR2 // < len?
    57  	BC 12, 8, backward // BLT CR2 backward
    58  
    59  	// Copying forward if no overlap.
    60  
    61  	BC 12, 6, checkbytes // BEQ CR1, checkbytes
    62  	SRDCC $3, DWORDS, OCTWORDS // 64 byte chunks?
    63  	MOVD $16, IDX16
    64  	BEQ lt64gt8 // < 64 bytes
    65  
    66  	// Prepare for moves of 64 bytes at a time.
    67  
    68  forward64setup:
    69  	DCBTST (TGT)			// prepare data cache
    70  	DCBT (SRC)
    71  	MOVD OCTWORDS, CTR // Number of 64 byte chunks
    72  	MOVD $32, IDX32
    73  	MOVD $48, IDX48
    74  	PCALIGN $32
    75  
    76  forward64:
    77  	LXVD2X (R0)(SRC), VS32 // load 64 bytes
    78  	LXVD2X (IDX16)(SRC), VS33
    79  	LXVD2X (IDX32)(SRC), VS34
    80  	LXVD2X (IDX48)(SRC), VS35
    81  	ADD $64, SRC
    82  	STXVD2X VS32, (R0)(TGT)		// store 64 bytes
    83  	STXVD2X VS33, (IDX16)(TGT)
    84  	STXVD2X VS34, (IDX32)(TGT)
    85  	STXVD2X VS35, (IDX48)(TGT)
    86  	ADD $64,TGT // bump up for next set
    87  	BC 16, 0, forward64 // continue
    88  	ANDCC $7, DWORDS // remaining doublewords
    89  	BEQ checkbytes // only bytes remain
    90  
    91  lt64gt8:
    92  	CMP DWORDS, $4
    93  	BLT lt32gt8
    94  	LXVD2X (R0)(SRC), VS32
    95  	LXVD2X (IDX16)(SRC), VS33
    96  	ADD $-4, DWORDS
    97  	STXVD2X VS32, (R0)(TGT)
    98  	STXVD2X VS33, (IDX16)(TGT)
    99  	ADD $32, SRC
   100  	ADD $32, TGT
   101  
   102  lt32gt8:
   103          // At this point >= 8 and < 32
   104  	// Move 16 bytes if possible
   105  	CMP DWORDS, $2
   106  	BLT lt16
   107  	LXVD2X (R0)(SRC), VS32
   108  	ADD $-2, DWORDS
   109  	STXVD2X VS32, (R0)(TGT)
   110  	ADD $16, SRC
   111  	ADD $16, TGT
   112  
   113  lt16:	// Move 8 bytes if possible
   114  	CMP DWORDS, $1
   115  	BLT checkbytes
   116  	MOVD 0(SRC), TMP
   117  	ADD $8, SRC
   118  	MOVD TMP, 0(TGT)
   119  	ADD $8, TGT
   120  checkbytes:
   121  	BC 12, 14, LR // BEQ lr
   122  lt8:	// Move word if possible
   123  	CMP BYTES, $4
   124  	BLT lt4
   125  	MOVWZ 0(SRC), TMP
   126  	ADD $-4, BYTES
   127  	MOVW TMP, 0(TGT)
   128  	ADD $4, SRC
   129  	ADD $4, TGT
   130  lt4:	// Move halfword if possible
   131  	CMP BYTES, $2
   132  	BLT lt2
   133  	MOVHZ 0(SRC), TMP
   134  	ADD $-2, BYTES
   135  	MOVH TMP, 0(TGT)
   136  	ADD $2, SRC
   137  	ADD $2, TGT
   138  lt2:	// Move last byte if 1 left
   139  	CMP BYTES, $1
   140  	BC 12, 0, LR // ble lr
   141  	MOVBZ 0(SRC), TMP
   142  	MOVBZ TMP, 0(TGT)
   143  	RET
   144  
   145  backward:
   146  	// Copying backwards proceeds by copying R7 bytes then copying R6 double words.
   147  	// R3 and R4 are advanced to the end of the destination/source buffers
   148  	// respectively and moved back as we copy.
   149  
   150  	ADD LEN, SRC, SRC // end of source
   151  	ADD TGT, LEN, TGT // end of dest
   152  
   153  	BEQ nobackwardtail // earlier condition
   154  
   155  	MOVD BYTES, CTR // bytes to move
   156  
   157  backwardtailloop:
   158  	MOVBZ 	-1(SRC), TMP // point to last byte
   159  	SUB $1,SRC
   160  	MOVBZ 	TMP, -1(TGT)
   161  	SUB $1,TGT
   162  	BDNZ backwardtailloop
   163  
   164  nobackwardtail:
   165  	BC 4, 5, LR // blelr cr1, return if DWORDS == 0
   166  	SRDCC $2,DWORDS,QWORDS // Compute number of 32B blocks and compare to 0
   167  	BNE backward32setup // If QWORDS != 0, start the 32B copy loop.
   168  
   169  backward24:
   170  	// DWORDS is a value between 1-3.
   171  	CMP DWORDS, $2
   172  
   173  	MOVD 	-8(SRC), TMP
   174  	MOVD 	TMP, -8(TGT)
   175  	BC 12, 0, LR // bltlr, return if DWORDS == 1
   176  
   177  	MOVD 	-16(SRC), TMP
   178  	MOVD 	TMP, -16(TGT)
   179  	BC 12, 2, LR // beqlr, return if DWORDS == 2
   180  
   181  	MOVD 	-24(SRC), TMP
   182  	MOVD 	TMP, -24(TGT)
   183  	RET
   184  
   185  backward32setup:
   186  	ANDCC   $3,DWORDS // Compute remaining DWORDS and compare to 0
   187  	MOVD QWORDS, CTR // set up loop ctr
   188  	MOVD $16, IDX16 // 32 bytes at a time
   189  
   190  backward32loop:
   191  	SUB $32, TGT
   192  	SUB $32, SRC
   193  	LXVD2X (R0)(SRC), VS32 // load 16x2 bytes
   194  	LXVD2X (IDX16)(SRC), VS33
   195  	STXVD2X VS32, (R0)(TGT)		// store 16x2 bytes
   196  	STXVD2X VS33, (IDX16)(TGT)
   197  	BDNZ backward32loop
   198  	BC 12, 2, LR // beqlr, return if DWORDS == 0
   199  	BR backward24