github.com/primecitizens/pcz/std@v0.2.1/core/mem/move_ppc64x.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2014 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 //go:build pcz && (ppc64 || ppc64le) 9 10 #include "textflag.h" 11 12 // See memmove Go doc for important implementation constraints. 13 14 // func Move(to, from unsafe.Pointer, n uintptr) 15 16 // target address 17 #define TGT R3 18 // source address 19 #define SRC R4 20 // length to move 21 #define LEN R5 22 // number of doublewords 23 #define DWORDS R6 24 // number of bytes < 8 25 #define BYTES R7 26 // const 16 used as index 27 #define IDX16 R8 28 // temp used for copies, etc. 29 #define TMP R9 30 // number of 64 byte chunks 31 #define QWORDS R10 32 // index values 33 #define IDX32 R14 34 #define IDX48 R15 35 #define OCTWORDS R16 36 37 TEXT ·Move<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24 38 // R3 = TGT = to 39 // R4 = SRC = from 40 // R5 = LEN = n 41 42 // Determine if there are doublewords to 43 // copy so a more efficient move can be done 44 check: 45 ANDCC $7, LEN, BYTES // R7: bytes to copy 46 SRD $3, LEN, DWORDS // R6: double words to copy 47 MOVFL CR0, CR3 // save CR from ANDCC 48 CMP DWORDS, $0, CR1 // CR1[EQ] set if no double words to copy 49 50 // Determine overlap by subtracting dest - src and comparing against the 51 // length. This catches the cases where src and dest are in different types 52 // of storage such as stack and static to avoid doing backward move when not 53 // necessary. 54 55 SUB SRC, TGT, TMP // dest - src 56 CMPU TMP, LEN, CR2 // < len? 57 BC 12, 8, backward // BLT CR2 backward 58 59 // Copying forward if no overlap. 60 61 BC 12, 6, checkbytes // BEQ CR1, checkbytes 62 SRDCC $3, DWORDS, OCTWORDS // 64 byte chunks? 63 MOVD $16, IDX16 64 BEQ lt64gt8 // < 64 bytes 65 66 // Prepare for moves of 64 bytes at a time. 67 68 forward64setup: 69 DCBTST (TGT) // prepare data cache 70 DCBT (SRC) 71 MOVD OCTWORDS, CTR // Number of 64 byte chunks 72 MOVD $32, IDX32 73 MOVD $48, IDX48 74 PCALIGN $32 75 76 forward64: 77 LXVD2X (R0)(SRC), VS32 // load 64 bytes 78 LXVD2X (IDX16)(SRC), VS33 79 LXVD2X (IDX32)(SRC), VS34 80 LXVD2X (IDX48)(SRC), VS35 81 ADD $64, SRC 82 STXVD2X VS32, (R0)(TGT) // store 64 bytes 83 STXVD2X VS33, (IDX16)(TGT) 84 STXVD2X VS34, (IDX32)(TGT) 85 STXVD2X VS35, (IDX48)(TGT) 86 ADD $64,TGT // bump up for next set 87 BC 16, 0, forward64 // continue 88 ANDCC $7, DWORDS // remaining doublewords 89 BEQ checkbytes // only bytes remain 90 91 lt64gt8: 92 CMP DWORDS, $4 93 BLT lt32gt8 94 LXVD2X (R0)(SRC), VS32 95 LXVD2X (IDX16)(SRC), VS33 96 ADD $-4, DWORDS 97 STXVD2X VS32, (R0)(TGT) 98 STXVD2X VS33, (IDX16)(TGT) 99 ADD $32, SRC 100 ADD $32, TGT 101 102 lt32gt8: 103 // At this point >= 8 and < 32 104 // Move 16 bytes if possible 105 CMP DWORDS, $2 106 BLT lt16 107 LXVD2X (R0)(SRC), VS32 108 ADD $-2, DWORDS 109 STXVD2X VS32, (R0)(TGT) 110 ADD $16, SRC 111 ADD $16, TGT 112 113 lt16: // Move 8 bytes if possible 114 CMP DWORDS, $1 115 BLT checkbytes 116 MOVD 0(SRC), TMP 117 ADD $8, SRC 118 MOVD TMP, 0(TGT) 119 ADD $8, TGT 120 checkbytes: 121 BC 12, 14, LR // BEQ lr 122 lt8: // Move word if possible 123 CMP BYTES, $4 124 BLT lt4 125 MOVWZ 0(SRC), TMP 126 ADD $-4, BYTES 127 MOVW TMP, 0(TGT) 128 ADD $4, SRC 129 ADD $4, TGT 130 lt4: // Move halfword if possible 131 CMP BYTES, $2 132 BLT lt2 133 MOVHZ 0(SRC), TMP 134 ADD $-2, BYTES 135 MOVH TMP, 0(TGT) 136 ADD $2, SRC 137 ADD $2, TGT 138 lt2: // Move last byte if 1 left 139 CMP BYTES, $1 140 BC 12, 0, LR // ble lr 141 MOVBZ 0(SRC), TMP 142 MOVBZ TMP, 0(TGT) 143 RET 144 145 backward: 146 // Copying backwards proceeds by copying R7 bytes then copying R6 double words. 147 // R3 and R4 are advanced to the end of the destination/source buffers 148 // respectively and moved back as we copy. 149 150 ADD LEN, SRC, SRC // end of source 151 ADD TGT, LEN, TGT // end of dest 152 153 BEQ nobackwardtail // earlier condition 154 155 MOVD BYTES, CTR // bytes to move 156 157 backwardtailloop: 158 MOVBZ -1(SRC), TMP // point to last byte 159 SUB $1,SRC 160 MOVBZ TMP, -1(TGT) 161 SUB $1,TGT 162 BDNZ backwardtailloop 163 164 nobackwardtail: 165 BC 4, 5, LR // blelr cr1, return if DWORDS == 0 166 SRDCC $2,DWORDS,QWORDS // Compute number of 32B blocks and compare to 0 167 BNE backward32setup // If QWORDS != 0, start the 32B copy loop. 168 169 backward24: 170 // DWORDS is a value between 1-3. 171 CMP DWORDS, $2 172 173 MOVD -8(SRC), TMP 174 MOVD TMP, -8(TGT) 175 BC 12, 0, LR // bltlr, return if DWORDS == 1 176 177 MOVD -16(SRC), TMP 178 MOVD TMP, -16(TGT) 179 BC 12, 2, LR // beqlr, return if DWORDS == 2 180 181 MOVD -24(SRC), TMP 182 MOVD TMP, -24(TGT) 183 RET 184 185 backward32setup: 186 ANDCC $3,DWORDS // Compute remaining DWORDS and compare to 0 187 MOVD QWORDS, CTR // set up loop ctr 188 MOVD $16, IDX16 // 32 bytes at a time 189 190 backward32loop: 191 SUB $32, TGT 192 SUB $32, SRC 193 LXVD2X (R0)(SRC), VS32 // load 16x2 bytes 194 LXVD2X (IDX16)(SRC), VS33 195 STXVD2X VS32, (R0)(TGT) // store 16x2 bytes 196 STXVD2X VS33, (IDX16)(TGT) 197 BDNZ backward32loop 198 BC 12, 2, LR // beqlr, return if DWORDS == 0 199 BR backward24