github.com/primecitizens/pcz/std@v0.2.1/core/mem/move_riscv64.s (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright 2023 The Prime Citizens 3 // 4 // Copyright 2016 The Go Authors. All rights reserved. 5 // Use of this source code is governed by a BSD-style 6 // license that can be found in the LICENSE file. 7 8 //go:build pcz && riscv64 9 10 #include "textflag.h" 11 12 // See memmove Go doc for important implementation constraints. 13 14 // void memmove(void*, void*, uintptr) 15 TEXT ·Move<ABIInternal>(SB),NOSPLIT,$-0-24 16 // X10 = to 17 // X11 = from 18 // X12 = n 19 BEQ X10, X11, done 20 BEQZ X12, done 21 22 // If the destination is ahead of the source, start at the end of the 23 // buffer and go backward. 24 BGTU X10, X11, backward 25 26 // If less than 8 bytes, do single byte copies. 27 MOV $8, X9 28 BLT X12, X9, f_loop4_check 29 30 // Check alignment - if alignment differs we have to do one byte at a time. 31 AND $3, X10, X5 32 AND $3, X11, X6 33 BNE X5, X6, f_loop8_unaligned_check 34 BEQZ X5, f_loop_check 35 36 // Move one byte at a time until we reach 8 byte alignment. 37 SUB X5, X12, X12 38 f_align: 39 ADD $-1, X5 40 MOVB 0(X11), X14 41 MOVB X14, 0(X10) 42 ADD $1, X10 43 ADD $1, X11 44 BNEZ X5, f_align 45 46 f_loop_check: 47 MOV $16, X9 48 BLT X12, X9, f_loop8_check 49 MOV $32, X9 50 BLT X12, X9, f_loop16_check 51 MOV $64, X9 52 BLT X12, X9, f_loop32_check 53 f_loop64: 54 MOV 0(X11), X14 55 MOV 8(X11), X15 56 MOV 16(X11), X16 57 MOV 24(X11), X17 58 MOV 32(X11), X18 59 MOV 40(X11), X19 60 MOV 48(X11), X20 61 MOV 56(X11), X21 62 MOV X14, 0(X10) 63 MOV X15, 8(X10) 64 MOV X16, 16(X10) 65 MOV X17, 24(X10) 66 MOV X18, 32(X10) 67 MOV X19, 40(X10) 68 MOV X20, 48(X10) 69 MOV X21, 56(X10) 70 ADD $64, X10 71 ADD $64, X11 72 ADD $-64, X12 73 BGE X12, X9, f_loop64 74 BEQZ X12, done 75 76 f_loop32_check: 77 MOV $32, X9 78 BLT X12, X9, f_loop16_check 79 f_loop32: 80 MOV 0(X11), X14 81 MOV 8(X11), X15 82 MOV 16(X11), X16 83 MOV 24(X11), X17 84 MOV X14, 0(X10) 85 MOV X15, 8(X10) 86 MOV X16, 16(X10) 87 MOV X17, 24(X10) 88 ADD $32, X10 89 ADD $32, X11 90 ADD $-32, X12 91 BGE X12, X9, f_loop32 92 BEQZ X12, done 93 94 f_loop16_check: 95 MOV $16, X9 96 BLT X12, X9, f_loop8_check 97 f_loop16: 98 MOV 0(X11), X14 99 MOV 8(X11), X15 100 MOV X14, 0(X10) 101 MOV X15, 8(X10) 102 ADD $16, X10 103 ADD $16, X11 104 ADD $-16, X12 105 BGE X12, X9, f_loop16 106 BEQZ X12, done 107 108 f_loop8_check: 109 MOV $8, X9 110 BLT X12, X9, f_loop4_check 111 f_loop8: 112 MOV 0(X11), X14 113 MOV X14, 0(X10) 114 ADD $8, X10 115 ADD $8, X11 116 ADD $-8, X12 117 BGE X12, X9, f_loop8 118 BEQZ X12, done 119 JMP f_loop4_check 120 121 f_loop8_unaligned_check: 122 MOV $8, X9 123 BLT X12, X9, f_loop4_check 124 f_loop8_unaligned: 125 MOVB 0(X11), X14 126 MOVB 1(X11), X15 127 MOVB 2(X11), X16 128 MOVB 3(X11), X17 129 MOVB 4(X11), X18 130 MOVB 5(X11), X19 131 MOVB 6(X11), X20 132 MOVB 7(X11), X21 133 MOVB X14, 0(X10) 134 MOVB X15, 1(X10) 135 MOVB X16, 2(X10) 136 MOVB X17, 3(X10) 137 MOVB X18, 4(X10) 138 MOVB X19, 5(X10) 139 MOVB X20, 6(X10) 140 MOVB X21, 7(X10) 141 ADD $8, X10 142 ADD $8, X11 143 ADD $-8, X12 144 BGE X12, X9, f_loop8_unaligned 145 146 f_loop4_check: 147 MOV $4, X9 148 BLT X12, X9, f_loop1 149 f_loop4: 150 MOVB 0(X11), X14 151 MOVB 1(X11), X15 152 MOVB 2(X11), X16 153 MOVB 3(X11), X17 154 MOVB X14, 0(X10) 155 MOVB X15, 1(X10) 156 MOVB X16, 2(X10) 157 MOVB X17, 3(X10) 158 ADD $4, X10 159 ADD $4, X11 160 ADD $-4, X12 161 BGE X12, X9, f_loop4 162 163 f_loop1: 164 BEQZ X12, done 165 MOVB 0(X11), X14 166 MOVB X14, 0(X10) 167 ADD $1, X10 168 ADD $1, X11 169 ADD $-1, X12 170 JMP f_loop1 171 172 backward: 173 ADD X10, X12, X10 174 ADD X11, X12, X11 175 176 // If less than 8 bytes, do single byte copies. 177 MOV $8, X9 178 BLT X12, X9, b_loop4_check 179 180 // Check alignment - if alignment differs we have to do one byte at a time. 181 AND $3, X10, X5 182 AND $3, X11, X6 183 BNE X5, X6, b_loop8_unaligned_check 184 BEQZ X5, b_loop_check 185 186 // Move one byte at a time until we reach 8 byte alignment. 187 SUB X5, X12, X12 188 b_align: 189 ADD $-1, X5 190 ADD $-1, X10 191 ADD $-1, X11 192 MOVB 0(X11), X14 193 MOVB X14, 0(X10) 194 BNEZ X5, b_align 195 196 b_loop_check: 197 MOV $16, X9 198 BLT X12, X9, b_loop8_check 199 MOV $32, X9 200 BLT X12, X9, b_loop16_check 201 MOV $64, X9 202 BLT X12, X9, b_loop32_check 203 b_loop64: 204 ADD $-64, X10 205 ADD $-64, X11 206 MOV 0(X11), X14 207 MOV 8(X11), X15 208 MOV 16(X11), X16 209 MOV 24(X11), X17 210 MOV 32(X11), X18 211 MOV 40(X11), X19 212 MOV 48(X11), X20 213 MOV 56(X11), X21 214 MOV X14, 0(X10) 215 MOV X15, 8(X10) 216 MOV X16, 16(X10) 217 MOV X17, 24(X10) 218 MOV X18, 32(X10) 219 MOV X19, 40(X10) 220 MOV X20, 48(X10) 221 MOV X21, 56(X10) 222 ADD $-64, X12 223 BGE X12, X9, b_loop64 224 BEQZ X12, done 225 226 b_loop32_check: 227 MOV $32, X9 228 BLT X12, X9, b_loop16_check 229 b_loop32: 230 ADD $-32, X10 231 ADD $-32, X11 232 MOV 0(X11), X14 233 MOV 8(X11), X15 234 MOV 16(X11), X16 235 MOV 24(X11), X17 236 MOV X14, 0(X10) 237 MOV X15, 8(X10) 238 MOV X16, 16(X10) 239 MOV X17, 24(X10) 240 ADD $-32, X12 241 BGE X12, X9, b_loop32 242 BEQZ X12, done 243 244 b_loop16_check: 245 MOV $16, X9 246 BLT X12, X9, b_loop8_check 247 b_loop16: 248 ADD $-16, X10 249 ADD $-16, X11 250 MOV 0(X11), X14 251 MOV 8(X11), X15 252 MOV X14, 0(X10) 253 MOV X15, 8(X10) 254 ADD $-16, X12 255 BGE X12, X9, b_loop16 256 BEQZ X12, done 257 258 b_loop8_check: 259 MOV $8, X9 260 BLT X12, X9, b_loop4_check 261 b_loop8: 262 ADD $-8, X10 263 ADD $-8, X11 264 MOV 0(X11), X14 265 MOV X14, 0(X10) 266 ADD $-8, X12 267 BGE X12, X9, b_loop8 268 BEQZ X12, done 269 JMP b_loop4_check 270 271 b_loop8_unaligned_check: 272 MOV $8, X9 273 BLT X12, X9, b_loop4_check 274 b_loop8_unaligned: 275 ADD $-8, X10 276 ADD $-8, X11 277 MOVB 0(X11), X14 278 MOVB 1(X11), X15 279 MOVB 2(X11), X16 280 MOVB 3(X11), X17 281 MOVB 4(X11), X18 282 MOVB 5(X11), X19 283 MOVB 6(X11), X20 284 MOVB 7(X11), X21 285 MOVB X14, 0(X10) 286 MOVB X15, 1(X10) 287 MOVB X16, 2(X10) 288 MOVB X17, 3(X10) 289 MOVB X18, 4(X10) 290 MOVB X19, 5(X10) 291 MOVB X20, 6(X10) 292 MOVB X21, 7(X10) 293 ADD $-8, X12 294 BGE X12, X9, b_loop8_unaligned 295 296 b_loop4_check: 297 MOV $4, X9 298 BLT X12, X9, b_loop1 299 b_loop4: 300 ADD $-4, X10 301 ADD $-4, X11 302 MOVB 0(X11), X14 303 MOVB 1(X11), X15 304 MOVB 2(X11), X16 305 MOVB 3(X11), X17 306 MOVB X14, 0(X10) 307 MOVB X15, 1(X10) 308 MOVB X16, 2(X10) 309 MOVB X17, 3(X10) 310 ADD $-4, X12 311 BGE X12, X9, b_loop4 312 313 b_loop1: 314 BEQZ X12, done 315 ADD $-1, X10 316 ADD $-1, X11 317 MOVB 0(X11), X14 318 MOVB X14, 0(X10) 319 ADD $-1, X12 320 JMP b_loop1 321 322 done: 323 RET