github.com/matrixorigin/matrixone@v1.2.0/pkg/container/hashtable/hash_arm64.s (about) 1 // Copyright 2021 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include "textflag.h" 16 17 // func crc32Int64BatchHash(data *uint64, hashes *uint64, length int) 18 // Requires: CRC32 19 TEXT ·crc32Int64BatchHash(SB), NOSPLIT, $0-24 20 MOVD data+0(FP), R0 21 MOVD hashes+8(FP), R1 22 MOVD length+16(FP), R2 23 24 loop: 25 SUBS $8, R2 26 BLT tail 27 28 VLD1 (R0), [V0.B16, V1.B16, V2.B16, V3.B16] 29 VST1 [V0.B16, V1.B16, V2.B16, V3.B16], (R1) 30 31 MOVD $-1, R3 32 MOVD $-1, R4 33 MOVD $-1, R5 34 MOVD $-1, R6 35 MOVD $-1, R7 36 MOVD $-1, R8 37 MOVD $-1, R9 38 MOVD $-1, R10 39 40 LDP.P 16(R0), (R11, R12) 41 LDP.P 16(R0), (R13, R14) 42 LDP.P 16(R0), (R15, R16) 43 LDP.P 16(R0), (R17, R19) 44 45 CRC32CX R11, R3 46 CRC32CX R12, R4 47 CRC32CX R13, R5 48 CRC32CX R14, R6 49 CRC32CX R15, R7 50 CRC32CX R16, R8 51 CRC32CX R17, R9 52 CRC32CX R19, R10 53 54 MOVW.P R3, 8(R1) 55 MOVW.P R4, 8(R1) 56 MOVW.P R5, 8(R1) 57 MOVW.P R6, 8(R1) 58 MOVW.P R7, 8(R1) 59 MOVW.P R8, 8(R1) 60 MOVW.P R9, 8(R1) 61 MOVW.P R10, 8(R1) 62 63 JMP loop 64 65 tail: 66 ADDS $8, R2 67 BEQ done 68 69 tailLoop: 70 MOVD $-1, R3 71 MOVD (R0), R5 72 MOVD.P 8(R0), R4 73 CRC32CX R4, R3 74 MOVD R5, (R1) 75 MOVW.P R3, 8(R1) 76 77 SUBS $1, R2 78 BNE tailLoop 79 80 done: 81 RET 82 83 //////////////////////////////////////////////////////////////// 84 //////////////////////////////////////////////////////////////// 85 //////////////////////////////////////////////////////////////// 86 //////////////////////////////////////////////////////////////// 87 88 DATA Pi<>+0x00(SB)/8, $0x3243f6a8885a308d 89 DATA Pi<>+0x08(SB)/8, $0x313198a2e0370734 90 DATA Pi<>+0x10(SB)/8, $0x4a4093822299f31d 91 DATA Pi<>+0x18(SB)/8, $0x0082efa98ec4e6c8 92 DATA Pi<>+0x20(SB)/8, $0x9452821e638d0137 93 DATA Pi<>+0x28(SB)/8, $0x7be5466cf34e90c6 94 DATA Pi<>+0x30(SB)/8, $0xcc0ac29b7c97c50d 95 DATA Pi<>+0x38(SB)/8, $0xd3f84d5b5b547091 96 DATA Pi<>+0x40(SB)/8, $0x79216d5d98979fb1 97 DATA Pi<>+0x48(SB)/8, $0xbd1310ba698dfb5a 98 DATA Pi<>+0x50(SB)/8, $0xc2ffd72dbd01adfb 99 DATA Pi<>+0x58(SB)/8, $0x7b8e1afed6a267e9 100 DATA Pi<>+0x60(SB)/8, $0x6ba7c9045f12c7f9 101 DATA Pi<>+0x68(SB)/8, $0x924a19947b3916cf 102 DATA Pi<>+0x70(SB)/8, $0x70801f2e2858efc1 103 DATA Pi<>+0x78(SB)/8, $0x6636920d871574e6 104 GLOBL Pi<>(SB), (NOPTR+RODATA), $0x80 105 106 DATA CryptedPi<>+0x00(SB)/8, $0x822233b93c11087c 107 DATA CryptedPi<>+0x08(SB)/8, $0xd2b32f4adde873da 108 DATA CryptedPi<>+0x10(SB)/8, $0xae9c2fc7dd17bcdb 109 DATA CryptedPi<>+0x18(SB)/8, $0x859110441a1569fc 110 DATA CryptedPi<>+0x20(SB)/8, $0x47087d794fffb5c9 111 DATA CryptedPi<>+0x28(SB)/8, $0xb7b6c8f565414445 112 DATA CryptedPi<>+0x30(SB)/8, $0xfd260edabb308f8d 113 DATA CryptedPi<>+0x38(SB)/8, $0x3ddefc67bc565a13 114 DATA CryptedPi<>+0x40(SB)/8, $0xe4c1d50223544f10 115 DATA CryptedPi<>+0x48(SB)/8, $0xaf40e05725c3192b 116 DATA CryptedPi<>+0x50(SB)/8, $0x281d8ab9a16382e9 117 DATA CryptedPi<>+0x58(SB)/8, $0xddc10c903b63a6cf 118 DATA CryptedPi<>+0x60(SB)/8, $0x852d3ad603e8df72 119 DATA CryptedPi<>+0x68(SB)/8, $0xa6642b57d1011deb 120 DATA CryptedPi<>+0x70(SB)/8, $0x5063d25a1cb7b6b9 121 DATA CryptedPi<>+0x78(SB)/8, $0xb2623e6241e8e46e 122 GLOBL CryptedPi<>(SB), (NOPTR+RODATA), $0x80 123 124 // func aesBytesBatchGenHashStates(data *[]byte, states *[3]uint64, length int) 125 // Requires: AES 126 TEXT ·aesBytesBatchGenHashStates(SB), NOSPLIT, $0-24 127 MOVD data+0(FP), R0 128 MOVD states+8(FP), R1 129 MOVD length+16(FP), R2 130 131 MOVD $CryptedPi<>(SB), R3 132 VLD1.P 64(R3), [V0.B16, V1.B16, V2.B16, V3.B16] 133 VLD1 (R3), [V4.B16, V5.B16, V6.B16, V7.B16] 134 VEOR V31.B16, V31.B16, V31.B16 135 136 loop: 137 LDP.P 24(R0), (R4, R5) 138 MOVD R5, R6 139 140 ADD R4, R5 141 SUB $0x40, R5 142 143 VMOV V0.B16, V8.B16 144 VMOV V1.B16, V9.B16 145 VMOV V2.B16, V10.B16 146 VMOV V3.B16, V11.B16 147 VMOV V4.B16, V12.B16 148 VMOV V5.B16, V13.B16 149 VMOV V6.B16, V14.B16 150 VMOV V7.B16, V15.B16 151 152 innerLoop: 153 CMP R4, R5 154 BLE tail 155 156 VLD1.P 0x40(R4), [V16.B16, V17.B16, V18.B16, V19.B16] 157 158 AESE V31.B16, V8.B16 159 AESMC V8.B16, V8.B16 160 VEOR V16.B16, V8.B16, V8.B16 161 162 AESE V31.B16, V12.B16 163 AESMC V12.B16, V12.B16 164 VEOR V16.B16, V12.B16, V12.B16 165 166 AESE V31.B16, V9.B16 167 AESMC V9.B16, V9.B16 168 VEOR V17.B16, V9.B16, V9.B16 169 170 AESE V31.B16, V13.B16 171 AESMC V13.B16, V13.B16 172 VEOR V17.B16, V13.B16, V13.B16 173 174 AESE V31.B16, V10.B16 175 AESMC V10.B16, V10.B16 176 VEOR V18.B16, V10.B16, V10.B16 177 178 AESE V31.B16, V14.B16 179 AESMC V14.B16, V14.B16 180 VEOR V18.B16, V14.B16, V14.B16 181 182 AESE V31.B16, V11.B16 183 AESMC V11.B16, V11.B16 184 VEOR V19.B16, V11.B16, V11.B16 185 186 AESE V31.B16, V15.B16 187 AESMC V15.B16, V15.B16 188 VEOR V19.B16, V15.B16, V15.B16 189 190 JMP innerLoop 191 192 tail: 193 ADD $0x30, R5 194 CMP R4, R5 195 BLE done 196 197 VLD1.P 0x10(R4), [V16.B16] 198 199 AESE V31.B16, V8.B16 200 AESMC V8.B16, V8.B16 201 VEOR V16.B16, V8.B16, V8.B16 202 203 AESE V31.B16, V12.B16 204 AESMC V12.B16, V12.B16 205 VEOR V16.B16, V12.B16, V12.B16 206 207 CMP R4, R5 208 BLE done 209 210 VLD1.P 0x10(R4), [V17.B16] 211 212 AESE V31.B16, V9.B16 213 AESMC V9.B16, V9.B16 214 VEOR V17.B16, V9.B16, V9.B16 215 216 AESE V31.B16, V13.B16 217 AESMC V13.B16, V13.B16 218 VEOR V17.B16, V13.B16, V13.B16 219 220 CMP R4, R5 221 BLE done 222 223 VLD1 (R4), [V18.B16] 224 225 AESE V31.B16, V10.B16 226 AESMC V10.B16, V10.B16 227 VEOR V18.B16, V10.B16, V10.B16 228 229 AESE V31.B16, V14.B16 230 AESMC V14.B16, V14.B16 231 VEOR V18.B16, V14.B16, V14.B16 232 233 done: 234 VLD1 (R5), [V19.B16] 235 236 AESE V31.B16, V11.B16 237 AESMC V11.B16, V11.B16 238 VEOR V19.B16, V11.B16, V11.B16 239 240 AESE V31.B16, V15.B16 241 AESMC V15.B16, V15.B16 242 VEOR V19.B16, V15.B16, V15.B16 243 244 AESE V31.B16, V8.B16 245 AESMC V8.B16, V8.B16 246 VEOR V9.B16, V8.B16, V8.B16 247 248 AESE V31.B16, V11.B16 249 AESMC V11.B16, V11.B16 250 251 AESE V10.B16, V11.B16 252 AESMC V11.B16, V11.B16 253 VEOR V8.B16, V11.B16, V9.B16 254 255 AESE V8.B16, V11.B16 256 AESMC V11.B16, V11.B16 257 VEOR V9.B16, V11.B16, V10.B16 258 259 AESE V9.B16, V11.B16 260 AESMC V11.B16, V11.B16 261 VEOR V10.B16, V11.B16, V8.B16 262 263 AESE V10.B16, V11.B16 264 AESMC V11.B16, V11.B16 265 VEOR V8.B16, V11.B16, V11.B16 266 267 AESE V31.B16, V12.B16 268 AESMC V12.B16, V12.B16 269 270 AESE V31.B16, V13.B16 271 AESMC V13.B16, V13.B16 272 VEOR V14.B16, V13.B16, V13.B16 273 274 AESE V15.B16, V12.B16 275 AESMC V12.B16, V12.B16 276 VEOR V13.B16, V12.B16, V12.B16 277 278 VMOV V11.D[0], R7 279 VMOV V11.D[1], R8 280 EOR R8, R7 281 EOR R6, R7 282 283 MOVD.P R7, 8(R1) 284 VST1.P [V12.B16], 16(R1) 285 286 SUBS $1, R2 287 BNE loop 288 289 RET 290 291 // func aesInt192BatchGenHashStates(data *[3]uint64, states *[3]uint64, length int) 292 // Requires: AES 293 TEXT ·aesInt192BatchGenHashStates(SB), NOSPLIT, $0-24 294 MOVD data+0(FP), R0 295 MOVD states+8(FP), R1 296 MOVD length+16(FP), R2 297 298 MOVD $CryptedPi<>(SB), R3 299 VLD1.P 64(R3), [V0.B16, V1.B16, V2.B16, V3.B16] 300 VLD1 (R3), [V4.B16, V5.B16, V6.B16, V7.B16] 301 VEOR V31.B16, V31.B16, V31.B16 302 303 VMOV V0.B16, V30.B16 304 305 AESE V31.B16, V0.B16 306 AESMC V0.B16, V0.B16 307 308 AESE V31.B16, V1.B16 309 AESMC V1.B16, V1.B16 310 311 AESE V31.B16, V3.B16 312 AESMC V3.B16, V3.B16 313 VEOR V2.B16, V3.B16, V3.B16 314 315 AESE V31.B16, V4.B16 316 AESMC V4.B16, V4.B16 317 318 AESE V31.B16, V5.B16 319 AESMC V5.B16, V5.B16 320 321 AESE V31.B16, V6.B16 322 AESMC V6.B16, V6.B16 323 VEOR V7.B16, V6.B16, V6.B16 324 325 loop: 326 VLD1 (R0), [V8.B16] 327 ADD $0x08, R0 328 VLD1.P 0x10(R0), [V9.B16] 329 330 VEOR V0.B16, V8.B16, V10.B16 331 VEOR V5.B16, V9.B16, V11.B16 332 333 AESE V1.B16, V9.B16 334 AESMC V9.B16, V9.B16 335 336 AESE V10.B16, V9.B16 337 AESMC V9.B16, V9.B16 338 VEOR V3.B16, V9.B16, V10.B16 339 340 AESE V3.B16, V9.B16 341 AESMC V9.B16, V9.B16 342 VEOR V10.B16, V9.B16, V12.B16 343 344 AESE V10.B16, V9.B16 345 AESMC V9.B16, V9.B16 346 VEOR V12.B16, V9.B16, V9.B16 347 348 VMOV V9.D[0], R4 349 VMOV V9.D[1], R5 350 EOR R5, R4 351 352 AESE V4.B16, V8.B16 353 AESMC V8.B16, V8.B16 354 355 AESE V11.B16, V8.B16 356 AESMC V8.B16, V8.B16 357 VEOR V6.B16, V8.B16, V8.B16 358 359 MOVD.P R4, 0x08(R1) 360 VST1.P [V8.B16], 0x10(R1) 361 362 SUBS $1, R2 363 BNE loop 364 365 done: 366 RET 367 368 // func aesInt256BatchGenHashStates(data *[4]uint64, states *[3]uint64, length int) 369 // Requires: AES 370 TEXT ·aesInt256BatchGenHashStates(SB), NOSPLIT, $0-24 371 MOVD data+0(FP), R0 372 MOVD states+8(FP), R1 373 MOVD length+16(FP), R2 374 375 MOVD $CryptedPi<>(SB), R3 376 VLD1.P 64(R3), [V0.B16, V1.B16, V2.B16, V3.B16] 377 VLD1 (R3), [V4.B16, V5.B16, V6.B16, V7.B16] 378 VEOR V31.B16, V31.B16, V31.B16 379 380 VMOV V0.B16, V30.B16 381 382 AESE V31.B16, V0.B16 383 AESMC V0.B16, V0.B16 384 385 AESE V31.B16, V1.B16 386 AESMC V1.B16, V1.B16 387 388 AESE V31.B16, V3.B16 389 AESMC V3.B16, V3.B16 390 VEOR V2.B16, V3.B16, V3.B16 391 392 AESE V31.B16, V4.B16 393 AESMC V4.B16, V4.B16 394 395 AESE V31.B16, V5.B16 396 AESMC V5.B16, V5.B16 397 398 AESE V31.B16, V6.B16 399 AESMC V6.B16, V6.B16 400 VEOR V7.B16, V6.B16, V6.B16 401 402 loop: 403 VLD1.P 0x20(R0), [V8.B16, V9.B16] 404 405 VEOR V0.B16, V8.B16, V10.B16 406 VEOR V5.B16, V9.B16, V11.B16 407 408 AESE V1.B16, V9.B16 409 AESMC V9.B16, V9.B16 410 411 AESE V10.B16, V9.B16 412 AESMC V9.B16, V9.B16 413 VEOR V3.B16, V9.B16, V10.B16 414 415 AESE V3.B16, V9.B16 416 AESMC V9.B16, V9.B16 417 VEOR V10.B16, V9.B16, V12.B16 418 419 AESE V10.B16, V9.B16 420 AESMC V9.B16, V9.B16 421 VEOR V12.B16, V9.B16, V9.B16 422 423 VMOV V9.D[0], R4 424 VMOV V9.D[1], R5 425 EOR R5, R4 426 427 AESE V4.B16, V8.B16 428 AESMC V8.B16, V8.B16 429 430 AESE V11.B16, V8.B16 431 AESMC V8.B16, V8.B16 432 VEOR V6.B16, V8.B16, V8.B16 433 434 MOVD.P R4, 0x08(R1) 435 VST1.P [V8.B16], 0x10(R1) 436 437 SUBS $1, R2 438 BNE loop 439 440 done: 441 RET 442 443 // func aesInt320BatchGenHashStates(data *[5]uint64, states *[3]uint64, length int) 444 // Requires: AES 445 TEXT ·aesInt320BatchGenHashStates(SB), NOSPLIT, $0-24 446 MOVD data+0(FP), R0 447 MOVD states+8(FP), R1 448 MOVD length+16(FP), R2 449 450 MOVD $CryptedPi<>(SB), R3 451 VLD1.P 64(R3), [V0.B16, V1.B16, V2.B16, V3.B16] 452 VLD1 (R3), [V4.B16, V5.B16, V6.B16, V7.B16] 453 VEOR V31.B16, V31.B16, V31.B16 454 455 AESE V31.B16, V0.B16 456 AESMC V0.B16, V0.B16 457 458 AESE V31.B16, V1.B16 459 AESMC V1.B16, V1.B16 460 461 AESE V31.B16, V3.B16 462 AESMC V3.B16, V3.B16 463 464 AESE V31.B16, V4.B16 465 AESMC V4.B16, V4.B16 466 467 AESE V31.B16, V5.B16 468 AESMC V5.B16, V5.B16 469 470 AESE V31.B16, V6.B16 471 AESMC V6.B16, V6.B16 472 473 loop: 474 VLD1 (R0), [V8.B16, V9.B16] 475 ADD $0x18, R0 476 VLD1.P 0x10(R0), [V10.B16] 477 478 VEOR V4.B16, V8.B16, V11.B16 479 VEOR V5.B16, V9.B16, V12.B16 480 481 VEOR V3.B16, V10.B16, V13.B16 482 483 AESE V0.B16, V8.B16 484 AESMC V8.B16, V8.B16 485 486 AESE V1.B16, V9.B16 487 AESMC V9.B16, V9.B16 488 VEOR V2.B16, V9.B16, V9.B16 489 490 AESE V13.B16, V8.B16 491 AESMC V8.B16, V8.B16 492 VEOR V9.B16, V8.B16, V13.B16 493 494 AESE V9.B16, V8.B16 495 AESMC V8.B16, V8.B16 496 VEOR V13.B16, V8.B16, V9.B16 497 498 AESE V13.B16, V8.B16 499 AESMC V8.B16, V8.B16 500 VEOR V9.B16, V8.B16, V8.B16 501 502 VMOV V8.D[0], R4 503 VMOV V8.D[1], R5 504 EOR R5, R4 505 506 AESE V31.B16, V11.B16 507 AESMC V11.B16, V11.B16 508 509 AESE V6.B16, V10.B16 510 AESMC V10.B16, V10.B16 511 VEOR V7.B16, V10.B16, V10.B16 512 513 AESE V12.B16, V11.B16 514 AESMC V11.B16, V11.B16 515 VEOR V10.B16, V11.B16, V11.B16 516 517 MOVD.P R4, 0x08(R1) 518 VST1.P [V11.B16], 0x10(R1) 519 520 SUBS $1, R2 521 BNE loop 522 523 done: 524 RET