github.com/matrixorigin/matrixone@v0.7.0/pkg/container/hashtable/hash_arm64.s (about) 1 // Copyright 2021 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include "textflag.h" 16 17 // func crc32Int64BatchHash(data *uint64, hashes *uint64, length int) 18 // Requires: CRC32 19 TEXT ·crc32Int64BatchHash(SB), NOSPLIT, $0-24 20 MOVD data+0(FP), R0 21 MOVD hashes+8(FP), R1 22 MOVD length+16(FP), R2 23 24 loop: 25 SUBS $8, R2 26 BLT tail 27 28 MOVD $-1, R3 29 MOVD $-1, R4 30 MOVD $-1, R5 31 MOVD $-1, R6 32 MOVD $-1, R7 33 MOVD $-1, R8 34 MOVD $-1, R9 35 MOVD $-1, R10 36 37 LDP.P 16(R0), (R11, R12) 38 LDP.P 16(R0), (R13, R14) 39 LDP.P 16(R0), (R15, R16) 40 LDP.P 16(R0), (R17, R19) 41 42 CRC32CX R11, R3 43 CRC32CX R12, R4 44 CRC32CX R13, R5 45 CRC32CX R14, R6 46 CRC32CX R15, R7 47 CRC32CX R16, R8 48 CRC32CX R17, R9 49 CRC32CX R19, R10 50 51 STP.P (R3, R4), 16(R1) 52 STP.P (R5, R6), 16(R1) 53 STP.P (R7, R8), 16(R1) 54 STP.P (R9, R10), 16(R1) 55 56 JMP loop 57 58 tail: 59 ADDS $8, R2 60 BEQ done 61 62 tailLoop: 63 MOVD $-1, R3 64 MOVD.P 8(R0), R4 65 CRC32CX R4, R3 66 MOVD.P R3, 8(R1) 67 68 SUBS $1, R2 69 BNE tailLoop 70 71 done: 72 RET 73 74 // func crc32Int64CellBatchHash(data *uint64, hashes *uint64, length int) 75 // Requires: CRC32 76 TEXT ·crc32Int64CellBatchHash(SB), NOSPLIT, $0-24 77 MOVD data+0(FP), R0 78 MOVD hashes+8(FP), R1 79 MOVD length+16(FP), R2 80 81 loop: 82 SUBS $8, R2 83 BLT tail 84 85 MOVD $-1, R3 86 MOVD $-1, R4 87 MOVD $-1, R5 88 MOVD $-1, R6 89 MOVD $-1, R7 90 MOVD $-1, R8 91 MOVD $-1, R9 92 MOVD $-1, R10 93 94 MOVD.P 16(R0), R11 95 MOVD.P 16(R0), R12 96 MOVD.P 16(R0), R13 97 MOVD.P 16(R0), R14 98 MOVD.P 16(R0), R15 99 MOVD.P 16(R0), R16 100 MOVD.P 16(R0), R17 101 MOVD.P 16(R0), R19 102 103 CRC32CX R11, R3 104 CRC32CX R12, R4 105 CRC32CX R13, R5 106 CRC32CX R14, R6 107 CRC32CX R15, R7 108 CRC32CX R16, R8 109 CRC32CX R17, R9 110 CRC32CX R19, R10 111 112 STP.P (R3, R4), 16(R1) 113 STP.P (R5, R6), 16(R1) 114 STP.P (R7, R8), 16(R1) 115 STP.P (R9, R10), 16(R1) 116 117 JMP loop 118 119 tail: 120 ADDS $8, R2 121 BEQ done 122 123 tailLoop: 124 MOVD $-1, R4 125 MOVD.P 16(R0), R3 126 CRC32CX R3, R4 127 MOVD.P R4, 8(R1) 128 129 SUBS $1, R2 130 BNE tailLoop 131 132 done: 133 RET 134 135 //////////////////////////////////////////////////////////////// 136 //////////////////////////////////////////////////////////////// 137 //////////////////////////////////////////////////////////////// 138 //////////////////////////////////////////////////////////////// 139 140 DATA Pi<>+0x00(SB)/8, $0x3243f6a8885a308d 141 DATA Pi<>+0x08(SB)/8, $0x313198a2e0370734 142 DATA Pi<>+0x10(SB)/8, $0x4a4093822299f31d 143 DATA Pi<>+0x18(SB)/8, $0x0082efa98ec4e6c8 144 DATA Pi<>+0x20(SB)/8, $0x9452821e638d0137 145 DATA Pi<>+0x28(SB)/8, $0x7be5466cf34e90c6 146 DATA Pi<>+0x30(SB)/8, $0xcc0ac29b7c97c50d 147 DATA Pi<>+0x38(SB)/8, $0xd3f84d5b5b547091 148 DATA Pi<>+0x40(SB)/8, $0x79216d5d98979fb1 149 DATA Pi<>+0x48(SB)/8, $0xbd1310ba698dfb5a 150 DATA Pi<>+0x50(SB)/8, $0xc2ffd72dbd01adfb 151 DATA Pi<>+0x58(SB)/8, $0x7b8e1afed6a267e9 152 DATA Pi<>+0x60(SB)/8, $0x6ba7c9045f12c7f9 153 DATA Pi<>+0x68(SB)/8, $0x924a19947b3916cf 154 DATA Pi<>+0x70(SB)/8, $0x70801f2e2858efc1 155 DATA Pi<>+0x78(SB)/8, $0x6636920d871574e6 156 GLOBL Pi<>(SB), (NOPTR+RODATA), $0x80 157 158 DATA CryptedPi<>+0x00(SB)/8, $0x822233b93c11087c 159 DATA CryptedPi<>+0x08(SB)/8, $0xd2b32f4adde873da 160 DATA CryptedPi<>+0x10(SB)/8, $0xae9c2fc7dd17bcdb 161 DATA CryptedPi<>+0x18(SB)/8, $0x859110441a1569fc 162 DATA CryptedPi<>+0x20(SB)/8, $0x47087d794fffb5c9 163 DATA CryptedPi<>+0x28(SB)/8, $0xb7b6c8f565414445 164 DATA CryptedPi<>+0x30(SB)/8, $0xfd260edabb308f8d 165 DATA CryptedPi<>+0x38(SB)/8, $0x3ddefc67bc565a13 166 DATA CryptedPi<>+0x40(SB)/8, $0xe4c1d50223544f10 167 DATA CryptedPi<>+0x48(SB)/8, $0xaf40e05725c3192b 168 DATA CryptedPi<>+0x50(SB)/8, $0x281d8ab9a16382e9 169 DATA CryptedPi<>+0x58(SB)/8, $0xddc10c903b63a6cf 170 DATA CryptedPi<>+0x60(SB)/8, $0x852d3ad603e8df72 171 DATA CryptedPi<>+0x68(SB)/8, $0xa6642b57d1011deb 172 DATA CryptedPi<>+0x70(SB)/8, $0x5063d25a1cb7b6b9 173 DATA CryptedPi<>+0x78(SB)/8, $0xb2623e6241e8e46e 174 GLOBL CryptedPi<>(SB), (NOPTR+RODATA), $0x80 175 176 // func aesBytesBatchGenHashStates(data *[]byte, states *[3]uint64, length int) 177 // Requires: AES 178 TEXT ·aesBytesBatchGenHashStates(SB), NOSPLIT, $0-24 179 MOVD data+0(FP), R0 180 MOVD states+8(FP), R1 181 MOVD length+16(FP), R2 182 183 MOVD $CryptedPi<>(SB), R3 184 VLD1.P 64(R3), [V0.B16, V1.B16, V2.B16, V3.B16] 185 VLD1 (R3), [V4.B16, V5.B16, V6.B16, V7.B16] 186 VEOR V31.B16, V31.B16, V31.B16 187 188 loop: 189 LDP.P 24(R0), (R4, R5) 190 MOVD R5, R6 191 192 ADD R4, R5 193 SUB $0x40, R5 194 195 VMOV V0.B16, V8.B16 196 VMOV V1.B16, V9.B16 197 VMOV V2.B16, V10.B16 198 VMOV V3.B16, V11.B16 199 VMOV V4.B16, V12.B16 200 VMOV V5.B16, V13.B16 201 VMOV V6.B16, V14.B16 202 VMOV V7.B16, V15.B16 203 204 innerLoop: 205 CMP R4, R5 206 BLE tail 207 208 VLD1.P 0x40(R4), [V16.B16, V17.B16, V18.B16, V19.B16] 209 210 AESE V31.B16, V8.B16 211 AESMC V8.B16, V8.B16 212 VEOR V16.B16, V8.B16, V8.B16 213 214 AESE V31.B16, V12.B16 215 AESMC V12.B16, V12.B16 216 VEOR V16.B16, V12.B16, V12.B16 217 218 AESE V31.B16, V9.B16 219 AESMC V9.B16, V9.B16 220 VEOR V17.B16, V9.B16, V9.B16 221 222 AESE V31.B16, V13.B16 223 AESMC V13.B16, V13.B16 224 VEOR V17.B16, V13.B16, V13.B16 225 226 AESE V31.B16, V10.B16 227 AESMC V10.B16, V10.B16 228 VEOR V18.B16, V10.B16, V10.B16 229 230 AESE V31.B16, V14.B16 231 AESMC V14.B16, V14.B16 232 VEOR V18.B16, V14.B16, V14.B16 233 234 AESE V31.B16, V11.B16 235 AESMC V11.B16, V11.B16 236 VEOR V19.B16, V11.B16, V11.B16 237 238 AESE V31.B16, V15.B16 239 AESMC V15.B16, V15.B16 240 VEOR V19.B16, V15.B16, V15.B16 241 242 JMP innerLoop 243 244 tail: 245 ADD $0x30, R5 246 CMP R4, R5 247 BLE done 248 249 VLD1.P 0x10(R4), [V16.B16] 250 251 AESE V31.B16, V8.B16 252 AESMC V8.B16, V8.B16 253 VEOR V16.B16, V8.B16, V8.B16 254 255 AESE V31.B16, V12.B16 256 AESMC V12.B16, V12.B16 257 VEOR V16.B16, V12.B16, V12.B16 258 259 CMP R4, R5 260 BLE done 261 262 VLD1.P 0x10(R4), [V17.B16] 263 264 AESE V31.B16, V9.B16 265 AESMC V9.B16, V9.B16 266 VEOR V17.B16, V9.B16, V9.B16 267 268 AESE V31.B16, V13.B16 269 AESMC V13.B16, V13.B16 270 VEOR V17.B16, V13.B16, V13.B16 271 272 CMP R4, R5 273 BLE done 274 275 VLD1 (R4), [V18.B16] 276 277 AESE V31.B16, V10.B16 278 AESMC V10.B16, V10.B16 279 VEOR V18.B16, V10.B16, V10.B16 280 281 AESE V31.B16, V14.B16 282 AESMC V14.B16, V14.B16 283 VEOR V18.B16, V14.B16, V14.B16 284 285 done: 286 VLD1 (R5), [V19.B16] 287 288 AESE V31.B16, V11.B16 289 AESMC V11.B16, V11.B16 290 VEOR V19.B16, V11.B16, V11.B16 291 292 AESE V31.B16, V15.B16 293 AESMC V15.B16, V15.B16 294 VEOR V19.B16, V15.B16, V15.B16 295 296 AESE V31.B16, V8.B16 297 AESMC V8.B16, V8.B16 298 VEOR V9.B16, V8.B16, V8.B16 299 300 AESE V31.B16, V11.B16 301 AESMC V11.B16, V11.B16 302 303 AESE V10.B16, V11.B16 304 AESMC V11.B16, V11.B16 305 VEOR V8.B16, V11.B16, V9.B16 306 307 AESE V8.B16, V11.B16 308 AESMC V11.B16, V11.B16 309 VEOR V9.B16, V11.B16, V10.B16 310 311 AESE V9.B16, V11.B16 312 AESMC V11.B16, V11.B16 313 VEOR V10.B16, V11.B16, V8.B16 314 315 AESE V10.B16, V11.B16 316 AESMC V11.B16, V11.B16 317 VEOR V8.B16, V11.B16, V11.B16 318 319 AESE V31.B16, V12.B16 320 AESMC V12.B16, V12.B16 321 322 AESE V31.B16, V13.B16 323 AESMC V13.B16, V13.B16 324 VEOR V14.B16, V13.B16, V13.B16 325 326 AESE V15.B16, V12.B16 327 AESMC V12.B16, V12.B16 328 VEOR V13.B16, V12.B16, V12.B16 329 330 VMOV V11.D[0], R7 331 VMOV V11.D[1], R8 332 EOR R8, R7 333 EOR R6, R7 334 335 MOVD.P R7, 8(R1) 336 VST1.P [V12.B16], 16(R1) 337 338 SUBS $1, R2 339 BNE loop 340 341 RET 342 343 // func aesInt192BatchGenHashStates(data *[3]uint64, states *[3]uint64, length int) 344 // Requires: AES 345 TEXT ·aesInt192BatchGenHashStates(SB), NOSPLIT, $0-24 346 MOVD data+0(FP), R0 347 MOVD states+8(FP), R1 348 MOVD length+16(FP), R2 349 350 MOVD $CryptedPi<>(SB), R3 351 VLD1.P 64(R3), [V0.B16, V1.B16, V2.B16, V3.B16] 352 VLD1 (R3), [V4.B16, V5.B16, V6.B16, V7.B16] 353 VEOR V31.B16, V31.B16, V31.B16 354 355 VMOV V0.B16, V30.B16 356 357 AESE V31.B16, V0.B16 358 AESMC V0.B16, V0.B16 359 360 AESE V31.B16, V1.B16 361 AESMC V1.B16, V1.B16 362 363 AESE V31.B16, V3.B16 364 AESMC V3.B16, V3.B16 365 VEOR V2.B16, V3.B16, V3.B16 366 367 AESE V31.B16, V4.B16 368 AESMC V4.B16, V4.B16 369 370 AESE V31.B16, V5.B16 371 AESMC V5.B16, V5.B16 372 373 AESE V31.B16, V6.B16 374 AESMC V6.B16, V6.B16 375 VEOR V7.B16, V6.B16, V6.B16 376 377 loop: 378 VLD1 (R0), [V8.B16] 379 ADD $0x08, R0 380 VLD1.P 0x10(R0), [V9.B16] 381 382 VEOR V0.B16, V8.B16, V10.B16 383 VEOR V5.B16, V9.B16, V11.B16 384 385 AESE V1.B16, V9.B16 386 AESMC V9.B16, V9.B16 387 388 AESE V10.B16, V9.B16 389 AESMC V9.B16, V9.B16 390 VEOR V3.B16, V9.B16, V10.B16 391 392 AESE V3.B16, V9.B16 393 AESMC V9.B16, V9.B16 394 VEOR V10.B16, V9.B16, V12.B16 395 396 AESE V10.B16, V9.B16 397 AESMC V9.B16, V9.B16 398 VEOR V12.B16, V9.B16, V9.B16 399 400 VMOV V9.D[0], R4 401 VMOV V9.D[1], R5 402 EOR R5, R4 403 404 AESE V4.B16, V8.B16 405 AESMC V8.B16, V8.B16 406 407 AESE V11.B16, V8.B16 408 AESMC V8.B16, V8.B16 409 VEOR V6.B16, V8.B16, V8.B16 410 411 MOVD.P R4, 0x08(R1) 412 VST1.P [V8.B16], 0x10(R1) 413 414 SUBS $1, R2 415 BNE loop 416 417 done: 418 RET 419 420 // func aesInt256BatchGenHashStates(data *[4]uint64, states *[3]uint64, length int) 421 // Requires: AES 422 TEXT ·aesInt256BatchGenHashStates(SB), NOSPLIT, $0-24 423 MOVD data+0(FP), R0 424 MOVD states+8(FP), R1 425 MOVD length+16(FP), R2 426 427 MOVD $CryptedPi<>(SB), R3 428 VLD1.P 64(R3), [V0.B16, V1.B16, V2.B16, V3.B16] 429 VLD1 (R3), [V4.B16, V5.B16, V6.B16, V7.B16] 430 VEOR V31.B16, V31.B16, V31.B16 431 432 VMOV V0.B16, V30.B16 433 434 AESE V31.B16, V0.B16 435 AESMC V0.B16, V0.B16 436 437 AESE V31.B16, V1.B16 438 AESMC V1.B16, V1.B16 439 440 AESE V31.B16, V3.B16 441 AESMC V3.B16, V3.B16 442 VEOR V2.B16, V3.B16, V3.B16 443 444 AESE V31.B16, V4.B16 445 AESMC V4.B16, V4.B16 446 447 AESE V31.B16, V5.B16 448 AESMC V5.B16, V5.B16 449 450 AESE V31.B16, V6.B16 451 AESMC V6.B16, V6.B16 452 VEOR V7.B16, V6.B16, V6.B16 453 454 loop: 455 VLD1.P 0x20(R0), [V8.B16, V9.B16] 456 457 VEOR V0.B16, V8.B16, V10.B16 458 VEOR V5.B16, V9.B16, V11.B16 459 460 AESE V1.B16, V9.B16 461 AESMC V9.B16, V9.B16 462 463 AESE V10.B16, V9.B16 464 AESMC V9.B16, V9.B16 465 VEOR V3.B16, V9.B16, V10.B16 466 467 AESE V3.B16, V9.B16 468 AESMC V9.B16, V9.B16 469 VEOR V10.B16, V9.B16, V12.B16 470 471 AESE V10.B16, V9.B16 472 AESMC V9.B16, V9.B16 473 VEOR V12.B16, V9.B16, V9.B16 474 475 VMOV V9.D[0], R4 476 VMOV V9.D[1], R5 477 EOR R5, R4 478 479 AESE V4.B16, V8.B16 480 AESMC V8.B16, V8.B16 481 482 AESE V11.B16, V8.B16 483 AESMC V8.B16, V8.B16 484 VEOR V6.B16, V8.B16, V8.B16 485 486 MOVD.P R4, 0x08(R1) 487 VST1.P [V8.B16], 0x10(R1) 488 489 SUBS $1, R2 490 BNE loop 491 492 done: 493 RET 494 495 // func aesInt320BatchGenHashStates(data *[5]uint64, states *[3]uint64, length int) 496 // Requires: AES 497 TEXT ·aesInt320BatchGenHashStates(SB), NOSPLIT, $0-24 498 MOVD data+0(FP), R0 499 MOVD states+8(FP), R1 500 MOVD length+16(FP), R2 501 502 MOVD $CryptedPi<>(SB), R3 503 VLD1.P 64(R3), [V0.B16, V1.B16, V2.B16, V3.B16] 504 VLD1 (R3), [V4.B16, V5.B16, V6.B16, V7.B16] 505 VEOR V31.B16, V31.B16, V31.B16 506 507 AESE V31.B16, V0.B16 508 AESMC V0.B16, V0.B16 509 510 AESE V31.B16, V1.B16 511 AESMC V1.B16, V1.B16 512 513 AESE V31.B16, V3.B16 514 AESMC V3.B16, V3.B16 515 516 AESE V31.B16, V4.B16 517 AESMC V4.B16, V4.B16 518 519 AESE V31.B16, V5.B16 520 AESMC V5.B16, V5.B16 521 522 AESE V31.B16, V6.B16 523 AESMC V6.B16, V6.B16 524 525 loop: 526 VLD1 (R0), [V8.B16, V9.B16] 527 ADD $0x18, R0 528 VLD1.P 0x10(R0), [V10.B16] 529 530 VEOR V4.B16, V8.B16, V11.B16 531 VEOR V5.B16, V9.B16, V12.B16 532 533 VEOR V3.B16, V10.B16, V13.B16 534 535 AESE V0.B16, V8.B16 536 AESMC V8.B16, V8.B16 537 538 AESE V1.B16, V9.B16 539 AESMC V9.B16, V9.B16 540 VEOR V2.B16, V9.B16, V9.B16 541 542 AESE V13.B16, V8.B16 543 AESMC V8.B16, V8.B16 544 VEOR V9.B16, V8.B16, V13.B16 545 546 AESE V9.B16, V8.B16 547 AESMC V8.B16, V8.B16 548 VEOR V13.B16, V8.B16, V9.B16 549 550 AESE V13.B16, V8.B16 551 AESMC V8.B16, V8.B16 552 VEOR V9.B16, V8.B16, V8.B16 553 554 VMOV V8.D[0], R4 555 VMOV V8.D[1], R5 556 EOR R5, R4 557 558 AESE V31.B16, V11.B16 559 AESMC V11.B16, V11.B16 560 561 AESE V6.B16, V10.B16 562 AESMC V10.B16, V10.B16 563 VEOR V7.B16, V10.B16, V10.B16 564 565 AESE V12.B16, V11.B16 566 AESMC V11.B16, V11.B16 567 VEOR V10.B16, V11.B16, V11.B16 568 569 MOVD.P R4, 0x08(R1) 570 VST1.P [V11.B16], 0x10(R1) 571 572 SUBS $1, R2 573 BNE loop 574 575 done: 576 RET