github.com/arr-ai/hash@v0.8.0/asm_amd64.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "funcdata.h" 7 #include "textflag.h" 8 9 // func aeshash(p unsafe.Pointer, h, s uintptr) uintptr 10 // hash function using AES hardware instructions 11 TEXT ·aeshash(SB),NOSPLIT,$0-32 12 MOVQ p+0(FP), AX // ptr to data 13 MOVQ s+16(FP), CX // size 14 LEAQ ret+24(FP), DX 15 JMP aeshashbody<>(SB) 16 17 // func aeshashstr(p unsafe.Pointer, h uintptr) uintptr 18 TEXT ·aeshashstr(SB),NOSPLIT,$0-24 19 MOVQ p+0(FP), AX // ptr to string struct 20 MOVQ 8(AX), CX // length of string 21 MOVQ (AX), AX // string data 22 LEAQ ret+16(FP), DX 23 JMP aeshashbody<>(SB) 24 25 // AX: data 26 // CX: length 27 // DX: address to put return value 28 TEXT aeshashbody<>(SB),NOSPLIT,$0-0 29 // Fill an SSE register with our seeds. 30 MOVQ h+8(FP), X0 // 64 bits of per-table hash seed 31 PINSRW $4, CX, X0 // 16 bits of length 32 PSHUFHW $0, X0, X0 // repeat length 4 times total 33 MOVO X0, X1 // save unscrambled seed 34 PXOR ·aeskeysched(SB), X0 // xor in per-process seed 35 AESENC X0, X0 // scramble seed 36 37 CMPQ CX, $16 38 JB aes0to15 39 JE aes16 40 CMPQ CX, $32 41 JBE aes17to32 42 CMPQ CX, $64 43 JBE aes33to64 44 CMPQ CX, $128 45 JBE aes65to128 46 JMP aes129plus 47 48 aes0to15: 49 TESTQ CX, CX 50 JE aes0 51 52 ADDQ $16, AX 53 TESTW $0xff0, AX 54 JE endofpage 55 56 // 16 bytes loaded at this address won't cross 57 // a page boundary, so we can load it directly. 58 MOVOU -16(AX), X1 59 ADDQ CX, CX 60 MOVQ $masks<>(SB), AX 61 PAND (AX)(CX*8), X1 62 final1: 63 PXOR X0, X1 // xor data with seed 64 AESENC X1, X1 // scramble combo 3 times 65 AESENC X1, X1 66 AESENC X1, X1 67 MOVQ X1, (DX) 68 RET 69 70 endofpage: 71 // address ends in 1111xxxx. Might be up against 72 // a page boundary, so load ending at last byte. 73 // Then shift bytes down using pshufb. 74 MOVOU -32(AX)(CX*1), X1 75 ADDQ CX, CX 76 MOVQ $shifts<>(SB), AX 77 PSHUFB (AX)(CX*8), X1 78 JMP final1 79 80 aes0: 81 // Return scrambled input seed 82 AESENC X0, X0 83 MOVQ X0, (DX) 84 RET 85 86 aes16: 87 MOVOU (AX), X1 88 JMP final1 89 90 aes17to32: 91 // make second starting seed 92 PXOR ·aeskeysched+16(SB), X1 93 AESENC X1, X1 94 95 // load data to be hashed 96 MOVOU (AX), X2 97 MOVOU -16(AX)(CX*1), X3 98 99 // xor with seed 100 PXOR X0, X2 101 PXOR X1, X3 102 103 // scramble 3 times 104 AESENC X2, X2 105 AESENC X3, X3 106 AESENC X2, X2 107 AESENC X3, X3 108 AESENC X2, X2 109 AESENC X3, X3 110 111 // combine results 112 PXOR X3, X2 113 MOVQ X2, (DX) 114 RET 115 116 aes33to64: 117 // make 3 more starting seeds 118 MOVO X1, X2 119 MOVO X1, X3 120 PXOR ·aeskeysched+16(SB), X1 121 PXOR ·aeskeysched+32(SB), X2 122 PXOR ·aeskeysched+48(SB), X3 123 AESENC X1, X1 124 AESENC X2, X2 125 AESENC X3, X3 126 127 MOVOU (AX), X4 128 MOVOU 16(AX), X5 129 MOVOU -32(AX)(CX*1), X6 130 MOVOU -16(AX)(CX*1), X7 131 132 PXOR X0, X4 133 PXOR X1, X5 134 PXOR X2, X6 135 PXOR X3, X7 136 137 AESENC X4, X4 138 AESENC X5, X5 139 AESENC X6, X6 140 AESENC X7, X7 141 142 AESENC X4, X4 143 AESENC X5, X5 144 AESENC X6, X6 145 AESENC X7, X7 146 147 AESENC X4, X4 148 AESENC X5, X5 149 AESENC X6, X6 150 AESENC X7, X7 151 152 PXOR X6, X4 153 PXOR X7, X5 154 PXOR X5, X4 155 MOVQ X4, (DX) 156 RET 157 158 aes65to128: 159 // make 7 more starting seeds 160 MOVO X1, X2 161 MOVO X1, X3 162 MOVO X1, X4 163 MOVO X1, X5 164 MOVO X1, X6 165 MOVO X1, X7 166 PXOR ·aeskeysched+16(SB), X1 167 PXOR ·aeskeysched+32(SB), X2 168 PXOR ·aeskeysched+48(SB), X3 169 PXOR ·aeskeysched+64(SB), X4 170 PXOR ·aeskeysched+80(SB), X5 171 PXOR ·aeskeysched+96(SB), X6 172 PXOR ·aeskeysched+112(SB), X7 173 AESENC X1, X1 174 AESENC X2, X2 175 AESENC X3, X3 176 AESENC X4, X4 177 AESENC X5, X5 178 AESENC X6, X6 179 AESENC X7, X7 180 181 // load data 182 MOVOU (AX), X8 183 MOVOU 16(AX), X9 184 MOVOU 32(AX), X10 185 MOVOU 48(AX), X11 186 MOVOU -64(AX)(CX*1), X12 187 MOVOU -48(AX)(CX*1), X13 188 MOVOU -32(AX)(CX*1), X14 189 MOVOU -16(AX)(CX*1), X15 190 191 // xor with seed 192 PXOR X0, X8 193 PXOR X1, X9 194 PXOR X2, X10 195 PXOR X3, X11 196 PXOR X4, X12 197 PXOR X5, X13 198 PXOR X6, X14 199 PXOR X7, X15 200 201 // scramble 3 times 202 AESENC X8, X8 203 AESENC X9, X9 204 AESENC X10, X10 205 AESENC X11, X11 206 AESENC X12, X12 207 AESENC X13, X13 208 AESENC X14, X14 209 AESENC X15, X15 210 211 AESENC X8, X8 212 AESENC X9, X9 213 AESENC X10, X10 214 AESENC X11, X11 215 AESENC X12, X12 216 AESENC X13, X13 217 AESENC X14, X14 218 AESENC X15, X15 219 220 AESENC X8, X8 221 AESENC X9, X9 222 AESENC X10, X10 223 AESENC X11, X11 224 AESENC X12, X12 225 AESENC X13, X13 226 AESENC X14, X14 227 AESENC X15, X15 228 229 // combine results 230 PXOR X12, X8 231 PXOR X13, X9 232 PXOR X14, X10 233 PXOR X15, X11 234 PXOR X10, X8 235 PXOR X11, X9 236 PXOR X9, X8 237 MOVQ X8, (DX) 238 RET 239 240 aes129plus: 241 // make 7 more starting seeds 242 MOVO X1, X2 243 MOVO X1, X3 244 MOVO X1, X4 245 MOVO X1, X5 246 MOVO X1, X6 247 MOVO X1, X7 248 PXOR ·aeskeysched+16(SB), X1 249 PXOR ·aeskeysched+32(SB), X2 250 PXOR ·aeskeysched+48(SB), X3 251 PXOR ·aeskeysched+64(SB), X4 252 PXOR ·aeskeysched+80(SB), X5 253 PXOR ·aeskeysched+96(SB), X6 254 PXOR ·aeskeysched+112(SB), X7 255 AESENC X1, X1 256 AESENC X2, X2 257 AESENC X3, X3 258 AESENC X4, X4 259 AESENC X5, X5 260 AESENC X6, X6 261 AESENC X7, X7 262 263 // start with last (possibly overlapping) block 264 MOVOU -128(AX)(CX*1), X8 265 MOVOU -112(AX)(CX*1), X9 266 MOVOU -96(AX)(CX*1), X10 267 MOVOU -80(AX)(CX*1), X11 268 MOVOU -64(AX)(CX*1), X12 269 MOVOU -48(AX)(CX*1), X13 270 MOVOU -32(AX)(CX*1), X14 271 MOVOU -16(AX)(CX*1), X15 272 273 // xor in seed 274 PXOR X0, X8 275 PXOR X1, X9 276 PXOR X2, X10 277 PXOR X3, X11 278 PXOR X4, X12 279 PXOR X5, X13 280 PXOR X6, X14 281 PXOR X7, X15 282 283 // compute number of remaining 128-byte blocks 284 DECQ CX 285 SHRQ $7, CX 286 287 aesloop: 288 // scramble state 289 AESENC X8, X8 290 AESENC X9, X9 291 AESENC X10, X10 292 AESENC X11, X11 293 AESENC X12, X12 294 AESENC X13, X13 295 AESENC X14, X14 296 AESENC X15, X15 297 298 // scramble state, xor in a block 299 MOVOU (AX), X0 300 MOVOU 16(AX), X1 301 MOVOU 32(AX), X2 302 MOVOU 48(AX), X3 303 AESENC X0, X8 304 AESENC X1, X9 305 AESENC X2, X10 306 AESENC X3, X11 307 MOVOU 64(AX), X4 308 MOVOU 80(AX), X5 309 MOVOU 96(AX), X6 310 MOVOU 112(AX), X7 311 AESENC X4, X12 312 AESENC X5, X13 313 AESENC X6, X14 314 AESENC X7, X15 315 316 ADDQ $128, AX 317 DECQ CX 318 JNE aesloop 319 320 // 3 more scrambles to finish 321 AESENC X8, X8 322 AESENC X9, X9 323 AESENC X10, X10 324 AESENC X11, X11 325 AESENC X12, X12 326 AESENC X13, X13 327 AESENC X14, X14 328 AESENC X15, X15 329 AESENC X8, X8 330 AESENC X9, X9 331 AESENC X10, X10 332 AESENC X11, X11 333 AESENC X12, X12 334 AESENC X13, X13 335 AESENC X14, X14 336 AESENC X15, X15 337 AESENC X8, X8 338 AESENC X9, X9 339 AESENC X10, X10 340 AESENC X11, X11 341 AESENC X12, X12 342 AESENC X13, X13 343 AESENC X14, X14 344 AESENC X15, X15 345 346 PXOR X12, X8 347 PXOR X13, X9 348 PXOR X14, X10 349 PXOR X15, X11 350 PXOR X10, X8 351 PXOR X11, X9 352 PXOR X9, X8 353 MOVQ X8, (DX) 354 RET 355 356 // func aeshash32(p unsafe.Pointer, h uintptr) uintptr 357 TEXT ·aeshash32(SB),NOSPLIT,$0-24 358 MOVQ p+0(FP), AX // ptr to data 359 MOVQ h+8(FP), X0 // seed 360 PINSRD $2, (AX), X0 // data 361 AESENC ·aeskeysched+0(SB), X0 362 AESENC ·aeskeysched+16(SB), X0 363 AESENC ·aeskeysched+32(SB), X0 364 MOVQ X0, ret+16(FP) 365 RET 366 367 // func aeshash64(p unsafe.Pointer, h uintptr) uintptr 368 TEXT ·aeshash64(SB),NOSPLIT,$0-24 369 MOVQ p+0(FP), AX // ptr to data 370 MOVQ h+8(FP), X0 // seed 371 PINSRQ $1, (AX), X0 // data 372 AESENC ·aeskeysched+0(SB), X0 373 AESENC ·aeskeysched+16(SB), X0 374 AESENC ·aeskeysched+32(SB), X0 375 MOVQ X0, ret+16(FP) 376 RET 377 378 // simple mask to get rid of data in the high part of the register. 379 DATA masks<>+0x00(SB)/8, $0x0000000000000000 380 DATA masks<>+0x08(SB)/8, $0x0000000000000000 381 DATA masks<>+0x10(SB)/8, $0x00000000000000ff 382 DATA masks<>+0x18(SB)/8, $0x0000000000000000 383 DATA masks<>+0x20(SB)/8, $0x000000000000ffff 384 DATA masks<>+0x28(SB)/8, $0x0000000000000000 385 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff 386 DATA masks<>+0x38(SB)/8, $0x0000000000000000 387 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff 388 DATA masks<>+0x48(SB)/8, $0x0000000000000000 389 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff 390 DATA masks<>+0x58(SB)/8, $0x0000000000000000 391 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff 392 DATA masks<>+0x68(SB)/8, $0x0000000000000000 393 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff 394 DATA masks<>+0x78(SB)/8, $0x0000000000000000 395 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff 396 DATA masks<>+0x88(SB)/8, $0x0000000000000000 397 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff 398 DATA masks<>+0x98(SB)/8, $0x00000000000000ff 399 DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff 400 DATA masks<>+0xa8(SB)/8, $0x000000000000ffff 401 DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff 402 DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff 403 DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff 404 DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff 405 DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff 406 DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff 407 DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff 408 DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff 409 DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff 410 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff 411 GLOBL masks<>(SB),RODATA,$256 412 413 // these are arguments to pshufb. They move data down from 414 // the high bytes of the register to the low bytes of the register. 415 // index is how many bytes to move. 416 DATA shifts<>+0x00(SB)/8, $0x0000000000000000 417 DATA shifts<>+0x08(SB)/8, $0x0000000000000000 418 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f 419 DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff 420 DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e 421 DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff 422 DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d 423 DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff 424 DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c 425 DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff 426 DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b 427 DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff 428 DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a 429 DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff 430 DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09 431 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff 432 DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908 433 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff 434 DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807 435 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f 436 DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706 437 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e 438 DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605 439 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d 440 DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504 441 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c 442 DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403 443 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b 444 DATA shifts<>+0xe0(SB)/8, $0x0908070605040302 445 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a 446 DATA shifts<>+0xf0(SB)/8, $0x0807060504030201 447 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09 448 GLOBL shifts<>(SB),RODATA,$256