github.com/insolar/vanilla@v0.0.0-20201023172447-248fdf805322/aeshash/hash_amd64.s (about) 1 #include "textflag.h" 2 3 // hash function using AES hardware instructions 4 5 TEXT ·aeshash(SB),NOSPLIT,$0-32 6 // nolint 7 MOVQ p+0(FP), AX // ptr to data 8 // nolint 9 MOVQ s+16(FP), CX // size 10 LEAQ ret+24(FP), DX 11 12 //TEXT ·aeshashstr(SB),NOSPLIT,$0-24 13 // MOVQ p+0(FP), AX // ptr to string/slice struct 14 // MOVQ 8(AX), CX // length of data 15 // MOVQ (AX), AX // data 16 // LEAQ ret+16(FP), DX 17 18 // Fill an SSE register with our seeds. 19 // nolint 20 MOVQ h+8(FP), X0 // 64 bits of per-table hash seed 21 PINSRW $4, CX, X0 // 16 bits of length 22 PSHUFHW $0, X0, X0 // repeat length 4 times total 23 MOVO X0, X1 // save unscrambled seed 24 AESENC X0, X0 // scramble seed 25 26 CMPQ CX, $16 27 JB aes0to15 28 JE aes16 29 CMPQ CX, $32 30 JBE aes17to32 31 CMPQ CX, $64 32 JBE aes33to64 33 CMPQ CX, $128 34 JBE aes65to128 35 JMP aes129plus 36 37 aes0to15: 38 TESTQ CX, CX 39 JE aes0 40 41 ADDQ $16, AX 42 TESTW $0xff0, AX 43 JE endofpage 44 45 // 16 bytes loaded at this address won't cross 46 // a page boundary, so we can load it directly. 47 MOVOU -16(AX), X1 48 ADDQ CX, CX 49 MOVQ $masks<>(SB), AX 50 PAND (AX)(CX*8), X1 51 final1: 52 PXOR X0, X1 // xor data with seed 53 AESENC X1, X1 // scramble combo 3 times 54 AESENC X1, X1 55 AESENC X1, X1 56 MOVQ X1, (DX) 57 RET 58 59 endofpage: 60 // address ends in 1111xxxx. Might be up against 61 // a page boundary, so load ending at last byte. 62 // Then shift bytes down using pshufb. 63 MOVOU -32(AX)(CX*1), X1 64 ADDQ CX, CX 65 MOVQ $shifts<>(SB), AX 66 PSHUFB (AX)(CX*8), X1 67 JMP final1 68 69 aes0: 70 // Return scrambled input seed 71 AESENC X0, X0 72 MOVQ X0, (DX) 73 RET 74 75 aes16: 76 MOVOU (AX), X1 77 JMP final1 78 79 aes17to32: 80 // make second starting seed 81 AESENC X1, X1 82 83 // load data to be hashed 84 MOVOU (AX), X2 85 MOVOU -16(AX)(CX*1), X3 86 87 // xor with seed 88 PXOR X0, X2 89 PXOR X1, X3 90 91 // scramble 3 times 92 AESENC X2, X2 93 AESENC X3, X3 94 AESENC X2, X2 95 AESENC X3, X3 96 AESENC X2, X2 97 AESENC X3, X3 98 99 // combine results 100 PXOR X3, X2 101 MOVQ X2, (DX) 102 RET 103 104 aes33to64: 105 // make 3 more starting seeds 106 MOVO X1, X2 107 MOVO X1, X3 108 AESENC X1, X1 109 AESENC X2, X2 110 AESENC X3, X3 111 112 MOVOU (AX), X4 113 MOVOU 16(AX), X5 114 MOVOU -32(AX)(CX*1), X6 115 MOVOU -16(AX)(CX*1), X7 116 117 PXOR X0, X4 118 PXOR X1, X5 119 PXOR X2, X6 120 PXOR X3, X7 121 122 AESENC X4, X4 123 AESENC X5, X5 124 AESENC X6, X6 125 AESENC X7, X7 126 127 AESENC X4, X4 128 AESENC X5, X5 129 AESENC X6, X6 130 AESENC X7, X7 131 132 AESENC X4, X4 133 AESENC X5, X5 134 AESENC X6, X6 135 AESENC X7, X7 136 137 PXOR X6, X4 138 PXOR X7, X5 139 PXOR X5, X4 140 MOVQ X4, (DX) 141 RET 142 143 aes65to128: 144 // make 7 more starting seeds 145 MOVO X1, X2 146 MOVO X1, X3 147 MOVO X1, X4 148 MOVO X1, X5 149 MOVO X1, X6 150 MOVO X1, X7 151 AESENC X1, X1 152 AESENC X2, X2 153 AESENC X3, X3 154 AESENC X4, X4 155 AESENC X5, X5 156 AESENC X6, X6 157 AESENC X7, X7 158 159 // load data 160 MOVOU (AX), X8 161 MOVOU 16(AX), X9 162 MOVOU 32(AX), X10 163 MOVOU 48(AX), X11 164 MOVOU -64(AX)(CX*1), X12 165 MOVOU -48(AX)(CX*1), X13 166 MOVOU -32(AX)(CX*1), X14 167 MOVOU -16(AX)(CX*1), X15 168 169 // xor with seed 170 PXOR X0, X8 171 PXOR X1, X9 172 PXOR X2, X10 173 PXOR X3, X11 174 PXOR X4, X12 175 PXOR X5, X13 176 PXOR X6, X14 177 PXOR X7, X15 178 179 // scramble 3 times 180 AESENC X8, X8 181 AESENC X9, X9 182 AESENC X10, X10 183 AESENC X11, X11 184 AESENC X12, X12 185 AESENC X13, X13 186 AESENC X14, X14 187 AESENC X15, X15 188 189 AESENC X8, X8 190 AESENC X9, X9 191 AESENC X10, X10 192 AESENC X11, X11 193 AESENC X12, X12 194 AESENC X13, X13 195 AESENC X14, X14 196 AESENC X15, X15 197 198 AESENC X8, X8 199 AESENC X9, X9 200 AESENC X10, X10 201 AESENC X11, X11 202 AESENC X12, X12 203 AESENC X13, X13 204 AESENC X14, X14 205 AESENC X15, X15 206 207 // combine results 208 PXOR X12, X8 209 PXOR X13, X9 210 PXOR X14, X10 211 PXOR X15, X11 212 PXOR X10, X8 213 PXOR X11, X9 214 PXOR X9, X8 215 MOVQ X8, (DX) 216 RET 217 218 aes129plus: 219 // make 7 more starting seeds 220 MOVO X1, X2 221 MOVO X1, X3 222 MOVO X1, X4 223 MOVO X1, X5 224 MOVO X1, X6 225 MOVO X1, X7 226 AESENC X1, X1 227 AESENC X2, X2 228 AESENC X3, X3 229 AESENC X4, X4 230 AESENC X5, X5 231 AESENC X6, X6 232 AESENC X7, X7 233 234 // start with last (possibly overlapping) block 235 MOVOU -128(AX)(CX*1), X8 236 MOVOU -112(AX)(CX*1), X9 237 MOVOU -96(AX)(CX*1), X10 238 MOVOU -80(AX)(CX*1), X11 239 MOVOU -64(AX)(CX*1), X12 240 MOVOU -48(AX)(CX*1), X13 241 MOVOU -32(AX)(CX*1), X14 242 MOVOU -16(AX)(CX*1), X15 243 244 // xor in seed 245 PXOR X0, X8 246 PXOR X1, X9 247 PXOR X2, X10 248 PXOR X3, X11 249 PXOR X4, X12 250 PXOR X5, X13 251 PXOR X6, X14 252 PXOR X7, X15 253 254 // compute number of remaining 128-byte blocks 255 DECQ CX 256 SHRQ $7, CX 257 258 aesloop: 259 // scramble state 260 AESENC X8, X8 261 AESENC X9, X9 262 AESENC X10, X10 263 AESENC X11, X11 264 AESENC X12, X12 265 AESENC X13, X13 266 AESENC X14, X14 267 AESENC X15, X15 268 269 // scramble state, xor in a block 270 MOVOU (AX), X0 271 MOVOU 16(AX), X1 272 MOVOU 32(AX), X2 273 MOVOU 48(AX), X3 274 AESENC X0, X8 275 AESENC X1, X9 276 AESENC X2, X10 277 AESENC X3, X11 278 MOVOU 64(AX), X4 279 MOVOU 80(AX), X5 280 MOVOU 96(AX), X6 281 MOVOU 112(AX), X7 282 AESENC X4, X12 283 AESENC X5, X13 284 AESENC X6, X14 285 AESENC X7, X15 286 287 ADDQ $128, AX 288 DECQ CX 289 JNE aesloop 290 291 // 3 more scrambles to finish 292 AESENC X8, X8 293 AESENC X9, X9 294 AESENC X10, X10 295 AESENC X11, X11 296 AESENC X12, X12 297 AESENC X13, X13 298 AESENC X14, X14 299 AESENC X15, X15 300 AESENC X8, X8 301 AESENC X9, X9 302 AESENC X10, X10 303 AESENC X11, X11 304 AESENC X12, X12 305 AESENC X13, X13 306 AESENC X14, X14 307 AESENC X15, X15 308 AESENC X8, X8 309 AESENC X9, X9 310 AESENC X10, X10 311 AESENC X11, X11 312 AESENC X12, X12 313 AESENC X13, X13 314 AESENC X14, X14 315 AESENC X15, X15 316 317 PXOR X12, X8 318 PXOR X13, X9 319 PXOR X14, X10 320 PXOR X15, X11 321 PXOR X10, X8 322 PXOR X11, X9 323 PXOR X9, X8 324 MOVQ X8, (DX) 325 RET 326 327 // simple mask to get rid of data in the high part of the register. 328 DATA masks<>+0x00(SB)/8, $0x0000000000000000 329 DATA masks<>+0x08(SB)/8, $0x0000000000000000 330 DATA masks<>+0x10(SB)/8, $0x00000000000000ff 331 DATA masks<>+0x18(SB)/8, $0x0000000000000000 332 DATA masks<>+0x20(SB)/8, $0x000000000000ffff 333 DATA masks<>+0x28(SB)/8, $0x0000000000000000 334 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff 335 DATA masks<>+0x38(SB)/8, $0x0000000000000000 336 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff 337 DATA masks<>+0x48(SB)/8, $0x0000000000000000 338 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff 339 DATA masks<>+0x58(SB)/8, $0x0000000000000000 340 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff 341 DATA masks<>+0x68(SB)/8, $0x0000000000000000 342 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff 343 DATA masks<>+0x78(SB)/8, $0x0000000000000000 344 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff 345 DATA masks<>+0x88(SB)/8, $0x0000000000000000 346 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff 347 DATA masks<>+0x98(SB)/8, $0x00000000000000ff 348 DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff 349 DATA masks<>+0xa8(SB)/8, $0x000000000000ffff 350 DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff 351 DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff 352 DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff 353 DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff 354 DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff 355 DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff 356 DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff 357 DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff 358 DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff 359 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff 360 GLOBL masks<>(SB),RODATA,$256 361 362 // these are arguments to pshufb. They move data down from 363 // the high bytes of the register to the low bytes of the register. 364 // index is how many bytes to move. 365 DATA shifts<>+0x00(SB)/8, $0x0000000000000000 366 DATA shifts<>+0x08(SB)/8, $0x0000000000000000 367 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f 368 DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff 369 DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e 370 DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff 371 DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d 372 DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff 373 DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c 374 DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff 375 DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b 376 DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff 377 DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a 378 DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff 379 DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09 380 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff 381 DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908 382 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff 383 DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807 384 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f 385 DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706 386 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e 387 DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605 388 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d 389 DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504 390 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c 391 DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403 392 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b 393 DATA shifts<>+0xe0(SB)/8, $0x0908070605040302 394 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a 395 DATA shifts<>+0xf0(SB)/8, $0x0807060504030201 396 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09 397 GLOBL shifts<>(SB),RODATA,$256