github.com/arr-ai/hash@v0.8.0/asm_386.s (about) 1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 #include "go_asm.h" 6 #include "funcdata.h" 7 #include "textflag.h" 8 9 // hash function using AES hardware instructions 10 TEXT ·aeshash(SB),NOSPLIT,$0-16 11 MOVL p+0(FP), AX // ptr to data 12 MOVL s+8(FP), BX // size 13 LEAL ret+12(FP), DX 14 JMP aeshashbody<>(SB) 15 16 TEXT ·aeshashstr(SB),NOSPLIT,$0-12 17 MOVL p+0(FP), AX // ptr to string object 18 MOVL 4(AX), BX // length of string 19 MOVL (AX), AX // string data 20 LEAL ret+8(FP), DX 21 JMP aeshashbody<>(SB) 22 23 // AX: data 24 // BX: length 25 // DX: address to put return value 26 TEXT aeshashbody<>(SB),NOSPLIT,$0-0 27 MOVL h+4(FP), X0 // 32 bits of per-table hash seed 28 PINSRW $4, BX, X0 // 16 bits of length 29 PSHUFHW $0, X0, X0 // replace size with its low 2 bytes repeated 4 times 30 MOVO X0, X1 // save unscrambled seed 31 PXOR ·aeskeysched(SB), X0 // xor in per-process seed 32 AESENC X0, X0 // scramble seed 33 34 CMPL BX, $16 35 JB aes0to15 36 JE aes16 37 CMPL BX, $32 38 JBE aes17to32 39 CMPL BX, $64 40 JBE aes33to64 41 JMP aes65plus 42 43 aes0to15: 44 TESTL BX, BX 45 JE aes0 46 47 ADDL $16, AX 48 TESTW $0xff0, AX 49 JE endofpage 50 51 // 16 bytes loaded at this address won't cross 52 // a page boundary, so we can load it directly. 53 MOVOU -16(AX), X1 54 ADDL BX, BX 55 PAND masks<>(SB)(BX*8), X1 56 57 final1: 58 AESENC X0, X1 // scramble input, xor in seed 59 AESENC X1, X1 // scramble combo 2 times 60 AESENC X1, X1 61 MOVL X1, (DX) 62 RET 63 64 endofpage: 65 // address ends in 1111xxxx. Might be up against 66 // a page boundary, so load ending at last byte. 67 // Then shift bytes down using pshufb. 68 MOVOU -32(AX)(BX*1), X1 69 ADDL BX, BX 70 PSHUFB shifts<>(SB)(BX*8), X1 71 JMP final1 72 73 aes0: 74 // Return scrambled input seed 75 AESENC X0, X0 76 MOVL X0, (DX) 77 RET 78 79 aes16: 80 MOVOU (AX), X1 81 JMP final1 82 83 aes17to32: 84 // make second starting seed 85 PXOR ·aeskeysched+16(SB), X1 86 AESENC X1, X1 87 88 // load data to be hashed 89 MOVOU (AX), X2 90 MOVOU -16(AX)(BX*1), X3 91 92 // scramble 3 times 93 AESENC X0, X2 94 AESENC X1, X3 95 AESENC X2, X2 96 AESENC X3, X3 97 AESENC X2, X2 98 AESENC X3, X3 99 100 // combine results 101 PXOR X3, X2 102 MOVL X2, (DX) 103 RET 104 105 aes33to64: 106 // make 3 more starting seeds 107 MOVO X1, X2 108 MOVO X1, X3 109 PXOR ·aeskeysched+16(SB), X1 110 PXOR ·aeskeysched+32(SB), X2 111 PXOR ·aeskeysched+48(SB), X3 112 AESENC X1, X1 113 AESENC X2, X2 114 AESENC X3, X3 115 116 MOVOU (AX), X4 117 MOVOU 16(AX), X5 118 MOVOU -32(AX)(BX*1), X6 119 MOVOU -16(AX)(BX*1), X7 120 121 AESENC X0, X4 122 AESENC X1, X5 123 AESENC X2, X6 124 AESENC X3, X7 125 126 AESENC X4, X4 127 AESENC X5, X5 128 AESENC X6, X6 129 AESENC X7, X7 130 131 AESENC X4, X4 132 AESENC X5, X5 133 AESENC X6, X6 134 AESENC X7, X7 135 136 PXOR X6, X4 137 PXOR X7, X5 138 PXOR X5, X4 139 MOVL X4, (DX) 140 RET 141 142 aes65plus: 143 // make 3 more starting seeds 144 MOVO X1, X2 145 MOVO X1, X3 146 PXOR ·aeskeysched+16(SB), X1 147 PXOR ·aeskeysched+32(SB), X2 148 PXOR ·aeskeysched+48(SB), X3 149 AESENC X1, X1 150 AESENC X2, X2 151 AESENC X3, X3 152 153 // start with last (possibly overlapping) block 154 MOVOU -64(AX)(BX*1), X4 155 MOVOU -48(AX)(BX*1), X5 156 MOVOU -32(AX)(BX*1), X6 157 MOVOU -16(AX)(BX*1), X7 158 159 // scramble state once 160 AESENC X0, X4 161 AESENC X1, X5 162 AESENC X2, X6 163 AESENC X3, X7 164 165 // compute number of remaining 64-byte blocks 166 DECL BX 167 SHRL $6, BX 168 169 aesloop: 170 // scramble state, xor in a block 171 MOVOU (AX), X0 172 MOVOU 16(AX), X1 173 MOVOU 32(AX), X2 174 MOVOU 48(AX), X3 175 AESENC X0, X4 176 AESENC X1, X5 177 AESENC X2, X6 178 AESENC X3, X7 179 180 // scramble state 181 AESENC X4, X4 182 AESENC X5, X5 183 AESENC X6, X6 184 AESENC X7, X7 185 186 ADDL $64, AX 187 DECL BX 188 JNE aesloop 189 190 // 2 more scrambles to finish 191 AESENC X4, X4 192 AESENC X5, X5 193 AESENC X6, X6 194 AESENC X7, X7 195 196 AESENC X4, X4 197 AESENC X5, X5 198 AESENC X6, X6 199 AESENC X7, X7 200 201 PXOR X6, X4 202 PXOR X7, X5 203 PXOR X5, X4 204 MOVL X4, (DX) 205 RET 206 207 TEXT ·aeshash32(SB),NOSPLIT,$0-12 208 MOVL p+0(FP), AX // ptr to data 209 MOVL h+4(FP), X0 // seed 210 PINSRD $1, (AX), X0 // data 211 AESENC ·aeskeysched+0(SB), X0 212 AESENC ·aeskeysched+16(SB), X0 213 AESENC ·aeskeysched+32(SB), X0 214 MOVL X0, ret+8(FP) 215 RET 216 217 TEXT ·aeshash64(SB),NOSPLIT,$0-12 218 MOVL p+0(FP), AX // ptr to data 219 MOVQ (AX), X0 // data 220 PINSRD $2, h+4(FP), X0 // seed 221 AESENC ·aeskeysched+0(SB), X0 222 AESENC ·aeskeysched+16(SB), X0 223 AESENC ·aeskeysched+32(SB), X0 224 MOVL X0, ret+8(FP) 225 RET 226 227 // simple mask to get rid of data in the high part of the register. 228 DATA masks<>+0x00(SB)/4, $0x00000000 229 DATA masks<>+0x04(SB)/4, $0x00000000 230 DATA masks<>+0x08(SB)/4, $0x00000000 231 DATA masks<>+0x0c(SB)/4, $0x00000000 232 233 DATA masks<>+0x10(SB)/4, $0x000000ff 234 DATA masks<>+0x14(SB)/4, $0x00000000 235 DATA masks<>+0x18(SB)/4, $0x00000000 236 DATA masks<>+0x1c(SB)/4, $0x00000000 237 238 DATA masks<>+0x20(SB)/4, $0x0000ffff 239 DATA masks<>+0x24(SB)/4, $0x00000000 240 DATA masks<>+0x28(SB)/4, $0x00000000 241 DATA masks<>+0x2c(SB)/4, $0x00000000 242 243 DATA masks<>+0x30(SB)/4, $0x00ffffff 244 DATA masks<>+0x34(SB)/4, $0x00000000 245 DATA masks<>+0x38(SB)/4, $0x00000000 246 DATA masks<>+0x3c(SB)/4, $0x00000000 247 248 DATA masks<>+0x40(SB)/4, $0xffffffff 249 DATA masks<>+0x44(SB)/4, $0x00000000 250 DATA masks<>+0x48(SB)/4, $0x00000000 251 DATA masks<>+0x4c(SB)/4, $0x00000000 252 253 DATA masks<>+0x50(SB)/4, $0xffffffff 254 DATA masks<>+0x54(SB)/4, $0x000000ff 255 DATA masks<>+0x58(SB)/4, $0x00000000 256 DATA masks<>+0x5c(SB)/4, $0x00000000 257 258 DATA masks<>+0x60(SB)/4, $0xffffffff 259 DATA masks<>+0x64(SB)/4, $0x0000ffff 260 DATA masks<>+0x68(SB)/4, $0x00000000 261 DATA masks<>+0x6c(SB)/4, $0x00000000 262 263 DATA masks<>+0x70(SB)/4, $0xffffffff 264 DATA masks<>+0x74(SB)/4, $0x00ffffff 265 DATA masks<>+0x78(SB)/4, $0x00000000 266 DATA masks<>+0x7c(SB)/4, $0x00000000 267 268 DATA masks<>+0x80(SB)/4, $0xffffffff 269 DATA masks<>+0x84(SB)/4, $0xffffffff 270 DATA masks<>+0x88(SB)/4, $0x00000000 271 DATA masks<>+0x8c(SB)/4, $0x00000000 272 273 DATA masks<>+0x90(SB)/4, $0xffffffff 274 DATA masks<>+0x94(SB)/4, $0xffffffff 275 DATA masks<>+0x98(SB)/4, $0x000000ff 276 DATA masks<>+0x9c(SB)/4, $0x00000000 277 278 DATA masks<>+0xa0(SB)/4, $0xffffffff 279 DATA masks<>+0xa4(SB)/4, $0xffffffff 280 DATA masks<>+0xa8(SB)/4, $0x0000ffff 281 DATA masks<>+0xac(SB)/4, $0x00000000 282 283 DATA masks<>+0xb0(SB)/4, $0xffffffff 284 DATA masks<>+0xb4(SB)/4, $0xffffffff 285 DATA masks<>+0xb8(SB)/4, $0x00ffffff 286 DATA masks<>+0xbc(SB)/4, $0x00000000 287 288 DATA masks<>+0xc0(SB)/4, $0xffffffff 289 DATA masks<>+0xc4(SB)/4, $0xffffffff 290 DATA masks<>+0xc8(SB)/4, $0xffffffff 291 DATA masks<>+0xcc(SB)/4, $0x00000000 292 293 DATA masks<>+0xd0(SB)/4, $0xffffffff 294 DATA masks<>+0xd4(SB)/4, $0xffffffff 295 DATA masks<>+0xd8(SB)/4, $0xffffffff 296 DATA masks<>+0xdc(SB)/4, $0x000000ff 297 298 DATA masks<>+0xe0(SB)/4, $0xffffffff 299 DATA masks<>+0xe4(SB)/4, $0xffffffff 300 DATA masks<>+0xe8(SB)/4, $0xffffffff 301 DATA masks<>+0xec(SB)/4, $0x0000ffff 302 303 DATA masks<>+0xf0(SB)/4, $0xffffffff 304 DATA masks<>+0xf4(SB)/4, $0xffffffff 305 DATA masks<>+0xf8(SB)/4, $0xffffffff 306 DATA masks<>+0xfc(SB)/4, $0x00ffffff 307 308 GLOBL masks<>(SB),RODATA,$256 309 310 // these are arguments to pshufb. They move data down from 311 // the high bytes of the register to the low bytes of the register. 312 // index is how many bytes to move. 313 DATA shifts<>+0x00(SB)/4, $0x00000000 314 DATA shifts<>+0x04(SB)/4, $0x00000000 315 DATA shifts<>+0x08(SB)/4, $0x00000000 316 DATA shifts<>+0x0c(SB)/4, $0x00000000 317 318 DATA shifts<>+0x10(SB)/4, $0xffffff0f 319 DATA shifts<>+0x14(SB)/4, $0xffffffff 320 DATA shifts<>+0x18(SB)/4, $0xffffffff 321 DATA shifts<>+0x1c(SB)/4, $0xffffffff 322 323 DATA shifts<>+0x20(SB)/4, $0xffff0f0e 324 DATA shifts<>+0x24(SB)/4, $0xffffffff 325 DATA shifts<>+0x28(SB)/4, $0xffffffff 326 DATA shifts<>+0x2c(SB)/4, $0xffffffff 327 328 DATA shifts<>+0x30(SB)/4, $0xff0f0e0d 329 DATA shifts<>+0x34(SB)/4, $0xffffffff 330 DATA shifts<>+0x38(SB)/4, $0xffffffff 331 DATA shifts<>+0x3c(SB)/4, $0xffffffff 332 333 DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c 334 DATA shifts<>+0x44(SB)/4, $0xffffffff 335 DATA shifts<>+0x48(SB)/4, $0xffffffff 336 DATA shifts<>+0x4c(SB)/4, $0xffffffff 337 338 DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b 339 DATA shifts<>+0x54(SB)/4, $0xffffff0f 340 DATA shifts<>+0x58(SB)/4, $0xffffffff 341 DATA shifts<>+0x5c(SB)/4, $0xffffffff 342 343 DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a 344 DATA shifts<>+0x64(SB)/4, $0xffff0f0e 345 DATA shifts<>+0x68(SB)/4, $0xffffffff 346 DATA shifts<>+0x6c(SB)/4, $0xffffffff 347 348 DATA shifts<>+0x70(SB)/4, $0x0c0b0a09 349 DATA shifts<>+0x74(SB)/4, $0xff0f0e0d 350 DATA shifts<>+0x78(SB)/4, $0xffffffff 351 DATA shifts<>+0x7c(SB)/4, $0xffffffff 352 353 DATA shifts<>+0x80(SB)/4, $0x0b0a0908 354 DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c 355 DATA shifts<>+0x88(SB)/4, $0xffffffff 356 DATA shifts<>+0x8c(SB)/4, $0xffffffff 357 358 DATA shifts<>+0x90(SB)/4, $0x0a090807 359 DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b 360 DATA shifts<>+0x98(SB)/4, $0xffffff0f 361 DATA shifts<>+0x9c(SB)/4, $0xffffffff 362 363 DATA shifts<>+0xa0(SB)/4, $0x09080706 364 DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a 365 DATA shifts<>+0xa8(SB)/4, $0xffff0f0e 366 DATA shifts<>+0xac(SB)/4, $0xffffffff 367 368 DATA shifts<>+0xb0(SB)/4, $0x08070605 369 DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09 370 DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d 371 DATA shifts<>+0xbc(SB)/4, $0xffffffff 372 373 DATA shifts<>+0xc0(SB)/4, $0x07060504 374 DATA shifts<>+0xc4(SB)/4, $0x0b0a0908 375 DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c 376 DATA shifts<>+0xcc(SB)/4, $0xffffffff 377 378 DATA shifts<>+0xd0(SB)/4, $0x06050403 379 DATA shifts<>+0xd4(SB)/4, $0x0a090807 380 DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b 381 DATA shifts<>+0xdc(SB)/4, $0xffffff0f 382 383 DATA shifts<>+0xe0(SB)/4, $0x05040302 384 DATA shifts<>+0xe4(SB)/4, $0x09080706 385 DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a 386 DATA shifts<>+0xec(SB)/4, $0xffff0f0e 387 388 DATA shifts<>+0xf0(SB)/4, $0x04030201 389 DATA shifts<>+0xf4(SB)/4, $0x08070605 390 DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09 391 DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d 392 393 GLOBL shifts<>(SB),RODATA,$256