github.com/arr-ai/hash@v0.8.0/asm_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "funcdata.h"
     7  #include "textflag.h"
     8  
     9  // func aeshash(p unsafe.Pointer, h, s uintptr) uintptr
    10  // hash function using AES hardware instructions
    11  TEXT ·aeshash(SB),NOSPLIT,$0-32
    12  	MOVQ	p+0(FP), AX	// ptr to data
    13  	MOVQ	s+16(FP), CX	// size
    14  	LEAQ	ret+24(FP), DX
    15  	JMP	aeshashbody<>(SB)
    16  
    17  // func aeshashstr(p unsafe.Pointer, h uintptr) uintptr
    18  TEXT ·aeshashstr(SB),NOSPLIT,$0-24
    19  	MOVQ	p+0(FP), AX	// ptr to string struct
    20  	MOVQ	8(AX), CX	// length of string
    21  	MOVQ	(AX), AX	// string data
    22  	LEAQ	ret+16(FP), DX
    23  	JMP	aeshashbody<>(SB)
    24  
    25  // AX: data
    26  // CX: length
    27  // DX: address to put return value
    28  TEXT aeshashbody<>(SB),NOSPLIT,$0-0
    29  	// Fill an SSE register with our seeds.
    30  	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
    31  	PINSRW	$4, CX, X0			// 16 bits of length
    32  	PSHUFHW $0, X0, X0			// repeat length 4 times total
    33  	MOVO	X0, X1				// save unscrambled seed
    34  	PXOR	·aeskeysched(SB), X0	// xor in per-process seed
    35  	AESENC	X0, X0				// scramble seed
    36  
    37  	CMPQ	CX, $16
    38  	JB	aes0to15
    39  	JE	aes16
    40  	CMPQ	CX, $32
    41  	JBE	aes17to32
    42  	CMPQ	CX, $64
    43  	JBE	aes33to64
    44  	CMPQ	CX, $128
    45  	JBE	aes65to128
    46  	JMP	aes129plus
    47  
    48  aes0to15:
    49  	TESTQ	CX, CX
    50  	JE	aes0
    51  
    52  	ADDQ	$16, AX
    53  	TESTW	$0xff0, AX
    54  	JE	endofpage
    55  
    56  	// 16 bytes loaded at this address won't cross
    57  	// a page boundary, so we can load it directly.
    58  	MOVOU	-16(AX), X1
    59  	ADDQ	CX, CX
    60  	MOVQ	$masks<>(SB), AX
    61  	PAND	(AX)(CX*8), X1
    62  final1:
    63  	PXOR	X0, X1	// xor data with seed
    64  	AESENC	X1, X1	// scramble combo 3 times
    65  	AESENC	X1, X1
    66  	AESENC	X1, X1
    67  	MOVQ	X1, (DX)
    68  	RET
    69  
    70  endofpage:
    71  	// address ends in 1111xxxx. Might be up against
    72  	// a page boundary, so load ending at last byte.
    73  	// Then shift bytes down using pshufb.
    74  	MOVOU	-32(AX)(CX*1), X1
    75  	ADDQ	CX, CX
    76  	MOVQ	$shifts<>(SB), AX
    77  	PSHUFB	(AX)(CX*8), X1
    78  	JMP	final1
    79  
    80  aes0:
    81  	// Return scrambled input seed
    82  	AESENC	X0, X0
    83  	MOVQ	X0, (DX)
    84  	RET
    85  
    86  aes16:
    87  	MOVOU	(AX), X1
    88  	JMP	final1
    89  
    90  aes17to32:
    91  	// make second starting seed
    92  	PXOR	·aeskeysched+16(SB), X1
    93  	AESENC	X1, X1
    94  
    95  	// load data to be hashed
    96  	MOVOU	(AX), X2
    97  	MOVOU	-16(AX)(CX*1), X3
    98  
    99  	// xor with seed
   100  	PXOR	X0, X2
   101  	PXOR	X1, X3
   102  
   103  	// scramble 3 times
   104  	AESENC	X2, X2
   105  	AESENC	X3, X3
   106  	AESENC	X2, X2
   107  	AESENC	X3, X3
   108  	AESENC	X2, X2
   109  	AESENC	X3, X3
   110  
   111  	// combine results
   112  	PXOR	X3, X2
   113  	MOVQ	X2, (DX)
   114  	RET
   115  
   116  aes33to64:
   117  	// make 3 more starting seeds
   118  	MOVO	X1, X2
   119  	MOVO	X1, X3
   120  	PXOR	·aeskeysched+16(SB), X1
   121  	PXOR	·aeskeysched+32(SB), X2
   122  	PXOR	·aeskeysched+48(SB), X3
   123  	AESENC	X1, X1
   124  	AESENC	X2, X2
   125  	AESENC	X3, X3
   126  
   127  	MOVOU	(AX), X4
   128  	MOVOU	16(AX), X5
   129  	MOVOU	-32(AX)(CX*1), X6
   130  	MOVOU	-16(AX)(CX*1), X7
   131  
   132  	PXOR	X0, X4
   133  	PXOR	X1, X5
   134  	PXOR	X2, X6
   135  	PXOR	X3, X7
   136  
   137  	AESENC	X4, X4
   138  	AESENC	X5, X5
   139  	AESENC	X6, X6
   140  	AESENC	X7, X7
   141  
   142  	AESENC	X4, X4
   143  	AESENC	X5, X5
   144  	AESENC	X6, X6
   145  	AESENC	X7, X7
   146  
   147  	AESENC	X4, X4
   148  	AESENC	X5, X5
   149  	AESENC	X6, X6
   150  	AESENC	X7, X7
   151  
   152  	PXOR	X6, X4
   153  	PXOR	X7, X5
   154  	PXOR	X5, X4
   155  	MOVQ	X4, (DX)
   156  	RET
   157  
   158  aes65to128:
   159  	// make 7 more starting seeds
   160  	MOVO	X1, X2
   161  	MOVO	X1, X3
   162  	MOVO	X1, X4
   163  	MOVO	X1, X5
   164  	MOVO	X1, X6
   165  	MOVO	X1, X7
   166  	PXOR	·aeskeysched+16(SB), X1
   167  	PXOR	·aeskeysched+32(SB), X2
   168  	PXOR	·aeskeysched+48(SB), X3
   169  	PXOR	·aeskeysched+64(SB), X4
   170  	PXOR	·aeskeysched+80(SB), X5
   171  	PXOR	·aeskeysched+96(SB), X6
   172  	PXOR	·aeskeysched+112(SB), X7
   173  	AESENC	X1, X1
   174  	AESENC	X2, X2
   175  	AESENC	X3, X3
   176  	AESENC	X4, X4
   177  	AESENC	X5, X5
   178  	AESENC	X6, X6
   179  	AESENC	X7, X7
   180  
   181  	// load data
   182  	MOVOU	(AX), X8
   183  	MOVOU	16(AX), X9
   184  	MOVOU	32(AX), X10
   185  	MOVOU	48(AX), X11
   186  	MOVOU	-64(AX)(CX*1), X12
   187  	MOVOU	-48(AX)(CX*1), X13
   188  	MOVOU	-32(AX)(CX*1), X14
   189  	MOVOU	-16(AX)(CX*1), X15
   190  
   191  	// xor with seed
   192  	PXOR	X0, X8
   193  	PXOR	X1, X9
   194  	PXOR	X2, X10
   195  	PXOR	X3, X11
   196  	PXOR	X4, X12
   197  	PXOR	X5, X13
   198  	PXOR	X6, X14
   199  	PXOR	X7, X15
   200  
   201  	// scramble 3 times
   202  	AESENC	X8, X8
   203  	AESENC	X9, X9
   204  	AESENC	X10, X10
   205  	AESENC	X11, X11
   206  	AESENC	X12, X12
   207  	AESENC	X13, X13
   208  	AESENC	X14, X14
   209  	AESENC	X15, X15
   210  
   211  	AESENC	X8, X8
   212  	AESENC	X9, X9
   213  	AESENC	X10, X10
   214  	AESENC	X11, X11
   215  	AESENC	X12, X12
   216  	AESENC	X13, X13
   217  	AESENC	X14, X14
   218  	AESENC	X15, X15
   219  
   220  	AESENC	X8, X8
   221  	AESENC	X9, X9
   222  	AESENC	X10, X10
   223  	AESENC	X11, X11
   224  	AESENC	X12, X12
   225  	AESENC	X13, X13
   226  	AESENC	X14, X14
   227  	AESENC	X15, X15
   228  
   229  	// combine results
   230  	PXOR	X12, X8
   231  	PXOR	X13, X9
   232  	PXOR	X14, X10
   233  	PXOR	X15, X11
   234  	PXOR	X10, X8
   235  	PXOR	X11, X9
   236  	PXOR	X9, X8
   237  	MOVQ	X8, (DX)
   238  	RET
   239  
   240  aes129plus:
   241  	// make 7 more starting seeds
   242  	MOVO	X1, X2
   243  	MOVO	X1, X3
   244  	MOVO	X1, X4
   245  	MOVO	X1, X5
   246  	MOVO	X1, X6
   247  	MOVO	X1, X7
   248  	PXOR	·aeskeysched+16(SB), X1
   249  	PXOR	·aeskeysched+32(SB), X2
   250  	PXOR	·aeskeysched+48(SB), X3
   251  	PXOR	·aeskeysched+64(SB), X4
   252  	PXOR	·aeskeysched+80(SB), X5
   253  	PXOR	·aeskeysched+96(SB), X6
   254  	PXOR	·aeskeysched+112(SB), X7
   255  	AESENC	X1, X1
   256  	AESENC	X2, X2
   257  	AESENC	X3, X3
   258  	AESENC	X4, X4
   259  	AESENC	X5, X5
   260  	AESENC	X6, X6
   261  	AESENC	X7, X7
   262  
   263  	// start with last (possibly overlapping) block
   264  	MOVOU	-128(AX)(CX*1), X8
   265  	MOVOU	-112(AX)(CX*1), X9
   266  	MOVOU	-96(AX)(CX*1), X10
   267  	MOVOU	-80(AX)(CX*1), X11
   268  	MOVOU	-64(AX)(CX*1), X12
   269  	MOVOU	-48(AX)(CX*1), X13
   270  	MOVOU	-32(AX)(CX*1), X14
   271  	MOVOU	-16(AX)(CX*1), X15
   272  
   273  	// xor in seed
   274  	PXOR	X0, X8
   275  	PXOR	X1, X9
   276  	PXOR	X2, X10
   277  	PXOR	X3, X11
   278  	PXOR	X4, X12
   279  	PXOR	X5, X13
   280  	PXOR	X6, X14
   281  	PXOR	X7, X15
   282  
   283  	// compute number of remaining 128-byte blocks
   284  	DECQ	CX
   285  	SHRQ	$7, CX
   286  
   287  aesloop:
   288  	// scramble state
   289  	AESENC	X8, X8
   290  	AESENC	X9, X9
   291  	AESENC	X10, X10
   292  	AESENC	X11, X11
   293  	AESENC	X12, X12
   294  	AESENC	X13, X13
   295  	AESENC	X14, X14
   296  	AESENC	X15, X15
   297  
   298  	// scramble state, xor in a block
   299  	MOVOU	(AX), X0
   300  	MOVOU	16(AX), X1
   301  	MOVOU	32(AX), X2
   302  	MOVOU	48(AX), X3
   303  	AESENC	X0, X8
   304  	AESENC	X1, X9
   305  	AESENC	X2, X10
   306  	AESENC	X3, X11
   307  	MOVOU	64(AX), X4
   308  	MOVOU	80(AX), X5
   309  	MOVOU	96(AX), X6
   310  	MOVOU	112(AX), X7
   311  	AESENC	X4, X12
   312  	AESENC	X5, X13
   313  	AESENC	X6, X14
   314  	AESENC	X7, X15
   315  
   316  	ADDQ	$128, AX
   317  	DECQ	CX
   318  	JNE	aesloop
   319  
   320  	// 3 more scrambles to finish
   321  	AESENC	X8, X8
   322  	AESENC	X9, X9
   323  	AESENC	X10, X10
   324  	AESENC	X11, X11
   325  	AESENC	X12, X12
   326  	AESENC	X13, X13
   327  	AESENC	X14, X14
   328  	AESENC	X15, X15
   329  	AESENC	X8, X8
   330  	AESENC	X9, X9
   331  	AESENC	X10, X10
   332  	AESENC	X11, X11
   333  	AESENC	X12, X12
   334  	AESENC	X13, X13
   335  	AESENC	X14, X14
   336  	AESENC	X15, X15
   337  	AESENC	X8, X8
   338  	AESENC	X9, X9
   339  	AESENC	X10, X10
   340  	AESENC	X11, X11
   341  	AESENC	X12, X12
   342  	AESENC	X13, X13
   343  	AESENC	X14, X14
   344  	AESENC	X15, X15
   345  
   346  	PXOR	X12, X8
   347  	PXOR	X13, X9
   348  	PXOR	X14, X10
   349  	PXOR	X15, X11
   350  	PXOR	X10, X8
   351  	PXOR	X11, X9
   352  	PXOR	X9, X8
   353  	MOVQ	X8, (DX)
   354  	RET
   355  
   356  // func aeshash32(p unsafe.Pointer, h uintptr) uintptr
   357  TEXT ·aeshash32(SB),NOSPLIT,$0-24
   358  	MOVQ	p+0(FP), AX	// ptr to data
   359  	MOVQ	h+8(FP), X0	// seed
   360  	PINSRD	$2, (AX), X0	// data
   361  	AESENC	·aeskeysched+0(SB), X0
   362  	AESENC	·aeskeysched+16(SB), X0
   363  	AESENC	·aeskeysched+32(SB), X0
   364  	MOVQ	X0, ret+16(FP)
   365  	RET
   366  
   367  // func aeshash64(p unsafe.Pointer, h uintptr) uintptr
   368  TEXT ·aeshash64(SB),NOSPLIT,$0-24
   369  	MOVQ	p+0(FP), AX	// ptr to data
   370  	MOVQ	h+8(FP), X0	// seed
   371  	PINSRQ	$1, (AX), X0	// data
   372  	AESENC	·aeskeysched+0(SB), X0
   373  	AESENC	·aeskeysched+16(SB), X0
   374  	AESENC	·aeskeysched+32(SB), X0
   375  	MOVQ	X0, ret+16(FP)
   376  	RET
   377  
   378  // simple mask to get rid of data in the high part of the register.
   379  DATA masks<>+0x00(SB)/8, $0x0000000000000000
   380  DATA masks<>+0x08(SB)/8, $0x0000000000000000
   381  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
   382  DATA masks<>+0x18(SB)/8, $0x0000000000000000
   383  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
   384  DATA masks<>+0x28(SB)/8, $0x0000000000000000
   385  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
   386  DATA masks<>+0x38(SB)/8, $0x0000000000000000
   387  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
   388  DATA masks<>+0x48(SB)/8, $0x0000000000000000
   389  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
   390  DATA masks<>+0x58(SB)/8, $0x0000000000000000
   391  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
   392  DATA masks<>+0x68(SB)/8, $0x0000000000000000
   393  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
   394  DATA masks<>+0x78(SB)/8, $0x0000000000000000
   395  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
   396  DATA masks<>+0x88(SB)/8, $0x0000000000000000
   397  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
   398  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
   399  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
   400  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
   401  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
   402  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
   403  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
   404  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
   405  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
   406  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
   407  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
   408  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
   409  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
   410  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
   411  GLOBL masks<>(SB),RODATA,$256
   412  
   413  // these are arguments to pshufb. They move data down from
   414  // the high bytes of the register to the low bytes of the register.
   415  // index is how many bytes to move.
   416  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
   417  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
   418  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
   419  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
   420  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
   421  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
   422  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
   423  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
   424  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
   425  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
   426  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
   427  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
   428  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
   429  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
   430  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
   431  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
   432  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
   433  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
   434  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
   435  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
   436  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
   437  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
   438  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
   439  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
   440  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
   441  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
   442  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
   443  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
   444  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
   445  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
   446  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
   447  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
   448  GLOBL shifts<>(SB),RODATA,$256