github.com/insolar/vanilla@v0.0.0-20201023172447-248fdf805322/aeshash/hash_amd64.s (about)

     1  #include "textflag.h"
     2  
     3  // hash function using AES hardware instructions
     4  
     5  TEXT ·aeshash(SB),NOSPLIT,$0-32
     6      // nolint
     7  	MOVQ	p+0(FP), AX	// ptr to data
     8  	// nolint
     9  	MOVQ	s+16(FP), CX	// size
    10  	LEAQ	ret+24(FP), DX
    11  
    12  //TEXT ·aeshashstr(SB),NOSPLIT,$0-24
    13  //	MOVQ	p+0(FP), AX	// ptr to string/slice struct
    14  //	MOVQ	8(AX), CX	// length of data
    15  //	MOVQ	(AX), AX	// data
    16  //	LEAQ	ret+16(FP), DX
    17  
    18  	// Fill an SSE register with our seeds.
    19  	// nolint
    20  	MOVQ	h+8(FP), X0		// 64 bits of per-table hash seed
    21  	PINSRW	$4, CX, X0			// 16 bits of length
    22  	PSHUFHW $0, X0, X0			// repeat length 4 times total
    23  	MOVO	X0, X1				// save unscrambled seed
    24  	AESENC	X0, X0				// scramble seed
    25  
    26  	CMPQ	CX, $16
    27  	JB	aes0to15
    28  	JE	aes16
    29  	CMPQ	CX, $32
    30  	JBE	aes17to32
    31  	CMPQ	CX, $64
    32  	JBE	aes33to64
    33  	CMPQ	CX, $128
    34  	JBE	aes65to128
    35  	JMP	aes129plus
    36  
    37  aes0to15:
    38  	TESTQ	CX, CX
    39  	JE	aes0
    40  
    41  	ADDQ	$16, AX
    42  	TESTW	$0xff0, AX
    43  	JE	endofpage
    44  
    45  	// 16 bytes loaded at this address won't cross
    46  	// a page boundary, so we can load it directly.
    47  	MOVOU	-16(AX), X1
    48  	ADDQ	CX, CX
    49  	MOVQ	$masks<>(SB), AX
    50  	PAND	(AX)(CX*8), X1
    51  final1:
    52  	PXOR	X0, X1	// xor data with seed
    53  	AESENC	X1, X1	// scramble combo 3 times
    54  	AESENC	X1, X1
    55  	AESENC	X1, X1
    56  	MOVQ	X1, (DX)
    57  	RET
    58  
    59  endofpage:
    60  	// address ends in 1111xxxx. Might be up against
    61  	// a page boundary, so load ending at last byte.
    62  	// Then shift bytes down using pshufb.
    63  	MOVOU	-32(AX)(CX*1), X1
    64  	ADDQ	CX, CX
    65  	MOVQ	$shifts<>(SB), AX
    66  	PSHUFB	(AX)(CX*8), X1
    67  	JMP	final1
    68  
    69  aes0:
    70  	// Return scrambled input seed
    71  	AESENC	X0, X0
    72  	MOVQ	X0, (DX)
    73  	RET
    74  
    75  aes16:
    76  	MOVOU	(AX), X1
    77  	JMP	final1
    78  
    79  aes17to32:
    80  	// make second starting seed
    81  	AESENC	X1, X1
    82  
    83  	// load data to be hashed
    84  	MOVOU	(AX), X2
    85  	MOVOU	-16(AX)(CX*1), X3
    86  
    87  	// xor with seed
    88  	PXOR	X0, X2
    89  	PXOR	X1, X3
    90  
    91  	// scramble 3 times
    92  	AESENC	X2, X2
    93  	AESENC	X3, X3
    94  	AESENC	X2, X2
    95  	AESENC	X3, X3
    96  	AESENC	X2, X2
    97  	AESENC	X3, X3
    98  
    99  	// combine results
   100  	PXOR	X3, X2
   101  	MOVQ	X2, (DX)
   102  	RET
   103  
   104  aes33to64:
   105  	// make 3 more starting seeds
   106  	MOVO	X1, X2
   107  	MOVO	X1, X3
   108  	AESENC	X1, X1
   109  	AESENC	X2, X2
   110  	AESENC	X3, X3
   111  
   112  	MOVOU	(AX), X4
   113  	MOVOU	16(AX), X5
   114  	MOVOU	-32(AX)(CX*1), X6
   115  	MOVOU	-16(AX)(CX*1), X7
   116  
   117  	PXOR	X0, X4
   118  	PXOR	X1, X5
   119  	PXOR	X2, X6
   120  	PXOR	X3, X7
   121  
   122  	AESENC	X4, X4
   123  	AESENC	X5, X5
   124  	AESENC	X6, X6
   125  	AESENC	X7, X7
   126  
   127  	AESENC	X4, X4
   128  	AESENC	X5, X5
   129  	AESENC	X6, X6
   130  	AESENC	X7, X7
   131  
   132  	AESENC	X4, X4
   133  	AESENC	X5, X5
   134  	AESENC	X6, X6
   135  	AESENC	X7, X7
   136  
   137  	PXOR	X6, X4
   138  	PXOR	X7, X5
   139  	PXOR	X5, X4
   140  	MOVQ	X4, (DX)
   141  	RET
   142  
   143  aes65to128:
   144  	// make 7 more starting seeds
   145  	MOVO	X1, X2
   146  	MOVO	X1, X3
   147  	MOVO	X1, X4
   148  	MOVO	X1, X5
   149  	MOVO	X1, X6
   150  	MOVO	X1, X7
   151  	AESENC	X1, X1
   152  	AESENC	X2, X2
   153  	AESENC	X3, X3
   154  	AESENC	X4, X4
   155  	AESENC	X5, X5
   156  	AESENC	X6, X6
   157  	AESENC	X7, X7
   158  
   159  	// load data
   160  	MOVOU	(AX), X8
   161  	MOVOU	16(AX), X9
   162  	MOVOU	32(AX), X10
   163  	MOVOU	48(AX), X11
   164  	MOVOU	-64(AX)(CX*1), X12
   165  	MOVOU	-48(AX)(CX*1), X13
   166  	MOVOU	-32(AX)(CX*1), X14
   167  	MOVOU	-16(AX)(CX*1), X15
   168  
   169  	// xor with seed
   170  	PXOR	X0, X8
   171  	PXOR	X1, X9
   172  	PXOR	X2, X10
   173  	PXOR	X3, X11
   174  	PXOR	X4, X12
   175  	PXOR	X5, X13
   176  	PXOR	X6, X14
   177  	PXOR	X7, X15
   178  
   179  	// scramble 3 times
   180  	AESENC	X8, X8
   181  	AESENC	X9, X9
   182  	AESENC	X10, X10
   183  	AESENC	X11, X11
   184  	AESENC	X12, X12
   185  	AESENC	X13, X13
   186  	AESENC	X14, X14
   187  	AESENC	X15, X15
   188  
   189  	AESENC	X8, X8
   190  	AESENC	X9, X9
   191  	AESENC	X10, X10
   192  	AESENC	X11, X11
   193  	AESENC	X12, X12
   194  	AESENC	X13, X13
   195  	AESENC	X14, X14
   196  	AESENC	X15, X15
   197  
   198  	AESENC	X8, X8
   199  	AESENC	X9, X9
   200  	AESENC	X10, X10
   201  	AESENC	X11, X11
   202  	AESENC	X12, X12
   203  	AESENC	X13, X13
   204  	AESENC	X14, X14
   205  	AESENC	X15, X15
   206  
   207  	// combine results
   208  	PXOR	X12, X8
   209  	PXOR	X13, X9
   210  	PXOR	X14, X10
   211  	PXOR	X15, X11
   212  	PXOR	X10, X8
   213  	PXOR	X11, X9
   214  	PXOR	X9, X8
   215  	MOVQ	X8, (DX)
   216  	RET
   217  
   218  aes129plus:
   219  	// make 7 more starting seeds
   220  	MOVO	X1, X2
   221  	MOVO	X1, X3
   222  	MOVO	X1, X4
   223  	MOVO	X1, X5
   224  	MOVO	X1, X6
   225  	MOVO	X1, X7
   226  	AESENC	X1, X1
   227  	AESENC	X2, X2
   228  	AESENC	X3, X3
   229  	AESENC	X4, X4
   230  	AESENC	X5, X5
   231  	AESENC	X6, X6
   232  	AESENC	X7, X7
   233  
   234  	// start with last (possibly overlapping) block
   235  	MOVOU	-128(AX)(CX*1), X8
   236  	MOVOU	-112(AX)(CX*1), X9
   237  	MOVOU	-96(AX)(CX*1), X10
   238  	MOVOU	-80(AX)(CX*1), X11
   239  	MOVOU	-64(AX)(CX*1), X12
   240  	MOVOU	-48(AX)(CX*1), X13
   241  	MOVOU	-32(AX)(CX*1), X14
   242  	MOVOU	-16(AX)(CX*1), X15
   243  
   244  	// xor in seed
   245  	PXOR	X0, X8
   246  	PXOR	X1, X9
   247  	PXOR	X2, X10
   248  	PXOR	X3, X11
   249  	PXOR	X4, X12
   250  	PXOR	X5, X13
   251  	PXOR	X6, X14
   252  	PXOR	X7, X15
   253  
   254  	// compute number of remaining 128-byte blocks
   255  	DECQ	CX
   256  	SHRQ	$7, CX
   257  
   258  aesloop:
   259  	// scramble state
   260  	AESENC	X8, X8
   261  	AESENC	X9, X9
   262  	AESENC	X10, X10
   263  	AESENC	X11, X11
   264  	AESENC	X12, X12
   265  	AESENC	X13, X13
   266  	AESENC	X14, X14
   267  	AESENC	X15, X15
   268  
   269  	// scramble state, xor in a block
   270  	MOVOU	(AX), X0
   271  	MOVOU	16(AX), X1
   272  	MOVOU	32(AX), X2
   273  	MOVOU	48(AX), X3
   274  	AESENC	X0, X8
   275  	AESENC	X1, X9
   276  	AESENC	X2, X10
   277  	AESENC	X3, X11
   278  	MOVOU	64(AX), X4
   279  	MOVOU	80(AX), X5
   280  	MOVOU	96(AX), X6
   281  	MOVOU	112(AX), X7
   282  	AESENC	X4, X12
   283  	AESENC	X5, X13
   284  	AESENC	X6, X14
   285  	AESENC	X7, X15
   286  
   287  	ADDQ	$128, AX
   288  	DECQ	CX
   289  	JNE	aesloop
   290  
   291  	// 3 more scrambles to finish
   292  	AESENC	X8, X8
   293  	AESENC	X9, X9
   294  	AESENC	X10, X10
   295  	AESENC	X11, X11
   296  	AESENC	X12, X12
   297  	AESENC	X13, X13
   298  	AESENC	X14, X14
   299  	AESENC	X15, X15
   300  	AESENC	X8, X8
   301  	AESENC	X9, X9
   302  	AESENC	X10, X10
   303  	AESENC	X11, X11
   304  	AESENC	X12, X12
   305  	AESENC	X13, X13
   306  	AESENC	X14, X14
   307  	AESENC	X15, X15
   308  	AESENC	X8, X8
   309  	AESENC	X9, X9
   310  	AESENC	X10, X10
   311  	AESENC	X11, X11
   312  	AESENC	X12, X12
   313  	AESENC	X13, X13
   314  	AESENC	X14, X14
   315  	AESENC	X15, X15
   316  
   317  	PXOR	X12, X8
   318  	PXOR	X13, X9
   319  	PXOR	X14, X10
   320  	PXOR	X15, X11
   321  	PXOR	X10, X8
   322  	PXOR	X11, X9
   323  	PXOR	X9, X8
   324  	MOVQ	X8, (DX)
   325  	RET
   326  
   327  // simple mask to get rid of data in the high part of the register.
   328  DATA masks<>+0x00(SB)/8, $0x0000000000000000
   329  DATA masks<>+0x08(SB)/8, $0x0000000000000000
   330  DATA masks<>+0x10(SB)/8, $0x00000000000000ff
   331  DATA masks<>+0x18(SB)/8, $0x0000000000000000
   332  DATA masks<>+0x20(SB)/8, $0x000000000000ffff
   333  DATA masks<>+0x28(SB)/8, $0x0000000000000000
   334  DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
   335  DATA masks<>+0x38(SB)/8, $0x0000000000000000
   336  DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
   337  DATA masks<>+0x48(SB)/8, $0x0000000000000000
   338  DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
   339  DATA masks<>+0x58(SB)/8, $0x0000000000000000
   340  DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
   341  DATA masks<>+0x68(SB)/8, $0x0000000000000000
   342  DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
   343  DATA masks<>+0x78(SB)/8, $0x0000000000000000
   344  DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
   345  DATA masks<>+0x88(SB)/8, $0x0000000000000000
   346  DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
   347  DATA masks<>+0x98(SB)/8, $0x00000000000000ff
   348  DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
   349  DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
   350  DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
   351  DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
   352  DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
   353  DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
   354  DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
   355  DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
   356  DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
   357  DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
   358  DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
   359  DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
   360  GLOBL masks<>(SB),RODATA,$256
   361  
   362  // these are arguments to pshufb. They move data down from
   363  // the high bytes of the register to the low bytes of the register.
   364  // index is how many bytes to move.
   365  DATA shifts<>+0x00(SB)/8, $0x0000000000000000
   366  DATA shifts<>+0x08(SB)/8, $0x0000000000000000
   367  DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
   368  DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
   369  DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
   370  DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
   371  DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
   372  DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
   373  DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
   374  DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
   375  DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
   376  DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
   377  DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
   378  DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
   379  DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
   380  DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
   381  DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
   382  DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
   383  DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
   384  DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
   385  DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
   386  DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
   387  DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
   388  DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
   389  DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
   390  DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
   391  DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
   392  DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
   393  DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
   394  DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
   395  DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
   396  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
   397  GLOBL shifts<>(SB),RODATA,$256