github.com/arr-ai/hash@v0.8.0/asm_386.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "funcdata.h"
     7  #include "textflag.h"
     8  
     9  // hash function using AES hardware instructions
    10  TEXT ·aeshash(SB),NOSPLIT,$0-16
    11  	MOVL	p+0(FP), AX	// ptr to data
    12  	MOVL	s+8(FP), BX	// size
    13  	LEAL	ret+12(FP), DX
    14  	JMP	aeshashbody<>(SB)
    15  
    16  TEXT ·aeshashstr(SB),NOSPLIT,$0-12
    17  	MOVL	p+0(FP), AX	// ptr to string object
    18  	MOVL	4(AX), BX	// length of string
    19  	MOVL	(AX), AX	// string data
    20  	LEAL	ret+8(FP), DX
    21  	JMP	aeshashbody<>(SB)
    22  
    23  // AX: data
    24  // BX: length
    25  // DX: address to put return value
    26  TEXT aeshashbody<>(SB),NOSPLIT,$0-0
    27  	MOVL	h+4(FP), X0	            // 32 bits of per-table hash seed
    28  	PINSRW	$4, BX, X0	            // 16 bits of length
    29  	PSHUFHW	$0, X0, X0	            // replace size with its low 2 bytes repeated 4 times
    30  	MOVO	X0, X1                      // save unscrambled seed
    31  	PXOR	·aeskeysched(SB), X0 // xor in per-process seed
    32  	AESENC	X0, X0                      // scramble seed
    33  
    34  	CMPL	BX, $16
    35  	JB	aes0to15
    36  	JE	aes16
    37  	CMPL	BX, $32
    38  	JBE	aes17to32
    39  	CMPL	BX, $64
    40  	JBE	aes33to64
    41  	JMP	aes65plus
    42  
    43  aes0to15:
    44  	TESTL	BX, BX
    45  	JE	aes0
    46  
    47  	ADDL	$16, AX
    48  	TESTW	$0xff0, AX
    49  	JE	endofpage
    50  
    51  	// 16 bytes loaded at this address won't cross
    52  	// a page boundary, so we can load it directly.
    53  	MOVOU	-16(AX), X1
    54  	ADDL	BX, BX
    55  	PAND	masks<>(SB)(BX*8), X1
    56  
    57  final1:
    58  	AESENC	X0, X1  // scramble input, xor in seed
    59  	AESENC	X1, X1  // scramble combo 2 times
    60  	AESENC	X1, X1
    61  	MOVL	X1, (DX)
    62  	RET
    63  
    64  endofpage:
    65  	// address ends in 1111xxxx. Might be up against
    66  	// a page boundary, so load ending at last byte.
    67  	// Then shift bytes down using pshufb.
    68  	MOVOU	-32(AX)(BX*1), X1
    69  	ADDL	BX, BX
    70  	PSHUFB	shifts<>(SB)(BX*8), X1
    71  	JMP	final1
    72  
    73  aes0:
    74  	// Return scrambled input seed
    75  	AESENC	X0, X0
    76  	MOVL	X0, (DX)
    77  	RET
    78  
    79  aes16:
    80  	MOVOU	(AX), X1
    81  	JMP	final1
    82  
    83  aes17to32:
    84  	// make second starting seed
    85  	PXOR	·aeskeysched+16(SB), X1
    86  	AESENC	X1, X1
    87  
    88  	// load data to be hashed
    89  	MOVOU	(AX), X2
    90  	MOVOU	-16(AX)(BX*1), X3
    91  
    92  	// scramble 3 times
    93  	AESENC	X0, X2
    94  	AESENC	X1, X3
    95  	AESENC	X2, X2
    96  	AESENC	X3, X3
    97  	AESENC	X2, X2
    98  	AESENC	X3, X3
    99  
   100  	// combine results
   101  	PXOR	X3, X2
   102  	MOVL	X2, (DX)
   103  	RET
   104  
   105  aes33to64:
   106  	// make 3 more starting seeds
   107  	MOVO	X1, X2
   108  	MOVO	X1, X3
   109  	PXOR	·aeskeysched+16(SB), X1
   110  	PXOR	·aeskeysched+32(SB), X2
   111  	PXOR	·aeskeysched+48(SB), X3
   112  	AESENC	X1, X1
   113  	AESENC	X2, X2
   114  	AESENC	X3, X3
   115  
   116  	MOVOU	(AX), X4
   117  	MOVOU	16(AX), X5
   118  	MOVOU	-32(AX)(BX*1), X6
   119  	MOVOU	-16(AX)(BX*1), X7
   120  
   121  	AESENC	X0, X4
   122  	AESENC	X1, X5
   123  	AESENC	X2, X6
   124  	AESENC	X3, X7
   125  
   126  	AESENC	X4, X4
   127  	AESENC	X5, X5
   128  	AESENC	X6, X6
   129  	AESENC	X7, X7
   130  
   131  	AESENC	X4, X4
   132  	AESENC	X5, X5
   133  	AESENC	X6, X6
   134  	AESENC	X7, X7
   135  
   136  	PXOR	X6, X4
   137  	PXOR	X7, X5
   138  	PXOR	X5, X4
   139  	MOVL	X4, (DX)
   140  	RET
   141  
   142  aes65plus:
   143  	// make 3 more starting seeds
   144  	MOVO	X1, X2
   145  	MOVO	X1, X3
   146  	PXOR	·aeskeysched+16(SB), X1
   147  	PXOR	·aeskeysched+32(SB), X2
   148  	PXOR	·aeskeysched+48(SB), X3
   149  	AESENC	X1, X1
   150  	AESENC	X2, X2
   151  	AESENC	X3, X3
   152  
   153  	// start with last (possibly overlapping) block
   154  	MOVOU	-64(AX)(BX*1), X4
   155  	MOVOU	-48(AX)(BX*1), X5
   156  	MOVOU	-32(AX)(BX*1), X6
   157  	MOVOU	-16(AX)(BX*1), X7
   158  
   159  	// scramble state once
   160  	AESENC	X0, X4
   161  	AESENC	X1, X5
   162  	AESENC	X2, X6
   163  	AESENC	X3, X7
   164  
   165  	// compute number of remaining 64-byte blocks
   166  	DECL	BX
   167  	SHRL	$6, BX
   168  
   169  aesloop:
   170  	// scramble state, xor in a block
   171  	MOVOU	(AX), X0
   172  	MOVOU	16(AX), X1
   173  	MOVOU	32(AX), X2
   174  	MOVOU	48(AX), X3
   175  	AESENC	X0, X4
   176  	AESENC	X1, X5
   177  	AESENC	X2, X6
   178  	AESENC	X3, X7
   179  
   180  	// scramble state
   181  	AESENC	X4, X4
   182  	AESENC	X5, X5
   183  	AESENC	X6, X6
   184  	AESENC	X7, X7
   185  
   186  	ADDL	$64, AX
   187  	DECL	BX
   188  	JNE	aesloop
   189  
   190  	// 2 more scrambles to finish
   191  	AESENC	X4, X4
   192  	AESENC	X5, X5
   193  	AESENC	X6, X6
   194  	AESENC	X7, X7
   195  
   196  	AESENC	X4, X4
   197  	AESENC	X5, X5
   198  	AESENC	X6, X6
   199  	AESENC	X7, X7
   200  
   201  	PXOR	X6, X4
   202  	PXOR	X7, X5
   203  	PXOR	X5, X4
   204  	MOVL	X4, (DX)
   205  	RET
   206  
   207  TEXT ·aeshash32(SB),NOSPLIT,$0-12
   208  	MOVL	p+0(FP), AX	// ptr to data
   209  	MOVL	h+4(FP), X0	// seed
   210  	PINSRD	$1, (AX), X0	// data
   211  	AESENC	·aeskeysched+0(SB), X0
   212  	AESENC	·aeskeysched+16(SB), X0
   213  	AESENC	·aeskeysched+32(SB), X0
   214  	MOVL	X0, ret+8(FP)
   215  	RET
   216  
   217  TEXT ·aeshash64(SB),NOSPLIT,$0-12
   218  	MOVL	p+0(FP), AX	// ptr to data
   219  	MOVQ	(AX), X0	// data
   220  	PINSRD	$2, h+4(FP), X0	// seed
   221  	AESENC	·aeskeysched+0(SB), X0
   222  	AESENC	·aeskeysched+16(SB), X0
   223  	AESENC	·aeskeysched+32(SB), X0
   224  	MOVL	X0, ret+8(FP)
   225  	RET
   226  
   227  // simple mask to get rid of data in the high part of the register.
   228  DATA masks<>+0x00(SB)/4, $0x00000000
   229  DATA masks<>+0x04(SB)/4, $0x00000000
   230  DATA masks<>+0x08(SB)/4, $0x00000000
   231  DATA masks<>+0x0c(SB)/4, $0x00000000
   232  
   233  DATA masks<>+0x10(SB)/4, $0x000000ff
   234  DATA masks<>+0x14(SB)/4, $0x00000000
   235  DATA masks<>+0x18(SB)/4, $0x00000000
   236  DATA masks<>+0x1c(SB)/4, $0x00000000
   237  
   238  DATA masks<>+0x20(SB)/4, $0x0000ffff
   239  DATA masks<>+0x24(SB)/4, $0x00000000
   240  DATA masks<>+0x28(SB)/4, $0x00000000
   241  DATA masks<>+0x2c(SB)/4, $0x00000000
   242  
   243  DATA masks<>+0x30(SB)/4, $0x00ffffff
   244  DATA masks<>+0x34(SB)/4, $0x00000000
   245  DATA masks<>+0x38(SB)/4, $0x00000000
   246  DATA masks<>+0x3c(SB)/4, $0x00000000
   247  
   248  DATA masks<>+0x40(SB)/4, $0xffffffff
   249  DATA masks<>+0x44(SB)/4, $0x00000000
   250  DATA masks<>+0x48(SB)/4, $0x00000000
   251  DATA masks<>+0x4c(SB)/4, $0x00000000
   252  
   253  DATA masks<>+0x50(SB)/4, $0xffffffff
   254  DATA masks<>+0x54(SB)/4, $0x000000ff
   255  DATA masks<>+0x58(SB)/4, $0x00000000
   256  DATA masks<>+0x5c(SB)/4, $0x00000000
   257  
   258  DATA masks<>+0x60(SB)/4, $0xffffffff
   259  DATA masks<>+0x64(SB)/4, $0x0000ffff
   260  DATA masks<>+0x68(SB)/4, $0x00000000
   261  DATA masks<>+0x6c(SB)/4, $0x00000000
   262  
   263  DATA masks<>+0x70(SB)/4, $0xffffffff
   264  DATA masks<>+0x74(SB)/4, $0x00ffffff
   265  DATA masks<>+0x78(SB)/4, $0x00000000
   266  DATA masks<>+0x7c(SB)/4, $0x00000000
   267  
   268  DATA masks<>+0x80(SB)/4, $0xffffffff
   269  DATA masks<>+0x84(SB)/4, $0xffffffff
   270  DATA masks<>+0x88(SB)/4, $0x00000000
   271  DATA masks<>+0x8c(SB)/4, $0x00000000
   272  
   273  DATA masks<>+0x90(SB)/4, $0xffffffff
   274  DATA masks<>+0x94(SB)/4, $0xffffffff
   275  DATA masks<>+0x98(SB)/4, $0x000000ff
   276  DATA masks<>+0x9c(SB)/4, $0x00000000
   277  
   278  DATA masks<>+0xa0(SB)/4, $0xffffffff
   279  DATA masks<>+0xa4(SB)/4, $0xffffffff
   280  DATA masks<>+0xa8(SB)/4, $0x0000ffff
   281  DATA masks<>+0xac(SB)/4, $0x00000000
   282  
   283  DATA masks<>+0xb0(SB)/4, $0xffffffff
   284  DATA masks<>+0xb4(SB)/4, $0xffffffff
   285  DATA masks<>+0xb8(SB)/4, $0x00ffffff
   286  DATA masks<>+0xbc(SB)/4, $0x00000000
   287  
   288  DATA masks<>+0xc0(SB)/4, $0xffffffff
   289  DATA masks<>+0xc4(SB)/4, $0xffffffff
   290  DATA masks<>+0xc8(SB)/4, $0xffffffff
   291  DATA masks<>+0xcc(SB)/4, $0x00000000
   292  
   293  DATA masks<>+0xd0(SB)/4, $0xffffffff
   294  DATA masks<>+0xd4(SB)/4, $0xffffffff
   295  DATA masks<>+0xd8(SB)/4, $0xffffffff
   296  DATA masks<>+0xdc(SB)/4, $0x000000ff
   297  
   298  DATA masks<>+0xe0(SB)/4, $0xffffffff
   299  DATA masks<>+0xe4(SB)/4, $0xffffffff
   300  DATA masks<>+0xe8(SB)/4, $0xffffffff
   301  DATA masks<>+0xec(SB)/4, $0x0000ffff
   302  
   303  DATA masks<>+0xf0(SB)/4, $0xffffffff
   304  DATA masks<>+0xf4(SB)/4, $0xffffffff
   305  DATA masks<>+0xf8(SB)/4, $0xffffffff
   306  DATA masks<>+0xfc(SB)/4, $0x00ffffff
   307  
   308  GLOBL masks<>(SB),RODATA,$256
   309  
   310  // these are arguments to pshufb. They move data down from
   311  // the high bytes of the register to the low bytes of the register.
   312  // index is how many bytes to move.
   313  DATA shifts<>+0x00(SB)/4, $0x00000000
   314  DATA shifts<>+0x04(SB)/4, $0x00000000
   315  DATA shifts<>+0x08(SB)/4, $0x00000000
   316  DATA shifts<>+0x0c(SB)/4, $0x00000000
   317  
   318  DATA shifts<>+0x10(SB)/4, $0xffffff0f
   319  DATA shifts<>+0x14(SB)/4, $0xffffffff
   320  DATA shifts<>+0x18(SB)/4, $0xffffffff
   321  DATA shifts<>+0x1c(SB)/4, $0xffffffff
   322  
   323  DATA shifts<>+0x20(SB)/4, $0xffff0f0e
   324  DATA shifts<>+0x24(SB)/4, $0xffffffff
   325  DATA shifts<>+0x28(SB)/4, $0xffffffff
   326  DATA shifts<>+0x2c(SB)/4, $0xffffffff
   327  
   328  DATA shifts<>+0x30(SB)/4, $0xff0f0e0d
   329  DATA shifts<>+0x34(SB)/4, $0xffffffff
   330  DATA shifts<>+0x38(SB)/4, $0xffffffff
   331  DATA shifts<>+0x3c(SB)/4, $0xffffffff
   332  
   333  DATA shifts<>+0x40(SB)/4, $0x0f0e0d0c
   334  DATA shifts<>+0x44(SB)/4, $0xffffffff
   335  DATA shifts<>+0x48(SB)/4, $0xffffffff
   336  DATA shifts<>+0x4c(SB)/4, $0xffffffff
   337  
   338  DATA shifts<>+0x50(SB)/4, $0x0e0d0c0b
   339  DATA shifts<>+0x54(SB)/4, $0xffffff0f
   340  DATA shifts<>+0x58(SB)/4, $0xffffffff
   341  DATA shifts<>+0x5c(SB)/4, $0xffffffff
   342  
   343  DATA shifts<>+0x60(SB)/4, $0x0d0c0b0a
   344  DATA shifts<>+0x64(SB)/4, $0xffff0f0e
   345  DATA shifts<>+0x68(SB)/4, $0xffffffff
   346  DATA shifts<>+0x6c(SB)/4, $0xffffffff
   347  
   348  DATA shifts<>+0x70(SB)/4, $0x0c0b0a09
   349  DATA shifts<>+0x74(SB)/4, $0xff0f0e0d
   350  DATA shifts<>+0x78(SB)/4, $0xffffffff
   351  DATA shifts<>+0x7c(SB)/4, $0xffffffff
   352  
   353  DATA shifts<>+0x80(SB)/4, $0x0b0a0908
   354  DATA shifts<>+0x84(SB)/4, $0x0f0e0d0c
   355  DATA shifts<>+0x88(SB)/4, $0xffffffff
   356  DATA shifts<>+0x8c(SB)/4, $0xffffffff
   357  
   358  DATA shifts<>+0x90(SB)/4, $0x0a090807
   359  DATA shifts<>+0x94(SB)/4, $0x0e0d0c0b
   360  DATA shifts<>+0x98(SB)/4, $0xffffff0f
   361  DATA shifts<>+0x9c(SB)/4, $0xffffffff
   362  
   363  DATA shifts<>+0xa0(SB)/4, $0x09080706
   364  DATA shifts<>+0xa4(SB)/4, $0x0d0c0b0a
   365  DATA shifts<>+0xa8(SB)/4, $0xffff0f0e
   366  DATA shifts<>+0xac(SB)/4, $0xffffffff
   367  
   368  DATA shifts<>+0xb0(SB)/4, $0x08070605
   369  DATA shifts<>+0xb4(SB)/4, $0x0c0b0a09
   370  DATA shifts<>+0xb8(SB)/4, $0xff0f0e0d
   371  DATA shifts<>+0xbc(SB)/4, $0xffffffff
   372  
   373  DATA shifts<>+0xc0(SB)/4, $0x07060504
   374  DATA shifts<>+0xc4(SB)/4, $0x0b0a0908
   375  DATA shifts<>+0xc8(SB)/4, $0x0f0e0d0c
   376  DATA shifts<>+0xcc(SB)/4, $0xffffffff
   377  
   378  DATA shifts<>+0xd0(SB)/4, $0x06050403
   379  DATA shifts<>+0xd4(SB)/4, $0x0a090807
   380  DATA shifts<>+0xd8(SB)/4, $0x0e0d0c0b
   381  DATA shifts<>+0xdc(SB)/4, $0xffffff0f
   382  
   383  DATA shifts<>+0xe0(SB)/4, $0x05040302
   384  DATA shifts<>+0xe4(SB)/4, $0x09080706
   385  DATA shifts<>+0xe8(SB)/4, $0x0d0c0b0a
   386  DATA shifts<>+0xec(SB)/4, $0xffff0f0e
   387  
   388  DATA shifts<>+0xf0(SB)/4, $0x04030201
   389  DATA shifts<>+0xf4(SB)/4, $0x08070605
   390  DATA shifts<>+0xf8(SB)/4, $0x0c0b0a09
   391  DATA shifts<>+0xfc(SB)/4, $0xff0f0e0d
   392  
   393  GLOBL shifts<>(SB),RODATA,$256