github.com/matrixorigin/matrixone@v0.7.0/pkg/container/hashtable/hash_arm64.s (about)

     1  // Copyright 2021 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  #include "textflag.h"
    16  
    17  // func crc32Int64BatchHash(data *uint64, hashes *uint64, length int)
    18  // Requires: CRC32
    19  TEXT ·crc32Int64BatchHash(SB), NOSPLIT, $0-24
    20  	MOVD data+0(FP), R0
    21  	MOVD hashes+8(FP), R1
    22  	MOVD length+16(FP), R2
    23  
    24  loop:
    25  	SUBS $8, R2
    26  	BLT  tail
    27  
    28  	MOVD $-1, R3
    29  	MOVD $-1, R4
    30  	MOVD $-1, R5
    31  	MOVD $-1, R6
    32  	MOVD $-1, R7
    33  	MOVD $-1, R8
    34  	MOVD $-1, R9
    35  	MOVD $-1, R10
    36  
    37  	LDP.P 16(R0), (R11, R12)
    38  	LDP.P 16(R0), (R13, R14)
    39  	LDP.P 16(R0), (R15, R16)
    40  	LDP.P 16(R0), (R17, R19)
    41  
    42  	CRC32CX R11, R3
    43  	CRC32CX R12, R4
    44  	CRC32CX R13, R5
    45  	CRC32CX R14, R6
    46  	CRC32CX R15, R7
    47  	CRC32CX R16, R8
    48  	CRC32CX R17, R9
    49  	CRC32CX R19, R10
    50  
    51  	STP.P (R3, R4), 16(R1)
    52  	STP.P (R5, R6), 16(R1)
    53  	STP.P (R7, R8), 16(R1)
    54  	STP.P (R9, R10), 16(R1)
    55  
    56  	JMP loop
    57  
    58  tail:
    59  	ADDS $8, R2
    60  	BEQ  done
    61  
    62  tailLoop:
    63  	MOVD    $-1, R3
    64  	MOVD.P  8(R0), R4
    65  	CRC32CX R4, R3
    66  	MOVD.P  R3, 8(R1)
    67  
    68  	SUBS $1, R2
    69  	BNE  tailLoop
    70  
    71  done:
    72  	RET
    73  
    74  // func crc32Int64CellBatchHash(data *uint64, hashes *uint64, length int)
    75  // Requires: CRC32
    76  TEXT ·crc32Int64CellBatchHash(SB), NOSPLIT, $0-24
    77  	MOVD data+0(FP), R0
    78  	MOVD hashes+8(FP), R1
    79  	MOVD length+16(FP), R2
    80  
    81  loop:
    82  	SUBS $8, R2
    83  	BLT  tail
    84  
    85  	MOVD $-1, R3
    86  	MOVD $-1, R4
    87  	MOVD $-1, R5
    88  	MOVD $-1, R6
    89  	MOVD $-1, R7
    90  	MOVD $-1, R8
    91  	MOVD $-1, R9
    92  	MOVD $-1, R10
    93  
    94  	MOVD.P 16(R0), R11
    95  	MOVD.P 16(R0), R12
    96  	MOVD.P 16(R0), R13
    97  	MOVD.P 16(R0), R14
    98  	MOVD.P 16(R0), R15
    99  	MOVD.P 16(R0), R16
   100  	MOVD.P 16(R0), R17
   101  	MOVD.P 16(R0), R19
   102  
   103  	CRC32CX R11, R3
   104  	CRC32CX R12, R4
   105  	CRC32CX R13, R5
   106  	CRC32CX R14, R6
   107  	CRC32CX R15, R7
   108  	CRC32CX R16, R8
   109  	CRC32CX R17, R9
   110  	CRC32CX R19, R10
   111  
   112  	STP.P (R3, R4), 16(R1)
   113  	STP.P (R5, R6), 16(R1)
   114  	STP.P (R7, R8), 16(R1)
   115  	STP.P (R9, R10), 16(R1)
   116  
   117  	JMP loop
   118  
   119  tail:
   120  	ADDS $8, R2
   121  	BEQ  done
   122  
   123  tailLoop:
   124  	MOVD    $-1, R4
   125  	MOVD.P  16(R0), R3
   126  	CRC32CX R3, R4
   127  	MOVD.P  R4, 8(R1)
   128  
   129  	SUBS $1, R2
   130  	BNE  tailLoop
   131  
   132  done:
   133  	RET
   134  
   135  ////////////////////////////////////////////////////////////////
   136  ////////////////////////////////////////////////////////////////
   137  ////////////////////////////////////////////////////////////////
   138  ////////////////////////////////////////////////////////////////
   139  
   140  DATA Pi<>+0x00(SB)/8, $0x3243f6a8885a308d
   141  DATA Pi<>+0x08(SB)/8, $0x313198a2e0370734
   142  DATA Pi<>+0x10(SB)/8, $0x4a4093822299f31d
   143  DATA Pi<>+0x18(SB)/8, $0x0082efa98ec4e6c8
   144  DATA Pi<>+0x20(SB)/8, $0x9452821e638d0137
   145  DATA Pi<>+0x28(SB)/8, $0x7be5466cf34e90c6
   146  DATA Pi<>+0x30(SB)/8, $0xcc0ac29b7c97c50d
   147  DATA Pi<>+0x38(SB)/8, $0xd3f84d5b5b547091
   148  DATA Pi<>+0x40(SB)/8, $0x79216d5d98979fb1
   149  DATA Pi<>+0x48(SB)/8, $0xbd1310ba698dfb5a
   150  DATA Pi<>+0x50(SB)/8, $0xc2ffd72dbd01adfb
   151  DATA Pi<>+0x58(SB)/8, $0x7b8e1afed6a267e9
   152  DATA Pi<>+0x60(SB)/8, $0x6ba7c9045f12c7f9
   153  DATA Pi<>+0x68(SB)/8, $0x924a19947b3916cf
   154  DATA Pi<>+0x70(SB)/8, $0x70801f2e2858efc1
   155  DATA Pi<>+0x78(SB)/8, $0x6636920d871574e6
   156  GLOBL Pi<>(SB), (NOPTR+RODATA), $0x80
   157  
   158  DATA CryptedPi<>+0x00(SB)/8, $0x822233b93c11087c
   159  DATA CryptedPi<>+0x08(SB)/8, $0xd2b32f4adde873da
   160  DATA CryptedPi<>+0x10(SB)/8, $0xae9c2fc7dd17bcdb
   161  DATA CryptedPi<>+0x18(SB)/8, $0x859110441a1569fc
   162  DATA CryptedPi<>+0x20(SB)/8, $0x47087d794fffb5c9
   163  DATA CryptedPi<>+0x28(SB)/8, $0xb7b6c8f565414445
   164  DATA CryptedPi<>+0x30(SB)/8, $0xfd260edabb308f8d
   165  DATA CryptedPi<>+0x38(SB)/8, $0x3ddefc67bc565a13
   166  DATA CryptedPi<>+0x40(SB)/8, $0xe4c1d50223544f10
   167  DATA CryptedPi<>+0x48(SB)/8, $0xaf40e05725c3192b
   168  DATA CryptedPi<>+0x50(SB)/8, $0x281d8ab9a16382e9
   169  DATA CryptedPi<>+0x58(SB)/8, $0xddc10c903b63a6cf
   170  DATA CryptedPi<>+0x60(SB)/8, $0x852d3ad603e8df72
   171  DATA CryptedPi<>+0x68(SB)/8, $0xa6642b57d1011deb
   172  DATA CryptedPi<>+0x70(SB)/8, $0x5063d25a1cb7b6b9
   173  DATA CryptedPi<>+0x78(SB)/8, $0xb2623e6241e8e46e
   174  GLOBL CryptedPi<>(SB), (NOPTR+RODATA), $0x80
   175  
   176  // func aesBytesBatchGenHashStates(data *[]byte, states *[3]uint64, length int)
   177  // Requires: AES
   178  TEXT ·aesBytesBatchGenHashStates(SB), NOSPLIT, $0-24
   179  	MOVD data+0(FP), R0
   180  	MOVD states+8(FP), R1
   181  	MOVD length+16(FP), R2
   182  
   183  	MOVD $CryptedPi<>(SB), R3
   184  	VLD1.P 64(R3), [V0.B16, V1.B16, V2.B16, V3.B16]
   185  	VLD1 (R3), [V4.B16, V5.B16, V6.B16, V7.B16]
   186  	VEOR V31.B16, V31.B16, V31.B16
   187  
   188  loop:
   189  	LDP.P 24(R0), (R4, R5)
   190  	MOVD  R5, R6
   191  
   192  	ADD R4, R5
   193  	SUB $0x40, R5
   194  
   195  	VMOV V0.B16, V8.B16
   196  	VMOV V1.B16, V9.B16
   197  	VMOV V2.B16, V10.B16
   198  	VMOV V3.B16, V11.B16
   199  	VMOV V4.B16, V12.B16
   200  	VMOV V5.B16, V13.B16
   201  	VMOV V6.B16, V14.B16
   202  	VMOV V7.B16, V15.B16
   203  
   204  innerLoop:
   205  	CMP R4, R5
   206  	BLE tail
   207  
   208  	VLD1.P 0x40(R4), [V16.B16, V17.B16, V18.B16, V19.B16]
   209  
   210  	AESE  V31.B16, V8.B16
   211  	AESMC V8.B16, V8.B16
   212  	VEOR  V16.B16, V8.B16, V8.B16
   213  
   214  	AESE  V31.B16, V12.B16
   215  	AESMC V12.B16, V12.B16
   216  	VEOR  V16.B16, V12.B16, V12.B16
   217  
   218  	AESE  V31.B16, V9.B16
   219  	AESMC V9.B16, V9.B16
   220  	VEOR  V17.B16, V9.B16, V9.B16
   221  
   222  	AESE  V31.B16, V13.B16
   223  	AESMC V13.B16, V13.B16
   224  	VEOR  V17.B16, V13.B16, V13.B16
   225  
   226  	AESE  V31.B16, V10.B16
   227  	AESMC V10.B16, V10.B16
   228  	VEOR  V18.B16, V10.B16, V10.B16
   229  
   230  	AESE  V31.B16, V14.B16
   231  	AESMC V14.B16, V14.B16
   232  	VEOR  V18.B16, V14.B16, V14.B16
   233  
   234  	AESE  V31.B16, V11.B16
   235  	AESMC V11.B16, V11.B16
   236  	VEOR  V19.B16, V11.B16, V11.B16
   237  
   238  	AESE  V31.B16, V15.B16
   239  	AESMC V15.B16, V15.B16
   240  	VEOR  V19.B16, V15.B16, V15.B16
   241  
   242  	JMP innerLoop
   243  
   244  tail:
   245  	ADD $0x30, R5
   246  	CMP R4, R5
   247  	BLE done
   248  
   249  	VLD1.P 0x10(R4), [V16.B16]
   250  
   251  	AESE  V31.B16, V8.B16
   252  	AESMC V8.B16, V8.B16
   253  	VEOR  V16.B16, V8.B16, V8.B16
   254  
   255  	AESE  V31.B16, V12.B16
   256  	AESMC V12.B16, V12.B16
   257  	VEOR  V16.B16, V12.B16, V12.B16
   258  
   259  	CMP R4, R5
   260  	BLE done
   261  
   262  	VLD1.P 0x10(R4), [V17.B16]
   263  
   264  	AESE  V31.B16, V9.B16
   265  	AESMC V9.B16, V9.B16
   266  	VEOR  V17.B16, V9.B16, V9.B16
   267  
   268  	AESE  V31.B16, V13.B16
   269  	AESMC V13.B16, V13.B16
   270  	VEOR  V17.B16, V13.B16, V13.B16
   271  
   272  	CMP R4, R5
   273  	BLE done
   274  
   275  	VLD1 (R4), [V18.B16]
   276  
   277  	AESE  V31.B16, V10.B16
   278  	AESMC V10.B16, V10.B16
   279  	VEOR  V18.B16, V10.B16, V10.B16
   280  
   281  	AESE  V31.B16, V14.B16
   282  	AESMC V14.B16, V14.B16
   283  	VEOR  V18.B16, V14.B16, V14.B16
   284  
   285  done:
   286  	VLD1  (R5), [V19.B16]
   287  
   288  	AESE  V31.B16, V11.B16
   289  	AESMC V11.B16, V11.B16
   290  	VEOR  V19.B16, V11.B16, V11.B16
   291  
   292  	AESE  V31.B16, V15.B16
   293  	AESMC V15.B16, V15.B16
   294  	VEOR  V19.B16, V15.B16, V15.B16
   295  
   296  	AESE  V31.B16, V8.B16
   297  	AESMC V8.B16, V8.B16
   298  	VEOR  V9.B16, V8.B16, V8.B16
   299  
   300  	AESE  V31.B16, V11.B16
   301  	AESMC V11.B16, V11.B16
   302  
   303  	AESE  V10.B16, V11.B16
   304  	AESMC V11.B16, V11.B16
   305  	VEOR  V8.B16, V11.B16, V9.B16
   306  
   307  	AESE  V8.B16, V11.B16
   308  	AESMC V11.B16, V11.B16
   309  	VEOR  V9.B16, V11.B16, V10.B16
   310  
   311  	AESE  V9.B16, V11.B16
   312  	AESMC V11.B16, V11.B16
   313  	VEOR  V10.B16, V11.B16, V8.B16
   314  
   315  	AESE  V10.B16, V11.B16
   316  	AESMC V11.B16, V11.B16
   317  	VEOR  V8.B16, V11.B16, V11.B16
   318  
   319  	AESE  V31.B16, V12.B16
   320  	AESMC V12.B16, V12.B16
   321  
   322  	AESE  V31.B16, V13.B16
   323  	AESMC V13.B16, V13.B16
   324  	VEOR  V14.B16, V13.B16, V13.B16
   325  
   326  	AESE  V15.B16, V12.B16
   327  	AESMC V12.B16, V12.B16
   328  	VEOR  V13.B16, V12.B16, V12.B16
   329  
   330  	VMOV V11.D[0], R7
   331  	VMOV V11.D[1], R8
   332  	EOR  R8, R7
   333  	EOR  R6, R7
   334  
   335  	MOVD.P R7, 8(R1)
   336  	VST1.P [V12.B16], 16(R1)
   337  
   338  	SUBS $1, R2
   339  	BNE  loop
   340  
   341  	RET
   342  
   343  // func aesInt192BatchGenHashStates(data *[3]uint64, states *[3]uint64, length int)
   344  // Requires: AES
   345  TEXT ·aesInt192BatchGenHashStates(SB), NOSPLIT, $0-24
   346  	MOVD data+0(FP), R0
   347  	MOVD states+8(FP), R1
   348  	MOVD length+16(FP), R2
   349  
   350  	MOVD $CryptedPi<>(SB), R3
   351  	VLD1.P 64(R3), [V0.B16, V1.B16, V2.B16, V3.B16]
   352  	VLD1 (R3), [V4.B16, V5.B16, V6.B16, V7.B16]
   353  	VEOR V31.B16, V31.B16, V31.B16
   354  
   355  	VMOV V0.B16, V30.B16
   356  
   357  	AESE  V31.B16, V0.B16
   358  	AESMC V0.B16, V0.B16
   359  
   360  	AESE  V31.B16, V1.B16
   361  	AESMC V1.B16, V1.B16
   362  
   363  	AESE  V31.B16, V3.B16
   364  	AESMC V3.B16, V3.B16
   365  	VEOR  V2.B16, V3.B16, V3.B16
   366  
   367  	AESE  V31.B16, V4.B16
   368  	AESMC V4.B16, V4.B16
   369  
   370  	AESE  V31.B16, V5.B16
   371  	AESMC V5.B16, V5.B16
   372  
   373  	AESE  V31.B16, V6.B16
   374  	AESMC V6.B16, V6.B16
   375  	VEOR  V7.B16, V6.B16, V6.B16
   376  
   377  loop:
   378  	VLD1   (R0), [V8.B16]
   379  	ADD    $0x08, R0
   380  	VLD1.P 0x10(R0), [V9.B16]
   381  
   382  	VEOR V0.B16, V8.B16, V10.B16
   383  	VEOR V5.B16, V9.B16, V11.B16
   384  
   385  	AESE  V1.B16, V9.B16
   386  	AESMC V9.B16, V9.B16
   387  
   388  	AESE  V10.B16, V9.B16
   389  	AESMC V9.B16, V9.B16
   390  	VEOR  V3.B16, V9.B16, V10.B16
   391  
   392  	AESE  V3.B16, V9.B16
   393  	AESMC V9.B16, V9.B16
   394  	VEOR  V10.B16, V9.B16, V12.B16
   395  
   396  	AESE  V10.B16, V9.B16
   397  	AESMC V9.B16, V9.B16
   398  	VEOR  V12.B16, V9.B16, V9.B16
   399  
   400  	VMOV  V9.D[0], R4
   401  	VMOV  V9.D[1], R5
   402  	EOR   R5, R4
   403  
   404  	AESE  V4.B16, V8.B16
   405  	AESMC V8.B16, V8.B16
   406  
   407  	AESE  V11.B16, V8.B16
   408  	AESMC V8.B16, V8.B16
   409  	VEOR  V6.B16, V8.B16, V8.B16
   410  
   411  	MOVD.P R4, 0x08(R1)
   412  	VST1.P [V8.B16], 0x10(R1)
   413  
   414  	SUBS $1, R2
   415  	BNE  loop
   416  
   417  done:
   418  	RET
   419  
   420  // func aesInt256BatchGenHashStates(data *[4]uint64, states *[3]uint64, length int)
   421  // Requires: AES
   422  TEXT ·aesInt256BatchGenHashStates(SB), NOSPLIT, $0-24
   423  	MOVD data+0(FP), R0
   424  	MOVD states+8(FP), R1
   425  	MOVD length+16(FP), R2
   426  
   427  	MOVD $CryptedPi<>(SB), R3
   428  	VLD1.P 64(R3), [V0.B16, V1.B16, V2.B16, V3.B16]
   429  	VLD1 (R3), [V4.B16, V5.B16, V6.B16, V7.B16]
   430  	VEOR V31.B16, V31.B16, V31.B16
   431  
   432  	VMOV V0.B16, V30.B16
   433  
   434  	AESE  V31.B16, V0.B16
   435  	AESMC V0.B16, V0.B16
   436  
   437  	AESE  V31.B16, V1.B16
   438  	AESMC V1.B16, V1.B16
   439  
   440  	AESE  V31.B16, V3.B16
   441  	AESMC V3.B16, V3.B16
   442  	VEOR  V2.B16, V3.B16, V3.B16
   443  
   444  	AESE  V31.B16, V4.B16
   445  	AESMC V4.B16, V4.B16
   446  
   447  	AESE  V31.B16, V5.B16
   448  	AESMC V5.B16, V5.B16
   449  
   450  	AESE  V31.B16, V6.B16
   451  	AESMC V6.B16, V6.B16
   452  	VEOR  V7.B16, V6.B16, V6.B16
   453  
   454  loop:
   455  	VLD1.P 0x20(R0), [V8.B16, V9.B16]
   456  
   457  	VEOR V0.B16, V8.B16, V10.B16
   458  	VEOR V5.B16, V9.B16, V11.B16
   459  
   460  	AESE  V1.B16, V9.B16
   461  	AESMC V9.B16, V9.B16
   462  
   463  	AESE  V10.B16, V9.B16
   464  	AESMC V9.B16, V9.B16
   465  	VEOR  V3.B16, V9.B16, V10.B16
   466  
   467  	AESE  V3.B16, V9.B16
   468  	AESMC V9.B16, V9.B16
   469  	VEOR  V10.B16, V9.B16, V12.B16
   470  
   471  	AESE  V10.B16, V9.B16
   472  	AESMC V9.B16, V9.B16
   473  	VEOR  V12.B16, V9.B16, V9.B16
   474  
   475  	VMOV  V9.D[0], R4
   476  	VMOV  V9.D[1], R5
   477  	EOR   R5, R4
   478  
   479  	AESE  V4.B16, V8.B16
   480  	AESMC V8.B16, V8.B16
   481  
   482  	AESE  V11.B16, V8.B16
   483  	AESMC V8.B16, V8.B16
   484  	VEOR  V6.B16, V8.B16, V8.B16
   485  
   486  	MOVD.P R4, 0x08(R1)
   487  	VST1.P [V8.B16], 0x10(R1)
   488  
   489  	SUBS $1, R2
   490  	BNE  loop
   491  
   492  done:
   493  	RET
   494  
   495  // func aesInt320BatchGenHashStates(data *[5]uint64, states *[3]uint64, length int)
   496  // Requires: AES
   497  TEXT ·aesInt320BatchGenHashStates(SB), NOSPLIT, $0-24
   498  	MOVD data+0(FP), R0
   499  	MOVD states+8(FP), R1
   500  	MOVD length+16(FP), R2
   501  
   502  	MOVD $CryptedPi<>(SB), R3
   503  	VLD1.P 64(R3), [V0.B16, V1.B16, V2.B16, V3.B16]
   504  	VLD1 (R3), [V4.B16, V5.B16, V6.B16, V7.B16]
   505  	VEOR V31.B16, V31.B16, V31.B16
   506  
   507  	AESE  V31.B16, V0.B16
   508  	AESMC V0.B16, V0.B16
   509  
   510  	AESE  V31.B16, V1.B16
   511  	AESMC V1.B16, V1.B16
   512  
   513  	AESE  V31.B16, V3.B16
   514  	AESMC V3.B16, V3.B16
   515  
   516  	AESE  V31.B16, V4.B16
   517  	AESMC V4.B16, V4.B16
   518  
   519  	AESE  V31.B16, V5.B16
   520  	AESMC V5.B16, V5.B16
   521  
   522  	AESE  V31.B16, V6.B16
   523  	AESMC V6.B16, V6.B16
   524  
   525  loop:
   526  	VLD1 (R0), [V8.B16, V9.B16]
   527  	ADD  $0x18, R0
   528  	VLD1.P 0x10(R0), [V10.B16]
   529  
   530  	VEOR V4.B16, V8.B16, V11.B16
   531  	VEOR V5.B16, V9.B16, V12.B16
   532  
   533  	VEOR V3.B16, V10.B16, V13.B16
   534  
   535  	AESE  V0.B16, V8.B16
   536  	AESMC V8.B16, V8.B16
   537  
   538  	AESE  V1.B16, V9.B16
   539  	AESMC V9.B16, V9.B16
   540  	VEOR  V2.B16, V9.B16, V9.B16
   541  
   542  	AESE  V13.B16, V8.B16
   543  	AESMC V8.B16, V8.B16
   544  	VEOR  V9.B16, V8.B16, V13.B16
   545  
   546  	AESE  V9.B16, V8.B16
   547  	AESMC V8.B16, V8.B16
   548  	VEOR  V13.B16, V8.B16, V9.B16
   549  
   550  	AESE  V13.B16, V8.B16
   551  	AESMC V8.B16, V8.B16
   552  	VEOR  V9.B16, V8.B16, V8.B16
   553  
   554  	VMOV  V8.D[0], R4
   555  	VMOV  V8.D[1], R5
   556  	EOR   R5, R4
   557  
   558  	AESE  V31.B16, V11.B16
   559  	AESMC V11.B16, V11.B16
   560  
   561  	AESE  V6.B16, V10.B16
   562  	AESMC V10.B16, V10.B16
   563  	VEOR  V7.B16, V10.B16, V10.B16
   564  
   565  	AESE  V12.B16, V11.B16
   566  	AESMC V11.B16, V11.B16
   567  	VEOR  V10.B16, V11.B16, V11.B16
   568  
   569  	MOVD.P R4, 0x08(R1)
   570  	VST1.P [V11.B16], 0x10(R1)
   571  
   572  	SUBS $1, R2
   573  	BNE  loop
   574  
   575  done:
   576  	RET