github.com/matrixorigin/matrixone@v1.2.0/pkg/container/hashtable/hash_arm64.s (about)

     1  // Copyright 2021 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  #include "textflag.h"
    16  
    17  // func crc32Int64BatchHash(data *uint64, hashes *uint64, length int)
    18  // Requires: CRC32
    19  TEXT ·crc32Int64BatchHash(SB), NOSPLIT, $0-24
    20  	MOVD data+0(FP), R0
    21  	MOVD hashes+8(FP), R1
    22  	MOVD length+16(FP), R2
    23  
    24  loop:
    25  	SUBS $8, R2
    26  	BLT  tail
    27  
    28  	VLD1 (R0), [V0.B16, V1.B16, V2.B16, V3.B16]
    29  	VST1 [V0.B16, V1.B16, V2.B16, V3.B16], (R1)
    30  
    31  	MOVD $-1, R3
    32  	MOVD $-1, R4
    33  	MOVD $-1, R5
    34  	MOVD $-1, R6
    35  	MOVD $-1, R7
    36  	MOVD $-1, R8
    37  	MOVD $-1, R9
    38  	MOVD $-1, R10
    39  
    40  	LDP.P 16(R0), (R11, R12)
    41  	LDP.P 16(R0), (R13, R14)
    42  	LDP.P 16(R0), (R15, R16)
    43  	LDP.P 16(R0), (R17, R19)
    44  
    45  	CRC32CX R11, R3
    46  	CRC32CX R12, R4
    47  	CRC32CX R13, R5
    48  	CRC32CX R14, R6
    49  	CRC32CX R15, R7
    50  	CRC32CX R16, R8
    51  	CRC32CX R17, R9
    52  	CRC32CX R19, R10
    53  
    54  	MOVW.P R3, 8(R1)
    55  	MOVW.P R4, 8(R1)
    56  	MOVW.P R5, 8(R1)
    57  	MOVW.P R6, 8(R1)
    58  	MOVW.P R7, 8(R1)
    59  	MOVW.P R8, 8(R1)
    60  	MOVW.P R9, 8(R1)
    61  	MOVW.P R10, 8(R1)
    62  
    63  	JMP loop
    64  
    65  tail:
    66  	ADDS $8, R2
    67  	BEQ  done
    68  
    69  tailLoop:
    70  	MOVD    $-1, R3
    71  	MOVD    (R0), R5
    72  	MOVD.P  8(R0), R4
    73  	CRC32CX R4, R3
    74  	MOVD    R5, (R1)
    75  	MOVW.P  R3, 8(R1)
    76  
    77  	SUBS $1, R2
    78  	BNE  tailLoop
    79  
    80  done:
    81  	RET
    82  
    83  ////////////////////////////////////////////////////////////////
    84  ////////////////////////////////////////////////////////////////
    85  ////////////////////////////////////////////////////////////////
    86  ////////////////////////////////////////////////////////////////
    87  
    88  DATA Pi<>+0x00(SB)/8, $0x3243f6a8885a308d
    89  DATA Pi<>+0x08(SB)/8, $0x313198a2e0370734
    90  DATA Pi<>+0x10(SB)/8, $0x4a4093822299f31d
    91  DATA Pi<>+0x18(SB)/8, $0x0082efa98ec4e6c8
    92  DATA Pi<>+0x20(SB)/8, $0x9452821e638d0137
    93  DATA Pi<>+0x28(SB)/8, $0x7be5466cf34e90c6
    94  DATA Pi<>+0x30(SB)/8, $0xcc0ac29b7c97c50d
    95  DATA Pi<>+0x38(SB)/8, $0xd3f84d5b5b547091
    96  DATA Pi<>+0x40(SB)/8, $0x79216d5d98979fb1
    97  DATA Pi<>+0x48(SB)/8, $0xbd1310ba698dfb5a
    98  DATA Pi<>+0x50(SB)/8, $0xc2ffd72dbd01adfb
    99  DATA Pi<>+0x58(SB)/8, $0x7b8e1afed6a267e9
   100  DATA Pi<>+0x60(SB)/8, $0x6ba7c9045f12c7f9
   101  DATA Pi<>+0x68(SB)/8, $0x924a19947b3916cf
   102  DATA Pi<>+0x70(SB)/8, $0x70801f2e2858efc1
   103  DATA Pi<>+0x78(SB)/8, $0x6636920d871574e6
   104  GLOBL Pi<>(SB), (NOPTR+RODATA), $0x80
   105  
   106  DATA CryptedPi<>+0x00(SB)/8, $0x822233b93c11087c
   107  DATA CryptedPi<>+0x08(SB)/8, $0xd2b32f4adde873da
   108  DATA CryptedPi<>+0x10(SB)/8, $0xae9c2fc7dd17bcdb
   109  DATA CryptedPi<>+0x18(SB)/8, $0x859110441a1569fc
   110  DATA CryptedPi<>+0x20(SB)/8, $0x47087d794fffb5c9
   111  DATA CryptedPi<>+0x28(SB)/8, $0xb7b6c8f565414445
   112  DATA CryptedPi<>+0x30(SB)/8, $0xfd260edabb308f8d
   113  DATA CryptedPi<>+0x38(SB)/8, $0x3ddefc67bc565a13
   114  DATA CryptedPi<>+0x40(SB)/8, $0xe4c1d50223544f10
   115  DATA CryptedPi<>+0x48(SB)/8, $0xaf40e05725c3192b
   116  DATA CryptedPi<>+0x50(SB)/8, $0x281d8ab9a16382e9
   117  DATA CryptedPi<>+0x58(SB)/8, $0xddc10c903b63a6cf
   118  DATA CryptedPi<>+0x60(SB)/8, $0x852d3ad603e8df72
   119  DATA CryptedPi<>+0x68(SB)/8, $0xa6642b57d1011deb
   120  DATA CryptedPi<>+0x70(SB)/8, $0x5063d25a1cb7b6b9
   121  DATA CryptedPi<>+0x78(SB)/8, $0xb2623e6241e8e46e
   122  GLOBL CryptedPi<>(SB), (NOPTR+RODATA), $0x80
   123  
   124  // func aesBytesBatchGenHashStates(data *[]byte, states *[3]uint64, length int)
   125  // Requires: AES
   126  TEXT ·aesBytesBatchGenHashStates(SB), NOSPLIT, $0-24
   127  	MOVD data+0(FP), R0
   128  	MOVD states+8(FP), R1
   129  	MOVD length+16(FP), R2
   130  
   131  	MOVD $CryptedPi<>(SB), R3
   132  	VLD1.P 64(R3), [V0.B16, V1.B16, V2.B16, V3.B16]
   133  	VLD1 (R3), [V4.B16, V5.B16, V6.B16, V7.B16]
   134  	VEOR V31.B16, V31.B16, V31.B16
   135  
   136  loop:
   137  	LDP.P 24(R0), (R4, R5)
   138  	MOVD  R5, R6
   139  
   140  	ADD R4, R5
   141  	SUB $0x40, R5
   142  
   143  	VMOV V0.B16, V8.B16
   144  	VMOV V1.B16, V9.B16
   145  	VMOV V2.B16, V10.B16
   146  	VMOV V3.B16, V11.B16
   147  	VMOV V4.B16, V12.B16
   148  	VMOV V5.B16, V13.B16
   149  	VMOV V6.B16, V14.B16
   150  	VMOV V7.B16, V15.B16
   151  
   152  innerLoop:
   153  	CMP R4, R5
   154  	BLE tail
   155  
   156  	VLD1.P 0x40(R4), [V16.B16, V17.B16, V18.B16, V19.B16]
   157  
   158  	AESE  V31.B16, V8.B16
   159  	AESMC V8.B16, V8.B16
   160  	VEOR  V16.B16, V8.B16, V8.B16
   161  
   162  	AESE  V31.B16, V12.B16
   163  	AESMC V12.B16, V12.B16
   164  	VEOR  V16.B16, V12.B16, V12.B16
   165  
   166  	AESE  V31.B16, V9.B16
   167  	AESMC V9.B16, V9.B16
   168  	VEOR  V17.B16, V9.B16, V9.B16
   169  
   170  	AESE  V31.B16, V13.B16
   171  	AESMC V13.B16, V13.B16
   172  	VEOR  V17.B16, V13.B16, V13.B16
   173  
   174  	AESE  V31.B16, V10.B16
   175  	AESMC V10.B16, V10.B16
   176  	VEOR  V18.B16, V10.B16, V10.B16
   177  
   178  	AESE  V31.B16, V14.B16
   179  	AESMC V14.B16, V14.B16
   180  	VEOR  V18.B16, V14.B16, V14.B16
   181  
   182  	AESE  V31.B16, V11.B16
   183  	AESMC V11.B16, V11.B16
   184  	VEOR  V19.B16, V11.B16, V11.B16
   185  
   186  	AESE  V31.B16, V15.B16
   187  	AESMC V15.B16, V15.B16
   188  	VEOR  V19.B16, V15.B16, V15.B16
   189  
   190  	JMP innerLoop
   191  
   192  tail:
   193  	ADD $0x30, R5
   194  	CMP R4, R5
   195  	BLE done
   196  
   197  	VLD1.P 0x10(R4), [V16.B16]
   198  
   199  	AESE  V31.B16, V8.B16
   200  	AESMC V8.B16, V8.B16
   201  	VEOR  V16.B16, V8.B16, V8.B16
   202  
   203  	AESE  V31.B16, V12.B16
   204  	AESMC V12.B16, V12.B16
   205  	VEOR  V16.B16, V12.B16, V12.B16
   206  
   207  	CMP R4, R5
   208  	BLE done
   209  
   210  	VLD1.P 0x10(R4), [V17.B16]
   211  
   212  	AESE  V31.B16, V9.B16
   213  	AESMC V9.B16, V9.B16
   214  	VEOR  V17.B16, V9.B16, V9.B16
   215  
   216  	AESE  V31.B16, V13.B16
   217  	AESMC V13.B16, V13.B16
   218  	VEOR  V17.B16, V13.B16, V13.B16
   219  
   220  	CMP R4, R5
   221  	BLE done
   222  
   223  	VLD1 (R4), [V18.B16]
   224  
   225  	AESE  V31.B16, V10.B16
   226  	AESMC V10.B16, V10.B16
   227  	VEOR  V18.B16, V10.B16, V10.B16
   228  
   229  	AESE  V31.B16, V14.B16
   230  	AESMC V14.B16, V14.B16
   231  	VEOR  V18.B16, V14.B16, V14.B16
   232  
   233  done:
   234  	VLD1  (R5), [V19.B16]
   235  
   236  	AESE  V31.B16, V11.B16
   237  	AESMC V11.B16, V11.B16
   238  	VEOR  V19.B16, V11.B16, V11.B16
   239  
   240  	AESE  V31.B16, V15.B16
   241  	AESMC V15.B16, V15.B16
   242  	VEOR  V19.B16, V15.B16, V15.B16
   243  
   244  	AESE  V31.B16, V8.B16
   245  	AESMC V8.B16, V8.B16
   246  	VEOR  V9.B16, V8.B16, V8.B16
   247  
   248  	AESE  V31.B16, V11.B16
   249  	AESMC V11.B16, V11.B16
   250  
   251  	AESE  V10.B16, V11.B16
   252  	AESMC V11.B16, V11.B16
   253  	VEOR  V8.B16, V11.B16, V9.B16
   254  
   255  	AESE  V8.B16, V11.B16
   256  	AESMC V11.B16, V11.B16
   257  	VEOR  V9.B16, V11.B16, V10.B16
   258  
   259  	AESE  V9.B16, V11.B16
   260  	AESMC V11.B16, V11.B16
   261  	VEOR  V10.B16, V11.B16, V8.B16
   262  
   263  	AESE  V10.B16, V11.B16
   264  	AESMC V11.B16, V11.B16
   265  	VEOR  V8.B16, V11.B16, V11.B16
   266  
   267  	AESE  V31.B16, V12.B16
   268  	AESMC V12.B16, V12.B16
   269  
   270  	AESE  V31.B16, V13.B16
   271  	AESMC V13.B16, V13.B16
   272  	VEOR  V14.B16, V13.B16, V13.B16
   273  
   274  	AESE  V15.B16, V12.B16
   275  	AESMC V12.B16, V12.B16
   276  	VEOR  V13.B16, V12.B16, V12.B16
   277  
   278  	VMOV V11.D[0], R7
   279  	VMOV V11.D[1], R8
   280  	EOR  R8, R7
   281  	EOR  R6, R7
   282  
   283  	MOVD.P R7, 8(R1)
   284  	VST1.P [V12.B16], 16(R1)
   285  
   286  	SUBS $1, R2
   287  	BNE  loop
   288  
   289  	RET
   290  
   291  // func aesInt192BatchGenHashStates(data *[3]uint64, states *[3]uint64, length int)
   292  // Requires: AES
   293  TEXT ·aesInt192BatchGenHashStates(SB), NOSPLIT, $0-24
   294  	MOVD data+0(FP), R0
   295  	MOVD states+8(FP), R1
   296  	MOVD length+16(FP), R2
   297  
   298  	MOVD $CryptedPi<>(SB), R3
   299  	VLD1.P 64(R3), [V0.B16, V1.B16, V2.B16, V3.B16]
   300  	VLD1 (R3), [V4.B16, V5.B16, V6.B16, V7.B16]
   301  	VEOR V31.B16, V31.B16, V31.B16
   302  
   303  	VMOV V0.B16, V30.B16
   304  
   305  	AESE  V31.B16, V0.B16
   306  	AESMC V0.B16, V0.B16
   307  
   308  	AESE  V31.B16, V1.B16
   309  	AESMC V1.B16, V1.B16
   310  
   311  	AESE  V31.B16, V3.B16
   312  	AESMC V3.B16, V3.B16
   313  	VEOR  V2.B16, V3.B16, V3.B16
   314  
   315  	AESE  V31.B16, V4.B16
   316  	AESMC V4.B16, V4.B16
   317  
   318  	AESE  V31.B16, V5.B16
   319  	AESMC V5.B16, V5.B16
   320  
   321  	AESE  V31.B16, V6.B16
   322  	AESMC V6.B16, V6.B16
   323  	VEOR  V7.B16, V6.B16, V6.B16
   324  
   325  loop:
   326  	VLD1   (R0), [V8.B16]
   327  	ADD    $0x08, R0
   328  	VLD1.P 0x10(R0), [V9.B16]
   329  
   330  	VEOR V0.B16, V8.B16, V10.B16
   331  	VEOR V5.B16, V9.B16, V11.B16
   332  
   333  	AESE  V1.B16, V9.B16
   334  	AESMC V9.B16, V9.B16
   335  
   336  	AESE  V10.B16, V9.B16
   337  	AESMC V9.B16, V9.B16
   338  	VEOR  V3.B16, V9.B16, V10.B16
   339  
   340  	AESE  V3.B16, V9.B16
   341  	AESMC V9.B16, V9.B16
   342  	VEOR  V10.B16, V9.B16, V12.B16
   343  
   344  	AESE  V10.B16, V9.B16
   345  	AESMC V9.B16, V9.B16
   346  	VEOR  V12.B16, V9.B16, V9.B16
   347  
   348  	VMOV  V9.D[0], R4
   349  	VMOV  V9.D[1], R5
   350  	EOR   R5, R4
   351  
   352  	AESE  V4.B16, V8.B16
   353  	AESMC V8.B16, V8.B16
   354  
   355  	AESE  V11.B16, V8.B16
   356  	AESMC V8.B16, V8.B16
   357  	VEOR  V6.B16, V8.B16, V8.B16
   358  
   359  	MOVD.P R4, 0x08(R1)
   360  	VST1.P [V8.B16], 0x10(R1)
   361  
   362  	SUBS $1, R2
   363  	BNE  loop
   364  
   365  done:
   366  	RET
   367  
   368  // func aesInt256BatchGenHashStates(data *[4]uint64, states *[3]uint64, length int)
   369  // Requires: AES
   370  TEXT ·aesInt256BatchGenHashStates(SB), NOSPLIT, $0-24
   371  	MOVD data+0(FP), R0
   372  	MOVD states+8(FP), R1
   373  	MOVD length+16(FP), R2
   374  
   375  	MOVD $CryptedPi<>(SB), R3
   376  	VLD1.P 64(R3), [V0.B16, V1.B16, V2.B16, V3.B16]
   377  	VLD1 (R3), [V4.B16, V5.B16, V6.B16, V7.B16]
   378  	VEOR V31.B16, V31.B16, V31.B16
   379  
   380  	VMOV V0.B16, V30.B16
   381  
   382  	AESE  V31.B16, V0.B16
   383  	AESMC V0.B16, V0.B16
   384  
   385  	AESE  V31.B16, V1.B16
   386  	AESMC V1.B16, V1.B16
   387  
   388  	AESE  V31.B16, V3.B16
   389  	AESMC V3.B16, V3.B16
   390  	VEOR  V2.B16, V3.B16, V3.B16
   391  
   392  	AESE  V31.B16, V4.B16
   393  	AESMC V4.B16, V4.B16
   394  
   395  	AESE  V31.B16, V5.B16
   396  	AESMC V5.B16, V5.B16
   397  
   398  	AESE  V31.B16, V6.B16
   399  	AESMC V6.B16, V6.B16
   400  	VEOR  V7.B16, V6.B16, V6.B16
   401  
   402  loop:
   403  	VLD1.P 0x20(R0), [V8.B16, V9.B16]
   404  
   405  	VEOR V0.B16, V8.B16, V10.B16
   406  	VEOR V5.B16, V9.B16, V11.B16
   407  
   408  	AESE  V1.B16, V9.B16
   409  	AESMC V9.B16, V9.B16
   410  
   411  	AESE  V10.B16, V9.B16
   412  	AESMC V9.B16, V9.B16
   413  	VEOR  V3.B16, V9.B16, V10.B16
   414  
   415  	AESE  V3.B16, V9.B16
   416  	AESMC V9.B16, V9.B16
   417  	VEOR  V10.B16, V9.B16, V12.B16
   418  
   419  	AESE  V10.B16, V9.B16
   420  	AESMC V9.B16, V9.B16
   421  	VEOR  V12.B16, V9.B16, V9.B16
   422  
   423  	VMOV  V9.D[0], R4
   424  	VMOV  V9.D[1], R5
   425  	EOR   R5, R4
   426  
   427  	AESE  V4.B16, V8.B16
   428  	AESMC V8.B16, V8.B16
   429  
   430  	AESE  V11.B16, V8.B16
   431  	AESMC V8.B16, V8.B16
   432  	VEOR  V6.B16, V8.B16, V8.B16
   433  
   434  	MOVD.P R4, 0x08(R1)
   435  	VST1.P [V8.B16], 0x10(R1)
   436  
   437  	SUBS $1, R2
   438  	BNE  loop
   439  
   440  done:
   441  	RET
   442  
   443  // func aesInt320BatchGenHashStates(data *[5]uint64, states *[3]uint64, length int)
   444  // Requires: AES
   445  TEXT ·aesInt320BatchGenHashStates(SB), NOSPLIT, $0-24
   446  	MOVD data+0(FP), R0
   447  	MOVD states+8(FP), R1
   448  	MOVD length+16(FP), R2
   449  
   450  	MOVD $CryptedPi<>(SB), R3
   451  	VLD1.P 64(R3), [V0.B16, V1.B16, V2.B16, V3.B16]
   452  	VLD1 (R3), [V4.B16, V5.B16, V6.B16, V7.B16]
   453  	VEOR V31.B16, V31.B16, V31.B16
   454  
   455  	AESE  V31.B16, V0.B16
   456  	AESMC V0.B16, V0.B16
   457  
   458  	AESE  V31.B16, V1.B16
   459  	AESMC V1.B16, V1.B16
   460  
   461  	AESE  V31.B16, V3.B16
   462  	AESMC V3.B16, V3.B16
   463  
   464  	AESE  V31.B16, V4.B16
   465  	AESMC V4.B16, V4.B16
   466  
   467  	AESE  V31.B16, V5.B16
   468  	AESMC V5.B16, V5.B16
   469  
   470  	AESE  V31.B16, V6.B16
   471  	AESMC V6.B16, V6.B16
   472  
   473  loop:
   474  	VLD1 (R0), [V8.B16, V9.B16]
   475  	ADD  $0x18, R0
   476  	VLD1.P 0x10(R0), [V10.B16]
   477  
   478  	VEOR V4.B16, V8.B16, V11.B16
   479  	VEOR V5.B16, V9.B16, V12.B16
   480  
   481  	VEOR V3.B16, V10.B16, V13.B16
   482  
   483  	AESE  V0.B16, V8.B16
   484  	AESMC V8.B16, V8.B16
   485  
   486  	AESE  V1.B16, V9.B16
   487  	AESMC V9.B16, V9.B16
   488  	VEOR  V2.B16, V9.B16, V9.B16
   489  
   490  	AESE  V13.B16, V8.B16
   491  	AESMC V8.B16, V8.B16
   492  	VEOR  V9.B16, V8.B16, V13.B16
   493  
   494  	AESE  V9.B16, V8.B16
   495  	AESMC V8.B16, V8.B16
   496  	VEOR  V13.B16, V8.B16, V9.B16
   497  
   498  	AESE  V13.B16, V8.B16
   499  	AESMC V8.B16, V8.B16
   500  	VEOR  V9.B16, V8.B16, V8.B16
   501  
   502  	VMOV  V8.D[0], R4
   503  	VMOV  V8.D[1], R5
   504  	EOR   R5, R4
   505  
   506  	AESE  V31.B16, V11.B16
   507  	AESMC V11.B16, V11.B16
   508  
   509  	AESE  V6.B16, V10.B16
   510  	AESMC V10.B16, V10.B16
   511  	VEOR  V7.B16, V10.B16, V10.B16
   512  
   513  	AESE  V12.B16, V11.B16
   514  	AESMC V11.B16, V11.B16
   515  	VEOR  V10.B16, V11.B16, V11.B16
   516  
   517  	MOVD.P R4, 0x08(R1)
   518  	VST1.P [V11.B16], 0x10(R1)
   519  
   520  	SUBS $1, R2
   521  	BNE  loop
   522  
   523  done:
   524  	RET