github.com/emmansun/gmsm@v0.29.1/sm3/sm3blocks_arm64.s (about)

     1  // Copyright 2024 Sun Yimin. All rights reserved.
     2  // Use of this source code is governed by a MIT-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !purego
     6  
     7  #include "textflag.h"
     8  #include "sm3_const_asm.s"
     9  
    10  #define a V0
    11  #define e V1
    12  #define b V2
    13  #define f V3
    14  #define c V4
    15  #define g V5
    16  #define d V6
    17  #define h V7
    18  
    19  #define tmp1 V8
    20  #define tmp2 V9
    21  #define tmp3 V10
    22  #define tmp4 V11
    23  
    24  #define aSave V24
    25  #define bSave V25
    26  #define cSave V26
    27  #define dSave V27
    28  #define eSave V28
    29  #define fSave V29
    30  #define gSave V30
    31  #define hSave V31
    32  
    33  // input: from high to low
    34  // t0 = t0.S3, t0.S2, t0.S1, t0.S0
    35  // t1 = t1.S3, t1.S2, t1.S1, t1.S0
    36  // t2 = t2.S3, t2.S2, t2.S1, t2.S0
    37  // t3 = t3.S3, t3.S2, t3.S1, t3.S0
    38  // output: from high to low
    39  // t0 = t3.S0, t2.S0, t1.S0, t0.S0
    40  // t1 = t3.S1, t2.S1, t1.S1, t0.S1
    41  // t2 = t3.S2, t2.S2, t1.S2, t0.S2
    42  // t3 = t3.S3, t2.S3, t1.S3, t0.S3
    43  #define TRANSPOSE_MATRIX(t0, t1, t2, t3, RTMP0, RTMP1, RTMP2, RTMP3) \
    44  	VZIP1 t1.S4, t0.S4, RTMP0.S4               \
    45  	VZIP1 t3.S4, t2.S4, RTMP1.S4               \
    46  	VZIP2 t1.S4, t0.S4, RTMP2.S4               \
    47  	VZIP2 t3.S4, t2.S4, RTMP3.S4               \
    48  	VZIP1 RTMP1.D2, RTMP0.D2, t0.D2            \
    49  	VZIP2 RTMP1.D2, RTMP0.D2, t1.D2            \
    50  	VZIP1 RTMP3.D2, RTMP2.D2, t2.D2            \
    51  	VZIP2 RTMP3.D2, RTMP2.D2, t3.D2            \
    52  
    53  // r = s <<< n
    54  #define PROLD(s, r, n) \
    55  	VSHL $(n), s.S4, r.S4     \
    56  	VSRI $(32-n), s.S4, r.S4  \
    57  
    58  #define loadWordByIndex(W, i) \
    59  	ADD $(16*(i)), wordStart, R20 \
    60  	VLD1 (R20), [W.S4]          \ 
    61  
    62  #define prepare4Words \
    63  	VLD1.P 16(srcPtr1), [V12.B16] \
    64  	VLD1.P 16(srcPtr2), [V13.B16] \
    65  	VLD1.P 16(srcPtr3), [V14.B16] \
    66  	VLD1.P 16(srcPtr4), [V15.B16] \	
    67  	TRANSPOSE_MATRIX(V12, V13, V14, V15, tmp1, tmp2, tmp3, tmp4); \
    68  	VREV32 V12.B16, V12.B16; \
    69  	VREV32 V13.B16, V13.B16; \
    70  	VREV32 V14.B16, V14.B16; \
    71  	VREV32 V15.B16, V15.B16; \
    72  	VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(wordPtr)
    73  
    74  #define LOAD_T(const, T) \
    75  	MOVW $const, R20     \
    76  	VDUP R20, T.S4       \
    77  
    78  #define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \
    79  	PROLD(a, V12, 12)                \
    80  	VMOV V12.B16, V13.B16            \
    81  	LOAD_T(const, tmp1)              \
    82  	VADD tmp1.S4, V12.S4, V12.S4     \
    83  	VADD e.S4, V12.S4, V12.S4        \
    84  	PROLD(V12, V14, 7)               \ // V14 = SS1
    85  	VEOR V14.B16, V13.B16, V12.B16   \ // V12 = SS2
    86  	VEOR a.B16, b.B16, V13.B16       \
    87  	VEOR c.B16, V13.B16, V13.B16     \
    88  	VADD V13.S4, d.S4, V13.S4        \ // V13 = (a XOR b XOR c) + d 
    89  	loadWordByIndex(V10, index)      \
    90  	loadWordByIndex(V11, index+4)    \
    91  	VEOR V10.B16, V11.B16, V11.B16   \
    92  	VADD V11.S4, V13.S4, V13.S4      \ // V13 = (a XOR b XOR c) + d + (Wt XOR Wt+4)
    93  	VADD V13.S4, V12.S4, V13.S4      \ // TT1
    94  	VADD h.S4, V10.S4, V10.S4        \
    95  	VADD V14.S4, V10.S4, V10.S4      \ // Wt + h + SS1
    96  	VEOR e.B16, f.B16, V11.B16       \
    97  	VEOR g.B16, V11.B16, V11.B16     \
    98  	VADD V11.S4, V10.S4, V10.S4      \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
    99  	VMOV b.B16, V11.B16              \
   100  	PROLD(V11, b, 9)                 \ // b = b <<< 9
   101  	VMOV V13.B16, h.B16              \ // h = TT1
   102  	VMOV f.B16, V11.B16              \
   103  	PROLD(V11, f, 19)                \ // f = f <<< 19
   104  	PROLD(V10, V11, 9)               \ // V11 = TT2 <<< 9
   105  	PROLD(V11, V12, 8)               \ // V12 = TT2 <<< 17
   106  	VEOR V10.B16, V11.B16, V11.B16   \ // V11 = TT2 XOR (TT2 <<< 9)
   107  	VEOR V11.B16, V12.B16, d.B16     \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)
   108  
   109  #define MESSAGE_SCHEDULE(index) \
   110  	loadWordByIndex(V10, index+1)    \ // Wj-3
   111  	PROLD(V10, V11, 15)              \
   112  	loadWordByIndex(V10, index-12)   \ // Wj-16
   113  	VEOR V10.B16, V11.B16, V10.B16   \
   114  	loadWordByIndex(V11, index-5)    \ // Wj-9
   115  	VEOR V10.B16, V11.B16, V10.B16   \
   116  	PROLD(V10, V11, 15)              \
   117  	PROLD(V11, V12, 8)               \
   118  	VEOR V11.B16, V10.B16, V10.B16   \
   119  	VEOR V12.B16, V10.B16, V10.B16   \ // P1
   120  	loadWordByIndex(V11, index-9)    \ // Wj-13
   121  	PROLD(V11, V12, 7)               \
   122  	VEOR V12.B16, V10.B16, V10.B16   \
   123  	loadWordByIndex(V11, index-2)    \ // Wj-6
   124  	VEOR V11.B16, V10.B16, V11.B16   \
   125  	VST1.P [V11.S4], 16(wordPtr)     \
   126  
   127  #define ROUND_12_15(index, const, a, b, c, d, e, f, g, h) \
   128  	MESSAGE_SCHEDULE(index)                               \
   129  	ROUND_00_11(index, const, a, b, c, d, e, f, g, h)     \
   130  
   131  #define ROUND_16_63(index, const, a, b, c, d, e, f, g, h) \
   132  	MESSAGE_SCHEDULE(index)          \ // V11 is Wt+4 now, Pls do not use it
   133  	PROLD(a, V12, 12)                \
   134  	VMOV V12.B16, V13.B16            \
   135  	LOAD_T(const, tmp1)              \
   136  	VADD tmp1.S4, V12.S4, V12.S4     \
   137  	VADD e.S4, V12.S4, V12.S4        \
   138  	PROLD(V12, V14, 7)               \ // V14 = SS1
   139  	VEOR V14.B16, V13.B16, V12.B16   \ // V12 = SS2
   140  	VORR a.B16, b.B16, V10.B16       \
   141  	VAND a.B16, b.B16, V13.B16       \
   142  	VAND c.B16, V10.B16, V10.B16     \
   143  	VORR V13.B16, V10.B16, V13.B16   \ // (a AND b) OR (a AND c) OR (b AND c)
   144  	VADD V13.S4, d.S4, V13.S4        \ // (a AND b) OR (a AND c) OR (b AND c) + d
   145  	loadWordByIndex(V10, index)      \ // Wj
   146  	VEOR V10.B16, V11.B16, V11.B16   \ // Wj XOR Wj+4
   147  	VADD V13.S4, V11.S4, V13.S4      \ // (a AND b) OR (a AND c) OR (b AND c) + d + (Wt XOR Wt+4)
   148  	VADD V13.S4, V12.S4, V13.S4      \ // TT1
   149  	VADD h.S4, V10.S4, V10.S4        \ // Wt + h
   150  	VADD V14.S4, V10.S4, V10.S4      \ // Wt + h + SS1
   151  	VEOR f.B16, g.B16, V11.B16       \
   152  	VAND V11.B16, e.B16, V11.B16     \
   153  	VEOR g.B16, V11.B16, V11.B16     \ // (f XOR g) AND e XOR g
   154  	VADD V10.S4, V11.S4, V10.S4      \ // TT2
   155  	VMOV b.B16, V11.B16              \
   156  	PROLD(V11, b, 9)                 \ // b = b <<< 9
   157  	VMOV V13.B16, h.B16              \ // h = TT1
   158  	VMOV f.B16, V11.B16              \
   159  	PROLD(V11, f, 19)                \ // f = f <<< 19
   160  	PROLD(V10, V11, 9)               \ // V11 = TT2 <<< 9
   161  	PROLD(V11, V12, 8)               \ // V12 = TT2 <<< 17
   162  	VEOR V10.B16, V11.B16, V11.B16   \ // V11 = TT2 XOR (TT2 <<< 9)
   163  	VEOR V11.B16, V12.B16, d.B16     \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)
   164  
   165  // blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int)
   166  TEXT ·blockMultBy4(SB), NOSPLIT, $0
   167  #define digPtr R0
   168  #define srcPtrPtr R1
   169  #define blockCount R3
   170  #define digSave R4
   171  #define wordStart R5
   172  #define srcPtr1 R6
   173  #define srcPtr2 R7
   174  #define srcPtr3 R8
   175  #define srcPtr4 R9
   176  #define wordPtr R10
   177  	MOVD	dig+0(FP), digPtr
   178  	MOVD	p+8(FP), srcPtrPtr
   179  	MOVD	buffer+16(FP), wordStart
   180  	MOVD	blocks+24(FP), blockCount
   181  
   182  	// load state
   183  	MOVD digPtr, digSave
   184  	MOVD.P 8(digPtr), R20
   185  	VLD1 (R20), [a.S4, e.S4]
   186  	MOVD.P 8(digPtr), R20
   187  	VLD1 (R20), [b.S4, f.S4]
   188  	MOVD.P 8(digPtr), R20
   189  	VLD1 (R20), [c.S4, g.S4]
   190  	MOVD (digPtr), R20
   191  	VLD1 (R20), [d.S4, h.S4]
   192  
   193  	// transpose state
   194  	TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4)
   195  	TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4)
   196  
   197  	MOVD.P 8(srcPtrPtr), srcPtr1
   198  	MOVD.P 8(srcPtrPtr), srcPtr2
   199  	MOVD.P 8(srcPtrPtr), srcPtr3
   200  	MOVD (srcPtrPtr), srcPtr4
   201  
   202  loop:
   203  	// save state
   204  	VMOV a.B16, aSave.B16
   205  	VMOV b.B16, bSave.B16
   206  	VMOV c.B16, cSave.B16
   207  	VMOV d.B16, dSave.B16
   208  	VMOV e.B16, eSave.B16
   209  	VMOV f.B16, fSave.B16
   210  	VMOV g.B16, gSave.B16
   211  	VMOV h.B16, hSave.B16
   212  
   213  	// reset wordPtr
   214  	MOVD wordStart, wordPtr
   215  
   216  	// load message block
   217  	prepare4Words
   218  	prepare4Words
   219  	prepare4Words
   220  	prepare4Words
   221  
   222  	ROUND_00_11(0, T0, a, b, c, d, e, f, g, h)
   223  	ROUND_00_11(1, T1, h, a, b, c, d, e, f, g)
   224  	ROUND_00_11(2, T2, g, h, a, b, c, d, e, f)
   225  	ROUND_00_11(3, T3, f, g, h, a, b, c, d, e)
   226  	ROUND_00_11(4, T4, e, f, g, h, a, b, c, d)
   227  	ROUND_00_11(5, T5, d, e, f, g, h, a, b, c)
   228  	ROUND_00_11(6, T6, c, d, e, f, g, h, a, b)
   229  	ROUND_00_11(7, T7, b, c, d, e, f, g, h, a)
   230  	ROUND_00_11(8, T8, a, b, c, d, e, f, g, h)
   231  	ROUND_00_11(9, T9, h, a, b, c, d, e, f, g)
   232  	ROUND_00_11(10, T10, g, h, a, b, c, d, e, f)
   233  	ROUND_00_11(11, T11, f, g, h, a, b, c, d, e)
   234  
   235  	ROUND_12_15(12, T12, e, f, g, h, a, b, c, d)
   236  	ROUND_12_15(13, T13, d, e, f, g, h, a, b, c)
   237  	ROUND_12_15(14, T14, c, d, e, f, g, h, a, b)
   238  	ROUND_12_15(15, T15, b, c, d, e, f, g, h, a)
   239  
   240  	ROUND_16_63(16, T16, a, b, c, d, e, f, g, h)
   241  	ROUND_16_63(17, T17, h, a, b, c, d, e, f, g)
   242  	ROUND_16_63(18, T18, g, h, a, b, c, d, e, f)
   243  	ROUND_16_63(19, T19, f, g, h, a, b, c, d, e)
   244  	ROUND_16_63(20, T20, e, f, g, h, a, b, c, d)
   245  	ROUND_16_63(21, T21, d, e, f, g, h, a, b, c)
   246  	ROUND_16_63(22, T22, c, d, e, f, g, h, a, b)
   247  	ROUND_16_63(23, T23, b, c, d, e, f, g, h, a)
   248  	ROUND_16_63(24, T24, a, b, c, d, e, f, g, h)
   249  	ROUND_16_63(25, T25, h, a, b, c, d, e, f, g)
   250  	ROUND_16_63(26, T26, g, h, a, b, c, d, e, f)
   251  	ROUND_16_63(27, T27, f, g, h, a, b, c, d, e)
   252  	ROUND_16_63(28, T28, e, f, g, h, a, b, c, d)
   253  	ROUND_16_63(29, T29, d, e, f, g, h, a, b, c)
   254  	ROUND_16_63(30, T30, c, d, e, f, g, h, a, b)
   255  	ROUND_16_63(31, T31, b, c, d, e, f, g, h, a)
   256  	ROUND_16_63(32, T32, a, b, c, d, e, f, g, h)
   257  	ROUND_16_63(33, T33, h, a, b, c, d, e, f, g)
   258  	ROUND_16_63(34, T34, g, h, a, b, c, d, e, f)
   259  	ROUND_16_63(35, T35, f, g, h, a, b, c, d, e)
   260  	ROUND_16_63(36, T36, e, f, g, h, a, b, c, d)
   261  	ROUND_16_63(37, T37, d, e, f, g, h, a, b, c)
   262  	ROUND_16_63(38, T38, c, d, e, f, g, h, a, b)
   263  	ROUND_16_63(39, T39, b, c, d, e, f, g, h, a)
   264  	ROUND_16_63(40, T40, a, b, c, d, e, f, g, h)
   265  	ROUND_16_63(41, T41, h, a, b, c, d, e, f, g)
   266  	ROUND_16_63(42, T42, g, h, a, b, c, d, e, f)
   267  	ROUND_16_63(43, T43, f, g, h, a, b, c, d, e)
   268  	ROUND_16_63(44, T44, e, f, g, h, a, b, c, d)
   269  	ROUND_16_63(45, T45, d, e, f, g, h, a, b, c)
   270  	ROUND_16_63(46, T46, c, d, e, f, g, h, a, b)
   271  	ROUND_16_63(47, T47, b, c, d, e, f, g, h, a)
   272  	ROUND_16_63(48, T16, a, b, c, d, e, f, g, h)
   273  	ROUND_16_63(49, T17, h, a, b, c, d, e, f, g)
   274  	ROUND_16_63(50, T18, g, h, a, b, c, d, e, f)
   275  	ROUND_16_63(51, T19, f, g, h, a, b, c, d, e)
   276  	ROUND_16_63(52, T20, e, f, g, h, a, b, c, d)
   277  	ROUND_16_63(53, T21, d, e, f, g, h, a, b, c)
   278  	ROUND_16_63(54, T22, c, d, e, f, g, h, a, b)
   279  	ROUND_16_63(55, T23, b, c, d, e, f, g, h, a)
   280  	ROUND_16_63(56, T24, a, b, c, d, e, f, g, h)
   281  	ROUND_16_63(57, T25, h, a, b, c, d, e, f, g)
   282  	ROUND_16_63(58, T26, g, h, a, b, c, d, e, f)
   283  	ROUND_16_63(59, T27, f, g, h, a, b, c, d, e)
   284  	ROUND_16_63(60, T28, e, f, g, h, a, b, c, d)
   285  	ROUND_16_63(61, T29, d, e, f, g, h, a, b, c)
   286  	ROUND_16_63(62, T30, c, d, e, f, g, h, a, b)
   287  	ROUND_16_63(63, T31, b, c, d, e, f, g, h, a)
   288  
   289  	VEOR a.B16, aSave.B16, a.B16
   290  	VEOR b.B16, bSave.B16, b.B16
   291  	VEOR c.B16, cSave.B16, c.B16
   292  	VEOR d.B16, dSave.B16, d.B16
   293  	VEOR e.B16, eSave.B16, e.B16
   294  	VEOR f.B16, fSave.B16, f.B16
   295  	VEOR g.B16, gSave.B16, g.B16
   296  	VEOR h.B16, hSave.B16, h.B16
   297  
   298  	SUB $1, blockCount
   299  	CBNZ blockCount, loop
   300  
   301  	// transpose state
   302  	TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4)
   303  	TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4)
   304  
   305  	MOVD.P 8(digSave), R20
   306  	VST1 [a.S4, e.S4], (R20)
   307  	MOVD.P 8(digSave), R20
   308  	VST1 [b.S4, f.S4], (R20)
   309  	MOVD.P 8(digSave), R20
   310  	VST1 [c.S4, g.S4], (R20)
   311  	MOVD (digSave), R20
   312  	VST1 [d.S4, h.S4], (R20)
   313  
   314  	RET
   315  
   316  #undef digPtr
   317  #undef a
   318  #undef b
   319  #undef c
   320  #undef d
   321  #undef e
   322  #undef f
   323  #undef g
   324  #undef h
   325  
   326  #define a V0
   327  #define b V1
   328  #define c V2
   329  #define d V3
   330  #define e V4
   331  #define f V5
   332  #define g V6
   333  #define h V7
   334  // func copyResultsBy4(dig *uint32, dst *byte)
   335  TEXT ·copyResultsBy4(SB),NOSPLIT,$0
   336  #define digPtr R0
   337  #define dstPtr R1
   338  	MOVD	dig+0(FP), digPtr
   339  	MOVD	dst+8(FP), dstPtr
   340  
   341  	// load state
   342  	VLD1.P 64(digPtr), [a.S4, b.S4, c.S4, d.S4]
   343  	VLD1 (digPtr), [e.S4, f.S4, g.S4, h.S4]
   344  
   345  	VREV32 a.B16, a.B16
   346  	VREV32 b.B16, b.B16
   347  	VREV32 c.B16, c.B16
   348  	VREV32 d.B16, d.B16
   349  	VREV32 e.B16, e.B16
   350  	VREV32 f.B16, f.B16
   351  	VREV32 g.B16, g.B16
   352  	VREV32 h.B16, h.B16
   353  
   354  	VST1.P [a.B16, b.B16, c.B16, d.B16], 64(dstPtr)
   355  	VST1 [e.B16, f.B16, g.B16, h.B16], (dstPtr)
   356  
   357  	RET