github.com/emmansun/gmsm@v0.29.1/sm3/sm3blocks_s390x.s

github.com/emmansun/gmsm@v0.29.1/sm3/sm3blocks_s390x.s (about)

     1  // Copyright 2024 Sun Yimin. All rights reserved.
     2  // Use of this source code is governed by a MIT-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !purego
     6  
     7  #include "textflag.h"
     8  
     9  DATA mask<>+0x00(SB)/8, $0x0001020310111213
    10  DATA mask<>+0x08(SB)/8, $0x0405060714151617
    11  DATA mask<>+0x10(SB)/8, $0x08090a0b18191a1b
    12  DATA mask<>+0x18(SB)/8, $0x0c0d0e0f1c1d1e1f
    13  DATA mask<>+0x20(SB)/8, $0x0001020304050607
    14  DATA mask<>+0x28(SB)/8, $0x1011121314151617
    15  DATA mask<>+0x30(SB)/8, $0x08090a0b0c0d0e0f
    16  DATA mask<>+0x38(SB)/8, $0x18191a1b1c1d1e1f
    17  GLOBL mask<>(SB), RODATA, $64
    18  
    19  #define a V0
    20  #define e V1
    21  #define b V2
    22  #define f V3
    23  #define c V4
    24  #define g V5
    25  #define d V6
    26  #define h V7
    27  #define M0 V8
    28  #define M1 V9
    29  #define M2 V10
    30  #define M3 V11
    31  #define TMP0 V12
    32  #define TMP1 V13
    33  #define TMP2 V14
    34  #define TMP3 V15
    35  #define TMP4 V16
    36  #define aSave V24
    37  #define bSave V25
    38  #define cSave V26
    39  #define dSave V27
    40  #define eSave V28
    41  #define fSave V29
    42  #define gSave V30
    43  #define hSave V31
    44  
    45  #define TRANSPOSE_MATRIX(T0, T1, T2, T3, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3) \
    46  	VPERM T0, T1, M0, TMP0; \
    47  	VPERM T2, T3, M0, TMP1; \
    48  	VPERM T0, T1, M1, TMP2; \
    49  	VPERM T2, T3, M1, TMP3; \
    50  	VPERM TMP0, TMP1, M2, T0; \
    51  	VPERM TMP0, TMP1, M3, T1; \
    52  	VPERM TMP2, TMP3, M2, T2; \
    53  	VPERM TMP2, TMP3, M3, T3
    54  
    55  // r = s <<< n
    56  #define PROLD(s, r, n) \
    57  	VERLLF $n, s, r
    58  
    59  #define loadWordByIndex(W, i) \
    60  	VL (16*(i))(statePtr), W
    61  
    62  // one word is 16 bytes
    63  #define prepare4Words \
    64  	VL (srcPtr1)(srcPtrPtr*1), V16; \
    65  	VL (srcPtr2)(srcPtrPtr*1), V17; \
    66  	VL (srcPtr3)(srcPtrPtr*1), V18; \
    67  	VL (srcPtr4)(srcPtrPtr*1), V19; \
    68  	TRANSPOSE_MATRIX(V16, V17, V18, V19, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3); \
    69  	VSTM V16, V19, (wordPtr); \
    70  	LAY 16(srcPtrPtr), srcPtrPtr; \
    71  	ADD $64, wordPtr
    72  
    73  #define ROUND_00_11(index, a, b, c, d, e, f, g, h) \
    74  	PROLD(a, TMP0, 12)               \
    75  	VLR TMP0, TMP1                   \
    76  	VLREPF (index*4)(kPtr), TMP2     \ // It seems that the VREPIF instruction is not supported yet.
    77  	VAF TMP2, TMP0, TMP0             \
    78  	VAF e, TMP0, TMP0                \
    79  	PROLD(TMP0, TMP2, 7)             \ // TMP2 = SS1
    80  	VX TMP2, TMP1, TMP0              \ // TMP0 = SS2
    81  	VX a, b, TMP1                    \
    82  	VX c, TMP1, TMP1                 \
    83  	VAF TMP1, d, TMP1                \ // TMP1 = (a XOR b XOR c) + d
    84  	loadWordByIndex(TMP3, index)     \
    85  	loadWordByIndex(TMP4, index+4)   \
    86  	VX TMP3, TMP4, TMP4              \
    87  	VAF TMP4, TMP1, TMP1             \ // TMP1 = (a XOR b XOR c) + d + (Wt XOR Wt+4)
    88  	VAF TMP1, TMP0, TMP1             \ // TMP1 = TT1
    89  	VAF h, TMP3, TMP3                \
    90  	VAF TMP3, TMP2, TMP3             \ // Wt + h + SS1
    91  	VX e, f, TMP4                    \
    92  	VX g, TMP4, TMP4                 \
    93  	VAF TMP4, TMP3, TMP3             \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
    94  	VLR b, TMP4                      \
    95  	PROLD(TMP4, b, 9)                \ // b = b <<< 9
    96  	VLR TMP1, h                      \ // h = TT1
    97  	VLR f, TMP4                      \
    98  	PROLD(TMP4, f, 19)               \ // f = f <<< 19
    99  	PROLD(TMP3, TMP4, 9)             \ // TMP4 = TT2 <<< 9
   100  	PROLD(TMP4, TMP0, 8)             \ // TMP0 = TT2 <<< 17
   101  	VX TMP3, TMP4, TMP4              \ // TMP4 = TT2 XOR (TT2 <<< 9)
   102  	VX TMP4, TMP0, d                 \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)
   103  
   104  #define MESSAGE_SCHEDULE(index) \
   105  	loadWordByIndex(TMP0, index+1)    \ // Wj-3
   106  	PROLD(TMP0, TMP1, 15)             \
   107  	loadWordByIndex(TMP0, index-12)   \ // Wj-16
   108  	VX TMP0, TMP1, TMP0               \
   109  	loadWordByIndex(TMP1, index-5)    \ // Wj-9
   110  	VX TMP0, TMP1, TMP0               \
   111  	PROLD(TMP0, TMP1, 15)             \
   112  	PROLD(TMP1, TMP2, 8)              \
   113  	VX TMP1, TMP0, TMP0               \
   114  	VX TMP2, TMP0, TMP0               \ // P1
   115  	loadWordByIndex(TMP1, index-9)    \ // Wj-13
   116  	PROLD(TMP1, TMP2, 7)              \
   117  	VX TMP2, TMP0, TMP0               \
   118  	loadWordByIndex(TMP1, index-2)    \ // Wj-6
   119  	VX TMP1, TMP0, TMP1               \
   120  	VST TMP1, (wordPtr)               \
   121  	ADD $16, wordPtr                  \
   122  
   123  #define ROUND_12_15(index, a, b, c, d, e, f, g, h) \
   124  	MESSAGE_SCHEDULE(index)                        \
   125  	ROUND_00_11(index, a, b, c, d, e, f, g, h)     \
   126  
   127  #define ROUND_16_63(index, a, b, c, d, e, f, g, h) \
   128  	MESSAGE_SCHEDULE(index)          \ // TMP1 is Wt+4 now, Pls do not use it
   129  	PROLD(a, TMP0, 12)               \
   130  	VLR TMP0, TMP4                   \
   131  	VLREPF (index*4)(kPtr), TMP2     \ // It seems that the VREPIF instruction is not supported yet.
   132  	VAF TMP2, TMP0, TMP0             \
   133  	VAF e, TMP0, TMP0                \
   134  	PROLD(TMP0, TMP2, 7)             \ // TMP2 = SS1
   135  	VX TMP2, TMP4, TMP0              \ // TMP0 = SS2
   136  	VO a, b, TMP3                    \
   137  	VN a, b, TMP4                    \
   138  	VN c, TMP3, TMP3                 \
   139  	VO TMP4, TMP3, TMP4              \ // (a AND b) OR (a AND c) OR (b AND c)
   140  	VAF TMP4, d, TMP4                \ // (a AND b) OR (a AND c) OR (b AND c) + d
   141  	loadWordByIndex(TMP3, index)     \ // Wj
   142  	VX TMP3, TMP1, TMP1              \ // Wj XOR Wj+4
   143  	VAF TMP4, TMP1, TMP4             \ // (a AND b) OR (a AND c) OR (b AND c) + d + (Wt XOR Wt+4)
   144  	VAF TMP4, TMP0, TMP4             \ // TT1
   145  	VAF h, TMP3, TMP3                \ // Wt + h
   146  	VAF TMP2, TMP3, TMP3             \ // Wt + h + SS1
   147  	VX f, g, TMP1                    \
   148  	VN TMP1, e, TMP1                 \
   149  	VX g, TMP1, TMP1                 \ // (f XOR g) AND e XOR g
   150  	VAF TMP3, TMP1, TMP3             \ // TT2
   151  	VLR b, TMP1                      \
   152  	PROLD(TMP1, b, 9)                \ // b = b <<< 9
   153  	VLR TMP4, h                      \ // h = TT1
   154  	VLR f, TMP1                      \
   155  	PROLD(TMP1, f, 19)               \ // f = f <<< 19
   156  	PROLD(TMP3, TMP1, 9)             \ // TMP1 = TT2 <<< 9
   157  	PROLD(TMP1, TMP0, 8)             \ // TMP0 = TT2 <<< 17
   158  	VX TMP3, TMP1, TMP1              \ // TMP1 = TT2 XOR (TT2 <<< 9)
   159  	VX TMP1, TMP0, d                 \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)
   160  
   161  // func copyResultsBy4(dig *uint32, dst *byte)
   162  TEXT ·copyResultsBy4(SB),NOSPLIT,$0
   163  #define digPtr R3
   164  #define dstPtr R4
   165  	MOVD	dig+0(FP), digPtr
   166  	MOVD	dst+8(FP), dstPtr
   167  
   168  	VLM (digPtr), V0, V7
   169  	VSTM V0, V7, (dstPtr)
   170  
   171  	RET
   172  #undef digPtr
   173  #undef dstPtr
   174  
   175  // Used general purpose registers R1-R11.
   176  // blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int)
   177  TEXT ·blockMultBy4(SB), NOSPLIT, $0
   178  #define digPtr R11
   179  #define srcPtrPtr R1
   180  #define statePtr R2
   181  #define kPtr R3
   182  #define blockCount R5
   183  #define srcPtr1 R6
   184  #define srcPtr2 R7
   185  #define srcPtr3 R8
   186  #define srcPtr4 R9
   187  #define wordPtr R10
   188  	MOVD	dig+0(FP), digPtr
   189  	MOVD	p+8(FP), srcPtrPtr
   190  	MOVD	buffer+16(FP), statePtr
   191  	MOVD	blocks+24(FP), blockCount
   192  
   193  	// load state
   194  	MOVD 0(digPtr), R4
   195  	VLM (R4), a, e
   196  	MOVD 8(digPtr), R4
   197  	VLM (R4), b, f
   198  	MOVD 16(digPtr), R4
   199  	VLM (R4), c, g
   200  	MOVD 24(digPtr), R4
   201  	VLM (R4), d, h
   202  
   203  	MOVD $mask<>+0x00(SB), R4
   204  	VLM (R4), M0, M3
   205  
   206  	TRANSPOSE_MATRIX(a, b, c, d, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3)
   207  	TRANSPOSE_MATRIX(e, f, g, h, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3)
   208  
   209  	MOVD (srcPtrPtr), srcPtr1
   210  	MOVD 8(srcPtrPtr), srcPtr2
   211  	MOVD 16(srcPtrPtr), srcPtr3
   212  	MOVD 24(srcPtrPtr), srcPtr4
   213  	MOVD $0, srcPtrPtr
   214  
   215  	MOVD $·_K+0(SB), kPtr
   216  
   217  loop:
   218  	// save state
   219  	VLR a, aSave
   220  	VLR b, bSave
   221  	VLR c, cSave
   222  	VLR d, dSave
   223  	VLR e, eSave
   224  	VLR f, fSave
   225  	VLR g, gSave
   226  	VLR h, hSave
   227  
   228  	// reset wordPtr
   229  	MOVD statePtr, wordPtr
   230  
   231  	// load message block
   232  	prepare4Words
   233  	prepare4Words
   234  	prepare4Words
   235  	prepare4Words
   236  
   237  	ROUND_00_11(0, a, b, c, d, e, f, g, h)
   238  	ROUND_00_11(1, h, a, b, c, d, e, f, g)
   239  	ROUND_00_11(2, g, h, a, b, c, d, e, f)
   240  	ROUND_00_11(3, f, g, h, a, b, c, d, e)
   241  	ROUND_00_11(4, e, f, g, h, a, b, c, d)
   242  	ROUND_00_11(5, d, e, f, g, h, a, b, c)
   243  	ROUND_00_11(6, c, d, e, f, g, h, a, b)
   244  	ROUND_00_11(7, b, c, d, e, f, g, h, a)
   245  	ROUND_00_11(8, a, b, c, d, e, f, g, h)
   246  	ROUND_00_11(9, h, a, b, c, d, e, f, g)
   247  	ROUND_00_11(10, g, h, a, b, c, d, e, f)
   248  	ROUND_00_11(11, f, g, h, a, b, c, d, e)
   249  
   250  	ROUND_12_15(12, e, f, g, h, a, b, c, d)
   251  	ROUND_12_15(13, d, e, f, g, h, a, b, c)
   252  	ROUND_12_15(14, c, d, e, f, g, h, a, b)
   253  	ROUND_12_15(15, b, c, d, e, f, g, h, a)
   254  
   255  	ROUND_16_63(16, a, b, c, d, e, f, g, h)
   256  	ROUND_16_63(17, h, a, b, c, d, e, f, g)
   257  	ROUND_16_63(18, g, h, a, b, c, d, e, f)
   258  	ROUND_16_63(19, f, g, h, a, b, c, d, e)
   259  	ROUND_16_63(20, e, f, g, h, a, b, c, d)
   260  	ROUND_16_63(21, d, e, f, g, h, a, b, c)
   261  	ROUND_16_63(22, c, d, e, f, g, h, a, b)
   262  	ROUND_16_63(23, b, c, d, e, f, g, h, a)
   263  	ROUND_16_63(24, a, b, c, d, e, f, g, h)
   264  	ROUND_16_63(25, h, a, b, c, d, e, f, g)
   265  	ROUND_16_63(26, g, h, a, b, c, d, e, f)
   266  	ROUND_16_63(27, f, g, h, a, b, c, d, e)
   267  	ROUND_16_63(28, e, f, g, h, a, b, c, d)
   268  	ROUND_16_63(29, d, e, f, g, h, a, b, c)
   269  	ROUND_16_63(30, c, d, e, f, g, h, a, b)
   270  	ROUND_16_63(31, b, c, d, e, f, g, h, a)
   271  	ROUND_16_63(32, a, b, c, d, e, f, g, h)
   272  	ROUND_16_63(33, h, a, b, c, d, e, f, g)
   273  	ROUND_16_63(34, g, h, a, b, c, d, e, f)
   274  	ROUND_16_63(35, f, g, h, a, b, c, d, e)
   275  	ROUND_16_63(36, e, f, g, h, a, b, c, d)
   276  	ROUND_16_63(37, d, e, f, g, h, a, b, c)
   277  	ROUND_16_63(38, c, d, e, f, g, h, a, b)
   278  	ROUND_16_63(39, b, c, d, e, f, g, h, a)
   279  	ROUND_16_63(40, a, b, c, d, e, f, g, h)
   280  	ROUND_16_63(41, h, a, b, c, d, e, f, g)
   281  	ROUND_16_63(42, g, h, a, b, c, d, e, f)
   282  	ROUND_16_63(43, f, g, h, a, b, c, d, e)
   283  	ROUND_16_63(44, e, f, g, h, a, b, c, d)
   284  	ROUND_16_63(45, d, e, f, g, h, a, b, c)
   285  	ROUND_16_63(46, c, d, e, f, g, h, a, b)
   286  	ROUND_16_63(47, b, c, d, e, f, g, h, a)
   287  	ROUND_16_63(48, a, b, c, d, e, f, g, h)
   288  	ROUND_16_63(49, h, a, b, c, d, e, f, g)
   289  	ROUND_16_63(50, g, h, a, b, c, d, e, f)
   290  	ROUND_16_63(51, f, g, h, a, b, c, d, e)
   291  	ROUND_16_63(52, e, f, g, h, a, b, c, d)
   292  	ROUND_16_63(53, d, e, f, g, h, a, b, c)
   293  	ROUND_16_63(54, c, d, e, f, g, h, a, b)
   294  	ROUND_16_63(55, b, c, d, e, f, g, h, a)
   295  	ROUND_16_63(56, a, b, c, d, e, f, g, h)
   296  	ROUND_16_63(57, h, a, b, c, d, e, f, g)
   297  	ROUND_16_63(58, g, h, a, b, c, d, e, f)
   298  	ROUND_16_63(59, f, g, h, a, b, c, d, e)
   299  	ROUND_16_63(60, e, f, g, h, a, b, c, d)
   300  	ROUND_16_63(61, d, e, f, g, h, a, b, c)
   301  	ROUND_16_63(62, c, d, e, f, g, h, a, b)
   302  	ROUND_16_63(63, b, c, d, e, f, g, h, a)
   303  
   304  	VX a, aSave, a
   305  	VX b, bSave, b
   306  	VX c, cSave, c
   307  	VX d, dSave, d
   308  	VX e, eSave, e
   309  	VX f, fSave, f
   310  	VX g, gSave, g
   311  	VX h, hSave, h
   312  
   313  	SUB $1, blockCount
   314  	CMPBGT blockCount, $0, loop
   315  
   316  	TRANSPOSE_MATRIX(a, b, c, d, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3)
   317  	TRANSPOSE_MATRIX(e, f, g, h, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3)
   318  
   319  	MOVD 	0(digPtr), R4
   320  	VSTM a, e, (R4)
   321  	MOVD 	8(digPtr), R4
   322  	VSTM b, f, (R4)
   323  	MOVD 	16(digPtr), R4
   324  	VSTM c, g, (R4)
   325  	MOVD 	24(digPtr), R4
   326  	VSTM d, h, (R4)
   327  
   328  	RET