github.com/emmansun/gmsm@v0.29.1/sm3/sm3block_s390x.s (about)

     1  // Copyright 2024 Sun Yimin. All rights reserved.
     2  // Use of this source code is governed by a MIT-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !purego
     6  
     7  #include "textflag.h"
     8  #include "sm3_const_asm.s"
     9  
    10  #define a R1
    11  #define b R2
    12  #define c R3
    13  #define d R4
    14  #define e R5
    15  #define f R6
    16  #define g R7
    17  #define h R8
    18  
    19  #define CTX	R9
    20  #define INP	R10
    21  #define LEN	R11
    22  #define END R12
    23  
    24  #define y0 R9
    25  #define y1 R11
    26  #define y2 R12
    27  
    28  #define XWORD0 V0
    29  #define XWORD1 V1
    30  #define XWORD2 V2
    31  #define XWORD3 V3
    32  
    33  #define XTMP0 V4
    34  #define XTMP1 V5
    35  #define XTMP2 V6
    36  #define XTMP3 V7
    37  #define XTMP4 V8
    38  
    39  #define XFER  V9
    40  
    41  #define SS12(a, e, const, ss1, ss2) \
    42  	RLL     $12, a, ss2;                       \ // y0 = a <<< 12
    43  	ADD     $const, e, ss1;                    \
    44  	ADD     ss2, ss1;                          \ // y2 = a <<< 12 + e + T
    45  	RLL     $7, ss1;                           \ // y2 = SS1
    46  	XOR     ss1, ss2
    47  
    48  #define P0(tt2, tmp, out) \
    49  	RLL     $9, tt2, tmp;                        \
    50  	RLL     $17, tt2, out;                       \
    51  	XOR     tmp, out;                            \
    52  	XOR     tt2, out
    53  
    54  // For rounds [0 - 16)
    55  // addr1 for w, addr2 for w'
    56  #define DO_ROUND_N_0(addr1, addr2, const, a, b, c, d, e, f, g, h) \
    57  	;                                            \
    58  	SS12(a, e, const, y2, y0);                   \
    59  	MOVWZ addr1, y1;                             \
    60  	ADD   y1, y2;                                \ // y2 = SS1 + W
    61  	ADD   h, y2;                                 \ // y2 = h + SS1 + W    
    62  	MOVWZ addr2, y1;                             \
    63  	ADD   y1, y0;                                \ // y0 = SS2 + W'
    64  	ADD   d, y0;                                 \ // y0 = d + SS2 + W'
    65  	;                                            \
    66  	XOR     a, b, h;                             \
    67  	XOR     c, h;                                \
    68  	ADD     y0, h;                               \ // h = FF(a, b, c) + d + SS2 + W' = tt1
    69  	;                                            \
    70  	XOR      e, f, y1;                           \
    71  	XOR      g, y1;                              \
    72  	ADD      y1, y2;                             \ // y2 = GG(e, f, g) + h + SS1 + W = tt2  
    73  	;                                            \
    74  	RLL    $9, b;                                \
    75  	RLL    $19, f;                               \
    76  	;                                            \
    77  	P0(y2, y0, d)
    78  
    79  // For rounds [16 - 64)
    80  // addr1 for w, addr2 for w'
    81  #define DO_ROUND_N_1(addr1, addr2, const, a, b, c, d, e, f, g, h) \
    82  	;                                            \
    83  	SS12(a, e, const, y2, y0);                   \
    84  	MOVWZ addr1, y1;                             \
    85  	ADD     y1, y2;                              \ // y2 = SS1 + W
    86  	ADD     h, y2;                               \ // y2 = h + SS1 + W    
    87  	MOVWZ addr2, y1;                             \
    88  	ADD     y1, y0;                              \ // y0 = SS2 + W'
    89  	ADD     d, y0;                               \ // y0 = d + SS2 + W'
    90  	;                                            \
    91  	OR      a, b, y1;                            \
    92  	AND     a, b, h;                             \
    93  	AND     c, y1;                               \
    94  	OR      y1, h;                               \ // h =  (a AND b) OR (a AND c) OR (b AND c)  
    95  	ADD     y0, h;                               \ // h = FF(a, b, c) + d + SS2 + W' = tt1
    96  	;                                            \
    97  	XOR     f, g, y1;                            \
    98  	AND     e, y1;                               \
    99  	XOR     g, y1;                               \ // y1 = GG2(e, f, g)	
   100  	ADD     y1, y2;                              \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2  
   101  	;                                            \
   102  	RLL     $9, b;                               \
   103  	RLL     $19, f;                              \
   104  	;                                            \
   105  	P0(y2, y0, d)
   106  
   107  // r = s <<< n
   108  #define PROLD(s, r, n) \
   109  	VERLLF $n, s, r
   110  
   111  #define MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3) \
   112  	VSLDB $12, XWORD0, XWORD1, XTMP0;  \ // XTMP0 = W[-13] = {w3, w4, w5, w6}
   113  	PROLD(XTMP0, XTMP1, 7);            \ // XTMP1 = W[-13] rol 7
   114  	VSLDB $8, XWORD2, XWORD3, XTMP0;   \ // XTMP0 = W[-6] = {w10, w11, w12, w13}
   115  	VX XTMP0, XTMP1, XTMP0;            \ // XTMP0 = W[-6] xor (W[-13] rol 7)
   116  	; \ // Prepare P1 parameters
   117  	VSLDB $12, XWORD1, XWORD2, XTMP1;  \ // XTMP1 = W[-9] = {w7, w8, w9, w10}
   118  	VX XTMP1, XWORD0, XTMP1;           \ // XTMP1 = W[-9] xor W[-16]
   119  	VSLDB $4, XWORD3, XWORD2, XTMP3;   \ // XTMP3 = W[-3] = {w13, w14, w15, w8}
   120  	PROLD(XTMP3, XTMP2, 15);           \ // XTMP2 = W[-3] rol 15
   121  	VX XTMP1, XTMP2, XTMP2;            \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABxx}
   122  	; \ // P1
   123  	PROLD(XTMP2, XTMP4, 15);           \ // XTMP4 =  = XTMP2 rol 15 {ABxx}
   124  	PROLD(XTMP4, XTMP3, 8);            \ // XTMP3 = XTMP2 rol 23 {ABxx}
   125  	VX XTMP2, XTMP4, XTMP4;            \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx})
   126  	VX XTMP4, XTMP3, XTMP4;            \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {ABxx}) XOR (XTMP2 rol 23 {ABxx})
   127  	; \ // First 2 words message schedule result
   128  	VX XTMP4, XTMP0, XTMP2;            \ // XTMP2 = {w[0], w[1], ..., ...}
   129  	; \ // Prepare P1 parameters
   130  	VSLDB $4, XWORD3, XTMP2, XTMP3;    \ // XTMP3 = W[-3] = {w13, w14, w15, w0}
   131  	PROLD(XTMP3, XTMP4, 15);           \ // XTMP4 = W[-3] rol 15
   132  	VX XTMP1, XTMP4, XTMP4;		       \ // XTMP4 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {ABCD}
   133  	; \ // P1
   134  	PROLD(XTMP4, XTMP3, 15);           \ // XTMP3 =  = XTMP4 rol 15 {ABCD}
   135  	PROLD(XTMP3, XTMP1, 8);            \ // XTMP1 = XTMP4 rol 23 {ABCD}
   136  	VX XTMP4, XTMP3, XTMP3;            \ // XTMP3 = XTMP4 XOR (XTMP4 rol 15 {ABCD})
   137  	VX XTMP3, XTMP1, XTMP1;            \ // XTMP1 = XTMP4 XOR (XTMP4 rol 15 {ABCD}) XOR (XTMP4 rol 23 {ABCD})
   138  	; \ // 4 words message schedule result
   139  	VX XTMP1, XTMP0, XWORD0;           \ // XWORD0 = {w[0], w[1], w[2], w[3]}
   140  
   141  // For the usage of tmp-xx(SP), I referred to the code of
   142  // https://github.com/golang/go/blob/master/src/crypto/md5/md5block_s390x.s
   143  //
   144  // func block(dig *digest, p []byte)
   145  TEXT ·block(SB),NOSPLIT,$72-32
   146  	MOVD	dig+0(FP), CTX
   147  	MOVD	p+8(FP), INP
   148  	MOVD	p_len+16(FP), LEN
   149  	AND	$-64, LEN
   150  	LAY	(INP)(LEN*1), END
   151  
   152  	CMPBEQ	INP, END, end
   153  	MOVD END, tmp-8(SP) // backup END
   154  	LMY 0(CTX), a, h
   155  
   156  loop:
   157  	STMY	a, h, tmp-40(SP) // backup state
   158  	VLM (INP), XWORD0, XWORD3
   159  
   160  schedule_compress: // for w0 - w47
   161  	// Do 4 rounds and scheduling
   162  	VST XWORD0, tmp-56(SP)
   163  	VX  XWORD0, XWORD1, XFER
   164  	VST XFER, tmp-72(SP)
   165  	DO_ROUND_N_0(tmp-56(SP), tmp-72(SP), T0, a, b, c, d, e, f, g, h)
   166  	DO_ROUND_N_0(tmp-52(SP), tmp-68(SP), T1, h, a, b, c, d, e, f, g)
   167  	MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3)
   168  	DO_ROUND_N_0(tmp-48(SP), tmp-64(SP), T2, g, h, a, b, c, d, e, f)
   169  	DO_ROUND_N_0(tmp-44(SP), tmp-60(SP), T3, f, g, h, a, b, c, d, e)
   170  
   171  	// Do 4 rounds and scheduling
   172  	VST XWORD1, tmp-56(SP)
   173  	VX XWORD1, XWORD2, XFER
   174  	VST XFER, tmp-72(SP)
   175  	DO_ROUND_N_0(tmp-56(SP), tmp-72(SP), T4, e, f, g, h, a, b, c, d)
   176  	DO_ROUND_N_0(tmp-52(SP), tmp-68(SP), T5, d, e, f, g, h, a, b, c)
   177  	MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0)
   178  	DO_ROUND_N_0(tmp-48(SP), tmp-64(SP), T6, c, d, e, f, g, h, a, b)
   179  	DO_ROUND_N_0(tmp-44(SP), tmp-60(SP), T7, b, c, d, e, f, g, h, a)
   180  
   181  	// Do 4 rounds and scheduling
   182  	VST XWORD2, tmp-56(SP)
   183  	VX XWORD2, XWORD3, XFER
   184  	VST XFER, tmp-72(SP)
   185  	DO_ROUND_N_0(tmp-56(SP), tmp-72(SP), T8, a, b, c, d, e, f, g, h)
   186  	DO_ROUND_N_0(tmp-52(SP), tmp-68(SP), T9, h, a, b, c, d, e, f, g)
   187  	MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1)
   188  	DO_ROUND_N_0(tmp-48(SP), tmp-64(SP), T10, g, h, a, b, c, d, e, f)
   189  	DO_ROUND_N_0(tmp-44(SP), tmp-60(SP), T11, f, g, h, a, b, c, d, e)
   190  
   191  	// Do 4 rounds and scheduling
   192  	VST XWORD3, tmp-56(SP)
   193  	VX XWORD3, XWORD0, XFER
   194  	VST XFER, tmp-72(SP)
   195  	DO_ROUND_N_0(tmp-56(SP), tmp-72(SP), T12, e, f, g, h, a, b, c, d)
   196  	DO_ROUND_N_0(tmp-52(SP), tmp-68(SP), T13, d, e, f, g, h, a, b, c)
   197  	MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2)
   198  	DO_ROUND_N_0(tmp-48(SP), tmp-64(SP), T14, c, d, e, f, g, h, a, b)
   199  	DO_ROUND_N_0(tmp-44(SP), tmp-60(SP), T15, b, c, d, e, f, g, h, a)
   200  
   201  	// Do 4 rounds and scheduling
   202  	VST XWORD0, tmp-56(SP)
   203  	VX XWORD0, XWORD1, XFER
   204  	VST XFER, tmp-72(SP)
   205  	DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T16, a, b, c, d, e, f, g, h)
   206  	DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T17, h, a, b, c, d, e, f, g)
   207  	MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3)
   208  	DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T18, g, h, a, b, c, d, e, f)
   209  	DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T19, f, g, h, a, b, c, d, e)
   210  
   211  	// Do 4 rounds and scheduling
   212  	VST XWORD1, tmp-56(SP)
   213  	VX XWORD1, XWORD2, XFER
   214  	VST XFER, tmp-72(SP)
   215  	DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T20, e, f, g, h, a, b, c, d)
   216  	DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T21, d, e, f, g, h, a, b, c)
   217  	MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0)
   218  	DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T22, c, d, e, f, g, h, a, b)
   219  	DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T23, b, c, d, e, f, g, h, a)
   220  	
   221  	// Do 4 rounds and scheduling
   222  	VST XWORD2, tmp-56(SP)
   223  	VX XWORD2, XWORD3, XFER
   224  	VST XFER, tmp-72(SP)
   225  	DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T24, a, b, c, d, e, f, g, h)
   226  	DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T25, h, a, b, c, d, e, f, g)
   227  	MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1)
   228  	DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T26, g, h, a, b, c, d, e, f)
   229  	DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T27, f, g, h, a, b, c, d, e)
   230  		
   231  	// Do 4 rounds and scheduling
   232  	VST XWORD3, tmp-56(SP)
   233  	VX XWORD3, XWORD0, XFER
   234  	VST XFER, tmp-72(SP)
   235  	DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T28, e, f, g, h, a, b, c, d)
   236  	DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T29, d, e, f, g, h, a, b, c)
   237  	MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2)
   238  	DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T30, c, d, e, f, g, h, a, b)
   239  	DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T31, b, c, d, e, f, g, h, a)
   240  
   241  	// Do 4 rounds and scheduling
   242  	VST XWORD0, tmp-56(SP)
   243  	VX XWORD0, XWORD1, XFER
   244  	VST XFER, tmp-72(SP)
   245  	DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T32, a, b, c, d, e, f, g, h)
   246  	DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T33, h, a, b, c, d, e, f, g)
   247  	MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3)
   248  	DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T34, g, h, a, b, c, d, e, f)
   249  	DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T35, f, g, h, a, b, c, d, e)
   250  
   251  	// Do 4 rounds and scheduling
   252  	VST XWORD1, tmp-56(SP)
   253  	VX XWORD1, XWORD2, XFER
   254  	VST XFER, tmp-72(SP)
   255  	DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T36, e, f, g, h, a, b, c, d)
   256  	DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T37, d, e, f, g, h, a, b, c)
   257  	MESSAGE_SCHEDULE(XWORD1, XWORD2, XWORD3, XWORD0)
   258  	DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T38, c, d, e, f, g, h, a, b)
   259  	DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T39, b, c, d, e, f, g, h, a)
   260  
   261  	// Do 4 rounds and scheduling
   262  	VST XWORD2, tmp-56(SP)
   263  	VX XWORD2, XWORD3, XFER
   264  	VST XFER, tmp-72(SP)
   265  	DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T40, a, b, c, d, e, f, g, h)
   266  	DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T41, h, a, b, c, d, e, f, g)
   267  	MESSAGE_SCHEDULE(XWORD2, XWORD3, XWORD0, XWORD1)
   268  	DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T42, g, h, a, b, c, d, e, f)
   269  	DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T43, f, g, h, a, b, c, d, e)
   270  
   271  	// Do 4 rounds and scheduling
   272  	VST XWORD3, tmp-56(SP)
   273  	VX XWORD3, XWORD0, XFER
   274  	VST XFER, tmp-72(SP)
   275  	DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T44, e, f, g, h, a, b, c, d)
   276  	DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T45, d, e, f, g, h, a, b, c)
   277  	MESSAGE_SCHEDULE(XWORD3, XWORD0, XWORD1, XWORD2)
   278  	DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T46, c, d, e, f, g, h, a, b)
   279  	DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T47, b, c, d, e, f, g, h, a)
   280  
   281  	// w48 - w63 processed with only 4 rounds scheduling (last 16 rounds)
   282  	// Do 4 rounds
   283  	VST XWORD0, tmp-56(SP)
   284  	VX XWORD0, XWORD1, XFER
   285  	VST XFER, tmp-72(SP)
   286  	DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T48, a, b, c, d, e, f, g, h)
   287  	DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T49, h, a, b, c, d, e, f, g)
   288  	DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T50, g, h, a, b, c, d, e, f)
   289  	DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T51, f, g, h, a, b, c, d, e)
   290  
   291  	VST XWORD1, tmp-56(SP)
   292  	VX XWORD1, XWORD2, XFER
   293  	VST XFER, tmp-72(SP)
   294  	DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T52, e, f, g, h, a, b, c, d)
   295  	DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T53, d, e, f, g, h, a, b, c)
   296  	DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T54, c, d, e, f, g, h, a, b)
   297  	DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T55, b, c, d, e, f, g, h, a)
   298  
   299  	VST XWORD2, tmp-56(SP)
   300  	VX XWORD2, XWORD3, XFER
   301  	VST XFER, tmp-72(SP)
   302  	MESSAGE_SCHEDULE(XWORD0, XWORD1, XWORD2, XWORD3)
   303  	DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T56, a, b, c, d, e, f, g, h)
   304  	DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T57, h, a, b, c, d, e, f, g)
   305  	DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T58, g, h, a, b, c, d, e, f)
   306  	DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T59, f, g, h, a, b, c, d, e)
   307  
   308  	VST XWORD3, tmp-56(SP)
   309  	VX XWORD3, XWORD0, XFER
   310  	VST XFER, tmp-72(SP)
   311  	DO_ROUND_N_1(tmp-56(SP), tmp-72(SP), T60, e, f, g, h, a, b, c, d)
   312  	DO_ROUND_N_1(tmp-52(SP), tmp-68(SP), T61, d, e, f, g, h, a, b, c)
   313  	DO_ROUND_N_1(tmp-48(SP), tmp-64(SP), T62, c, d, e, f, g, h, a, b)
   314  	DO_ROUND_N_1(tmp-44(SP), tmp-60(SP), T63, b, c, d, e, f, g, h, a)
   315  
   316  	MOVWZ tmp-40(SP), END
   317  	XOR END, a
   318  	MOVWZ tmp-36(SP), END
   319  	XOR END, b
   320  	MOVWZ tmp-32(SP), END
   321  	XOR END, c
   322  	MOVWZ tmp-28(SP), END
   323  	XOR END, d
   324  	MOVWZ tmp-24(SP), END
   325  	XOR END, e
   326  	MOVWZ tmp-20(SP), END
   327  	XOR END, f
   328  	MOVWZ tmp-16(SP), END
   329  	XOR END, g
   330  	MOVWZ tmp-12(SP), END
   331  	XOR END, h
   332  
   333  	LA	64(INP), INP
   334  	MOVD tmp-8(SP), END
   335  	CMPBLT	INP, END, loop
   336  
   337  end:
   338  	MOVD	dig+0(FP), CTX
   339  	STMY	a, h, 0(CTX)
   340  	RET