github.com/emmansun/gmsm@v0.29.1/sm3/sm3blocks_ppc64x.s (about)

     1  // Copyright 2024 Sun Yimin. All rights reserved.
     2  // Use of this source code is governed by a MIT-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build (ppc64 || ppc64le) && !purego
     6  
     7  #include "textflag.h"
     8  #include "sm3_const_asm.s"
     9  
    10  #define a V0
    11  #define e V1
    12  #define b V2
    13  #define f V3
    14  #define c V4
    15  #define g V5
    16  #define d V6
    17  #define h V7
    18  #define M0 V8
    19  #define M1 V9
    20  #define M2 V10
    21  #define M3 V11
    22  #define TMP0 V12
    23  #define TMP1 V13
    24  #define TMP2 V14
    25  #define TMP3 V15
    26  #define TMP4 V16
    27  #define TMP5 V17
    28  
    29  // For instruction emulation
    30  #define ESPERMW  V31 // Endian swapping permute into BE
    31  
    32  #define R_x08 R15
    33  #define R_x10 R16
    34  #define R_x18 R17
    35  #define R_x20 R18
    36  #define R_x30 R19
    37  #define R_TMP R19
    38  
    39  DATA ·mask+0x00(SB)/8, $0x0b0a09080f0e0d0c // byte swap per word
    40  DATA ·mask+0x08(SB)/8, $0x0302010007060504
    41  DATA ·mask+0x10(SB)/8, $0x0001020310111213 // Permute for transpose matrix
    42  DATA ·mask+0x18(SB)/8, $0x0405060714151617
    43  DATA ·mask+0x20(SB)/8, $0x08090a0b18191a1b
    44  DATA ·mask+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f
    45  DATA ·mask+0x30(SB)/8, $0x0001020304050607
    46  DATA ·mask+0x38(SB)/8, $0x1011121314151617
    47  DATA ·mask+0x40(SB)/8, $0x08090a0b0c0d0e0f
    48  DATA ·mask+0x48(SB)/8, $0x18191a1b1c1d1e1f
    49  
    50  GLOBL ·mask(SB), RODATA, $80
    51  
    52  #ifdef GOARCH_ppc64le
    53  #define NEEDS_PERMW
    54  
    55  #define PPC64X_STXVD2X(VS,RA,RB) \
    56  	VPERM	VS, VS, ESPERMW, TMP5 \ // byte swap per word
    57  	STXVD2X	TMP5, (RA+RB)
    58  
    59  #define PPC64X_LXVW4X(RA,RB,VT) \
    60  	LXVW4X	(RA+RB), VT \
    61  	VPERM	VT, VT, ESPERMW, VT
    62  
    63  #else
    64  #define PPC64X_STXVD2X(VS,RA,RB) STXVD2X	VS, (RA+RB)	
    65  #define PPC64X_LXVW4X(RA,RB,VT)  LXVW4X	(RA+RB), VT
    66  #endif // defined(GOARCH_ppc64le)
    67  
    68  // r = s <<< n
    69  // Due to VSPLTISW's limitation, the n MUST be [0, 31]
    70  #define PROLD(s, r, n) \
    71  	VSPLTISW $n, TMP5 \
    72  	VRLW	s, TMP5, r
    73  
    74  #define loadWordByIndex(W, i) \
    75  	MOVD $(16*(i)), R_TMP \
    76  	LXVW4X (R_TMP)(statePtr), W
    77  
    78  // one word is 16 bytes
    79  #define prepare4Words \
    80  	PPC64X_LXVW4X(srcPtr1, srcPtrPtr, V16); \
    81  	PPC64X_LXVW4X(srcPtr2, srcPtrPtr, V17); \
    82  	PPC64X_LXVW4X(srcPtr3, srcPtrPtr, V18); \
    83  	PPC64X_LXVW4X(srcPtr4, srcPtrPtr, V19); \
    84  	TRANSPOSE_MATRIX(V16, V17, V18, V19); \
    85  	ADD $16, srcPtrPtr;     \
    86  	STXVW4X V16, (wordPtr); \
    87  	ADD $16, wordPtr;       \
    88  	STXVW4X V17, (wordPtr); \
    89  	ADD $16, wordPtr;       \
    90  	STXVW4X V18, (wordPtr); \
    91  	ADD $16, wordPtr;       \
    92  	STXVW4X V19, (wordPtr); \
    93  	ADD $16, wordPtr
    94  
    95  #define TRANSPOSE_MATRIX(T0, T1, T2, T3) \
    96  	VPERM T0, T1, M0, TMP0; \
    97  	VPERM T2, T3, M0, TMP1; \
    98  	VPERM T0, T1, M1, TMP2; \
    99  	VPERM T2, T3, M1, TMP3; \
   100  	VPERM TMP0, TMP1, M2, T0; \
   101  	VPERM TMP0, TMP1, M3, T1; \
   102  	VPERM TMP2, TMP3, M2, T2; \
   103  	VPERM TMP2, TMP3, M3, T3
   104  
   105  // Load constant T, How to simlify it?
   106  // Solution 1: big constant table like sha256block_ppc64x.s
   107  // Solution 2: 2 constant T, rotate shift left one bit every time
   108  // Solution 1's performance is better but it uses more memory.
   109  #define LOAD_T(index, const, target) \
   110  	MOVD $const, R_TMP                \
   111  	MTVSRWZ R_TMP, target             \
   112  	VSPLTW $1, target, target
   113  
   114  #define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \
   115  	PROLD(a, TMP0, 12)               \
   116  	VOR TMP0, TMP0, TMP1             \
   117  	LOAD_T(index, const, TMP2)       \
   118  	VADDUWM TMP2, TMP0, TMP0         \
   119  	VADDUWM e, TMP0, TMP0            \
   120  	PROLD(TMP0, TMP2, 7)             \ // TMP2 = SS1
   121  	VXOR TMP2, TMP1, TMP0            \ // TMP0 = SS2
   122  	VXOR a, b, TMP1                  \
   123  	VXOR c, TMP1, TMP1               \
   124  	VADDUWM TMP1, d, TMP1            \ // TMP1 = (a XOR b XOR c) + d
   125  	loadWordByIndex(TMP3, index)     \
   126  	loadWordByIndex(TMP4, index+4)   \
   127  	VXOR TMP3, TMP4, TMP4            \
   128  	VADDUWM TMP4, TMP1, TMP1         \ // TMP1 = (a XOR b XOR c) + d + (Wt XOR Wt+4)
   129  	VADDUWM TMP1, TMP0, TMP1         \ // TMP1 = TT1
   130  	VADDUWM h, TMP3, TMP3            \
   131  	VADDUWM TMP3, TMP2, TMP3         \ // Wt + h + SS1
   132  	VXOR e, f, TMP4                  \
   133  	VXOR g, TMP4, TMP4               \
   134  	VADDUWM TMP4, TMP3, TMP3         \ // TT2 = (e XOR f XOR g) + Wt + h + SS1
   135  	VOR b, b, TMP4                   \
   136  	PROLD(TMP4, b, 9)                \ // b = b <<< 9
   137  	VOR TMP1, TMP1, h                \ // h = TT1
   138  	PROLD(f, f, 19)                  \ // f = f <<< 19
   139  	PROLD(TMP3, TMP4, 9)             \ // TMP4 = TT2 <<< 9
   140  	PROLD(TMP4, TMP0, 8)             \ // TMP0 = TT2 <<< 17
   141  	VXOR TMP3, TMP4, TMP4            \ // TMP4 = TT2 XOR (TT2 <<< 9)
   142  	VXOR TMP4, TMP0, d               \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)
   143  
   144  #define MESSAGE_SCHEDULE(index) \
   145  	loadWordByIndex(TMP0, index+1)    \ // Wj-3
   146  	PROLD(TMP0, TMP1, 15)             \
   147  	loadWordByIndex(TMP0, index-12)   \ // Wj-16
   148  	VXOR TMP0, TMP1, TMP0             \
   149  	loadWordByIndex(TMP1, index-5)    \ // Wj-9
   150  	VXOR TMP0, TMP1, TMP0             \
   151  	PROLD(TMP0, TMP1, 15)             \
   152  	PROLD(TMP1, TMP2, 8)              \
   153  	VXOR TMP1, TMP0, TMP0             \
   154  	VXOR TMP2, TMP0, TMP0             \ // P1
   155  	loadWordByIndex(TMP1, index-9)    \ // Wj-13
   156  	PROLD(TMP1, TMP2, 7)              \
   157  	VXOR TMP2, TMP0, TMP0             \
   158  	loadWordByIndex(TMP1, index-2)    \ // Wj-6
   159  	VXOR TMP1, TMP0, TMP1             \
   160  	STXVW4X TMP1, (wordPtr)           \
   161  	ADD $16, wordPtr                  \
   162  
   163  #define ROUND_12_15(index, const, a, b, c, d, e, f, g, h) \
   164  	MESSAGE_SCHEDULE(index)                               \
   165  	ROUND_00_11(index, const, a, b, c, d, e, f, g, h)
   166  
   167  #define ROUND_16_63(index, const, a, b, c, d, e, f, g, h) \
   168  	MESSAGE_SCHEDULE(index)          \ // TMP1 is Wt+4 now, Pls do not use it
   169  	PROLD(a, TMP0, 12)               \
   170  	VOR TMP0, TMP0, TMP4             \
   171  	LOAD_T(index, const, TMP2)       \
   172  	VADDUWM TMP2, TMP0, TMP0         \
   173  	VADDUWM e, TMP0, TMP0            \
   174  	PROLD(TMP0, TMP2, 7)             \ // TMP2 = SS1
   175  	VXOR TMP2, TMP4, TMP0            \ // TMP0 = SS2
   176  	VOR a, b, TMP3                   \
   177  	VAND a, b, TMP4                  \
   178  	VAND c, TMP3, TMP3               \
   179  	VOR TMP4, TMP3, TMP4             \ // (a AND b) OR (a AND c) OR (b AND c)
   180  	VADDUWM TMP4, d, TMP4            \ // (a AND b) OR (a AND c) OR (b AND c) + d
   181  	loadWordByIndex(TMP3, index)     \ // Wj
   182  	VXOR TMP3, TMP1, TMP1            \ // Wj XOR Wj+4
   183  	VADDUWM TMP4, TMP1, TMP4         \ // (a AND b) OR (a AND c) OR (b AND c) + d + (Wt XOR Wt+4)
   184  	VADDUWM TMP4, TMP0, TMP4         \ // TT1
   185  	VADDUWM h, TMP3, TMP3            \ // Wt + h
   186  	VADDUWM TMP2, TMP3, TMP3         \ // Wt + h + SS1
   187  	VXOR f, g, TMP1                  \
   188  	VAND TMP1, e, TMP1               \
   189  	VXOR g, TMP1, TMP1               \ // (f XOR g) AND e XOR g
   190  	VADDUWM TMP3, TMP1, TMP3         \ // TT2
   191  	VOR b, b, TMP1                   \
   192  	PROLD(TMP1, b, 9)                \ // b = b <<< 9
   193  	VOR TMP4, TMP4, h                \ // h = TT1
   194  	PROLD(f, f, 19)                  \ // f = f <<< 19
   195  	PROLD(TMP3, TMP1, 9)             \ // TMP1 = TT2 <<< 9
   196  	PROLD(TMP1, TMP0, 8)             \ // TMP0 = TT2 <<< 17
   197  	VXOR TMP3, TMP1, TMP1            \ // TMP1 = TT2 XOR (TT2 <<< 9)
   198  	VXOR TMP1, TMP0, d               \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17)
   199  
   200  // Used general purpose registers R4-R12, R15-R19.
   201  // blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int)
   202  TEXT ·blockMultBy4(SB), NOSPLIT, $0
   203  	MOVD	$8, R_x08
   204  	MOVD 	$16, R_x10
   205  	MOVD 	$24, R_x18
   206  	MOVD 	$32, R_x20
   207  	MOVD 	$48, R_x30
   208  #ifdef NEEDS_PERMW
   209  	MOVD	$·mask(SB), R4
   210  	LVX	(R4), ESPERMW
   211  	ADD	$0x10, R4
   212  #else
   213  	MOVD	$·mask+0x10(SB), R4
   214  #endif
   215  	LXVD2X 	(R0)(R4), M0
   216  	LXVD2X 	(R_x10)(R4), M1
   217  	LXVD2X 	(R_x20)(R4), M2
   218  	LXVD2X 	(R_x30)(R4), M3	
   219  #define digPtr R11
   220  #define srcPtrPtr R5
   221  #define statePtr R4
   222  #define blockCount R6
   223  #define srcPtr1 R7
   224  #define srcPtr2 R8
   225  #define srcPtr3 R9
   226  #define srcPtr4 R10
   227  #define wordPtr R12
   228  	MOVD	dig+0(FP), digPtr
   229  	MOVD	p+8(FP), srcPtrPtr
   230  	MOVD	buffer+16(FP), statePtr
   231  	MOVD	blocks+24(FP), blockCount
   232  
   233  	// load state
   234  	MOVD (R0)(digPtr), R_TMP
   235  	LXVW4X (R0)(R_TMP), a
   236  	LXVW4X (R_x10)(R_TMP), e
   237  	MOVD (R_x08)(digPtr), R_TMP
   238  	LXVW4X (R0)(R_TMP), b
   239  	LXVW4X (R_x10)(R_TMP), f
   240  	MOVD (R_x10)(digPtr), R_TMP
   241  	LXVW4X (R0)(R_TMP), c
   242  	LXVW4X (R_x10)(R_TMP), g
   243  	MOVD (R_x18)(digPtr), R_TMP
   244  	LXVW4X (R0)(R_TMP), d
   245  	LXVW4X (R_x10)(R_TMP), h
   246  
   247  	TRANSPOSE_MATRIX(a, b, c, d)
   248  	TRANSPOSE_MATRIX(e, f, g, h)
   249  
   250  	MOVD (R0)(srcPtrPtr), srcPtr1
   251  	MOVD (R_x08)(srcPtrPtr), srcPtr2
   252  	MOVD (R_x10)(srcPtrPtr), srcPtr3
   253  	MOVD (R_x18)(srcPtrPtr), srcPtr4
   254  	MOVD $0, srcPtrPtr
   255  
   256  	MOVD blockCount, CTR
   257  
   258  loop:
   259  	// Offload to VSR24-31 (aka FPR24-31)
   260  	XXLOR	V0, V0, VS24
   261  	XXLOR	V1, V1, VS25
   262  	XXLOR	V2, V2, VS26
   263  	XXLOR	V3, V3, VS27
   264  	XXLOR	V4, V4, VS28
   265  	XXLOR	V5, V5, VS29
   266  	XXLOR	V6, V6, VS30
   267  	XXLOR	V7, V7, VS31
   268  
   269  	// reset wordPtr
   270  	MOVD statePtr, wordPtr
   271  
   272  	// load message block
   273  	prepare4Words
   274  	prepare4Words
   275  	prepare4Words
   276  	prepare4Words
   277  
   278  	ROUND_00_11(0, T0, a, b, c, d, e, f, g, h)
   279  	ROUND_00_11(1, T1, h, a, b, c, d, e, f, g)
   280  	ROUND_00_11(2, T2, g, h, a, b, c, d, e, f)
   281  	ROUND_00_11(3, T3, f, g, h, a, b, c, d, e)
   282  	ROUND_00_11(4, T4, e, f, g, h, a, b, c, d)
   283  	ROUND_00_11(5, T5, d, e, f, g, h, a, b, c)
   284  	ROUND_00_11(6, T6, c, d, e, f, g, h, a, b)
   285  	ROUND_00_11(7, T7, b, c, d, e, f, g, h, a)
   286  	ROUND_00_11(8, T8, a, b, c, d, e, f, g, h)
   287  	ROUND_00_11(9, T9, h, a, b, c, d, e, f, g)
   288  	ROUND_00_11(10, T10, g, h, a, b, c, d, e, f)
   289  	ROUND_00_11(11, T11, f, g, h, a, b, c, d, e)
   290  
   291  	ROUND_12_15(12, T12, e, f, g, h, a, b, c, d)
   292  	ROUND_12_15(13, T13, d, e, f, g, h, a, b, c)
   293  	ROUND_12_15(14, T14, c, d, e, f, g, h, a, b)
   294  	ROUND_12_15(15, T15, b, c, d, e, f, g, h, a)
   295  
   296  	ROUND_16_63(16, T16, a, b, c, d, e, f, g, h)
   297  	ROUND_16_63(17, T17, h, a, b, c, d, e, f, g)
   298  	ROUND_16_63(18, T18, g, h, a, b, c, d, e, f)
   299  	ROUND_16_63(19, T19, f, g, h, a, b, c, d, e)
   300  	ROUND_16_63(20, T20, e, f, g, h, a, b, c, d)
   301  	ROUND_16_63(21, T21, d, e, f, g, h, a, b, c)
   302  	ROUND_16_63(22, T22, c, d, e, f, g, h, a, b)
   303  	ROUND_16_63(23, T23, b, c, d, e, f, g, h, a)
   304  	ROUND_16_63(24, T24, a, b, c, d, e, f, g, h)
   305  	ROUND_16_63(25, T25, h, a, b, c, d, e, f, g)
   306  	ROUND_16_63(26, T26, g, h, a, b, c, d, e, f)
   307  	ROUND_16_63(27, T27, f, g, h, a, b, c, d, e)
   308  	ROUND_16_63(28, T28, e, f, g, h, a, b, c, d)
   309  	ROUND_16_63(29, T29, d, e, f, g, h, a, b, c)
   310  	ROUND_16_63(30, T30, c, d, e, f, g, h, a, b)
   311  	ROUND_16_63(31, T31, b, c, d, e, f, g, h, a)
   312  	ROUND_16_63(32, T32, a, b, c, d, e, f, g, h)
   313  	ROUND_16_63(33, T33, h, a, b, c, d, e, f, g)
   314  	ROUND_16_63(34, T34, g, h, a, b, c, d, e, f)
   315  	ROUND_16_63(35, T35, f, g, h, a, b, c, d, e)
   316  	ROUND_16_63(36, T36, e, f, g, h, a, b, c, d)
   317  	ROUND_16_63(37, T37, d, e, f, g, h, a, b, c)
   318  	ROUND_16_63(38, T38, c, d, e, f, g, h, a, b)
   319  	ROUND_16_63(39, T39, b, c, d, e, f, g, h, a)
   320  	ROUND_16_63(40, T40, a, b, c, d, e, f, g, h)
   321  	ROUND_16_63(41, T41, h, a, b, c, d, e, f, g)
   322  	ROUND_16_63(42, T42, g, h, a, b, c, d, e, f)
   323  	ROUND_16_63(43, T43, f, g, h, a, b, c, d, e)
   324  	ROUND_16_63(44, T44, e, f, g, h, a, b, c, d)
   325  	ROUND_16_63(45, T45, d, e, f, g, h, a, b, c)
   326  	ROUND_16_63(46, T46, c, d, e, f, g, h, a, b)
   327  	ROUND_16_63(47, T47, b, c, d, e, f, g, h, a)
   328  	ROUND_16_63(48, T16, a, b, c, d, e, f, g, h)
   329  	ROUND_16_63(49, T17, h, a, b, c, d, e, f, g)
   330  	ROUND_16_63(50, T18, g, h, a, b, c, d, e, f)
   331  	ROUND_16_63(51, T19, f, g, h, a, b, c, d, e)
   332  	ROUND_16_63(52, T20, e, f, g, h, a, b, c, d)
   333  	ROUND_16_63(53, T21, d, e, f, g, h, a, b, c)
   334  	ROUND_16_63(54, T22, c, d, e, f, g, h, a, b)
   335  	ROUND_16_63(55, T23, b, c, d, e, f, g, h, a)
   336  	ROUND_16_63(56, T24, a, b, c, d, e, f, g, h)
   337  	ROUND_16_63(57, T25, h, a, b, c, d, e, f, g)
   338  	ROUND_16_63(58, T26, g, h, a, b, c, d, e, f)
   339  	ROUND_16_63(59, T27, f, g, h, a, b, c, d, e)
   340  	ROUND_16_63(60, T28, e, f, g, h, a, b, c, d)
   341  	ROUND_16_63(61, T29, d, e, f, g, h, a, b, c)
   342  	ROUND_16_63(62, T30, c, d, e, f, g, h, a, b)
   343  	ROUND_16_63(63, T31, b, c, d, e, f, g, h, a)
   344  
   345  	XXLXOR	V0, VS24, V0
   346  	XXLXOR	V1, VS25, V1
   347  	XXLXOR	V2, VS26, V2
   348  	XXLXOR	V3, VS27, V3
   349  	XXLXOR	V4, VS28, V4
   350  	XXLXOR	V5, VS29, V5
   351  	XXLXOR	V6, VS30, V6
   352  	XXLXOR	V7, VS31, V7
   353  
   354  	BDNZ	loop
   355  
   356  end:
   357  	TRANSPOSE_MATRIX(a, b, c, d)
   358  	TRANSPOSE_MATRIX(e, f, g, h)
   359  
   360  	// save state
   361  	MOVD (R0)(digPtr), R_TMP
   362  	STXVW4X a, (R0)(R_TMP)
   363  	STXVW4X e, (R_x10)(R_TMP)
   364  	MOVD (R_x08)(digPtr), R_TMP
   365  	STXVW4X b, (R0)(R_TMP)
   366  	STXVW4X f, (R_x10)(R_TMP)
   367  	MOVD (R_x10)(digPtr), R_TMP
   368  	STXVW4X c, (R0)(R_TMP)
   369  	STXVW4X g, (R_x10)(R_TMP)
   370  	MOVD (R_x18)(digPtr), R_TMP
   371  	STXVW4X d, (R0)(R_TMP)
   372  	STXVW4X h, (R_x10)(R_TMP)
   373  
   374  	RET
   375  
   376  // Used general purpose registers R4-R6, R8-R9, R16-R19.
   377  // func copyResultsBy4(dig *uint32, dst *byte)
   378  TEXT ·copyResultsBy4(SB),NOSPLIT,$0
   379  	MOVD	dig+0(FP), R6
   380  	MOVD	dst+8(FP), R4
   381  
   382  #ifdef NEEDS_PERMW	
   383  	MOVD	$·mask+0x00(SB), R5
   384  	LVX	(R5), ESPERMW
   385  #endif
   386  	MOVD	$16, R5
   387  	MOVD 	$32, R16
   388  	MOVD 	$48, R17
   389  	MOVD 	$64, R18
   390  	MOVD 	$80, R19
   391  	MOVD	$96, R8
   392  	MOVD	$112, R9
   393  
   394  	LXVD2X 	(R0)(R6), V0
   395  	PPC64X_STXVD2X(V0, R0, R4)
   396  
   397  	LXVD2X 	(R5)(R6), V0
   398  	PPC64X_STXVD2X(V0, R5, R4)
   399  	
   400  	LXVD2X 	(R16)(R6), V0
   401  	PPC64X_STXVD2X(V0, R16, R4)
   402  
   403  	LXVD2X 	(R17)(R6), V0
   404  	PPC64X_STXVD2X(V0, R17, R4)
   405  
   406  	LXVD2X 	(R18)(R6), V0
   407  	PPC64X_STXVD2X(V0, R18, R4)
   408  
   409  	LXVD2X 	(R19)(R6), V0
   410  	PPC64X_STXVD2X(V0, R19, R4)
   411  
   412  	LXVD2X 	(R8)(R6), V0
   413  	PPC64X_STXVD2X(V0, R8, R4)
   414  
   415  	LXVD2X 	(R9)(R6), V0
   416  	PPC64X_STXVD2X(V0, R9, R4)
   417  
   418  	RET