github.com/emmansun/gmsm@v0.29.1/sm3/sm3blocks_s390x.s (about) 1 // Copyright 2024 Sun Yimin. All rights reserved. 2 // Use of this source code is governed by a MIT-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !purego 6 7 #include "textflag.h" 8 9 DATA mask<>+0x00(SB)/8, $0x0001020310111213 10 DATA mask<>+0x08(SB)/8, $0x0405060714151617 11 DATA mask<>+0x10(SB)/8, $0x08090a0b18191a1b 12 DATA mask<>+0x18(SB)/8, $0x0c0d0e0f1c1d1e1f 13 DATA mask<>+0x20(SB)/8, $0x0001020304050607 14 DATA mask<>+0x28(SB)/8, $0x1011121314151617 15 DATA mask<>+0x30(SB)/8, $0x08090a0b0c0d0e0f 16 DATA mask<>+0x38(SB)/8, $0x18191a1b1c1d1e1f 17 GLOBL mask<>(SB), RODATA, $64 18 19 #define a V0 20 #define e V1 21 #define b V2 22 #define f V3 23 #define c V4 24 #define g V5 25 #define d V6 26 #define h V7 27 #define M0 V8 28 #define M1 V9 29 #define M2 V10 30 #define M3 V11 31 #define TMP0 V12 32 #define TMP1 V13 33 #define TMP2 V14 34 #define TMP3 V15 35 #define TMP4 V16 36 #define aSave V24 37 #define bSave V25 38 #define cSave V26 39 #define dSave V27 40 #define eSave V28 41 #define fSave V29 42 #define gSave V30 43 #define hSave V31 44 45 #define TRANSPOSE_MATRIX(T0, T1, T2, T3, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3) \ 46 VPERM T0, T1, M0, TMP0; \ 47 VPERM T2, T3, M0, TMP1; \ 48 VPERM T0, T1, M1, TMP2; \ 49 VPERM T2, T3, M1, TMP3; \ 50 VPERM TMP0, TMP1, M2, T0; \ 51 VPERM TMP0, TMP1, M3, T1; \ 52 VPERM TMP2, TMP3, M2, T2; \ 53 VPERM TMP2, TMP3, M3, T3 54 55 // r = s <<< n 56 #define PROLD(s, r, n) \ 57 VERLLF $n, s, r 58 59 #define loadWordByIndex(W, i) \ 60 VL (16*(i))(statePtr), W 61 62 // one word is 16 bytes 63 #define prepare4Words \ 64 VL (srcPtr1)(srcPtrPtr*1), V16; \ 65 VL (srcPtr2)(srcPtrPtr*1), V17; \ 66 VL (srcPtr3)(srcPtrPtr*1), V18; \ 67 VL (srcPtr4)(srcPtrPtr*1), V19; \ 68 TRANSPOSE_MATRIX(V16, V17, V18, V19, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3); \ 69 VSTM V16, V19, (wordPtr); \ 70 LAY 16(srcPtrPtr), srcPtrPtr; \ 71 ADD $64, wordPtr 72 73 #define ROUND_00_11(index, a, b, c, d, e, f, g, h) \ 74 PROLD(a, TMP0, 12) \ 75 VLR TMP0, TMP1 \ 76 VLREPF (index*4)(kPtr), TMP2 \ // It seems that the VREPIF instruction is not supported yet. 77 VAF TMP2, TMP0, TMP0 \ 78 VAF e, TMP0, TMP0 \ 79 PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1 80 VX TMP2, TMP1, TMP0 \ // TMP0 = SS2 81 VX a, b, TMP1 \ 82 VX c, TMP1, TMP1 \ 83 VAF TMP1, d, TMP1 \ // TMP1 = (a XOR b XOR c) + d 84 loadWordByIndex(TMP3, index) \ 85 loadWordByIndex(TMP4, index+4) \ 86 VX TMP3, TMP4, TMP4 \ 87 VAF TMP4, TMP1, TMP1 \ // TMP1 = (a XOR b XOR c) + d + (Wt XOR Wt+4) 88 VAF TMP1, TMP0, TMP1 \ // TMP1 = TT1 89 VAF h, TMP3, TMP3 \ 90 VAF TMP3, TMP2, TMP3 \ // Wt + h + SS1 91 VX e, f, TMP4 \ 92 VX g, TMP4, TMP4 \ 93 VAF TMP4, TMP3, TMP3 \ // TT2 = (e XOR f XOR g) + Wt + h + SS1 94 VLR b, TMP4 \ 95 PROLD(TMP4, b, 9) \ // b = b <<< 9 96 VLR TMP1, h \ // h = TT1 97 VLR f, TMP4 \ 98 PROLD(TMP4, f, 19) \ // f = f <<< 19 99 PROLD(TMP3, TMP4, 9) \ // TMP4 = TT2 <<< 9 100 PROLD(TMP4, TMP0, 8) \ // TMP0 = TT2 <<< 17 101 VX TMP3, TMP4, TMP4 \ // TMP4 = TT2 XOR (TT2 <<< 9) 102 VX TMP4, TMP0, d \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17) 103 104 #define MESSAGE_SCHEDULE(index) \ 105 loadWordByIndex(TMP0, index+1) \ // Wj-3 106 PROLD(TMP0, TMP1, 15) \ 107 loadWordByIndex(TMP0, index-12) \ // Wj-16 108 VX TMP0, TMP1, TMP0 \ 109 loadWordByIndex(TMP1, index-5) \ // Wj-9 110 VX TMP0, TMP1, TMP0 \ 111 PROLD(TMP0, TMP1, 15) \ 112 PROLD(TMP1, TMP2, 8) \ 113 VX TMP1, TMP0, TMP0 \ 114 VX TMP2, TMP0, TMP0 \ // P1 115 loadWordByIndex(TMP1, index-9) \ // Wj-13 116 PROLD(TMP1, TMP2, 7) \ 117 VX TMP2, TMP0, TMP0 \ 118 loadWordByIndex(TMP1, index-2) \ // Wj-6 119 VX TMP1, TMP0, TMP1 \ 120 VST TMP1, (wordPtr) \ 121 ADD $16, wordPtr \ 122 123 #define ROUND_12_15(index, a, b, c, d, e, f, g, h) \ 124 MESSAGE_SCHEDULE(index) \ 125 ROUND_00_11(index, a, b, c, d, e, f, g, h) \ 126 127 #define ROUND_16_63(index, a, b, c, d, e, f, g, h) \ 128 MESSAGE_SCHEDULE(index) \ // TMP1 is Wt+4 now, Pls do not use it 129 PROLD(a, TMP0, 12) \ 130 VLR TMP0, TMP4 \ 131 VLREPF (index*4)(kPtr), TMP2 \ // It seems that the VREPIF instruction is not supported yet. 132 VAF TMP2, TMP0, TMP0 \ 133 VAF e, TMP0, TMP0 \ 134 PROLD(TMP0, TMP2, 7) \ // TMP2 = SS1 135 VX TMP2, TMP4, TMP0 \ // TMP0 = SS2 136 VO a, b, TMP3 \ 137 VN a, b, TMP4 \ 138 VN c, TMP3, TMP3 \ 139 VO TMP4, TMP3, TMP4 \ // (a AND b) OR (a AND c) OR (b AND c) 140 VAF TMP4, d, TMP4 \ // (a AND b) OR (a AND c) OR (b AND c) + d 141 loadWordByIndex(TMP3, index) \ // Wj 142 VX TMP3, TMP1, TMP1 \ // Wj XOR Wj+4 143 VAF TMP4, TMP1, TMP4 \ // (a AND b) OR (a AND c) OR (b AND c) + d + (Wt XOR Wt+4) 144 VAF TMP4, TMP0, TMP4 \ // TT1 145 VAF h, TMP3, TMP3 \ // Wt + h 146 VAF TMP2, TMP3, TMP3 \ // Wt + h + SS1 147 VX f, g, TMP1 \ 148 VN TMP1, e, TMP1 \ 149 VX g, TMP1, TMP1 \ // (f XOR g) AND e XOR g 150 VAF TMP3, TMP1, TMP3 \ // TT2 151 VLR b, TMP1 \ 152 PROLD(TMP1, b, 9) \ // b = b <<< 9 153 VLR TMP4, h \ // h = TT1 154 VLR f, TMP1 \ 155 PROLD(TMP1, f, 19) \ // f = f <<< 19 156 PROLD(TMP3, TMP1, 9) \ // TMP1 = TT2 <<< 9 157 PROLD(TMP1, TMP0, 8) \ // TMP0 = TT2 <<< 17 158 VX TMP3, TMP1, TMP1 \ // TMP1 = TT2 XOR (TT2 <<< 9) 159 VX TMP1, TMP0, d \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17) 160 161 // func copyResultsBy4(dig *uint32, dst *byte) 162 TEXT ·copyResultsBy4(SB),NOSPLIT,$0 163 #define digPtr R3 164 #define dstPtr R4 165 MOVD dig+0(FP), digPtr 166 MOVD dst+8(FP), dstPtr 167 168 VLM (digPtr), V0, V7 169 VSTM V0, V7, (dstPtr) 170 171 RET 172 #undef digPtr 173 #undef dstPtr 174 175 // Used general purpose registers R1-R11. 176 // blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int) 177 TEXT ·blockMultBy4(SB), NOSPLIT, $0 178 #define digPtr R11 179 #define srcPtrPtr R1 180 #define statePtr R2 181 #define kPtr R3 182 #define blockCount R5 183 #define srcPtr1 R6 184 #define srcPtr2 R7 185 #define srcPtr3 R8 186 #define srcPtr4 R9 187 #define wordPtr R10 188 MOVD dig+0(FP), digPtr 189 MOVD p+8(FP), srcPtrPtr 190 MOVD buffer+16(FP), statePtr 191 MOVD blocks+24(FP), blockCount 192 193 // load state 194 MOVD 0(digPtr), R4 195 VLM (R4), a, e 196 MOVD 8(digPtr), R4 197 VLM (R4), b, f 198 MOVD 16(digPtr), R4 199 VLM (R4), c, g 200 MOVD 24(digPtr), R4 201 VLM (R4), d, h 202 203 MOVD $mask<>+0x00(SB), R4 204 VLM (R4), M0, M3 205 206 TRANSPOSE_MATRIX(a, b, c, d, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3) 207 TRANSPOSE_MATRIX(e, f, g, h, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3) 208 209 MOVD (srcPtrPtr), srcPtr1 210 MOVD 8(srcPtrPtr), srcPtr2 211 MOVD 16(srcPtrPtr), srcPtr3 212 MOVD 24(srcPtrPtr), srcPtr4 213 MOVD $0, srcPtrPtr 214 215 MOVD $·_K+0(SB), kPtr 216 217 loop: 218 // save state 219 VLR a, aSave 220 VLR b, bSave 221 VLR c, cSave 222 VLR d, dSave 223 VLR e, eSave 224 VLR f, fSave 225 VLR g, gSave 226 VLR h, hSave 227 228 // reset wordPtr 229 MOVD statePtr, wordPtr 230 231 // load message block 232 prepare4Words 233 prepare4Words 234 prepare4Words 235 prepare4Words 236 237 ROUND_00_11(0, a, b, c, d, e, f, g, h) 238 ROUND_00_11(1, h, a, b, c, d, e, f, g) 239 ROUND_00_11(2, g, h, a, b, c, d, e, f) 240 ROUND_00_11(3, f, g, h, a, b, c, d, e) 241 ROUND_00_11(4, e, f, g, h, a, b, c, d) 242 ROUND_00_11(5, d, e, f, g, h, a, b, c) 243 ROUND_00_11(6, c, d, e, f, g, h, a, b) 244 ROUND_00_11(7, b, c, d, e, f, g, h, a) 245 ROUND_00_11(8, a, b, c, d, e, f, g, h) 246 ROUND_00_11(9, h, a, b, c, d, e, f, g) 247 ROUND_00_11(10, g, h, a, b, c, d, e, f) 248 ROUND_00_11(11, f, g, h, a, b, c, d, e) 249 250 ROUND_12_15(12, e, f, g, h, a, b, c, d) 251 ROUND_12_15(13, d, e, f, g, h, a, b, c) 252 ROUND_12_15(14, c, d, e, f, g, h, a, b) 253 ROUND_12_15(15, b, c, d, e, f, g, h, a) 254 255 ROUND_16_63(16, a, b, c, d, e, f, g, h) 256 ROUND_16_63(17, h, a, b, c, d, e, f, g) 257 ROUND_16_63(18, g, h, a, b, c, d, e, f) 258 ROUND_16_63(19, f, g, h, a, b, c, d, e) 259 ROUND_16_63(20, e, f, g, h, a, b, c, d) 260 ROUND_16_63(21, d, e, f, g, h, a, b, c) 261 ROUND_16_63(22, c, d, e, f, g, h, a, b) 262 ROUND_16_63(23, b, c, d, e, f, g, h, a) 263 ROUND_16_63(24, a, b, c, d, e, f, g, h) 264 ROUND_16_63(25, h, a, b, c, d, e, f, g) 265 ROUND_16_63(26, g, h, a, b, c, d, e, f) 266 ROUND_16_63(27, f, g, h, a, b, c, d, e) 267 ROUND_16_63(28, e, f, g, h, a, b, c, d) 268 ROUND_16_63(29, d, e, f, g, h, a, b, c) 269 ROUND_16_63(30, c, d, e, f, g, h, a, b) 270 ROUND_16_63(31, b, c, d, e, f, g, h, a) 271 ROUND_16_63(32, a, b, c, d, e, f, g, h) 272 ROUND_16_63(33, h, a, b, c, d, e, f, g) 273 ROUND_16_63(34, g, h, a, b, c, d, e, f) 274 ROUND_16_63(35, f, g, h, a, b, c, d, e) 275 ROUND_16_63(36, e, f, g, h, a, b, c, d) 276 ROUND_16_63(37, d, e, f, g, h, a, b, c) 277 ROUND_16_63(38, c, d, e, f, g, h, a, b) 278 ROUND_16_63(39, b, c, d, e, f, g, h, a) 279 ROUND_16_63(40, a, b, c, d, e, f, g, h) 280 ROUND_16_63(41, h, a, b, c, d, e, f, g) 281 ROUND_16_63(42, g, h, a, b, c, d, e, f) 282 ROUND_16_63(43, f, g, h, a, b, c, d, e) 283 ROUND_16_63(44, e, f, g, h, a, b, c, d) 284 ROUND_16_63(45, d, e, f, g, h, a, b, c) 285 ROUND_16_63(46, c, d, e, f, g, h, a, b) 286 ROUND_16_63(47, b, c, d, e, f, g, h, a) 287 ROUND_16_63(48, a, b, c, d, e, f, g, h) 288 ROUND_16_63(49, h, a, b, c, d, e, f, g) 289 ROUND_16_63(50, g, h, a, b, c, d, e, f) 290 ROUND_16_63(51, f, g, h, a, b, c, d, e) 291 ROUND_16_63(52, e, f, g, h, a, b, c, d) 292 ROUND_16_63(53, d, e, f, g, h, a, b, c) 293 ROUND_16_63(54, c, d, e, f, g, h, a, b) 294 ROUND_16_63(55, b, c, d, e, f, g, h, a) 295 ROUND_16_63(56, a, b, c, d, e, f, g, h) 296 ROUND_16_63(57, h, a, b, c, d, e, f, g) 297 ROUND_16_63(58, g, h, a, b, c, d, e, f) 298 ROUND_16_63(59, f, g, h, a, b, c, d, e) 299 ROUND_16_63(60, e, f, g, h, a, b, c, d) 300 ROUND_16_63(61, d, e, f, g, h, a, b, c) 301 ROUND_16_63(62, c, d, e, f, g, h, a, b) 302 ROUND_16_63(63, b, c, d, e, f, g, h, a) 303 304 VX a, aSave, a 305 VX b, bSave, b 306 VX c, cSave, c 307 VX d, dSave, d 308 VX e, eSave, e 309 VX f, fSave, f 310 VX g, gSave, g 311 VX h, hSave, h 312 313 SUB $1, blockCount 314 CMPBGT blockCount, $0, loop 315 316 TRANSPOSE_MATRIX(a, b, c, d, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3) 317 TRANSPOSE_MATRIX(e, f, g, h, M0, M1, M2, M3, TMP0, TMP1, TMP2, TMP3) 318 319 MOVD 0(digPtr), R4 320 VSTM a, e, (R4) 321 MOVD 8(digPtr), R4 322 VSTM b, f, (R4) 323 MOVD 16(digPtr), R4 324 VSTM c, g, (R4) 325 MOVD 24(digPtr), R4 326 VSTM d, h, (R4) 327 328 RET