github.com/emmansun/gmsm@v0.29.1/sm3/sm3blocks_arm64.s (about) 1 // Copyright 2024 Sun Yimin. All rights reserved. 2 // Use of this source code is governed by a MIT-style 3 // license that can be found in the LICENSE file. 4 5 //go:build !purego 6 7 #include "textflag.h" 8 #include "sm3_const_asm.s" 9 10 #define a V0 11 #define e V1 12 #define b V2 13 #define f V3 14 #define c V4 15 #define g V5 16 #define d V6 17 #define h V7 18 19 #define tmp1 V8 20 #define tmp2 V9 21 #define tmp3 V10 22 #define tmp4 V11 23 24 #define aSave V24 25 #define bSave V25 26 #define cSave V26 27 #define dSave V27 28 #define eSave V28 29 #define fSave V29 30 #define gSave V30 31 #define hSave V31 32 33 // input: from high to low 34 // t0 = t0.S3, t0.S2, t0.S1, t0.S0 35 // t1 = t1.S3, t1.S2, t1.S1, t1.S0 36 // t2 = t2.S3, t2.S2, t2.S1, t2.S0 37 // t3 = t3.S3, t3.S2, t3.S1, t3.S0 38 // output: from high to low 39 // t0 = t3.S0, t2.S0, t1.S0, t0.S0 40 // t1 = t3.S1, t2.S1, t1.S1, t0.S1 41 // t2 = t3.S2, t2.S2, t1.S2, t0.S2 42 // t3 = t3.S3, t2.S3, t1.S3, t0.S3 43 #define TRANSPOSE_MATRIX(t0, t1, t2, t3, RTMP0, RTMP1, RTMP2, RTMP3) \ 44 VZIP1 t1.S4, t0.S4, RTMP0.S4 \ 45 VZIP1 t3.S4, t2.S4, RTMP1.S4 \ 46 VZIP2 t1.S4, t0.S4, RTMP2.S4 \ 47 VZIP2 t3.S4, t2.S4, RTMP3.S4 \ 48 VZIP1 RTMP1.D2, RTMP0.D2, t0.D2 \ 49 VZIP2 RTMP1.D2, RTMP0.D2, t1.D2 \ 50 VZIP1 RTMP3.D2, RTMP2.D2, t2.D2 \ 51 VZIP2 RTMP3.D2, RTMP2.D2, t3.D2 \ 52 53 // r = s <<< n 54 #define PROLD(s, r, n) \ 55 VSHL $(n), s.S4, r.S4 \ 56 VSRI $(32-n), s.S4, r.S4 \ 57 58 #define loadWordByIndex(W, i) \ 59 ADD $(16*(i)), wordStart, R20 \ 60 VLD1 (R20), [W.S4] \ 61 62 #define prepare4Words \ 63 VLD1.P 16(srcPtr1), [V12.B16] \ 64 VLD1.P 16(srcPtr2), [V13.B16] \ 65 VLD1.P 16(srcPtr3), [V14.B16] \ 66 VLD1.P 16(srcPtr4), [V15.B16] \ 67 TRANSPOSE_MATRIX(V12, V13, V14, V15, tmp1, tmp2, tmp3, tmp4); \ 68 VREV32 V12.B16, V12.B16; \ 69 VREV32 V13.B16, V13.B16; \ 70 VREV32 V14.B16, V14.B16; \ 71 VREV32 V15.B16, V15.B16; \ 72 VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(wordPtr) 73 74 #define LOAD_T(const, T) \ 75 MOVW $const, R20 \ 76 VDUP R20, T.S4 \ 77 78 #define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \ 79 PROLD(a, V12, 12) \ 80 VMOV V12.B16, V13.B16 \ 81 LOAD_T(const, tmp1) \ 82 VADD tmp1.S4, V12.S4, V12.S4 \ 83 VADD e.S4, V12.S4, V12.S4 \ 84 PROLD(V12, V14, 7) \ // V14 = SS1 85 VEOR V14.B16, V13.B16, V12.B16 \ // V12 = SS2 86 VEOR a.B16, b.B16, V13.B16 \ 87 VEOR c.B16, V13.B16, V13.B16 \ 88 VADD V13.S4, d.S4, V13.S4 \ // V13 = (a XOR b XOR c) + d 89 loadWordByIndex(V10, index) \ 90 loadWordByIndex(V11, index+4) \ 91 VEOR V10.B16, V11.B16, V11.B16 \ 92 VADD V11.S4, V13.S4, V13.S4 \ // V13 = (a XOR b XOR c) + d + (Wt XOR Wt+4) 93 VADD V13.S4, V12.S4, V13.S4 \ // TT1 94 VADD h.S4, V10.S4, V10.S4 \ 95 VADD V14.S4, V10.S4, V10.S4 \ // Wt + h + SS1 96 VEOR e.B16, f.B16, V11.B16 \ 97 VEOR g.B16, V11.B16, V11.B16 \ 98 VADD V11.S4, V10.S4, V10.S4 \ // TT2 = (e XOR f XOR g) + Wt + h + SS1 99 VMOV b.B16, V11.B16 \ 100 PROLD(V11, b, 9) \ // b = b <<< 9 101 VMOV V13.B16, h.B16 \ // h = TT1 102 VMOV f.B16, V11.B16 \ 103 PROLD(V11, f, 19) \ // f = f <<< 19 104 PROLD(V10, V11, 9) \ // V11 = TT2 <<< 9 105 PROLD(V11, V12, 8) \ // V12 = TT2 <<< 17 106 VEOR V10.B16, V11.B16, V11.B16 \ // V11 = TT2 XOR (TT2 <<< 9) 107 VEOR V11.B16, V12.B16, d.B16 \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17) 108 109 #define MESSAGE_SCHEDULE(index) \ 110 loadWordByIndex(V10, index+1) \ // Wj-3 111 PROLD(V10, V11, 15) \ 112 loadWordByIndex(V10, index-12) \ // Wj-16 113 VEOR V10.B16, V11.B16, V10.B16 \ 114 loadWordByIndex(V11, index-5) \ // Wj-9 115 VEOR V10.B16, V11.B16, V10.B16 \ 116 PROLD(V10, V11, 15) \ 117 PROLD(V11, V12, 8) \ 118 VEOR V11.B16, V10.B16, V10.B16 \ 119 VEOR V12.B16, V10.B16, V10.B16 \ // P1 120 loadWordByIndex(V11, index-9) \ // Wj-13 121 PROLD(V11, V12, 7) \ 122 VEOR V12.B16, V10.B16, V10.B16 \ 123 loadWordByIndex(V11, index-2) \ // Wj-6 124 VEOR V11.B16, V10.B16, V11.B16 \ 125 VST1.P [V11.S4], 16(wordPtr) \ 126 127 #define ROUND_12_15(index, const, a, b, c, d, e, f, g, h) \ 128 MESSAGE_SCHEDULE(index) \ 129 ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \ 130 131 #define ROUND_16_63(index, const, a, b, c, d, e, f, g, h) \ 132 MESSAGE_SCHEDULE(index) \ // V11 is Wt+4 now, Pls do not use it 133 PROLD(a, V12, 12) \ 134 VMOV V12.B16, V13.B16 \ 135 LOAD_T(const, tmp1) \ 136 VADD tmp1.S4, V12.S4, V12.S4 \ 137 VADD e.S4, V12.S4, V12.S4 \ 138 PROLD(V12, V14, 7) \ // V14 = SS1 139 VEOR V14.B16, V13.B16, V12.B16 \ // V12 = SS2 140 VORR a.B16, b.B16, V10.B16 \ 141 VAND a.B16, b.B16, V13.B16 \ 142 VAND c.B16, V10.B16, V10.B16 \ 143 VORR V13.B16, V10.B16, V13.B16 \ // (a AND b) OR (a AND c) OR (b AND c) 144 VADD V13.S4, d.S4, V13.S4 \ // (a AND b) OR (a AND c) OR (b AND c) + d 145 loadWordByIndex(V10, index) \ // Wj 146 VEOR V10.B16, V11.B16, V11.B16 \ // Wj XOR Wj+4 147 VADD V13.S4, V11.S4, V13.S4 \ // (a AND b) OR (a AND c) OR (b AND c) + d + (Wt XOR Wt+4) 148 VADD V13.S4, V12.S4, V13.S4 \ // TT1 149 VADD h.S4, V10.S4, V10.S4 \ // Wt + h 150 VADD V14.S4, V10.S4, V10.S4 \ // Wt + h + SS1 151 VEOR f.B16, g.B16, V11.B16 \ 152 VAND V11.B16, e.B16, V11.B16 \ 153 VEOR g.B16, V11.B16, V11.B16 \ // (f XOR g) AND e XOR g 154 VADD V10.S4, V11.S4, V10.S4 \ // TT2 155 VMOV b.B16, V11.B16 \ 156 PROLD(V11, b, 9) \ // b = b <<< 9 157 VMOV V13.B16, h.B16 \ // h = TT1 158 VMOV f.B16, V11.B16 \ 159 PROLD(V11, f, 19) \ // f = f <<< 19 160 PROLD(V10, V11, 9) \ // V11 = TT2 <<< 9 161 PROLD(V11, V12, 8) \ // V12 = TT2 <<< 17 162 VEOR V10.B16, V11.B16, V11.B16 \ // V11 = TT2 XOR (TT2 <<< 9) 163 VEOR V11.B16, V12.B16, d.B16 \ // d = TT2 XOR (TT2 <<< 9) XOR (TT2 <<< 17) 164 165 // blockMultBy4(dig **[8]uint32, p **byte, buffer *byte, blocks int) 166 TEXT ·blockMultBy4(SB), NOSPLIT, $0 167 #define digPtr R0 168 #define srcPtrPtr R1 169 #define blockCount R3 170 #define digSave R4 171 #define wordStart R5 172 #define srcPtr1 R6 173 #define srcPtr2 R7 174 #define srcPtr3 R8 175 #define srcPtr4 R9 176 #define wordPtr R10 177 MOVD dig+0(FP), digPtr 178 MOVD p+8(FP), srcPtrPtr 179 MOVD buffer+16(FP), wordStart 180 MOVD blocks+24(FP), blockCount 181 182 // load state 183 MOVD digPtr, digSave 184 MOVD.P 8(digPtr), R20 185 VLD1 (R20), [a.S4, e.S4] 186 MOVD.P 8(digPtr), R20 187 VLD1 (R20), [b.S4, f.S4] 188 MOVD.P 8(digPtr), R20 189 VLD1 (R20), [c.S4, g.S4] 190 MOVD (digPtr), R20 191 VLD1 (R20), [d.S4, h.S4] 192 193 // transpose state 194 TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4) 195 TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4) 196 197 MOVD.P 8(srcPtrPtr), srcPtr1 198 MOVD.P 8(srcPtrPtr), srcPtr2 199 MOVD.P 8(srcPtrPtr), srcPtr3 200 MOVD (srcPtrPtr), srcPtr4 201 202 loop: 203 // save state 204 VMOV a.B16, aSave.B16 205 VMOV b.B16, bSave.B16 206 VMOV c.B16, cSave.B16 207 VMOV d.B16, dSave.B16 208 VMOV e.B16, eSave.B16 209 VMOV f.B16, fSave.B16 210 VMOV g.B16, gSave.B16 211 VMOV h.B16, hSave.B16 212 213 // reset wordPtr 214 MOVD wordStart, wordPtr 215 216 // load message block 217 prepare4Words 218 prepare4Words 219 prepare4Words 220 prepare4Words 221 222 ROUND_00_11(0, T0, a, b, c, d, e, f, g, h) 223 ROUND_00_11(1, T1, h, a, b, c, d, e, f, g) 224 ROUND_00_11(2, T2, g, h, a, b, c, d, e, f) 225 ROUND_00_11(3, T3, f, g, h, a, b, c, d, e) 226 ROUND_00_11(4, T4, e, f, g, h, a, b, c, d) 227 ROUND_00_11(5, T5, d, e, f, g, h, a, b, c) 228 ROUND_00_11(6, T6, c, d, e, f, g, h, a, b) 229 ROUND_00_11(7, T7, b, c, d, e, f, g, h, a) 230 ROUND_00_11(8, T8, a, b, c, d, e, f, g, h) 231 ROUND_00_11(9, T9, h, a, b, c, d, e, f, g) 232 ROUND_00_11(10, T10, g, h, a, b, c, d, e, f) 233 ROUND_00_11(11, T11, f, g, h, a, b, c, d, e) 234 235 ROUND_12_15(12, T12, e, f, g, h, a, b, c, d) 236 ROUND_12_15(13, T13, d, e, f, g, h, a, b, c) 237 ROUND_12_15(14, T14, c, d, e, f, g, h, a, b) 238 ROUND_12_15(15, T15, b, c, d, e, f, g, h, a) 239 240 ROUND_16_63(16, T16, a, b, c, d, e, f, g, h) 241 ROUND_16_63(17, T17, h, a, b, c, d, e, f, g) 242 ROUND_16_63(18, T18, g, h, a, b, c, d, e, f) 243 ROUND_16_63(19, T19, f, g, h, a, b, c, d, e) 244 ROUND_16_63(20, T20, e, f, g, h, a, b, c, d) 245 ROUND_16_63(21, T21, d, e, f, g, h, a, b, c) 246 ROUND_16_63(22, T22, c, d, e, f, g, h, a, b) 247 ROUND_16_63(23, T23, b, c, d, e, f, g, h, a) 248 ROUND_16_63(24, T24, a, b, c, d, e, f, g, h) 249 ROUND_16_63(25, T25, h, a, b, c, d, e, f, g) 250 ROUND_16_63(26, T26, g, h, a, b, c, d, e, f) 251 ROUND_16_63(27, T27, f, g, h, a, b, c, d, e) 252 ROUND_16_63(28, T28, e, f, g, h, a, b, c, d) 253 ROUND_16_63(29, T29, d, e, f, g, h, a, b, c) 254 ROUND_16_63(30, T30, c, d, e, f, g, h, a, b) 255 ROUND_16_63(31, T31, b, c, d, e, f, g, h, a) 256 ROUND_16_63(32, T32, a, b, c, d, e, f, g, h) 257 ROUND_16_63(33, T33, h, a, b, c, d, e, f, g) 258 ROUND_16_63(34, T34, g, h, a, b, c, d, e, f) 259 ROUND_16_63(35, T35, f, g, h, a, b, c, d, e) 260 ROUND_16_63(36, T36, e, f, g, h, a, b, c, d) 261 ROUND_16_63(37, T37, d, e, f, g, h, a, b, c) 262 ROUND_16_63(38, T38, c, d, e, f, g, h, a, b) 263 ROUND_16_63(39, T39, b, c, d, e, f, g, h, a) 264 ROUND_16_63(40, T40, a, b, c, d, e, f, g, h) 265 ROUND_16_63(41, T41, h, a, b, c, d, e, f, g) 266 ROUND_16_63(42, T42, g, h, a, b, c, d, e, f) 267 ROUND_16_63(43, T43, f, g, h, a, b, c, d, e) 268 ROUND_16_63(44, T44, e, f, g, h, a, b, c, d) 269 ROUND_16_63(45, T45, d, e, f, g, h, a, b, c) 270 ROUND_16_63(46, T46, c, d, e, f, g, h, a, b) 271 ROUND_16_63(47, T47, b, c, d, e, f, g, h, a) 272 ROUND_16_63(48, T16, a, b, c, d, e, f, g, h) 273 ROUND_16_63(49, T17, h, a, b, c, d, e, f, g) 274 ROUND_16_63(50, T18, g, h, a, b, c, d, e, f) 275 ROUND_16_63(51, T19, f, g, h, a, b, c, d, e) 276 ROUND_16_63(52, T20, e, f, g, h, a, b, c, d) 277 ROUND_16_63(53, T21, d, e, f, g, h, a, b, c) 278 ROUND_16_63(54, T22, c, d, e, f, g, h, a, b) 279 ROUND_16_63(55, T23, b, c, d, e, f, g, h, a) 280 ROUND_16_63(56, T24, a, b, c, d, e, f, g, h) 281 ROUND_16_63(57, T25, h, a, b, c, d, e, f, g) 282 ROUND_16_63(58, T26, g, h, a, b, c, d, e, f) 283 ROUND_16_63(59, T27, f, g, h, a, b, c, d, e) 284 ROUND_16_63(60, T28, e, f, g, h, a, b, c, d) 285 ROUND_16_63(61, T29, d, e, f, g, h, a, b, c) 286 ROUND_16_63(62, T30, c, d, e, f, g, h, a, b) 287 ROUND_16_63(63, T31, b, c, d, e, f, g, h, a) 288 289 VEOR a.B16, aSave.B16, a.B16 290 VEOR b.B16, bSave.B16, b.B16 291 VEOR c.B16, cSave.B16, c.B16 292 VEOR d.B16, dSave.B16, d.B16 293 VEOR e.B16, eSave.B16, e.B16 294 VEOR f.B16, fSave.B16, f.B16 295 VEOR g.B16, gSave.B16, g.B16 296 VEOR h.B16, hSave.B16, h.B16 297 298 SUB $1, blockCount 299 CBNZ blockCount, loop 300 301 // transpose state 302 TRANSPOSE_MATRIX(a, b, c, d, tmp1, tmp2, tmp3, tmp4) 303 TRANSPOSE_MATRIX(e, f, g, h, tmp1, tmp2, tmp3, tmp4) 304 305 MOVD.P 8(digSave), R20 306 VST1 [a.S4, e.S4], (R20) 307 MOVD.P 8(digSave), R20 308 VST1 [b.S4, f.S4], (R20) 309 MOVD.P 8(digSave), R20 310 VST1 [c.S4, g.S4], (R20) 311 MOVD (digSave), R20 312 VST1 [d.S4, h.S4], (R20) 313 314 RET 315 316 #undef digPtr 317 #undef a 318 #undef b 319 #undef c 320 #undef d 321 #undef e 322 #undef f 323 #undef g 324 #undef h 325 326 #define a V0 327 #define b V1 328 #define c V2 329 #define d V3 330 #define e V4 331 #define f V5 332 #define g V6 333 #define h V7 334 // func copyResultsBy4(dig *uint32, dst *byte) 335 TEXT ·copyResultsBy4(SB),NOSPLIT,$0 336 #define digPtr R0 337 #define dstPtr R1 338 MOVD dig+0(FP), digPtr 339 MOVD dst+8(FP), dstPtr 340 341 // load state 342 VLD1.P 64(digPtr), [a.S4, b.S4, c.S4, d.S4] 343 VLD1 (digPtr), [e.S4, f.S4, g.S4, h.S4] 344 345 VREV32 a.B16, a.B16 346 VREV32 b.B16, b.B16 347 VREV32 c.B16, c.B16 348 VREV32 d.B16, d.B16 349 VREV32 e.B16, e.B16 350 VREV32 f.B16, f.B16 351 VREV32 g.B16, g.B16 352 VREV32 h.B16, h.B16 353 354 VST1.P [a.B16, b.B16, c.B16, d.B16], 64(dstPtr) 355 VST1 [e.B16, f.B16, g.B16, h.B16], (dstPtr) 356 357 RET