github.com/emmansun/gmsm@v0.29.1/sm3/sm3block_amd64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #include "sm3_const_asm.s" 6 7 // xorm (mem), reg 8 // Xor reg to mem using reg-mem xor and store 9 #define xorm(P1, P2) \ 10 XORL P2, P1; \ 11 MOVL P1, P2 12 13 #define a R8 14 #define b R9 15 #define c R10 16 #define d R11 17 #define e R12 18 #define f R13 19 #define g R14 20 #define h DI 21 22 // Wt = Mt; for 0 <= t <= 3 23 #define MSGSCHEDULE0(index) \ 24 MOVL (index*4)(SI), AX; \ 25 BSWAPL AX; \ 26 MOVL AX, (index*4)(BP) 27 28 // Wt+4 = Mt+4; for 0 <= t <= 11 29 #define MSGSCHEDULE01(index) \ 30 MOVL ((index+4)*4)(SI), AX; \ 31 BSWAPL AX; \ 32 MOVL AX, ((index+4)*4)(BP) 33 34 // x = Wt-12 XOR Wt-5 XOR ROTL(15, Wt+1) 35 // p1(x) = x XOR ROTL(15, x) XOR ROTL(23, x) 36 // Wt+4 = p1(x) XOR ROTL(7, Wt-9) XOR Wt-2 37 // for 12 <= t <= 63 38 #define MSGSCHEDULE1(index) \ 39 MOVL ((index+1)*4)(BP), AX; \ 40 ROLL $15, AX; \ 41 MOVL ((index-12)*4)(BP), BX; \ 42 XORL BX, AX; \ 43 MOVL ((index-5)*4)(BP), BX; \ 44 XORL BX, AX; \ 45 MOVL AX, BX; \ 46 ROLL $15, BX; \ 47 XORL BX, AX; \ 48 ROLL $8, BX; \ 49 XORL BX, AX; \ 50 MOVL ((index-9)*4)(BP), BX; \ 51 ROLL $7, BX; \ 52 XORL BX, AX; \ 53 MOVL ((index-2)*4)(BP), BX; \ 54 XORL BX, AX; \ 55 MOVL AX, ((index+4)*4)(BP) 56 57 // Calculate ss1 in BX 58 // x = ROTL(12, a) + e + ROTL(index, const) 59 // ret = ROTL(7, x) 60 #define SM3SS1(const, a, e) \ 61 MOVL a, BX; \ 62 ROLL $12, BX; \ 63 ADDL e, BX; \ 64 ADDL $const, BX; \ 65 ROLL $7, BX 66 67 // Calculate tt1 in CX 68 // ret = (a XOR b XOR c) + d + (ROTL(12, a) XOR ss1) + (Wt XOR Wt+4) 69 #define SM3TT10(index, a, b, c, d) \ 70 MOVL b, DX; \ 71 XORL a, DX; \ 72 XORL c, DX; \ // (a XOR b XOR c) 73 ADDL d, DX; \ // (a XOR b XOR c) + d 74 MOVL ((index)*4)(BP), CX; \ //Wt 75 XORL CX, AX; \ //Wt XOR Wt+4 76 ADDL AX, DX; \ 77 MOVL a, CX; \ 78 ROLL $12, CX; \ 79 XORL BX, CX; \ // ROTL(12, a) XOR ss1 80 ADDL DX, CX // (a XOR b XOR c) + d + (ROTL(12, a) XOR ss1) 81 82 // Calculate tt2 in BX 83 // ret = (e XOR f XOR g) + h + ss1 + Wt 84 #define SM3TT20(index, e, f, g, h) \ 85 MOVL ((index)*4)(BP), DX; \ //Wt 86 ADDL h, DX; \ //Wt + h 87 ADDL BX, DX; \ //Wt + h + ss1 88 MOVL e, BX; \ 89 XORL f, BX; \ // e XOR f 90 XORL g, BX; \ // e XOR f XOR g 91 ADDL DX, BX // (e XOR f XOR g) + Wt + h + ss1 92 93 // Calculate tt1 in CX, used DX 94 // ret = ((a AND b) OR (a AND c) OR (b AND c)) + d + (ROTL(12, a) XOR ss1) + (Wt XOR Wt+4) 95 #define SM3TT11(index, a, b, c, d) \ 96 MOVL a, DX; \ 97 ORL b, DX; \ // a AND b 98 MOVL a, CX; \ 99 ANDL b, CX; \ // a AND b 100 ANDL c, DX; \ 101 ORL CX, DX; \ // (a AND b) OR (a AND c) OR (b AND c) 102 ADDL d, DX; \ 103 MOVL a, CX; \ 104 ROLL $12, CX; \ 105 XORL BX, CX; \ 106 ADDL DX, CX; \ // ((a AND b) OR (a AND c) OR (b AND c)) + d + (ROTL(12, a) XOR ss1) 107 MOVL ((index)*4)(BP), DX; \ 108 XORL DX, AX; \ // Wt XOR Wt+4 109 ADDL AX, CX 110 111 // Calculate tt2 in BX 112 // ret = ((e AND f) OR (NOT(e) AND g)) + h + ss1 + Wt 113 #define SM3TT21(index, e, f, g, h) \ 114 MOVL ((index)*4)(BP), DX; \ 115 ADDL h, DX; \ // Wt + h 116 ADDL BX, DX; \ // h + ss1 + Wt 117 MOVL f, BX; \ 118 XORL g, BX; \ 119 ANDL e, BX; \ 120 XORL g, BX; \ // GG2(e, f, g) 121 ADDL DX, BX 122 123 #define COPYRESULT(b, d, f, h) \ 124 ROLL $9, b; \ 125 MOVL CX, h; \ // a = ttl 126 ROLL $19, f; \ 127 MOVL BX, CX; \ 128 ROLL $9, CX; \ 129 XORL BX, CX; \ // tt2 XOR ROTL(9, tt2) 130 ROLL $17, BX; \ 131 XORL BX, CX; \ // tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2) 132 MOVL CX, d // e = tt2 XOR ROTL(9, tt2) XOR ROTL(17, tt2) 133 134 #define SM3ROUND0(index, const, a, b, c, d, e, f, g, h) \ 135 MSGSCHEDULE01(index); \ 136 SM3SS1(const, a, e); \ 137 SM3TT10(index, a, b, c, d); \ 138 SM3TT20(index, e, f, g, h); \ 139 COPYRESULT(b, d, f, h) 140 141 #define SM3ROUND1(index, const, a, b, c, d, e, f, g, h) \ 142 MSGSCHEDULE1(index); \ 143 SM3SS1(const, a, e); \ 144 SM3TT10(index, a, b, c, d); \ 145 SM3TT20(index, e, f, g, h); \ 146 COPYRESULT(b, d, f, h) 147 148 #define SM3ROUND2(index, const, a, b, c, d, e, f, g, h) \ 149 MSGSCHEDULE1(index); \ 150 SM3SS1(const, a, e); \ 151 SM3TT11(index, a, b, c, d); \ 152 SM3TT21(index, e, f, g, h); \ 153 COPYRESULT(b, d, f, h) 154 155 TEXT ·blockAMD64(SB), 0, $288-32 156 MOVQ p_base+8(FP), SI 157 MOVQ p_len+16(FP), DX 158 SHRQ $6, DX 159 SHLQ $6, DX 160 161 LEAQ (SI)(DX*1), DI 162 MOVQ DI, 272(SP) 163 CMPQ SI, DI 164 JEQ end 165 166 MOVQ dig+0(FP), BP 167 MOVL (0*4)(BP), a // a = H0 168 MOVL (1*4)(BP), b // b = H1 169 MOVL (2*4)(BP), c // c = H2 170 MOVL (3*4)(BP), d // d = H3 171 MOVL (4*4)(BP), e // e = H4 172 MOVL (5*4)(BP), f // f = H5 173 MOVL (6*4)(BP), g // g = H6 174 MOVL (7*4)(BP), h // h = H7 175 176 loop: 177 MOVQ SP, BP 178 179 MSGSCHEDULE0(0) 180 MSGSCHEDULE0(1) 181 MSGSCHEDULE0(2) 182 MSGSCHEDULE0(3) 183 184 SM3ROUND0(0, T0, a, b, c, d, e, f, g, h) 185 SM3ROUND0(1, T1, h, a, b, c, d, e, f, g) 186 SM3ROUND0(2, T2, g, h, a, b, c, d, e, f) 187 SM3ROUND0(3, T3, f, g, h, a, b, c, d, e) 188 SM3ROUND0(4, T4, e, f, g, h, a, b, c, d) 189 SM3ROUND0(5, T5, d, e, f, g, h, a, b, c) 190 SM3ROUND0(6, T6, c, d, e, f, g, h, a, b) 191 SM3ROUND0(7, T7, b, c, d, e, f, g, h, a) 192 SM3ROUND0(8, T8, a, b, c, d, e, f, g, h) 193 SM3ROUND0(9, T9, h, a, b, c, d, e, f, g) 194 SM3ROUND0(10, T10, g, h, a, b, c, d, e, f) 195 SM3ROUND0(11, T11, f, g, h, a, b, c, d, e) 196 197 SM3ROUND1(12, T12, e, f, g, h, a, b, c, d) 198 SM3ROUND1(13, T13, d, e, f, g, h, a, b, c) 199 SM3ROUND1(14, T14, c, d, e, f, g, h, a, b) 200 SM3ROUND1(15, T15, b, c, d, e, f, g, h, a) 201 202 SM3ROUND2(16, T16, a, b, c, d, e, f, g, h) 203 SM3ROUND2(17, T17, h, a, b, c, d, e, f, g) 204 SM3ROUND2(18, T18, g, h, a, b, c, d, e, f) 205 SM3ROUND2(19, T19, f, g, h, a, b, c, d, e) 206 SM3ROUND2(20, T20, e, f, g, h, a, b, c, d) 207 SM3ROUND2(21, T21, d, e, f, g, h, a, b, c) 208 SM3ROUND2(22, T22, c, d, e, f, g, h, a, b) 209 SM3ROUND2(23, T23, b, c, d, e, f, g, h, a) 210 SM3ROUND2(24, T24, a, b, c, d, e, f, g, h) 211 SM3ROUND2(25, T25, h, a, b, c, d, e, f, g) 212 SM3ROUND2(26, T26, g, h, a, b, c, d, e, f) 213 SM3ROUND2(27, T27, f, g, h, a, b, c, d, e) 214 SM3ROUND2(28, T28, e, f, g, h, a, b, c, d) 215 SM3ROUND2(29, T29, d, e, f, g, h, a, b, c) 216 SM3ROUND2(30, T30, c, d, e, f, g, h, a, b) 217 SM3ROUND2(31, T31, b, c, d, e, f, g, h, a) 218 SM3ROUND2(32, T32, a, b, c, d, e, f, g, h) 219 SM3ROUND2(33, T33, h, a, b, c, d, e, f, g) 220 SM3ROUND2(34, T34, g, h, a, b, c, d, e, f) 221 SM3ROUND2(35, T35, f, g, h, a, b, c, d, e) 222 SM3ROUND2(36, T36, e, f, g, h, a, b, c, d) 223 SM3ROUND2(37, T37, d, e, f, g, h, a, b, c) 224 SM3ROUND2(38, T38, c, d, e, f, g, h, a, b) 225 SM3ROUND2(39, T39, b, c, d, e, f, g, h, a) 226 SM3ROUND2(40, T40, a, b, c, d, e, f, g, h) 227 SM3ROUND2(41, T41, h, a, b, c, d, e, f, g) 228 SM3ROUND2(42, T42, g, h, a, b, c, d, e, f) 229 SM3ROUND2(43, T43, f, g, h, a, b, c, d, e) 230 SM3ROUND2(44, T44, e, f, g, h, a, b, c, d) 231 SM3ROUND2(45, T45, d, e, f, g, h, a, b, c) 232 SM3ROUND2(46, T46, c, d, e, f, g, h, a, b) 233 SM3ROUND2(47, T47, b, c, d, e, f, g, h, a) 234 SM3ROUND2(48, T48, a, b, c, d, e, f, g, h) 235 SM3ROUND2(49, T49, h, a, b, c, d, e, f, g) 236 SM3ROUND2(50, T50, g, h, a, b, c, d, e, f) 237 SM3ROUND2(51, T51, f, g, h, a, b, c, d, e) 238 SM3ROUND2(52, T52, e, f, g, h, a, b, c, d) 239 SM3ROUND2(53, T53, d, e, f, g, h, a, b, c) 240 SM3ROUND2(54, T54, c, d, e, f, g, h, a, b) 241 SM3ROUND2(55, T55, b, c, d, e, f, g, h, a) 242 SM3ROUND2(56, T56, a, b, c, d, e, f, g, h) 243 SM3ROUND2(57, T57, h, a, b, c, d, e, f, g) 244 SM3ROUND2(58, T58, g, h, a, b, c, d, e, f) 245 SM3ROUND2(59, T59, f, g, h, a, b, c, d, e) 246 SM3ROUND2(60, T60, e, f, g, h, a, b, c, d) 247 SM3ROUND2(61, T61, d, e, f, g, h, a, b, c) 248 SM3ROUND2(62, T62, c, d, e, f, g, h, a, b) 249 SM3ROUND2(63, T63, b, c, d, e, f, g, h, a) 250 251 MOVQ hg+0(FP), BP 252 253 xorm( 0(BP), a) 254 xorm( 4(BP), b) 255 xorm( 8(BP), c) 256 xorm( 12(BP), d) 257 xorm( 16(BP), e) 258 xorm( 20(BP), f) 259 xorm( 24(BP), g) 260 xorm( 28(BP), h) 261 262 ADDQ $64, SI 263 CMPQ SI, 272(SP) 264 JB loop 265 266 end: 267 RET