github.com/emmansun/gmsm@v0.29.1/sm3/sm3block_arm64.s (about) 1 //go:build !purego 2 3 #include "textflag.h" 4 5 #include "sm3_const_asm.s" 6 7 #define XWORD0 V0 8 #define XWORD1 V1 9 #define XWORD2 V2 10 #define XWORD3 V3 11 12 #define XTMP0 V4 13 #define XTMP1 V5 14 #define XTMP2 V6 15 #define XTMP3 V7 16 #define XTMP4 V8 17 18 #define Wt V9 19 20 #define a R0 21 #define b R1 22 #define c R2 23 #define d R3 24 #define e R4 25 #define f R5 26 #define g R6 27 #define h R7 28 29 #define y0 R8 30 #define y1 R9 31 #define y2 R10 32 33 #define NUM_BYTES R11 34 #define INP R12 35 #define CTX R13 // Beginning of digest in memory (a, b, c, ... , h) 36 37 #define a1 R15 38 #define b1 R16 39 #define c1 R19 40 #define d1 R20 41 #define e1 R21 42 #define f1 R22 43 #define g1 R23 44 #define h1 R24 45 46 // For rounds [0 - 16) 47 #define ROUND_AND_SCHED_N_0_0(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) \ 48 RORW $20, a, y0; \ // y0 = a <<< 12 49 ADDW $const, e, y1; \ 50 ADDW y0, y1; \ // y1 = a <<< 12 + e + T 51 VEXT $12, XWORD1.B16, XWORD0.B16, XTMP0.B16; \ // XTMP0 = W[-13] = {w6,w5,w4,w3}, Vm = XWORD1, Vn = XWORD0 52 RORW $25, y1, y2; \ // y2 = SS1 53 EORW y2, y0; \ // y0 = SS2 54 VMOV XWORD0.S[0], y1; \ 55 VSHL $7, XTMP0.S4, XTMP1.S4; \ 56 ADDW y1, y2; \ // y2 = SS1 + W 57 ADDW h, y2; \ // y2 = h + SS1 + W 58 VMOV Wt.S[0], y1; \ 59 VSRI $25, XTMP0.S4, XTMP1.S4; \ // XTMP1 = W[-13] rol 7 60 ADDW y1, y0; \ // y0 = SS2 + W' 61 ADDW d, y0; \ // y0 = d + SS2 + W' 62 ; \ 63 EORW a, b, h; \ 64 VEXT $8, XWORD3.B16, XWORD2.B16, XTMP0.B16; \ // XTMP0 = W[-6] = {w13,w12,w11,w10} 65 EORW c, h; \ 66 ADDW y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 67 ; \ 68 EORW e, f, y1; \ 69 VEOR XTMP1.B16, XTMP0.B16, XTMP0.B16; \ // XTMP0 = W[-6] ^ (W[-13] rol 7) 70 EORW g, y1; \ 71 ADDW y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 72 ; \ 73 RORW $23, b; \ 74 VEXT $12, XWORD2.B16, XWORD1.B16, XTMP1.B16; \ // XTMP1 = W[-9] = {w10,w9,w8,w7}, Vm = XWORD2, Vn = XWORD1 75 RORW $13, f; \ 76 ; \ 77 RORW $23, y2, y0; \ 78 RORW $15, y2, d; \ 79 VEOR XWORD0.B16, XTMP1.B16, XTMP1.B16; \ // XTMP1 = W[-9] ^ W[-16] 80 EORW y0, d; \ 81 EORW y2, d; \ // d = P(tt2) 82 VEXT $4, XWORD2.B16, XWORD3.B16, XTMP3.B16; \ // XTMP3 = W[-3] {w11,w15,w14,w13} 83 84 #define ROUND_AND_SCHED_N_0_1(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) \ 85 RORW $20, a, y0; \ // y0 = a <<< 12 86 ADDW $const, e, y1; \ 87 ADDW y0, y1; \ // y1 = a <<< 12 + e + T 88 VSHL $15, XTMP3.S4, XTMP2.S4; \ 89 RORW $25, y1, y2; \ // y2 = SS1 90 EORW y2, y0; \ // y0 = SS2 91 VMOV XWORD0.S[1], y1; \ 92 VSRI $17, XTMP3.S4, XTMP2.S4; \ // XTMP2 = W[-3] rol 15 {xxBA} 93 ADDW y1, y2; \ // y2 = SS1 + W 94 ADDW h, y2; \ // y2 = h + SS1 + W 95 VMOV Wt.S[1], y1; \ 96 VEOR XTMP1.B16, XTMP2.B16, XTMP2.B16; \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {xxBA} 97 ADDW y1, y0; \ // y0 = SS2 + W' 98 ADDW d, y0; \ // y0 = d + SS2 + W' 99 ; \ 100 EORW a, b, h; \ 101 VSHL $15, XTMP2.S4, XTMP4.S4; \ 102 EORW c, h; \ 103 ADDW y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 104 ; \ 105 EORW e, f, y1; \ 106 VSRI $17, XTMP2.S4, XTMP4.S4; \ // XTMP4 = = XTMP2 rol 15 {xxBA} 107 EORW g, y1; \ 108 ADDW y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 109 ; \ 110 RORW $23, b; \ 111 VSHL $8, XTMP4.S4, XTMP3.S4; \ 112 RORW $13, f; \ 113 ; \ 114 RORW $23, y2, y0; \ 115 RORW $15, y2, d; \ 116 VSRI $24, XTMP4.S4, XTMP3.S4; \ // XTMP3 = XTMP2 rol 23 {xxBA} 117 EORW y0, d; \ 118 EORW y2, d; \ // d = P(tt2) 119 VEOR XTMP2.B16, XTMP4.B16, XTMP4.B16; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {xxBA}) 120 121 #define ROUND_AND_SCHED_N_0_2(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) \ 122 RORW $20, a, y0; \ // y0 = a <<< 12 123 ADDW $const, e, y1; \ 124 ADDW y0, y1; \ // y1 = a <<< 12 + e + T 125 VEOR XTMP4.B16, XTMP3.B16, XTMP4.B16; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {xxBA}) XOR (XTMP2 rol 23 {xxBA}) 126 RORW $25, y1, y2; \ // y2 = SS1 127 EORW y2, y0; \ // y0 = SS2 128 VMOV XWORD0.S[2], y1; \ 129 VEOR XTMP4.B16, XTMP0.B16, XTMP2.B16; \ // XTMP2 = {..., ..., W[1], W[0]} 130 ADDW y1, y2; \ // y2 = SS1 + W 131 ADDW h, y2; \ // y2 = h + SS1 + W 132 VMOV Wt.S[2], y1; \ 133 VEXT $4, XTMP2.B16, XWORD3.B16, XTMP3.B16; \ // XTMP3 = W[-3] {W[0],w15, w14, w13}, Vm = XTMP2, Vn = XWORD3 134 ADDW y1, y0; \ // y0 = SS2 + W' 135 ADDW d, y0; \ // y0 = d + SS2 + W' 136 ; \ 137 EORW a, b, h; \ 138 VSHL $15, XTMP3.S4, XTMP4.S4; \ 139 EORW c, h; \ 140 ADDW y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 141 ; \ 142 EORW e, f, y1; \ 143 VSRI $17, XTMP3.S4, XTMP4.S4; \ // XTMP4 = W[-3] rol 15 {DCBA} 144 EORW g, y1; \ 145 ADDW y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 146 RORW $23, b; \ 147 RORW $13, f; \ 148 VEOR XTMP1.B16, XTMP4.B16, XTMP4.B16; \ // XTMP4 = W[-9] XOR W[-16] XOR (W[-3] rol 15) {DCBA} 149 RORW $23, y2, y0; \ 150 RORW $15, y2, d; \ 151 EORW y0, d; \ 152 EORW y2, d; \ // d = P(tt2) 153 VSHL $15, XTMP4.S4, XTMP3.S4; \ 154 155 #define ROUND_AND_SCHED_N_0_3(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) \ 156 RORW $20, a, y0; \ // y0 = a <<< 12 157 ADDW $const, e, y1; \ 158 ADDW y0, y1; \ // y1 = a <<< 12 + e + T 159 RORW $25, y1, y2; \ // y2 = SS1 160 VSRI $17, XTMP4.S4, XTMP3.S4; \ // XTMP3 = XTMP4 rol 15 {DCBA} 161 EORW y2, y0; \ // y0 = SS2 162 VMOV XWORD0.S[3], y1; \ 163 ADDW y1, y2; \ // y2 = SS1 + W 164 VSHL $8, XTMP3.S4, XTMP1.S4; \ 165 ADDW h, y2; \ // y2 = h + SS1 + W 166 VMOV Wt.S[3], y1; \ 167 ADDW y1, y0; \ // y0 = SS2 + W' 168 ADDW d, y0; \ // y0 = d + SS2 + W' 169 VSRI $24, XTMP3.S4, XTMP1.S4; \ // XTMP1 = XTMP4 rol 23 {DCBA} 170 EORW a, b, h; \ 171 EORW c, h; \ 172 ADDW y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 173 EORW e, f, y1; \ 174 VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16; \ // XTMP3 = XTMP4 XOR (XTMP4 rol 15 {DCBA}) 175 EORW g, y1; \ 176 ADDW y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 177 RORW $23, b; \ 178 RORW $13, f; \ 179 VEOR XTMP3.B16, XTMP1.B16, XTMP1.B16; \ // XTMP1 = XTMP4 XOR (XTMP4 rol 15 {DCBA}) XOR (XTMP4 rol 23 {DCBA}) 180 RORW $23, y2, y0; \ 181 RORW $15, y2, d; \ 182 EORW y0, d; \ 183 EORW y2, d; \ // d = P(tt2) 184 VEOR XTMP1.B16, XTMP0.B16, XWORD0.B16; \ // XWORD0 = {W[3], W[2], W[1], W[0]} 185 186 // For rounds [16 - 64) 187 #define ROUND_AND_SCHED_N_1_0(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) \ 188 RORW $20, a, y0; \ // y0 = a <<< 12 189 ADDW $const, e, y1; \ 190 ADDW y0, y1; \ // y1 = a <<< 12 + e + T 191 VEXT $12, XWORD1.B16, XWORD0.B16, XTMP0.B16; \ // XTMP0 = W[-13] = {w6,w5,w4,w3}, Vm = XWORD1, Vn = XWORD0 192 RORW $25, y1, y2; \ // y2 = SS1 193 EORW y2, y0; \ // y0 = SS2 194 VMOV XWORD0.S[0], y1; \ 195 VSHL $7, XTMP0.S4, XTMP1.S4; \ 196 ADDW y1, y2; \ // y2 = SS1 + W 197 ADDW h, y2; \ // y2 = h + SS1 + W 198 VMOV Wt.S[0], y1; \ 199 VSRI $25, XTMP0.S4, XTMP1.S4; \ // XTMP1 = W[-13] rol 7 200 ADDW y1, y0; \ // y0 = SS2 + W' 201 ADDW d, y0; \ // y0 = d + SS2 + W' 202 ; \ 203 ORRW a, b, y1; \ 204 VEXT $8, XWORD3.B16, XWORD2.B16, XTMP0.B16; \ // XTMP0 = W[-6] = {w13,w12,w11,w10} 205 ANDW a, b, h; \ 206 ANDW c, y1; \ 207 ORRW y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c) 208 VEOR XTMP1.B16, XTMP0.B16, XTMP0.B16; \ // XTMP0 = W[-6] ^ (W[-13] rol 7) 209 ADDW y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 210 ; \ 211 EORW f, g, y1; \ 212 ANDW e, y1; \ 213 VEXT $12, XWORD2.B16, XWORD1.B16, XTMP1.B16; \ // XTMP1 = W[-9] = {w10,w9,w8,w7}, Vm = XWORD2, Vn = XWORD1 214 EORW g, y1; \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g) 215 ADDW y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 216 ; \ 217 RORW $23, b; \ 218 RORW $13, f; \ 219 VEOR XWORD0.B16, XTMP1.B16, XTMP1.B16; \ // XTMP1 = W[-9] ^ W[-16] 220 ; \ 221 RORW $23, y2, y0; \ 222 RORW $15, y2, d; \ 223 EORW y0, d; \ 224 EORW y2, d; \ // d = P(tt2) 225 VEXT $4, XWORD2.B16, XWORD3.B16, XTMP3.B16; \ // XTMP3 = W[-3] {w11,w15,w14,w13} 226 227 #define ROUND_AND_SCHED_N_1_1(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) \ 228 RORW $20, a, y0; \ // y0 = a <<< 12 229 ADDW $const, e, y1; \ 230 ADDW y0, y1; \ // y1 = a <<< 12 + e + T 231 VSHL $15, XTMP3.S4, XTMP2.S4; \ 232 RORW $25, y1, y2; \ // y2 = SS1 233 EORW y2, y0; \ // y0 = SS2 234 VMOV XWORD0.S[1], y1; \ 235 VSRI $17, XTMP3.S4, XTMP2.S4; \ // XTMP2 = W[-3] rol 15 {xxBA} 236 ADDW y1, y2; \ // y2 = SS1 + W 237 ADDW h, y2; \ // y2 = h + SS1 + W 238 VMOV Wt.S[1], y1; \ 239 VEOR XTMP1.B16, XTMP2.B16, XTMP2.B16; \ // XTMP2 = W[-9] ^ W[-16] ^ (W[-3] rol 15) {xxBA} 240 ADDW y1, y0; \ // y0 = SS2 + W' 241 ADDW d, y0; \ // y0 = d + SS2 + W' 242 ; \ 243 ORRW a, b, y1; \ 244 VSHL $15, XTMP2.S4, XTMP4.S4; \ 245 ANDW a, b, h; \ 246 ANDW c, y1; \ 247 ORRW y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c) 248 VSRI $17, XTMP2.S4, XTMP4.S4; \ // XTMP4 = = XTMP2 rol 15 {xxBA} 249 ADDW y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 250 ; \ 251 EORW f, g, y1; \ 252 ANDW e, y1; \ 253 EORW g, y1; \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g) 254 VSHL $8, XTMP4.S4, XTMP3.S4; \ 255 ADDW y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 256 ; \ 257 RORW $23, b; \ 258 RORW $13, f; \ 259 ; \ 260 RORW $23, y2, y0; \ 261 VSRI $24, XTMP4.S4, XTMP3.S4; \ // XTMP3 = XTMP2 rol 23 {xxBA} 262 RORW $15, y2, d; \ 263 EORW y0, d; \ 264 EORW y2, d; \ // d = P(tt2) 265 VEOR XTMP2.B16, XTMP4.B16, XTMP4.B16; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {xxBA}) 266 267 #define ROUND_AND_SCHED_N_1_2(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) \ 268 RORW $20, a, y0; \ // y0 = a <<< 12 269 ADDW $const, e, y1; \ 270 ADDW y0, y1; \ // y1 = a <<< 12 + e + T 271 VEOR XTMP4.B16, XTMP3.B16, XTMP4.B16; \ // XTMP4 = XTMP2 XOR (XTMP2 rol 15 {xxBA}) XOR (XTMP2 rol 23 {xxBA}) 272 RORW $25, y1, y2; \ // y2 = SS1 273 EORW y2, y0; \ // y0 = SS2 274 VMOV XWORD0.S[2], y1; \ 275 VEOR XTMP4.B16, XTMP0.B16, XTMP2.B16; \ // XTMP2 = {..., ..., W[1], W[0]} 276 ADDW y1, y2; \ // y2 = SS1 + W 277 ADDW h, y2; \ // y2 = h + SS1 + W 278 VMOV Wt.S[2], y1; \ 279 ADDW y1, y0; \ // y0 = SS2 + W' 280 VEXT $4, XTMP2.B16, XWORD3.B16, XTMP3.B16; \ // XTMP3 = W[-3] {W[0],w15, w14, w13}, Vm = XTMP2, Vn = XWORD3 281 ADDW d, y0; \ // y0 = d + SS2 + W' 282 ORRW a, b, y1; \ 283 ANDW a, b, h; \ 284 ANDW c, y1; \ 285 VSHL $15, XTMP3.S4, XTMP4.S4; \ 286 ORRW y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c) 287 ADDW y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 288 EORW f, g, y1; \ 289 ANDW e, y1; \ 290 VSRI $17, XTMP3.S4, XTMP4.S4; \ // XTMP4 = W[-3] rol 15 {DCBA} 291 EORW g, y1; \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g) 292 ADDW y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 293 RORW $23, b; \ 294 RORW $13, f; \ 295 VEOR XTMP1.B16, XTMP4.B16, XTMP4.B16; \ // XTMP4 = W[-9] XOR W[-16] XOR (W[-3] rol 15) {DCBA} 296 RORW $23, y2, y0; \ 297 RORW $15, y2, d; \ 298 EORW y0, d; \ 299 EORW y2, d; \ // d = P(tt2) 300 VSHL $15, XTMP4.S4, XTMP3.S4; \ 301 302 #define ROUND_AND_SCHED_N_1_3(disp, const, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) \ 303 RORW $20, a, y0; \ // y0 = a <<< 12 304 ADDW $const, e, y1; \ 305 ADDW y0, y1; \ // y1 = a <<< 12 + e + T 306 RORW $25, y1, y2; \ // y2 = SS1 307 VSRI $17, XTMP4.S4, XTMP3.S4; \ // XTMP3 = XTMP4 rol 15 {DCBA} 308 EORW y2, y0; \ // y0 = SS2 309 VMOV XWORD0.S[3], y1; \ 310 ADDW y1, y2; \ // y2 = SS1 + W 311 ADDW h, y2; \ // y2 = h + SS1 + W 312 VMOV Wt.S[3], y1; \ 313 VSHL $8, XTMP3.S4, XTMP1.S4; \ 314 ADDW y1, y0; \ // y0 = SS2 + W' 315 ADDW d, y0; \ // y0 = d + SS2 + W' 316 ORRW a, b, y1; \ 317 ANDW a, b, h; \ 318 ANDW c, y1; \ 319 VSRI $24, XTMP3.S4, XTMP1.S4; \ // XTMP1 = XTMP4 rol 23 {DCBA} 320 ORRW y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c) 321 ADDW y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 322 EORW f, g, y1; \ 323 ANDW e, y1; \ 324 VEOR XTMP3.B16, XTMP4.B16, XTMP3.B16; \ // XTMP3 = XTMP4 XOR (XTMP4 rol 15 {DCBA}) 325 EORW g, y1; \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g) 326 ADDW y1, y2; \ // y2 = GG(e, f, g) + h + SS1 + W = tt2 327 RORW $23, b; \ 328 RORW $13, f; \ 329 VEOR XTMP3.B16, XTMP1.B16, XTMP1.B16; \ // XTMP1 = XTMP4 XOR (XTMP4 rol 15 {DCBA}) XOR (XTMP4 rol 23 {DCBA}) 330 RORW $23, y2, y0; \ 331 RORW $15, y2, d; \ 332 EORW y0, d; \ 333 EORW y2, d; \ // d = P(tt2) 334 VEOR XTMP1.B16, XTMP0.B16, XWORD0.B16; \ // XWORD0 = {W[3], W[2], W[1], W[0]} 335 336 // For rounds [16 - 64) 337 #define DO_ROUND_N_1(disp, idx, const, a, b, c, d, e, f, g, h, W, Wt) \ 338 RORW $20, a, y0; \ // y0 = a <<< 12 339 ADDW $const, e, y1; \ 340 ADDW y0, y1; \ // y1 = a <<< 12 + e + T 341 RORW $25, y1, y2; \ // y2 = SS1 342 EORW y2, y0; \ // y0 = SS2 343 VMOV W.S[idx], y1; \ 344 ADDW y1, y2; \ // y2 = SS1 + W 345 ADDW h, y2; \ // y2 = h + SS1 + W 346 VMOV Wt.S[idx], y1; \ 347 ADDW y1, y0; \ // y0 = SS2 + W' 348 ADDW d, y0; \ // y0 = d + SS2 + W' 349 ; \ 350 ORRW a, b, y1; \ 351 ANDW a, b, h; \ 352 ANDW c, y1; \ 353 ORRW y1, h; \ // h = (a AND b) OR (a AND c) OR (b AND c) 354 ADDW y0, h; \ // h = FF(a, b, c) + d + SS2 + W' = tt1 355 ; \ 356 EORW f, g, y1; \ 357 ANDW e, y1; \ 358 EORW g, y1; \ // y1 = GG2(e, f, g) = (e AND f) OR (NOT(e) AND g) 359 ADDW y1, y2; \ // y2 = GG2(e, f, g) + h + SS1 + W = tt2 360 ; \ 361 RORW $23, b; \ 362 RORW $13, f; \ 363 ; \ 364 RORW $23, y2, y0; \ 365 RORW $15, y2, d; \ 366 EORW y0, d; \ 367 EORW y2, d; \ // d = P(tt2) 368 369 // func blockARM64(dig *digest, p []byte) 370 TEXT ·blockARM64(SB), NOSPLIT, $0 371 MOVD dig+0(FP), CTX 372 MOVD p_base+8(FP), INP 373 MOVD p_len+16(FP), NUM_BYTES 374 375 AND $~63, NUM_BYTES 376 CBZ NUM_BYTES, end 377 378 LDPW (0*8)(CTX), (a, b) 379 LDPW (1*8)(CTX), (c, d) 380 LDPW (2*8)(CTX), (e, f) 381 LDPW (3*8)(CTX), (g, h) 382 383 loop: 384 MOVW a, a1 385 MOVW b, b1 386 MOVW c, c1 387 MOVW d, d1 388 MOVW e, e1 389 MOVW f, f1 390 MOVW g, g1 391 MOVW h, h1 392 393 VLD1.P 64(INP), [XWORD0.B16, XWORD1.B16, XWORD2.B16, XWORD3.B16] 394 VREV32 XWORD0.B16, XWORD0.B16 395 VREV32 XWORD1.B16, XWORD1.B16 396 VREV32 XWORD2.B16, XWORD2.B16 397 VREV32 XWORD3.B16, XWORD3.B16 398 399 schedule_compress: // for w0 - w47 400 // Do 4 rounds and scheduling 401 VEOR XWORD0.B16, XWORD1.B16, Wt.B16 402 ROUND_AND_SCHED_N_0_0(0*16, T0, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) 403 ROUND_AND_SCHED_N_0_1(0*16, T1, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt) 404 ROUND_AND_SCHED_N_0_2(0*16, T2, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt) 405 ROUND_AND_SCHED_N_0_3(0*16, T3, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt) 406 407 // Do 4 rounds and scheduling 408 VEOR XWORD1.B16, XWORD2.B16, Wt.B16 409 ROUND_AND_SCHED_N_0_0(0*16, T4, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0, Wt) 410 ROUND_AND_SCHED_N_0_1(0*16, T5, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0, Wt) 411 ROUND_AND_SCHED_N_0_2(0*16, T6, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0, Wt) 412 ROUND_AND_SCHED_N_0_3(0*16, T7, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0, Wt) 413 414 // Do 4 rounds and scheduling 415 VEOR XWORD2.B16, XWORD3.B16, Wt.B16 416 ROUND_AND_SCHED_N_0_0(0*16, T8, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1, Wt) 417 ROUND_AND_SCHED_N_0_1(0*16, T9, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1, Wt) 418 ROUND_AND_SCHED_N_0_2(0*16, T10, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1, Wt) 419 ROUND_AND_SCHED_N_0_3(0*16, T11, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1, Wt) 420 421 // Do 4 rounds and scheduling 422 VEOR XWORD3.B16, XWORD0.B16, Wt.B16 423 ROUND_AND_SCHED_N_0_0(0*16, T12, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2, Wt) 424 ROUND_AND_SCHED_N_0_1(0*16, T13, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2, Wt) 425 ROUND_AND_SCHED_N_0_2(0*16, T14, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2, Wt) 426 ROUND_AND_SCHED_N_0_3(0*16, T15, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2, Wt) 427 428 // Do 4 rounds and scheduling 429 VEOR XWORD0.B16, XWORD1.B16, Wt.B16 430 ROUND_AND_SCHED_N_1_0(0*16, T16, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) 431 ROUND_AND_SCHED_N_1_1(0*16, T17, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt) 432 ROUND_AND_SCHED_N_1_2(0*16, T18, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt) 433 ROUND_AND_SCHED_N_1_3(0*16, T19, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt) 434 435 // Do 4 rounds and scheduling 436 VEOR XWORD1.B16, XWORD2.B16, Wt.B16 437 ROUND_AND_SCHED_N_1_0(0*16, T20, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0, Wt) 438 ROUND_AND_SCHED_N_1_1(0*16, T21, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0, Wt) 439 ROUND_AND_SCHED_N_1_2(0*16, T22, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0, Wt) 440 ROUND_AND_SCHED_N_1_3(0*16, T23, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0, Wt) 441 442 // Do 4 rounds and scheduling 443 VEOR XWORD2.B16, XWORD3.B16, Wt.B16 444 ROUND_AND_SCHED_N_1_0(0*16, T24, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1, Wt) 445 ROUND_AND_SCHED_N_1_1(0*16, T25, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1, Wt) 446 ROUND_AND_SCHED_N_1_2(0*16, T26, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1, Wt) 447 ROUND_AND_SCHED_N_1_3(0*16, T27, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1, Wt) 448 449 // Do 4 rounds and scheduling 450 VEOR XWORD3.B16, XWORD0.B16, Wt.B16 451 ROUND_AND_SCHED_N_1_0(0*16, T28, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2, Wt) 452 ROUND_AND_SCHED_N_1_1(0*16, T29, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2, Wt) 453 ROUND_AND_SCHED_N_1_2(0*16, T30, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2, Wt) 454 ROUND_AND_SCHED_N_1_3(0*16, T31, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2, Wt) 455 456 // Do 4 rounds and scheduling 457 VEOR XWORD0.B16, XWORD1.B16, Wt.B16 458 ROUND_AND_SCHED_N_1_0(0*16, T32, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) 459 ROUND_AND_SCHED_N_1_1(0*16, T33, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt) 460 ROUND_AND_SCHED_N_1_2(0*16, T34, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt) 461 ROUND_AND_SCHED_N_1_3(0*16, T35, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt) 462 463 // Do 4 rounds and scheduling 464 VEOR XWORD1.B16, XWORD2.B16, Wt.B16 465 ROUND_AND_SCHED_N_1_0(0*16, T36, e, f, g, h, a, b, c, d, XWORD1, XWORD2, XWORD3, XWORD0, Wt) 466 ROUND_AND_SCHED_N_1_1(0*16, T37, d, e, f, g, h, a, b, c, XWORD1, XWORD2, XWORD3, XWORD0, Wt) 467 ROUND_AND_SCHED_N_1_2(0*16, T38, c, d, e, f, g, h, a, b, XWORD1, XWORD2, XWORD3, XWORD0, Wt) 468 ROUND_AND_SCHED_N_1_3(0*16, T39, b, c, d, e, f, g, h, a, XWORD1, XWORD2, XWORD3, XWORD0, Wt) 469 470 // Do 4 rounds and scheduling 471 VEOR XWORD2.B16, XWORD3.B16, Wt.B16 472 ROUND_AND_SCHED_N_1_0(0*16, T40, a, b, c, d, e, f, g, h, XWORD2, XWORD3, XWORD0, XWORD1, Wt) 473 ROUND_AND_SCHED_N_1_1(0*16, T41, h, a, b, c, d, e, f, g, XWORD2, XWORD3, XWORD0, XWORD1, Wt) 474 ROUND_AND_SCHED_N_1_2(0*16, T42, g, h, a, b, c, d, e, f, XWORD2, XWORD3, XWORD0, XWORD1, Wt) 475 ROUND_AND_SCHED_N_1_3(0*16, T43, f, g, h, a, b, c, d, e, XWORD2, XWORD3, XWORD0, XWORD1, Wt) 476 477 // Do 4 rounds and scheduling 478 VEOR XWORD3.B16, XWORD0.B16, Wt.B16 479 ROUND_AND_SCHED_N_1_0(0*16, T44, e, f, g, h, a, b, c, d, XWORD3, XWORD0, XWORD1, XWORD2, Wt) 480 ROUND_AND_SCHED_N_1_1(0*16, T45, d, e, f, g, h, a, b, c, XWORD3, XWORD0, XWORD1, XWORD2, Wt) 481 ROUND_AND_SCHED_N_1_2(0*16, T46, c, d, e, f, g, h, a, b, XWORD3, XWORD0, XWORD1, XWORD2, Wt) 482 ROUND_AND_SCHED_N_1_3(0*16, T47, b, c, d, e, f, g, h, a, XWORD3, XWORD0, XWORD1, XWORD2, Wt) 483 484 // w48 - w63 processed with only 4 rounds scheduling (last 16 rounds) 485 // Do 4 rounds and scheduling 486 VEOR XWORD0.B16, XWORD1.B16, Wt.B16 487 ROUND_AND_SCHED_N_1_0(0*16, T48, a, b, c, d, e, f, g, h, XWORD0, XWORD1, XWORD2, XWORD3, Wt) 488 ROUND_AND_SCHED_N_1_1(0*16, T49, h, a, b, c, d, e, f, g, XWORD0, XWORD1, XWORD2, XWORD3, Wt) 489 ROUND_AND_SCHED_N_1_2(0*16, T50, g, h, a, b, c, d, e, f, XWORD0, XWORD1, XWORD2, XWORD3, Wt) 490 ROUND_AND_SCHED_N_1_3(0*16, T51, f, g, h, a, b, c, d, e, XWORD0, XWORD1, XWORD2, XWORD3, Wt) 491 492 // w52 - w63 processed with no scheduling (last 12 rounds) 493 // Do 4 rounds 494 VEOR XWORD1.B16, XWORD2.B16, Wt.B16 495 DO_ROUND_N_1(0*16, 0, T52, e, f, g, h, a, b, c, d, XWORD1, Wt) 496 DO_ROUND_N_1(0*16, 1, T53, d, e, f, g, h, a, b, c, XWORD1, Wt) 497 DO_ROUND_N_1(0*16, 2, T54, c, d, e, f, g, h, a, b, XWORD1, Wt) 498 DO_ROUND_N_1(0*16, 3, T55, b, c, d, e, f, g, h, a, XWORD1, Wt) 499 500 // Do 4 rounds 501 VEOR XWORD2.B16, XWORD3.B16, Wt.B16 502 DO_ROUND_N_1(0*16, 0, T56, a, b, c, d, e, f, g, h, XWORD2, Wt) 503 DO_ROUND_N_1(0*16, 1, T57, h, a, b, c, d, e, f, g, XWORD2, Wt) 504 DO_ROUND_N_1(0*16, 2, T58, g, h, a, b, c, d, e, f, XWORD2, Wt) 505 DO_ROUND_N_1(0*16, 3, T59, f, g, h, a, b, c, d, e, XWORD2, Wt) 506 507 // Do 4 rounds 508 VEOR XWORD3.B16, XWORD0.B16, Wt.B16 509 DO_ROUND_N_1(0*16, 0, T60, e, f, g, h, a, b, c, d, XWORD3, Wt) 510 DO_ROUND_N_1(0*16, 1, T61, d, e, f, g, h, a, b, c, XWORD3, Wt) 511 DO_ROUND_N_1(0*16, 2, T62, c, d, e, f, g, h, a, b, XWORD3, Wt) 512 DO_ROUND_N_1(0*16, 3, T63, b, c, d, e, f, g, h, a, XWORD3, Wt) 513 514 EORW a1, a // H0 = a XOR H0 515 EORW b1, b // H1 = b XOR H1 516 EORW c1, c // H0 = a XOR H0 517 EORW d1, d // H1 = b XOR H1 518 EORW e1, e // H0 = a XOR H0 519 EORW f1, f // H1 = b XOR H1 520 EORW g1, g // H0 = a XOR H0 521 EORW h1, h // H1 = b XOR H1 522 523 SUB $64, NUM_BYTES, NUM_BYTES 524 CBNZ NUM_BYTES, loop 525 526 STPW (a, b), (0*8)(CTX) 527 STPW (c, d), (1*8)(CTX) 528 STPW (e, f), (2*8)(CTX) 529 STPW (g, h), (3*8)(CTX) 530 531 end: 532 RET