github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/crypto/sha256/sha256block_ppc64x.s (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 //go:build ppc64 || ppc64le 6 7 // Based on CRYPTOGAMS code with the following comment: 8 // # ==================================================================== 9 // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 10 // # project. The module is, however, dual licensed under OpenSSL and 11 // # CRYPTOGAMS licenses depending on where you obtain it. For further 12 // # details see http://www.openssl.org/~appro/cryptogams/. 13 // # ==================================================================== 14 15 #include "textflag.h" 16 17 // SHA256 block routine. See sha256block.go for Go equivalent. 18 // 19 // The algorithm is detailed in FIPS 180-4: 20 // 21 // https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf 22 // 23 // Wt = Mt; for 0 <= t <= 15 24 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 25 // 26 // a = H0 27 // b = H1 28 // c = H2 29 // d = H3 30 // e = H4 31 // f = H5 32 // g = H6 33 // h = H7 34 // 35 // for t = 0 to 63 { 36 // T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt 37 // T2 = BIGSIGMA0(a) + Maj(a,b,c) 38 // h = g 39 // g = f 40 // f = e 41 // e = d + T1 42 // d = c 43 // c = b 44 // b = a 45 // a = T1 + T2 46 // } 47 // 48 // H0 = a + H0 49 // H1 = b + H1 50 // H2 = c + H2 51 // H3 = d + H3 52 // H4 = e + H4 53 // H5 = f + H5 54 // H6 = g + H6 55 // H7 = h + H7 56 57 #define CTX R3 58 #define INP R4 59 #define END R5 60 #define TBL R6 61 #define IDX R7 62 #define LEN R9 63 #define TEMP R12 64 65 #define HEX00 R0 66 #define HEX10 R10 67 68 // V0-V7 are A-H 69 // V8-V23 are used for the message schedule 70 #define KI V24 71 #define FUNC V25 72 #define S0 V26 73 #define S1 V27 74 #define s0 V28 75 #define s1 V29 76 #define LEMASK V31 // Permutation control register for little endian 77 78 // 4 copies of each Kt, to fill all 4 words of a vector register 79 DATA ·kcon+0x000(SB)/8, $0x428a2f98428a2f98 80 DATA ·kcon+0x008(SB)/8, $0x428a2f98428a2f98 81 DATA ·kcon+0x010(SB)/8, $0x7137449171374491 82 DATA ·kcon+0x018(SB)/8, $0x7137449171374491 83 DATA ·kcon+0x020(SB)/8, $0xb5c0fbcfb5c0fbcf 84 DATA ·kcon+0x028(SB)/8, $0xb5c0fbcfb5c0fbcf 85 DATA ·kcon+0x030(SB)/8, $0xe9b5dba5e9b5dba5 86 DATA ·kcon+0x038(SB)/8, $0xe9b5dba5e9b5dba5 87 DATA ·kcon+0x040(SB)/8, $0x3956c25b3956c25b 88 DATA ·kcon+0x048(SB)/8, $0x3956c25b3956c25b 89 DATA ·kcon+0x050(SB)/8, $0x59f111f159f111f1 90 DATA ·kcon+0x058(SB)/8, $0x59f111f159f111f1 91 DATA ·kcon+0x060(SB)/8, $0x923f82a4923f82a4 92 DATA ·kcon+0x068(SB)/8, $0x923f82a4923f82a4 93 DATA ·kcon+0x070(SB)/8, $0xab1c5ed5ab1c5ed5 94 DATA ·kcon+0x078(SB)/8, $0xab1c5ed5ab1c5ed5 95 DATA ·kcon+0x080(SB)/8, $0xd807aa98d807aa98 96 DATA ·kcon+0x088(SB)/8, $0xd807aa98d807aa98 97 DATA ·kcon+0x090(SB)/8, $0x12835b0112835b01 98 DATA ·kcon+0x098(SB)/8, $0x12835b0112835b01 99 DATA ·kcon+0x0A0(SB)/8, $0x243185be243185be 100 DATA ·kcon+0x0A8(SB)/8, $0x243185be243185be 101 DATA ·kcon+0x0B0(SB)/8, $0x550c7dc3550c7dc3 102 DATA ·kcon+0x0B8(SB)/8, $0x550c7dc3550c7dc3 103 DATA ·kcon+0x0C0(SB)/8, $0x72be5d7472be5d74 104 DATA ·kcon+0x0C8(SB)/8, $0x72be5d7472be5d74 105 DATA ·kcon+0x0D0(SB)/8, $0x80deb1fe80deb1fe 106 DATA ·kcon+0x0D8(SB)/8, $0x80deb1fe80deb1fe 107 DATA ·kcon+0x0E0(SB)/8, $0x9bdc06a79bdc06a7 108 DATA ·kcon+0x0E8(SB)/8, $0x9bdc06a79bdc06a7 109 DATA ·kcon+0x0F0(SB)/8, $0xc19bf174c19bf174 110 DATA ·kcon+0x0F8(SB)/8, $0xc19bf174c19bf174 111 DATA ·kcon+0x100(SB)/8, $0xe49b69c1e49b69c1 112 DATA ·kcon+0x108(SB)/8, $0xe49b69c1e49b69c1 113 DATA ·kcon+0x110(SB)/8, $0xefbe4786efbe4786 114 DATA ·kcon+0x118(SB)/8, $0xefbe4786efbe4786 115 DATA ·kcon+0x120(SB)/8, $0x0fc19dc60fc19dc6 116 DATA ·kcon+0x128(SB)/8, $0x0fc19dc60fc19dc6 117 DATA ·kcon+0x130(SB)/8, $0x240ca1cc240ca1cc 118 DATA ·kcon+0x138(SB)/8, $0x240ca1cc240ca1cc 119 DATA ·kcon+0x140(SB)/8, $0x2de92c6f2de92c6f 120 DATA ·kcon+0x148(SB)/8, $0x2de92c6f2de92c6f 121 DATA ·kcon+0x150(SB)/8, $0x4a7484aa4a7484aa 122 DATA ·kcon+0x158(SB)/8, $0x4a7484aa4a7484aa 123 DATA ·kcon+0x160(SB)/8, $0x5cb0a9dc5cb0a9dc 124 DATA ·kcon+0x168(SB)/8, $0x5cb0a9dc5cb0a9dc 125 DATA ·kcon+0x170(SB)/8, $0x76f988da76f988da 126 DATA ·kcon+0x178(SB)/8, $0x76f988da76f988da 127 DATA ·kcon+0x180(SB)/8, $0x983e5152983e5152 128 DATA ·kcon+0x188(SB)/8, $0x983e5152983e5152 129 DATA ·kcon+0x190(SB)/8, $0xa831c66da831c66d 130 DATA ·kcon+0x198(SB)/8, $0xa831c66da831c66d 131 DATA ·kcon+0x1A0(SB)/8, $0xb00327c8b00327c8 132 DATA ·kcon+0x1A8(SB)/8, $0xb00327c8b00327c8 133 DATA ·kcon+0x1B0(SB)/8, $0xbf597fc7bf597fc7 134 DATA ·kcon+0x1B8(SB)/8, $0xbf597fc7bf597fc7 135 DATA ·kcon+0x1C0(SB)/8, $0xc6e00bf3c6e00bf3 136 DATA ·kcon+0x1C8(SB)/8, $0xc6e00bf3c6e00bf3 137 DATA ·kcon+0x1D0(SB)/8, $0xd5a79147d5a79147 138 DATA ·kcon+0x1D8(SB)/8, $0xd5a79147d5a79147 139 DATA ·kcon+0x1E0(SB)/8, $0x06ca635106ca6351 140 DATA ·kcon+0x1E8(SB)/8, $0x06ca635106ca6351 141 DATA ·kcon+0x1F0(SB)/8, $0x1429296714292967 142 DATA ·kcon+0x1F8(SB)/8, $0x1429296714292967 143 DATA ·kcon+0x200(SB)/8, $0x27b70a8527b70a85 144 DATA ·kcon+0x208(SB)/8, $0x27b70a8527b70a85 145 DATA ·kcon+0x210(SB)/8, $0x2e1b21382e1b2138 146 DATA ·kcon+0x218(SB)/8, $0x2e1b21382e1b2138 147 DATA ·kcon+0x220(SB)/8, $0x4d2c6dfc4d2c6dfc 148 DATA ·kcon+0x228(SB)/8, $0x4d2c6dfc4d2c6dfc 149 DATA ·kcon+0x230(SB)/8, $0x53380d1353380d13 150 DATA ·kcon+0x238(SB)/8, $0x53380d1353380d13 151 DATA ·kcon+0x240(SB)/8, $0x650a7354650a7354 152 DATA ·kcon+0x248(SB)/8, $0x650a7354650a7354 153 DATA ·kcon+0x250(SB)/8, $0x766a0abb766a0abb 154 DATA ·kcon+0x258(SB)/8, $0x766a0abb766a0abb 155 DATA ·kcon+0x260(SB)/8, $0x81c2c92e81c2c92e 156 DATA ·kcon+0x268(SB)/8, $0x81c2c92e81c2c92e 157 DATA ·kcon+0x270(SB)/8, $0x92722c8592722c85 158 DATA ·kcon+0x278(SB)/8, $0x92722c8592722c85 159 DATA ·kcon+0x280(SB)/8, $0xa2bfe8a1a2bfe8a1 160 DATA ·kcon+0x288(SB)/8, $0xa2bfe8a1a2bfe8a1 161 DATA ·kcon+0x290(SB)/8, $0xa81a664ba81a664b 162 DATA ·kcon+0x298(SB)/8, $0xa81a664ba81a664b 163 DATA ·kcon+0x2A0(SB)/8, $0xc24b8b70c24b8b70 164 DATA ·kcon+0x2A8(SB)/8, $0xc24b8b70c24b8b70 165 DATA ·kcon+0x2B0(SB)/8, $0xc76c51a3c76c51a3 166 DATA ·kcon+0x2B8(SB)/8, $0xc76c51a3c76c51a3 167 DATA ·kcon+0x2C0(SB)/8, $0xd192e819d192e819 168 DATA ·kcon+0x2C8(SB)/8, $0xd192e819d192e819 169 DATA ·kcon+0x2D0(SB)/8, $0xd6990624d6990624 170 DATA ·kcon+0x2D8(SB)/8, $0xd6990624d6990624 171 DATA ·kcon+0x2E0(SB)/8, $0xf40e3585f40e3585 172 DATA ·kcon+0x2E8(SB)/8, $0xf40e3585f40e3585 173 DATA ·kcon+0x2F0(SB)/8, $0x106aa070106aa070 174 DATA ·kcon+0x2F8(SB)/8, $0x106aa070106aa070 175 DATA ·kcon+0x300(SB)/8, $0x19a4c11619a4c116 176 DATA ·kcon+0x308(SB)/8, $0x19a4c11619a4c116 177 DATA ·kcon+0x310(SB)/8, $0x1e376c081e376c08 178 DATA ·kcon+0x318(SB)/8, $0x1e376c081e376c08 179 DATA ·kcon+0x320(SB)/8, $0x2748774c2748774c 180 DATA ·kcon+0x328(SB)/8, $0x2748774c2748774c 181 DATA ·kcon+0x330(SB)/8, $0x34b0bcb534b0bcb5 182 DATA ·kcon+0x338(SB)/8, $0x34b0bcb534b0bcb5 183 DATA ·kcon+0x340(SB)/8, $0x391c0cb3391c0cb3 184 DATA ·kcon+0x348(SB)/8, $0x391c0cb3391c0cb3 185 DATA ·kcon+0x350(SB)/8, $0x4ed8aa4a4ed8aa4a 186 DATA ·kcon+0x358(SB)/8, $0x4ed8aa4a4ed8aa4a 187 DATA ·kcon+0x360(SB)/8, $0x5b9cca4f5b9cca4f 188 DATA ·kcon+0x368(SB)/8, $0x5b9cca4f5b9cca4f 189 DATA ·kcon+0x370(SB)/8, $0x682e6ff3682e6ff3 190 DATA ·kcon+0x378(SB)/8, $0x682e6ff3682e6ff3 191 DATA ·kcon+0x380(SB)/8, $0x748f82ee748f82ee 192 DATA ·kcon+0x388(SB)/8, $0x748f82ee748f82ee 193 DATA ·kcon+0x390(SB)/8, $0x78a5636f78a5636f 194 DATA ·kcon+0x398(SB)/8, $0x78a5636f78a5636f 195 DATA ·kcon+0x3A0(SB)/8, $0x84c8781484c87814 196 DATA ·kcon+0x3A8(SB)/8, $0x84c8781484c87814 197 DATA ·kcon+0x3B0(SB)/8, $0x8cc702088cc70208 198 DATA ·kcon+0x3B8(SB)/8, $0x8cc702088cc70208 199 DATA ·kcon+0x3C0(SB)/8, $0x90befffa90befffa 200 DATA ·kcon+0x3C8(SB)/8, $0x90befffa90befffa 201 DATA ·kcon+0x3D0(SB)/8, $0xa4506ceba4506ceb 202 DATA ·kcon+0x3D8(SB)/8, $0xa4506ceba4506ceb 203 DATA ·kcon+0x3E0(SB)/8, $0xbef9a3f7bef9a3f7 204 DATA ·kcon+0x3E8(SB)/8, $0xbef9a3f7bef9a3f7 205 DATA ·kcon+0x3F0(SB)/8, $0xc67178f2c67178f2 206 DATA ·kcon+0x3F8(SB)/8, $0xc67178f2c67178f2 207 DATA ·kcon+0x400(SB)/8, $0x0000000000000000 208 DATA ·kcon+0x408(SB)/8, $0x0000000000000000 209 210 #ifdef GOARCH_ppc64le 211 DATA ·kcon+0x410(SB)/8, $0x1011121310111213 // permutation control vectors 212 DATA ·kcon+0x418(SB)/8, $0x1011121300010203 213 DATA ·kcon+0x420(SB)/8, $0x1011121310111213 214 DATA ·kcon+0x428(SB)/8, $0x0405060700010203 215 DATA ·kcon+0x430(SB)/8, $0x1011121308090a0b 216 DATA ·kcon+0x438(SB)/8, $0x0405060700010203 217 #else 218 DATA ·kcon+0x410(SB)/8, $0x1011121300010203 219 DATA ·kcon+0x418(SB)/8, $0x1011121310111213 // permutation control vectors 220 DATA ·kcon+0x420(SB)/8, $0x0405060700010203 221 DATA ·kcon+0x428(SB)/8, $0x1011121310111213 222 DATA ·kcon+0x430(SB)/8, $0x0001020304050607 223 DATA ·kcon+0x438(SB)/8, $0x08090a0b10111213 224 #endif 225 226 GLOBL ·kcon(SB), RODATA, $1088 227 228 #define SHA256ROUND0(a, b, c, d, e, f, g, h, xi) \ 229 VSEL g, f, e, FUNC; \ 230 VSHASIGMAW $15, e, $1, S1; \ 231 VADDUWM xi, h, h; \ 232 VSHASIGMAW $0, a, $1, S0; \ 233 VADDUWM FUNC, h, h; \ 234 VXOR b, a, FUNC; \ 235 VADDUWM S1, h, h; \ 236 VSEL b, c, FUNC, FUNC; \ 237 VADDUWM KI, g, g; \ 238 VADDUWM h, d, d; \ 239 VADDUWM FUNC, S0, S0; \ 240 LVX (TBL)(IDX), KI; \ 241 ADD $16, IDX; \ 242 VADDUWM S0, h, h 243 244 #define SHA256ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14) \ 245 VSHASIGMAW $0, xj_1, $0, s0; \ 246 VSEL g, f, e, FUNC; \ 247 VSHASIGMAW $15, e, $1, S1; \ 248 VADDUWM xi, h, h; \ 249 VSHASIGMAW $0, a, $1, S0; \ 250 VSHASIGMAW $15, xj_14, $0, s1; \ 251 VADDUWM FUNC, h, h; \ 252 VXOR b, a, FUNC; \ 253 VADDUWM xj_9, xj, xj; \ 254 VADDUWM S1, h, h; \ 255 VSEL b, c, FUNC, FUNC; \ 256 VADDUWM KI, g, g; \ 257 VADDUWM h, d, d; \ 258 VADDUWM FUNC, S0, S0; \ 259 VADDUWM s0, xj, xj; \ 260 LVX (TBL)(IDX), KI; \ 261 ADD $16, IDX; \ 262 VADDUWM S0, h, h; \ 263 VADDUWM s1, xj, xj 264 265 #ifdef GOARCH_ppc64le 266 #define VPERMLE(va,vb,vc,vt) VPERM va, vb, vc, vt 267 #else 268 #define VPERMLE(va,vb,vc,vt) 269 #endif 270 271 // func block(dig *digest, p []byte) 272 TEXT ·block(SB),0,$0-32 273 MOVD dig+0(FP), CTX 274 MOVD p_base+8(FP), INP 275 MOVD p_len+16(FP), LEN 276 277 SRD $6, LEN 278 SLD $6, LEN 279 ADD INP, LEN, END 280 281 CMP INP, END 282 BEQ end 283 284 MOVD $·kcon(SB), TBL 285 MOVWZ $0x10, HEX10 286 MOVWZ $8, IDX 287 288 #ifdef GOARCH_ppc64le 289 LVSL (IDX)(R0), LEMASK 290 VSPLTISB $0x0F, KI 291 VXOR KI, LEMASK, LEMASK 292 #endif 293 294 LXVW4X (CTX)(HEX00), VS32 // v0 = vs32 295 LXVW4X (CTX)(HEX10), VS36 // v4 = vs36 296 297 // unpack the input values into vector registers 298 VSLDOI $4, V0, V0, V1 299 VSLDOI $8, V0, V0, V2 300 VSLDOI $12, V0, V0, V3 301 VSLDOI $4, V4, V4, V5 302 VSLDOI $8, V4, V4, V6 303 VSLDOI $12, V4, V4, V7 304 305 loop: 306 LVX (TBL)(HEX00), KI 307 MOVWZ $16, IDX 308 309 LXVD2X (INP)(R0), VS40 // load v8 (=vs40) in advance 310 ADD $16, INP 311 312 // Offload to VSR24-31 (aka FPR24-31) 313 XXLOR V0, V0, VS24 314 XXLOR V1, V1, VS25 315 XXLOR V2, V2, VS26 316 XXLOR V3, V3, VS27 317 XXLOR V4, V4, VS28 318 XXLOR V5, V5, VS29 319 XXLOR V6, V6, VS30 320 XXLOR V7, V7, VS31 321 322 VADDUWM KI, V7, V7 // h+K[i] 323 LVX (TBL)(IDX), KI 324 ADD $16, IDX 325 326 VPERMLE(V8, V8, LEMASK, V8) 327 SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8) 328 VSLDOI $4, V8, V8, V9 329 SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9) 330 VSLDOI $4, V9, V9, V10 331 SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10) 332 LXVD2X (INP)(R0), VS44 // load v12 (=vs44) in advance 333 ADD $16, INP, INP 334 VSLDOI $4, V10, V10, V11 335 SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11) 336 VPERMLE(V12, V12, LEMASK, V12) 337 SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12) 338 VSLDOI $4, V12, V12, V13 339 SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13) 340 VSLDOI $4, V13, V13, V14 341 SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14) 342 LXVD2X (INP)(R0), VS48 // load v16 (=vs48) in advance 343 ADD $16, INP, INP 344 VSLDOI $4, V14, V14, V15 345 SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15) 346 VPERMLE(V16, V16, LEMASK, V16) 347 SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16) 348 VSLDOI $4, V16, V16, V17 349 SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17) 350 VSLDOI $4, V17, V17, V18 351 SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18) 352 VSLDOI $4, V18, V18, V19 353 LXVD2X (INP)(R0), VS52 // load v20 (=vs52) in advance 354 ADD $16, INP, INP 355 SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19) 356 VPERMLE(V20, V20, LEMASK, V20) 357 SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20) 358 VSLDOI $4, V20, V20, V21 359 SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21) 360 VSLDOI $4, V21, V21, V22 361 SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22) 362 VSLDOI $4, V22, V22, V23 363 SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22) 364 365 MOVWZ $3, TEMP 366 MOVWZ TEMP, CTR 367 368 L16_xx: 369 SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23) 370 SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8) 371 SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9) 372 SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10) 373 SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11) 374 SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12) 375 SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13) 376 SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14) 377 SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15) 378 SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16) 379 SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17) 380 SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18) 381 SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19) 382 SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20) 383 SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21) 384 SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22) 385 386 BC 0x10, 0, L16_xx // bdnz 387 388 XXLOR VS24, VS24, V10 389 390 XXLOR VS25, VS25, V11 391 VADDUWM V10, V0, V0 392 XXLOR VS26, VS26, V12 393 VADDUWM V11, V1, V1 394 XXLOR VS27, VS27, V13 395 VADDUWM V12, V2, V2 396 XXLOR VS28, VS28, V14 397 VADDUWM V13, V3, V3 398 XXLOR VS29, VS29, V15 399 VADDUWM V14, V4, V4 400 XXLOR VS30, VS30, V16 401 VADDUWM V15, V5, V5 402 XXLOR VS31, VS31, V17 403 VADDUWM V16, V6, V6 404 VADDUWM V17, V7, V7 405 406 CMPU INP, END 407 BLT loop 408 409 LVX (TBL)(IDX), V8 410 ADD $16, IDX 411 VPERM V0, V1, KI, V0 412 LVX (TBL)(IDX), V9 413 VPERM V4, V5, KI, V4 414 VPERM V0, V2, V8, V0 415 VPERM V4, V6, V8, V4 416 VPERM V0, V3, V9, V0 417 VPERM V4, V7, V9, V4 418 STXVD2X VS32, (CTX+HEX00) // v0 = vs32 419 STXVD2X VS36, (CTX+HEX10) // v4 = vs36 420 421 end: 422 RET 423