github.com/mattn/go@v0.0.0-20171011075504-07f7db3ea99f/src/crypto/sha256/sha256block_ppc64le.s (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // This is a derived work from OpenSSL of SHA-2 using assembly optimizations. The 6 // original code was written by Andy Polyakov <appro@openssl.org> and it's dual 7 // licensed under OpenSSL and CRYPTOGAMS licenses depending on where you obtain 8 // it. For further details see http://www.openssl.org/~appro/cryptogams/. 9 10 #include "textflag.h" 11 12 // SHA256 block routine. See sha256block.go for Go equivalent. 13 // 14 // The algorithm is detailed in FIPS 180-4: 15 // 16 // http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf 17 // 18 // Wt = Mt; for 0 <= t <= 15 19 // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 20 // 21 // a = H0 22 // b = H1 23 // c = H2 24 // d = H3 25 // e = H4 26 // f = H5 27 // g = H6 28 // h = H7 29 // 30 // for t = 0 to 63 { 31 // T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt 32 // T2 = BIGSIGMA0(a) + Maj(a,b,c) 33 // h = g 34 // g = f 35 // f = e 36 // e = d + T1 37 // d = c 38 // c = b 39 // b = a 40 // a = T1 + T2 41 // } 42 // 43 // H0 = a + H0 44 // H1 = b + H1 45 // H2 = c + H2 46 // H3 = d + H3 47 // H4 = e + H4 48 // H5 = f + H5 49 // H6 = g + H6 50 // H7 = h + H7 51 52 #define CTX R3 53 #define INP R4 54 #define END R5 55 #define TBL R6 56 #define IDX R7 57 #define CNT R8 58 #define LEN R9 59 #define OFFLOAD R11 60 #define TEMP R12 61 62 #define HEX00 R0 63 #define HEX10 R10 64 #define HEX20 R25 65 #define HEX30 R26 66 #define HEX40 R27 67 #define HEX50 R28 68 #define HEX60 R29 69 #define HEX70 R31 70 71 // V0-V7 are A-H 72 // V8-V23 are used for the message schedule 73 #define KI V24 74 #define FUNC V25 75 #define S0 V26 76 #define S1 V27 77 #define s0 V28 78 #define s1 V29 79 #define LEMASK V31 // Permutation control register for little endian 80 81 // 4 copies of each Kt, to fill all 4 words of a vector register 82 DATA ·kcon+0x000(SB)/8, $0x428a2f98428a2f98 83 DATA ·kcon+0x008(SB)/8, $0x428a2f98428a2f98 84 DATA ·kcon+0x010(SB)/8, $0x7137449171374491 85 DATA ·kcon+0x018(SB)/8, $0x7137449171374491 86 DATA ·kcon+0x020(SB)/8, $0xb5c0fbcfb5c0fbcf 87 DATA ·kcon+0x028(SB)/8, $0xb5c0fbcfb5c0fbcf 88 DATA ·kcon+0x030(SB)/8, $0xe9b5dba5e9b5dba5 89 DATA ·kcon+0x038(SB)/8, $0xe9b5dba5e9b5dba5 90 DATA ·kcon+0x040(SB)/8, $0x3956c25b3956c25b 91 DATA ·kcon+0x048(SB)/8, $0x3956c25b3956c25b 92 DATA ·kcon+0x050(SB)/8, $0x59f111f159f111f1 93 DATA ·kcon+0x058(SB)/8, $0x59f111f159f111f1 94 DATA ·kcon+0x060(SB)/8, $0x923f82a4923f82a4 95 DATA ·kcon+0x068(SB)/8, $0x923f82a4923f82a4 96 DATA ·kcon+0x070(SB)/8, $0xab1c5ed5ab1c5ed5 97 DATA ·kcon+0x078(SB)/8, $0xab1c5ed5ab1c5ed5 98 DATA ·kcon+0x080(SB)/8, $0xd807aa98d807aa98 99 DATA ·kcon+0x088(SB)/8, $0xd807aa98d807aa98 100 DATA ·kcon+0x090(SB)/8, $0x12835b0112835b01 101 DATA ·kcon+0x098(SB)/8, $0x12835b0112835b01 102 DATA ·kcon+0x0A0(SB)/8, $0x243185be243185be 103 DATA ·kcon+0x0A8(SB)/8, $0x243185be243185be 104 DATA ·kcon+0x0B0(SB)/8, $0x550c7dc3550c7dc3 105 DATA ·kcon+0x0B8(SB)/8, $0x550c7dc3550c7dc3 106 DATA ·kcon+0x0C0(SB)/8, $0x72be5d7472be5d74 107 DATA ·kcon+0x0C8(SB)/8, $0x72be5d7472be5d74 108 DATA ·kcon+0x0D0(SB)/8, $0x80deb1fe80deb1fe 109 DATA ·kcon+0x0D8(SB)/8, $0x80deb1fe80deb1fe 110 DATA ·kcon+0x0E0(SB)/8, $0x9bdc06a79bdc06a7 111 DATA ·kcon+0x0E8(SB)/8, $0x9bdc06a79bdc06a7 112 DATA ·kcon+0x0F0(SB)/8, $0xc19bf174c19bf174 113 DATA ·kcon+0x0F8(SB)/8, $0xc19bf174c19bf174 114 DATA ·kcon+0x100(SB)/8, $0xe49b69c1e49b69c1 115 DATA ·kcon+0x108(SB)/8, $0xe49b69c1e49b69c1 116 DATA ·kcon+0x110(SB)/8, $0xefbe4786efbe4786 117 DATA ·kcon+0x118(SB)/8, $0xefbe4786efbe4786 118 DATA ·kcon+0x120(SB)/8, $0x0fc19dc60fc19dc6 119 DATA ·kcon+0x128(SB)/8, $0x0fc19dc60fc19dc6 120 DATA ·kcon+0x130(SB)/8, $0x240ca1cc240ca1cc 121 DATA ·kcon+0x138(SB)/8, $0x240ca1cc240ca1cc 122 DATA ·kcon+0x140(SB)/8, $0x2de92c6f2de92c6f 123 DATA ·kcon+0x148(SB)/8, $0x2de92c6f2de92c6f 124 DATA ·kcon+0x150(SB)/8, $0x4a7484aa4a7484aa 125 DATA ·kcon+0x158(SB)/8, $0x4a7484aa4a7484aa 126 DATA ·kcon+0x160(SB)/8, $0x5cb0a9dc5cb0a9dc 127 DATA ·kcon+0x168(SB)/8, $0x5cb0a9dc5cb0a9dc 128 DATA ·kcon+0x170(SB)/8, $0x76f988da76f988da 129 DATA ·kcon+0x178(SB)/8, $0x76f988da76f988da 130 DATA ·kcon+0x180(SB)/8, $0x983e5152983e5152 131 DATA ·kcon+0x188(SB)/8, $0x983e5152983e5152 132 DATA ·kcon+0x190(SB)/8, $0xa831c66da831c66d 133 DATA ·kcon+0x198(SB)/8, $0xa831c66da831c66d 134 DATA ·kcon+0x1A0(SB)/8, $0xb00327c8b00327c8 135 DATA ·kcon+0x1A8(SB)/8, $0xb00327c8b00327c8 136 DATA ·kcon+0x1B0(SB)/8, $0xbf597fc7bf597fc7 137 DATA ·kcon+0x1B8(SB)/8, $0xbf597fc7bf597fc7 138 DATA ·kcon+0x1C0(SB)/8, $0xc6e00bf3c6e00bf3 139 DATA ·kcon+0x1C8(SB)/8, $0xc6e00bf3c6e00bf3 140 DATA ·kcon+0x1D0(SB)/8, $0xd5a79147d5a79147 141 DATA ·kcon+0x1D8(SB)/8, $0xd5a79147d5a79147 142 DATA ·kcon+0x1E0(SB)/8, $0x06ca635106ca6351 143 DATA ·kcon+0x1E8(SB)/8, $0x06ca635106ca6351 144 DATA ·kcon+0x1F0(SB)/8, $0x1429296714292967 145 DATA ·kcon+0x1F8(SB)/8, $0x1429296714292967 146 DATA ·kcon+0x200(SB)/8, $0x27b70a8527b70a85 147 DATA ·kcon+0x208(SB)/8, $0x27b70a8527b70a85 148 DATA ·kcon+0x210(SB)/8, $0x2e1b21382e1b2138 149 DATA ·kcon+0x218(SB)/8, $0x2e1b21382e1b2138 150 DATA ·kcon+0x220(SB)/8, $0x4d2c6dfc4d2c6dfc 151 DATA ·kcon+0x228(SB)/8, $0x4d2c6dfc4d2c6dfc 152 DATA ·kcon+0x230(SB)/8, $0x53380d1353380d13 153 DATA ·kcon+0x238(SB)/8, $0x53380d1353380d13 154 DATA ·kcon+0x240(SB)/8, $0x650a7354650a7354 155 DATA ·kcon+0x248(SB)/8, $0x650a7354650a7354 156 DATA ·kcon+0x250(SB)/8, $0x766a0abb766a0abb 157 DATA ·kcon+0x258(SB)/8, $0x766a0abb766a0abb 158 DATA ·kcon+0x260(SB)/8, $0x81c2c92e81c2c92e 159 DATA ·kcon+0x268(SB)/8, $0x81c2c92e81c2c92e 160 DATA ·kcon+0x270(SB)/8, $0x92722c8592722c85 161 DATA ·kcon+0x278(SB)/8, $0x92722c8592722c85 162 DATA ·kcon+0x280(SB)/8, $0xa2bfe8a1a2bfe8a1 163 DATA ·kcon+0x288(SB)/8, $0xa2bfe8a1a2bfe8a1 164 DATA ·kcon+0x290(SB)/8, $0xa81a664ba81a664b 165 DATA ·kcon+0x298(SB)/8, $0xa81a664ba81a664b 166 DATA ·kcon+0x2A0(SB)/8, $0xc24b8b70c24b8b70 167 DATA ·kcon+0x2A8(SB)/8, $0xc24b8b70c24b8b70 168 DATA ·kcon+0x2B0(SB)/8, $0xc76c51a3c76c51a3 169 DATA ·kcon+0x2B8(SB)/8, $0xc76c51a3c76c51a3 170 DATA ·kcon+0x2C0(SB)/8, $0xd192e819d192e819 171 DATA ·kcon+0x2C8(SB)/8, $0xd192e819d192e819 172 DATA ·kcon+0x2D0(SB)/8, $0xd6990624d6990624 173 DATA ·kcon+0x2D8(SB)/8, $0xd6990624d6990624 174 DATA ·kcon+0x2E0(SB)/8, $0xf40e3585f40e3585 175 DATA ·kcon+0x2E8(SB)/8, $0xf40e3585f40e3585 176 DATA ·kcon+0x2F0(SB)/8, $0x106aa070106aa070 177 DATA ·kcon+0x2F8(SB)/8, $0x106aa070106aa070 178 DATA ·kcon+0x300(SB)/8, $0x19a4c11619a4c116 179 DATA ·kcon+0x308(SB)/8, $0x19a4c11619a4c116 180 DATA ·kcon+0x310(SB)/8, $0x1e376c081e376c08 181 DATA ·kcon+0x318(SB)/8, $0x1e376c081e376c08 182 DATA ·kcon+0x320(SB)/8, $0x2748774c2748774c 183 DATA ·kcon+0x328(SB)/8, $0x2748774c2748774c 184 DATA ·kcon+0x330(SB)/8, $0x34b0bcb534b0bcb5 185 DATA ·kcon+0x338(SB)/8, $0x34b0bcb534b0bcb5 186 DATA ·kcon+0x340(SB)/8, $0x391c0cb3391c0cb3 187 DATA ·kcon+0x348(SB)/8, $0x391c0cb3391c0cb3 188 DATA ·kcon+0x350(SB)/8, $0x4ed8aa4a4ed8aa4a 189 DATA ·kcon+0x358(SB)/8, $0x4ed8aa4a4ed8aa4a 190 DATA ·kcon+0x360(SB)/8, $0x5b9cca4f5b9cca4f 191 DATA ·kcon+0x368(SB)/8, $0x5b9cca4f5b9cca4f 192 DATA ·kcon+0x370(SB)/8, $0x682e6ff3682e6ff3 193 DATA ·kcon+0x378(SB)/8, $0x682e6ff3682e6ff3 194 DATA ·kcon+0x380(SB)/8, $0x748f82ee748f82ee 195 DATA ·kcon+0x388(SB)/8, $0x748f82ee748f82ee 196 DATA ·kcon+0x390(SB)/8, $0x78a5636f78a5636f 197 DATA ·kcon+0x398(SB)/8, $0x78a5636f78a5636f 198 DATA ·kcon+0x3A0(SB)/8, $0x84c8781484c87814 199 DATA ·kcon+0x3A8(SB)/8, $0x84c8781484c87814 200 DATA ·kcon+0x3B0(SB)/8, $0x8cc702088cc70208 201 DATA ·kcon+0x3B8(SB)/8, $0x8cc702088cc70208 202 DATA ·kcon+0x3C0(SB)/8, $0x90befffa90befffa 203 DATA ·kcon+0x3C8(SB)/8, $0x90befffa90befffa 204 DATA ·kcon+0x3D0(SB)/8, $0xa4506ceba4506ceb 205 DATA ·kcon+0x3D8(SB)/8, $0xa4506ceba4506ceb 206 DATA ·kcon+0x3E0(SB)/8, $0xbef9a3f7bef9a3f7 207 DATA ·kcon+0x3E8(SB)/8, $0xbef9a3f7bef9a3f7 208 DATA ·kcon+0x3F0(SB)/8, $0xc67178f2c67178f2 209 DATA ·kcon+0x3F8(SB)/8, $0xc67178f2c67178f2 210 DATA ·kcon+0x400(SB)/8, $0x0000000000000000 211 DATA ·kcon+0x408(SB)/8, $0x0000000000000000 212 DATA ·kcon+0x410(SB)/8, $0x1011121310111213 // permutation control vectors 213 DATA ·kcon+0x418(SB)/8, $0x1011121300010203 214 DATA ·kcon+0x420(SB)/8, $0x1011121310111213 215 DATA ·kcon+0x428(SB)/8, $0x0405060700010203 216 DATA ·kcon+0x430(SB)/8, $0x1011121308090a0b 217 DATA ·kcon+0x438(SB)/8, $0x0405060700010203 218 GLOBL ·kcon(SB), RODATA, $1088 219 220 #define SHA256ROUND0(a, b, c, d, e, f, g, h, xi) \ 221 VSEL g, f, e, FUNC; \ 222 VSHASIGMAW $15, e, $1, S1; \ 223 VADDUWM xi, h, h; \ 224 VSHASIGMAW $0, a, $1, S0; \ 225 VADDUWM FUNC, h, h; \ 226 VXOR b, a, FUNC; \ 227 VADDUWM S1, h, h; \ 228 VSEL b, c, FUNC, FUNC; \ 229 VADDUWM KI, g, g; \ 230 VADDUWM h, d, d; \ 231 VADDUWM FUNC, S0, S0; \ 232 LVX (TBL)(IDX), KI; \ 233 ADD $16, IDX; \ 234 VADDUWM S0, h, h 235 236 #define SHA256ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14) \ 237 VSHASIGMAW $0, xj_1, $0, s0; \ 238 VSEL g, f, e, FUNC; \ 239 VSHASIGMAW $15, e, $1, S1; \ 240 VADDUWM xi, h, h; \ 241 VSHASIGMAW $0, a, $1, S0; \ 242 VSHASIGMAW $15, xj_14, $0, s1; \ 243 VADDUWM FUNC, h, h; \ 244 VXOR b, a, FUNC; \ 245 VADDUWM xj_9, xj, xj; \ 246 VADDUWM S1, h, h; \ 247 VSEL b, c, FUNC, FUNC; \ 248 VADDUWM KI, g, g; \ 249 VADDUWM h, d, d; \ 250 VADDUWM FUNC, S0, S0; \ 251 VADDUWM s0, xj, xj; \ 252 LVX (TBL)(IDX), KI; \ 253 ADD $16, IDX; \ 254 VADDUWM S0, h, h; \ 255 VADDUWM s1, xj, xj 256 257 // func block(dig *digest, p []byte) 258 TEXT ·block(SB),0,$128-32 259 MOVD dig+0(FP), CTX 260 MOVD p_base+8(FP), INP 261 MOVD p_len+16(FP), LEN 262 263 SRD $6, LEN 264 SLD $6, LEN 265 266 ADD INP, LEN, END 267 268 CMP INP, END 269 BEQ end 270 271 MOVD $·kcon(SB), TBL 272 MOVD R1, OFFLOAD 273 274 MOVD R0, CNT 275 MOVWZ $0x10, HEX10 276 MOVWZ $0x20, HEX20 277 MOVWZ $0x30, HEX30 278 MOVWZ $0x40, HEX40 279 MOVWZ $0x50, HEX50 280 MOVWZ $0x60, HEX60 281 MOVWZ $0x70, HEX70 282 283 MOVWZ $8, IDX 284 LVSL (IDX)(R0), LEMASK 285 VSPLTISB $0x0F, KI 286 VXOR KI, LEMASK, LEMASK 287 288 LXVW4X (CTX)(HEX00), VS32 // v0 = vs32 289 LXVW4X (CTX)(HEX10), VS36 // v4 = vs36 290 291 // unpack the input values into vector registers 292 VSLDOI $4, V0, V0, V1 293 VSLDOI $8, V0, V0, V2 294 VSLDOI $12, V0, V0, V3 295 VSLDOI $4, V4, V4, V5 296 VSLDOI $8, V4, V4, V6 297 VSLDOI $12, V4, V4, V7 298 299 loop: 300 LVX (TBL)(HEX00), KI 301 MOVWZ $16, IDX 302 303 LXVD2X (INP)(R0), VS40 // load v8 (=vs40) in advance 304 ADD $16, INP 305 306 STVX V0, (OFFLOAD+HEX00) 307 STVX V1, (OFFLOAD+HEX10) 308 STVX V2, (OFFLOAD+HEX20) 309 STVX V3, (OFFLOAD+HEX30) 310 STVX V4, (OFFLOAD+HEX40) 311 STVX V5, (OFFLOAD+HEX50) 312 STVX V6, (OFFLOAD+HEX60) 313 STVX V7, (OFFLOAD+HEX70) 314 315 VADDUWM KI, V7, V7 // h+K[i] 316 LVX (TBL)(IDX), KI 317 ADD $16, IDX 318 319 VPERM V8, V8, LEMASK, V8 320 SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8) 321 VSLDOI $4, V8, V8, V9 322 SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9) 323 VSLDOI $4, V9, V9, V10 324 SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10) 325 LXVD2X (INP)(R0), VS44 // load v12 (=vs44) in advance 326 ADD $16, INP, INP 327 VSLDOI $4, V10, V10, V11 328 SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11) 329 VPERM V12, V12, LEMASK, V12 330 SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12) 331 VSLDOI $4, V12, V12, V13 332 SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13) 333 VSLDOI $4, V13, V13, V14 334 SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14) 335 LXVD2X (INP)(R0), VS48 // load v16 (=vs48) in advance 336 ADD $16, INP, INP 337 VSLDOI $4, V14, V14, V15 338 SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15) 339 VPERM V16, V16, LEMASK, V16 340 SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16) 341 VSLDOI $4, V16, V16, V17 342 SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17) 343 VSLDOI $4, V17, V17, V18 344 SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18) 345 VSLDOI $4, V18, V18, V19 346 LXVD2X (INP)(R0), VS52 // load v20 (=vs52) in advance 347 ADD $16, INP, INP 348 SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19) 349 VPERM V20, V20, LEMASK, V20 350 SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20) 351 VSLDOI $4, V20, V20, V21 352 SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21) 353 VSLDOI $4, V21, V21, V22 354 SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22) 355 VSLDOI $4, V22, V22, V23 356 SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22) 357 358 MOVWZ $3, TEMP 359 MOVWZ TEMP, CTR 360 361 L16_xx: 362 SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23) 363 SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8) 364 SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9) 365 SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10) 366 SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11) 367 SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12) 368 SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13) 369 SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14) 370 SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15) 371 SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16) 372 SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17) 373 SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18) 374 SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19) 375 SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20) 376 SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21) 377 SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22) 378 379 BC 0x10, 0, L16_xx // bdnz 380 381 LVX (OFFLOAD)(HEX00), V10 382 383 LVX (OFFLOAD)(HEX10), V11 384 VADDUWM V10, V0, V0 385 LVX (OFFLOAD)(HEX20), V12 386 VADDUWM V11, V1, V1 387 LVX (OFFLOAD)(HEX30), V13 388 VADDUWM V12, V2, V2 389 LVX (OFFLOAD)(HEX40), V14 390 VADDUWM V13, V3, V3 391 LVX (OFFLOAD)(HEX50), V15 392 VADDUWM V14, V4, V4 393 LVX (OFFLOAD)(HEX60), V16 394 VADDUWM V15, V5, V5 395 LVX (OFFLOAD)(HEX70), V17 396 VADDUWM V16, V6, V6 397 VADDUWM V17, V7, V7 398 399 CMPU INP, END 400 BLT loop 401 402 LVX (TBL)(IDX), V8 403 ADD $16, IDX 404 VPERM V0, V1, KI, V0 405 LVX (TBL)(IDX), V9 406 VPERM V4, V5, KI, V4 407 VPERM V0, V2, V8, V0 408 VPERM V4, V6, V8, V4 409 VPERM V0, V3, V9, V0 410 VPERM V4, V7, V9, V4 411 STXVD2X VS32, (CTX+HEX00) // v0 = vs32 412 STXVD2X VS36, (CTX+HEX10) // v4 = vs36 413 414 end: 415 RET 416