github.com/psiphon-labs/psiphon-tunnel-core@v2.0.28+incompatible/psiphon/common/crypto/Yawning/chacha20/chacha20_amd64.py (about) 1 #!/usr/bin/env python3 2 # 3 # To the extent possible under law, Yawning Angel has waived all copyright 4 # and related or neighboring rights to chacha20, using the Creative 5 # Commons "CC0" public domain dedication. See LICENSE or 6 # <http://creativecommons.org/publicdomain/zero/1.0/> for full details. 7 8 # 9 # cgo sucks. Plan 9 assembly sucks. Real languages have SIMD intrinsics. 10 # The least terrible/retarded option is to use a Python code generator, so 11 # that's what I did. 12 # 13 # Code based on Ted Krovetz's vec128 C implementation, with corrections 14 # to use a 64 bit counter instead of 32 bit, and to allow unaligned input and 15 # output pointers. 16 # 17 # Dependencies: https://github.com/Maratyszcza/PeachPy 18 # 19 # python3 -m peachpy.x86_64 -mabi=goasm -S -o chacha20_amd64.s chacha20_amd64.py 20 # 21 22 from peachpy import * 23 from peachpy.x86_64 import * 24 25 x = Argument(ptr(uint32_t)) 26 inp = Argument(ptr(const_uint8_t)) 27 outp = Argument(ptr(uint8_t)) 28 nrBlocks = Argument(ptr(size_t)) 29 30 # 31 # SSE2 helper functions. A temporary register is explicitly passed in because 32 # the main fast loop uses every single register (and even spills) so manual 33 # control is needed. 34 # 35 # This used to also have a DQROUNDS helper that did 2 rounds of ChaCha like 36 # in the C code, but the C code has the luxury of an optimizer reordering 37 # everything, while this does not. 38 # 39 40 def ROTW16_sse2(tmp, d): 41 MOVDQA(tmp, d) 42 PSLLD(tmp, 16) 43 PSRLD(d, 16) 44 PXOR(d, tmp) 45 46 def ROTW12_sse2(tmp, b): 47 MOVDQA(tmp, b) 48 PSLLD(tmp, 12) 49 PSRLD(b, 20) 50 PXOR(b, tmp) 51 52 def ROTW8_sse2(tmp, d): 53 MOVDQA(tmp, d) 54 PSLLD(tmp, 8) 55 PSRLD(d, 24) 56 PXOR(d, tmp) 57 58 def ROTW7_sse2(tmp, b): 59 MOVDQA(tmp, b) 60 PSLLD(tmp, 7) 61 PSRLD(b, 25) 62 PXOR(b, tmp) 63 64 def WriteXor_sse2(tmp, inp, outp, d, v0, v1, v2, v3): 65 MOVDQU(tmp, [inp+d]) 66 PXOR(tmp, v0) 67 MOVDQU([outp+d], tmp) 68 MOVDQU(tmp, [inp+d+16]) 69 PXOR(tmp, v1) 70 MOVDQU([outp+d+16], tmp) 71 MOVDQU(tmp, [inp+d+32]) 72 PXOR(tmp, v2) 73 MOVDQU([outp+d+32], tmp) 74 MOVDQU(tmp, [inp+d+48]) 75 PXOR(tmp, v3) 76 MOVDQU([outp+d+48], tmp) 77 78 # SSE2 ChaCha20 (aka vec128). Does not handle partial blocks, and will 79 # process 4/2/1 blocks at a time. 80 with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)): 81 reg_x = GeneralPurposeRegister64() 82 reg_inp = GeneralPurposeRegister64() 83 reg_outp = GeneralPurposeRegister64() 84 reg_blocks = GeneralPurposeRegister64() 85 reg_sp_save = GeneralPurposeRegister64() 86 87 LOAD.ARGUMENT(reg_x, x) 88 LOAD.ARGUMENT(reg_inp, inp) 89 LOAD.ARGUMENT(reg_outp, outp) 90 LOAD.ARGUMENT(reg_blocks, nrBlocks) 91 92 # Align the stack to a 32 byte boundary. 93 MOV(reg_sp_save, registers.rsp) 94 AND(registers.rsp, 0xffffffffffffffe0) 95 SUB(registers.rsp, 0x20) 96 97 # Build the counter increment vector on the stack, and allocate the scratch 98 # space 99 xmm_v0 = XMMRegister() 100 PXOR(xmm_v0, xmm_v0) 101 SUB(registers.rsp, 16+16) 102 MOVDQA([registers.rsp], xmm_v0) 103 reg_tmp = GeneralPurposeRegister32() 104 MOV(reg_tmp, 0x00000001) 105 MOV([registers.rsp], reg_tmp) 106 mem_one = [registers.rsp] # (Stack) Counter increment vector 107 mem_tmp0 = [registers.rsp+16] # (Stack) Scratch space. 108 109 mem_s0 = [reg_x] # (Memory) Cipher state [0..3] 110 mem_s1 = [reg_x+16] # (Memory) Cipher state [4..7] 111 mem_s2 = [reg_x+32] # (Memory) Cipher state [8..11] 112 mem_s3 = [reg_x+48] # (Memory) Cipher state [12..15] 113 114 # xmm_v0 allocated above... 115 xmm_v1 = XMMRegister() 116 xmm_v2 = XMMRegister() 117 xmm_v3 = XMMRegister() 118 119 xmm_v4 = XMMRegister() 120 xmm_v5 = XMMRegister() 121 xmm_v6 = XMMRegister() 122 xmm_v7 = XMMRegister() 123 124 xmm_v8 = XMMRegister() 125 xmm_v9 = XMMRegister() 126 xmm_v10 = XMMRegister() 127 xmm_v11 = XMMRegister() 128 129 xmm_v12 = XMMRegister() 130 xmm_v13 = XMMRegister() 131 xmm_v14 = XMMRegister() 132 xmm_v15 = XMMRegister() 133 134 xmm_tmp = xmm_v12 135 136 # 137 # 4 blocks at a time. 138 # 139 140 reg_rounds = GeneralPurposeRegister64() 141 142 vector_loop4 = Loop() 143 SUB(reg_blocks, 4) 144 JB(vector_loop4.end) 145 with vector_loop4: 146 MOVDQU(xmm_v0, mem_s0) 147 MOVDQU(xmm_v1, mem_s1) 148 MOVDQU(xmm_v2, mem_s2) 149 MOVDQU(xmm_v3, mem_s3) 150 151 MOVDQA(xmm_v4, xmm_v0) 152 MOVDQA(xmm_v5, xmm_v1) 153 MOVDQA(xmm_v6, xmm_v2) 154 MOVDQA(xmm_v7, xmm_v3) 155 PADDQ(xmm_v7, mem_one) 156 157 MOVDQA(xmm_v8, xmm_v0) 158 MOVDQA(xmm_v9, xmm_v1) 159 MOVDQA(xmm_v10, xmm_v2) 160 MOVDQA(xmm_v11, xmm_v7) 161 PADDQ(xmm_v11, mem_one) 162 163 MOVDQA(xmm_v12, xmm_v0) 164 MOVDQA(xmm_v13, xmm_v1) 165 MOVDQA(xmm_v14, xmm_v2) 166 MOVDQA(xmm_v15, xmm_v11) 167 PADDQ(xmm_v15, mem_one) 168 169 MOV(reg_rounds, 20) 170 rounds_loop4 = Loop() 171 with rounds_loop4: 172 # a += b; d ^= a; d = ROTW16(d); 173 PADDD(xmm_v0, xmm_v1) 174 PADDD(xmm_v4, xmm_v5) 175 PADDD(xmm_v8, xmm_v9) 176 PADDD(xmm_v12, xmm_v13) 177 PXOR(xmm_v3, xmm_v0) 178 PXOR(xmm_v7, xmm_v4) 179 PXOR(xmm_v11, xmm_v8) 180 PXOR(xmm_v15, xmm_v12) 181 182 MOVDQA(mem_tmp0, xmm_tmp) # Save 183 184 ROTW16_sse2(xmm_tmp, xmm_v3) 185 ROTW16_sse2(xmm_tmp, xmm_v7) 186 ROTW16_sse2(xmm_tmp, xmm_v11) 187 ROTW16_sse2(xmm_tmp, xmm_v15) 188 189 # c += d; b ^= c; b = ROTW12(b); 190 PADDD(xmm_v2, xmm_v3) 191 PADDD(xmm_v6, xmm_v7) 192 PADDD(xmm_v10, xmm_v11) 193 PADDD(xmm_v14, xmm_v15) 194 PXOR(xmm_v1, xmm_v2) 195 PXOR(xmm_v5, xmm_v6) 196 PXOR(xmm_v9, xmm_v10) 197 PXOR(xmm_v13, xmm_v14) 198 ROTW12_sse2(xmm_tmp, xmm_v1) 199 ROTW12_sse2(xmm_tmp, xmm_v5) 200 ROTW12_sse2(xmm_tmp, xmm_v9) 201 ROTW12_sse2(xmm_tmp, xmm_v13) 202 203 # a += b; d ^= a; d = ROTW8(d); 204 MOVDQA(xmm_tmp, mem_tmp0) # Restore 205 206 PADDD(xmm_v0, xmm_v1) 207 PADDD(xmm_v4, xmm_v5) 208 PADDD(xmm_v8, xmm_v9) 209 PADDD(xmm_v12, xmm_v13) 210 PXOR(xmm_v3, xmm_v0) 211 PXOR(xmm_v7, xmm_v4) 212 PXOR(xmm_v11, xmm_v8) 213 PXOR(xmm_v15, xmm_v12) 214 215 MOVDQA(mem_tmp0, xmm_tmp) # Save 216 217 ROTW8_sse2(xmm_tmp, xmm_v3) 218 ROTW8_sse2(xmm_tmp, xmm_v7) 219 ROTW8_sse2(xmm_tmp, xmm_v11) 220 ROTW8_sse2(xmm_tmp, xmm_v15) 221 222 # c += d; b ^= c; b = ROTW7(b) 223 PADDD(xmm_v2, xmm_v3) 224 PADDD(xmm_v6, xmm_v7) 225 PADDD(xmm_v10, xmm_v11) 226 PADDD(xmm_v14, xmm_v15) 227 PXOR(xmm_v1, xmm_v2) 228 PXOR(xmm_v5, xmm_v6) 229 PXOR(xmm_v9, xmm_v10) 230 PXOR(xmm_v13, xmm_v14) 231 ROTW7_sse2(xmm_tmp, xmm_v1) 232 ROTW7_sse2(xmm_tmp, xmm_v5) 233 ROTW7_sse2(xmm_tmp, xmm_v9) 234 ROTW7_sse2(xmm_tmp, xmm_v13) 235 236 # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); 237 PSHUFD(xmm_v1, xmm_v1, 0x39) 238 PSHUFD(xmm_v5, xmm_v5, 0x39) 239 PSHUFD(xmm_v9, xmm_v9, 0x39) 240 PSHUFD(xmm_v13, xmm_v13, 0x39) 241 PSHUFD(xmm_v2, xmm_v2, 0x4e) 242 PSHUFD(xmm_v6, xmm_v6, 0x4e) 243 PSHUFD(xmm_v10, xmm_v10, 0x4e) 244 PSHUFD(xmm_v14, xmm_v14, 0x4e) 245 PSHUFD(xmm_v3, xmm_v3, 0x93) 246 PSHUFD(xmm_v7, xmm_v7, 0x93) 247 PSHUFD(xmm_v11, xmm_v11, 0x93) 248 PSHUFD(xmm_v15, xmm_v15, 0x93) 249 250 MOVDQA(xmm_tmp, mem_tmp0) # Restore 251 252 # a += b; d ^= a; d = ROTW16(d); 253 PADDD(xmm_v0, xmm_v1) 254 PADDD(xmm_v4, xmm_v5) 255 PADDD(xmm_v8, xmm_v9) 256 PADDD(xmm_v12, xmm_v13) 257 PXOR(xmm_v3, xmm_v0) 258 PXOR(xmm_v7, xmm_v4) 259 PXOR(xmm_v11, xmm_v8) 260 PXOR(xmm_v15, xmm_v12) 261 262 MOVDQA(mem_tmp0, xmm_tmp) # Save 263 264 ROTW16_sse2(xmm_tmp, xmm_v3) 265 ROTW16_sse2(xmm_tmp, xmm_v7) 266 ROTW16_sse2(xmm_tmp, xmm_v11) 267 ROTW16_sse2(xmm_tmp, xmm_v15) 268 269 # c += d; b ^= c; b = ROTW12(b); 270 PADDD(xmm_v2, xmm_v3) 271 PADDD(xmm_v6, xmm_v7) 272 PADDD(xmm_v10, xmm_v11) 273 PADDD(xmm_v14, xmm_v15) 274 PXOR(xmm_v1, xmm_v2) 275 PXOR(xmm_v5, xmm_v6) 276 PXOR(xmm_v9, xmm_v10) 277 PXOR(xmm_v13, xmm_v14) 278 ROTW12_sse2(xmm_tmp, xmm_v1) 279 ROTW12_sse2(xmm_tmp, xmm_v5) 280 ROTW12_sse2(xmm_tmp, xmm_v9) 281 ROTW12_sse2(xmm_tmp, xmm_v13) 282 283 # a += b; d ^= a; d = ROTW8(d); 284 MOVDQA(xmm_tmp, mem_tmp0) # Restore 285 286 PADDD(xmm_v0, xmm_v1) 287 PADDD(xmm_v4, xmm_v5) 288 PADDD(xmm_v8, xmm_v9) 289 PADDD(xmm_v12, xmm_v13) 290 PXOR(xmm_v3, xmm_v0) 291 PXOR(xmm_v7, xmm_v4) 292 PXOR(xmm_v11, xmm_v8) 293 PXOR(xmm_v15, xmm_v12) 294 295 MOVDQA(mem_tmp0, xmm_tmp) # Save 296 297 ROTW8_sse2(xmm_tmp, xmm_v3) 298 ROTW8_sse2(xmm_tmp, xmm_v7) 299 ROTW8_sse2(xmm_tmp, xmm_v11) 300 ROTW8_sse2(xmm_tmp, xmm_v15) 301 302 # c += d; b ^= c; b = ROTW7(b) 303 PADDD(xmm_v2, xmm_v3) 304 PADDD(xmm_v6, xmm_v7) 305 PADDD(xmm_v10, xmm_v11) 306 PADDD(xmm_v14, xmm_v15) 307 PXOR(xmm_v1, xmm_v2) 308 PXOR(xmm_v5, xmm_v6) 309 PXOR(xmm_v9, xmm_v10) 310 PXOR(xmm_v13, xmm_v14) 311 ROTW7_sse2(xmm_tmp, xmm_v1) 312 ROTW7_sse2(xmm_tmp, xmm_v5) 313 ROTW7_sse2(xmm_tmp, xmm_v9) 314 ROTW7_sse2(xmm_tmp, xmm_v13) 315 316 # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); 317 PSHUFD(xmm_v1, xmm_v1, 0x93) 318 PSHUFD(xmm_v5, xmm_v5, 0x93) 319 PSHUFD(xmm_v9, xmm_v9, 0x93) 320 PSHUFD(xmm_v13, xmm_v13, 0x93) 321 PSHUFD(xmm_v2, xmm_v2, 0x4e) 322 PSHUFD(xmm_v6, xmm_v6, 0x4e) 323 PSHUFD(xmm_v10, xmm_v10, 0x4e) 324 PSHUFD(xmm_v14, xmm_v14, 0x4e) 325 PSHUFD(xmm_v3, xmm_v3, 0x39) 326 PSHUFD(xmm_v7, xmm_v7, 0x39) 327 PSHUFD(xmm_v11, xmm_v11, 0x39) 328 PSHUFD(xmm_v15, xmm_v15, 0x39) 329 330 MOVDQA(xmm_tmp, mem_tmp0) # Restore 331 332 SUB(reg_rounds, 2) 333 JNZ(rounds_loop4.begin) 334 335 MOVDQA(mem_tmp0, xmm_tmp) 336 337 PADDD(xmm_v0, mem_s0) 338 PADDD(xmm_v1, mem_s1) 339 PADDD(xmm_v2, mem_s2) 340 PADDD(xmm_v3, mem_s3) 341 WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3) 342 MOVDQU(xmm_v3, mem_s3) 343 PADDQ(xmm_v3, mem_one) 344 345 PADDD(xmm_v4, mem_s0) 346 PADDD(xmm_v5, mem_s1) 347 PADDD(xmm_v6, mem_s2) 348 PADDD(xmm_v7, xmm_v3) 349 WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7) 350 PADDQ(xmm_v3, mem_one) 351 352 PADDD(xmm_v8, mem_s0) 353 PADDD(xmm_v9, mem_s1) 354 PADDD(xmm_v10, mem_s2) 355 PADDD(xmm_v11, xmm_v3) 356 WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 128, xmm_v8, xmm_v9, xmm_v10, xmm_v11) 357 PADDQ(xmm_v3, mem_one) 358 359 MOVDQA(xmm_tmp, mem_tmp0) 360 361 PADDD(xmm_v12, mem_s0) 362 PADDD(xmm_v13, mem_s1) 363 PADDD(xmm_v14, mem_s2) 364 PADDD(xmm_v15, xmm_v3) 365 WriteXor_sse2(xmm_v0, reg_inp, reg_outp, 192, xmm_v12, xmm_v13, xmm_v14, xmm_v15) 366 PADDQ(xmm_v3, mem_one) 367 368 MOVDQU(mem_s3, xmm_v3) 369 370 ADD(reg_inp, 4 * 64) 371 ADD(reg_outp, 4 * 64) 372 373 SUB(reg_blocks, 4) 374 JAE(vector_loop4.begin) 375 376 ADD(reg_blocks, 4) 377 out = Label() 378 JZ(out) 379 380 # Past this point, we no longer need to use every single register to hold 381 # the in progress state. 382 383 xmm_s0 = xmm_v8 384 xmm_s1 = xmm_v9 385 xmm_s2 = xmm_v10 386 xmm_s3 = xmm_v11 387 xmm_one = xmm_v13 388 MOVDQU(xmm_s0, mem_s0) 389 MOVDQU(xmm_s1, mem_s1) 390 MOVDQU(xmm_s2, mem_s2) 391 MOVDQU(xmm_s3, mem_s3) 392 MOVDQA(xmm_one, mem_one) 393 394 # 395 # 2 blocks at a time. 396 # 397 398 process_1_block = Label() 399 SUB(reg_blocks, 2) 400 JB(process_1_block) # < 2 blocks remaining. 401 402 MOVDQA(xmm_v0, xmm_s0) 403 MOVDQA(xmm_v1, xmm_s1) 404 MOVDQA(xmm_v2, xmm_s2) 405 MOVDQA(xmm_v3, xmm_s3) 406 407 MOVDQA(xmm_v4, xmm_v0) 408 MOVDQA(xmm_v5, xmm_v1) 409 MOVDQA(xmm_v6, xmm_v2) 410 MOVDQA(xmm_v7, xmm_v3) 411 PADDQ(xmm_v7, xmm_one) 412 413 MOV(reg_rounds, 20) 414 rounds_loop2 = Loop() 415 with rounds_loop2: 416 # a += b; d ^= a; d = ROTW16(d); 417 PADDD(xmm_v0, xmm_v1) 418 PADDD(xmm_v4, xmm_v5) 419 PXOR(xmm_v3, xmm_v0) 420 PXOR(xmm_v7, xmm_v4) 421 ROTW16_sse2(xmm_tmp, xmm_v3) 422 ROTW16_sse2(xmm_tmp, xmm_v7) 423 424 # c += d; b ^= c; b = ROTW12(b); 425 PADDD(xmm_v2, xmm_v3) 426 PADDD(xmm_v6, xmm_v7) 427 PXOR(xmm_v1, xmm_v2) 428 PXOR(xmm_v5, xmm_v6) 429 ROTW12_sse2(xmm_tmp, xmm_v1) 430 ROTW12_sse2(xmm_tmp, xmm_v5) 431 432 # a += b; d ^= a; d = ROTW8(d); 433 PADDD(xmm_v0, xmm_v1) 434 PADDD(xmm_v4, xmm_v5) 435 PXOR(xmm_v3, xmm_v0) 436 PXOR(xmm_v7, xmm_v4) 437 ROTW8_sse2(xmm_tmp, xmm_v3) 438 ROTW8_sse2(xmm_tmp, xmm_v7) 439 440 # c += d; b ^= c; b = ROTW7(b) 441 PADDD(xmm_v2, xmm_v3) 442 PADDD(xmm_v6, xmm_v7) 443 PXOR(xmm_v1, xmm_v2) 444 PXOR(xmm_v5, xmm_v6) 445 ROTW7_sse2(xmm_tmp, xmm_v1) 446 ROTW7_sse2(xmm_tmp, xmm_v5) 447 448 # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); 449 PSHUFD(xmm_v1, xmm_v1, 0x39) 450 PSHUFD(xmm_v5, xmm_v5, 0x39) 451 PSHUFD(xmm_v2, xmm_v2, 0x4e) 452 PSHUFD(xmm_v6, xmm_v6, 0x4e) 453 PSHUFD(xmm_v3, xmm_v3, 0x93) 454 PSHUFD(xmm_v7, xmm_v7, 0x93) 455 456 # a += b; d ^= a; d = ROTW16(d); 457 PADDD(xmm_v0, xmm_v1) 458 PADDD(xmm_v4, xmm_v5) 459 PXOR(xmm_v3, xmm_v0) 460 PXOR(xmm_v7, xmm_v4) 461 ROTW16_sse2(xmm_tmp, xmm_v3) 462 ROTW16_sse2(xmm_tmp, xmm_v7) 463 464 # c += d; b ^= c; b = ROTW12(b); 465 PADDD(xmm_v2, xmm_v3) 466 PADDD(xmm_v6, xmm_v7) 467 PXOR(xmm_v1, xmm_v2) 468 PXOR(xmm_v5, xmm_v6) 469 ROTW12_sse2(xmm_tmp, xmm_v1) 470 ROTW12_sse2(xmm_tmp, xmm_v5) 471 472 # a += b; d ^= a; d = ROTW8(d); 473 PADDD(xmm_v0, xmm_v1) 474 PADDD(xmm_v4, xmm_v5) 475 PXOR(xmm_v3, xmm_v0) 476 PXOR(xmm_v7, xmm_v4) 477 ROTW8_sse2(xmm_tmp, xmm_v3) 478 ROTW8_sse2(xmm_tmp, xmm_v7) 479 480 # c += d; b ^= c; b = ROTW7(b) 481 PADDD(xmm_v2, xmm_v3) 482 PADDD(xmm_v6, xmm_v7) 483 PXOR(xmm_v1, xmm_v2) 484 PXOR(xmm_v5, xmm_v6) 485 ROTW7_sse2(xmm_tmp, xmm_v1) 486 ROTW7_sse2(xmm_tmp, xmm_v5) 487 488 # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); 489 PSHUFD(xmm_v1, xmm_v1, 0x93) 490 PSHUFD(xmm_v5, xmm_v5, 0x93) 491 PSHUFD(xmm_v2, xmm_v2, 0x4e) 492 PSHUFD(xmm_v6, xmm_v6, 0x4e) 493 PSHUFD(xmm_v3, xmm_v3, 0x39) 494 PSHUFD(xmm_v7, xmm_v7, 0x39) 495 496 SUB(reg_rounds, 2) 497 JNZ(rounds_loop2.begin) 498 499 PADDD(xmm_v0, xmm_s0) 500 PADDD(xmm_v1, xmm_s1) 501 PADDD(xmm_v2, xmm_s2) 502 PADDD(xmm_v3, xmm_s3) 503 WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3) 504 PADDQ(xmm_s3, xmm_one) 505 506 PADDD(xmm_v4, xmm_s0) 507 PADDD(xmm_v5, xmm_s1) 508 PADDD(xmm_v6, xmm_s2) 509 PADDD(xmm_v7, xmm_s3) 510 WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7) 511 PADDQ(xmm_s3, xmm_one) 512 513 ADD(reg_inp, 2 * 64) 514 ADD(reg_outp, 2 * 64) 515 SUB(reg_blocks, 2) 516 517 LABEL(process_1_block) 518 ADD(reg_blocks, 2) 519 out_serial = Label() 520 JZ(out_serial) 521 522 # 523 # 1 block at a time. Only executed once, because if there was > 1, 524 # the parallel code would have processed it already. 525 # 526 527 MOVDQA(xmm_v0, xmm_s0) 528 MOVDQA(xmm_v1, xmm_s1) 529 MOVDQA(xmm_v2, xmm_s2) 530 MOVDQA(xmm_v3, xmm_s3) 531 532 MOV(reg_rounds, 20) 533 rounds_loop1 = Loop() 534 with rounds_loop1: 535 # a += b; d ^= a; d = ROTW16(d); 536 PADDD(xmm_v0, xmm_v1) 537 PXOR(xmm_v3, xmm_v0) 538 ROTW16_sse2(xmm_tmp, xmm_v3) 539 540 # c += d; b ^= c; b = ROTW12(b); 541 PADDD(xmm_v2, xmm_v3) 542 PXOR(xmm_v1, xmm_v2) 543 ROTW12_sse2(xmm_tmp, xmm_v1) 544 545 # a += b; d ^= a; d = ROTW8(d); 546 PADDD(xmm_v0, xmm_v1) 547 PXOR(xmm_v3, xmm_v0) 548 ROTW8_sse2(xmm_tmp, xmm_v3) 549 550 # c += d; b ^= c; b = ROTW7(b) 551 PADDD(xmm_v2, xmm_v3) 552 PXOR(xmm_v1, xmm_v2) 553 ROTW7_sse2(xmm_tmp, xmm_v1) 554 555 # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); 556 PSHUFD(xmm_v1, xmm_v1, 0x39) 557 PSHUFD(xmm_v2, xmm_v2, 0x4e) 558 PSHUFD(xmm_v3, xmm_v3, 0x93) 559 560 # a += b; d ^= a; d = ROTW16(d); 561 PADDD(xmm_v0, xmm_v1) 562 PXOR(xmm_v3, xmm_v0) 563 ROTW16_sse2(xmm_tmp, xmm_v3) 564 565 # c += d; b ^= c; b = ROTW12(b); 566 PADDD(xmm_v2, xmm_v3) 567 PXOR(xmm_v1, xmm_v2) 568 ROTW12_sse2(xmm_tmp, xmm_v1) 569 570 # a += b; d ^= a; d = ROTW8(d); 571 PADDD(xmm_v0, xmm_v1) 572 PXOR(xmm_v3, xmm_v0) 573 ROTW8_sse2(xmm_tmp, xmm_v3) 574 575 # c += d; b ^= c; b = ROTW7(b) 576 PADDD(xmm_v2, xmm_v3) 577 PXOR(xmm_v1, xmm_v2) 578 ROTW7_sse2(xmm_tmp, xmm_v1) 579 580 # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); 581 PSHUFD(xmm_v1, xmm_v1, 0x93) 582 PSHUFD(xmm_v2, xmm_v2, 0x4e) 583 PSHUFD(xmm_v3, xmm_v3, 0x39) 584 585 SUB(reg_rounds, 2) 586 JNZ(rounds_loop1.begin) 587 588 PADDD(xmm_v0, xmm_s0) 589 PADDD(xmm_v1, xmm_s1) 590 PADDD(xmm_v2, xmm_s2) 591 PADDD(xmm_v3, xmm_s3) 592 WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3) 593 PADDQ(xmm_s3, xmm_one) 594 595 LABEL(out_serial) 596 597 # Write back the updated counter. Stoping at 2^70 bytes is the user's 598 # problem, not mine. (Skipped if there's exactly a multiple of 4 blocks 599 # because the counter is incremented in memory while looping.) 600 MOVDQU(mem_s3, xmm_s3) 601 602 LABEL(out) 603 604 # Paranoia, cleanse the scratch space. 605 PXOR(xmm_v0, xmm_v0) 606 MOVDQA(mem_tmp0, xmm_v0) 607 608 # Remove our stack allocation. 609 MOV(registers.rsp, reg_sp_save) 610 611 RETURN() 612 613 # 614 # AVX2 helpers. Like the SSE2 equivalents, the scratch register is explicit, 615 # and more helpers are used to increase readability for destructive operations. 616 # 617 # XXX/Performance: ROTW16_avx2/ROTW8_avx2 both can use VPSHUFFB. 618 # 619 620 def ADD_avx2(dst, src): 621 VPADDD(dst, dst, src) 622 623 def XOR_avx2(dst, src): 624 VPXOR(dst, dst, src) 625 626 def ROTW16_avx2(tmp, d): 627 VPSLLD(tmp, d, 16) 628 VPSRLD(d, d, 16) 629 XOR_avx2(d, tmp) 630 631 def ROTW12_avx2(tmp, b): 632 VPSLLD(tmp, b, 12) 633 VPSRLD(b, b, 20) 634 XOR_avx2(b, tmp) 635 636 def ROTW8_avx2(tmp, d): 637 VPSLLD(tmp, d, 8) 638 VPSRLD(d, d, 24) 639 XOR_avx2(d, tmp) 640 641 def ROTW7_avx2(tmp, b): 642 VPSLLD(tmp, b, 7) 643 VPSRLD(b, b, 25) 644 XOR_avx2(b, tmp) 645 646 def WriteXor_avx2(tmp, inp, outp, d, v0, v1, v2, v3): 647 # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20)); 648 VPERM2I128(tmp, v0, v1, 0x20) 649 VPXOR(tmp, tmp, [inp+d]) 650 VMOVDQU([outp+d], tmp) 651 652 # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20)); 653 VPERM2I128(tmp, v2, v3, 0x20) 654 VPXOR(tmp, tmp, [inp+d+32]) 655 VMOVDQU([outp+d+32], tmp) 656 657 # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31)); 658 VPERM2I128(tmp, v0, v1, 0x31) 659 VPXOR(tmp, tmp, [inp+d+64]) 660 VMOVDQU([outp+d+64], tmp) 661 662 # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31)); 663 VPERM2I128(tmp, v2, v3, 0x31) 664 VPXOR(tmp, tmp, [inp+d+96]) 665 VMOVDQU([outp+d+96], tmp) 666 667 # AVX2 ChaCha20 (aka avx2). Does not handle partial blocks, will process 668 # 8/4/2 blocks at a time. 669 with Function("blocksAmd64AVX2", (x, inp, outp, nrBlocks), target=uarch.broadwell): 670 reg_x = GeneralPurposeRegister64() 671 reg_inp = GeneralPurposeRegister64() 672 reg_outp = GeneralPurposeRegister64() 673 reg_blocks = GeneralPurposeRegister64() 674 reg_sp_save = GeneralPurposeRegister64() 675 676 LOAD.ARGUMENT(reg_x, x) 677 LOAD.ARGUMENT(reg_inp, inp) 678 LOAD.ARGUMENT(reg_outp, outp) 679 LOAD.ARGUMENT(reg_blocks, nrBlocks) 680 681 # Align the stack to a 32 byte boundary. 682 MOV(reg_sp_save, registers.rsp) 683 AND(registers.rsp, 0xffffffffffffffe0) 684 SUB(registers.rsp, 0x20) 685 686 x_s0 = [reg_x] # (Memory) Cipher state [0..3] 687 x_s1 = [reg_x+16] # (Memory) Cipher state [4..7] 688 x_s2 = [reg_x+32] # (Memory) Cipher state [8..11] 689 x_s3 = [reg_x+48] # (Memory) Cipher state [12..15] 690 691 ymm_v0 = YMMRegister() 692 ymm_v1 = YMMRegister() 693 ymm_v2 = YMMRegister() 694 ymm_v3 = YMMRegister() 695 696 ymm_v4 = YMMRegister() 697 ymm_v5 = YMMRegister() 698 ymm_v6 = YMMRegister() 699 ymm_v7 = YMMRegister() 700 701 ymm_v8 = YMMRegister() 702 ymm_v9 = YMMRegister() 703 ymm_v10 = YMMRegister() 704 ymm_v11 = YMMRegister() 705 706 ymm_v12 = YMMRegister() 707 ymm_v13 = YMMRegister() 708 ymm_v14 = YMMRegister() 709 ymm_v15 = YMMRegister() 710 711 ymm_tmp0 = ymm_v12 712 713 # Allocate the neccecary stack space for the counter vector and two ymm 714 # registers that we will spill. 715 SUB(registers.rsp, 96) 716 mem_tmp0 = [registers.rsp+64] # (Stack) Scratch space. 717 mem_s3 = [registers.rsp+32] # (Stack) Working copy of s3. (8x) 718 mem_inc = [registers.rsp] # (Stack) Counter increment vector. 719 720 # Increment the counter for one side of the state vector. 721 VPXOR(ymm_tmp0, ymm_tmp0, ymm_tmp0) 722 VMOVDQU(mem_inc, ymm_tmp0) 723 reg_tmp = GeneralPurposeRegister32() 724 MOV(reg_tmp, 0x00000001) 725 MOV([registers.rsp+16], reg_tmp) 726 VBROADCASTI128(ymm_v3, x_s3) 727 VPADDQ(ymm_v3, ymm_v3, [registers.rsp]) 728 VMOVDQA(mem_s3, ymm_v3) 729 730 # As we process 2xN blocks at a time, so the counter increment for both 731 # sides of the state vector is 2. 732 MOV(reg_tmp, 0x00000002) 733 MOV([registers.rsp], reg_tmp) 734 MOV([registers.rsp+16], reg_tmp) 735 736 out_write_even = Label() 737 out_write_odd = Label() 738 739 # 740 # 8 blocks at a time. Ted Krovetz's avx2 code does not do this, but it's 741 # a decent gain despite all the pain... 742 # 743 744 reg_rounds = GeneralPurposeRegister64() 745 746 vector_loop8 = Loop() 747 SUB(reg_blocks, 8) 748 JB(vector_loop8.end) 749 with vector_loop8: 750 VBROADCASTI128(ymm_v0, x_s0) 751 VBROADCASTI128(ymm_v1, x_s1) 752 VBROADCASTI128(ymm_v2, x_s2) 753 VMOVDQA(ymm_v3, mem_s3) 754 755 VMOVDQA(ymm_v4, ymm_v0) 756 VMOVDQA(ymm_v5, ymm_v1) 757 VMOVDQA(ymm_v6, ymm_v2) 758 VPADDQ(ymm_v7, ymm_v3, mem_inc) 759 760 VMOVDQA(ymm_v8, ymm_v0) 761 VMOVDQA(ymm_v9, ymm_v1) 762 VMOVDQA(ymm_v10, ymm_v2) 763 VPADDQ(ymm_v11, ymm_v7, mem_inc) 764 765 VMOVDQA(ymm_v12, ymm_v0) 766 VMOVDQA(ymm_v13, ymm_v1) 767 VMOVDQA(ymm_v14, ymm_v2) 768 VPADDQ(ymm_v15, ymm_v11, mem_inc) 769 770 MOV(reg_rounds, 20) 771 rounds_loop8 = Loop() 772 with rounds_loop8: 773 # a += b; d ^= a; d = ROTW16(d); 774 ADD_avx2(ymm_v0, ymm_v1) 775 ADD_avx2(ymm_v4, ymm_v5) 776 ADD_avx2(ymm_v8, ymm_v9) 777 ADD_avx2(ymm_v12, ymm_v13) 778 XOR_avx2(ymm_v3, ymm_v0) 779 XOR_avx2(ymm_v7, ymm_v4) 780 XOR_avx2(ymm_v11, ymm_v8) 781 XOR_avx2(ymm_v15, ymm_v12) 782 783 VMOVDQA(mem_tmp0, ymm_tmp0) # Save 784 785 ROTW16_avx2(ymm_tmp0, ymm_v3) 786 ROTW16_avx2(ymm_tmp0, ymm_v7) 787 ROTW16_avx2(ymm_tmp0, ymm_v11) 788 ROTW16_avx2(ymm_tmp0, ymm_v15) 789 790 # c += d; b ^= c; b = ROTW12(b); 791 ADD_avx2(ymm_v2, ymm_v3) 792 ADD_avx2(ymm_v6, ymm_v7) 793 ADD_avx2(ymm_v10, ymm_v11) 794 ADD_avx2(ymm_v14, ymm_v15) 795 XOR_avx2(ymm_v1, ymm_v2) 796 XOR_avx2(ymm_v5, ymm_v6) 797 XOR_avx2(ymm_v9, ymm_v10) 798 XOR_avx2(ymm_v13, ymm_v14) 799 ROTW12_avx2(ymm_tmp0, ymm_v1) 800 ROTW12_avx2(ymm_tmp0, ymm_v5) 801 ROTW12_avx2(ymm_tmp0, ymm_v9) 802 ROTW12_avx2(ymm_tmp0, ymm_v13) 803 804 # a += b; d ^= a; d = ROTW8(d); 805 VMOVDQA(ymm_tmp0, mem_tmp0) # Restore 806 807 ADD_avx2(ymm_v0, ymm_v1) 808 ADD_avx2(ymm_v4, ymm_v5) 809 ADD_avx2(ymm_v8, ymm_v9) 810 ADD_avx2(ymm_v12, ymm_v13) 811 XOR_avx2(ymm_v3, ymm_v0) 812 XOR_avx2(ymm_v7, ymm_v4) 813 XOR_avx2(ymm_v11, ymm_v8) 814 XOR_avx2(ymm_v15, ymm_v12) 815 816 VMOVDQA(mem_tmp0, ymm_tmp0) # Save 817 818 ROTW8_avx2(ymm_tmp0, ymm_v3) 819 ROTW8_avx2(ymm_tmp0, ymm_v7) 820 ROTW8_avx2(ymm_tmp0, ymm_v11) 821 ROTW8_avx2(ymm_tmp0, ymm_v15) 822 823 # c += d; b ^= c; b = ROTW7(b) 824 ADD_avx2(ymm_v2, ymm_v3) 825 ADD_avx2(ymm_v6, ymm_v7) 826 ADD_avx2(ymm_v10, ymm_v11) 827 ADD_avx2(ymm_v14, ymm_v15) 828 XOR_avx2(ymm_v1, ymm_v2) 829 XOR_avx2(ymm_v5, ymm_v6) 830 XOR_avx2(ymm_v9, ymm_v10) 831 XOR_avx2(ymm_v13, ymm_v14) 832 ROTW7_avx2(ymm_tmp0, ymm_v1) 833 ROTW7_avx2(ymm_tmp0, ymm_v5) 834 ROTW7_avx2(ymm_tmp0, ymm_v9) 835 ROTW7_avx2(ymm_tmp0, ymm_v13) 836 837 # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); 838 VPSHUFD(ymm_v1, ymm_v1, 0x39) 839 VPSHUFD(ymm_v5, ymm_v5, 0x39) 840 VPSHUFD(ymm_v9, ymm_v9, 0x39) 841 VPSHUFD(ymm_v13, ymm_v13, 0x39) 842 VPSHUFD(ymm_v2, ymm_v2, 0x4e) 843 VPSHUFD(ymm_v6, ymm_v6, 0x4e) 844 VPSHUFD(ymm_v10, ymm_v10, 0x4e) 845 VPSHUFD(ymm_v14, ymm_v14, 0x4e) 846 VPSHUFD(ymm_v3, ymm_v3, 0x93) 847 VPSHUFD(ymm_v7, ymm_v7, 0x93) 848 VPSHUFD(ymm_v11, ymm_v11, 0x93) 849 VPSHUFD(ymm_v15, ymm_v15, 0x93) 850 851 # a += b; d ^= a; d = ROTW16(d); 852 VMOVDQA(ymm_tmp0, mem_tmp0) # Restore 853 854 ADD_avx2(ymm_v0, ymm_v1) 855 ADD_avx2(ymm_v4, ymm_v5) 856 ADD_avx2(ymm_v8, ymm_v9) 857 ADD_avx2(ymm_v12, ymm_v13) 858 XOR_avx2(ymm_v3, ymm_v0) 859 XOR_avx2(ymm_v7, ymm_v4) 860 XOR_avx2(ymm_v11, ymm_v8) 861 XOR_avx2(ymm_v15, ymm_v12) 862 863 VMOVDQA(mem_tmp0, ymm_tmp0) # Save 864 865 ROTW16_avx2(ymm_tmp0, ymm_v3) 866 ROTW16_avx2(ymm_tmp0, ymm_v7) 867 ROTW16_avx2(ymm_tmp0, ymm_v11) 868 ROTW16_avx2(ymm_tmp0, ymm_v15) 869 870 # c += d; b ^= c; b = ROTW12(b); 871 ADD_avx2(ymm_v2, ymm_v3) 872 ADD_avx2(ymm_v6, ymm_v7) 873 ADD_avx2(ymm_v10, ymm_v11) 874 ADD_avx2(ymm_v14, ymm_v15) 875 XOR_avx2(ymm_v1, ymm_v2) 876 XOR_avx2(ymm_v5, ymm_v6) 877 XOR_avx2(ymm_v9, ymm_v10) 878 XOR_avx2(ymm_v13, ymm_v14) 879 ROTW12_avx2(ymm_tmp0, ymm_v1) 880 ROTW12_avx2(ymm_tmp0, ymm_v5) 881 ROTW12_avx2(ymm_tmp0, ymm_v9) 882 ROTW12_avx2(ymm_tmp0, ymm_v13) 883 884 # a += b; d ^= a; d = ROTW8(d); 885 VMOVDQA(ymm_tmp0, mem_tmp0) # Restore 886 887 ADD_avx2(ymm_v0, ymm_v1) 888 ADD_avx2(ymm_v4, ymm_v5) 889 ADD_avx2(ymm_v8, ymm_v9) 890 ADD_avx2(ymm_v12, ymm_v13) 891 XOR_avx2(ymm_v3, ymm_v0) 892 XOR_avx2(ymm_v7, ymm_v4) 893 XOR_avx2(ymm_v11, ymm_v8) 894 XOR_avx2(ymm_v15, ymm_v12) 895 896 VMOVDQA(mem_tmp0, ymm_tmp0) # Save 897 898 ROTW8_avx2(ymm_tmp0, ymm_v3) 899 ROTW8_avx2(ymm_tmp0, ymm_v7) 900 ROTW8_avx2(ymm_tmp0, ymm_v11) 901 ROTW8_avx2(ymm_tmp0, ymm_v15) 902 903 # c += d; b ^= c; b = ROTW7(b) 904 ADD_avx2(ymm_v2, ymm_v3) 905 ADD_avx2(ymm_v6, ymm_v7) 906 ADD_avx2(ymm_v10, ymm_v11) 907 ADD_avx2(ymm_v14, ymm_v15) 908 XOR_avx2(ymm_v1, ymm_v2) 909 XOR_avx2(ymm_v5, ymm_v6) 910 XOR_avx2(ymm_v9, ymm_v10) 911 XOR_avx2(ymm_v13, ymm_v14) 912 ROTW7_avx2(ymm_tmp0, ymm_v1) 913 ROTW7_avx2(ymm_tmp0, ymm_v5) 914 ROTW7_avx2(ymm_tmp0, ymm_v9) 915 ROTW7_avx2(ymm_tmp0, ymm_v13) 916 917 # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); 918 VPSHUFD(ymm_v1, ymm_v1, 0x93) 919 VPSHUFD(ymm_v5, ymm_v5, 0x93) 920 VPSHUFD(ymm_v9, ymm_v9, 0x93) 921 VPSHUFD(ymm_v13, ymm_v13, 0x93) 922 VPSHUFD(ymm_v2, ymm_v2, 0x4e) 923 VPSHUFD(ymm_v6, ymm_v6, 0x4e) 924 VPSHUFD(ymm_v10, ymm_v10, 0x4e) 925 VPSHUFD(ymm_v14, ymm_v14, 0x4e) 926 VPSHUFD(ymm_v3, ymm_v3, 0x39) 927 VPSHUFD(ymm_v7, ymm_v7, 0x39) 928 VPSHUFD(ymm_v11, ymm_v11, 0x39) 929 VPSHUFD(ymm_v15, ymm_v15, 0x39) 930 931 VMOVDQA(ymm_tmp0, mem_tmp0) # Restore 932 933 SUB(reg_rounds, 2) 934 JNZ(rounds_loop8.begin) 935 936 # ymm_v12 is in mem_tmp0 and is current.... 937 938 # XXX: I assume VBROADCASTI128 is about as fast as VMOVDQA.... 939 VBROADCASTI128(ymm_tmp0, x_s0) 940 ADD_avx2(ymm_v0, ymm_tmp0) 941 ADD_avx2(ymm_v4, ymm_tmp0) 942 ADD_avx2(ymm_v8, ymm_tmp0) 943 ADD_avx2(ymm_tmp0, mem_tmp0) 944 VMOVDQA(mem_tmp0, ymm_tmp0) 945 946 VBROADCASTI128(ymm_tmp0, x_s1) 947 ADD_avx2(ymm_v1, ymm_tmp0) 948 ADD_avx2(ymm_v5, ymm_tmp0) 949 ADD_avx2(ymm_v9, ymm_tmp0) 950 ADD_avx2(ymm_v13, ymm_tmp0) 951 952 VBROADCASTI128(ymm_tmp0, x_s2) 953 ADD_avx2(ymm_v2, ymm_tmp0) 954 ADD_avx2(ymm_v6, ymm_tmp0) 955 ADD_avx2(ymm_v10, ymm_tmp0) 956 ADD_avx2(ymm_v14, ymm_tmp0) 957 958 ADD_avx2(ymm_v3, mem_s3) 959 WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3) 960 VMOVDQA(ymm_v3, mem_s3) 961 ADD_avx2(ymm_v3, mem_inc) 962 963 ADD_avx2(ymm_v7, ymm_v3) 964 WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7) 965 ADD_avx2(ymm_v3, mem_inc) 966 967 ADD_avx2(ymm_v11, ymm_v3) 968 WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 256, ymm_v8, ymm_v9, ymm_v10, ymm_v11) 969 ADD_avx2(ymm_v3, mem_inc) 970 971 VMOVDQA(ymm_v12, mem_tmp0) 972 ADD_avx2(ymm_v15, ymm_v3) 973 WriteXor_avx2(ymm_v0, reg_inp, reg_outp, 384, ymm_v12, ymm_v13, ymm_v14, ymm_v15) 974 ADD_avx2(ymm_v3, mem_inc) 975 976 VMOVDQA(mem_s3, ymm_v3) 977 978 ADD(reg_inp, 8 * 64) 979 ADD(reg_outp, 8 * 64) 980 981 SUB(reg_blocks, 8) 982 JAE(vector_loop8.begin) 983 984 # ymm_v3 contains a current copy of mem_s3 either from when it was built, 985 # or because the loop updates it. Copy this before we mess with the block 986 # counter in case we need to write it back and return. 987 ymm_s3 = ymm_v11 988 VMOVDQA(ymm_s3, ymm_v3) 989 990 ADD(reg_blocks, 8) 991 JZ(out_write_even) 992 993 # We now actually can do everything in registers. 994 ymm_s0 = ymm_v8 995 VBROADCASTI128(ymm_s0, x_s0) 996 ymm_s1 = ymm_v9 997 VBROADCASTI128(ymm_s1, x_s1) 998 ymm_s2 = ymm_v10 999 VBROADCASTI128(ymm_s2, x_s2) 1000 ymm_inc = ymm_v14 1001 VMOVDQA(ymm_inc, mem_inc) 1002 1003 # 1004 # 4 blocks at a time. 1005 # 1006 1007 process_2_blocks = Label() 1008 SUB(reg_blocks, 4) 1009 JB(process_2_blocks) # < 4 blocks remaining. 1010 1011 VMOVDQA(ymm_v0, ymm_s0) 1012 VMOVDQA(ymm_v1, ymm_s1) 1013 VMOVDQA(ymm_v2, ymm_s2) 1014 VMOVDQA(ymm_v3, ymm_s3) 1015 1016 VMOVDQA(ymm_v4, ymm_v0) 1017 VMOVDQA(ymm_v5, ymm_v1) 1018 VMOVDQA(ymm_v6, ymm_v2) 1019 VPADDQ(ymm_v7, ymm_v3, ymm_inc) 1020 1021 MOV(reg_rounds, 20) 1022 rounds_loop4 = Loop() 1023 with rounds_loop4: 1024 # a += b; d ^= a; d = ROTW16(d); 1025 ADD_avx2(ymm_v0, ymm_v1) 1026 ADD_avx2(ymm_v4, ymm_v5) 1027 XOR_avx2(ymm_v3, ymm_v0) 1028 XOR_avx2(ymm_v7, ymm_v4) 1029 ROTW16_avx2(ymm_tmp0, ymm_v3) 1030 ROTW16_avx2(ymm_tmp0, ymm_v7) 1031 1032 # c += d; b ^= c; b = ROTW12(b); 1033 ADD_avx2(ymm_v2, ymm_v3) 1034 ADD_avx2(ymm_v6, ymm_v7) 1035 XOR_avx2(ymm_v1, ymm_v2) 1036 XOR_avx2(ymm_v5, ymm_v6) 1037 ROTW12_avx2(ymm_tmp0, ymm_v1) 1038 ROTW12_avx2(ymm_tmp0, ymm_v5) 1039 1040 # a += b; d ^= a; d = ROTW8(d); 1041 ADD_avx2(ymm_v0, ymm_v1) 1042 ADD_avx2(ymm_v4, ymm_v5) 1043 XOR_avx2(ymm_v3, ymm_v0) 1044 XOR_avx2(ymm_v7, ymm_v4) 1045 ROTW8_avx2(ymm_tmp0, ymm_v3) 1046 ROTW8_avx2(ymm_tmp0, ymm_v7) 1047 1048 # c += d; b ^= c; b = ROTW7(b) 1049 ADD_avx2(ymm_v2, ymm_v3) 1050 ADD_avx2(ymm_v6, ymm_v7) 1051 XOR_avx2(ymm_v1, ymm_v2) 1052 XOR_avx2(ymm_v5, ymm_v6) 1053 ROTW7_avx2(ymm_tmp0, ymm_v1) 1054 ROTW7_avx2(ymm_tmp0, ymm_v5) 1055 1056 # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); 1057 VPSHUFD(ymm_v1, ymm_v1, 0x39) 1058 VPSHUFD(ymm_v5, ymm_v5, 0x39) 1059 VPSHUFD(ymm_v2, ymm_v2, 0x4e) 1060 VPSHUFD(ymm_v6, ymm_v6, 0x4e) 1061 VPSHUFD(ymm_v3, ymm_v3, 0x93) 1062 VPSHUFD(ymm_v7, ymm_v7, 0x93) 1063 1064 # a += b; d ^= a; d = ROTW16(d); 1065 ADD_avx2(ymm_v0, ymm_v1) 1066 ADD_avx2(ymm_v4, ymm_v5) 1067 XOR_avx2(ymm_v3, ymm_v0) 1068 XOR_avx2(ymm_v7, ymm_v4) 1069 ROTW16_avx2(ymm_tmp0, ymm_v3) 1070 ROTW16_avx2(ymm_tmp0, ymm_v7) 1071 1072 # c += d; b ^= c; b = ROTW12(b); 1073 ADD_avx2(ymm_v2, ymm_v3) 1074 ADD_avx2(ymm_v6, ymm_v7) 1075 XOR_avx2(ymm_v1, ymm_v2) 1076 XOR_avx2(ymm_v5, ymm_v6) 1077 ROTW12_avx2(ymm_tmp0, ymm_v1) 1078 ROTW12_avx2(ymm_tmp0, ymm_v5) 1079 1080 # a += b; d ^= a; d = ROTW8(d); 1081 ADD_avx2(ymm_v0, ymm_v1) 1082 ADD_avx2(ymm_v4, ymm_v5) 1083 XOR_avx2(ymm_v3, ymm_v0) 1084 XOR_avx2(ymm_v7, ymm_v4) 1085 ROTW8_avx2(ymm_tmp0, ymm_v3) 1086 ROTW8_avx2(ymm_tmp0, ymm_v7) 1087 1088 # c += d; b ^= c; b = ROTW7(b) 1089 ADD_avx2(ymm_v2, ymm_v3) 1090 ADD_avx2(ymm_v6, ymm_v7) 1091 XOR_avx2(ymm_v1, ymm_v2) 1092 XOR_avx2(ymm_v5, ymm_v6) 1093 ROTW7_avx2(ymm_tmp0, ymm_v1) 1094 ROTW7_avx2(ymm_tmp0, ymm_v5) 1095 1096 # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); 1097 VPSHUFD(ymm_v1, ymm_v1, 0x93) 1098 VPSHUFD(ymm_v5, ymm_v5, 0x93) 1099 VPSHUFD(ymm_v2, ymm_v2, 0x4e) 1100 VPSHUFD(ymm_v6, ymm_v6, 0x4e) 1101 VPSHUFD(ymm_v3, ymm_v3, 0x39) 1102 VPSHUFD(ymm_v7, ymm_v7, 0x39) 1103 1104 SUB(reg_rounds, 2) 1105 JNZ(rounds_loop4.begin) 1106 1107 ADD_avx2(ymm_v0, ymm_s0) 1108 ADD_avx2(ymm_v1, ymm_s1) 1109 ADD_avx2(ymm_v2, ymm_s2) 1110 ADD_avx2(ymm_v3, ymm_s3) 1111 WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3) 1112 ADD_avx2(ymm_s3, ymm_inc) 1113 1114 ADD_avx2(ymm_v4, ymm_s0) 1115 ADD_avx2(ymm_v5, ymm_s1) 1116 ADD_avx2(ymm_v6, ymm_s2) 1117 ADD_avx2(ymm_v7, ymm_s3) 1118 WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7) 1119 ADD_avx2(ymm_s3, ymm_inc) 1120 1121 ADD(reg_inp, 4 * 64) 1122 ADD(reg_outp, 4 * 64) 1123 SUB(reg_blocks, 4) 1124 1125 LABEL(process_2_blocks) 1126 ADD(reg_blocks, 4) 1127 JZ(out_write_even) # 0 blocks left. 1128 1129 # 1130 # 2/1 blocks at a time. The two codepaths are unified because 1131 # with AVX2 we do 2 blocks at a time anyway, and this only gets called 1132 # if 3/2/1 blocks are remaining, so the extra branches don't hurt that 1133 # much. 1134 # 1135 1136 vector_loop2 = Loop() 1137 with vector_loop2: 1138 VMOVDQA(ymm_v0, ymm_s0) 1139 VMOVDQA(ymm_v1, ymm_s1) 1140 VMOVDQA(ymm_v2, ymm_s2) 1141 VMOVDQA(ymm_v3, ymm_s3) 1142 1143 MOV(reg_rounds, 20) 1144 rounds_loop2 = Loop() 1145 with rounds_loop2: 1146 # a += b; d ^= a; d = ROTW16(d); 1147 ADD_avx2(ymm_v0, ymm_v1) 1148 XOR_avx2(ymm_v3, ymm_v0) 1149 ROTW16_avx2(ymm_tmp0, ymm_v3) 1150 1151 # c += d; b ^= c; b = ROTW12(b); 1152 ADD_avx2(ymm_v2, ymm_v3) 1153 XOR_avx2(ymm_v1, ymm_v2) 1154 ROTW12_avx2(ymm_tmp0, ymm_v1) 1155 1156 # a += b; d ^= a; d = ROTW8(d); 1157 ADD_avx2(ymm_v0, ymm_v1) 1158 XOR_avx2(ymm_v3, ymm_v0) 1159 ROTW8_avx2(ymm_tmp0, ymm_v3) 1160 1161 # c += d; b ^= c; b = ROTW7(b) 1162 ADD_avx2(ymm_v2, ymm_v3) 1163 XOR_avx2(ymm_v1, ymm_v2) 1164 ROTW7_avx2(ymm_tmp0, ymm_v1) 1165 1166 # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); 1167 VPSHUFD(ymm_v1, ymm_v1, 0x39) 1168 VPSHUFD(ymm_v2, ymm_v2, 0x4e) 1169 VPSHUFD(ymm_v3, ymm_v3, 0x93) 1170 1171 # a += b; d ^= a; d = ROTW16(d); 1172 ADD_avx2(ymm_v0, ymm_v1) 1173 XOR_avx2(ymm_v3, ymm_v0) 1174 ROTW16_avx2(ymm_tmp0, ymm_v3) 1175 1176 # c += d; b ^= c; b = ROTW12(b); 1177 ADD_avx2(ymm_v2, ymm_v3) 1178 XOR_avx2(ymm_v1, ymm_v2) 1179 ROTW12_avx2(ymm_tmp0, ymm_v1) 1180 1181 # a += b; d ^= a; d = ROTW8(d); 1182 ADD_avx2(ymm_v0, ymm_v1) 1183 XOR_avx2(ymm_v3, ymm_v0) 1184 ROTW8_avx2(ymm_tmp0, ymm_v3) 1185 1186 # c += d; b ^= c; b = ROTW7(b) 1187 ADD_avx2(ymm_v2, ymm_v3) 1188 XOR_avx2(ymm_v1, ymm_v2) 1189 ROTW7_avx2(ymm_tmp0, ymm_v1) 1190 1191 # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); 1192 VPSHUFD(ymm_v1, ymm_v1, 0x93) 1193 VPSHUFD(ymm_v2, ymm_v2, 0x4e) 1194 VPSHUFD(ymm_v3, ymm_v3, 0x39) 1195 1196 SUB(reg_rounds, 2) 1197 JNZ(rounds_loop2.begin) 1198 1199 ADD_avx2(ymm_v0, ymm_s0) 1200 ADD_avx2(ymm_v1, ymm_s1) 1201 ADD_avx2(ymm_v2, ymm_s2) 1202 ADD_avx2(ymm_v3, ymm_s3) 1203 1204 # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20)); 1205 VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x20) 1206 VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp]) 1207 VMOVDQU([reg_outp], ymm_tmp0) 1208 1209 # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20)); 1210 VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x20) 1211 VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+32]) 1212 VMOVDQU([reg_outp+32], ymm_tmp0) 1213 1214 SUB(reg_blocks, 1) 1215 JZ(out_write_odd) 1216 1217 ADD_avx2(ymm_s3, ymm_inc) 1218 1219 # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31)); 1220 VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x31) 1221 VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+64]) 1222 VMOVDQU([reg_outp+64], ymm_tmp0) 1223 1224 # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31)); 1225 VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x31) 1226 VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+96]) 1227 VMOVDQU([reg_outp+96], ymm_tmp0) 1228 1229 SUB(reg_blocks, 1) 1230 JZ(out_write_even) 1231 1232 ADD(reg_inp, 2 * 64) 1233 ADD(reg_outp, 2 * 64) 1234 JMP(vector_loop2.begin) 1235 1236 LABEL(out_write_odd) 1237 VPERM2I128(ymm_s3, ymm_s3, ymm_s3, 0x01) # Odd number of blocks. 1238 1239 LABEL(out_write_even) 1240 VMOVDQU(x_s3, ymm_s3.as_xmm) # Write back ymm_s3 to x_v3 1241 1242 # Paranoia, cleanse the scratch space. 1243 VPXOR(ymm_v0, ymm_v0, ymm_v0) 1244 VMOVDQA(mem_tmp0, ymm_v0) 1245 VMOVDQA(mem_s3, ymm_v0) 1246 1247 # Clear all YMM (and XMM) registers. 1248 VZEROALL() 1249 1250 # Remove our stack allocation. 1251 MOV(registers.rsp, reg_sp_save) 1252 1253 RETURN() 1254 1255 # 1256 # CPUID 1257 # 1258 1259 cpuidParams = Argument(ptr(uint32_t)) 1260 1261 with Function("cpuidAmd64", (cpuidParams,)): 1262 reg_params = registers.r15 1263 LOAD.ARGUMENT(reg_params, cpuidParams) 1264 1265 MOV(registers.eax, [reg_params]) 1266 MOV(registers.ecx, [reg_params+8]) 1267 1268 CPUID() 1269 1270 MOV([reg_params], registers.eax) 1271 MOV([reg_params+4], registers.ebx) 1272 MOV([reg_params+8], registers.ecx) 1273 MOV([reg_params+12], registers.edx) 1274 1275 RETURN() 1276 1277 # 1278 # XGETBV (ECX = 0) 1279 # 1280 1281 xcrVec = Argument(ptr(uint32_t)) 1282 1283 with Function("xgetbv0Amd64", (xcrVec,)): 1284 reg_vec = GeneralPurposeRegister64() 1285 1286 LOAD.ARGUMENT(reg_vec, xcrVec) 1287 1288 XOR(registers.ecx, registers.ecx) 1289 1290 XGETBV() 1291 1292 MOV([reg_vec], registers.eax) 1293 MOV([reg_vec+4], registers.edx) 1294 1295 RETURN()