github.com/mad-day/Yawning-crypto@v0.0.0-20190711051033-5a5f8cca32ec/aez/aez_amd64.py (about) 1 #!/usr/bin/env python3 2 # 3 # To the extent possible under law, Yawning Angel has waived all copyright 4 # and related or neighboring rights to aez, using the Creative 5 # Commons "CC0" public domain dedication. See LICENSE or 6 # <http://creativecommons.org/publicdomain/zero/1.0/> for full details. 7 8 # 9 # Dependencies: https://github.com/Maratyszcza/PeachPy 10 # 11 # python3 -m peachpy.x86_64 -mabi=goasm -S -o aez_amd64.s aez_amd64.py 12 # 13 14 from peachpy import * 15 from peachpy.x86_64 import * 16 17 cpuidParams = Argument(ptr(uint32_t)) 18 19 with Function("cpuidAMD64", (cpuidParams,)): 20 reg_params = registers.r15 21 LOAD.ARGUMENT(reg_params, cpuidParams) 22 23 MOV(registers.eax, [reg_params]) 24 MOV(registers.ecx, [reg_params+8]) 25 26 CPUID() 27 28 MOV([reg_params], registers.eax) 29 MOV([reg_params+4], registers.ebx) 30 MOV([reg_params+8], registers.ecx) 31 MOV([reg_params+12], registers.edx) 32 33 RETURN() 34 35 with Function("resetAMD64SSE2", ()): 36 PXOR(registers.xmm0, registers.xmm0) 37 PXOR(registers.xmm1, registers.xmm1) 38 PXOR(registers.xmm2, registers.xmm2) 39 PXOR(registers.xmm3, registers.xmm3) 40 PXOR(registers.xmm4, registers.xmm4) 41 PXOR(registers.xmm5, registers.xmm5) 42 PXOR(registers.xmm6, registers.xmm6) 43 PXOR(registers.xmm7, registers.xmm7) 44 PXOR(registers.xmm8, registers.xmm8) 45 PXOR(registers.xmm9, registers.xmm9) 46 PXOR(registers.xmm10, registers.xmm10) 47 PXOR(registers.xmm11, registers.xmm10) 48 PXOR(registers.xmm12, registers.xmm12) 49 PXOR(registers.xmm13, registers.xmm13) 50 PXOR(registers.xmm14, registers.xmm14) 51 PXOR(registers.xmm15, registers.xmm15) 52 RETURN() 53 54 a = Argument(ptr(const_uint8_t)) 55 b = Argument(ptr(const_uint8_t)) 56 c = Argument(ptr(const_uint8_t)) 57 d = Argument(ptr(const_uint8_t)) 58 dst = Argument(ptr(uint8_t)) 59 60 with Function("xorBytes1x16AMD64SSE2", (a, b, dst)): 61 reg_a = GeneralPurposeRegister64() 62 reg_b = GeneralPurposeRegister64() 63 reg_dst = GeneralPurposeRegister64() 64 65 LOAD.ARGUMENT(reg_a, a) 66 LOAD.ARGUMENT(reg_b, b) 67 LOAD.ARGUMENT(reg_dst, dst) 68 69 xmm_a = XMMRegister() 70 xmm_b = XMMRegister() 71 72 MOVDQU(xmm_a, [reg_a]) 73 MOVDQU(xmm_b, [reg_b]) 74 75 PXOR(xmm_a, xmm_b) 76 77 MOVDQU([reg_dst], xmm_a) 78 79 RETURN() 80 81 with Function("xorBytes4x16AMD64SSE2", (a, b, c, d, dst)): 82 reg_a = GeneralPurposeRegister64() 83 reg_b = GeneralPurposeRegister64() 84 reg_c = GeneralPurposeRegister64() 85 reg_d = GeneralPurposeRegister64() 86 reg_dst = GeneralPurposeRegister64() 87 88 LOAD.ARGUMENT(reg_a, a) 89 LOAD.ARGUMENT(reg_b, b) 90 LOAD.ARGUMENT(reg_c, c) 91 LOAD.ARGUMENT(reg_d, d) 92 LOAD.ARGUMENT(reg_dst, dst) 93 94 xmm_a = XMMRegister() 95 xmm_b = XMMRegister() 96 xmm_c = XMMRegister() 97 xmm_d = XMMRegister() 98 99 MOVDQU(xmm_a, [reg_a]) 100 MOVDQU(xmm_b, [reg_b]) 101 MOVDQU(xmm_c, [reg_c]) 102 MOVDQU(xmm_d, [reg_d]) 103 104 PXOR(xmm_a, xmm_b) 105 PXOR(xmm_c, xmm_d) 106 PXOR(xmm_a, xmm_c) 107 108 MOVDQU([reg_dst], xmm_a) 109 110 RETURN() 111 112 # 113 # AES-NI helper functions. 114 # 115 def aesenc4x1(o, j, i, l, z): 116 AESENC(o, j) 117 AESENC(o, i) 118 AESENC(o, l) 119 AESENC(o, z) 120 121 def aesenc4x2(o0, o1, j, i, l, z): 122 AESENC(o0, j) 123 AESENC(o1, j) 124 AESENC(o0, i) 125 AESENC(o1, i) 126 AESENC(o0, l) 127 AESENC(o1, l) 128 AESENC(o0, z) 129 AESENC(o1, z) 130 131 def aesenc4x4(o0, o1, o2, o3, j, i, l, z): 132 AESENC(o0, j) 133 AESENC(o1, j) 134 AESENC(o2, j) 135 AESENC(o3, j) 136 AESENC(o0, i) 137 AESENC(o1, i) 138 AESENC(o2, i) 139 AESENC(o3, i) 140 AESENC(o0, l) 141 AESENC(o1, l) 142 AESENC(o2, l) 143 AESENC(o3, l) 144 AESENC(o0, z) 145 AESENC(o1, z) 146 AESENC(o2, z) 147 AESENC(o3, z) 148 149 def aesenc4x8(o0, o1, o2, o3, o4, o5, o6, o7, j, i, l, z): 150 AESENC(o0, j) 151 AESENC(o1, j) 152 AESENC(o2, j) 153 AESENC(o3, j) 154 AESENC(o4, j) 155 AESENC(o5, j) 156 AESENC(o6, j) 157 AESENC(o7, j) 158 AESENC(o0, i) 159 AESENC(o1, i) 160 AESENC(o2, i) 161 AESENC(o3, i) 162 AESENC(o4, i) 163 AESENC(o5, i) 164 AESENC(o6, i) 165 AESENC(o7, i) 166 AESENC(o0, l) 167 AESENC(o1, l) 168 AESENC(o2, l) 169 AESENC(o3, l) 170 AESENC(o4, l) 171 AESENC(o5, l) 172 AESENC(o6, l) 173 AESENC(o7, l) 174 AESENC(o0, z) 175 AESENC(o1, z) 176 AESENC(o2, z) 177 AESENC(o3, z) 178 AESENC(o4, z) 179 AESENC(o5, z) 180 AESENC(o6, z) 181 AESENC(o7, z) 182 183 # 184 # Sigh. PeachPy has "interesting" ideas of definitions for certain things, 185 # so just use the `zen` uarch, because it supports everything. 186 # 187 188 j = Argument(ptr(const_uint8_t)) 189 i = Argument(ptr(const_uint8_t)) 190 l = Argument(ptr(const_uint8_t)) 191 k = Argument(ptr(const_uint8_t)) 192 src = Argument(ptr(uint8_t)) 193 194 with Function("aezAES4AMD64AESNI", (j, i, l, k, src, dst), target=uarch.zen): 195 reg_j = GeneralPurposeRegister64() 196 reg_i = GeneralPurposeRegister64() 197 reg_l = GeneralPurposeRegister64() 198 reg_k = GeneralPurposeRegister64() 199 reg_src = GeneralPurposeRegister64() 200 reg_dst = GeneralPurposeRegister64() 201 202 LOAD.ARGUMENT(reg_j, j) 203 LOAD.ARGUMENT(reg_i, i) 204 LOAD.ARGUMENT(reg_l, l) 205 LOAD.ARGUMENT(reg_k, k) 206 LOAD.ARGUMENT(reg_src, src) 207 LOAD.ARGUMENT(reg_dst, dst) 208 209 xmm_state = XMMRegister() 210 xmm_j = XMMRegister() 211 xmm_i = XMMRegister() 212 xmm_l = XMMRegister() 213 xmm_zero = XMMRegister() 214 215 MOVDQU(xmm_state, [reg_src]) 216 MOVDQA(xmm_j, [reg_j]) 217 MOVDQA(xmm_i, [reg_i]) 218 MOVDQA(xmm_l, [reg_l]) 219 220 PXOR(xmm_state, xmm_j) 221 PXOR(xmm_i, xmm_l) 222 PXOR(xmm_state, xmm_i) 223 PXOR(xmm_zero, xmm_zero) 224 225 MOVDQA(xmm_i, [reg_k]) 226 MOVDQA(xmm_j, [reg_k+16]) 227 MOVDQA(xmm_l, [reg_k+32]) 228 229 aesenc4x1(xmm_state, xmm_j, xmm_i, xmm_l, xmm_zero) 230 231 MOVDQU([reg_dst], xmm_state) 232 233 RETURN() 234 235 with Function("aezAES10AMD64AESNI", (l, k, src, dst), target=uarch.zen): 236 reg_l = GeneralPurposeRegister64() 237 reg_k = GeneralPurposeRegister64() 238 reg_src = GeneralPurposeRegister64() 239 reg_dst = GeneralPurposeRegister64() 240 241 LOAD.ARGUMENT(reg_l, l) 242 LOAD.ARGUMENT(reg_k, k) 243 LOAD.ARGUMENT(reg_src, src) 244 LOAD.ARGUMENT(reg_dst, dst) 245 246 MOVDQU(xmm_state, [reg_src]) 247 MOVDQU(xmm_l, [reg_l]) 248 249 PXOR(xmm_state, xmm_l) 250 251 MOVDQA(xmm_i, [reg_k]) 252 MOVDQA(xmm_j, [reg_k+16]) 253 MOVDQA(xmm_l, [reg_k+32]) 254 255 AESENC(xmm_state, xmm_i) 256 AESENC(xmm_state, xmm_j) 257 AESENC(xmm_state, xmm_l) 258 AESENC(xmm_state, xmm_i) 259 AESENC(xmm_state, xmm_j) 260 AESENC(xmm_state, xmm_l) 261 AESENC(xmm_state, xmm_i) 262 AESENC(xmm_state, xmm_j) 263 AESENC(xmm_state, xmm_l) 264 AESENC(xmm_state, xmm_i) 265 266 MOVDQU([reg_dst], xmm_state) 267 268 RETURN() 269 270 def doubleBlock(blk, tmp0, tmp1, c): 271 MOVDQA(tmp0, [c]) 272 PSHUFB(blk, tmp0) 273 MOVDQA(tmp1, blk) 274 PSRAD(tmp1, 31) 275 PAND(tmp1, [c+16]) 276 PSHUFD(tmp1, tmp1, 0x93) 277 PSLLD(blk, 1) 278 PXOR(blk, tmp1) 279 PSHUFB(blk, tmp0) 280 281 x = Argument(ptr(uint8_t)) 282 consts = Argument(ptr(const_uint8_t)) 283 sz = Argument(ptr(size_t)) 284 285 with Function("aezCorePass1AMD64AESNI", (src, dst, x, i, l, k, consts, sz), target=uarch.zen): 286 # This would be better as a port of the aesni pass_one() routine, 287 # however that requires storing some intermediaries in reversed 288 # form. 289 290 reg_src = GeneralPurposeRegister64() 291 reg_dst = GeneralPurposeRegister64() 292 reg_x = GeneralPurposeRegister64() 293 reg_tmp = GeneralPurposeRegister64() 294 reg_l = GeneralPurposeRegister64() 295 reg_bytes = GeneralPurposeRegister64() 296 reg_idx = GeneralPurposeRegister64() 297 298 LOAD.ARGUMENT(reg_src, src) # src pointer 299 LOAD.ARGUMENT(reg_dst, dst) # dst pointer 300 LOAD.ARGUMENT(reg_x, x) 301 LOAD.ARGUMENT(reg_l, l) # e.L[] 302 LOAD.ARGUMENT(reg_bytes, sz) # bytes remaining 303 MOV(reg_idx, 1) # Index into e.L[] 304 305 xmm_j = XMMRegister() # AESENC Round key J 306 xmm_i = XMMRegister() # AESENC Round key I 307 xmm_l = XMMRegister() # AESENC Round Key L 308 xmm_x = XMMRegister() # Checksum X 309 xmm_iDbl = XMMRegister() # e.I[1] 310 xmm_tmp0 = XMMRegister() 311 xmm_tmp1 = XMMRegister() 312 xmm_zero = XMMRegister() # [16]byte{0x00} 313 314 xmm_o0 = XMMRegister() 315 xmm_o1 = XMMRegister() 316 xmm_o2 = XMMRegister() 317 xmm_o3 = XMMRegister() 318 xmm_o4 = XMMRegister() 319 xmm_o5 = XMMRegister() 320 xmm_o6 = XMMRegister() 321 xmm_o7 = XMMRegister() 322 323 MOVDQU(xmm_x, [reg_x]) 324 325 LOAD.ARGUMENT(reg_tmp, i) 326 MOVDQU(xmm_iDbl, [reg_tmp]) 327 328 LOAD.ARGUMENT(reg_tmp, k) 329 MOVDQU(xmm_i, [reg_tmp]) 330 MOVDQU(xmm_j, [reg_tmp+16]) 331 MOVDQU(xmm_l, [reg_tmp+32]) 332 333 LOAD.ARGUMENT(reg_tmp, consts) # doubleBlock constants 334 335 PXOR(xmm_zero, xmm_zero) 336 337 # Process 16 * 16 bytes at a time in a loop. 338 vector_loop256 = Loop() 339 SUB(reg_bytes, 256) 340 JB(vector_loop256.end) 341 with vector_loop256: 342 # TODO: Make better use of registers, optimize scheduling. 343 344 # o0 = aes4(o0 ^ J ^ I ^ L[1], keys) // E(1,1) 345 # o1 = aes4(o1 ^ J ^ I ^ L[2], keys) // E(1,2) 346 # o2 = aes4(o2 ^ J ^ I ^ L[3], keys) // E(1,3) 347 # o3 = aes4(o3 ^ J ^ I ^ L[4], keys) // E(1,4) 348 # o4 = aes4(o4 ^ J ^ I ^ L[5], keys) // E(1,5) 349 # o5 = aes4(o5 ^ J ^ I ^ L[6], keys) // E(1,6) 350 # o6 = aes4(o6 ^ J ^ I ^ L[7], keys) // E(1,7) 351 # o7 = aes4(o7 ^ J ^ I ^ L[0], keys) // E(1,0) 352 MOVDQU(xmm_o0, [reg_src+16]) 353 MOVDQU(xmm_o1, [reg_src+48]) 354 MOVDQU(xmm_o2, [reg_src+80]) 355 MOVDQU(xmm_o3, [reg_src+112]) 356 MOVDQU(xmm_o4, [reg_src+144]) 357 MOVDQU(xmm_o5, [reg_src+176]) 358 MOVDQU(xmm_o6, [reg_src+208]) 359 MOVDQU(xmm_o7, [reg_src+240]) 360 MOVDQA(xmm_tmp0, xmm_j) # tmp = j ^ iDbl 361 PXOR(xmm_tmp0, xmm_iDbl) 362 PXOR(xmm_o0, xmm_tmp0) 363 PXOR(xmm_o1, xmm_tmp0) 364 PXOR(xmm_o2, xmm_tmp0) 365 PXOR(xmm_o3, xmm_tmp0) 366 PXOR(xmm_o4, xmm_tmp0) 367 PXOR(xmm_o5, xmm_tmp0) 368 PXOR(xmm_o6, xmm_tmp0) 369 PXOR(xmm_o7, xmm_tmp0) 370 PXOR(xmm_o0, [reg_l+16]) # L[1] 371 PXOR(xmm_o1, [reg_l+32]) # L[2] 372 PXOR(xmm_o2, [reg_l+48]) # L[3] 373 PXOR(xmm_o3, [reg_l+64]) # L[4] 374 PXOR(xmm_o4, [reg_l+80]) # L[5] 375 PXOR(xmm_o5, [reg_l+96]) # L[6] 376 PXOR(xmm_o6, [reg_l+112]) # L[7] 377 PXOR(xmm_o7, [reg_l]) # L[0] 378 aesenc4x8(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_o4, xmm_o5, xmm_o6, xmm_o7, xmm_j, xmm_i, xmm_l, xmm_zero) 379 380 # dst[ :] = in[ :] ^ o0 381 # dst[32:] = in[32:] ^ o1 382 # dst[64:] = in[64:] ^ o2 383 # dst[96:] = in[96:] ^ o3 384 # dst[128:] = in[128:] ^ o4 385 # dst[160:] = in[160:] ^ o5 386 # dst[192:] = in[192:] ^ o6 387 # dst[224:] = in[224:] ^ o7 388 MOVDQU(xmm_tmp0, [reg_src]) 389 MOVDQU(xmm_tmp1, [reg_src+32]) 390 PXOR(xmm_o0, xmm_tmp0) 391 PXOR(xmm_o1, xmm_tmp1) 392 MOVDQU(xmm_tmp0, [reg_src+64]) 393 MOVDQU(xmm_tmp1, [reg_src+96]) 394 PXOR(xmm_o2, xmm_tmp0) 395 PXOR(xmm_o3, xmm_tmp1) 396 MOVDQU(xmm_tmp0, [reg_src+128]) 397 MOVDQU(xmm_tmp1, [reg_src+160]) 398 PXOR(xmm_o4, xmm_tmp0) 399 PXOR(xmm_o5, xmm_tmp1) 400 MOVDQU(xmm_tmp0, [reg_src+192]) 401 MOVDQU(xmm_tmp1, [reg_src+224]) 402 PXOR(xmm_o6, xmm_tmp0) 403 PXOR(xmm_o7, xmm_tmp1) 404 MOVDQU([reg_dst], xmm_o0) 405 MOVDQU([reg_dst+32], xmm_o1) 406 MOVDQU([reg_dst+64], xmm_o2) 407 MOVDQU([reg_dst+96], xmm_o3) 408 MOVDQU([reg_dst+128], xmm_o4) 409 MOVDQU([reg_dst+160], xmm_o5) 410 MOVDQU([reg_dst+192], xmm_o6) 411 MOVDQU([reg_dst+224], xmm_o7) 412 413 # o0 = aes4(o0 ^ I, keys) // E(0,0) 414 # o1 = aes4(o1 ^ I, keys) // E(0,0) 415 # o2 = aes4(o2 ^ I, keys) // E(0,0) 416 # o3 = aes4(o3 ^ I, keys) // E(0,0) 417 # o4 = aes4(o4 ^ I, keys) // E(0,0) 418 # o5 = aes4(o5 ^ I, keys) // E(0,0) 419 # o6 = aes4(o6 ^ I, keys) // E(0,0) 420 # o7 = aes4(o7 ^ I, keys) // E(0,0) 421 PXOR(xmm_o0, xmm_i) 422 PXOR(xmm_o1, xmm_i) 423 PXOR(xmm_o2, xmm_i) 424 PXOR(xmm_o3, xmm_i) 425 PXOR(xmm_o4, xmm_i) 426 PXOR(xmm_o5, xmm_i) 427 PXOR(xmm_o6, xmm_i) 428 PXOR(xmm_o7, xmm_i) 429 aesenc4x8(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_o4, xmm_o5, xmm_o6, xmm_o7, xmm_j, xmm_i, xmm_l, xmm_zero) 430 431 # dst[ 16:] = o0 ^ in[ 16:] 432 # dst[ 48:] = o1 ^ in[ 48:] 433 # dst[ 80:] = o2 ^ in[ 80:] 434 # dst[112:] = o3 ^ in[112:] 435 # dst[144:] = o4 ^ in[144:] 436 # dst[176:] = o5 ^ in[176:] 437 # dst[208:] = o6 ^ in[208:] 438 # dst[240:] = o7 ^ in[240:] 439 MOVDQU(xmm_tmp0, [reg_src+16]) 440 MOVDQU(xmm_tmp1, [reg_src+48]) 441 PXOR(xmm_o0, xmm_tmp0) 442 PXOR(xmm_o1, xmm_tmp1) 443 MOVDQU(xmm_tmp0, [reg_src+80]) 444 MOVDQU(xmm_tmp1, [reg_src+112]) 445 PXOR(xmm_o2, xmm_tmp0) 446 PXOR(xmm_o3, xmm_tmp1) 447 MOVDQU(xmm_tmp0, [reg_src+144]) 448 MOVDQU(xmm_tmp1, [reg_src+176]) 449 PXOR(xmm_o4, xmm_tmp0) 450 PXOR(xmm_o5, xmm_tmp1) 451 MOVDQU(xmm_tmp0, [reg_src+208]) 452 MOVDQU(xmm_tmp1, [reg_src+240]) 453 PXOR(xmm_o6, xmm_tmp0) 454 PXOR(xmm_o7, xmm_tmp1) 455 MOVDQU([reg_dst+16], xmm_o0) 456 MOVDQU([reg_dst+48], xmm_o1) 457 MOVDQU([reg_dst+80], xmm_o2) 458 MOVDQU([reg_dst+112], xmm_o3) 459 MOVDQU([reg_dst+144], xmm_o4) 460 MOVDQU([reg_dst+176], xmm_o5) 461 MOVDQU([reg_dst+208], xmm_o6) 462 MOVDQU([reg_dst+240], xmm_o7) 463 464 # X ^= o0 ^ o1 ^ o2 ^ o3 465 PXOR(xmm_x, xmm_o0) 466 PXOR(xmm_x, xmm_o1) 467 PXOR(xmm_x, xmm_o2) 468 PXOR(xmm_x, xmm_o3) 469 PXOR(xmm_x, xmm_o4) 470 PXOR(xmm_x, xmm_o5) 471 PXOR(xmm_x, xmm_o6) 472 PXOR(xmm_x, xmm_o7) 473 474 # doubleBlock(I) 475 doubleBlock(xmm_iDbl, xmm_tmp0, xmm_tmp1, reg_tmp) 476 477 # Update book keeping. 478 ADD(reg_src, 256) 479 ADD(reg_dst, 256) 480 SUB(reg_bytes, 256) 481 JAE(vector_loop256.begin) 482 ADD(reg_bytes, 256) 483 process_64bytes = Label() 484 SUB(reg_bytes, 128) 485 JB(process_64bytes) 486 487 # Can I haz registers? 488 xmm_src_l0 = xmm_tmp0 489 xmm_src_l1 = xmm_tmp1 490 xmm_src_r0 = xmm_o4 # Change these at your peril (tmp0 used in 8 * 16 path) 491 xmm_src_r1 = xmm_o5 492 xmm_src_r2 = xmm_o6 493 xmm_src_r3 = xmm_o7 494 495 # 496 # Process 8 * 16 bytes. 497 # 498 499 # o0 = aes4(o0 ^ J ^ I ^ L[1], keys) // E(1,1) 500 # o1 = aes4(o1 ^ J ^ I ^ L[2], keys) // E(1,2) 501 # o2 = aes4(o2 ^ J ^ I ^ L[3], keys) // E(1,3) 502 # o3 = aes4(o3 ^ J ^ I ^ L[4], keys) // E(1,4) 503 MOVDQU(xmm_src_r0, [reg_src+16]) 504 MOVDQU(xmm_src_r1, [reg_src+48]) 505 MOVDQU(xmm_src_r2, [reg_src+80]) 506 MOVDQU(xmm_src_r3, [reg_src+112]) 507 MOVDQA(xmm_o0, xmm_src_r0) 508 MOVDQA(xmm_o1, xmm_src_r1) 509 MOVDQU(xmm_o2, xmm_src_r2) 510 MOVDQU(xmm_o3, xmm_src_r3) 511 MOVDQA(xmm_tmp0, xmm_j) # tmp0(src_l0) = j ^ iDbl)1 512 PXOR(xmm_tmp0, xmm_iDbl) 513 PXOR(xmm_o0, xmm_tmp0) 514 PXOR(xmm_o1, xmm_tmp0) 515 PXOR(xmm_o2, xmm_tmp0) 516 PXOR(xmm_o3, xmm_tmp0) 517 PXOR(xmm_o0, [reg_l+16]) # L[1] 518 PXOR(xmm_o1, [reg_l+32]) # L[2] 519 PXOR(xmm_o2, [reg_l+48]) # L[3] 520 PXOR(xmm_o3, [reg_l+64]) # L[4] 521 aesenc4x4(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_j, xmm_i, xmm_l, xmm_zero) 522 523 # dst[ :] = in[ :] ^ o0 524 # dst[32:] = in[32:] ^ o1 525 # dst[64:] = in[64:] ^ o2 526 # dst[96:] = in[96:] ^ o3 527 MOVDQU(xmm_src_l0, [reg_src]) 528 MOVDQU(xmm_src_l1, [reg_src+32]) 529 PXOR(xmm_o0, xmm_src_l0) 530 PXOR(xmm_o1, xmm_src_l1) 531 MOVDQU(xmm_src_l0, [reg_src+64]) 532 MOVDQU(xmm_src_l1, [reg_src+96]) 533 PXOR(xmm_o2, xmm_src_l0) 534 PXOR(xmm_o3, xmm_src_l1) 535 MOVDQU([reg_dst], xmm_o0) 536 MOVDQU([reg_dst+32], xmm_o1) 537 MOVDQU([reg_dst+64], xmm_o2) 538 MOVDQU([reg_dst+96], xmm_o3) 539 540 # o0 = aes4(o0 ^ I, keys) // E(0,0) 541 # o1 = aes4(o1 ^ I, keys) // E(0,0) 542 # o2 = aes4(o2 ^ I, keys) // E(0,0) 543 # o3 = aes4(o3 ^ I, keys) // E(0,0) 544 PXOR(xmm_o0, xmm_i) 545 PXOR(xmm_o1, xmm_i) 546 PXOR(xmm_o2, xmm_i) 547 PXOR(xmm_o3, xmm_i) 548 aesenc4x4(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_j, xmm_i, xmm_l, xmm_zero) 549 550 # dst[ 16:] = o0 ^ in[ 16:] 551 # dst[ 48:] = o1 ^ in[ 48:] 552 # dst[ 80:] = o2 ^ in[ 80:] 553 # dst[112:] = o3 ^ in[112:] 554 PXOR(xmm_o0, xmm_src_r0) 555 PXOR(xmm_o1, xmm_src_r1) 556 PXOR(xmm_o2, xmm_src_r2) 557 PXOR(xmm_o3, xmm_src_r3) 558 MOVDQU([reg_dst+16], xmm_o0) 559 MOVDQU([reg_dst+48], xmm_o1) 560 MOVDQU([reg_dst+80], xmm_o2) 561 MOVDQU([reg_dst+112], xmm_o3) 562 563 # X ^= o0 ^ o1 ^ o2 ^ o3 564 PXOR(xmm_x, xmm_o0) 565 PXOR(xmm_x, xmm_o1) 566 PXOR(xmm_x, xmm_o2) 567 PXOR(xmm_x, xmm_o3) 568 569 # Update book keeping. 570 ADD(reg_src, 128) 571 ADD(reg_dst, 128) 572 ADD(reg_idx, 4) 573 SUB(reg_bytes, 128) 574 575 LABEL(process_64bytes) 576 ADD(reg_bytes, 128) 577 process_32bytes = Label() 578 SUB(reg_bytes, 64) 579 JB(process_32bytes) 580 581 # 582 # Process 4 * 16 bytes. 583 # 584 585 reg_l_offset = reg_tmp 586 MOV(reg_l_offset, reg_idx) 587 SHL(reg_l_offset, 4) 588 ADD(reg_l_offset, reg_l) # reg_l_offset = reg_l + reg_idx*16 (L[i%8]) 589 590 # o0 = aes4(o0 ^ J ^ I ^ L[(i+0)%8], keys) // E(1,i) 591 # o1 = aes4(o1 ^ J ^ I ^ L[(i+1)%8], keys) // E(1,i+1) 592 MOVDQU(xmm_src_r0, [reg_src+16]) 593 MOVDQU(xmm_src_r1, [reg_src+48]) 594 MOVDQA(xmm_o0, xmm_src_r0) 595 MOVDQA(xmm_o1, xmm_src_r1) 596 PXOR(xmm_o0, xmm_j) 597 PXOR(xmm_o1, xmm_j) 598 PXOR(xmm_o0, xmm_iDbl) 599 PXOR(xmm_o1, xmm_iDbl) 600 PXOR(xmm_o0, [reg_l_offset]) # L[i] 601 PXOR(xmm_o1, [reg_l_offset+16]) # L[i+1] 602 aesenc4x2(xmm_o0, xmm_o1, xmm_j, xmm_i, xmm_l, xmm_zero) 603 604 # dst[: ] = in[: ] ^ o0 605 # dst[32:] = in[32:] ^ o1 606 MOVDQU(xmm_src_l0, [reg_src]) 607 MOVDQU(xmm_src_l1, [reg_src+32]) 608 PXOR(xmm_o0, xmm_src_l0) 609 PXOR(xmm_o1, xmm_src_l1) 610 MOVDQU([reg_dst], xmm_o0) 611 MOVDQU([reg_dst+32], xmm_o1) 612 613 # o0 = aes4(o0 ^ I, keys) // E(0,0) 614 # o1 = aes4(o1 ^ I, keys) // E(0,0) 615 PXOR(xmm_o0, xmm_i) 616 PXOR(xmm_o1, xmm_i) 617 aesenc4x2(xmm_o0, xmm_o1, xmm_j, xmm_i, xmm_l, xmm_zero) 618 619 # dst[16:] = o0 ^ in[16:] 620 # dst[48:] = o1 ^ in[48:] 621 PXOR(xmm_o0, xmm_src_r0) 622 PXOR(xmm_o1, xmm_src_r1) 623 MOVDQU([reg_dst+16], xmm_o0) 624 MOVDQU([reg_dst+48], xmm_o1) 625 626 # X ^= o0 ^ o2 627 PXOR(xmm_x, xmm_o0) 628 PXOR(xmm_x, xmm_o1) 629 630 # Update book keeping. 631 ADD(reg_src, 64) 632 ADD(reg_dst, 64) 633 ADD(reg_idx, 2) 634 SUB(reg_bytes, 64) 635 636 LABEL(process_32bytes) 637 ADD(reg_bytes, 64) 638 out = Label() 639 SUB(reg_bytes, 32) 640 JB(out) 641 642 # 643 # Process 2 * 16 bytes 644 # 645 646 # Pick the final L from the table. This is the only time 647 # where wrapping needs to happen based on the index. 648 AND(reg_idx, 7) 649 SHL(reg_idx, 4) 650 ADD(reg_l, reg_idx) # reg_l += reg_idx (&L[i%8]) 651 652 # o0 = aes4(o0 ^ J ^ I ^ L[i%8], keys) // E(1,i) 653 MOVDQU(xmm_src_r0, [reg_src+16]) 654 MOVDQA(xmm_o0, xmm_src_r0) 655 PXOR(xmm_o0, xmm_j) 656 PXOR(xmm_o0, xmm_iDbl) 657 PXOR(xmm_o0, [reg_l]) 658 aesenc4x1(xmm_o0, xmm_j, xmm_i, xmm_l, xmm_zero) 659 660 # dst[:] = in[:] ^ o0 661 MOVDQU(xmm_src_l0, [reg_src]) 662 PXOR(xmm_o0, xmm_src_l0) 663 MOVDQU([reg_dst], xmm_o0) 664 665 # o0 = aes4(o0 ^ I, keys) // E(0,0) 666 PXOR(xmm_o0, xmm_i) 667 aesenc4x1(xmm_o0, xmm_j, xmm_i, xmm_l, xmm_zero) 668 669 # dst[16:] = o0 ^ in[16:] 670 PXOR(xmm_o0, xmm_src_r0) 671 MOVDQU([reg_dst+16], xmm_o0) 672 673 # X ^= o0 674 PXOR(xmm_x, xmm_o0) 675 676 LABEL(out) 677 678 # Write back X. 679 MOVDQU([reg_x], xmm_x) 680 681 RETURN() 682 683 y = Argument(ptr(uint8_t)) 684 s = Argument(ptr(const_uint8_t)) 685 686 with Function("aezCorePass2AMD64AESNI", (dst, y, s, j, i, l, k, consts, sz), target=uarch.zen): 687 reg_dst = GeneralPurposeRegister64() 688 reg_y = GeneralPurposeRegister64() 689 reg_s = GeneralPurposeRegister64() 690 reg_j = GeneralPurposeRegister64() 691 reg_l = GeneralPurposeRegister64() 692 reg_tmp = GeneralPurposeRegister64() 693 reg_bytes = GeneralPurposeRegister64() 694 reg_idx = GeneralPurposeRegister64() 695 reg_sp_save = GeneralPurposeRegister64() 696 697 LOAD.ARGUMENT(reg_dst, dst) # dst pointer 698 LOAD.ARGUMENT(reg_y, y) 699 LOAD.ARGUMENT(reg_j, j) 700 LOAD.ARGUMENT(reg_l, l) 701 LOAD.ARGUMENT(reg_bytes, sz) # bytes remaining 702 MOV(reg_idx, 1) # Index into e.L[] 703 704 xmm_j = XMMRegister() # AESENC Round key J 705 xmm_i = XMMRegister() # AESENC Round key I 706 xmm_l = XMMRegister() # AESENC Round Key L 707 xmm_s = XMMRegister() # S 708 xmm_y = XMMRegister() # Checksum Y 709 xmm_iDbl = XMMRegister() # e.I[1] 710 xmm_zero = XMMRegister() # [16]byte{0x00} 711 xmm_tmp0 = XMMRegister() 712 713 o0 = XMMRegister() 714 o1 = XMMRegister() 715 o2 = XMMRegister() 716 o3 = XMMRegister() 717 o4 = XMMRegister() 718 o5 = XMMRegister() 719 o6 = XMMRegister() 720 o7 = XMMRegister() 721 722 LOAD.ARGUMENT(reg_tmp, k) 723 MOVDQU(xmm_i, [reg_tmp]) 724 MOVDQU(xmm_j, [reg_tmp+16]) 725 MOVDQU(xmm_l, [reg_tmp+32]) 726 727 MOVDQU(xmm_y, [reg_y]) 728 729 LOAD.ARGUMENT(reg_tmp, i) 730 MOVDQU(xmm_iDbl, [reg_tmp]) 731 732 LOAD.ARGUMENT(reg_tmp, consts) 733 734 PXOR(xmm_zero, xmm_zero) 735 736 LOAD.ARGUMENT(reg_s, s) 737 MOVDQU(xmm_s, [reg_s]) 738 PXOR(xmm_s, [reg_j+16]) # S ^= J[1] (Once per call, in theory) 739 740 # Save the stack pointer, align stack to 32 bytes, and allocate 741 # 256 bytes of scratch space. 742 MOV(reg_sp_save, registers.rsp) 743 AND(registers.rsp, 0xffffffffffffffe0) 744 SUB(registers.rsp, 256) 745 746 # Name strategic offsets. 747 mem_dst_l0 = [registers.rsp] 748 mem_dst_r0 = [registers.rsp+16] 749 mem_dst_l1 = [registers.rsp+32] 750 mem_dst_r1 = [registers.rsp+48] 751 mem_dst_l2 = [registers.rsp+64] 752 mem_dst_r2 = [registers.rsp+80] 753 mem_dst_l3 = [registers.rsp+96] 754 mem_dst_r3 = [registers.rsp+112] 755 mem_dst_l4 = [registers.rsp+128] 756 mem_dst_r4 = [registers.rsp+144] 757 mem_dst_l5 = [registers.rsp+160] 758 mem_dst_r5 = [registers.rsp+176] 759 mem_dst_l6 = [registers.rsp+192] 760 mem_dst_r6 = [registers.rsp+208] 761 mem_dst_l7 = [registers.rsp+224] 762 mem_dst_r7 = [registers.rsp+240] 763 764 # 765 # Process 16 * 16 bytes at a time in a loop. 766 # 767 vector_loop256 = Loop() 768 SUB(reg_bytes, 256) 769 JB(vector_loop256.end) 770 with vector_loop256: 771 # o0 = aes4(J[1] ^ I ^ L[1] ^ S[:], keys) // E(1,1) 772 # o1 = aes4(J[1] ^ I ^ L[2] ^ S[:], keys) // E(1,1) 773 # ... 774 # o6 = aes4(J[1] ^ I ^ L[7] ^ S[:], keys) // E(1,1) 775 # o7 = aes4(J[1] ^ I ^ L[0] ^ S[:], keys) // E(1,0) 776 MOVDQA(xmm_o0, xmm_s) 777 PXOR(xmm_o0, xmm_iDbl) # o0 = s ^ I 778 MOVDQA(xmm_o1, xmm_o0) # o1 = o1 779 MOVDQA(xmm_o2, xmm_o0) # o2 = o1 780 MOVDQA(xmm_o3, xmm_o0) # o3 = o1 781 MOVDQA(xmm_o4, xmm_o0) # o1 = o1 782 MOVDQA(xmm_o5, xmm_o0) # o2 = o1 783 MOVDQA(xmm_o6, xmm_o0) # o3 = o1 784 MOVDQA(xmm_o7, xmm_o0) # o3 = o1 785 PXOR(xmm_o0, [reg_l+16]) # o0 ^= L[1] 786 PXOR(xmm_o1, [reg_l+32]) # o1 ^= L[2] 787 PXOR(xmm_o2, [reg_l+48]) # o2 ^= L[3] 788 PXOR(xmm_o3, [reg_l+64]) # o3 ^= L[4] 789 PXOR(xmm_o4, [reg_l+80]) # o4 ^= L[5] 790 PXOR(xmm_o5, [reg_l+96]) # o5 ^= L[6] 791 PXOR(xmm_o6, [reg_l+112]) # o6 ^= L[7] 792 PXOR(xmm_o7, [reg_l]) # o7 ^= L[0] 793 aesenc4x8(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_o4, xmm_o5, xmm_o6, xmm_o7, xmm_j, xmm_i, xmm_l, xmm_zero) 794 795 # TODO: Figure out how the fuck to remove some of these loads/stores. 796 xmm_tmp1 = xmm_s # Use as scratch till the end of loop body. 797 798 # dst_l0 ^= o0, ... dst_l7 ^= o7 799 # Y ^= dst_l0 ^ ... ^ dst_l7 800 MOVDQU(xmm_tmp0, [reg_dst]) 801 MOVDQU(xmm_tmp1, [reg_dst+32]) 802 PXOR(xmm_tmp0, xmm_o0) 803 PXOR(xmm_tmp1, xmm_o1) 804 PXOR(xmm_y, xmm_tmp0) 805 PXOR(xmm_y, xmm_tmp1) 806 MOVDQA(mem_dst_l0, xmm_tmp0) 807 MOVDQA(mem_dst_l1, xmm_tmp1) 808 809 MOVDQU(xmm_tmp0, [reg_dst+64]) 810 MOVDQU(xmm_tmp1, [reg_dst+96]) 811 PXOR(xmm_tmp0, xmm_o2) 812 PXOR(xmm_tmp1, xmm_o3) 813 PXOR(xmm_y, xmm_tmp0) 814 PXOR(xmm_y, xmm_tmp1) 815 MOVDQA(mem_dst_l2, xmm_tmp0) 816 MOVDQA(mem_dst_l3, xmm_tmp1) 817 818 MOVDQU(xmm_tmp0, [reg_dst+128]) 819 MOVDQU(xmm_tmp1, [reg_dst+160]) 820 PXOR(xmm_tmp0, xmm_o4) 821 PXOR(xmm_tmp1, xmm_o5) 822 PXOR(xmm_y, xmm_tmp0) 823 PXOR(xmm_y, xmm_tmp1) 824 MOVDQA(mem_dst_l4, xmm_tmp0) 825 MOVDQA(mem_dst_l5, xmm_tmp1) 826 827 MOVDQU(xmm_tmp0, [reg_dst+192]) 828 MOVDQU(xmm_tmp1, [reg_dst+224]) 829 PXOR(xmm_tmp0, xmm_o6) 830 PXOR(xmm_tmp1, xmm_o7) 831 PXOR(xmm_y, xmm_tmp0) 832 PXOR(xmm_y, xmm_tmp1) 833 MOVDQA(mem_dst_l6, xmm_tmp0) 834 MOVDQA(mem_dst_l7, xmm_tmp1) 835 836 # o0 ^= dst_r0, ... o7 ^= dst_r7 837 # dst_r0 = o0, ... dst_r7 = o7 838 MOVDQU(xmm_tmp0, [reg_dst+16]) 839 MOVDQU(xmm_tmp1, [reg_dst+48]) 840 PXOR(xmm_o0, xmm_tmp0) 841 PXOR(xmm_o1, xmm_tmp1) 842 MOVDQA(mem_dst_r0, xmm_o0) 843 MOVDQA(mem_dst_r1, xmm_o1) 844 845 MOVDQU(xmm_tmp0, [reg_dst+80]) 846 MOVDQU(xmm_tmp1, [reg_dst+112]) 847 PXOR(xmm_o2, xmm_tmp0) 848 PXOR(xmm_o3, xmm_tmp1) 849 MOVDQA(mem_dst_r2, xmm_o2) 850 MOVDQA(mem_dst_r3, xmm_o3) 851 852 MOVDQU(xmm_tmp0, [reg_dst+144]) 853 MOVDQU(xmm_tmp1, [reg_dst+176]) 854 PXOR(xmm_o4, xmm_tmp0) 855 PXOR(xmm_o5, xmm_tmp1) 856 MOVDQA(mem_dst_r4, xmm_o4) 857 MOVDQA(mem_dst_r5, xmm_o5) 858 859 MOVDQU(xmm_tmp0, [reg_dst+208]) 860 MOVDQU(xmm_tmp1, [reg_dst+240]) 861 PXOR(xmm_o6, xmm_tmp0) 862 PXOR(xmm_o7, xmm_tmp1) 863 MOVDQA(mem_dst_r6, xmm_o6) 864 MOVDQA(mem_dst_r7, xmm_o7) 865 866 # o0 = aes4(o0 ^ I[0]) // E(0,0) 867 # ... 868 # o7 = aes4(o7 ^ I[0]) // E(0,0) 869 PXOR(xmm_o0, xmm_i) 870 PXOR(xmm_o1, xmm_i) 871 PXOR(xmm_o2, xmm_i) 872 PXOR(xmm_o3, xmm_i) 873 PXOR(xmm_o4, xmm_i) 874 PXOR(xmm_o5, xmm_i) 875 PXOR(xmm_o6, xmm_i) 876 PXOR(xmm_o7, xmm_i) 877 aesenc4x8(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_o4, xmm_o5, xmm_o6, xmm_o7, xmm_j, xmm_i, xmm_l, xmm_zero) 878 879 # o0 ^= dst_l0, ... o7 ^= dst_l7 880 # dst_l0 = o0, ... dst_l7 = o7 881 # 882 # nb: Stored into the right hand blocks of dst[], because we are 883 # done with the left hand side. 884 PXOR(xmm_o0, mem_dst_l0) 885 PXOR(xmm_o1, mem_dst_l1) 886 PXOR(xmm_o2, mem_dst_l2) 887 PXOR(xmm_o3, mem_dst_l3) 888 PXOR(xmm_o4, mem_dst_l4) 889 PXOR(xmm_o5, mem_dst_l5) 890 PXOR(xmm_o6, mem_dst_l6) 891 PXOR(xmm_o7, mem_dst_l7) 892 MOVDQU([reg_dst+16], xmm_o0) 893 MOVDQU([reg_dst+48], xmm_o1) 894 MOVDQU([reg_dst+80], xmm_o2) 895 MOVDQU([reg_dst+112], xmm_o3) 896 MOVDQU([reg_dst+144], xmm_o4) 897 MOVDQU([reg_dst+176], xmm_o5) 898 MOVDQU([reg_dst+208], xmm_o6) 899 MOVDQU([reg_dst+240], xmm_o7) 900 901 # o0 = aes4(o0 ^ J[0] ^ I ^ L[1]) // E(1,1) 902 # o1 = aes4(o0 ^ J[0] ^ I ^ L[2]) // E(1,2) 903 # ... 904 # o6 = aes4(o0 ^ J[0] ^ I ^ L[7]) // E(1,7) 905 # o7 = aes4(o0 ^ J[0] ^ I ^ L[0]) // E(1,0) 906 MOVDQA(xmm_tmp0, [reg_j]) 907 PXOR(xmm_tmp0, xmm_iDbl) # tmp = J[0] ^ I 908 PXOR(xmm_o0, xmm_tmp0) # o0 ^= tmp 909 PXOR(xmm_o1, xmm_tmp0) # o1 ^= tmp 910 PXOR(xmm_o2, xmm_tmp0) # o2 ^= tmp 911 PXOR(xmm_o3, xmm_tmp0) # o3 ^= tmp 912 PXOR(xmm_o4, xmm_tmp0) # o4 ^= tmp 913 PXOR(xmm_o5, xmm_tmp0) # o5 ^= tmp 914 PXOR(xmm_o6, xmm_tmp0) # o6 ^= tmp 915 PXOR(xmm_o7, xmm_tmp0) # o7 ^= tmp 916 PXOR(xmm_o0, [reg_l+16]) # o0 ^= L[1] 917 PXOR(xmm_o1, [reg_l+32]) # o1 ^= L[2] 918 PXOR(xmm_o2, [reg_l+48]) # o2 ^= L[3] 919 PXOR(xmm_o3, [reg_l+64]) # o3 ^= L[4] 920 PXOR(xmm_o4, [reg_l+80]) # o4 ^= L[5] 921 PXOR(xmm_o5, [reg_l+96]) # o5 ^= L[6] 922 PXOR(xmm_o6, [reg_l+112]) # o6 ^= L[7] 923 PXOR(xmm_o7, [reg_l]) # o7 ^= L[0] 924 aesenc4x8(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_o4, xmm_o5, xmm_o6, xmm_o7, xmm_j, xmm_i, xmm_l, xmm_zero) 925 926 # dst_r0 ^= o0, ... dst_r7 ^= o7 927 # dst_l0, dst_r0 = dst_r0, dst_l0 ... dst_l7, dst_r7 = dst_r7, dst_l7 928 # 929 # nb: dst_l0 ... dst_l7 already written after the previous aesenc4x8 930 # call. 931 PXOR(xmm_o0, mem_dst_r0) 932 PXOR(xmm_o1, mem_dst_r1) 933 PXOR(xmm_o2, mem_dst_r2) 934 PXOR(xmm_o3, mem_dst_r3) 935 PXOR(xmm_o4, mem_dst_r4) 936 PXOR(xmm_o5, mem_dst_r5) 937 PXOR(xmm_o6, mem_dst_r6) 938 PXOR(xmm_o7, mem_dst_r7) 939 MOVDQU([reg_dst], xmm_o0) 940 MOVDQU([reg_dst+32], xmm_o1) 941 MOVDQU([reg_dst+64], xmm_o2) 942 MOVDQU([reg_dst+96], xmm_o3) 943 MOVDQU([reg_dst+128], xmm_o4) 944 MOVDQU([reg_dst+160], xmm_o5) 945 MOVDQU([reg_dst+192], xmm_o6) 946 MOVDQU([reg_dst+224], xmm_o7) 947 948 # doubleBlock(I) 949 doubleBlock(xmm_iDbl, xmm_tmp0, xmm_tmp1, reg_tmp) 950 951 MOVDQU(xmm_s, [reg_s]) 952 PXOR(xmm_s, [reg_j+16]) # Re-derive since it was used as scratch space. 953 954 # Update book keeping. 955 ADD(reg_dst, 256) 956 SUB(reg_bytes, 256) 957 JAE(vector_loop256.begin) 958 959 # Purge the scratch space that we are done with. 960 MOVDQA(mem_dst_r0, xmm_zero) 961 MOVDQA(mem_dst_r1, xmm_zero) 962 MOVDQA(mem_dst_r2, xmm_zero) 963 MOVDQA(mem_dst_r3, xmm_zero) 964 MOVDQA(mem_dst_l4, xmm_zero) 965 MOVDQA(mem_dst_r4, xmm_zero) 966 MOVDQA(mem_dst_l5, xmm_zero) 967 MOVDQA(mem_dst_r5, xmm_zero) 968 MOVDQA(mem_dst_l6, xmm_zero) 969 MOVDQA(mem_dst_r6, xmm_zero) 970 MOVDQA(mem_dst_l7, xmm_zero) 971 MOVDQA(mem_dst_r7, xmm_zero) 972 ADD(reg_bytes, 256) 973 process_64bytes = Label() 974 SUB(reg_bytes, 128) 975 JB(process_64bytes) 976 977 # Can I haz registers? 978 xmm_dst_l0 = xmm_o4 979 xmm_dst_r0 = xmm_o5 980 xmm_dst_l1 = xmm_o6 981 xmm_dst_r1 = xmm_o7 982 983 # 984 # Process 8 * 16 bytes. 985 # 986 987 # o0 = aes4(J[1] ^ I ^ L[1] ^ S[:], keys) // E(1,1) 988 # o1 = aes4(J[1] ^ I ^ L[2] ^ S[:], keys) // E(1,2) 989 # o2 = aes4(J[1] ^ I ^ L[3] ^ S[:], keys) // E(1,3) 990 # o3 = aes4(J[1] ^ I ^ L[4] ^ S[:], keys) // E(1,4) 991 MOVDQA(xmm_o0, xmm_s) 992 PXOR(xmm_o0, xmm_iDbl) # o0 = s ^ I 993 MOVDQA(xmm_o1, xmm_o0) # o1 = o0 994 MOVDQA(xmm_o2, xmm_o0) # o2 = o0 995 MOVDQA(xmm_o3, xmm_o0) # o3 = o0 996 PXOR(xmm_o0, [reg_l+16]) # o0 ^= L[1] 997 PXOR(xmm_o1, [reg_l+32]) # o1 ^= L[2] 998 PXOR(xmm_o2, [reg_l+48]) # o2 ^= L[3] 999 PXOR(xmm_o3, [reg_l+64]) # o3 ^= L[4] 1000 aesenc4x4(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_j, xmm_i, xmm_l, xmm_zero) 1001 1002 # Load the left halfs of the dsts into registers. 1003 xmm_dst_l2 = xmm_dst_r0 1004 xmm_dst_l3 = xmm_dst_r1 1005 MOVDQU(xmm_dst_l0, [reg_dst]) # dst_l0 = dst[:] 1006 MOVDQU(xmm_dst_l1, [reg_dst+32]) # dst_l1 = dst[32:] 1007 MOVDQU(xmm_dst_l2, [reg_dst+64]) # dst_l2 = dst[64:] 1008 MOVDQU(xmm_dst_l3, [reg_dst+96]) # dst_l3 = dst[96:] 1009 1010 # dst_l0 ^= o0, ... dst_l3 ^= o3 1011 PXOR(xmm_dst_l0, xmm_o0) 1012 PXOR(xmm_dst_l1, xmm_o1) 1013 PXOR(xmm_dst_l2, xmm_o2) 1014 PXOR(xmm_dst_l3, xmm_o3) 1015 1016 # Y ^= dst_l0 ^ ... ^ dst_l3 1017 PXOR(xmm_y, xmm_dst_l0) 1018 PXOR(xmm_y, xmm_dst_l1) 1019 PXOR(xmm_y, xmm_dst_l2) 1020 PXOR(xmm_y, xmm_dst_l3) 1021 1022 # Store the altered left halfs. 1023 MOVDQA(mem_dst_l0, xmm_dst_l0) 1024 MOVDQA(mem_dst_l1, xmm_dst_l1) 1025 MOVDQA(mem_dst_l2, xmm_dst_l2) 1026 MOVDQA(mem_dst_l3, xmm_dst_l3) 1027 1028 # Load the right halfs of dst into registers. 1029 xmm_dst_r2 = xmm_dst_l0 1030 xmm_dst_r3 = xmm_dst_l1 1031 MOVDQU(xmm_dst_r0, [reg_dst+16]) # dst_r0 = dst[ 16:] 1032 MOVDQU(xmm_dst_r1, [reg_dst+48]) # dst_r1 = dst[ 48:] 1033 MOVDQU(xmm_dst_r2, [reg_dst+80]) # dst_r2 = dst[ 80:] 1034 MOVDQU(xmm_dst_r3, [reg_dst+112]) # dst_r3 = dst[112:] 1035 1036 # o0 ^= dst_r0, ... o3 ^= dst_r3 1037 # dst_r0 = o0, ... dst_r3 = o3 1038 PXOR(xmm_o0, xmm_dst_r0) 1039 PXOR(xmm_o1, xmm_dst_r1) 1040 PXOR(xmm_o2, xmm_dst_r2) 1041 PXOR(xmm_o3, xmm_dst_r3) 1042 MOVDQU([reg_dst+16], xmm_o0) 1043 MOVDQU([reg_dst+48], xmm_o1) 1044 MOVDQU([reg_dst+80], xmm_o2) 1045 MOVDQU([reg_dst+112], xmm_o3) 1046 MOVDQA(xmm_dst_r0, xmm_o0) 1047 MOVDQA(xmm_dst_r1, xmm_o1) 1048 MOVDQA(xmm_dst_r2, xmm_o2) 1049 MOVDQA(xmm_dst_r3, xmm_o3) 1050 1051 # o0 = aes4(o0 ^ I[0]) // E(0,0) 1052 # ... 1053 # o3 = aes4(o3 ^ I[0]) // E(0,0) 1054 PXOR(xmm_o0, xmm_i) 1055 PXOR(xmm_o1, xmm_i) 1056 PXOR(xmm_o2, xmm_i) 1057 PXOR(xmm_o3, xmm_i) 1058 aesenc4x4(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_j, xmm_i, xmm_l, xmm_zero) 1059 1060 # o0 ^= dst_l0, ... o3 ^= dst_l3 1061 # dst_l0 = o0, ... dst_l3 = o3 1062 # 1063 # nb: Stored into the right hand blocks of dst[], because we are 1064 # done with the left hand side. 1065 PXOR(xmm_o0, mem_dst_l0) 1066 PXOR(xmm_o1, mem_dst_l1) 1067 PXOR(xmm_o2, mem_dst_l2) 1068 PXOR(xmm_o3, mem_dst_l3) 1069 MOVDQU([reg_dst+16], xmm_o0) 1070 MOVDQU([reg_dst+48], xmm_o1) 1071 MOVDQU([reg_dst+80], xmm_o2) 1072 MOVDQU([reg_dst+112], xmm_o3) 1073 1074 # o0 = aes4(o0 ^ J[0] ^ I ^ L[1]) // E(1,1) 1075 # o1 = aes4(o1 ^ J[0] ^ I ^ L[2]) // E(1,2) 1076 # o2 = aes4(o2 ^ J[0] ^ I ^ L[3]) // E(1,3) 1077 # o3 = aes4(o3 ^ J[0] ^ I ^ L[4]) // E(1,4) 1078 PXOR(xmm_o0, [reg_j]) 1079 PXOR(xmm_o1, [reg_j]) 1080 PXOR(xmm_o2, [reg_j]) 1081 PXOR(xmm_o3, [reg_j]) 1082 PXOR(xmm_o0, xmm_iDbl) 1083 PXOR(xmm_o1, xmm_iDbl) 1084 PXOR(xmm_o2, xmm_iDbl) 1085 PXOR(xmm_o3, xmm_iDbl) 1086 PXOR(xmm_o0, [reg_l+16]) # o0 ^= L[1] 1087 PXOR(xmm_o1, [reg_l+32]) # o1 ^= L[2] 1088 PXOR(xmm_o2, [reg_l+48]) # o2 ^= L[3] 1089 PXOR(xmm_o3, [reg_l+64]) # o3 ^= L[4] 1090 aesenc4x4(xmm_o0, xmm_o1, xmm_o2, xmm_o3, xmm_j, xmm_i, xmm_l, xmm_zero) 1091 1092 # dst_r0 ^= o0, ... dst_r3 ^= o3 1093 # dst_l0, dst_r0 = dst_r0, dst_l0 ... dst_l3, dst_r3 = dst_r, dst_l3 1094 # 1095 # nb: dst_l0 ... dst_l7 already written after the previous aesenc4x4 1096 # call. 1097 PXOR(xmm_o0, xmm_dst_r0) 1098 PXOR(xmm_o1, xmm_dst_r1) 1099 PXOR(xmm_o2, xmm_dst_r2) 1100 PXOR(xmm_o3, xmm_dst_r3) 1101 MOVDQU([reg_dst], xmm_o0) 1102 MOVDQU([reg_dst+32], xmm_o1) 1103 MOVDQU([reg_dst+64], xmm_o2) 1104 MOVDQU([reg_dst+96], xmm_o3) 1105 1106 # Update book keeping. 1107 ADD(reg_dst, 128) 1108 ADD(reg_idx, 4) 1109 SUB(reg_bytes, 128) 1110 1111 LABEL(process_64bytes) 1112 ADD(reg_bytes, 128) 1113 process_32bytes = Label() 1114 SUB(reg_bytes, 64) 1115 JB(process_32bytes) 1116 1117 # 1118 # Process 4 * 16 bytes. 1119 # 1120 # (Scratch space unused past this point, working set fits into registers.) 1121 # 1122 1123 reg_l_offset = reg_tmp 1124 MOV(reg_l_offset, reg_idx) 1125 SHL(reg_l_offset, 4) 1126 ADD(reg_l_offset, reg_l) # reg_l_offset = reg_l + reg_idx*16 (L[i%8]) 1127 1128 # o0 = aes4(J[1] ^ I ^ L[(i+0)%8] ^ S[:], keys) // E(1,i) 1129 # o1 = aes4(J[1] ^ I ^ L[(i+1)%8] ^ S[:], keys) // E(1,i+1) 1130 MOVDQA(xmm_o0, xmm_s) 1131 PXOR(xmm_o0, xmm_iDbl) # o0 = s ^ I 1132 MOVDQA(xmm_o1, xmm_o0) # o1 = o0 1133 PXOR(xmm_o0, [reg_l_offset]) # o0 ^= L[i] 1134 PXOR(xmm_o1, [reg_l_offset+16]) # o1 ^= L[i+1] 1135 aesenc4x2(xmm_o0, xmm_o1, xmm_j, xmm_i, xmm_l, xmm_zero) 1136 1137 # Load dst into registers. 1138 MOVDQU(xmm_dst_l0, [reg_dst]) # dst_l0 = dst[:] 1139 MOVDQU(xmm_dst_r0, [reg_dst+16]) # dst_r0 = dst[16:] 1140 MOVDQU(xmm_dst_l1, [reg_dst+32]) # dst_l1 = dst[32:] 1141 MOVDQU(xmm_dst_r1, [reg_dst+48]) # dst_r1 = dst[48:] 1142 1143 # dst_l0 ^= o0, dst_l1 ^= o1 1144 PXOR(xmm_dst_l0, xmm_o0) 1145 PXOR(xmm_dst_l1, xmm_o1) 1146 1147 # Y ^= dst_l0 ^ dst_l1 1148 PXOR(xmm_y, xmm_dst_l0) 1149 PXOR(xmm_y, xmm_dst_l1) 1150 1151 # o0 ^= dst_r0, o1 ^= dst_r1 1152 # dst_r0 = o0, dst_r1 = o1 1153 PXOR(xmm_o0, xmm_dst_r0) 1154 PXOR(xmm_o1, xmm_dst_r1) 1155 MOVDQA(xmm_dst_r0, xmm_o0) 1156 MOVDQA(xmm_dst_r1, xmm_o1) 1157 1158 # o0 = aes4(o0 ^ I[0]) // E(0,0) 1159 # o1 = aes4(o1 ^ I[0]) // E(0,0) 1160 PXOR(xmm_o0, xmm_i) 1161 PXOR(xmm_o1, xmm_i) 1162 aesenc4x2(xmm_o0, xmm_o1, xmm_j, xmm_i, xmm_l, xmm_zero) 1163 1164 # o0 ^= dst_l0, o1 ^= dst_;1 1165 # dst_l0 = o0, dst_l1 = o1 1166 PXOR(xmm_o0, xmm_dst_l0) 1167 PXOR(xmm_o1, xmm_dst_l1) 1168 MOVDQA(xmm_dst_l0, xmm_o0) 1169 MOVDQA(xmm_dst_l1, xmm_o1) 1170 1171 # o0 = aes4(o0 ^ J[0] ^ I ^ L[(i+0)%8]) // E(1,i) 1172 # o1 = aes4(o1 ^ J[0] ^ I ^ L[(i+1)%8]) // E(1,i+1) 1173 PXOR(xmm_o0, [reg_j]) 1174 PXOR(xmm_o1, [reg_j]) 1175 PXOR(xmm_o0, xmm_iDbl) 1176 PXOR(xmm_o1, xmm_iDbl) 1177 PXOR(xmm_o0, [reg_tmp]) 1178 PXOR(xmm_o1, [reg_tmp+16]) 1179 aesenc4x2(xmm_o0, xmm_o1, xmm_j, xmm_i, xmm_l, xmm_zero) 1180 1181 # dst_r0 ^= o0 1182 # dst_r1 ^= o1 1183 PXOR(xmm_dst_r0, xmm_o0) 1184 PXOR(xmm_dst_r1, xmm_o1) 1185 1186 # dst_l0, dst_r0 = dst_r0, dst_l0 .. dst_l1, dst_r1 = dst_r1, dst_l1 1187 MOVDQU([reg_dst], xmm_dst_r0) 1188 MOVDQU([reg_dst+16], xmm_dst_l0) 1189 MOVDQU([reg_dst+32], xmm_dst_r1) 1190 MOVDQU([reg_dst+48], xmm_dst_l1) 1191 1192 # Update book keeping. 1193 ADD(reg_dst, 64) 1194 ADD(reg_idx, 2) 1195 SUB(reg_bytes, 64) 1196 1197 LABEL(process_32bytes) 1198 ADD(reg_bytes, 64) 1199 out = Label() 1200 SUB(reg_bytes, 32) 1201 JB(out) 1202 1203 # 1204 # Process 2 * 16 bytes 1205 # 1206 1207 # Pick the final L from the table. This is the only time 1208 # where wrapping needs to happen based on the index. 1209 AND(reg_idx, 7) 1210 SHL(reg_idx, 4) 1211 ADD(reg_l, reg_idx) # reg_l += reg_idx (&L[i%8]) 1212 1213 # o0 = aes4(J[1] ^ I ^ L[i%8] ^ S[:], keys) // E(1,i) 1214 MOVDQA(xmm_o0, xmm_s) # o0 = s 1215 PXOR(xmm_o0, xmm_iDbl) # o0 ^= I 1216 PXOR(xmm_o0, [reg_l]) # L[i%8] 1217 aesenc4x1(xmm_o0, xmm_j, xmm_i, xmm_l, xmm_zero) 1218 1219 # Load dst into registers. 1220 MOVDQU(xmm_dst_l0, [reg_dst]) # dst_l = dst[:] 1221 MOVDQU(xmm_dst_r0, [reg_dst+16]) # dst_r = dst[16:] 1222 1223 # dst_l ^= o0 1224 PXOR(xmm_dst_l0, xmm_o0) 1225 1226 # Y ^= dst_l 1227 PXOR(xmm_y, xmm_dst_l0) 1228 1229 # dst_r ^= o0 1230 PXOR(xmm_o0, xmm_dst_r0) 1231 MOVDQA(xmm_dst_r0, xmm_o0) # o0 = dst_r 1232 1233 # o0 = aes4(o0 ^ I[0]) // E(0,0) 1234 PXOR(xmm_o0, xmm_i) 1235 aesenc4x1(xmm_o0, xmm_j, xmm_i, xmm_l, xmm_zero) 1236 1237 # dst_l ^= o0 1238 PXOR(xmm_o0, xmm_dst_l0) 1239 MOVDQA(xmm_dst_l0, xmm_o0) # o0 = dst_l 1240 1241 # o0 = aes4(o0 ^ J[0] ^ I ^ L[i%8]) // E(1,i) 1242 PXOR(xmm_o0, [reg_j]) 1243 PXOR(xmm_o0, xmm_iDbl) 1244 PXOR(xmm_o0, [reg_l]) 1245 aesenc4x1(xmm_o0, xmm_j, xmm_i, xmm_l, xmm_zero) 1246 1247 # dst_r ^= o0 1248 PXOR(xmm_dst_r0, xmm_o0) 1249 1250 # dst_l, dst_r = dst_r, dst_l 1251 MOVDQU([reg_dst], xmm_dst_r0) 1252 MOVDQU([reg_dst+16], xmm_dst_l0) 1253 1254 LABEL(out) 1255 1256 # Write back Y. 1257 MOVDQU([reg_y], xmm_y) 1258 1259 # Paranoia, cleanse the scratch space. Most of it is purged 1260 # at the end of the 16x16 loop, but the 8x16 case uses these 4. 1261 MOVDQA(mem_dst_l0, xmm_zero) 1262 MOVDQA(mem_dst_l1, xmm_zero) 1263 MOVDQA(mem_dst_l2, xmm_zero) 1264 MOVDQA(mem_dst_l3, xmm_zero) 1265 1266 # Restore the stack pointer. 1267 MOV(registers.rsp, reg_sp_save) 1268 1269 RETURN()