github.com/verovm/record-replay@v1.9.7/crypto/blake2b/blake2bAVX2_amd64.s (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build go1.7,amd64,!gccgo,!appengine 6 7 #include "textflag.h" 8 9 DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 10 DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b 11 DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b 12 DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1 13 GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32 14 15 DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1 16 DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f 17 DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b 18 DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179 19 GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32 20 21 DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403 22 DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b 23 DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403 24 DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b 25 GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32 26 27 DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302 28 DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a 29 DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302 30 DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a 31 GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32 32 33 DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 34 DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b 35 GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16 36 37 DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b 38 DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1 39 GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16 40 41 DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1 42 DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f 43 GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16 44 45 DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b 46 DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179 47 GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16 48 49 DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403 50 DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b 51 GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16 52 53 DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302 54 DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a 55 GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16 56 57 #define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39 58 #define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93 59 #define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e 60 #define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93 61 #define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39 62 63 #define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \ 64 VPADDQ m0, Y0, Y0; \ 65 VPADDQ Y1, Y0, Y0; \ 66 VPXOR Y0, Y3, Y3; \ 67 VPSHUFD $-79, Y3, Y3; \ 68 VPADDQ Y3, Y2, Y2; \ 69 VPXOR Y2, Y1, Y1; \ 70 VPSHUFB c40, Y1, Y1; \ 71 VPADDQ m1, Y0, Y0; \ 72 VPADDQ Y1, Y0, Y0; \ 73 VPXOR Y0, Y3, Y3; \ 74 VPSHUFB c48, Y3, Y3; \ 75 VPADDQ Y3, Y2, Y2; \ 76 VPXOR Y2, Y1, Y1; \ 77 VPADDQ Y1, Y1, t; \ 78 VPSRLQ $63, Y1, Y1; \ 79 VPXOR t, Y1, Y1; \ 80 VPERMQ_0x39_Y1_Y1; \ 81 VPERMQ_0x4E_Y2_Y2; \ 82 VPERMQ_0x93_Y3_Y3; \ 83 VPADDQ m2, Y0, Y0; \ 84 VPADDQ Y1, Y0, Y0; \ 85 VPXOR Y0, Y3, Y3; \ 86 VPSHUFD $-79, Y3, Y3; \ 87 VPADDQ Y3, Y2, Y2; \ 88 VPXOR Y2, Y1, Y1; \ 89 VPSHUFB c40, Y1, Y1; \ 90 VPADDQ m3, Y0, Y0; \ 91 VPADDQ Y1, Y0, Y0; \ 92 VPXOR Y0, Y3, Y3; \ 93 VPSHUFB c48, Y3, Y3; \ 94 VPADDQ Y3, Y2, Y2; \ 95 VPXOR Y2, Y1, Y1; \ 96 VPADDQ Y1, Y1, t; \ 97 VPSRLQ $63, Y1, Y1; \ 98 VPXOR t, Y1, Y1; \ 99 VPERMQ_0x39_Y3_Y3; \ 100 VPERMQ_0x4E_Y2_Y2; \ 101 VPERMQ_0x93_Y1_Y1 102 103 #define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E 104 #define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26 105 #define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E 106 #define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36 107 #define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E 108 109 #define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n 110 #define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n 111 #define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n 112 #define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n 113 #define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n 114 115 #define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01 116 #define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01 117 #define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01 118 #define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01 119 #define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01 120 121 #define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01 122 #define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01 123 #define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01 124 #define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01 125 #define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01 126 127 #define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8 128 #define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01 129 130 // load msg: Y12 = (i0, i1, i2, i3) 131 // i0, i1, i2, i3 must not be 0 132 #define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \ 133 VMOVQ_SI_X12(i0*8); \ 134 VMOVQ_SI_X11(i2*8); \ 135 VPINSRQ_1_SI_X12(i1*8); \ 136 VPINSRQ_1_SI_X11(i3*8); \ 137 VINSERTI128 $1, X11, Y12, Y12 138 139 // load msg: Y13 = (i0, i1, i2, i3) 140 // i0, i1, i2, i3 must not be 0 141 #define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \ 142 VMOVQ_SI_X13(i0*8); \ 143 VMOVQ_SI_X11(i2*8); \ 144 VPINSRQ_1_SI_X13(i1*8); \ 145 VPINSRQ_1_SI_X11(i3*8); \ 146 VINSERTI128 $1, X11, Y13, Y13 147 148 // load msg: Y14 = (i0, i1, i2, i3) 149 // i0, i1, i2, i3 must not be 0 150 #define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \ 151 VMOVQ_SI_X14(i0*8); \ 152 VMOVQ_SI_X11(i2*8); \ 153 VPINSRQ_1_SI_X14(i1*8); \ 154 VPINSRQ_1_SI_X11(i3*8); \ 155 VINSERTI128 $1, X11, Y14, Y14 156 157 // load msg: Y15 = (i0, i1, i2, i3) 158 // i0, i1, i2, i3 must not be 0 159 #define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \ 160 VMOVQ_SI_X15(i0*8); \ 161 VMOVQ_SI_X11(i2*8); \ 162 VPINSRQ_1_SI_X15(i1*8); \ 163 VPINSRQ_1_SI_X11(i3*8); \ 164 VINSERTI128 $1, X11, Y15, Y15 165 166 #define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \ 167 VMOVQ_SI_X12_0; \ 168 VMOVQ_SI_X11(4*8); \ 169 VPINSRQ_1_SI_X12(2*8); \ 170 VPINSRQ_1_SI_X11(6*8); \ 171 VINSERTI128 $1, X11, Y12, Y12; \ 172 LOAD_MSG_AVX2_Y13(1, 3, 5, 7); \ 173 LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \ 174 LOAD_MSG_AVX2_Y15(9, 11, 13, 15) 175 176 #define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \ 177 LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \ 178 LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \ 179 VMOVQ_SI_X11(11*8); \ 180 VPSHUFD $0x4E, 0*8(SI), X14; \ 181 VPINSRQ_1_SI_X11(5*8); \ 182 VINSERTI128 $1, X11, Y14, Y14; \ 183 LOAD_MSG_AVX2_Y15(12, 2, 7, 3) 184 185 #define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \ 186 VMOVQ_SI_X11(5*8); \ 187 VMOVDQU 11*8(SI), X12; \ 188 VPINSRQ_1_SI_X11(15*8); \ 189 VINSERTI128 $1, X11, Y12, Y12; \ 190 VMOVQ_SI_X13(8*8); \ 191 VMOVQ_SI_X11(2*8); \ 192 VPINSRQ_1_SI_X13_0; \ 193 VPINSRQ_1_SI_X11(13*8); \ 194 VINSERTI128 $1, X11, Y13, Y13; \ 195 LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \ 196 LOAD_MSG_AVX2_Y15(14, 6, 1, 4) 197 198 #define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \ 199 LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \ 200 LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \ 201 LOAD_MSG_AVX2_Y14(2, 5, 4, 15); \ 202 VMOVQ_SI_X15(6*8); \ 203 VMOVQ_SI_X11_0; \ 204 VPINSRQ_1_SI_X15(10*8); \ 205 VPINSRQ_1_SI_X11(8*8); \ 206 VINSERTI128 $1, X11, Y15, Y15 207 208 #define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \ 209 LOAD_MSG_AVX2_Y12(9, 5, 2, 10); \ 210 VMOVQ_SI_X13_0; \ 211 VMOVQ_SI_X11(4*8); \ 212 VPINSRQ_1_SI_X13(7*8); \ 213 VPINSRQ_1_SI_X11(15*8); \ 214 VINSERTI128 $1, X11, Y13, Y13; \ 215 LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \ 216 LOAD_MSG_AVX2_Y15(1, 12, 8, 13) 217 218 #define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \ 219 VMOVQ_SI_X12(2*8); \ 220 VMOVQ_SI_X11_0; \ 221 VPINSRQ_1_SI_X12(6*8); \ 222 VPINSRQ_1_SI_X11(8*8); \ 223 VINSERTI128 $1, X11, Y12, Y12; \ 224 LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \ 225 LOAD_MSG_AVX2_Y14(4, 7, 15, 1); \ 226 LOAD_MSG_AVX2_Y15(13, 5, 14, 9) 227 228 #define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \ 229 LOAD_MSG_AVX2_Y12(12, 1, 14, 4); \ 230 LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \ 231 VMOVQ_SI_X14_0; \ 232 VPSHUFD $0x4E, 8*8(SI), X11; \ 233 VPINSRQ_1_SI_X14(6*8); \ 234 VINSERTI128 $1, X11, Y14, Y14; \ 235 LOAD_MSG_AVX2_Y15(7, 3, 2, 11) 236 237 #define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \ 238 LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \ 239 LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \ 240 LOAD_MSG_AVX2_Y14(5, 15, 8, 2); \ 241 VMOVQ_SI_X15_0; \ 242 VMOVQ_SI_X11(6*8); \ 243 VPINSRQ_1_SI_X15(4*8); \ 244 VPINSRQ_1_SI_X11(10*8); \ 245 VINSERTI128 $1, X11, Y15, Y15 246 247 #define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \ 248 VMOVQ_SI_X12(6*8); \ 249 VMOVQ_SI_X11(11*8); \ 250 VPINSRQ_1_SI_X12(14*8); \ 251 VPINSRQ_1_SI_X11_0; \ 252 VINSERTI128 $1, X11, Y12, Y12; \ 253 LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \ 254 VMOVQ_SI_X11(1*8); \ 255 VMOVDQU 12*8(SI), X14; \ 256 VPINSRQ_1_SI_X11(10*8); \ 257 VINSERTI128 $1, X11, Y14, Y14; \ 258 VMOVQ_SI_X15(2*8); \ 259 VMOVDQU 4*8(SI), X11; \ 260 VPINSRQ_1_SI_X15(7*8); \ 261 VINSERTI128 $1, X11, Y15, Y15 262 263 #define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \ 264 LOAD_MSG_AVX2_Y12(10, 8, 7, 1); \ 265 VMOVQ_SI_X13(2*8); \ 266 VPSHUFD $0x4E, 5*8(SI), X11; \ 267 VPINSRQ_1_SI_X13(4*8); \ 268 VINSERTI128 $1, X11, Y13, Y13; \ 269 LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \ 270 VMOVQ_SI_X15(11*8); \ 271 VMOVQ_SI_X11(12*8); \ 272 VPINSRQ_1_SI_X15(14*8); \ 273 VPINSRQ_1_SI_X11_0; \ 274 VINSERTI128 $1, X11, Y15, Y15 275 276 // func fAVX2(h *[8]uint64, m *[16]uint64, c0, c1 uint64, flag uint64, rounds uint64) 277 TEXT ·fAVX2(SB), 4, $64-48 // frame size = 32 + 32 byte alignment 278 MOVQ h+0(FP), AX 279 MOVQ m+8(FP), SI 280 MOVQ c0+16(FP), R8 281 MOVQ c1+24(FP), R9 282 MOVQ flag+32(FP), CX 283 MOVQ rounds+40(FP), BX 284 285 MOVQ SP, DX 286 MOVQ SP, R10 287 ADDQ $31, R10 288 ANDQ $~31, R10 289 MOVQ R10, SP 290 291 MOVQ CX, 16(SP) 292 XORQ CX, CX 293 MOVQ CX, 24(SP) 294 295 VMOVDQU ·AVX2_c40<>(SB), Y4 296 VMOVDQU ·AVX2_c48<>(SB), Y5 297 298 VMOVDQU 0(AX), Y8 299 VMOVDQU 32(AX), Y9 300 VMOVDQU ·AVX2_iv0<>(SB), Y6 301 VMOVDQU ·AVX2_iv1<>(SB), Y7 302 303 MOVQ R8, 0(SP) 304 MOVQ R9, 8(SP) 305 306 VMOVDQA Y8, Y0 307 VMOVDQA Y9, Y1 308 VMOVDQA Y6, Y2 309 VPXOR 0(SP), Y7, Y3 310 311 loop: 312 SUBQ $1, BX; JCS done 313 LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() 314 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 315 316 SUBQ $1, BX; JCS done 317 LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() 318 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 319 320 SUBQ $1, BX; JCS done 321 LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() 322 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 323 324 SUBQ $1, BX; JCS done 325 LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() 326 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 327 328 SUBQ $1, BX; JCS done 329 LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() 330 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 331 332 SUBQ $1, BX; JCS done 333 LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() 334 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 335 336 SUBQ $1, BX; JCS done 337 LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() 338 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 339 340 SUBQ $1, BX; JCS done 341 LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() 342 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 343 344 SUBQ $1, BX; JCS done 345 LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() 346 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 347 348 SUBQ $1, BX; JCS done 349 LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() 350 ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) 351 352 JMP loop 353 354 done: 355 VPXOR Y0, Y8, Y8 356 VPXOR Y1, Y9, Y9 357 VPXOR Y2, Y8, Y8 358 VPXOR Y3, Y9, Y9 359 360 VMOVDQU Y8, 0(AX) 361 VMOVDQU Y9, 32(AX) 362 VZEROUPPER 363 364 MOVQ DX, SP 365 RET 366 367 #define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA 368 #define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB 369 #define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF 370 #define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD 371 #define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE 372 373 #define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7 374 #define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF 375 #define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7 376 #define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF 377 #define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7 378 #define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7 379 #define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF 380 #define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF 381 382 #define SHUFFLE_AVX() \ 383 VMOVDQA X6, X13; \ 384 VMOVDQA X2, X14; \ 385 VMOVDQA X4, X6; \ 386 VPUNPCKLQDQ_X13_X13_X15; \ 387 VMOVDQA X5, X4; \ 388 VMOVDQA X6, X5; \ 389 VPUNPCKHQDQ_X15_X7_X6; \ 390 VPUNPCKLQDQ_X7_X7_X15; \ 391 VPUNPCKHQDQ_X15_X13_X7; \ 392 VPUNPCKLQDQ_X3_X3_X15; \ 393 VPUNPCKHQDQ_X15_X2_X2; \ 394 VPUNPCKLQDQ_X14_X14_X15; \ 395 VPUNPCKHQDQ_X15_X3_X3; \ 396 397 #define SHUFFLE_AVX_INV() \ 398 VMOVDQA X2, X13; \ 399 VMOVDQA X4, X14; \ 400 VPUNPCKLQDQ_X2_X2_X15; \ 401 VMOVDQA X5, X4; \ 402 VPUNPCKHQDQ_X15_X3_X2; \ 403 VMOVDQA X14, X5; \ 404 VPUNPCKLQDQ_X3_X3_X15; \ 405 VMOVDQA X6, X14; \ 406 VPUNPCKHQDQ_X15_X13_X3; \ 407 VPUNPCKLQDQ_X7_X7_X15; \ 408 VPUNPCKHQDQ_X15_X6_X6; \ 409 VPUNPCKLQDQ_X14_X14_X15; \ 410 VPUNPCKHQDQ_X15_X7_X7; \ 411 412 #define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \ 413 VPADDQ m0, v0, v0; \ 414 VPADDQ v2, v0, v0; \ 415 VPADDQ m1, v1, v1; \ 416 VPADDQ v3, v1, v1; \ 417 VPXOR v0, v6, v6; \ 418 VPXOR v1, v7, v7; \ 419 VPSHUFD $-79, v6, v6; \ 420 VPSHUFD $-79, v7, v7; \ 421 VPADDQ v6, v4, v4; \ 422 VPADDQ v7, v5, v5; \ 423 VPXOR v4, v2, v2; \ 424 VPXOR v5, v3, v3; \ 425 VPSHUFB c40, v2, v2; \ 426 VPSHUFB c40, v3, v3; \ 427 VPADDQ m2, v0, v0; \ 428 VPADDQ v2, v0, v0; \ 429 VPADDQ m3, v1, v1; \ 430 VPADDQ v3, v1, v1; \ 431 VPXOR v0, v6, v6; \ 432 VPXOR v1, v7, v7; \ 433 VPSHUFB c48, v6, v6; \ 434 VPSHUFB c48, v7, v7; \ 435 VPADDQ v6, v4, v4; \ 436 VPADDQ v7, v5, v5; \ 437 VPXOR v4, v2, v2; \ 438 VPXOR v5, v3, v3; \ 439 VPADDQ v2, v2, t0; \ 440 VPSRLQ $63, v2, v2; \ 441 VPXOR t0, v2, v2; \ 442 VPADDQ v3, v3, t0; \ 443 VPSRLQ $63, v3, v3; \ 444 VPXOR t0, v3, v3 445 446 // load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7) 447 // i0, i1, i2, i3, i4, i5, i6, i7 must not be 0 448 #define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \ 449 VMOVQ_SI_X12(i0*8); \ 450 VMOVQ_SI_X13(i2*8); \ 451 VMOVQ_SI_X14(i4*8); \ 452 VMOVQ_SI_X15(i6*8); \ 453 VPINSRQ_1_SI_X12(i1*8); \ 454 VPINSRQ_1_SI_X13(i3*8); \ 455 VPINSRQ_1_SI_X14(i5*8); \ 456 VPINSRQ_1_SI_X15(i7*8) 457 458 // load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7) 459 #define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \ 460 VMOVQ_SI_X12_0; \ 461 VMOVQ_SI_X13(4*8); \ 462 VMOVQ_SI_X14(1*8); \ 463 VMOVQ_SI_X15(5*8); \ 464 VPINSRQ_1_SI_X12(2*8); \ 465 VPINSRQ_1_SI_X13(6*8); \ 466 VPINSRQ_1_SI_X14(3*8); \ 467 VPINSRQ_1_SI_X15(7*8) 468 469 // load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3) 470 #define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \ 471 VPSHUFD $0x4E, 0*8(SI), X12; \ 472 VMOVQ_SI_X13(11*8); \ 473 VMOVQ_SI_X14(12*8); \ 474 VMOVQ_SI_X15(7*8); \ 475 VPINSRQ_1_SI_X13(5*8); \ 476 VPINSRQ_1_SI_X14(2*8); \ 477 VPINSRQ_1_SI_X15(3*8) 478 479 // load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13) 480 #define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \ 481 VMOVDQU 11*8(SI), X12; \ 482 VMOVQ_SI_X13(5*8); \ 483 VMOVQ_SI_X14(8*8); \ 484 VMOVQ_SI_X15(2*8); \ 485 VPINSRQ_1_SI_X13(15*8); \ 486 VPINSRQ_1_SI_X14_0; \ 487 VPINSRQ_1_SI_X15(13*8) 488 489 // load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8) 490 #define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \ 491 VMOVQ_SI_X12(2*8); \ 492 VMOVQ_SI_X13(4*8); \ 493 VMOVQ_SI_X14(6*8); \ 494 VMOVQ_SI_X15_0; \ 495 VPINSRQ_1_SI_X12(5*8); \ 496 VPINSRQ_1_SI_X13(15*8); \ 497 VPINSRQ_1_SI_X14(10*8); \ 498 VPINSRQ_1_SI_X15(8*8) 499 500 // load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15) 501 #define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \ 502 VMOVQ_SI_X12(9*8); \ 503 VMOVQ_SI_X13(2*8); \ 504 VMOVQ_SI_X14_0; \ 505 VMOVQ_SI_X15(4*8); \ 506 VPINSRQ_1_SI_X12(5*8); \ 507 VPINSRQ_1_SI_X13(10*8); \ 508 VPINSRQ_1_SI_X14(7*8); \ 509 VPINSRQ_1_SI_X15(15*8) 510 511 // load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3) 512 #define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \ 513 VMOVQ_SI_X12(2*8); \ 514 VMOVQ_SI_X13_0; \ 515 VMOVQ_SI_X14(12*8); \ 516 VMOVQ_SI_X15(11*8); \ 517 VPINSRQ_1_SI_X12(6*8); \ 518 VPINSRQ_1_SI_X13(8*8); \ 519 VPINSRQ_1_SI_X14(10*8); \ 520 VPINSRQ_1_SI_X15(3*8) 521 522 // load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11) 523 #define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \ 524 MOVQ 0*8(SI), X12; \ 525 VPSHUFD $0x4E, 8*8(SI), X13; \ 526 MOVQ 7*8(SI), X14; \ 527 MOVQ 2*8(SI), X15; \ 528 VPINSRQ_1_SI_X12(6*8); \ 529 VPINSRQ_1_SI_X14(3*8); \ 530 VPINSRQ_1_SI_X15(11*8) 531 532 // load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8) 533 #define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \ 534 MOVQ 6*8(SI), X12; \ 535 MOVQ 11*8(SI), X13; \ 536 MOVQ 15*8(SI), X14; \ 537 MOVQ 3*8(SI), X15; \ 538 VPINSRQ_1_SI_X12(14*8); \ 539 VPINSRQ_1_SI_X13_0; \ 540 VPINSRQ_1_SI_X14(9*8); \ 541 VPINSRQ_1_SI_X15(8*8) 542 543 // load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10) 544 #define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \ 545 MOVQ 5*8(SI), X12; \ 546 MOVQ 8*8(SI), X13; \ 547 MOVQ 0*8(SI), X14; \ 548 MOVQ 6*8(SI), X15; \ 549 VPINSRQ_1_SI_X12(15*8); \ 550 VPINSRQ_1_SI_X13(2*8); \ 551 VPINSRQ_1_SI_X14(4*8); \ 552 VPINSRQ_1_SI_X15(10*8) 553 554 // load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5) 555 #define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \ 556 VMOVDQU 12*8(SI), X12; \ 557 MOVQ 1*8(SI), X13; \ 558 MOVQ 2*8(SI), X14; \ 559 VPINSRQ_1_SI_X13(10*8); \ 560 VPINSRQ_1_SI_X14(7*8); \ 561 VMOVDQU 4*8(SI), X15 562 563 // load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0) 564 #define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \ 565 MOVQ 15*8(SI), X12; \ 566 MOVQ 3*8(SI), X13; \ 567 MOVQ 11*8(SI), X14; \ 568 MOVQ 12*8(SI), X15; \ 569 VPINSRQ_1_SI_X12(9*8); \ 570 VPINSRQ_1_SI_X13(13*8); \ 571 VPINSRQ_1_SI_X14(14*8); \ 572 VPINSRQ_1_SI_X15_0 573 574 // func fAVX(h *[8]uint64, m *[16]uint64, c0, c1 uint64, flag uint64, rounds uint64) 575 TEXT ·fAVX(SB), 4, $24-48 // frame size = 8 + 16 byte alignment 576 MOVQ h+0(FP), AX 577 MOVQ m+8(FP), SI 578 MOVQ c0+16(FP), R8 579 MOVQ c1+24(FP), R9 580 MOVQ flag+32(FP), CX 581 MOVQ rounds+40(FP), BX 582 583 MOVQ SP, BP 584 MOVQ SP, R10 585 ADDQ $15, R10 586 ANDQ $~15, R10 587 MOVQ R10, SP 588 589 VMOVDQU ·AVX_c40<>(SB), X0 590 VMOVDQU ·AVX_c48<>(SB), X1 591 VMOVDQA X0, X8 592 VMOVDQA X1, X9 593 594 VMOVDQU ·AVX_iv3<>(SB), X0 595 VMOVDQA X0, 0(SP) 596 XORQ CX, 0(SP) // 0(SP) = ·AVX_iv3 ^ (CX || 0) 597 598 VMOVDQU 0(AX), X10 599 VMOVDQU 16(AX), X11 600 VMOVDQU 32(AX), X2 601 VMOVDQU 48(AX), X3 602 603 VMOVQ_R8_X15 604 VPINSRQ_1_R9_X15 605 606 VMOVDQA X10, X0 607 VMOVDQA X11, X1 608 VMOVDQU ·AVX_iv0<>(SB), X4 609 VMOVDQU ·AVX_iv1<>(SB), X5 610 VMOVDQU ·AVX_iv2<>(SB), X6 611 612 VPXOR X15, X6, X6 613 VMOVDQA 0(SP), X7 614 615 loop: 616 SUBQ $1, BX; JCS done 617 LOAD_MSG_AVX_0_2_4_6_1_3_5_7() 618 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 619 SHUFFLE_AVX() 620 LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15) 621 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 622 SHUFFLE_AVX_INV() 623 624 SUBQ $1, BX; JCS done 625 LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6) 626 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 627 SHUFFLE_AVX() 628 LOAD_MSG_AVX_1_0_11_5_12_2_7_3() 629 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 630 SHUFFLE_AVX_INV() 631 632 SUBQ $1, BX; JCS done 633 LOAD_MSG_AVX_11_12_5_15_8_0_2_13() 634 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 635 SHUFFLE_AVX() 636 LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4) 637 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 638 SHUFFLE_AVX_INV() 639 640 SUBQ $1, BX; JCS done 641 LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14) 642 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 643 SHUFFLE_AVX() 644 LOAD_MSG_AVX_2_5_4_15_6_10_0_8() 645 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 646 SHUFFLE_AVX_INV() 647 648 SUBQ $1, BX; JCS done 649 LOAD_MSG_AVX_9_5_2_10_0_7_4_15() 650 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 651 SHUFFLE_AVX() 652 LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13) 653 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 654 SHUFFLE_AVX_INV() 655 656 SUBQ $1, BX; JCS done 657 LOAD_MSG_AVX_2_6_0_8_12_10_11_3() 658 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 659 SHUFFLE_AVX() 660 LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9) 661 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 662 SHUFFLE_AVX_INV() 663 664 SUBQ $1, BX; JCS done 665 LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10) 666 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 667 SHUFFLE_AVX() 668 LOAD_MSG_AVX_0_6_9_8_7_3_2_11() 669 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 670 SHUFFLE_AVX_INV() 671 672 SUBQ $1, BX; JCS done 673 LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9) 674 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 675 SHUFFLE_AVX() 676 LOAD_MSG_AVX_5_15_8_2_0_4_6_10() 677 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 678 SHUFFLE_AVX_INV() 679 680 SUBQ $1, BX; JCS done 681 LOAD_MSG_AVX_6_14_11_0_15_9_3_8() 682 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 683 SHUFFLE_AVX() 684 LOAD_MSG_AVX_12_13_1_10_2_7_4_5() 685 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 686 SHUFFLE_AVX_INV() 687 688 SUBQ $1, BX; JCS done 689 LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5) 690 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 691 SHUFFLE_AVX() 692 LOAD_MSG_AVX_15_9_3_13_11_14_12_0() 693 HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) 694 SHUFFLE_AVX_INV() 695 696 JMP loop 697 698 done: 699 VMOVDQU 32(AX), X14 700 VMOVDQU 48(AX), X15 701 VPXOR X0, X10, X10 702 VPXOR X1, X11, X11 703 VPXOR X2, X14, X14 704 VPXOR X3, X15, X15 705 VPXOR X4, X10, X10 706 VPXOR X5, X11, X11 707 VPXOR X6, X14, X2 708 VPXOR X7, X15, X3 709 VMOVDQU X2, 32(AX) 710 VMOVDQU X3, 48(AX) 711 712 VMOVDQU X10, 0(AX) 713 VMOVDQU X11, 16(AX) 714 VZEROUPPER 715 716 MOVQ BP, SP 717 RET