github.com/bcskill/bcschain/v3@v3.4.9-beta2/crypto/blake2b/blake2b_amd64.s (about) 1 // Copyright 2016 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build amd64,!gccgo,!appengine 6 7 #include "textflag.h" 8 9 DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 10 DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b 11 GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16 12 13 DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b 14 DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1 15 GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16 16 17 DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1 18 DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f 19 GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16 20 21 DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b 22 DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179 23 GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16 24 25 DATA ·c40<>+0x00(SB)/8, $0x0201000706050403 26 DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b 27 GLOBL ·c40<>(SB), (NOPTR+RODATA), $16 28 29 DATA ·c48<>+0x00(SB)/8, $0x0100070605040302 30 DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a 31 GLOBL ·c48<>(SB), (NOPTR+RODATA), $16 32 33 #define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \ 34 MOVO v4, t1; \ 35 MOVO v5, v4; \ 36 MOVO t1, v5; \ 37 MOVO v6, t1; \ 38 PUNPCKLQDQ v6, t2; \ 39 PUNPCKHQDQ v7, v6; \ 40 PUNPCKHQDQ t2, v6; \ 41 PUNPCKLQDQ v7, t2; \ 42 MOVO t1, v7; \ 43 MOVO v2, t1; \ 44 PUNPCKHQDQ t2, v7; \ 45 PUNPCKLQDQ v3, t2; \ 46 PUNPCKHQDQ t2, v2; \ 47 PUNPCKLQDQ t1, t2; \ 48 PUNPCKHQDQ t2, v3 49 50 #define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \ 51 MOVO v4, t1; \ 52 MOVO v5, v4; \ 53 MOVO t1, v5; \ 54 MOVO v2, t1; \ 55 PUNPCKLQDQ v2, t2; \ 56 PUNPCKHQDQ v3, v2; \ 57 PUNPCKHQDQ t2, v2; \ 58 PUNPCKLQDQ v3, t2; \ 59 MOVO t1, v3; \ 60 MOVO v6, t1; \ 61 PUNPCKHQDQ t2, v3; \ 62 PUNPCKLQDQ v7, t2; \ 63 PUNPCKHQDQ t2, v6; \ 64 PUNPCKLQDQ t1, t2; \ 65 PUNPCKHQDQ t2, v7 66 67 #define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \ 68 PADDQ m0, v0; \ 69 PADDQ m1, v1; \ 70 PADDQ v2, v0; \ 71 PADDQ v3, v1; \ 72 PXOR v0, v6; \ 73 PXOR v1, v7; \ 74 PSHUFD $0xB1, v6, v6; \ 75 PSHUFD $0xB1, v7, v7; \ 76 PADDQ v6, v4; \ 77 PADDQ v7, v5; \ 78 PXOR v4, v2; \ 79 PXOR v5, v3; \ 80 PSHUFB c40, v2; \ 81 PSHUFB c40, v3; \ 82 PADDQ m2, v0; \ 83 PADDQ m3, v1; \ 84 PADDQ v2, v0; \ 85 PADDQ v3, v1; \ 86 PXOR v0, v6; \ 87 PXOR v1, v7; \ 88 PSHUFB c48, v6; \ 89 PSHUFB c48, v7; \ 90 PADDQ v6, v4; \ 91 PADDQ v7, v5; \ 92 PXOR v4, v2; \ 93 PXOR v5, v3; \ 94 MOVOU v2, t0; \ 95 PADDQ v2, t0; \ 96 PSRLQ $63, v2; \ 97 PXOR t0, v2; \ 98 MOVOU v3, t0; \ 99 PADDQ v3, t0; \ 100 PSRLQ $63, v3; \ 101 PXOR t0, v3 102 103 #define LOAD_MSG(m0, m1, m2, m3, i0, i1, i2, i3, i4, i5, i6, i7) \ 104 MOVQ i0*8(SI), m0; \ 105 PINSRQ $1, i1*8(SI), m0; \ 106 MOVQ i2*8(SI), m1; \ 107 PINSRQ $1, i3*8(SI), m1; \ 108 MOVQ i4*8(SI), m2; \ 109 PINSRQ $1, i5*8(SI), m2; \ 110 MOVQ i6*8(SI), m3; \ 111 PINSRQ $1, i7*8(SI), m3 112 113 // func fSSE4(h *[8]uint64, m *[16]uint64, c0, c1 uint64, flag uint64, rounds uint64) 114 TEXT ·fSSE4(SB), 4, $24-48 // frame size = 8 + 16 byte alignment 115 MOVQ h+0(FP), AX 116 MOVQ m+8(FP), SI 117 MOVQ c0+16(FP), R8 118 MOVQ c1+24(FP), R9 119 MOVQ flag+32(FP), CX 120 MOVQ rounds+40(FP), BX 121 122 MOVQ SP, BP 123 MOVQ SP, R10 124 ADDQ $15, R10 125 ANDQ $~15, R10 126 MOVQ R10, SP 127 128 MOVOU ·iv3<>(SB), X0 129 MOVO X0, 0(SP) 130 XORQ CX, 0(SP) // 0(SP) = ·iv3 ^ (CX || 0) 131 132 MOVOU ·c40<>(SB), X13 133 MOVOU ·c48<>(SB), X14 134 135 MOVOU 0(AX), X12 136 MOVOU 16(AX), X15 137 138 MOVQ R8, X8 139 PINSRQ $1, R9, X8 140 141 MOVO X12, X0 142 MOVO X15, X1 143 MOVOU 32(AX), X2 144 MOVOU 48(AX), X3 145 MOVOU ·iv0<>(SB), X4 146 MOVOU ·iv1<>(SB), X5 147 MOVOU ·iv2<>(SB), X6 148 149 PXOR X8, X6 150 MOVO 0(SP), X7 151 152 loop: 153 SUBQ $1, BX; JCS done 154 LOAD_MSG(X8, X9, X10, X11, 0, 2, 4, 6, 1, 3, 5, 7) 155 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 156 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 157 LOAD_MSG(X8, X9, X10, X11, 8, 10, 12, 14, 9, 11, 13, 15) 158 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 159 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 160 161 SUBQ $1, BX; JCS done 162 LOAD_MSG(X8, X9, X10, X11, 14, 4, 9, 13, 10, 8, 15, 6) 163 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 164 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 165 LOAD_MSG(X8, X9, X10, X11, 1, 0, 11, 5, 12, 2, 7, 3) 166 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 167 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 168 169 SUBQ $1, BX; JCS done 170 LOAD_MSG(X8, X9, X10, X11, 11, 12, 5, 15, 8, 0, 2, 13) 171 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 172 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 173 LOAD_MSG(X8, X9, X10, X11, 10, 3, 7, 9, 14, 6, 1, 4) 174 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 175 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 176 177 SUBQ $1, BX; JCS done 178 LOAD_MSG(X8, X9, X10, X11, 7, 3, 13, 11, 9, 1, 12, 14) 179 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 180 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 181 LOAD_MSG(X8, X9, X10, X11, 2, 5, 4, 15, 6, 10, 0, 8) 182 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 183 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 184 185 SUBQ $1, BX; JCS done 186 LOAD_MSG(X8, X9, X10, X11, 9, 5, 2, 10, 0, 7, 4, 15) 187 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 188 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 189 LOAD_MSG(X8, X9, X10, X11, 14, 11, 6, 3, 1, 12, 8, 13) 190 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 191 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 192 193 SUBQ $1, BX; JCS done 194 LOAD_MSG(X8, X9, X10, X11, 2, 6, 0, 8, 12, 10, 11, 3) 195 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 196 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 197 LOAD_MSG(X8, X9, X10, X11, 4, 7, 15, 1, 13, 5, 14, 9) 198 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 199 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 200 201 SUBQ $1, BX; JCS done 202 LOAD_MSG(X8, X9, X10, X11, 12, 1, 14, 4, 5, 15, 13, 10) 203 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 204 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 205 LOAD_MSG(X8, X9, X10, X11, 0, 6, 9, 8, 7, 3, 2, 11) 206 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 207 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 208 209 SUBQ $1, BX; JCS done 210 LOAD_MSG(X8, X9, X10, X11, 13, 7, 12, 3, 11, 14, 1, 9) 211 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 212 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 213 LOAD_MSG(X8, X9, X10, X11, 5, 15, 8, 2, 0, 4, 6, 10) 214 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 215 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 216 217 SUBQ $1, BX; JCS done 218 LOAD_MSG(X8, X9, X10, X11, 6, 14, 11, 0, 15, 9, 3, 8) 219 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 220 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 221 LOAD_MSG(X8, X9, X10, X11, 12, 13, 1, 10, 2, 7, 4, 5) 222 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 223 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 224 225 SUBQ $1, BX; JCS done 226 LOAD_MSG(X8, X9, X10, X11, 10, 8, 7, 1, 2, 4, 6, 5) 227 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 228 SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) 229 LOAD_MSG(X8, X9, X10, X11, 15, 9, 3, 13, 11, 14, 12, 0) 230 HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) 231 SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) 232 233 JMP loop 234 235 done: 236 MOVOU 32(AX), X10 237 MOVOU 48(AX), X11 238 PXOR X0, X12 239 PXOR X1, X15 240 PXOR X2, X10 241 PXOR X3, X11 242 PXOR X4, X12 243 PXOR X5, X15 244 PXOR X6, X10 245 PXOR X7, X11 246 MOVOU X10, 32(AX) 247 MOVOU X11, 48(AX) 248 249 MOVOU X12, 0(AX) 250 MOVOU X15, 16(AX) 251 252 MOVQ BP, SP 253 RET