github.com/icodeface/tls@v0.0.0-20230910023335-34df9250cd12/internal/x/crypto/poly1305/sum_s390x.s (about) 1 // Copyright 2018 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build s390x,go1.11,!gccgo,!appengine 6 7 #include "textflag.h" 8 9 // Implementation of Poly1305 using the vector facility (vx). 10 11 // constants 12 #define MOD26 V0 13 #define EX0 V1 14 #define EX1 V2 15 #define EX2 V3 16 17 // temporaries 18 #define T_0 V4 19 #define T_1 V5 20 #define T_2 V6 21 #define T_3 V7 22 #define T_4 V8 23 24 // key (r) 25 #define R_0 V9 26 #define R_1 V10 27 #define R_2 V11 28 #define R_3 V12 29 #define R_4 V13 30 #define R5_1 V14 31 #define R5_2 V15 32 #define R5_3 V16 33 #define R5_4 V17 34 #define RSAVE_0 R5 35 #define RSAVE_1 R6 36 #define RSAVE_2 R7 37 #define RSAVE_3 R8 38 #define RSAVE_4 R9 39 #define R5SAVE_1 V28 40 #define R5SAVE_2 V29 41 #define R5SAVE_3 V30 42 #define R5SAVE_4 V31 43 44 // message block 45 #define F_0 V18 46 #define F_1 V19 47 #define F_2 V20 48 #define F_3 V21 49 #define F_4 V22 50 51 // accumulator 52 #define H_0 V23 53 #define H_1 V24 54 #define H_2 V25 55 #define H_3 V26 56 #define H_4 V27 57 58 GLOBL ·keyMask<>(SB), RODATA, $16 59 DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f 60 DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f 61 62 GLOBL ·bswapMask<>(SB), RODATA, $16 63 DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908 64 DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100 65 66 GLOBL ·constants<>(SB), RODATA, $64 67 // MOD26 68 DATA ·constants<>+0(SB)/8, $0x3ffffff 69 DATA ·constants<>+8(SB)/8, $0x3ffffff 70 // EX0 71 DATA ·constants<>+16(SB)/8, $0x0006050403020100 72 DATA ·constants<>+24(SB)/8, $0x1016151413121110 73 // EX1 74 DATA ·constants<>+32(SB)/8, $0x060c0b0a09080706 75 DATA ·constants<>+40(SB)/8, $0x161c1b1a19181716 76 // EX2 77 DATA ·constants<>+48(SB)/8, $0x0d0d0d0d0d0f0e0d 78 DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d 79 80 // h = (f*g) % (2**130-5) [partial reduction] 81 #define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \ 82 VMLOF f0, g0, h0 \ 83 VMLOF f0, g1, h1 \ 84 VMLOF f0, g2, h2 \ 85 VMLOF f0, g3, h3 \ 86 VMLOF f0, g4, h4 \ 87 VMLOF f1, g54, T_0 \ 88 VMLOF f1, g0, T_1 \ 89 VMLOF f1, g1, T_2 \ 90 VMLOF f1, g2, T_3 \ 91 VMLOF f1, g3, T_4 \ 92 VMALOF f2, g53, h0, h0 \ 93 VMALOF f2, g54, h1, h1 \ 94 VMALOF f2, g0, h2, h2 \ 95 VMALOF f2, g1, h3, h3 \ 96 VMALOF f2, g2, h4, h4 \ 97 VMALOF f3, g52, T_0, T_0 \ 98 VMALOF f3, g53, T_1, T_1 \ 99 VMALOF f3, g54, T_2, T_2 \ 100 VMALOF f3, g0, T_3, T_3 \ 101 VMALOF f3, g1, T_4, T_4 \ 102 VMALOF f4, g51, h0, h0 \ 103 VMALOF f4, g52, h1, h1 \ 104 VMALOF f4, g53, h2, h2 \ 105 VMALOF f4, g54, h3, h3 \ 106 VMALOF f4, g0, h4, h4 \ 107 VAG T_0, h0, h0 \ 108 VAG T_1, h1, h1 \ 109 VAG T_2, h2, h2 \ 110 VAG T_3, h3, h3 \ 111 VAG T_4, h4, h4 112 113 // carry h0->h1 h3->h4, h1->h2 h4->h0, h0->h1 h2->h3, h3->h4 114 #define REDUCE(h0, h1, h2, h3, h4) \ 115 VESRLG $26, h0, T_0 \ 116 VESRLG $26, h3, T_1 \ 117 VN MOD26, h0, h0 \ 118 VN MOD26, h3, h3 \ 119 VAG T_0, h1, h1 \ 120 VAG T_1, h4, h4 \ 121 VESRLG $26, h1, T_2 \ 122 VESRLG $26, h4, T_3 \ 123 VN MOD26, h1, h1 \ 124 VN MOD26, h4, h4 \ 125 VESLG $2, T_3, T_4 \ 126 VAG T_3, T_4, T_4 \ 127 VAG T_2, h2, h2 \ 128 VAG T_4, h0, h0 \ 129 VESRLG $26, h2, T_0 \ 130 VESRLG $26, h0, T_1 \ 131 VN MOD26, h2, h2 \ 132 VN MOD26, h0, h0 \ 133 VAG T_0, h3, h3 \ 134 VAG T_1, h1, h1 \ 135 VESRLG $26, h3, T_2 \ 136 VN MOD26, h3, h3 \ 137 VAG T_2, h4, h4 138 139 // expand in0 into d[0] and in1 into d[1] 140 #define EXPAND(in0, in1, d0, d1, d2, d3, d4) \ 141 VGBM $0x0707, d1 \ // d1=tmp 142 VPERM in0, in1, EX2, d4 \ 143 VPERM in0, in1, EX0, d0 \ 144 VPERM in0, in1, EX1, d2 \ 145 VN d1, d4, d4 \ 146 VESRLG $26, d0, d1 \ 147 VESRLG $30, d2, d3 \ 148 VESRLG $4, d2, d2 \ 149 VN MOD26, d0, d0 \ 150 VN MOD26, d1, d1 \ 151 VN MOD26, d2, d2 \ 152 VN MOD26, d3, d3 153 154 // pack h4:h0 into h1:h0 (no carry) 155 #define PACK(h0, h1, h2, h3, h4) \ 156 VESLG $26, h1, h1 \ 157 VESLG $26, h3, h3 \ 158 VO h0, h1, h0 \ 159 VO h2, h3, h2 \ 160 VESLG $4, h2, h2 \ 161 VLEIB $7, $48, h1 \ 162 VSLB h1, h2, h2 \ 163 VO h0, h2, h0 \ 164 VLEIB $7, $104, h1 \ 165 VSLB h1, h4, h3 \ 166 VO h3, h0, h0 \ 167 VLEIB $7, $24, h1 \ 168 VSRLB h1, h4, h1 169 170 // if h > 2**130-5 then h -= 2**130-5 171 #define MOD(h0, h1, t0, t1, t2) \ 172 VZERO t0 \ 173 VLEIG $1, $5, t0 \ 174 VACCQ h0, t0, t1 \ 175 VAQ h0, t0, t0 \ 176 VONE t2 \ 177 VLEIG $1, $-4, t2 \ 178 VAQ t2, t1, t1 \ 179 VACCQ h1, t1, t1 \ 180 VONE t2 \ 181 VAQ t2, t1, t1 \ 182 VN h0, t1, t2 \ 183 VNC t0, t1, t1 \ 184 VO t1, t2, h0 185 186 // func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]key) 187 TEXT ·poly1305vx(SB), $0-32 188 // This code processes up to 2 blocks (32 bytes) per iteration 189 // using the algorithm described in: 190 // NEON crypto, Daniel J. Bernstein & Peter Schwabe 191 // https://cryptojedi.org/papers/neoncrypto-20120320.pdf 192 LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key 193 194 // load MOD26, EX0, EX1 and EX2 195 MOVD $·constants<>(SB), R5 196 VLM (R5), MOD26, EX2 197 198 // setup r 199 VL (R4), T_0 200 MOVD $·keyMask<>(SB), R6 201 VL (R6), T_1 202 VN T_0, T_1, T_0 203 EXPAND(T_0, T_0, R_0, R_1, R_2, R_3, R_4) 204 205 // setup r*5 206 VLEIG $0, $5, T_0 207 VLEIG $1, $5, T_0 208 209 // store r (for final block) 210 VMLOF T_0, R_1, R5SAVE_1 211 VMLOF T_0, R_2, R5SAVE_2 212 VMLOF T_0, R_3, R5SAVE_3 213 VMLOF T_0, R_4, R5SAVE_4 214 VLGVG $0, R_0, RSAVE_0 215 VLGVG $0, R_1, RSAVE_1 216 VLGVG $0, R_2, RSAVE_2 217 VLGVG $0, R_3, RSAVE_3 218 VLGVG $0, R_4, RSAVE_4 219 220 // skip r**2 calculation 221 CMPBLE R3, $16, skip 222 223 // calculate r**2 224 MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5SAVE_1, R5SAVE_2, R5SAVE_3, R5SAVE_4, H_0, H_1, H_2, H_3, H_4) 225 REDUCE(H_0, H_1, H_2, H_3, H_4) 226 VLEIG $0, $5, T_0 227 VLEIG $1, $5, T_0 228 VMLOF T_0, H_1, R5_1 229 VMLOF T_0, H_2, R5_2 230 VMLOF T_0, H_3, R5_3 231 VMLOF T_0, H_4, R5_4 232 VLR H_0, R_0 233 VLR H_1, R_1 234 VLR H_2, R_2 235 VLR H_3, R_3 236 VLR H_4, R_4 237 238 // initialize h 239 VZERO H_0 240 VZERO H_1 241 VZERO H_2 242 VZERO H_3 243 VZERO H_4 244 245 loop: 246 CMPBLE R3, $32, b2 247 VLM (R2), T_0, T_1 248 SUB $32, R3 249 MOVD $32(R2), R2 250 EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4) 251 VLEIB $4, $1, F_4 252 VLEIB $12, $1, F_4 253 254 multiply: 255 VAG H_0, F_0, F_0 256 VAG H_1, F_1, F_1 257 VAG H_2, F_2, F_2 258 VAG H_3, F_3, F_3 259 VAG H_4, F_4, F_4 260 MULTIPLY(F_0, F_1, F_2, F_3, F_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4) 261 REDUCE(H_0, H_1, H_2, H_3, H_4) 262 CMPBNE R3, $0, loop 263 264 finish: 265 // sum vectors 266 VZERO T_0 267 VSUMQG H_0, T_0, H_0 268 VSUMQG H_1, T_0, H_1 269 VSUMQG H_2, T_0, H_2 270 VSUMQG H_3, T_0, H_3 271 VSUMQG H_4, T_0, H_4 272 273 // h may be >= 2*(2**130-5) so we need to reduce it again 274 REDUCE(H_0, H_1, H_2, H_3, H_4) 275 276 // carry h1->h4 277 VESRLG $26, H_1, T_1 278 VN MOD26, H_1, H_1 279 VAQ T_1, H_2, H_2 280 VESRLG $26, H_2, T_2 281 VN MOD26, H_2, H_2 282 VAQ T_2, H_3, H_3 283 VESRLG $26, H_3, T_3 284 VN MOD26, H_3, H_3 285 VAQ T_3, H_4, H_4 286 287 // h is now < 2*(2**130-5) 288 // pack h into h1 (hi) and h0 (lo) 289 PACK(H_0, H_1, H_2, H_3, H_4) 290 291 // if h > 2**130-5 then h -= 2**130-5 292 MOD(H_0, H_1, T_0, T_1, T_2) 293 294 // h += s 295 MOVD $·bswapMask<>(SB), R5 296 VL (R5), T_1 297 VL 16(R4), T_0 298 VPERM T_0, T_0, T_1, T_0 // reverse bytes (to big) 299 VAQ T_0, H_0, H_0 300 VPERM H_0, H_0, T_1, H_0 // reverse bytes (to little) 301 VST H_0, (R1) 302 303 RET 304 305 b2: 306 CMPBLE R3, $16, b1 307 308 // 2 blocks remaining 309 SUB $17, R3 310 VL (R2), T_0 311 VLL R3, 16(R2), T_1 312 ADD $1, R3 313 MOVBZ $1, R0 314 CMPBEQ R3, $16, 2(PC) 315 VLVGB R3, R0, T_1 316 EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4) 317 CMPBNE R3, $16, 2(PC) 318 VLEIB $12, $1, F_4 319 VLEIB $4, $1, F_4 320 321 // setup [r²,r] 322 VLVGG $1, RSAVE_0, R_0 323 VLVGG $1, RSAVE_1, R_1 324 VLVGG $1, RSAVE_2, R_2 325 VLVGG $1, RSAVE_3, R_3 326 VLVGG $1, RSAVE_4, R_4 327 VPDI $0, R5_1, R5SAVE_1, R5_1 328 VPDI $0, R5_2, R5SAVE_2, R5_2 329 VPDI $0, R5_3, R5SAVE_3, R5_3 330 VPDI $0, R5_4, R5SAVE_4, R5_4 331 332 MOVD $0, R3 333 BR multiply 334 335 skip: 336 VZERO H_0 337 VZERO H_1 338 VZERO H_2 339 VZERO H_3 340 VZERO H_4 341 342 CMPBEQ R3, $0, finish 343 344 b1: 345 // 1 block remaining 346 SUB $1, R3 347 VLL R3, (R2), T_0 348 ADD $1, R3 349 MOVBZ $1, R0 350 CMPBEQ R3, $16, 2(PC) 351 VLVGB R3, R0, T_0 352 VZERO T_1 353 EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4) 354 CMPBNE R3, $16, 2(PC) 355 VLEIB $4, $1, F_4 356 VLEIG $1, $1, R_0 357 VZERO R_1 358 VZERO R_2 359 VZERO R_3 360 VZERO R_4 361 VZERO R5_1 362 VZERO R5_2 363 VZERO R5_3 364 VZERO R5_4 365 366 // setup [r, 1] 367 VLVGG $0, RSAVE_0, R_0 368 VLVGG $0, RSAVE_1, R_1 369 VLVGG $0, RSAVE_2, R_2 370 VLVGG $0, RSAVE_3, R_3 371 VLVGG $0, RSAVE_4, R_4 372 VPDI $0, R5SAVE_1, R5_1, R5_1 373 VPDI $0, R5SAVE_2, R5_2, R5_2 374 VPDI $0, R5SAVE_3, R5_3, R5_3 375 VPDI $0, R5SAVE_4, R5_4, R5_4 376 377 MOVD $0, R3 378 BR multiply 379 380 TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1 381 MOVD $x-24(SP), R1 382 XC $24, 0(R1), 0(R1) // clear the storage 383 MOVD $2, R0 // R0 is the number of double words stored -1 384 WORD $0xB2B01000 // STFLE 0(R1) 385 XOR R0, R0 // reset the value of R0 386 MOVBZ z-8(SP), R1 387 AND $0x40, R1 388 BEQ novector 389 390 vectorinstalled: 391 // check if the vector instruction has been enabled 392 VLEIB $0, $0xF, V16 393 VLGVB $0, V16, R1 394 CMPBNE R1, $0xF, novector 395 MOVB $1, ret+0(FP) // have vx 396 RET 397 398 novector: 399 MOVB $0, ret+0(FP) // no vx 400 RET