github.com/icodeface/tls@v0.0.0-20230910023335-34df9250cd12/internal/x/crypto/poly1305/sum_s390x.s (about)

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build s390x,go1.11,!gccgo,!appengine
     6  
     7  #include "textflag.h"
     8  
     9  // Implementation of Poly1305 using the vector facility (vx).
    10  
    11  // constants
    12  #define MOD26 V0
    13  #define EX0   V1
    14  #define EX1   V2
    15  #define EX2   V3
    16  
    17  // temporaries
    18  #define T_0 V4
    19  #define T_1 V5
    20  #define T_2 V6
    21  #define T_3 V7
    22  #define T_4 V8
    23  
    24  // key (r)
    25  #define R_0  V9
    26  #define R_1  V10
    27  #define R_2  V11
    28  #define R_3  V12
    29  #define R_4  V13
    30  #define R5_1 V14
    31  #define R5_2 V15
    32  #define R5_3 V16
    33  #define R5_4 V17
    34  #define RSAVE_0 R5
    35  #define RSAVE_1 R6
    36  #define RSAVE_2 R7
    37  #define RSAVE_3 R8
    38  #define RSAVE_4 R9
    39  #define R5SAVE_1 V28
    40  #define R5SAVE_2 V29
    41  #define R5SAVE_3 V30
    42  #define R5SAVE_4 V31
    43  
    44  // message block
    45  #define F_0 V18
    46  #define F_1 V19
    47  #define F_2 V20
    48  #define F_3 V21
    49  #define F_4 V22
    50  
    51  // accumulator
    52  #define H_0 V23
    53  #define H_1 V24
    54  #define H_2 V25
    55  #define H_3 V26
    56  #define H_4 V27
    57  
    58  GLOBL ·keyMask<>(SB), RODATA, $16
    59  DATA ·keyMask<>+0(SB)/8, $0xffffff0ffcffff0f
    60  DATA ·keyMask<>+8(SB)/8, $0xfcffff0ffcffff0f
    61  
    62  GLOBL ·bswapMask<>(SB), RODATA, $16
    63  DATA ·bswapMask<>+0(SB)/8, $0x0f0e0d0c0b0a0908
    64  DATA ·bswapMask<>+8(SB)/8, $0x0706050403020100
    65  
    66  GLOBL ·constants<>(SB), RODATA, $64
    67  // MOD26
    68  DATA ·constants<>+0(SB)/8, $0x3ffffff
    69  DATA ·constants<>+8(SB)/8, $0x3ffffff
    70  // EX0
    71  DATA ·constants<>+16(SB)/8, $0x0006050403020100
    72  DATA ·constants<>+24(SB)/8, $0x1016151413121110
    73  // EX1
    74  DATA ·constants<>+32(SB)/8, $0x060c0b0a09080706
    75  DATA ·constants<>+40(SB)/8, $0x161c1b1a19181716
    76  // EX2
    77  DATA ·constants<>+48(SB)/8, $0x0d0d0d0d0d0f0e0d
    78  DATA ·constants<>+56(SB)/8, $0x1d1d1d1d1d1f1e1d
    79  
    80  // h = (f*g) % (2**130-5) [partial reduction]
    81  #define MULTIPLY(f0, f1, f2, f3, f4, g0, g1, g2, g3, g4, g51, g52, g53, g54, h0, h1, h2, h3, h4) \
    82  	VMLOF  f0, g0, h0        \
    83  	VMLOF  f0, g1, h1        \
    84  	VMLOF  f0, g2, h2        \
    85  	VMLOF  f0, g3, h3        \
    86  	VMLOF  f0, g4, h4        \
    87  	VMLOF  f1, g54, T_0      \
    88  	VMLOF  f1, g0, T_1       \
    89  	VMLOF  f1, g1, T_2       \
    90  	VMLOF  f1, g2, T_3       \
    91  	VMLOF  f1, g3, T_4       \
    92  	VMALOF f2, g53, h0, h0   \
    93  	VMALOF f2, g54, h1, h1   \
    94  	VMALOF f2, g0, h2, h2    \
    95  	VMALOF f2, g1, h3, h3    \
    96  	VMALOF f2, g2, h4, h4    \
    97  	VMALOF f3, g52, T_0, T_0 \
    98  	VMALOF f3, g53, T_1, T_1 \
    99  	VMALOF f3, g54, T_2, T_2 \
   100  	VMALOF f3, g0, T_3, T_3  \
   101  	VMALOF f3, g1, T_4, T_4  \
   102  	VMALOF f4, g51, h0, h0   \
   103  	VMALOF f4, g52, h1, h1   \
   104  	VMALOF f4, g53, h2, h2   \
   105  	VMALOF f4, g54, h3, h3   \
   106  	VMALOF f4, g0, h4, h4    \
   107  	VAG    T_0, h0, h0       \
   108  	VAG    T_1, h1, h1       \
   109  	VAG    T_2, h2, h2       \
   110  	VAG    T_3, h3, h3       \
   111  	VAG    T_4, h4, h4
   112  
   113  // carry h0->h1 h3->h4, h1->h2 h4->h0, h0->h1 h2->h3, h3->h4
   114  #define REDUCE(h0, h1, h2, h3, h4) \
   115  	VESRLG $26, h0, T_0  \
   116  	VESRLG $26, h3, T_1  \
   117  	VN     MOD26, h0, h0 \
   118  	VN     MOD26, h3, h3 \
   119  	VAG    T_0, h1, h1   \
   120  	VAG    T_1, h4, h4   \
   121  	VESRLG $26, h1, T_2  \
   122  	VESRLG $26, h4, T_3  \
   123  	VN     MOD26, h1, h1 \
   124  	VN     MOD26, h4, h4 \
   125  	VESLG  $2, T_3, T_4  \
   126  	VAG    T_3, T_4, T_4 \
   127  	VAG    T_2, h2, h2   \
   128  	VAG    T_4, h0, h0   \
   129  	VESRLG $26, h2, T_0  \
   130  	VESRLG $26, h0, T_1  \
   131  	VN     MOD26, h2, h2 \
   132  	VN     MOD26, h0, h0 \
   133  	VAG    T_0, h3, h3   \
   134  	VAG    T_1, h1, h1   \
   135  	VESRLG $26, h3, T_2  \
   136  	VN     MOD26, h3, h3 \
   137  	VAG    T_2, h4, h4
   138  
   139  // expand in0 into d[0] and in1 into d[1]
   140  #define EXPAND(in0, in1, d0, d1, d2, d3, d4) \
   141  	VGBM   $0x0707, d1       \ // d1=tmp
   142  	VPERM  in0, in1, EX2, d4 \
   143  	VPERM  in0, in1, EX0, d0 \
   144  	VPERM  in0, in1, EX1, d2 \
   145  	VN     d1, d4, d4        \
   146  	VESRLG $26, d0, d1       \
   147  	VESRLG $30, d2, d3       \
   148  	VESRLG $4, d2, d2        \
   149  	VN     MOD26, d0, d0     \
   150  	VN     MOD26, d1, d1     \
   151  	VN     MOD26, d2, d2     \
   152  	VN     MOD26, d3, d3
   153  
   154  // pack h4:h0 into h1:h0 (no carry)
   155  #define PACK(h0, h1, h2, h3, h4) \
   156  	VESLG $26, h1, h1  \
   157  	VESLG $26, h3, h3  \
   158  	VO    h0, h1, h0   \
   159  	VO    h2, h3, h2   \
   160  	VESLG $4, h2, h2   \
   161  	VLEIB $7, $48, h1  \
   162  	VSLB  h1, h2, h2   \
   163  	VO    h0, h2, h0   \
   164  	VLEIB $7, $104, h1 \
   165  	VSLB  h1, h4, h3   \
   166  	VO    h3, h0, h0   \
   167  	VLEIB $7, $24, h1  \
   168  	VSRLB h1, h4, h1
   169  
   170  // if h > 2**130-5 then h -= 2**130-5
   171  #define MOD(h0, h1, t0, t1, t2) \
   172  	VZERO t0          \
   173  	VLEIG $1, $5, t0  \
   174  	VACCQ h0, t0, t1  \
   175  	VAQ   h0, t0, t0  \
   176  	VONE  t2          \
   177  	VLEIG $1, $-4, t2 \
   178  	VAQ   t2, t1, t1  \
   179  	VACCQ h1, t1, t1  \
   180  	VONE  t2          \
   181  	VAQ   t2, t1, t1  \
   182  	VN    h0, t1, t2  \
   183  	VNC   t0, t1, t1  \
   184  	VO    t1, t2, h0
   185  
   186  // func poly1305vx(out *[16]byte, m *byte, mlen uint64, key *[32]key)
   187  TEXT ·poly1305vx(SB), $0-32
   188  	// This code processes up to 2 blocks (32 bytes) per iteration
   189  	// using the algorithm described in:
   190  	// NEON crypto, Daniel J. Bernstein & Peter Schwabe
   191  	// https://cryptojedi.org/papers/neoncrypto-20120320.pdf
   192  	LMG out+0(FP), R1, R4 // R1=out, R2=m, R3=mlen, R4=key
   193  
   194  	// load MOD26, EX0, EX1 and EX2
   195  	MOVD $·constants<>(SB), R5
   196  	VLM  (R5), MOD26, EX2
   197  
   198  	// setup r
   199  	VL   (R4), T_0
   200  	MOVD $·keyMask<>(SB), R6
   201  	VL   (R6), T_1
   202  	VN   T_0, T_1, T_0
   203  	EXPAND(T_0, T_0, R_0, R_1, R_2, R_3, R_4)
   204  
   205  	// setup r*5
   206  	VLEIG $0, $5, T_0
   207  	VLEIG $1, $5, T_0
   208  
   209  	// store r (for final block)
   210  	VMLOF T_0, R_1, R5SAVE_1
   211  	VMLOF T_0, R_2, R5SAVE_2
   212  	VMLOF T_0, R_3, R5SAVE_3
   213  	VMLOF T_0, R_4, R5SAVE_4
   214  	VLGVG $0, R_0, RSAVE_0
   215  	VLGVG $0, R_1, RSAVE_1
   216  	VLGVG $0, R_2, RSAVE_2
   217  	VLGVG $0, R_3, RSAVE_3
   218  	VLGVG $0, R_4, RSAVE_4
   219  
   220  	// skip r**2 calculation
   221  	CMPBLE R3, $16, skip
   222  
   223  	// calculate r**2
   224  	MULTIPLY(R_0, R_1, R_2, R_3, R_4, R_0, R_1, R_2, R_3, R_4, R5SAVE_1, R5SAVE_2, R5SAVE_3, R5SAVE_4, H_0, H_1, H_2, H_3, H_4)
   225  	REDUCE(H_0, H_1, H_2, H_3, H_4)
   226  	VLEIG $0, $5, T_0
   227  	VLEIG $1, $5, T_0
   228  	VMLOF T_0, H_1, R5_1
   229  	VMLOF T_0, H_2, R5_2
   230  	VMLOF T_0, H_3, R5_3
   231  	VMLOF T_0, H_4, R5_4
   232  	VLR   H_0, R_0
   233  	VLR   H_1, R_1
   234  	VLR   H_2, R_2
   235  	VLR   H_3, R_3
   236  	VLR   H_4, R_4
   237  
   238  	// initialize h
   239  	VZERO H_0
   240  	VZERO H_1
   241  	VZERO H_2
   242  	VZERO H_3
   243  	VZERO H_4
   244  
   245  loop:
   246  	CMPBLE R3, $32, b2
   247  	VLM    (R2), T_0, T_1
   248  	SUB    $32, R3
   249  	MOVD   $32(R2), R2
   250  	EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
   251  	VLEIB  $4, $1, F_4
   252  	VLEIB  $12, $1, F_4
   253  
   254  multiply:
   255  	VAG    H_0, F_0, F_0
   256  	VAG    H_1, F_1, F_1
   257  	VAG    H_2, F_2, F_2
   258  	VAG    H_3, F_3, F_3
   259  	VAG    H_4, F_4, F_4
   260  	MULTIPLY(F_0, F_1, F_2, F_3, F_4, R_0, R_1, R_2, R_3, R_4, R5_1, R5_2, R5_3, R5_4, H_0, H_1, H_2, H_3, H_4)
   261  	REDUCE(H_0, H_1, H_2, H_3, H_4)
   262  	CMPBNE R3, $0, loop
   263  
   264  finish:
   265  	// sum vectors
   266  	VZERO  T_0
   267  	VSUMQG H_0, T_0, H_0
   268  	VSUMQG H_1, T_0, H_1
   269  	VSUMQG H_2, T_0, H_2
   270  	VSUMQG H_3, T_0, H_3
   271  	VSUMQG H_4, T_0, H_4
   272  
   273  	// h may be >= 2*(2**130-5) so we need to reduce it again
   274  	REDUCE(H_0, H_1, H_2, H_3, H_4)
   275  
   276  	// carry h1->h4
   277  	VESRLG $26, H_1, T_1
   278  	VN     MOD26, H_1, H_1
   279  	VAQ    T_1, H_2, H_2
   280  	VESRLG $26, H_2, T_2
   281  	VN     MOD26, H_2, H_2
   282  	VAQ    T_2, H_3, H_3
   283  	VESRLG $26, H_3, T_3
   284  	VN     MOD26, H_3, H_3
   285  	VAQ    T_3, H_4, H_4
   286  
   287  	// h is now < 2*(2**130-5)
   288  	// pack h into h1 (hi) and h0 (lo)
   289  	PACK(H_0, H_1, H_2, H_3, H_4)
   290  
   291  	// if h > 2**130-5 then h -= 2**130-5
   292  	MOD(H_0, H_1, T_0, T_1, T_2)
   293  
   294  	// h += s
   295  	MOVD  $·bswapMask<>(SB), R5
   296  	VL    (R5), T_1
   297  	VL    16(R4), T_0
   298  	VPERM T_0, T_0, T_1, T_0    // reverse bytes (to big)
   299  	VAQ   T_0, H_0, H_0
   300  	VPERM H_0, H_0, T_1, H_0    // reverse bytes (to little)
   301  	VST   H_0, (R1)
   302  
   303  	RET
   304  
   305  b2:
   306  	CMPBLE R3, $16, b1
   307  
   308  	// 2 blocks remaining
   309  	SUB    $17, R3
   310  	VL     (R2), T_0
   311  	VLL    R3, 16(R2), T_1
   312  	ADD    $1, R3
   313  	MOVBZ  $1, R0
   314  	CMPBEQ R3, $16, 2(PC)
   315  	VLVGB  R3, R0, T_1
   316  	EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
   317  	CMPBNE R3, $16, 2(PC)
   318  	VLEIB  $12, $1, F_4
   319  	VLEIB  $4, $1, F_4
   320  
   321  	// setup [r²,r]
   322  	VLVGG $1, RSAVE_0, R_0
   323  	VLVGG $1, RSAVE_1, R_1
   324  	VLVGG $1, RSAVE_2, R_2
   325  	VLVGG $1, RSAVE_3, R_3
   326  	VLVGG $1, RSAVE_4, R_4
   327  	VPDI  $0, R5_1, R5SAVE_1, R5_1
   328  	VPDI  $0, R5_2, R5SAVE_2, R5_2
   329  	VPDI  $0, R5_3, R5SAVE_3, R5_3
   330  	VPDI  $0, R5_4, R5SAVE_4, R5_4
   331  
   332  	MOVD $0, R3
   333  	BR   multiply
   334  
   335  skip:
   336  	VZERO H_0
   337  	VZERO H_1
   338  	VZERO H_2
   339  	VZERO H_3
   340  	VZERO H_4
   341  
   342  	CMPBEQ R3, $0, finish
   343  
   344  b1:
   345  	// 1 block remaining
   346  	SUB    $1, R3
   347  	VLL    R3, (R2), T_0
   348  	ADD    $1, R3
   349  	MOVBZ  $1, R0
   350  	CMPBEQ R3, $16, 2(PC)
   351  	VLVGB  R3, R0, T_0
   352  	VZERO  T_1
   353  	EXPAND(T_0, T_1, F_0, F_1, F_2, F_3, F_4)
   354  	CMPBNE R3, $16, 2(PC)
   355  	VLEIB  $4, $1, F_4
   356  	VLEIG  $1, $1, R_0
   357  	VZERO  R_1
   358  	VZERO  R_2
   359  	VZERO  R_3
   360  	VZERO  R_4
   361  	VZERO  R5_1
   362  	VZERO  R5_2
   363  	VZERO  R5_3
   364  	VZERO  R5_4
   365  
   366  	// setup [r, 1]
   367  	VLVGG $0, RSAVE_0, R_0
   368  	VLVGG $0, RSAVE_1, R_1
   369  	VLVGG $0, RSAVE_2, R_2
   370  	VLVGG $0, RSAVE_3, R_3
   371  	VLVGG $0, RSAVE_4, R_4
   372  	VPDI  $0, R5SAVE_1, R5_1, R5_1
   373  	VPDI  $0, R5SAVE_2, R5_2, R5_2
   374  	VPDI  $0, R5SAVE_3, R5_3, R5_3
   375  	VPDI  $0, R5SAVE_4, R5_4, R5_4
   376  
   377  	MOVD $0, R3
   378  	BR   multiply
   379  
   380  TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
   381  	MOVD  $x-24(SP), R1
   382  	XC    $24, 0(R1), 0(R1) // clear the storage
   383  	MOVD  $2, R0            // R0 is the number of double words stored -1
   384  	WORD  $0xB2B01000       // STFLE 0(R1)
   385  	XOR   R0, R0            // reset the value of R0
   386  	MOVBZ z-8(SP), R1
   387  	AND   $0x40, R1
   388  	BEQ   novector
   389  
   390  vectorinstalled:
   391  	// check if the vector instruction has been enabled
   392  	VLEIB  $0, $0xF, V16
   393  	VLGVB  $0, V16, R1
   394  	CMPBNE R1, $0xF, novector
   395  	MOVB   $1, ret+0(FP)      // have vx
   396  	RET
   397  
   398  novector:
   399  	MOVB $0, ret+0(FP) // no vx
   400  	RET