github.com/emmansun/gmsm@v0.29.1/internal/sm2ec/p256_asm_amd64.s (about)

     1  // This file contains constant-time, 64-bit assembly implementation of
     2  // P256. The optimizations performed here are described in detail in:
     3  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     4  //                          256-bit primes"
     5  // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
     6  // https://eprint.iacr.org/2013/816.pdf
     7  // https://github.com/emmansun/gmsm/wiki/SM2-WWMM-(2)
     8  //go:build !(purego || plugin)
     9  
    10  #include "textflag.h"
    11  #include "p256_macros_amd64.s"
    12  #define t1 R15
    13  
    14  /* ---------------------------------------*/
    15  // func p256Sqr(res, in *p256Element, n int)
    16  TEXT ·p256Sqr(SB),NOSPLIT,$0
    17  	MOVQ res+0(FP), res_ptr
    18  	MOVQ in+8(FP), x_ptr
    19  	MOVQ n+16(FP), BX
    20  	CMPB ·supportBMI2+0(SB), $0x01
    21  	JEQ  sqrBMI2
    22  
    23  sqrLoop:
    24  	p256SqrRound(t1)
    25  	DECQ BX                              
    26  	JNE  sqrLoop
    27  	RET
    28  	
    29  sqrBMI2:
    30  	p256SqrRoundAdx(t1)
    31  	DECQ BX
    32  	JNE  sqrBMI2
    33  	RET
    34  
    35  /* ---------------------------------------*/
    36  // func p256OrdSqr(res, in *p256OrdElement, n int)
    37  TEXT ·p256OrdSqr(SB),NOSPLIT,$0
    38  	MOVQ res+0(FP), res_ptr
    39  	MOVQ in+8(FP), x_ptr
    40  	MOVQ n+16(FP), BX
    41  
    42  	CMPB ·supportBMI2+0(SB), $0x01
    43  	JEQ  ordSqrLoopBMI2
    44  
    45  ordSqrLoop:
    46  	p256OrdSqrRound(t1)
    47  	DECQ BX
    48  	JNE ordSqrLoop
    49  
    50  	RET
    51  
    52  ordSqrLoopBMI2:
    53  	p256OrdSqrRoundAdx(t1)
    54  	DECQ BX
    55  	JNE ordSqrLoopBMI2
    56  
    57  	RET
    58  	
    59  /* ---------------------------------------*/
    60  #undef res_ptr
    61  #undef x_ptr
    62  #undef y_ptr
    63  
    64  #undef acc0
    65  #undef acc1
    66  #undef acc2
    67  #undef acc3
    68  #undef acc4
    69  #undef acc5
    70  #undef t0
    71  #undef t1
    72  /* ---------------------------------------*/
    73  #define mul0 AX
    74  #define mul1 DX
    75  #define acc0 BX
    76  #define acc1 CX
    77  #define acc2 R8
    78  #define acc3 R9
    79  #define acc4 R10
    80  #define acc5 R11
    81  #define acc6 R12
    82  #define acc7 R13
    83  #define t0 R14
    84  #define t1 R15
    85  #define t2 DI
    86  #define t3 SI
    87  #define hlp BP
    88  
    89  /* ---------------------------------------*/
    90  // [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] * [t3, t2, t1, t0]
    91  TEXT sm2P256MulInternal(SB),NOSPLIT,$8
    92  	CMPB ·supportBMI2+0(SB), $0x01
    93  	JEQ  internalMulBMI2
    94  
    95  	MOVQ acc4, mul0
    96  	MULQ t0
    97  	MOVQ mul0, acc0
    98  	MOVQ mul1, acc1
    99  
   100  	MOVQ acc4, mul0
   101  	MULQ t1
   102  	ADDQ mul0, acc1
   103  	ADCQ $0, mul1
   104  	MOVQ mul1, acc2
   105  
   106  	MOVQ acc4, mul0
   107  	MULQ t2
   108  	ADDQ mul0, acc2
   109  	ADCQ $0, mul1
   110  	MOVQ mul1, acc3
   111  
   112  	MOVQ acc4, mul0
   113  	MULQ t3
   114  	ADDQ mul0, acc3
   115  	ADCQ $0, mul1
   116  	MOVQ mul1, acc4
   117  
   118  	MOVQ acc5, mul0
   119  	MULQ t0
   120  	ADDQ mul0, acc1
   121  	ADCQ $0, mul1
   122  	MOVQ mul1, hlp
   123  
   124  	MOVQ acc5, mul0
   125  	MULQ t1
   126  	ADDQ hlp, acc2
   127  	ADCQ $0, mul1
   128  	ADDQ mul0, acc2
   129  	ADCQ $0, mul1
   130  	MOVQ mul1, hlp
   131  
   132  	MOVQ acc5, mul0
   133  	MULQ t2
   134  	ADDQ hlp, acc3
   135  	ADCQ $0, mul1
   136  	ADDQ mul0, acc3
   137  	ADCQ $0, mul1
   138  	MOVQ mul1, hlp
   139  
   140  	MOVQ acc5, mul0
   141  	MULQ t3
   142  	ADDQ hlp, acc4
   143  	ADCQ $0, mul1
   144  	ADDQ mul0, acc4
   145  	ADCQ $0, mul1
   146  	MOVQ mul1, acc5
   147  
   148  	MOVQ acc6, mul0
   149  	MULQ t0
   150  	ADDQ mul0, acc2
   151  	ADCQ $0, mul1
   152  	MOVQ mul1, hlp
   153  
   154  	MOVQ acc6, mul0
   155  	MULQ t1
   156  	ADDQ hlp, acc3
   157  	ADCQ $0, mul1
   158  	ADDQ mul0, acc3
   159  	ADCQ $0, mul1
   160  	MOVQ mul1, hlp
   161  
   162  	MOVQ acc6, mul0
   163  	MULQ t2
   164  	ADDQ hlp, acc4
   165  	ADCQ $0, mul1
   166  	ADDQ mul0, acc4
   167  	ADCQ $0, mul1
   168  	MOVQ mul1, hlp
   169  
   170  	MOVQ acc6, mul0
   171  	MULQ t3
   172  	ADDQ hlp, acc5
   173  	ADCQ $0, mul1
   174  	ADDQ mul0, acc5
   175  	ADCQ $0, mul1
   176  	MOVQ mul1, acc6
   177  
   178  	MOVQ acc7, mul0
   179  	MULQ t0
   180  	ADDQ mul0, acc3
   181  	ADCQ $0, mul1
   182  	MOVQ mul1, hlp
   183  
   184  	MOVQ acc7, mul0
   185  	MULQ t1
   186  	ADDQ hlp, acc4
   187  	ADCQ $0, mul1
   188  	ADDQ mul0, acc4
   189  	ADCQ $0, mul1
   190  	MOVQ mul1, hlp
   191  
   192  	MOVQ acc7, mul0
   193  	MULQ t2
   194  	ADDQ hlp, acc5
   195  	ADCQ $0, mul1
   196  	ADDQ mul0, acc5
   197  	ADCQ $0, mul1
   198  	MOVQ mul1, hlp
   199  
   200  	MOVQ acc7, mul0
   201  	MULQ t3
   202  	ADDQ hlp, acc6
   203  	ADCQ $0, mul1
   204  	ADDQ mul0, acc6
   205  	ADCQ $0, mul1
   206  	MOVQ mul1, acc7
   207  	sm2P256MulReductionInline
   208  	
   209  	MOVQ $0, BP
   210  	// Add bits [511:256] of the result
   211  	ADCQ acc0, acc4
   212  	ADCQ acc1, acc5
   213  	ADCQ acc2, acc6
   214  	ADCQ acc3, acc7
   215  	ADCQ $0, hlp
   216  	// Copy result
   217  	MOVQ acc4, acc0
   218  	MOVQ acc5, acc1
   219  	MOVQ acc6, acc2
   220  	MOVQ acc7, acc3
   221  	// Subtract p256
   222  	SUBQ $-1, acc4
   223  	SBBQ p256p<>+0x08(SB), acc5
   224  	SBBQ $-1, acc6
   225  	SBBQ p256p<>+0x018(SB), acc7
   226  	SBBQ $0, hlp
   227  	// If the result of the subtraction is negative, restore the previous result
   228  	CMOVQCS acc0, acc4 // CMOVQCS: Move if below (CF == 1)
   229  	CMOVQCS acc1, acc5
   230  	CMOVQCS acc2, acc6
   231  	CMOVQCS acc3, acc7
   232  
   233  	RET
   234  internalMulBMI2:
   235  	MOVQ acc4, mul1
   236  	MULXQ t0, acc0, acc1
   237  
   238  	MULXQ t1, mul0, acc2
   239  	ADDQ mul0, acc1
   240  
   241  	MULXQ t2, mul0, acc3
   242  	ADCQ mul0, acc2
   243  
   244  	MULXQ t3, mul0, acc4
   245  	ADCQ mul0, acc3
   246  	ADCQ $0, acc4
   247  
   248  	MOVQ acc5, mul1
   249  	MULXQ t0, mul0, hlp
   250  	ADDQ mul0, acc1
   251  	ADCQ hlp, acc2
   252  
   253  	MULXQ t1, mul0, hlp
   254  	ADCQ $0, hlp
   255  	ADDQ mul0, acc2
   256  	ADCQ hlp, acc3
   257  
   258  	MULXQ t2, mul0, hlp
   259  	ADCQ $0, hlp
   260  	ADDQ mul0, acc3
   261  	ADCQ hlp, acc4
   262  
   263  	MULXQ t3, mul0, acc5
   264  	ADCQ $0, acc5
   265  	ADDQ mul0, acc4
   266  	ADCQ $0, acc5
   267  
   268  	MOVQ acc6, mul1
   269  	MULXQ t0, mul0, hlp
   270  	ADDQ mul0, acc2
   271  	ADCQ hlp, acc3
   272  
   273  	MULXQ t1, mul0, hlp
   274  	ADCQ $0, hlp
   275  	ADDQ mul0, acc3
   276  	ADCQ hlp, acc4
   277  
   278  	MULXQ t2, mul0, hlp
   279  	ADCQ $0, hlp
   280  	ADDQ mul0, acc4
   281  	ADCQ hlp, acc5
   282  
   283  	MULXQ t3, mul0, acc6
   284  	ADCQ $0, acc6
   285  	ADDQ mul0, acc5
   286  	ADCQ $0, acc6
   287  
   288  	MOVQ acc7, mul1
   289  	MULXQ t0, mul0, hlp
   290  	ADDQ mul0, acc3
   291  	ADCQ hlp, acc4
   292  
   293  	MULXQ t1, mul0, hlp
   294  	ADCQ $0, hlp
   295  	ADDQ mul0, acc4
   296  	ADCQ hlp, acc5
   297  
   298  	MULXQ t2, mul0, hlp
   299  	ADCQ $0, hlp
   300  	ADDQ mul0, acc5
   301  	ADCQ hlp, acc6
   302  
   303  	MULXQ t3, mul0, acc7
   304  	ADCQ $0, acc7
   305  	ADDQ mul0, acc6
   306  	ADCQ $0, acc7
   307  
   308  	sm2P256MulReductionInline
   309  	MOVQ $0, BP
   310  	// Add bits [511:256] of the result
   311  	ADCQ acc0, acc4
   312  	ADCQ acc1, acc5
   313  	ADCQ acc2, acc6
   314  	ADCQ acc3, acc7
   315  	ADCQ $0, hlp
   316  	// Copy result
   317  	MOVQ acc4, acc0
   318  	MOVQ acc5, acc1
   319  	MOVQ acc6, acc2
   320  	MOVQ acc7, acc3
   321  	// Subtract p256
   322  	SUBQ $-1, acc4
   323  	SBBQ p256p<>+0x08(SB), acc5
   324  	SBBQ $-1, acc6
   325  	SBBQ p256p<>+0x018(SB), acc7
   326  	SBBQ $0, hlp
   327  	// If the result of the subtraction is negative, restore the previous result
   328  	CMOVQCS acc0, acc4 // CMOVQCS: Move if below (CF == 1)
   329  	CMOVQCS acc1, acc5
   330  	CMOVQCS acc2, acc6
   331  	CMOVQCS acc3, acc7
   332  
   333  	RET
   334  
   335  /* ---------------------------------------*/
   336  // [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4]^2
   337  TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
   338  	CMPB ·supportBMI2+0(SB), $0x01
   339  	JEQ  internalSqrBMI2
   340  
   341  	p256SqrInternalInline
   342  	RET
   343  
   344  internalSqrBMI2:
   345  	p256SqrInternalInlineAdx
   346  	RET
   347  
   348  /* ---------------------------------------*/
   349  #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
   350  #define LDt(src)   MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
   351  #define ST(dst)    MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
   352  #define STt(dst)   MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
   353  #define acc2t      MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
   354  #define t2acc      MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
   355  /* ---------------------------------------*/
   356  #define x1in(off) (32*0 + off)(SP)
   357  #define y1in(off) (32*1 + off)(SP)
   358  #define z1in(off) (32*2 + off)(SP)
   359  #define x2in(off) (32*3 + off)(SP)
   360  #define y2in(off) (32*4 + off)(SP)
   361  #define xout(off) (32*5 + off)(SP)
   362  #define yout(off) (32*6 + off)(SP)
   363  #define zout(off) (32*7 + off)(SP)
   364  #define s2(off)   (32*8 + off)(SP)
   365  #define z1sqr(off) (32*9 + off)(SP)
   366  #define h(off)	  (32*10 + off)(SP)
   367  #define r(off)	  (32*11 + off)(SP)
   368  #define hsqr(off) (32*12 + off)(SP)
   369  #define rsqr(off) (32*13 + off)(SP)
   370  #define hcub(off) (32*14 + off)(SP)
   371  #define rptr	  (32*15)(SP)
   372  #define sel_save  (32*15 + 8)(SP)
   373  #define zero_save (32*15 + 8 + 4)(SP)
   374  
   375  #define p256PointAddAffineInline() \
   376  	\// Store pointer to result
   377  	MOVQ mul0, rptr                   \
   378  	MOVL t1, sel_save                 \
   379  	MOVL t2, zero_save                \
   380  	\// Negate y2in based on sign
   381  	MOVQ (16*2 + 8*0)(CX), acc4       \
   382  	MOVQ (16*2 + 8*1)(CX), acc5       \
   383  	MOVQ (16*2 + 8*2)(CX), acc6       \
   384  	MOVQ (16*2 + 8*3)(CX), acc7       \
   385  	MOVQ $-1, acc0                    \
   386  	MOVQ p256p<>+0x08(SB), acc1       \
   387  	MOVQ $-1, acc2                    \
   388  	MOVQ p256p<>+0x018(SB), acc3      \
   389  	XORQ mul0, mul0                   \
   390  	\// Speculatively subtract
   391  	SUBQ acc4, acc0                   \
   392  	SBBQ acc5, acc1                   \
   393  	SBBQ acc6, acc2                   \
   394  	SBBQ acc7, acc3                   \
   395  	SBBQ $0, mul0                     \
   396  	MOVQ acc0, t0                     \
   397  	MOVQ acc1, t1                     \
   398  	MOVQ acc2, t2                     \
   399  	MOVQ acc3, t3                     \
   400  	\// Add in case the operand was > p256
   401  	ADDQ $-1, acc0                    \
   402  	ADCQ p256p<>+0x08(SB), acc1       \
   403  	ADCQ $-1, acc2                    \
   404  	ADCQ p256p<>+0x018(SB), acc3      \
   405  	ADCQ $0, mul0                     \ // ZF := 1 if mul0 == 0 after ADC
   406  	CMOVQNE t0, acc0                  \ // CMOVQNE: Move if not equal (ZF == 0)
   407  	CMOVQNE t1, acc1                  \
   408  	CMOVQNE t2, acc2                  \
   409  	CMOVQNE t3, acc3                  \
   410  	\// If condition is 0, keep original value
   411  	TESTQ DX, DX                      \ // ZF := 1 if (DX AND DX == 0)
   412  	CMOVQEQ acc4, acc0                \ // CMOVQEQ: Move if equal (ZF == 1)
   413  	CMOVQEQ acc5, acc1                \
   414  	CMOVQEQ acc6, acc2                \
   415  	CMOVQEQ acc7, acc3                \
   416  	\// Store result
   417  	MOVQ acc0, y2in(8*0)              \
   418  	MOVQ acc1, y2in(8*1)              \
   419  	MOVQ acc2, y2in(8*2)              \
   420  	MOVQ acc3, y2in(8*3)              \
   421  	\// Begin point add
   422  	LDacc (z1in)                      \
   423  	CALL sm2P256SqrInternal(SB)	      \// z1ˆ2
   424  	ST (z1sqr)                        \
   425  	\
   426  	LDt (x2in)                        \
   427  	CALL sm2P256MulInternal(SB)	      \// u2 = x2 * z1ˆ2
   428  	\
   429  	LDt (x1in)                        \
   430  	p256SubInline2          	      \// h = u2 - x1
   431  	ST (h)                            \
   432  	\
   433  	LDt (z1in)                        \
   434  	CALL sm2P256MulInternal(SB)	      \// z3 = h * z1
   435  	ST (zout)                         \
   436  	\
   437  	LDacc (z1sqr)                     \
   438  	CALL sm2P256MulInternal(SB)	      \// z1ˆ3
   439  	\
   440  	LDt (y2in)                        \
   441  	CALL sm2P256MulInternal(SB)	      \// s2 = y2 * z1ˆ3
   442  	ST (s2)                           \
   443  	\
   444  	LDt (y1in)                        \
   445  	p256SubInline2                    \// r = s2 - y1
   446  	ST (r)                            \
   447  	\
   448  	CALL sm2P256SqrInternal(SB)	      \// rsqr = rˆ2
   449  	ST (rsqr)                         \
   450  	\
   451  	LDacc (h)                         \
   452  	CALL sm2P256SqrInternal(SB)	      \// hsqr = hˆ2
   453  	ST (hsqr)                         \
   454  	\
   455  	LDt (h)                           \
   456  	CALL sm2P256MulInternal(SB)	      \// hcub = hˆ3
   457  	ST (hcub)                         \
   458  	\
   459  	LDt (y1in)                        \
   460  	CALL sm2P256MulInternal(SB)	      \// s2 = y1 * hˆ3
   461  	ST (s2)                           \
   462  	\
   463  	LDacc (x1in)                      \
   464  	LDt (hsqr)                        \
   465  	CALL sm2P256MulInternal(SB)	      \// x1 * hˆ2
   466  	ST (h)                            \
   467  	\
   468  	p256MulBy2Inline			      \// x1 * hˆ2 * 2, inline
   469  	LDacc (rsqr)                      \
   470  	p256SubInline2          	      \// rˆ2 - x1 * hˆ2 * 2
   471  	\
   472  	LDt (hcub)                        \
   473  	p256SubInline                     \
   474  	STt (xout)                         \// xout = rˆ2 - 2 * x1 * hˆ2 - h^3
   475  	LDacc (h)                         \
   476  	p256SubInline2                    \
   477  	\
   478  	LDt (r)                           \
   479  	CALL sm2P256MulInternal(SB)       \
   480  	\
   481  	LDt (s2)                          \
   482  	p256SubInline2                    \
   483  	ST (yout)                         \
   484  	\// Load stored values from stack
   485  	MOVQ rptr, AX                     \
   486  
   487  // func p256PointAddAffineAsm(res, in1 *SM2P256Point, in2 *p256AffinePoint, sign, sel, zero int)
   488  TEXT ·p256PointAddAffineAsm(SB),0,$512-48
   489  	// Move input to stack in order to free registers
   490  	MOVQ res+0(FP), AX
   491  	MOVQ in1+8(FP), BX
   492  	MOVQ in2+16(FP), CX
   493  	MOVQ sign+24(FP), DX
   494  	MOVQ sel+32(FP), t1
   495  	MOVQ zero+40(FP), t2
   496  
   497  	CMPB ·supportAVX2+0(SB), $0x01
   498  	JEQ  pointaddaffine_avx2
   499  
   500  	MOVOU (16*0)(BX), X0
   501  	MOVOU (16*1)(BX), X1
   502  	MOVOU (16*2)(BX), X2
   503  	MOVOU (16*3)(BX), X3
   504  	MOVOU (16*4)(BX), X4
   505  	MOVOU (16*5)(BX), X5
   506  
   507  	MOVOU X0, x1in(16*0)
   508  	MOVOU X1, x1in(16*1)
   509  	MOVOU X2, y1in(16*0)
   510  	MOVOU X3, y1in(16*1)
   511  	MOVOU X4, z1in(16*0)
   512  	MOVOU X5, z1in(16*1)
   513  
   514  	MOVOU (16*0)(CX), X0
   515  	MOVOU (16*1)(CX), X1
   516  
   517  	MOVOU X0, x2in(16*0)
   518  	MOVOU X1, x2in(16*1)
   519  	
   520  	p256PointAddAffineInline()
   521  	// The result is not valid if (sel == 0), conditional choose
   522  	MOVOU xout(16*0), X0
   523  	MOVOU xout(16*1), X1
   524  	MOVOU yout(16*0), X2
   525  	MOVOU yout(16*1), X3
   526  	MOVOU zout(16*0), X4
   527  	MOVOU zout(16*1), X5
   528  
   529  	MOVL sel_save, X6 // sel 
   530  	MOVL zero_save, X7 // zero
   531  
   532  	PXOR X8, X8 // X8's bits are all 0
   533  	PCMPEQL X9, X9 // X9's bits are all 1
   534  
   535  	PSHUFD $0, X6, X6
   536  	PSHUFD $0, X7, X7
   537  
   538  	PCMPEQL X8, X6  // X6's bits are all 1 if sel = 0, else are 0
   539  	PCMPEQL X8, X7  // X7's bits are all 1 if zero = 0, else are 0
   540  
   541  	MOVOU X6, X15
   542  	PANDN X9, X15 // X15 = NOT(X6)
   543  
   544  	MOVOU x1in(16*0), X9
   545  	MOVOU x1in(16*1), X10
   546  	MOVOU y1in(16*0), X11
   547  	MOVOU y1in(16*1), X12
   548  	MOVOU z1in(16*0), X13
   549  	MOVOU z1in(16*1), X14
   550  
   551  	PAND X15, X0
   552  	PAND X15, X1
   553  	PAND X15, X2
   554  	PAND X15, X3
   555  	PAND X15, X4
   556  	PAND X15, X5
   557  
   558  	PAND X6, X9
   559  	PAND X6, X10
   560  	PAND X6, X11
   561  	PAND X6, X12
   562  	PAND X6, X13
   563  	PAND X6, X14
   564  
   565  	PXOR X9, X0
   566  	PXOR X10, X1
   567  	PXOR X11, X2
   568  	PXOR X12, X3
   569  	PXOR X13, X4
   570  	PXOR X14, X5
   571  	// Similarly if zero == 0
   572  	PCMPEQL X9, X9
   573  	MOVOU X7, X15
   574  	PANDN X9, X15 // X15 = NOT(X7)
   575  
   576  	MOVOU x2in(16*0), X9
   577  	MOVOU x2in(16*1), X10
   578  	MOVOU y2in(16*0), X11
   579  	MOVOU y2in(16*1), X12
   580  	MOVOU p256one<>+0x00(SB), X13
   581  	MOVOU p256one<>+0x10(SB), X14
   582  
   583  	PAND X15, X0
   584  	PAND X15, X1
   585  	PAND X15, X2
   586  	PAND X15, X3
   587  	PAND X15, X4
   588  	PAND X15, X5
   589  
   590  	PAND X7, X9
   591  	PAND X7, X10
   592  	PAND X7, X11
   593  	PAND X7, X12
   594  	PAND X7, X13
   595  	PAND X7, X14
   596  
   597  	PXOR X9, X0
   598  	PXOR X10, X1
   599  	PXOR X11, X2
   600  	PXOR X12, X3
   601  	PXOR X13, X4
   602  	PXOR X14, X5
   603  	// Finally output the result
   604  	MOVOU X0, (16*0)(AX)
   605  	MOVOU X1, (16*1)(AX)
   606  	MOVOU X2, (16*2)(AX)
   607  	MOVOU X3, (16*3)(AX)
   608  	MOVOU X4, (16*4)(AX)
   609  	MOVOU X5, (16*5)(AX)
   610  	MOVQ $0, rptr
   611  
   612  	RET
   613  pointaddaffine_avx2:
   614  	VMOVDQU (32*0)(BX), Y0
   615  	VMOVDQU (32*1)(BX), Y1
   616  	VMOVDQU (32*2)(BX), Y2
   617  
   618  	VMOVDQU Y0, x1in(32*0)
   619  	VMOVDQU Y1, y1in(32*0)
   620  	VMOVDQU Y2, z1in(32*0)
   621  
   622  	VMOVDQU (32*0)(CX), Y0
   623  	VMOVDQU Y0, x2in(32*0)
   624  
   625  	p256PointAddAffineInline()
   626  	// The result is not valid if (sel == 0), conditional choose
   627  	VPXOR Y8, Y8, Y8 // Y8's bits are all 0
   628  	VPBROADCASTD sel_save, Y6 // sel
   629  	VPBROADCASTD zero_save, Y7 // zero
   630  
   631  	VPCMPEQD Y8, Y6, Y6 // Y6's bits are all 1 if sel = 0, else are 0
   632  	VPCMPEQD Y8, Y7, Y7 // Y7's bits are all 1 if zero = 0, else are 0
   633  
   634  	VPANDN xout(32*0), Y6, Y0
   635  	VPANDN yout(32*0), Y6, Y1
   636  	VPANDN zout(32*0), Y6, Y2
   637  
   638  	VPAND x1in(32*0), Y6, Y9
   639  	VPAND y1in(32*0), Y6, Y10
   640  	VPAND z1in(32*0), Y6, Y11
   641  
   642  	VPXOR Y9, Y0, Y0
   643  	VPXOR Y10, Y1, Y1
   644  	VPXOR Y11, Y2, Y2
   645  
   646  	// Similarly if zero == 0
   647  	VPANDN Y0, Y7, Y0
   648  	VPANDN Y1, Y7, Y1
   649  	VPANDN Y2, Y7, Y2
   650  
   651  	VPAND x2in(32*0), Y7, Y9
   652  	VPAND y2in(32*0), Y7, Y10
   653  	VPAND p256one<>+0x00(SB), Y7, Y11
   654  
   655  	VPXOR Y9, Y0, Y0
   656  	VPXOR Y10, Y1, Y1
   657  	VPXOR Y11, Y2, Y2
   658  
   659  	// Finally output the result
   660  	VMOVDQU Y0, (32*0)(AX)
   661  	VMOVDQU Y1, (32*1)(AX)
   662  	VMOVDQU Y2, (32*2)(AX)
   663  	MOVQ $0, rptr
   664  
   665  	VZEROUPPER
   666  	RET	
   667  #undef x1in
   668  #undef y1in
   669  #undef z1in
   670  #undef x2in
   671  #undef y2in
   672  #undef xout
   673  #undef yout
   674  #undef zout
   675  #undef s2
   676  #undef z1sqr
   677  #undef h
   678  #undef r
   679  #undef hsqr
   680  #undef rsqr
   681  #undef hcub
   682  #undef rptr
   683  #undef sel_save
   684  #undef zero_save
   685  
   686  /* ---------------------------------------*/
   687  #define x1in(off) (32*0 + off)(SP)
   688  #define y1in(off) (32*1 + off)(SP)
   689  #define z1in(off) (32*2 + off)(SP)
   690  #define x2in(off) (32*3 + off)(SP)
   691  #define y2in(off) (32*4 + off)(SP)
   692  #define z2in(off) (32*5 + off)(SP)
   693  
   694  #define xout(off) (32*6 + off)(SP)
   695  #define yout(off) (32*7 + off)(SP)
   696  #define zout(off) (32*8 + off)(SP)
   697  
   698  #define u1(off)    (32*9 + off)(SP)
   699  #define u2(off)    (32*10 + off)(SP)
   700  #define s1(off)    (32*11 + off)(SP)
   701  #define s2(off)    (32*12 + off)(SP)
   702  #define z1sqr(off) (32*13 + off)(SP)
   703  #define z2sqr(off) (32*14 + off)(SP)
   704  #define h(off)     (32*15 + off)(SP)
   705  #define r(off)     (32*16 + off)(SP)
   706  #define hsqr(off)  (32*17 + off)(SP)
   707  #define rsqr(off)  (32*18 + off)(SP)
   708  #define hcub(off)  (32*19 + off)(SP)
   709  #define rptr       (32*20)(SP)
   710  #define points_eq  (32*20+8)(SP)
   711  
   712  #define p256PointAddInline() \
   713  	\// Begin point add
   714  	LDacc (z2in)                 \
   715  	CALL sm2P256SqrInternal(SB)	 \// z2ˆ2
   716  	ST (z2sqr)                   \
   717  	LDt (z2in)                   \
   718  	CALL sm2P256MulInternal(SB)	 \// z2ˆ3
   719  	LDt (y1in)                   \
   720  	CALL sm2P256MulInternal(SB)	 \// s1 = z2ˆ3*y1
   721  	ST (s1)                      \
   722  	\
   723  	LDacc (z1in)                 \ 
   724  	CALL sm2P256SqrInternal(SB)	 \// z1ˆ2
   725  	ST (z1sqr)                   \
   726  	LDt (z1in)                   \
   727  	CALL sm2P256MulInternal(SB)	 \// z1ˆ3
   728  	LDt (y2in)                   \
   729  	CALL sm2P256MulInternal(SB)	 \// s2 = z1ˆ3*y2
   730  	ST (s2)                      \ 
   731  	\
   732  	LDt (s1)                     \
   733  	p256SubInline2          	 \// r = s2 - s1
   734  	ST (r)                       \
   735  	p256IsZeroInline             \
   736  	MOVQ AX, points_eq           \
   737  	\
   738  	LDacc (z2sqr)                \
   739  	LDt (x1in)                   \
   740  	CALL sm2P256MulInternal(SB)	 \// u1 = x1 * z2ˆ2
   741  	ST (u1)                      \
   742  	LDacc (z1sqr)                \
   743  	LDt (x2in)                   \ 
   744  	CALL sm2P256MulInternal(SB)	 \// u2 = x2 * z1ˆ2
   745  	ST (u2)                      \
   746  	\
   747  	LDt (u1)                     \ 
   748  	p256SubInline2          	 \// h = u2 - u1
   749  	ST (h)                       \
   750  	p256IsZeroInline             \
   751  	ANDQ points_eq, AX           \
   752  	MOVQ AX, points_eq           \
   753  	\
   754  	LDacc (r)                    \
   755  	CALL sm2P256SqrInternal(SB)	 \// rsqr = rˆ2
   756  	ST (rsqr)                    \
   757  	\
   758  	LDacc (h)                    \
   759  	CALL sm2P256SqrInternal(SB)	 \// hsqr = hˆ2
   760  	ST (hsqr)                    \
   761  	\
   762  	LDt (h)                      \
   763  	CALL sm2P256MulInternal(SB)	 \// hcub = hˆ3
   764  	ST (hcub)                    \
   765  	\
   766  	LDt (s1)                     \
   767  	CALL sm2P256MulInternal(SB)  \
   768  	ST (s2)                      \
   769  	\
   770  	LDacc (z1in)                 \
   771  	LDt (z2in)                   \
   772  	CALL sm2P256MulInternal(SB)	 \// z1 * z2
   773  	LDt (h)                      \
   774  	CALL sm2P256MulInternal(SB)	 \// z1 * z2 * h
   775  	ST (zout)                    \
   776  	\
   777  	LDacc (hsqr)                 \
   778  	LDt (u1)                     \
   779  	CALL sm2P256MulInternal(SB)	 \// hˆ2 * u1
   780  	ST (u2)                      \
   781  	\
   782  	p256MulBy2Inline	         \// u1 * hˆ2 * 2, inline
   783  	LDacc (rsqr)                 \
   784  	p256SubInline2          	 \// rˆ2 - u1 * hˆ2 * 2
   785  	\
   786  	LDt (hcub)                   \
   787  	p256SubInline                \
   788  	STt (xout)                   \
   789  	LDacc (u2)                   \
   790  	p256SubInline2               \
   791  	\
   792  	LDt (r)                      \
   793  	CALL sm2P256MulInternal(SB)  \
   794  	\
   795  	LDt (s2)                     \
   796  	p256SubInline2               \
   797  	ST (yout)                    \
   798  
   799  //func p256PointAddAsm(res, in1, in2 *SM2P256Point) int
   800  TEXT ·p256PointAddAsm(SB),0,$680-32
   801  	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
   802  	// Move input to stack in order to free registers
   803  	MOVQ res+0(FP), AX
   804  	MOVQ in1+8(FP), BX
   805  	MOVQ in2+16(FP), CX
   806  
   807  	CMPB ·supportAVX2+0(SB), $0x01
   808  	JEQ  pointadd_avx2
   809  
   810  	MOVOU (16*0)(BX), X0
   811  	MOVOU (16*1)(BX), X1
   812  	MOVOU (16*2)(BX), X2
   813  	MOVOU (16*3)(BX), X3
   814  	MOVOU (16*4)(BX), X4
   815  	MOVOU (16*5)(BX), X5
   816  
   817  	MOVOU X0, x1in(16*0)
   818  	MOVOU X1, x1in(16*1)
   819  	MOVOU X2, y1in(16*0)
   820  	MOVOU X3, y1in(16*1)
   821  	MOVOU X4, z1in(16*0)
   822  	MOVOU X5, z1in(16*1)
   823  
   824  	MOVOU (16*0)(CX), X0
   825  	MOVOU (16*1)(CX), X1
   826  	MOVOU (16*2)(CX), X2
   827  	MOVOU (16*3)(CX), X3
   828  	MOVOU (16*4)(CX), X4
   829  	MOVOU (16*5)(CX), X5
   830  
   831  	MOVOU X0, x2in(16*0)
   832  	MOVOU X1, x2in(16*1)
   833  	MOVOU X2, y2in(16*0)
   834  	MOVOU X3, y2in(16*1)
   835  	MOVOU X4, z2in(16*0)
   836  	MOVOU X5, z2in(16*1)
   837  	// Store pointer to result
   838  	MOVQ AX, rptr
   839  	p256PointAddInline()
   840  
   841  	MOVOU xout(16*0), X0
   842  	MOVOU xout(16*1), X1
   843  	MOVOU yout(16*0), X2
   844  	MOVOU yout(16*1), X3
   845  	MOVOU zout(16*0), X4
   846  	MOVOU zout(16*1), X5
   847  	// Finally output the result
   848  	MOVQ rptr, AX
   849  	MOVQ $0, rptr
   850  	MOVOU X0, (16*0)(AX)
   851  	MOVOU X1, (16*1)(AX)
   852  	MOVOU X2, (16*2)(AX)
   853  	MOVOU X3, (16*3)(AX)
   854  	MOVOU X4, (16*4)(AX)
   855  	MOVOU X5, (16*5)(AX)
   856  
   857  	MOVQ points_eq, AX
   858  	MOVQ AX, ret+24(FP)
   859  
   860  	RET
   861  pointadd_avx2:
   862  	VMOVDQU (32*0)(BX), Y0
   863  	VMOVDQU (32*1)(BX), Y1
   864  	VMOVDQU (32*2)(BX), Y2
   865  
   866  	VMOVDQU Y0, x1in(32*0)
   867  	VMOVDQU Y1, y1in(32*0)
   868  	VMOVDQU Y2, z1in(32*0)
   869  
   870  	VMOVDQU (32*0)(CX), Y0
   871  	VMOVDQU (32*1)(CX), Y1
   872  	VMOVDQU (32*2)(CX), Y2
   873  
   874  	VMOVDQU Y0, x2in(32*0)
   875  	VMOVDQU Y1, y2in(32*0)
   876  	VMOVDQU Y2, z2in(32*0)
   877  
   878  	// Store pointer to result
   879  	MOVQ AX, rptr
   880  	p256PointAddInline()
   881  
   882  	VMOVDQU xout(32*0), Y0
   883  	VMOVDQU yout(32*0), Y1
   884  	VMOVDQU zout(32*0), Y2
   885  	// Finally output the result
   886  	MOVQ rptr, AX
   887  	MOVQ $0, rptr
   888  	VMOVDQU Y0, (32*0)(AX)
   889  	VMOVDQU Y1, (32*1)(AX)
   890  	VMOVDQU Y2, (32*2)(AX)
   891  
   892  	MOVQ points_eq, AX
   893  	MOVQ AX, ret+24(FP)
   894  
   895  	VZEROUPPER
   896  	RET
   897  
   898  #undef x1in
   899  #undef y1in
   900  #undef z1in
   901  #undef x2in
   902  #undef y2in
   903  #undef z2in
   904  #undef xout
   905  #undef yout
   906  #undef zout
   907  #undef s1
   908  #undef s2
   909  #undef u1
   910  #undef u2
   911  #undef z1sqr
   912  #undef z2sqr
   913  #undef h
   914  #undef r
   915  #undef hsqr
   916  #undef rsqr
   917  #undef hcub
   918  #undef rptr
   919  /* ---------------------------------------*/
   920  #define x(off) (32*0 + off)(SP)
   921  #define y(off) (32*1 + off)(SP)
   922  #define z(off) (32*2 + off)(SP)
   923  
   924  #define s(off)	(32*3 + off)(SP)
   925  #define m(off)	(32*4 + off)(SP)
   926  #define zsqr(off) (32*5 + off)(SP)
   927  #define tmp(off)  (32*6 + off)(SP)
   928  #define rptr	  (32*7)(SP)
   929  
   930  #define calZ() \
   931  	LDacc (z)                               \
   932  	CALL sm2P256SqrInternal(SB)             \
   933  	ST (zsqr)                               \  // ZZ = Z1^2
   934  	\
   935  	LDt (x)                                 \
   936  	p256AddInline                           \
   937  	STt (m)                                 \  // M = ZZ + X1
   938  	\
   939  	LDacc (z)                               \
   940  	LDt (y)                                 \
   941  	CALL sm2P256MulInternal(SB)             \ // Z1 * Y1
   942  	p256MulBy2Inline                        \ // Z3 = 2(Z1 * Y1) = (Y1 + Z1)^2 - Y1^2 - Z1^2
   943  
   944  #define calX() \
   945  	LDacc (x)                               \
   946  	LDt (zsqr)                              \
   947  	p256SubInline2                          \ // X1 - ZZ
   948  	LDt (m)                                 \
   949  	CALL sm2P256MulInternal(SB)             \ // M = (X1 - ZZ) * (X1 + ZZ) = X1^2 - ZZ^2
   950  	ST (m)                                  \
   951  	\// Multiply by 3
   952  	p256TripleInline                        \
   953  	STt (m)                                 \  // M = 3 * (X1^2 - ZZ^2)
   954  	\////////////////////////
   955  	LDacc (y)                               \
   956  	p256MulBy2Inline2                       \
   957  	CALL sm2P256SqrInternal(SB)             \ // 4 * YY = (2*Y1)^2
   958  	ST (s)                                  \ // S = 4 * YY
   959  	CALL sm2P256SqrInternal(SB)             \ // (4 * YY)^2 = 16 * YYYY
   960  	\// Divide by 2
   961  	XORQ mul0, mul0                         \
   962  	MOVQ acc4, t0                           \
   963  	MOVQ acc5, t1                           \  
   964  	MOVQ acc6, t2                           \
   965  	MOVQ acc7, t3                           \
   966  	\ // [mul0, acc7, acc6, acc5, acc4] := [acc7, acc6, acc5, acc4] + P
   967  	ADDQ $-1, acc4                          \
   968  	ADCQ p256p<>+0x08(SB), acc5             \
   969  	ADCQ $-1, acc6                          \
   970  	ADCQ p256p<>+0x018(SB), acc7            \
   971  	ADCQ $0, mul0                           \
   972  	TESTQ $1, t0                            \ // ZF := 1 if (t0 AND 1 == 0)
   973  	\ // CMOVQEQ: Move if equal (ZF == 1)
   974  	CMOVQEQ t0, acc4                        \ // acc4 := t0 if (ZF == 1)
   975  	CMOVQEQ t1, acc5                        \ // acc5 := t1 if (ZF == 1)
   976  	CMOVQEQ t2, acc6                        \ // acc6 := t2 if (ZF == 1)
   977  	CMOVQEQ t3, acc7                        \ // acc7 := t3 if (ZF == 1)
   978  	ANDQ t0, mul0                           \ // mul0 := t0 AND mul0 (mul0 := 0 if (ZF == 1) else keeping the original value 0 or 1) 
   979  	\ // Divide even by 2 
   980  	SHRQ $1, acc5, acc4                     \ // acc4 := acc4 >> 1 | acc5 << 63
   981  	SHRQ $1, acc6, acc5                     \ // acc5 := acc5 >> 1 | acc6 << 63
   982  	SHRQ $1, acc7, acc6                     \ // acc6 := acc6 >> 1 | acc7 << 63
   983  	SHRQ $1, mul0, acc7                     \ // acc7 := acc7 >> 1 | mul0 << 63
   984  	ST (y)                                  \ // Y3 = 8 * YYYY
   985  	\/////////////////////////
   986  	LDacc (x)                               \
   987  	LDt (s)                                 \
   988  	CALL sm2P256MulInternal(SB)             \ // X1 * 4 * YY
   989  	ST (s)                                  \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY)
   990  	p256MulBy2Inline                        \
   991  	STt (tmp)                               \ // tmp = 2*S = 8 * X1 * YY
   992  	\
   993  	LDacc (m)                               \
   994  	CALL sm2P256SqrInternal(SB)             \ // M^2 = (3 * (X1^2 - ZZ^2))^2
   995  	LDt (tmp)                               \
   996  	p256SubInline2                          \ // X3 = M^2 - 2*S
   997  
   998  #define calY() \
   999  	acc2t                                   \
  1000  	LDacc (s)                               \ // S = 4 * X1 * YY = 2 * ((X1+YY)^2 - XX - YYYY)
  1001  	p256SubInline2                          \ // S - X3 
  1002  	\
  1003  	LDt (m)                                 \
  1004  	CALL sm2P256MulInternal(SB)             \ // M * (S - X3)
  1005  	\
  1006  	LDt (y)                                 \
  1007  	p256SubInline2                          \ // Y3 = M * (S - X3) - 8 * YYYYY
  1008  
  1009  #define lastP256PointDouble() \
  1010  	\ // See https://hyperelliptic.org/EFD/g1p/data/shortw/jacobian-3/doubling/dbl-2007-bl
  1011  	calZ()                            \
  1012  	MOVQ rptr, AX                     \
  1013  	\// Store z
  1014  	MOVQ t0, (16*4 + 8*0)(AX)         \
  1015  	MOVQ t1, (16*4 + 8*1)(AX)         \
  1016  	MOVQ t2, (16*4 + 8*2)(AX)         \
  1017  	MOVQ t3, (16*4 + 8*3)(AX)         \
  1018  	\
  1019  	calX()                            \
  1020  	MOVQ rptr, AX                     \
  1021  	\// Store x
  1022  	MOVQ acc4, (16*0 + 8*0)(AX)       \
  1023  	MOVQ acc5, (16*0 + 8*1)(AX)       \
  1024  	MOVQ acc6, (16*0 + 8*2)(AX)       \
  1025  	MOVQ acc7, (16*0 + 8*3)(AX)       \
  1026  	\
  1027  	calY()                            \
  1028  	MOVQ rptr, AX                     \ 
  1029  	\// Store y
  1030  	MOVQ acc4, (16*2 + 8*0)(AX)       \  
  1031  	MOVQ acc5, (16*2 + 8*1)(AX)       \ 
  1032  	MOVQ acc6, (16*2 + 8*2)(AX)       \
  1033  	MOVQ acc7, (16*2 + 8*3)(AX)       \
  1034  	\///////////////////////
  1035  	MOVQ $0, rptr                     \
  1036  
  1037  //func p256PointDoubleAsm(res, in *SM2P256Point)
  1038  TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-16
  1039  	// Move input to stack in order to free registers
  1040  	MOVQ res+0(FP), AX
  1041  	MOVQ in+8(FP), BX
  1042  
  1043  	p256PointDoubleInit()
  1044  	// Store pointer to result
  1045  	MOVQ AX, rptr
  1046  	// Begin point double
  1047  	lastP256PointDouble()
  1048  
  1049  	RET
  1050  
  1051  #define storeTmpX() \
  1052  	MOVQ acc4, x(8*0) \
  1053  	MOVQ acc5, x(8*1) \
  1054  	MOVQ acc6, x(8*2) \
  1055  	MOVQ acc7, x(8*3) \
  1056  
  1057  #define storeTmpY() \
  1058  	MOVQ acc4, y(8*0) \
  1059  	MOVQ acc5, y(8*1) \
  1060  	MOVQ acc6, y(8*2) \
  1061  	MOVQ acc7, y(8*3) \
  1062  
  1063  #define storeTmpZ() \
  1064  	MOVQ t0, z(8*0) \
  1065  	MOVQ t1, z(8*1) \
  1066  	MOVQ t2, z(8*2) \
  1067  	MOVQ t3, z(8*3) \
  1068  
  1069  #define p256PointDoubleRound() \
  1070  	calZ()                  \
  1071  	storeTmpZ()             \ 
  1072  	calX()                  \
  1073  	storeTmpX()             \
  1074  	calY()                  \
  1075  	storeTmpY()             \
  1076  
  1077  //func p256PointDouble6TimesAsm(res, in *SM2P256Point)
  1078  TEXT ·p256PointDouble6TimesAsm(SB),NOSPLIT,$256-16
  1079  	// Move input to stack in order to free registers
  1080  	MOVQ res+0(FP), AX
  1081  	MOVQ in+8(FP), BX
  1082  
  1083  	p256PointDoubleInit()
  1084  	// Store pointer to result
  1085  	MOVQ AX, rptr
  1086  
  1087  	// point double 1-5 rounds
  1088  	p256PointDoubleRound()
  1089  	p256PointDoubleRound()
  1090  	p256PointDoubleRound()
  1091  	p256PointDoubleRound()
  1092  	p256PointDoubleRound()
  1093  
  1094  	// last point double round
  1095  	lastP256PointDouble()
  1096  
  1097  	RET
  1098  /* ---------------------------------------*/