github.com/hellobchain/newcryptosm@v0.0.0-20221019060107-edb949a317e9/sm2/sm2_asm_amd64.s (about)

     1  
     2  #include "textflag.h"
     3  
     4  #define res_ptr DI
     5  #define x_ptr SI
     6  #define y_ptr CX
     7  
     8  #define acc0 R8
     9  #define acc1 R9
    10  #define acc2 R10
    11  #define acc3 R11
    12  #define acc4 R12
    13  #define acc5 R13
    14  #define t0 R14
    15  #define t1 R15
    16  
    17  //sm2	fffffffe ffffffff ffffffff ffffffff 
    18  //		ffffffff 00000000 ffffffff ffffffff	
    19  DATA sm2const0<>+0x00(SB)/8, $0xffffffff00000000
    20  DATA sm2const1<>+0x00(SB)/8, $0xfffffffeffffffff
    21  DATA sm2ordK0<>+0x00(SB)/8, $0x327f9e8872350975
    22  DATA sm2ord<>+0x00(SB)/8, $0x53bbf40939d54123
    23  DATA sm2ord<>+0x08(SB)/8, $0x7203df6b21c6052b
    24  DATA sm2ord<>+0x10(SB)/8, $0xffffffffffffffff
    25  DATA sm2ord<>+0x18(SB)/8, $0xfffffffeffffffff
    26  DATA sm2one<>+0x00(SB)/8, $0x0000000000000001
    27  DATA sm2one<>+0x08(SB)/8, $0x00000000ffffffff
    28  DATA sm2one<>+0x10(SB)/8, $0x0000000000000000
    29  DATA sm2one<>+0x18(SB)/8, $0x0000000100000000
    30  GLOBL sm2const0<>(SB), 8, $8
    31  GLOBL sm2const1<>(SB), 8, $8
    32  GLOBL sm2ordK0<>(SB), 8, $8
    33  GLOBL sm2ord<>(SB), 8, $32
    34  GLOBL sm2one<>(SB), 8, $32
    35  
    36  /* ---------------------------------------*/
    37  // func sm2LittleToBig(res []byte, in []uint64)
    38  TEXT ·sm2LittleToBig(SB),NOSPLIT,$0
    39  	JMP ·sm2BigToLittle(SB)
    40  /* ---------------------------------------*/
    41  // func sm2BigToLittle(res []uint64, in []byte)
    42  TEXT ·sm2BigToLittle(SB),NOSPLIT,$0
    43  	MOVQ res+0(FP), res_ptr
    44  	MOVQ in+24(FP), x_ptr
    45  
    46  	MOVQ (8*0)(x_ptr), acc0
    47  	MOVQ (8*1)(x_ptr), acc1
    48  	MOVQ (8*2)(x_ptr), acc2
    49  	MOVQ (8*3)(x_ptr), acc3
    50  
    51  	BSWAPQ acc0
    52  	BSWAPQ acc1
    53  	BSWAPQ acc2
    54  	BSWAPQ acc3
    55  
    56  	MOVQ acc3, (8*0)(res_ptr)
    57  	MOVQ acc2, (8*1)(res_ptr)
    58  	MOVQ acc1, (8*2)(res_ptr)
    59  	MOVQ acc0, (8*3)(res_ptr)
    60  
    61  	RET
    62  /* ---------------------------------------*/
    63  // func sm2MovCond(res, a, b []uint64, cond int)
    64  // If cond == 0 res=b, else res=a
    65  TEXT ·sm2MovCond(SB),NOSPLIT,$0
    66  	MOVQ res+0(FP), res_ptr
    67  	MOVQ a+24(FP), x_ptr
    68  	MOVQ b+48(FP), y_ptr
    69  	MOVQ cond+72(FP), X12
    70  
    71  	PXOR X13, X13
    72  	PSHUFD $0, X12, X12
    73  	PCMPEQL X13, X12
    74  
    75  	MOVOU X12, X0
    76  	MOVOU (16*0)(x_ptr), X6
    77  	PANDN X6, X0
    78  	MOVOU X12, X1
    79  	MOVOU (16*1)(x_ptr), X7
    80  	PANDN X7, X1
    81  	MOVOU X12, X2
    82  	MOVOU (16*2)(x_ptr), X8
    83  	PANDN X8, X2
    84  	MOVOU X12, X3
    85  	MOVOU (16*3)(x_ptr), X9
    86  	PANDN X9, X3
    87  	MOVOU X12, X4
    88  	MOVOU (16*4)(x_ptr), X10
    89  	PANDN X10, X4
    90  	MOVOU X12, X5
    91  	MOVOU (16*5)(x_ptr), X11
    92  	PANDN X11, X5
    93  
    94  	MOVOU (16*0)(y_ptr), X6
    95  	MOVOU (16*1)(y_ptr), X7
    96  	MOVOU (16*2)(y_ptr), X8
    97  	MOVOU (16*3)(y_ptr), X9
    98  	MOVOU (16*4)(y_ptr), X10
    99  	MOVOU (16*5)(y_ptr), X11
   100  
   101  	PAND X12, X6
   102  	PAND X12, X7
   103  	PAND X12, X8
   104  	PAND X12, X9
   105  	PAND X12, X10
   106  	PAND X12, X11
   107  
   108  	PXOR X6, X0
   109  	PXOR X7, X1
   110  	PXOR X8, X2
   111  	PXOR X9, X3
   112  	PXOR X10, X4
   113  	PXOR X11, X5
   114  
   115  	MOVOU X0, (16*0)(res_ptr)
   116  	MOVOU X1, (16*1)(res_ptr)
   117  	MOVOU X2, (16*2)(res_ptr)
   118  	MOVOU X3, (16*3)(res_ptr)
   119  	MOVOU X4, (16*4)(res_ptr)
   120  	MOVOU X5, (16*5)(res_ptr)
   121  
   122  	RET
   123  /* ---------------------------------------*/
   124  // func sm2NegCond(val []uint64, cond int)
   125  TEXT ·sm2NegCond(SB),NOSPLIT,$0
   126  	MOVQ val+0(FP), res_ptr
   127  	MOVQ cond+24(FP), t0
   128  	// acc = poly
   129  	MOVQ $-1, acc0
   130  	MOVQ sm2const0<>(SB), acc1
   131  	MOVQ $-1, acc2
   132  	MOVQ sm2const1<>(SB), acc3
   133  	// Load the original value
   134  	MOVQ (8*0)(res_ptr), acc5
   135  	MOVQ (8*1)(res_ptr), x_ptr
   136  	MOVQ (8*2)(res_ptr), y_ptr
   137  	MOVQ (8*3)(res_ptr), t1
   138  	// Speculatively subtract
   139  	SUBQ acc5, acc0
   140  	SBBQ x_ptr, acc1
   141  	SBBQ y_ptr, acc2
   142  	SBBQ t1, acc3
   143  	// If condition is 0, keep original value
   144  	TESTQ t0, t0
   145  	CMOVQEQ acc5, acc0
   146  	CMOVQEQ x_ptr, acc1
   147  	CMOVQEQ y_ptr, acc2
   148  	CMOVQEQ t1, acc3
   149  	// Store result
   150  	MOVQ acc0, (8*0)(res_ptr)
   151  	MOVQ acc1, (8*1)(res_ptr)
   152  	MOVQ acc2, (8*2)(res_ptr)
   153  	MOVQ acc3, (8*3)(res_ptr)
   154  
   155  	RET
   156  /* ---------------------------------------*/
   157  // func sm2Sqr(res, in []uint64, n int)
   158  TEXT ·sm2Sqr(SB),NOSPLIT,$0
   159  	MOVQ res+0(FP), res_ptr
   160  	MOVQ in+24(FP), x_ptr
   161  	MOVQ n+48(FP), BX
   162  
   163  sqrLoop:
   164  
   165  	// y[1:] * y[0]
   166  	MOVQ (8*0)(x_ptr), t0
   167  
   168  	MOVQ (8*1)(x_ptr), AX
   169  	MULQ t0
   170  	MOVQ AX, acc1
   171  	MOVQ DX, acc2
   172  
   173  	MOVQ (8*2)(x_ptr), AX
   174  	MULQ t0
   175  	ADDQ AX, acc2
   176  	ADCQ $0, DX
   177  	MOVQ DX, acc3
   178  
   179  	MOVQ (8*3)(x_ptr), AX
   180  	MULQ t0
   181  	ADDQ AX, acc3
   182  	ADCQ $0, DX
   183  	MOVQ DX, acc4
   184  	// y[2:] * y[1]
   185  	MOVQ (8*1)(x_ptr), t0
   186  
   187  	MOVQ (8*2)(x_ptr), AX
   188  	MULQ t0
   189  	ADDQ AX, acc3
   190  	ADCQ $0, DX
   191  	MOVQ DX, t1
   192  
   193  	MOVQ (8*3)(x_ptr), AX
   194  	MULQ t0
   195  	ADDQ t1, acc4
   196  	ADCQ $0, DX
   197  	ADDQ AX, acc4
   198  	ADCQ $0, DX
   199  	MOVQ DX, acc5
   200  	// y[3] * y[2]
   201  	MOVQ (8*2)(x_ptr), t0
   202  
   203  	MOVQ (8*3)(x_ptr), AX
   204  	MULQ t0
   205  	ADDQ AX, acc5
   206  	ADCQ $0, DX
   207  	MOVQ DX, y_ptr
   208  	XORQ t1, t1
   209  	// *2
   210  	ADDQ acc1, acc1
   211  	ADCQ acc2, acc2
   212  	ADCQ acc3, acc3
   213  	ADCQ acc4, acc4
   214  	ADCQ acc5, acc5
   215  	ADCQ y_ptr, y_ptr
   216  	ADCQ $0, t1
   217  	// Missing products
   218  	MOVQ (8*0)(x_ptr), AX
   219  	MULQ AX
   220  	MOVQ AX, acc0
   221  	MOVQ DX, t0
   222  
   223  	MOVQ (8*1)(x_ptr), AX
   224  	MULQ AX
   225  	ADDQ t0, acc1
   226  	ADCQ AX, acc2
   227  	ADCQ $0, DX
   228  	MOVQ DX, t0
   229  
   230  	MOVQ (8*2)(x_ptr), AX
   231  	MULQ AX
   232  	ADDQ t0, acc3
   233  	ADCQ AX, acc4
   234  	ADCQ $0, DX
   235  	MOVQ DX, t0
   236  
   237  	MOVQ (8*3)(x_ptr), AX
   238  	MULQ AX
   239  	ADDQ t0, acc5
   240  	ADCQ AX, y_ptr
   241  	ADCQ DX, t1
   242  	MOVQ t1, x_ptr
   243  	// First reduction step
   244  	MOVQ acc0, AX
   245  	MOVQ acc0, t1
   246  	ADDQ acc0, acc1
   247  	ADCQ $0, acc2
   248  	ADCQ $0, acc3
   249  	ADCQ $0, AX
   250  	SHLQ $32, t1
   251  	SHRQ $32, acc0
   252  	SUBQ t1, acc1
   253  	SBBQ acc0, acc2
   254  	SBBQ t1, acc3
   255  	SBBQ acc0, AX
   256  	MOVQ AX, acc0
   257  
   258  	// Second reduction step
   259  	MOVQ acc1, AX
   260  	MOVQ acc1, t1
   261  	ADDQ acc1, acc2
   262  	ADCQ $0, acc3
   263  	ADCQ $0, acc0
   264  	ADCQ $0, AX
   265  	SHLQ $32, t1
   266  	SHRQ $32, acc1
   267  	SUBQ t1, acc2
   268  	SBBQ acc1, acc3
   269  	SBBQ t1, acc0
   270  	SBBQ acc1, AX
   271  	MOVQ AX, acc1
   272  
   273  	// Third reduction step
   274  	MOVQ acc2, AX
   275  	MOVQ acc2, t1
   276  	ADDQ acc2, acc3
   277  	ADCQ $0, acc0
   278  	ADCQ $0, acc1
   279  	ADCQ $0, AX
   280  	SHLQ $32, t1
   281  	SHRQ $32, acc2
   282  	SUBQ t1, acc3
   283  	SBBQ acc2, acc0
   284  	SBBQ t1, acc1
   285  	SBBQ acc2, AX
   286  	MOVQ AX, acc2
   287  	
   288  	// Last reduction step
   289  	XORQ t0, t0
   290  	MOVQ acc3, AX
   291  	MOVQ acc3, t1
   292  	ADDQ acc3, acc0
   293  	ADCQ $0, acc1
   294  	ADCQ $0, acc2
   295  	ADCQ $0, AX
   296  	SHLQ $32, t1
   297  	SHRQ $32, acc3
   298  	SUBQ t1, acc0
   299  	SBBQ acc3, acc1
   300  	SBBQ t1, acc2
   301  	SBBQ acc3, AX
   302  	MOVQ AX, acc3
   303  	
   304  	// Add bits [511:256] of the sqr result
   305  	ADCQ acc4, acc0
   306  	ADCQ acc5, acc1
   307  	ADCQ y_ptr, acc2
   308  	ADCQ x_ptr, acc3
   309  	ADCQ $0, t0
   310  
   311  	MOVQ acc0, acc4
   312  	MOVQ acc1, acc5
   313  	MOVQ acc2, y_ptr
   314  	MOVQ acc3, t1
   315  	// Subtract sm2-p
   316  	SUBQ $-1, acc0
   317  	SBBQ sm2const0<>(SB) ,acc1
   318  	SBBQ $-1, acc2
   319  	SBBQ sm2const1<>(SB), acc3
   320  	SBBQ $0, t0
   321  
   322  	CMOVQCS acc4, acc0
   323  	CMOVQCS acc5, acc1
   324  	CMOVQCS y_ptr, acc2
   325  	CMOVQCS t1, acc3
   326  
   327  	MOVQ acc0, (8*0)(res_ptr)
   328  	MOVQ acc1, (8*1)(res_ptr)
   329  	MOVQ acc2, (8*2)(res_ptr)
   330  	MOVQ acc3, (8*3)(res_ptr)
   331  	MOVQ res_ptr, x_ptr
   332  	DECQ BX
   333  	JNE  sqrLoop
   334  
   335  	RET
   336  /* ---------------------------------------*/
   337  // func sm2Mul(res, in1, in2 []uint64)
   338  TEXT ·sm2Mul(SB),NOSPLIT,$0
   339  	MOVQ res+0(FP), res_ptr
   340  	MOVQ in1+24(FP), x_ptr
   341  	MOVQ in2+48(FP), y_ptr
   342  	// x * y[0]
   343  	MOVQ (8*0)(y_ptr), t0
   344  
   345  	MOVQ (8*0)(x_ptr), AX
   346  	MULQ t0
   347  	MOVQ AX, acc0
   348  	MOVQ DX, acc1
   349  
   350  	MOVQ (8*1)(x_ptr), AX
   351  	MULQ t0
   352  	ADDQ AX, acc1
   353  	ADCQ $0, DX
   354  	MOVQ DX, acc2
   355  
   356  	MOVQ (8*2)(x_ptr), AX
   357  	MULQ t0
   358  	ADDQ AX, acc2
   359  	ADCQ $0, DX
   360  	MOVQ DX, acc3
   361  
   362  	MOVQ (8*3)(x_ptr), AX
   363  	MULQ t0
   364  	ADDQ AX, acc3
   365  	ADCQ $0, DX
   366  	MOVQ DX, acc4
   367  	XORQ acc5, acc5
   368  	// First reduction step
   369  	MOVQ acc0, t1
   370  	ADDQ acc0, acc1
   371  	ADCQ $0, acc2
   372  	ADCQ $0, acc3
   373  	ADCQ acc0, acc4
   374  	ADCQ $0, acc5
   375  	SHLQ $32, t1
   376  	SHRQ $32, acc0
   377  	SUBQ t1, acc1
   378  	SBBQ acc0, acc2
   379  	SBBQ t1, acc3
   380  	SBBQ acc0, acc4
   381  	SBBQ $0, acc5
   382  	XORQ acc0, acc0
   383  	// x * y[1]
   384  	MOVQ (8*1)(y_ptr), t0
   385  
   386  	MOVQ (8*0)(x_ptr), AX
   387  	MULQ t0
   388  	ADDQ AX, acc1
   389  	ADCQ $0, DX
   390  	MOVQ DX, t1
   391  
   392  	MOVQ (8*1)(x_ptr), AX
   393  	MULQ t0
   394  	ADDQ t1, acc2
   395  	ADCQ $0, DX
   396  	ADDQ AX, acc2
   397  	ADCQ $0, DX
   398  	MOVQ DX, t1
   399  
   400  	MOVQ (8*2)(x_ptr), AX
   401  	MULQ t0
   402  	ADDQ t1, acc3
   403  	ADCQ $0, DX
   404  	ADDQ AX, acc3
   405  	ADCQ $0, DX
   406  	MOVQ DX, t1
   407  
   408  	MOVQ (8*3)(x_ptr), AX
   409  	MULQ t0
   410  	ADDQ t1, acc4
   411  	ADCQ $0, DX
   412  	ADDQ AX, acc4
   413  	ADCQ DX, acc5
   414  	ADCQ $0, acc0
   415  	// Second reduction step
   416  	MOVQ acc1, t1
   417  	ADDQ acc1, acc2
   418  	ADCQ $0, acc3
   419  	ADCQ $0, acc4
   420  	ADCQ acc1, acc5
   421  	ADCQ $0, acc0
   422  	SHLQ $32, t1
   423  	SHRQ $32, acc1
   424  	SUBQ t1, acc2
   425  	SBBQ acc1, acc3
   426  	SBBQ t1, acc4
   427  	SBBQ acc1, acc5
   428  	SBBQ $0, acc0
   429  	XORQ acc1, acc1
   430  	// x * y[2]
   431  	MOVQ (8*2)(y_ptr), t0
   432  
   433  	MOVQ (8*0)(x_ptr), AX
   434  	MULQ t0
   435  	ADDQ AX, acc2
   436  	ADCQ $0, DX
   437  	MOVQ DX, t1
   438  
   439  	MOVQ (8*1)(x_ptr), AX
   440  	MULQ t0
   441  	ADDQ t1, acc3
   442  	ADCQ $0, DX
   443  	ADDQ AX, acc3
   444  	ADCQ $0, DX
   445  	MOVQ DX, t1
   446  
   447  	MOVQ (8*2)(x_ptr), AX
   448  	MULQ t0
   449  	ADDQ t1, acc4
   450  	ADCQ $0, DX
   451  	ADDQ AX, acc4
   452  	ADCQ $0, DX
   453  	MOVQ DX, t1
   454  
   455  	MOVQ (8*3)(x_ptr), AX
   456  	MULQ t0
   457  	ADDQ t1, acc5
   458  	ADCQ $0, DX
   459  	ADDQ AX, acc5
   460  	ADCQ DX, acc0
   461  	ADCQ $0, acc1
   462  	// Third reduction step
   463  	MOVQ acc2, t1
   464  	ADDQ acc2, acc3
   465  	ADCQ $0, acc4
   466  	ADCQ $0, acc5
   467  	ADCQ acc2, acc0
   468  	ADCQ $0, acc1
   469  	SHLQ $32, t1
   470  	SHRQ $32, acc2
   471  	SUBQ t1, acc3
   472  	SBBQ acc2, acc4
   473  	SBBQ t1, acc5
   474  	SBBQ acc2, acc0
   475  	SBBQ $0, acc1
   476  	XORQ acc2, acc2
   477  	// x * y[3]
   478  	MOVQ (8*3)(y_ptr), t0
   479  
   480  	MOVQ (8*0)(x_ptr), AX
   481  	MULQ t0
   482  	ADDQ AX, acc3
   483  	ADCQ $0, DX
   484  	MOVQ DX, t1
   485  
   486  	MOVQ (8*1)(x_ptr), AX
   487  	MULQ t0
   488  	ADDQ t1, acc4
   489  	ADCQ $0, DX
   490  	ADDQ AX, acc4
   491  	ADCQ $0, DX
   492  	MOVQ DX, t1
   493  
   494  	MOVQ (8*2)(x_ptr), AX
   495  	MULQ t0
   496  	ADDQ t1, acc5
   497  	ADCQ $0, DX
   498  	ADDQ AX, acc5
   499  	ADCQ $0, DX
   500  	MOVQ DX, t1
   501  
   502  	MOVQ (8*3)(x_ptr), AX
   503  	MULQ t0
   504  	ADDQ t1, acc0
   505  	ADCQ $0, DX
   506  	ADDQ AX, acc0
   507  	ADCQ DX, acc1
   508  	ADCQ $0, acc2
   509  	// Last reduction step
   510  	MOVQ acc3, t1
   511  	ADDQ acc3, acc4
   512  	ADCQ $0, acc5
   513  	ADCQ $0, acc0
   514  	ADCQ acc3, acc1
   515  	ADCQ $0, acc2
   516  	SHLQ $32, t1
   517  	SHRQ $32, acc3
   518  	SUBQ t1, acc4
   519  	SBBQ acc3, acc5
   520  	SBBQ t1, acc0
   521  	SBBQ acc3, acc1
   522  	SBBQ $0, acc2
   523  	// Copy result [255:0]
   524  	MOVQ acc4, x_ptr
   525  	MOVQ acc5, acc3
   526  	MOVQ acc0, t0
   527  	MOVQ acc1, t1
   528  	// Subtract sm2-p
   529  	SUBQ $-1, acc4
   530  	SBBQ sm2const0<>(SB) ,acc5
   531  	SBBQ $-1, acc0
   532  	SBBQ sm2const1<>(SB), acc1
   533  	SBBQ $0, acc2
   534  
   535  	CMOVQCS x_ptr, acc4
   536  	CMOVQCS acc3, acc5
   537  	CMOVQCS t0, acc0
   538  	CMOVQCS t1, acc1
   539  
   540  	MOVQ acc4, (8*0)(res_ptr)
   541  	MOVQ acc5, (8*1)(res_ptr)
   542  	MOVQ acc0, (8*2)(res_ptr)
   543  	MOVQ acc1, (8*3)(res_ptr)
   544  
   545  	RET
   546  /* ---------------------------------------*/
   547  // func sm2FromMont(res, in []uint64)
   548  TEXT ·sm2FromMont(SB),NOSPLIT,$0
   549  	MOVQ res+0(FP), res_ptr
   550  	MOVQ in+24(FP), x_ptr
   551  
   552  	MOVQ (8*0)(x_ptr), acc0
   553  	MOVQ (8*1)(x_ptr), acc1
   554  	MOVQ (8*2)(x_ptr), acc2
   555  	MOVQ (8*3)(x_ptr), acc3
   556  	XORQ acc4, acc4
   557  
   558  	// Only reduce, no multiplications are needed
   559  	// First stage
   560  	MOVQ acc0, t1
   561  	ADDQ acc0, acc1
   562  	ADCQ $0, acc2
   563  	ADCQ $0, acc3
   564  	ADCQ acc0, acc4
   565  	SHLQ $32, t1
   566  	SHRQ $32, acc0
   567  	SUBQ t1, acc1
   568  	SBBQ acc0, acc2
   569  	SBBQ t1, acc3
   570  	SBBQ acc0, acc4
   571  	XORQ acc5, acc5
   572  	// Second stage
   573  	MOVQ acc1, t1
   574  	ADDQ acc1, acc2
   575  	ADCQ $0, acc3
   576  	ADCQ $0, acc4
   577  	ADCQ acc1, acc5
   578  	SHLQ $32, t1
   579  	SHRQ $32, acc1
   580  	SUBQ t1, acc2
   581  	SBBQ acc1, acc3
   582  	SBBQ t1, acc4
   583  	SBBQ acc1, acc5
   584  	XORQ acc0, acc0
   585  	// Third stage
   586  	MOVQ acc2, t1
   587  	ADDQ acc2, acc3
   588  	ADCQ $0, acc4
   589  	ADCQ $0, acc5
   590  	ADCQ acc2, acc0
   591  	SHLQ $32, t1
   592  	SHRQ $32, acc2
   593  	SUBQ t1, acc3
   594  	SBBQ acc2, acc4
   595  	SBBQ t1, acc5
   596  	SBBQ acc2, acc0
   597  	XORQ acc1, acc1
   598  	// Last stage
   599  	MOVQ acc3, t1
   600  	ADDQ acc3, acc4
   601  	ADCQ $0, acc5
   602  	ADCQ $0, acc0
   603  	ADCQ acc3, acc1
   604  	SHLQ $32, t1
   605  	SHRQ $32, acc3
   606  	SUBQ t1, acc4
   607  	SBBQ acc3, acc5
   608  	SBBQ t1, acc0
   609  	SBBQ acc3, acc1
   610  
   611  	MOVQ acc4, x_ptr
   612  	MOVQ acc5, acc3
   613  	MOVQ acc0, t0
   614  	MOVQ acc1, t1
   615  
   616  	SUBQ $-1, acc4
   617  	SBBQ sm2const0<>(SB), acc5
   618  	SBBQ $-1, acc0
   619  	SBBQ sm2const1<>(SB), acc1
   620  
   621  	CMOVQCS x_ptr, acc4
   622  	CMOVQCS acc3, acc5
   623  	CMOVQCS t0, acc0
   624  	CMOVQCS t1, acc1
   625  
   626  	MOVQ acc4, (8*0)(res_ptr)
   627  	MOVQ acc5, (8*1)(res_ptr)
   628  	MOVQ acc0, (8*2)(res_ptr)
   629  	MOVQ acc1, (8*3)(res_ptr)
   630  
   631  	RET
   632  /* ---------------------------------------*/
   633  // Constant time point access to arbitrary point table.
   634  // Indexed from 1 to 15, with -1 offset
   635  // (index 0 is implicitly point at infinity)
   636  // func sm2Select(point, table []uint64, idx int)
   637  TEXT ·sm2Select(SB),NOSPLIT,$0
   638  	MOVQ idx+48(FP),AX
   639  	MOVQ table+24(FP),DI
   640  	MOVQ point+0(FP),DX
   641  
   642  	PXOR X15, X15	// X15 = 0
   643  	PCMPEQL X14, X14 // X14 = -1
   644  	PSUBL X14, X15   // X15 = 1
   645  	MOVL AX, X14
   646  	PSHUFD $0, X14, X14
   647  
   648  	PXOR X0, X0
   649  	PXOR X1, X1
   650  	PXOR X2, X2
   651  	PXOR X3, X3
   652  	PXOR X4, X4
   653  	PXOR X5, X5
   654  	MOVQ $16, AX
   655  
   656  	MOVOU X15, X13
   657  
   658  loop_select:
   659  
   660  		MOVOU X13, X12
   661  		PADDL X15, X13
   662  		PCMPEQL X14, X12
   663  
   664  		MOVOU (16*0)(DI), X6
   665  		MOVOU (16*1)(DI), X7
   666  		MOVOU (16*2)(DI), X8
   667  		MOVOU (16*3)(DI), X9
   668  		MOVOU (16*4)(DI), X10
   669  		MOVOU (16*5)(DI), X11
   670  		ADDQ $(16*6), DI
   671  
   672  		PAND X12, X6
   673  		PAND X12, X7
   674  		PAND X12, X8
   675  		PAND X12, X9
   676  		PAND X12, X10
   677  		PAND X12, X11
   678  
   679  		PXOR X6, X0
   680  		PXOR X7, X1
   681  		PXOR X8, X2
   682  		PXOR X9, X3
   683  		PXOR X10, X4
   684  		PXOR X11, X5
   685  
   686  		DECQ AX
   687  		JNE loop_select
   688  
   689  	MOVOU X0, (16*0)(DX)
   690  	MOVOU X1, (16*1)(DX)
   691  	MOVOU X2, (16*2)(DX)
   692  	MOVOU X3, (16*3)(DX)
   693  	MOVOU X4, (16*4)(DX)
   694  	MOVOU X5, (16*5)(DX)
   695  
   696  	RET
   697  /* ---------------------------------------*/
   698  // Constant time point access to base point table.
   699  // func sm2SelectBase(point, table []uint64, idx int)
   700  TEXT ·sm2SelectBase(SB),NOSPLIT,$0
   701  	MOVQ idx+48(FP),AX
   702  	MOVQ table+24(FP),DI
   703  	MOVQ point+0(FP),DX
   704  
   705  	PXOR X15, X15	// X15 = 0
   706  	PCMPEQL X14, X14 // X14 = -1
   707  	PSUBL X14, X15   // X15 = 1
   708  	MOVL AX, X14
   709  	PSHUFD $0, X14, X14
   710  
   711  	PXOR X0, X0
   712  	PXOR X1, X1
   713  	PXOR X2, X2
   714  	PXOR X3, X3
   715  	MOVQ $16, AX
   716  
   717  	MOVOU X15, X13
   718  
   719  loop_select_base:
   720  
   721  		MOVOU X13, X12
   722  		PADDL X15, X13
   723  		PCMPEQL X14, X12
   724  
   725  		MOVOU (16*0)(DI), X4
   726  		MOVOU (16*1)(DI), X5
   727  		MOVOU (16*2)(DI), X6
   728  		MOVOU (16*3)(DI), X7
   729  
   730  		MOVOU (16*4)(DI), X8
   731  		MOVOU (16*5)(DI), X9
   732  		MOVOU (16*6)(DI), X10
   733  		MOVOU (16*7)(DI), X11
   734  
   735  		ADDQ $(16*8), DI
   736  
   737  		PAND X12, X4
   738  		PAND X12, X5
   739  		PAND X12, X6
   740  		PAND X12, X7
   741  
   742  		MOVOU X13, X12
   743  		PADDL X15, X13
   744  		PCMPEQL X14, X12
   745  
   746  		PAND X12, X8
   747  		PAND X12, X9
   748  		PAND X12, X10
   749  		PAND X12, X11
   750  
   751  		PXOR X4, X0
   752  		PXOR X5, X1
   753  		PXOR X6, X2
   754  		PXOR X7, X3
   755  
   756  		PXOR X8, X0
   757  		PXOR X9, X1
   758  		PXOR X10, X2
   759  		PXOR X11, X3
   760  
   761  		DECQ AX
   762  		JNE loop_select_base
   763  
   764  	MOVOU X0, (16*0)(DX)
   765  	MOVOU X1, (16*1)(DX)
   766  	MOVOU X2, (16*2)(DX)
   767  	MOVOU X3, (16*3)(DX)
   768  
   769  	RET
   770  /* ---------------------------------------*/
   771  // func sm2OrdMul(res, in1, in2 []uint64)
   772  TEXT ·sm2OrdMul(SB),NOSPLIT,$0
   773  	MOVQ res+0(FP), res_ptr
   774  	MOVQ in1+24(FP), x_ptr
   775  	MOVQ in2+48(FP), y_ptr
   776  	// x * y[0]
   777  	MOVQ (8*0)(y_ptr), t0
   778  
   779  	MOVQ (8*0)(x_ptr), AX
   780  	MULQ t0
   781  	MOVQ AX, acc0
   782  	MOVQ DX, acc1
   783  
   784  	MOVQ (8*1)(x_ptr), AX
   785  	MULQ t0
   786  	ADDQ AX, acc1
   787  	ADCQ $0, DX
   788  	MOVQ DX, acc2
   789  
   790  	MOVQ (8*2)(x_ptr), AX
   791  	MULQ t0
   792  	ADDQ AX, acc2
   793  	ADCQ $0, DX
   794  	MOVQ DX, acc3
   795  
   796  	MOVQ (8*3)(x_ptr), AX
   797  	MULQ t0
   798  	ADDQ AX, acc3
   799  	ADCQ $0, DX
   800  	MOVQ DX, acc4
   801  	XORQ acc5, acc5
   802  	// First reduction step
   803  	MOVQ acc0, AX
   804  	MULQ sm2ordK0<>(SB)
   805  	MOVQ AX, t0
   806  
   807  	MOVQ sm2ord<>+0x00(SB), AX
   808  	MULQ t0
   809  	ADDQ AX, acc0
   810  	ADCQ $0, DX
   811  	MOVQ DX, t1
   812  
   813  	MOVQ sm2ord<>+0x08(SB), AX
   814  	MULQ t0
   815  	ADDQ t1, acc1
   816  	ADCQ $0, DX
   817  	ADDQ AX, acc1
   818  	ADCQ $0, DX
   819  	MOVQ DX, t1
   820  
   821  	MOVQ sm2ord<>+0x10(SB), AX
   822  	MULQ t0
   823  	ADDQ t1, acc2
   824  	ADCQ $0, DX
   825  	ADDQ AX, acc2
   826  	ADCQ $0, DX
   827  	MOVQ DX, t1
   828  
   829  	MOVQ sm2ord<>+0x18(SB), AX
   830  	MULQ t0
   831  	ADDQ t1, acc3
   832  	ADCQ $0, DX
   833  	ADDQ AX, acc3
   834  	ADCQ DX, acc4
   835  	ADCQ $0, acc5
   836  	// x * y[1]
   837  	MOVQ (8*1)(y_ptr), t0
   838  
   839  	MOVQ (8*0)(x_ptr), AX
   840  	MULQ t0
   841  	ADDQ AX, acc1
   842  	ADCQ $0, DX
   843  	MOVQ DX, t1
   844  
   845  	MOVQ (8*1)(x_ptr), AX
   846  	MULQ t0
   847  	ADDQ t1, acc2
   848  	ADCQ $0, DX
   849  	ADDQ AX, acc2
   850  	ADCQ $0, DX
   851  	MOVQ DX, t1
   852  
   853  	MOVQ (8*2)(x_ptr), AX
   854  	MULQ t0
   855  	ADDQ t1, acc3
   856  	ADCQ $0, DX
   857  	ADDQ AX, acc3
   858  	ADCQ $0, DX
   859  	MOVQ DX, t1
   860  
   861  	MOVQ (8*3)(x_ptr), AX
   862  	MULQ t0
   863  	ADDQ t1, acc4
   864  	ADCQ $0, DX
   865  	ADDQ AX, acc4
   866  	ADCQ DX, acc5
   867  	ADCQ $0, acc0
   868  	// Second reduction step
   869  	MOVQ acc1, AX
   870  	MULQ sm2ordK0<>(SB)
   871  	MOVQ AX, t0
   872  
   873  	MOVQ sm2ord<>+0x00(SB), AX
   874  	MULQ t0
   875  	ADDQ AX, acc1
   876  	ADCQ $0, DX
   877  	MOVQ DX, t1
   878  
   879  	MOVQ sm2ord<>+0x08(SB), AX
   880  	MULQ t0
   881  	ADDQ t1, acc2
   882  	ADCQ $0, DX
   883  	ADDQ AX, acc2
   884  	ADCQ $0, DX
   885  	MOVQ DX, t1
   886  
   887  	MOVQ sm2ord<>+0x10(SB), AX
   888  	MULQ t0
   889  	ADDQ t1, acc3
   890  	ADCQ $0, DX
   891  	ADDQ AX, acc3
   892  	ADCQ $0, DX
   893  	MOVQ DX, t1
   894  
   895  	MOVQ sm2ord<>+0x18(SB), AX
   896  	MULQ t0
   897  	ADDQ t1, acc4
   898  	ADCQ $0, DX
   899  	ADDQ AX, acc4
   900  	ADCQ DX, acc5
   901  	ADCQ $0, acc0
   902  	// x * y[2]
   903  	MOVQ (8*2)(y_ptr), t0
   904  
   905  	MOVQ (8*0)(x_ptr), AX
   906  	MULQ t0
   907  	ADDQ AX, acc2
   908  	ADCQ $0, DX
   909  	MOVQ DX, t1
   910  
   911  	MOVQ (8*1)(x_ptr), AX
   912  	MULQ t0
   913  	ADDQ t1, acc3
   914  	ADCQ $0, DX
   915  	ADDQ AX, acc3
   916  	ADCQ $0, DX
   917  	MOVQ DX, t1
   918  
   919  	MOVQ (8*2)(x_ptr), AX
   920  	MULQ t0
   921  	ADDQ t1, acc4
   922  	ADCQ $0, DX
   923  	ADDQ AX, acc4
   924  	ADCQ $0, DX
   925  	MOVQ DX, t1
   926  
   927  	MOVQ (8*3)(x_ptr), AX
   928  	MULQ t0
   929  	ADDQ t1, acc5
   930  	ADCQ $0, DX
   931  	ADDQ AX, acc5
   932  	ADCQ DX, acc0
   933  	ADCQ $0, acc1
   934  	// Third reduction step
   935  	MOVQ acc2, AX
   936  	MULQ sm2ordK0<>(SB)
   937  	MOVQ AX, t0
   938  
   939  	MOVQ sm2ord<>+0x00(SB), AX
   940  	MULQ t0
   941  	ADDQ AX, acc2
   942  	ADCQ $0, DX
   943  	MOVQ DX, t1
   944  
   945  	MOVQ sm2ord<>+0x08(SB), AX
   946  	MULQ t0
   947  	ADDQ t1, acc3
   948  	ADCQ $0, DX
   949  	ADDQ AX, acc3
   950  	ADCQ $0, DX
   951  	MOVQ DX, t1
   952  
   953  	MOVQ sm2ord<>+0x10(SB), AX
   954  	MULQ t0
   955  	ADDQ t1, acc4
   956  	ADCQ $0, DX
   957  	ADDQ AX, acc4
   958  	ADCQ $0, DX
   959  	MOVQ DX, t1
   960  
   961  	MOVQ sm2ord<>+0x18(SB), AX
   962  	MULQ t0
   963  	ADDQ t1, acc5
   964  	ADCQ $0, DX
   965  	ADDQ AX, acc5
   966  	ADCQ DX, acc0
   967  	ADCQ $0, acc1
   968  	// x * y[3]
   969  	MOVQ (8*3)(y_ptr), t0
   970  
   971  	MOVQ (8*0)(x_ptr), AX
   972  	MULQ t0
   973  	ADDQ AX, acc3
   974  	ADCQ $0, DX
   975  	MOVQ DX, t1
   976  
   977  	MOVQ (8*1)(x_ptr), AX
   978  	MULQ t0
   979  	ADDQ t1, acc4
   980  	ADCQ $0, DX
   981  	ADDQ AX, acc4
   982  	ADCQ $0, DX
   983  	MOVQ DX, t1
   984  
   985  	MOVQ (8*2)(x_ptr), AX
   986  	MULQ t0
   987  	ADDQ t1, acc5
   988  	ADCQ $0, DX
   989  	ADDQ AX, acc5
   990  	ADCQ $0, DX
   991  	MOVQ DX, t1
   992  
   993  	MOVQ (8*3)(x_ptr), AX
   994  	MULQ t0
   995  	ADDQ t1, acc0
   996  	ADCQ $0, DX
   997  	ADDQ AX, acc0
   998  	ADCQ DX, acc1
   999  	ADCQ $0, acc2
  1000  	// Last reduction step
  1001  	MOVQ acc3, AX
  1002  	MULQ sm2ordK0<>(SB)
  1003  	MOVQ AX, t0
  1004  
  1005  	MOVQ sm2ord<>+0x00(SB), AX
  1006  	MULQ t0
  1007  	ADDQ AX, acc3
  1008  	ADCQ $0, DX
  1009  	MOVQ DX, t1
  1010  
  1011  	MOVQ sm2ord<>+0x08(SB), AX
  1012  	MULQ t0
  1013  	ADDQ t1, acc4
  1014  	ADCQ $0, DX
  1015  	ADDQ AX, acc4
  1016  	ADCQ $0, DX
  1017  	MOVQ DX, t1
  1018  
  1019  	MOVQ sm2ord<>+0x10(SB), AX
  1020  	MULQ t0
  1021  	ADDQ t1, acc5
  1022  	ADCQ $0, DX
  1023  	ADDQ AX, acc5
  1024  	ADCQ $0, DX
  1025  	MOVQ DX, t1
  1026  
  1027  	MOVQ sm2ord<>+0x18(SB), AX
  1028  	MULQ t0
  1029  	ADDQ t1, acc0
  1030  	ADCQ $0, DX
  1031  	ADDQ AX, acc0
  1032  	ADCQ DX, acc1
  1033  	ADCQ $0, acc2
  1034  	// Copy result [255:0]
  1035  	MOVQ acc4, x_ptr
  1036  	MOVQ acc5, acc3
  1037  	MOVQ acc0, t0
  1038  	MOVQ acc1, t1
  1039  	// Subtract sm2
  1040  	SUBQ sm2ord<>+0x00(SB), acc4
  1041  	SBBQ sm2ord<>+0x08(SB) ,acc5
  1042  	SBBQ sm2ord<>+0x10(SB), acc0
  1043  	SBBQ sm2ord<>+0x18(SB), acc1
  1044  	SBBQ $0, acc2
  1045  
  1046  	CMOVQCS x_ptr, acc4
  1047  	CMOVQCS acc3, acc5
  1048  	CMOVQCS t0, acc0
  1049  	CMOVQCS t1, acc1
  1050  
  1051  	MOVQ acc4, (8*0)(res_ptr)
  1052  	MOVQ acc5, (8*1)(res_ptr)
  1053  	MOVQ acc0, (8*2)(res_ptr)
  1054  	MOVQ acc1, (8*3)(res_ptr)
  1055  
  1056  	RET
  1057  /* ---------------------------------------*/
  1058  // func sm2OrdSqr(res, in []uint64, n int)
  1059  TEXT ·sm2OrdSqr(SB),NOSPLIT,$0
  1060  	MOVQ res+0(FP), res_ptr
  1061  	MOVQ in+24(FP), x_ptr
  1062  	MOVQ n+48(FP), BX
  1063  
  1064  ordSqrLoop:
  1065  
  1066  	// y[1:] * y[0]
  1067  	MOVQ (8*0)(x_ptr), t0
  1068  
  1069  	MOVQ (8*1)(x_ptr), AX
  1070  	MULQ t0
  1071  	MOVQ AX, acc1
  1072  	MOVQ DX, acc2
  1073  
  1074  	MOVQ (8*2)(x_ptr), AX
  1075  	MULQ t0
  1076  	ADDQ AX, acc2
  1077  	ADCQ $0, DX
  1078  	MOVQ DX, acc3
  1079  
  1080  	MOVQ (8*3)(x_ptr), AX
  1081  	MULQ t0
  1082  	ADDQ AX, acc3
  1083  	ADCQ $0, DX
  1084  	MOVQ DX, acc4
  1085  	// y[2:] * y[1]
  1086  	MOVQ (8*1)(x_ptr), t0
  1087  
  1088  	MOVQ (8*2)(x_ptr), AX
  1089  	MULQ t0
  1090  	ADDQ AX, acc3
  1091  	ADCQ $0, DX
  1092  	MOVQ DX, t1
  1093  
  1094  	MOVQ (8*3)(x_ptr), AX
  1095  	MULQ t0
  1096  	ADDQ t1, acc4
  1097  	ADCQ $0, DX
  1098  	ADDQ AX, acc4
  1099  	ADCQ $0, DX
  1100  	MOVQ DX, acc5
  1101  	// y[3] * y[2]
  1102  	MOVQ (8*2)(x_ptr), t0
  1103  
  1104  	MOVQ (8*3)(x_ptr), AX
  1105  	MULQ t0
  1106  	ADDQ AX, acc5
  1107  	ADCQ $0, DX
  1108  	MOVQ DX, y_ptr
  1109  	XORQ t1, t1
  1110  	// *2
  1111  	ADDQ acc1, acc1
  1112  	ADCQ acc2, acc2
  1113  	ADCQ acc3, acc3
  1114  	ADCQ acc4, acc4
  1115  	ADCQ acc5, acc5
  1116  	ADCQ y_ptr, y_ptr
  1117  	ADCQ $0, t1
  1118  	// Missing products
  1119  	MOVQ (8*0)(x_ptr), AX
  1120  	MULQ AX
  1121  	MOVQ AX, acc0
  1122  	MOVQ DX, t0
  1123  
  1124  	MOVQ (8*1)(x_ptr), AX
  1125  	MULQ AX
  1126  	ADDQ t0, acc1
  1127  	ADCQ AX, acc2
  1128  	ADCQ $0, DX
  1129  	MOVQ DX, t0
  1130  
  1131  	MOVQ (8*2)(x_ptr), AX
  1132  	MULQ AX
  1133  	ADDQ t0, acc3
  1134  	ADCQ AX, acc4
  1135  	ADCQ $0, DX
  1136  	MOVQ DX, t0
  1137  
  1138  	MOVQ (8*3)(x_ptr), AX
  1139  	MULQ AX
  1140  	ADDQ t0, acc5
  1141  	ADCQ AX, y_ptr
  1142  	ADCQ DX, t1
  1143  	MOVQ t1, x_ptr
  1144  	// First reduction step
  1145  	MOVQ acc0, AX
  1146  	MULQ sm2ordK0<>(SB)
  1147  	MOVQ AX, t0
  1148  
  1149  	MOVQ sm2ord<>+0x00(SB), AX
  1150  	MULQ t0
  1151  	ADDQ AX, acc0
  1152  	ADCQ $0, DX
  1153  	MOVQ DX, t1
  1154  
  1155  	MOVQ sm2ord<>+0x08(SB), AX
  1156  	MULQ t0
  1157  	ADDQ t1, acc1
  1158  	ADCQ $0, DX
  1159  	ADDQ AX, acc1
  1160  
  1161  	//MOVQ t0, t1
  1162  	ADCQ DX, acc2
  1163  	ADCQ $0, acc3
  1164  	SUBQ t0, acc2
  1165  	SBBQ $0, acc3
  1166  
  1167  	MOVQ t0, AX
  1168  	MOVQ t0, DX
  1169  	MOVQ t0, acc0
  1170  	SBBQ $0, acc0
  1171  	SHLQ $32, AX
  1172  	SHRQ $32, DX
  1173  
  1174  	//ADDQ t1, acc3
  1175  	SUBQ AX, acc3
  1176  	SBBQ DX, acc0
  1177  	// Second reduction step
  1178  	MOVQ acc1, AX
  1179  	MULQ sm2ordK0<>(SB)
  1180  	MOVQ AX, t0
  1181  
  1182  	MOVQ sm2ord<>+0x00(SB), AX
  1183  	MULQ t0
  1184  	ADDQ AX, acc1
  1185  	ADCQ $0, DX
  1186  	MOVQ DX, t1
  1187  
  1188  	MOVQ sm2ord<>+0x08(SB), AX
  1189  	MULQ t0
  1190  	ADDQ t1, acc2
  1191  	ADCQ $0, DX
  1192  	ADDQ AX, acc2
  1193  
  1194  	//MOVQ t0, t1
  1195  	ADCQ DX, acc3
  1196  	ADCQ $0, acc0
  1197  	SUBQ t0, acc3
  1198  	SBBQ $0, acc0
  1199  
  1200  	MOVQ t0, AX
  1201  	MOVQ t0, DX
  1202  	MOVQ t0, acc1
  1203  	SBBQ $0, acc1
  1204  	SHLQ $32, AX
  1205  	SHRQ $32, DX
  1206  
  1207  	//ADDQ t1, acc0
  1208  	SUBQ AX, acc0
  1209  	SBBQ DX, acc1
  1210  	// Third reduction step
  1211  	MOVQ acc2, AX
  1212  	MULQ sm2ordK0<>(SB)
  1213  	MOVQ AX, t0
  1214  
  1215  	MOVQ sm2ord<>+0x00(SB), AX
  1216  	MULQ t0
  1217  	ADDQ AX, acc2
  1218  	ADCQ $0, DX
  1219  	MOVQ DX, t1
  1220  
  1221  	MOVQ sm2ord<>+0x08(SB), AX
  1222  	MULQ t0
  1223  	ADDQ t1, acc3
  1224  	ADCQ $0, DX
  1225  	ADDQ AX, acc3
  1226  
  1227  	//MOVQ t0, t1
  1228  	ADCQ DX, acc0
  1229  	ADCQ $0, acc1
  1230  	SUBQ t0, acc0
  1231  	SBBQ $0, acc1
  1232  
  1233  	MOVQ t0, AX
  1234  	MOVQ t0, DX
  1235  	MOVQ t0, acc2
  1236  	SBBQ $0, acc2
  1237  	SHLQ $32, AX
  1238  	SHRQ $32, DX
  1239  
  1240  	//ADDQ t1, acc1
  1241  	SUBQ AX, acc1
  1242  	SBBQ DX, acc2
  1243  	// Last reduction step
  1244  	MOVQ acc3, AX
  1245  	MULQ sm2ordK0<>(SB)
  1246  	MOVQ AX, t0
  1247  
  1248  	MOVQ sm2ord<>+0x00(SB), AX
  1249  	MULQ t0
  1250  	ADDQ AX, acc3
  1251  	ADCQ $0, DX
  1252  	MOVQ DX, t1
  1253  
  1254  	MOVQ sm2ord<>+0x08(SB), AX
  1255  	MULQ t0
  1256  	ADDQ t1, acc0
  1257  	ADCQ $0, DX
  1258  	ADDQ AX, acc0
  1259  	//ADCQ $0, DX
  1260  	//MOVQ DX, t1
  1261  
  1262  	//MOVQ t0, t1
  1263  	ADCQ DX, acc1
  1264  	ADCQ $0, acc2
  1265  	SUBQ t0, acc1
  1266  	SBBQ $0, acc2
  1267  
  1268  	MOVQ t0, AX
  1269  	MOVQ t0, DX
  1270  	MOVQ t0, acc3
  1271  	SBBQ $0, acc3
  1272  	SHLQ $32, AX
  1273  	SHRQ $32, DX
  1274  
  1275  	//ADDQ t1, acc2
  1276  	SUBQ AX, acc2
  1277  	SBBQ DX, acc3
  1278  	XORQ t0, t0
  1279  	// Add bits [511:256] of the sqr result
  1280  	ADDQ acc4, acc0
  1281  	ADCQ acc5, acc1
  1282  	ADCQ y_ptr, acc2
  1283  	ADCQ x_ptr, acc3
  1284  	ADCQ $0, t0
  1285  
  1286  	MOVQ acc0, acc4
  1287  	MOVQ acc1, acc5
  1288  	MOVQ acc2, y_ptr
  1289  	MOVQ acc3, t1
  1290  	// Subtract sm2
  1291  	SUBQ sm2ord<>+0x00(SB), acc0
  1292  	SBBQ sm2ord<>+0x08(SB) ,acc1
  1293  	SBBQ sm2ord<>+0x10(SB), acc2
  1294  	SBBQ sm2ord<>+0x18(SB), acc3
  1295  	SBBQ $0, t0
  1296  
  1297  	CMOVQCS acc4, acc0
  1298  	CMOVQCS acc5, acc1
  1299  	CMOVQCS y_ptr, acc2
  1300  	CMOVQCS t1, acc3
  1301  
  1302  	MOVQ acc0, (8*0)(res_ptr)
  1303  	MOVQ acc1, (8*1)(res_ptr)
  1304  	MOVQ acc2, (8*2)(res_ptr)
  1305  	MOVQ acc3, (8*3)(res_ptr)
  1306  	MOVQ res_ptr, x_ptr
  1307  	DECQ BX
  1308  	JNE ordSqrLoop
  1309  
  1310  	RET
  1311  /* ---------------------------------------*/
  1312  #undef res_ptr
  1313  #undef x_ptr
  1314  #undef y_ptr
  1315  
  1316  #undef acc0
  1317  #undef acc1
  1318  #undef acc2
  1319  #undef acc3
  1320  #undef acc4
  1321  #undef acc5
  1322  #undef t0
  1323  #undef t1
  1324  /* ---------------------------------------*/
  1325  #define mul0 AX
  1326  #define mul1 DX
  1327  #define acc0 BX
  1328  #define acc1 CX
  1329  #define acc2 R8
  1330  #define acc3 R9
  1331  #define acc4 R10
  1332  #define acc5 R11
  1333  #define acc6 R12
  1334  #define acc7 R13
  1335  #define t0 R14
  1336  #define t1 R15
  1337  #define t2 DI
  1338  #define t3 SI
  1339  #define hlp BP
  1340  /* ---------------------------------------*/
  1341  TEXT sm2SubInternal(SB),NOSPLIT | DUPOK,$0
  1342  	XORQ mul0, mul0
  1343  	SUBQ t0, acc4
  1344  	SBBQ t1, acc5
  1345  	SBBQ t2, acc6
  1346  	SBBQ t3, acc7
  1347  	SBBQ $0, mul0
  1348  
  1349  	MOVQ acc4, acc0
  1350  	MOVQ acc5, acc1
  1351  	MOVQ acc6, acc2
  1352  	MOVQ acc7, acc3
  1353  
  1354  	ADDQ $-1, acc4
  1355  	ADCQ sm2const0<>(SB), acc5
  1356  	ADCQ $-1, acc6
  1357  	ADCQ sm2const1<>(SB), acc7
  1358  	ANDQ $1, mul0
  1359  
  1360  	CMOVQEQ acc0, acc4
  1361  	CMOVQEQ acc1, acc5
  1362  	CMOVQEQ acc2, acc6
  1363  	CMOVQEQ acc3, acc7
  1364  
  1365  	RET
  1366  /* ---------------------------------------*/
  1367  TEXT sm2MulInternal(SB),NOSPLIT | DUPOK,$0
  1368  	MOVQ acc4, mul0
  1369  	MULQ t0
  1370  	MOVQ mul0, acc0
  1371  	MOVQ mul1, acc1
  1372  
  1373  	MOVQ acc4, mul0
  1374  	MULQ t1
  1375  	ADDQ mul0, acc1
  1376  	ADCQ $0, mul1
  1377  	MOVQ mul1, acc2
  1378  
  1379  	MOVQ acc4, mul0
  1380  	MULQ t2
  1381  	ADDQ mul0, acc2
  1382  	ADCQ $0, mul1
  1383  	MOVQ mul1, acc3
  1384  
  1385  	MOVQ acc4, mul0
  1386  	MULQ t3
  1387  	ADDQ mul0, acc3
  1388  	ADCQ $0, mul1
  1389  	MOVQ mul1, acc4
  1390  
  1391  	MOVQ acc5, mul0
  1392  	MULQ t0
  1393  	ADDQ mul0, acc1
  1394  	ADCQ $0, mul1
  1395  	MOVQ mul1, hlp
  1396  
  1397  	MOVQ acc5, mul0
  1398  	MULQ t1
  1399  	ADDQ hlp, acc2
  1400  	ADCQ $0, mul1
  1401  	ADDQ mul0, acc2
  1402  	ADCQ $0, mul1
  1403  	MOVQ mul1, hlp
  1404  
  1405  	MOVQ acc5, mul0
  1406  	MULQ t2
  1407  	ADDQ hlp, acc3
  1408  	ADCQ $0, mul1
  1409  	ADDQ mul0, acc3
  1410  	ADCQ $0, mul1
  1411  	MOVQ mul1, hlp
  1412  
  1413  	MOVQ acc5, mul0
  1414  	MULQ t3
  1415  	ADDQ hlp, acc4
  1416  	ADCQ $0, mul1
  1417  	ADDQ mul0, acc4
  1418  	ADCQ $0, mul1
  1419  	MOVQ mul1, acc5
  1420  
  1421  	MOVQ acc6, mul0
  1422  	MULQ t0
  1423  	ADDQ mul0, acc2
  1424  	ADCQ $0, mul1
  1425  	MOVQ mul1, hlp
  1426  
  1427  	MOVQ acc6, mul0
  1428  	MULQ t1
  1429  	ADDQ hlp, acc3
  1430  	ADCQ $0, mul1
  1431  	ADDQ mul0, acc3
  1432  	ADCQ $0, mul1
  1433  	MOVQ mul1, hlp
  1434  
  1435  	MOVQ acc6, mul0
  1436  	MULQ t2
  1437  	ADDQ hlp, acc4
  1438  	ADCQ $0, mul1
  1439  	ADDQ mul0, acc4
  1440  	ADCQ $0, mul1
  1441  	MOVQ mul1, hlp
  1442  
  1443  	MOVQ acc6, mul0
  1444  	MULQ t3
  1445  	ADDQ hlp, acc5
  1446  	ADCQ $0, mul1
  1447  	ADDQ mul0, acc5
  1448  	ADCQ $0, mul1
  1449  	MOVQ mul1, acc6
  1450  
  1451  	MOVQ acc7, mul0
  1452  	MULQ t0
  1453  	ADDQ mul0, acc3
  1454  	ADCQ $0, mul1
  1455  	MOVQ mul1, hlp
  1456  
  1457  	MOVQ acc7, mul0
  1458  	MULQ t1
  1459  	ADDQ hlp, acc4
  1460  	ADCQ $0, mul1
  1461  	ADDQ mul0, acc4
  1462  	ADCQ $0, mul1
  1463  	MOVQ mul1, hlp
  1464  
  1465  	MOVQ acc7, mul0
  1466  	MULQ t2
  1467  	ADDQ hlp, acc5
  1468  	ADCQ $0, mul1
  1469  	ADDQ mul0, acc5
  1470  	ADCQ $0, mul1
  1471  	MOVQ mul1, hlp
  1472  
  1473  	MOVQ acc7, mul0
  1474  	MULQ t3
  1475  	ADDQ hlp, acc6
  1476  	ADCQ $0, mul1
  1477  	ADDQ mul0, acc6
  1478  	ADCQ $0, mul1
  1479  	MOVQ mul1, acc7
  1480  	// First reduction step
  1481  	MOVQ acc0, mul0
  1482  	MOVQ acc0, hlp
  1483  	ADDQ acc0, acc1
  1484  	ADCQ $0, acc2
  1485  	ADCQ $0, acc3
  1486  	ADCQ $0, mul0
  1487  	SHLQ $32, hlp
  1488  	SHRQ $32, acc0
  1489  	SUBQ hlp, acc1
  1490  	SBBQ acc0, acc2
  1491  	SBBQ hlp, acc3
  1492  	SBBQ acc0, mul0
  1493  	MOVQ mul0, acc0
  1494  	// Second reduction step
  1495  	MOVQ acc1, mul0
  1496  	MOVQ acc1, hlp
  1497  	ADDQ acc1, acc2
  1498  	ADCQ $0, acc3
  1499  	ADCQ $0, acc0
  1500  	ADCQ $0, mul0
  1501  	SHLQ $32, hlp
  1502  	SHRQ $32, acc1
  1503  	SUBQ hlp, acc2
  1504  	SBBQ acc1, acc3
  1505  	SBBQ hlp, acc0
  1506  	SBBQ acc1, mul0
  1507  	MOVQ mul0, acc1
  1508  	// Third reduction step
  1509  	MOVQ acc2, mul0
  1510  	MOVQ acc2, hlp
  1511  	ADDQ acc2, acc3
  1512  	ADCQ $0, acc0
  1513  	ADCQ $0, acc1
  1514  	ADCQ $0, mul0
  1515  	SHLQ $32, hlp
  1516  	SHRQ $32, acc2
  1517  	SUBQ hlp, acc3
  1518  	SBBQ acc2, acc0
  1519  	SBBQ hlp, acc1
  1520  	SBBQ acc2, mul0
  1521  	MOVQ mul0, acc2
  1522  	// Last reduction step
  1523  	MOVQ acc3, mul0
  1524  	MOVQ acc3, hlp
  1525  	ADDQ acc3, acc0
  1526  	ADCQ $0, acc1
  1527  	ADCQ $0, acc2
  1528  	ADCQ $0, mul0
  1529  	SHLQ $32, hlp
  1530  	SHRQ $32, acc3
  1531  	SUBQ hlp, acc0
  1532  	SBBQ acc3, acc1
  1533  	SBBQ hlp, acc2
  1534  	SBBQ acc3, mul0
  1535  	MOVQ mul0, acc3
  1536  	MOVQ $0, BP
  1537  	// Add bits [511:256] of the result
  1538  	ADCQ acc0, acc4
  1539  	ADCQ acc1, acc5
  1540  	ADCQ acc2, acc6
  1541  	ADCQ acc3, acc7
  1542  	ADCQ $0, hlp
  1543  	// Copy result
  1544  	MOVQ acc4, acc0
  1545  	MOVQ acc5, acc1
  1546  	MOVQ acc6, acc2
  1547  	MOVQ acc7, acc3
  1548  	// Subtract sm2
  1549  	SUBQ $-1, acc4
  1550  	SBBQ sm2const0<>(SB) ,acc5
  1551  	SBBQ $-1, acc6
  1552  	SBBQ sm2const1<>(SB), acc7
  1553  	SBBQ $0, hlp
  1554  	// If the result of the subtraction is negative, restore the previous result
  1555  	CMOVQCS acc0, acc4
  1556  	CMOVQCS acc1, acc5
  1557  	CMOVQCS acc2, acc6
  1558  	CMOVQCS acc3, acc7
  1559  
  1560  	RET
  1561  /* ---------------------------------------*/
  1562  TEXT sm2SqrInternal(SB),NOSPLIT | DUPOK,$0
  1563  
  1564  	MOVQ acc4, mul0
  1565  	MULQ acc5
  1566  	MOVQ mul0, acc1
  1567  	MOVQ mul1, acc2
  1568  
  1569  	MOVQ acc4, mul0
  1570  	MULQ acc6
  1571  	ADDQ mul0, acc2
  1572  	ADCQ $0, mul1
  1573  	MOVQ mul1, acc3
  1574  
  1575  	MOVQ acc4, mul0
  1576  	MULQ acc7
  1577  	ADDQ mul0, acc3
  1578  	ADCQ $0, mul1
  1579  	MOVQ mul1, t0
  1580  
  1581  	MOVQ acc5, mul0
  1582  	MULQ acc6
  1583  	ADDQ mul0, acc3
  1584  	ADCQ $0, mul1
  1585  	MOVQ mul1, hlp
  1586  
  1587  	MOVQ acc5, mul0
  1588  	MULQ acc7
  1589  	ADDQ hlp, t0
  1590  	ADCQ $0, mul1
  1591  	ADDQ mul0, t0
  1592  	ADCQ $0, mul1
  1593  	MOVQ mul1, t1
  1594  
  1595  	MOVQ acc6, mul0
  1596  	MULQ acc7
  1597  	ADDQ mul0, t1
  1598  	ADCQ $0, mul1
  1599  	MOVQ mul1, t2
  1600  	XORQ t3, t3
  1601  	// *2
  1602  	ADDQ acc1, acc1
  1603  	ADCQ acc2, acc2
  1604  	ADCQ acc3, acc3
  1605  	ADCQ t0, t0
  1606  	ADCQ t1, t1
  1607  	ADCQ t2, t2
  1608  	ADCQ $0, t3
  1609  	// Missing products
  1610  	MOVQ acc4, mul0
  1611  	MULQ mul0
  1612  	MOVQ mul0, acc0
  1613  	MOVQ DX, acc4
  1614  
  1615  	MOVQ acc5, mul0
  1616  	MULQ mul0
  1617  	ADDQ acc4, acc1
  1618  	ADCQ mul0, acc2
  1619  	ADCQ $0, DX
  1620  	MOVQ DX, acc4
  1621  
  1622  	MOVQ acc6, mul0
  1623  	MULQ mul0
  1624  	ADDQ acc4, acc3
  1625  	ADCQ mul0, t0
  1626  	ADCQ $0, DX
  1627  	MOVQ DX, acc4
  1628  
  1629  	MOVQ acc7, mul0
  1630  	MULQ mul0
  1631  	ADDQ acc4, t1
  1632  	ADCQ mul0, t2
  1633  	ADCQ DX, t3
  1634  	// First reduction step
  1635  	MOVQ acc0, mul0
  1636  	MOVQ acc0, hlp
  1637  	ADDQ acc0, acc1
  1638  	ADCQ $0, acc2
  1639  	ADCQ $0, acc3
  1640  	ADCQ $0, mul0
  1641  	SHLQ $32, hlp
  1642  	SHRQ $32, acc0
  1643  	SUBQ hlp, acc1
  1644  	SBBQ acc0, acc2
  1645  	SBBQ hlp, acc3
  1646  	SBBQ acc0, mul0
  1647  	MOVQ mul0, acc0
  1648  	// Second reduction step
  1649  	MOVQ acc1, mul0
  1650  	MOVQ acc1, hlp
  1651  	ADDQ acc1, acc2
  1652  	ADCQ $0, acc3
  1653  	ADCQ $0, acc0
  1654  	ADCQ $0, mul0
  1655  	SHLQ $32, hlp
  1656  	SHRQ $32, acc1
  1657  	SUBQ hlp, acc2
  1658  	SBBQ acc1, acc3
  1659  	SBBQ hlp, acc0
  1660  	SBBQ acc1, mul0
  1661  	MOVQ mul0, acc1
  1662  	// Third reduction step
  1663  	MOVQ acc2, mul0
  1664  	MOVQ acc2, hlp
  1665  	ADDQ acc2, acc3
  1666  	ADCQ $0, acc0
  1667  	ADCQ $0, acc1
  1668  	ADCQ $0, mul0
  1669  	SHLQ $32, hlp
  1670  	SHRQ $32, acc2
  1671  	SUBQ hlp, acc3
  1672  	SBBQ acc2, acc0
  1673  	SBBQ hlp, acc1
  1674  	SBBQ acc2, mul0
  1675  	MOVQ mul0, acc2
  1676  	// Last reduction step
  1677  	MOVQ acc3, mul0
  1678  	MOVQ acc3, hlp
  1679  	ADDQ acc3, acc0
  1680  	ADCQ $0, acc1
  1681  	ADCQ $0, acc2
  1682  	ADCQ $0, mul0
  1683  	SHLQ $32, hlp
  1684  	SHRQ $32, acc3
  1685  	SUBQ hlp, acc0
  1686  	SBBQ acc3, acc1
  1687  	SBBQ hlp, acc2
  1688  	SBBQ acc3, mul0
  1689  	MOVQ mul0, acc3
  1690  	MOVQ $0, BP
  1691  	// Add bits [511:256] of the result
  1692  	ADCQ acc0, t0
  1693  	ADCQ acc1, t1
  1694  	ADCQ acc2, t2
  1695  	ADCQ acc3, t3
  1696  	ADCQ $0, hlp
  1697  	// Copy result
  1698  	MOVQ t0, acc4
  1699  	MOVQ t1, acc5
  1700  	MOVQ t2, acc6
  1701  	MOVQ t3, acc7
  1702  	// Subtract sm2
  1703  	SUBQ $-1, acc4
  1704  	SBBQ sm2const0<>(SB) ,acc5
  1705  	SBBQ $-1, acc6
  1706  	SBBQ sm2const1<>(SB), acc7
  1707  	SBBQ $0, hlp
  1708  	// If the result of the subtraction is negative, restore the previous result
  1709  	CMOVQCS t0, acc4
  1710  	CMOVQCS t1, acc5
  1711  	CMOVQCS t2, acc6
  1712  	CMOVQCS t3, acc7
  1713  
  1714  	RET
  1715  /* ---------------------------------------*/
  1716  #define sm2MulBy2Inline\
  1717  	XORQ mul0, mul0;\
  1718  	ADDQ acc4, acc4;\
  1719  	ADCQ acc5, acc5;\
  1720  	ADCQ acc6, acc6;\
  1721  	ADCQ acc7, acc7;\
  1722  	ADCQ $0, mul0;\
  1723  	MOVQ acc4, t0;\
  1724  	MOVQ acc5, t1;\
  1725  	MOVQ acc6, t2;\
  1726  	MOVQ acc7, t3;\
  1727  	SUBQ $-1, t0;\
  1728  	SBBQ sm2const0<>(SB), t1;\
  1729  	SBBQ $-1, t2;\
  1730  	SBBQ sm2const1<>(SB), t3;\
  1731  	SBBQ $0, mul0;\
  1732  	CMOVQCS acc4, t0;\
  1733  	CMOVQCS acc5, t1;\
  1734  	CMOVQCS acc6, t2;\
  1735  	CMOVQCS acc7, t3;
  1736  /* ---------------------------------------*/
  1737  #define sm2AddInline \
  1738  	XORQ mul0, mul0;\
  1739  	ADDQ t0, acc4;\
  1740  	ADCQ t1, acc5;\
  1741  	ADCQ t2, acc6;\
  1742  	ADCQ t3, acc7;\
  1743  	ADCQ $0, mul0;\
  1744  	MOVQ acc4, t0;\
  1745  	MOVQ acc5, t1;\
  1746  	MOVQ acc6, t2;\
  1747  	MOVQ acc7, t3;\
  1748  	SUBQ $-1, t0;\
  1749  	SBBQ sm2const0<>(SB), t1;\
  1750  	SBBQ $-1, t2;\
  1751  	SBBQ sm2const1<>(SB), t3;\
  1752  	SBBQ $0, mul0;\
  1753  	CMOVQCS acc4, t0;\
  1754  	CMOVQCS acc5, t1;\
  1755  	CMOVQCS acc6, t2;\
  1756  	CMOVQCS acc7, t3;
  1757  /* ---------------------------------------*/
  1758  #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
  1759  #define LDt(src)   MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
  1760  #define ST(dst)    MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
  1761  #define STt(dst)   MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
  1762  #define acc2t      MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
  1763  #define t2acc      MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
  1764  /* ---------------------------------------*/
  1765  #define x1in(off) (32*0 + off)(SP)
  1766  #define y1in(off) (32*1 + off)(SP)
  1767  #define z1in(off) (32*2 + off)(SP)
  1768  #define x2in(off) (32*3 + off)(SP)
  1769  #define y2in(off) (32*4 + off)(SP)
  1770  #define xout(off) (32*5 + off)(SP)
  1771  #define yout(off) (32*6 + off)(SP)
  1772  #define zout(off) (32*7 + off)(SP)
  1773  #define s2(off)   (32*8 + off)(SP)
  1774  #define z1sqr(off) (32*9 + off)(SP)
  1775  #define h(off)	  (32*10 + off)(SP)
  1776  #define r(off)	  (32*11 + off)(SP)
  1777  #define hsqr(off) (32*12 + off)(SP)
  1778  #define rsqr(off) (32*13 + off)(SP)
  1779  #define hcub(off) (32*14 + off)(SP)
  1780  #define rptr	  (32*15)(SP)
  1781  #define sel_save  (32*15 + 8)(SP)
  1782  #define zero_save (32*15 + 8 + 4)(SP)
  1783  
  1784  // func sm2PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
  1785  TEXT ·sm2PointAddAffineAsm(SB),0,$512-96
  1786  	// Move input to stack in order to free registers
  1787  	MOVQ res+0(FP), AX
  1788  	MOVQ in1+24(FP), BX
  1789  	MOVQ in2+48(FP), CX
  1790  	MOVQ sign+72(FP), DX
  1791  	MOVQ sel+80(FP), t1
  1792  	MOVQ zero+88(FP), t2
  1793  
  1794  	MOVOU (16*0)(BX), X0
  1795  	MOVOU (16*1)(BX), X1
  1796  	MOVOU (16*2)(BX), X2
  1797  	MOVOU (16*3)(BX), X3
  1798  	MOVOU (16*4)(BX), X4
  1799  	MOVOU (16*5)(BX), X5
  1800  
  1801  	MOVOU X0, x1in(16*0)
  1802  	MOVOU X1, x1in(16*1)
  1803  	MOVOU X2, y1in(16*0)
  1804  	MOVOU X3, y1in(16*1)
  1805  	MOVOU X4, z1in(16*0)
  1806  	MOVOU X5, z1in(16*1)
  1807  
  1808  	MOVOU (16*0)(CX), X0
  1809  	MOVOU (16*1)(CX), X1
  1810  
  1811  	MOVOU X0, x2in(16*0)
  1812  	MOVOU X1, x2in(16*1)
  1813  	// Store pointer to result
  1814  	MOVQ mul0, rptr
  1815  	MOVL t1, sel_save
  1816  	MOVL t2, zero_save
  1817  	// Negate y2in based on sign
  1818  	MOVQ (16*2 + 8*0)(CX), acc4
  1819  	MOVQ (16*2 + 8*1)(CX), acc5
  1820  	MOVQ (16*2 + 8*2)(CX), acc6
  1821  	MOVQ (16*2 + 8*3)(CX), acc7
  1822  	MOVQ $-1, acc0
  1823  	MOVQ sm2const0<>(SB), acc1
  1824  	MOVQ $-1, acc2
  1825  	MOVQ sm2const1<>(SB), acc3
  1826  	XORQ mul0, mul0
  1827  	// Speculatively subtract
  1828  	SUBQ acc4, acc0
  1829  	SBBQ acc5, acc1
  1830  	SBBQ acc6, acc2
  1831  	SBBQ acc7, acc3
  1832  	SBBQ $0, mul0
  1833  	MOVQ acc0, t0
  1834  	MOVQ acc1, t1
  1835  	MOVQ acc2, t2
  1836  	MOVQ acc3, t3
  1837  	// Add in case the operand was > sm2-p
  1838  	ADDQ $-1, acc0
  1839  	ADCQ sm2const0<>(SB), acc1
  1840  	ADCQ $-1, acc2
  1841  	ADCQ sm2const1<>(SB), acc3
  1842  	ADCQ $0, mul0
  1843  	CMOVQNE t0, acc0
  1844  	CMOVQNE t1, acc1
  1845  	CMOVQNE t2, acc2
  1846  	CMOVQNE t3, acc3
  1847  	// If condition is 0, keep original value
  1848  	TESTQ DX, DX
  1849  	CMOVQEQ acc4, acc0
  1850  	CMOVQEQ acc5, acc1
  1851  	CMOVQEQ acc6, acc2
  1852  	CMOVQEQ acc7, acc3
  1853  	// Store result
  1854  	MOVQ acc0, y2in(8*0)
  1855  	MOVQ acc1, y2in(8*1)
  1856  	MOVQ acc2, y2in(8*2)
  1857  	MOVQ acc3, y2in(8*3)
  1858  	// Begin point add
  1859  	LDacc (z1in)
  1860  	CALL sm2SqrInternal(SB)	// z1ˆ2
  1861  	ST (z1sqr)
  1862  
  1863  	LDt (x2in)
  1864  	CALL sm2MulInternal(SB)	// x2 * z1ˆ2
  1865  
  1866  	LDt (x1in)
  1867  	CALL sm2SubInternal(SB)	// h = u2 - u1
  1868  	ST (h)
  1869  
  1870  	LDt (z1in)
  1871  	CALL sm2MulInternal(SB)	// z3 = h * z1
  1872  	ST (zout)
  1873  
  1874  	LDacc (z1sqr)
  1875  	CALL sm2MulInternal(SB)	// z1ˆ3
  1876  
  1877  	LDt (y2in)
  1878  	CALL sm2MulInternal(SB)	// s2 = y2 * z1ˆ3
  1879  	ST (s2)
  1880  
  1881  	LDt (y1in)
  1882  	CALL sm2SubInternal(SB)	// r = s2 - s1
  1883  	ST (r)
  1884  
  1885  	CALL sm2SqrInternal(SB)	// rsqr = rˆ2
  1886  	ST (rsqr)
  1887  
  1888  	LDacc (h)
  1889  	CALL sm2SqrInternal(SB)	// hsqr = hˆ2
  1890  	ST (hsqr)
  1891  
  1892  	LDt (h)
  1893  	CALL sm2MulInternal(SB)	// hcub = hˆ3
  1894  	ST (hcub)
  1895  
  1896  	LDt (y1in)
  1897  	CALL sm2MulInternal(SB)	// y1 * hˆ3
  1898  	ST (s2)
  1899  
  1900  	LDacc (x1in)
  1901  	LDt (hsqr)
  1902  	CALL sm2MulInternal(SB)	// u1 * hˆ2
  1903  	ST (h)
  1904  
  1905  	sm2MulBy2Inline			// u1 * hˆ2 * 2, inline
  1906  	LDacc (rsqr)
  1907  	CALL sm2SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
  1908  
  1909  	LDt (hcub)
  1910  	CALL sm2SubInternal(SB)
  1911  	ST (xout)
  1912  
  1913  	MOVQ acc4, t0
  1914  	MOVQ acc5, t1
  1915  	MOVQ acc6, t2
  1916  	MOVQ acc7, t3
  1917  	LDacc (h)
  1918  	CALL sm2SubInternal(SB)
  1919  
  1920  	LDt (r)
  1921  	CALL sm2MulInternal(SB)
  1922  
  1923  	LDt (s2)
  1924  	CALL sm2SubInternal(SB)
  1925  	ST (yout)
  1926  	// Load stored values from stack
  1927  	MOVQ rptr, AX
  1928  	MOVL sel_save, BX
  1929  	MOVL zero_save, CX
  1930  	// The result is not valid if (sel == 0), conditional choose
  1931  	MOVOU xout(16*0), X0
  1932  	MOVOU xout(16*1), X1
  1933  	MOVOU yout(16*0), X2
  1934  	MOVOU yout(16*1), X3
  1935  	MOVOU zout(16*0), X4
  1936  	MOVOU zout(16*1), X5
  1937  
  1938  	MOVL BX, X6
  1939  	MOVL CX, X7
  1940  
  1941  	PXOR X8, X8
  1942  	PCMPEQL X9, X9
  1943  
  1944  	PSHUFD $0, X6, X6
  1945  	PSHUFD $0, X7, X7
  1946  
  1947  	PCMPEQL X8, X6
  1948  	PCMPEQL X8, X7
  1949  
  1950  	MOVOU X6, X15
  1951  	PANDN X9, X15
  1952  
  1953  	MOVOU x1in(16*0), X9
  1954  	MOVOU x1in(16*1), X10
  1955  	MOVOU y1in(16*0), X11
  1956  	MOVOU y1in(16*1), X12
  1957  	MOVOU z1in(16*0), X13
  1958  	MOVOU z1in(16*1), X14
  1959  
  1960  	PAND X15, X0
  1961  	PAND X15, X1
  1962  	PAND X15, X2
  1963  	PAND X15, X3
  1964  	PAND X15, X4
  1965  	PAND X15, X5
  1966  
  1967  	PAND X6, X9
  1968  	PAND X6, X10
  1969  	PAND X6, X11
  1970  	PAND X6, X12
  1971  	PAND X6, X13
  1972  	PAND X6, X14
  1973  
  1974  	PXOR X9, X0
  1975  	PXOR X10, X1
  1976  	PXOR X11, X2
  1977  	PXOR X12, X3
  1978  	PXOR X13, X4
  1979  	PXOR X14, X5
  1980  	// Similarly if zero == 0
  1981  	PCMPEQL X9, X9
  1982  	MOVOU X7, X15
  1983  	PANDN X9, X15
  1984  
  1985  	MOVOU x2in(16*0), X9
  1986  	MOVOU x2in(16*1), X10
  1987  	MOVOU y2in(16*0), X11
  1988  	MOVOU y2in(16*1), X12
  1989  	MOVOU sm2one<>+0x00(SB), X13
  1990  	MOVOU sm2one<>+0x10(SB), X14
  1991  
  1992  	PAND X15, X0
  1993  	PAND X15, X1
  1994  	PAND X15, X2
  1995  	PAND X15, X3
  1996  	PAND X15, X4
  1997  	PAND X15, X5
  1998  
  1999  	PAND X7, X9
  2000  	PAND X7, X10
  2001  	PAND X7, X11
  2002  	PAND X7, X12
  2003  	PAND X7, X13
  2004  	PAND X7, X14
  2005  
  2006  	PXOR X9, X0
  2007  	PXOR X10, X1
  2008  	PXOR X11, X2
  2009  	PXOR X12, X3
  2010  	PXOR X13, X4
  2011  	PXOR X14, X5
  2012  	// Finally output the result
  2013  	MOVOU X0, (16*0)(AX)
  2014  	MOVOU X1, (16*1)(AX)
  2015  	MOVOU X2, (16*2)(AX)
  2016  	MOVOU X3, (16*3)(AX)
  2017  	MOVOU X4, (16*4)(AX)
  2018  	MOVOU X5, (16*5)(AX)
  2019  	MOVQ $0, rptr
  2020  
  2021  	RET
  2022  #undef x1in
  2023  #undef y1in
  2024  #undef z1in
  2025  #undef x2in
  2026  #undef y2in
  2027  #undef xout
  2028  #undef yout
  2029  #undef zout
  2030  #undef s2
  2031  #undef z1sqr
  2032  #undef h
  2033  #undef r
  2034  #undef hsqr
  2035  #undef rsqr
  2036  #undef hcub
  2037  #undef rptr
  2038  #undef sel_save
  2039  #undef zero_save
  2040  
  2041  // sm2IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
  2042  // otherwise. It writes to [acc4..acc7], t0 and t1.
  2043  TEXT sm2IsZero(SB),NOSPLIT | DUPOK,$0
  2044  	// AX contains a flag that is set if the input is zero.
  2045  	XORQ AX, AX
  2046  	MOVQ $1, t1
  2047  
  2048  	// Check whether [acc4..acc7] are all zero.
  2049  	MOVQ acc4, t0
  2050  	ORQ acc5, t0
  2051  	ORQ acc6, t0
  2052  	ORQ acc7, t0
  2053  
  2054  	// Set the zero flag if so. (CMOV of a constant to a register doesn't
  2055  	// appear to be supported in Go. Thus t1 = 1.)
  2056  	CMOVQEQ t1, AX
  2057  
  2058  	// XOR [acc4..acc7] with P and compare with zero again.
  2059  	XORQ $-1, acc4
  2060  	XORQ sm2const0<>(SB), acc5
  2061  	XORQ $-1, acc6
  2062  	XORQ sm2const1<>(SB), acc7
  2063  	ORQ acc5, acc4
  2064  	ORQ acc6, acc4
  2065  	ORQ acc7, acc4
  2066  
  2067  	// Set the zero flag if so.
  2068  	CMOVQEQ t1, AX
  2069  	RET
  2070  
  2071  /* ---------------------------------------*/
  2072  #define x1in(off) (32*0 + off)(SP)
  2073  #define y1in(off) (32*1 + off)(SP)
  2074  #define z1in(off) (32*2 + off)(SP)
  2075  #define x2in(off) (32*3 + off)(SP)
  2076  #define y2in(off) (32*4 + off)(SP)
  2077  #define z2in(off) (32*5 + off)(SP)
  2078  
  2079  #define xout(off) (32*6 + off)(SP)
  2080  #define yout(off) (32*7 + off)(SP)
  2081  #define zout(off) (32*8 + off)(SP)
  2082  
  2083  #define u1(off)    (32*9 + off)(SP)
  2084  #define u2(off)    (32*10 + off)(SP)
  2085  #define s1(off)    (32*11 + off)(SP)
  2086  #define s2(off)    (32*12 + off)(SP)
  2087  #define z1sqr(off) (32*13 + off)(SP)
  2088  #define z2sqr(off) (32*14 + off)(SP)
  2089  #define h(off)     (32*15 + off)(SP)
  2090  #define r(off)     (32*16 + off)(SP)
  2091  #define hsqr(off)  (32*17 + off)(SP)
  2092  #define rsqr(off)  (32*18 + off)(SP)
  2093  #define hcub(off)  (32*19 + off)(SP)
  2094  #define rptr       (32*20)(SP)
  2095  #define points_eq  (32*20+8)(SP)
  2096  
  2097  //func sm2PointAddAsm(res, in1, in2 []uint64) int
  2098  TEXT ·sm2PointAddAsm(SB),0,$680-80
  2099  	// Move input to stack in order to free registers
  2100  	MOVQ res+0(FP), AX
  2101  	MOVQ in1+24(FP), BX
  2102  	MOVQ in2+48(FP), CX
  2103  
  2104  	MOVOU (16*0)(BX), X0
  2105  	MOVOU (16*1)(BX), X1
  2106  	MOVOU (16*2)(BX), X2
  2107  	MOVOU (16*3)(BX), X3
  2108  	MOVOU (16*4)(BX), X4
  2109  	MOVOU (16*5)(BX), X5
  2110  
  2111  	MOVOU X0, x1in(16*0)
  2112  	MOVOU X1, x1in(16*1)
  2113  	MOVOU X2, y1in(16*0)
  2114  	MOVOU X3, y1in(16*1)
  2115  	MOVOU X4, z1in(16*0)
  2116  	MOVOU X5, z1in(16*1)
  2117  
  2118  	MOVOU (16*0)(CX), X0
  2119  	MOVOU (16*1)(CX), X1
  2120  	MOVOU (16*2)(CX), X2
  2121  	MOVOU (16*3)(CX), X3
  2122  	MOVOU (16*4)(CX), X4
  2123  	MOVOU (16*5)(CX), X5
  2124  
  2125  	MOVOU X0, x2in(16*0)
  2126  	MOVOU X1, x2in(16*1)
  2127  	MOVOU X2, y2in(16*0)
  2128  	MOVOU X3, y2in(16*1)
  2129  	MOVOU X4, z2in(16*0)
  2130  	MOVOU X5, z2in(16*1)
  2131  	// Store pointer to result
  2132  	MOVQ AX, rptr
  2133  	// Begin point add
  2134  	LDacc (z2in)
  2135  	CALL sm2SqrInternal(SB)	// z2ˆ2
  2136  	ST (z2sqr)
  2137  	LDt (z2in)
  2138  	CALL sm2MulInternal(SB)	// z2ˆ3
  2139  	LDt (y1in)
  2140  	CALL sm2MulInternal(SB)	// s1 = z2ˆ3*y1
  2141  	ST (s1)
  2142  
  2143  	LDacc (z1in)
  2144  	CALL sm2SqrInternal(SB)	// z1ˆ2
  2145  	ST (z1sqr)
  2146  	LDt (z1in)
  2147  	CALL sm2MulInternal(SB)	// z1ˆ3
  2148  	LDt (y2in)
  2149  	CALL sm2MulInternal(SB)	// s2 = z1ˆ3*y2
  2150  	ST (s2)
  2151  
  2152  	LDt (s1)
  2153  	CALL sm2SubInternal(SB)	// r = s2 - s1
  2154  	ST (r)
  2155  	CALL sm2IsZero(SB)
  2156  	MOVQ AX, points_eq
  2157  
  2158  	LDacc (z2sqr)
  2159  	LDt (x1in)
  2160  	CALL sm2MulInternal(SB)	// u1 = x1 * z2ˆ2
  2161  	ST (u1)
  2162  	LDacc (z1sqr)
  2163  	LDt (x2in)
  2164  	CALL sm2MulInternal(SB)	// u2 = x2 * z1ˆ2
  2165  	ST (u2)
  2166  
  2167  	LDt (u1)
  2168  	CALL sm2SubInternal(SB)	// h = u2 - u1
  2169  	ST (h)
  2170  	CALL sm2IsZero(SB)
  2171  	ANDQ points_eq, AX
  2172  	MOVQ AX, points_eq
  2173  
  2174  	LDacc (r)
  2175  	CALL sm2SqrInternal(SB)	// rsqr = rˆ2
  2176  	ST (rsqr)
  2177  
  2178  	LDacc (h)
  2179  	CALL sm2SqrInternal(SB)	// hsqr = hˆ2
  2180  	ST (hsqr)
  2181  
  2182  	LDt (h)
  2183  	CALL sm2MulInternal(SB)	// hcub = hˆ3
  2184  	ST (hcub)
  2185  
  2186  	LDt (s1)
  2187  	CALL sm2MulInternal(SB)
  2188  	ST (s2)
  2189  
  2190  	LDacc (z1in)
  2191  	LDt (z2in)
  2192  	CALL sm2MulInternal(SB)	// z1 * z2
  2193  	LDt (h)
  2194  	CALL sm2MulInternal(SB)	// z1 * z2 * h
  2195  	ST (zout)
  2196  
  2197  	LDacc (hsqr)
  2198  	LDt (u1)
  2199  	CALL sm2MulInternal(SB)	// hˆ2 * u1
  2200  	ST (u2)
  2201  
  2202  	sm2MulBy2Inline	// u1 * hˆ2 * 2, inline
  2203  	LDacc (rsqr)
  2204  	CALL sm2SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
  2205  
  2206  	LDt (hcub)
  2207  	CALL sm2SubInternal(SB)
  2208  	ST (xout)
  2209  
  2210  	MOVQ acc4, t0
  2211  	MOVQ acc5, t1
  2212  	MOVQ acc6, t2
  2213  	MOVQ acc7, t3
  2214  	LDacc (u2)
  2215  	CALL sm2SubInternal(SB)
  2216  
  2217  	LDt (r)
  2218  	CALL sm2MulInternal(SB)
  2219  
  2220  	LDt (s2)
  2221  	CALL sm2SubInternal(SB)
  2222  	ST (yout)
  2223  
  2224  	MOVOU xout(16*0), X0
  2225  	MOVOU xout(16*1), X1
  2226  	MOVOU yout(16*0), X2
  2227  	MOVOU yout(16*1), X3
  2228  	MOVOU zout(16*0), X4
  2229  	MOVOU zout(16*1), X5
  2230  	// Finally output the result
  2231  	MOVQ rptr, AX
  2232  	MOVQ $0, rptr
  2233  	MOVOU X0, (16*0)(AX)
  2234  	MOVOU X1, (16*1)(AX)
  2235  	MOVOU X2, (16*2)(AX)
  2236  	MOVOU X3, (16*3)(AX)
  2237  	MOVOU X4, (16*4)(AX)
  2238  	MOVOU X5, (16*5)(AX)
  2239  
  2240  	MOVQ points_eq, AX
  2241  	MOVQ AX, ret+72(FP)
  2242  
  2243  	RET
  2244  #undef x1in
  2245  #undef y1in
  2246  #undef z1in
  2247  #undef x2in
  2248  #undef y2in
  2249  #undef z2in
  2250  #undef xout
  2251  #undef yout
  2252  #undef zout
  2253  #undef s1
  2254  #undef s2
  2255  #undef u1
  2256  #undef u2
  2257  #undef z1sqr
  2258  #undef z2sqr
  2259  #undef h
  2260  #undef r
  2261  #undef hsqr
  2262  #undef rsqr
  2263  #undef hcub
  2264  #undef rptr
  2265  /* ---------------------------------------*/
  2266  #define x(off) (32*0 + off)(SP)
  2267  #define y(off) (32*1 + off)(SP)
  2268  #define z(off) (32*2 + off)(SP)
  2269  
  2270  #define s(off)	(32*3 + off)(SP)
  2271  #define m(off)	(32*4 + off)(SP)
  2272  #define zsqr(off) (32*5 + off)(SP)
  2273  #define tmp(off)  (32*6 + off)(SP)
  2274  #define rptr	  (32*7)(SP)
  2275  
  2276  //func sm2PointDoubleAsm(res, in []uint64)
  2277  TEXT ·sm2PointDoubleAsm(SB),NOSPLIT,$256-48
  2278  	// Move input to stack in order to free registers
  2279  	MOVQ res+0(FP), AX
  2280  	MOVQ in+24(FP), BX
  2281  
  2282  	MOVOU (16*0)(BX), X0
  2283  	MOVOU (16*1)(BX), X1
  2284  	MOVOU (16*2)(BX), X2
  2285  	MOVOU (16*3)(BX), X3
  2286  	MOVOU (16*4)(BX), X4
  2287  	MOVOU (16*5)(BX), X5
  2288  
  2289  	MOVOU X0, x(16*0)
  2290  	MOVOU X1, x(16*1)
  2291  	MOVOU X2, y(16*0)
  2292  	MOVOU X3, y(16*1)
  2293  	MOVOU X4, z(16*0)
  2294  	MOVOU X5, z(16*1)
  2295  	// Store pointer to result
  2296  	MOVQ AX, rptr
  2297  	// Begin point double
  2298  	LDacc (z)
  2299  	CALL sm2SqrInternal(SB)
  2300  	ST (zsqr)
  2301  
  2302  	LDt (x)
  2303  	sm2AddInline
  2304  	STt (m)
  2305  
  2306  	LDacc (z)
  2307  	LDt (y)
  2308  	CALL sm2MulInternal(SB)
  2309  	sm2MulBy2Inline
  2310  	MOVQ rptr, AX
  2311  	// Store z
  2312  	MOVQ t0, (16*4 + 8*0)(AX)
  2313  	MOVQ t1, (16*4 + 8*1)(AX)
  2314  	MOVQ t2, (16*4 + 8*2)(AX)
  2315  	MOVQ t3, (16*4 + 8*3)(AX)
  2316  
  2317  	LDacc (x)
  2318  	LDt (zsqr)
  2319  	CALL sm2SubInternal(SB)
  2320  	LDt (m)
  2321  	CALL sm2MulInternal(SB)
  2322  	ST (m)
  2323  	// Multiply by 3
  2324  	sm2MulBy2Inline
  2325  	LDacc (m)
  2326  	sm2AddInline
  2327  	STt (m)
  2328  	////////////////////////
  2329  	LDacc (y)
  2330  	sm2MulBy2Inline
  2331  	t2acc
  2332  	CALL sm2SqrInternal(SB)
  2333  	ST (s)
  2334  	CALL sm2SqrInternal(SB)
  2335  	// Divide by 2
  2336  	XORQ mul0, mul0
  2337  	MOVQ acc4, t0
  2338  	MOVQ acc5, t1
  2339  	MOVQ acc6, t2
  2340  	MOVQ acc7, t3
  2341  
  2342  	ADDQ $-1, acc4
  2343  	ADCQ sm2const0<>(SB), acc5
  2344  	ADCQ $-1, acc6
  2345  	ADCQ sm2const1<>(SB), acc7
  2346  	ADCQ $0, mul0
  2347  	TESTQ $1, t0
  2348  
  2349  	CMOVQEQ t0, acc4
  2350  	CMOVQEQ t1, acc5
  2351  	CMOVQEQ t2, acc6
  2352  	CMOVQEQ t3, acc7
  2353  	ANDQ t0, mul0
  2354  
  2355  	SHRQ $1, acc4:acc5
  2356  	SHRQ $1, acc5:acc6
  2357  	SHRQ $1, acc6:acc7
  2358  	SHRQ $1, acc7:mul0
  2359  	ST (y)
  2360  	/////////////////////////
  2361  	LDacc (x)
  2362  	LDt (s)
  2363  	CALL sm2MulInternal(SB)
  2364  	ST (s)
  2365  	sm2MulBy2Inline
  2366  	STt (tmp)
  2367  
  2368  	LDacc (m)
  2369  	CALL sm2SqrInternal(SB)
  2370  	LDt (tmp)
  2371  	CALL sm2SubInternal(SB)
  2372  
  2373  	MOVQ rptr, AX
  2374  	// Store x
  2375  	MOVQ acc4, (16*0 + 8*0)(AX)
  2376  	MOVQ acc5, (16*0 + 8*1)(AX)
  2377  	MOVQ acc6, (16*0 + 8*2)(AX)
  2378  	MOVQ acc7, (16*0 + 8*3)(AX)
  2379  
  2380  	acc2t
  2381  	LDacc (s)
  2382  	CALL sm2SubInternal(SB)
  2383  
  2384  	LDt (m)
  2385  	CALL sm2MulInternal(SB)
  2386  
  2387  	LDt (y)
  2388  	CALL sm2SubInternal(SB)
  2389  	MOVQ rptr, AX
  2390  	// Store y
  2391  	MOVQ acc4, (16*2 + 8*0)(AX)
  2392  	MOVQ acc5, (16*2 + 8*1)(AX)
  2393  	MOVQ acc6, (16*2 + 8*2)(AX)
  2394  	MOVQ acc7, (16*2 + 8*3)(AX)
  2395  	///////////////////////
  2396  	MOVQ $0, rptr
  2397  
  2398  	RET
  2399  /* ---------------------------------------*/
  2400