github.com/hxx258456/ccgo@v0.0.5-0.20230213014102-48b35f46f66f/sm2/p256_asm_amd64.s (about)

     1  // This file contains constant-time, 64-bit assembly implementation of
     2  // P256. The optimizations performed here are described in detail in:
     3  // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
     4  //                          256-bit primes"
     5  // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
     6  // https://eprint.iacr.org/2013/816.pdf
     7  
     8  #include "textflag.h"
     9  
    10  #define res_ptr DI
    11  #define x_ptr SI
    12  #define y_ptr CX
    13  
    14  #define acc0 R8
    15  #define acc1 R9
    16  #define acc2 R10
    17  #define acc3 R11
    18  #define acc4 R12
    19  #define acc5 R13
    20  #define t0 R14
    21  #define t1 R15
    22  
    23  DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff
    24  DATA p256p<>+0x08(SB)/8, $0xffffffff00000000
    25  DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff
    26  DATA p256p<>+0x18(SB)/8, $0xfffffffeffffffff
    27  DATA p256ordK0<>+0x00(SB)/8, $0x327f9e8872350975
    28  DATA p256ord<>+0x00(SB)/8, $0x53bbf40939d54123
    29  DATA p256ord<>+0x08(SB)/8, $0x7203df6b21c6052b
    30  DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
    31  DATA p256ord<>+0x18(SB)/8, $0xfffffffeffffffff
    32  DATA p256one<>+0x00(SB)/8, $0x0000000000000001
    33  DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff
    34  DATA p256one<>+0x10(SB)/8, $0x0000000000000000
    35  DATA p256one<>+0x18(SB)/8, $0x0000000100000000
    36  GLOBL p256p<>(SB), RODATA, $32
    37  GLOBL p256ordK0<>(SB), RODATA, $8
    38  GLOBL p256ord<>(SB), RODATA, $32
    39  GLOBL p256one<>(SB), RODATA, $32
    40  
    41  /* ---------------------------------------*/
    42  // func p256LittleToBig(res []byte, in []uint64)
    43  TEXT ·p256LittleToBig(SB),NOSPLIT,$0
    44  	JMP ·p256BigToLittle(SB)
    45  /* ---------------------------------------*/
    46  // func p256BigToLittle(res []uint64, in []byte)
    47  TEXT ·p256BigToLittle(SB),NOSPLIT,$0
    48  	MOVQ res+0(FP), res_ptr
    49  	MOVQ in+24(FP), x_ptr
    50  
    51  	MOVQ (8*0)(x_ptr), acc0
    52  	MOVQ (8*1)(x_ptr), acc1
    53  	MOVQ (8*2)(x_ptr), acc2
    54  	MOVQ (8*3)(x_ptr), acc3
    55  
    56  	BSWAPQ acc0
    57  	BSWAPQ acc1
    58  	BSWAPQ acc2
    59  	BSWAPQ acc3
    60  
    61  	MOVQ acc3, (8*0)(res_ptr)
    62  	MOVQ acc2, (8*1)(res_ptr)
    63  	MOVQ acc1, (8*2)(res_ptr)
    64  	MOVQ acc0, (8*3)(res_ptr)
    65  
    66  	RET
    67  /* ---------------------------------------*/
    68  // func p256MovCond(res, a, b []uint64, cond int)
    69  // If cond == 0 res=b, else res=a
    70  TEXT ·p256MovCond(SB),NOSPLIT,$0
    71  	MOVQ res+0(FP), res_ptr
    72  	MOVQ a+24(FP), x_ptr
    73  	MOVQ b+48(FP), y_ptr
    74  	MOVQ cond+72(FP), X12
    75  
    76  	PXOR X13, X13
    77  	PSHUFD $0, X12, X12
    78  	PCMPEQL X13, X12
    79  
    80  	MOVOU X12, X0
    81  	MOVOU (16*0)(x_ptr), X6
    82  	PANDN X6, X0
    83  	MOVOU X12, X1
    84  	MOVOU (16*1)(x_ptr), X7
    85  	PANDN X7, X1
    86  	MOVOU X12, X2
    87  	MOVOU (16*2)(x_ptr), X8
    88  	PANDN X8, X2
    89  	MOVOU X12, X3
    90  	MOVOU (16*3)(x_ptr), X9
    91  	PANDN X9, X3
    92  	MOVOU X12, X4
    93  	MOVOU (16*4)(x_ptr), X10
    94  	PANDN X10, X4
    95  	MOVOU X12, X5
    96  	MOVOU (16*5)(x_ptr), X11
    97  	PANDN X11, X5
    98  
    99  	MOVOU (16*0)(y_ptr), X6
   100  	MOVOU (16*1)(y_ptr), X7
   101  	MOVOU (16*2)(y_ptr), X8
   102  	MOVOU (16*3)(y_ptr), X9
   103  	MOVOU (16*4)(y_ptr), X10
   104  	MOVOU (16*5)(y_ptr), X11
   105  
   106  	PAND X12, X6
   107  	PAND X12, X7
   108  	PAND X12, X8
   109  	PAND X12, X9
   110  	PAND X12, X10
   111  	PAND X12, X11
   112  
   113  	PXOR X6, X0
   114  	PXOR X7, X1
   115  	PXOR X8, X2
   116  	PXOR X9, X3
   117  	PXOR X10, X4
   118  	PXOR X11, X5
   119  
   120  	MOVOU X0, (16*0)(res_ptr)
   121  	MOVOU X1, (16*1)(res_ptr)
   122  	MOVOU X2, (16*2)(res_ptr)
   123  	MOVOU X3, (16*3)(res_ptr)
   124  	MOVOU X4, (16*4)(res_ptr)
   125  	MOVOU X5, (16*5)(res_ptr)
   126  
   127  	RET
   128  /* ---------------------------------------*/
   129  // func p256NegCond(val []uint64, cond int)
   130  TEXT ·p256NegCond(SB),NOSPLIT,$0
   131  	MOVQ val+0(FP), res_ptr
   132  	MOVQ cond+24(FP), t0
   133  	// acc = poly
   134  	MOVQ $-1, acc0
   135  	MOVQ p256p<>+0x08(SB), acc1
   136  	MOVQ $-1, acc2
   137  	MOVQ p256p<>+0x18(SB), acc3
   138  	// Load the original value
   139  	MOVQ (8*0)(res_ptr), acc5
   140  	MOVQ (8*1)(res_ptr), x_ptr
   141  	MOVQ (8*2)(res_ptr), y_ptr
   142  	MOVQ (8*3)(res_ptr), t1
   143  	// Speculatively subtract
   144  	SUBQ acc5, acc0
   145  	SBBQ x_ptr, acc1
   146  	SBBQ y_ptr, acc2
   147  	SBBQ t1, acc3
   148  	// If condition is 0, keep original value
   149  	TESTQ t0, t0
   150  	CMOVQEQ acc5, acc0
   151  	CMOVQEQ x_ptr, acc1
   152  	CMOVQEQ y_ptr, acc2
   153  	CMOVQEQ t1, acc3
   154  	// Store result
   155  	MOVQ acc0, (8*0)(res_ptr)
   156  	MOVQ acc1, (8*1)(res_ptr)
   157  	MOVQ acc2, (8*2)(res_ptr)
   158  	MOVQ acc3, (8*3)(res_ptr)
   159  
   160  	RET
   161  /* ---------------------------------------*/
   162  // func p256Sqr(res, in []uint64, n int)
   163  TEXT ·p256Sqr(SB),NOSPLIT,$0
   164  	MOVQ res+0(FP), res_ptr
   165  	MOVQ in+24(FP), x_ptr
   166  	MOVQ n+48(FP), BX
   167  
   168  sqrLoop:
   169  
   170  	// y[1:] * y[0]
   171  	MOVQ (8*0)(x_ptr), t0
   172  
   173  	MOVQ (8*1)(x_ptr), AX
   174  	MULQ t0
   175  	MOVQ AX, acc1
   176  	MOVQ DX, acc2
   177  
   178  	MOVQ (8*2)(x_ptr), AX
   179  	MULQ t0
   180  	ADDQ AX, acc2
   181  	ADCQ $0, DX
   182  	MOVQ DX, acc3
   183  
   184  	MOVQ (8*3)(x_ptr), AX
   185  	MULQ t0
   186  	ADDQ AX, acc3
   187  	ADCQ $0, DX
   188  	MOVQ DX, acc4
   189  	// y[2:] * y[1]
   190  	MOVQ (8*1)(x_ptr), t0
   191  
   192  	MOVQ (8*2)(x_ptr), AX
   193  	MULQ t0
   194  	ADDQ AX, acc3
   195  	ADCQ $0, DX
   196  	MOVQ DX, t1
   197  
   198  	MOVQ (8*3)(x_ptr), AX
   199  	MULQ t0
   200  	ADDQ t1, acc4
   201  	ADCQ $0, DX
   202  	ADDQ AX, acc4
   203  	ADCQ $0, DX
   204  	MOVQ DX, acc5
   205  	// y[3] * y[2]
   206  	MOVQ (8*2)(x_ptr), t0
   207  
   208  	MOVQ (8*3)(x_ptr), AX
   209  	MULQ t0
   210  	ADDQ AX, acc5
   211  	ADCQ $0, DX
   212  	MOVQ DX, y_ptr
   213  	XORQ t1, t1
   214  	// *2
   215  	ADDQ acc1, acc1
   216  	ADCQ acc2, acc2
   217  	ADCQ acc3, acc3
   218  	ADCQ acc4, acc4
   219  	ADCQ acc5, acc5
   220  	ADCQ y_ptr, y_ptr
   221  	ADCQ $0, t1
   222  	// Missing products
   223  	MOVQ (8*0)(x_ptr), AX
   224  	MULQ AX
   225  	MOVQ AX, acc0
   226  	MOVQ DX, t0
   227  
   228  	MOVQ (8*1)(x_ptr), AX
   229  	MULQ AX
   230  	ADDQ t0, acc1
   231  	ADCQ AX, acc2
   232  	ADCQ $0, DX
   233  	MOVQ DX, t0
   234  
   235  	MOVQ (8*2)(x_ptr), AX
   236  	MULQ AX
   237  	ADDQ t0, acc3
   238  	ADCQ AX, acc4
   239  	ADCQ $0, DX
   240  	MOVQ DX, t0
   241  
   242  	MOVQ (8*3)(x_ptr), AX
   243  	MULQ AX
   244  	ADDQ t0, acc5
   245  	ADCQ AX, y_ptr
   246  	ADCQ DX, t1
   247  	MOVQ t1, x_ptr
   248  	// First reduction step
   249  	MOVQ acc0, AX
   250  	MOVQ acc0, DX
   251  	SHLQ $32, AX
   252  	SHRQ $32, DX
   253  
   254  	ADDQ acc0, acc1
   255  	ADCQ $0, acc2
   256  	ADCQ $0, acc3
   257  	ADCQ $0, acc0
   258  	
   259  	SUBQ AX, acc1
   260  	SBBQ DX, acc2
   261  	SBBQ AX, acc3
   262  	SBBQ DX, acc0
   263  	// Second reduction step
   264  	MOVQ acc1, AX
   265  	MOVQ acc1, DX
   266  	SHLQ $32, AX
   267  	SHRQ $32, DX
   268  
   269  	ADDQ acc1, acc2
   270  	ADCQ $0, acc3
   271  	ADCQ $0, acc0
   272  	ADCQ $0, acc1
   273  	
   274  	SUBQ AX, acc2
   275  	SBBQ DX, acc3
   276  	SBBQ AX, acc0
   277  	SBBQ DX, acc1
   278  	// Third reduction step
   279  	MOVQ acc2, AX
   280  	MOVQ acc2, DX
   281  	SHLQ $32, AX
   282  	SHRQ $32, DX
   283  
   284  	ADDQ acc2, acc3
   285  	ADCQ $0, acc0
   286  	ADCQ $0, acc1
   287  	ADCQ $0, acc2
   288  	
   289  	SUBQ AX, acc3
   290  	SBBQ DX, acc0
   291  	SBBQ AX, acc1
   292  	SBBQ DX, acc2
   293  	// Last reduction step
   294  	XORQ t0, t0
   295  	MOVQ acc3, AX
   296  	MOVQ acc3, DX
   297  	SHLQ $32, AX
   298  	SHRQ $32, DX
   299  
   300  	ADDQ acc3, acc0
   301  	ADCQ $0, acc1
   302  	ADCQ $0, acc2
   303  	ADCQ $0, acc3
   304  	
   305  	SUBQ AX, acc0
   306  	SBBQ DX, acc1
   307  	SBBQ AX, acc2
   308  	SBBQ DX, acc3
   309  
   310  	// Add bits [511:256] of the sqr result
   311  	ADCQ acc4, acc0
   312  	ADCQ acc5, acc1
   313  	ADCQ y_ptr, acc2
   314  	ADCQ x_ptr, acc3
   315  	ADCQ $0, t0
   316  
   317  	MOVQ acc0, acc4
   318  	MOVQ acc1, acc5
   319  	MOVQ acc2, y_ptr
   320  	MOVQ acc3, t1
   321  	// Subtract p256
   322  	SUBQ $-1, acc0
   323  	SBBQ p256p<>+0x08(SB), acc1
   324  	SBBQ $-1, acc2
   325  	SBBQ p256p<>+0x018(SB), acc3
   326  	SBBQ $0, t0
   327  
   328  	CMOVQCS acc4, acc0
   329  	CMOVQCS acc5, acc1
   330  	CMOVQCS y_ptr, acc2
   331  	CMOVQCS t1, acc3
   332  
   333  	MOVQ acc0, (8*0)(res_ptr)
   334  	MOVQ acc1, (8*1)(res_ptr)
   335  	MOVQ acc2, (8*2)(res_ptr)
   336  	MOVQ acc3, (8*3)(res_ptr)
   337  	MOVQ res_ptr, x_ptr
   338  	DECQ BX
   339  	JNE  sqrLoop
   340  
   341  	RET
   342  /* ---------------------------------------*/
   343  // func p256Mul(res, in1, in2 []uint64)
   344  TEXT ·p256Mul(SB),NOSPLIT,$0
   345  	MOVQ res+0(FP), res_ptr
   346  	MOVQ in1+24(FP), x_ptr
   347  	MOVQ in2+48(FP), y_ptr
   348  	// x * y[0]
   349  	MOVQ (8*0)(y_ptr), t0
   350  
   351  	MOVQ (8*0)(x_ptr), AX
   352  	MULQ t0
   353  	MOVQ AX, acc0
   354  	MOVQ DX, acc1
   355  
   356  	MOVQ (8*1)(x_ptr), AX
   357  	MULQ t0
   358  	ADDQ AX, acc1
   359  	ADCQ $0, DX
   360  	MOVQ DX, acc2
   361  
   362  	MOVQ (8*2)(x_ptr), AX
   363  	MULQ t0
   364  	ADDQ AX, acc2
   365  	ADCQ $0, DX
   366  	MOVQ DX, acc3
   367  
   368  	MOVQ (8*3)(x_ptr), AX
   369  	MULQ t0
   370  	ADDQ AX, acc3
   371  	ADCQ $0, DX
   372  	MOVQ DX, acc4
   373  	XORQ acc5, acc5
   374  	// First reduction step
   375  	MOVQ acc0, AX
   376  	MOVQ acc0, DX
   377  	SHLQ $32, AX
   378  	SHRQ $32, DX
   379  
   380  	ADDQ acc0, acc1
   381  	ADCQ $0, acc2
   382  	ADCQ $0, acc3
   383  	ADCQ acc0, acc4
   384  	ADCQ $0, acc5
   385  	
   386  	SUBQ AX, acc1
   387  	SBBQ DX, acc2
   388  	SBBQ AX, acc3
   389  	SBBQ DX, acc4
   390  	SBBQ $0, acc5
   391  	XORQ acc0, acc0
   392  
   393  	// x * y[1]
   394  	MOVQ (8*1)(y_ptr), t0
   395  
   396  	MOVQ (8*0)(x_ptr), AX
   397  	MULQ t0
   398  	ADDQ AX, acc1
   399  	ADCQ $0, DX
   400  	MOVQ DX, t1
   401  
   402  	MOVQ (8*1)(x_ptr), AX
   403  	MULQ t0
   404  	ADDQ t1, acc2
   405  	ADCQ $0, DX
   406  	ADDQ AX, acc2
   407  	ADCQ $0, DX
   408  	MOVQ DX, t1
   409  
   410  	MOVQ (8*2)(x_ptr), AX
   411  	MULQ t0
   412  	ADDQ t1, acc3
   413  	ADCQ $0, DX
   414  	ADDQ AX, acc3
   415  	ADCQ $0, DX
   416  	MOVQ DX, t1
   417  
   418  	MOVQ (8*3)(x_ptr), AX
   419  	MULQ t0
   420  	ADDQ t1, acc4
   421  	ADCQ $0, DX
   422  	ADDQ AX, acc4
   423  	ADCQ DX, acc5
   424  	ADCQ $0, acc0
   425  	// Second reduction step
   426  	MOVQ acc1, AX
   427  	MOVQ acc1, DX
   428  	SHLQ $32, AX
   429  	SHRQ $32, DX
   430  
   431  	ADDQ acc1, acc2
   432  	ADCQ $0, acc3
   433  	ADCQ $0, acc4
   434  	ADCQ acc1, acc5
   435  	ADCQ $0, acc0
   436  	
   437  	SUBQ AX, acc2
   438  	SBBQ DX, acc3
   439  	SBBQ AX, acc4
   440  	SBBQ DX, acc5
   441  	SBBQ $0, acc0	
   442  	XORQ acc1, acc1
   443  
   444  	// x * y[2]
   445  	MOVQ (8*2)(y_ptr), t0
   446  
   447  	MOVQ (8*0)(x_ptr), AX
   448  	MULQ t0
   449  	ADDQ AX, acc2
   450  	ADCQ $0, DX
   451  	MOVQ DX, t1
   452  
   453  	MOVQ (8*1)(x_ptr), AX
   454  	MULQ t0
   455  	ADDQ t1, acc3
   456  	ADCQ $0, DX
   457  	ADDQ AX, acc3
   458  	ADCQ $0, DX
   459  	MOVQ DX, t1
   460  
   461  	MOVQ (8*2)(x_ptr), AX
   462  	MULQ t0
   463  	ADDQ t1, acc4
   464  	ADCQ $0, DX
   465  	ADDQ AX, acc4
   466  	ADCQ $0, DX
   467  	MOVQ DX, t1
   468  
   469  	MOVQ (8*3)(x_ptr), AX
   470  	MULQ t0
   471  	ADDQ t1, acc5
   472  	ADCQ $0, DX
   473  	ADDQ AX, acc5
   474  	ADCQ DX, acc0
   475  	ADCQ $0, acc1
   476  	// Third reduction step
   477  	MOVQ acc2, AX
   478  	MOVQ acc2, DX
   479  	SHLQ $32, AX
   480  	SHRQ $32, DX
   481  
   482  	ADDQ acc2, acc3
   483  	ADCQ $0, acc4
   484  	ADCQ $0, acc5
   485  	ADCQ acc2, acc0
   486  	ADCQ $0, acc1
   487  	
   488  	SUBQ AX, acc3
   489  	SBBQ DX, acc4
   490  	SBBQ AX, acc5
   491  	SBBQ DX, acc0
   492  	SBBQ $0, acc1	
   493  	XORQ acc2, acc2
   494  	// x * y[3]
   495  	MOVQ (8*3)(y_ptr), t0
   496  
   497  	MOVQ (8*0)(x_ptr), AX
   498  	MULQ t0
   499  	ADDQ AX, acc3
   500  	ADCQ $0, DX
   501  	MOVQ DX, t1
   502  
   503  	MOVQ (8*1)(x_ptr), AX
   504  	MULQ t0
   505  	ADDQ t1, acc4
   506  	ADCQ $0, DX
   507  	ADDQ AX, acc4
   508  	ADCQ $0, DX
   509  	MOVQ DX, t1
   510  
   511  	MOVQ (8*2)(x_ptr), AX
   512  	MULQ t0
   513  	ADDQ t1, acc5
   514  	ADCQ $0, DX
   515  	ADDQ AX, acc5
   516  	ADCQ $0, DX
   517  	MOVQ DX, t1
   518  
   519  	MOVQ (8*3)(x_ptr), AX
   520  	MULQ t0
   521  	ADDQ t1, acc0
   522  	ADCQ $0, DX
   523  	ADDQ AX, acc0
   524  	ADCQ DX, acc1
   525  	ADCQ $0, acc2
   526  	// Last reduction step
   527  	MOVQ acc3, AX
   528  	MOVQ acc3, DX
   529  	SHLQ $32, AX
   530  	SHRQ $32, DX
   531  
   532  	ADDQ acc3, acc4
   533  	ADCQ $0, acc5
   534  	ADCQ $0, acc0
   535  	ADCQ acc3, acc1
   536  	ADCQ $0, acc2
   537  	
   538  	SUBQ AX, acc4
   539  	SBBQ DX, acc5
   540  	SBBQ AX, acc0
   541  	SBBQ DX, acc1
   542  	SBBQ $0, acc2	
   543  	// Copy result [255:0]
   544  	MOVQ acc4, x_ptr
   545  	MOVQ acc5, acc3
   546  	MOVQ acc0, t0
   547  	MOVQ acc1, t1
   548  	// Subtract p256
   549  	SUBQ $-1, acc4
   550  	SBBQ p256p<>+0x08(SB), acc5
   551  	SBBQ $-1, acc0
   552  	SBBQ p256p<>+0x018(SB), acc1
   553  	SBBQ $0, acc2
   554  
   555  	CMOVQCS x_ptr, acc4
   556  	CMOVQCS acc3, acc5
   557  	CMOVQCS t0, acc0
   558  	CMOVQCS t1, acc1
   559  
   560  	MOVQ acc4, (8*0)(res_ptr)
   561  	MOVQ acc5, (8*1)(res_ptr)
   562  	MOVQ acc0, (8*2)(res_ptr)
   563  	MOVQ acc1, (8*3)(res_ptr)
   564  
   565  	RET
   566  /* ---------------------------------------*/
   567  // func p256FromMont(res, in []uint64)
   568  TEXT ·p256FromMont(SB),NOSPLIT,$0
   569  	MOVQ res+0(FP), res_ptr
   570  	MOVQ in+24(FP), x_ptr
   571  
   572  	MOVQ (8*0)(x_ptr), acc0
   573  	MOVQ (8*1)(x_ptr), acc1
   574  	MOVQ (8*2)(x_ptr), acc2
   575  	MOVQ (8*3)(x_ptr), acc3
   576  	XORQ acc4, acc4
   577  
   578  	// Only reduce, no multiplications are needed
   579  	// First stage
   580  	MOVQ acc0, AX
   581  	MOVQ acc0, DX
   582  	SHLQ $32, AX
   583  	SHRQ $32, DX
   584  
   585  	ADDQ acc0, acc1
   586  	ADCQ $0, acc2
   587  	ADCQ $0, acc3
   588  	ADCQ acc0, acc4
   589  	
   590  	SUBQ AX, acc1
   591  	SBBQ DX, acc2
   592  	SBBQ AX, acc3
   593  	SBBQ DX, acc4
   594  	XORQ acc5, acc5
   595  
   596  	// Second stage
   597  	MOVQ acc1, AX
   598  	MOVQ acc1, DX
   599  	SHLQ $32, AX
   600  	SHRQ $32, DX
   601  
   602  	ADDQ acc1, acc2
   603  	ADCQ $0, acc3
   604  	ADCQ $0, acc4
   605  	ADCQ acc1, acc5
   606  	
   607  	SUBQ AX, acc2
   608  	SBBQ DX, acc3
   609  	SBBQ AX, acc4
   610  	SBBQ DX, acc5
   611  	XORQ acc0, acc0
   612  	// Third stage
   613  	MOVQ acc2, AX
   614  	MOVQ acc2, DX
   615  	SHLQ $32, AX
   616  	SHRQ $32, DX
   617  
   618  	ADDQ acc2, acc3
   619  	ADCQ $0, acc4
   620  	ADCQ $0, acc5
   621  	ADCQ acc2, acc0
   622  	
   623  	SUBQ AX, acc3
   624  	SBBQ DX, acc4
   625  	SBBQ AX, acc5
   626  	SBBQ DX, acc0
   627  	XORQ acc1, acc1
   628  	// Last stage
   629  	MOVQ acc3, AX
   630  	MOVQ acc3, DX
   631  	SHLQ $32, AX
   632  	SHRQ $32, DX
   633  
   634  	ADDQ acc3, acc4
   635  	ADCQ $0, acc5
   636  	ADCQ $0, acc0
   637  	ADCQ acc3, acc1
   638  	
   639  	SUBQ AX, acc4
   640  	SBBQ DX, acc5
   641  	SBBQ AX, acc0
   642  	SBBQ DX, acc1
   643  	
   644  	MOVQ acc4, x_ptr
   645  	MOVQ acc5, acc3
   646  	MOVQ acc0, t0
   647  	MOVQ acc1, t1
   648  
   649  	SUBQ $-1, acc4
   650  	SBBQ p256p<>+0x08(SB), acc5
   651  	SBBQ $-1, acc0
   652  	SBBQ p256p<>+0x018(SB), acc1
   653  
   654  	CMOVQCS x_ptr, acc4
   655  	CMOVQCS acc3, acc5
   656  	CMOVQCS t0, acc0
   657  	CMOVQCS t1, acc1
   658  
   659  	MOVQ acc4, (8*0)(res_ptr)
   660  	MOVQ acc5, (8*1)(res_ptr)
   661  	MOVQ acc0, (8*2)(res_ptr)
   662  	MOVQ acc1, (8*3)(res_ptr)
   663  
   664  	RET
   665  /* ---------------------------------------*/
   666  // Constant time point access to arbitrary point table.
   667  // Indexed from 1 to 15, with -1 offset
   668  // (index 0 is implicitly point at infinity)
   669  // func p256Select(point, table []uint64, idx int)
   670  TEXT ·p256Select(SB),NOSPLIT,$0
   671  	MOVQ idx+48(FP),AX
   672  	MOVQ table+24(FP),DI
   673  	MOVQ point+0(FP),DX
   674  
   675  	PXOR X15, X15	// X15 = 0
   676  	PCMPEQL X14, X14 // X14 = -1
   677  	PSUBL X14, X15   // X15 = 1
   678  	MOVL AX, X14
   679  	PSHUFD $0, X14, X14
   680  
   681  	PXOR X0, X0
   682  	PXOR X1, X1
   683  	PXOR X2, X2
   684  	PXOR X3, X3
   685  	PXOR X4, X4
   686  	PXOR X5, X5
   687  	MOVQ $16, AX
   688  
   689  	MOVOU X15, X13
   690  
   691  loop_select:
   692  
   693  		MOVOU X13, X12
   694  		PADDL X15, X13
   695  		PCMPEQL X14, X12
   696  
   697  		MOVOU (16*0)(DI), X6
   698  		MOVOU (16*1)(DI), X7
   699  		MOVOU (16*2)(DI), X8
   700  		MOVOU (16*3)(DI), X9
   701  		MOVOU (16*4)(DI), X10
   702  		MOVOU (16*5)(DI), X11
   703  		ADDQ $(16*6), DI
   704  
   705  		PAND X12, X6
   706  		PAND X12, X7
   707  		PAND X12, X8
   708  		PAND X12, X9
   709  		PAND X12, X10
   710  		PAND X12, X11
   711  
   712  		PXOR X6, X0
   713  		PXOR X7, X1
   714  		PXOR X8, X2
   715  		PXOR X9, X3
   716  		PXOR X10, X4
   717  		PXOR X11, X5
   718  
   719  		DECQ AX
   720  		JNE loop_select
   721  
   722  	MOVOU X0, (16*0)(DX)
   723  	MOVOU X1, (16*1)(DX)
   724  	MOVOU X2, (16*2)(DX)
   725  	MOVOU X3, (16*3)(DX)
   726  	MOVOU X4, (16*4)(DX)
   727  	MOVOU X5, (16*5)(DX)
   728  
   729  	RET
   730  /* ---------------------------------------*/
   731  // Constant time point access to base point table.
   732  // func p256SelectBase(point *[12]uint64, table string, idx int)
   733  TEXT ·p256SelectBase(SB),NOSPLIT,$0
   734  	MOVQ idx+24(FP),AX
   735  	MOVQ table+8(FP),DI
   736  	MOVQ point+0(FP),DX
   737  
   738  	PXOR X15, X15	// X15 = 0
   739  	PCMPEQL X14, X14 // X14 = -1
   740  	PSUBL X14, X15   // X15 = 1
   741  	MOVL AX, X14
   742  	PSHUFD $0, X14, X14
   743  
   744  	PXOR X0, X0
   745  	PXOR X1, X1
   746  	PXOR X2, X2
   747  	PXOR X3, X3
   748  	MOVQ $16, AX
   749  
   750  	MOVOU X15, X13
   751  
   752  loop_select_base:
   753  
   754  		MOVOU X13, X12
   755  		PADDL X15, X13
   756  		PCMPEQL X14, X12
   757  
   758  		MOVOU (16*0)(DI), X4
   759  		MOVOU (16*1)(DI), X5
   760  		MOVOU (16*2)(DI), X6
   761  		MOVOU (16*3)(DI), X7
   762  
   763  		MOVOU (16*4)(DI), X8
   764  		MOVOU (16*5)(DI), X9
   765  		MOVOU (16*6)(DI), X10
   766  		MOVOU (16*7)(DI), X11
   767  
   768  		ADDQ $(16*8), DI
   769  
   770  		PAND X12, X4
   771  		PAND X12, X5
   772  		PAND X12, X6
   773  		PAND X12, X7
   774  
   775  		MOVOU X13, X12
   776  		PADDL X15, X13
   777  		PCMPEQL X14, X12
   778  
   779  		PAND X12, X8
   780  		PAND X12, X9
   781  		PAND X12, X10
   782  		PAND X12, X11
   783  
   784  		PXOR X4, X0
   785  		PXOR X5, X1
   786  		PXOR X6, X2
   787  		PXOR X7, X3
   788  
   789  		PXOR X8, X0
   790  		PXOR X9, X1
   791  		PXOR X10, X2
   792  		PXOR X11, X3
   793  
   794  		DECQ AX
   795  		JNE loop_select_base
   796  
   797  	MOVOU X0, (16*0)(DX)
   798  	MOVOU X1, (16*1)(DX)
   799  	MOVOU X2, (16*2)(DX)
   800  	MOVOU X3, (16*3)(DX)
   801  
   802  	RET
   803  /* ---------------------------------------*/
   804  // func p256OrdMul(res, in1, in2 []uint64)
   805  TEXT ·p256OrdMul(SB),NOSPLIT,$0
   806  	MOVQ res+0(FP), res_ptr
   807  	MOVQ in1+24(FP), x_ptr
   808  	MOVQ in2+48(FP), y_ptr
   809  	// x * y[0]
   810  	MOVQ (8*0)(y_ptr), t0
   811  
   812  	MOVQ (8*0)(x_ptr), AX
   813  	MULQ t0
   814  	MOVQ AX, acc0
   815  	MOVQ DX, acc1
   816  
   817  	MOVQ (8*1)(x_ptr), AX
   818  	MULQ t0
   819  	ADDQ AX, acc1
   820  	ADCQ $0, DX
   821  	MOVQ DX, acc2
   822  
   823  	MOVQ (8*2)(x_ptr), AX
   824  	MULQ t0
   825  	ADDQ AX, acc2
   826  	ADCQ $0, DX
   827  	MOVQ DX, acc3
   828  
   829  	MOVQ (8*3)(x_ptr), AX
   830  	MULQ t0
   831  	ADDQ AX, acc3
   832  	ADCQ $0, DX
   833  	MOVQ DX, acc4
   834  	XORQ acc5, acc5
   835  	// First reduction step
   836  	MOVQ acc0, AX
   837  	MULQ p256ordK0<>(SB)
   838  	MOVQ AX, t0
   839  
   840  	MOVQ p256ord<>+0x00(SB), AX
   841  	MULQ t0
   842  	ADDQ AX, acc0
   843  	ADCQ $0, DX
   844  	MOVQ DX, t1
   845  
   846  	MOVQ p256ord<>+0x08(SB), AX
   847  	MULQ t0
   848  	ADDQ t1, acc1
   849  	ADCQ $0, DX
   850  	ADDQ AX, acc1
   851  	ADCQ DX, acc2
   852  	ADCQ $0, acc3
   853  	ADCQ t0, acc4
   854  	ADCQ $0, acc5
   855  
   856  	MOVQ t0, AX
   857  	MOVQ t0, DX
   858  	SHLQ $32, AX
   859  	SHRQ $32, DX
   860  		
   861  	SUBQ t0, acc2
   862  	SBBQ AX, acc3
   863  	SBBQ DX, acc4
   864  	SBBQ $0, acc5
   865  	// x * y[1]
   866  	MOVQ (8*1)(y_ptr), t0
   867  
   868  	MOVQ (8*0)(x_ptr), AX
   869  	MULQ t0
   870  	ADDQ AX, acc1
   871  	ADCQ $0, DX
   872  	MOVQ DX, t1
   873  
   874  	MOVQ (8*1)(x_ptr), AX
   875  	MULQ t0
   876  	ADDQ t1, acc2
   877  	ADCQ $0, DX
   878  	ADDQ AX, acc2
   879  	ADCQ $0, DX
   880  	MOVQ DX, t1
   881  
   882  	MOVQ (8*2)(x_ptr), AX
   883  	MULQ t0
   884  	ADDQ t1, acc3
   885  	ADCQ $0, DX
   886  	ADDQ AX, acc3
   887  	ADCQ $0, DX
   888  	MOVQ DX, t1
   889  
   890  	MOVQ (8*3)(x_ptr), AX
   891  	MULQ t0
   892  	ADDQ t1, acc4
   893  	ADCQ $0, DX
   894  	ADDQ AX, acc4
   895  	ADCQ DX, acc5
   896  	ADCQ $0, acc0
   897  	// Second reduction step
   898  	MOVQ acc1, AX
   899  	MULQ p256ordK0<>(SB)
   900  	MOVQ AX, t0
   901  
   902  	MOVQ p256ord<>+0x00(SB), AX
   903  	MULQ t0
   904  	ADDQ AX, acc1
   905  	ADCQ $0, DX
   906  	MOVQ DX, t1
   907  
   908  	MOVQ p256ord<>+0x08(SB), AX
   909  	MULQ t0
   910  	ADDQ t1, acc2
   911  	ADCQ $0, DX
   912  	ADDQ AX, acc2
   913  	ADCQ DX, acc3
   914  	ADCQ $0, acc4
   915  	ADCQ t0, acc5
   916  	ADCQ $0, acc0
   917  
   918  	MOVQ t0, AX
   919  	MOVQ t0, DX
   920  	SHLQ $32, AX
   921  	SHRQ $32, DX
   922  		
   923  	SUBQ t0, acc3
   924  	SBBQ AX, acc4
   925  	SBBQ DX, acc5
   926  	SBBQ $0, acc0
   927  	// x * y[2]
   928  	MOVQ (8*2)(y_ptr), t0
   929  
   930  	MOVQ (8*0)(x_ptr), AX
   931  	MULQ t0
   932  	ADDQ AX, acc2
   933  	ADCQ $0, DX
   934  	MOVQ DX, t1
   935  
   936  	MOVQ (8*1)(x_ptr), AX
   937  	MULQ t0
   938  	ADDQ t1, acc3
   939  	ADCQ $0, DX
   940  	ADDQ AX, acc3
   941  	ADCQ $0, DX
   942  	MOVQ DX, t1
   943  
   944  	MOVQ (8*2)(x_ptr), AX
   945  	MULQ t0
   946  	ADDQ t1, acc4
   947  	ADCQ $0, DX
   948  	ADDQ AX, acc4
   949  	ADCQ $0, DX
   950  	MOVQ DX, t1
   951  
   952  	MOVQ (8*3)(x_ptr), AX
   953  	MULQ t0
   954  	ADDQ t1, acc5
   955  	ADCQ $0, DX
   956  	ADDQ AX, acc5
   957  	ADCQ DX, acc0
   958  	ADCQ $0, acc1
   959  	// Third reduction step
   960  	MOVQ acc2, AX
   961  	MULQ p256ordK0<>(SB)
   962  	MOVQ AX, t0
   963  
   964  	MOVQ p256ord<>+0x00(SB), AX
   965  	MULQ t0
   966  	ADDQ AX, acc2
   967  	ADCQ $0, DX
   968  	MOVQ DX, t1
   969  
   970  	MOVQ p256ord<>+0x08(SB), AX
   971  	MULQ t0
   972  	ADDQ t1, acc3
   973  	ADCQ $0, DX
   974  	ADDQ AX, acc3
   975  	ADCQ DX, acc4
   976  	ADCQ $0, acc5
   977  	ADCQ t0, acc0
   978  	ADCQ $0, acc1
   979  
   980  	MOVQ t0, AX
   981  	MOVQ t0, DX
   982  	SHLQ $32, AX
   983  	SHRQ $32, DX
   984  		
   985  	SUBQ t0, acc4
   986  	SBBQ AX, acc5
   987  	SBBQ DX, acc0
   988  	SBBQ $0, acc1
   989  	// x * y[3]
   990  	MOVQ (8*3)(y_ptr), t0
   991  
   992  	MOVQ (8*0)(x_ptr), AX
   993  	MULQ t0
   994  	ADDQ AX, acc3
   995  	ADCQ $0, DX
   996  	MOVQ DX, t1
   997  
   998  	MOVQ (8*1)(x_ptr), AX
   999  	MULQ t0
  1000  	ADDQ t1, acc4
  1001  	ADCQ $0, DX
  1002  	ADDQ AX, acc4
  1003  	ADCQ $0, DX
  1004  	MOVQ DX, t1
  1005  
  1006  	MOVQ (8*2)(x_ptr), AX
  1007  	MULQ t0
  1008  	ADDQ t1, acc5
  1009  	ADCQ $0, DX
  1010  	ADDQ AX, acc5
  1011  	ADCQ $0, DX
  1012  	MOVQ DX, t1
  1013  
  1014  	MOVQ (8*3)(x_ptr), AX
  1015  	MULQ t0
  1016  	ADDQ t1, acc0
  1017  	ADCQ $0, DX
  1018  	ADDQ AX, acc0
  1019  	ADCQ DX, acc1
  1020  	ADCQ $0, acc2
  1021  	// Last reduction step
  1022  	MOVQ acc3, AX
  1023  	MULQ p256ordK0<>(SB)
  1024  	MOVQ AX, t0
  1025  
  1026  	MOVQ p256ord<>+0x00(SB), AX
  1027  	MULQ t0
  1028  	ADDQ AX, acc3
  1029  	ADCQ $0, DX
  1030  	MOVQ DX, t1
  1031  
  1032  	MOVQ p256ord<>+0x08(SB), AX
  1033  	MULQ t0
  1034  	ADDQ t1, acc4
  1035  	ADCQ $0, DX
  1036  	ADDQ AX, acc4
  1037  	ADCQ DX, acc5
  1038  	ADCQ $0, acc0
  1039  	ADCQ t0, acc1
  1040  	ADCQ $0, acc2
  1041  
  1042  	MOVQ t0, AX
  1043  	MOVQ t0, DX
  1044  	SHLQ $32, AX
  1045  	SHRQ $32, DX
  1046  		
  1047  	SUBQ t0, acc5
  1048  	SBBQ AX, acc0
  1049  	SBBQ DX, acc1
  1050  	SBBQ $0, acc2
  1051  	// Copy result [255:0]
  1052  	MOVQ acc4, x_ptr
  1053  	MOVQ acc5, acc3
  1054  	MOVQ acc0, t0
  1055  	MOVQ acc1, t1
  1056  	// Subtract p256
  1057  	SUBQ p256ord<>+0x00(SB), acc4
  1058  	SBBQ p256ord<>+0x08(SB) ,acc5
  1059  	SBBQ p256ord<>+0x10(SB), acc0
  1060  	SBBQ p256ord<>+0x18(SB), acc1
  1061  	SBBQ $0, acc2
  1062  
  1063  	CMOVQCS x_ptr, acc4
  1064  	CMOVQCS acc3, acc5
  1065  	CMOVQCS t0, acc0
  1066  	CMOVQCS t1, acc1
  1067  
  1068  	MOVQ acc4, (8*0)(res_ptr)
  1069  	MOVQ acc5, (8*1)(res_ptr)
  1070  	MOVQ acc0, (8*2)(res_ptr)
  1071  	MOVQ acc1, (8*3)(res_ptr)
  1072  
  1073  	RET
  1074  /* ---------------------------------------*/
  1075  // func p256OrdSqr(res, in []uint64, n int)
  1076  TEXT ·p256OrdSqr(SB),NOSPLIT,$0
  1077  	MOVQ res+0(FP), res_ptr
  1078  	MOVQ in+24(FP), x_ptr
  1079  	MOVQ n+48(FP), BX
  1080  
  1081  ordSqrLoop:
  1082  
  1083  	// y[1:] * y[0]
  1084  	MOVQ (8*0)(x_ptr), t0
  1085  
  1086  	MOVQ (8*1)(x_ptr), AX
  1087  	MULQ t0
  1088  	MOVQ AX, acc1
  1089  	MOVQ DX, acc2
  1090  
  1091  	MOVQ (8*2)(x_ptr), AX
  1092  	MULQ t0
  1093  	ADDQ AX, acc2
  1094  	ADCQ $0, DX
  1095  	MOVQ DX, acc3
  1096  
  1097  	MOVQ (8*3)(x_ptr), AX
  1098  	MULQ t0
  1099  	ADDQ AX, acc3
  1100  	ADCQ $0, DX
  1101  	MOVQ DX, acc4
  1102  	// y[2:] * y[1]
  1103  	MOVQ (8*1)(x_ptr), t0
  1104  
  1105  	MOVQ (8*2)(x_ptr), AX
  1106  	MULQ t0
  1107  	ADDQ AX, acc3
  1108  	ADCQ $0, DX
  1109  	MOVQ DX, t1
  1110  
  1111  	MOVQ (8*3)(x_ptr), AX
  1112  	MULQ t0
  1113  	ADDQ t1, acc4
  1114  	ADCQ $0, DX
  1115  	ADDQ AX, acc4
  1116  	ADCQ $0, DX
  1117  	MOVQ DX, acc5
  1118  	// y[3] * y[2]
  1119  	MOVQ (8*2)(x_ptr), t0
  1120  
  1121  	MOVQ (8*3)(x_ptr), AX
  1122  	MULQ t0
  1123  	ADDQ AX, acc5
  1124  	ADCQ $0, DX
  1125  	MOVQ DX, y_ptr
  1126  	XORQ t1, t1
  1127  	// *2
  1128  	ADDQ acc1, acc1
  1129  	ADCQ acc2, acc2
  1130  	ADCQ acc3, acc3
  1131  	ADCQ acc4, acc4
  1132  	ADCQ acc5, acc5
  1133  	ADCQ y_ptr, y_ptr
  1134  	ADCQ $0, t1
  1135  	// Missing products
  1136  	MOVQ (8*0)(x_ptr), AX
  1137  	MULQ AX
  1138  	MOVQ AX, acc0
  1139  	MOVQ DX, t0
  1140  
  1141  	MOVQ (8*1)(x_ptr), AX
  1142  	MULQ AX
  1143  	ADDQ t0, acc1
  1144  	ADCQ AX, acc2
  1145  	ADCQ $0, DX
  1146  	MOVQ DX, t0
  1147  
  1148  	MOVQ (8*2)(x_ptr), AX
  1149  	MULQ AX
  1150  	ADDQ t0, acc3
  1151  	ADCQ AX, acc4
  1152  	ADCQ $0, DX
  1153  	MOVQ DX, t0
  1154  
  1155  	MOVQ (8*3)(x_ptr), AX
  1156  	MULQ AX
  1157  	ADDQ t0, acc5
  1158  	ADCQ AX, y_ptr
  1159  	ADCQ DX, t1
  1160  	MOVQ t1, x_ptr
  1161  	// First reduction step
  1162  	MOVQ acc0, AX
  1163  	MULQ p256ordK0<>(SB)
  1164  	MOVQ AX, t0                 // Y = t0 = (k0 * acc0) mod 2^64
  1165  
  1166  	MOVQ p256ord<>+0x00(SB), AX
  1167  	MULQ t0
  1168  	ADDQ AX, acc0               // (carry1, acc0) = acc0 + t0 * ord0
  1169  	ADCQ $0, DX                 // DX = carry1 + H(t0 * ord0)
  1170  	MOVQ DX, t1                 // t1 = carry1 + H(t0 * ord0)
  1171  	MOVQ t0, acc0
  1172  
  1173  	MOVQ p256ord<>+0x08(SB), AX
  1174  	MULQ t0
  1175  	ADDQ t1, acc1               // (carry2, acc1) = acc1 + t1
  1176  	ADCQ $0, DX                 // DX = carry2 + H(t0*ord1)
  1177  
  1178  	ADDQ AX, acc1               // (carry3, acc1) = acc1 + t1 + L(t0*ord1)
  1179  	ADCQ DX, acc2
  1180  	ADCQ $0, acc3
  1181  	ADCQ $0, acc0
  1182  
  1183  	MOVQ t0, AX
  1184  	MOVQ t0, DX
  1185  	SHLQ $32, AX
  1186  	SHRQ $32, DX
  1187  
  1188  	SUBQ t0, acc2
  1189  	SBBQ AX, acc3
  1190  	SBBQ DX, acc0
  1191  	// Second reduction step
  1192  	MOVQ acc1, AX
  1193  	MULQ p256ordK0<>(SB)
  1194  	MOVQ AX, t0
  1195  
  1196  	MOVQ p256ord<>+0x00(SB), AX
  1197  	MULQ t0
  1198  	ADDQ AX, acc1
  1199  	ADCQ $0, DX
  1200  	MOVQ DX, t1
  1201  	MOVQ t0, acc1
  1202  
  1203  	MOVQ p256ord<>+0x08(SB), AX
  1204  	MULQ t0
  1205  	ADDQ t1, acc2
  1206  	ADCQ $0, DX
  1207  
  1208  	ADDQ AX, acc2
  1209  	ADCQ DX, acc3
  1210  	ADCQ $0, acc0
  1211  	ADCQ $0, acc1
  1212  
  1213  	MOVQ t0, AX
  1214  	MOVQ t0, DX
  1215  	SHLQ $32, AX
  1216  	SHRQ $32, DX
  1217  
  1218  	SUBQ t0, acc3
  1219  	SBBQ AX, acc0
  1220  	SBBQ DX, acc1
  1221  	// Third reduction step
  1222  	MOVQ acc2, AX
  1223  	MULQ p256ordK0<>(SB)
  1224  	MOVQ AX, t0
  1225  
  1226  	MOVQ p256ord<>+0x00(SB), AX
  1227  	MULQ t0
  1228  	ADDQ AX, acc2
  1229  	ADCQ $0, DX
  1230  	MOVQ DX, t1
  1231  	MOVQ t0, acc2
  1232  
  1233  	MOVQ p256ord<>+0x08(SB), AX
  1234  	MULQ t0
  1235  	ADDQ t1, acc3
  1236  	ADCQ $0, DX
  1237  
  1238  	ADDQ AX, acc3
  1239  	ADCQ DX, acc0
  1240  	ADCQ $0, acc1
  1241  	ADCQ $0, acc2
  1242  
  1243  	MOVQ t0, AX
  1244  	MOVQ t0, DX
  1245  	SHLQ $32, AX
  1246  	SHRQ $32, DX
  1247  
  1248  	SUBQ t0, acc0
  1249  	SBBQ AX, acc1
  1250  	SBBQ DX, acc2
  1251  	// Last reduction step
  1252  	MOVQ acc3, AX
  1253  	MULQ p256ordK0<>(SB)
  1254  	MOVQ AX, t0
  1255  
  1256  	MOVQ p256ord<>+0x00(SB), AX
  1257  	MULQ t0
  1258  	ADDQ AX, acc3
  1259  	ADCQ $0, DX
  1260  	MOVQ DX, t1
  1261  	MOVQ t0, acc3
  1262  
  1263  	MOVQ p256ord<>+0x08(SB), AX
  1264  	MULQ t0
  1265  	ADDQ t1, acc0
  1266  	ADCQ $0, DX
  1267  
  1268  	ADDQ AX, acc0
  1269  	ADCQ DX, acc1
  1270  	ADCQ $0, acc2
  1271  	ADCQ $0, acc3
  1272  
  1273  	MOVQ t0, AX
  1274  	MOVQ t0, DX
  1275  	SHLQ $32, AX
  1276  	SHRQ $32, DX
  1277  
  1278  	SUBQ t0, acc1
  1279  	SBBQ AX, acc2
  1280  	SBBQ DX, acc3
  1281  
  1282  	XORQ t0, t0
  1283  	// Add bits [511:256] of the sqr result
  1284  	ADCQ acc4, acc0
  1285  	ADCQ acc5, acc1
  1286  	ADCQ y_ptr, acc2
  1287  	ADCQ x_ptr, acc3
  1288  	ADCQ $0, t0
  1289  
  1290  	MOVQ acc0, acc4
  1291  	MOVQ acc1, acc5
  1292  	MOVQ acc2, y_ptr
  1293  	MOVQ acc3, t1
  1294  	// Subtract p256
  1295  	SUBQ p256ord<>+0x00(SB), acc0
  1296  	SBBQ p256ord<>+0x08(SB) ,acc1
  1297  	SBBQ p256ord<>+0x10(SB), acc2
  1298  	SBBQ p256ord<>+0x18(SB), acc3
  1299  	SBBQ $0, t0
  1300  
  1301  	CMOVQCS acc4, acc0
  1302  	CMOVQCS acc5, acc1
  1303  	CMOVQCS y_ptr, acc2
  1304  	CMOVQCS t1, acc3
  1305  
  1306  	MOVQ acc0, (8*0)(res_ptr)
  1307  	MOVQ acc1, (8*1)(res_ptr)
  1308  	MOVQ acc2, (8*2)(res_ptr)
  1309  	MOVQ acc3, (8*3)(res_ptr)
  1310  	MOVQ res_ptr, x_ptr
  1311  	DECQ BX
  1312  	JNE ordSqrLoop
  1313  
  1314  	RET
  1315  /* ---------------------------------------*/
  1316  #undef res_ptr
  1317  #undef x_ptr
  1318  #undef y_ptr
  1319  
  1320  #undef acc0
  1321  #undef acc1
  1322  #undef acc2
  1323  #undef acc3
  1324  #undef acc4
  1325  #undef acc5
  1326  #undef t0
  1327  #undef t1
  1328  /* ---------------------------------------*/
  1329  #define mul0 AX
  1330  #define mul1 DX
  1331  #define acc0 BX
  1332  #define acc1 CX
  1333  #define acc2 R8
  1334  #define acc3 R9
  1335  #define acc4 R10
  1336  #define acc5 R11
  1337  #define acc6 R12
  1338  #define acc7 R13
  1339  #define t0 R14
  1340  #define t1 R15
  1341  #define t2 DI
  1342  #define t3 SI
  1343  #define hlp BP
  1344  /* ---------------------------------------*/
  1345  TEXT sm2P256SubInternal(SB),NOSPLIT,$0
  1346  	XORQ mul0, mul0
  1347  	SUBQ t0, acc4
  1348  	SBBQ t1, acc5
  1349  	SBBQ t2, acc6
  1350  	SBBQ t3, acc7
  1351  	SBBQ $0, mul0
  1352  
  1353  	MOVQ acc4, acc0
  1354  	MOVQ acc5, acc1
  1355  	MOVQ acc6, acc2
  1356  	MOVQ acc7, acc3
  1357  
  1358  	ADDQ $-1, acc4
  1359  	ADCQ p256p<>+0x08(SB), acc5
  1360  	ADCQ $-1, acc6
  1361  	ADCQ p256p<>+0x018(SB), acc7
  1362  	ANDQ $1, mul0
  1363  
  1364  	CMOVQEQ acc0, acc4
  1365  	CMOVQEQ acc1, acc5
  1366  	CMOVQEQ acc2, acc6
  1367  	CMOVQEQ acc3, acc7
  1368  
  1369  	RET
  1370  /* ---------------------------------------*/
  1371  TEXT sm2P256MulInternal(SB),NOSPLIT,$8
  1372  	MOVQ acc4, mul0
  1373  	MULQ t0
  1374  	MOVQ mul0, acc0
  1375  	MOVQ mul1, acc1
  1376  
  1377  	MOVQ acc4, mul0
  1378  	MULQ t1
  1379  	ADDQ mul0, acc1
  1380  	ADCQ $0, mul1
  1381  	MOVQ mul1, acc2
  1382  
  1383  	MOVQ acc4, mul0
  1384  	MULQ t2
  1385  	ADDQ mul0, acc2
  1386  	ADCQ $0, mul1
  1387  	MOVQ mul1, acc3
  1388  
  1389  	MOVQ acc4, mul0
  1390  	MULQ t3
  1391  	ADDQ mul0, acc3
  1392  	ADCQ $0, mul1
  1393  	MOVQ mul1, acc4
  1394  
  1395  	MOVQ acc5, mul0
  1396  	MULQ t0
  1397  	ADDQ mul0, acc1
  1398  	ADCQ $0, mul1
  1399  	MOVQ mul1, hlp
  1400  
  1401  	MOVQ acc5, mul0
  1402  	MULQ t1
  1403  	ADDQ hlp, acc2
  1404  	ADCQ $0, mul1
  1405  	ADDQ mul0, acc2
  1406  	ADCQ $0, mul1
  1407  	MOVQ mul1, hlp
  1408  
  1409  	MOVQ acc5, mul0
  1410  	MULQ t2
  1411  	ADDQ hlp, acc3
  1412  	ADCQ $0, mul1
  1413  	ADDQ mul0, acc3
  1414  	ADCQ $0, mul1
  1415  	MOVQ mul1, hlp
  1416  
  1417  	MOVQ acc5, mul0
  1418  	MULQ t3
  1419  	ADDQ hlp, acc4
  1420  	ADCQ $0, mul1
  1421  	ADDQ mul0, acc4
  1422  	ADCQ $0, mul1
  1423  	MOVQ mul1, acc5
  1424  
  1425  	MOVQ acc6, mul0
  1426  	MULQ t0
  1427  	ADDQ mul0, acc2
  1428  	ADCQ $0, mul1
  1429  	MOVQ mul1, hlp
  1430  
  1431  	MOVQ acc6, mul0
  1432  	MULQ t1
  1433  	ADDQ hlp, acc3
  1434  	ADCQ $0, mul1
  1435  	ADDQ mul0, acc3
  1436  	ADCQ $0, mul1
  1437  	MOVQ mul1, hlp
  1438  
  1439  	MOVQ acc6, mul0
  1440  	MULQ t2
  1441  	ADDQ hlp, acc4
  1442  	ADCQ $0, mul1
  1443  	ADDQ mul0, acc4
  1444  	ADCQ $0, mul1
  1445  	MOVQ mul1, hlp
  1446  
  1447  	MOVQ acc6, mul0
  1448  	MULQ t3
  1449  	ADDQ hlp, acc5
  1450  	ADCQ $0, mul1
  1451  	ADDQ mul0, acc5
  1452  	ADCQ $0, mul1
  1453  	MOVQ mul1, acc6
  1454  
  1455  	MOVQ acc7, mul0
  1456  	MULQ t0
  1457  	ADDQ mul0, acc3
  1458  	ADCQ $0, mul1
  1459  	MOVQ mul1, hlp
  1460  
  1461  	MOVQ acc7, mul0
  1462  	MULQ t1
  1463  	ADDQ hlp, acc4
  1464  	ADCQ $0, mul1
  1465  	ADDQ mul0, acc4
  1466  	ADCQ $0, mul1
  1467  	MOVQ mul1, hlp
  1468  
  1469  	MOVQ acc7, mul0
  1470  	MULQ t2
  1471  	ADDQ hlp, acc5
  1472  	ADCQ $0, mul1
  1473  	ADDQ mul0, acc5
  1474  	ADCQ $0, mul1
  1475  	MOVQ mul1, hlp
  1476  
  1477  	MOVQ acc7, mul0
  1478  	MULQ t3
  1479  	ADDQ hlp, acc6
  1480  	ADCQ $0, mul1
  1481  	ADDQ mul0, acc6
  1482  	ADCQ $0, mul1
  1483  	MOVQ mul1, acc7
  1484  	// First reduction step
  1485  	MOVQ acc0, mul0
  1486  	MOVQ acc0, mul1
  1487  	SHLQ $32, mul0
  1488  	SHRQ $32, mul1
  1489  
  1490  	ADDQ acc0, acc1
  1491  	ADCQ $0, acc2
  1492  	ADCQ $0, acc3
  1493  	ADCQ $0, acc0
  1494  	
  1495  	SUBQ mul0, acc1
  1496  	SBBQ mul1, acc2
  1497  	SBBQ mul0, acc3
  1498  	SBBQ mul1, acc0
  1499  	// Second reduction step
  1500  	MOVQ acc1, mul0
  1501  	MOVQ acc1, mul1
  1502  	SHLQ $32, mul0
  1503  	SHRQ $32, mul1
  1504  
  1505  	ADDQ acc1, acc2
  1506  	ADCQ $0, acc3
  1507  	ADCQ $0, acc0
  1508  	ADCQ $0, acc1
  1509  	
  1510  	SUBQ mul0, acc2
  1511  	SBBQ mul1, acc3
  1512  	SBBQ mul0, acc0
  1513  	SBBQ mul1, acc1
  1514  	// Third reduction step
  1515  	MOVQ acc2, mul0
  1516  	MOVQ acc2, mul1
  1517  	SHLQ $32, mul0
  1518  	SHRQ $32, mul1
  1519  
  1520  	ADDQ acc2, acc3
  1521  	ADCQ $0, acc0
  1522  	ADCQ $0, acc1
  1523  	ADCQ $0, acc2
  1524  	
  1525  	SUBQ mul0, acc3
  1526  	SBBQ mul1, acc0
  1527  	SBBQ mul0, acc1
  1528  	SBBQ mul1, acc2
  1529  	// Last reduction step
  1530  	MOVQ acc3, mul0
  1531  	MOVQ acc3, mul1
  1532  	SHLQ $32, mul0
  1533  	SHRQ $32, mul1
  1534  
  1535  	ADDQ acc3, acc0
  1536  	ADCQ $0, acc1
  1537  	ADCQ $0, acc2
  1538  	ADCQ $0, acc3
  1539  	
  1540  	SUBQ mul0, acc0
  1541  	SBBQ mul1, acc1
  1542  	SBBQ mul0, acc2
  1543  	SBBQ mul1, acc3
  1544  	MOVQ $0, BP
  1545  	// Add bits [511:256] of the result
  1546  	ADCQ acc0, acc4
  1547  	ADCQ acc1, acc5
  1548  	ADCQ acc2, acc6
  1549  	ADCQ acc3, acc7
  1550  	ADCQ $0, hlp
  1551  	// Copy result
  1552  	MOVQ acc4, acc0
  1553  	MOVQ acc5, acc1
  1554  	MOVQ acc6, acc2
  1555  	MOVQ acc7, acc3
  1556  	// Subtract p256
  1557  	SUBQ $-1, acc4
  1558  	SBBQ p256p<>+0x08(SB), acc5
  1559  	SBBQ $-1, acc6
  1560  	SBBQ p256p<>+0x018(SB), acc7
  1561  	SBBQ $0, hlp
  1562  	// If the result of the subtraction is negative, restore the previous result
  1563  	CMOVQCS acc0, acc4
  1564  	CMOVQCS acc1, acc5
  1565  	CMOVQCS acc2, acc6
  1566  	CMOVQCS acc3, acc7
  1567  
  1568  	RET
  1569  /* ---------------------------------------*/
  1570  TEXT sm2P256SqrInternal(SB),NOSPLIT,$8
  1571  
  1572  	MOVQ acc4, mul0
  1573  	MULQ acc5
  1574  	MOVQ mul0, acc1
  1575  	MOVQ mul1, acc2
  1576  
  1577  	MOVQ acc4, mul0
  1578  	MULQ acc6
  1579  	ADDQ mul0, acc2
  1580  	ADCQ $0, mul1
  1581  	MOVQ mul1, acc3
  1582  
  1583  	MOVQ acc4, mul0
  1584  	MULQ acc7
  1585  	ADDQ mul0, acc3
  1586  	ADCQ $0, mul1
  1587  	MOVQ mul1, t0
  1588  
  1589  	MOVQ acc5, mul0
  1590  	MULQ acc6
  1591  	ADDQ mul0, acc3
  1592  	ADCQ $0, mul1
  1593  	MOVQ mul1, hlp
  1594  
  1595  	MOVQ acc5, mul0
  1596  	MULQ acc7
  1597  	ADDQ hlp, t0
  1598  	ADCQ $0, mul1
  1599  	ADDQ mul0, t0
  1600  	ADCQ $0, mul1
  1601  	MOVQ mul1, t1
  1602  
  1603  	MOVQ acc6, mul0
  1604  	MULQ acc7
  1605  	ADDQ mul0, t1
  1606  	ADCQ $0, mul1
  1607  	MOVQ mul1, t2
  1608  	XORQ t3, t3
  1609  	// *2
  1610  	ADDQ acc1, acc1
  1611  	ADCQ acc2, acc2
  1612  	ADCQ acc3, acc3
  1613  	ADCQ t0, t0
  1614  	ADCQ t1, t1
  1615  	ADCQ t2, t2
  1616  	ADCQ $0, t3
  1617  	// Missing products
  1618  	MOVQ acc4, mul0
  1619  	MULQ mul0
  1620  	MOVQ mul0, acc0
  1621  	MOVQ DX, acc4
  1622  
  1623  	MOVQ acc5, mul0
  1624  	MULQ mul0
  1625  	ADDQ acc4, acc1
  1626  	ADCQ mul0, acc2
  1627  	ADCQ $0, DX
  1628  	MOVQ DX, acc4
  1629  
  1630  	MOVQ acc6, mul0
  1631  	MULQ mul0
  1632  	ADDQ acc4, acc3
  1633  	ADCQ mul0, t0
  1634  	ADCQ $0, DX
  1635  	MOVQ DX, acc4
  1636  
  1637  	MOVQ acc7, mul0
  1638  	MULQ mul0
  1639  	ADDQ acc4, t1
  1640  	ADCQ mul0, t2
  1641  	ADCQ DX, t3
  1642  	// First reduction step
  1643  	MOVQ acc0, mul0
  1644  	MOVQ acc0, mul1
  1645  	SHLQ $32, mul0
  1646  	SHRQ $32, mul1
  1647  
  1648  	ADDQ acc0, acc1
  1649  	ADCQ $0, acc2
  1650  	ADCQ $0, acc3
  1651  	ADCQ $0, acc0
  1652  	
  1653  	SUBQ mul0, acc1
  1654  	SBBQ mul1, acc2
  1655  	SBBQ mul0, acc3
  1656  	SBBQ mul1, acc0
  1657  	// Second reduction step
  1658  	MOVQ acc1, mul0
  1659  	MOVQ acc1, mul1
  1660  	SHLQ $32, mul0
  1661  	SHRQ $32, mul1
  1662  
  1663  	ADDQ acc1, acc2
  1664  	ADCQ $0, acc3
  1665  	ADCQ $0, acc0
  1666  	ADCQ $0, acc1
  1667  	
  1668  	SUBQ mul0, acc2
  1669  	SBBQ mul1, acc3
  1670  	SBBQ mul0, acc0
  1671  	SBBQ mul1, acc1
  1672  	// Third reduction step
  1673  	MOVQ acc2, mul0
  1674  	MOVQ acc2, mul1
  1675  	SHLQ $32, mul0
  1676  	SHRQ $32, mul1
  1677  
  1678  	ADDQ acc2, acc3
  1679  	ADCQ $0, acc0
  1680  	ADCQ $0, acc1
  1681  	ADCQ $0, acc2
  1682  	
  1683  	SUBQ mul0, acc3
  1684  	SBBQ mul1, acc0
  1685  	SBBQ mul0, acc1
  1686  	SBBQ mul1, acc2
  1687  	// Last reduction step
  1688  	MOVQ acc3, mul0
  1689  	MOVQ acc3, mul1
  1690  	SHLQ $32, mul0
  1691  	SHRQ $32, mul1
  1692  
  1693  	ADDQ acc3, acc0
  1694  	ADCQ $0, acc1
  1695  	ADCQ $0, acc2
  1696  	ADCQ $0, acc3
  1697  	
  1698  	SUBQ mul0, acc0
  1699  	SBBQ mul1, acc1
  1700  	SBBQ mul0, acc2
  1701  	SBBQ mul1, acc3
  1702  	MOVQ $0, BP
  1703  	// Add bits [511:256] of the result
  1704  	ADCQ acc0, t0
  1705  	ADCQ acc1, t1
  1706  	ADCQ acc2, t2
  1707  	ADCQ acc3, t3
  1708  	ADCQ $0, hlp
  1709  	// Copy result
  1710  	MOVQ t0, acc4
  1711  	MOVQ t1, acc5
  1712  	MOVQ t2, acc6
  1713  	MOVQ t3, acc7
  1714  	// Subtract p256
  1715  	SUBQ $-1, acc4
  1716  	SBBQ p256p<>+0x08(SB), acc5
  1717  	SBBQ $-1, acc6
  1718  	SBBQ p256p<>+0x018(SB), acc7
  1719  	SBBQ $0, hlp
  1720  	// If the result of the subtraction is negative, restore the previous result
  1721  	CMOVQCS t0, acc4
  1722  	CMOVQCS t1, acc5
  1723  	CMOVQCS t2, acc6
  1724  	CMOVQCS t3, acc7
  1725  
  1726  	RET
  1727  /* ---------------------------------------*/
  1728  #define p256MulBy2Inline\
  1729  	XORQ mul0, mul0;\
  1730  	ADDQ acc4, acc4;\
  1731  	ADCQ acc5, acc5;\
  1732  	ADCQ acc6, acc6;\
  1733  	ADCQ acc7, acc7;\
  1734  	ADCQ $0, mul0;\
  1735  	MOVQ acc4, t0;\
  1736  	MOVQ acc5, t1;\
  1737  	MOVQ acc6, t2;\
  1738  	MOVQ acc7, t3;\
  1739  	SUBQ $-1, t0;\
  1740  	SBBQ p256p<>+0x08(SB), t1;\
  1741  	SBBQ $-1, t2;\
  1742  	SBBQ p256p<>+0x018(SB), t3;\
  1743  	SBBQ $0, mul0;\
  1744  	CMOVQCS acc4, t0;\
  1745  	CMOVQCS acc5, t1;\
  1746  	CMOVQCS acc6, t2;\
  1747  	CMOVQCS acc7, t3;
  1748  /* ---------------------------------------*/
  1749  #define p256AddInline \
  1750  	XORQ mul0, mul0;\
  1751  	ADDQ t0, acc4;\
  1752  	ADCQ t1, acc5;\
  1753  	ADCQ t2, acc6;\
  1754  	ADCQ t3, acc7;\
  1755  	ADCQ $0, mul0;\
  1756  	MOVQ acc4, t0;\
  1757  	MOVQ acc5, t1;\
  1758  	MOVQ acc6, t2;\
  1759  	MOVQ acc7, t3;\
  1760  	SUBQ $-1, t0;\
  1761  	SBBQ p256p<>+0x08(SB), t1;\
  1762  	SBBQ $-1, t2;\
  1763  	SBBQ p256p<>+0x018(SB), t3;\
  1764  	SBBQ $0, mul0;\
  1765  	CMOVQCS acc4, t0;\
  1766  	CMOVQCS acc5, t1;\
  1767  	CMOVQCS acc6, t2;\
  1768  	CMOVQCS acc7, t3;
  1769  /* ---------------------------------------*/
  1770  #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
  1771  #define LDt(src)   MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
  1772  #define ST(dst)    MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
  1773  #define STt(dst)   MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
  1774  #define acc2t      MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
  1775  #define t2acc      MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
  1776  /* ---------------------------------------*/
  1777  #define x1in(off) (32*0 + off)(SP)
  1778  #define y1in(off) (32*1 + off)(SP)
  1779  #define z1in(off) (32*2 + off)(SP)
  1780  #define x2in(off) (32*3 + off)(SP)
  1781  #define y2in(off) (32*4 + off)(SP)
  1782  #define xout(off) (32*5 + off)(SP)
  1783  #define yout(off) (32*6 + off)(SP)
  1784  #define zout(off) (32*7 + off)(SP)
  1785  #define s2(off)   (32*8 + off)(SP)
  1786  #define z1sqr(off) (32*9 + off)(SP)
  1787  #define h(off)	  (32*10 + off)(SP)
  1788  #define r(off)	  (32*11 + off)(SP)
  1789  #define hsqr(off) (32*12 + off)(SP)
  1790  #define rsqr(off) (32*13 + off)(SP)
  1791  #define hcub(off) (32*14 + off)(SP)
  1792  #define rptr	  (32*15)(SP)
  1793  #define sel_save  (32*15 + 8)(SP)
  1794  #define zero_save (32*15 + 8 + 4)(SP)
  1795  
  1796  // func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
  1797  TEXT ·p256PointAddAffineAsm(SB),0,$512-96
  1798  	// Move input to stack in order to free registers
  1799  	MOVQ res+0(FP), AX
  1800  	MOVQ in1+24(FP), BX
  1801  	MOVQ in2+48(FP), CX
  1802  	MOVQ sign+72(FP), DX
  1803  	MOVQ sel+80(FP), t1
  1804  	MOVQ zero+88(FP), t2
  1805  
  1806  	MOVOU (16*0)(BX), X0
  1807  	MOVOU (16*1)(BX), X1
  1808  	MOVOU (16*2)(BX), X2
  1809  	MOVOU (16*3)(BX), X3
  1810  	MOVOU (16*4)(BX), X4
  1811  	MOVOU (16*5)(BX), X5
  1812  
  1813  	MOVOU X0, x1in(16*0)
  1814  	MOVOU X1, x1in(16*1)
  1815  	MOVOU X2, y1in(16*0)
  1816  	MOVOU X3, y1in(16*1)
  1817  	MOVOU X4, z1in(16*0)
  1818  	MOVOU X5, z1in(16*1)
  1819  
  1820  	MOVOU (16*0)(CX), X0
  1821  	MOVOU (16*1)(CX), X1
  1822  
  1823  	MOVOU X0, x2in(16*0)
  1824  	MOVOU X1, x2in(16*1)
  1825  	// Store pointer to result
  1826  	MOVQ mul0, rptr
  1827  	MOVL t1, sel_save
  1828  	MOVL t2, zero_save
  1829  	// Negate y2in based on sign
  1830  	MOVQ (16*2 + 8*0)(CX), acc4
  1831  	MOVQ (16*2 + 8*1)(CX), acc5
  1832  	MOVQ (16*2 + 8*2)(CX), acc6
  1833  	MOVQ (16*2 + 8*3)(CX), acc7
  1834  	MOVQ $-1, acc0
  1835  	MOVQ p256p<>+0x08(SB), acc1
  1836  	MOVQ $-1, acc2
  1837  	MOVQ p256p<>+0x018(SB), acc3
  1838  	XORQ mul0, mul0
  1839  	// Speculatively subtract
  1840  	SUBQ acc4, acc0
  1841  	SBBQ acc5, acc1
  1842  	SBBQ acc6, acc2
  1843  	SBBQ acc7, acc3
  1844  	SBBQ $0, mul0
  1845  	MOVQ acc0, t0
  1846  	MOVQ acc1, t1
  1847  	MOVQ acc2, t2
  1848  	MOVQ acc3, t3
  1849  	// Add in case the operand was > p256
  1850  	ADDQ $-1, acc0
  1851  	ADCQ p256p<>+0x08(SB), acc1
  1852  	ADCQ $-1, acc2
  1853  	ADCQ p256p<>+0x018(SB), acc3
  1854  	ADCQ $0, mul0
  1855  	CMOVQNE t0, acc0
  1856  	CMOVQNE t1, acc1
  1857  	CMOVQNE t2, acc2
  1858  	CMOVQNE t3, acc3
  1859  	// If condition is 0, keep original value
  1860  	TESTQ DX, DX
  1861  	CMOVQEQ acc4, acc0
  1862  	CMOVQEQ acc5, acc1
  1863  	CMOVQEQ acc6, acc2
  1864  	CMOVQEQ acc7, acc3
  1865  	// Store result
  1866  	MOVQ acc0, y2in(8*0)
  1867  	MOVQ acc1, y2in(8*1)
  1868  	MOVQ acc2, y2in(8*2)
  1869  	MOVQ acc3, y2in(8*3)
  1870  	// Begin point add
  1871  	LDacc (z1in)
  1872  	CALL sm2P256SqrInternal(SB)	// z1ˆ2
  1873  	ST (z1sqr)
  1874  
  1875  	LDt (x2in)
  1876  	CALL sm2P256MulInternal(SB)	// x2 * z1ˆ2
  1877  
  1878  	LDt (x1in)
  1879  	CALL sm2P256SubInternal(SB)	// h = u2 - u1
  1880  	ST (h)
  1881  
  1882  	LDt (z1in)
  1883  	CALL sm2P256MulInternal(SB)	// z3 = h * z1
  1884  	ST (zout)
  1885  
  1886  	LDacc (z1sqr)
  1887  	CALL sm2P256MulInternal(SB)	// z1ˆ3
  1888  
  1889  	LDt (y2in)
  1890  	CALL sm2P256MulInternal(SB)	// s2 = y2 * z1ˆ3
  1891  	ST (s2)
  1892  
  1893  	LDt (y1in)
  1894  	CALL sm2P256SubInternal(SB)	// r = s2 - s1
  1895  	ST (r)
  1896  
  1897  	CALL sm2P256SqrInternal(SB)	// rsqr = rˆ2
  1898  	ST (rsqr)
  1899  
  1900  	LDacc (h)
  1901  	CALL sm2P256SqrInternal(SB)	// hsqr = hˆ2
  1902  	ST (hsqr)
  1903  
  1904  	LDt (h)
  1905  	CALL sm2P256MulInternal(SB)	// hcub = hˆ3
  1906  	ST (hcub)
  1907  
  1908  	LDt (y1in)
  1909  	CALL sm2P256MulInternal(SB)	// y1 * hˆ3
  1910  	ST (s2)
  1911  
  1912  	LDacc (x1in)
  1913  	LDt (hsqr)
  1914  	CALL sm2P256MulInternal(SB)	// u1 * hˆ2
  1915  	ST (h)
  1916  
  1917  	p256MulBy2Inline			// u1 * hˆ2 * 2, inline
  1918  	LDacc (rsqr)
  1919  	CALL sm2P256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
  1920  
  1921  	LDt (hcub)
  1922  	CALL sm2P256SubInternal(SB)
  1923  	ST (xout)
  1924  
  1925  	MOVQ acc4, t0
  1926  	MOVQ acc5, t1
  1927  	MOVQ acc6, t2
  1928  	MOVQ acc7, t3
  1929  	LDacc (h)
  1930  	CALL sm2P256SubInternal(SB)
  1931  
  1932  	LDt (r)
  1933  	CALL sm2P256MulInternal(SB)
  1934  
  1935  	LDt (s2)
  1936  	CALL sm2P256SubInternal(SB)
  1937  	ST (yout)
  1938  	// Load stored values from stack
  1939  	MOVQ rptr, AX
  1940  	MOVL sel_save, BX
  1941  	MOVL zero_save, CX
  1942  	// The result is not valid if (sel == 0), conditional choose
  1943  	MOVOU xout(16*0), X0
  1944  	MOVOU xout(16*1), X1
  1945  	MOVOU yout(16*0), X2
  1946  	MOVOU yout(16*1), X3
  1947  	MOVOU zout(16*0), X4
  1948  	MOVOU zout(16*1), X5
  1949  
  1950  	MOVL BX, X6
  1951  	MOVL CX, X7
  1952  
  1953  	PXOR X8, X8
  1954  	PCMPEQL X9, X9
  1955  
  1956  	PSHUFD $0, X6, X6
  1957  	PSHUFD $0, X7, X7
  1958  
  1959  	PCMPEQL X8, X6
  1960  	PCMPEQL X8, X7
  1961  
  1962  	MOVOU X6, X15
  1963  	PANDN X9, X15
  1964  
  1965  	MOVOU x1in(16*0), X9
  1966  	MOVOU x1in(16*1), X10
  1967  	MOVOU y1in(16*0), X11
  1968  	MOVOU y1in(16*1), X12
  1969  	MOVOU z1in(16*0), X13
  1970  	MOVOU z1in(16*1), X14
  1971  
  1972  	PAND X15, X0
  1973  	PAND X15, X1
  1974  	PAND X15, X2
  1975  	PAND X15, X3
  1976  	PAND X15, X4
  1977  	PAND X15, X5
  1978  
  1979  	PAND X6, X9
  1980  	PAND X6, X10
  1981  	PAND X6, X11
  1982  	PAND X6, X12
  1983  	PAND X6, X13
  1984  	PAND X6, X14
  1985  
  1986  	PXOR X9, X0
  1987  	PXOR X10, X1
  1988  	PXOR X11, X2
  1989  	PXOR X12, X3
  1990  	PXOR X13, X4
  1991  	PXOR X14, X5
  1992  	// Similarly if zero == 0
  1993  	PCMPEQL X9, X9
  1994  	MOVOU X7, X15
  1995  	PANDN X9, X15
  1996  
  1997  	MOVOU x2in(16*0), X9
  1998  	MOVOU x2in(16*1), X10
  1999  	MOVOU y2in(16*0), X11
  2000  	MOVOU y2in(16*1), X12
  2001  	MOVOU p256one<>+0x00(SB), X13
  2002  	MOVOU p256one<>+0x10(SB), X14
  2003  
  2004  	PAND X15, X0
  2005  	PAND X15, X1
  2006  	PAND X15, X2
  2007  	PAND X15, X3
  2008  	PAND X15, X4
  2009  	PAND X15, X5
  2010  
  2011  	PAND X7, X9
  2012  	PAND X7, X10
  2013  	PAND X7, X11
  2014  	PAND X7, X12
  2015  	PAND X7, X13
  2016  	PAND X7, X14
  2017  
  2018  	PXOR X9, X0
  2019  	PXOR X10, X1
  2020  	PXOR X11, X2
  2021  	PXOR X12, X3
  2022  	PXOR X13, X4
  2023  	PXOR X14, X5
  2024  	// Finally output the result
  2025  	MOVOU X0, (16*0)(AX)
  2026  	MOVOU X1, (16*1)(AX)
  2027  	MOVOU X2, (16*2)(AX)
  2028  	MOVOU X3, (16*3)(AX)
  2029  	MOVOU X4, (16*4)(AX)
  2030  	MOVOU X5, (16*5)(AX)
  2031  	MOVQ $0, rptr
  2032  
  2033  	RET
  2034  #undef x1in
  2035  #undef y1in
  2036  #undef z1in
  2037  #undef x2in
  2038  #undef y2in
  2039  #undef xout
  2040  #undef yout
  2041  #undef zout
  2042  #undef s2
  2043  #undef z1sqr
  2044  #undef h
  2045  #undef r
  2046  #undef hsqr
  2047  #undef rsqr
  2048  #undef hcub
  2049  #undef rptr
  2050  #undef sel_save
  2051  #undef zero_save
  2052  
  2053  // sm2P256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
  2054  // otherwise. It writes to [acc4..acc7], t0 and t1.
  2055  TEXT sm2P256IsZero(SB),NOSPLIT,$0
  2056  	// AX contains a flag that is set if the input is zero.
  2057  	XORQ AX, AX
  2058  	MOVQ $1, t1
  2059  
  2060  	// Check whether [acc4..acc7] are all zero.
  2061  	MOVQ acc4, t0
  2062  	ORQ acc5, t0
  2063  	ORQ acc6, t0
  2064  	ORQ acc7, t0
  2065  
  2066  	// Set the zero flag if so. (CMOV of a constant to a register doesn't
  2067  	// appear to be supported in Go. Thus t1 = 1.)
  2068  	CMOVQEQ t1, AX
  2069  
  2070  	// XOR [acc4..acc7] with P and compare with zero again.
  2071  	XORQ $-1, acc4
  2072  	XORQ p256p<>+0x08(SB), acc5
  2073  	XORQ $-1, acc6
  2074  	XORQ p256p<>+0x018(SB), acc7
  2075  	ORQ acc5, acc4
  2076  	ORQ acc6, acc4
  2077  	ORQ acc7, acc4
  2078  
  2079  	// Set the zero flag if so.
  2080  	CMOVQEQ t1, AX
  2081  	RET
  2082  
  2083  /* ---------------------------------------*/
  2084  #define x1in(off) (32*0 + off)(SP)
  2085  #define y1in(off) (32*1 + off)(SP)
  2086  #define z1in(off) (32*2 + off)(SP)
  2087  #define x2in(off) (32*3 + off)(SP)
  2088  #define y2in(off) (32*4 + off)(SP)
  2089  #define z2in(off) (32*5 + off)(SP)
  2090  
  2091  #define xout(off) (32*6 + off)(SP)
  2092  #define yout(off) (32*7 + off)(SP)
  2093  #define zout(off) (32*8 + off)(SP)
  2094  
  2095  #define u1(off)    (32*9 + off)(SP)
  2096  #define u2(off)    (32*10 + off)(SP)
  2097  #define s1(off)    (32*11 + off)(SP)
  2098  #define s2(off)    (32*12 + off)(SP)
  2099  #define z1sqr(off) (32*13 + off)(SP)
  2100  #define z2sqr(off) (32*14 + off)(SP)
  2101  #define h(off)     (32*15 + off)(SP)
  2102  #define r(off)     (32*16 + off)(SP)
  2103  #define hsqr(off)  (32*17 + off)(SP)
  2104  #define rsqr(off)  (32*18 + off)(SP)
  2105  #define hcub(off)  (32*19 + off)(SP)
  2106  #define rptr       (32*20)(SP)
  2107  #define points_eq  (32*20+8)(SP)
  2108  
  2109  //func p256PointAddAsm(res, in1, in2 []uint64) int
  2110  TEXT ·p256PointAddAsm(SB),0,$680-80
  2111  	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
  2112  	// Move input to stack in order to free registers
  2113  	MOVQ res+0(FP), AX
  2114  	MOVQ in1+24(FP), BX
  2115  	MOVQ in2+48(FP), CX
  2116  
  2117  	MOVOU (16*0)(BX), X0
  2118  	MOVOU (16*1)(BX), X1
  2119  	MOVOU (16*2)(BX), X2
  2120  	MOVOU (16*3)(BX), X3
  2121  	MOVOU (16*4)(BX), X4
  2122  	MOVOU (16*5)(BX), X5
  2123  
  2124  	MOVOU X0, x1in(16*0)
  2125  	MOVOU X1, x1in(16*1)
  2126  	MOVOU X2, y1in(16*0)
  2127  	MOVOU X3, y1in(16*1)
  2128  	MOVOU X4, z1in(16*0)
  2129  	MOVOU X5, z1in(16*1)
  2130  
  2131  	MOVOU (16*0)(CX), X0
  2132  	MOVOU (16*1)(CX), X1
  2133  	MOVOU (16*2)(CX), X2
  2134  	MOVOU (16*3)(CX), X3
  2135  	MOVOU (16*4)(CX), X4
  2136  	MOVOU (16*5)(CX), X5
  2137  
  2138  	MOVOU X0, x2in(16*0)
  2139  	MOVOU X1, x2in(16*1)
  2140  	MOVOU X2, y2in(16*0)
  2141  	MOVOU X3, y2in(16*1)
  2142  	MOVOU X4, z2in(16*0)
  2143  	MOVOU X5, z2in(16*1)
  2144  	// Store pointer to result
  2145  	MOVQ AX, rptr
  2146  	// Begin point add
  2147  	LDacc (z2in)
  2148  	CALL sm2P256SqrInternal(SB)	// z2ˆ2
  2149  	ST (z2sqr)
  2150  	LDt (z2in)
  2151  	CALL sm2P256MulInternal(SB)	// z2ˆ3
  2152  	LDt (y1in)
  2153  	CALL sm2P256MulInternal(SB)	// s1 = z2ˆ3*y1
  2154  	ST (s1)
  2155  
  2156  	LDacc (z1in)
  2157  	CALL sm2P256SqrInternal(SB)	// z1ˆ2
  2158  	ST (z1sqr)
  2159  	LDt (z1in)
  2160  	CALL sm2P256MulInternal(SB)	// z1ˆ3
  2161  	LDt (y2in)
  2162  	CALL sm2P256MulInternal(SB)	// s2 = z1ˆ3*y2
  2163  	ST (s2)
  2164  
  2165  	LDt (s1)
  2166  	CALL sm2P256SubInternal(SB)	// r = s2 - s1
  2167  	ST (r)
  2168  	CALL sm2P256IsZero(SB)
  2169  	MOVQ AX, points_eq
  2170  
  2171  	LDacc (z2sqr)
  2172  	LDt (x1in)
  2173  	CALL sm2P256MulInternal(SB)	// u1 = x1 * z2ˆ2
  2174  	ST (u1)
  2175  	LDacc (z1sqr)
  2176  	LDt (x2in)
  2177  	CALL sm2P256MulInternal(SB)	// u2 = x2 * z1ˆ2
  2178  	ST (u2)
  2179  
  2180  	LDt (u1)
  2181  	CALL sm2P256SubInternal(SB)	// h = u2 - u1
  2182  	ST (h)
  2183  	CALL sm2P256IsZero(SB)
  2184  	ANDQ points_eq, AX
  2185  	MOVQ AX, points_eq
  2186  
  2187  	LDacc (r)
  2188  	CALL sm2P256SqrInternal(SB)	// rsqr = rˆ2
  2189  	ST (rsqr)
  2190  
  2191  	LDacc (h)
  2192  	CALL sm2P256SqrInternal(SB)	// hsqr = hˆ2
  2193  	ST (hsqr)
  2194  
  2195  	LDt (h)
  2196  	CALL sm2P256MulInternal(SB)	// hcub = hˆ3
  2197  	ST (hcub)
  2198  
  2199  	LDt (s1)
  2200  	CALL sm2P256MulInternal(SB)
  2201  	ST (s2)
  2202  
  2203  	LDacc (z1in)
  2204  	LDt (z2in)
  2205  	CALL sm2P256MulInternal(SB)	// z1 * z2
  2206  	LDt (h)
  2207  	CALL sm2P256MulInternal(SB)	// z1 * z2 * h
  2208  	ST (zout)
  2209  
  2210  	LDacc (hsqr)
  2211  	LDt (u1)
  2212  	CALL sm2P256MulInternal(SB)	// hˆ2 * u1
  2213  	ST (u2)
  2214  
  2215  	p256MulBy2Inline	// u1 * hˆ2 * 2, inline
  2216  	LDacc (rsqr)
  2217  	CALL sm2P256SubInternal(SB)	// rˆ2 - u1 * hˆ2 * 2
  2218  
  2219  	LDt (hcub)
  2220  	CALL sm2P256SubInternal(SB)
  2221  	ST (xout)
  2222  
  2223  	MOVQ acc4, t0
  2224  	MOVQ acc5, t1
  2225  	MOVQ acc6, t2
  2226  	MOVQ acc7, t3
  2227  	LDacc (u2)
  2228  	CALL sm2P256SubInternal(SB)
  2229  
  2230  	LDt (r)
  2231  	CALL sm2P256MulInternal(SB)
  2232  
  2233  	LDt (s2)
  2234  	CALL sm2P256SubInternal(SB)
  2235  	ST (yout)
  2236  
  2237  	MOVOU xout(16*0), X0
  2238  	MOVOU xout(16*1), X1
  2239  	MOVOU yout(16*0), X2
  2240  	MOVOU yout(16*1), X3
  2241  	MOVOU zout(16*0), X4
  2242  	MOVOU zout(16*1), X5
  2243  	// Finally output the result
  2244  	MOVQ rptr, AX
  2245  	MOVQ $0, rptr
  2246  	MOVOU X0, (16*0)(AX)
  2247  	MOVOU X1, (16*1)(AX)
  2248  	MOVOU X2, (16*2)(AX)
  2249  	MOVOU X3, (16*3)(AX)
  2250  	MOVOU X4, (16*4)(AX)
  2251  	MOVOU X5, (16*5)(AX)
  2252  
  2253  	MOVQ points_eq, AX
  2254  	MOVQ AX, ret+72(FP)
  2255  
  2256  	RET
  2257  #undef x1in
  2258  #undef y1in
  2259  #undef z1in
  2260  #undef x2in
  2261  #undef y2in
  2262  #undef z2in
  2263  #undef xout
  2264  #undef yout
  2265  #undef zout
  2266  #undef s1
  2267  #undef s2
  2268  #undef u1
  2269  #undef u2
  2270  #undef z1sqr
  2271  #undef z2sqr
  2272  #undef h
  2273  #undef r
  2274  #undef hsqr
  2275  #undef rsqr
  2276  #undef hcub
  2277  #undef rptr
  2278  /* ---------------------------------------*/
  2279  #define x(off) (32*0 + off)(SP)
  2280  #define y(off) (32*1 + off)(SP)
  2281  #define z(off) (32*2 + off)(SP)
  2282  
  2283  #define s(off)	(32*3 + off)(SP)
  2284  #define m(off)	(32*4 + off)(SP)
  2285  #define zsqr(off) (32*5 + off)(SP)
  2286  #define tmp(off)  (32*6 + off)(SP)
  2287  #define rptr	  (32*7)(SP)
  2288  
  2289  //func p256PointDoubleAsm(res, in []uint64)
  2290  TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48
  2291  	// Move input to stack in order to free registers
  2292  	MOVQ res+0(FP), AX
  2293  	MOVQ in+24(FP), BX
  2294  
  2295  	MOVOU (16*0)(BX), X0
  2296  	MOVOU (16*1)(BX), X1
  2297  	MOVOU (16*2)(BX), X2
  2298  	MOVOU (16*3)(BX), X3
  2299  	MOVOU (16*4)(BX), X4
  2300  	MOVOU (16*5)(BX), X5
  2301  
  2302  	MOVOU X0, x(16*0)
  2303  	MOVOU X1, x(16*1)
  2304  	MOVOU X2, y(16*0)
  2305  	MOVOU X3, y(16*1)
  2306  	MOVOU X4, z(16*0)
  2307  	MOVOU X5, z(16*1)
  2308  	// Store pointer to result
  2309  	MOVQ AX, rptr
  2310  	// Begin point double
  2311  	LDacc (z)
  2312  	CALL sm2P256SqrInternal(SB)
  2313  	ST (zsqr)
  2314  
  2315  	LDt (x)
  2316  	p256AddInline
  2317  	STt (m)
  2318  
  2319  	LDacc (z)
  2320  	LDt (y)
  2321  	CALL sm2P256MulInternal(SB)
  2322  	p256MulBy2Inline
  2323  	MOVQ rptr, AX
  2324  	// Store z
  2325  	MOVQ t0, (16*4 + 8*0)(AX)
  2326  	MOVQ t1, (16*4 + 8*1)(AX)
  2327  	MOVQ t2, (16*4 + 8*2)(AX)
  2328  	MOVQ t3, (16*4 + 8*3)(AX)
  2329  
  2330  	LDacc (x)
  2331  	LDt (zsqr)
  2332  	CALL sm2P256SubInternal(SB)
  2333  	LDt (m)
  2334  	CALL sm2P256MulInternal(SB)
  2335  	ST (m)
  2336  	// Multiply by 3
  2337  	p256MulBy2Inline
  2338  	LDacc (m)
  2339  	p256AddInline
  2340  	STt (m)
  2341  	////////////////////////
  2342  	LDacc (y)
  2343  	p256MulBy2Inline
  2344  	t2acc
  2345  	CALL sm2P256SqrInternal(SB)
  2346  	ST (s)
  2347  	CALL sm2P256SqrInternal(SB)
  2348  	// Divide by 2
  2349  	XORQ mul0, mul0
  2350  	MOVQ acc4, t0
  2351  	MOVQ acc5, t1
  2352  	MOVQ acc6, t2
  2353  	MOVQ acc7, t3
  2354  
  2355  	ADDQ $-1, acc4
  2356  	ADCQ p256p<>+0x08(SB), acc5
  2357  	ADCQ $-1, acc6
  2358  	ADCQ p256p<>+0x018(SB), acc7
  2359  	ADCQ $0, mul0
  2360  	TESTQ $1, t0
  2361  
  2362  	CMOVQEQ t0, acc4
  2363  	CMOVQEQ t1, acc5
  2364  	CMOVQEQ t2, acc6
  2365  	CMOVQEQ t3, acc7
  2366  	ANDQ t0, mul0
  2367  
  2368  	SHRQ $1, acc5, acc4
  2369  	SHRQ $1, acc6, acc5
  2370  	SHRQ $1, acc7, acc6
  2371  	SHRQ $1, mul0, acc7
  2372  	ST (y)
  2373  	/////////////////////////
  2374  	LDacc (x)
  2375  	LDt (s)
  2376  	CALL sm2P256MulInternal(SB)
  2377  	ST (s)
  2378  	p256MulBy2Inline
  2379  	STt (tmp)
  2380  
  2381  	LDacc (m)
  2382  	CALL sm2P256SqrInternal(SB)
  2383  	LDt (tmp)
  2384  	CALL sm2P256SubInternal(SB)
  2385  
  2386  	MOVQ rptr, AX
  2387  	// Store x
  2388  	MOVQ acc4, (16*0 + 8*0)(AX)
  2389  	MOVQ acc5, (16*0 + 8*1)(AX)
  2390  	MOVQ acc6, (16*0 + 8*2)(AX)
  2391  	MOVQ acc7, (16*0 + 8*3)(AX)
  2392  
  2393  	acc2t
  2394  	LDacc (s)
  2395  	CALL sm2P256SubInternal(SB)
  2396  
  2397  	LDt (m)
  2398  	CALL sm2P256MulInternal(SB)
  2399  
  2400  	LDt (y)
  2401  	CALL sm2P256SubInternal(SB)
  2402  	MOVQ rptr, AX
  2403  	// Store y
  2404  	MOVQ acc4, (16*2 + 8*0)(AX)
  2405  	MOVQ acc5, (16*2 + 8*1)(AX)
  2406  	MOVQ acc6, (16*2 + 8*2)(AX)
  2407  	MOVQ acc7, (16*2 + 8*3)(AX)
  2408  	///////////////////////
  2409  	MOVQ $0, rptr
  2410  
  2411  	RET
  2412  /* ---------------------------------------*/
  2413