github.com/emmansun/gmsm@v0.29.1/internal/sm2ec/p256_macros_amd64.s (about)

     1  #define res_ptr DI
     2  #define x_ptr SI
     3  #define y_ptr CX
     4  
     5  #define acc0 R8
     6  #define acc1 R9
     7  #define acc2 R10
     8  #define acc3 R11
     9  #define acc4 R12
    10  #define acc5 R13
    11  #define t0 R14
    12  
    13  DATA p256p<>+0x00(SB)/8, $0xffffffffffffffff
    14  DATA p256p<>+0x08(SB)/8, $0xffffffff00000000
    15  DATA p256p<>+0x10(SB)/8, $0xffffffffffffffff
    16  DATA p256p<>+0x18(SB)/8, $0xfffffffeffffffff
    17  DATA p256ordK0<>+0x00(SB)/8, $0x327f9e8872350975
    18  DATA p256ord<>+0x00(SB)/8, $0x53bbf40939d54123
    19  DATA p256ord<>+0x08(SB)/8, $0x7203df6b21c6052b
    20  DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
    21  DATA p256ord<>+0x18(SB)/8, $0xfffffffeffffffff
    22  DATA p256one<>+0x00(SB)/8, $0x0000000000000001
    23  DATA p256one<>+0x08(SB)/8, $0x00000000ffffffff
    24  DATA p256one<>+0x10(SB)/8, $0x0000000000000000
    25  DATA p256one<>+0x18(SB)/8, $0x0000000100000000
    26  GLOBL p256p<>(SB), 8, $32
    27  GLOBL p256ordK0<>(SB), 8, $8
    28  GLOBL p256ord<>(SB), 8, $32
    29  GLOBL p256one<>(SB), 8, $32
    30  
    31  #define p256SqrMontReduceInline \
    32  	\ // First reduction step, [p3, p2, p1, p0] = [1, -0x100000000, 0, (1 - 0x100000000), -1]
    33  	MOVQ acc0, AX     \
    34  	MOVQ acc0, DX     \
    35  	SHLQ $32, AX      \
    36  	SHRQ $32, DX      \
    37  	\// calculate the negative part: [1, -0x100000000, 0, -0x100000000] * acc0 + [0, acc3, acc2, acc1]
    38  	SUBQ AX, acc1     \ 
    39  	SBBQ DX, acc2     \
    40  	SBBQ AX, acc3     \
    41  	MOVQ acc0, AX     \
    42  	SBBQ DX, acc0     \
    43  	\ // calculate the positive part: [0, 0, 0, AX] + [acc0, acc3, acc2, acc1], 
    44  	\ // due to (-1) * acc0 + acc0 == 0, so last lowest lamb 0 is dropped directly, no carry.
    45  	ADDQ AX, acc1     \
    46  	ADCQ $0, acc2     \
    47  	ADCQ $0, acc3     \
    48  	ADCQ $0, acc0     \
    49  	\ // Second reduction step
    50  	MOVQ acc1, AX     \
    51  	MOVQ acc1, DX     \
    52  	SHLQ $32, AX      \
    53  	SHRQ $32, DX      \
    54  	\
    55  	SUBQ AX, acc2     \
    56  	SBBQ DX, acc3     \
    57  	SBBQ AX, acc0     \
    58  	MOVQ acc1, AX     \
    59  	SBBQ DX, acc1     \
    60  	\
    61  	ADDQ AX, acc2     \
    62  	ADCQ $0, acc3     \
    63  	ADCQ $0, acc0     \
    64  	ADCQ $0, acc1     \
    65  	\ // Third reduction step
    66  	MOVQ acc2, AX     \
    67  	MOVQ acc2, DX     \
    68  	SHLQ $32, AX      \
    69  	SHRQ $32, DX      \
    70  	\
    71  	SUBQ AX, acc3     \
    72  	SBBQ DX, acc0     \
    73  	SBBQ AX, acc1     \
    74  	MOVQ acc2, AX     \
    75  	SBBQ DX, acc2     \
    76  	\
    77  	ADDQ AX, acc3     \
    78  	ADCQ $0, acc0     \
    79  	ADCQ $0, acc1     \
    80  	ADCQ $0, acc2     \
    81  	\ // Last reduction step
    82  	XORQ t0, t0       \
    83  	MOVQ acc3, AX     \
    84  	MOVQ acc3, DX     \
    85  	SHLQ $32, AX      \
    86  	SHRQ $32, DX      \
    87  	\
    88  	SUBQ AX, acc0     \
    89  	SBBQ DX, acc1     \
    90  	SBBQ AX, acc2     \
    91  	MOVQ acc3, AX     \
    92  	SBBQ DX, acc3     \
    93  	\
    94  	ADDQ AX, acc0     \
    95  	ADCQ $0, acc1     \
    96  	ADCQ $0, acc2     \
    97  	ADCQ $0, acc3     \
    98  	\ // Add bits [511:256] of the sqr result
    99  	ADCQ acc4, acc0   \
   100  	ADCQ acc5, acc1   \
   101  	ADCQ y_ptr, acc2  \
   102  	ADCQ x_ptr, acc3  \
   103  	ADCQ $0, t0
   104  
   105  /* ---------------------------------------*/
   106  #define p256PrimReduce(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
   107  	MOVQ a0, b0                 \
   108  	MOVQ a1, b1                 \
   109  	MOVQ a2, b2                 \
   110  	MOVQ a3, b3                 \
   111  	\ // Subtract p256
   112  	SUBQ $-1, a0                \
   113  	SBBQ p256p<>+0x08(SB), a1   \
   114  	SBBQ $-1, a2                \
   115  	SBBQ p256p<>+0x018(SB), a3  \
   116  	SBBQ $0, a4                 \
   117  	\ // If the result of the subtraction is negative, restore the previous result
   118  	CMOVQCS b0, a0              \ // CMOVQCS: Move if below (CF == 1)
   119  	CMOVQCS b1, a1              \
   120  	CMOVQCS b2, a2              \
   121  	CMOVQCS b3, a3              \
   122  	\
   123  	MOVQ a0, (8*0)(res)         \
   124  	MOVQ a1, (8*1)(res)         \
   125  	MOVQ a2, (8*2)(res)         \
   126  	MOVQ a3, (8*3)(res)
   127  
   128  /* ---------------------------------------*/
   129  #define p256OrdReduceInline(a0, a1, a2, a3, a4, b0, b1, b2, b3, res) \
   130  	\// Copy result [255:0]
   131  	MOVQ a0, b0                    \
   132  	MOVQ a1, b1                    \
   133  	MOVQ a2, b2                    \
   134  	MOVQ a3, b3                    \
   135  	\// Subtract p256ord
   136  	SUBQ p256ord<>+0x00(SB), a0    \
   137  	SBBQ p256ord<>+0x08(SB) ,a1    \
   138  	SBBQ p256ord<>+0x10(SB), a2    \
   139  	SBBQ p256ord<>+0x18(SB), a3    \
   140  	SBBQ $0, a4                    \
   141  	\ // If the result of the subtraction is negative, restore the previous result
   142  	CMOVQCS b0, a0                 \ // CMOVQCS: Move if below (CF == 1)
   143  	CMOVQCS b1, a1                 \
   144  	CMOVQCS b2, a2                 \
   145  	CMOVQCS b3, a3                 \
   146  	\
   147  	MOVQ a0, (8*0)(res)            \
   148  	MOVQ a1, (8*1)(res)            \
   149  	MOVQ a2, (8*2)(res)            \
   150  	MOVQ a3, (8*3)(res)
   151  
   152  /* ---------------------------------------*/
   153  #define sm2P256SqrReductionInline \
   154  	\ // First reduction step
   155  	MOVQ acc0, mul0             \
   156  	MOVQ acc0, mul1             \
   157  	SHLQ $32, mul0              \
   158  	SHRQ $32, mul1              \
   159  	\
   160  	SUBQ mul0, acc1             \
   161  	SBBQ mul1, acc2             \
   162  	SBBQ mul0, acc3             \
   163  	MOVQ acc0, mul0             \
   164  	SBBQ mul1, acc0             \
   165  	\
   166  	ADDQ mul0, acc1             \
   167  	ADCQ $0, acc2               \
   168  	ADCQ $0, acc3               \
   169  	ADCQ $0, acc0               \
   170  	\ // Second reduction step
   171  	MOVQ acc1, mul0             \
   172  	MOVQ acc1, mul1             \
   173  	SHLQ $32, mul0              \
   174  	SHRQ $32, mul1              \
   175  	\
   176  	SUBQ mul0, acc2             \
   177  	SBBQ mul1, acc3             \
   178  	SBBQ mul0, acc0             \
   179  	MOVQ acc1, mul0             \
   180  	SBBQ mul1, acc1             \
   181  	\
   182  	ADDQ mul0, acc2             \
   183  	ADCQ $0, acc3               \
   184  	ADCQ $0, acc0               \
   185  	ADCQ $0, acc1               \
   186  	\ // Third reduction step
   187  	MOVQ acc2, mul0             \
   188  	MOVQ acc2, mul1             \
   189  	SHLQ $32, mul0              \
   190  	SHRQ $32, mul1              \
   191  	\
   192  	SUBQ mul0, acc3             \
   193  	SBBQ mul1, acc0             \
   194  	SBBQ mul0, acc1             \
   195  	MOVQ acc2, mul0             \
   196  	SBBQ mul1, acc2             \
   197  	\
   198  	ADDQ mul0, acc3             \
   199  	ADCQ $0, acc0               \
   200  	ADCQ $0, acc1               \
   201  	ADCQ $0, acc2               \
   202  	\ // Last reduction step
   203  	MOVQ acc3, mul0             \
   204  	MOVQ acc3, mul1             \
   205  	SHLQ $32, mul0              \
   206  	SHRQ $32, mul1              \
   207  	\
   208  	SUBQ mul0, acc0             \
   209  	SBBQ mul1, acc1             \
   210  	SBBQ mul0, acc2             \
   211  	MOVQ acc3, mul0             \
   212  	SBBQ mul1, acc3             \
   213  	\
   214  	ADDQ mul0, acc0             \
   215  	ADCQ $0, acc1               \
   216  	ADCQ $0, acc2               \
   217  	ADCQ $0, acc3               \
   218  	MOVQ $0, mul0               \
   219  	\ // Add bits [511:256] of the result
   220  	ADCQ acc0, t0               \
   221  	ADCQ acc1, t1               \
   222  	ADCQ acc2, t2               \
   223  	ADCQ acc3, t3               \
   224  	ADCQ $0, mul0               \
   225  	\ // Copy result
   226  	MOVQ t0, acc4               \
   227  	MOVQ t1, acc5               \
   228  	MOVQ t2, acc6               \
   229  	MOVQ t3, acc7               \
   230  	\ // Subtract p256
   231  	SUBQ $-1, acc4              \
   232  	SBBQ p256p<>+0x08(SB), acc5 \
   233  	SBBQ $-1, acc6              \
   234  	SBBQ p256p<>+0x018(SB), acc7\
   235  	SBBQ $0, mul0               \
   236  	\ // If the result of the subtraction is negative, restore the previous result
   237  	CMOVQCS t0, acc4            \ // CMOVQCS: Move if below (CF == 1)
   238  	CMOVQCS t1, acc5            \
   239  	CMOVQCS t2, acc6            \
   240  	CMOVQCS t3, acc7
   241  
   242  /* ---------------------------------------*/
   243  #define sm2P256MulReductionInline \
   244  	\// First reduction step
   245  	MOVQ acc0, mul0              \
   246  	MOVQ acc0, mul1              \
   247  	SHLQ $32, mul0               \
   248  	SHRQ $32, mul1               \
   249  	\
   250  	SUBQ mul0, acc1              \
   251  	SBBQ mul1, acc2              \
   252  	SBBQ mul0, acc3              \
   253  	MOVQ acc0, mul0              \
   254  	SBBQ mul1, acc0              \
   255  	\
   256  	ADDQ mul0, acc1              \
   257  	ADCQ $0, acc2                \
   258  	ADCQ $0, acc3                \
   259  	ADCQ $0, acc0                \
   260  	\// Second reduction step
   261  	MOVQ acc1, mul0              \
   262  	MOVQ acc1, mul1              \
   263  	SHLQ $32, mul0               \
   264  	SHRQ $32, mul1               \
   265  	\
   266  	SUBQ mul0, acc2              \
   267  	SBBQ mul1, acc3              \
   268  	SBBQ mul0, acc0              \
   269  	MOVQ acc1, mul0              \
   270  	SBBQ mul1, acc1              \
   271  	\
   272  	ADDQ mul0, acc2              \
   273  	ADCQ $0, acc3                \
   274  	ADCQ $0, acc0                \
   275  	ADCQ $0, acc1                \
   276  	\// Third reduction step
   277  	MOVQ acc2, mul0              \
   278  	MOVQ acc2, mul1              \
   279  	SHLQ $32, mul0               \
   280  	SHRQ $32, mul1               \
   281  	\
   282  	SUBQ mul0, acc3              \
   283  	SBBQ mul1, acc0              \
   284  	SBBQ mul0, acc1              \
   285  	MOVQ acc2, mul0              \
   286  	SBBQ mul1, acc2              \
   287  	\
   288  	ADDQ mul0, acc3              \
   289  	ADCQ $0, acc0                \
   290  	ADCQ $0, acc1                \
   291  	ADCQ $0, acc2                \
   292  	\// Last reduction step
   293  	MOVQ acc3, mul0              \
   294  	MOVQ acc3, mul1              \
   295  	SHLQ $32, mul0               \
   296  	SHRQ $32, mul1               \
   297  	\
   298  	SUBQ mul0, acc0              \
   299  	SBBQ mul1, acc1              \
   300  	SBBQ mul0, acc2              \
   301  	MOVQ acc3, mul0              \
   302  	SBBQ mul1, acc3              \
   303  	\
   304  	ADDQ mul0, acc0              \
   305  	ADCQ $0, acc1                \
   306  	ADCQ $0, acc2                \
   307  	ADCQ $0, acc3
   308  
   309  /* ---------------------------------------*/
   310  #define p256SqrRound(t1) \
   311  	\// y[1:] * y[0]
   312  	MOVQ (8*0)(x_ptr), t0;\
   313  	\
   314  	MOVQ (8*1)(x_ptr), AX;\
   315  	MULQ t0;\
   316  	MOVQ AX, acc1;\
   317  	MOVQ DX, acc2;\
   318  	\
   319  	MOVQ (8*2)(x_ptr), AX;\
   320  	MULQ t0;\
   321  	ADDQ AX, acc2;\
   322  	ADCQ $0, DX;\
   323  	MOVQ DX, acc3;\
   324  	\
   325  	MOVQ (8*3)(x_ptr), AX;\
   326  	MULQ t0;\
   327  	ADDQ AX, acc3;\
   328  	ADCQ $0, DX;\
   329  	MOVQ DX, acc4;\
   330  	\// y[2:] * y[1]
   331  	MOVQ (8*1)(x_ptr), t0;\
   332  	\
   333  	MOVQ (8*2)(x_ptr), AX;\
   334  	MULQ t0;\
   335  	ADDQ AX, acc3;\
   336  	ADCQ $0, DX;\
   337  	MOVQ DX, t1;\
   338  	\
   339  	MOVQ (8*3)(x_ptr), AX;\
   340  	MULQ t0;\
   341  	ADDQ t1, acc4;\
   342  	ADCQ $0, DX;\
   343  	ADDQ AX, acc4;\
   344  	ADCQ $0, DX;\
   345  	MOVQ DX, acc5;\
   346  	\// y[3] * y[2]
   347  	MOVQ (8*2)(x_ptr), t0;\
   348  	\
   349  	MOVQ (8*3)(x_ptr), AX;\
   350  	MULQ t0;\
   351  	ADDQ AX, acc5;\
   352  	ADCQ $0, DX;\
   353  	MOVQ DX, y_ptr;\
   354  	XORQ t1, t1;\
   355  	\// *2
   356  	ADDQ acc1, acc1;\
   357  	ADCQ acc2, acc2;\
   358  	ADCQ acc3, acc3;\
   359  	ADCQ acc4, acc4;\
   360  	ADCQ acc5, acc5;\
   361  	ADCQ y_ptr, y_ptr;\
   362  	ADCQ $0, t1;\
   363  	\// Missing products
   364  	MOVQ (8*0)(x_ptr), AX;\
   365  	MULQ AX;\
   366  	MOVQ AX, acc0;\
   367  	MOVQ DX, t0;\
   368  	\
   369  	MOVQ (8*1)(x_ptr), AX;\
   370  	MULQ AX;\
   371  	ADDQ t0, acc1;\
   372  	ADCQ AX, acc2;\
   373  	ADCQ $0, DX;\
   374  	MOVQ DX, t0;\
   375  	\
   376  	MOVQ (8*2)(x_ptr), AX;\
   377  	MULQ AX;\
   378  	ADDQ t0, acc3;\
   379  	ADCQ AX, acc4;\
   380  	ADCQ $0, DX;\
   381  	MOVQ DX, t0;\
   382  	\
   383  	MOVQ (8*3)(x_ptr), AX;\
   384  	MULQ AX;\
   385  	ADDQ t0, acc5;\
   386  	ADCQ AX, y_ptr;\
   387  	ADCQ DX, t1;\
   388  	MOVQ t1, x_ptr;\
   389  	\// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
   390  	p256SqrMontReduceInline;\
   391  	p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\
   392  	MOVQ res_ptr, x_ptr;
   393  
   394  /* ---------------------------------------*/
   395  #define p256SqrRoundAdx(t1) \
   396  	XORQ acc0, acc0;\
   397  	XORQ y_ptr, y_ptr;\
   398  	\// x[1:] * x[0]
   399  	MOVQ (8*0)(x_ptr), DX;\
   400  	MULXQ (8*1)(x_ptr), acc1, acc2;\
   401  	\
   402  	MULXQ (8*2)(x_ptr), AX, acc3;\
   403  	ADOXQ AX, acc2;\
   404  	\
   405  	MULXQ (8*3)(x_ptr), AX, acc4;\
   406  	ADOXQ AX, acc3;\
   407  	ADOXQ y_ptr, acc4;\
   408  	\
   409  	\// x[2:] * x[1]
   410  	MOVQ (8*1)(x_ptr), DX;\
   411  	MULXQ (8*2)(x_ptr), AX, t1;\
   412  	ADOXQ AX, acc3;\
   413  	\
   414  	MULXQ (8*3)(x_ptr), AX, acc5;\
   415  	ADCXQ t1, AX;\
   416  	ADOXQ AX, acc4;\
   417  	ADCXQ y_ptr, acc5;\
   418  	\
   419  	\// y[x] * x[2]
   420  	MOVQ (8*2)(x_ptr), DX;\
   421  	MULXQ (8*3)(x_ptr), AX, y_ptr ;\
   422  	ADOXQ AX, acc5;\
   423  	ADOXQ acc0, y_ptr;\
   424  	\
   425  	XORQ t1, t1;\
   426  	\
   427  	\// *2
   428  	ADOXQ acc1, acc1;\
   429  	ADOXQ acc2, acc2;\
   430  	ADOXQ acc3, acc3;\
   431  	ADOXQ acc4, acc4;\
   432  	ADOXQ acc5, acc5;\
   433  	ADOXQ y_ptr, y_ptr;\
   434  	ADOXQ acc0, t1;\
   435  	\
   436  	\// Missing products
   437  	MOVQ (8*0)(x_ptr), DX;\
   438  	MULXQ DX, acc0, t0;\
   439  	ADCXQ t0, acc1;\
   440  	\
   441  	MOVQ (8*1)(x_ptr), DX;\
   442  	MULXQ DX, AX, t0;\
   443  	ADCXQ AX, acc2;\
   444  	ADCXQ t0, acc3;\
   445  	\
   446  	MOVQ (8*2)(x_ptr), DX;\
   447  	MULXQ DX, AX, t0 ;\
   448  	ADCXQ AX, acc4;\
   449  	ADCXQ t0, acc5;\
   450  	\
   451  	MOVQ (8*3)(x_ptr), DX;\
   452  	MULXQ DX, AX, x_ptr;\
   453  	ADCXQ AX, y_ptr;\
   454  	ADCXQ t1, x_ptr;\
   455  	\
   456  	\// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
   457  	p256SqrMontReduceInline;\
   458  	p256PrimReduce(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\
   459  	MOVQ res_ptr, x_ptr;
   460  
   461  /* ---------------------------------------*/
   462  #define p256OrdSqrRound(t1) \
   463  	\// y[1:] * y[0]
   464  	MOVQ (8*0)(x_ptr), t0;\
   465  	\
   466  	MOVQ (8*1)(x_ptr), AX;\
   467  	MULQ t0;\
   468  	MOVQ AX, acc1;\
   469  	MOVQ DX, acc2;\
   470  	\
   471  	MOVQ (8*2)(x_ptr), AX;\
   472  	MULQ t0;\
   473  	ADDQ AX, acc2;\
   474  	ADCQ $0, DX;\
   475  	MOVQ DX, acc3;\
   476  	\
   477  	MOVQ (8*3)(x_ptr), AX;\
   478  	MULQ t0;\
   479  	ADDQ AX, acc3;\
   480  	ADCQ $0, DX;\
   481  	MOVQ DX, acc4;\
   482  	\// y[2:] * y[1]
   483  	MOVQ (8*1)(x_ptr), t0;\
   484  	\
   485  	MOVQ (8*2)(x_ptr), AX;\
   486  	MULQ t0;\
   487  	ADDQ AX, acc3;\
   488  	ADCQ $0, DX;\
   489  	MOVQ DX, t1;\
   490  	\
   491  	MOVQ (8*3)(x_ptr), AX;\
   492  	MULQ t0;\
   493  	ADDQ t1, acc4;\
   494  	ADCQ $0, DX;\
   495  	ADDQ AX, acc4;\
   496  	ADCQ $0, DX;\
   497  	MOVQ DX, acc5;\
   498  	\// y[3] * y[2]
   499  	MOVQ (8*2)(x_ptr), t0;\
   500  	\
   501  	MOVQ (8*3)(x_ptr), AX;\
   502  	MULQ t0;\
   503  	ADDQ AX, acc5;\
   504  	ADCQ $0, DX;\
   505  	MOVQ DX, y_ptr;\
   506  	XORQ t1, t1;\
   507  	\// *2
   508  	ADDQ acc1, acc1;\
   509  	ADCQ acc2, acc2;\
   510  	ADCQ acc3, acc3;\
   511  	ADCQ acc4, acc4;\
   512  	ADCQ acc5, acc5;\
   513  	ADCQ y_ptr, y_ptr;\
   514  	ADCQ $0, t1;\
   515  	\// Missing products
   516  	MOVQ (8*0)(x_ptr), AX;\
   517  	MULQ AX;\
   518  	MOVQ AX, acc0;\
   519  	MOVQ DX, t0;\
   520  	\
   521  	MOVQ (8*1)(x_ptr), AX;\
   522  	MULQ AX;\
   523  	ADDQ t0, acc1;\
   524  	ADCQ AX, acc2;\
   525  	ADCQ $0, DX;\
   526  	MOVQ DX, t0;\
   527  	\
   528  	MOVQ (8*2)(x_ptr), AX;\
   529  	MULQ AX;\
   530  	ADDQ t0, acc3;\
   531  	ADCQ AX, acc4;\
   532  	ADCQ $0, DX;\
   533  	MOVQ DX, t0;\
   534  	\
   535  	MOVQ (8*3)(x_ptr), AX;\
   536  	MULQ AX;\
   537  	ADDQ t0, acc5;\
   538  	ADCQ AX, y_ptr;\
   539  	ADCQ DX, t1;\
   540  	MOVQ t1, x_ptr;\
   541  	\
   542  	\// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
   543  	MOVQ acc0, AX;\
   544  	MULQ p256ordK0<>(SB);\
   545  	MOVQ AX, t0;\ // Y = t0 = (k0 * acc0) mod 2^64
   546  	\
   547  	MOVQ p256ord<>+0x00(SB), AX;\
   548  	MULQ t0;\
   549  	ADDQ AX, acc0;\ // (carry1, acc0) = acc0 + L(t0 * ord0)
   550  	ADCQ $0, DX;\ // DX = carry1 + H(t0 * ord0)
   551  	MOVQ DX, t1;\ // t1 = carry1 + H(t0 * ord0)
   552  	MOVQ t0, acc0;\ // acc0 =  t0
   553  	\
   554  	\// calculate the negative part: [acc0, acc3, acc2, acc1] - [0, 0x100000000, 1, 0] * t0
   555  	MOVQ t0, AX;\
   556  	MOVQ t0, DX;\
   557  	SHLQ $32, AX;\
   558  	SHRQ $32, DX;\
   559  	\
   560  	SUBQ t0, acc2;\
   561  	SBBQ AX, acc3;\
   562  	SBBQ DX, acc0;\
   563  	\
   564  	MOVQ p256ord<>+0x08(SB), AX;\
   565  	MULQ t0;\
   566  	ADDQ t1, acc1;\ // (carry2, acc1) = acc1 + t1
   567  	ADCQ $0, DX;\ // DX = carry2 + H(t0*ord1)
   568  	\
   569  	ADDQ AX, acc1;\ // (carry3, acc1) = acc1 + t1 + L(t0*ord1)
   570  	ADCQ DX, acc2;\
   571  	ADCQ $0, acc3;\
   572  	ADCQ $0, acc0;\
   573  	\
   574  	\// Second reduction step
   575  	MOVQ acc1, AX;\
   576  	MULQ p256ordK0<>(SB);\
   577  	MOVQ AX, t0;\
   578  	\
   579  	MOVQ p256ord<>+0x00(SB), AX;\
   580  	MULQ t0;\
   581  	ADDQ AX, acc1;\
   582  	ADCQ $0, DX;\
   583  	MOVQ DX, t1;\
   584  	MOVQ t0, acc1;\
   585  	\
   586  	MOVQ t0, AX;\
   587  	MOVQ t0, DX;\
   588  	SHLQ $32, AX;\
   589  	SHRQ $32, DX;\
   590  	\
   591  	SUBQ t0, acc3;\
   592  	SBBQ AX, acc0;\
   593  	SBBQ DX, acc1;\
   594  	\
   595  	MOVQ p256ord<>+0x08(SB), AX;\
   596  	MULQ t0;\
   597  	ADDQ t1, acc2;\
   598  	ADCQ $0, DX;\
   599  	\
   600  	ADDQ AX, acc2;\
   601  	ADCQ DX, acc3;\
   602  	ADCQ $0, acc0;\
   603  	ADCQ $0, acc1;\
   604  	\
   605  	\// Third reduction step
   606  	MOVQ acc2, AX;\
   607  	MULQ p256ordK0<>(SB);\
   608  	MOVQ AX, t0;\
   609  	\
   610  	MOVQ p256ord<>+0x00(SB), AX;\
   611  	MULQ t0;\
   612  	ADDQ AX, acc2;\
   613  	ADCQ $0, DX;\
   614  	MOVQ DX, t1;\
   615  	MOVQ t0, acc2;\
   616  	\
   617  	MOVQ t0, AX;\
   618  	MOVQ t0, DX;\
   619  	SHLQ $32, AX;\
   620  	SHRQ $32, DX;\
   621  	\
   622  	SUBQ t0, acc0;\
   623  	SBBQ AX, acc1;\
   624  	SBBQ DX, acc2;\
   625  	\
   626  	MOVQ p256ord<>+0x08(SB), AX;\
   627  	MULQ t0;\
   628  	ADDQ t1, acc3;\
   629  	ADCQ $0, DX;\
   630  	\
   631  	ADDQ AX, acc3;\
   632  	ADCQ DX, acc0;\
   633  	ADCQ $0, acc1;\
   634  	ADCQ $0, acc2;\
   635  	\
   636  	\// Last reduction step
   637  	MOVQ acc3, AX;\
   638  	MULQ p256ordK0<>(SB);\
   639  	MOVQ AX, t0;\
   640  	\
   641  	MOVQ p256ord<>+0x00(SB), AX;\
   642  	MULQ t0;\
   643  	ADDQ AX, acc3;\
   644  	ADCQ $0, DX;\
   645  	MOVQ DX, t1;\
   646  	MOVQ t0, acc3;\
   647  	\
   648  	MOVQ t0, AX;\
   649  	MOVQ t0, DX;\
   650  	SHLQ $32, AX;\
   651  	SHRQ $32, DX;\
   652  	\
   653  	SUBQ t0, acc1;\
   654  	SBBQ AX, acc2;\
   655  	SBBQ DX, acc3;\
   656  	\
   657  	MOVQ p256ord<>+0x08(SB), AX;\
   658  	MULQ t0;\
   659  	ADDQ t1, acc0;\
   660  	ADCQ $0, DX;\
   661  	\
   662  	ADDQ AX, acc0;\
   663  	ADCQ DX, acc1;\
   664  	ADCQ $0, acc2;\
   665  	ADCQ $0, acc3;\
   666  	XORQ t0, t0;\
   667  	\// Add bits [511:256] of the sqr result
   668  	ADCQ acc4, acc0;\
   669  	ADCQ acc5, acc1;\
   670  	ADCQ y_ptr, acc2;\
   671  	ADCQ x_ptr, acc3;\
   672  	ADCQ $0, t0;\
   673  	\
   674  	p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\
   675  	MOVQ res_ptr, x_ptr;
   676  
   677  /* ---------------------------------------*/
   678  #define p256OrdSqrRoundAdx(t1) \
   679  	XORQ acc0, acc0;\
   680  	XORQ y_ptr, y_ptr;\
   681  	\// y[1:] * y[0]
   682  	MOVQ (8*0)(x_ptr), DX;\
   683  	MULXQ (8*1)(x_ptr), acc1, acc2 ;\
   684  	\
   685  	MULXQ (8*2)(x_ptr), AX, acc3;\
   686  	ADOXQ AX, acc2;\
   687  	\
   688  	MULXQ (8*3)(x_ptr), AX, acc4;\
   689  	ADOXQ AX, acc3;\
   690  	ADOXQ y_ptr, acc4;\
   691  	\
   692  	\// y[2:] * y[1]
   693  	MOVQ (8*1)(x_ptr), DX;\
   694  	MULXQ (8*2)(x_ptr), AX, t1;\
   695  	ADOXQ AX, acc3;\
   696  	\
   697  	MULXQ (8*3)(x_ptr), AX, acc5;\
   698  	ADCXQ t1, AX;\
   699  	ADOXQ AX, acc4;\
   700  	ADCXQ y_ptr, acc5;\
   701  	\
   702  	\// y[3] * y[2]
   703  	MOVQ (8*2)(x_ptr), DX;\
   704  	MULXQ (8*3)(x_ptr), AX, y_ptr;\ 
   705  	ADOXQ AX, acc5;\
   706  	ADOXQ acc0, y_ptr;\
   707  	\
   708  	XORQ t1, t1;\
   709  	\// *2
   710  	ADOXQ acc1, acc1;\
   711  	ADOXQ acc2, acc2;\
   712  	ADOXQ acc3, acc3;\
   713  	ADOXQ acc4, acc4;\
   714  	ADOXQ acc5, acc5;\
   715  	ADOXQ y_ptr, y_ptr;\
   716  	ADOXQ acc0, t1;\
   717  	\
   718  	\// Missing products
   719  	MOVQ (8*0)(x_ptr), DX;\
   720  	MULXQ DX, acc0, t0;\
   721  	ADCXQ t0, acc1;\
   722  	\
   723  	MOVQ (8*1)(x_ptr), DX;\
   724  	MULXQ DX, AX, t0;\
   725  	ADCXQ AX, acc2;\
   726  	ADCXQ t0, acc3;\
   727  	\
   728  	MOVQ (8*2)(x_ptr), DX;\
   729  	MULXQ DX, AX, t0 ;\
   730  	ADCXQ AX, acc4;\
   731  	ADCXQ t0, acc5;\
   732  	\
   733  	MOVQ (8*3)(x_ptr), DX;\
   734  	MULXQ DX, AX, x_ptr;\
   735  	ADCXQ AX, y_ptr;\
   736  	ADCXQ t1, x_ptr;\
   737  	\
   738  	\// T = [x_ptr, y_ptr, acc5, acc4, acc3, acc2, acc1, acc0]
   739  	\// First reduction step
   740  	MOVQ acc0, DX;\
   741  	MULXQ p256ordK0<>(SB), DX, AX;\
   742  	\
   743  	MULXQ p256ord<>+0x00(SB), AX, t0;\
   744  	ADOXQ AX, acc0;\// (carry1, acc0) = acc0 + t0 * ord0
   745  	\
   746  	MULXQ p256ord<>+0x08(SB), AX, t1;\
   747  	ADCXQ t0, AX;\
   748  	ADOXQ AX, acc1;\
   749  	\
   750  	MULXQ p256ord<>+0x10(SB), AX, t0;\
   751  	ADCXQ t1, AX;\
   752  	ADOXQ AX, acc2;\
   753  	\
   754  	MULXQ p256ord<>+0x18(SB), AX, acc0;\
   755  	ADCXQ t0, AX;\
   756  	ADOXQ AX, acc3;\
   757  	MOVQ $0, t0;\
   758  	ADCXQ t0, acc0;\
   759  	ADOXQ t0, acc0;\
   760  	\
   761  	\// Second reduction step
   762  	MOVQ acc1, DX;\
   763  	MULXQ p256ordK0<>(SB), DX, AX;\
   764  	\
   765  	MULXQ p256ord<>+0x00(SB), AX, t0;\
   766  	ADOXQ AX, acc1;\
   767  	\
   768  	MULXQ p256ord<>+0x08(SB), AX, t1;\
   769  	ADCXQ t0, AX;\
   770  	ADOXQ AX, acc2;\
   771  	\
   772  	MULXQ p256ord<>+0x10(SB), AX, t0;\
   773  	ADCXQ t1, AX;\
   774  	ADOXQ AX, acc3;\
   775  	\
   776  	MULXQ p256ord<>+0x18(SB), AX, acc1;\
   777  	ADCXQ t0, AX;\
   778  	ADOXQ AX, acc0;\
   779  	MOVQ $0, t0;\
   780  	ADCXQ t0, acc1;\
   781  	ADOXQ t0, acc1;\
   782  	\
   783  	\// Third reduction step
   784  	MOVQ acc2, DX;\
   785  	MULXQ p256ordK0<>(SB), DX, AX;\
   786  	\
   787  	MULXQ p256ord<>+0x00(SB), AX, t0;\
   788  	ADOXQ AX, acc2;\
   789  	\
   790  	MULXQ p256ord<>+0x08(SB), AX, t1;\
   791  	ADCXQ t0, AX;\
   792  	ADOXQ AX, acc3;\
   793  	\
   794  	MULXQ p256ord<>+0x10(SB), AX, t0;\
   795  	ADCXQ t1, AX;\
   796  	ADOXQ AX, acc0;\
   797  	\
   798  	MULXQ p256ord<>+0x18(SB), AX, acc2;\
   799  	ADCXQ t0, AX;\
   800  	ADOXQ AX, acc1;\
   801  	MOVQ $0, t0;\
   802  	ADCXQ t0, acc2;\
   803  	ADOXQ t0, acc2;\
   804  	\
   805  	\// Last reduction step
   806  	MOVQ acc3, DX;\
   807  	MULXQ p256ordK0<>(SB), DX, AX;\
   808  	\
   809  	MULXQ p256ord<>+0x00(SB), AX, t0;\
   810  	ADOXQ AX, acc3;\
   811  	\
   812  	MULXQ p256ord<>+0x08(SB), AX, t1;\
   813  	ADCXQ t0, AX;\
   814  	ADOXQ AX, acc0;\
   815  	\
   816  	MULXQ p256ord<>+0x10(SB), AX, t0;\
   817  	ADCXQ t1, AX;\
   818  	ADOXQ AX, acc1;\
   819  	\
   820  	MULXQ p256ord<>+0x18(SB), AX, acc3;\
   821  	ADCXQ t0, AX;\
   822  	ADOXQ AX, acc2;\
   823  	MOVQ $0, t0;\
   824  	ADCXQ t0, acc3;\
   825  	ADOXQ t0, acc3;\
   826  	\
   827  	XORQ t1, t1;\
   828  	\// Add bits [511:256] of the sqr result
   829  	ADCXQ acc4, acc0;\
   830  	ADCXQ acc5, acc1;\
   831  	ADCXQ y_ptr, acc2;\
   832  	ADCXQ x_ptr, acc3;\
   833  	ADCXQ t1, t0;\
   834  	\
   835  	p256OrdReduceInline(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1, res_ptr);\
   836  	MOVQ res_ptr, x_ptr;
   837  
   838  // Below marcors are used for point operation
   839  /* ---------------------------------------*/
   840  // [t3, t2, t1, t0] = 2[acc7, acc6, acc5, acc4]
   841  #define p256MulBy2Inline\
   842  	XORQ mul0, mul0;\
   843  	ADDQ acc4, acc4;\
   844  	ADCQ acc5, acc5;\
   845  	ADCQ acc6, acc6;\
   846  	ADCQ acc7, acc7;\
   847  	ADCQ $0, mul0;\
   848  	MOVQ acc4, t0;\
   849  	MOVQ acc5, t1;\
   850  	MOVQ acc6, t2;\
   851  	MOVQ acc7, t3;\
   852  	SUBQ $-1, t0;\
   853  	SBBQ p256p<>+0x08(SB), t1;\
   854  	SBBQ $-1, t2;\
   855  	SBBQ p256p<>+0x018(SB), t3;\
   856  	SBBQ $0, mul0;\
   857  	CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
   858  	CMOVQCS acc5, t1;\
   859  	CMOVQCS acc6, t2;\
   860  	CMOVQCS acc7, t3;
   861  
   862  /* ---------------------------------------*/
   863  // [acc7, acc6, acc5, acc4] = 2[acc7, acc6, acc5, acc4]
   864  #define p256MulBy2Inline2\
   865  	XORQ mul0, mul0;\
   866  	ADDQ acc4, acc4;\
   867  	ADCQ acc5, acc5;\
   868  	ADCQ acc6, acc6;\
   869  	ADCQ acc7, acc7;\
   870  	ADCQ $0, mul0;\
   871  	MOVQ acc4, t0;\
   872  	MOVQ acc5, t1;\
   873  	MOVQ acc6, t2;\
   874  	MOVQ acc7, t3;\
   875  	SUBQ $-1, acc4;\
   876  	SBBQ p256p<>+0x08(SB), acc5;\
   877  	SBBQ $-1, acc6;\
   878  	SBBQ p256p<>+0x018(SB), acc7;\
   879  	SBBQ $0, mul0;\
   880  	CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
   881  	CMOVQCS t1, acc5;\
   882  	CMOVQCS t2, acc6;\
   883  	CMOVQCS t3, acc7;
   884  
   885  /* ---------------------------------------*/
   886  // [t3, t2, t1, t0] = 3[acc7, acc6, acc5, acc4]
   887  #define p256TripleInline\
   888  	XORQ mul0, mul0;\
   889  	MOVQ acc4, acc0;\
   890  	MOVQ acc5, acc1;\
   891  	MOVQ acc6, acc2;\
   892  	MOVQ acc7, acc3;\
   893  	ADDQ acc4, acc4;\
   894  	ADCQ acc5, acc5;\
   895  	ADCQ acc6, acc6;\
   896  	ADCQ acc7, acc7;\
   897  	ADCQ $0, mul0;\
   898  	MOVQ acc4, t0;\
   899  	MOVQ acc5, t1;\
   900  	MOVQ acc6, t2;\
   901  	MOVQ acc7, t3;\
   902  	SUBQ $-1, acc4;\
   903  	SBBQ p256p<>+0x08(SB), acc5;\
   904  	SBBQ $-1, acc6;\
   905  	SBBQ p256p<>+0x018(SB), acc7;\
   906  	SBBQ $0, mul0;\
   907  	CMOVQCS t0, acc4;\ // CMOVQCS: Move if below (CF == 1)
   908  	CMOVQCS t1, acc5;\
   909  	CMOVQCS t2, acc6;\
   910  	CMOVQCS t3, acc7;\
   911  	XORQ mul0, mul0;\
   912  	ADDQ acc0, acc4;\
   913  	ADCQ acc1, acc5;\
   914  	ADCQ acc2, acc6;\
   915  	ADCQ acc3, acc7;\
   916  	ADCQ $0, mul0;\
   917  	MOVQ acc4, t0;\
   918  	MOVQ acc5, t1;\
   919  	MOVQ acc6, t2;\
   920  	MOVQ acc7, t3;\
   921  	SUBQ $-1, t0;\
   922  	SBBQ p256p<>+0x08(SB), t1;\
   923  	SBBQ $-1, t2;\
   924  	SBBQ p256p<>+0x018(SB), t3;\
   925  	SBBQ $0, mul0;\
   926  	CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
   927  	CMOVQCS acc5, t1;\
   928  	CMOVQCS acc6, t2;\
   929  	CMOVQCS acc7, t3;	
   930  
   931  /* ---------------------------------------*/
   932  // [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] + [t3, t2, t1, t0]
   933  #define p256AddInline \
   934  	XORQ mul0, mul0;\
   935  	ADDQ t0, acc4;\
   936  	ADCQ t1, acc5;\
   937  	ADCQ t2, acc6;\
   938  	ADCQ t3, acc7;\
   939  	ADCQ $0, mul0;\
   940  	MOVQ acc4, t0;\
   941  	MOVQ acc5, t1;\
   942  	MOVQ acc6, t2;\
   943  	MOVQ acc7, t3;\
   944  	SUBQ $-1, t0;\
   945  	SBBQ p256p<>+0x08(SB), t1;\
   946  	SBBQ $-1, t2;\
   947  	SBBQ p256p<>+0x018(SB), t3;\
   948  	SBBQ $0, mul0;\
   949  	CMOVQCS acc4, t0;\ // CMOVQCS: Move if below (CF == 1)
   950  	CMOVQCS acc5, t1;\
   951  	CMOVQCS acc6, t2;\
   952  	CMOVQCS acc7, t3;
   953  
   954  /* ---------------------------------------*/
   955  // [t3, t2, t1, t0] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
   956  #define p256SubInline \
   957  	XORQ mul0, mul0;\
   958  	SUBQ t0, acc4;\
   959  	SBBQ t1, acc5;\
   960  	SBBQ t2, acc6;\
   961  	SBBQ t3, acc7;\
   962  	SBBQ $0, mul0;\
   963  	MOVQ acc4, t0;\
   964  	MOVQ acc5, t1;\
   965  	MOVQ acc6, t2;\
   966  	MOVQ acc7, t3;\
   967  	ADDQ $-1, t0;\
   968  	ADCQ p256p<>+0x08(SB), t1;\
   969  	ADCQ $-1, t2;\
   970  	ADCQ p256p<>+0x018(SB), t3;\
   971  	ANDQ $1, mul0;\
   972  	CMOVQEQ acc4, t0;\  // CMOVQEQ: Move if equal (ZF == 1)
   973  	CMOVQEQ acc5, t1;\
   974  	CMOVQEQ acc6, t2;\
   975  	CMOVQEQ acc7, t3;\
   976  
   977  /* ---------------------------------------*/
   978  // [acc7, acc6, acc5, acc4] = [acc7, acc6, acc5, acc4] - [t3, t2, t1, t0]
   979  #define p256SubInline2 \
   980  	XORQ mul0, mul0;\
   981  	SUBQ t0, acc4;\
   982  	SBBQ t1, acc5;\
   983  	SBBQ t2, acc6;\
   984  	SBBQ t3, acc7;\
   985  	SBBQ $0, mul0;\
   986  	MOVQ acc4, acc0;\
   987  	MOVQ acc5, acc1;\
   988  	MOVQ acc6, acc2;\
   989  	MOVQ acc7, acc3;\
   990  	ADDQ $-1, acc4;\
   991  	ADCQ p256p<>+0x08(SB), acc5;\
   992  	ADCQ $-1, acc6;\
   993  	ADCQ p256p<>+0x018(SB), acc7;\
   994  	ANDQ $1, mul0;\
   995  	CMOVQEQ acc0, acc4;\  // CMOVQEQ: Move if equal (ZF == 1)
   996  	CMOVQEQ acc1, acc5;\
   997  	CMOVQEQ acc2, acc6;\
   998  	CMOVQEQ acc3, acc7;\
   999  
  1000  #define p256SqrInternalInline \
  1001  	MOVQ acc4, mul0;\
  1002  	MULQ acc5;\
  1003  	MOVQ mul0, acc1;\
  1004  	MOVQ mul1, acc2;\
  1005  	\
  1006  	MOVQ acc4, mul0;\
  1007  	MULQ acc6;\
  1008  	ADDQ mul0, acc2;\
  1009  	ADCQ $0, mul1;\
  1010  	MOVQ mul1, acc3;\
  1011  	\
  1012  	MOVQ acc4, mul0;\
  1013  	MULQ acc7;\
  1014  	ADDQ mul0, acc3;\
  1015  	ADCQ $0, mul1;\
  1016  	MOVQ mul1, t0;\
  1017  	\
  1018  	MOVQ acc5, mul0;\
  1019  	MULQ acc6;\
  1020  	ADDQ mul0, acc3;\
  1021  	ADCQ $0, mul1;\
  1022  	MOVQ mul1, acc0;\
  1023  	\
  1024  	MOVQ acc5, mul0;\
  1025  	MULQ acc7;\
  1026  	ADDQ acc0, t0;\
  1027  	ADCQ $0, mul1;\
  1028  	ADDQ mul0, t0;\
  1029  	ADCQ $0, mul1;\
  1030  	MOVQ mul1, t1;\
  1031  	\
  1032  	MOVQ acc6, mul0;\
  1033  	MULQ acc7;\
  1034  	ADDQ mul0, t1;\
  1035  	ADCQ $0, mul1;\
  1036  	MOVQ mul1, t2;\
  1037  	XORQ t3, t3;\
  1038  	\// *2
  1039  	ADDQ acc1, acc1;\
  1040  	ADCQ acc2, acc2;\
  1041  	ADCQ acc3, acc3;\
  1042  	ADCQ t0, t0;\
  1043  	ADCQ t1, t1;\
  1044  	ADCQ t2, t2;\
  1045  	ADCQ $0, t3;\
  1046  	\// Missing products
  1047  	MOVQ acc4, mul0;\
  1048  	MULQ mul0;\
  1049  	MOVQ mul0, acc0;\
  1050  	MOVQ mul1, acc4;\
  1051  	\
  1052  	MOVQ acc5, mul0;\
  1053  	MULQ mul0;\
  1054  	ADDQ acc4, acc1;\
  1055  	ADCQ mul0, acc2;\
  1056  	ADCQ $0, mul1;\
  1057  	MOVQ mul1, acc4;\
  1058  	\
  1059  	MOVQ acc6, mul0;\
  1060  	MULQ mul0;\
  1061  	ADDQ acc4, acc3;\
  1062  	ADCQ mul0, t0;\
  1063  	ADCQ $0, mul1;\
  1064  	MOVQ mul1, acc4;\
  1065  	\
  1066  	MOVQ acc7, mul0;\
  1067  	MULQ mul0;\
  1068  	ADDQ acc4, t1;\
  1069  	ADCQ mul0, t2;\
  1070  	ADCQ mul1, t3;\
  1071  	\// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
  1072  	sm2P256SqrReductionInline;
  1073  
  1074  #define p256SqrInternalInlineAdx \
  1075  	XORQ acc0, acc0;\
  1076  	XORQ t2, t2;\
  1077  	MOVQ acc4, mul1;\
  1078  	MULXQ acc5, acc1, acc2;\
  1079  	\
  1080  	MULXQ acc6, mul0, acc3;\
  1081  	ADOXQ mul0, acc2;\
  1082  	\
  1083  	MULXQ acc7, mul0, t0;\
  1084  	ADOXQ mul0, acc3;\
  1085  	ADOXQ t2, t0;\
  1086  	\
  1087  	MOVQ acc5, mul1;\
  1088  	MULXQ acc6, mul0, t3;\
  1089  	ADOXQ mul0, acc3;\
  1090  	\
  1091  	MULXQ acc7, mul0, t1;\
  1092  	ADCXQ t3, mul0;\
  1093  	ADOXQ mul0, t0;\
  1094  	ADCXQ t2, t1;\
  1095  	\
  1096  	MOVQ acc6, mul1;\
  1097  	MULXQ acc7, mul0, t2;\
  1098  	ADOXQ mul0, t1;\
  1099  	ADOXQ acc0, t2;\
  1100  	XORQ t3, t3;\
  1101  	\
  1102  	\// *2
  1103  	ADOXQ acc1, acc1;\
  1104  	ADOXQ acc2, acc2;\
  1105  	ADOXQ acc3, acc3;\
  1106  	ADOXQ t0, t0;\
  1107  	ADOXQ t1, t1;\
  1108  	ADOXQ t2, t2;\
  1109  	ADOXQ acc0, t3;\
  1110  	\
  1111  	\// Missing products
  1112  	MOVQ acc4, mul1;\
  1113  	MULXQ mul1, acc0, acc4;\ 
  1114  	ADDQ acc4, acc1;\
  1115  	\
  1116  	MOVQ acc5, mul1;\
  1117  	MULXQ mul1, mul0, acc4;\
  1118  	ADCXQ mul0, acc2;\
  1119  	ADCXQ acc4, acc3;\
  1120  	\
  1121  	MOVQ acc6, mul1;\
  1122  	MULXQ mul1, mul0, acc4;\
  1123  	ADCXQ mul0, t0;\
  1124  	ADCXQ acc4, t1;\
  1125  	\
  1126  	MOVQ acc7, mul1;\
  1127  	MULXQ mul1, mul0, acc4;\
  1128  	ADCXQ mul0, t2;\
  1129  	ADCXQ acc4, t3;\
  1130  	\// T = [t3, t2,, t1, t0, acc3, acc2, acc1, acc0]
  1131  	sm2P256SqrReductionInline;
  1132  
  1133  // p256IsZeroInline returns 1 in AX if [acc4..acc7] represents zero and zero
  1134  // otherwise. It writes to [acc4..acc7], t0 and t1.
  1135  #define p256IsZeroInline \
  1136  	\// AX contains a flag that is set if the input is zero.
  1137  	XORQ AX, AX;\
  1138  	MOVQ $1, t1;\
  1139  	\// Check whether [acc4..acc7] are all zero.
  1140  	MOVQ acc4, t0;\
  1141  	ORQ acc5, t0;\
  1142  	ORQ acc6, t0;\
  1143  	ORQ acc7, t0;\
  1144  	\// Set the zero flag if so. (CMOV of a constant to a register doesn't
  1145  	\// appear to be supported in Go. Thus t1 = 1.)
  1146  	CMOVQEQ t1, AX;\  // CMOVQEQ: Move if equal (ZF == 1)
  1147  	\// XOR [acc4..acc7] with P and compare with zero again.
  1148  	XORQ $-1, acc4;\
  1149  	XORQ p256p<>+0x08(SB), acc5;\
  1150  	XORQ $-1, acc6;\
  1151  	XORQ p256p<>+0x018(SB), acc7;\
  1152  	ORQ acc5, acc4;\
  1153  	ORQ acc6, acc4;\
  1154  	ORQ acc7, acc4;\
  1155  	\// Set the zero flag if so.
  1156  	\// CMOVQEQ: Move if equal (ZF == 1)
  1157  	CMOVQEQ t1, AX;
  1158  
  1159  #define p256PointDoubleInit() \
  1160  	MOVOU (16*0)(BX), X0;\
  1161  	MOVOU (16*1)(BX), X1;\
  1162  	MOVOU (16*2)(BX), X2;\
  1163  	MOVOU (16*3)(BX), X3;\
  1164  	MOVOU (16*4)(BX), X4;\
  1165  	MOVOU (16*5)(BX), X5;\
  1166  	\
  1167  	MOVOU X0, x(16*0);\
  1168  	MOVOU X1, x(16*1);\
  1169  	MOVOU X2, y(16*0);\
  1170  	MOVOU X3, y(16*1);\
  1171  	MOVOU X4, z(16*0);\
  1172  	MOVOU X5, z(16*1);