github.com/cloudflare/circl@v1.5.0/ecc/p384/arith_amd64.s (about)

     1  // +build amd64,!purego
     2  
     3  #include "textflag.h"
     4  
     5  #define storeBlock(a0,a1,a2,a3,a4,a5, r) \
     6  	MOVQ a0,  0+r \
     7  	MOVQ a1,  8+r \
     8  	MOVQ a2, 16+r \
     9  	MOVQ a3, 24+r \
    10  	MOVQ a4, 32+r \
    11  	MOVQ a5, 40+r
    12  
    13  #define loadBlock(r, a0,a1,a2,a3,a4,a5) \
    14  	MOVQ  0+r, a0 \
    15  	MOVQ  8+r, a1 \
    16  	MOVQ 16+r, a2 \
    17  	MOVQ 24+r, a3 \
    18  	MOVQ 32+r, a4 \
    19  	MOVQ 40+r, a5
    20  
    21  #define fp384Carry(a0,a1,a2,a3,a4,a5,a6, b0,b1,b2,b3,b4,b5,b6) \
    22  	\ // b = a-p
    23  	MOVQ a0, b0 \
    24  	MOVQ a1, b1 \
    25  	MOVQ a2, b2 \
    26  	MOVQ a3, b3 \
    27  	MOVQ a4, b4 \
    28  	MOVQ a5, b5 \
    29  	MOVQ a6, b6 \
    30  	\
    31  	SUBQ ·p+0(SB), b0 \
    32  	SBBQ ·p+8(SB), b1 \
    33  	SBBQ ·p+16(SB), b2 \
    34  	SBBQ ·p+24(SB), b3 \
    35  	SBBQ ·p+32(SB), b4 \
    36  	SBBQ ·p+40(SB), b5 \
    37  	SBBQ $0, b6 \
    38  	\
    39  	\ // if b is negative then return a
    40  	\ // else return b
    41  	CMOVQCC b0, a0 \
    42  	CMOVQCC b1, a1 \
    43  	CMOVQCC b2, a2 \
    44  	CMOVQCC b3, a3 \
    45  	CMOVQCC b4, a4 \
    46  	CMOVQCC b5, a5
    47  
    48  #define mul(a0,a1,a2,a3,a4,a5, rb, stack) \
    49  	\ // a0
    50  	MOVQ a0, AX \
    51  	MULQ 0+rb \
    52  	MOVQ AX, R8 \
    53  	MOVQ DX, R9 \
    54  	MOVQ a0, AX \
    55  	MULQ 8+rb \
    56  	ADDQ AX, R9 \
    57  	ADCQ $0, DX \
    58  	MOVQ DX, R10 \
    59  	MOVQ a0, AX \
    60  	MULQ 16+rb \
    61  	ADDQ AX, R10 \
    62  	ADCQ $0, DX \
    63  	MOVQ DX, R11 \
    64  	MOVQ a0, AX \
    65  	MULQ 24+rb \
    66  	ADDQ AX, R11 \
    67  	ADCQ $0, DX \
    68  	MOVQ DX, R12 \
    69  	MOVQ a0, AX \
    70  	MULQ 32+rb \
    71  	ADDQ AX, R12 \
    72  	ADCQ $0, DX \
    73  	MOVQ DX, R13 \
    74  	MOVQ a0, AX \
    75  	MULQ 40+rb \
    76  	ADDQ AX, R13 \
    77  	ADCQ $0, DX \
    78  	MOVQ DX, R14 \
    79  	\
    80  	storeBlock(R8,R9,R10,R11,R12,R13, 0+stack) \
    81  	MOVQ R14, 48+stack \
    82  	\
    83  	\ // a1
    84  	MOVQ a1, AX \
    85  	MULQ 0+rb \
    86  	MOVQ AX, R8 \
    87  	MOVQ DX, R9 \
    88  	MOVQ a1, AX \
    89  	MULQ 8+rb \
    90  	ADDQ AX, R9 \
    91  	ADCQ $0, DX \
    92  	MOVQ DX, R10 \
    93  	MOVQ a1, AX \
    94  	MULQ 16+rb \
    95  	ADDQ AX, R10 \
    96  	ADCQ $0, DX \
    97  	MOVQ DX, R11 \
    98  	MOVQ a1, AX \
    99  	MULQ 24+rb \
   100  	ADDQ AX, R11 \
   101  	ADCQ $0, DX \
   102  	MOVQ DX, R12 \
   103  	MOVQ a1, AX \
   104  	MULQ 32+rb \
   105  	ADDQ AX, R12 \
   106  	ADCQ $0, DX \
   107  	MOVQ DX, R13 \
   108  	MOVQ a1, AX \
   109  	MULQ 40+rb \
   110  	ADDQ AX, R13 \
   111  	ADCQ $0, DX \
   112  	MOVQ DX, R14 \
   113  	\
   114  	ADDQ 8+stack, R8 \
   115  	ADCQ 16+stack, R9 \
   116  	ADCQ 24+stack, R10 \
   117  	ADCQ 32+stack, R11 \
   118  	ADCQ 40+stack, R12 \
   119  	ADCQ 48+stack, R13 \
   120  	ADCQ $0, R14 \
   121  	storeBlock(R8,R9,R10,R11,R12,R13, 8+stack) \
   122  	MOVQ R14, 56+stack \
   123  	\
   124  	\ // a2
   125  	MOVQ a2, AX \
   126  	MULQ 0+rb \
   127  	MOVQ AX, R8 \
   128  	MOVQ DX, R9 \
   129  	MOVQ a2, AX \
   130  	MULQ 8+rb \
   131  	ADDQ AX, R9 \
   132  	ADCQ $0, DX \
   133  	MOVQ DX, R10 \
   134  	MOVQ a2, AX \
   135  	MULQ 16+rb \
   136  	ADDQ AX, R10 \
   137  	ADCQ $0, DX \
   138  	MOVQ DX, R11 \
   139  	MOVQ a2, AX \
   140  	MULQ 24+rb \
   141  	ADDQ AX, R11 \
   142  	ADCQ $0, DX \
   143  	MOVQ DX, R12 \
   144  	MOVQ a2, AX \
   145  	MULQ 32+rb \
   146  	ADDQ AX, R12 \
   147  	ADCQ $0, DX \
   148  	MOVQ DX, R13 \
   149  	MOVQ a2, AX \
   150  	MULQ 40+rb \
   151  	ADDQ AX, R13 \
   152  	ADCQ $0, DX \
   153  	MOVQ DX, R14 \
   154  	\
   155  	ADDQ 16+stack, R8 \
   156  	ADCQ 24+stack, R9 \
   157  	ADCQ 32+stack, R10 \
   158  	ADCQ 40+stack, R11 \
   159  	ADCQ 48+stack, R12 \
   160  	ADCQ 56+stack, R13 \
   161  	ADCQ $0, R14 \
   162  	storeBlock(R8,R9,R10,R11,R12,R13, 16+stack) \
   163  	MOVQ R14, 64+stack \
   164  	\
   165  	\ // a3
   166  	MOVQ a3, AX \
   167  	MULQ 0+rb \
   168  	MOVQ AX, R8 \
   169  	MOVQ DX, R9 \
   170  	MOVQ a3, AX \
   171  	MULQ 8+rb \
   172  	ADDQ AX, R9 \
   173  	ADCQ $0, DX \
   174  	MOVQ DX, R10 \
   175  	MOVQ a3, AX \
   176  	MULQ 16+rb \
   177  	ADDQ AX, R10 \
   178  	ADCQ $0, DX \
   179  	MOVQ DX, R11 \
   180  	MOVQ a3, AX \
   181  	MULQ 24+rb \
   182  	ADDQ AX, R11 \
   183  	ADCQ $0, DX \
   184  	MOVQ DX, R12 \
   185  	MOVQ a3, AX \
   186  	MULQ 32+rb \
   187  	ADDQ AX, R12 \
   188  	ADCQ $0, DX \
   189  	MOVQ DX, R13 \
   190  	MOVQ a3, AX \
   191  	MULQ 40+rb \
   192  	ADDQ AX, R13 \
   193  	ADCQ $0, DX \
   194  	MOVQ DX, R14 \
   195  	\
   196  	ADDQ 24+stack, R8 \
   197  	ADCQ 32+stack, R9 \
   198  	ADCQ 40+stack, R10 \
   199  	ADCQ 48+stack, R11 \
   200  	ADCQ 56+stack, R12 \
   201  	ADCQ 64+stack, R13 \
   202  	ADCQ $0, R14 \
   203  	storeBlock(R8,R9,R10,R11,R12,R13, 24+stack) \
   204  	MOVQ R14, 72+stack \
   205  	\
   206  	\ // a4
   207  	MOVQ a4, AX \
   208  	MULQ 0+rb \
   209  	MOVQ AX, R8 \
   210  	MOVQ DX, R9 \
   211  	MOVQ a4, AX \
   212  	MULQ 8+rb \
   213  	ADDQ AX, R9 \
   214  	ADCQ $0, DX \
   215  	MOVQ DX, R10 \
   216  	MOVQ a4, AX \
   217  	MULQ 16+rb \
   218  	ADDQ AX, R10 \
   219  	ADCQ $0, DX \
   220  	MOVQ DX, R11 \
   221  	MOVQ a4, AX \
   222  	MULQ 24+rb \
   223  	ADDQ AX, R11 \
   224  	ADCQ $0, DX \
   225  	MOVQ DX, R12 \
   226  	MOVQ a4, AX \
   227  	MULQ 32+rb \
   228  	ADDQ AX, R12 \
   229  	ADCQ $0, DX \
   230  	MOVQ DX, R13 \
   231  	MOVQ a4, AX \
   232  	MULQ 40+rb \
   233  	ADDQ AX, R13 \
   234  	ADCQ $0, DX \
   235  	MOVQ DX, R14 \
   236  	\
   237  	ADDQ 32+stack, R8 \
   238  	ADCQ 40+stack, R9 \
   239  	ADCQ 48+stack, R10 \
   240  	ADCQ 56+stack, R11 \
   241  	ADCQ 64+stack, R12 \
   242  	ADCQ 72+stack, R13 \
   243  	ADCQ $0, R14 \
   244  	storeBlock(R8,R9,R10,R11,R12,R13, 32+stack) \
   245  	MOVQ R14, 80+stack \
   246  	\
   247  	\ // a5
   248  	MOVQ a5, AX \
   249  	MULQ 0+rb \
   250  	MOVQ AX, R8 \
   251  	MOVQ DX, R9 \
   252  	MOVQ a5, AX \
   253  	MULQ 8+rb \
   254  	ADDQ AX, R9 \
   255  	ADCQ $0, DX \
   256  	MOVQ DX, R10 \
   257  	MOVQ a5, AX \
   258  	MULQ 16+rb \
   259  	ADDQ AX, R10 \
   260  	ADCQ $0, DX \
   261  	MOVQ DX, R11 \
   262  	MOVQ a5, AX \
   263  	MULQ 24+rb \
   264  	ADDQ AX, R11 \
   265  	ADCQ $0, DX \
   266  	MOVQ DX, R12 \
   267  	MOVQ a5, AX \
   268  	MULQ 32+rb \
   269  	ADDQ AX, R12 \
   270  	ADCQ $0, DX \
   271  	MOVQ DX, R13 \
   272  	MOVQ a5, AX \
   273  	MULQ 40+rb \
   274  	ADDQ AX, R13 \
   275  	ADCQ $0, DX \
   276  	MOVQ DX, R14 \
   277  	\
   278  	ADDQ 40+stack, R8 \
   279  	ADCQ 48+stack, R9 \
   280  	ADCQ 56+stack, R10 \
   281  	ADCQ 64+stack, R11 \
   282  	ADCQ 72+stack, R12 \
   283  	ADCQ 80+stack, R13 \
   284  	ADCQ $0, R14 \
   285  	storeBlock(R8,R9,R10,R11,R12,R13, 40+stack) \
   286  	MOVQ R14, 88+stack
   287  
   288  #define fp384Reduce(stack) \
   289  	\ // m = (T * P') mod R, store m in R8:R9:R10:R11:R12:R13
   290  	MOVQ ·pp+0(SB), AX \
   291  	MULQ 0+stack \
   292  	MOVQ AX, R8 ; MOVQ R8, 96+stack\
   293  	MOVQ DX, R9 \
   294  	MOVQ ·pp+0(SB), AX \
   295  	MULQ 8+stack \
   296  	ADDQ AX, R9 \
   297  	ADCQ $0, DX \
   298  	MOVQ DX, R10 \
   299  	MOVQ ·pp+0(SB), AX \
   300  	MULQ 16+stack \
   301  	ADDQ AX, R10 \
   302  	ADCQ $0, DX \
   303  	MOVQ DX, R11 \
   304  	MOVQ ·pp+0(SB), AX \
   305  	MULQ 24+stack \
   306  	ADDQ AX, R11 \
   307  	ADCQ $0, DX \
   308  	MOVQ DX, R12 \
   309  	MOVQ ·pp+0(SB), AX \
   310  	MULQ 32+stack \
   311  	ADDQ AX, R12 \
   312  	ADCQ $0, DX \
   313  	MOVQ DX, R13 \
   314  	MOVQ ·pp+0(SB), AX \
   315  	MULQ 40+stack \
   316  	ADDQ AX, R13 \
   317  	\
   318  	ADDQ 0+stack, R9 \
   319  	ADCQ 8+stack, R10 \
   320  	ADCQ 16+stack, R11 \
   321  	ADCQ 24+stack, R12 \
   322  	ADCQ 32+stack, R13 \
   323  	\
   324  	MOVQ ·pp+16(SB), AX \
   325  	MULQ 0+stack \
   326  	MOVQ AX, R14 \
   327  	MOVQ DX, R8 \
   328  	MOVQ ·pp+16(SB), AX \
   329  	MULQ 8+stack \
   330  	ADDQ AX, R8 \
   331  	ADCQ $0, DX \
   332  	MOVQ DX, BX \
   333  	MOVQ ·pp+16(SB), AX \
   334  	MULQ 16+stack \
   335  	ADDQ AX, BX \
   336  	ADCQ $0, DX \
   337  	MOVQ DX, CX \
   338  	MOVQ ·pp+16(SB), AX \
   339  	MULQ 24+stack \
   340  	ADDQ AX, CX \
   341  	\
   342  	ADDQ R14, R10 \
   343  	ADCQ R8, R11 \
   344  	ADCQ BX, R12 \
   345  	ADCQ CX, R13 \
   346  	\
   347  	MOVQ ·pp+24(SB), AX \
   348  	MULQ 0+stack \
   349  	MOVQ AX, R14 \
   350  	MOVQ DX, R8 \
   351  	MOVQ ·pp+24(SB), AX \
   352  	MULQ 8+stack \
   353  	ADDQ AX, R8 \
   354  	ADCQ $0, DX \
   355  	MOVQ DX, BX \
   356  	MOVQ ·pp+24(SB), AX \
   357  	MULQ 16+stack \
   358  	ADDQ AX, BX \
   359  	\
   360  	ADDQ R14, R11 \
   361  	ADCQ R8, R12 \
   362  	ADCQ BX, R13 \
   363  	\
   364  	MOVQ ·pp+32(SB), AX \
   365  	MULQ 0+stack \
   366  	MOVQ AX, R14 \
   367  	MOVQ DX, R8 \
   368  	MOVQ ·pp+32(SB), AX \
   369  	MULQ 8+stack \
   370  	ADDQ AX, R8 \
   371  	\
   372  	ADDQ R14, R12 \
   373  	ADCQ R8, R13 \
   374  	\
   375  	MOVQ ·pp+40(SB), AX \
   376  	MULQ 0+stack \
   377  	ADDQ AX, R13 \
   378  	\
   379  	MOVQ 96+stack, R8 \
   380  	\
   381  	storeBlock(R8,R9,R10,R11,R12,R13, 96+stack) \
   382  	\
   383  	\ // m * P
   384  	mul(·p+0(SB),·p+8(SB),·p+16(SB),·p+24(SB),·p+32(SB),·p+40(SB), 96+stack, 144+stack) \
   385  	\
   386  	\ // Add the 768-bit intermediate to m*N
   387  	MOVQ $0, R15 \
   388  	loadBlock(144+stack, R8,R9,R10,R11,R12,R13) \
   389  	loadBlock(192+stack, R14,SI,AX,BX,CX,DX) \
   390  	\
   391  	ADDQ 0+stack, R8 \
   392  	ADCQ 8+stack, R9 \
   393  	ADCQ 16+stack, R10 \
   394  	ADCQ 24+stack, R11 \
   395  	ADCQ 32+stack, R12 \
   396  	ADCQ 40+stack, R13 \
   397  	ADCQ 48+stack, R14 \
   398  	ADCQ 56+stack, SI \
   399  	ADCQ 64+stack, AX \
   400  	ADCQ 72+stack, BX \
   401  	ADCQ 80+stack, CX \
   402  	ADCQ 88+stack, DX \
   403  	ADCQ $0, R15 \
   404  	\
   405  	fp384Carry(R14,SI,AX,BX,CX,DX,R15, R8,R9,R10,R11,R12,R13,DI)
   406  
   407  #define mulBMI2(a0,a1,a2,a3,a4,a5, rb, stack) \
   408  	MOVQ a0, DX \
   409  	MULXQ 0+rb, R8, R9; MOVQ R8, 0+stack; MOVQ $0, R8 \
   410  	MULXQ 8+rb, AX, R10 \
   411  	ADDQ AX, R9 \
   412  	MULXQ 16+rb, AX, R11 \
   413  	ADCQ AX, R10 \
   414  	MULXQ 24+rb, AX, R12 \
   415  	ADCQ AX, R11 \
   416  	MULXQ 32+rb, AX, R13 \
   417  	ADCQ AX, R12 \
   418  	MULXQ 40+rb, AX, R14 \
   419  	ADCQ AX, R13 \
   420  	ADCQ $0, R14 \
   421  	\
   422  	MOVQ a1, DX \
   423  	MULXQ 0+rb, AX, BX \
   424  	ADDQ AX, R9; MOVQ R9, 8+stack; MOVL $0, R9 \
   425  	ADCQ BX, R10 \
   426  	MULXQ 16+rb, AX, BX \
   427  	ADCQ AX, R11 \
   428  	ADCQ BX, R12 \
   429  	MULXQ 32+rb, AX, BX \
   430  	ADCQ AX, R13 \
   431  	ADCQ BX, R14 \
   432  	ADCQ $0,  R8 \
   433  	MULXQ 8+rb, AX, BX \
   434  	ADDQ AX, R10 \
   435  	ADCQ BX, R11 \
   436  	MULXQ 24+rb, AX, BX \
   437  	ADCQ AX, R12 \
   438  	ADCQ BX, R13 \
   439  	MULXQ 40+rb, AX, BX \
   440  	ADCQ AX, R14 \
   441  	ADCQ BX, R8 \
   442  	ADCQ $0, R9 \
   443  	\
   444  	MOVQ a2, DX \
   445  	MULXQ 0+rb, AX, BX \
   446  	ADDQ AX, R10; MOVQ R10, 16+stack; MOVL $0, R10 \
   447  	ADCQ BX, R11 \
   448  	MULXQ 16+rb, AX, BX \
   449  	ADCQ AX, R12 \
   450  	ADCQ BX, R13 \
   451  	MULXQ 32+rb, AX, BX \
   452  	ADCQ AX, R14 \
   453  	ADCQ BX, R8 \
   454  	ADCQ $0, R9 \
   455  	MULXQ 8+rb, AX, BX \
   456  	ADDQ AX, R11 \
   457  	ADCQ BX, R12 \
   458  	MULXQ 24+rb, AX, BX \
   459  	ADCQ AX, R13 \
   460  	ADCQ BX, R14 \
   461  	MULXQ 40+rb, AX, BX \
   462  	ADCQ AX, R8 \
   463  	ADCQ BX, R9 \
   464  	ADCQ $0, R10 \
   465  	\
   466  	MOVQ a3, DX \
   467  	MULXQ 0+rb, AX, BX \
   468  	ADDQ AX, R11; MOVQ R11, 24+stack; MOVL $0, R11 \
   469  	ADCQ BX, R12 \
   470  	MULXQ 16+rb, AX, BX \
   471  	ADCQ AX, R13 \
   472  	ADCQ BX, R14 \
   473  	MULXQ 32+rb, AX, BX \
   474  	ADCQ AX, R8 \
   475  	ADCQ BX, R9 \
   476  	ADCQ $0, R10 \
   477  	MULXQ 8+rb, AX, BX \
   478  	ADDQ AX, R12 \
   479  	ADCQ BX, R13 \
   480  	MULXQ 24+rb, AX, BX \
   481  	ADCQ AX, R14 \
   482  	ADCQ BX, R8 \
   483  	MULXQ 40+rb, AX, BX \
   484  	ADCQ AX, R9 \
   485  	ADCQ BX, R10 \
   486  	ADCQ $0, R11 \
   487  	\
   488  	MOVQ a4, DX \
   489  	MULXQ 0+rb, AX, BX \
   490  	ADDQ AX, R12; MOVQ R12, 32+stack; MOVL $0, R12 \
   491  	ADCQ BX, R13 \
   492  	MULXQ 16+rb, AX, BX \
   493  	ADCQ AX, R14 \
   494  	ADCQ BX, R8 \
   495  	MULXQ 32+rb, AX, BX \
   496  	ADCQ AX, R9 \
   497  	ADCQ BX, R10 \
   498  	ADCQ $0, R11 \
   499  	MULXQ 8+rb, AX, BX \
   500  	ADDQ AX, R13 \
   501  	ADCQ BX, R14 \
   502  	MULXQ 24+rb, AX, BX \
   503  	ADCQ AX, R8 \
   504  	ADCQ BX, R9 \
   505  	MULXQ 40+rb, AX, BX \
   506  	ADCQ AX, R10 \
   507  	ADCQ BX, R11 \
   508  	ADCQ $0, R12 \
   509  	\
   510  	MOVQ a5, DX \
   511  	MULXQ 0+rb, AX, BX \
   512  	ADDQ AX, R13; MOVQ R13, 40+stack \
   513  	ADCQ BX, R14 \
   514  	MULXQ 16+rb, AX, BX \
   515  	ADCQ AX, R8 \
   516  	ADCQ BX, R9 \
   517  	MULXQ 32+rb, AX, BX \
   518  	ADCQ AX, R10 \
   519  	ADCQ BX, R11 \
   520  	ADCQ $0, R12 \
   521  	MULXQ 8+rb, AX, BX \
   522  	ADDQ AX, R14 \
   523  	ADCQ BX, R8 \
   524  	MULXQ 24+rb, AX, BX \
   525  	ADCQ AX, R9 \
   526  	ADCQ BX, R10 \
   527  	MULXQ 40+rb, AX, BX \
   528  	ADCQ AX, R11 \
   529  	ADCQ BX, R12
   530  
   531  #define fp384ReduceBMI2(stack) \
   532  	\ // m = (T * P') mod R, store m in R8:R9:R10:R11:R12:R13
   533  	MOVQ ·pp+0(SB), DX \
   534  	MULXQ 0+stack, R8, R9 \
   535  	MULXQ 8+stack, AX, R10 \
   536  	ADDQ AX, R9 \
   537  	MULXQ 16+stack, AX, R11 \
   538  	ADCQ AX, R10 \
   539  	MULXQ 24+stack, AX, R12 \
   540  	ADCQ AX, R11 \
   541  	MULXQ 32+stack, AX, R13 \
   542  	ADCQ AX, R12 \
   543  	MULXQ 40+stack, AX, BX \
   544  	ADCQ AX, R13 \
   545  	\
   546  	ADDQ 0+stack, R9 \
   547  	ADCQ 8+stack, R10 \
   548  	ADCQ 16+stack, R11 \
   549  	ADCQ 24+stack, R12 \
   550  	ADCQ 32+stack, R13 \
   551  	\
   552  	MOVQ ·pp+16(SB), DX \
   553  	MULXQ 0+stack, AX, BX \
   554  	ADDQ AX, R10 \
   555  	ADCQ BX, R11 \
   556  	MULXQ 16+stack, AX, BX \
   557  	ADCQ AX, R12 \
   558  	ADCQ BX, R13 \
   559  	MULXQ 8+stack, AX, BX \
   560  	ADDQ AX, R11 \
   561  	ADCQ BX, R12 \
   562  	MULXQ 24+stack, AX, BX \
   563  	ADCQ AX, R13 \
   564  	\
   565  	MOVQ ·pp+24(SB), DX \
   566  	MULXQ 0+stack, AX, BX \
   567  	ADDQ AX, R11 \
   568  	ADCQ BX, R12 \
   569  	MULXQ 16+stack, AX, BX \
   570  	ADCQ AX, R13 \
   571  	MULXQ 8+stack, AX, BX \
   572  	ADDQ AX, R12 \
   573  	ADCQ BX, R13 \
   574  	\
   575  	MOVQ ·pp+32(SB), DX \
   576  	MULXQ 0+stack, AX, BX \
   577  	ADDQ AX, R12 \
   578  	ADCQ BX, R13 \
   579  	MULXQ 8+stack, AX, BX \
   580  	ADDQ AX, R13 \
   581  	\
   582  	MOVQ ·pp+40(SB), DX \
   583  	MULXQ 0+stack, AX, BX \
   584  	ADDQ AX, R13 \
   585  	\
   586  	storeBlock(R8,R9,R10,R11,R12,R13, 96+stack) \
   587  	\
   588  	\ // m * P
   589  	mulBMI2(·p+0(SB),·p+8(SB),·p+16(SB),·p+24(SB),·p+32(SB),·p+40(SB), 96+stack, 144+stack) \
   590  	\
   591  	\ // Add the 768-bit intermediate to m*N
   592  	loadBlock(144+stack, AX,R13,BX,CX,DX,DI) \
   593  	\
   594  	ADDQ 0+stack,  AX \
   595  	ADCQ 8+stack, R13 \
   596  	ADCQ 16+stack, BX \
   597  	ADCQ 24+stack, CX \
   598  	ADCQ 32+stack, DX \
   599  	ADCQ 40+stack, DI \
   600  	ADCQ 48+stack, R14 \
   601  	ADCQ 56+stack, R8 \
   602  	ADCQ 64+stack, R9 \
   603  	ADCQ 72+stack, R10 \
   604  	ADCQ 80+stack, R11 \
   605  	ADCQ 88+stack, R12 \
   606  	MOVQ $0, 0+stack \
   607  	ADCQ $0, 0+stack \
   608  	\
   609  	fp384Carry(R14,R8,R9,R10,R11,R12, 0+stack, AX,R13,BX,CX,DX,DI,SI)
   610  
   611  TEXT ·fp384Neg(SB), NOSPLIT, $0-16
   612  	MOVQ ·p+0(SB), R8
   613  	MOVQ ·p+8(SB), R9
   614  	MOVQ ·p+16(SB), R10
   615  	MOVQ ·p+24(SB), R11
   616  	MOVQ ·p+32(SB), R12
   617  	MOVQ ·p+40(SB), R13
   618  
   619  	MOVQ a+8(FP), DI
   620  	SUBQ 0(DI), R8
   621  	SBBQ 8(DI), R9
   622  	SBBQ 16(DI), R10
   623  	SBBQ 24(DI), R11
   624  	SBBQ 32(DI), R12
   625  	SBBQ 40(DI), R13
   626  
   627  	MOVQ $0, R15
   628  	fp384Carry(R8,R9,R10,R11,R12,R13,R15, R14,AX,BX,CX,DX,DI,SI)
   629  
   630  	MOVQ c+0(FP), DI
   631  	storeBlock(R8,R9,R10,R11,R12,R13, 0(DI))
   632  	RET
   633  
   634  TEXT ·fp384Add(SB), NOSPLIT, $0-24
   635  	MOVQ a+8(FP), DI
   636  	MOVQ b+16(FP), SI
   637  
   638  	loadBlock(0(DI), R8,R9,R10,R11,R12,R13)
   639  	MOVQ $0, R15
   640  
   641  	ADDQ  0(SI), R8
   642  	ADCQ  8(SI), R9
   643  	ADCQ 16(SI), R10
   644  	ADCQ 24(SI), R11
   645  	ADCQ 32(SI), R12
   646  	ADCQ 40(SI), R13
   647  	ADCQ $0, R15
   648  
   649  	fp384Carry(R8,R9,R10,R11,R12,R13,R15, R14,AX,BX,CX,DX,DI,SI)
   650  
   651  	MOVQ c+0(FP), DI
   652  	storeBlock(R8,R9,R10,R11,R12,R13, 0(DI))
   653  	RET
   654  
   655  TEXT ·fp384Sub(SB), NOSPLIT, $0-24
   656  	MOVQ ·p+0(SB), R8
   657  	MOVQ ·p+8(SB), R9
   658  	MOVQ ·p+16(SB), R10
   659  	MOVQ ·p+24(SB), R11
   660  	MOVQ ·p+32(SB), R12
   661  	MOVQ ·p+40(SB), R13
   662  
   663  	MOVQ b+16(FP), DI
   664  	SUBQ 0(DI), R8
   665  	SBBQ 8(DI), R9
   666  	SBBQ 16(DI), R10
   667  	SBBQ 24(DI), R11
   668  	SBBQ 32(DI), R12
   669  	SBBQ 40(DI), R13
   670  
   671  	MOVQ $0, R15
   672  	MOVQ a+8(FP), DI
   673  	ADDQ 0(DI), R8
   674  	ADCQ 8(DI), R9
   675  	ADCQ 16(DI), R10
   676  	ADCQ 24(DI), R11
   677  	ADCQ 32(DI), R12
   678  	ADCQ 40(DI), R13
   679  	ADCQ $0, R15
   680  
   681  	fp384Carry(R8,R9,R10,R11,R12,R13,R15, R14,AX,BX,CX,DX,DI,SI)
   682  
   683  	MOVQ c+0(FP), DI
   684  	storeBlock(R8,R9,R10,R11,R12,R13, 0(DI))
   685  	RET
   686  
   687  TEXT ·fp384Mul(SB), NOSPLIT, $240-24
   688  	MOVQ a+8(FP), DI
   689  	MOVQ b+16(FP), SI
   690  
   691  	// Jump to a slightly different implementation if MULX isn't supported.
   692  	CMPB ·hasBMI2(SB), $0
   693  	JE   nobmi2Mul
   694  
   695  	// T = a * b
   696  	mulBMI2(0(DI),8(DI),16(DI),24(DI),32(DI),40(DI), 0(SI), 0(SP))
   697  	storeBlock(R14,R8,R9,R10,R11,R12, 48(SP))
   698  
   699  	// Reduce T.
   700  	fp384ReduceBMI2(0(SP))
   701  
   702  	MOVQ c+0(FP), DI
   703  	storeBlock(R14,R8,R9,R10,R11,R12, 0(DI))
   704  	JMP end
   705  
   706  nobmi2Mul:
   707  	// T = a * b
   708  	mul(0(DI),8(DI),16(DI),24(DI),32(DI),40(DI), 0(SI), 0(SP))
   709  
   710  	// Reduce T.
   711  	fp384Reduce(0(SP))
   712  
   713  	MOVQ c+0(FP), DI
   714  	storeBlock(R14,SI,AX,BX,CX,DX, 0(DI))
   715  
   716  end:
   717  	RET
   718  
   719  TEXT ·fp384Cmov(SB), NOSPLIT, $0
   720      MOVQ x+0(FP), DI
   721      MOVQ y+8(FP), SI
   722      MOVQ b+16(FP), BX
   723      TESTQ BX, BX
   724      MOVQ  0(DI), AX; MOVQ  0(SI), DX; CMOVQNE DX, AX; MOVQ AX,  0(DI);
   725      MOVQ  8(DI), AX; MOVQ  8(SI), DX; CMOVQNE DX, AX; MOVQ AX,  8(DI);
   726      MOVQ 16(DI), AX; MOVQ 16(SI), DX; CMOVQNE DX, AX; MOVQ AX, 16(DI);
   727      MOVQ 24(DI), AX; MOVQ 24(SI), DX; CMOVQNE DX, AX; MOVQ AX, 24(DI);
   728      MOVQ 32(DI), AX; MOVQ 32(SI), DX; CMOVQNE DX, AX; MOVQ AX, 32(DI);
   729      MOVQ 40(DI), AX; MOVQ 40(SI), DX; CMOVQNE DX, AX; MOVQ AX, 40(DI);
   730      RET