github.com/emmansun/gmsm@v0.29.1/sm9/bn256/gfp_cmn_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #include "gfp_macros_amd64.s"
     6  
     7  TEXT ·gfpNeg(SB),NOSPLIT,$0-16
     8  	MOVQ ·p2+0(SB), R8
     9  	MOVQ ·p2+8(SB), R9
    10  	MOVQ ·p2+16(SB), R10
    11  	MOVQ ·p2+24(SB), R11
    12  
    13  	MOVQ a+8(FP), DI
    14  	SUBQ 0(DI), R8
    15  	SBBQ 8(DI), R9
    16  	SBBQ 16(DI), R10
    17  	SBBQ 24(DI), R11
    18  
    19  	gfpCarryWithoutCarry(R8,R9,R10,R11, R12,R13,R14,CX)
    20  
    21  	MOVQ c+0(FP), DI
    22  	storeBlock(R8,R9,R10,R11, 0(DI))
    23  	RET
    24  
    25  TEXT ·gfpAdd(SB),NOSPLIT,$0-24
    26  	MOVQ a+8(FP), DI
    27  	MOVQ b+16(FP), SI
    28  
    29  	loadBlock(0(DI), R8,R9,R10,R11)
    30  	MOVQ $0, R12
    31  
    32  	ADDQ  0(SI), R8
    33  	ADCQ  8(SI), R9
    34  	ADCQ 16(SI), R10
    35  	ADCQ 24(SI), R11
    36  	ADCQ $0, R12
    37  
    38  	gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12)
    39  
    40  	MOVQ c+0(FP), DI
    41  	storeBlock(R8,R9,R10,R11, 0(DI))
    42  	RET
    43  
    44  TEXT ·gfpDouble(SB),NOSPLIT,$0-16
    45  	MOVQ a+0(FP), DI
    46  	MOVQ b+8(FP), SI
    47  
    48  	loadBlock(0(SI), R8,R9,R10,R11)
    49  	XORQ R12, R12
    50  
    51  	ADDQ  R8, R8
    52  	ADCQ  R9, R9
    53  	ADCQ  R10, R10
    54  	ADCQ  R11, R11
    55  	ADCQ  $0, R12
    56  
    57  	gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12)
    58  
    59  	storeBlock(R8,R9,R10,R11, 0(DI))
    60  	RET
    61  
    62  TEXT ·gfpTriple(SB),NOSPLIT,$0-16
    63  	MOVQ a+0(FP), DI
    64  	MOVQ b+8(FP), SI
    65  
    66  	loadBlock(0(SI), R8,R9,R10,R11)
    67  	XORQ R12, R12
    68  
    69  	ADDQ  R8, R8
    70  	ADCQ  R9, R9
    71  	ADCQ  R10, R10
    72  	ADCQ  R11, R11
    73  	ADCQ $0, R12
    74  
    75  	gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12)
    76  
    77  	XORQ R12, R12
    78  	ADDQ  0(SI), R8
    79  	ADCQ  8(SI), R9
    80  	ADCQ 16(SI), R10
    81  	ADCQ 24(SI), R11
    82  	ADCQ $0, R12
    83  
    84  	gfpCarry(R8,R9,R10,R11, R13,R14,CX,AX,R12)
    85  
    86  	storeBlock(R8,R9,R10,R11, 0(DI))
    87  	RET
    88  
    89  TEXT ·gfpSub(SB),NOSPLIT,$0-24
    90  	MOVQ a+8(FP), DI
    91  	MOVQ b+16(FP), SI
    92  
    93  	loadBlock(0(DI), R8,R9,R10,R11)
    94  
    95  	MOVQ ·p2+0(SB), R12
    96  	MOVQ ·p2+8(SB), R13
    97  	MOVQ ·p2+16(SB), R14
    98  	MOVQ ·p2+24(SB), CX
    99  	MOVQ $0, AX
   100  
   101  	SUBQ  0(SI), R8
   102  	SBBQ  8(SI), R9
   103  	SBBQ 16(SI), R10
   104  	SBBQ 24(SI), R11
   105  
   106  	CMOVQCC AX, R12
   107  	CMOVQCC AX, R13
   108  	CMOVQCC AX, R14
   109  	CMOVQCC AX, CX
   110  
   111  	ADDQ R12, R8
   112  	ADCQ R13, R9
   113  	ADCQ R14, R10
   114  	ADCQ CX, R11
   115  
   116  	MOVQ c+0(FP), DI
   117  	storeBlock(R8,R9,R10,R11, 0(DI))
   118  	RET
   119  
   120  TEXT ·gfpMul(SB),NOSPLIT,$0-24
   121  	MOVQ in1+8(FP), x_ptr
   122  	MOVQ in2+16(FP), y_ptr
   123  	
   124  	CMPB ·supportADX(SB), $0
   125  	JE   noAdxMul
   126  
   127  	XORQ acc5, acc5
   128  	XORQ res_ptr, res_ptr
   129  	// x * y[0]
   130  	MOVQ (8*0)(y_ptr), DX
   131  	MULXQ (8*0)(x_ptr), acc0, acc1 
   132  
   133  	MULXQ (8*1)(x_ptr), AX, acc2
   134  	ADCXQ AX, acc1
   135  
   136  	MULXQ (8*2)(x_ptr), AX, acc3
   137  	ADCXQ AX, acc2
   138  
   139  	MULXQ (8*3)(x_ptr), AX, acc4
   140  	ADCXQ AX, acc3
   141  	ADCXQ acc5, acc4
   142  
   143  	// First reduction step
   144  	MOVQ acc0, DX
   145  	MULXQ ·np+0x00(SB), DX, AX
   146  
   147  	MULXQ ·p2+0x00(SB), AX, t0
   148  	ADOXQ AX, acc0
   149  
   150  	MULXQ ·p2+0x08(SB), AX, BX
   151  	ADCXQ t0, AX
   152  	ADOXQ AX, acc1
   153  
   154  	MULXQ ·p2+0x10(SB), AX, t0
   155  	ADCXQ BX, AX
   156  	ADOXQ AX, acc2
   157  
   158  	MULXQ ·p2+0x18(SB), AX, BX
   159  	ADCXQ t0, AX
   160  	ADOXQ AX, acc3
   161  
   162  	ADCXQ res_ptr, BX
   163  	ADOXQ BX, acc4
   164  	ADOXQ res_ptr, acc5
   165  	XORQ acc0, acc0  // It seems this line is optional.
   166  
   167  	// x * y[1]
   168  	MOVQ (8*1)(y_ptr), DX
   169  	MULXQ (8*0)(x_ptr), AX, t0
   170  	ADOXQ AX, acc1
   171  
   172  	MULXQ (8*1)(x_ptr), AX, BX 
   173  	ADCXQ t0, AX
   174  	ADOXQ AX, acc2
   175  
   176  	MULXQ (8*2)(x_ptr), AX, t0
   177  	ADCXQ BX, AX
   178  	ADOXQ AX, acc3
   179  
   180  	MULXQ (8*3)(x_ptr), AX, BX
   181  	ADCXQ t0, AX
   182  	ADOXQ AX, acc4
   183  
   184  	ADCXQ acc0, BX
   185  	ADOXQ BX, acc5
   186  	ADOXQ res_ptr, acc0
   187  
   188  	// Second reduction step
   189  	MOVQ acc1, DX
   190  	MULXQ ·np+0x00(SB), DX, AX
   191  
   192  	MULXQ ·p2+0x00(SB), AX, t0
   193  	ADOXQ AX, acc1
   194  
   195  	MULXQ ·p2+0x08(SB), AX, BX
   196  	ADCXQ t0, AX
   197  	ADOXQ AX, acc2
   198  
   199  	MULXQ ·p2+0x10(SB), AX, t0
   200  	ADCXQ BX, AX
   201  	ADOXQ AX, acc3
   202  
   203  	MULXQ ·p2+0x18(SB), AX, BX
   204  	ADCXQ t0, AX
   205  	ADOXQ AX, acc4
   206  
   207  	ADCXQ res_ptr, BX
   208  	ADOXQ BX, acc5
   209  	ADOXQ res_ptr, acc0
   210  	XORQ acc1, acc1  // It seems this line is optional.
   211  
   212  	// x * y[2]
   213  	MOVQ (8*2)(y_ptr), DX
   214  	MULXQ (8*0)(x_ptr), AX, t0
   215  	ADOXQ AX, acc2
   216  
   217  	MULXQ (8*1)(x_ptr), AX, BX 
   218  	ADCXQ t0, AX
   219  	ADOXQ AX, acc3
   220  
   221  	MULXQ (8*2)(x_ptr), AX, t0
   222  	ADCXQ BX, AX
   223  	ADOXQ AX, acc4
   224  
   225  	MULXQ (8*3)(x_ptr), AX, BX
   226  	ADCXQ t0, AX
   227  	ADOXQ AX, acc5
   228  
   229  	ADCXQ res_ptr, BX
   230  	ADOXQ BX, acc0
   231  	ADOXQ res_ptr, acc1
   232  
   233  	// Third reduction step
   234  	MOVQ acc2, DX
   235  	MULXQ ·np+0x00(SB), DX, AX
   236  
   237  	MULXQ ·p2+0x00(SB), AX, t0
   238  	ADOXQ AX, acc2
   239  
   240  	MULXQ ·p2+0x08(SB), AX, BX
   241  	ADCXQ t0, AX
   242  	ADOXQ AX, acc3
   243  
   244  	MULXQ ·p2+0x10(SB), AX, t0
   245  	ADCXQ BX, AX
   246  	ADOXQ AX, acc4
   247  
   248  	MULXQ ·p2+0x18(SB), AX, BX
   249  	ADCXQ t0, AX
   250  	ADOXQ AX, acc5
   251  
   252  	ADCXQ res_ptr, BX
   253  	ADOXQ BX, acc0
   254  	ADOXQ res_ptr, acc1
   255  	XORQ acc2, acc2  // It seems this line is optional.
   256  
   257  	// x * y[3]
   258  	MOVQ (8*3)(y_ptr), DX
   259  	MULXQ (8*0)(x_ptr), AX, t0
   260  	ADOXQ AX, acc3
   261  
   262  	MULXQ (8*1)(x_ptr), AX, BX 
   263  	ADCXQ t0, AX
   264  	ADOXQ AX, acc4
   265  
   266  	MULXQ (8*2)(x_ptr), AX, t0 
   267  	ADCXQ BX, AX
   268  	ADOXQ AX, acc5
   269  
   270  	MULXQ (8*3)(x_ptr), AX, BX
   271  	ADCXQ t0, AX
   272  	ADOXQ AX, acc0
   273  
   274  	ADCXQ res_ptr, BX
   275  	ADOXQ BX, acc1
   276  	ADOXQ res_ptr, acc2
   277  
   278  	// Last reduction step
   279  	MOVQ acc3, DX
   280  	MULXQ ·np+0x00(SB), DX, AX
   281  
   282  	MULXQ ·p2+0x00(SB), AX, t0
   283  	ADOXQ AX, acc3
   284  
   285  	MULXQ ·p2+0x08(SB), AX, BX
   286  	ADCXQ t0, AX
   287  	ADOXQ AX, acc4
   288  
   289  	MULXQ ·p2+0x10(SB), AX, t0
   290  	ADCXQ BX, AX
   291  	ADOXQ AX, acc5
   292  
   293  	MULXQ ·p2+0x18(SB), AX, BX
   294  	ADCXQ t0, AX
   295  	ADOXQ AX, acc0
   296  
   297  	ADCXQ res_ptr, BX
   298  	ADOXQ BX, acc1
   299  	ADOXQ res_ptr, acc2
   300  	// Copy result [255:0]
   301  	gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,BX,acc2)
   302  	MOVQ res+0(FP), res_ptr
   303  	storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr))
   304  	RET
   305  
   306  noAdxMul:
   307  	// x * y[0]
   308  	MOVQ (8*0)(y_ptr), t0
   309  
   310  	MOVQ (8*0)(x_ptr), AX
   311  	MULQ t0
   312  	MOVQ AX, acc0
   313  	MOVQ DX, acc1
   314  
   315  	MOVQ (8*1)(x_ptr), AX
   316  	MULQ t0
   317  	ADDQ AX, acc1
   318  	ADCQ $0, DX
   319  	MOVQ DX, acc2
   320  
   321  	MOVQ (8*2)(x_ptr), AX
   322  	MULQ t0
   323  	ADDQ AX, acc2
   324  	ADCQ $0, DX
   325  	MOVQ DX, acc3
   326  
   327  	MOVQ (8*3)(x_ptr), AX
   328  	MULQ t0
   329  	ADDQ AX, acc3
   330  	ADCQ $0, DX
   331  	MOVQ DX, acc4
   332  	XORQ acc5, acc5
   333  	// First reduction step
   334  	MOVQ acc0, AX
   335  	MULQ ·np+0x00(SB)
   336  	MOVQ AX, t0
   337  
   338  	MOVQ ·p2+0x00(SB), AX
   339  	MULQ t0
   340  	ADDQ AX, acc0
   341  	ADCQ $0, DX
   342  	MOVQ DX, BX
   343  
   344  	MOVQ ·p2+0x08(SB), AX
   345  	MULQ t0
   346  	ADDQ BX, acc1
   347  	ADCQ $0, DX
   348  	ADDQ AX, acc1
   349  	ADCQ $0, DX
   350  	MOVQ DX, BX
   351  
   352  	MOVQ ·p2+0x10(SB), AX
   353  	MULQ t0
   354  	ADDQ BX, acc2
   355  	ADCQ $0, DX
   356  	ADDQ AX, acc2
   357  	ADCQ $0, DX
   358  	MOVQ DX, BX
   359  
   360  	MOVQ ·p2+0x18(SB), AX
   361  	MULQ t0
   362  	ADDQ BX, acc3
   363  	ADCQ $0, DX
   364  	ADDQ AX, acc3
   365  	ADCQ DX, acc4
   366  	ADCQ $0, acc5
   367  
   368  	XORQ acc0, acc0  // It seems this line is optional.
   369  	// x * y[1]
   370  	MOVQ (8*1)(y_ptr), t0
   371  
   372  	MOVQ (8*0)(x_ptr), AX
   373  	MULQ t0
   374  	ADDQ AX, acc1
   375  	ADCQ $0, DX
   376  	MOVQ DX, BX
   377  
   378  	MOVQ (8*1)(x_ptr), AX
   379  	MULQ t0
   380  	ADDQ BX, acc2
   381  	ADCQ $0, DX
   382  	ADDQ AX, acc2
   383  	ADCQ $0, DX
   384  	MOVQ DX, BX
   385  
   386  	MOVQ (8*2)(x_ptr), AX
   387  	MULQ t0
   388  	ADDQ BX, acc3
   389  	ADCQ $0, DX
   390  	ADDQ AX, acc3
   391  	ADCQ $0, DX
   392  	MOVQ DX, BX
   393  
   394  	MOVQ (8*3)(x_ptr), AX
   395  	MULQ t0
   396  	ADDQ BX, acc4
   397  	ADCQ $0, DX
   398  	ADDQ AX, acc4
   399  	ADCQ DX, acc5
   400  	ADCQ $0, acc0
   401  	// Second reduction step
   402  	MOVQ acc1, AX
   403  	MULQ ·np+0x00(SB)
   404  	MOVQ AX, t0
   405  
   406  	MOVQ ·p2+0x00(SB), AX
   407  	MULQ t0
   408  	ADDQ AX, acc1
   409  	ADCQ $0, DX
   410  	MOVQ DX, BX
   411  
   412  	MOVQ ·p2+0x08(SB), AX
   413  	MULQ t0
   414  	ADDQ BX, acc2
   415  	ADCQ $0, DX
   416  	ADDQ AX, acc2
   417  	ADCQ $0, DX
   418  	MOVQ DX, BX
   419  
   420  	MOVQ ·p2+0x10(SB), AX
   421  	MULQ t0
   422  	ADDQ BX, acc3
   423  	ADCQ $0, DX
   424  	ADDQ AX, acc3
   425  	ADCQ $0, DX
   426  	MOVQ DX, BX
   427  
   428  	MOVQ ·p2+0x18(SB), AX
   429  	MULQ t0
   430  	ADDQ BX, acc4
   431  	ADCQ $0, DX
   432  	ADDQ AX, acc4
   433  	ADCQ DX, acc5
   434  	ADCQ $0, acc0
   435  
   436  	XORQ acc1, acc1  // It seems this line is optional.
   437  	// x * y[2]
   438  	MOVQ (8*2)(y_ptr), t0
   439  
   440  	MOVQ (8*0)(x_ptr), AX
   441  	MULQ t0
   442  	ADDQ AX, acc2
   443  	ADCQ $0, DX
   444  	MOVQ DX, BX
   445  
   446  	MOVQ (8*1)(x_ptr), AX
   447  	MULQ t0
   448  	ADDQ BX, acc3
   449  	ADCQ $0, DX
   450  	ADDQ AX, acc3
   451  	ADCQ $0, DX
   452  	MOVQ DX, BX
   453  
   454  	MOVQ (8*2)(x_ptr), AX
   455  	MULQ t0
   456  	ADDQ BX, acc4
   457  	ADCQ $0, DX
   458  	ADDQ AX, acc4
   459  	ADCQ $0, DX
   460  	MOVQ DX, BX
   461  
   462  	MOVQ (8*3)(x_ptr), AX
   463  	MULQ t0
   464  	ADDQ BX, acc5
   465  	ADCQ $0, DX
   466  	ADDQ AX, acc5
   467  	ADCQ DX, acc0
   468  	ADCQ $0, acc1
   469  	// Third reduction step
   470  	MOVQ acc2, AX
   471  	MULQ ·np+0x00(SB)
   472  	MOVQ AX, t0
   473  
   474  	MOVQ ·p2+0x00(SB), AX
   475  	MULQ t0
   476  	ADDQ AX, acc2
   477  	ADCQ $0, DX
   478  	MOVQ DX, BX
   479  
   480  	MOVQ ·p2+0x08(SB), AX
   481  	MULQ t0
   482  	ADDQ BX, acc3
   483  	ADCQ $0, DX
   484  	ADDQ AX, acc3
   485  	ADCQ $0, DX
   486  	MOVQ DX, BX
   487  
   488  	MOVQ ·p2+0x10(SB), AX
   489  	MULQ t0
   490  	ADDQ BX, acc4
   491  	ADCQ $0, DX
   492  	ADDQ AX, acc4
   493  	ADCQ $0, DX
   494  	MOVQ DX, BX
   495  
   496  	MOVQ ·p2+0x18(SB), AX
   497  	MULQ t0
   498  	ADDQ BX, acc5
   499  	ADCQ $0, DX
   500  	ADDQ AX, acc5
   501  	ADCQ DX, acc0
   502  	ADCQ $0, acc1
   503  
   504  	XORQ acc2, acc2  // It seems this line is optional.
   505  	// x * y[3]
   506  	MOVQ (8*3)(y_ptr), t0
   507  
   508  	MOVQ (8*0)(x_ptr), AX
   509  	MULQ t0
   510  	ADDQ AX, acc3
   511  	ADCQ $0, DX
   512  	MOVQ DX, BX
   513  
   514  	MOVQ (8*1)(x_ptr), AX
   515  	MULQ t0
   516  	ADDQ BX, acc4
   517  	ADCQ $0, DX
   518  	ADDQ AX, acc4
   519  	ADCQ $0, DX
   520  	MOVQ DX, BX
   521  
   522  	MOVQ (8*2)(x_ptr), AX
   523  	MULQ t0
   524  	ADDQ BX, acc5
   525  	ADCQ $0, DX
   526  	ADDQ AX, acc5
   527  	ADCQ $0, DX
   528  	MOVQ DX, BX
   529  
   530  	MOVQ (8*3)(x_ptr), AX
   531  	MULQ t0
   532  	ADDQ BX, acc0
   533  	ADCQ $0, DX
   534  	ADDQ AX, acc0
   535  	ADCQ DX, acc1
   536  	ADCQ $0, acc2
   537  	// Last reduction step
   538  	MOVQ acc3, AX
   539  	MULQ ·np+0x00(SB)
   540  	MOVQ AX, t0
   541  
   542  	MOVQ ·p2+0x00(SB), AX
   543  	MULQ t0
   544  	ADDQ AX, acc3
   545  	ADCQ $0, DX
   546  	MOVQ DX, BX
   547  
   548  	MOVQ ·p2+0x08(SB), AX
   549  	MULQ t0
   550  	ADDQ BX, acc4
   551  	ADCQ $0, DX
   552  	ADDQ AX, acc4
   553  	ADCQ $0, DX
   554  	MOVQ DX, BX
   555  
   556  	MOVQ ·p2+0x10(SB), AX
   557  	MULQ t0
   558  	ADDQ BX, acc5
   559  	ADCQ $0, DX
   560  	ADDQ AX, acc5
   561  	ADCQ $0, DX
   562  	MOVQ DX, BX
   563  
   564  	MOVQ ·p2+0x18(SB), AX
   565  	MULQ t0
   566  	ADDQ BX, acc0
   567  	ADCQ $0, DX
   568  	ADDQ AX, acc0
   569  	ADCQ DX, acc1
   570  	ADCQ $0, acc2
   571  	// Copy result [255:0]
   572  	gfpCarry(acc4,acc5,acc0,acc1, x_ptr,acc3,t0,BX,acc2)
   573  	MOVQ res+0(FP), res_ptr
   574  	storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr))
   575  
   576  	RET
   577  
   578  /* ---------------------------------------*/
   579  // func gfpFromMont(res, in *gfP)
   580  TEXT ·gfpFromMont(SB),NOSPLIT,$0
   581  	MOVQ res+0(FP), res_ptr
   582  	MOVQ in+8(FP), x_ptr
   583  
   584  	MOVQ (8*0)(x_ptr), acc0
   585  	MOVQ (8*1)(x_ptr), acc1
   586  	MOVQ (8*2)(x_ptr), acc2
   587  	MOVQ (8*3)(x_ptr), acc3
   588  	XORQ acc4, acc4
   589  
   590  	// Only reduce, no multiplications are needed
   591  	// First reduction step
   592  	MOVQ acc0, AX
   593  	MULQ ·np+0x00(SB)
   594  	MOVQ AX, t0     // Y
   595  
   596  	// Calculate next T = T+Y*P
   597  	MOVQ ·p2+0x00(SB), AX
   598  	MULQ t0
   599  	ADDQ AX, acc0   // acc0 is free now
   600  	ADCQ $0, DX
   601  	MOVQ DX, BX     // carry
   602  	XORQ acc0, acc0
   603  
   604  	MOVQ ·p2+0x08(SB), AX
   605  	MULQ t0
   606  	ADDQ BX, acc1
   607  	ADCQ $0, DX
   608  	ADDQ AX, acc1
   609  	ADCQ $0, DX
   610  	MOVQ DX, BX     // carry
   611  
   612  	MOVQ ·p2+0x10(SB), AX
   613  	MULQ t0
   614  	ADDQ BX, acc2
   615  	ADCQ $0, DX
   616  	ADDQ AX, acc2
   617  	ADCQ $0, DX
   618  	MOVQ DX, BX     // carry
   619  
   620  	MOVQ ·p2+0x18(SB), AX
   621  	MULQ t0
   622  	ADDQ BX, acc3
   623  	ADCQ $0, DX
   624  	ADDQ AX, acc3
   625  	ADCQ DX, acc4
   626  	XORQ acc5, acc5
   627  
   628  	// Second reduction step
   629  	MOVQ acc1, AX
   630  	MULQ ·np+0x00(SB)
   631  	MOVQ AX, t0     // Y
   632  
   633  	// Calculate next T = T+Y*P
   634  	MOVQ ·p2+0x00(SB), AX
   635  	MULQ t0
   636  	ADDQ AX, acc1   // acc1 is free now
   637  	ADCQ $0, DX
   638  	MOVQ DX, BX     // carry
   639  	XORQ acc1, acc1
   640  
   641  	MOVQ ·p2+0x08(SB), AX
   642  	MULQ t0
   643  	ADDQ BX, acc2
   644  	ADCQ $0, DX
   645  	ADDQ AX, acc2
   646  	ADCQ $0, DX
   647  	MOVQ DX, BX     // carry
   648  
   649  	MOVQ ·p2+0x10(SB), AX
   650  	MULQ t0
   651  	ADDQ BX, acc3
   652  	ADCQ $0, DX
   653  	ADDQ AX, acc3
   654  	ADCQ $0, DX
   655  	MOVQ DX, BX     // carry
   656  
   657  	MOVQ ·p2+0x18(SB), AX
   658  	MULQ t0
   659  	ADDQ BX, acc4
   660  	ADCQ $0, DX
   661  	ADDQ AX, acc4
   662  	ADCQ DX, acc5
   663  
   664  	// Third reduction step
   665  	MOVQ acc2, AX
   666  	MULQ ·np+0x00(SB)
   667  	MOVQ AX, t0     // Y
   668  
   669  	// Calculate next T = T+Y*P
   670  	MOVQ ·p2+0x00(SB), AX
   671  	MULQ t0
   672  	ADDQ AX, acc2   // acc2 is free now
   673  	ADCQ $0, DX
   674  	MOVQ DX, BX     // carry
   675  
   676  	MOVQ ·p2+0x08(SB), AX
   677  	MULQ t0
   678  	ADDQ BX, acc3
   679  	ADCQ $0, DX
   680  	ADDQ AX, acc3
   681  	ADCQ $0, DX
   682  	MOVQ DX, BX     // carry
   683  
   684  	MOVQ ·p2+0x10(SB), AX
   685  	MULQ t0
   686  	ADDQ BX, acc4
   687  	ADCQ $0, DX
   688  	ADDQ AX, acc4
   689  	ADCQ $0, DX
   690  	MOVQ DX, BX     // carry
   691  
   692  	MOVQ ·p2+0x18(SB), AX
   693  	MULQ t0
   694  	ADDQ BX, acc5
   695  	ADCQ $0, DX
   696  	ADDQ AX, acc5
   697  	ADCQ DX, acc0
   698  
   699  	// Last reduction step
   700  	MOVQ acc3, AX
   701  	MULQ ·np+0x00(SB)
   702  	MOVQ AX, t0     // Y
   703  
   704  	// Calculate next T = T+Y*P
   705  	MOVQ ·p2+0x00(SB), AX
   706  	MULQ t0
   707  	ADDQ AX, acc3   // acc3 is free now
   708  	ADCQ $0, DX
   709  	MOVQ DX, BX     // carry
   710  	XORQ acc3, acc3
   711  
   712  	MOVQ ·p2+0x08(SB), AX
   713  	MULQ t0
   714  	ADDQ BX, acc4
   715  	ADCQ $0, DX
   716  	ADDQ AX, acc4
   717  	ADCQ $0, DX
   718  	MOVQ DX, BX     // carry
   719  
   720  	MOVQ ·p2+0x10(SB), AX
   721  	MULQ t0
   722  	ADDQ BX, acc5
   723  	ADCQ $0, DX
   724  	ADDQ AX, acc5
   725  	ADCQ $0, DX
   726  	MOVQ DX, BX     // carry
   727  
   728  	MOVQ ·p2+0x18(SB), AX
   729  	MULQ t0
   730  	ADDQ BX, acc0
   731  	ADCQ $0, DX
   732  	ADDQ AX, acc0
   733  	ADCQ DX, acc1
   734  
   735  	gfpCarryWithoutCarry(acc4, acc5, acc0, acc1, x_ptr, acc3, t0, BX)
   736  	storeBlock(acc4,acc5,acc0,acc1, 0(res_ptr))
   737  	RET
   738  
   739  /* ---------------------------------------*/
   740  // func gfpUnmarshal(res *gfP, in *[32]byte)
   741  TEXT ·gfpUnmarshal(SB),NOSPLIT,$0
   742  	JMP ·gfpMarshal(SB)
   743  
   744  /* ---------------------------------------*/
   745  // func gfpMarshal(res *[32]byte, in *gfP)
   746  TEXT ·gfpMarshal(SB),NOSPLIT,$0
   747  	MOVQ res+0(FP), res_ptr
   748  	MOVQ in+8(FP), x_ptr
   749  
   750  	MOVQ (8*0)(x_ptr), acc0
   751  	MOVQ (8*1)(x_ptr), acc1
   752  	MOVQ (8*2)(x_ptr), acc2
   753  	MOVQ (8*3)(x_ptr), acc3
   754  
   755  	BSWAPQ acc0
   756  	BSWAPQ acc1
   757  	BSWAPQ acc2
   758  	BSWAPQ acc3
   759  
   760  	MOVQ acc3, (8*0)(res_ptr)
   761  	MOVQ acc2, (8*1)(res_ptr)
   762  	MOVQ acc1, (8*2)(res_ptr)
   763  	MOVQ acc0, (8*3)(res_ptr)
   764  
   765  	RET