github.com/emmansun/gmsm@v0.29.1/sm9/bn256/gfp_amd64.s (about)

     1  //go:build !(purego || plugin)
     2  
     3  #include "textflag.h"
     4  #include "gfp_macros_amd64.s"
     5  #define t1 R15
     6  
     7  // func gfpSqr(res, in *gfP, n int)
     8  TEXT ·gfpSqr(SB),NOSPLIT,$0
     9  	MOVQ res+0(FP), res_ptr
    10  	MOVQ in+8(FP), x_ptr
    11  	MOVQ n+16(FP), BX
    12  
    13  	CMPB ·supportADX(SB), $0
    14  	JE   gfpSqrLoop
    15  
    16  gfpSqrLoopAdx:
    17  	XORQ acc0, acc0
    18  	XORQ y_ptr, y_ptr
    19  	// y[1:] * y[0]
    20  	MOVQ (8*0)(x_ptr), DX
    21  	MULXQ (8*1)(x_ptr), acc1, acc2 
    22  
    23  	MULXQ (8*2)(x_ptr), AX, acc3
    24  	ADOXQ AX, acc2
    25  
    26  	MULXQ (8*3)(x_ptr), AX, acc4
    27  	ADOXQ AX, acc3
    28  	ADOXQ y_ptr, acc4
    29  
    30  	// y[2:] * y[1]
    31  	MOVQ (8*1)(x_ptr), DX
    32  	MULXQ (8*2)(x_ptr), AX, t1
    33  	ADOXQ AX, acc3
    34  
    35  	MULXQ (8*3)(x_ptr), AX, acc5
    36  	ADCXQ t1, AX
    37  	ADOXQ AX, acc4
    38  	ADCXQ y_ptr, acc5
    39  
    40  	// y[3] * y[2]
    41  	MOVQ (8*2)(x_ptr), DX
    42  	MULXQ (8*3)(x_ptr), AX, y_ptr 
    43  	ADOXQ AX, acc5
    44  	ADOXQ acc0, y_ptr
    45  
    46  	XORQ t1, t1
    47  	// *2
    48  	ADOXQ acc1, acc1
    49  	ADOXQ acc2, acc2
    50  	ADOXQ acc3, acc3
    51  	ADOXQ acc4, acc4
    52  	ADOXQ acc5, acc5
    53  	ADOXQ y_ptr, y_ptr
    54  	ADOXQ acc0, t1
    55  	
    56  	// Missing products
    57  	MOVQ (8*0)(x_ptr), DX
    58  	MULXQ DX, acc0, t0
    59  	ADCXQ t0, acc1
    60  
    61  	MOVQ (8*1)(x_ptr), DX
    62  	MULXQ DX, AX, t0
    63  	ADCXQ AX, acc2
    64  	ADCXQ t0, acc3
    65  
    66  	MOVQ (8*2)(x_ptr), DX
    67  	MULXQ DX, AX, t0 
    68  	ADCXQ AX, acc4
    69  	ADCXQ t0, acc5
    70  
    71  	MOVQ (8*3)(x_ptr), DX
    72  	MULXQ DX, AX, x_ptr
    73  	ADCXQ AX, y_ptr
    74  	ADCXQ t1, x_ptr
    75  
    76  	// First reduction step
    77  	MOVQ acc0, DX
    78  	MULXQ ·np+0x00(SB), DX, AX
    79  
    80  	MULXQ ·p2+0x00(SB), AX, t0
    81  	ADOXQ AX, acc0               // (carry1, acc0) = acc0 + t0 * ord0
    82  
    83  	MULXQ ·p2+0x08(SB), AX, t1
    84  	ADCXQ t0, AX
    85  	ADOXQ AX, acc1
    86  
    87  	MULXQ ·p2+0x10(SB), AX, t0
    88  	ADCXQ t1, AX
    89  	ADOXQ AX, acc2
    90  	
    91  	MULXQ ·p2+0x18(SB), AX, acc0
    92  	ADCXQ t0, AX
    93  	ADOXQ AX, acc3
    94  	MOVQ $0, t0
    95  	ADCXQ t0, acc0
    96  	ADOXQ t0, acc0
    97  
    98  	// Second reduction step
    99  	MOVQ acc1, DX
   100  	MULXQ ·np+0x00(SB), DX, AX
   101  
   102  	MULXQ ·p2+0x00(SB), AX, t0
   103  	ADOXQ AX, acc1
   104  
   105  	MULXQ ·p2+0x08(SB), AX, t1
   106  	ADCXQ t0, AX
   107  	ADOXQ AX, acc2
   108  
   109  	MULXQ ·p2+0x10(SB), AX, t0
   110  	ADCXQ t1, AX
   111  	ADOXQ AX, acc3
   112  
   113  	MULXQ ·p2+0x18(SB), AX, acc1
   114  	ADCXQ t0, AX
   115  	ADOXQ AX, acc0
   116  	MOVQ $0, t0
   117  	ADCXQ t0, acc1
   118  	ADOXQ t0, acc1
   119  
   120  	// Third reduction step
   121  	MOVQ acc2, DX
   122  	MULXQ ·np+0x00(SB), DX, AX
   123  
   124  	MULXQ ·p2+0x00(SB), AX, t0
   125  	ADOXQ AX, acc2
   126  
   127  	MULXQ ·p2+0x08(SB), AX, t1
   128  	ADCXQ t0, AX
   129  	ADOXQ AX, acc3
   130  
   131  	MULXQ ·p2+0x10(SB), AX, t0
   132  	ADCXQ t1, AX
   133  	ADOXQ AX, acc0
   134  
   135  	MULXQ ·p2+0x18(SB), AX, acc2
   136  	ADCXQ t0, AX
   137  	ADOXQ AX, acc1
   138  	MOVQ $0, t0
   139  	ADCXQ t0, acc2
   140  	ADOXQ t0, acc2
   141  
   142  	// Last reduction step
   143  	MOVQ acc3, DX
   144  	MULXQ ·np+0x00(SB), DX, AX
   145  
   146  	MULXQ ·p2+0x00(SB), AX, t0
   147  	ADOXQ AX, acc3
   148  
   149  	MULXQ ·p2+0x08(SB), AX, t1
   150  	ADCXQ t0, AX
   151  	ADOXQ AX, acc0
   152  
   153  	MULXQ ·p2+0x10(SB), AX, t0
   154  	ADCXQ t1, AX
   155  	ADOXQ AX, acc1
   156  
   157  	MULXQ ·p2+0x18(SB), AX, acc3
   158  	ADCXQ t0, AX
   159  	ADOXQ AX, acc2
   160  	MOVQ $0, t0
   161  	ADCXQ t0, acc3
   162  	ADOXQ t0, acc3
   163  
   164  	XORQ t1, t1
   165  	// Add bits [511:256] of the sqr result
   166  	ADCXQ acc4, acc0
   167  	ADCXQ acc5, acc1
   168  	ADCXQ y_ptr, acc2
   169  	ADCXQ x_ptr, acc3
   170  	ADCXQ t1, t0
   171  	
   172  	gfpCarry(acc0,acc1,acc2,acc3, acc4,acc5,y_ptr,t1,t0)
   173  	storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr))
   174  
   175  	MOVQ res_ptr, x_ptr
   176  	DECQ BX
   177  	JNE gfpSqrLoopAdx
   178  
   179  	RET
   180  
   181  gfpSqrLoop:
   182  
   183  	// y[1:] * y[0]
   184  	MOVQ (8*0)(x_ptr), t0
   185  
   186  	MOVQ (8*1)(x_ptr), AX
   187  	MULQ t0
   188  	MOVQ AX, acc1
   189  	MOVQ DX, acc2
   190  
   191  	MOVQ (8*2)(x_ptr), AX
   192  	MULQ t0
   193  	ADDQ AX, acc2
   194  	ADCQ $0, DX
   195  	MOVQ DX, acc3
   196  
   197  	MOVQ (8*3)(x_ptr), AX
   198  	MULQ t0
   199  	ADDQ AX, acc3
   200  	ADCQ $0, DX
   201  	MOVQ DX, acc4
   202  	// y[2:] * y[1]
   203  	MOVQ (8*1)(x_ptr), t0
   204  
   205  	MOVQ (8*2)(x_ptr), AX
   206  	MULQ t0
   207  	ADDQ AX, acc3
   208  	ADCQ $0, DX
   209  	MOVQ DX, t1
   210  
   211  	MOVQ (8*3)(x_ptr), AX
   212  	MULQ t0
   213  	ADDQ t1, acc4
   214  	ADCQ $0, DX
   215  	ADDQ AX, acc4
   216  	ADCQ $0, DX
   217  	MOVQ DX, acc5
   218  	// y[3] * y[2]
   219  	MOVQ (8*2)(x_ptr), t0
   220  
   221  	MOVQ (8*3)(x_ptr), AX
   222  	MULQ t0
   223  	ADDQ AX, acc5
   224  	ADCQ $0, DX
   225  	MOVQ DX, y_ptr
   226  	XORQ t1, t1
   227  	// *2
   228  	ADDQ acc1, acc1
   229  	ADCQ acc2, acc2
   230  	ADCQ acc3, acc3
   231  	ADCQ acc4, acc4
   232  	ADCQ acc5, acc5
   233  	ADCQ y_ptr, y_ptr
   234  	ADCQ $0, t1
   235  	// Missing products
   236  	MOVQ (8*0)(x_ptr), AX
   237  	MULQ AX
   238  	MOVQ AX, acc0
   239  	MOVQ DX, t0
   240  
   241  	MOVQ (8*1)(x_ptr), AX
   242  	MULQ AX
   243  	ADDQ t0, acc1
   244  	ADCQ AX, acc2
   245  	ADCQ $0, DX
   246  	MOVQ DX, t0
   247  
   248  	MOVQ (8*2)(x_ptr), AX
   249  	MULQ AX
   250  	ADDQ t0, acc3
   251  	ADCQ AX, acc4
   252  	ADCQ $0, DX
   253  	MOVQ DX, t0
   254  
   255  	MOVQ (8*3)(x_ptr), AX
   256  	MULQ AX
   257  	ADDQ t0, acc5
   258  	ADCQ AX, y_ptr
   259  	ADCQ DX, t1
   260  	MOVQ t1, x_ptr
   261  	// T = [acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, x_ptr]
   262  	// First reduction step
   263  	MOVQ acc0, AX
   264  	MULQ ·np+0x00(SB)
   265  	MOVQ AX, t0     // Y
   266  
   267  	// Calculate next T = T+Y*P
   268  	MOVQ ·p2+0x00(SB), AX
   269  	MULQ t0
   270  	ADDQ AX, acc0   // acc0 is free now
   271  	ADCQ $0, DX
   272  	MOVQ DX, t1     // carry
   273  	XORQ acc0, acc0
   274  
   275  	MOVQ ·p2+0x08(SB), AX
   276  	MULQ t0
   277  	ADDQ t1, acc1
   278  	ADCQ $0, DX
   279  	ADDQ AX, acc1
   280  	ADCQ $0, DX
   281  	MOVQ DX, t1     // carry
   282  
   283  	MOVQ ·p2+0x10(SB), AX
   284  	MULQ t0
   285  	ADDQ t1, acc2
   286  	ADCQ $0, DX
   287  	ADDQ AX, acc2
   288  	ADCQ $0, DX
   289  	MOVQ DX, t1     // carry
   290  
   291  	MOVQ ·p2+0x18(SB), AX
   292  	MULQ t0
   293  	ADDQ t1, acc3
   294  	ADCQ $0, DX
   295  	ADDQ AX, acc3
   296  	ADCQ DX, acc0
   297  
   298  	// Second reduction step
   299  	MOVQ acc1, AX
   300  	MULQ ·np+0x00(SB)
   301  	MOVQ AX, t0     // Y
   302  
   303  	// Calculate next T = T+Y*P
   304  	MOVQ ·p2+0x00(SB), AX
   305  	MULQ t0
   306  	ADDQ AX, acc1   // acc1 is free now
   307  	ADCQ $0, DX
   308  	MOVQ DX, t1     // carry
   309  	XORQ acc1, acc1
   310  
   311  	MOVQ ·p2+0x08(SB), AX
   312  	MULQ t0
   313  	ADDQ t1, acc2
   314  	ADCQ $0, DX
   315  	ADDQ AX, acc2
   316  	ADCQ $0, DX
   317  	MOVQ DX, t1     // carry
   318  
   319  	MOVQ ·p2+0x10(SB), AX
   320  	MULQ t0
   321  	ADDQ t1, acc3
   322  	ADCQ $0, DX
   323  	ADDQ AX, acc3
   324  	ADCQ $0, DX
   325  	MOVQ DX, t1     // carry
   326  
   327  	MOVQ ·p2+0x18(SB), AX
   328  	MULQ t0
   329  	ADDQ t1, acc0
   330  	ADCQ $0, DX
   331  	ADDQ AX, acc0
   332  	ADCQ DX, acc1
   333  
   334  	// Third reduction step
   335  	MOVQ acc2, AX
   336  	MULQ ·np+0x00(SB)
   337  	MOVQ AX, t0     // Y
   338  
   339  	// Calculate next T = T+Y*P
   340  	MOVQ ·p2+0x00(SB), AX
   341  	MULQ t0
   342  	ADDQ AX, acc2   // acc2 is free now
   343  	ADCQ $0, DX
   344  	MOVQ DX, t1     // carry
   345  	XORQ acc2, acc2
   346  
   347  	MOVQ ·p2+0x08(SB), AX
   348  	MULQ t0
   349  	ADDQ t1, acc3
   350  	ADCQ $0, DX
   351  	ADDQ AX, acc3
   352  	ADCQ $0, DX
   353  	MOVQ DX, t1     // carry
   354  
   355  	MOVQ ·p2+0x10(SB), AX
   356  	MULQ t0
   357  	ADDQ t1, acc0
   358  	ADCQ $0, DX
   359  	ADDQ AX, acc0
   360  	ADCQ $0, DX
   361  	MOVQ DX, t1     // carry
   362  
   363  	MOVQ ·p2+0x18(SB), AX
   364  	MULQ t0
   365  	ADDQ t1, acc1
   366  	ADCQ $0, DX
   367  	ADDQ AX, acc1
   368  	ADCQ DX, acc2
   369  
   370  	// Last reduction step
   371  	MOVQ acc3, AX
   372  	MULQ ·np+0x00(SB)
   373  	MOVQ AX, t0     // Y
   374  
   375  	// Calculate next T = T+Y*P
   376  	MOVQ ·p2+0x00(SB), AX
   377  	MULQ t0
   378  	ADDQ AX, acc3   // acc3 is free now
   379  	ADCQ $0, DX
   380  	MOVQ DX, t1     // carry
   381  	XORQ acc3, acc3
   382  
   383  	MOVQ ·p2+0x08(SB), AX
   384  	MULQ t0
   385  	ADDQ t1, acc0
   386  	ADCQ $0, DX
   387  	ADDQ AX, acc0
   388  	ADCQ $0, DX
   389  	MOVQ DX, t1     // carry
   390  
   391  	MOVQ ·p2+0x10(SB), AX
   392  	MULQ t0
   393  	ADDQ t1, acc1
   394  	ADCQ $0, DX
   395  	ADDQ AX, acc1
   396  	ADCQ $0, DX
   397  	MOVQ DX, t1     // carry
   398  
   399  	MOVQ ·p2+0x18(SB), AX
   400  	MULQ t0
   401  	ADDQ t1, acc2
   402  	ADCQ $0, DX
   403  	ADDQ AX, acc2
   404  	ADCQ DX, acc3
   405  
   406  	XORQ t0, t0
   407  	// Add bits [511:256] of the sqr result
   408  	ADDQ acc4, acc0
   409  	ADCQ acc5, acc1
   410  	ADCQ y_ptr, acc2
   411  	ADCQ x_ptr, acc3
   412  	ADCQ $0, t0
   413  	
   414  	gfpCarry(acc0,acc1,acc2,acc3, acc4,acc5,y_ptr,t1,t0)
   415  	storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr))
   416  	MOVQ res_ptr, x_ptr
   417  	DECQ BX
   418  	JNE gfpSqrLoop
   419  
   420  	RET