github.com/emmansun/gmsm@v0.29.1/sm9/bn256/gfp_plugin_amd64.s (about)

     1  //go:build plugin && !purego
     2  
     3  #include "textflag.h"
     4  #include "gfp_macros_amd64.s"
     5  
     6  // func gfpSqr(res, in *gfP, n int)
     7  TEXT ·gfpSqr(SB),NOSPLIT,$0
     8  	MOVQ res+0(FP), res_ptr
     9  	MOVQ in+8(FP), x_ptr
    10  	MOVQ n+16(FP), BP
    11  
    12  	CMPB ·supportADX(SB), $0
    13  	JE   gfpSqrLoop
    14  
    15  gfpSqrLoopAdx:
    16  	XORQ acc0, acc0
    17  	XORQ y_ptr, y_ptr
    18  	// y[1:] * y[0]
    19  	MOVQ (8*0)(x_ptr), DX
    20  	MULXQ (8*1)(x_ptr), acc1, acc2 
    21  
    22  	MULXQ (8*2)(x_ptr), AX, acc3
    23  	ADOXQ AX, acc2
    24  
    25  	MULXQ (8*3)(x_ptr), AX, acc4
    26  	ADOXQ AX, acc3
    27  	ADOXQ y_ptr, acc4
    28  
    29  	// y[2:] * y[1]
    30  	MOVQ (8*1)(x_ptr), DX
    31  	MULXQ (8*2)(x_ptr), AX, BX
    32  	ADOXQ AX, acc3
    33  
    34  	MULXQ (8*3)(x_ptr), AX, acc5
    35  	ADCXQ BX, AX
    36  	ADOXQ AX, acc4
    37  	ADCXQ y_ptr, acc5
    38  
    39  	// y[3] * y[2]
    40  	MOVQ (8*2)(x_ptr), DX
    41  	MULXQ (8*3)(x_ptr), AX, y_ptr 
    42  	ADOXQ AX, acc5
    43  	ADOXQ acc0, y_ptr
    44  
    45  	XORQ BX, BX
    46  	// *2
    47  	ADOXQ acc1, acc1
    48  	ADOXQ acc2, acc2
    49  	ADOXQ acc3, acc3
    50  	ADOXQ acc4, acc4
    51  	ADOXQ acc5, acc5
    52  	ADOXQ y_ptr, y_ptr
    53  	ADOXQ acc0, BX
    54  	
    55  	// Missing products
    56  	MOVQ (8*0)(x_ptr), DX
    57  	MULXQ DX, acc0, t0
    58  	ADCXQ t0, acc1
    59  
    60  	MOVQ (8*1)(x_ptr), DX
    61  	MULXQ DX, AX, t0
    62  	ADCXQ AX, acc2
    63  	ADCXQ t0, acc3
    64  
    65  	MOVQ (8*2)(x_ptr), DX
    66  	MULXQ DX, AX, t0 
    67  	ADCXQ AX, acc4
    68  	ADCXQ t0, acc5
    69  
    70  	MOVQ (8*3)(x_ptr), DX
    71  	MULXQ DX, AX, x_ptr
    72  	ADCXQ AX, y_ptr
    73  	ADCXQ BX, x_ptr
    74  
    75  	// First reduction step
    76  	MOVQ acc0, DX
    77  	MULXQ ·np+0x00(SB), DX, AX
    78  
    79  	MULXQ ·p2+0x00(SB), AX, t0
    80  	ADOXQ AX, acc0               // (carry1, acc0) = acc0 + t0 * ord0
    81  
    82  	MULXQ ·p2+0x08(SB), AX, BX
    83  	ADCXQ t0, AX
    84  	ADOXQ AX, acc1
    85  
    86  	MULXQ ·p2+0x10(SB), AX, t0
    87  	ADCXQ BX, AX
    88  	ADOXQ AX, acc2
    89  	
    90  	MULXQ ·p2+0x18(SB), AX, acc0
    91  	ADCXQ t0, AX
    92  	ADOXQ AX, acc3
    93  	MOVQ $0, t0
    94  	ADCXQ t0, acc0
    95  	ADOXQ t0, acc0
    96  
    97  	// Second reduction step
    98  	MOVQ acc1, DX
    99  	MULXQ ·np+0x00(SB), DX, AX
   100  
   101  	MULXQ ·p2+0x00(SB), AX, t0
   102  	ADOXQ AX, acc1
   103  
   104  	MULXQ ·p2+0x08(SB), AX, BX
   105  	ADCXQ t0, AX
   106  	ADOXQ AX, acc2
   107  
   108  	MULXQ ·p2+0x10(SB), AX, t0
   109  	ADCXQ BX, AX
   110  	ADOXQ AX, acc3
   111  
   112  	MULXQ ·p2+0x18(SB), AX, acc1
   113  	ADCXQ t0, AX
   114  	ADOXQ AX, acc0
   115  	MOVQ $0, t0
   116  	ADCXQ t0, acc1
   117  	ADOXQ t0, acc1
   118  
   119  	// Third reduction step
   120  	MOVQ acc2, DX
   121  	MULXQ ·np+0x00(SB), DX, AX
   122  
   123  	MULXQ ·p2+0x00(SB), AX, t0
   124  	ADOXQ AX, acc2
   125  
   126  	MULXQ ·p2+0x08(SB), AX, BX
   127  	ADCXQ t0, AX
   128  	ADOXQ AX, acc3
   129  
   130  	MULXQ ·p2+0x10(SB), AX, t0
   131  	ADCXQ BX, AX
   132  	ADOXQ AX, acc0
   133  
   134  	MULXQ ·p2+0x18(SB), AX, acc2
   135  	ADCXQ t0, AX
   136  	ADOXQ AX, acc1
   137  	MOVQ $0, t0
   138  	ADCXQ t0, acc2
   139  	ADOXQ t0, acc2
   140  
   141  	// Last reduction step
   142  	MOVQ acc3, DX
   143  	MULXQ ·np+0x00(SB), DX, AX
   144  
   145  	MULXQ ·p2+0x00(SB), AX, t0
   146  	ADOXQ AX, acc3
   147  
   148  	MULXQ ·p2+0x08(SB), AX, BX
   149  	ADCXQ t0, AX
   150  	ADOXQ AX, acc0
   151  
   152  	MULXQ ·p2+0x10(SB), AX, t0
   153  	ADCXQ BX, AX
   154  	ADOXQ AX, acc1
   155  
   156  	MULXQ ·p2+0x18(SB), AX, acc3
   157  	ADCXQ t0, AX
   158  	ADOXQ AX, acc2
   159  	MOVQ $0, t0
   160  	ADCXQ t0, acc3
   161  	ADOXQ t0, acc3
   162  
   163  	XORQ BX, BX
   164  	// Add bits [511:256] of the sqr result
   165  	ADCXQ acc4, acc0
   166  	ADCXQ acc5, acc1
   167  	ADCXQ y_ptr, acc2
   168  	ADCXQ x_ptr, acc3
   169  	ADCXQ BX, t0
   170  	
   171  	gfpCarry(acc0,acc1,acc2,acc3, acc4,acc5,y_ptr,BX,t0)
   172  	storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr))
   173  
   174  	MOVQ res_ptr, x_ptr
   175  	DECQ BP
   176  	JNE gfpSqrLoopAdx
   177  
   178  	RET
   179  
   180  gfpSqrLoop:
   181  	// y[1:] * y[0]
   182  	MOVQ (8*0)(x_ptr), t0
   183  
   184  	MOVQ (8*1)(x_ptr), AX
   185  	MULQ t0
   186  	MOVQ AX, acc1
   187  	MOVQ DX, acc2
   188  
   189  	MOVQ (8*2)(x_ptr), AX
   190  	MULQ t0
   191  	ADDQ AX, acc2
   192  	ADCQ $0, DX
   193  	MOVQ DX, acc3
   194  
   195  	MOVQ (8*3)(x_ptr), AX
   196  	MULQ t0
   197  	ADDQ AX, acc3
   198  	ADCQ $0, DX
   199  	MOVQ DX, acc4
   200  	// y[2:] * y[1]
   201  	MOVQ (8*1)(x_ptr), t0
   202  
   203  	MOVQ (8*2)(x_ptr), AX
   204  	MULQ t0
   205  	ADDQ AX, acc3
   206  	ADCQ $0, DX
   207  	MOVQ DX, BX
   208  
   209  	MOVQ (8*3)(x_ptr), AX
   210  	MULQ t0
   211  	ADDQ BX, acc4
   212  	ADCQ $0, DX
   213  	ADDQ AX, acc4
   214  	ADCQ $0, DX
   215  	MOVQ DX, acc5
   216  	// y[3] * y[2]
   217  	MOVQ (8*2)(x_ptr), t0
   218  
   219  	MOVQ (8*3)(x_ptr), AX
   220  	MULQ t0
   221  	ADDQ AX, acc5
   222  	ADCQ $0, DX
   223  	MOVQ DX, y_ptr
   224  	XORQ BX, BX
   225  	// *2
   226  	ADDQ acc1, acc1
   227  	ADCQ acc2, acc2
   228  	ADCQ acc3, acc3
   229  	ADCQ acc4, acc4
   230  	ADCQ acc5, acc5
   231  	ADCQ y_ptr, y_ptr
   232  	ADCQ $0, BX
   233  	// Missing products
   234  	MOVQ (8*0)(x_ptr), AX
   235  	MULQ AX
   236  	MOVQ AX, acc0
   237  	MOVQ DX, t0
   238  
   239  	MOVQ (8*1)(x_ptr), AX
   240  	MULQ AX
   241  	ADDQ t0, acc1
   242  	ADCQ AX, acc2
   243  	ADCQ $0, DX
   244  	MOVQ DX, t0
   245  
   246  	MOVQ (8*2)(x_ptr), AX
   247  	MULQ AX
   248  	ADDQ t0, acc3
   249  	ADCQ AX, acc4
   250  	ADCQ $0, DX
   251  	MOVQ DX, t0
   252  
   253  	MOVQ (8*3)(x_ptr), AX
   254  	MULQ AX
   255  	ADDQ t0, acc5
   256  	ADCQ AX, y_ptr
   257  	ADCQ DX, BX
   258  	MOVQ BX, x_ptr
   259  	// T = [acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, x_ptr]
   260  	// First reduction step
   261  	MOVQ acc0, AX
   262  	MULQ ·np+0x00(SB)
   263  	MOVQ AX, t0     // Y
   264  
   265  	// Calculate next T = T+Y*P
   266  	MOVQ ·p2+0x00(SB), AX
   267  	MULQ t0
   268  	ADDQ AX, acc0   // acc0 is free now
   269  	ADCQ $0, DX
   270  	MOVQ DX, BX     // carry
   271  	XORQ acc0, acc0
   272  
   273  	MOVQ ·p2+0x08(SB), AX
   274  	MULQ t0
   275  	ADDQ BX, acc1
   276  	ADCQ $0, DX
   277  	ADDQ AX, acc1
   278  	ADCQ $0, DX
   279  	MOVQ DX, BX     // carry
   280  
   281  	MOVQ ·p2+0x10(SB), AX
   282  	MULQ t0
   283  	ADDQ BX, acc2
   284  	ADCQ $0, DX
   285  	ADDQ AX, acc2
   286  	ADCQ $0, DX
   287  	MOVQ DX, BX     // carry
   288  
   289  	MOVQ ·p2+0x18(SB), AX
   290  	MULQ t0
   291  	ADDQ BX, acc3
   292  	ADCQ $0, DX
   293  	ADDQ AX, acc3
   294  	ADCQ DX, acc0
   295  
   296  	// Second reduction step
   297  	MOVQ acc1, AX
   298  	MULQ ·np+0x00(SB)
   299  	MOVQ AX, t0     // Y
   300  
   301  	// Calculate next T = T+Y*P
   302  	MOVQ ·p2+0x00(SB), AX
   303  	MULQ t0
   304  	ADDQ AX, acc1   // acc1 is free now
   305  	ADCQ $0, DX
   306  	MOVQ DX, BX     // carry
   307  	XORQ acc1, acc1
   308  
   309  	MOVQ ·p2+0x08(SB), AX
   310  	MULQ t0
   311  	ADDQ BX, acc2
   312  	ADCQ $0, DX
   313  	ADDQ AX, acc2
   314  	ADCQ $0, DX
   315  	MOVQ DX, BX     // carry
   316  
   317  	MOVQ ·p2+0x10(SB), AX
   318  	MULQ t0
   319  	ADDQ BX, acc3
   320  	ADCQ $0, DX
   321  	ADDQ AX, acc3
   322  	ADCQ $0, DX
   323  	MOVQ DX, BX     // carry
   324  
   325  	MOVQ ·p2+0x18(SB), AX
   326  	MULQ t0
   327  	ADDQ BX, acc0
   328  	ADCQ $0, DX
   329  	ADDQ AX, acc0
   330  	ADCQ DX, acc1
   331  
   332  	// Third reduction step
   333  	MOVQ acc2, AX
   334  	MULQ ·np+0x00(SB)
   335  	MOVQ AX, t0     // Y
   336  
   337  	// Calculate next T = T+Y*P
   338  	MOVQ ·p2+0x00(SB), AX
   339  	MULQ t0
   340  	ADDQ AX, acc2   // acc2 is free now
   341  	ADCQ $0, DX
   342  	MOVQ DX, BX     // carry
   343  	XORQ acc2, acc2
   344  
   345  	MOVQ ·p2+0x08(SB), AX
   346  	MULQ t0
   347  	ADDQ BX, acc3
   348  	ADCQ $0, DX
   349  	ADDQ AX, acc3
   350  	ADCQ $0, DX
   351  	MOVQ DX, BX     // carry
   352  
   353  	MOVQ ·p2+0x10(SB), AX
   354  	MULQ t0
   355  	ADDQ BX, acc0
   356  	ADCQ $0, DX
   357  	ADDQ AX, acc0
   358  	ADCQ $0, DX
   359  	MOVQ DX, BX     // carry
   360  
   361  	MOVQ ·p2+0x18(SB), AX
   362  	MULQ t0
   363  	ADDQ BX, acc1
   364  	ADCQ $0, DX
   365  	ADDQ AX, acc1
   366  	ADCQ DX, acc2
   367  
   368  	// Last reduction step
   369  	MOVQ acc3, AX
   370  	MULQ ·np+0x00(SB)
   371  	MOVQ AX, t0     // Y
   372  
   373  	// Calculate next T = T+Y*P
   374  	MOVQ ·p2+0x00(SB), AX
   375  	MULQ t0
   376  	ADDQ AX, acc3   // acc3 is free now
   377  	ADCQ $0, DX
   378  	MOVQ DX, BX     // carry
   379  	XORQ acc3, acc3
   380  
   381  	MOVQ ·p2+0x08(SB), AX
   382  	MULQ t0
   383  	ADDQ BX, acc0
   384  	ADCQ $0, DX
   385  	ADDQ AX, acc0
   386  	ADCQ $0, DX
   387  	MOVQ DX, BX     // carry
   388  
   389  	MOVQ ·p2+0x10(SB), AX
   390  	MULQ t0
   391  	ADDQ BX, acc1
   392  	ADCQ $0, DX
   393  	ADDQ AX, acc1
   394  	ADCQ $0, DX
   395  	MOVQ DX, BX     // carry
   396  
   397  	MOVQ ·p2+0x18(SB), AX
   398  	MULQ t0
   399  	ADDQ BX, acc2
   400  	ADCQ $0, DX
   401  	ADDQ AX, acc2
   402  	ADCQ DX, acc3
   403  
   404  	XORQ t0, t0
   405  	// Add bits [511:256] of the sqr result
   406  	ADDQ acc4, acc0
   407  	ADCQ acc5, acc1
   408  	ADCQ y_ptr, acc2
   409  	ADCQ x_ptr, acc3
   410  	ADCQ $0, t0
   411  	
   412  	gfpCarry(acc0,acc1,acc2,acc3, acc4,acc5,y_ptr,BX,t0)
   413  	storeBlock(acc0,acc1,acc2,acc3, 0(res_ptr))
   414  	MOVQ res_ptr, x_ptr
   415  	DECQ BP
   416  	JNE gfpSqrLoop
   417  
   418  	RET