github.com/emmansun/gmsm@v0.29.1/sm9/bn256/select_amd64.s (about)

     1  //go:build !purego
     2  
     3  #include "textflag.h"
     4  
     5  #define res_ptr DI
     6  #define x_ptr SI
     7  #define y_ptr CX
     8  
     9  // func gfpCopy(res, a *gfP)
    10  TEXT ·gfpCopy(SB),NOSPLIT,$0
    11  	MOVQ res+0(FP), res_ptr
    12  	MOVQ a+8(FP), x_ptr
    13  
    14  	CMPB ·supportAVX2+0(SB), $0x01
    15  	JEQ  copygfp_avx2
    16  
    17  	MOVOU (16*0)(x_ptr), X0
    18  	MOVOU (16*1)(x_ptr), X1
    19  
    20  	MOVOU X0, (16*0)(res_ptr)
    21  	MOVOU X1, (16*1)(res_ptr)
    22  
    23  	RET
    24  
    25  copygfp_avx2:
    26  	VMOVDQU (x_ptr), Y0
    27  	VMOVDQU Y0, (res_ptr)
    28  	VZEROUPPER
    29  	RET
    30  
    31  // func gfp2Copy(res, a *gfP2)
    32  TEXT ·gfp2Copy(SB),NOSPLIT,$0
    33  	MOVQ res+0(FP), res_ptr
    34  	MOVQ a+8(FP), x_ptr
    35  
    36  	CMPB ·supportAVX2+0(SB), $0x01
    37  	JEQ  copygfp2_avx2
    38  
    39  	MOVOU (16*0)(x_ptr), X0
    40  	MOVOU (16*1)(x_ptr), X1
    41  	MOVOU (16*2)(x_ptr), X2
    42  	MOVOU (16*3)(x_ptr), X3
    43  
    44  	MOVOU X0, (16*0)(res_ptr)
    45  	MOVOU X1, (16*1)(res_ptr)
    46  	MOVOU X2, (16*2)(res_ptr)
    47  	MOVOU X3, (16*3)(res_ptr)
    48  	RET
    49  
    50  copygfp2_avx2:
    51  	VMOVDQU (32*0)(x_ptr), Y0
    52  	VMOVDQU (32*1)(x_ptr), Y1
    53  
    54  	VMOVDQU Y0, (32*0)(res_ptr)
    55  	VMOVDQU Y1, (32*1)(res_ptr)
    56  
    57  	VZEROUPPER
    58  	RET
    59  
    60  // func gfp4Copy(res, a *gfP4)
    61  TEXT ·gfp4Copy(SB),NOSPLIT,$0
    62  	MOVQ res+0(FP), res_ptr
    63  	MOVQ a+8(FP), x_ptr
    64  
    65  	CMPB ·supportAVX2+0(SB), $0x01
    66  	JEQ  copygfp4_avx2
    67  
    68  	MOVOU (16*0)(x_ptr), X0
    69  	MOVOU (16*1)(x_ptr), X1
    70  	MOVOU (16*2)(x_ptr), X2
    71  	MOVOU (16*3)(x_ptr), X3
    72  	
    73  	MOVOU (16*4)(x_ptr), X4
    74  	MOVOU (16*5)(x_ptr), X5
    75  	MOVOU (16*6)(x_ptr), X6
    76  	MOVOU (16*7)(x_ptr), X7
    77  
    78  	MOVOU X0, (16*0)(res_ptr)
    79  	MOVOU X1, (16*1)(res_ptr)
    80  	MOVOU X2, (16*2)(res_ptr)
    81  	MOVOU X3, (16*3)(res_ptr)
    82  
    83  	MOVOU X4, (16*4)(res_ptr)
    84  	MOVOU X5, (16*5)(res_ptr)
    85  	MOVOU X6, (16*6)(res_ptr)
    86  	MOVOU X7, (16*7)(res_ptr)
    87  	
    88  	RET
    89  
    90  copygfp4_avx2:
    91  	VMOVDQU (32*0)(x_ptr), Y0
    92  	VMOVDQU (32*1)(x_ptr), Y1
    93  	VMOVDQU (32*2)(x_ptr), Y2
    94  	VMOVDQU (32*3)(x_ptr), Y3
    95  
    96  	VMOVDQU Y0, (32*0)(res_ptr)
    97  	VMOVDQU Y1, (32*1)(res_ptr)
    98  	VMOVDQU Y2, (32*2)(res_ptr)
    99  	VMOVDQU Y3, (32*3)(res_ptr)
   100  
   101  	VZEROUPPER
   102  	RET
   103  
   104  // func gfp6Copy(res, a *gfP6)
   105  TEXT ·gfp6Copy(SB),NOSPLIT,$0
   106  	MOVQ res+0(FP), res_ptr
   107  	MOVQ a+8(FP), x_ptr
   108  
   109  	CMPB ·supportAVX2+0(SB), $0x01
   110  	JEQ  copygfp6_avx2
   111  
   112  	MOVOU (16*0)(x_ptr), X0
   113  	MOVOU (16*1)(x_ptr), X1
   114  	MOVOU (16*2)(x_ptr), X2
   115  	MOVOU (16*3)(x_ptr), X3
   116  	
   117  	MOVOU (16*4)(x_ptr), X4
   118  	MOVOU (16*5)(x_ptr), X5
   119  	MOVOU (16*6)(x_ptr), X6
   120  	MOVOU (16*7)(x_ptr), X7
   121  
   122  	MOVOU (16*8)(x_ptr), X8
   123  	MOVOU (16*9)(x_ptr), X9
   124  	MOVOU (16*10)(x_ptr), X10
   125  	MOVOU (16*11)(x_ptr), X11
   126  
   127  	MOVOU X0, (16*0)(res_ptr)
   128  	MOVOU X1, (16*1)(res_ptr)
   129  	MOVOU X2, (16*2)(res_ptr)
   130  	MOVOU X3, (16*3)(res_ptr)
   131  
   132  	MOVOU X4, (16*4)(res_ptr)
   133  	MOVOU X5, (16*5)(res_ptr)
   134  	MOVOU X6, (16*6)(res_ptr)
   135  	MOVOU X7, (16*7)(res_ptr)
   136  
   137  	MOVOU X8, (16*8)(res_ptr)
   138  	MOVOU X9, (16*9)(res_ptr)
   139  	MOVOU X10, (16*10)(res_ptr)
   140  	MOVOU X11, (16*11)(res_ptr)
   141  	
   142  	RET
   143  
   144  copygfp6_avx2:
   145  	VMOVDQU (32*0)(x_ptr), Y0
   146  	VMOVDQU (32*1)(x_ptr), Y1
   147  	VMOVDQU (32*2)(x_ptr), Y2
   148  	VMOVDQU (32*3)(x_ptr), Y3
   149  	VMOVDQU (32*4)(x_ptr), Y4
   150  	VMOVDQU (32*5)(x_ptr), Y5
   151  
   152  	VMOVDQU Y0, (32*0)(res_ptr)
   153  	VMOVDQU Y1, (32*1)(res_ptr)
   154  	VMOVDQU Y2, (32*2)(res_ptr)
   155  	VMOVDQU Y3, (32*3)(res_ptr)
   156  	VMOVDQU Y4, (32*4)(res_ptr)
   157  	VMOVDQU Y5, (32*5)(res_ptr)
   158  
   159  	VZEROUPPER
   160  	RET	
   161  
   162  // func gfp12Copy(res, a *gfP12)
   163  TEXT ·gfp12Copy(SB),NOSPLIT,$0
   164  	MOVQ res+0(FP), res_ptr
   165  	MOVQ a+8(FP), x_ptr
   166  
   167  	CMPB ·supportAVX2+0(SB), $0x01
   168  	JEQ  copygfp12_avx2
   169  
   170  	MOVOU (16*0)(x_ptr), X0
   171  	MOVOU (16*1)(x_ptr), X1
   172  	MOVOU (16*2)(x_ptr), X2
   173  	MOVOU (16*3)(x_ptr), X3
   174  	
   175  	MOVOU (16*4)(x_ptr), X4
   176  	MOVOU (16*5)(x_ptr), X5
   177  	MOVOU (16*6)(x_ptr), X6
   178  	MOVOU (16*7)(x_ptr), X7
   179  
   180  	MOVOU X0, (16*0)(res_ptr)
   181  	MOVOU X1, (16*1)(res_ptr)
   182  	MOVOU X2, (16*2)(res_ptr)
   183  	MOVOU X3, (16*3)(res_ptr)
   184  
   185  	MOVOU X4, (16*4)(res_ptr)
   186  	MOVOU X5, (16*5)(res_ptr)
   187  	MOVOU X6, (16*6)(res_ptr)
   188  	MOVOU X7, (16*7)(res_ptr)
   189  
   190  	MOVOU (16*8)(x_ptr), X0
   191  	MOVOU (16*9)(x_ptr), X1
   192  	MOVOU (16*10)(x_ptr), X2
   193  	MOVOU (16*11)(x_ptr), X3
   194  	
   195  	MOVOU (16*12)(x_ptr), X4
   196  	MOVOU (16*13)(x_ptr), X5
   197  	MOVOU (16*14)(x_ptr), X6
   198  	MOVOU (16*15)(x_ptr), X7
   199  
   200  	MOVOU X0, (16*8)(res_ptr)
   201  	MOVOU X1, (16*9)(res_ptr)
   202  	MOVOU X2, (16*10)(res_ptr)
   203  	MOVOU X3, (16*11)(res_ptr)
   204  
   205  	MOVOU X4, (16*12)(res_ptr)
   206  	MOVOU X5, (16*13)(res_ptr)
   207  	MOVOU X6, (16*14)(res_ptr)
   208  	MOVOU X7, (16*15)(res_ptr)
   209  
   210  	MOVOU (16*16)(x_ptr), X0
   211  	MOVOU (16*17)(x_ptr), X1
   212  	MOVOU (16*18)(x_ptr), X2
   213  	MOVOU (16*19)(x_ptr), X3
   214  	
   215  	MOVOU (16*20)(x_ptr), X4
   216  	MOVOU (16*21)(x_ptr), X5
   217  	MOVOU (16*22)(x_ptr), X6
   218  	MOVOU (16*23)(x_ptr), X7
   219  
   220  	MOVOU X0, (16*16)(res_ptr)
   221  	MOVOU X1, (16*17)(res_ptr)
   222  	MOVOU X2, (16*18)(res_ptr)
   223  	MOVOU X3, (16*19)(res_ptr)
   224  
   225  	MOVOU X4, (16*20)(res_ptr)
   226  	MOVOU X5, (16*21)(res_ptr)
   227  	MOVOU X6, (16*22)(res_ptr)
   228  	MOVOU X7, (16*23)(res_ptr)
   229  
   230  	RET
   231  
   232  copygfp12_avx2:
   233  	VMOVDQU (32*0)(x_ptr), Y0
   234  	VMOVDQU (32*1)(x_ptr), Y1
   235  	VMOVDQU (32*2)(x_ptr), Y2
   236  	VMOVDQU (32*3)(x_ptr), Y3
   237  
   238  	VMOVDQU (32*4)(x_ptr), Y4
   239  	VMOVDQU (32*5)(x_ptr), Y5
   240  	VMOVDQU (32*6)(x_ptr), Y6
   241  	VMOVDQU (32*7)(x_ptr), Y7
   242  
   243  	VMOVDQU (32*8)(x_ptr), Y8
   244  	VMOVDQU (32*9)(x_ptr), Y9
   245  	VMOVDQU (32*10)(x_ptr), Y10
   246  	VMOVDQU (32*11)(x_ptr), Y11
   247  
   248  	VMOVDQU Y0, (32*0)(res_ptr)
   249  	VMOVDQU Y1, (32*1)(res_ptr)
   250  	VMOVDQU Y2, (32*2)(res_ptr)
   251  	VMOVDQU Y3, (32*3)(res_ptr)
   252  
   253  	VMOVDQU Y4, (32*4)(res_ptr)
   254  	VMOVDQU Y5, (32*5)(res_ptr)
   255  	VMOVDQU Y6, (32*6)(res_ptr)
   256  	VMOVDQU Y7, (32*7)(res_ptr)
   257  
   258  	VMOVDQU Y8, (32*8)(res_ptr)
   259  	VMOVDQU Y9, (32*9)(res_ptr)
   260  	VMOVDQU Y10, (32*10)(res_ptr)
   261  	VMOVDQU Y11, (32*11)(res_ptr)
   262  
   263  	VZEROUPPER
   264  	RET		
   265  
   266  // func gfP12MovCond(res, a, b *gfP12, cond int)
   267  TEXT ·gfP12MovCond(SB),NOSPLIT,$0
   268  	MOVQ res+0(FP), res_ptr
   269  	MOVQ a+8(FP), x_ptr
   270  	MOVQ b+16(FP), y_ptr
   271  	MOVQ cond+24(FP), X12
   272  
   273  	CMPB ·supportAVX2+0(SB), $0x01
   274  	JEQ  move_avx2
   275  	
   276  	PXOR X13, X13
   277  	PSHUFD $0, X12, X12
   278  	PCMPEQL X13, X12
   279  
   280  	MOVOU X12, X0
   281  	MOVOU (16*0)(x_ptr), X6
   282  	PANDN X6, X0
   283  
   284  	MOVOU X12, X1
   285  	MOVOU (16*1)(x_ptr), X7
   286  	PANDN X7, X1
   287  
   288  	MOVOU X12, X2
   289  	MOVOU (16*2)(x_ptr), X8
   290  	PANDN X8, X2
   291  
   292  	MOVOU X12, X3
   293  	MOVOU (16*3)(x_ptr), X9
   294  	PANDN X9, X3
   295  
   296  	MOVOU X12, X4
   297  	MOVOU (16*4)(x_ptr), X10
   298  	PANDN X10, X4
   299  
   300  	MOVOU X12, X5
   301  	MOVOU (16*5)(x_ptr), X11
   302  	PANDN X11, X5
   303  
   304  	MOVOU (16*0)(y_ptr), X6
   305  	MOVOU (16*1)(y_ptr), X7
   306  	MOVOU (16*2)(y_ptr), X8
   307  	MOVOU (16*3)(y_ptr), X9
   308  	MOVOU (16*4)(y_ptr), X10
   309  	MOVOU (16*5)(y_ptr), X11
   310  
   311  	PAND X12, X6
   312  	PAND X12, X7
   313  	PAND X12, X8
   314  	PAND X12, X9
   315  	PAND X12, X10
   316  	PAND X12, X11
   317  
   318  	PXOR X6, X0
   319  	PXOR X7, X1
   320  	PXOR X8, X2
   321  	PXOR X9, X3
   322  	PXOR X10, X4
   323  	PXOR X11, X5
   324  
   325  	MOVOU X0, (16*0)(res_ptr)
   326  	MOVOU X1, (16*1)(res_ptr)
   327  	MOVOU X2, (16*2)(res_ptr)
   328  	MOVOU X3, (16*3)(res_ptr)
   329  	MOVOU X4, (16*4)(res_ptr)
   330  	MOVOU X5, (16*5)(res_ptr)
   331  
   332  	MOVOU X12, X0
   333  	MOVOU (16*6)(x_ptr), X6
   334  	PANDN X6, X0
   335  
   336  	MOVOU X12, X1
   337  	MOVOU (16*7)(x_ptr), X7
   338  	PANDN X7, X1
   339  
   340  	MOVOU X12, X2
   341  	MOVOU (16*8)(x_ptr), X8
   342  	PANDN X8, X2
   343  
   344  	MOVOU X12, X3
   345  	MOVOU (16*9)(x_ptr), X9
   346  	PANDN X9, X3
   347  
   348  	MOVOU X12, X4
   349  	MOVOU (16*10)(x_ptr), X10
   350  	PANDN X10, X4
   351  
   352  	MOVOU X12, X5
   353  	MOVOU (16*11)(x_ptr), X11
   354  	PANDN X11, X5
   355  
   356  	MOVOU (16*6)(y_ptr), X6
   357  	MOVOU (16*7)(y_ptr), X7
   358  	MOVOU (16*8)(y_ptr), X8
   359  	MOVOU (16*9)(y_ptr), X9
   360  	MOVOU (16*10)(y_ptr), X10
   361  	MOVOU (16*11)(y_ptr), X11
   362  
   363  	PAND X12, X6
   364  	PAND X12, X7
   365  	PAND X12, X8
   366  	PAND X12, X9
   367  	PAND X12, X10
   368  	PAND X12, X11
   369  
   370  	PXOR X6, X0
   371  	PXOR X7, X1
   372  	PXOR X8, X2
   373  	PXOR X9, X3
   374  	PXOR X10, X4
   375  	PXOR X11, X5
   376  
   377  	MOVOU X0, (16*6)(res_ptr)
   378  	MOVOU X1, (16*7)(res_ptr)
   379  	MOVOU X2, (16*8)(res_ptr)
   380  	MOVOU X3, (16*9)(res_ptr)
   381  	MOVOU X4, (16*10)(res_ptr)
   382  	MOVOU X5, (16*11)(res_ptr)
   383  
   384  	MOVOU X12, X0
   385  	MOVOU (16*12)(x_ptr), X6
   386  	PANDN X6, X0
   387  
   388  	MOVOU X12, X1
   389  	MOVOU (16*13)(x_ptr), X7
   390  	PANDN X7, X1
   391  
   392  	MOVOU X12, X2
   393  	MOVOU (16*14)(x_ptr), X8
   394  	PANDN X8, X2
   395  
   396  	MOVOU X12, X3
   397  	MOVOU (16*15)(x_ptr), X9
   398  	PANDN X9, X3
   399  
   400  	MOVOU X12, X4
   401  	MOVOU (16*16)(x_ptr), X10
   402  	PANDN X10, X4
   403  
   404  	MOVOU X12, X5
   405  	MOVOU (16*17)(x_ptr), X11
   406  	PANDN X11, X5
   407  
   408  	MOVOU (16*12)(y_ptr), X6
   409  	MOVOU (16*13)(y_ptr), X7
   410  	MOVOU (16*14)(y_ptr), X8
   411  	MOVOU (16*15)(y_ptr), X9
   412  	MOVOU (16*16)(y_ptr), X10
   413  	MOVOU (16*17)(y_ptr), X11
   414  
   415  	PAND X12, X6
   416  	PAND X12, X7
   417  	PAND X12, X8
   418  	PAND X12, X9
   419  	PAND X12, X10
   420  	PAND X12, X11
   421  
   422  	PXOR X6, X0
   423  	PXOR X7, X1
   424  	PXOR X8, X2
   425  	PXOR X9, X3
   426  	PXOR X10, X4
   427  	PXOR X11, X5
   428  
   429  	MOVOU X0, (16*12)(res_ptr)
   430  	MOVOU X1, (16*13)(res_ptr)
   431  	MOVOU X2, (16*14)(res_ptr)
   432  	MOVOU X3, (16*15)(res_ptr)
   433  	MOVOU X4, (16*16)(res_ptr)
   434  	MOVOU X5, (16*17)(res_ptr)
   435  
   436  	MOVOU X12, X0
   437  	MOVOU (16*18)(x_ptr), X6
   438  	PANDN X6, X0
   439  
   440  	MOVOU X12, X1
   441  	MOVOU (16*19)(x_ptr), X7
   442  	PANDN X7, X1
   443  
   444  	MOVOU X12, X2
   445  	MOVOU (16*20)(x_ptr), X8
   446  	PANDN X8, X2
   447  
   448  	MOVOU X12, X3
   449  	MOVOU (16*21)(x_ptr), X9
   450  	PANDN X9, X3
   451  
   452  	MOVOU X12, X4
   453  	MOVOU (16*22)(x_ptr), X10
   454  	PANDN X10, X4
   455  
   456  	MOVOU X12, X5
   457  	MOVOU (16*23)(x_ptr), X11
   458  	PANDN X11, X5
   459  
   460  	MOVOU (16*18)(y_ptr), X6
   461  	MOVOU (16*19)(y_ptr), X7
   462  	MOVOU (16*20)(y_ptr), X8
   463  	MOVOU (16*21)(y_ptr), X9
   464  	MOVOU (16*22)(y_ptr), X10
   465  	MOVOU (16*23)(y_ptr), X11
   466  
   467  	PAND X12, X6
   468  	PAND X12, X7
   469  	PAND X12, X8
   470  	PAND X12, X9
   471  	PAND X12, X10
   472  	PAND X12, X11
   473  
   474  	PXOR X6, X0
   475  	PXOR X7, X1
   476  	PXOR X8, X2
   477  	PXOR X9, X3
   478  	PXOR X10, X4
   479  	PXOR X11, X5
   480  
   481  	MOVOU X0, (16*18)(res_ptr)
   482  	MOVOU X1, (16*19)(res_ptr)
   483  	MOVOU X2, (16*20)(res_ptr)
   484  	MOVOU X3, (16*21)(res_ptr)
   485  	MOVOU X4, (16*22)(res_ptr)
   486  	MOVOU X5, (16*23)(res_ptr)
   487  
   488  	RET
   489  
   490  move_avx2:
   491  	VPXOR Y13, Y13, Y13
   492  	VPBROADCASTD X12, Y12
   493  	VPCMPEQD Y13, Y12, Y12
   494  
   495  	VPANDN (32*0)(x_ptr), Y12, Y0 
   496  	VPANDN (32*1)(x_ptr), Y12, Y1
   497  	VPANDN (32*2)(x_ptr), Y12, Y2
   498  	VPANDN (32*3)(x_ptr), Y12, Y3
   499  	VPANDN (32*4)(x_ptr), Y12, Y4
   500  	VPANDN (32*5)(x_ptr), Y12, Y5
   501  
   502  	VPAND (32*0)(y_ptr), Y12, Y6
   503  	VPAND (32*1)(y_ptr), Y12, Y7
   504  	VPAND (32*2)(y_ptr), Y12, Y8
   505  	VPAND (32*3)(y_ptr), Y12, Y9
   506  	VPAND (32*4)(y_ptr), Y12, Y10
   507  	VPAND (32*5)(y_ptr), Y12, Y11
   508  
   509  	VPXOR Y6, Y0, Y0
   510  	VPXOR Y7, Y1, Y1
   511  	VPXOR Y8, Y2, Y2
   512  	VPXOR Y9, Y3, Y3
   513  	VPXOR Y10, Y4, Y4
   514  	VPXOR Y11, Y5, Y5
   515  
   516  	VMOVDQU Y0, (32*0)(res_ptr)
   517  	VMOVDQU Y1, (32*1)(res_ptr)
   518  	VMOVDQU Y2, (32*2)(res_ptr)
   519  	VMOVDQU Y3, (32*3)(res_ptr)
   520  	VMOVDQU Y4, (32*4)(res_ptr)
   521  	VMOVDQU Y5, (32*5)(res_ptr)
   522  
   523  	VPANDN (32*6)(x_ptr), Y12, Y0 
   524  	VPANDN (32*7)(x_ptr), Y12, Y1
   525  	VPANDN (32*8)(x_ptr), Y12, Y2
   526  	VPANDN (32*9)(x_ptr), Y12, Y3
   527  	VPANDN (32*10)(x_ptr), Y12, Y4
   528  	VPANDN (32*11)(x_ptr), Y12, Y5
   529  
   530  	VPAND (32*6)(y_ptr), Y12, Y6
   531  	VPAND (32*7)(y_ptr), Y12, Y7
   532  	VPAND (32*8)(y_ptr), Y12, Y8
   533  	VPAND (32*9)(y_ptr), Y12, Y9
   534  	VPAND (32*10)(y_ptr), Y12, Y10
   535  	VPAND (32*11)(y_ptr), Y12, Y11
   536  
   537  	VPXOR Y6, Y0, Y0
   538  	VPXOR Y7, Y1, Y1
   539  	VPXOR Y8, Y2, Y2
   540  	VPXOR Y9, Y3, Y3
   541  	VPXOR Y10, Y4, Y4
   542  	VPXOR Y11, Y5, Y5
   543  
   544  	VMOVDQU Y0, (32*6)(res_ptr)
   545  	VMOVDQU Y1, (32*7)(res_ptr)
   546  	VMOVDQU Y2, (32*8)(res_ptr)
   547  	VMOVDQU Y3, (32*9)(res_ptr)
   548  	VMOVDQU Y4, (32*10)(res_ptr)
   549  	VMOVDQU Y5, (32*11)(res_ptr)
   550  
   551  	VZEROUPPER
   552  	RET
   553  
   554  // func curvePointMovCond(res, a, b *curvePoint, cond int)
   555  TEXT ·curvePointMovCond(SB),NOSPLIT,$0
   556  	MOVQ res+0(FP), res_ptr
   557  	MOVQ a+8(FP), x_ptr
   558  	MOVQ b+16(FP), y_ptr
   559  	MOVQ cond+24(FP), X12
   560  
   561  	CMPB ·supportAVX2+0(SB), $0x01
   562  	JEQ  move_avx2
   563  	
   564  	PXOR X13, X13
   565  	PSHUFD $0, X12, X12
   566  	PCMPEQL X13, X12
   567  
   568  	MOVOU X12, X0
   569  	MOVOU (16*0)(x_ptr), X6
   570  	PANDN X6, X0
   571  
   572  	MOVOU X12, X1
   573  	MOVOU (16*1)(x_ptr), X7
   574  	PANDN X7, X1
   575  
   576  	MOVOU X12, X2
   577  	MOVOU (16*2)(x_ptr), X8
   578  	PANDN X8, X2
   579  
   580  	MOVOU X12, X3
   581  	MOVOU (16*3)(x_ptr), X9
   582  	PANDN X9, X3
   583  
   584  	MOVOU X12, X4
   585  	MOVOU (16*4)(x_ptr), X10
   586  	PANDN X10, X4
   587  
   588  	MOVOU X12, X5
   589  	MOVOU (16*5)(x_ptr), X11
   590  	PANDN X11, X5
   591  
   592  	MOVOU (16*0)(y_ptr), X6
   593  	MOVOU (16*1)(y_ptr), X7
   594  	MOVOU (16*2)(y_ptr), X8
   595  	MOVOU (16*3)(y_ptr), X9
   596  	MOVOU (16*4)(y_ptr), X10
   597  	MOVOU (16*5)(y_ptr), X11
   598  
   599  	PAND X12, X6
   600  	PAND X12, X7
   601  	PAND X12, X8
   602  	PAND X12, X9
   603  	PAND X12, X10
   604  	PAND X12, X11
   605  
   606  	PXOR X6, X0
   607  	PXOR X7, X1
   608  	PXOR X8, X2
   609  	PXOR X9, X3
   610  	PXOR X10, X4
   611  	PXOR X11, X5
   612  
   613  	MOVOU X0, (16*0)(res_ptr)
   614  	MOVOU X1, (16*1)(res_ptr)
   615  	MOVOU X2, (16*2)(res_ptr)
   616  	MOVOU X3, (16*3)(res_ptr)
   617  	MOVOU X4, (16*4)(res_ptr)
   618  	MOVOU X5, (16*5)(res_ptr)
   619  
   620  	MOVOU X12, X0
   621  	MOVOU (16*6)(x_ptr), X6
   622  	PANDN X6, X0
   623  
   624  	MOVOU X12, X1
   625  	MOVOU (16*7)(x_ptr), X7
   626  	PANDN X7, X1
   627  
   628  	MOVOU (16*6)(y_ptr), X6
   629  	MOVOU (16*7)(y_ptr), X7
   630  
   631  	PAND X12, X6
   632  	PAND X12, X7
   633  
   634  	PXOR X6, X0
   635  	PXOR X7, X1
   636  
   637  	MOVOU X0, (16*6)(res_ptr)
   638  	MOVOU X1, (16*7)(res_ptr)
   639  
   640  	RET
   641  
   642  move_avx2:
   643  	VPXOR Y13, Y13, Y13
   644  	VPBROADCASTD X12, Y12
   645  	VPCMPEQD Y13, Y12, Y12
   646  
   647  	VPANDN (32*0)(x_ptr), Y12, Y0 
   648  	VPANDN (32*1)(x_ptr), Y12, Y1
   649  	VPANDN (32*2)(x_ptr), Y12, Y2
   650  	VPANDN (32*3)(x_ptr), Y12, Y3
   651  
   652  	VPAND (32*0)(y_ptr), Y12, Y6
   653  	VPAND (32*1)(y_ptr), Y12, Y7
   654  	VPAND (32*2)(y_ptr), Y12, Y8
   655  	VPAND (32*3)(y_ptr), Y12, Y9
   656  
   657  	VPXOR Y6, Y0, Y0
   658  	VPXOR Y7, Y1, Y1
   659  	VPXOR Y8, Y2, Y2
   660  	VPXOR Y9, Y3, Y3
   661  
   662  	VMOVDQU Y0, (32*0)(res_ptr)
   663  	VMOVDQU Y1, (32*1)(res_ptr)
   664  	VMOVDQU Y2, (32*2)(res_ptr)
   665  	VMOVDQU Y3, (32*3)(res_ptr)
   666  
   667  	VZEROUPPER
   668  	RET
   669  
   670  // func twistPointMovCond(res, a, b *twistPoint, cond int)
   671  TEXT ·twistPointMovCond(SB),NOSPLIT,$0
   672  	MOVQ res+0(FP), res_ptr
   673  	MOVQ a+8(FP), x_ptr
   674  	MOVQ b+16(FP), y_ptr
   675  	MOVQ cond+24(FP), X12
   676  
   677  	CMPB ·supportAVX2+0(SB), $0x01
   678  	JEQ  move_avx2
   679  	
   680  	PXOR X13, X13
   681  	PSHUFD $0, X12, X12
   682  	PCMPEQL X13, X12
   683  
   684  	MOVOU X12, X0
   685  	MOVOU (16*0)(x_ptr), X6
   686  	PANDN X6, X0
   687  
   688  	MOVOU X12, X1
   689  	MOVOU (16*1)(x_ptr), X7
   690  	PANDN X7, X1
   691  
   692  	MOVOU X12, X2
   693  	MOVOU (16*2)(x_ptr), X8
   694  	PANDN X8, X2
   695  
   696  	MOVOU X12, X3
   697  	MOVOU (16*3)(x_ptr), X9
   698  	PANDN X9, X3
   699  
   700  	MOVOU X12, X4
   701  	MOVOU (16*4)(x_ptr), X10
   702  	PANDN X10, X4
   703  
   704  	MOVOU X12, X5
   705  	MOVOU (16*5)(x_ptr), X11
   706  	PANDN X11, X5
   707  
   708  	MOVOU (16*0)(y_ptr), X6
   709  	MOVOU (16*1)(y_ptr), X7
   710  	MOVOU (16*2)(y_ptr), X8
   711  	MOVOU (16*3)(y_ptr), X9
   712  	MOVOU (16*4)(y_ptr), X10
   713  	MOVOU (16*5)(y_ptr), X11
   714  
   715  	PAND X12, X6
   716  	PAND X12, X7
   717  	PAND X12, X8
   718  	PAND X12, X9
   719  	PAND X12, X10
   720  	PAND X12, X11
   721  
   722  	PXOR X6, X0
   723  	PXOR X7, X1
   724  	PXOR X8, X2
   725  	PXOR X9, X3
   726  	PXOR X10, X4
   727  	PXOR X11, X5
   728  
   729  	MOVOU X0, (16*0)(res_ptr)
   730  	MOVOU X1, (16*1)(res_ptr)
   731  	MOVOU X2, (16*2)(res_ptr)
   732  	MOVOU X3, (16*3)(res_ptr)
   733  	MOVOU X4, (16*4)(res_ptr)
   734  	MOVOU X5, (16*5)(res_ptr)
   735  
   736  	MOVOU X12, X0
   737  	MOVOU (16*6)(x_ptr), X6
   738  	PANDN X6, X0
   739  
   740  	MOVOU X12, X1
   741  	MOVOU (16*7)(x_ptr), X7
   742  	PANDN X7, X1
   743  
   744  	MOVOU X12, X2
   745  	MOVOU (16*8)(x_ptr), X8
   746  	PANDN X8, X2
   747  
   748  	MOVOU X12, X3
   749  	MOVOU (16*9)(x_ptr), X9
   750  	PANDN X9, X3
   751  
   752  	MOVOU X12, X4
   753  	MOVOU (16*10)(x_ptr), X10
   754  	PANDN X10, X4
   755  
   756  	MOVOU X12, X5
   757  	MOVOU (16*11)(x_ptr), X11
   758  	PANDN X11, X5
   759  
   760  	MOVOU (16*6)(y_ptr), X6
   761  	MOVOU (16*7)(y_ptr), X7
   762  	MOVOU (16*8)(y_ptr), X8
   763  	MOVOU (16*9)(y_ptr), X9
   764  	MOVOU (16*10)(y_ptr), X10
   765  	MOVOU (16*11)(y_ptr), X11
   766  
   767  	PAND X12, X6
   768  	PAND X12, X7
   769  	PAND X12, X8
   770  	PAND X12, X9
   771  	PAND X12, X10
   772  	PAND X12, X11
   773  
   774  	PXOR X6, X0
   775  	PXOR X7, X1
   776  	PXOR X8, X2
   777  	PXOR X9, X3
   778  	PXOR X10, X4
   779  	PXOR X11, X5
   780  
   781  	MOVOU X0, (16*6)(res_ptr)
   782  	MOVOU X1, (16*7)(res_ptr)
   783  	MOVOU X2, (16*8)(res_ptr)
   784  	MOVOU X3, (16*9)(res_ptr)
   785  	MOVOU X4, (16*10)(res_ptr)
   786  	MOVOU X5, (16*11)(res_ptr)
   787  
   788  	MOVOU X12, X0
   789  	MOVOU (16*12)(x_ptr), X6
   790  	PANDN X6, X0
   791  
   792  	MOVOU X12, X1
   793  	MOVOU (16*13)(x_ptr), X7
   794  	PANDN X7, X1
   795  
   796  	MOVOU X12, X2
   797  	MOVOU (16*14)(x_ptr), X8
   798  	PANDN X8, X2
   799  
   800  	MOVOU X12, X3
   801  	MOVOU (16*15)(x_ptr), X9
   802  	PANDN X9, X3
   803  
   804  	MOVOU (16*12)(y_ptr), X6
   805  	MOVOU (16*13)(y_ptr), X7
   806  	MOVOU (16*14)(y_ptr), X8
   807  	MOVOU (16*15)(y_ptr), X9
   808  
   809  	PAND X12, X6
   810  	PAND X12, X7
   811  	PAND X12, X8
   812  	PAND X12, X9
   813  
   814  	PXOR X6, X0
   815  	PXOR X7, X1
   816  	PXOR X8, X2
   817  	PXOR X9, X3
   818  
   819  	MOVOU X0, (16*12)(res_ptr)
   820  	MOVOU X1, (16*13)(res_ptr)
   821  	MOVOU X2, (16*14)(res_ptr)
   822  	MOVOU X3, (16*15)(res_ptr)
   823  
   824  	RET
   825  
   826  move_avx2:
   827  	VPXOR Y13, Y13, Y13
   828  	VPBROADCASTD X12, Y12
   829  	VPCMPEQD Y13, Y12, Y12
   830  
   831  	VPANDN (32*0)(x_ptr), Y12, Y0 
   832  	VPANDN (32*1)(x_ptr), Y12, Y1
   833  	VPANDN (32*2)(x_ptr), Y12, Y2
   834  	VPANDN (32*3)(x_ptr), Y12, Y3
   835  	VPANDN (32*4)(x_ptr), Y12, Y4
   836  	VPANDN (32*5)(x_ptr), Y12, Y5
   837  
   838  	VPAND (32*0)(y_ptr), Y12, Y6
   839  	VPAND (32*1)(y_ptr), Y12, Y7
   840  	VPAND (32*2)(y_ptr), Y12, Y8
   841  	VPAND (32*3)(y_ptr), Y12, Y9
   842  	VPAND (32*4)(y_ptr), Y12, Y10
   843  	VPAND (32*5)(y_ptr), Y12, Y11
   844  
   845  	VPXOR Y6, Y0, Y0
   846  	VPXOR Y7, Y1, Y1
   847  	VPXOR Y8, Y2, Y2
   848  	VPXOR Y9, Y3, Y3
   849  	VPXOR Y10, Y4, Y4
   850  	VPXOR Y11, Y5, Y5
   851  
   852  	VMOVDQU Y0, (32*0)(res_ptr)
   853  	VMOVDQU Y1, (32*1)(res_ptr)
   854  	VMOVDQU Y2, (32*2)(res_ptr)
   855  	VMOVDQU Y3, (32*3)(res_ptr)
   856  	VMOVDQU Y4, (32*4)(res_ptr)
   857  	VMOVDQU Y5, (32*5)(res_ptr)
   858  
   859  	VPANDN (32*6)(x_ptr), Y12, Y0 
   860  	VPANDN (32*7)(x_ptr), Y12, Y1
   861  
   862  	VPAND (32*6)(y_ptr), Y12, Y6
   863  	VPAND (32*7)(y_ptr), Y12, Y7
   864  
   865  	VPXOR Y6, Y0, Y0
   866  	VPXOR Y7, Y1, Y1
   867  
   868  	VMOVDQU Y0, (32*6)(res_ptr)
   869  	VMOVDQU Y1, (32*7)(res_ptr)
   870  
   871  	VZEROUPPER
   872  	RET