github.com/consensys/gnark-crypto@v0.14.0/ecc/bn254/internal/fptower/e2_amd64.s (about)

     1  // Copyright 2020 ConsenSys Software Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  #include "textflag.h"
    16  #include "funcdata.h"
    17  
    18  // modulus q
    19  DATA q<>+0(SB)/8, $0x3c208c16d87cfd47
    20  DATA q<>+8(SB)/8, $0x97816a916871ca8d
    21  DATA q<>+16(SB)/8, $0xb85045b68181585d
    22  DATA q<>+24(SB)/8, $0x30644e72e131a029
    23  GLOBL q<>(SB), (RODATA+NOPTR), $32
    24  
    25  // qInv0 q'[0]
    26  DATA qInv0<>(SB)/8, $0x87d20782e4866389
    27  GLOBL qInv0<>(SB), (RODATA+NOPTR), $8
    28  
    29  #define REDUCE(ra0, ra1, ra2, ra3, rb0, rb1, rb2, rb3) \
    30  	MOVQ    ra0, rb0;        \
    31  	SUBQ    q<>(SB), ra0;    \
    32  	MOVQ    ra1, rb1;        \
    33  	SBBQ    q<>+8(SB), ra1;  \
    34  	MOVQ    ra2, rb2;        \
    35  	SBBQ    q<>+16(SB), ra2; \
    36  	MOVQ    ra3, rb3;        \
    37  	SBBQ    q<>+24(SB), ra3; \
    38  	CMOVQCS rb0, ra0;        \
    39  	CMOVQCS rb1, ra1;        \
    40  	CMOVQCS rb2, ra2;        \
    41  	CMOVQCS rb3, ra3;        \
    42  
    43  // this code is generated and identical to fp.Mul(...)
    44  #define MUL() \
    45  	XORQ  AX, AX;              \
    46  	MOVQ  SI, DX;              \
    47  	MULXQ R14, R10, R11;       \
    48  	MULXQ R15, AX, R12;        \
    49  	ADOXQ AX, R11;             \
    50  	MULXQ CX, AX, R13;         \
    51  	ADOXQ AX, R12;             \
    52  	MULXQ BX, AX, BP;          \
    53  	ADOXQ AX, R13;             \
    54  	MOVQ  $0, AX;              \
    55  	ADOXQ AX, BP;              \
    56  	PUSHQ BP;                  \
    57  	MOVQ  qInv0<>(SB), DX;     \
    58  	IMULQ R10, DX;             \
    59  	XORQ  AX, AX;              \
    60  	MULXQ q<>+0(SB), AX, BP;   \
    61  	ADCXQ R10, AX;             \
    62  	MOVQ  BP, R10;             \
    63  	POPQ  BP;                  \
    64  	ADCXQ R11, R10;            \
    65  	MULXQ q<>+8(SB), AX, R11;  \
    66  	ADOXQ AX, R10;             \
    67  	ADCXQ R12, R11;            \
    68  	MULXQ q<>+16(SB), AX, R12; \
    69  	ADOXQ AX, R11;             \
    70  	ADCXQ R13, R12;            \
    71  	MULXQ q<>+24(SB), AX, R13; \
    72  	ADOXQ AX, R12;             \
    73  	MOVQ  $0, AX;              \
    74  	ADCXQ AX, R13;             \
    75  	ADOXQ BP, R13;             \
    76  	XORQ  AX, AX;              \
    77  	MOVQ  DI, DX;              \
    78  	MULXQ R14, AX, BP;         \
    79  	ADOXQ AX, R10;             \
    80  	ADCXQ BP, R11;             \
    81  	MULXQ R15, AX, BP;         \
    82  	ADOXQ AX, R11;             \
    83  	ADCXQ BP, R12;             \
    84  	MULXQ CX, AX, BP;          \
    85  	ADOXQ AX, R12;             \
    86  	ADCXQ BP, R13;             \
    87  	MULXQ BX, AX, BP;          \
    88  	ADOXQ AX, R13;             \
    89  	MOVQ  $0, AX;              \
    90  	ADCXQ AX, BP;              \
    91  	ADOXQ AX, BP;              \
    92  	PUSHQ BP;                  \
    93  	MOVQ  qInv0<>(SB), DX;     \
    94  	IMULQ R10, DX;             \
    95  	XORQ  AX, AX;              \
    96  	MULXQ q<>+0(SB), AX, BP;   \
    97  	ADCXQ R10, AX;             \
    98  	MOVQ  BP, R10;             \
    99  	POPQ  BP;                  \
   100  	ADCXQ R11, R10;            \
   101  	MULXQ q<>+8(SB), AX, R11;  \
   102  	ADOXQ AX, R10;             \
   103  	ADCXQ R12, R11;            \
   104  	MULXQ q<>+16(SB), AX, R12; \
   105  	ADOXQ AX, R11;             \
   106  	ADCXQ R13, R12;            \
   107  	MULXQ q<>+24(SB), AX, R13; \
   108  	ADOXQ AX, R12;             \
   109  	MOVQ  $0, AX;              \
   110  	ADCXQ AX, R13;             \
   111  	ADOXQ BP, R13;             \
   112  	XORQ  AX, AX;              \
   113  	MOVQ  R8, DX;              \
   114  	MULXQ R14, AX, BP;         \
   115  	ADOXQ AX, R10;             \
   116  	ADCXQ BP, R11;             \
   117  	MULXQ R15, AX, BP;         \
   118  	ADOXQ AX, R11;             \
   119  	ADCXQ BP, R12;             \
   120  	MULXQ CX, AX, BP;          \
   121  	ADOXQ AX, R12;             \
   122  	ADCXQ BP, R13;             \
   123  	MULXQ BX, AX, BP;          \
   124  	ADOXQ AX, R13;             \
   125  	MOVQ  $0, AX;              \
   126  	ADCXQ AX, BP;              \
   127  	ADOXQ AX, BP;              \
   128  	PUSHQ BP;                  \
   129  	MOVQ  qInv0<>(SB), DX;     \
   130  	IMULQ R10, DX;             \
   131  	XORQ  AX, AX;              \
   132  	MULXQ q<>+0(SB), AX, BP;   \
   133  	ADCXQ R10, AX;             \
   134  	MOVQ  BP, R10;             \
   135  	POPQ  BP;                  \
   136  	ADCXQ R11, R10;            \
   137  	MULXQ q<>+8(SB), AX, R11;  \
   138  	ADOXQ AX, R10;             \
   139  	ADCXQ R12, R11;            \
   140  	MULXQ q<>+16(SB), AX, R12; \
   141  	ADOXQ AX, R11;             \
   142  	ADCXQ R13, R12;            \
   143  	MULXQ q<>+24(SB), AX, R13; \
   144  	ADOXQ AX, R12;             \
   145  	MOVQ  $0, AX;              \
   146  	ADCXQ AX, R13;             \
   147  	ADOXQ BP, R13;             \
   148  	XORQ  AX, AX;              \
   149  	MOVQ  R9, DX;              \
   150  	MULXQ R14, AX, BP;         \
   151  	ADOXQ AX, R10;             \
   152  	ADCXQ BP, R11;             \
   153  	MULXQ R15, AX, BP;         \
   154  	ADOXQ AX, R11;             \
   155  	ADCXQ BP, R12;             \
   156  	MULXQ CX, AX, BP;          \
   157  	ADOXQ AX, R12;             \
   158  	ADCXQ BP, R13;             \
   159  	MULXQ BX, AX, BP;          \
   160  	ADOXQ AX, R13;             \
   161  	MOVQ  $0, AX;              \
   162  	ADCXQ AX, BP;              \
   163  	ADOXQ AX, BP;              \
   164  	PUSHQ BP;                  \
   165  	MOVQ  qInv0<>(SB), DX;     \
   166  	IMULQ R10, DX;             \
   167  	XORQ  AX, AX;              \
   168  	MULXQ q<>+0(SB), AX, BP;   \
   169  	ADCXQ R10, AX;             \
   170  	MOVQ  BP, R10;             \
   171  	POPQ  BP;                  \
   172  	ADCXQ R11, R10;            \
   173  	MULXQ q<>+8(SB), AX, R11;  \
   174  	ADOXQ AX, R10;             \
   175  	ADCXQ R12, R11;            \
   176  	MULXQ q<>+16(SB), AX, R12; \
   177  	ADOXQ AX, R11;             \
   178  	ADCXQ R13, R12;            \
   179  	MULXQ q<>+24(SB), AX, R13; \
   180  	ADOXQ AX, R12;             \
   181  	MOVQ  $0, AX;              \
   182  	ADCXQ AX, R13;             \
   183  	ADOXQ BP, R13;             \
   184  
   185  TEXT ·addE2(SB), NOSPLIT, $0-24
   186  	MOVQ x+8(FP), AX
   187  	MOVQ 0(AX), BX
   188  	MOVQ 8(AX), SI
   189  	MOVQ 16(AX), DI
   190  	MOVQ 24(AX), R8
   191  	MOVQ y+16(FP), DX
   192  	ADDQ 0(DX), BX
   193  	ADCQ 8(DX), SI
   194  	ADCQ 16(DX), DI
   195  	ADCQ 24(DX), R8
   196  
   197  	// reduce element(BX,SI,DI,R8) using temp registers (R9,R10,R11,R12)
   198  	REDUCE(BX,SI,DI,R8,R9,R10,R11,R12)
   199  
   200  	MOVQ res+0(FP), CX
   201  	MOVQ BX, 0(CX)
   202  	MOVQ SI, 8(CX)
   203  	MOVQ DI, 16(CX)
   204  	MOVQ R8, 24(CX)
   205  	MOVQ 32(AX), BX
   206  	MOVQ 40(AX), SI
   207  	MOVQ 48(AX), DI
   208  	MOVQ 56(AX), R8
   209  	ADDQ 32(DX), BX
   210  	ADCQ 40(DX), SI
   211  	ADCQ 48(DX), DI
   212  	ADCQ 56(DX), R8
   213  
   214  	// reduce element(BX,SI,DI,R8) using temp registers (R13,R14,R15,R9)
   215  	REDUCE(BX,SI,DI,R8,R13,R14,R15,R9)
   216  
   217  	MOVQ BX, 32(CX)
   218  	MOVQ SI, 40(CX)
   219  	MOVQ DI, 48(CX)
   220  	MOVQ R8, 56(CX)
   221  	RET
   222  
   223  TEXT ·doubleE2(SB), NOSPLIT, $0-16
   224  	MOVQ res+0(FP), DX
   225  	MOVQ x+8(FP), AX
   226  	MOVQ 0(AX), CX
   227  	MOVQ 8(AX), BX
   228  	MOVQ 16(AX), SI
   229  	MOVQ 24(AX), DI
   230  	ADDQ CX, CX
   231  	ADCQ BX, BX
   232  	ADCQ SI, SI
   233  	ADCQ DI, DI
   234  
   235  	// reduce element(CX,BX,SI,DI) using temp registers (R8,R9,R10,R11)
   236  	REDUCE(CX,BX,SI,DI,R8,R9,R10,R11)
   237  
   238  	MOVQ CX, 0(DX)
   239  	MOVQ BX, 8(DX)
   240  	MOVQ SI, 16(DX)
   241  	MOVQ DI, 24(DX)
   242  	MOVQ 32(AX), CX
   243  	MOVQ 40(AX), BX
   244  	MOVQ 48(AX), SI
   245  	MOVQ 56(AX), DI
   246  	ADDQ CX, CX
   247  	ADCQ BX, BX
   248  	ADCQ SI, SI
   249  	ADCQ DI, DI
   250  
   251  	// reduce element(CX,BX,SI,DI) using temp registers (R12,R13,R14,R15)
   252  	REDUCE(CX,BX,SI,DI,R12,R13,R14,R15)
   253  
   254  	MOVQ CX, 32(DX)
   255  	MOVQ BX, 40(DX)
   256  	MOVQ SI, 48(DX)
   257  	MOVQ DI, 56(DX)
   258  	RET
   259  
   260  TEXT ·subE2(SB), NOSPLIT, $0-24
   261  	XORQ    DI, DI
   262  	MOVQ    x+8(FP), SI
   263  	MOVQ    0(SI), AX
   264  	MOVQ    8(SI), DX
   265  	MOVQ    16(SI), CX
   266  	MOVQ    24(SI), BX
   267  	MOVQ    y+16(FP), SI
   268  	SUBQ    0(SI), AX
   269  	SBBQ    8(SI), DX
   270  	SBBQ    16(SI), CX
   271  	SBBQ    24(SI), BX
   272  	MOVQ    x+8(FP), SI
   273  	MOVQ    $0x3c208c16d87cfd47, R8
   274  	MOVQ    $0x97816a916871ca8d, R9
   275  	MOVQ    $0xb85045b68181585d, R10
   276  	MOVQ    $0x30644e72e131a029, R11
   277  	CMOVQCC DI, R8
   278  	CMOVQCC DI, R9
   279  	CMOVQCC DI, R10
   280  	CMOVQCC DI, R11
   281  	ADDQ    R8, AX
   282  	ADCQ    R9, DX
   283  	ADCQ    R10, CX
   284  	ADCQ    R11, BX
   285  	MOVQ    res+0(FP), R12
   286  	MOVQ    AX, 0(R12)
   287  	MOVQ    DX, 8(R12)
   288  	MOVQ    CX, 16(R12)
   289  	MOVQ    BX, 24(R12)
   290  	MOVQ    32(SI), AX
   291  	MOVQ    40(SI), DX
   292  	MOVQ    48(SI), CX
   293  	MOVQ    56(SI), BX
   294  	MOVQ    y+16(FP), SI
   295  	SUBQ    32(SI), AX
   296  	SBBQ    40(SI), DX
   297  	SBBQ    48(SI), CX
   298  	SBBQ    56(SI), BX
   299  	MOVQ    $0x3c208c16d87cfd47, R13
   300  	MOVQ    $0x97816a916871ca8d, R14
   301  	MOVQ    $0xb85045b68181585d, R15
   302  	MOVQ    $0x30644e72e131a029, R8
   303  	CMOVQCC DI, R13
   304  	CMOVQCC DI, R14
   305  	CMOVQCC DI, R15
   306  	CMOVQCC DI, R8
   307  	ADDQ    R13, AX
   308  	ADCQ    R14, DX
   309  	ADCQ    R15, CX
   310  	ADCQ    R8, BX
   311  	MOVQ    res+0(FP), SI
   312  	MOVQ    AX, 32(SI)
   313  	MOVQ    DX, 40(SI)
   314  	MOVQ    CX, 48(SI)
   315  	MOVQ    BX, 56(SI)
   316  	RET
   317  
   318  TEXT ·negE2(SB), NOSPLIT, $0-16
   319  	MOVQ  res+0(FP), DX
   320  	MOVQ  x+8(FP), AX
   321  	MOVQ  0(AX), BX
   322  	MOVQ  8(AX), SI
   323  	MOVQ  16(AX), DI
   324  	MOVQ  24(AX), R8
   325  	MOVQ  BX, AX
   326  	ORQ   SI, AX
   327  	ORQ   DI, AX
   328  	ORQ   R8, AX
   329  	TESTQ AX, AX
   330  	JNE   l1
   331  	MOVQ  AX, 0(DX)
   332  	MOVQ  AX, 8(DX)
   333  	MOVQ  AX, 16(DX)
   334  	MOVQ  AX, 24(DX)
   335  	JMP   l3
   336  
   337  l1:
   338  	MOVQ $0x3c208c16d87cfd47, CX
   339  	SUBQ BX, CX
   340  	MOVQ CX, 0(DX)
   341  	MOVQ $0x97816a916871ca8d, CX
   342  	SBBQ SI, CX
   343  	MOVQ CX, 8(DX)
   344  	MOVQ $0xb85045b68181585d, CX
   345  	SBBQ DI, CX
   346  	MOVQ CX, 16(DX)
   347  	MOVQ $0x30644e72e131a029, CX
   348  	SBBQ R8, CX
   349  	MOVQ CX, 24(DX)
   350  
   351  l3:
   352  	MOVQ  x+8(FP), AX
   353  	MOVQ  32(AX), BX
   354  	MOVQ  40(AX), SI
   355  	MOVQ  48(AX), DI
   356  	MOVQ  56(AX), R8
   357  	MOVQ  BX, AX
   358  	ORQ   SI, AX
   359  	ORQ   DI, AX
   360  	ORQ   R8, AX
   361  	TESTQ AX, AX
   362  	JNE   l2
   363  	MOVQ  AX, 32(DX)
   364  	MOVQ  AX, 40(DX)
   365  	MOVQ  AX, 48(DX)
   366  	MOVQ  AX, 56(DX)
   367  	RET
   368  
   369  l2:
   370  	MOVQ $0x3c208c16d87cfd47, CX
   371  	SUBQ BX, CX
   372  	MOVQ CX, 32(DX)
   373  	MOVQ $0x97816a916871ca8d, CX
   374  	SBBQ SI, CX
   375  	MOVQ CX, 40(DX)
   376  	MOVQ $0xb85045b68181585d, CX
   377  	SBBQ DI, CX
   378  	MOVQ CX, 48(DX)
   379  	MOVQ $0x30644e72e131a029, CX
   380  	SBBQ R8, CX
   381  	MOVQ CX, 56(DX)
   382  	RET
   383  
   384  TEXT ·mulNonResE2(SB), NOSPLIT, $0-16
   385  	MOVQ x+8(FP), R10
   386  	MOVQ 0(R10), AX
   387  	MOVQ 8(R10), DX
   388  	MOVQ 16(R10), CX
   389  	MOVQ 24(R10), BX
   390  	ADDQ AX, AX
   391  	ADCQ DX, DX
   392  	ADCQ CX, CX
   393  	ADCQ BX, BX
   394  
   395  	// reduce element(AX,DX,CX,BX) using temp registers (R11,R12,R13,R14)
   396  	REDUCE(AX,DX,CX,BX,R11,R12,R13,R14)
   397  
   398  	ADDQ AX, AX
   399  	ADCQ DX, DX
   400  	ADCQ CX, CX
   401  	ADCQ BX, BX
   402  
   403  	// reduce element(AX,DX,CX,BX) using temp registers (R15,R11,R12,R13)
   404  	REDUCE(AX,DX,CX,BX,R15,R11,R12,R13)
   405  
   406  	ADDQ AX, AX
   407  	ADCQ DX, DX
   408  	ADCQ CX, CX
   409  	ADCQ BX, BX
   410  
   411  	// reduce element(AX,DX,CX,BX) using temp registers (R14,R15,R11,R12)
   412  	REDUCE(AX,DX,CX,BX,R14,R15,R11,R12)
   413  
   414  	ADDQ 0(R10), AX
   415  	ADCQ 8(R10), DX
   416  	ADCQ 16(R10), CX
   417  	ADCQ 24(R10), BX
   418  
   419  	// reduce element(AX,DX,CX,BX) using temp registers (R13,R14,R15,R11)
   420  	REDUCE(AX,DX,CX,BX,R13,R14,R15,R11)
   421  
   422  	MOVQ    32(R10), SI
   423  	MOVQ    40(R10), DI
   424  	MOVQ    48(R10), R8
   425  	MOVQ    56(R10), R9
   426  	XORQ    R12, R12
   427  	SUBQ    SI, AX
   428  	SBBQ    DI, DX
   429  	SBBQ    R8, CX
   430  	SBBQ    R9, BX
   431  	MOVQ    $0x3c208c16d87cfd47, R13
   432  	MOVQ    $0x97816a916871ca8d, R14
   433  	MOVQ    $0xb85045b68181585d, R15
   434  	MOVQ    $0x30644e72e131a029, R11
   435  	CMOVQCC R12, R13
   436  	CMOVQCC R12, R14
   437  	CMOVQCC R12, R15
   438  	CMOVQCC R12, R11
   439  	ADDQ    R13, AX
   440  	ADCQ    R14, DX
   441  	ADCQ    R15, CX
   442  	ADCQ    R11, BX
   443  	ADDQ    SI, SI
   444  	ADCQ    DI, DI
   445  	ADCQ    R8, R8
   446  	ADCQ    R9, R9
   447  
   448  	// reduce element(SI,DI,R8,R9) using temp registers (R13,R14,R15,R11)
   449  	REDUCE(SI,DI,R8,R9,R13,R14,R15,R11)
   450  
   451  	ADDQ SI, SI
   452  	ADCQ DI, DI
   453  	ADCQ R8, R8
   454  	ADCQ R9, R9
   455  
   456  	// reduce element(SI,DI,R8,R9) using temp registers (R12,R13,R14,R15)
   457  	REDUCE(SI,DI,R8,R9,R12,R13,R14,R15)
   458  
   459  	ADDQ SI, SI
   460  	ADCQ DI, DI
   461  	ADCQ R8, R8
   462  	ADCQ R9, R9
   463  
   464  	// reduce element(SI,DI,R8,R9) using temp registers (R11,R12,R13,R14)
   465  	REDUCE(SI,DI,R8,R9,R11,R12,R13,R14)
   466  
   467  	ADDQ 32(R10), SI
   468  	ADCQ 40(R10), DI
   469  	ADCQ 48(R10), R8
   470  	ADCQ 56(R10), R9
   471  
   472  	// reduce element(SI,DI,R8,R9) using temp registers (R15,R11,R12,R13)
   473  	REDUCE(SI,DI,R8,R9,R15,R11,R12,R13)
   474  
   475  	ADDQ 0(R10), SI
   476  	ADCQ 8(R10), DI
   477  	ADCQ 16(R10), R8
   478  	ADCQ 24(R10), R9
   479  
   480  	// reduce element(SI,DI,R8,R9) using temp registers (R14,R15,R11,R12)
   481  	REDUCE(SI,DI,R8,R9,R14,R15,R11,R12)
   482  
   483  	MOVQ res+0(FP), R10
   484  	MOVQ AX, 0(R10)
   485  	MOVQ DX, 8(R10)
   486  	MOVQ CX, 16(R10)
   487  	MOVQ BX, 24(R10)
   488  	MOVQ SI, 32(R10)
   489  	MOVQ DI, 40(R10)
   490  	MOVQ R8, 48(R10)
   491  	MOVQ R9, 56(R10)
   492  	RET
   493  
   494  TEXT ·mulAdxE2(SB), $64-24
   495  	NO_LOCAL_POINTERS
   496  
   497  	// var a, b, c fp.Element
   498  	// a.Add(&x.A0, &x.A1)
   499  	// b.Add(&y.A0, &y.A1)
   500  	// a.Mul(&a, &b)
   501  	// b.Mul(&x.A0, &y.A0)
   502  	// c.Mul(&x.A1, &y.A1)
   503  	// z.A1.Sub(&a, &b).Sub(&z.A1, &c)
   504  	// z.A0.Sub(&b, &c)
   505  
   506  	CMPB ·supportAdx(SB), $1
   507  	JNE  l4
   508  	MOVQ x+8(FP), AX
   509  	MOVQ y+16(FP), DX
   510  	MOVQ 32(AX), R14
   511  	MOVQ 40(AX), R15
   512  	MOVQ 48(AX), CX
   513  	MOVQ 56(AX), BX
   514  	MOVQ 32(DX), SI
   515  	MOVQ 40(DX), DI
   516  	MOVQ 48(DX), R8
   517  	MOVQ 56(DX), R9
   518  
   519  	// mul (R14,R15,CX,BX) with (SI,DI,R8,R9) into (R10,R11,R12,R13)
   520  	MUL()
   521  
   522  	// reduce element(R10,R11,R12,R13) using temp registers (SI,DI,R8,R9)
   523  	REDUCE(R10,R11,R12,R13,SI,DI,R8,R9)
   524  
   525  	MOVQ R10, s4-40(SP)
   526  	MOVQ R11, s5-48(SP)
   527  	MOVQ R12, s6-56(SP)
   528  	MOVQ R13, s7-64(SP)
   529  	MOVQ x+8(FP), AX
   530  	MOVQ y+16(FP), DX
   531  	ADDQ 0(AX), R14
   532  	ADCQ 8(AX), R15
   533  	ADCQ 16(AX), CX
   534  	ADCQ 24(AX), BX
   535  	MOVQ 0(DX), SI
   536  	MOVQ 8(DX), DI
   537  	MOVQ 16(DX), R8
   538  	MOVQ 24(DX), R9
   539  	ADDQ 32(DX), SI
   540  	ADCQ 40(DX), DI
   541  	ADCQ 48(DX), R8
   542  	ADCQ 56(DX), R9
   543  
   544  	// mul (R14,R15,CX,BX) with (SI,DI,R8,R9) into (R10,R11,R12,R13)
   545  	MUL()
   546  
   547  	// reduce element(R10,R11,R12,R13) using temp registers (SI,DI,R8,R9)
   548  	REDUCE(R10,R11,R12,R13,SI,DI,R8,R9)
   549  
   550  	MOVQ R10, s0-8(SP)
   551  	MOVQ R11, s1-16(SP)
   552  	MOVQ R12, s2-24(SP)
   553  	MOVQ R13, s3-32(SP)
   554  	MOVQ x+8(FP), AX
   555  	MOVQ y+16(FP), DX
   556  	MOVQ 0(AX), R14
   557  	MOVQ 8(AX), R15
   558  	MOVQ 16(AX), CX
   559  	MOVQ 24(AX), BX
   560  	MOVQ 0(DX), SI
   561  	MOVQ 8(DX), DI
   562  	MOVQ 16(DX), R8
   563  	MOVQ 24(DX), R9
   564  
   565  	// mul (R14,R15,CX,BX) with (SI,DI,R8,R9) into (R10,R11,R12,R13)
   566  	MUL()
   567  
   568  	// reduce element(R10,R11,R12,R13) using temp registers (SI,DI,R8,R9)
   569  	REDUCE(R10,R11,R12,R13,SI,DI,R8,R9)
   570  
   571  	XORQ    DX, DX
   572  	MOVQ    s0-8(SP), R14
   573  	MOVQ    s1-16(SP), R15
   574  	MOVQ    s2-24(SP), CX
   575  	MOVQ    s3-32(SP), BX
   576  	SUBQ    R10, R14
   577  	SBBQ    R11, R15
   578  	SBBQ    R12, CX
   579  	SBBQ    R13, BX
   580  	MOVQ    $0x3c208c16d87cfd47, SI
   581  	MOVQ    $0x97816a916871ca8d, DI
   582  	MOVQ    $0xb85045b68181585d, R8
   583  	MOVQ    $0x30644e72e131a029, R9
   584  	CMOVQCC DX, SI
   585  	CMOVQCC DX, DI
   586  	CMOVQCC DX, R8
   587  	CMOVQCC DX, R9
   588  	ADDQ    SI, R14
   589  	ADCQ    DI, R15
   590  	ADCQ    R8, CX
   591  	ADCQ    R9, BX
   592  	SUBQ    s4-40(SP), R14
   593  	SBBQ    s5-48(SP), R15
   594  	SBBQ    s6-56(SP), CX
   595  	SBBQ    s7-64(SP), BX
   596  	MOVQ    $0x3c208c16d87cfd47, SI
   597  	MOVQ    $0x97816a916871ca8d, DI
   598  	MOVQ    $0xb85045b68181585d, R8
   599  	MOVQ    $0x30644e72e131a029, R9
   600  	CMOVQCC DX, SI
   601  	CMOVQCC DX, DI
   602  	CMOVQCC DX, R8
   603  	CMOVQCC DX, R9
   604  	ADDQ    SI, R14
   605  	ADCQ    DI, R15
   606  	ADCQ    R8, CX
   607  	ADCQ    R9, BX
   608  	MOVQ    res+0(FP), AX
   609  	MOVQ    R14, 32(AX)
   610  	MOVQ    R15, 40(AX)
   611  	MOVQ    CX, 48(AX)
   612  	MOVQ    BX, 56(AX)
   613  	MOVQ    s4-40(SP), SI
   614  	MOVQ    s5-48(SP), DI
   615  	MOVQ    s6-56(SP), R8
   616  	MOVQ    s7-64(SP), R9
   617  	SUBQ    SI, R10
   618  	SBBQ    DI, R11
   619  	SBBQ    R8, R12
   620  	SBBQ    R9, R13
   621  	MOVQ    $0x3c208c16d87cfd47, R14
   622  	MOVQ    $0x97816a916871ca8d, R15
   623  	MOVQ    $0xb85045b68181585d, CX
   624  	MOVQ    $0x30644e72e131a029, BX
   625  	CMOVQCC DX, R14
   626  	CMOVQCC DX, R15
   627  	CMOVQCC DX, CX
   628  	CMOVQCC DX, BX
   629  	ADDQ    R14, R10
   630  	ADCQ    R15, R11
   631  	ADCQ    CX, R12
   632  	ADCQ    BX, R13
   633  	MOVQ    R10, 0(AX)
   634  	MOVQ    R11, 8(AX)
   635  	MOVQ    R12, 16(AX)
   636  	MOVQ    R13, 24(AX)
   637  	RET
   638  
   639  l4:
   640  	MOVQ res+0(FP), AX
   641  	MOVQ AX, (SP)
   642  	MOVQ x+8(FP), AX
   643  	MOVQ AX, 8(SP)
   644  	MOVQ y+16(FP), AX
   645  	MOVQ AX, 16(SP)
   646  	CALL ·mulGenericE2(SB)
   647  	RET
   648  
   649  TEXT ·squareAdxE2(SB), $16-16
   650  	NO_LOCAL_POINTERS
   651  
   652  	// z.A0 = (x.A0 + x.A1) * (x.A0 - x.A1)
   653  	// z.A1 = 2 * x.A0 * x.A1
   654  
   655  	CMPB ·supportAdx(SB), $1
   656  	JNE  l5
   657  
   658  	// 2 * x.A0 * x.A1
   659  	MOVQ x+8(FP), AX
   660  
   661  	// x.A0[0] -> SI
   662  	// x.A0[1] -> DI
   663  	// x.A0[2] -> R8
   664  	// x.A0[3] -> R9
   665  	MOVQ 0(AX), SI
   666  	MOVQ 8(AX), DI
   667  	MOVQ 16(AX), R8
   668  	MOVQ 24(AX), R9
   669  
   670  	// 2 * x.A1[0] -> R14
   671  	// 2 * x.A1[1] -> R15
   672  	// 2 * x.A1[2] -> CX
   673  	// 2 * x.A1[3] -> BX
   674  	MOVQ 32(AX), R14
   675  	MOVQ 40(AX), R15
   676  	MOVQ 48(AX), CX
   677  	MOVQ 56(AX), BX
   678  	ADDQ R14, R14
   679  	ADCQ R15, R15
   680  	ADCQ CX, CX
   681  	ADCQ BX, BX
   682  
   683  	// mul (R14,R15,CX,BX) with (SI,DI,R8,R9) into (R10,R11,R12,R13)
   684  	MUL()
   685  
   686  	// reduce element(R10,R11,R12,R13) using temp registers (R14,R15,CX,BX)
   687  	REDUCE(R10,R11,R12,R13,R14,R15,CX,BX)
   688  
   689  	MOVQ x+8(FP), AX
   690  
   691  	// x.A1[0] -> R14
   692  	// x.A1[1] -> R15
   693  	// x.A1[2] -> CX
   694  	// x.A1[3] -> BX
   695  	MOVQ 32(AX), R14
   696  	MOVQ 40(AX), R15
   697  	MOVQ 48(AX), CX
   698  	MOVQ 56(AX), BX
   699  	MOVQ res+0(FP), DX
   700  	MOVQ R10, 32(DX)
   701  	MOVQ R11, 40(DX)
   702  	MOVQ R12, 48(DX)
   703  	MOVQ R13, 56(DX)
   704  	MOVQ R14, R10
   705  	MOVQ R15, R11
   706  	MOVQ CX, R12
   707  	MOVQ BX, R13
   708  
   709  	// Add(&x.A0, &x.A1)
   710  	ADDQ SI, R14
   711  	ADCQ DI, R15
   712  	ADCQ R8, CX
   713  	ADCQ R9, BX
   714  	XORQ BP, BP
   715  
   716  	// Sub(&x.A0, &x.A1)
   717  	SUBQ    R10, SI
   718  	SBBQ    R11, DI
   719  	SBBQ    R12, R8
   720  	SBBQ    R13, R9
   721  	MOVQ    $0x3c208c16d87cfd47, R10
   722  	MOVQ    $0x97816a916871ca8d, R11
   723  	MOVQ    $0xb85045b68181585d, R12
   724  	MOVQ    $0x30644e72e131a029, R13
   725  	CMOVQCC BP, R10
   726  	CMOVQCC BP, R11
   727  	CMOVQCC BP, R12
   728  	CMOVQCC BP, R13
   729  	ADDQ    R10, SI
   730  	ADCQ    R11, DI
   731  	ADCQ    R12, R8
   732  	ADCQ    R13, R9
   733  
   734  	// mul (R14,R15,CX,BX) with (SI,DI,R8,R9) into (R10,R11,R12,R13)
   735  	MUL()
   736  
   737  	// reduce element(R10,R11,R12,R13) using temp registers (R14,R15,CX,BX)
   738  	REDUCE(R10,R11,R12,R13,R14,R15,CX,BX)
   739  
   740  	MOVQ res+0(FP), AX
   741  	MOVQ R10, 0(AX)
   742  	MOVQ R11, 8(AX)
   743  	MOVQ R12, 16(AX)
   744  	MOVQ R13, 24(AX)
   745  	RET
   746  
   747  l5:
   748  	MOVQ res+0(FP), AX
   749  	MOVQ AX, (SP)
   750  	MOVQ x+8(FP), AX
   751  	MOVQ AX, 8(SP)
   752  	CALL ·squareGenericE2(SB)
   753  	RET