github.com/emmansun/gmsm@v0.29.1/internal/sm2ec/p256_asm_s390x.s (about)

     1  // This is a port of the NIST P256 s390x asm implementation to SM2 P256.
     2  //
     3  // Copyright 2019 The Go Authors. All rights reserved.
     4  // Use of this source code is governed by a BSD-style
     5  // license that can be found in the LICENSE file.
     6  
     7  //go:build !purego
     8  
     9  #include "textflag.h"
    10  #include "go_asm.h"
    11  
    12  DATA p256ordK0<>+0x00(SB)/4, $0x72350975
    13  DATA p256ord<>+0x00(SB)/8, $0xfffffffeffffffff
    14  DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff
    15  DATA p256ord<>+0x10(SB)/8, $0x7203df6b21c6052b
    16  DATA p256ord<>+0x18(SB)/8, $0x53bbf40939d54123
    17  DATA p256<>+0x00(SB)/8, $0xfffffffeffffffff // P256
    18  DATA p256<>+0x08(SB)/8, $0xffffffffffffffff // P256
    19  DATA p256<>+0x10(SB)/8, $0xffffffff00000000 // P256
    20  DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
    21  DATA p256<>+0x20(SB)/8, $0x0000000000000000 // SEL 0 0 d1 d0
    22  DATA p256<>+0x28(SB)/8, $0x18191a1b1c1d1e1f // SEL 0 0 d1 d0
    23  DATA p256<>+0x30(SB)/8, $0x0706050403020100 // LE2BE permute mask
    24  DATA p256<>+0x38(SB)/8, $0x0f0e0d0c0b0a0908 // LE2BE permute mask
    25  DATA p256mul<>+0x00(SB)/8, $0xfffffffeffffffff // P256
    26  DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256
    27  DATA p256mul<>+0x10(SB)/8, $0xffffffff00000000 // P256
    28  DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
    29  DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0  0  0 d0
    30  DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0  0  0 d0
    31  DATA p256mul<>+0x30(SB)/8, $0x0405060708090a0b // SEL  0  0 d1 d0
    32  DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL  0  0 d1 d0
    33  DATA p256mul<>+0x40(SB)/8, $0x0000000100000000 // (1*2^256)%P256
    34  DATA p256mul<>+0x48(SB)/8, $0x0000000000000000 // (1*2^256)%P256
    35  DATA p256mul<>+0x50(SB)/8, $0x00000000ffffffff // (1*2^256)%P256
    36  DATA p256mul<>+0x58(SB)/8, $0x0000000000000001 // (1*2^256)%P256
    37  GLOBL p256ordK0<>(SB), 8, $4
    38  GLOBL p256ord<>(SB), 8, $32
    39  GLOBL p256<>(SB), 8, $64
    40  GLOBL p256mul<>(SB), 8, $96
    41  
    42  // func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
    43  TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0
    44  	JMP ·p256BigToLittle(SB)
    45  
    46  // func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
    47  TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0
    48  	JMP ·p256BigToLittle(SB)
    49  
    50  // ---------------------------------------
    51  // func p256LittleToBig(res *[32]byte, in *p256Element)
    52  TEXT ·p256LittleToBig(SB), NOSPLIT, $0
    53  	JMP ·p256BigToLittle(SB)
    54  
    55  // func p256BigToLittle(res *p256Element, in *[32]byte)
    56  #define res_ptr   R1
    57  #define in_ptr   R2
    58  #define T1L   V3
    59  #define T1H   V2
    60  #define T1L2  V1
    61  
    62  TEXT ·p256BigToLittle(SB), NOSPLIT, $0
    63  	MOVD res+0(FP), res_ptr
    64  	MOVD in+8(FP), in_ptr
    65  
    66  	VLM (in_ptr), T1H, T1L
    67  
    68  	VPDI $0x4, T1L, T1L, T1L2
    69  	VPDI $0x4, T1H, T1H, T1H
    70  
    71  	VSTM T1L2, T1H, (res_ptr)
    72  
    73  	RET
    74  
    75  #undef res_ptr
    76  #undef in_ptr
    77  #undef T1L
    78  #undef T1H
    79  #undef T1L2
    80  
    81  // ---------------------------------------
    82  // iff cond == 1  val <- -val
    83  // func p256NegCond(val *p256Element, cond int)
    84  #define P1ptr   R1
    85  #define CPOOL   R4
    86  
    87  #define Y1L   V0
    88  #define Y1H   V1
    89  #define T1L   V2
    90  #define T1H   V3
    91  
    92  #define PL    V31
    93  #define PH    V30
    94  
    95  #define ZER   V4
    96  #define SEL1  V5
    97  #define CAR1  V6
    98  TEXT ·p256NegCond(SB), NOSPLIT, $0
    99  	MOVD val+0(FP), P1ptr
   100  
   101  	MOVD $p256mul<>+0x00(SB), CPOOL
   102  	VLM   (CPOOL), PH, PL
   103  
   104  	VLM   (P1ptr), Y1L, Y1H
   105  	VPDI $0x4, Y1H, Y1H, Y1H
   106  	VPDI $0x4, Y1L, Y1L, Y1L
   107  
   108  	VLREPG cond+8(FP), SEL1
   109  	VZERO  ZER
   110  	VCEQG  SEL1, ZER, SEL1
   111  
   112  	VSCBIQ Y1L, PL, CAR1
   113  	VSQ    Y1L, PL, T1L
   114  	VSBIQ  PH, Y1H, CAR1, T1H
   115  
   116  	VSEL Y1L, T1L, SEL1, Y1L
   117  	VSEL Y1H, T1H, SEL1, Y1H
   118  
   119  	VPDI $0x4, Y1H, Y1H, Y1H
   120  	VPDI $0x4, Y1L, Y1L, Y1L
   121  	VSTM  Y1L, Y1H, (P1ptr)
   122  
   123  	RET
   124  
   125  #undef P1ptr
   126  #undef CPOOL
   127  #undef Y1L
   128  #undef Y1H
   129  #undef T1L
   130  #undef T1H
   131  #undef PL
   132  #undef PH
   133  #undef ZER
   134  #undef SEL1
   135  #undef CAR1
   136  
   137  // ---------------------------------------
   138  // if cond == 0 res <- b; else res <- a
   139  // func p256MovCond(res, a, b *P256Point, cond int)
   140  #define P3ptr   R1
   141  #define P1ptr   R2
   142  #define P2ptr   R3
   143  
   144  #define X1L    V1
   145  #define X1H    V0
   146  #define Y1L    V3
   147  #define Y1H    V2
   148  #define Z1L    V5
   149  #define Z1H    V4
   150  #define X2L    V7
   151  #define X2H    V6
   152  #define Y2L    V9
   153  #define Y2H    V8
   154  #define Z2L    V11
   155  #define Z2H    V10
   156  
   157  #define ZER   V18
   158  #define SEL1  V19
   159  TEXT ·p256MovCond(SB), NOSPLIT, $0
   160  	MOVD   res+0(FP), P3ptr
   161  	MOVD   a+8(FP), P1ptr
   162  	MOVD   b+16(FP), P2ptr
   163  	VLREPG cond+24(FP), SEL1
   164  	VZERO  ZER
   165  	VCEQG  SEL1, ZER, SEL1
   166  
   167  	VLM (P1ptr), X1H, Z1L
   168  	VLM (P2ptr), X2H, Z2L
   169  
   170  	VSEL X2L, X1L, SEL1, X1L
   171  	VSEL X2H, X1H, SEL1, X1H
   172  	VSEL Y2L, Y1L, SEL1, Y1L
   173  	VSEL Y2H, Y1H, SEL1, Y1H
   174  	VSEL Z2L, Z1L, SEL1, Z1L
   175  	VSEL Z2H, Z1H, SEL1, Z1H
   176  
   177  	VSTM X1H, Z1L, (P3ptr)
   178  
   179  	RET
   180  
   181  #undef P3ptr
   182  #undef P1ptr
   183  #undef P2ptr
   184  #undef X1L
   185  #undef X1H
   186  #undef Y1L
   187  #undef Y1H
   188  #undef Z1L
   189  #undef Z1H
   190  #undef X2L
   191  #undef X2H
   192  #undef Y2L
   193  #undef Y2H
   194  #undef Z2L
   195  #undef Z2H
   196  #undef ZER
   197  #undef SEL1
   198  
   199  // ---------------------------------------
   200  // Constant time table access
   201  // Indexed from 1 to 15, with -1 offset
   202  // (index 0 is implicitly point at infinity)
   203  // func p256Select(res *P256Point, table *p256Table, idx int, limit int)
   204  #define P3ptr   R1
   205  #define P1ptr   R2
   206  #define LIMIT   R3
   207  #define COUNT   R4
   208  
   209  #define X1L    V1
   210  #define X1H    V0
   211  #define Y1L    V3
   212  #define Y1H    V2
   213  #define Z1L    V5
   214  #define Z1H    V4
   215  #define X2L    V7
   216  #define X2H    V6
   217  #define Y2L    V9
   218  #define Y2H    V8
   219  #define Z2L    V11
   220  #define Z2H    V10
   221  
   222  #define ONE   V18
   223  #define IDX   V19
   224  #define SEL1  V20
   225  #define SEL2  V21
   226  TEXT ·p256Select(SB), NOSPLIT, $0
   227  	MOVD   res+0(FP), P3ptr
   228  	MOVD   table+8(FP), P1ptr
   229  	MOVD   limit+24(FP), LIMIT
   230  	VLREPB idx+(16+7)(FP), IDX
   231  	VREPIB $1, ONE
   232  	VREPIB $1, SEL2
   233  	MOVD   $1, COUNT
   234  
   235  	VZERO X1H
   236  	VZERO X1L
   237  	VZERO Y1H
   238  	VZERO Y1L
   239  	VZERO Z1H
   240  	VZERO Z1L
   241  
   242  loop_select:
   243  	VLM (P1ptr), X2H, Z2L
   244  
   245  	VCEQG SEL2, IDX, SEL1
   246  
   247  	VSEL X2L, X1L, SEL1, X1L
   248  	VSEL X2H, X1H, SEL1, X1H
   249  	VSEL Y2L, Y1L, SEL1, Y1L
   250  	VSEL Y2H, Y1H, SEL1, Y1H
   251  	VSEL Z2L, Z1L, SEL1, Z1L
   252  	VSEL Z2H, Z1H, SEL1, Z1H
   253  
   254  	VAB  SEL2, ONE, SEL2
   255  	ADD  $96, P1ptr
   256  	ADD  $1, COUNT
   257  	CMPBLE  COUNT, LIMIT, loop_select
   258  
   259  	VSTM X1H, Z1L, (P3ptr)
   260  
   261  	RET
   262  
   263  #undef P3ptr
   264  #undef P1ptr
   265  #undef COUNT
   266  #undef LIMIT
   267  #undef X1L
   268  #undef X1H
   269  #undef Y1L
   270  #undef Y1H
   271  #undef Z1L
   272  #undef Z1H
   273  #undef X2L
   274  #undef X2H
   275  #undef Y2L
   276  #undef Y2H
   277  #undef Z2L
   278  #undef Z2H
   279  #undef ONE
   280  #undef IDX
   281  #undef SEL1
   282  #undef SEL2
   283  
   284  // ---------------------------------------
   285  
   286  //  func p256FromMont(res, in *p256Element)
   287  #define res_ptr R1
   288  #define x_ptr   R2
   289  #define CPOOL   R4
   290  
   291  #define T0   V0
   292  #define T1   V1
   293  #define T2   V2
   294  #define TT0  V3
   295  #define TT1  V4
   296  
   297  #define ZER   V6
   298  #define CAR1  V9
   299  #define CAR2  V10
   300  #define RED1  V11
   301  #define RED2  V12
   302  #define PH    V13
   303  #define PL    V14
   304  #define SEL1  V15
   305  
   306  TEXT ·p256FromMont(SB), NOSPLIT, $0
   307  	MOVD res+0(FP), res_ptr
   308  	MOVD in+8(FP), x_ptr
   309  
   310  	VZERO T2
   311  	VZERO ZER
   312  	MOVD  $p256<>+0x00(SB), CPOOL
   313  	VLM    (CPOOL), PH, SEL1
   314  
   315  	VLM (x_ptr), T0, T1
   316  	VPDI $0x4, T0, T0, T0
   317  	VPDI $0x4, T1, T1, T1
   318  
   319  	// First round
   320  	VPERM ZER, T0, SEL1, RED1   // 0 0 d1 d0
   321  	VSLDB $4, RED1, ZER, TT0    // 0 d1 d0 0
   322  	VSLDB $4, TT0, ZER, RED2    // d1 d0 0 0
   323  	VSCBIQ  TT0, RED1, CAR1
   324  	VSQ	 TT0, RED1, RED1
   325  	VSBIQ  RED2, TT0, CAR1, RED2 // Guaranteed not to underflow
   326  
   327  	VSLDB $8, T1, T0, T0
   328  	VSLDB $8, T2, T1, T1
   329  
   330  	VACCQ  T0, RED1, CAR1
   331  	VAQ    T0, RED1, T0
   332  	VACCCQ T1, RED2, CAR1, CAR2
   333  	VACQ   T1, RED2, CAR1, T1
   334  	VAQ    T2, CAR2, T2
   335  
   336  	// Second round
   337  	VPERM ZER, T0, SEL1, RED1   // 0 0 d1 d0
   338  	VSLDB $4, RED1, ZER, TT0    // 0 d1 d0 0
   339  	VSLDB $4, TT0, ZER, RED2    // d1 d0 0 0
   340  	VSCBIQ  TT0, RED1, CAR1
   341  	VSQ	 TT0, RED1, RED1
   342  	VSBIQ  RED2, TT0, CAR1, RED2 // Guaranteed not to underflow
   343  
   344  	VSLDB $8, T1, T0, T0
   345  	VSLDB $8, T2, T1, T1
   346  
   347  	VACCQ  T0, RED1, CAR1
   348  	VAQ    T0, RED1, T0
   349  	VACCCQ T1, RED2, CAR1, CAR2
   350  	VACQ   T1, RED2, CAR1, T1
   351  	VAQ    T2, CAR2, T2
   352  
   353  	// Third round
   354  	VPERM ZER, T0, SEL1, RED1   // 0 0 d1 d0
   355  	VSLDB $4, RED1, ZER, TT0    // 0 d1 d0 0
   356  	VSLDB $4, TT0, ZER, RED2    // d1 d0 0 0
   357  	VSCBIQ  TT0, RED1, CAR1
   358  	VSQ	 TT0, RED1, RED1
   359  	VSBIQ  RED2, TT0, CAR1, RED2 // Guaranteed not to underflow
   360  
   361  	VSLDB $8, T1, T0, T0
   362  	VSLDB $8, T2, T1, T1
   363  
   364  	VACCQ  T0, RED1, CAR1
   365  	VAQ    T0, RED1, T0
   366  	VACCCQ T1, RED2, CAR1, CAR2
   367  	VACQ   T1, RED2, CAR1, T1
   368  	VAQ    T2, CAR2, T2
   369  
   370  	// Last round
   371  	VPERM ZER, T0, SEL1, RED1   // 0 0 d1 d0
   372  	VSLDB $4, RED1, ZER, TT0    // 0 d1 d0 0
   373  	VSLDB $4, TT0, ZER, RED2    // d1 d0 0 0
   374  	VSCBIQ  TT0, RED1, CAR1
   375  	VSQ	 TT0, RED1, RED1
   376  	VSBIQ  RED2, TT0, CAR1, RED2 // Guaranteed not to underflow
   377  
   378  	VSLDB $8, T1, T0, T0
   379  	VSLDB $8, T2, T1, T1
   380  
   381  	VACCQ  T0, RED1, CAR1
   382  	VAQ    T0, RED1, T0
   383  	VACCCQ T1, RED2, CAR1, CAR2
   384  	VACQ   T1, RED2, CAR1, T1
   385  	VAQ    T2, CAR2, T2
   386  
   387  	// ---------------------------------------------------
   388  
   389  	VSCBIQ  PL, T0, CAR1
   390  	VSQ     PL, T0, TT0
   391  	VSBCBIQ T1, PH, CAR1, CAR2
   392  	VSBIQ   T1, PH, CAR1, TT1
   393  	VSBIQ   T2, ZER, CAR2, T2
   394  
   395  	// what output to use, TT1||TT0 or T1||T0?
   396  	VSEL T0, TT0, T2, T0
   397  	VSEL T1, TT1, T2, T1
   398  
   399  	VPDI $0x4, T0, T0, TT0
   400  	VPDI $0x4, T1, T1, TT1
   401  	VSTM  TT0, TT1, (res_ptr)
   402  
   403  	RET
   404  
   405  #undef res_ptr
   406  #undef x_ptr
   407  #undef CPOOL
   408  #undef T0
   409  #undef T1
   410  #undef T2
   411  #undef TT0
   412  #undef TT1
   413  #undef ZER
   414  #undef SEL1
   415  #undef CAR1
   416  #undef CAR2
   417  #undef RED1
   418  #undef RED2
   419  #undef PL
   420  #undef PH
   421  
   422  // Constant time table access
   423  // Indexed from 1 to 15, with -1 offset
   424  // (index 0 is implicitly point at infinity)
   425  // func p256SelectBase(point *p256Point, table []p256Point, idx int)
   426  // new : func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   427  
   428  #define P3ptr   R1
   429  #define P1ptr   R2
   430  #define COUNT   R4
   431  #define CPOOL   R5
   432  
   433  #define X1L    V1
   434  #define X1H    V0
   435  #define Y1L    V3
   436  #define Y1H    V2
   437  #define X2L    V7
   438  #define X2H    V6
   439  #define Y2L    V9
   440  #define Y2H    V8
   441  
   442  #define ONE   V18
   443  #define IDX   V19
   444  #define SEL1  V20
   445  #define SEL2  V21
   446  
   447  TEXT ·p256SelectAffine(SB), NOSPLIT, $0
   448  	MOVD   res+0(FP), P3ptr
   449  	MOVD   table+8(FP), P1ptr
   450  	MOVD   $p256<>+0x00(SB), CPOOL
   451  	VLREPB idx+(16+7)(FP), IDX
   452  	VREPIB $1, ONE
   453  	VREPIB $1, SEL2
   454  	MOVD   $1, COUNT
   455  
   456  	VZERO X1H
   457  	VZERO X1L
   458  	VZERO Y1H
   459  	VZERO Y1L
   460  
   461  loop_select:
   462  	VLM (P1ptr), X2H, Y2L
   463  
   464  	VCEQG SEL2, IDX, SEL1
   465  
   466  	VSEL X2L, X1L, SEL1, X1L
   467  	VSEL X2H, X1H, SEL1, X1H
   468  	VSEL Y2L, Y1L, SEL1, Y1L
   469  	VSEL Y2H, Y1H, SEL1, Y1H
   470  
   471  	VAB  SEL2, ONE, SEL2
   472  	ADDW $1, COUNT
   473  	ADD  $64, P1ptr
   474  	CMPW COUNT, $33
   475  	BLT  loop_select
   476  
   477  	VSTM X1H, Y1L, (P3ptr)
   478  
   479  	RET
   480  
   481  #undef P3ptr
   482  #undef P1ptr
   483  #undef COUNT
   484  #undef X1L
   485  #undef X1H
   486  #undef Y1L
   487  #undef Y1H
   488  #undef X2L
   489  #undef X2H
   490  #undef Y2L
   491  #undef Y2H
   492  #undef ONE
   493  #undef IDX
   494  #undef SEL1
   495  #undef SEL2
   496  #undef CPOOL
   497  
   498  // ---------------------------------------
   499  // sm2p256OrdMulInternal
   500  #define X0    V0
   501  #define X1    V1
   502  #define Y0    V2
   503  #define Y1    V3
   504  #define M1    V4
   505  #define M0    V5
   506  #define T0    V6
   507  #define T1    V7
   508  #define T2    V8
   509  #define YDIG  V9
   510  
   511  #define ADD1  V16
   512  #define ADD1H V17
   513  #define ADD2  V18
   514  #define ADD2H V19
   515  #define RED1  V20
   516  #define RED1H V21
   517  #define RED2  V22
   518  #define RED2H V23
   519  #define CAR1  V24
   520  #define CAR1M V25
   521  
   522  #define MK0   V30
   523  #define K0    V31
   524  TEXT sm2p256OrdMulInternal<>(SB), NOSPLIT, $0-0
   525  	// ---------------------------------------------------------------------------/
   526  	VREPF $3, Y0, YDIG
   527  	VMLF  X0, YDIG, ADD1
   528  	VMLF  ADD1, K0, MK0
   529  	VREPF $3, MK0, MK0
   530  
   531  	VMLF  X1, YDIG, ADD2
   532  	VMLHF X0, YDIG, ADD1H
   533  	VMLHF X1, YDIG, ADD2H
   534  
   535  	VMALF  M0, MK0, ADD1, RED1
   536  	VMALHF M0, MK0, ADD1, RED1H
   537  	VMALF  M1, MK0, ADD2, RED2
   538  	VMALHF M1, MK0, ADD2, RED2H
   539  
   540  	VZERO T2
   541  	VSLDB $12, RED2, RED1, RED1
   542  	VSLDB $12, T2, RED2, RED2
   543  
   544  	VACCQ RED1, ADD1H, CAR1
   545  	VAQ   RED1, ADD1H, T0
   546  	VACCQ RED1H, T0, CAR1M
   547  	VAQ   RED1H, T0, T0
   548  
   549  	// << ready for next MK0
   550  
   551  	VACQ   RED2, ADD2H, CAR1, T1
   552  	VACCCQ RED2, ADD2H, CAR1, CAR1
   553  	VACCCQ RED2H, T1, CAR1M, T2
   554  	VACQ   RED2H, T1, CAR1M, T1
   555  	VAQ    CAR1, T2, T2
   556  
   557  	// ---------------------------------------------------
   558  /* *
   559   * ---+--------+--------+
   560   *  T2|   T1   |   T0   |
   561   * ---+--------+--------+
   562   *           *(add)*
   563   *    +--------+--------+
   564   *    |   X1   |   X0   |
   565   *    +--------+--------+
   566   *           *(mul)*
   567   *    +--------+--------+
   568   *    |  YDIG  |  YDIG  |
   569   *    +--------+--------+
   570   *           *(add)*
   571   *    +--------+--------+
   572   *    |   M1   |   M0   |
   573   *    +--------+--------+
   574   *           *(mul)*
   575   *    +--------+--------+
   576   *    |   MK0  |   MK0  |
   577   *    +--------+--------+
   578   *
   579   *   ---------------------
   580   *
   581   *    +--------+--------+
   582   *    |  ADD2  |  ADD1  |
   583   *    +--------+--------+
   584   *  +--------+--------+
   585   *  | ADD2H  | ADD1H  |
   586   *  +--------+--------+
   587   *    +--------+--------+
   588   *    |  RED2  |  RED1  |
   589   *    +--------+--------+
   590   *  +--------+--------+
   591   *  | RED2H  | RED1H  |
   592   *  +--------+--------+
   593   */
   594  	VREPF $2, Y0, YDIG
   595  	VMALF X0, YDIG, T0, ADD1
   596  	VMLF  ADD1, K0, MK0
   597  	VREPF $3, MK0, MK0
   598  
   599  	VMALF  X1, YDIG, T1, ADD2
   600  	VMALHF X0, YDIG, T0, ADD1H
   601  	VMALHF X1, YDIG, T1, ADD2H
   602  
   603  	VMALF  M0, MK0, ADD1, RED1
   604  	VMALHF M0, MK0, ADD1, RED1H
   605  	VMALF  M1, MK0, ADD2, RED2
   606  	VMALHF M1, MK0, ADD2, RED2H
   607  
   608  	VSLDB $12, RED2, RED1, RED1
   609  	VSLDB $12, T2, RED2, RED2
   610  
   611  	VACCQ RED1, ADD1H, CAR1
   612  	VAQ   RED1, ADD1H, T0
   613  	VACCQ RED1H, T0, CAR1M
   614  	VAQ   RED1H, T0, T0
   615  
   616  	// << ready for next MK0
   617  
   618  	VACQ   RED2, ADD2H, CAR1, T1
   619  	VACCCQ RED2, ADD2H, CAR1, CAR1
   620  	VACCCQ RED2H, T1, CAR1M, T2
   621  	VACQ   RED2H, T1, CAR1M, T1
   622  	VAQ    CAR1, T2, T2
   623  
   624  	// ---------------------------------------------------
   625  	VREPF $1, Y0, YDIG
   626  	VMALF X0, YDIG, T0, ADD1
   627  	VMLF  ADD1, K0, MK0
   628  	VREPF $3, MK0, MK0
   629  
   630  	VMALF  X1, YDIG, T1, ADD2
   631  	VMALHF X0, YDIG, T0, ADD1H
   632  	VMALHF X1, YDIG, T1, ADD2H
   633  
   634  	VMALF  M0, MK0, ADD1, RED1
   635  	VMALHF M0, MK0, ADD1, RED1H
   636  	VMALF  M1, MK0, ADD2, RED2
   637  	VMALHF M1, MK0, ADD2, RED2H
   638  
   639  	VSLDB $12, RED2, RED1, RED1
   640  	VSLDB $12, T2, RED2, RED2
   641  
   642  	VACCQ RED1, ADD1H, CAR1
   643  	VAQ   RED1, ADD1H, T0
   644  	VACCQ RED1H, T0, CAR1M
   645  	VAQ   RED1H, T0, T0
   646  
   647  	// << ready for next MK0
   648  
   649  	VACQ   RED2, ADD2H, CAR1, T1
   650  	VACCCQ RED2, ADD2H, CAR1, CAR1
   651  	VACCCQ RED2H, T1, CAR1M, T2
   652  	VACQ   RED2H, T1, CAR1M, T1
   653  	VAQ    CAR1, T2, T2
   654  
   655  	// ---------------------------------------------------
   656  	VREPF $0, Y0, YDIG
   657  	VMALF X0, YDIG, T0, ADD1
   658  	VMLF  ADD1, K0, MK0
   659  	VREPF $3, MK0, MK0
   660  
   661  	VMALF  X1, YDIG, T1, ADD2
   662  	VMALHF X0, YDIG, T0, ADD1H
   663  	VMALHF X1, YDIG, T1, ADD2H
   664  
   665  	VMALF  M0, MK0, ADD1, RED1
   666  	VMALHF M0, MK0, ADD1, RED1H
   667  	VMALF  M1, MK0, ADD2, RED2
   668  	VMALHF M1, MK0, ADD2, RED2H
   669  
   670  	VSLDB $12, RED2, RED1, RED1
   671  	VSLDB $12, T2, RED2, RED2
   672  
   673  	VACCQ RED1, ADD1H, CAR1
   674  	VAQ   RED1, ADD1H, T0
   675  	VACCQ RED1H, T0, CAR1M
   676  	VAQ   RED1H, T0, T0
   677  
   678  	// << ready for next MK0
   679  
   680  	VACQ   RED2, ADD2H, CAR1, T1
   681  	VACCCQ RED2, ADD2H, CAR1, CAR1
   682  	VACCCQ RED2H, T1, CAR1M, T2
   683  	VACQ   RED2H, T1, CAR1M, T1
   684  	VAQ    CAR1, T2, T2
   685  
   686  	// ---------------------------------------------------
   687  	VREPF $3, Y1, YDIG
   688  	VMALF X0, YDIG, T0, ADD1
   689  	VMLF  ADD1, K0, MK0
   690  	VREPF $3, MK0, MK0
   691  
   692  	VMALF  X1, YDIG, T1, ADD2
   693  	VMALHF X0, YDIG, T0, ADD1H
   694  	VMALHF X1, YDIG, T1, ADD2H
   695  
   696  	VMALF  M0, MK0, ADD1, RED1
   697  	VMALHF M0, MK0, ADD1, RED1H
   698  	VMALF  M1, MK0, ADD2, RED2
   699  	VMALHF M1, MK0, ADD2, RED2H
   700  
   701  	VSLDB $12, RED2, RED1, RED1
   702  	VSLDB $12, T2, RED2, RED2
   703  
   704  	VACCQ RED1, ADD1H, CAR1
   705  	VAQ   RED1, ADD1H, T0
   706  	VACCQ RED1H, T0, CAR1M
   707  	VAQ   RED1H, T0, T0
   708  
   709  	// << ready for next MK0
   710  
   711  	VACQ   RED2, ADD2H, CAR1, T1
   712  	VACCCQ RED2, ADD2H, CAR1, CAR1
   713  	VACCCQ RED2H, T1, CAR1M, T2
   714  	VACQ   RED2H, T1, CAR1M, T1
   715  	VAQ    CAR1, T2, T2
   716  
   717  	// ---------------------------------------------------
   718  	VREPF $2, Y1, YDIG
   719  	VMALF X0, YDIG, T0, ADD1
   720  	VMLF  ADD1, K0, MK0
   721  	VREPF $3, MK0, MK0
   722  
   723  	VMALF  X1, YDIG, T1, ADD2
   724  	VMALHF X0, YDIG, T0, ADD1H
   725  	VMALHF X1, YDIG, T1, ADD2H
   726  
   727  	VMALF  M0, MK0, ADD1, RED1
   728  	VMALHF M0, MK0, ADD1, RED1H
   729  	VMALF  M1, MK0, ADD2, RED2
   730  	VMALHF M1, MK0, ADD2, RED2H
   731  
   732  	VSLDB $12, RED2, RED1, RED1
   733  	VSLDB $12, T2, RED2, RED2
   734  
   735  	VACCQ RED1, ADD1H, CAR1
   736  	VAQ   RED1, ADD1H, T0
   737  	VACCQ RED1H, T0, CAR1M
   738  	VAQ   RED1H, T0, T0
   739  
   740  	// << ready for next MK0
   741  
   742  	VACQ   RED2, ADD2H, CAR1, T1
   743  	VACCCQ RED2, ADD2H, CAR1, CAR1
   744  	VACCCQ RED2H, T1, CAR1M, T2
   745  	VACQ   RED2H, T1, CAR1M, T1
   746  	VAQ    CAR1, T2, T2
   747  
   748  	// ---------------------------------------------------
   749  	VREPF $1, Y1, YDIG
   750  	VMALF X0, YDIG, T0, ADD1
   751  	VMLF  ADD1, K0, MK0
   752  	VREPF $3, MK0, MK0
   753  
   754  	VMALF  X1, YDIG, T1, ADD2
   755  	VMALHF X0, YDIG, T0, ADD1H
   756  	VMALHF X1, YDIG, T1, ADD2H
   757  
   758  	VMALF  M0, MK0, ADD1, RED1
   759  	VMALHF M0, MK0, ADD1, RED1H
   760  	VMALF  M1, MK0, ADD2, RED2
   761  	VMALHF M1, MK0, ADD2, RED2H
   762  
   763  	VSLDB $12, RED2, RED1, RED1
   764  	VSLDB $12, T2, RED2, RED2
   765  
   766  	VACCQ RED1, ADD1H, CAR1
   767  	VAQ   RED1, ADD1H, T0
   768  	VACCQ RED1H, T0, CAR1M
   769  	VAQ   RED1H, T0, T0
   770  
   771  	// << ready for next MK0
   772  
   773  	VACQ   RED2, ADD2H, CAR1, T1
   774  	VACCCQ RED2, ADD2H, CAR1, CAR1
   775  	VACCCQ RED2H, T1, CAR1M, T2
   776  	VACQ   RED2H, T1, CAR1M, T1
   777  	VAQ    CAR1, T2, T2
   778  
   779  	// ---------------------------------------------------
   780  	VREPF $0, Y1, YDIG
   781  	VMALF X0, YDIG, T0, ADD1
   782  	VMLF  ADD1, K0, MK0
   783  	VREPF $3, MK0, MK0
   784  
   785  	VMALF  X1, YDIG, T1, ADD2
   786  	VMALHF X0, YDIG, T0, ADD1H
   787  	VMALHF X1, YDIG, T1, ADD2H
   788  
   789  	VMALF  M0, MK0, ADD1, RED1
   790  	VMALHF M0, MK0, ADD1, RED1H
   791  	VMALF  M1, MK0, ADD2, RED2
   792  	VMALHF M1, MK0, ADD2, RED2H
   793  
   794  	VSLDB $12, RED2, RED1, RED1
   795  	VSLDB $12, T2, RED2, RED2
   796  
   797  	VACCQ RED1, ADD1H, CAR1
   798  	VAQ   RED1, ADD1H, T0
   799  	VACCQ RED1H, T0, CAR1M
   800  	VAQ   RED1H, T0, T0
   801  
   802  	// << ready for next MK0
   803  
   804  	VACQ   RED2, ADD2H, CAR1, T1
   805  	VACCCQ RED2, ADD2H, CAR1, CAR1
   806  	VACCCQ RED2H, T1, CAR1M, T2
   807  	VACQ   RED2H, T1, CAR1M, T1
   808  	VAQ    CAR1, T2, T2
   809  
   810  	// ---------------------------------------------------
   811  
   812  	VZERO   RED1
   813  	VSCBIQ  M0, T0, CAR1
   814  	VSQ     M0, T0, ADD1
   815  	VSBCBIQ T1, M1, CAR1, CAR1M
   816  	VSBIQ   T1, M1, CAR1, ADD2
   817  	VSBIQ   T2, RED1, CAR1M, T2
   818  
   819  	// what output to use, ADD2||ADD1 or T1||T0?
   820  	VSEL T0, ADD1, T2, T0
   821  	VSEL T1, ADD2, T2, T1
   822  
   823  	RET
   824  
   825  #undef X0
   826  #undef X1
   827  #undef Y0
   828  #undef Y1
   829  #undef M0
   830  #undef M1
   831  #undef T0
   832  #undef T1
   833  #undef T2
   834  #undef YDIG
   835  
   836  #undef ADD1
   837  #undef ADD1H
   838  #undef ADD2
   839  #undef ADD2H
   840  #undef RED1
   841  #undef RED1H
   842  #undef RED2
   843  #undef RED2H
   844  #undef CAR1
   845  #undef CAR1M
   846  
   847  #undef MK0
   848  #undef K0
   849  
   850  // ---------------------------------------
   851  
   852  // Parameters
   853  #define X0    V0
   854  #define X1    V1
   855  #define Y0    V2
   856  #define Y1    V3
   857  
   858  TEXT sm2p256OrdSqrInternal<>(SB), NOFRAME|NOSPLIT, $0
   859  	VLR X0, Y0
   860  	VLR X1, Y1
   861  	BR  sm2p256OrdMulInternal<>(SB)
   862  
   863  #undef X0
   864  #undef X1
   865  #undef Y0
   866  #undef Y1
   867  
   868  // ---------------------------------------
   869  
   870  // func p256OrdMul(res, in1, in2 *p256OrdElement)
   871  #define res_ptr R1
   872  #define x_ptr R2
   873  #define y_ptr R3
   874  #define X0    V0
   875  #define X1    V1
   876  #define Y0    V2
   877  #define Y1    V3
   878  #define M0    V5
   879  #define M1    V4
   880  #define T0    V6
   881  #define T1    V7
   882  #define K0    V31
   883  TEXT ·p256OrdMul(SB), NOSPLIT, $0
   884  	MOVD res+0(FP), res_ptr
   885  	MOVD in1+8(FP), x_ptr
   886  	MOVD in2+16(FP), y_ptr
   887  
   888  	MOVD  $p256ordK0<>+0x00(SB), R4
   889  
   890  	VLEF    $3, 0(R4), K0
   891  	//WORD $0xE7F40000
   892  	//BYTE $0x38
   893  	//BYTE $0x03
   894  	MOVD $p256ord<>+0x00(SB), R4
   895  	VLM (R4), M1, M0
   896  
   897  	VLM (x_ptr), X0, X1
   898  	VPDI $0x4, X0, X0, X0
   899  	VPDI $0x4, X1, X1, X1
   900  	VLM (y_ptr), Y0, Y1
   901  	VPDI $0x4, Y0, Y0, Y0
   902  	VPDI $0x4, Y1, Y1, Y1
   903  
   904  	CALL sm2p256OrdMulInternal<>(SB)
   905  
   906  	VPDI $0x4, T0, T0, T0
   907  	VPDI $0x4, T1, T1, T1
   908  	VSTM T0, T1, (res_ptr)
   909  
   910  	RET
   911  
   912  #undef res_ptr
   913  #undef x_ptr
   914  #undef y_ptr
   915  #undef X0
   916  #undef X1
   917  #undef Y0
   918  #undef Y1
   919  #undef M0
   920  #undef M1
   921  #undef T0
   922  #undef T1
   923  #undef K0
   924  
   925  // ---------------------------------------
   926  //  func p256OrdSqr(res, in *p256OrdElement, n int)
   927  #define res_ptr R1
   928  #define x_ptr R2
   929  #define COUNT   R5
   930  #define N       R6
   931  #define X0    V0
   932  #define X1    V1
   933  #define M0    V5
   934  #define M1    V4
   935  #define T0    V6
   936  #define T1    V7
   937  #define K0    V31
   938  TEXT ·p256OrdSqr(SB), NOSPLIT, $0
   939  	MOVD res+0(FP), res_ptr
   940  	MOVD in+8(FP), x_ptr
   941  	MOVD n+16(FP), N
   942  
   943  	MOVD $0, COUNT
   944  
   945  	MOVD  $p256ordK0<>+0x00(SB), R4
   946  
   947  	VLEF    $3, 0(R4), K0
   948  	//WORD $0xE7F40000
   949  	//BYTE $0x38
   950  	//BYTE $0x03
   951  	MOVD $p256ord<>+0x00(SB), R4
   952  	VLM (R4), M1, M0
   953  
   954  	VLM   (x_ptr), X0, X1
   955  	VPDI $0x4, X0, X0, X0
   956  	VPDI $0x4, X1, X1, X1
   957  
   958  loop:
   959  	CALL sm2p256OrdSqrInternal<>(SB)
   960  	VLR  T0, X0
   961  	VLR  T1, X1
   962  	ADDW $1, COUNT
   963  	CMPW COUNT, N
   964  	BLT  loop
   965  
   966  	VPDI $0x4, T0, T0, T0
   967  	VPDI $0x4, T1, T1, T1
   968  	VSTM  T0, T1, (res_ptr)
   969  	
   970  	RET
   971  
   972  #undef res_ptr
   973  #undef x_ptr
   974  #undef COUNT
   975  #undef N
   976  #undef X0
   977  #undef X1
   978  #undef M0
   979  #undef M1
   980  #undef T0
   981  #undef T1
   982  #undef K0
   983  
   984  // ---------------------------------------
   985  // sm2p256MulInternal
   986  // V0-V3,V30,V31 - Not Modified
   987  // V4-V14 - Volatile
   988  
   989  #define CPOOL   R4
   990  
   991  // Parameters
   992  #define X0    V0 // Not modified
   993  #define X1    V1 // Not modified
   994  #define Y0    V2 // Not modified
   995  #define Y1    V3 // Not modified
   996  #define T0    V4
   997  #define T1    V5
   998  #define P0    V31 // Not modified
   999  #define P1    V30 // Not modified
  1000  
  1001  // Temporaries
  1002  #define YDIG  V6 // Overloaded with CAR2, ZER
  1003  #define ADD1H V7 // Overloaded with ADD3H
  1004  #define ADD2H V8 // Overloaded with ADD4H
  1005  #define ADD3  V9 // Overloaded with SEL2,SEL5
  1006  #define ADD4  V10 // Overloaded with SEL3,SEL6
  1007  #define RED1  V11 // Overloaded with CAR2
  1008  #define RED2  V12
  1009  #define RED3  V13 // Overloaded with SEL1
  1010  #define T2    V14
  1011  // Overloaded temporaries
  1012  #define ADD1  V4 // Overloaded with T0
  1013  #define ADD2  V5 // Overloaded with T1
  1014  #define ADD3H V7 // Overloaded with ADD1H
  1015  #define ADD4H V8 // Overloaded with ADD2H
  1016  #define ZER   V6 // Overloaded with YDIG, CAR2
  1017  #define CAR1  V6 // Overloaded with YDIG, ZER
  1018  #define CAR2  V11 // Overloaded with RED1
  1019  // Constant Selects
  1020  #define SEL1  V13 // Overloaded with RED3
  1021  #define SEL2  V9 // Overloaded with ADD3,SEL5
  1022  #define SEL3  V10 // Overloaded with ADD4,SEL6
  1023  #define SEL4  V6 // Overloaded with YDIG,CAR2,ZER
  1024  #define SEL5  V9 // Overloaded with ADD3,SEL2
  1025  #define SEL6  V10 // Overloaded with ADD4,SEL3
  1026  
  1027  TEXT sm2p256MulInternal<>(SB), NOSPLIT, $0-0
  1028  	// ---------------------------------------------------
  1029  
  1030  	VREPF $3, Y0, YDIG
  1031  	VMLHF X0, YDIG, ADD1H
  1032  	VMLHF X1, YDIG, ADD2H
  1033  	VMLF  X0, YDIG, ADD1
  1034  	VMLF  X1, YDIG, ADD2
  1035  
  1036  	VREPF  $2, Y0, YDIG
  1037  	VMALF  X0, YDIG, ADD1H, ADD3
  1038  	VMALF  X1, YDIG, ADD2H, ADD4
  1039  	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
  1040  	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
  1041  
  1042  	VZERO ZER
  1043  	VL    32(CPOOL), SEL1
  1044  	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1045  
  1046  	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
  1047  	VSLDB $12, ZER, ADD2, T1  // ADD2 Free
  1048  
  1049  	VACCQ  T0, ADD3, CAR1
  1050  	VAQ    T0, ADD3, T0       // ADD3 Free
  1051  	VACCCQ T1, ADD4, CAR1, T2
  1052  	VACQ   T1, ADD4, CAR1, T1 // ADD4 Free
  1053  
  1054  	VL    48(CPOOL), SEL2
  1055  	VPERM RED3, T0, SEL2, RED1 // [ 0  0 d1 d0]
  1056  	VSLDB $4, RED1, ZER, RED3  // [ 0 d1 d0  0]
  1057  	VSLDB $4, RED3, ZER, RED2  // [d1 d0  0  0]
  1058  	VSCBIQ  RED3, RED1, CAR1
  1059  	VSQ	 RED3, RED1, RED1
  1060  	VSBIQ  RED2, RED3, CAR1, RED2 // Guaranteed not to underflow
  1061  
  1062  	VSLDB $12, T1, T0, T0
  1063  	VSLDB $12, T2, T1, T1 // T2 Free
  1064  
  1065  	VACCQ  T0, ADD3H, CAR1
  1066  	VAQ    T0, ADD3H, T0
  1067  	VACCCQ T1, ADD4H, CAR1, T2
  1068  	VACQ   T1, ADD4H, CAR1, T1
  1069  
  1070  	VACCQ  T0, RED1, CAR1
  1071  	VAQ    T0, RED1, T0
  1072  	VACCCQ T1, RED2, CAR1, CAR2
  1073  	VACQ   T1, RED2, CAR1, T1
  1074  	VAQ   T2, CAR2, T2
  1075  	// ---------------------------------------------------
  1076  
  1077  	VREPF  $1, Y0, YDIG
  1078  	VMALHF X0, YDIG, T0, ADD1H
  1079  	VMALHF X1, YDIG, T1, ADD2H
  1080  	VMALF  X0, YDIG, T0, ADD1  // T0 Free->ADD1
  1081  	VMALF  X1, YDIG, T1, ADD2  // T1 Free->ADD2
  1082  
  1083  	VREPF  $0, Y0, YDIG
  1084  	VMALF  X0, YDIG, ADD1H, ADD3
  1085  	VMALF  X1, YDIG, ADD2H, ADD4
  1086  	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
  1087  	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
  1088  
  1089  	VZERO ZER
  1090  	VL    32(CPOOL), SEL1
  1091  	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1092  
  1093  	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0
  1094  	VSLDB $12, T2, ADD2, T1   // ADD2 Free->T1, T2 Free
  1095  
  1096  	VACCQ  T0, ADD3, CAR1
  1097  	VAQ    T0, ADD3, T0
  1098  	VACCCQ T1, ADD4, CAR1, T2
  1099  	VACQ   T1, ADD4, CAR1, T1
  1100  
  1101  	VL    48(CPOOL), SEL2
  1102  	VPERM RED3, T0, SEL2, RED1 // [ 0  0 d1 d0]
  1103  	VSLDB $4, RED1, ZER, RED3  // [ 0 d1 d0  0]
  1104  	VSLDB $4, RED3, ZER, RED2  // [d1 d0  0  0]
  1105  	VSCBIQ  RED3, RED1, CAR1
  1106  	VSQ	 RED3, RED1, RED1
  1107  	VSBIQ  RED2, RED3, CAR1, RED2 // Guaranteed not to underflow
  1108  
  1109  	VSLDB $12, T1, T0, T0
  1110  	VSLDB $12, T2, T1, T1
  1111  
  1112  	VACCQ  T0, ADD3H, CAR1
  1113  	VAQ    T0, ADD3H, T0
  1114  	VACCCQ T1, ADD4H, CAR1, T2
  1115  	VACQ   T1, ADD4H, CAR1, T1
  1116  
  1117  	VACCQ  T0, RED1, CAR1
  1118  	VAQ    T0, RED1, T0
  1119  	VACCCQ T1, RED2, CAR1, CAR2
  1120  	VACQ   T1, RED2, CAR1, T1
  1121  	VAQ    T2, CAR2, T2
  1122  	// ---------------------------------------------------
  1123  
  1124  	VREPF  $3, Y1, YDIG
  1125  	VMALHF X0, YDIG, T0, ADD1H
  1126  	VMALHF X1, YDIG, T1, ADD2H
  1127  	VMALF  X0, YDIG, T0, ADD1
  1128  	VMALF  X1, YDIG, T1, ADD2
  1129  
  1130  	VREPF  $2, Y1, YDIG
  1131  	VMALF  X0, YDIG, ADD1H, ADD3
  1132  	VMALF  X1, YDIG, ADD2H, ADD4
  1133  	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
  1134  	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
  1135  
  1136  	VZERO ZER
  1137  	VL    32(CPOOL), SEL1
  1138  	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1139  
  1140  	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
  1141  	VSLDB $12, T2, ADD2, T1   // ADD2 Free->T1, T2 Free
  1142  
  1143  	VACCQ  T0, ADD3, CAR1
  1144  	VAQ    T0, ADD3, T0
  1145  	VACCCQ T1, ADD4, CAR1, T2
  1146  	VACQ   T1, ADD4, CAR1, T1
  1147  
  1148  	VL    48(CPOOL), SEL2
  1149  	VPERM RED3, T0, SEL2, RED1 // [ 0  0 d1 d0]
  1150  	VSLDB $4, RED1, ZER, RED3  // [ 0 d1 d0  0]
  1151  	VSLDB $4, RED3, ZER, RED2  // [d1 d0  0  0]
  1152  	VSCBIQ  RED3, RED1, CAR1
  1153  	VSQ	 RED3, RED1, RED1
  1154  	VSBIQ  RED2, RED3, CAR1, RED2 // Guaranteed not to underflow
  1155  
  1156  	VSLDB $12, T1, T0, T0
  1157  	VSLDB $12, T2, T1, T1
  1158  
  1159  	VACCQ  T0, ADD3H, CAR1
  1160  	VAQ    T0, ADD3H, T0
  1161  	VACCCQ T1, ADD4H, CAR1, T2
  1162  	VACQ   T1, ADD4H, CAR1, T1
  1163  
  1164  	VACCQ  T0, RED1, CAR1
  1165  	VAQ    T0, RED1, T0
  1166  	VACCCQ T1, RED2, CAR1, CAR2
  1167  	VACQ   T1, RED2, CAR1, T1
  1168  	VAQ    T2, CAR2, T2
  1169  	// ---------------------------------------------------
  1170  
  1171  	VREPF  $1, Y1, YDIG
  1172  	VMALHF X0, YDIG, T0, ADD1H
  1173  	VMALHF X1, YDIG, T1, ADD2H
  1174  	VMALF  X0, YDIG, T0, ADD1
  1175  	VMALF  X1, YDIG, T1, ADD2
  1176  
  1177  	VREPF  $0, Y1, YDIG
  1178  	VMALF  X0, YDIG, ADD1H, ADD3
  1179  	VMALF  X1, YDIG, ADD2H, ADD4
  1180  	VMALHF X0, YDIG, ADD1H, ADD3H
  1181  	VMALHF X1, YDIG, ADD2H, ADD4H
  1182  
  1183  	VZERO ZER
  1184  	VL    32(CPOOL), SEL1
  1185  	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1186  
  1187  	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
  1188  	VSLDB $12, T2, ADD2, T1   // ADD2 Free->T1, T2 Free
  1189  
  1190  	VACCQ  T0, ADD3, CAR1
  1191  	VAQ    T0, ADD3, T0
  1192  	VACCCQ T1, ADD4, CAR1, T2
  1193  	VACQ   T1, ADD4, CAR1, T1
  1194  
  1195  	VL    48(CPOOL), SEL2
  1196  	VPERM RED3, T0, SEL2, RED1 // [ 0  0 d1 d0]
  1197  	VSLDB $4, RED1, ZER, RED3  // [ 0 d1 d0  0]
  1198  	VSLDB $4, RED3, ZER, RED2  // [d1 d0  0  0]
  1199  	VSCBIQ  RED3, RED1, CAR1
  1200  	VSQ	 RED3, RED1, RED1
  1201  	VSBIQ  RED2, RED3, CAR1, RED2 // Guaranteed not to underflow
  1202  
  1203  	VSLDB $12, T1, T0, T0
  1204  	VSLDB $12, T2, T1, T1
  1205  
  1206  	VACCQ  T0, ADD3H, CAR1
  1207  	VAQ    T0, ADD3H, T0
  1208  	VACCCQ T1, ADD4H, CAR1, T2
  1209  	VACQ   T1, ADD4H, CAR1, T1
  1210  
  1211  	VACCQ  T0, RED1, CAR1
  1212  	VAQ    T0, RED1, T0
  1213  	VACCCQ T1, RED2, CAR1, CAR2
  1214  	VACQ   T1, RED2, CAR1, T1
  1215  	VAQ    T2, CAR2, T2
  1216  
  1217  	// ---------------------------------------------------
  1218  
  1219  	VZERO   RED3
  1220  	VSCBIQ  P0, T0, CAR1
  1221  	VSQ     P0, T0, ADD1H
  1222  	VSBCBIQ T1, P1, CAR1, CAR2
  1223  	VSBIQ   T1, P1, CAR1, ADD2H
  1224  	VSBIQ   T2, RED3, CAR2, T2
  1225  
  1226  	// what output to use, ADD2H||ADD1H or T1||T0?
  1227  	VSEL T0, ADD1H, T2, T0
  1228  	VSEL T1, ADD2H, T2, T1
  1229  	RET
  1230  
  1231  #undef CPOOL
  1232  
  1233  #undef X0
  1234  #undef X1
  1235  #undef Y0
  1236  #undef Y1
  1237  #undef T0
  1238  #undef T1
  1239  #undef P0
  1240  #undef P1
  1241  
  1242  #undef SEL1
  1243  #undef SEL2
  1244  #undef SEL3
  1245  #undef SEL4
  1246  #undef SEL5
  1247  #undef SEL6
  1248  
  1249  #undef YDIG
  1250  #undef ADD1H
  1251  #undef ADD2H
  1252  #undef ADD3
  1253  #undef ADD4
  1254  #undef RED1
  1255  #undef RED2
  1256  #undef RED3
  1257  #undef T2
  1258  #undef ADD1
  1259  #undef ADD2
  1260  #undef ADD3H
  1261  #undef ADD4H
  1262  #undef ZER
  1263  #undef CAR1
  1264  #undef CAR2
  1265  
  1266  // ---------------------------------------
  1267  
  1268  // Parameters
  1269  #define X0    V0
  1270  #define X1    V1
  1271  #define Y0    V2
  1272  #define Y1    V3
  1273  
  1274  TEXT sm2p256SqrInternal<>(SB), NOFRAME|NOSPLIT, $0
  1275  	VLR X0, Y0
  1276  	VLR X1, Y1
  1277  	BR  sm2p256MulInternal<>(SB)
  1278  
  1279  #undef X0
  1280  #undef X1
  1281  #undef Y0
  1282  #undef Y1
  1283  
  1284  #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
  1285  	VZERO   ZER                \
  1286  	VSCBIQ  Y0, X0, CAR1       \
  1287  	VSQ     Y0, X0, T0         \
  1288  	VSBCBIQ X1, Y1, CAR1, SEL1 \
  1289  	VSBIQ   X1, Y1, CAR1, T1   \
  1290  	VSQ     SEL1, ZER, SEL1    \
  1291  	                           \
  1292  	VACCQ   T0, PL, CAR1       \
  1293  	VAQ     T0, PL, TT0        \
  1294  	VACQ    T1, PH, CAR1, TT1  \
  1295  	                           \
  1296  	VSEL    T0, TT0, SEL1, T0  \
  1297  	VSEL    T1, TT1, SEL1, T1  \
  1298  
  1299  #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
  1300  	VACCQ   X0, Y0, CAR1        \
  1301  	VAQ     X0, Y0, T0          \
  1302  	VACCCQ  X1, Y1, CAR1, T2    \
  1303  	VACQ    X1, Y1, CAR1, T1    \
  1304  	                            \
  1305  	VZERO   ZER                 \
  1306  	VSCBIQ  PL, T0, CAR1        \
  1307  	VSQ     PL, T0, TT0         \
  1308  	VSBCBIQ T1, PH, CAR1, CAR2  \
  1309  	VSBIQ   T1, PH, CAR1, TT1   \
  1310  	VSBIQ   T2, ZER, CAR2, SEL1 \
  1311  	                            \
  1312  	VSEL    T0, TT0, SEL1, T0   \
  1313  	VSEL    T1, TT1, SEL1, T1
  1314  
  1315  #define p256HalfInternal(T1, T0, X1, X0) \
  1316  	VZERO  ZER                \
  1317  	VSBIQ  ZER, ZER, X0, SEL1 \
  1318  	                          \
  1319  	VACCQ  X0, PL, CAR1       \
  1320  	VAQ    X0, PL, T0         \
  1321  	VACCCQ X1, PH, CAR1, T2   \
  1322  	VACQ   X1, PH, CAR1, T1   \
  1323  	                          \
  1324  	VSEL   X0, T0, SEL1, T0   \
  1325  	VSEL   X1, T1, SEL1, T1   \
  1326  	VSEL   ZER, T2, SEL1, T2  \
  1327  	                          \
  1328  	VSLDB  $15, T2, ZER, TT1  \
  1329  	VSLDB  $15, T1, ZER, TT0  \
  1330  	VREPIB $1, SEL1           \
  1331  	VSRL   SEL1, T0, T0       \
  1332  	VSRL   SEL1, T1, T1       \
  1333  	VREPIB $7, SEL1           \
  1334  	VSL    SEL1, TT0, TT0     \
  1335  	VSL    SEL1, TT1, TT1     \
  1336  	VO     T0, TT0, T0        \
  1337  	VO     T1, TT1, T1
  1338  
  1339  // ---------------------------------------
  1340  // func p256Mul(res, in1, in2 *p256Element)
  1341  #define res_ptr R1
  1342  #define x_ptr   R2
  1343  #define y_ptr   R3
  1344  #define CPOOL   R4
  1345  
  1346  // Parameters
  1347  #define X0    V0
  1348  #define X1    V1
  1349  #define Y0    V2
  1350  #define Y1    V3
  1351  #define T0    V4
  1352  #define T1    V5
  1353  
  1354  // Constants
  1355  #define P0    V31
  1356  #define P1    V30
  1357  TEXT ·p256Mul(SB), NOSPLIT, $0
  1358  	MOVD res+0(FP), res_ptr
  1359  	MOVD in1+8(FP), x_ptr
  1360  	MOVD in2+16(FP), y_ptr
  1361  
  1362  	VLM   (x_ptr), X0, X1
  1363  	VPDI $0x4, X0, X0, X0
  1364  	VPDI $0x4, X1, X1, X1
  1365  	VLM   (y_ptr), Y0, Y1
  1366  	VPDI $0x4, Y0, Y0, Y0
  1367  	VPDI $0x4, Y1, Y1, Y1
  1368  
  1369  	MOVD $p256mul<>+0x00(SB), CPOOL
  1370  	VLM   (CPOOL), P1, P0
  1371  
  1372  	CALL sm2p256MulInternal<>(SB)
  1373  
  1374  	VPDI $0x4, T0, T0, T0
  1375  	VPDI $0x4, T1, T1, T1
  1376  	VSTM  T0, T1, (res_ptr)
  1377  	RET
  1378  
  1379  #undef res_ptr
  1380  #undef x_ptr
  1381  #undef y_ptr
  1382  #undef CPOOL
  1383  
  1384  #undef X0
  1385  #undef X1
  1386  #undef Y0
  1387  #undef Y1
  1388  #undef T0
  1389  #undef T1
  1390  #undef P0
  1391  #undef P1
  1392  
  1393  // ---------------------------------------
  1394  //  func p256Sqr(res, in *p256Element, n int)
  1395  #define res_ptr R1
  1396  #define x_ptr   R2
  1397  #define y_ptr   R3
  1398  #define CPOOL   R4
  1399  #define COUNT   R5
  1400  #define N       R6
  1401  
  1402  // Parameters
  1403  #define X0    V0
  1404  #define X1    V1
  1405  #define T0    V4
  1406  #define T1    V5
  1407  
  1408  // Constants
  1409  #define P0    V31
  1410  #define P1    V30
  1411  TEXT ·p256Sqr(SB), NOSPLIT, $0
  1412  	MOVD res+0(FP), res_ptr
  1413  	MOVD in+8(FP), x_ptr
  1414  
  1415  	VLM   (x_ptr), X0, X1
  1416  	VPDI $0x4, X0, X0, X0
  1417  	VPDI $0x4, X1, X1, X1
  1418  
  1419  	MOVD $p256mul<>+0x00(SB), CPOOL
  1420  	MOVD $0, COUNT
  1421  	MOVD n+16(FP), N
  1422  	VLM   (CPOOL), P1, P0
  1423  
  1424  loop:
  1425  	CALL sm2p256SqrInternal<>(SB)
  1426  	VLR  T0, X0
  1427  	VLR  T1, X1
  1428  	ADDW $1, COUNT
  1429  	CMPW COUNT, N
  1430  	BLT  loop
  1431  
  1432  	VPDI $0x4, T0, T0, T0
  1433  	VPDI $0x4, T1, T1, T1
  1434  	VSTM  T0, T1, (res_ptr)
  1435  	RET
  1436  
  1437  #undef res_ptr
  1438  #undef x_ptr
  1439  #undef y_ptr
  1440  #undef CPOOL
  1441  #undef COUNT
  1442  #undef N
  1443  
  1444  #undef X0
  1445  #undef X1
  1446  #undef T0
  1447  #undef T1
  1448  #undef P0
  1449  #undef P1
  1450  
  1451  // Point add with P2 being affine point
  1452  // If sign == 1 -> P2 = -P2
  1453  // If sel == 0 -> P3 = P1
  1454  // if zero == 0 -> P3 = P2
  1455  // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
  1456  #define P3ptr   R1
  1457  #define P1ptr   R2
  1458  #define P2ptr   R3
  1459  #define CPOOL   R4
  1460  
  1461  // Temporaries in REGs
  1462  #define Y2L    V15
  1463  #define Y2H    V16
  1464  #define T1L    V17
  1465  #define T1H    V18
  1466  #define T2L    V19
  1467  #define T2H    V20
  1468  #define T3L    V21
  1469  #define T3H    V22
  1470  #define T4L    V23
  1471  #define T4H    V24
  1472  
  1473  // Temps for Sub and Add
  1474  #define TT0  V11
  1475  #define TT1  V12
  1476  #define T2   V13
  1477  
  1478  // p256MulAsm Parameters
  1479  #define X0    V0
  1480  #define X1    V1
  1481  #define Y0    V2
  1482  #define Y1    V3
  1483  #define T0    V4
  1484  #define T1    V5
  1485  
  1486  #define PL    V31
  1487  #define PH    V30
  1488  
  1489  // Names for zero/sel selects
  1490  #define X1L    V0
  1491  #define X1H    V1
  1492  #define Y1L    V2 // p256MulAsmParmY
  1493  #define Y1H    V3 // p256MulAsmParmY
  1494  #define Z1L    V4
  1495  #define Z1H    V5
  1496  #define X2L    V0
  1497  #define X2H    V1
  1498  #define Z2L    V4
  1499  #define Z2H    V5
  1500  #define X3L    V17 // T1L
  1501  #define X3H    V18 // T1H
  1502  #define Y3L    V21 // T3L
  1503  #define Y3H    V22 // T3H
  1504  #define Z3L    V28
  1505  #define Z3H    V29
  1506  
  1507  #define ZER   V6
  1508  #define SEL1  V7
  1509  #define CAR1  V8
  1510  #define CAR2  V9
  1511  /* *
  1512   * Three operand formula:
  1513   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1514   * T1 = Z1²
  1515   * T2 = T1*Z1
  1516   * T1 = T1*X2
  1517   * T2 = T2*Y2
  1518   * T1 = T1-X1
  1519   * T2 = T2-Y1
  1520   * Z3 = Z1*T1
  1521   * T3 = T1²
  1522   * T4 = T3*T1
  1523   * T3 = T3*X1
  1524   * T1 = 2*T3
  1525   * X3 = T2²
  1526   * X3 = X3-T1
  1527   * X3 = X3-T4
  1528   * T3 = T3-X3
  1529   * T3 = T3*T2
  1530   * T4 = T4*Y1
  1531   * Y3 = T3-T4
  1532  
  1533   * Three operand formulas, but with MulInternal X,Y used to store temps
  1534  X=Z1; Y=Z1; MUL;T-   // T1 = Z1²      T1
  1535  X=T ; Y-  ; MUL;T2=T // T2 = T1*Z1    T1   T2
  1536  X-  ; Y=X2; MUL;T1=T // T1 = T1*X2    T1   T2
  1537  X=T2; Y=Y2; MUL;T-   // T2 = T2*Y2    T1   T2
  1538  SUB(T2<T-Y1)         // T2 = T2-Y1    T1   T2
  1539  SUB(Y<T1-X1)         // T1 = T1-X1    T1   T2
  1540  X=Z1; Y- ;  MUL;Z3:=T// Z3 = Z1*T1         T2
  1541  X=Y;  Y- ;  MUL;X=T  // T3 = T1*T1         T2
  1542  X- ;  Y- ;  MUL;T4=T // T4 = T3*T1         T2        T4
  1543  X- ;  Y=X1; MUL;T3=T // T3 = T3*X1         T2   T3   T4
  1544  ADD(T1<T+T)          // T1 = T3+T3    T1   T2   T3   T4
  1545  X=T2; Y=T2; MUL;T-   // X3 = T2*T2    T1   T2   T3   T4
  1546  SUB(T<T-T1)          // X3 = X3-T1    T1   T2   T3   T4
  1547  SUB(T<T-T4) X3:=T    // X3 = X3-T4         T2   T3   T4
  1548  SUB(X<T3-T)          // T3 = T3-X3         T2   T3   T4
  1549  X- ;  Y- ;  MUL;T3=T // T3 = T3*T2         T2   T3   T4
  1550  X=T4; Y=Y1; MUL;T-   // T4 = T4*Y1              T3   T4
  1551  SUB(T<T3-T) Y3:=T    // Y3 = T3-T4              T3   T4
  1552  
  1553  	*/
  1554  TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
  1555  	MOVD res+0(FP), P3ptr
  1556  	MOVD in1+8(FP), P1ptr
  1557  	MOVD in2+16(FP), P2ptr
  1558  
  1559  	MOVD $p256mul<>+0x00(SB), CPOOL
  1560  	VL   16(CPOOL), PL
  1561  	VL   0(CPOOL), PH
  1562  
  1563  	//	if (sign == 1) {
  1564  	//		Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2  = P-Y2
  1565  	//	}
  1566  
  1567  	VL   48(P2ptr), Y2H
  1568  	VPDI $0x4, Y2H, Y2H, Y2H
  1569  	VL   32(P2ptr), Y2L
  1570  	VPDI $0x4, Y2L, Y2L, Y2L
  1571  
  1572  	VLREPG sign+24(FP), SEL1
  1573  	VZERO  ZER
  1574  	VCEQG  SEL1, ZER, SEL1
  1575  
  1576  	VSCBIQ Y2L, PL, CAR1
  1577  	VSQ    Y2L, PL, T1L
  1578  	VSBIQ  PH, Y2H, CAR1, T1H
  1579  
  1580  	VSEL Y2L, T1L, SEL1, Y2L
  1581  	VSEL Y2H, T1H, SEL1, Y2H
  1582  
  1583  /* *
  1584   * Three operand formula:
  1585   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1586   */
  1587  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1²      T1
  1588  	VL   80(P1ptr), X1       // Z1H
  1589  	VPDI $0x4, X1, X1, X1
  1590  	VL   64(P1ptr), X0       // Z1L
  1591  	VPDI $0x4, X0, X0, X0
  1592  	VLR  X0, Y0
  1593  	VLR  X1, Y1
  1594  	CALL sm2p256SqrInternal<>(SB)
  1595  
  1596  	// X=T ; Y-  ; MUL; T2=T // T2 = T1*Z1    T1   T2
  1597  	VLR  T0, X0
  1598  	VLR  T1, X1
  1599  	CALL sm2p256MulInternal<>(SB)
  1600  	VLR  T0, T2L
  1601  	VLR  T1, T2H
  1602  
  1603  	// X-  ; Y=X2; MUL; T1=T // T1 = T1*X2    T1   T2
  1604  	VL   16(P2ptr), Y1       // X2H
  1605  	VPDI $0x4, Y1, Y1, Y1
  1606  	VL   0(P2ptr), Y0        // X2L
  1607  	VPDI $0x4, Y0, Y0, Y0
  1608  	CALL sm2p256MulInternal<>(SB)
  1609  	VLR  T0, T1L
  1610  	VLR  T1, T1H
  1611  
  1612  	// X=T2; Y=Y2; MUL; T-   // T2 = T2*Y2    T1   T2
  1613  	VLR  T2L, X0
  1614  	VLR  T2H, X1
  1615  	VLR  Y2L, Y0
  1616  	VLR  Y2H, Y1
  1617  	CALL sm2p256MulInternal<>(SB)
  1618  
  1619  	// SUB(T2<T-Y1)          // T2 = T2-Y1    T1   T2
  1620  	VL   48(P1ptr), Y1H
  1621  	VPDI $0x4, Y1H, Y1H, Y1H
  1622  	VL   32(P1ptr), Y1L
  1623  	VPDI $0x4, Y1L, Y1L, Y1L
  1624  	p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
  1625  
  1626  	// SUB(Y<T1-X1)          // T1 = T1-X1    T1   T2
  1627  	VL   16(P1ptr), X1H
  1628  	VPDI $0x4, X1H, X1H, X1H
  1629  	VL   0(P1ptr), X1L
  1630  	VPDI $0x4, X1L, X1L, X1L
  1631  	p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
  1632  
  1633  	// X=Z1; Y- ;  MUL; Z3:=T// Z3 = Z1*T1         T2
  1634  	VL   80(P1ptr), X1       // Z1H
  1635  	VPDI $0x4, X1, X1, X1
  1636  	VL   64(P1ptr), X0       // Z1L
  1637  	VPDI $0x4, X0, X0, X0
  1638  	CALL sm2p256MulInternal<>(SB)
  1639  
  1640  	// VST T1, 64(P3ptr)
  1641  	// VST T0, 80(P3ptr)
  1642  	VLR T0, Z3L
  1643  	VLR T1, Z3H
  1644  
  1645  	// X=Y;  Y- ;  MUL; X=T  // T3 = T1*T1         T2
  1646  	VLR  Y0, X0
  1647  	VLR  Y1, X1
  1648  	CALL sm2p256SqrInternal<>(SB)
  1649  	VLR  T0, X0
  1650  	VLR  T1, X1
  1651  
  1652  	// X- ;  Y- ;  MUL; T4=T // T4 = T3*T1         T2        T4
  1653  	CALL sm2p256MulInternal<>(SB)
  1654  	VLR  T0, T4L
  1655  	VLR  T1, T4H
  1656  
  1657  	// X- ;  Y=X1; MUL; T3=T // T3 = T3*X1         T2   T3   T4
  1658  	VL   16(P1ptr), Y1       // X1H
  1659  	VPDI $0x4, Y1, Y1, Y1
  1660  	VL   0(P1ptr), Y0        // X1L
  1661  	VPDI $0x4, Y0, Y0, Y0
  1662  	CALL sm2p256MulInternal<>(SB)
  1663  	VLR  T0, T3L
  1664  	VLR  T1, T3H
  1665  
  1666  	// ADD(T1<T+T)           // T1 = T3+T3    T1   T2   T3   T4
  1667  	p256AddInternal(T1H,T1L, T1,T0,T1,T0)
  1668  
  1669  	// X=T2; Y=T2; MUL; T-   // X3 = T2*T2    T1   T2   T3   T4
  1670  	VLR  T2L, X0
  1671  	VLR  T2H, X1
  1672  	VLR  T2L, Y0
  1673  	VLR  T2H, Y1
  1674  	CALL sm2p256SqrInternal<>(SB)
  1675  
  1676  	// SUB(T<T-T1)           // X3 = X3-T1    T1   T2   T3   T4  (T1 = X3)
  1677  	p256SubInternal(T1,T0,T1,T0,T1H,T1L)
  1678  
  1679  	// SUB(T<T-T4) X3:=T     // X3 = X3-T4         T2   T3   T4
  1680  	p256SubInternal(T1,T0,T1,T0,T4H,T4L)
  1681  	VLR T0, X3L
  1682  	VLR T1, X3H
  1683  
  1684  	// SUB(X<T3-T)           // T3 = T3-X3         T2   T3   T4
  1685  	p256SubInternal(X1,X0,T3H,T3L,T1,T0)
  1686  
  1687  	// X- ;  Y- ;  MUL; T3=T // T3 = T3*T2         T2   T3   T4
  1688  	CALL sm2p256MulInternal<>(SB)
  1689  	VLR  T0, T3L
  1690  	VLR  T1, T3H
  1691  
  1692  	// X=T4; Y=Y1; MUL; T-   // T4 = T4*Y1              T3   T4
  1693  	VLR  T4L, X0
  1694  	VLR  T4H, X1
  1695  	VL   48(P1ptr), Y1       // Y1H
  1696  	VPDI $0x4, Y1, Y1, Y1
  1697  	VL   32(P1ptr), Y0       // Y1L
  1698  	VPDI $0x4, Y0, Y0, Y0
  1699  	CALL sm2p256MulInternal<>(SB)
  1700  
  1701  	// SUB(T<T3-T) Y3:=T     // Y3 = T3-T4              T3   T4  (T3 = Y3)
  1702  	p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
  1703  
  1704  	//	if (sel == 0) {
  1705  	//		copy(P3.x[:], X1)
  1706  	//		copy(P3.y[:], Y1)
  1707  	//		copy(P3.z[:], Z1)
  1708  	//	}
  1709  
  1710  	VL   16(P1ptr), X1H
  1711  	VPDI $0x4, X1H, X1H, X1H
  1712  	VL   0(P1ptr), X1L
  1713  	VPDI $0x4, X1L, X1L, X1L
  1714  
  1715  	// Y1 already loaded, left over from addition
  1716  	VL   80(P1ptr), Z1H
  1717  	VPDI $0x4, Z1H, Z1H, Z1H
  1718  	VL   64(P1ptr), Z1L
  1719  	VPDI $0x4, Z1L, Z1L, Z1L
  1720  
  1721  	VLREPG sel+32(FP), SEL1
  1722  	VZERO  ZER
  1723  	VCEQG  SEL1, ZER, SEL1
  1724  
  1725  	VSEL X1L, X3L, SEL1, X3L
  1726  	VSEL X1H, X3H, SEL1, X3H
  1727  	VSEL Y1L, Y3L, SEL1, Y3L
  1728  	VSEL Y1H, Y3H, SEL1, Y3H
  1729  	VSEL Z1L, Z3L, SEL1, Z3L
  1730  	VSEL Z1H, Z3H, SEL1, Z3H
  1731  
  1732  	//	if (zero == 0) {
  1733  	//		copy(P3.x[:], X2)
  1734  	//		copy(P3.y[:], Y2)
  1735  	//		copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  1736  	//			0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01})  //(p256.z*2^256)%p
  1737  	//	}
  1738  	VL   16(P2ptr), X2H
  1739  	VPDI $0x4, X2H, X2H, X2H
  1740  	VL   0(P2ptr), X2L
  1741  	VPDI $0x4, X2L, X2L, X2L
  1742  
  1743  	// Y2 already loaded
  1744  	VL 64(CPOOL), Z2H
  1745  	VL 80(CPOOL), Z2L
  1746  
  1747  	VLREPG zero+40(FP), SEL1
  1748  	VZERO  ZER
  1749  	VCEQG  SEL1, ZER, SEL1
  1750  
  1751  	VSEL X2L, X3L, SEL1, X3L
  1752  	VSEL X2H, X3H, SEL1, X3H
  1753  	VSEL Y2L, Y3L, SEL1, Y3L
  1754  	VSEL Y2H, Y3H, SEL1, Y3H
  1755  	VSEL Z2L, Z3L, SEL1, Z3L
  1756  	VSEL Z2H, Z3H, SEL1, Z3H
  1757  
  1758  	// All done, store out the result!!!
  1759  	VPDI $0x4, X3H, X3H, X3H
  1760  	VST  X3H, 16(P3ptr)
  1761  	VPDI $0x4, X3L, X3L, X3L
  1762  	VST  X3L, 0(P3ptr)
  1763  	VPDI $0x4, Y3H, Y3H, Y3H
  1764  	VST  Y3H, 48(P3ptr)
  1765  	VPDI $0x4, Y3L, Y3L, Y3L
  1766  	VST  Y3L, 32(P3ptr)
  1767  	VPDI $0x4, Z3H, Z3H, Z3H
  1768  	VST  Z3H, 80(P3ptr)
  1769  	VPDI $0x4, Z3L, Z3L, Z3L
  1770  	VST  Z3L, 64(P3ptr)
  1771  
  1772  	RET
  1773  
  1774  #undef P3ptr
  1775  #undef P1ptr
  1776  #undef P2ptr
  1777  #undef CPOOL
  1778  
  1779  #undef Y2L
  1780  #undef Y2H
  1781  #undef T1L
  1782  #undef T1H
  1783  #undef T2L
  1784  #undef T2H
  1785  #undef T3L
  1786  #undef T3H
  1787  #undef T4L
  1788  #undef T4H
  1789  
  1790  #undef TT0
  1791  #undef TT1
  1792  #undef T2
  1793  
  1794  #undef X0
  1795  #undef X1
  1796  #undef Y0
  1797  #undef Y1
  1798  #undef T0
  1799  #undef T1
  1800  
  1801  #undef PL
  1802  #undef PH
  1803  
  1804  #undef X1L
  1805  #undef X1H
  1806  #undef Y1L
  1807  #undef Y1H
  1808  #undef Z1L
  1809  #undef Z1H
  1810  #undef X2L
  1811  #undef X2H
  1812  #undef Z2L
  1813  #undef Z2H
  1814  #undef X3L
  1815  #undef X3H
  1816  #undef Y3L
  1817  #undef Y3H
  1818  #undef Z3L
  1819  #undef Z3H
  1820  
  1821  #undef ZER
  1822  #undef SEL1
  1823  #undef CAR1
  1824  #undef CAR2
  1825  
  1826  // func p256PointDoubleAsm(res, in *P256Point)
  1827  // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
  1828  // https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
  1829  // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
  1830  #define P3ptr   R1
  1831  #define P1ptr   R2
  1832  #define CPOOL   R4
  1833  
  1834  // Temporaries in REGs
  1835  #define X3L    V15
  1836  #define X3H    V16
  1837  #define Y3L    V17
  1838  #define Y3H    V18
  1839  #define T1L    V19
  1840  #define T1H    V20
  1841  #define T2L    V21
  1842  #define T2H    V22
  1843  #define T3L    V23
  1844  #define T3H    V24
  1845  
  1846  #define X1L    V6
  1847  #define X1H    V7
  1848  #define Y1L    V8
  1849  #define Y1H    V9
  1850  #define Z1L    V10
  1851  #define Z1H    V11
  1852  
  1853  // Temps for Sub and Add
  1854  #define TT0  V11
  1855  #define TT1  V12
  1856  #define T2   V13
  1857  
  1858  // p256MulAsm Parameters
  1859  #define X0    V0
  1860  #define X1    V1
  1861  #define Y0    V2
  1862  #define Y1    V3
  1863  #define T0    V4
  1864  #define T1    V5
  1865  
  1866  #define PL    V31
  1867  #define PH    V30
  1868  
  1869  #define Z3L    V23
  1870  #define Z3H    V24
  1871  
  1872  #define ZER   V26
  1873  #define SEL1  V27
  1874  #define CAR1  V28
  1875  #define CAR2  V29
  1876  /*
  1877   * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
  1878   * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
  1879   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1880   * 	A  = 3(X₁-Z₁²)×(X₁+Z₁²)
  1881   * 	B  = 2Y₁
  1882   * 	Z₃ = B×Z₁
  1883   * 	C  = B²
  1884   * 	D  = C×X₁
  1885   * 	X₃ = A²-2D
  1886   * 	Y₃ = (D-X₃)×A-C²/2
  1887   *
  1888   * Three-operand formula:
  1889   *       T1 = Z1²
  1890   *       T2 = X1-T1
  1891   *       T1 = X1+T1
  1892   *       T2 = T2*T1
  1893   *       T2 = 3*T2
  1894   *       Y3 = 2*Y1
  1895   *       Z3 = Y3*Z1
  1896   *       Y3 = Y3²
  1897   *       T3 = Y3*X1
  1898   *       Y3 = Y3²
  1899   *       Y3 = half*Y3
  1900   *       X3 = T2²
  1901   *       T1 = 2*T3
  1902   *       X3 = X3-T1
  1903   *       T1 = T3-X3
  1904   *       T1 = T1*T2
  1905   *       Y3 = T1-Y3
  1906   */
  1907  
  1908  #define p256PointDoubleRound(P1ptr, P3ptr) \
  1909  	\ // X=Z1; Y=Z1; MUL; T-    // T1 = Z1²
  1910  	VLM  64(P1ptr), X0, X1                 \ // Z1L, Z1H
  1911  	VPDI $0x4, X1, X1, X1                  \               
  1912  	VPDI $0x4, X0, X0, X0                  \
  1913  	VLR  X0, Y0                            \
  1914  	VLR  X1, Y1                            \
  1915  	CALL sm2p256SqrInternal<>(SB)          \
  1916  	\
  1917  	\ // SUB(X<X1-T)            // T2 = X1-T1
  1918  	VLM  (P1ptr), X1L, X1H                 \
  1919  	VPDI $0x4, X1H, X1H, X1H               \
  1920  	VPDI $0x4, X1L, X1L, X1L               \
  1921  	p256SubInternal(X1,X0,X1H,X1L,T1,T0)   \
  1922  	\
  1923  	\ // ADD(Y<X1+T)            // T1 = X1+T1
  1924  	p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)   \
  1925  	\
  1926  	\ // X-  ; Y-  ; MUL; T-    // T2 = T2*T1
  1927  	CALL sm2p256MulInternal<>(SB)          \
  1928  	\
  1929  	\ // ADD(T2<T+T); ADD(T2<T2+T)  // T2 = 3*T2
  1930  	p256AddInternal(T2H,T2L,T1,T0,T1,T0)   \
  1931  	p256AddInternal(T2H,T2L,T2H,T2L,T1,T0) \
  1932  	\
  1933  	\// ADD(X<Y1+Y1)           // Y3 = 2*Y1
  1934  	VLM  32(P1ptr), Y1L, Y1H               \
  1935  	VPDI $0x4, Y1H, Y1H, Y1H               \
  1936  	VPDI $0x4, Y1L, Y1L, Y1L               \
  1937  	p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L) \
  1938  	\
  1939  	\// X-  ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
  1940  	VLM  64(P1ptr), Y0, Y1                 \ // Z1L, Z1H
  1941  	VPDI $0x4, Y1, Y1, Y1                  \
  1942  	VPDI $0x4, Y0, Y0, Y0                  \
  1943  	CALL sm2p256MulInternal<>(SB)          \
  1944  	VPDI $0x4, T1, T1, TT1                 \
  1945  	VPDI $0x4, T0, T0, TT0                 \
  1946  	VSTM  TT0, TT1, 64(P3ptr)              \
  1947  	\
  1948  	\ // X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  1949  	VLR  X0, Y0                            \
  1950  	VLR  X1, Y1                            \
  1951  	CALL sm2p256SqrInternal<>(SB)          \
  1952  	\
  1953  	\ // X=T ; Y=X1; MUL; T3=T  // T3 = Y3*X1
  1954  	VLR  T0, X0                            \
  1955  	VLR  T1, X1                            \
  1956  	VLM  0(P1ptr), Y0, Y1                  \
  1957  	VPDI $0x4, Y1, Y1, Y1                  \
  1958  	VPDI $0x4, Y0, Y0, Y0                  \
  1959  	CALL sm2p256MulInternal<>(SB)          \
  1960  	VLR  T0, T3L                           \
  1961  	VLR  T1, T3H                           \
  1962  	\
  1963  	\ // X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  1964  	VLR  X0, Y0                            \
  1965  	VLR  X1, Y1                            \
  1966  	CALL sm2p256SqrInternal<>(SB)          \
  1967  	\
  1968  	\ // HAL(Y3<T)              // Y3 = half*Y3
  1969  	p256HalfInternal(Y3H,Y3L, T1,T0)       \
  1970  	\
  1971  	\ // X=T2; Y=T2; MUL; T-    // X3 = T2²
  1972  	VLR  T2L, X0                           \
  1973  	VLR  T2H, X1                           \
  1974  	VLR  T2L, Y0                           \
  1975  	VLR  T2H, Y1                           \
  1976  	CALL sm2p256SqrInternal<>(SB)          \
  1977  	\
  1978  	\ // ADD(T1<T3+T3)          // T1 = 2*T3
  1979  	p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L) \
  1980  	\
  1981  	\ // SUB(X3<T-T1) X3:=X3    // X3 = X3-T1
  1982  	p256SubInternal(X3H,X3L,T1,T0,T1H,T1L) \
  1983  	VPDI $0x4, X3H, X3H, TT1               \
  1984  	VPDI $0x4, X3L, X3L, TT0               \
  1985  	VSTM  TT0, TT1, (P3ptr)                \
  1986  	\
  1987  	\ // SUB(X<T3-X3)           // T1 = T3-X3
  1988  	p256SubInternal(X1,X0,T3H,T3L,X3H,X3L) \
  1989  	\
  1990  	\ // X-  ; Y-  ; MUL; T-    // T1 = T1*T2
  1991  	CALL sm2p256MulInternal<>(SB)          \
  1992  	\
  1993  	\ // SUB(Y3<T-Y3)           // Y3 = T1-Y3
  1994  	p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L) \
  1995  	\
  1996  	VPDI $0x4, Y3H, Y3H, Y3H               \
  1997  	VPDI $0x4, Y3L, Y3L, Y3L               \
  1998  	VSTM  Y3L, Y3H, 32(P3ptr)              \
  1999  
  2000  TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
  2001  	MOVD res+0(FP), P3ptr
  2002  	MOVD in+8(FP), P1ptr
  2003  
  2004  	MOVD $p256mul<>+0x00(SB), CPOOL
  2005  	VLM   (CPOOL), PH, PL
  2006  
  2007  	p256PointDoubleRound(P1ptr, P3ptr)
  2008  	RET
  2009  
  2010  TEXT ·p256PointDouble6TimesAsm(SB), NOSPLIT, $0
  2011  	MOVD res+0(FP), P3ptr
  2012  	MOVD in+8(FP), P1ptr
  2013  
  2014  	MOVD $p256mul<>+0x00(SB), CPOOL
  2015  	VLM   (CPOOL), PH, PL
  2016  
  2017  	p256PointDoubleRound(P1ptr, P3ptr)
  2018  	p256PointDoubleRound(P3ptr, P3ptr)
  2019  	p256PointDoubleRound(P3ptr, P3ptr)
  2020  	p256PointDoubleRound(P3ptr, P3ptr)
  2021  	p256PointDoubleRound(P3ptr, P3ptr)
  2022  	p256PointDoubleRound(P3ptr, P3ptr)
  2023  	
  2024  	RET
  2025  
  2026  #undef P3ptr
  2027  #undef P1ptr
  2028  #undef CPOOL
  2029  #undef X3L
  2030  #undef X3H
  2031  #undef Y3L
  2032  #undef Y3H
  2033  #undef T1L
  2034  #undef T1H
  2035  #undef T2L
  2036  #undef T2H
  2037  #undef T3L
  2038  #undef T3H
  2039  #undef X1L
  2040  #undef X1H
  2041  #undef Y1L
  2042  #undef Y1H
  2043  #undef Z1L
  2044  #undef Z1H
  2045  #undef TT0
  2046  #undef TT1
  2047  #undef T2
  2048  #undef X0
  2049  #undef X1
  2050  #undef Y0
  2051  #undef Y1
  2052  #undef T0
  2053  #undef T1
  2054  #undef PL
  2055  #undef PH
  2056  #undef Z3L
  2057  #undef Z3H
  2058  #undef ZER
  2059  #undef SEL1
  2060  #undef CAR1
  2061  #undef CAR2
  2062  
  2063  // func p256PointAddAsm(res, in1, in2 *P256Point) int
  2064  #define P3ptr  R1
  2065  #define P1ptr  R2
  2066  #define P2ptr  R3
  2067  #define CPOOL  R4
  2068  #define ISZERO R5
  2069  #define TRUE   R6
  2070  
  2071  // Temporaries in REGs
  2072  #define T1L   V16
  2073  #define T1H   V17
  2074  #define T2L   V18
  2075  #define T2H   V19
  2076  #define U1L   V20
  2077  #define U1H   V21
  2078  #define S1L   V22
  2079  #define S1H   V23
  2080  #define HL    V24
  2081  #define HH    V25
  2082  #define RL    V26
  2083  #define RH    V27
  2084  
  2085  // Temps for Sub and Add
  2086  #define ZER   V6
  2087  #define SEL1  V7
  2088  #define CAR1  V8
  2089  #define CAR2  V9
  2090  #define TT0  V11
  2091  #define TT1  V12
  2092  #define T2   V13
  2093  
  2094  // p256MulAsm Parameters
  2095  #define X0    V0
  2096  #define X1    V1
  2097  #define Y0    V2
  2098  #define Y1    V3
  2099  #define T0    V4
  2100  #define T1    V5
  2101  
  2102  #define PL    V31
  2103  #define PH    V30
  2104  /*
  2105   * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
  2106   *
  2107   * A = X₁×Z₂²
  2108   * B = Y₁×Z₂³
  2109   * C = X₂×Z₁²-A
  2110   * D = Y₂×Z₁³-B
  2111   * X₃ = D² - 2A×C² - C³
  2112   * Y₃ = D×(A×C² - X₃) - B×C³
  2113   * Z₃ = Z₁×Z₂×C
  2114   *
  2115   * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
  2116   * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
  2117   *
  2118   * T1 = Z1*Z1
  2119   * T2 = Z2*Z2
  2120   * U1 = X1*T2
  2121   * H  = X2*T1
  2122   * H  = H-U1
  2123   * Z3 = Z1*Z2
  2124   * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  2125   *
  2126   * S1 = Z2*T2
  2127   * S1 = Y1*S1
  2128   * R  = Z1*T1
  2129   * R  = Y2*R
  2130   * R  = R-S1
  2131   *
  2132   * T1 = H*H
  2133   * T2 = H*T1
  2134   * U1 = U1*T1
  2135   *
  2136   * X3 = R*R
  2137   * X3 = X3-T2
  2138   * T1 = 2*U1
  2139   * X3 = X3-T1 << store-out X3 result reg
  2140   *
  2141   * T2 = S1*T2
  2142   * Y3 = U1-X3
  2143   * Y3 = R*Y3
  2144   * Y3 = Y3-T2 << store-out Y3 result reg
  2145  
  2146   	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  2147  	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  2148  	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  2149  	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2150  	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2151  	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2152  	// SUB(H<H-T)            // H  = H-U1
  2153  	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2154  	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  2155  	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2156  	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2157  	// SUB(R<T-S1)           // R  = R-S1
  2158  	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2159  	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2160  	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2161  	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2162  	// SUB(T<T-T2)           // X3 = X3-T2
  2163  	// ADD(X<U1+U1)          // T1 = 2*U1
  2164  	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2165  	// SUB(Y<U1-T)           // Y3 = U1-X3
  2166  	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2167  	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2168  	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2169  	*/
  2170  TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
  2171  	MOVD res+0(FP), P3ptr
  2172  	MOVD in1+8(FP), P1ptr
  2173  	MOVD in2+16(FP), P2ptr
  2174  
  2175  	MOVD $p256mul<>+0x00(SB), CPOOL
  2176  	VLM   (CPOOL), PH, PL
  2177  
  2178  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  2179  	VLM  64(P1ptr), X0, X1   // Z1L, Z1H
  2180  	VPDI $0x4, X1, X1, X1
  2181  	VPDI $0x4, X0, X0, X0
  2182  	VLR  X0, Y0
  2183  	VLR  X1, Y1
  2184  	CALL sm2p256SqrInternal<>(SB)
  2185  
  2186  	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  2187  	VLR  T0, Y0
  2188  	VLR  T1, Y1
  2189  	CALL sm2p256MulInternal<>(SB)
  2190  	VLR  T0, RL
  2191  	VLR  T1, RH
  2192  
  2193  	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  2194  	VLM  (P2ptr), X0, X1    // X2L, X2H
  2195  	VPDI $0x4, X1, X1, X1
  2196  	VPDI $0x4, X0, X0, X0
  2197  	CALL sm2p256MulInternal<>(SB)
  2198  	VLR  T0, HL
  2199  	VLR  T1, HH
  2200  
  2201  	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2202  	VLM  64(P2ptr), X0, X1  // Z2L, Z2H
  2203  	VPDI $0x4, X1, X1, X1
  2204  	VPDI $0x4, X0, X0, X0
  2205  	VLR  X0, Y0
  2206  	VLR  X1, Y1
  2207  	CALL sm2p256SqrInternal<>(SB)
  2208  
  2209  	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2210  	VLR  T0, Y0
  2211  	VLR  T1, Y1
  2212  	CALL sm2p256MulInternal<>(SB)
  2213  	VLR  T0, S1L
  2214  	VLR  T1, S1H
  2215  
  2216  	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2217  	VLM  (P1ptr), X0, X1     // X1L, X1H
  2218  	VPDI $0x4, X1, X1, X1
  2219  	VPDI $0x4, X0, X0, X0
  2220  	CALL sm2p256MulInternal<>(SB)
  2221  	VLR  T0, U1L
  2222  	VLR  T1, U1H
  2223  
  2224  	// SUB(H<H-T)            // H  = H-U1
  2225  	p256SubInternal(HH,HL,HH,HL,T1,T0)
  2226  
  2227  	// if H == 0 or H^P == 0 then ret=1 else ret=0
  2228  	// clobbers T1H and T1L
  2229  	MOVD   $0, ISZERO
  2230  	MOVD   $1, TRUE
  2231  	VZERO  ZER
  2232  	VO     HL, HH, T1H
  2233  	VCEQGS ZER, T1H, T1H
  2234  	MOVDEQ TRUE, ISZERO
  2235  	VX     HL, PL, T1L
  2236  	VX     HH, PH, T1H
  2237  	VO     T1L, T1H, T1H
  2238  	VCEQGS ZER, T1H, T1H
  2239  	MOVDEQ TRUE, ISZERO
  2240  	MOVD   ISZERO, ret+24(FP)
  2241  
  2242  	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2243  	VLM  64(P1ptr), X0, X1   // Z1L, Z1H
  2244  	VPDI $0x4, X1, X1, X1
  2245  	VPDI $0x4, X0, X0, X0
  2246  	VLM  64(P2ptr), Y0, Y1   // Z2L, Z2H
  2247  	VPDI $0x4, Y1, Y1, Y1
  2248  	VPDI $0x4, Y0, Y0, Y0
  2249  	CALL sm2p256MulInternal<>(SB)
  2250  
  2251  	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
  2252  	VLR  T0, X0
  2253  	VLR  T1, X1
  2254  	VLR  HL, Y0
  2255  	VLR  HH, Y1
  2256  	CALL sm2p256MulInternal<>(SB)
  2257  	VPDI $0x4, T1, T1, TT1
  2258  	VPDI $0x4, T0, T0, TT0
  2259  	VSTM  TT0, TT1, 64(P3ptr)
  2260  
  2261  	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2262  	VLM  32(P1ptr), X0, X1
  2263  	VPDI $0x4, X1, X1, X1
  2264  	VPDI $0x4, X0, X0, X0
  2265  	VLR  S1L, Y0
  2266  	VLR  S1H, Y1
  2267  	CALL sm2p256MulInternal<>(SB)
  2268  	VLR  T0, S1L
  2269  	VLR  T1, S1H
  2270  
  2271  	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2272  	VLM  32(P2ptr), X0, X1
  2273  	VPDI $0x4, X1, X1, X1
  2274  	VPDI $0x4, X0, X0, X0
  2275  	VLR  RL, Y0
  2276  	VLR  RH, Y1
  2277  	CALL sm2p256MulInternal<>(SB)
  2278  
  2279  	// SUB(R<T-S1)           // R  = T-S1
  2280  	p256SubInternal(RH,RL,T1,T0,S1H,S1L)
  2281  
  2282  	// if R == 0 or R^P == 0 then ret=ret else ret=0
  2283  	// clobbers T1H and T1L
  2284  	MOVD   $0, ISZERO
  2285  	MOVD   $1, TRUE
  2286  	VZERO  ZER
  2287  	VO     RL, RH, T1H
  2288  	VCEQGS ZER, T1H, T1H
  2289  	MOVDEQ TRUE, ISZERO
  2290  	VX     RL, PL, T1L
  2291  	VX     RH, PH, T1H
  2292  	VO     T1L, T1H, T1H
  2293  	VCEQGS ZER, T1H, T1H
  2294  	MOVDEQ TRUE, ISZERO
  2295  	AND    ret+24(FP), ISZERO
  2296  	MOVD   ISZERO, ret+24(FP)
  2297  
  2298  	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2299  	VLR  HL, X0
  2300  	VLR  HH, X1
  2301  	VLR  HL, Y0
  2302  	VLR  HH, Y1
  2303  	CALL sm2p256SqrInternal<>(SB)
  2304  
  2305  	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2306  	VLR  T0, Y0
  2307  	VLR  T1, Y1
  2308  	CALL sm2p256MulInternal<>(SB)
  2309  	VLR  T0, T2L
  2310  	VLR  T1, T2H
  2311  
  2312  	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2313  	VLR  U1L, X0
  2314  	VLR  U1H, X1
  2315  	CALL sm2p256MulInternal<>(SB)
  2316  	VLR  T0, U1L
  2317  	VLR  T1, U1H
  2318  
  2319  	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2320  	VLR  RL, X0
  2321  	VLR  RH, X1
  2322  	VLR  RL, Y0
  2323  	VLR  RH, Y1
  2324  	CALL sm2p256SqrInternal<>(SB)
  2325  
  2326  	// SUB(T<T-T2)           // X3 = X3-T2
  2327  	p256SubInternal(T1,T0,T1,T0,T2H,T2L)
  2328  
  2329  	// ADD(X<U1+U1)          // T1 = 2*U1
  2330  	p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
  2331  
  2332  	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2333  	p256SubInternal(T1,T0,T1,T0,X1,X0)
  2334  	VPDI $0x4, T1, T1, TT1
  2335  	VPDI $0x4, T0, T0, TT0
  2336  	VSTM  TT0, TT1, (P3ptr)
  2337  
  2338  	// SUB(Y<U1-T)           // Y3 = U1-X3
  2339  	p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
  2340  
  2341  	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2342  	VLR  RL, X0
  2343  	VLR  RH, X1
  2344  	CALL sm2p256MulInternal<>(SB)
  2345  	VLR  T0, U1L
  2346  	VLR  T1, U1H
  2347  
  2348  	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2349  	VLR  S1L, X0
  2350  	VLR  S1H, X1
  2351  	VLR  T2L, Y0
  2352  	VLR  T2H, Y1
  2353  	CALL sm2p256MulInternal<>(SB)
  2354  
  2355  	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2356  	p256SubInternal(T1,T0,U1H,U1L,T1,T0)
  2357  	VPDI $0x4, T1, T1, T1
  2358  	VPDI $0x4, T0, T0, T0
  2359  	VSTM  T0, T1, 32(P3ptr)
  2360  
  2361  	RET
  2362  
  2363  #undef P3ptr
  2364  #undef P1ptr
  2365  #undef P2ptr
  2366  #undef CPOOL
  2367  #undef ISZERO
  2368  #undef TRUE
  2369  #undef T1L
  2370  #undef T1H
  2371  #undef T2L
  2372  #undef T2H
  2373  #undef U1L
  2374  #undef U1H
  2375  #undef S1L
  2376  #undef S1H
  2377  #undef HL
  2378  #undef HH
  2379  #undef RL
  2380  #undef RH
  2381  #undef ZER
  2382  #undef SEL1
  2383  #undef CAR1
  2384  #undef CAR2
  2385  #undef TT0
  2386  #undef TT1
  2387  #undef T2
  2388  #undef X0
  2389  #undef X1
  2390  #undef Y0
  2391  #undef Y1
  2392  #undef T0
  2393  #undef T1
  2394  #undef PL
  2395  #undef PH
  2396  
  2397  //func p256OrdReduce(s *p256OrdElement)
  2398  #define res_ptr R1
  2399  #define CPOOL   R4
  2400  
  2401  #define T0   V0
  2402  #define T1   V1
  2403  #define T2   V2
  2404  #define TT0  V3
  2405  #define TT1  V4
  2406  
  2407  #define ZER   V6
  2408  #define CAR1  V7
  2409  #define CAR2  V8
  2410  #define PL    V10
  2411  #define PH    V9
  2412  
  2413  TEXT ·p256OrdReduce(SB),NOSPLIT,$0
  2414  	MOVD res+0(FP), res_ptr
  2415  
  2416  	VZERO T2
  2417  	VZERO ZER
  2418  	MOVD  $p256ord<>+0x00(SB), CPOOL
  2419  	VLM  (CPOOL), PH, PL
  2420  
  2421  	VLM  (res_ptr), T0, T1
  2422  	VPDI $0x4, T0, T0, T0
  2423  	VPDI $0x4, T1, T1, T1
  2424  
  2425  	VSCBIQ  PL, T0, CAR1
  2426  	VSQ     PL, T0, TT0
  2427  	VSBCBIQ T1, PH, CAR1, CAR2
  2428  	VSBIQ   T1, PH, CAR1, TT1
  2429  	VSBIQ   T2, ZER, CAR2, T2
  2430  
  2431  	// what output to use, TT1||TT0 or T1||T0?
  2432  	VSEL T0, TT0, T2, T0
  2433  	VSEL T1, TT1, T2, T1
  2434  
  2435  	VPDI $0x4, T0, T0, TT0
  2436  	VPDI $0x4, T1, T1, TT1
  2437  	VSTM  TT0, TT1, (res_ptr)
  2438  
  2439  	RET
  2440  #undef res_ptr
  2441  #undef CPOOL
  2442  #undef T0
  2443  #undef T1
  2444  #undef T2
  2445  #undef TT0
  2446  #undef TT1
  2447  #undef ZER
  2448  #undef CAR1
  2449  #undef CAR2
  2450  #undef PL
  2451  #undef PH