github.com/emmansun/gmsm@v0.29.1/internal/sm2ec/p256_asm_ppc64le.s (about)

     1  // This is a port of the NIST P256 ppc64le asm implementation to SM2 P256.
     2  //
     3  // Copyright 2019 The Go Authors. All rights reserved.
     4  // Use of this source code is governed by a BSD-style
     5  // license that can be found in the LICENSE file.
     6  
     7  //go:build !purego
     8  
     9  #include "textflag.h"
    10  
    11  // This is a port of the s390x asm implementation.
    12  // to ppc64le.
    13  
    14  // Some changes were needed due to differences in
    15  // the Go opcodes and/or available instructions
    16  // between s390x and ppc64le.
    17  
    18  // 1. There were operand order differences in the
    19  // VSUBUQM, VSUBCUQ, and VSEL instructions.
    20  
    21  // 2. ppc64 does not have a multiply high and low
    22  // like s390x, so those were implemented using
    23  // macros to compute the equivalent values.
    24  
    25  // 3. The LVX, STVX instructions on ppc64 require
    26  // 16 byte alignment of the data.  To avoid that
    27  // requirement, data is loaded using LXVD2X and
    28  // STXVD2X with VPERM to reorder bytes correctly.
    29  
    30  // I have identified some areas where I believe
    31  // changes would be needed to make this work for big
    32  // endian; however additional changes beyond what I
    33  // have noted are most likely needed to make it work.
    34  // - The string used with VPERM to swap the byte order
    35  //   for loads and stores.
    36  // - The constants that are loaded from CPOOL.
    37  //
    38  
    39  // The following constants are defined in an order
    40  // that is correct for use with LXVD2X/STXVD2X
    41  // on little endian.
    42  DATA p256ord<>+0x00(SB)/8, $0xfffffffeffffffff
    43  DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff
    44  DATA p256ord<>+0x10(SB)/8, $0x7203df6b21c6052b
    45  DATA p256ord<>+0x18(SB)/8, $0x53bbf40939d54123
    46  DATA p256ord<>+0x20(SB)/8, $0x7235097572350975 // p256ord K0
    47  DATA p256ord<>+0x28(SB)/8, $0x7235097572350975 // p256ord K0
    48  DATA p256<>+0x00(SB)/8, $0xfffffffeffffffff // P256
    49  DATA p256<>+0x08(SB)/8, $0xffffffffffffffff // P256
    50  DATA p256<>+0x10(SB)/8, $0xffffffff00000000 // P256
    51  DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
    52  DATA p256<>+0x20(SB)/8, $0x0000000000000000 // SEL 0 0 d1 d0
    53  DATA p256<>+0x28(SB)/8, $0x18191a1b1c1d1e1f // SEL 0 0 d1 d0
    54  DATA p256mul<>+0x00(SB)/8, $0xffffffff00000000 // P256 original
    55  DATA p256mul<>+0x08(SB)/8, $0xffffffffffffffff // P256
    56  DATA p256mul<>+0x10(SB)/8, $0xfffffffeffffffff // P256 original
    57  DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
    58  DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0  0  0 d0
    59  DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0  0  0 d0
    60  DATA p256mul<>+0x30(SB)/8, $0x0405060708090a0b // SEL  0  0 d1 d0
    61  DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL  0  0 d1 d0
    62  DATA p256mul<>+0x40(SB)/8, $0x00000000ffffffff // (1*2^256)%P256
    63  DATA p256mul<>+0x48(SB)/8, $0x0000000000000001 // (1*2^256)%P256
    64  DATA p256mul<>+0x50(SB)/8, $0x0000000100000000 // (1*2^256)%P256
    65  DATA p256mul<>+0x58(SB)/8, $0x0000000000000000 // (1*2^256)%P256
    66  
    67  // External declarations for constants
    68  GLOBL p256ord<>(SB), 8, $48
    69  GLOBL p256<>(SB), 8, $48
    70  GLOBL p256mul<>(SB), 8, $96
    71  
    72  // The following macros are used to implement the ppc64le
    73  // equivalent function from the corresponding s390x
    74  // instruction for vector multiply high, low, and add,
    75  // since there aren't exact equivalent instructions.
    76  // The corresponding s390x instructions appear in the
    77  // comments.
    78  // Implementation for big endian would have to be
    79  // investigated, I think it would be different.
    80  //
    81  //
    82  // Vector multiply word
    83  //
    84  //	VMLF  x0, x1, out_low
    85  //	VMLHF x0, x1, out_hi
    86  #define VMULT(x1, x2, out_low, out_hi) \
    87  	VMULEUW x1, x2, TMP1; \
    88  	VMULOUW x1, x2, TMP2; \
    89  	VMRGEW TMP1, TMP2, out_hi; \
    90  	VMRGOW TMP1, TMP2, out_low
    91  
    92  //
    93  // Vector multiply add word
    94  //
    95  //	VMALF  x0, x1, y, out_low
    96  //	VMALHF x0, x1, y, out_hi
    97  #define VMULT_ADD(x1, x2, y, one, out_low, out_hi) \
    98  	VMULEUW  y, one, TMP1; \
    99  	VMULOUW  y, one, TMP2; \
   100  	VMULEUW  x1, x2, out_hi; \
   101  	VMULOUW  x1, x2, out_low; \
   102  	VADDUDM  TMP1, out_hi, TMP1; \
   103  	VADDUDM  TMP2, out_low, TMP2; \
   104  	VMRGEW   TMP1, TMP2, out_hi; \
   105  	VMRGOW   TMP1, TMP2, out_low
   106  
   107  #define res_ptr R3
   108  #define a_ptr R4
   109  
   110  #undef res_ptr
   111  #undef a_ptr
   112  
   113  #define P1ptr   R3
   114  #define CPOOL   R7
   115  
   116  #define Y1L   V0
   117  #define Y1H   V1
   118  #define T1L   V2
   119  #define T1H   V3
   120  
   121  #define PL    V30
   122  #define PH    V31
   123  
   124  #define SEL   V4
   125  #define ZER   V5
   126  #define CAR1  V6
   127  // func p256NegCond(val *p256Element, cond int)
   128  TEXT ·p256NegCond(SB), NOSPLIT, $0-16
   129  	MOVD val+0(FP), P1ptr
   130  	MOVD $16, R16
   131  	MOVD $40, R17
   132  
   133  	// cond is R1 + 8 (cond offset) + 32
   134  	LXVDSX (R1)(R17), SEL
   135  	VSPLTISB $0, ZER
   136  	// SEL controls whether to store a or b
   137  	VCMPEQUD SEL, ZER, SEL
   138  
   139  	MOVD $p256mul<>+0x00(SB), CPOOL
   140  
   141  	LXVD2X (P1ptr)(R0), Y1L
   142  	LXVD2X (P1ptr)(R16), Y1H
   143  
   144  	XXPERMDI Y1H, Y1H, $2, Y1H
   145  	XXPERMDI Y1L, Y1L, $2, Y1L
   146  
   147  	LXVD2X (CPOOL)(R0), PL
   148  	LXVD2X (CPOOL)(R16), PH
   149  
   150  	VSUBCUQ  PL, Y1L, CAR1      // subtract part2 giving carry
   151  	VSUBUQM  PL, Y1L, T1L       // subtract part2 giving result
   152  	VSUBEUQM PH, Y1H, CAR1, T1H // subtract part1 using carry from part2
   153  	
   154  	VSEL T1H, Y1H, SEL, Y1H
   155  	VSEL T1L, Y1L, SEL, Y1L
   156  
   157  	XXPERMDI Y1H, Y1H, $2, Y1H
   158  	XXPERMDI Y1L, Y1L, $2, Y1L
   159  
   160  	STXVD2X Y1L, (R0+P1ptr)
   161  	STXVD2X Y1H, (R16+P1ptr)
   162  	RET
   163  
   164  #undef P1ptr
   165  #undef CPOOL
   166  #undef Y1L
   167  #undef Y1H
   168  #undef T1L
   169  #undef T1H
   170  #undef PL
   171  #undef PH
   172  #undef ZER
   173  #undef SEL
   174  #undef CAR1
   175  
   176  #define P3ptr   R3
   177  #define P1ptr   R4
   178  #define P2ptr   R5
   179  
   180  #define X1L    V0
   181  #define X1H    V1
   182  #define Y1L    V2
   183  #define Y1H    V3
   184  #define Z1L    V4
   185  #define Z1H    V5
   186  #define X2L    V6
   187  #define X2H    V7
   188  #define Y2L    V8
   189  #define Y2H    V9
   190  #define Z2L    V10
   191  #define Z2H    V11
   192  #define SEL    V12
   193  #define ZER    V13
   194  
   195  // This function uses LXVD2X and STXVD2X to avoid the
   196  // data alignment requirement for LVX, STVX. Since
   197  // this code is just moving bytes and not doing arithmetic,
   198  // order of the bytes doesn't matter.
   199  //
   200  // func p256MovCond(res, a, b *p256Point, cond int)
   201  TEXT ·p256MovCond(SB), NOSPLIT, $0-32
   202  	MOVD res+0(FP), P3ptr
   203  	MOVD a+8(FP), P1ptr
   204  	MOVD b+16(FP), P2ptr
   205  	MOVD $16, R16
   206  	MOVD $32, R17
   207  	MOVD $48, R18
   208  	MOVD $56, R21
   209  	MOVD $64, R19
   210  	MOVD $80, R20
   211  	// cond is R1 + 24 (cond offset) + 32
   212  	LXVDSX (R1)(R21), SEL
   213  	VSPLTISB $0, ZER
   214  	// SEL controls whether to store a or b
   215  	VCMPEQUD SEL, ZER, SEL
   216  
   217  	LXVD2X (P1ptr+R0), X1H
   218  	LXVD2X (P1ptr+R16), X1L
   219  	LXVD2X (P1ptr+R17), Y1H
   220  	LXVD2X (P1ptr+R18), Y1L
   221  	LXVD2X (P1ptr+R19), Z1H
   222  	LXVD2X (P1ptr+R20), Z1L
   223  
   224  	LXVD2X (P2ptr+R0), X2H
   225  	LXVD2X (P2ptr+R16), X2L
   226  	LXVD2X (P2ptr+R17), Y2H
   227  	LXVD2X (P2ptr+R18), Y2L
   228  	LXVD2X (P2ptr+R19), Z2H
   229  	LXVD2X (P2ptr+R20), Z2L
   230  
   231  	VSEL X1H, X2H, SEL, X1H
   232  	VSEL X1L, X2L, SEL, X1L
   233  	VSEL Y1H, Y2H, SEL, Y1H
   234  	VSEL Y1L, Y2L, SEL, Y1L
   235  	VSEL Z1H, Z2H, SEL, Z1H
   236  	VSEL Z1L, Z2L, SEL, Z1L
   237  
   238  	STXVD2X X1H, (P3ptr+R0)
   239  	STXVD2X X1L, (P3ptr+R16)
   240  	STXVD2X Y1H, (P3ptr+R17)
   241  	STXVD2X Y1L, (P3ptr+R18)
   242  	STXVD2X Z1H, (P3ptr+R19)
   243  	STXVD2X Z1L, (P3ptr+R20)
   244  
   245  	RET
   246  
   247  #undef P3ptr
   248  #undef P1ptr
   249  #undef P2ptr
   250  #undef X1L
   251  #undef X1H
   252  #undef Y1L
   253  #undef Y1H
   254  #undef Z1L
   255  #undef Z1H
   256  #undef X2L
   257  #undef X2H
   258  #undef Y2L
   259  #undef Y2H
   260  #undef Z2L
   261  #undef Z2H
   262  #undef SEL
   263  #undef ZER
   264  
   265  #define P3ptr   R3
   266  #define P1ptr   R4
   267  #define COUNT   R5
   268  
   269  #define X1L    V0
   270  #define X1H    V1
   271  #define Y1L    V2
   272  #define Y1H    V3
   273  #define Z1L    V4
   274  #define Z1H    V5
   275  #define X2L    V6
   276  #define X2H    V7
   277  #define Y2L    V8
   278  #define Y2H    V9
   279  #define Z2L    V10
   280  #define Z2H    V11
   281  
   282  #define ONE   V18
   283  #define IDX   V19
   284  #define SEL1  V20
   285  #define SEL2  V21
   286  // func p256Select(point *p256Point, table *p256Table, idx int, limit int)
   287  TEXT ·p256Select(SB), NOSPLIT, $0-24
   288  	MOVD res+0(FP), P3ptr
   289  	MOVD table+8(FP), P1ptr
   290      MOVD limit+24(FP), COUNT
   291  	MOVD $16, R16
   292  	MOVD $32, R17
   293  	MOVD $48, R18
   294  	MOVD $64, R19
   295  	MOVD $80, R20
   296  
   297  	LXVDSX   (R1)(R18), SEL1 // VLREPG idx+32(FP), SEL1
   298  	VSPLTB   $7, SEL1, IDX    // splat byte
   299  	VSPLTISB $1, ONE          // VREPIB $1, ONE
   300  	VSPLTISB $1, SEL2         // VREPIB $1, SEL2
   301  	MOVD     COUNT, CTR       // set up ctr
   302  
   303  	VSPLTISB $0, X1H // VZERO  X1H
   304  	VSPLTISB $0, X1L // VZERO  X1L
   305  	VSPLTISB $0, Y1H // VZERO  Y1H
   306  	VSPLTISB $0, Y1L // VZERO  Y1L
   307  	VSPLTISB $0, Z1H // VZERO  Z1H
   308  	VSPLTISB $0, Z1L // VZERO  Z1L
   309  
   310  loop_select:
   311  
   312  	// LVXD2X is used here since data alignment doesn't
   313  	// matter.
   314  
   315  	LXVD2X (P1ptr+R0), X2H
   316  	LXVD2X (P1ptr+R16), X2L
   317  	LXVD2X (P1ptr+R17), Y2H
   318  	LXVD2X (P1ptr+R18), Y2L
   319  	LXVD2X (P1ptr+R19), Z2H
   320  	LXVD2X (P1ptr+R20), Z2L
   321  
   322  	VCMPEQUD SEL2, IDX, SEL1 // VCEQG SEL2, IDX, SEL1 OK
   323  
   324  	// This will result in SEL1 being all 0s or 1s, meaning
   325  	// the result is either X1L or X2L, no individual byte
   326  	// selection.
   327  
   328  	VSEL X1L, X2L, SEL1, X1L
   329  	VSEL X1H, X2H, SEL1, X1H
   330  	VSEL Y1L, Y2L, SEL1, Y1L
   331  	VSEL Y1H, Y2H, SEL1, Y1H
   332  	VSEL Z1L, Z2L, SEL1, Z1L
   333  	VSEL Z1H, Z2H, SEL1, Z1H
   334  
   335  	// Add 1 to all bytes in SEL2
   336  	VADDUBM SEL2, ONE, SEL2    // VAB  SEL2, ONE, SEL2 OK
   337  	ADD     $96, P1ptr
   338  	BDNZ    loop_select
   339  
   340  	// STXVD2X is used here so that alignment doesn't
   341  	// need to be verified. Since values were loaded
   342  	// using LXVD2X this is OK.
   343  	STXVD2X X1H, (P3ptr+R0)
   344  	STXVD2X X1L, (P3ptr+R16)
   345  	STXVD2X Y1H, (P3ptr+R17)
   346  	STXVD2X Y1L, (P3ptr+R18)
   347  	STXVD2X Z1H, (P3ptr+R19)
   348  	STXVD2X Z1L, (P3ptr+R20)
   349  	RET
   350  
   351  #undef P3ptr
   352  #undef P1ptr
   353  #undef COUNT
   354  #undef X1L
   355  #undef X1H
   356  #undef Y1L
   357  #undef Y1H
   358  #undef Z1L
   359  #undef Z1H
   360  #undef X2L
   361  #undef X2H
   362  #undef Y2L
   363  #undef Y2H
   364  #undef Z2L
   365  #undef Z2H
   366  #undef ONE
   367  #undef IDX
   368  #undef SEL1
   369  #undef SEL2
   370  
   371  // The following functions all reverse the byte order.
   372  
   373  //func p256BigToLittle(res *p256Element, in *[32]byte)
   374  TEXT ·p256BigToLittle(SB), NOSPLIT, $0-16
   375  	MOVD	res+0(FP), R3
   376  	MOVD	in+8(FP), R4
   377  	BR	p256InternalEndianSwap<>(SB)
   378  
   379  //func p256LittleToBig(res *[32]byte, in *p256Element)
   380  TEXT ·p256LittleToBig(SB), NOSPLIT, $0-16
   381  	MOVD	res+0(FP), R3
   382  	MOVD	in+8(FP), R4
   383  	BR	p256InternalEndianSwap<>(SB)
   384  
   385  //func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte)
   386  TEXT ·p256OrdBigToLittle(SB), NOSPLIT, $0-16
   387  	MOVD	res+0(FP), R3
   388  	MOVD	in+8(FP), R4
   389  	BR	p256InternalEndianSwap<>(SB)
   390  
   391  //func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement)
   392  TEXT ·p256OrdLittleToBig(SB), NOSPLIT, $0-16
   393  	MOVD	res+0(FP), R3
   394  	MOVD	in+8(FP), R4
   395  	BR	p256InternalEndianSwap<>(SB)
   396  
   397  TEXT p256InternalEndianSwap<>(SB), NOSPLIT, $0-0
   398  	// Index registers needed for BR movs
   399  	MOVD	$8, R9
   400  	MOVD	$16, R10
   401  	MOVD	$24, R14
   402  
   403  	MOVDBR	(R0)(R4), R5
   404  	MOVDBR	(R9)(R4), R6
   405  	MOVDBR	(R10)(R4), R7
   406  	MOVDBR	(R14)(R4), R8
   407  
   408  	MOVD	R8, 0(R3)
   409  	MOVD	R7, 8(R3)
   410  	MOVD	R6, 16(R3)
   411  	MOVD	R5, 24(R3)
   412  
   413  	RET
   414  
   415  #define P3ptr   R3
   416  #define P1ptr   R4
   417  #define COUNT   R5
   418  
   419  #define X1L    V0
   420  #define X1H    V1
   421  #define Y1L    V2
   422  #define Y1H    V3
   423  #define Z1L    V4
   424  #define Z1H    V5
   425  #define X2L    V6
   426  #define X2H    V7
   427  #define Y2L    V8
   428  #define Y2H    V9
   429  #define Z2L    V10
   430  #define Z2H    V11
   431  
   432  #define ONE   V18
   433  #define IDX   V19
   434  #define SEL1  V20
   435  #define SEL2  V21
   436  
   437  // func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int)
   438  TEXT ·p256SelectAffine(SB), NOSPLIT, $0-24
   439  	MOVD res+0(FP), P3ptr
   440  	MOVD table+8(FP), P1ptr
   441  	MOVD $16, R16
   442  	MOVD $32, R17
   443  	MOVD $48, R18
   444  
   445  	LXVDSX (R1)(R18), SEL1
   446  	VSPLTB $7, SEL1, IDX    // splat byte
   447  
   448  	VSPLTISB $1, ONE    // Vector with byte 1s
   449  	VSPLTISB $1, SEL2   // Vector with byte 1s
   450  	MOVD     $32, COUNT
   451  	MOVD     COUNT, CTR // loop count
   452  
   453  	VSPLTISB $0, X1H // VZERO  X1H
   454  	VSPLTISB $0, X1L // VZERO  X1L
   455  	VSPLTISB $0, Y1H // VZERO  Y1H
   456  	VSPLTISB $0, Y1L // VZERO  Y1L
   457  
   458  loop_select:
   459  	LXVD2X (P1ptr+R0), X2H
   460  	LXVD2X (P1ptr+R16), X2L
   461  	LXVD2X (P1ptr+R17), Y2H
   462  	LXVD2X (P1ptr+R18), Y2L
   463  
   464  	VCMPEQUD SEL2, IDX, SEL1 // Compare against idx
   465  
   466  	VSEL X1L, X2L, SEL1, X1L // Select if idx matched
   467  	VSEL X1H, X2H, SEL1, X1H
   468  	VSEL Y1L, Y2L, SEL1, Y1L
   469  	VSEL Y1H, Y2H, SEL1, Y1H
   470  
   471  	VADDUBM SEL2, ONE, SEL2    // Increment SEL2 bytes by 1
   472  	ADD     $64, P1ptr         // Next chunk
   473  	BDNZ	loop_select
   474  
   475  	STXVD2X X1H, (P3ptr+R0)
   476  	STXVD2X X1L, (P3ptr+R16)
   477  	STXVD2X Y1H, (P3ptr+R17)
   478  	STXVD2X Y1L, (P3ptr+R18)
   479  	RET
   480  
   481  #undef P3ptr
   482  #undef P1ptr
   483  #undef COUNT
   484  #undef X1L
   485  #undef X1H
   486  #undef Y1L
   487  #undef Y1H
   488  #undef Z1L
   489  #undef Z1H
   490  #undef X2L
   491  #undef X2H
   492  #undef Y2L
   493  #undef Y2H
   494  #undef Z2L
   495  #undef Z2H
   496  #undef ONE
   497  #undef IDX
   498  #undef SEL1
   499  #undef SEL2
   500  
   501  // ---------------------------------------
   502  // sm2p256OrdMulInternal
   503  #define X0    V0
   504  #define X1    V1
   505  #define Y0    V2
   506  #define Y1    V3
   507  #define M1    V4
   508  #define M0    V5
   509  #define T0    V6
   510  #define T1    V7
   511  #define T2    V8
   512  #define YDIG  V9
   513  
   514  #define ADD1  V16
   515  #define ADD1H V17
   516  #define ADD2  V18
   517  #define ADD2H V19
   518  #define RED1  V20
   519  #define RED1H V21
   520  #define RED2  V22
   521  #define RED2H V23
   522  #define CAR1  V24
   523  #define CAR1M V25
   524  
   525  #define MK0   V30
   526  #define K0    V31
   527  
   528  // TMP1, TMP2 used in
   529  // VMULT macros
   530  #define TMP1  V13
   531  #define TMP2  V27
   532  #define ONE   V29 // 1s splatted by word
   533  
   534  TEXT sm2p256OrdMulInternal<>(SB), NOSPLIT, $0
   535  	// ---------------------------------------------------------------------------/
   536  	//	VREPF $3, Y0, YDIG
   537  	VSPLTW $3, Y0, YDIG
   538  	VSPLTISW $1, ONE
   539  
   540  	//	VMLF  X0, YDIG, ADD1
   541  	//	VMLF  X1, YDIG, ADD2
   542  	//	VMLHF X0, YDIG, ADD1H
   543  	//	VMLHF X1, YDIG, ADD2H
   544  	VMULT(X0, YDIG, ADD1, ADD1H)
   545  	VMULT(X1, YDIG, ADD2, ADD2H)
   546  
   547  	//	VMLF  ADD1, K0, MK0
   548  	//	VREPF $3, MK0, MK0
   549  	VMULUWM ADD1, K0, MK0
   550  	VSPLTW $3, MK0, MK0
   551  
   552  	//	VMALF  M0, MK0, ADD1, RED1
   553  	//	VMALHF M0, MK0, ADD1, RED1H
   554  	//	VMALF  M1, MK0, ADD2, RED2
   555  	//	VMALHF M1, MK0, ADD2, RED2H
   556  	VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H)
   557  	VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H)
   558  
   559  	VSPLTISB $0, T2 // VZERO T2
   560  
   561  	VSLDOI $12, RED2, RED1, RED1 // VSLDB
   562  	VSLDOI $12, T2, RED2, RED2   // VSLDB
   563  
   564  	VADDCUQ RED1, ADD1H, CAR1  // VACCQ
   565  	VADDUQM RED1, ADD1H, T0    // VAQ
   566  	VADDCUQ RED1H, T0, CAR1M   // VACCQ
   567  	VADDUQM RED1H, T0, T0      // VAQ
   568  
   569  	// << ready for next MK0
   570  
   571  	VADDEUQM RED2, ADD2H, CAR1, T1   // VACQ
   572  	VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ
   573  	VADDECUQ RED2H, T1, CAR1M, T2    // VACCCQ
   574  	VADDEUQM RED2H, T1, CAR1M, T1    // VACQ
   575  	VADDUQM  CAR1, T2, T2            // VAQ
   576  
   577  	// ---------------------------------------------------
   578  /* *
   579   * ---+--------+--------+
   580   *  T2|   T1   |   T0   |
   581   * ---+--------+--------+
   582   *           *(add)*
   583   *    +--------+--------+
   584   *    |   X1   |   X0   |
   585   *    +--------+--------+
   586   *           *(mul)*
   587   *    +--------+--------+
   588   *    |  YDIG  |  YDIG  |
   589   *    +--------+--------+
   590   *           *(add)*
   591   *    +--------+--------+
   592   *    |   M1   |   M0   |
   593   *    +--------+--------+
   594   *           *(mul)*
   595   *    +--------+--------+
   596   *    |   MK0  |   MK0  |
   597   *    +--------+--------+
   598   *
   599   *   ---------------------
   600   *
   601   *    +--------+--------+
   602   *    |  ADD2  |  ADD1  |
   603   *    +--------+--------+
   604   *  +--------+--------+
   605   *  | ADD2H  | ADD1H  |
   606   *  +--------+--------+
   607   *    +--------+--------+
   608   *    |  RED2  |  RED1  |
   609   *    +--------+--------+
   610   *  +--------+--------+
   611   *  | RED2H  | RED1H  |
   612   *  +--------+--------+
   613   */
   614  	// VREPF $2, Y0, YDIG
   615  	VSPLTW $2, Y0, YDIG
   616  
   617  	//	VMALF X0, YDIG, T0, ADD1
   618  	//	VMALF  X1, YDIG, T1, ADD2
   619  	//	VMALHF X0, YDIG, T0, ADD1H
   620  	//	VMALHF X1, YDIG, T1, ADD2H
   621  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   622  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   623  
   624  	//	VMLF  ADD1, K0, MK0
   625  	//	VREPF $3, MK0, MK0
   626  	VMULUWM ADD1, K0, MK0
   627  	VSPLTW $3, MK0, MK0
   628  
   629  	//	VMALF  M0, MK0, ADD1, RED1
   630  	//	VMALHF M0, MK0, ADD1, RED1H
   631  	//	VMALF  M1, MK0, ADD2, RED2
   632  	//	VMALHF M1, MK0, ADD2, RED2H
   633  	VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H)
   634  	VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H)
   635  
   636  	VSLDOI $12, RED2, RED1, RED1 // VSLDB
   637  	VSLDOI $12, T2, RED2, RED2   // VSLDB
   638  
   639  	VADDCUQ RED1, ADD1H, CAR1 // VACCQ
   640  	VADDUQM RED1, ADD1H, T0   // VAQ
   641  	VADDCUQ RED1H, T0, CAR1M  // VACCQ
   642  	VADDUQM RED1H, T0, T0     // VAQ
   643  
   644  	// << ready for next MK0
   645  
   646  	VADDEUQM RED2, ADD2H, CAR1, T1   // VACQ
   647  	VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ
   648  	VADDECUQ RED2H, T1, CAR1M, T2    // VACCCQ
   649  	VADDEUQM RED2H, T1, CAR1M, T1    // VACQ
   650  	VADDUQM  CAR1, T2, T2            // VAQ
   651  
   652  	// ---------------------------------------------------
   653  	//	VREPF $1, Y0, YDIG
   654  	VSPLTW $1, Y0, YDIG
   655  
   656  	//	VMALF X0, YDIG, T0, ADD1
   657  	//	VMALF  X1, YDIG, T1, ADD2
   658  	//	VMALHF X0, YDIG, T0, ADD1H
   659  	//	VMALHF X1, YDIG, T1, ADD2H
   660  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   661  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   662  
   663  	//	VMLF  ADD1, K0, MK0
   664  	//	VREPF $3, MK0, MK0
   665  	VMULUWM ADD1, K0, MK0
   666  	VSPLTW $3, MK0, MK0
   667  
   668  	//	VMALF  M0, MK0, ADD1, RED1
   669  	//	VMALHF M0, MK0, ADD1, RED1H
   670  	//	VMALF  M1, MK0, ADD2, RED2
   671  	//	VMALHF M1, MK0, ADD2, RED2H
   672  	VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H)
   673  	VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H)
   674  	
   675  	VSLDOI $12, RED2, RED1, RED1 // VSLDB
   676  	VSLDOI $12, T2, RED2, RED2   // VSLDB
   677  
   678  	VADDCUQ RED1, ADD1H, CAR1 // VACCQ
   679  	VADDUQM RED1, ADD1H, T0 // VAQ
   680  	VADDCUQ RED1H, T0, CAR1M  // VACCQ
   681  	VADDUQM RED1H, T0, T0   // VAQ
   682  
   683  	// << ready for next MK0
   684  
   685  	VADDEUQM RED2, ADD2H, CAR1, T1   // VACQ
   686  	VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ
   687  	VADDECUQ RED2H, T1, CAR1M, T2    // VACCCQ
   688  	VADDEUQM RED2H, T1, CAR1M, T1    // VACQ
   689  	VADDUQM  CAR1, T2, T2            // VAQ
   690  
   691  	// ---------------------------------------------------
   692  	//	VREPF $0, Y0, YDIG
   693  	VSPLTW $0, Y0, YDIG
   694  
   695  	//	VMALF X0, YDIG, T0, ADD1
   696  	//	VMALF  X1, YDIG, T1, ADD2
   697  	//	VMALHF X0, YDIG, T0, ADD1H
   698  	//	VMALHF X1, YDIG, T1, ADD2H
   699  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   700  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   701  
   702  	//	VMLF  ADD1, K0, MK0
   703  	//	VREPF $3, MK0, MK0
   704  	VMULUWM ADD1, K0, MK0
   705  	VSPLTW $3, MK0, MK0
   706  
   707  	//	VMALF  M0, MK0, ADD1, RED1
   708  	//	VMALHF M0, MK0, ADD1, RED1H
   709  	//	VMALF  M1, MK0, ADD2, RED2
   710  	//	VMALHF M1, MK0, ADD2, RED2H
   711  	VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H)
   712  	VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H)
   713  
   714  	VSLDOI $12, RED2, RED1, RED1
   715  	VSLDOI $12, T2, RED2, RED2
   716  
   717  	VADDCUQ RED1, ADD1H, CAR1 // VACCQ
   718  	VADDUQM RED1, ADD1H, T0   // VAQ
   719  	VADDCUQ RED1H, T0, CAR1M  // VACCQ
   720  	VADDUQM RED1H, T0, T0     // VAQ
   721  
   722  	// << ready for next MK0
   723  
   724  	VADDEUQM RED2, ADD2H, CAR1, T1   // VACQ
   725  	VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ
   726  	VADDECUQ RED2H, T1, CAR1M, T2    // VACCCQ
   727  	VADDEUQM RED2H, T1, CAR1M, T1    // VACQ
   728  	VADDUQM  CAR1, T2, T2            // VAQ
   729  
   730  	// ---------------------------------------------------
   731  	//	VREPF $3, Y1, YDIG
   732  	VSPLTW $3, Y1, YDIG
   733  
   734  	//	VMALF X0, YDIG, T0, ADD1
   735  	//	VMALF  X1, YDIG, T1, ADD2
   736  	//	VMALHF X0, YDIG, T0, ADD1H
   737  	//	VMALHF X1, YDIG, T1, ADD2H
   738  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   739  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   740  
   741  	//	VMLF  ADD1, K0, MK0
   742  	//	VREPF $3, MK0, MK0
   743  	VMULUWM ADD1, K0, MK0
   744  	VSPLTW $3, MK0, MK0
   745  
   746  	//	VMALF  M0, MK0, ADD1, RED1
   747  	//	VMALHF M0, MK0, ADD1, RED1H
   748  	//	VMALF  M1, MK0, ADD2, RED2
   749  	//	VMALHF M1, MK0, ADD2, RED2H
   750  	VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H)
   751  	VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H)
   752  
   753  	VSLDOI $12, RED2, RED1, RED1
   754  	VSLDOI $12, T2, RED2, RED2
   755  
   756  	VADDCUQ RED1, ADD1H, CAR1 // VACCQ
   757  	VADDUQM RED1, ADD1H, T0   // VAQ
   758  	VADDCUQ RED1H, T0, CAR1M  // VACCQ
   759  	VADDUQM RED1H, T0, T0     // VAQ
   760  
   761  	// << ready for next MK0
   762  
   763  	VADDEUQM RED2, ADD2H, CAR1, T1   // VACQ
   764  	VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ
   765  	VADDECUQ RED2H, T1, CAR1M, T2    // VACCCQ
   766  	VADDEUQM RED2H, T1, CAR1M, T1    // VACQ
   767  	VADDUQM  CAR1, T2, T2            // VAQ
   768  
   769  	// ---------------------------------------------------
   770  	//	VREPF $2, Y1, YDIG
   771  	VSPLTW $2, Y1, YDIG
   772  
   773  	//	VMALF X0, YDIG, T0, ADD1
   774  	//	VMALF  X1, YDIG, T1, ADD2
   775  	//	VMALHF X0, YDIG, T0, ADD1H
   776  	//	VMALHF X1, YDIG, T1, ADD2H
   777  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   778  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   779  
   780  	//	VMLF  ADD1, K0, MK0
   781  	//	VREPF $3, MK0, MK0
   782  	VMULUWM ADD1, K0, MK0
   783  	VSPLTW $3, MK0, MK0
   784  
   785  	//	VMALF  M0, MK0, ADD1, RED1
   786  	//	VMALHF M0, MK0, ADD1, RED1H
   787  	//	VMALF  M1, MK0, ADD2, RED2
   788  	//	VMALHF M1, MK0, ADD2, RED2H
   789  	VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H)
   790  	VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H)
   791  
   792  	VSLDOI $12, RED2, RED1, RED1
   793  	VSLDOI $12, T2, RED2, RED2
   794  
   795  	VADDCUQ RED1, ADD1H, CAR1 // VACCQ
   796  	VADDUQM RED1, ADD1H, T0   // VAQ
   797  	VADDCUQ RED1H, T0, CAR1M  // VACCQ
   798  	VADDUQM RED1H, T0, T0     // VAQ
   799  
   800  	// << ready for next MK0
   801  
   802  	VADDEUQM RED2, ADD2H, CAR1, T1   // VACQ
   803  	VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ
   804  	VADDECUQ RED2H, T1, CAR1M, T2    // VACCCQ
   805  	VADDEUQM RED2H, T1, CAR1M, T1    // VACQ
   806  	VADDUQM  CAR1, T2, T2            // VAQ
   807  
   808  	// ---------------------------------------------------
   809  	//	VREPF $1, Y1, YDIG
   810  	VSPLTW $1, Y1, YDIG
   811  
   812  	//	VMALF X0, YDIG, T0, ADD1
   813  	//	VMALF  X1, YDIG, T1, ADD2
   814  	//	VMALHF X0, YDIG, T0, ADD1H
   815  	//	VMALHF X1, YDIG, T1, ADD2H
   816  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   817  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   818  
   819  	//	VMLF  ADD1, K0, MK0
   820  	//	VREPF $3, MK0, MK0
   821  	VMULUWM ADD1, K0, MK0
   822  	VSPLTW $3, MK0, MK0
   823  
   824  	//	VMALF  M0, MK0, ADD1, RED1
   825  	//	VMALHF M0, MK0, ADD1, RED1H
   826  	//	VMALF  M1, MK0, ADD2, RED2
   827  	//	VMALHF M1, MK0, ADD2, RED2H
   828  	VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H)
   829  	VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H)
   830  
   831  	VSLDOI $12, RED2, RED1, RED1
   832  	VSLDOI $12, T2, RED2, RED2
   833  
   834  	VADDCUQ RED1, ADD1H, CAR1 // VACCQ
   835  	VADDUQM RED1, ADD1H, T0   // VAQ
   836  	VADDCUQ RED1H, T0, CAR1M  // VACCQ
   837  	VADDUQM RED1H, T0, T0     // VAQ
   838  
   839  	// << ready for next MK0
   840  
   841  	VADDEUQM RED2, ADD2H, CAR1, T1   // VACQ
   842  	VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ
   843  	VADDECUQ RED2H, T1, CAR1M, T2    // VACCCQ
   844  	VADDEUQM RED2H, T1, CAR1M, T1    // VACQ
   845  	VADDUQM  CAR1, T2, T2            // VAQ
   846  
   847  	// ---------------------------------------------------
   848  	//	VREPF $0, Y1, YDIG
   849  	VSPLTW $0, Y1, YDIG
   850  	
   851  	//	VMALF X0, YDIG, T0, ADD1
   852  	//	VMALF  X1, YDIG, T1, ADD2
   853  	//	VMALHF X0, YDIG, T0, ADD1H
   854  	//	VMALHF X1, YDIG, T1, ADD2H
   855  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
   856  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
   857  
   858  	//	VMLF  ADD1, K0, MK0
   859  	//	VREPF $3, MK0, MK0
   860  	VMULUWM ADD1, K0, MK0
   861  	VSPLTW $3, MK0, MK0
   862  
   863  	//	VMALF  M0, MK0, ADD1, RED1
   864  	//	VMALHF M0, MK0, ADD1, RED1H
   865  	//	VMALF  M1, MK0, ADD2, RED2
   866  	//	VMALHF M1, MK0, ADD2, RED2H
   867  	VMULT_ADD(M0, MK0, ADD1, ONE, RED1, RED1H)
   868  	VMULT_ADD(M1, MK0, ADD2, ONE, RED2, RED2H)
   869  
   870  	VSLDOI $12, RED2, RED1, RED1
   871  	VSLDOI $12, T2, RED2, RED2
   872  
   873  	VADDCUQ RED1, ADD1H, CAR1 // VACCQ
   874  	VADDUQM RED1, ADD1H, T0   // VAQ
   875  	VADDCUQ RED1H, T0, CAR1M  // VACCQ
   876  	VADDUQM RED1H, T0, T0     // VAQ
   877  
   878  	// << ready for next MK0
   879  
   880  	VADDEUQM RED2, ADD2H, CAR1, T1   // VACQ
   881  	VADDECUQ RED2, ADD2H, CAR1, CAR1 // VACCCQ
   882  	VADDECUQ RED2H, T1, CAR1M, T2    // VACCCQ
   883  	VADDEUQM RED2H, T1, CAR1M, T1    // VACQ
   884  	VADDUQM  CAR1, T2, T2            // VAQ
   885  
   886  	// ---------------------------------------------------
   887  
   888  	//	VZERO   RED1
   889  	//	VSCBIQ  M0, T0, CAR1
   890  	//	VSQ     M0, T0, ADD1
   891  	//	VSBCBIQ T1, M1, CAR1, CAR1M
   892  	//	VSBIQ   T1, M1, CAR1, ADD2
   893  	//	VSBIQ   T2, RED1, CAR1M, T2
   894  	VSPLTISB $0, RED1 // VZERO RED1
   895  	VSUBCUQ  T0, M0, CAR1         // VSCBIQ
   896  	VSUBUQM  T0, M0, ADD1         // VSQ
   897  	VSUBECUQ T1, M1, CAR1, CAR1M  // VSBCBIQ
   898  	VSUBEUQM T1, M1, CAR1, ADD2   // VSBIQ
   899  	VSUBEUQM T2, RED1, CAR1M, T2  // VSBIQ
   900  
   901  	// what output to use, ADD2||ADD1 or T1||T0?
   902  	VSEL ADD1, T0, T2, T0
   903  	VSEL ADD2, T1, T2, T1
   904  	RET
   905  
   906  #undef X0
   907  #undef X1
   908  #undef Y0
   909  #undef Y1
   910  #undef M0
   911  #undef M1
   912  #undef T0
   913  #undef T1
   914  #undef T2
   915  #undef YDIG
   916  
   917  #undef ADD1
   918  #undef ADD1H
   919  #undef ADD2
   920  #undef ADD2H
   921  #undef RED1
   922  #undef RED1H
   923  #undef RED2
   924  #undef RED2H
   925  #undef CAR1
   926  #undef CAR1M
   927  
   928  #undef MK0
   929  #undef K0
   930  #undef TMP1
   931  #undef TMP2
   932  #undef ONE
   933  
   934  // ---------------------------------------
   935  
   936  // func p256OrdMul(res, in1, in2 *p256OrdElement)
   937  #define res_ptr R3
   938  #define x_ptr R4
   939  #define y_ptr R5
   940  #define CPOOL R7
   941  #define N     R8
   942  
   943  #define X0    V0
   944  #define X1    V1
   945  #define Y0    V2
   946  #define Y1    V3
   947  #define M0    V5
   948  #define M1    V4
   949  #define T0    V6
   950  #define T1    V7
   951  #define K0    V31
   952  TEXT ·p256OrdMul(SB), NOSPLIT, $0-24
   953  	MOVD res+0(FP), res_ptr
   954  	MOVD in1+8(FP), x_ptr
   955  	MOVD in2+16(FP), y_ptr
   956  	MOVD $16, R16
   957  	MOVD $32, R17
   958  
   959  	LXVD2X (R0)(x_ptr), X0
   960  	LXVD2X (R16)(x_ptr), X1
   961  
   962  	XXPERMDI X0, X0, $2, X0
   963  	XXPERMDI X1, X1, $2, X1
   964  
   965  	LXVD2X (R0)(y_ptr), Y0
   966  	LXVD2X (R16)(y_ptr), Y1
   967  
   968  	XXPERMDI Y0, Y0, $2, Y0
   969  	XXPERMDI Y1, Y1, $2, Y1
   970  
   971  	MOVD $p256ord<>+0x00(SB), CPOOL
   972  	LXVD2X (R16)(CPOOL), M0
   973  	LXVD2X (R0)(CPOOL), M1
   974  	LXVD2X (R17)(CPOOL), K0  // Can use VSPLTISW $0x72350975, K0 instead
   975  
   976  	CALL sm2p256OrdMulInternal<>(SB)
   977  
   978  	XXPERMDI T0, T0, $2, T0
   979  	XXPERMDI T1, T1, $2, T1
   980  	STXVD2X T0, (R0)(res_ptr)
   981  	STXVD2X T1, (R16)(res_ptr)
   982  
   983  	RET
   984  
   985  // func p256OrdSqr(res, in *p256Element, n int)
   986  TEXT ·p256OrdSqr(SB), NOSPLIT, $0-24
   987  	MOVD res+0(FP), res_ptr
   988  	MOVD in+8(FP), x_ptr
   989  	MOVD n+16(FP), N
   990  	MOVD $16, R16
   991  	MOVD $32, R17
   992  
   993  	LXVD2X (R0)(x_ptr), X0
   994  	LXVD2X (R16)(x_ptr), X1
   995  
   996  	XXPERMDI X0, X0, $2, X0
   997  	XXPERMDI X1, X1, $2, X1
   998  
   999  	MOVD $p256ord<>+0x00(SB), CPOOL
  1000  	LXVD2X (R16)(CPOOL), M0
  1001  	LXVD2X (R0)(CPOOL), M1
  1002  	LXVD2X (R17)(CPOOL), K0  // Can use VSPLTISW $0x72350975, K0 instead
  1003  
  1004  sqrOrdLoop:
  1005  	// Sqr uses same value for both
  1006  
  1007  	VOR	X0, X0, Y0
  1008  	VOR	X1, X1, Y1
  1009  	CALL sm2p256OrdMulInternal<>(SB)
  1010  
  1011  	ADD	$-1, N
  1012  	CMP	$0, N
  1013  	BEQ	done
  1014  
  1015  	VOR	T0, T0, X0
  1016  	VOR	T1, T1, X1
  1017  	BR	sqrOrdLoop
  1018  
  1019  done:
  1020  	XXPERMDI T0, T0, $2, T0
  1021  	XXPERMDI T1, T1, $2, T1
  1022  	STXVD2X T0, (R0)(res_ptr)
  1023  	STXVD2X T1, (R16)(res_ptr)
  1024  
  1025  	RET
  1026  
  1027  #undef res_ptr
  1028  #undef x_ptr
  1029  #undef y_ptr
  1030  #undef CPOOL
  1031  #undef N
  1032  #undef X0
  1033  #undef X1
  1034  #undef Y0
  1035  #undef Y1
  1036  #undef M0
  1037  #undef M1
  1038  #undef T0
  1039  #undef T1
  1040  #undef K0
  1041  
  1042  #define res_ptr R3
  1043  #define x_ptr   R4
  1044  #define CPOOL   R7
  1045  
  1046  #define T0   V0
  1047  #define T1   V1
  1048  #define T2   V2
  1049  #define TT0  V3
  1050  #define TT1  V4
  1051  
  1052  #define ZER   V6
  1053  #define SEL1  V7
  1054  #define SEL2  V8
  1055  #define CAR1  V9
  1056  #define CAR2  V10
  1057  #define RED1  V11
  1058  #define RED2  V12
  1059  #define PL    V13
  1060  #define PH    V14
  1061  
  1062  // func p256FromMont(res, in *p256Element)
  1063  TEXT ·p256FromMont(SB), NOSPLIT, $0-16
  1064  	MOVD res+0(FP), res_ptr
  1065  	MOVD in+8(FP), x_ptr
  1066  
  1067  	MOVD $16, R16
  1068  	MOVD $32, R17
  1069  	MOVD $p256<>+0x00(SB), CPOOL
  1070  
  1071  	VSPLTISB $0, T2  // VZERO T2
  1072  	VSPLTISB $0, ZER // VZERO ZER
  1073  
  1074  	// Constants are defined so that the LXVD2X is correct
  1075  	LXVD2X (CPOOL+R0), PH
  1076  	LXVD2X (CPOOL+R16), PL
  1077  
  1078  	// VPERM byte selections
  1079  	LXVD2X (CPOOL+R17), SEL1
  1080  
  1081  	LXVD2X (R16)(x_ptr), T1
  1082  	LXVD2X (R0)(x_ptr), T0
  1083  
  1084  	// Put in true little endian order
  1085  	XXPERMDI T0, T0, $2, T0
  1086  	XXPERMDI T1, T1, $2, T1
  1087  
  1088  	// First round
  1089  	VPERM ZER, T0, SEL1, RED1      // 0 0 d1 d0
  1090  	VSLDOI $4, RED1, ZER, TT0      // 0 d1 d0 0
  1091  	VSLDOI $4, TT0, ZER, RED2      // d1 d0 0 0
  1092  	VSUBCUQ  RED1, TT0, CAR1       // VSCBIQ  TT0, RED1, CAR1
  1093  	VSUBUQM  RED1, TT0, RED1       // VSQ	 TT0, RED1, RED1
  1094  	VSUBEUQM RED2, TT0, CAR1, RED2 // VSBIQ  RED2, TT0, CAR1, RED2 // Guaranteed not to underflow
  1095  
  1096  	VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
  1097  	VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
  1098  
  1099  	VADDCUQ  T0, RED1, CAR1       // VACCQ  T0, RED1, CAR1
  1100  	VADDUQM  T0, RED1, T0         // VAQ    T0, RED1, T0
  1101  	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
  1102  	VADDEUQM T1, RED2, CAR1, T1   // VACQ   T1, RED2, CAR1, T1
  1103  	VADDUQM  T2, CAR2, T2         // VAQ    T2, CAR2, T2
  1104  
  1105  	// Second round
  1106  	VPERM ZER, T0, SEL1, RED1      // 0 0 d1 d0
  1107  	VSLDOI $4, RED1, ZER, TT0      // 0 d1 d0 0
  1108  	VSLDOI $4, TT0, ZER, RED2      // d1 d0 0 0
  1109  	VSUBCUQ  RED1, TT0, CAR1       // VSCBIQ  TT0, RED1, CAR1
  1110  	VSUBUQM  RED1, TT0, RED1       // VSQ	 TT0, RED1, RED1
  1111  	VSUBEUQM RED2, TT0, CAR1, RED2 // VSBIQ  RED2, TT0, CAR1, RED2 // Guaranteed not to underflow
  1112  
  1113  	VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
  1114  	VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
  1115  
  1116  	VADDCUQ  T0, RED1, CAR1       // VACCQ  T0, RED1, CAR1
  1117  	VADDUQM  T0, RED1, T0         // VAQ    T0, RED1, T0
  1118  	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
  1119  	VADDEUQM T1, RED2, CAR1, T1   // VACQ   T1, RED2, CAR1, T1
  1120  	VADDUQM  T2, CAR2, T2         // VAQ    T2, CAR2, T2
  1121  
  1122  	// Third round
  1123  	VPERM ZER, T0, SEL1, RED1      // 0 0 d1 d0
  1124  	VSLDOI $4, RED1, ZER, TT0      // 0 d1 d0 0
  1125  	VSLDOI $4, TT0, ZER, RED2      // d1 d0 0 0
  1126  	VSUBCUQ  RED1, TT0, CAR1       // VSCBIQ  TT0, RED1, CAR1
  1127  	VSUBUQM  RED1, TT0, RED1       // VSQ	 TT0, RED1, RED1
  1128  	VSUBEUQM RED2, TT0, CAR1, RED2 // VSBIQ  RED2, TT0, CAR1, RED2 // Guaranteed not to underflow
  1129  
  1130  	VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
  1131  	VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
  1132  
  1133  	VADDCUQ  T0, RED1, CAR1       // VACCQ  T0, RED1, CAR1
  1134  	VADDUQM  T0, RED1, T0         // VAQ    T0, RED1, T0
  1135  	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
  1136  	VADDEUQM T1, RED2, CAR1, T1   // VACQ   T1, RED2, CAR1, T1
  1137  	VADDUQM  T2, CAR2, T2         // VAQ    T2, CAR2, T2
  1138  
  1139  	// Last round
  1140  	VPERM ZER, T0, SEL1, RED1      // 0 0 d1 d0
  1141  	VSLDOI $4, RED1, ZER, TT0      // 0 d1 d0 0
  1142  	VSLDOI $4, TT0, ZER, RED2      // d1 d0 0 0
  1143  	VSUBCUQ  RED1, TT0, CAR1       // VSCBIQ  TT0, RED1, CAR1
  1144  	VSUBUQM  RED1, TT0, RED1       // VSQ	 TT0, RED1, RED1
  1145  	VSUBEUQM RED2, TT0, CAR1, RED2 // VSBIQ  RED2, TT0, CAR1, RED2 // Guaranteed not to underflow
  1146  
  1147  	VSLDOI $8, T1, T0, T0 // VSLDB $8, T1, T0, T0
  1148  	VSLDOI $8, T2, T1, T1 // VSLDB $8, T2, T1, T1
  1149  
  1150  	VADDCUQ  T0, RED1, CAR1       // VACCQ  T0, RED1, CAR1
  1151  	VADDUQM  T0, RED1, T0         // VAQ    T0, RED1, T0
  1152  	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ T1, RED2, CAR1, CAR2
  1153  	VADDEUQM T1, RED2, CAR1, T1   // VACQ   T1, RED2, CAR1, T1
  1154  	VADDUQM  T2, CAR2, T2         // VAQ    T2, CAR2, T2
  1155  
  1156  	// ---------------------------------------------------
  1157  
  1158  	VSUBCUQ  T0, PL, CAR1       // VSCBIQ  PL, T0, CAR1
  1159  	VSUBUQM  T0, PL, TT0        // VSQ     PL, T0, TT0
  1160  	VSUBECUQ T1, PH, CAR1, CAR2 // VSBCBIQ T1, PH, CAR1, CAR2
  1161  	VSUBEUQM T1, PH, CAR1, TT1  // VSBIQ   T1, PH, CAR1, TT1
  1162  	VSUBEUQM T2, ZER, CAR2, T2  // VSBIQ   T2, ZER, CAR2, T2
  1163  
  1164  	VSEL TT0, T0, T2, T0
  1165  	VSEL TT1, T1, T2, T1
  1166  
  1167  	// Reorder the bytes so STXVD2X can be used.
  1168  	// TT0, TT1 used for VPERM result in case
  1169  	// the caller expects T0, T1 to be good.
  1170  	XXPERMDI T0, T0, $2, TT0
  1171  	XXPERMDI T1, T1, $2, TT1
  1172  
  1173  	STXVD2X TT0, (R0)(res_ptr)
  1174  	STXVD2X TT1, (R16)(res_ptr)
  1175  	RET
  1176  
  1177  #undef res_ptr
  1178  #undef x_ptr
  1179  #undef CPOOL
  1180  #undef T0
  1181  #undef T1
  1182  #undef T2
  1183  #undef TT0
  1184  #undef TT1
  1185  #undef ZER
  1186  #undef SEL1
  1187  #undef SEL2
  1188  #undef CAR1
  1189  #undef CAR2
  1190  #undef RED1
  1191  #undef RED2
  1192  #undef PL
  1193  #undef PH
  1194  
  1195  //func p256OrdReduce(s *p256OrdElement)
  1196  #define res_ptr R3
  1197  #define CPOOL   R4
  1198  
  1199  #define T0   V0
  1200  #define T1   V1
  1201  #define T2   V2
  1202  #define TT0  V3
  1203  #define TT1  V4
  1204  
  1205  #define ZER   V6
  1206  #define CAR1  V7
  1207  #define CAR2  V8
  1208  #define PL    V9
  1209  #define PH    V10
  1210  
  1211  TEXT ·p256OrdReduce(SB),NOSPLIT,$0
  1212  	MOVD res+0(FP), res_ptr
  1213  	MOVD $16, R16
  1214  
  1215  	VSPLTISB $0, T2  // VZERO T2
  1216  	VSPLTISB $0, ZER // VZERO ZER
  1217  
  1218  	MOVD  $p256ord<>+0x00(SB), CPOOL
  1219  	LXVD2X (CPOOL+R0), PH
  1220  	LXVD2X (CPOOL+R16), PL
  1221  
  1222  	LXVD2X (R16)(res_ptr), T1
  1223  	LXVD2X (R0)(res_ptr), T0
  1224  
  1225  	// Put in true little endian order
  1226  	XXPERMDI T0, T0, $2, T0
  1227  	XXPERMDI T1, T1, $2, T1
  1228  
  1229  	VSUBCUQ  T0, PL, CAR1       // VSCBIQ  PL, T0, CAR1
  1230  	VSUBUQM  T0, PL, TT0        // VSQ     PL, T0, TT0
  1231  	VSUBECUQ T1, PH, CAR1, CAR2 // VSBCBIQ T1, PH, CAR1, CAR2
  1232  	VSUBEUQM T1, PH, CAR1, TT1  // VSBIQ   T1, PH, CAR1, TT1
  1233  	VSUBEUQM T2, ZER, CAR2, T2  // VSBIQ   T2, ZER, CAR2, T2
  1234  
  1235  	VSEL TT0, T0, T2, T0
  1236  	VSEL TT1, T1, T2, T1
  1237  
  1238  	// Reorder the bytes so STXVD2X can be used.
  1239  	// TT0, TT1 used for VPERM result in case
  1240  	// the caller expects T0, T1 to be good.
  1241  	XXPERMDI T0, T0, $2, TT0
  1242  	XXPERMDI T1, T1, $2, TT1
  1243  
  1244  	STXVD2X TT0, (R0)(res_ptr)
  1245  	STXVD2X TT1, (R16)(res_ptr)
  1246  
  1247  	RET
  1248  #undef res_ptr
  1249  #undef CPOOL
  1250  #undef T0
  1251  #undef T1
  1252  #undef T2
  1253  #undef TT0
  1254  #undef TT1
  1255  #undef ZER
  1256  #undef CAR1
  1257  #undef CAR2
  1258  #undef PL
  1259  #undef PH
  1260  
  1261  // ---------------------------------------
  1262  // sm2p256MulInternal
  1263  // V0-V3 V30,V31 - Not Modified
  1264  // V4-V15 V28-V29 - Volatile
  1265  
  1266  #define CPOOL   R7
  1267  
  1268  // Parameters
  1269  #define X0    V0 // Not modified
  1270  #define X1    V1 // Not modified
  1271  #define Y0    V2 // Not modified
  1272  #define Y1    V3 // Not modified
  1273  #define T0    V4 // Result
  1274  #define T1    V5 // Result
  1275  #define P0    V30 // Not modified
  1276  #define P1    V31 // Not modified
  1277  
  1278  // Temporaries: lots of reused vector regs
  1279  #define YDIG  V6 // Overloaded with CAR2
  1280  #define ADD1H V7 // Overloaded with ADD3H
  1281  #define ADD2H V8 // Overloaded with ADD4H
  1282  #define ADD3  V9 // Overloaded with SEL2,SEL5
  1283  #define ADD4  V10 // Overloaded with SEL3,SEL6
  1284  #define RED1  V11 // Overloaded with CAR2
  1285  #define RED2  V12 // Overloaded with TMP2
  1286  #define RED3  V13 // Overloaded with SEL1
  1287  #define T2    V14
  1288  // Overloaded temporaries
  1289  #define ADD1  V4 // Overloaded with T0
  1290  #define ADD2  V5 // Overloaded with T1
  1291  #define ADD3H V7 // Overloaded with ADD1H
  1292  #define ADD4H V8 // Overloaded with ADD2H
  1293  #define ZER   V28 // Overloaded with TMP1
  1294  #define CAR1  V6 // Overloaded with YDIG
  1295  #define CAR2  V11 // Overloaded with RED1
  1296  // Constant Selects
  1297  #define SEL1  V13 // Overloaded with RED3
  1298  #define SEL2  V9 // Overloaded with ADD3,SEL5
  1299  #define SEL3  V10 // Overloaded with ADD4,SEL6
  1300  #define SEL4  V6 // Overloaded with YDIG,CAR1
  1301  #define SEL5  V9 // Overloaded with ADD3,SEL2
  1302  #define SEL6  V10 // Overloaded with ADD4,SEL3
  1303  
  1304  // TMP1, TMP2 used in
  1305  // VMULT macros
  1306  #define TMP1  V13 // Overloaded with RED3
  1307  #define TMP2  V12 // Overloaded with RED2
  1308  #define ONE   V29 // 1s splatted by word
  1309  
  1310  TEXT sm2p256MulInternal<>(SB), NOSPLIT, $0-16
  1311  	// CPOOL loaded from caller
  1312  	MOVD $16, R16
  1313  	MOVD $32, R17
  1314  	MOVD $48, R18
  1315  
  1316  	// ---------------------------------------------------
  1317  
  1318  	VSPLTW $3, Y0, YDIG // VREPF Y0 is input
  1319  
  1320  	//	VMLHF X0, YDIG, ADD1H
  1321  	//	VMLHF X1, YDIG, ADD2H
  1322  	//	VMLF  X0, YDIG, ADD1
  1323  	//	VMLF  X1, YDIG, ADD2
  1324  	//
  1325  	VMULT(X0, YDIG, ADD1, ADD1H)
  1326  	VMULT(X1, YDIG, ADD2, ADD2H)
  1327  
  1328  	VSPLTISW $1, ONE
  1329  	VSPLTW $2, Y0, YDIG // VREPF
  1330  
  1331  	//	VMALF  X0, YDIG, ADD1H, ADD3
  1332  	//	VMALF  X1, YDIG, ADD2H, ADD4
  1333  	//	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
  1334  	//	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
  1335  	VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
  1336  	VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
  1337  
  1338  	LXVD2X   (R17)(CPOOL), SEL1
  1339  	VSPLTISB $0, ZER               // VZERO ZER
  1340  	VPERM    ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1341  
  1342  	VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free	// VSLDB
  1343  	VSLDOI $12, ZER, ADD2, T1  // ADD2 Free	// VSLDB
  1344  
  1345  	VADDCUQ  T0, ADD3, CAR1     // VACCQ
  1346  	VADDUQM  T0, ADD3, T0       // ADD3 Free	// VAQ
  1347  	VADDECUQ T1, ADD4, CAR1, T2 // VACCCQ
  1348  	VADDEUQM T1, ADD4, CAR1, T1 // ADD4 Free	// VACQ
  1349  
  1350  	LXVD2X  (R18)(CPOOL), SEL2
  1351  	VPERM   RED3, T0, SEL2, RED1 // [ 0  0 d1 d0]
  1352  	VSLDOI $4, RED1, ZER, RED3   // [ 0 d1 d0  0]
  1353  	VSLDOI $4, RED3, ZER, RED2   // [d1 d0  0  0]
  1354  	VSUBCUQ  RED1, RED3, CAR1
  1355  	VSUBUQM  RED1, RED3, RED1
  1356  	VSUBEUQM RED2, RED3, CAR1, RED2 // Guaranteed not to underflow
  1357  
  1358  	VSLDOI $12, T1, T0, T0 // VSLDB
  1359  	VSLDOI $12, T2, T1, T1 // VSLDB
  1360  
  1361  	VADDCUQ  T0, ADD3H, CAR1     // VACCQ
  1362  	VADDUQM  T0, ADD3H, T0       // VAQ
  1363  	VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
  1364  	VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
  1365  
  1366  	VADDCUQ T0, RED1, CAR1     // VACCQ
  1367  	VADDUQM T0, RED1, T0       // VAQ
  1368  	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ
  1369  	VADDEUQM T1, RED2, CAR1, T1 // VACQ
  1370  	VADDUQM T2, CAR2, T2       // VAQ
  1371  
  1372  	// ---------------------------------------------------
  1373  
  1374  	VSPLTW $1, Y0, YDIG                // VREPF
  1375  
  1376  	//	VMALHF X0, YDIG, T0, ADD1H
  1377  	//	VMALHF X1, YDIG, T1, ADD2H
  1378  	//	VMALF  X0, YDIG, T0, ADD1  // T0 Free->ADD1
  1379  	//	VMALF  X1, YDIG, T1, ADD2  // T1 Free->ADD2
  1380  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
  1381  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
  1382  
  1383  	VSPLTW $0, Y0, YDIG // VREPF
  1384  
  1385  	//	VMALF  X0, YDIG, ADD1H, ADD3
  1386  	//	VMALF  X1, YDIG, ADD2H, ADD4
  1387  	//	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
  1388  	//	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
  1389  	VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
  1390  	VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
  1391  
  1392  	VSPLTISB $0, ZER               // VZERO ZER
  1393  	LXVD2X   (R17)(CPOOL), SEL1
  1394  	VPERM    ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1395  
  1396  	VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free->T0		// VSLDB
  1397  	VSLDOI $12, T2, ADD2, T1   // ADD2 Free->T1, T2 Free	// VSLDB
  1398  
  1399  	VADDCUQ  T0, ADD3, CAR1       // VACCQ
  1400  	VADDUQM  T0, ADD3, T0         // VAQ
  1401  	VADDECUQ T1, ADD4, CAR1, T2   // VACCCQ
  1402  	VADDEUQM T1, ADD4, CAR1, T1   // VACQ
  1403  
  1404  	LXVD2X  (R18)(CPOOL), SEL2
  1405  	VPERM   RED3, T0, SEL2, RED1 // [ 0  0 d1 d0]
  1406  	VSLDOI $4, RED1, ZER, RED3   // [ 0 d1 d0  0]
  1407  	VSLDOI $4, RED3, ZER, RED2   // [d1 d0  0  0]
  1408  	VSUBCUQ  RED1, RED3, CAR1
  1409  	VSUBUQM  RED1, RED3, RED1
  1410  	VSUBEUQM RED2, RED3, CAR1, RED2 // Guaranteed not to underflow
  1411  
  1412  	VSLDOI $12, T1, T0, T0 // VSLDB
  1413  	VSLDOI $12, T2, T1, T1 // VSLDB
  1414  
  1415  	VADDCUQ  T0, ADD3H, CAR1     // VACCQ
  1416  	VADDUQM  T0, ADD3H, T0       // VAQ
  1417  	VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
  1418  	VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
  1419  
  1420  	VADDCUQ T0, RED1, CAR1     // VACCQ
  1421  	VADDUQM T0, RED1, T0       // VAQ
  1422  	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ
  1423  	VADDEUQM T1, RED2, CAR1, T1 // VACQ
  1424  	VADDUQM T2, CAR2, T2       // VAQ
  1425  	// ---------------------------------------------------
  1426  
  1427  	VSPLTW $3, Y1, YDIG                // VREPF
  1428  
  1429  	//	VMALHF X0, YDIG, T0, ADD1H
  1430  	//	VMALHF X1, YDIG, T1, ADD2H
  1431  	//	VMALF  X0, YDIG, T0, ADD1
  1432  	//	VMALF  X1, YDIG, T1, ADD2
  1433  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
  1434  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
  1435  
  1436  	VSPLTW $2, Y1, YDIG // VREPF
  1437  
  1438  	//	VMALF  X0, YDIG, ADD1H, ADD3
  1439  	//	VMALF  X1, YDIG, ADD2H, ADD4
  1440  	//	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
  1441  	//	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
  1442  	VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
  1443  	VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
  1444  
  1445  	VSPLTISB $0, ZER               // VZERO ZER
  1446  	LXVD2X   (R17)(CPOOL), SEL1
  1447  	VPERM    ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1448  
  1449  	VSLDOI $12, ADD2, ADD1, T0 // ADD1 Free		// VSLDB
  1450  	VSLDOI $12, T2, ADD2, T1   // ADD2 Free		// VSLDB
  1451  
  1452  	VADDCUQ  T0, ADD3, CAR1       // VACCQ
  1453  	VADDUQM  T0, ADD3, T0         // VAQ
  1454  	VADDECUQ T1, ADD4, CAR1, T2   // VACCCQ
  1455  	VADDEUQM T1, ADD4, CAR1, T1   // VACQ
  1456  
  1457  	LXVD2X  (R18)(CPOOL), SEL2
  1458  	VPERM   RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1459  	VSLDOI $4, RED1, ZER, RED3   // [ 0 d1 d0  0]
  1460  	VSLDOI $4, RED3, ZER, RED2   // [d1 d0  0  0]
  1461  	VSUBCUQ  RED1, RED3, CAR1
  1462  	VSUBUQM  RED1, RED3, RED1
  1463  	VSUBEUQM RED2, RED3, CAR1, RED2 // Guaranteed not to underflow
  1464  
  1465  	VSLDOI $12, T1, T0, T0 // VSLDB
  1466  	VSLDOI $12, T2, T1, T1 // VSLDB
  1467  
  1468  	VADDCUQ  T0, ADD3H, CAR1     // VACCQ
  1469  	VADDUQM  T0, ADD3H, T0       // VAQ
  1470  	VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
  1471  	VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
  1472  
  1473  	VADDCUQ T0, RED1, CAR1     // VACCQ
  1474  	VADDUQM T0, RED1, T0       // VAQ
  1475  	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ
  1476  	VADDEUQM T1, RED2, CAR1, T1 // VACQ
  1477  	VADDUQM T2, CAR2, T2       // VAQ
  1478  	// ---------------------------------------------------
  1479  
  1480  	VSPLTW $1, Y1, YDIG                // VREPF
  1481  
  1482  	//	VMALHF X0, YDIG, T0, ADD1H
  1483  	//	VMALHF X1, YDIG, T1, ADD2H
  1484  	//	VMALF  X0, YDIG, T0, ADD1
  1485  	//	VMALF  X1, YDIG, T1, ADD2
  1486  	VMULT_ADD(X0, YDIG, T0, ONE, ADD1, ADD1H)
  1487  	VMULT_ADD(X1, YDIG, T1, ONE, ADD2, ADD2H)
  1488  
  1489  	VSPLTW $0, Y1, YDIG // VREPF
  1490  
  1491  	//	VMALF  X0, YDIG, ADD1H, ADD3
  1492  	//	VMALF  X1, YDIG, ADD2H, ADD4
  1493  	//	VMALHF X0, YDIG, ADD1H, ADD3H
  1494  	//	VMALHF X1, YDIG, ADD2H, ADD4H
  1495  	VMULT_ADD(X0, YDIG, ADD1H, ONE, ADD3, ADD3H)
  1496  	VMULT_ADD(X1, YDIG, ADD2H, ONE, ADD4, ADD4H)
  1497  
  1498  	VSPLTISB $0, ZER               // VZERO ZER
  1499  	LXVD2X   (R17)(CPOOL), SEL1
  1500  	VPERM    ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1501  
  1502  	VSLDOI $12, ADD2, ADD1, T0 // VSLDB
  1503  	VSLDOI $12, T2, ADD2, T1   // VSLDB
  1504  
  1505  	VADDCUQ  T0, ADD3, CAR1       // VACCQ
  1506  	VADDUQM  T0, ADD3, T0         // VAQ
  1507  	VADDECUQ T1, ADD4, CAR1, T2   // VACCCQ
  1508  	VADDEUQM T1, ADD4, CAR1, T1   // VACQ
  1509  
  1510  	LXVD2X  (R18)(CPOOL), SEL2
  1511  	VPERM   RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1512  	VSLDOI $4, RED1, ZER, RED3   // [ 0 d1 d0  0]
  1513  	VSLDOI $4, RED3, ZER, RED2   // [d1 d0  0  0]
  1514  	VSUBCUQ  RED1, RED3, CAR1
  1515  	VSUBUQM  RED1, RED3, RED1
  1516  	VSUBEUQM RED2, RED3, CAR1, RED2 // Guaranteed not to underflow
  1517  
  1518  	VSLDOI $12, T1, T0, T0 // VSLDB
  1519  	VSLDOI $12, T2, T1, T1 // VSLDB
  1520  
  1521  	VADDCUQ  T0, ADD3H, CAR1     // VACCQ
  1522  	VADDUQM  T0, ADD3H, T0       // VAQ
  1523  	VADDECUQ T1, ADD4H, CAR1, T2 // VACCCQ
  1524  	VADDEUQM T1, ADD4H, CAR1, T1 // VACQ
  1525  
  1526  	VADDCUQ  T0, RED1, CAR1       // VACCQ
  1527  	VADDUQM  T0, RED1, T0         // VAQ
  1528  	VADDECUQ T1, RED2, CAR1, CAR2 // VACCCQ
  1529  	VADDEUQM T1, RED2, CAR1, T1   // VACQ
  1530  	VADDUQM  T2, CAR2, T2         // VAQ
  1531  
  1532  	// ---------------------------------------------------
  1533  
  1534  	VSPLTISB $0, RED3            // VZERO   RED3
  1535  	VSUBCUQ  T0, P0, CAR1        // VSCBIQ
  1536  	VSUBUQM  T0, P0, ADD1H       // VSQ
  1537  	VSUBECUQ T1, P1, CAR1, CAR2  // VSBCBIQ
  1538  	VSUBEUQM T1, P1, CAR1, ADD2H // VSBIQ
  1539  	VSUBEUQM T2, RED3, CAR2, T2  // VSBIQ
  1540  
  1541  	// what output to use, ADD2H||ADD1H or T1||T0?
  1542  	VSEL ADD1H, T0, T2, T0
  1543  	VSEL ADD2H, T1, T2, T1
  1544  	RET
  1545  
  1546  #undef CPOOL
  1547  
  1548  #undef X0
  1549  #undef X1
  1550  #undef Y0
  1551  #undef Y1
  1552  #undef T0
  1553  #undef T1
  1554  #undef P0
  1555  #undef P1
  1556  
  1557  #undef SEL1
  1558  #undef SEL2
  1559  #undef SEL3
  1560  #undef SEL4
  1561  #undef SEL5
  1562  #undef SEL6
  1563  
  1564  #undef YDIG
  1565  #undef ADD1H
  1566  #undef ADD2H
  1567  #undef ADD3
  1568  #undef ADD4
  1569  #undef RED1
  1570  #undef RED2
  1571  #undef RED3
  1572  #undef T2
  1573  #undef ADD1
  1574  #undef ADD2
  1575  #undef ADD3H
  1576  #undef ADD4H
  1577  #undef ZER
  1578  #undef CAR1
  1579  #undef CAR2
  1580  
  1581  #undef TMP1
  1582  #undef TMP2
  1583  
  1584  #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
  1585  	VSPLTISB $0, ZER            \ // VZERO
  1586  	VSUBCUQ  X0, Y0, CAR1       \
  1587  	VSUBUQM  X0, Y0, T0         \
  1588  	VSUBECUQ X1, Y1, CAR1, SEL1 \
  1589  	VSUBEUQM X1, Y1, CAR1, T1   \
  1590  	VSUBUQM  ZER, SEL1, SEL1    \ // VSQ
  1591  	                            \
  1592  	VADDCUQ  T0, PL, CAR1       \ // VACCQ
  1593  	VADDUQM  T0, PL, TT0        \ // VAQ
  1594  	VADDEUQM T1, PH, CAR1, TT1  \ // VACQ
  1595  	                            \
  1596  	VSEL     TT0, T0, SEL1, T0  \
  1597  	VSEL     TT1, T1, SEL1, T1  \
  1598  
  1599  #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
  1600  	VADDCUQ  X0, Y0, CAR1        \
  1601  	VADDUQM  X0, Y0, T0          \
  1602  	VADDECUQ X1, Y1, CAR1, T2    \ // VACCCQ
  1603  	VADDEUQM X1, Y1, CAR1, T1    \
  1604  	                             \
  1605  	VSPLTISB $0, ZER             \
  1606  	VSUBCUQ  T0, PL, CAR1        \ // VSCBIQ
  1607  	VSUBUQM  T0, PL, TT0         \
  1608  	VSUBECUQ T1, PH, CAR1, CAR2  \ // VSBCBIQ
  1609  	VSUBEUQM T1, PH, CAR1, TT1   \ // VSBIQ
  1610  	VSUBEUQM T2, ZER, CAR2, SEL1 \
  1611  	                             \
  1612  	VSEL     TT0, T0, SEL1, T0   \
  1613  	VSEL     TT1, T1, SEL1, T1
  1614  
  1615  #define p256HalfInternal(T1, T0, X1, X0) \
  1616  	VSPLTISB $0, ZER            \
  1617  	VSUBEUQM ZER, ZER, X0, SEL1 \
  1618  	                            \
  1619  	VADDCUQ  X0, PL, CAR1       \
  1620  	VADDUQM  X0, PL, T0         \
  1621  	VADDECUQ X1, PH, CAR1, T2   \
  1622  	VADDEUQM X1, PH, CAR1, T1   \
  1623  	                            \
  1624  	VSEL     T0, X0, SEL1, T0   \
  1625  	VSEL     T1, X1, SEL1, T1   \
  1626  	VSEL     T2, ZER, SEL1, T2  \
  1627  	                            \
  1628  	VSLDOI   $15, T2, ZER, TT1  \
  1629  	VSLDOI   $15, T1, ZER, TT0  \
  1630  	VSPLTISB $1, SEL1           \
  1631  	VSR      T0, SEL1, T0       \ // VSRL
  1632  	VSR      T1, SEL1, T1       \
  1633  	VSPLTISB $7, SEL1           \ // VREPIB
  1634  	VSL      TT0, SEL1, TT0     \
  1635  	VSL      TT1, SEL1, TT1     \
  1636  	VOR      T0, TT0, T0        \
  1637  	VOR      T1, TT1, T1
  1638  
  1639  #define res_ptr R3
  1640  #define x_ptr   R4
  1641  #define y_ptr   R5
  1642  #define CPOOL   R7
  1643  #define TEMP    R8
  1644  #define N       R9
  1645  
  1646  // Parameters
  1647  #define X0    V0
  1648  #define X1    V1
  1649  #define Y0    V2
  1650  #define Y1    V3
  1651  #define T0    V4
  1652  #define T1    V5
  1653  
  1654  // Constants
  1655  #define P0    V30
  1656  #define P1    V31
  1657  // func p256MulAsm(res, in1, in2 *p256Element)
  1658  TEXT ·p256Mul(SB), NOSPLIT, $0-24
  1659  	MOVD res+0(FP), res_ptr
  1660  	MOVD in1+8(FP), x_ptr
  1661  	MOVD in2+16(FP), y_ptr
  1662  	MOVD $16, R16
  1663  	MOVD $32, R17
  1664  
  1665  	MOVD $p256mul<>+0x00(SB), CPOOL
  1666  
  1667  	LXVD2X (R0)(x_ptr), X0
  1668  	LXVD2X (R16)(x_ptr), X1
  1669  
  1670  	XXPERMDI X0, X0, $2, X0
  1671  	XXPERMDI X1, X1, $2, X1
  1672  
  1673  	LXVD2X (R0)(y_ptr), Y0
  1674  	LXVD2X (R16)(y_ptr), Y1
  1675  
  1676  	XXPERMDI Y0, Y0, $2, Y0
  1677  	XXPERMDI Y1, Y1, $2, Y1
  1678  
  1679  	LXVD2X (R16)(CPOOL), P1
  1680  	LXVD2X (R0)(CPOOL), P0
  1681  
  1682  	CALL sm2p256MulInternal<>(SB)
  1683  
  1684  	MOVD $p256mul<>+0x00(SB), CPOOL // What's the purpose of this?
  1685  
  1686  	XXPERMDI T0, T0, $2, T0
  1687  	XXPERMDI T1, T1, $2, T1
  1688  	STXVD2X T0, (R0)(res_ptr)
  1689  	STXVD2X T1, (R16)(res_ptr)
  1690  	RET
  1691  
  1692  // func p256Sqr(res, in *p256Element, n int)
  1693  TEXT ·p256Sqr(SB), NOSPLIT, $0-24
  1694  	MOVD res+0(FP), res_ptr
  1695  	MOVD in+8(FP), x_ptr
  1696  	MOVD	n+16(FP), N	
  1697  	MOVD $16, R16
  1698  	MOVD $32, R17
  1699  
  1700  	MOVD $p256mul<>+0x00(SB), CPOOL
  1701  	LXVD2X (R16)(CPOOL), P1
  1702  	LXVD2X (R0)(CPOOL), P0
  1703  
  1704  	LXVD2X (R0)(x_ptr), X0
  1705  	LXVD2X (R16)(x_ptr), X1
  1706  
  1707  	XXPERMDI X0, X0, $2, X0
  1708  	XXPERMDI X1, X1, $2, X1
  1709  
  1710  sqrLoop:
  1711  	// Sqr uses same value for both
  1712  
  1713  	VOR	X0, X0, Y0
  1714  	VOR	X1, X1, Y1
  1715  
  1716  	CALL sm2p256MulInternal<>(SB)
  1717  
  1718  	ADD	$-1, N
  1719  	CMP	$0, N
  1720  	BEQ	done
  1721  	VOR	T0, T0, X0
  1722  	VOR	T1, T1, X1
  1723  	BR	sqrLoop
  1724  
  1725  done:
  1726  	XXPERMDI T0, T0, $2, T0
  1727  	XXPERMDI T1, T1, $2, T1
  1728  	STXVD2X T0, (R0)(res_ptr)
  1729  	STXVD2X T1, (R16)(res_ptr)
  1730  	RET
  1731  
  1732  #undef res_ptr
  1733  #undef x_ptr
  1734  #undef y_ptr
  1735  #undef CPOOL
  1736  
  1737  #undef X0
  1738  #undef X1
  1739  #undef Y0
  1740  #undef Y1
  1741  #undef T0
  1742  #undef T1
  1743  #undef P0
  1744  #undef P1
  1745  
  1746  #define P3ptr   R3
  1747  #define P1ptr   R4
  1748  #define P2ptr   R5
  1749  #define CPOOL   R7
  1750  
  1751  // Temporaries in REGs
  1752  #define Y2L    V15
  1753  #define Y2H    V16
  1754  #define T1L    V17
  1755  #define T1H    V18
  1756  #define T2L    V19
  1757  #define T2H    V20
  1758  #define T3L    V21
  1759  #define T3H    V22
  1760  #define T4L    V23
  1761  #define T4H    V24
  1762  
  1763  // Temps for Sub and Add
  1764  #define TT0  V11
  1765  #define TT1  V12
  1766  #define T2   V13
  1767  
  1768  // p256MulAsm Parameters
  1769  #define X0    V0
  1770  #define X1    V1
  1771  #define Y0    V2
  1772  #define Y1    V3
  1773  #define T0    V4
  1774  #define T1    V5
  1775  
  1776  #define PL    V30
  1777  #define PH    V31
  1778  
  1779  // Names for zero/sel selects
  1780  #define X1L    V0
  1781  #define X1H    V1
  1782  #define Y1L    V2 // p256MulAsmParmY
  1783  #define Y1H    V3 // p256MulAsmParmY
  1784  #define Z1L    V4
  1785  #define Z1H    V5
  1786  #define X2L    V0
  1787  #define X2H    V1
  1788  #define Z2L    V4
  1789  #define Z2H    V5
  1790  #define X3L    V17 // T1L
  1791  #define X3H    V18 // T1H
  1792  #define Y3L    V21 // T3L
  1793  #define Y3H    V22 // T3H
  1794  #define Z3L    V25
  1795  #define Z3H    V26
  1796  
  1797  #define ZER   V6
  1798  #define SEL1  V7
  1799  #define CAR1  V8
  1800  #define CAR2  V9
  1801  /* *
  1802   * Three operand formula:
  1803   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1804   * T1 = Z1²
  1805   * T2 = T1*Z1
  1806   * T1 = T1*X2
  1807   * T2 = T2*Y2
  1808   * T1 = T1-X1
  1809   * T2 = T2-Y1
  1810   * Z3 = Z1*T1
  1811   * T3 = T1²
  1812   * T4 = T3*T1
  1813   * T3 = T3*X1
  1814   * T1 = 2*T3
  1815   * X3 = T2²
  1816   * X3 = X3-T1
  1817   * X3 = X3-T4
  1818   * T3 = T3-X3
  1819   * T3 = T3*T2
  1820   * T4 = T4*Y1
  1821   * Y3 = T3-T4
  1822  
  1823   * Three operand formulas, but with MulInternal X,Y used to store temps
  1824  X=Z1; Y=Z1; MUL;T-   // T1 = Z1²      T1
  1825  X=T ; Y-  ; MUL;T2=T // T2 = T1*Z1    T1   T2
  1826  X-  ; Y=X2; MUL;T1=T // T1 = T1*X2    T1   T2
  1827  X=T2; Y=Y2; MUL;T-   // T2 = T2*Y2    T1   T2
  1828  SUB(T2<T-Y1)         // T2 = T2-Y1    T1   T2
  1829  SUB(Y<T1-X1)         // T1 = T1-X1    T1   T2
  1830  X=Z1; Y- ;  MUL;Z3:=T// Z3 = Z1*T1         T2
  1831  X=Y;  Y- ;  MUL;X=T  // T3 = T1*T1         T2
  1832  X- ;  Y- ;  MUL;T4=T // T4 = T3*T1         T2        T4
  1833  X- ;  Y=X1; MUL;T3=T // T3 = T3*X1         T2   T3   T4
  1834  ADD(T1<T+T)          // T1 = T3+T3    T1   T2   T3   T4
  1835  X=T2; Y=T2; MUL;T-   // X3 = T2*T2    T1   T2   T3   T4
  1836  SUB(T<T-T1)          // X3 = X3-T1    T1   T2   T3   T4
  1837  SUB(T<T-T4) X3:=T    // X3 = X3-T4         T2   T3   T4
  1838  SUB(X<T3-T)          // T3 = T3-X3         T2   T3   T4
  1839  X- ;  Y- ;  MUL;T3=T // T3 = T3*T2         T2   T3   T4
  1840  X=T4; Y=Y1; MUL;T-   // T4 = T4*Y1              T3   T4
  1841  SUB(T<T3-T) Y3:=T    // Y3 = T3-T4              T3   T4
  1842  */
  1843  //	
  1844  // func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int)
  1845  TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
  1846  	MOVD res+0(FP), P3ptr
  1847  	MOVD in1+8(FP), P1ptr
  1848  	MOVD in2+16(FP), P2ptr
  1849  
  1850  	MOVD $p256mul<>+0x00(SB), CPOOL
  1851  
  1852  	MOVD $16, R16
  1853  	MOVD $32, R17
  1854  	MOVD $48, R18
  1855  	MOVD $64, R19
  1856  	MOVD $80, R20
  1857  	MOVD $96, R21
  1858  	MOVD $112, R22
  1859  	MOVD $128, R23
  1860  	MOVD $144, R24
  1861  	MOVD $160, R25
  1862  	MOVD $88, R26 // offset of sign+24(FP): 24 + 64
  1863  
  1864  	LXVD2X (R16)(CPOOL), PH
  1865  	LXVD2X (R0)(CPOOL), PL
  1866  
  1867  	LXVD2X (R17)(P2ptr), Y2L
  1868  	LXVD2X (R18)(P2ptr), Y2H
  1869  	XXPERMDI Y2H, Y2H, $2, Y2H
  1870  	XXPERMDI Y2L, Y2L, $2, Y2L
  1871  
  1872  	// Equivalent of VLREPG sign+24(FP), SEL1
  1873  	LXVDSX   (R1)(R26), SEL1
  1874  	VSPLTISB $0, ZER
  1875  	VCMPEQUD SEL1, ZER, SEL1
  1876  
  1877  	VSUBCUQ  PL, Y2L, CAR1
  1878  	VSUBUQM  PL, Y2L, T1L
  1879  	VSUBEUQM PH, Y2H, CAR1, T1H
  1880  
  1881  	VSEL T1L, Y2L, SEL1, Y2L
  1882  	VSEL T1H, Y2H, SEL1, Y2H
  1883  
  1884  /* *
  1885   * Three operand formula:
  1886   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1887   */
  1888  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1²      T1
  1889  	LXVD2X (R19)(P1ptr), X0     // Z1H
  1890  	LXVD2X (R20)(P1ptr), X1     // Z1L
  1891  	XXPERMDI X0, X0, $2, X0
  1892  	XXPERMDI X1, X1, $2, X1
  1893  	VOR    X0, X0, Y0
  1894  	VOR    X1, X1, Y1
  1895  	CALL   sm2p256MulInternal<>(SB)
  1896  
  1897  	// X=T ; Y-  ; MUL; T2=T // T2 = T1*Z1    T1   T2
  1898  	VOR  T0, T0, X0
  1899  	VOR  T1, T1, X1
  1900  	CALL sm2p256MulInternal<>(SB)
  1901  	VOR  T0, T0, T2L
  1902  	VOR  T1, T1, T2H
  1903  
  1904  	// X-  ; Y=X2; MUL; T1=T // T1 = T1*X2    T1   T2
  1905  	MOVD   in2+16(FP), P2ptr
  1906  	LXVD2X (R0)(P2ptr), Y0      // X2H
  1907  	LXVD2X (R16)(P2ptr), Y1     // X2L
  1908  	XXPERMDI Y0, Y0, $2, Y0
  1909  	XXPERMDI Y1, Y1, $2, Y1
  1910  	CALL   sm2p256MulInternal<>(SB)
  1911  	VOR    T0, T0, T1L
  1912  	VOR    T1, T1, T1H
  1913  
  1914  	// X=T2; Y=Y2; MUL; T-   // T2 = T2*Y2    T1   T2
  1915  	VOR  T2L, T2L, X0
  1916  	VOR  T2H, T2H, X1
  1917  	VOR  Y2L, Y2L, Y0
  1918  	VOR  Y2H, Y2H, Y1
  1919  	CALL sm2p256MulInternal<>(SB)
  1920  
  1921  	// SUB(T2<T-Y1)          // T2 = T2-Y1    T1   T2
  1922  	MOVD   in1+8(FP), P1ptr
  1923  	LXVD2X (R17)(P1ptr), Y1L
  1924  	LXVD2X (R18)(P1ptr), Y1H
  1925  	XXPERMDI Y1H, Y1H, $2, Y1H
  1926  	XXPERMDI Y1L, Y1L, $2, Y1L
  1927  	p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
  1928  
  1929  	// SUB(Y<T1-X1)          // T1 = T1-X1    T1   T2
  1930  	LXVD2X (R0)(P1ptr), X1L
  1931  	LXVD2X (R16)(P1ptr), X1H
  1932  	XXPERMDI X1H, X1H, $2, X1H
  1933  	XXPERMDI X1L, X1L, $2, X1L
  1934  	p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
  1935  
  1936  	// X=Z1; Y- ;  MUL; Z3:=T// Z3 = Z1*T1         T2
  1937  	LXVD2X (R19)(P1ptr), X0     // Z1H
  1938  	LXVD2X (R20)(P1ptr), X1     // Z1L
  1939  	XXPERMDI X0, X0, $2, X0
  1940  	XXPERMDI X1, X1, $2, X1
  1941  	CALL   sm2p256MulInternal<>(SB)
  1942  
  1943  	VOR T0, T0, Z3L
  1944  	VOR T1, T1, Z3H
  1945  
  1946  	// X=Y;  Y- ;  MUL; X=T  // T3 = T1*T1         T2
  1947  	VOR  Y0, Y0, X0
  1948  	VOR  Y1, Y1, X1
  1949  	CALL sm2p256MulInternal<>(SB)
  1950  	VOR  T0, T0, X0
  1951  	VOR  T1, T1, X1
  1952  
  1953  	// X- ;  Y- ;  MUL; T4=T // T4 = T3*T1         T2        T4
  1954  	CALL sm2p256MulInternal<>(SB)
  1955  	VOR  T0, T0, T4L
  1956  	VOR  T1, T1, T4H
  1957  
  1958  	// X- ;  Y=X1; MUL; T3=T // T3 = T3*X1         T2   T3   T4
  1959  	MOVD   in1+8(FP), P1ptr
  1960  	LXVD2X (R0)(P1ptr), Y0      // X1H
  1961  	LXVD2X (R16)(P1ptr), Y1     // X1L
  1962  	XXPERMDI Y1, Y1, $2, Y1
  1963  	XXPERMDI Y0, Y0, $2, Y0
  1964  	CALL   sm2p256MulInternal<>(SB)
  1965  	VOR    T0, T0, T3L
  1966  	VOR    T1, T1, T3H
  1967  
  1968  	// ADD(T1<T+T)           // T1 = T3+T3    T1   T2   T3   T4
  1969  	p256AddInternal(T1H,T1L, T1,T0,T1,T0)
  1970  
  1971  	// X=T2; Y=T2; MUL; T-   // X3 = T2*T2    T1   T2   T3   T4
  1972  	VOR  T2L, T2L, X0
  1973  	VOR  T2H, T2H, X1
  1974  	VOR  T2L, T2L, Y0
  1975  	VOR  T2H, T2H, Y1
  1976  	CALL sm2p256MulInternal<>(SB)
  1977  
  1978  	// SUB(T<T-T1)           // X3 = X3-T1    T1   T2   T3   T4  (T1 = X3)
  1979  	p256SubInternal(T1,T0,T1,T0,T1H,T1L)
  1980  
  1981  	// SUB(T<T-T4) X3:=T     // X3 = X3-T4         T2   T3   T4
  1982  	p256SubInternal(T1,T0,T1,T0,T4H,T4L)
  1983  	VOR T0, T0, X3L
  1984  	VOR T1, T1, X3H
  1985  
  1986  	// SUB(X<T3-T)           // T3 = T3-X3         T2   T3   T4
  1987  	p256SubInternal(X1,X0,T3H,T3L,T1,T0)
  1988  
  1989  	// X- ;  Y- ;  MUL; T3=T // T3 = T3*T2         T2   T3   T4
  1990  	CALL sm2p256MulInternal<>(SB)
  1991  	VOR  T0, T0, T3L
  1992  	VOR  T1, T1, T3H
  1993  
  1994  	// X=T4; Y=Y1; MUL; T-   // T4 = T4*Y1              T3   T4
  1995  	VOR    T4L, T4L, X0
  1996  	VOR    T4H, T4H, X1
  1997  	MOVD   in1+8(FP), P1ptr
  1998  	LXVD2X (R17)(P1ptr), Y0     // Y1H
  1999  	LXVD2X (R18)(P1ptr), Y1     // Y1L
  2000  	XXPERMDI Y0, Y0, $2, Y0
  2001  	XXPERMDI Y1, Y1, $2, Y1
  2002  	CALL   sm2p256MulInternal<>(SB)
  2003  
  2004  	// SUB(T<T3-T) Y3:=T     // Y3 = T3-T4              T3   T4  (T3 = Y3)
  2005  	p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
  2006  
  2007  	//	if (sel == 0) {
  2008  	//		copy(P3.x[:], X1)
  2009  	//		copy(P3.y[:], Y1)
  2010  	//		copy(P3.z[:], Z1)
  2011  	//	}
  2012  
  2013  	LXVD2X (R0)(P1ptr), X1L
  2014  	LXVD2X (R16)(P1ptr), X1H
  2015  	XXPERMDI X1H, X1H, $2, X1H
  2016  	XXPERMDI X1L, X1L, $2, X1L
  2017  
  2018  	// Y1 already loaded, left over from addition
  2019  	LXVD2X (R19)(P1ptr), Z1L
  2020  	LXVD2X (R20)(P1ptr), Z1H
  2021  	XXPERMDI Z1H, Z1H, $2, Z1H
  2022  	XXPERMDI Z1L, Z1L, $2, Z1L
  2023  
  2024  	LXVDSX   (R1)(R21), SEL1    // Get offset to sel+32
  2025  	VSPLTISB $0, ZER
  2026  	VCMPEQUD SEL1, ZER, SEL1
  2027  
  2028  	VSEL X3L, X1L, SEL1, X3L
  2029  	VSEL X3H, X1H, SEL1, X3H
  2030  	VSEL Y3L, Y1L, SEL1, Y3L
  2031  	VSEL Y3H, Y1H, SEL1, Y3H
  2032  	VSEL Z3L, Z1L, SEL1, Z3L
  2033  	VSEL Z3H, Z1H, SEL1, Z3H
  2034  
  2035  	MOVD   in2+16(FP), P2ptr
  2036  	LXVD2X (R0)(P2ptr), X2L
  2037  	LXVD2X (R16)(P2ptr), X2H
  2038  	XXPERMDI X2H, X2H, $2, X2H
  2039  	XXPERMDI X2L, X2L, $2, X2L
  2040  
  2041  	// Y2 already loaded
  2042  	LXVD2X (R19)(CPOOL), Z2L
  2043  	LXVD2X (R20)(CPOOL), Z2H
  2044  
  2045  	LXVDSX   (R1)(R22), SEL1    // Get the value from zero+40(FP)
  2046  	VSPLTISB $0, ZER
  2047  	VCMPEQUD SEL1, ZER, SEL1
  2048  
  2049  	VSEL X3L, X2L, SEL1, X3L
  2050  	VSEL X3H, X2H, SEL1, X3H
  2051  	VSEL Y3L, Y2L, SEL1, Y3L
  2052  	VSEL Y3H, Y2H, SEL1, Y3H
  2053  	VSEL Z3L, Z2L, SEL1, Z3L
  2054  	VSEL Z3H, Z2H, SEL1, Z3H
  2055  
  2056  	// Reorder the bytes so they can be stored using STXVD2X.
  2057  	MOVD    res+0(FP), P3ptr
  2058  	XXPERMDI X3H, X3H, $2, X3H
  2059  	XXPERMDI X3L, X3L, $2, X3L
  2060  	XXPERMDI Y3H, Y3H, $2, Y3H
  2061  	XXPERMDI Y3L, Y3L, $2, Y3L
  2062  	XXPERMDI Z3H, Z3H, $2, Z3H
  2063  	XXPERMDI Z3L, Z3L, $2, Z3L
  2064  	STXVD2X X3L, (R0)(P3ptr)
  2065  	STXVD2X X3H, (R16)(P3ptr)
  2066  	STXVD2X Y3L, (R17)(P3ptr)
  2067  	STXVD2X Y3H, (R18)(P3ptr)
  2068  	STXVD2X Z3L, (R19)(P3ptr)
  2069  	STXVD2X Z3H, (R20)(P3ptr)
  2070  
  2071  	RET
  2072  
  2073  #undef P3ptr
  2074  #undef P1ptr
  2075  #undef P2ptr
  2076  #undef CPOOL
  2077  
  2078  #undef Y2L
  2079  #undef Y2H
  2080  #undef T1L
  2081  #undef T1H
  2082  #undef T2L
  2083  #undef T2H
  2084  #undef T3L
  2085  #undef T3H
  2086  #undef T4L
  2087  #undef T4H
  2088  
  2089  #undef TT0
  2090  #undef TT1
  2091  #undef T2
  2092  
  2093  #undef X0
  2094  #undef X1
  2095  #undef Y0
  2096  #undef Y1
  2097  #undef T0
  2098  #undef T1
  2099  
  2100  #undef PL
  2101  #undef PH
  2102  
  2103  #undef X1L
  2104  #undef X1H
  2105  #undef Y1L
  2106  #undef Y1H
  2107  #undef Z1L
  2108  #undef Z1H
  2109  #undef X2L
  2110  #undef X2H
  2111  #undef Z2L
  2112  #undef Z2H
  2113  #undef X3L
  2114  #undef X3H
  2115  #undef Y3L
  2116  #undef Y3H
  2117  #undef Z3L
  2118  #undef Z3H
  2119  
  2120  #undef ZER
  2121  #undef SEL1
  2122  #undef CAR1
  2123  #undef CAR2
  2124  
  2125  // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
  2126  // http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
  2127  // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
  2128  #define P3ptr   R3
  2129  #define P1ptr   R4
  2130  #define CPOOL   R7
  2131  
  2132  // Temporaries in REGs
  2133  #define X3L    V15
  2134  #define X3H    V16
  2135  #define Y3L    V17
  2136  #define Y3H    V18
  2137  #define T1L    V19
  2138  #define T1H    V20
  2139  #define T2L    V21
  2140  #define T2H    V22
  2141  #define T3L    V23
  2142  #define T3H    V24
  2143  
  2144  #define X1L    V6
  2145  #define X1H    V7
  2146  #define Y1L    V8
  2147  #define Y1H    V9
  2148  #define Z1L    V10
  2149  #define Z1H    V11
  2150  
  2151  // Temps for Sub and Add
  2152  #define TT0  V11
  2153  #define TT1  V12
  2154  #define T2   V13
  2155  
  2156  // p256MulAsm Parameters
  2157  #define X0    V0
  2158  #define X1    V1
  2159  #define Y0    V2
  2160  #define Y1    V3
  2161  #define T0    V4
  2162  #define T1    V5
  2163  
  2164  #define PL    V30
  2165  #define PH    V31
  2166  
  2167  #define Z3L    V23
  2168  #define Z3H    V24
  2169  
  2170  #define ZER   V26
  2171  #define SEL1  V27
  2172  #define CAR1  V28
  2173  #define CAR2  V29
  2174  /*
  2175   * http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
  2176   * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
  2177   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  2178   * 	A  = 3(X₁-Z₁²)×(X₁+Z₁²)
  2179   * 	B  = 2Y₁
  2180   * 	Z₃ = B×Z₁
  2181   * 	C  = B²
  2182   * 	D  = C×X₁
  2183   * 	X₃ = A²-2D
  2184   * 	Y₃ = (D-X₃)×A-C²/2
  2185   *
  2186   * Three-operand formula:
  2187   *       T1 = Z1²
  2188   *       T2 = X1-T1
  2189   *       T1 = X1+T1
  2190   *       T2 = T2*T1
  2191   *       T2 = 3*T2
  2192   *       Y3 = 2*Y1
  2193   *       Z3 = Y3*Z1
  2194   *       Y3 = Y3²
  2195   *       T3 = Y3*X1
  2196   *       Y3 = Y3²
  2197   *       Y3 = half*Y3
  2198   *       X3 = T2²
  2199   *       T1 = 2*T3
  2200   *       X3 = X3-T1
  2201   *       T1 = T3-X3
  2202   *       T1 = T1*T2
  2203   *       Y3 = T1-Y3
  2204   */
  2205  
  2206  #define p256PointDoubleRound(P1ptr, P3ptr) \
  2207  	\// X=Z1; Y=Z1; MUL; T-    // T1 = Z1²
  2208  	LXVD2X (R19)(P1ptr), X0 \ // Z1H
  2209  	LXVD2X (R20)(P1ptr), X1 \ // Z1L
  2210  	\
  2211  	XXPERMDI X0, X0, $2, X0 \
  2212  	XXPERMDI X1, X1, $2, X1 \
  2213  	\
  2214  	VOR  X0, X0, Y0	\
  2215  	VOR  X1, X1, Y1	\
  2216  	CALL sm2p256MulInternal<>(SB)	\
  2217  	\
  2218  	\// SUB(X<X1-T)            // T2 = X1-T1
  2219  	LXVD2X (R0)(P1ptr), X1L	\
  2220  	LXVD2X (R16)(P1ptr), X1H	\
  2221  	XXPERMDI X1L, X1L, $2, X1L	\
  2222  	XXPERMDI X1H, X1H, $2, X1H	\
  2223  	\
  2224  	p256SubInternal(X1,X0,X1H,X1L,T1,T0)	\
  2225  	\
  2226  	\ // ADD(Y<X1+T)            // T1 = X1+T1
  2227  	p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)	\
  2228  	\
  2229  	\ // X-  ; Y-  ; MUL; T-    // T2 = T2*T1
  2230  	CALL sm2p256MulInternal<>(SB)	\
  2231  	\
  2232  	\ // ADD(T2<T+T); ADD(T2<T2+T)  // T2 = 3*T2
  2233  	p256AddInternal(T2H,T2L,T1,T0,T1,T0)	\
  2234  	p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)	\
  2235  	\
  2236  	\ // ADD(X<Y1+Y1)           // Y3 = 2*Y1
  2237  	LXVD2X (R17)(P1ptr), Y1L	\
  2238  	LXVD2X (R18)(P1ptr), Y1H	\
  2239  	XXPERMDI Y1L, Y1L, $2, Y1L	\
  2240  	XXPERMDI Y1H, Y1H, $2, Y1H	\
  2241  	\
  2242  	p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)	\
  2243  	\
  2244  	\ // X-  ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
  2245  	LXVD2X (R19)(P1ptr), Y0	\
  2246  	LXVD2X (R20)(P1ptr), Y1	\
  2247  	XXPERMDI Y0, Y0, $2, Y0	\
  2248  	XXPERMDI Y1, Y1, $2, Y1	\
  2249  	\
  2250  	CALL sm2p256MulInternal<>(SB)	\
  2251  	\
  2252  	\ // Leave T0, T1 as is.
  2253  	XXPERMDI T0, T0, $2, TT0	\
  2254  	XXPERMDI T1, T1, $2, TT1	\
  2255  	STXVD2X TT0, (R19)(P3ptr)	\
  2256  	STXVD2X TT1, (R20)(P3ptr)	\
  2257  	\
  2258  	\ // X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  2259  	VOR  X0, X0, Y0	\
  2260  	VOR  X1, X1, Y1	\
  2261  	CALL sm2p256MulInternal<>(SB)	\
  2262  	\
  2263  	\ // X=T ; Y=X1; MUL; T3=T  // T3 = Y3*X1
  2264  	VOR    T0, T0, X0	\
  2265  	VOR    T1, T1, X1	\
  2266  	LXVD2X (R0)(P1ptr), Y0	\
  2267  	LXVD2X (R16)(P1ptr), Y1	\
  2268  	XXPERMDI Y0, Y0, $2, Y0	\
  2269  	XXPERMDI Y1, Y1, $2, Y1	\
  2270  	CALL   sm2p256MulInternal<>(SB)	\
  2271  	VOR    T0, T0, T3L	\
  2272  	VOR    T1, T1, T3H	\
  2273  	\
  2274  	\ // X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  2275  	VOR  X0, X0, Y0	\
  2276  	VOR  X1, X1, Y1	\
  2277  	CALL sm2p256MulInternal<>(SB)	\
  2278  	\
  2279  	\ // HAL(Y3<T)              // Y3 = half*Y3
  2280  	p256HalfInternal(Y3H,Y3L, T1,T0)	\
  2281  	\
  2282  	\ // X=T2; Y=T2; MUL; T-    // X3 = T2²
  2283  	VOR  T2L, T2L, X0	\
  2284  	VOR  T2H, T2H, X1	\
  2285  	VOR  T2L, T2L, Y0	\
  2286  	VOR  T2H, T2H, Y1	\
  2287  	CALL sm2p256MulInternal<>(SB)	\
  2288  	\
  2289  	\ // ADD(T1<T3+T3)          // T1 = 2*T3
  2290  	p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)	\
  2291  	\
  2292  	\ // SUB(X3<T-T1) X3:=X3    // X3 = X3-T1
  2293  	p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)	\
  2294  	\
  2295  	XXPERMDI X3L, X3L, $2, TT0	\
  2296  	XXPERMDI X3H, X3H, $2, TT1	\
  2297  	STXVD2X TT0, (R0)(P3ptr)	\
  2298  	STXVD2X TT1, (R16)(P3ptr)	\
  2299  	\
  2300  	\ // SUB(X<T3-X3)           // T1 = T3-X3
  2301  	p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)	\
  2302  	\
  2303  	\ // X-  ; Y-  ; MUL; T-    // T1 = T1*T2
  2304  	CALL sm2p256MulInternal<>(SB)	\
  2305  	\
  2306  	\ // SUB(Y3<T-Y3)           // Y3 = T1-Y3
  2307  	p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)	\
  2308  	\
  2309  	XXPERMDI Y3L, Y3L, $2, Y3L	\
  2310  	XXPERMDI Y3H, Y3H, $2, Y3H	\
  2311  	STXVD2X Y3L, (R17)(P3ptr)	\
  2312  	STXVD2X Y3H, (R18)(P3ptr)	\
  2313  
  2314  // p256PointDoubleAsm(res, in1 *p256Point)
  2315  TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0-16
  2316  	MOVD res+0(FP), P3ptr
  2317  	MOVD in+8(FP), P1ptr
  2318  
  2319  	MOVD $p256mul<>+0x00(SB), CPOOL
  2320  
  2321  	MOVD $16, R16
  2322  	MOVD $32, R17
  2323  	MOVD $48, R18
  2324  	MOVD $64, R19
  2325  	MOVD $80, R20
  2326  
  2327  	LXVD2X (R16)(CPOOL), PH
  2328  	LXVD2X (R0)(CPOOL), PL
  2329  
  2330  	p256PointDoubleRound(P1ptr, P3ptr)
  2331  	RET
  2332  
  2333  TEXT ·p256PointDouble6TimesAsm(SB), NOSPLIT, $0-16
  2334  	MOVD res+0(FP), P3ptr
  2335  	MOVD in+8(FP), P1ptr
  2336  
  2337  	MOVD $p256mul<>+0x00(SB), CPOOL
  2338  
  2339  	MOVD $16, R16
  2340  	MOVD $32, R17
  2341  	MOVD $48, R18
  2342  	MOVD $64, R19
  2343  	MOVD $80, R20
  2344  
  2345  	LXVD2X (R16)(CPOOL), PH
  2346  	LXVD2X (R0)(CPOOL), PL
  2347  
  2348  	p256PointDoubleRound(P1ptr, P3ptr)
  2349  	p256PointDoubleRound(P3ptr, P3ptr)
  2350  	p256PointDoubleRound(P3ptr, P3ptr)
  2351  	p256PointDoubleRound(P3ptr, P3ptr)
  2352  	p256PointDoubleRound(P3ptr, P3ptr)
  2353  	p256PointDoubleRound(P3ptr, P3ptr)
  2354  	RET
  2355  
  2356  #undef P3ptr
  2357  #undef P1ptr
  2358  #undef CPOOL
  2359  #undef X3L
  2360  #undef X3H
  2361  #undef Y3L
  2362  #undef Y3H
  2363  #undef T1L
  2364  #undef T1H
  2365  #undef T2L
  2366  #undef T2H
  2367  #undef T3L
  2368  #undef T3H
  2369  #undef X1L
  2370  #undef X1H
  2371  #undef Y1L
  2372  #undef Y1H
  2373  #undef Z1L
  2374  #undef Z1H
  2375  #undef TT0
  2376  #undef TT1
  2377  #undef T2
  2378  #undef X0
  2379  #undef X1
  2380  #undef Y0
  2381  #undef Y1
  2382  #undef T0
  2383  #undef T1
  2384  #undef PL
  2385  #undef PH
  2386  #undef Z3L
  2387  #undef Z3H
  2388  #undef ZER
  2389  #undef SEL1
  2390  #undef CAR1
  2391  #undef CAR2
  2392  
  2393  #define P3ptr  R3
  2394  #define P1ptr  R4
  2395  #define P2ptr  R5
  2396  #define CPOOL  R7
  2397  #define TRUE   R14
  2398  #define RES1   R9
  2399  #define RES2   R10
  2400  
  2401  // Temporaries in REGs
  2402  #define T1L   V16
  2403  #define T1H   V17
  2404  #define T2L   V18
  2405  #define T2H   V19
  2406  #define U1L   V20
  2407  #define U1H   V21
  2408  #define S1L   V22
  2409  #define S1H   V23
  2410  #define HL    V24
  2411  #define HH    V25
  2412  #define RL    V26
  2413  #define RH    V27
  2414  
  2415  // Temps for Sub and Add
  2416  #define ZER   V6
  2417  #define SEL1  V7
  2418  #define CAR1  V8
  2419  #define CAR2  V9
  2420  #define TT0  V11
  2421  #define TT1  V12
  2422  #define T2   V13
  2423  
  2424  // p256MulAsm Parameters
  2425  #define X0    V0
  2426  #define X1    V1
  2427  #define Y0    V2
  2428  #define Y1    V3
  2429  #define T0    V4
  2430  #define T1    V5
  2431  
  2432  #define PL    V30
  2433  #define PH    V31
  2434  /*
  2435   * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
  2436   *
  2437   * A = X₁×Z₂²
  2438   * B = Y₁×Z₂³
  2439   * C = X₂×Z₁²-A
  2440   * D = Y₂×Z₁³-B
  2441   * X₃ = D² - 2A×C² - C³
  2442   * Y₃ = D×(A×C² - X₃) - B×C³
  2443   * Z₃ = Z₁×Z₂×C
  2444   *
  2445   * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
  2446   * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
  2447   *
  2448   * T1 = Z1*Z1
  2449   * T2 = Z2*Z2
  2450   * U1 = X1*T2
  2451   * H  = X2*T1
  2452   * H  = H-U1
  2453   * Z3 = Z1*Z2
  2454   * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  2455   *
  2456   * S1 = Z2*T2
  2457   * S1 = Y1*S1
  2458   * R  = Z1*T1
  2459   * R  = Y2*R
  2460   * R  = R-S1
  2461   *
  2462   * T1 = H*H
  2463   * T2 = H*T1
  2464   * U1 = U1*T1
  2465   *
  2466   * X3 = R*R
  2467   * X3 = X3-T2
  2468   * T1 = 2*U1
  2469   * X3 = X3-T1 << store-out X3 result reg
  2470   *
  2471   * T2 = S1*T2
  2472   * Y3 = U1-X3
  2473   * Y3 = R*Y3
  2474   * Y3 = Y3-T2 << store-out Y3 result reg
  2475  
  2476  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  2477  	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  2478  	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  2479  	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2480  	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2481  	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2482  	// SUB(H<H-T)            // H  = H-U1
  2483  	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2484  	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  2485  	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2486  	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2487  	// SUB(R<T-S1)           // R  = R-S1
  2488  	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2489  	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2490  	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2491  	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2492  	// SUB(T<T-T2)           // X3 = X3-T2
  2493  	// ADD(X<U1+U1)          // T1 = 2*U1
  2494  	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2495  	// SUB(Y<U1-T)           // Y3 = U1-X3
  2496  	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2497  	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2498  	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2499  	*/
  2500  // p256PointAddAsm(res, in1, in2 *p256Point)
  2501  TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
  2502  	MOVD res+0(FP), P3ptr
  2503  	MOVD in1+8(FP), P1ptr
  2504  	MOVD $p256mul<>+0x00(SB), CPOOL
  2505  	MOVD $16, R16
  2506  	MOVD $32, R17
  2507  	MOVD $48, R18
  2508  	MOVD $64, R19
  2509  	MOVD $80, R20
  2510  
  2511  	LXVD2X (R16)(CPOOL), PH
  2512  	LXVD2X (R0)(CPOOL), PL
  2513  
  2514  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  2515  	LXVD2X (R19)(P1ptr), X0     // Z1L
  2516  	LXVD2X (R20)(P1ptr), X1     // Z1H
  2517  	XXPERMDI X0, X0, $2, X0
  2518  	XXPERMDI X1, X1, $2, X1
  2519  	VOR    X0, X0, Y0
  2520  	VOR    X1, X1, Y1
  2521  	CALL   sm2p256MulInternal<>(SB)
  2522  
  2523  	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  2524  	VOR  T0, T0, Y0
  2525  	VOR  T1, T1, Y1
  2526  	CALL sm2p256MulInternal<>(SB)
  2527  	VOR  T0, T0, RL            // SAVE: RL
  2528  	VOR  T1, T1, RH            // SAVE: RH
  2529  
  2530  	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  2531  	MOVD   in2+16(FP), P2ptr
  2532  	LXVD2X (R0)(P2ptr), X0      // X2L
  2533  	LXVD2X (R16)(P2ptr), X1     // X2H
  2534  	XXPERMDI X0, X0, $2, X0
  2535  	XXPERMDI X1, X1, $2, X1
  2536  	CALL   sm2p256MulInternal<>(SB)
  2537  	VOR    T0, T0, HL            // SAVE: HL
  2538  	VOR    T1, T1, HH            // SAVE: HH
  2539  
  2540  	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2541  	MOVD   in2+16(FP), P2ptr
  2542  	LXVD2X (R19)(P2ptr), X0     // Z2L
  2543  	LXVD2X (R20)(P2ptr), X1     // Z2H
  2544  	XXPERMDI X0, X0, $2, X0
  2545  	XXPERMDI X1, X1, $2, X1
  2546  	VOR    X0, X0, Y0
  2547  	VOR    X1, X1, Y1
  2548  	CALL   sm2p256MulInternal<>(SB)
  2549  
  2550  	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2551  	VOR  T0, T0, Y0
  2552  	VOR  T1, T1, Y1
  2553  	CALL sm2p256MulInternal<>(SB)
  2554  	VOR  T0, T0, S1L           // SAVE: S1L
  2555  	VOR  T1, T1, S1H           // SAVE: S1H
  2556  
  2557  	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2558  	MOVD   in1+8(FP), P1ptr
  2559  	LXVD2X (R0)(P1ptr), X0      // X1L
  2560  	LXVD2X (R16)(P1ptr), X1     // X1H
  2561  	XXPERMDI X0, X0, $2, X0
  2562  	XXPERMDI X1, X1, $2, X1
  2563  	CALL   sm2p256MulInternal<>(SB)
  2564  	VOR    T0, T0, U1L           // SAVE: U1L
  2565  	VOR    T1, T1, U1H           // SAVE: U1H
  2566  
  2567  	// SUB(H<H-T)            // H  = H-U1
  2568  	p256SubInternal(HH,HL,HH,HL,T1,T0)
  2569  
  2570  	// if H == 0 or H^P == 0 then ret=1 else ret=0
  2571  	// clobbers T1H and T1L
  2572  	MOVD       $1, TRUE
  2573  	VSPLTISB   $0, ZER
  2574  	VOR        HL, HH, T1H
  2575  	VCMPEQUDCC ZER, T1H, T1H
  2576  
  2577  	// 26 = CR6 NE
  2578  	ISEL       $26, R0, TRUE, RES1
  2579  	VXOR       HL, PL, T1L         // SAVE: T1L
  2580  	VXOR       HH, PH, T1H         // SAVE: T1H
  2581  	VOR        T1L, T1H, T1H
  2582  	VCMPEQUDCC ZER, T1H, T1H
  2583  
  2584  	// 26 = CR6 NE
  2585  	ISEL $26, R0, TRUE, RES2
  2586  	OR   RES2, RES1, RES1
  2587  	MOVD RES1, ret+24(FP)
  2588  
  2589  	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2590  	MOVD   in1+8(FP), P1ptr
  2591  	MOVD   in2+16(FP), P2ptr
  2592  	LXVD2X (R19)(P1ptr), X0        // Z1L
  2593  	LXVD2X (R20)(P1ptr), X1        // Z1H
  2594  	XXPERMDI X0, X0, $2, X0
  2595  	XXPERMDI X1, X1, $2, X1
  2596  	LXVD2X (R19)(P2ptr), Y0        // Z2L
  2597  	LXVD2X (R20)(P2ptr), Y1        // Z2H
  2598  	XXPERMDI Y0, Y0, $2, Y0
  2599  	XXPERMDI Y1, Y1, $2, Y1
  2600  	CALL   sm2p256MulInternal<>(SB)
  2601  
  2602  	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
  2603  	VOR     T0, T0, X0
  2604  	VOR     T1, T1, X1
  2605  	VOR     HL, HL, Y0
  2606  	VOR     HH, HH, Y1
  2607  	CALL    sm2p256MulInternal<>(SB)
  2608  	MOVD    res+0(FP), P3ptr
  2609  	XXPERMDI T1, T1, $2, TT1
  2610  	XXPERMDI T0, T0, $2, TT0
  2611  	STXVD2X TT0, (R19)(P3ptr)
  2612  	STXVD2X TT1, (R20)(P3ptr)
  2613  
  2614  	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2615  	MOVD   in1+8(FP), P1ptr
  2616  	LXVD2X (R17)(P1ptr), X0
  2617  	LXVD2X (R18)(P1ptr), X1
  2618  	XXPERMDI X0, X0, $2, X0
  2619  	XXPERMDI X1, X1, $2, X1
  2620  	VOR    S1L, S1L, Y0
  2621  	VOR    S1H, S1H, Y1
  2622  	CALL   sm2p256MulInternal<>(SB)
  2623  	VOR    T0, T0, S1L
  2624  	VOR    T1, T1, S1H
  2625  
  2626  	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2627  	MOVD   in2+16(FP), P2ptr
  2628  	LXVD2X (R17)(P2ptr), X0
  2629  	LXVD2X (R18)(P2ptr), X1
  2630  	XXPERMDI X0, X0, $2, X0
  2631  	XXPERMDI X1, X1, $2, X1
  2632  	VOR    RL, RL, Y0
  2633  
  2634  	VOR RH, RH, Y1
  2635  	CALL   sm2p256MulInternal<>(SB)
  2636  
  2637  	// SUB(R<T-S1)           // R  = T-S1
  2638  	p256SubInternal(RH,RL,T1,T0,S1H,S1L)
  2639  
  2640  	// if R == 0 or R^P == 0 then ret=ret else ret=0
  2641  	// clobbers T1H and T1L
  2642  	// Redo this using ISEL??
  2643  	MOVD       $1, TRUE
  2644  	VSPLTISB   $0, ZER
  2645  	VOR        RL, RH, T1H
  2646  	VCMPEQUDCC ZER, T1H, T1H
  2647  
  2648  	// 24 = CR6 NE
  2649  	ISEL       $26, R0, TRUE, RES1
  2650  	VXOR       RL, PL, T1L
  2651  	VXOR       RH, PH, T1H         // SAVE: T1L
  2652  	VOR        T1L, T1H, T1H
  2653  	VCMPEQUDCC ZER, T1H, T1H
  2654  
  2655  	// 26 = CR6 NE
  2656  	ISEL $26, R0, TRUE, RES2
  2657  	OR   RES2, RES1, RES1
  2658  	MOVD ret+24(FP), RES2
  2659  	AND  RES2, RES1, RES1
  2660  	MOVD RES1, ret+24(FP)
  2661  
  2662  	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2663  	VOR  HL, HL, X0
  2664  	VOR  HH, HH, X1
  2665  	VOR  HL, HL, Y0
  2666  	VOR  HH, HH, Y1
  2667  	CALL sm2p256MulInternal<>(SB)
  2668  
  2669  	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2670  	VOR  T0, T0, Y0
  2671  	VOR  T1, T1, Y1
  2672  	CALL sm2p256MulInternal<>(SB)
  2673  	VOR  T0, T0, T2L
  2674  	VOR  T1, T1, T2H
  2675  
  2676  	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2677  	VOR  U1L, U1L, X0
  2678  	VOR  U1H, U1H, X1
  2679  	CALL sm2p256MulInternal<>(SB)
  2680  	VOR  T0, T0, U1L
  2681  	VOR  T1, T1, U1H
  2682  
  2683  	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2684  	VOR RL, RL, X0
  2685  	VOR RL, RL, Y0
  2686  	VOR  RH, RH, X1
  2687  	VOR  RH, RH, Y1
  2688  	CALL sm2p256MulInternal<>(SB)
  2689  
  2690  	// SUB(T<T-T2)           // X3 = X3-T2
  2691  	p256SubInternal(T1,T0,T1,T0,T2H,T2L)
  2692  
  2693  	// ADD(X<U1+U1)          // T1 = 2*U1
  2694  	p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
  2695  
  2696  	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2697  	p256SubInternal(T1,T0,T1,T0,X1,X0)
  2698  	MOVD    res+0(FP), P3ptr
  2699  	XXPERMDI T1, T1, $2, TT1
  2700  	XXPERMDI T0, T0, $2, TT0
  2701  	STXVD2X TT0, (R0)(P3ptr)
  2702  	STXVD2X TT1, (R16)(P3ptr)
  2703  
  2704  	// SUB(Y<U1-T)           // Y3 = U1-X3
  2705  	p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
  2706  
  2707  	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2708  	VOR RL, RL, X0
  2709  
  2710  	VOR  RH, RH, X1
  2711  	CALL   sm2p256MulInternal<>(SB)
  2712  	VOR    T0, T0, U1L
  2713  	VOR    T1, T1, U1H
  2714  
  2715  	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2716  	VOR  S1L, S1L, X0
  2717  	VOR  S1H, S1H, X1
  2718  	VOR  T2L, T2L, Y0
  2719  	VOR  T2H, T2H, Y1
  2720  	CALL sm2p256MulInternal<>(SB)
  2721  
  2722  	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2723  	p256SubInternal(T1,T0,U1H,U1L,T1,T0)
  2724  	MOVD    res+0(FP), P3ptr
  2725  	XXPERMDI T1, T1, $2, TT1
  2726  	XXPERMDI T0, T0, $2, TT0
  2727  	STXVD2X TT0, (R17)(P3ptr)
  2728  	STXVD2X TT1, (R18)(P3ptr)
  2729  
  2730  	RET