github.com/mattn/go@v0.0.0-20171011075504-07f7db3ea99f/src/crypto/elliptic/p256_asm_s390x.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f
     8  DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000
     9  DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff
    10  DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84
    11  DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551
    12  DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
    13  DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
    14  DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
    15  DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
    16  DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    17  DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    18  DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0  d1 d0  0
    19  DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0  d1 d0  0
    20  DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    21  DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    22  DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256
    23  DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256
    24  DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256
    25  DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
    26  DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0  0  0 d0
    27  DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0  0  0 d0
    28  DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0  0 d1 d0
    29  DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0  0 d1 d0
    30  DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL  0 d1 d0 d1
    31  DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL  0 d1 d0 d1
    32  DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL  0  0 d1 d0
    33  DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL  0  0 d1 d0
    34  DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    35  DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    36  DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0  d1 d0  0
    37  DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0  d1 d0  0
    38  DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
    39  DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
    40  DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
    41  DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256
    42  GLOBL p256ordK0<>(SB), 8, $4
    43  GLOBL p256ord<>(SB), 8, $32
    44  GLOBL p256<>(SB), 8, $80
    45  GLOBL p256mul<>(SB), 8, $160
    46  
    47  // func hasVectorFacility() bool
    48  TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
    49  	MOVD  $x-24(SP), R1
    50  	XC    $24, 0(R1), 0(R1) // clear the storage
    51  	MOVD  $2, R0            // R0 is the number of double words stored -1
    52  	WORD  $0xB2B01000       // STFLE 0(R1)
    53  	XOR   R0, R0            // reset the value of R0
    54  	MOVBZ z-8(SP), R1
    55  	AND   $0x40, R1
    56  	BEQ   novector
    57  
    58  vectorinstalled:
    59  	// check if the vector instruction has been enabled
    60  	VLEIB  $0, $0xF, V16
    61  	VLGVB  $0, V16, R1
    62  	CMPBNE R1, $0xF, novector
    63  	MOVB   $1, ret+0(FP) // have vx
    64  	RET
    65  
    66  novector:
    67  	MOVB $0, ret+0(FP)   // no vx
    68  	RET
    69  
    70  // ---------------------------------------
    71  // iff cond == 1  val <- -val
    72  // func p256NegCond(val *p256Point, cond int)
    73  #define P1ptr   R1
    74  #define CPOOL   R4
    75  
    76  #define Y1L   V0
    77  #define Y1H   V1
    78  #define T1L   V2
    79  #define T1H   V3
    80  
    81  #define PL    V30
    82  #define PH    V31
    83  
    84  #define ZER   V4
    85  #define SEL1  V5
    86  #define CAR1  V6
    87  TEXT ·p256NegCond(SB), NOSPLIT, $0
    88  	MOVD val+0(FP), P1ptr
    89  
    90  	MOVD $p256mul<>+0x00(SB), CPOOL
    91  	VL   16(CPOOL), PL
    92  	VL   0(CPOOL), PH
    93  
    94  	VL 32(P1ptr), Y1H
    95  	VL 48(P1ptr), Y1L
    96  
    97  	VLREPG cond+8(FP), SEL1
    98  	VZERO  ZER
    99  	VCEQG  SEL1, ZER, SEL1
   100  
   101  	VSCBIQ Y1L, PL, CAR1
   102  	VSQ    Y1L, PL, T1L
   103  	VSBIQ  PH, Y1H, CAR1, T1H
   104  
   105  	VSEL Y1L, T1L, SEL1, Y1L
   106  	VSEL Y1H, T1H, SEL1, Y1H
   107  
   108  	VST Y1H, 32(P1ptr)
   109  	VST Y1L, 48(P1ptr)
   110  	RET
   111  
   112  #undef P1ptr
   113  #undef CPOOL
   114  #undef Y1L
   115  #undef Y1H
   116  #undef T1L
   117  #undef T1H
   118  #undef PL
   119  #undef PH
   120  #undef ZER
   121  #undef SEL1
   122  #undef CAR1
   123  
   124  // ---------------------------------------
   125  // if cond == 0 res <- b; else res <- a
   126  // func p256MovCond(res, a, b *p256Point, cond int)
   127  #define P3ptr   R1
   128  #define P1ptr   R2
   129  #define P2ptr   R3
   130  
   131  #define X1L    V0
   132  #define X1H    V1
   133  #define Y1L    V2
   134  #define Y1H    V3
   135  #define Z1L    V4
   136  #define Z1H    V5
   137  #define X2L    V6
   138  #define X2H    V7
   139  #define Y2L    V8
   140  #define Y2H    V9
   141  #define Z2L    V10
   142  #define Z2H    V11
   143  
   144  #define ZER   V18
   145  #define SEL1  V19
   146  TEXT ·p256MovCond(SB), NOSPLIT, $0
   147  	MOVD   res+0(FP), P3ptr
   148  	MOVD   a+8(FP), P1ptr
   149  	MOVD   b+16(FP), P2ptr
   150  	VLREPG cond+24(FP), SEL1
   151  	VZERO  ZER
   152  	VCEQG  SEL1, ZER, SEL1
   153  
   154  	VL 0(P1ptr), X1H
   155  	VL 16(P1ptr), X1L
   156  	VL 32(P1ptr), Y1H
   157  	VL 48(P1ptr), Y1L
   158  	VL 64(P1ptr), Z1H
   159  	VL 80(P1ptr), Z1L
   160  
   161  	VL 0(P2ptr), X2H
   162  	VL 16(P2ptr), X2L
   163  	VL 32(P2ptr), Y2H
   164  	VL 48(P2ptr), Y2L
   165  	VL 64(P2ptr), Z2H
   166  	VL 80(P2ptr), Z2L
   167  
   168  	VSEL X2L, X1L, SEL1, X1L
   169  	VSEL X2H, X1H, SEL1, X1H
   170  	VSEL Y2L, Y1L, SEL1, Y1L
   171  	VSEL Y2H, Y1H, SEL1, Y1H
   172  	VSEL Z2L, Z1L, SEL1, Z1L
   173  	VSEL Z2H, Z1H, SEL1, Z1H
   174  
   175  	VST X1H, 0(P3ptr)
   176  	VST X1L, 16(P3ptr)
   177  	VST Y1H, 32(P3ptr)
   178  	VST Y1L, 48(P3ptr)
   179  	VST Z1H, 64(P3ptr)
   180  	VST Z1L, 80(P3ptr)
   181  
   182  	RET
   183  
   184  #undef P3ptr
   185  #undef P1ptr
   186  #undef P2ptr
   187  #undef X1L
   188  #undef X1H
   189  #undef Y1L
   190  #undef Y1H
   191  #undef Z1L
   192  #undef Z1H
   193  #undef X2L
   194  #undef X2H
   195  #undef Y2L
   196  #undef Y2H
   197  #undef Z2L
   198  #undef Z2H
   199  #undef ZER
   200  #undef SEL1
   201  
   202  // ---------------------------------------
   203  // Constant time table access
   204  // Indexed from 1 to 15, with -1 offset
   205  // (index 0 is implicitly point at infinity)
   206  // func p256Select(point *p256Point, table []p256Point, idx int)
   207  #define P3ptr   R1
   208  #define P1ptr   R2
   209  #define COUNT   R4
   210  
   211  #define X1L    V0
   212  #define X1H    V1
   213  #define Y1L    V2
   214  #define Y1H    V3
   215  #define Z1L    V4
   216  #define Z1H    V5
   217  #define X2L    V6
   218  #define X2H    V7
   219  #define Y2L    V8
   220  #define Y2H    V9
   221  #define Z2L    V10
   222  #define Z2H    V11
   223  
   224  #define ONE   V18
   225  #define IDX   V19
   226  #define SEL1  V20
   227  #define SEL2  V21
   228  TEXT ·p256Select(SB), NOSPLIT, $0
   229  	MOVD   point+0(FP), P3ptr
   230  	MOVD   table+8(FP), P1ptr
   231  	VLREPB idx+(32+7)(FP), IDX
   232  	VREPIB $1, ONE
   233  	VREPIB $1, SEL2
   234  	MOVD   $1, COUNT
   235  
   236  	VZERO X1H
   237  	VZERO X1L
   238  	VZERO Y1H
   239  	VZERO Y1L
   240  	VZERO Z1H
   241  	VZERO Z1L
   242  
   243  loop_select:
   244  	VL 0(P1ptr), X2H
   245  	VL 16(P1ptr), X2L
   246  	VL 32(P1ptr), Y2H
   247  	VL 48(P1ptr), Y2L
   248  	VL 64(P1ptr), Z2H
   249  	VL 80(P1ptr), Z2L
   250  
   251  	VCEQG SEL2, IDX, SEL1
   252  
   253  	VSEL X2L, X1L, SEL1, X1L
   254  	VSEL X2H, X1H, SEL1, X1H
   255  	VSEL Y2L, Y1L, SEL1, Y1L
   256  	VSEL Y2H, Y1H, SEL1, Y1H
   257  	VSEL Z2L, Z1L, SEL1, Z1L
   258  	VSEL Z2H, Z1H, SEL1, Z1H
   259  
   260  	VAB  SEL2, ONE, SEL2
   261  	ADDW $1, COUNT
   262  	ADD  $96, P1ptr
   263  	CMPW COUNT, $17
   264  	BLT  loop_select
   265  
   266  	VST X1H, 0(P3ptr)
   267  	VST X1L, 16(P3ptr)
   268  	VST Y1H, 32(P3ptr)
   269  	VST Y1L, 48(P3ptr)
   270  	VST Z1H, 64(P3ptr)
   271  	VST Z1L, 80(P3ptr)
   272  	RET
   273  
   274  #undef P3ptr
   275  #undef P1ptr
   276  #undef COUNT
   277  #undef X1L
   278  #undef X1H
   279  #undef Y1L
   280  #undef Y1H
   281  #undef Z1L
   282  #undef Z1H
   283  #undef X2L
   284  #undef X2H
   285  #undef Y2L
   286  #undef Y2H
   287  #undef Z2L
   288  #undef Z2H
   289  #undef ONE
   290  #undef IDX
   291  #undef SEL1
   292  #undef SEL2
   293  
   294  // ---------------------------------------
   295  // Constant time table access
   296  // Indexed from 1 to 15, with -1 offset
   297  // (index 0 is implicitly point at infinity)
   298  // func p256SelectBase(point *p256Point, table []p256Point, idx int)
   299  #define P3ptr   R1
   300  #define P1ptr   R2
   301  #define COUNT   R4
   302  
   303  #define X1L    V0
   304  #define X1H    V1
   305  #define Y1L    V2
   306  #define Y1H    V3
   307  #define Z1L    V4
   308  #define Z1H    V5
   309  #define X2L    V6
   310  #define X2H    V7
   311  #define Y2L    V8
   312  #define Y2H    V9
   313  #define Z2L    V10
   314  #define Z2H    V11
   315  
   316  #define ONE   V18
   317  #define IDX   V19
   318  #define SEL1  V20
   319  #define SEL2  V21
   320  TEXT ·p256SelectBase(SB), NOSPLIT, $0
   321  	MOVD   point+0(FP), P3ptr
   322  	MOVD   table+8(FP), P1ptr
   323  	VLREPB idx+(32+7)(FP), IDX
   324  	VREPIB $1, ONE
   325  	VREPIB $1, SEL2
   326  	MOVD   $1, COUNT
   327  
   328  	VZERO X1H
   329  	VZERO X1L
   330  	VZERO Y1H
   331  	VZERO Y1L
   332  	VZERO Z1H
   333  	VZERO Z1L
   334  
   335  loop_select:
   336  	VL 0(P1ptr), X2H
   337  	VL 16(P1ptr), X2L
   338  	VL 32(P1ptr), Y2H
   339  	VL 48(P1ptr), Y2L
   340  	VL 64(P1ptr), Z2H
   341  	VL 80(P1ptr), Z2L
   342  
   343  	VCEQG SEL2, IDX, SEL1
   344  
   345  	VSEL X2L, X1L, SEL1, X1L
   346  	VSEL X2H, X1H, SEL1, X1H
   347  	VSEL Y2L, Y1L, SEL1, Y1L
   348  	VSEL Y2H, Y1H, SEL1, Y1H
   349  	VSEL Z2L, Z1L, SEL1, Z1L
   350  	VSEL Z2H, Z1H, SEL1, Z1H
   351  
   352  	VAB  SEL2, ONE, SEL2
   353  	ADDW $1, COUNT
   354  	ADD  $96, P1ptr
   355  	CMPW COUNT, $65
   356  	BLT  loop_select
   357  
   358  	VST X1H, 0(P3ptr)
   359  	VST X1L, 16(P3ptr)
   360  	VST Y1H, 32(P3ptr)
   361  	VST Y1L, 48(P3ptr)
   362  	VST Z1H, 64(P3ptr)
   363  	VST Z1L, 80(P3ptr)
   364  	RET
   365  
   366  #undef P3ptr
   367  #undef P1ptr
   368  #undef COUNT
   369  #undef X1L
   370  #undef X1H
   371  #undef Y1L
   372  #undef Y1H
   373  #undef Z1L
   374  #undef Z1H
   375  #undef X2L
   376  #undef X2H
   377  #undef Y2L
   378  #undef Y2H
   379  #undef Z2L
   380  #undef Z2H
   381  #undef ONE
   382  #undef IDX
   383  #undef SEL1
   384  #undef SEL2
   385  
   386  // ---------------------------------------
   387  // func p256FromMont(res, in []byte)
   388  #define res_ptr R1
   389  #define x_ptr   R2
   390  #define CPOOL   R4
   391  
   392  #define T0   V0
   393  #define T1   V1
   394  #define T2   V2
   395  #define TT0  V3
   396  #define TT1  V4
   397  
   398  #define ZER   V6
   399  #define SEL1  V7
   400  #define SEL2  V8
   401  #define CAR1  V9
   402  #define CAR2  V10
   403  #define RED1  V11
   404  #define RED2  V12
   405  #define PL    V13
   406  #define PH    V14
   407  
   408  TEXT ·p256FromMont(SB), NOSPLIT, $0
   409  	MOVD res+0(FP), res_ptr
   410  	MOVD in+24(FP), x_ptr
   411  
   412  	VZERO T2
   413  	VZERO ZER
   414  	MOVD  $p256<>+0x00(SB), CPOOL
   415  	VL    16(CPOOL), PL
   416  	VL    0(CPOOL), PH
   417  	VL    48(CPOOL), SEL2
   418  	VL    64(CPOOL), SEL1
   419  
   420  	VL (1*16)(x_ptr), T0
   421  	VL (0*16)(x_ptr), T1
   422  
   423  	// First round
   424  	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   425  	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   426  	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   427  
   428  	VSLDB $8, T1, T0, T0
   429  	VSLDB $8, T2, T1, T1
   430  
   431  	VACCQ  T0, RED1, CAR1
   432  	VAQ    T0, RED1, T0
   433  	VACCCQ T1, RED2, CAR1, CAR2
   434  	VACQ   T1, RED2, CAR1, T1
   435  	VAQ    T2, CAR2, T2
   436  
   437  	// Second round
   438  	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   439  	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   440  	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   441  
   442  	VSLDB $8, T1, T0, T0
   443  	VSLDB $8, T2, T1, T1
   444  
   445  	VACCQ  T0, RED1, CAR1
   446  	VAQ    T0, RED1, T0
   447  	VACCCQ T1, RED2, CAR1, CAR2
   448  	VACQ   T1, RED2, CAR1, T1
   449  	VAQ    T2, CAR2, T2
   450  
   451  	// Third round
   452  	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   453  	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   454  	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   455  
   456  	VSLDB $8, T1, T0, T0
   457  	VSLDB $8, T2, T1, T1
   458  
   459  	VACCQ  T0, RED1, CAR1
   460  	VAQ    T0, RED1, T0
   461  	VACCCQ T1, RED2, CAR1, CAR2
   462  	VACQ   T1, RED2, CAR1, T1
   463  	VAQ    T2, CAR2, T2
   464  
   465  	// Last round
   466  	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   467  	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   468  	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   469  
   470  	VSLDB $8, T1, T0, T0
   471  	VSLDB $8, T2, T1, T1
   472  
   473  	VACCQ  T0, RED1, CAR1
   474  	VAQ    T0, RED1, T0
   475  	VACCCQ T1, RED2, CAR1, CAR2
   476  	VACQ   T1, RED2, CAR1, T1
   477  	VAQ    T2, CAR2, T2
   478  
   479  	// ---------------------------------------------------
   480  
   481  	VSCBIQ  PL, T0, CAR1
   482  	VSQ     PL, T0, TT0
   483  	VSBCBIQ T1, PH, CAR1, CAR2
   484  	VSBIQ   T1, PH, CAR1, TT1
   485  	VSBIQ   T2, ZER, CAR2, T2
   486  
   487  	// what output to use, TT1||TT0 or T1||T0?
   488  	VSEL T0, TT0, T2, T0
   489  	VSEL T1, TT1, T2, T1
   490  
   491  	VST T0, (1*16)(res_ptr)
   492  	VST T1, (0*16)(res_ptr)
   493  	RET
   494  
   495  #undef res_ptr
   496  #undef x_ptr
   497  #undef CPOOL
   498  #undef T0
   499  #undef T1
   500  #undef T2
   501  #undef TT0
   502  #undef TT1
   503  #undef ZER
   504  #undef SEL1
   505  #undef SEL2
   506  #undef CAR1
   507  #undef CAR2
   508  #undef RED1
   509  #undef RED2
   510  #undef PL
   511  #undef PH
   512  
   513  // ---------------------------------------
   514  // func p256OrdMul(res, in1, in2 []byte)
   515  #define res_ptr R1
   516  #define x_ptr R2
   517  #define y_ptr R3
   518  #define X0    V0
   519  #define X1    V1
   520  #define Y0    V2
   521  #define Y1    V3
   522  #define M0    V4
   523  #define M1    V5
   524  #define T0    V6
   525  #define T1    V7
   526  #define T2    V8
   527  #define YDIG  V9
   528  
   529  #define ADD1  V16
   530  #define ADD1H V17
   531  #define ADD2  V18
   532  #define ADD2H V19
   533  #define RED1  V20
   534  #define RED1H V21
   535  #define RED2  V22
   536  #define RED2H V23
   537  #define CAR1  V24
   538  #define CAR1M V25
   539  
   540  #define MK0   V30
   541  #define K0    V31
   542  TEXT ·p256OrdMul(SB), NOSPLIT, $0
   543  	MOVD res+0(FP), res_ptr
   544  	MOVD in1+24(FP), x_ptr
   545  	MOVD in2+48(FP), y_ptr
   546  
   547  	VZERO T2
   548  	MOVD  $p256ordK0<>+0x00(SB), R4
   549  
   550  	// VLEF    $3, 0(R4), K0
   551  	WORD $0xE7F40000
   552  	BYTE $0x38
   553  	BYTE $0x03
   554  	MOVD $p256ord<>+0x00(SB), R4
   555  	VL   16(R4), M0
   556  	VL   0(R4), M1
   557  
   558  	VL (1*16)(x_ptr), X0
   559  	VL (0*16)(x_ptr), X1
   560  	VL (1*16)(y_ptr), Y0
   561  	VL (0*16)(y_ptr), Y1
   562  
   563  	// ---------------------------------------------------------------------------/
   564  	VREPF $3, Y0, YDIG
   565  	VMLF  X0, YDIG, ADD1
   566  	VMLF  ADD1, K0, MK0
   567  	VREPF $3, MK0, MK0
   568  
   569  	VMLF  X1, YDIG, ADD2
   570  	VMLHF X0, YDIG, ADD1H
   571  	VMLHF X1, YDIG, ADD2H
   572  
   573  	VMALF  M0, MK0, ADD1, RED1
   574  	VMALHF M0, MK0, ADD1, RED1H
   575  	VMALF  M1, MK0, ADD2, RED2
   576  	VMALHF M1, MK0, ADD2, RED2H
   577  
   578  	VSLDB $12, RED2, RED1, RED1
   579  	VSLDB $12, T2, RED2, RED2
   580  
   581  	VACCQ RED1, ADD1H, CAR1
   582  	VAQ   RED1, ADD1H, T0
   583  	VACCQ RED1H, T0, CAR1M
   584  	VAQ   RED1H, T0, T0
   585  
   586  	// << ready for next MK0
   587  
   588  	VACQ   RED2, ADD2H, CAR1, T1
   589  	VACCCQ RED2, ADD2H, CAR1, CAR1
   590  	VACCCQ RED2H, T1, CAR1M, T2
   591  	VACQ   RED2H, T1, CAR1M, T1
   592  	VAQ    CAR1, T2, T2
   593  
   594  	// ---------------------------------------------------
   595  /* *
   596   * ---+--------+--------+
   597   *  T2|   T1   |   T0   |
   598   * ---+--------+--------+
   599   *           *(add)*
   600   *    +--------+--------+
   601   *    |   X1   |   X0   |
   602   *    +--------+--------+
   603   *           *(mul)*
   604   *    +--------+--------+
   605   *    |  YDIG  |  YDIG  |
   606   *    +--------+--------+
   607   *           *(add)*
   608   *    +--------+--------+
   609   *    |   M1   |   M0   |
   610   *    +--------+--------+
   611   *           *(mul)*
   612   *    +--------+--------+
   613   *    |   MK0  |   MK0  |
   614   *    +--------+--------+
   615   *
   616   *   ---------------------
   617   *
   618   *    +--------+--------+
   619   *    |  ADD2  |  ADD1  |
   620   *    +--------+--------+
   621   *  +--------+--------+
   622   *  | ADD2H  | ADD1H  |
   623   *  +--------+--------+
   624   *    +--------+--------+
   625   *    |  RED2  |  RED1  |
   626   *    +--------+--------+
   627   *  +--------+--------+
   628   *  | RED2H  | RED1H  |
   629   *  +--------+--------+
   630   */
   631  	VREPF $2, Y0, YDIG
   632  	VMALF X0, YDIG, T0, ADD1
   633  	VMLF  ADD1, K0, MK0
   634  	VREPF $3, MK0, MK0
   635  
   636  	VMALF  X1, YDIG, T1, ADD2
   637  	VMALHF X0, YDIG, T0, ADD1H
   638  	VMALHF X1, YDIG, T1, ADD2H
   639  
   640  	VMALF  M0, MK0, ADD1, RED1
   641  	VMALHF M0, MK0, ADD1, RED1H
   642  	VMALF  M1, MK0, ADD2, RED2
   643  	VMALHF M1, MK0, ADD2, RED2H
   644  
   645  	VSLDB $12, RED2, RED1, RED1
   646  	VSLDB $12, T2, RED2, RED2
   647  
   648  	VACCQ RED1, ADD1H, CAR1
   649  	VAQ   RED1, ADD1H, T0
   650  	VACCQ RED1H, T0, CAR1M
   651  	VAQ   RED1H, T0, T0
   652  
   653  	// << ready for next MK0
   654  
   655  	VACQ   RED2, ADD2H, CAR1, T1
   656  	VACCCQ RED2, ADD2H, CAR1, CAR1
   657  	VACCCQ RED2H, T1, CAR1M, T2
   658  	VACQ   RED2H, T1, CAR1M, T1
   659  	VAQ    CAR1, T2, T2
   660  
   661  	// ---------------------------------------------------
   662  	VREPF $1, Y0, YDIG
   663  	VMALF X0, YDIG, T0, ADD1
   664  	VMLF  ADD1, K0, MK0
   665  	VREPF $3, MK0, MK0
   666  
   667  	VMALF  X1, YDIG, T1, ADD2
   668  	VMALHF X0, YDIG, T0, ADD1H
   669  	VMALHF X1, YDIG, T1, ADD2H
   670  
   671  	VMALF  M0, MK0, ADD1, RED1
   672  	VMALHF M0, MK0, ADD1, RED1H
   673  	VMALF  M1, MK0, ADD2, RED2
   674  	VMALHF M1, MK0, ADD2, RED2H
   675  
   676  	VSLDB $12, RED2, RED1, RED1
   677  	VSLDB $12, T2, RED2, RED2
   678  
   679  	VACCQ RED1, ADD1H, CAR1
   680  	VAQ   RED1, ADD1H, T0
   681  	VACCQ RED1H, T0, CAR1M
   682  	VAQ   RED1H, T0, T0
   683  
   684  	// << ready for next MK0
   685  
   686  	VACQ   RED2, ADD2H, CAR1, T1
   687  	VACCCQ RED2, ADD2H, CAR1, CAR1
   688  	VACCCQ RED2H, T1, CAR1M, T2
   689  	VACQ   RED2H, T1, CAR1M, T1
   690  	VAQ    CAR1, T2, T2
   691  
   692  	// ---------------------------------------------------
   693  	VREPF $0, Y0, YDIG
   694  	VMALF X0, YDIG, T0, ADD1
   695  	VMLF  ADD1, K0, MK0
   696  	VREPF $3, MK0, MK0
   697  
   698  	VMALF  X1, YDIG, T1, ADD2
   699  	VMALHF X0, YDIG, T0, ADD1H
   700  	VMALHF X1, YDIG, T1, ADD2H
   701  
   702  	VMALF  M0, MK0, ADD1, RED1
   703  	VMALHF M0, MK0, ADD1, RED1H
   704  	VMALF  M1, MK0, ADD2, RED2
   705  	VMALHF M1, MK0, ADD2, RED2H
   706  
   707  	VSLDB $12, RED2, RED1, RED1
   708  	VSLDB $12, T2, RED2, RED2
   709  
   710  	VACCQ RED1, ADD1H, CAR1
   711  	VAQ   RED1, ADD1H, T0
   712  	VACCQ RED1H, T0, CAR1M
   713  	VAQ   RED1H, T0, T0
   714  
   715  	// << ready for next MK0
   716  
   717  	VACQ   RED2, ADD2H, CAR1, T1
   718  	VACCCQ RED2, ADD2H, CAR1, CAR1
   719  	VACCCQ RED2H, T1, CAR1M, T2
   720  	VACQ   RED2H, T1, CAR1M, T1
   721  	VAQ    CAR1, T2, T2
   722  
   723  	// ---------------------------------------------------
   724  	VREPF $3, Y1, YDIG
   725  	VMALF X0, YDIG, T0, ADD1
   726  	VMLF  ADD1, K0, MK0
   727  	VREPF $3, MK0, MK0
   728  
   729  	VMALF  X1, YDIG, T1, ADD2
   730  	VMALHF X0, YDIG, T0, ADD1H
   731  	VMALHF X1, YDIG, T1, ADD2H
   732  
   733  	VMALF  M0, MK0, ADD1, RED1
   734  	VMALHF M0, MK0, ADD1, RED1H
   735  	VMALF  M1, MK0, ADD2, RED2
   736  	VMALHF M1, MK0, ADD2, RED2H
   737  
   738  	VSLDB $12, RED2, RED1, RED1
   739  	VSLDB $12, T2, RED2, RED2
   740  
   741  	VACCQ RED1, ADD1H, CAR1
   742  	VAQ   RED1, ADD1H, T0
   743  	VACCQ RED1H, T0, CAR1M
   744  	VAQ   RED1H, T0, T0
   745  
   746  	// << ready for next MK0
   747  
   748  	VACQ   RED2, ADD2H, CAR1, T1
   749  	VACCCQ RED2, ADD2H, CAR1, CAR1
   750  	VACCCQ RED2H, T1, CAR1M, T2
   751  	VACQ   RED2H, T1, CAR1M, T1
   752  	VAQ    CAR1, T2, T2
   753  
   754  	// ---------------------------------------------------
   755  	VREPF $2, Y1, YDIG
   756  	VMALF X0, YDIG, T0, ADD1
   757  	VMLF  ADD1, K0, MK0
   758  	VREPF $3, MK0, MK0
   759  
   760  	VMALF  X1, YDIG, T1, ADD2
   761  	VMALHF X0, YDIG, T0, ADD1H
   762  	VMALHF X1, YDIG, T1, ADD2H
   763  
   764  	VMALF  M0, MK0, ADD1, RED1
   765  	VMALHF M0, MK0, ADD1, RED1H
   766  	VMALF  M1, MK0, ADD2, RED2
   767  	VMALHF M1, MK0, ADD2, RED2H
   768  
   769  	VSLDB $12, RED2, RED1, RED1
   770  	VSLDB $12, T2, RED2, RED2
   771  
   772  	VACCQ RED1, ADD1H, CAR1
   773  	VAQ   RED1, ADD1H, T0
   774  	VACCQ RED1H, T0, CAR1M
   775  	VAQ   RED1H, T0, T0
   776  
   777  	// << ready for next MK0
   778  
   779  	VACQ   RED2, ADD2H, CAR1, T1
   780  	VACCCQ RED2, ADD2H, CAR1, CAR1
   781  	VACCCQ RED2H, T1, CAR1M, T2
   782  	VACQ   RED2H, T1, CAR1M, T1
   783  	VAQ    CAR1, T2, T2
   784  
   785  	// ---------------------------------------------------
   786  	VREPF $1, Y1, YDIG
   787  	VMALF X0, YDIG, T0, ADD1
   788  	VMLF  ADD1, K0, MK0
   789  	VREPF $3, MK0, MK0
   790  
   791  	VMALF  X1, YDIG, T1, ADD2
   792  	VMALHF X0, YDIG, T0, ADD1H
   793  	VMALHF X1, YDIG, T1, ADD2H
   794  
   795  	VMALF  M0, MK0, ADD1, RED1
   796  	VMALHF M0, MK0, ADD1, RED1H
   797  	VMALF  M1, MK0, ADD2, RED2
   798  	VMALHF M1, MK0, ADD2, RED2H
   799  
   800  	VSLDB $12, RED2, RED1, RED1
   801  	VSLDB $12, T2, RED2, RED2
   802  
   803  	VACCQ RED1, ADD1H, CAR1
   804  	VAQ   RED1, ADD1H, T0
   805  	VACCQ RED1H, T0, CAR1M
   806  	VAQ   RED1H, T0, T0
   807  
   808  	// << ready for next MK0
   809  
   810  	VACQ   RED2, ADD2H, CAR1, T1
   811  	VACCCQ RED2, ADD2H, CAR1, CAR1
   812  	VACCCQ RED2H, T1, CAR1M, T2
   813  	VACQ   RED2H, T1, CAR1M, T1
   814  	VAQ    CAR1, T2, T2
   815  
   816  	// ---------------------------------------------------
   817  	VREPF $0, Y1, YDIG
   818  	VMALF X0, YDIG, T0, ADD1
   819  	VMLF  ADD1, K0, MK0
   820  	VREPF $3, MK0, MK0
   821  
   822  	VMALF  X1, YDIG, T1, ADD2
   823  	VMALHF X0, YDIG, T0, ADD1H
   824  	VMALHF X1, YDIG, T1, ADD2H
   825  
   826  	VMALF  M0, MK0, ADD1, RED1
   827  	VMALHF M0, MK0, ADD1, RED1H
   828  	VMALF  M1, MK0, ADD2, RED2
   829  	VMALHF M1, MK0, ADD2, RED2H
   830  
   831  	VSLDB $12, RED2, RED1, RED1
   832  	VSLDB $12, T2, RED2, RED2
   833  
   834  	VACCQ RED1, ADD1H, CAR1
   835  	VAQ   RED1, ADD1H, T0
   836  	VACCQ RED1H, T0, CAR1M
   837  	VAQ   RED1H, T0, T0
   838  
   839  	// << ready for next MK0
   840  
   841  	VACQ   RED2, ADD2H, CAR1, T1
   842  	VACCCQ RED2, ADD2H, CAR1, CAR1
   843  	VACCCQ RED2H, T1, CAR1M, T2
   844  	VACQ   RED2H, T1, CAR1M, T1
   845  	VAQ    CAR1, T2, T2
   846  
   847  	// ---------------------------------------------------
   848  
   849  	VZERO   RED1
   850  	VSCBIQ  M0, T0, CAR1
   851  	VSQ     M0, T0, ADD1
   852  	VSBCBIQ T1, M1, CAR1, CAR1M
   853  	VSBIQ   T1, M1, CAR1, ADD2
   854  	VSBIQ   T2, RED1, CAR1M, T2
   855  
   856  	// what output to use, ADD2||ADD1 or T1||T0?
   857  	VSEL T0, ADD1, T2, T0
   858  	VSEL T1, ADD2, T2, T1
   859  
   860  	VST T0, (1*16)(res_ptr)
   861  	VST T1, (0*16)(res_ptr)
   862  	RET
   863  
   864  #undef res_ptr
   865  #undef x_ptr
   866  #undef y_ptr
   867  #undef X0
   868  #undef X1
   869  #undef Y0
   870  #undef Y1
   871  #undef M0
   872  #undef M1
   873  #undef T0
   874  #undef T1
   875  #undef T2
   876  #undef YDIG
   877  
   878  #undef ADD1
   879  #undef ADD1H
   880  #undef ADD2
   881  #undef ADD2H
   882  #undef RED1
   883  #undef RED1H
   884  #undef RED2
   885  #undef RED2H
   886  #undef CAR1
   887  #undef CAR1M
   888  
   889  #undef MK0
   890  #undef K0
   891  
   892  // ---------------------------------------
   893  // p256MulInternal
   894  // V0-V3,V30,V31 - Not Modified
   895  // V4-V15 - Volatile
   896  
   897  #define CPOOL   R4
   898  
   899  // Parameters
   900  #define X0    V0 // Not modified
   901  #define X1    V1 // Not modified
   902  #define Y0    V2 // Not modified
   903  #define Y1    V3 // Not modified
   904  #define T0    V4
   905  #define T1    V5
   906  #define P0    V30 // Not modified
   907  #define P1    V31 // Not modified
   908  
   909  // Temporaries
   910  #define YDIG  V6 // Overloaded with CAR2, ZER
   911  #define ADD1H V7 // Overloaded with ADD3H
   912  #define ADD2H V8 // Overloaded with ADD4H
   913  #define ADD3  V9 // Overloaded with SEL2,SEL5
   914  #define ADD4  V10 // Overloaded with SEL3,SEL6
   915  #define RED1  V11 // Overloaded with CAR2
   916  #define RED2  V12
   917  #define RED3  V13 // Overloaded with SEL1
   918  #define T2    V14
   919  // Overloaded temporaries
   920  #define ADD1  V4 // Overloaded with T0
   921  #define ADD2  V5 // Overloaded with T1
   922  #define ADD3H V7 // Overloaded with ADD1H
   923  #define ADD4H V8 // Overloaded with ADD2H
   924  #define ZER   V6 // Overloaded with YDIG, CAR2
   925  #define CAR1  V6 // Overloaded with YDIG, ZER
   926  #define CAR2  V11 // Overloaded with RED1
   927  // Constant Selects
   928  #define SEL1  V13 // Overloaded with RED3
   929  #define SEL2  V9 // Overloaded with ADD3,SEL5
   930  #define SEL3  V10 // Overloaded with ADD4,SEL6
   931  #define SEL4  V6 // Overloaded with YDIG,CAR2,ZER
   932  #define SEL5  V9 // Overloaded with ADD3,SEL2
   933  #define SEL6  V10 // Overloaded with ADD4,SEL3
   934  
   935  /* *
   936   * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
   937   * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
   938   * With you, SIMD be...
   939   *
   940   *                                           +--------+--------+
   941   *                                  +--------|  RED2  |  RED1  |
   942   *                                  |        +--------+--------+
   943   *                                  |       ---+--------+--------+
   944   *                                  |  +---- T2|   T1   |   T0   |--+
   945   *                                  |  |    ---+--------+--------+  |
   946   *                                  |  |                            |
   947   *                                  |  |    ======================= |
   948   *                                  |  |                            |
   949   *                                  |  |       +--------+--------+<-+
   950   *                                  |  +-------|  ADD2  |  ADD1  |--|-----+
   951   *                                  |  |       +--------+--------+  |     |
   952   *                                  |  |     +--------+--------+<---+     |
   953   *                                  |  |     | ADD2H  | ADD1H  |--+       |
   954   *                                  |  |     +--------+--------+  |       |
   955   *                                  |  |     +--------+--------+<-+       |
   956   *                                  |  |     |  ADD4  |  ADD3  |--|-+     |
   957   *                                  |  |     +--------+--------+  | |     |
   958   *                                  |  |   +--------+--------+<---+ |     |
   959   *                                  |  |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
   960   *                                  |  |   +--------+--------+      | |   V
   961   *                                  |  | ------------------------   | | +--------+
   962   *                                  |  |                            | | |  RED3  |  [d0 0 0 d0]
   963   *                                  |  |                            | | +--------+
   964   *                                  |  +---->+--------+--------+    | |   |
   965   *   (T2[1w]||ADD2[4w]||ADD1[3w])   +--------|   T1   |   T0   |    | |   |
   966   *                                  |        +--------+--------+    | |   |
   967   *                                  +---->---+--------+--------+    | |   |
   968   *                                         T2|   T1   |   T0   |----+ |   |
   969   *                                        ---+--------+--------+    | |   |
   970   *                                        ---+--------+--------+<---+ |   |
   971   *                                    +--- T2|   T1   |   T0   |----------+
   972   *                                    |   ---+--------+--------+      |   |
   973   *                                    |  +--------+--------+<-------------+
   974   *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
   975   *                                    |  +--------+--------+     |    |   |
   976   *                                    |  +--------+<----------------------+
   977   *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
   978   *                                    |  +--------+              |    |
   979   *                                    +--->+--------+--------+   |    |
   980   *                                         |   T1   |   T0   |--------+
   981   *                                         +--------+--------+   |    |
   982   *                                   --------------------------- |    |
   983   *                                                               |    |
   984   *                                       +--------+--------+<----+    |
   985   *                                       |  RED2  |  RED1  |          |
   986   *                                       +--------+--------+          |
   987   *                                      ---+--------+--------+<-------+
   988   *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
   989   *                                      ---+--------+--------+
   990   *
   991   *                                                                *Mi obra de arte de siglo XXI @vpaprots
   992   *
   993   *
   994   * First group is special, doesnt get the two inputs:
   995   *                                             +--------+--------+<-+
   996   *                                     +-------|  ADD2  |  ADD1  |--|-----+
   997   *                                     |       +--------+--------+  |     |
   998   *                                     |     +--------+--------+<---+     |
   999   *                                     |     | ADD2H  | ADD1H  |--+       |
  1000   *                                     |     +--------+--------+  |       |
  1001   *                                     |     +--------+--------+<-+       |
  1002   *                                     |     |  ADD4  |  ADD3  |--|-+     |
  1003   *                                     |     +--------+--------+  | |     |
  1004   *                                     |   +--------+--------+<---+ |     |
  1005   *                                     |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
  1006   *                                     |   +--------+--------+      | |   V
  1007   *                                     | ------------------------   | | +--------+
  1008   *                                     |                            | | |  RED3  |  [d0 0 0 d0]
  1009   *                                     |                            | | +--------+
  1010   *                                     +---->+--------+--------+    | |   |
  1011   *   (T2[1w]||ADD2[4w]||ADD1[3w])            |   T1   |   T0   |----+ |   |
  1012   *                                           +--------+--------+    | |   |
  1013   *                                        ---+--------+--------+<---+ |   |
  1014   *                                    +--- T2|   T1   |   T0   |----------+
  1015   *                                    |   ---+--------+--------+      |   |
  1016   *                                    |  +--------+--------+<-------------+
  1017   *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
  1018   *                                    |  +--------+--------+     |    |   |
  1019   *                                    |  +--------+<----------------------+
  1020   *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
  1021   *                                    |  +--------+              |    |
  1022   *                                    +--->+--------+--------+   |    |
  1023   *                                         |   T1   |   T0   |--------+
  1024   *                                         +--------+--------+   |    |
  1025   *                                   --------------------------- |    |
  1026   *                                                               |    |
  1027   *                                       +--------+--------+<----+    |
  1028   *                                       |  RED2  |  RED1  |          |
  1029   *                                       +--------+--------+          |
  1030   *                                      ---+--------+--------+<-------+
  1031   *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
  1032   *                                      ---+--------+--------+
  1033   *
  1034   * Last 'group' needs to RED2||RED1 shifted less
  1035   */
  1036  TEXT p256MulInternal<>(SB), NOSPLIT, $0-0
  1037  	VL 32(CPOOL), SEL1
  1038  	VL 48(CPOOL), SEL2
  1039  	VL 64(CPOOL), SEL3
  1040  	VL 80(CPOOL), SEL4
  1041  
  1042  	// ---------------------------------------------------
  1043  
  1044  	VREPF $3, Y0, YDIG
  1045  	VMLHF X0, YDIG, ADD1H
  1046  	VMLHF X1, YDIG, ADD2H
  1047  	VMLF  X0, YDIG, ADD1
  1048  	VMLF  X1, YDIG, ADD2
  1049  
  1050  	VREPF  $2, Y0, YDIG
  1051  	VMALF  X0, YDIG, ADD1H, ADD3
  1052  	VMALF  X1, YDIG, ADD2H, ADD4
  1053  	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
  1054  	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
  1055  
  1056  	VZERO ZER
  1057  	VL    32(CPOOL), SEL1
  1058  	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1059  
  1060  	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
  1061  	VSLDB $12, ZER, ADD2, T1  // ADD2 Free
  1062  
  1063  	VACCQ  T0, ADD3, CAR1
  1064  	VAQ    T0, ADD3, T0       // ADD3 Free
  1065  	VACCCQ T1, ADD4, CAR1, T2
  1066  	VACQ   T1, ADD4, CAR1, T1 // ADD4 Free
  1067  
  1068  	VL    48(CPOOL), SEL2
  1069  	VL    64(CPOOL), SEL3
  1070  	VL    80(CPOOL), SEL4
  1071  	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1072  	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1073  	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1074  	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1075  
  1076  	VSLDB $12, T1, T0, T0
  1077  	VSLDB $12, T2, T1, T1
  1078  
  1079  	VACCQ  T0, ADD3H, CAR1
  1080  	VAQ    T0, ADD3H, T0
  1081  	VACCCQ T1, ADD4H, CAR1, T2
  1082  	VACQ   T1, ADD4H, CAR1, T1
  1083  
  1084  	// ---------------------------------------------------
  1085  
  1086  	VREPF  $1, Y0, YDIG
  1087  	VMALHF X0, YDIG, T0, ADD1H
  1088  	VMALHF X1, YDIG, T1, ADD2H
  1089  	VMALF  X0, YDIG, T0, ADD1  // T0 Free->ADD1
  1090  	VMALF  X1, YDIG, T1, ADD2  // T1 Free->ADD2
  1091  
  1092  	VREPF  $0, Y0, YDIG
  1093  	VMALF  X0, YDIG, ADD1H, ADD3
  1094  	VMALF  X1, YDIG, ADD2H, ADD4
  1095  	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
  1096  	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
  1097  
  1098  	VZERO ZER
  1099  	VL    32(CPOOL), SEL1
  1100  	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1101  
  1102  	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0
  1103  	VSLDB $12, T2, ADD2, T1   // ADD2 Free->T1, T2 Free
  1104  
  1105  	VACCQ  T0, RED1, CAR1
  1106  	VAQ    T0, RED1, T0
  1107  	VACCCQ T1, RED2, CAR1, T2
  1108  	VACQ   T1, RED2, CAR1, T1
  1109  
  1110  	VACCQ  T0, ADD3, CAR1
  1111  	VAQ    T0, ADD3, T0
  1112  	VACCCQ T1, ADD4, CAR1, CAR2
  1113  	VACQ   T1, ADD4, CAR1, T1
  1114  	VAQ    T2, CAR2, T2
  1115  
  1116  	VL    48(CPOOL), SEL2
  1117  	VL    64(CPOOL), SEL3
  1118  	VL    80(CPOOL), SEL4
  1119  	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1120  	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1121  	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1122  	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1123  
  1124  	VSLDB $12, T1, T0, T0
  1125  	VSLDB $12, T2, T1, T1
  1126  
  1127  	VACCQ  T0, ADD3H, CAR1
  1128  	VAQ    T0, ADD3H, T0
  1129  	VACCCQ T1, ADD4H, CAR1, T2
  1130  	VACQ   T1, ADD4H, CAR1, T1
  1131  
  1132  	// ---------------------------------------------------
  1133  
  1134  	VREPF  $3, Y1, YDIG
  1135  	VMALHF X0, YDIG, T0, ADD1H
  1136  	VMALHF X1, YDIG, T1, ADD2H
  1137  	VMALF  X0, YDIG, T0, ADD1
  1138  	VMALF  X1, YDIG, T1, ADD2
  1139  
  1140  	VREPF  $2, Y1, YDIG
  1141  	VMALF  X0, YDIG, ADD1H, ADD3
  1142  	VMALF  X1, YDIG, ADD2H, ADD4
  1143  	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
  1144  	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
  1145  
  1146  	VZERO ZER
  1147  	VL    32(CPOOL), SEL1
  1148  	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1149  
  1150  	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
  1151  	VSLDB $12, T2, ADD2, T1   // ADD2 Free
  1152  
  1153  	VACCQ  T0, RED1, CAR1
  1154  	VAQ    T0, RED1, T0
  1155  	VACCCQ T1, RED2, CAR1, T2
  1156  	VACQ   T1, RED2, CAR1, T1
  1157  
  1158  	VACCQ  T0, ADD3, CAR1
  1159  	VAQ    T0, ADD3, T0
  1160  	VACCCQ T1, ADD4, CAR1, CAR2
  1161  	VACQ   T1, ADD4, CAR1, T1
  1162  	VAQ    T2, CAR2, T2
  1163  
  1164  	VL    48(CPOOL), SEL2
  1165  	VL    64(CPOOL), SEL3
  1166  	VL    80(CPOOL), SEL4
  1167  	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1168  	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1169  	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1170  	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1171  
  1172  	VSLDB $12, T1, T0, T0
  1173  	VSLDB $12, T2, T1, T1
  1174  
  1175  	VACCQ  T0, ADD3H, CAR1
  1176  	VAQ    T0, ADD3H, T0
  1177  	VACCCQ T1, ADD4H, CAR1, T2
  1178  	VACQ   T1, ADD4H, CAR1, T1
  1179  
  1180  	// ---------------------------------------------------
  1181  
  1182  	VREPF  $1, Y1, YDIG
  1183  	VMALHF X0, YDIG, T0, ADD1H
  1184  	VMALHF X1, YDIG, T1, ADD2H
  1185  	VMALF  X0, YDIG, T0, ADD1
  1186  	VMALF  X1, YDIG, T1, ADD2
  1187  
  1188  	VREPF  $0, Y1, YDIG
  1189  	VMALF  X0, YDIG, ADD1H, ADD3
  1190  	VMALF  X1, YDIG, ADD2H, ADD4
  1191  	VMALHF X0, YDIG, ADD1H, ADD3H
  1192  	VMALHF X1, YDIG, ADD2H, ADD4H
  1193  
  1194  	VZERO ZER
  1195  	VL    32(CPOOL), SEL1
  1196  	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1197  
  1198  	VSLDB $12, ADD2, ADD1, T0
  1199  	VSLDB $12, T2, ADD2, T1
  1200  
  1201  	VACCQ  T0, RED1, CAR1
  1202  	VAQ    T0, RED1, T0
  1203  	VACCCQ T1, RED2, CAR1, T2
  1204  	VACQ   T1, RED2, CAR1, T1
  1205  
  1206  	VACCQ  T0, ADD3, CAR1
  1207  	VAQ    T0, ADD3, T0
  1208  	VACCCQ T1, ADD4, CAR1, CAR2
  1209  	VACQ   T1, ADD4, CAR1, T1
  1210  	VAQ    T2, CAR2, T2
  1211  
  1212  	VL    96(CPOOL), SEL5
  1213  	VL    112(CPOOL), SEL6
  1214  	VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
  1215  	VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0  0]
  1216  	VSQ   RED1, RED2, RED2     // Guaranteed not to underflow
  1217  
  1218  	VSLDB $12, T1, T0, T0
  1219  	VSLDB $12, T2, T1, T1
  1220  
  1221  	VACCQ  T0, ADD3H, CAR1
  1222  	VAQ    T0, ADD3H, T0
  1223  	VACCCQ T1, ADD4H, CAR1, T2
  1224  	VACQ   T1, ADD4H, CAR1, T1
  1225  
  1226  	VACCQ  T0, RED1, CAR1
  1227  	VAQ    T0, RED1, T0
  1228  	VACCCQ T1, RED2, CAR1, CAR2
  1229  	VACQ   T1, RED2, CAR1, T1
  1230  	VAQ    T2, CAR2, T2
  1231  
  1232  	// ---------------------------------------------------
  1233  
  1234  	VZERO   RED3
  1235  	VSCBIQ  P0, T0, CAR1
  1236  	VSQ     P0, T0, ADD1H
  1237  	VSBCBIQ T1, P1, CAR1, CAR2
  1238  	VSBIQ   T1, P1, CAR1, ADD2H
  1239  	VSBIQ   T2, RED3, CAR2, T2
  1240  
  1241  	// what output to use, ADD2H||ADD1H or T1||T0?
  1242  	VSEL T0, ADD1H, T2, T0
  1243  	VSEL T1, ADD2H, T2, T1
  1244  	RET
  1245  
  1246  #undef CPOOL
  1247  
  1248  #undef X0
  1249  #undef X1
  1250  #undef Y0
  1251  #undef Y1
  1252  #undef T0
  1253  #undef T1
  1254  #undef P0
  1255  #undef P1
  1256  
  1257  #undef SEL1
  1258  #undef SEL2
  1259  #undef SEL3
  1260  #undef SEL4
  1261  #undef SEL5
  1262  #undef SEL6
  1263  
  1264  #undef YDIG
  1265  #undef ADD1H
  1266  #undef ADD2H
  1267  #undef ADD3
  1268  #undef ADD4
  1269  #undef RED1
  1270  #undef RED2
  1271  #undef RED3
  1272  #undef T2
  1273  #undef ADD1
  1274  #undef ADD2
  1275  #undef ADD3H
  1276  #undef ADD4H
  1277  #undef ZER
  1278  #undef CAR1
  1279  #undef CAR2
  1280  
  1281  #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
  1282  	VZERO   ZER                \
  1283  	VSCBIQ  Y0, X0, CAR1       \
  1284  	VSQ     Y0, X0, T0         \
  1285  	VSBCBIQ X1, Y1, CAR1, SEL1 \
  1286  	VSBIQ   X1, Y1, CAR1, T1   \
  1287  	VSQ     SEL1, ZER, SEL1    \
  1288  	                           \
  1289  	VACCQ   T0, PL, CAR1       \
  1290  	VAQ     T0, PL, TT0        \
  1291  	VACQ    T1, PH, CAR1, TT1  \
  1292  	                           \
  1293  	VSEL    T0, TT0, SEL1, T0  \
  1294  	VSEL    T1, TT1, SEL1, T1  \
  1295  
  1296  #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
  1297  	VACCQ   X0, Y0, CAR1        \
  1298  	VAQ     X0, Y0, T0          \
  1299  	VACCCQ  X1, Y1, CAR1, T2    \
  1300  	VACQ    X1, Y1, CAR1, T1    \
  1301  	                            \
  1302  	VZERO   ZER                 \
  1303  	VSCBIQ  PL, T0, CAR1        \
  1304  	VSQ     PL, T0, TT0         \
  1305  	VSBCBIQ T1, PH, CAR1, CAR2  \
  1306  	VSBIQ   T1, PH, CAR1, TT1   \
  1307  	VSBIQ   T2, ZER, CAR2, SEL1 \
  1308  	                            \
  1309  	VSEL    T0, TT0, SEL1, T0   \
  1310  	VSEL    T1, TT1, SEL1, T1
  1311  
  1312  #define p256HalfInternal(T1, T0, X1, X0) \
  1313  	VZERO  ZER                \
  1314  	VSBIQ  ZER, ZER, X0, SEL1 \
  1315  	                          \
  1316  	VACCQ  X0, PL, CAR1       \
  1317  	VAQ    X0, PL, T0         \
  1318  	VACCCQ X1, PH, CAR1, T2   \
  1319  	VACQ   X1, PH, CAR1, T1   \
  1320  	                          \
  1321  	VSEL   X0, T0, SEL1, T0   \
  1322  	VSEL   X1, T1, SEL1, T1   \
  1323  	VSEL   ZER, T2, SEL1, T2  \
  1324  	                          \
  1325  	VSLDB  $15, T2, ZER, TT1  \
  1326  	VSLDB  $15, T1, ZER, TT0  \
  1327  	VREPIB $1, SEL1           \
  1328  	VSRL   SEL1, T0, T0       \
  1329  	VSRL   SEL1, T1, T1       \
  1330  	VREPIB $7, SEL1           \
  1331  	VSL    SEL1, TT0, TT0     \
  1332  	VSL    SEL1, TT1, TT1     \
  1333  	VO     T0, TT0, T0        \
  1334  	VO     T1, TT1, T1
  1335  
  1336  // ---------------------------------------
  1337  // func p256MulAsm(res, in1, in2 []byte)
  1338  #define res_ptr R1
  1339  #define x_ptr   R2
  1340  #define y_ptr   R3
  1341  #define CPOOL   R4
  1342  
  1343  // Parameters
  1344  #define X0    V0
  1345  #define X1    V1
  1346  #define Y0    V2
  1347  #define Y1    V3
  1348  #define T0    V4
  1349  #define T1    V5
  1350  
  1351  // Constants
  1352  #define P0    V30
  1353  #define P1    V31
  1354  TEXT ·p256MulAsm(SB), NOSPLIT, $0
  1355  	MOVD res+0(FP), res_ptr
  1356  	MOVD in1+24(FP), x_ptr
  1357  	MOVD in2+48(FP), y_ptr
  1358  
  1359  	VL (1*16)(x_ptr), X0
  1360  	VL (0*16)(x_ptr), X1
  1361  	VL (1*16)(y_ptr), Y0
  1362  	VL (0*16)(y_ptr), Y1
  1363  
  1364  	MOVD $p256mul<>+0x00(SB), CPOOL
  1365  	VL   16(CPOOL), P0
  1366  	VL   0(CPOOL), P1
  1367  
  1368  	CALL p256MulInternal<>(SB)
  1369  
  1370  	VST T0, (1*16)(res_ptr)
  1371  	VST T1, (0*16)(res_ptr)
  1372  	RET
  1373  
  1374  #undef res_ptr
  1375  #undef x_ptr
  1376  #undef y_ptr
  1377  #undef CPOOL
  1378  
  1379  #undef X0
  1380  #undef X1
  1381  #undef Y0
  1382  #undef Y1
  1383  #undef T0
  1384  #undef T1
  1385  #undef P0
  1386  #undef P1
  1387  
  1388  // Point add with P2 being affine point
  1389  // If sign == 1 -> P2 = -P2
  1390  // If sel == 0 -> P3 = P1
  1391  // if zero == 0 -> P3 = P2
  1392  // p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int)
  1393  #define P3ptr   R1
  1394  #define P1ptr   R2
  1395  #define P2ptr   R3
  1396  #define CPOOL   R4
  1397  
  1398  // Temporaries in REGs
  1399  #define Y2L    V15
  1400  #define Y2H    V16
  1401  #define T1L    V17
  1402  #define T1H    V18
  1403  #define T2L    V19
  1404  #define T2H    V20
  1405  #define T3L    V21
  1406  #define T3H    V22
  1407  #define T4L    V23
  1408  #define T4H    V24
  1409  
  1410  // Temps for Sub and Add
  1411  #define TT0  V11
  1412  #define TT1  V12
  1413  #define T2   V13
  1414  
  1415  // p256MulAsm Parameters
  1416  #define X0    V0
  1417  #define X1    V1
  1418  #define Y0    V2
  1419  #define Y1    V3
  1420  #define T0    V4
  1421  #define T1    V5
  1422  
  1423  #define PL    V30
  1424  #define PH    V31
  1425  
  1426  // Names for zero/sel selects
  1427  #define X1L    V0
  1428  #define X1H    V1
  1429  #define Y1L    V2 // p256MulAsmParmY
  1430  #define Y1H    V3 // p256MulAsmParmY
  1431  #define Z1L    V4
  1432  #define Z1H    V5
  1433  #define X2L    V0
  1434  #define X2H    V1
  1435  #define Z2L    V4
  1436  #define Z2H    V5
  1437  #define X3L    V17 // T1L
  1438  #define X3H    V18 // T1H
  1439  #define Y3L    V21 // T3L
  1440  #define Y3H    V22 // T3H
  1441  #define Z3L    V28
  1442  #define Z3H    V29
  1443  
  1444  #define ZER   V6
  1445  #define SEL1  V7
  1446  #define CAR1  V8
  1447  #define CAR2  V9
  1448  /* *
  1449   * Three operand formula:
  1450   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1451   * T1 = Z1²
  1452   * T2 = T1*Z1
  1453   * T1 = T1*X2
  1454   * T2 = T2*Y2
  1455   * T1 = T1-X1
  1456   * T2 = T2-Y1
  1457   * Z3 = Z1*T1
  1458   * T3 = T1²
  1459   * T4 = T3*T1
  1460   * T3 = T3*X1
  1461   * T1 = 2*T3
  1462   * X3 = T2²
  1463   * X3 = X3-T1
  1464   * X3 = X3-T4
  1465   * T3 = T3-X3
  1466   * T3 = T3*T2
  1467   * T4 = T4*Y1
  1468   * Y3 = T3-T4
  1469  
  1470   * Three operand formulas, but with MulInternal X,Y used to store temps
  1471  X=Z1; Y=Z1; MUL;T-   // T1 = Z1²      T1
  1472  X=T ; Y-  ; MUL;T2=T // T2 = T1*Z1    T1   T2
  1473  X-  ; Y=X2; MUL;T1=T // T1 = T1*X2    T1   T2
  1474  X=T2; Y=Y2; MUL;T-   // T2 = T2*Y2    T1   T2
  1475  SUB(T2<T-Y1)         // T2 = T2-Y1    T1   T2
  1476  SUB(Y<T1-X1)         // T1 = T1-X1    T1   T2
  1477  X=Z1; Y- ;  MUL;Z3:=T// Z3 = Z1*T1         T2
  1478  X=Y;  Y- ;  MUL;X=T  // T3 = T1*T1         T2
  1479  X- ;  Y- ;  MUL;T4=T // T4 = T3*T1         T2        T4
  1480  X- ;  Y=X1; MUL;T3=T // T3 = T3*X1         T2   T3   T4
  1481  ADD(T1<T+T)          // T1 = T3+T3    T1   T2   T3   T4
  1482  X=T2; Y=T2; MUL;T-   // X3 = T2*T2    T1   T2   T3   T4
  1483  SUB(T<T-T1)          // X3 = X3-T1    T1   T2   T3   T4
  1484  SUB(T<T-T4) X3:=T    // X3 = X3-T4         T2   T3   T4
  1485  SUB(X<T3-T)          // T3 = T3-X3         T2   T3   T4
  1486  X- ;  Y- ;  MUL;T3=T // T3 = T3*T2         T2   T3   T4
  1487  X=T4; Y=Y1; MUL;T-   // T4 = T4*Y1              T3   T4
  1488  SUB(T<T3-T) Y3:=T    // Y3 = T3-T4              T3   T4
  1489  
  1490  	*/
  1491  TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
  1492  	MOVD P3+0(FP), P3ptr
  1493  	MOVD P1+8(FP), P1ptr
  1494  	MOVD P2+16(FP), P2ptr
  1495  
  1496  	MOVD $p256mul<>+0x00(SB), CPOOL
  1497  	VL   16(CPOOL), PL
  1498  	VL   0(CPOOL), PH
  1499  
  1500  	//	if (sign == 1) {
  1501  	//		Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2  = P-Y2
  1502  	//	}
  1503  
  1504  	VL 32(P2ptr), Y2H
  1505  	VL 48(P2ptr), Y2L
  1506  
  1507  	VLREPG sign+24(FP), SEL1
  1508  	VZERO  ZER
  1509  	VCEQG  SEL1, ZER, SEL1
  1510  
  1511  	VSCBIQ Y2L, PL, CAR1
  1512  	VSQ    Y2L, PL, T1L
  1513  	VSBIQ  PH, Y2H, CAR1, T1H
  1514  
  1515  	VSEL Y2L, T1L, SEL1, Y2L
  1516  	VSEL Y2H, T1H, SEL1, Y2H
  1517  
  1518  /* *
  1519   * Three operand formula:
  1520   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1521   */
  1522  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1²      T1
  1523  	VL   64(P1ptr), X1       // Z1H
  1524  	VL   80(P1ptr), X0       // Z1L
  1525  	VLR  X0, Y0
  1526  	VLR  X1, Y1
  1527  	CALL p256MulInternal<>(SB)
  1528  
  1529  	// X=T ; Y-  ; MUL; T2=T // T2 = T1*Z1    T1   T2
  1530  	VLR  T0, X0
  1531  	VLR  T1, X1
  1532  	CALL p256MulInternal<>(SB)
  1533  	VLR  T0, T2L
  1534  	VLR  T1, T2H
  1535  
  1536  	// X-  ; Y=X2; MUL; T1=T // T1 = T1*X2    T1   T2
  1537  	VL   0(P2ptr), Y1        // X2H
  1538  	VL   16(P2ptr), Y0       // X2L
  1539  	CALL p256MulInternal<>(SB)
  1540  	VLR  T0, T1L
  1541  	VLR  T1, T1H
  1542  
  1543  	// X=T2; Y=Y2; MUL; T-   // T2 = T2*Y2    T1   T2
  1544  	VLR  T2L, X0
  1545  	VLR  T2H, X1
  1546  	VLR  Y2L, Y0
  1547  	VLR  Y2H, Y1
  1548  	CALL p256MulInternal<>(SB)
  1549  
  1550  	// SUB(T2<T-Y1)          // T2 = T2-Y1    T1   T2
  1551  	VL 32(P1ptr), Y1H
  1552  	VL 48(P1ptr), Y1L
  1553  	p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
  1554  
  1555  	// SUB(Y<T1-X1)          // T1 = T1-X1    T1   T2
  1556  	VL 0(P1ptr), X1H
  1557  	VL 16(P1ptr), X1L
  1558  	p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
  1559  
  1560  	// X=Z1; Y- ;  MUL; Z3:=T// Z3 = Z1*T1         T2
  1561  	VL   64(P1ptr), X1       // Z1H
  1562  	VL   80(P1ptr), X0       // Z1L
  1563  	CALL p256MulInternal<>(SB)
  1564  
  1565  	// VST T1, 64(P3ptr)
  1566  	// VST T0, 80(P3ptr)
  1567  	VLR T0, Z3L
  1568  	VLR T1, Z3H
  1569  
  1570  	// X=Y;  Y- ;  MUL; X=T  // T3 = T1*T1         T2
  1571  	VLR  Y0, X0
  1572  	VLR  Y1, X1
  1573  	CALL p256MulInternal<>(SB)
  1574  	VLR  T0, X0
  1575  	VLR  T1, X1
  1576  
  1577  	// X- ;  Y- ;  MUL; T4=T // T4 = T3*T1         T2        T4
  1578  	CALL p256MulInternal<>(SB)
  1579  	VLR  T0, T4L
  1580  	VLR  T1, T4H
  1581  
  1582  	// X- ;  Y=X1; MUL; T3=T // T3 = T3*X1         T2   T3   T4
  1583  	VL   0(P1ptr), Y1        // X1H
  1584  	VL   16(P1ptr), Y0       // X1L
  1585  	CALL p256MulInternal<>(SB)
  1586  	VLR  T0, T3L
  1587  	VLR  T1, T3H
  1588  
  1589  	// ADD(T1<T+T)           // T1 = T3+T3    T1   T2   T3   T4
  1590  	p256AddInternal(T1H,T1L, T1,T0,T1,T0)
  1591  
  1592  	// X=T2; Y=T2; MUL; T-   // X3 = T2*T2    T1   T2   T3   T4
  1593  	VLR  T2L, X0
  1594  	VLR  T2H, X1
  1595  	VLR  T2L, Y0
  1596  	VLR  T2H, Y1
  1597  	CALL p256MulInternal<>(SB)
  1598  
  1599  	// SUB(T<T-T1)           // X3 = X3-T1    T1   T2   T3   T4  (T1 = X3)
  1600  	p256SubInternal(T1,T0,T1,T0,T1H,T1L)
  1601  
  1602  	// SUB(T<T-T4) X3:=T     // X3 = X3-T4         T2   T3   T4
  1603  	p256SubInternal(T1,T0,T1,T0,T4H,T4L)
  1604  	VLR T0, X3L
  1605  	VLR T1, X3H
  1606  
  1607  	// SUB(X<T3-T)           // T3 = T3-X3         T2   T3   T4
  1608  	p256SubInternal(X1,X0,T3H,T3L,T1,T0)
  1609  
  1610  	// X- ;  Y- ;  MUL; T3=T // T3 = T3*T2         T2   T3   T4
  1611  	CALL p256MulInternal<>(SB)
  1612  	VLR  T0, T3L
  1613  	VLR  T1, T3H
  1614  
  1615  	// X=T4; Y=Y1; MUL; T-   // T4 = T4*Y1              T3   T4
  1616  	VLR  T4L, X0
  1617  	VLR  T4H, X1
  1618  	VL   32(P1ptr), Y1       // Y1H
  1619  	VL   48(P1ptr), Y0       // Y1L
  1620  	CALL p256MulInternal<>(SB)
  1621  
  1622  	// SUB(T<T3-T) Y3:=T     // Y3 = T3-T4              T3   T4  (T3 = Y3)
  1623  	p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
  1624  
  1625  	//	if (sel == 0) {
  1626  	//		copy(P3.x[:], X1)
  1627  	//		copy(P3.y[:], Y1)
  1628  	//		copy(P3.z[:], Z1)
  1629  	//	}
  1630  
  1631  	VL 0(P1ptr), X1H
  1632  	VL 16(P1ptr), X1L
  1633  
  1634  	// Y1 already loaded, left over from addition
  1635  	VL 64(P1ptr), Z1H
  1636  	VL 80(P1ptr), Z1L
  1637  
  1638  	VLREPG sel+32(FP), SEL1
  1639  	VZERO  ZER
  1640  	VCEQG  SEL1, ZER, SEL1
  1641  
  1642  	VSEL X1L, X3L, SEL1, X3L
  1643  	VSEL X1H, X3H, SEL1, X3H
  1644  	VSEL Y1L, Y3L, SEL1, Y3L
  1645  	VSEL Y1H, Y3H, SEL1, Y3H
  1646  	VSEL Z1L, Z3L, SEL1, Z3L
  1647  	VSEL Z1H, Z3H, SEL1, Z3H
  1648  
  1649  	//	if (zero == 0) {
  1650  	//		copy(P3.x[:], X2)
  1651  	//		copy(P3.y[:], Y2)
  1652  	//		copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  1653  	//			0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01})  //(p256.z*2^256)%p
  1654  	//	}
  1655  	VL 0(P2ptr), X2H
  1656  	VL 16(P2ptr), X2L
  1657  
  1658  	// Y2 already loaded
  1659  	VL 128(CPOOL), Z2H
  1660  	VL 144(CPOOL), Z2L
  1661  
  1662  	VLREPG zero+40(FP), SEL1
  1663  	VZERO  ZER
  1664  	VCEQG  SEL1, ZER, SEL1
  1665  
  1666  	VSEL X2L, X3L, SEL1, X3L
  1667  	VSEL X2H, X3H, SEL1, X3H
  1668  	VSEL Y2L, Y3L, SEL1, Y3L
  1669  	VSEL Y2H, Y3H, SEL1, Y3H
  1670  	VSEL Z2L, Z3L, SEL1, Z3L
  1671  	VSEL Z2H, Z3H, SEL1, Z3H
  1672  
  1673  	// All done, store out the result!!!
  1674  	VST X3H, 0(P3ptr)
  1675  	VST X3L, 16(P3ptr)
  1676  	VST Y3H, 32(P3ptr)
  1677  	VST Y3L, 48(P3ptr)
  1678  	VST Z3H, 64(P3ptr)
  1679  	VST Z3L, 80(P3ptr)
  1680  
  1681  	RET
  1682  
  1683  #undef P3ptr
  1684  #undef P1ptr
  1685  #undef P2ptr
  1686  #undef CPOOL
  1687  
  1688  #undef Y2L
  1689  #undef Y2H
  1690  #undef T1L
  1691  #undef T1H
  1692  #undef T2L
  1693  #undef T2H
  1694  #undef T3L
  1695  #undef T3H
  1696  #undef T4L
  1697  #undef T4H
  1698  
  1699  #undef TT0
  1700  #undef TT1
  1701  #undef T2
  1702  
  1703  #undef X0
  1704  #undef X1
  1705  #undef Y0
  1706  #undef Y1
  1707  #undef T0
  1708  #undef T1
  1709  
  1710  #undef PL
  1711  #undef PH
  1712  
  1713  #undef X1L
  1714  #undef X1H
  1715  #undef Y1L
  1716  #undef Y1H
  1717  #undef Z1L
  1718  #undef Z1H
  1719  #undef X2L
  1720  #undef X2H
  1721  #undef Z2L
  1722  #undef Z2H
  1723  #undef X3L
  1724  #undef X3H
  1725  #undef Y3L
  1726  #undef Y3H
  1727  #undef Z3L
  1728  #undef Z3H
  1729  
  1730  #undef ZER
  1731  #undef SEL1
  1732  #undef CAR1
  1733  #undef CAR2
  1734  
  1735  // p256PointDoubleAsm(P3, P1 *p256Point)
  1736  // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
  1737  // http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
  1738  // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
  1739  #define P3ptr   R1
  1740  #define P1ptr   R2
  1741  #define CPOOL   R4
  1742  
  1743  // Temporaries in REGs
  1744  #define X3L    V15
  1745  #define X3H    V16
  1746  #define Y3L    V17
  1747  #define Y3H    V18
  1748  #define T1L    V19
  1749  #define T1H    V20
  1750  #define T2L    V21
  1751  #define T2H    V22
  1752  #define T3L    V23
  1753  #define T3H    V24
  1754  
  1755  #define X1L    V6
  1756  #define X1H    V7
  1757  #define Y1L    V8
  1758  #define Y1H    V9
  1759  #define Z1L    V10
  1760  #define Z1H    V11
  1761  
  1762  // Temps for Sub and Add
  1763  #define TT0  V11
  1764  #define TT1  V12
  1765  #define T2   V13
  1766  
  1767  // p256MulAsm Parameters
  1768  #define X0    V0
  1769  #define X1    V1
  1770  #define Y0    V2
  1771  #define Y1    V3
  1772  #define T0    V4
  1773  #define T1    V5
  1774  
  1775  #define PL    V30
  1776  #define PH    V31
  1777  
  1778  #define Z3L    V23
  1779  #define Z3H    V24
  1780  
  1781  #define ZER   V26
  1782  #define SEL1  V27
  1783  #define CAR1  V28
  1784  #define CAR2  V29
  1785  /*
  1786   * http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
  1787   * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
  1788   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1789   * 	A  = 3(X₁-Z₁²)×(X₁+Z₁²)
  1790   * 	B  = 2Y₁
  1791   * 	Z₃ = B×Z₁
  1792   * 	C  = B²
  1793   * 	D  = C×X₁
  1794   * 	X₃ = A²-2D
  1795   * 	Y₃ = (D-X₃)×A-C²/2
  1796   *
  1797   * Three-operand formula:
  1798   *       T1 = Z1²
  1799   *       T2 = X1-T1
  1800   *       T1 = X1+T1
  1801   *       T2 = T2*T1
  1802   *       T2 = 3*T2
  1803   *       Y3 = 2*Y1
  1804   *       Z3 = Y3*Z1
  1805   *       Y3 = Y3²
  1806   *       T3 = Y3*X1
  1807   *       Y3 = Y3²
  1808   *       Y3 = half*Y3
  1809   *       X3 = T2²
  1810   *       T1 = 2*T3
  1811   *       X3 = X3-T1
  1812   *       T1 = T3-X3
  1813   *       T1 = T1*T2
  1814   *       Y3 = T1-Y3
  1815   */
  1816  
  1817  TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
  1818  	MOVD P3+0(FP), P3ptr
  1819  	MOVD P1+8(FP), P1ptr
  1820  
  1821  	MOVD $p256mul<>+0x00(SB), CPOOL
  1822  	VL   16(CPOOL), PL
  1823  	VL   0(CPOOL), PH
  1824  
  1825  	// X=Z1; Y=Z1; MUL; T-    // T1 = Z1²
  1826  	VL   64(P1ptr), X1       // Z1H
  1827  	VL   80(P1ptr), X0       // Z1L
  1828  	VLR  X0, Y0
  1829  	VLR  X1, Y1
  1830  	CALL p256MulInternal<>(SB)
  1831  
  1832  	// SUB(X<X1-T)            // T2 = X1-T1
  1833  	VL 0(P1ptr), X1H
  1834  	VL 16(P1ptr), X1L
  1835  	p256SubInternal(X1,X0,X1H,X1L,T1,T0)
  1836  
  1837  	// ADD(Y<X1+T)            // T1 = X1+T1
  1838  	p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
  1839  
  1840  	// X-  ; Y-  ; MUL; T-    // T2 = T2*T1
  1841  	CALL p256MulInternal<>(SB)
  1842  
  1843  	// ADD(T2<T+T); ADD(T2<T2+T)  // T2 = 3*T2
  1844  	p256AddInternal(T2H,T2L,T1,T0,T1,T0)
  1845  	p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
  1846  
  1847  	// ADD(X<Y1+Y1)           // Y3 = 2*Y1
  1848  	VL 32(P1ptr), Y1H
  1849  	VL 48(P1ptr), Y1L
  1850  	p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
  1851  
  1852  	// X-  ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
  1853  	VL   64(P1ptr), Y1       // Z1H
  1854  	VL   80(P1ptr), Y0       // Z1L
  1855  	CALL p256MulInternal<>(SB)
  1856  	VST  T1, 64(P3ptr)
  1857  	VST  T0, 80(P3ptr)
  1858  
  1859  	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  1860  	VLR  X0, Y0
  1861  	VLR  X1, Y1
  1862  	CALL p256MulInternal<>(SB)
  1863  
  1864  	// X=T ; Y=X1; MUL; T3=T  // T3 = Y3*X1
  1865  	VLR  T0, X0
  1866  	VLR  T1, X1
  1867  	VL   0(P1ptr), Y1
  1868  	VL   16(P1ptr), Y0
  1869  	CALL p256MulInternal<>(SB)
  1870  	VLR  T0, T3L
  1871  	VLR  T1, T3H
  1872  
  1873  	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  1874  	VLR  X0, Y0
  1875  	VLR  X1, Y1
  1876  	CALL p256MulInternal<>(SB)
  1877  
  1878  	// HAL(Y3<T)              // Y3 = half*Y3
  1879  	p256HalfInternal(Y3H,Y3L, T1,T0)
  1880  
  1881  	// X=T2; Y=T2; MUL; T-    // X3 = T2²
  1882  	VLR  T2L, X0
  1883  	VLR  T2H, X1
  1884  	VLR  T2L, Y0
  1885  	VLR  T2H, Y1
  1886  	CALL p256MulInternal<>(SB)
  1887  
  1888  	// ADD(T1<T3+T3)          // T1 = 2*T3
  1889  	p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
  1890  
  1891  	// SUB(X3<T-T1) X3:=X3    // X3 = X3-T1
  1892  	p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
  1893  	VST X3H, 0(P3ptr)
  1894  	VST X3L, 16(P3ptr)
  1895  
  1896  	// SUB(X<T3-X3)           // T1 = T3-X3
  1897  	p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
  1898  
  1899  	// X-  ; Y-  ; MUL; T-    // T1 = T1*T2
  1900  	CALL p256MulInternal<>(SB)
  1901  
  1902  	// SUB(Y3<T-Y3)           // Y3 = T1-Y3
  1903  	p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
  1904  
  1905  	VST Y3H, 32(P3ptr)
  1906  	VST Y3L, 48(P3ptr)
  1907  	RET
  1908  
  1909  #undef P3ptr
  1910  #undef P1ptr
  1911  #undef CPOOL
  1912  #undef X3L
  1913  #undef X3H
  1914  #undef Y3L
  1915  #undef Y3H
  1916  #undef T1L
  1917  #undef T1H
  1918  #undef T2L
  1919  #undef T2H
  1920  #undef T3L
  1921  #undef T3H
  1922  #undef X1L
  1923  #undef X1H
  1924  #undef Y1L
  1925  #undef Y1H
  1926  #undef Z1L
  1927  #undef Z1H
  1928  #undef TT0
  1929  #undef TT1
  1930  #undef T2
  1931  #undef X0
  1932  #undef X1
  1933  #undef Y0
  1934  #undef Y1
  1935  #undef T0
  1936  #undef T1
  1937  #undef PL
  1938  #undef PH
  1939  #undef Z3L
  1940  #undef Z3H
  1941  #undef ZER
  1942  #undef SEL1
  1943  #undef CAR1
  1944  #undef CAR2
  1945  
  1946  // p256PointAddAsm(P3, P1, P2 *p256Point)
  1947  #define P3ptr  R1
  1948  #define P1ptr  R2
  1949  #define P2ptr  R3
  1950  #define CPOOL  R4
  1951  #define ISZERO R5
  1952  #define TRUE   R6
  1953  
  1954  // Temporaries in REGs
  1955  #define T1L   V16
  1956  #define T1H   V17
  1957  #define T2L   V18
  1958  #define T2H   V19
  1959  #define U1L   V20
  1960  #define U1H   V21
  1961  #define S1L   V22
  1962  #define S1H   V23
  1963  #define HL    V24
  1964  #define HH    V25
  1965  #define RL    V26
  1966  #define RH    V27
  1967  
  1968  // Temps for Sub and Add
  1969  #define ZER   V6
  1970  #define SEL1  V7
  1971  #define CAR1  V8
  1972  #define CAR2  V9
  1973  #define TT0  V11
  1974  #define TT1  V12
  1975  #define T2   V13
  1976  
  1977  // p256MulAsm Parameters
  1978  #define X0    V0
  1979  #define X1    V1
  1980  #define Y0    V2
  1981  #define Y1    V3
  1982  #define T0    V4
  1983  #define T1    V5
  1984  
  1985  #define PL    V30
  1986  #define PH    V31
  1987  /*
  1988   * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
  1989   *
  1990   * A = X₁×Z₂²
  1991   * B = Y₁×Z₂³
  1992   * C = X₂×Z₁²-A
  1993   * D = Y₂×Z₁³-B
  1994   * X₃ = D² - 2A×C² - C³
  1995   * Y₃ = D×(A×C² - X₃) - B×C³
  1996   * Z₃ = Z₁×Z₂×C
  1997   *
  1998   * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
  1999   * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
  2000   *
  2001   * T1 = Z1*Z1
  2002   * T2 = Z2*Z2
  2003   * U1 = X1*T2
  2004   * H  = X2*T1
  2005   * H  = H-U1
  2006   * Z3 = Z1*Z2
  2007   * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  2008   *
  2009   * S1 = Z2*T2
  2010   * S1 = Y1*S1
  2011   * R  = Z1*T1
  2012   * R  = Y2*R
  2013   * R  = R-S1
  2014   *
  2015   * T1 = H*H
  2016   * T2 = H*T1
  2017   * U1 = U1*T1
  2018   *
  2019   * X3 = R*R
  2020   * X3 = X3-T2
  2021   * T1 = 2*U1
  2022   * X3 = X3-T1 << store-out X3 result reg
  2023   *
  2024   * T2 = S1*T2
  2025   * Y3 = U1-X3
  2026   * Y3 = R*Y3
  2027   * Y3 = Y3-T2 << store-out Y3 result reg
  2028  
  2029   	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  2030  	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  2031  	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  2032  	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2033  	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2034  	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2035  	// SUB(H<H-T)            // H  = H-U1
  2036  	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2037  	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  2038  	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2039  	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2040  	// SUB(R<T-S1)           // R  = R-S1
  2041  	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2042  	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2043  	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2044  	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2045  	// SUB(T<T-T2)           // X3 = X3-T2
  2046  	// ADD(X<U1+U1)          // T1 = 2*U1
  2047  	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2048  	// SUB(Y<U1-T)           // Y3 = U1-X3
  2049  	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2050  	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2051  	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2052  	*/
  2053  TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
  2054  	MOVD P3+0(FP), P3ptr
  2055  	MOVD P1+8(FP), P1ptr
  2056  	MOVD P2+16(FP), P2ptr
  2057  
  2058  	MOVD $p256mul<>+0x00(SB), CPOOL
  2059  	VL   16(CPOOL), PL
  2060  	VL   0(CPOOL), PH
  2061  
  2062  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  2063  	VL   64(P1ptr), X1       // Z1H
  2064  	VL   80(P1ptr), X0       // Z1L
  2065  	VLR  X0, Y0
  2066  	VLR  X1, Y1
  2067  	CALL p256MulInternal<>(SB)
  2068  
  2069  	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  2070  	VLR  T0, Y0
  2071  	VLR  T1, Y1
  2072  	CALL p256MulInternal<>(SB)
  2073  	VLR  T0, RL
  2074  	VLR  T1, RH
  2075  
  2076  	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  2077  	VL   0(P2ptr), X1        // X2H
  2078  	VL   16(P2ptr), X0       // X2L
  2079  	CALL p256MulInternal<>(SB)
  2080  	VLR  T0, HL
  2081  	VLR  T1, HH
  2082  
  2083  	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2084  	VL   64(P2ptr), X1       // Z2H
  2085  	VL   80(P2ptr), X0       // Z2L
  2086  	VLR  X0, Y0
  2087  	VLR  X1, Y1
  2088  	CALL p256MulInternal<>(SB)
  2089  
  2090  	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2091  	VLR  T0, Y0
  2092  	VLR  T1, Y1
  2093  	CALL p256MulInternal<>(SB)
  2094  	VLR  T0, S1L
  2095  	VLR  T1, S1H
  2096  
  2097  	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2098  	VL   0(P1ptr), X1        // X1H
  2099  	VL   16(P1ptr), X0       // X1L
  2100  	CALL p256MulInternal<>(SB)
  2101  	VLR  T0, U1L
  2102  	VLR  T1, U1H
  2103  
  2104  	// SUB(H<H-T)            // H  = H-U1
  2105  	p256SubInternal(HH,HL,HH,HL,T1,T0)
  2106  
  2107  	// if H == 0 or H^P == 0 then ret=1 else ret=0
  2108  	// clobbers T1H and T1L
  2109  	MOVD   $0, ISZERO
  2110  	MOVD   $1, TRUE
  2111  	VZERO  ZER
  2112  	VO     HL, HH, T1H
  2113  	VCEQGS ZER, T1H, T1H
  2114  	MOVDEQ TRUE, ISZERO
  2115  	VX     HL, PL, T1L
  2116  	VX     HH, PH, T1H
  2117  	VO     T1L, T1H, T1H
  2118  	VCEQGS ZER, T1H, T1H
  2119  	MOVDEQ TRUE, ISZERO
  2120  	MOVD   ISZERO, ret+24(FP)
  2121  
  2122  	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2123  	VL   64(P1ptr), X1       // Z1H
  2124  	VL   80(P1ptr), X0       // Z1L
  2125  	VL   64(P2ptr), Y1       // Z2H
  2126  	VL   80(P2ptr), Y0       // Z2L
  2127  	CALL p256MulInternal<>(SB)
  2128  
  2129  	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
  2130  	VLR  T0, X0
  2131  	VLR  T1, X1
  2132  	VLR  HL, Y0
  2133  	VLR  HH, Y1
  2134  	CALL p256MulInternal<>(SB)
  2135  	VST  T1, 64(P3ptr)
  2136  	VST  T0, 80(P3ptr)
  2137  
  2138  	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2139  	VL   32(P1ptr), X1
  2140  	VL   48(P1ptr), X0
  2141  	VLR  S1L, Y0
  2142  	VLR  S1H, Y1
  2143  	CALL p256MulInternal<>(SB)
  2144  	VLR  T0, S1L
  2145  	VLR  T1, S1H
  2146  
  2147  	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2148  	VL   32(P2ptr), X1
  2149  	VL   48(P2ptr), X0
  2150  	VLR  RL, Y0
  2151  	VLR  RH, Y1
  2152  	CALL p256MulInternal<>(SB)
  2153  
  2154  	// SUB(R<T-S1)           // R  = T-S1
  2155  	p256SubInternal(RH,RL,T1,T0,S1H,S1L)
  2156  
  2157  	// if R == 0 or R^P == 0 then ret=ret else ret=0
  2158  	// clobbers T1H and T1L
  2159  	MOVD   $0, ISZERO
  2160  	MOVD   $1, TRUE
  2161  	VZERO  ZER
  2162  	VO     RL, RH, T1H
  2163  	VCEQGS ZER, T1H, T1H
  2164  	MOVDEQ TRUE, ISZERO
  2165  	VX     RL, PL, T1L
  2166  	VX     RH, PH, T1H
  2167  	VO     T1L, T1H, T1H
  2168  	VCEQGS ZER, T1H, T1H
  2169  	MOVDEQ TRUE, ISZERO
  2170  	AND    ret+24(FP), ISZERO
  2171  	MOVD   ISZERO, ret+24(FP)
  2172  
  2173  	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2174  	VLR  HL, X0
  2175  	VLR  HH, X1
  2176  	VLR  HL, Y0
  2177  	VLR  HH, Y1
  2178  	CALL p256MulInternal<>(SB)
  2179  
  2180  	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2181  	VLR  T0, Y0
  2182  	VLR  T1, Y1
  2183  	CALL p256MulInternal<>(SB)
  2184  	VLR  T0, T2L
  2185  	VLR  T1, T2H
  2186  
  2187  	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2188  	VLR  U1L, X0
  2189  	VLR  U1H, X1
  2190  	CALL p256MulInternal<>(SB)
  2191  	VLR  T0, U1L
  2192  	VLR  T1, U1H
  2193  
  2194  	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2195  	VLR  RL, X0
  2196  	VLR  RH, X1
  2197  	VLR  RL, Y0
  2198  	VLR  RH, Y1
  2199  	CALL p256MulInternal<>(SB)
  2200  
  2201  	// SUB(T<T-T2)           // X3 = X3-T2
  2202  	p256SubInternal(T1,T0,T1,T0,T2H,T2L)
  2203  
  2204  	// ADD(X<U1+U1)          // T1 = 2*U1
  2205  	p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
  2206  
  2207  	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2208  	p256SubInternal(T1,T0,T1,T0,X1,X0)
  2209  	VST T1, 0(P3ptr)
  2210  	VST T0, 16(P3ptr)
  2211  
  2212  	// SUB(Y<U1-T)           // Y3 = U1-X3
  2213  	p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
  2214  
  2215  	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2216  	VLR  RL, X0
  2217  	VLR  RH, X1
  2218  	CALL p256MulInternal<>(SB)
  2219  	VLR  T0, U1L
  2220  	VLR  T1, U1H
  2221  
  2222  	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2223  	VLR  S1L, X0
  2224  	VLR  S1H, X1
  2225  	VLR  T2L, Y0
  2226  	VLR  T2H, Y1
  2227  	CALL p256MulInternal<>(SB)
  2228  
  2229  	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2230  	p256SubInternal(T1,T0,U1H,U1L,T1,T0)
  2231  	VST T1, 32(P3ptr)
  2232  	VST T0, 48(P3ptr)
  2233  
  2234  	RET