github.com/fisco-bcos/crypto@v0.0.0-20200202032121-bd8ab0b5d4f1/elliptic/p256_asm_s390x.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  #include "go_asm.h"
     7  
     8  
     9  DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f
    10  DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000
    11  DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff
    12  DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84
    13  DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551
    14  DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
    15  DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
    16  DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
    17  DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
    18  DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    19  DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    20  DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0  d1 d0  0
    21  DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0  d1 d0  0
    22  DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    23  DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
    24  DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256
    25  DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256
    26  DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256
    27  DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
    28  DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0  0  0 d0
    29  DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0  0  0 d0
    30  DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0  0 d1 d0
    31  DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0  0 d1 d0
    32  DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL  0 d1 d0 d1
    33  DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL  0 d1 d0 d1
    34  DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL  0  0 d1 d0
    35  DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL  0  0 d1 d0
    36  DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    37  DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
    38  DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0  d1 d0  0
    39  DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0  d1 d0  0
    40  DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
    41  DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
    42  DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
    43  DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256
    44  GLOBL p256ordK0<>(SB), 8, $4
    45  GLOBL p256ord<>(SB), 8, $32
    46  GLOBL p256<>(SB), 8, $80
    47  GLOBL p256mul<>(SB), 8, $160
    48  
    49  DATA p256vmsl<>+0x0(SB)/8, $0x0012131415161718
    50  DATA p256vmsl<>+0x8(SB)/8, $0x00191a1b1c1d1e1f
    51  DATA p256vmsl<>+0x10(SB)/8, $0x0012131415161718
    52  DATA p256vmsl<>+0x18(SB)/8, $0x000b0c0d0e0f1011
    53  DATA p256vmsl<>+0x20(SB)/8, $0x00191a1b1c1d1e1f
    54  DATA p256vmsl<>+0x28(SB)/8, $0x0012131415161718
    55  DATA p256vmsl<>+0x30(SB)/8, $0x000b0c0d0e0f1011
    56  DATA p256vmsl<>+0x38(SB)/8, $0x0012131415161718
    57  DATA p256vmsl<>+0x40(SB)/8, $0x000405060708090a
    58  DATA p256vmsl<>+0x48(SB)/8, $0x000b0c0d0e0f1011
    59  DATA p256vmsl<>+0x50(SB)/8, $0x000b0c0d0e0f1011
    60  DATA p256vmsl<>+0x58(SB)/8, $0x000405060708090a
    61  DATA p256vmsl<>+0x60(SB)/8, $0x1010101000010203
    62  DATA p256vmsl<>+0x68(SB)/8, $0x100405060708090a
    63  DATA p256vmsl<>+0x70(SB)/8, $0x100405060708090a
    64  DATA p256vmsl<>+0x78(SB)/8, $0x1010101000010203
    65  GLOBL p256vmsl<>(SB), 8, $128
    66  
    67  // ---------------------------------------
    68  // iff cond == 1  val <- -val
    69  // func p256NegCond(val *p256Point, cond int)
    70  #define P1ptr   R1
    71  #define CPOOL   R4
    72  
    73  #define Y1L   V0
    74  #define Y1H   V1
    75  #define T1L   V2
    76  #define T1H   V3
    77  
    78  #define PL    V30
    79  #define PH    V31
    80  
    81  #define ZER   V4
    82  #define SEL1  V5
    83  #define CAR1  V6
    84  TEXT ·p256NegCond(SB), NOSPLIT, $0
    85  	MOVD val+0(FP), P1ptr
    86  
    87  	MOVD $p256mul<>+0x00(SB), CPOOL
    88  	VL   16(CPOOL), PL
    89  	VL   0(CPOOL), PH
    90  
    91  	VL 32(P1ptr), Y1H
    92  	VL 48(P1ptr), Y1L
    93  
    94  	VLREPG cond+8(FP), SEL1
    95  	VZERO  ZER
    96  	VCEQG  SEL1, ZER, SEL1
    97  
    98  	VSCBIQ Y1L, PL, CAR1
    99  	VSQ    Y1L, PL, T1L
   100  	VSBIQ  PH, Y1H, CAR1, T1H
   101  
   102  	VSEL Y1L, T1L, SEL1, Y1L
   103  	VSEL Y1H, T1H, SEL1, Y1H
   104  
   105  	VST Y1H, 32(P1ptr)
   106  	VST Y1L, 48(P1ptr)
   107  	RET
   108  
   109  #undef P1ptr
   110  #undef CPOOL
   111  #undef Y1L
   112  #undef Y1H
   113  #undef T1L
   114  #undef T1H
   115  #undef PL
   116  #undef PH
   117  #undef ZER
   118  #undef SEL1
   119  #undef CAR1
   120  
   121  // ---------------------------------------
   122  // if cond == 0 res <- b; else res <- a
   123  // func p256MovCond(res, a, b *p256Point, cond int)
   124  #define P3ptr   R1
   125  #define P1ptr   R2
   126  #define P2ptr   R3
   127  
   128  #define X1L    V0
   129  #define X1H    V1
   130  #define Y1L    V2
   131  #define Y1H    V3
   132  #define Z1L    V4
   133  #define Z1H    V5
   134  #define X2L    V6
   135  #define X2H    V7
   136  #define Y2L    V8
   137  #define Y2H    V9
   138  #define Z2L    V10
   139  #define Z2H    V11
   140  
   141  #define ZER   V18
   142  #define SEL1  V19
   143  TEXT ·p256MovCond(SB), NOSPLIT, $0
   144  	MOVD   res+0(FP), P3ptr
   145  	MOVD   a+8(FP), P1ptr
   146  	MOVD   b+16(FP), P2ptr
   147  	VLREPG cond+24(FP), SEL1
   148  	VZERO  ZER
   149  	VCEQG  SEL1, ZER, SEL1
   150  
   151  	VL 0(P1ptr), X1H
   152  	VL 16(P1ptr), X1L
   153  	VL 32(P1ptr), Y1H
   154  	VL 48(P1ptr), Y1L
   155  	VL 64(P1ptr), Z1H
   156  	VL 80(P1ptr), Z1L
   157  
   158  	VL 0(P2ptr), X2H
   159  	VL 16(P2ptr), X2L
   160  	VL 32(P2ptr), Y2H
   161  	VL 48(P2ptr), Y2L
   162  	VL 64(P2ptr), Z2H
   163  	VL 80(P2ptr), Z2L
   164  
   165  	VSEL X2L, X1L, SEL1, X1L
   166  	VSEL X2H, X1H, SEL1, X1H
   167  	VSEL Y2L, Y1L, SEL1, Y1L
   168  	VSEL Y2H, Y1H, SEL1, Y1H
   169  	VSEL Z2L, Z1L, SEL1, Z1L
   170  	VSEL Z2H, Z1H, SEL1, Z1H
   171  
   172  	VST X1H, 0(P3ptr)
   173  	VST X1L, 16(P3ptr)
   174  	VST Y1H, 32(P3ptr)
   175  	VST Y1L, 48(P3ptr)
   176  	VST Z1H, 64(P3ptr)
   177  	VST Z1L, 80(P3ptr)
   178  
   179  	RET
   180  
   181  #undef P3ptr
   182  #undef P1ptr
   183  #undef P2ptr
   184  #undef X1L
   185  #undef X1H
   186  #undef Y1L
   187  #undef Y1H
   188  #undef Z1L
   189  #undef Z1H
   190  #undef X2L
   191  #undef X2H
   192  #undef Y2L
   193  #undef Y2H
   194  #undef Z2L
   195  #undef Z2H
   196  #undef ZER
   197  #undef SEL1
   198  
   199  // ---------------------------------------
   200  // Constant time table access
   201  // Indexed from 1 to 15, with -1 offset
   202  // (index 0 is implicitly point at infinity)
   203  // func p256Select(point *p256Point, table []p256Point, idx int)
   204  #define P3ptr   R1
   205  #define P1ptr   R2
   206  #define COUNT   R4
   207  
   208  #define X1L    V0
   209  #define X1H    V1
   210  #define Y1L    V2
   211  #define Y1H    V3
   212  #define Z1L    V4
   213  #define Z1H    V5
   214  #define X2L    V6
   215  #define X2H    V7
   216  #define Y2L    V8
   217  #define Y2H    V9
   218  #define Z2L    V10
   219  #define Z2H    V11
   220  
   221  #define ONE   V18
   222  #define IDX   V19
   223  #define SEL1  V20
   224  #define SEL2  V21
   225  TEXT ·p256Select(SB), NOSPLIT, $0
   226  	MOVD   point+0(FP), P3ptr
   227  	MOVD   table+8(FP), P1ptr
   228  	VLREPB idx+(32+7)(FP), IDX
   229  	VREPIB $1, ONE
   230  	VREPIB $1, SEL2
   231  	MOVD   $1, COUNT
   232  
   233  	VZERO X1H
   234  	VZERO X1L
   235  	VZERO Y1H
   236  	VZERO Y1L
   237  	VZERO Z1H
   238  	VZERO Z1L
   239  
   240  loop_select:
   241  	VL 0(P1ptr), X2H
   242  	VL 16(P1ptr), X2L
   243  	VL 32(P1ptr), Y2H
   244  	VL 48(P1ptr), Y2L
   245  	VL 64(P1ptr), Z2H
   246  	VL 80(P1ptr), Z2L
   247  
   248  	VCEQG SEL2, IDX, SEL1
   249  
   250  	VSEL X2L, X1L, SEL1, X1L
   251  	VSEL X2H, X1H, SEL1, X1H
   252  	VSEL Y2L, Y1L, SEL1, Y1L
   253  	VSEL Y2H, Y1H, SEL1, Y1H
   254  	VSEL Z2L, Z1L, SEL1, Z1L
   255  	VSEL Z2H, Z1H, SEL1, Z1H
   256  
   257  	VAB  SEL2, ONE, SEL2
   258  	ADDW $1, COUNT
   259  	ADD  $96, P1ptr
   260  	CMPW COUNT, $17
   261  	BLT  loop_select
   262  
   263  	VST X1H, 0(P3ptr)
   264  	VST X1L, 16(P3ptr)
   265  	VST Y1H, 32(P3ptr)
   266  	VST Y1L, 48(P3ptr)
   267  	VST Z1H, 64(P3ptr)
   268  	VST Z1L, 80(P3ptr)
   269  	RET
   270  
   271  #undef P3ptr
   272  #undef P1ptr
   273  #undef COUNT
   274  #undef X1L
   275  #undef X1H
   276  #undef Y1L
   277  #undef Y1H
   278  #undef Z1L
   279  #undef Z1H
   280  #undef X2L
   281  #undef X2H
   282  #undef Y2L
   283  #undef Y2H
   284  #undef Z2L
   285  #undef Z2H
   286  #undef ONE
   287  #undef IDX
   288  #undef SEL1
   289  #undef SEL2
   290  
   291  // ---------------------------------------
   292  // Constant time table access
   293  // Indexed from 1 to 15, with -1 offset
   294  // (index 0 is implicitly point at infinity)
   295  // func p256SelectBase(point *p256Point, table []p256Point, idx int)
   296  #define P3ptr   R1
   297  #define P1ptr   R2
   298  #define COUNT   R4
   299  
   300  #define X1L    V0
   301  #define X1H    V1
   302  #define Y1L    V2
   303  #define Y1H    V3
   304  #define Z1L    V4
   305  #define Z1H    V5
   306  #define X2L    V6
   307  #define X2H    V7
   308  #define Y2L    V8
   309  #define Y2H    V9
   310  #define Z2L    V10
   311  #define Z2H    V11
   312  
   313  #define ONE   V18
   314  #define IDX   V19
   315  #define SEL1  V20
   316  #define SEL2  V21
   317  TEXT ·p256SelectBase(SB), NOSPLIT, $0
   318  	MOVD   point+0(FP), P3ptr
   319  	MOVD   table+8(FP), P1ptr
   320  	VLREPB idx+(32+7)(FP), IDX
   321  	VREPIB $1, ONE
   322  	VREPIB $1, SEL2
   323  	MOVD   $1, COUNT
   324  
   325  	VZERO X1H
   326  	VZERO X1L
   327  	VZERO Y1H
   328  	VZERO Y1L
   329  	VZERO Z1H
   330  	VZERO Z1L
   331  
   332  loop_select:
   333  	VL 0(P1ptr), X2H
   334  	VL 16(P1ptr), X2L
   335  	VL 32(P1ptr), Y2H
   336  	VL 48(P1ptr), Y2L
   337  	VL 64(P1ptr), Z2H
   338  	VL 80(P1ptr), Z2L
   339  
   340  	VCEQG SEL2, IDX, SEL1
   341  
   342  	VSEL X2L, X1L, SEL1, X1L
   343  	VSEL X2H, X1H, SEL1, X1H
   344  	VSEL Y2L, Y1L, SEL1, Y1L
   345  	VSEL Y2H, Y1H, SEL1, Y1H
   346  	VSEL Z2L, Z1L, SEL1, Z1L
   347  	VSEL Z2H, Z1H, SEL1, Z1H
   348  
   349  	VAB  SEL2, ONE, SEL2
   350  	ADDW $1, COUNT
   351  	ADD  $96, P1ptr
   352  	CMPW COUNT, $65
   353  	BLT  loop_select
   354  
   355  	VST X1H, 0(P3ptr)
   356  	VST X1L, 16(P3ptr)
   357  	VST Y1H, 32(P3ptr)
   358  	VST Y1L, 48(P3ptr)
   359  	VST Z1H, 64(P3ptr)
   360  	VST Z1L, 80(P3ptr)
   361  	RET
   362  
   363  #undef P3ptr
   364  #undef P1ptr
   365  #undef COUNT
   366  #undef X1L
   367  #undef X1H
   368  #undef Y1L
   369  #undef Y1H
   370  #undef Z1L
   371  #undef Z1H
   372  #undef X2L
   373  #undef X2H
   374  #undef Y2L
   375  #undef Y2H
   376  #undef Z2L
   377  #undef Z2H
   378  #undef ONE
   379  #undef IDX
   380  #undef SEL1
   381  #undef SEL2
   382  
   383  // ---------------------------------------
   384  // func p256FromMont(res, in []byte)
   385  #define res_ptr R1
   386  #define x_ptr   R2
   387  #define CPOOL   R4
   388  
   389  #define T0   V0
   390  #define T1   V1
   391  #define T2   V2
   392  #define TT0  V3
   393  #define TT1  V4
   394  
   395  #define ZER   V6
   396  #define SEL1  V7
   397  #define SEL2  V8
   398  #define CAR1  V9
   399  #define CAR2  V10
   400  #define RED1  V11
   401  #define RED2  V12
   402  #define PL    V13
   403  #define PH    V14
   404  
   405  TEXT ·p256FromMont(SB), NOSPLIT, $0
   406  	MOVD res+0(FP), res_ptr
   407  	MOVD in+24(FP), x_ptr
   408  
   409  	VZERO T2
   410  	VZERO ZER
   411  	MOVD  $p256<>+0x00(SB), CPOOL
   412  	VL    16(CPOOL), PL
   413  	VL    0(CPOOL), PH
   414  	VL    48(CPOOL), SEL2
   415  	VL    64(CPOOL), SEL1
   416  
   417  	VL (1*16)(x_ptr), T0
   418  	VL (0*16)(x_ptr), T1
   419  
   420  	// First round
   421  	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   422  	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   423  	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   424  
   425  	VSLDB $8, T1, T0, T0
   426  	VSLDB $8, T2, T1, T1
   427  
   428  	VACCQ  T0, RED1, CAR1
   429  	VAQ    T0, RED1, T0
   430  	VACCCQ T1, RED2, CAR1, CAR2
   431  	VACQ   T1, RED2, CAR1, T1
   432  	VAQ    T2, CAR2, T2
   433  
   434  	// Second round
   435  	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   436  	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   437  	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   438  
   439  	VSLDB $8, T1, T0, T0
   440  	VSLDB $8, T2, T1, T1
   441  
   442  	VACCQ  T0, RED1, CAR1
   443  	VAQ    T0, RED1, T0
   444  	VACCCQ T1, RED2, CAR1, CAR2
   445  	VACQ   T1, RED2, CAR1, T1
   446  	VAQ    T2, CAR2, T2
   447  
   448  	// Third round
   449  	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   450  	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   451  	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   452  
   453  	VSLDB $8, T1, T0, T0
   454  	VSLDB $8, T2, T1, T1
   455  
   456  	VACCQ  T0, RED1, CAR1
   457  	VAQ    T0, RED1, T0
   458  	VACCCQ T1, RED2, CAR1, CAR2
   459  	VACQ   T1, RED2, CAR1, T1
   460  	VAQ    T2, CAR2, T2
   461  
   462  	// Last round
   463  	VPERM T1, T0, SEL1, RED2    // d1 d0 d1 d0
   464  	VPERM ZER, RED2, SEL2, RED1 // 0  d1 d0  0
   465  	VSQ   RED1, RED2, RED2      // Guaranteed not to underflow
   466  
   467  	VSLDB $8, T1, T0, T0
   468  	VSLDB $8, T2, T1, T1
   469  
   470  	VACCQ  T0, RED1, CAR1
   471  	VAQ    T0, RED1, T0
   472  	VACCCQ T1, RED2, CAR1, CAR2
   473  	VACQ   T1, RED2, CAR1, T1
   474  	VAQ    T2, CAR2, T2
   475  
   476  	// ---------------------------------------------------
   477  
   478  	VSCBIQ  PL, T0, CAR1
   479  	VSQ     PL, T0, TT0
   480  	VSBCBIQ T1, PH, CAR1, CAR2
   481  	VSBIQ   T1, PH, CAR1, TT1
   482  	VSBIQ   T2, ZER, CAR2, T2
   483  
   484  	// what output to use, TT1||TT0 or T1||T0?
   485  	VSEL T0, TT0, T2, T0
   486  	VSEL T1, TT1, T2, T1
   487  
   488  	VST T0, (1*16)(res_ptr)
   489  	VST T1, (0*16)(res_ptr)
   490  	RET
   491  
   492  #undef res_ptr
   493  #undef x_ptr
   494  #undef CPOOL
   495  #undef T0
   496  #undef T1
   497  #undef T2
   498  #undef TT0
   499  #undef TT1
   500  #undef ZER
   501  #undef SEL1
   502  #undef SEL2
   503  #undef CAR1
   504  #undef CAR2
   505  #undef RED1
   506  #undef RED2
   507  #undef PL
   508  #undef PH
   509  
   510  // ---------------------------------------
   511  // func p256OrdMul(res, in1, in2 []byte)
   512  #define res_ptr R1
   513  #define x_ptr R2
   514  #define y_ptr R3
   515  #define X0    V0
   516  #define X1    V1
   517  #define Y0    V2
   518  #define Y1    V3
   519  #define M0    V4
   520  #define M1    V5
   521  #define T0    V6
   522  #define T1    V7
   523  #define T2    V8
   524  #define YDIG  V9
   525  
   526  #define ADD1  V16
   527  #define ADD1H V17
   528  #define ADD2  V18
   529  #define ADD2H V19
   530  #define RED1  V20
   531  #define RED1H V21
   532  #define RED2  V22
   533  #define RED2H V23
   534  #define CAR1  V24
   535  #define CAR1M V25
   536  
   537  #define MK0   V30
   538  #define K0    V31
   539  TEXT ·p256OrdMul(SB), NOSPLIT, $0
   540  	MOVD res+0(FP), res_ptr
   541  	MOVD in1+24(FP), x_ptr
   542  	MOVD in2+48(FP), y_ptr
   543  
   544  	VZERO T2
   545  	MOVD  $p256ordK0<>+0x00(SB), R4
   546  
   547  	// VLEF    $3, 0(R4), K0
   548  	WORD $0xE7F40000
   549  	BYTE $0x38
   550  	BYTE $0x03
   551  	MOVD $p256ord<>+0x00(SB), R4
   552  	VL   16(R4), M0
   553  	VL   0(R4), M1
   554  
   555  	VL (1*16)(x_ptr), X0
   556  	VL (0*16)(x_ptr), X1
   557  	VL (1*16)(y_ptr), Y0
   558  	VL (0*16)(y_ptr), Y1
   559  
   560  	// ---------------------------------------------------------------------------/
   561  	VREPF $3, Y0, YDIG
   562  	VMLF  X0, YDIG, ADD1
   563  	VMLF  ADD1, K0, MK0
   564  	VREPF $3, MK0, MK0
   565  
   566  	VMLF  X1, YDIG, ADD2
   567  	VMLHF X0, YDIG, ADD1H
   568  	VMLHF X1, YDIG, ADD2H
   569  
   570  	VMALF  M0, MK0, ADD1, RED1
   571  	VMALHF M0, MK0, ADD1, RED1H
   572  	VMALF  M1, MK0, ADD2, RED2
   573  	VMALHF M1, MK0, ADD2, RED2H
   574  
   575  	VSLDB $12, RED2, RED1, RED1
   576  	VSLDB $12, T2, RED2, RED2
   577  
   578  	VACCQ RED1, ADD1H, CAR1
   579  	VAQ   RED1, ADD1H, T0
   580  	VACCQ RED1H, T0, CAR1M
   581  	VAQ   RED1H, T0, T0
   582  
   583  	// << ready for next MK0
   584  
   585  	VACQ   RED2, ADD2H, CAR1, T1
   586  	VACCCQ RED2, ADD2H, CAR1, CAR1
   587  	VACCCQ RED2H, T1, CAR1M, T2
   588  	VACQ   RED2H, T1, CAR1M, T1
   589  	VAQ    CAR1, T2, T2
   590  
   591  	// ---------------------------------------------------
   592  /* *
   593   * ---+--------+--------+
   594   *  T2|   T1   |   T0   |
   595   * ---+--------+--------+
   596   *           *(add)*
   597   *    +--------+--------+
   598   *    |   X1   |   X0   |
   599   *    +--------+--------+
   600   *           *(mul)*
   601   *    +--------+--------+
   602   *    |  YDIG  |  YDIG  |
   603   *    +--------+--------+
   604   *           *(add)*
   605   *    +--------+--------+
   606   *    |   M1   |   M0   |
   607   *    +--------+--------+
   608   *           *(mul)*
   609   *    +--------+--------+
   610   *    |   MK0  |   MK0  |
   611   *    +--------+--------+
   612   *
   613   *   ---------------------
   614   *
   615   *    +--------+--------+
   616   *    |  ADD2  |  ADD1  |
   617   *    +--------+--------+
   618   *  +--------+--------+
   619   *  | ADD2H  | ADD1H  |
   620   *  +--------+--------+
   621   *    +--------+--------+
   622   *    |  RED2  |  RED1  |
   623   *    +--------+--------+
   624   *  +--------+--------+
   625   *  | RED2H  | RED1H  |
   626   *  +--------+--------+
   627   */
   628  	VREPF $2, Y0, YDIG
   629  	VMALF X0, YDIG, T0, ADD1
   630  	VMLF  ADD1, K0, MK0
   631  	VREPF $3, MK0, MK0
   632  
   633  	VMALF  X1, YDIG, T1, ADD2
   634  	VMALHF X0, YDIG, T0, ADD1H
   635  	VMALHF X1, YDIG, T1, ADD2H
   636  
   637  	VMALF  M0, MK0, ADD1, RED1
   638  	VMALHF M0, MK0, ADD1, RED1H
   639  	VMALF  M1, MK0, ADD2, RED2
   640  	VMALHF M1, MK0, ADD2, RED2H
   641  
   642  	VSLDB $12, RED2, RED1, RED1
   643  	VSLDB $12, T2, RED2, RED2
   644  
   645  	VACCQ RED1, ADD1H, CAR1
   646  	VAQ   RED1, ADD1H, T0
   647  	VACCQ RED1H, T0, CAR1M
   648  	VAQ   RED1H, T0, T0
   649  
   650  	// << ready for next MK0
   651  
   652  	VACQ   RED2, ADD2H, CAR1, T1
   653  	VACCCQ RED2, ADD2H, CAR1, CAR1
   654  	VACCCQ RED2H, T1, CAR1M, T2
   655  	VACQ   RED2H, T1, CAR1M, T1
   656  	VAQ    CAR1, T2, T2
   657  
   658  	// ---------------------------------------------------
   659  	VREPF $1, Y0, YDIG
   660  	VMALF X0, YDIG, T0, ADD1
   661  	VMLF  ADD1, K0, MK0
   662  	VREPF $3, MK0, MK0
   663  
   664  	VMALF  X1, YDIG, T1, ADD2
   665  	VMALHF X0, YDIG, T0, ADD1H
   666  	VMALHF X1, YDIG, T1, ADD2H
   667  
   668  	VMALF  M0, MK0, ADD1, RED1
   669  	VMALHF M0, MK0, ADD1, RED1H
   670  	VMALF  M1, MK0, ADD2, RED2
   671  	VMALHF M1, MK0, ADD2, RED2H
   672  
   673  	VSLDB $12, RED2, RED1, RED1
   674  	VSLDB $12, T2, RED2, RED2
   675  
   676  	VACCQ RED1, ADD1H, CAR1
   677  	VAQ   RED1, ADD1H, T0
   678  	VACCQ RED1H, T0, CAR1M
   679  	VAQ   RED1H, T0, T0
   680  
   681  	// << ready for next MK0
   682  
   683  	VACQ   RED2, ADD2H, CAR1, T1
   684  	VACCCQ RED2, ADD2H, CAR1, CAR1
   685  	VACCCQ RED2H, T1, CAR1M, T2
   686  	VACQ   RED2H, T1, CAR1M, T1
   687  	VAQ    CAR1, T2, T2
   688  
   689  	// ---------------------------------------------------
   690  	VREPF $0, Y0, YDIG
   691  	VMALF X0, YDIG, T0, ADD1
   692  	VMLF  ADD1, K0, MK0
   693  	VREPF $3, MK0, MK0
   694  
   695  	VMALF  X1, YDIG, T1, ADD2
   696  	VMALHF X0, YDIG, T0, ADD1H
   697  	VMALHF X1, YDIG, T1, ADD2H
   698  
   699  	VMALF  M0, MK0, ADD1, RED1
   700  	VMALHF M0, MK0, ADD1, RED1H
   701  	VMALF  M1, MK0, ADD2, RED2
   702  	VMALHF M1, MK0, ADD2, RED2H
   703  
   704  	VSLDB $12, RED2, RED1, RED1
   705  	VSLDB $12, T2, RED2, RED2
   706  
   707  	VACCQ RED1, ADD1H, CAR1
   708  	VAQ   RED1, ADD1H, T0
   709  	VACCQ RED1H, T0, CAR1M
   710  	VAQ   RED1H, T0, T0
   711  
   712  	// << ready for next MK0
   713  
   714  	VACQ   RED2, ADD2H, CAR1, T1
   715  	VACCCQ RED2, ADD2H, CAR1, CAR1
   716  	VACCCQ RED2H, T1, CAR1M, T2
   717  	VACQ   RED2H, T1, CAR1M, T1
   718  	VAQ    CAR1, T2, T2
   719  
   720  	// ---------------------------------------------------
   721  	VREPF $3, Y1, YDIG
   722  	VMALF X0, YDIG, T0, ADD1
   723  	VMLF  ADD1, K0, MK0
   724  	VREPF $3, MK0, MK0
   725  
   726  	VMALF  X1, YDIG, T1, ADD2
   727  	VMALHF X0, YDIG, T0, ADD1H
   728  	VMALHF X1, YDIG, T1, ADD2H
   729  
   730  	VMALF  M0, MK0, ADD1, RED1
   731  	VMALHF M0, MK0, ADD1, RED1H
   732  	VMALF  M1, MK0, ADD2, RED2
   733  	VMALHF M1, MK0, ADD2, RED2H
   734  
   735  	VSLDB $12, RED2, RED1, RED1
   736  	VSLDB $12, T2, RED2, RED2
   737  
   738  	VACCQ RED1, ADD1H, CAR1
   739  	VAQ   RED1, ADD1H, T0
   740  	VACCQ RED1H, T0, CAR1M
   741  	VAQ   RED1H, T0, T0
   742  
   743  	// << ready for next MK0
   744  
   745  	VACQ   RED2, ADD2H, CAR1, T1
   746  	VACCCQ RED2, ADD2H, CAR1, CAR1
   747  	VACCCQ RED2H, T1, CAR1M, T2
   748  	VACQ   RED2H, T1, CAR1M, T1
   749  	VAQ    CAR1, T2, T2
   750  
   751  	// ---------------------------------------------------
   752  	VREPF $2, Y1, YDIG
   753  	VMALF X0, YDIG, T0, ADD1
   754  	VMLF  ADD1, K0, MK0
   755  	VREPF $3, MK0, MK0
   756  
   757  	VMALF  X1, YDIG, T1, ADD2
   758  	VMALHF X0, YDIG, T0, ADD1H
   759  	VMALHF X1, YDIG, T1, ADD2H
   760  
   761  	VMALF  M0, MK0, ADD1, RED1
   762  	VMALHF M0, MK0, ADD1, RED1H
   763  	VMALF  M1, MK0, ADD2, RED2
   764  	VMALHF M1, MK0, ADD2, RED2H
   765  
   766  	VSLDB $12, RED2, RED1, RED1
   767  	VSLDB $12, T2, RED2, RED2
   768  
   769  	VACCQ RED1, ADD1H, CAR1
   770  	VAQ   RED1, ADD1H, T0
   771  	VACCQ RED1H, T0, CAR1M
   772  	VAQ   RED1H, T0, T0
   773  
   774  	// << ready for next MK0
   775  
   776  	VACQ   RED2, ADD2H, CAR1, T1
   777  	VACCCQ RED2, ADD2H, CAR1, CAR1
   778  	VACCCQ RED2H, T1, CAR1M, T2
   779  	VACQ   RED2H, T1, CAR1M, T1
   780  	VAQ    CAR1, T2, T2
   781  
   782  	// ---------------------------------------------------
   783  	VREPF $1, Y1, YDIG
   784  	VMALF X0, YDIG, T0, ADD1
   785  	VMLF  ADD1, K0, MK0
   786  	VREPF $3, MK0, MK0
   787  
   788  	VMALF  X1, YDIG, T1, ADD2
   789  	VMALHF X0, YDIG, T0, ADD1H
   790  	VMALHF X1, YDIG, T1, ADD2H
   791  
   792  	VMALF  M0, MK0, ADD1, RED1
   793  	VMALHF M0, MK0, ADD1, RED1H
   794  	VMALF  M1, MK0, ADD2, RED2
   795  	VMALHF M1, MK0, ADD2, RED2H
   796  
   797  	VSLDB $12, RED2, RED1, RED1
   798  	VSLDB $12, T2, RED2, RED2
   799  
   800  	VACCQ RED1, ADD1H, CAR1
   801  	VAQ   RED1, ADD1H, T0
   802  	VACCQ RED1H, T0, CAR1M
   803  	VAQ   RED1H, T0, T0
   804  
   805  	// << ready for next MK0
   806  
   807  	VACQ   RED2, ADD2H, CAR1, T1
   808  	VACCCQ RED2, ADD2H, CAR1, CAR1
   809  	VACCCQ RED2H, T1, CAR1M, T2
   810  	VACQ   RED2H, T1, CAR1M, T1
   811  	VAQ    CAR1, T2, T2
   812  
   813  	// ---------------------------------------------------
   814  	VREPF $0, Y1, YDIG
   815  	VMALF X0, YDIG, T0, ADD1
   816  	VMLF  ADD1, K0, MK0
   817  	VREPF $3, MK0, MK0
   818  
   819  	VMALF  X1, YDIG, T1, ADD2
   820  	VMALHF X0, YDIG, T0, ADD1H
   821  	VMALHF X1, YDIG, T1, ADD2H
   822  
   823  	VMALF  M0, MK0, ADD1, RED1
   824  	VMALHF M0, MK0, ADD1, RED1H
   825  	VMALF  M1, MK0, ADD2, RED2
   826  	VMALHF M1, MK0, ADD2, RED2H
   827  
   828  	VSLDB $12, RED2, RED1, RED1
   829  	VSLDB $12, T2, RED2, RED2
   830  
   831  	VACCQ RED1, ADD1H, CAR1
   832  	VAQ   RED1, ADD1H, T0
   833  	VACCQ RED1H, T0, CAR1M
   834  	VAQ   RED1H, T0, T0
   835  
   836  	// << ready for next MK0
   837  
   838  	VACQ   RED2, ADD2H, CAR1, T1
   839  	VACCCQ RED2, ADD2H, CAR1, CAR1
   840  	VACCCQ RED2H, T1, CAR1M, T2
   841  	VACQ   RED2H, T1, CAR1M, T1
   842  	VAQ    CAR1, T2, T2
   843  
   844  	// ---------------------------------------------------
   845  
   846  	VZERO   RED1
   847  	VSCBIQ  M0, T0, CAR1
   848  	VSQ     M0, T0, ADD1
   849  	VSBCBIQ T1, M1, CAR1, CAR1M
   850  	VSBIQ   T1, M1, CAR1, ADD2
   851  	VSBIQ   T2, RED1, CAR1M, T2
   852  
   853  	// what output to use, ADD2||ADD1 or T1||T0?
   854  	VSEL T0, ADD1, T2, T0
   855  	VSEL T1, ADD2, T2, T1
   856  
   857  	VST T0, (1*16)(res_ptr)
   858  	VST T1, (0*16)(res_ptr)
   859  	RET
   860  
   861  #undef res_ptr
   862  #undef x_ptr
   863  #undef y_ptr
   864  #undef X0
   865  #undef X1
   866  #undef Y0
   867  #undef Y1
   868  #undef M0
   869  #undef M1
   870  #undef T0
   871  #undef T1
   872  #undef T2
   873  #undef YDIG
   874  
   875  #undef ADD1
   876  #undef ADD1H
   877  #undef ADD2
   878  #undef ADD2H
   879  #undef RED1
   880  #undef RED1H
   881  #undef RED2
   882  #undef RED2H
   883  #undef CAR1
   884  #undef CAR1M
   885  
   886  #undef MK0
   887  #undef K0
   888  
   889  // ---------------------------------------
   890  // p256MulInternalVX
   891  // V0-V3,V30,V31 - Not Modified
   892  // V4-V15 - Volatile
   893  
   894  #define CPOOL   R4
   895  
   896  // Parameters
   897  #define X0    V0 // Not modified
   898  #define X1    V1 // Not modified
   899  #define Y0    V2 // Not modified
   900  #define Y1    V3 // Not modified
   901  #define T0    V4
   902  #define T1    V5
   903  #define P0    V30 // Not modified
   904  #define P1    V31 // Not modified
   905  
   906  // Temporaries
   907  #define YDIG  V6 // Overloaded with CAR2, ZER
   908  #define ADD1H V7 // Overloaded with ADD3H
   909  #define ADD2H V8 // Overloaded with ADD4H
   910  #define ADD3  V9 // Overloaded with SEL2,SEL5
   911  #define ADD4  V10 // Overloaded with SEL3,SEL6
   912  #define RED1  V11 // Overloaded with CAR2
   913  #define RED2  V12
   914  #define RED3  V13 // Overloaded with SEL1
   915  #define T2    V14
   916  // Overloaded temporaries
   917  #define ADD1  V4 // Overloaded with T0
   918  #define ADD2  V5 // Overloaded with T1
   919  #define ADD3H V7 // Overloaded with ADD1H
   920  #define ADD4H V8 // Overloaded with ADD2H
   921  #define ZER   V6 // Overloaded with YDIG, CAR2
   922  #define CAR1  V6 // Overloaded with YDIG, ZER
   923  #define CAR2  V11 // Overloaded with RED1
   924  // Constant Selects
   925  #define SEL1  V13 // Overloaded with RED3
   926  #define SEL2  V9 // Overloaded with ADD3,SEL5
   927  #define SEL3  V10 // Overloaded with ADD4,SEL6
   928  #define SEL4  V6 // Overloaded with YDIG,CAR2,ZER
   929  #define SEL5  V9 // Overloaded with ADD3,SEL2
   930  #define SEL6  V10 // Overloaded with ADD4,SEL3
   931  
   932  /* *
   933   * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
   934   * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
   935   * With you, SIMD be...
   936   *
   937   *                                           +--------+--------+
   938   *                                  +--------|  RED2  |  RED1  |
   939   *                                  |        +--------+--------+
   940   *                                  |       ---+--------+--------+
   941   *                                  |  +---- T2|   T1   |   T0   |--+
   942   *                                  |  |    ---+--------+--------+  |
   943   *                                  |  |                            |
   944   *                                  |  |    ======================= |
   945   *                                  |  |                            |
   946   *                                  |  |       +--------+--------+<-+
   947   *                                  |  +-------|  ADD2  |  ADD1  |--|-----+
   948   *                                  |  |       +--------+--------+  |     |
   949   *                                  |  |     +--------+--------+<---+     |
   950   *                                  |  |     | ADD2H  | ADD1H  |--+       |
   951   *                                  |  |     +--------+--------+  |       |
   952   *                                  |  |     +--------+--------+<-+       |
   953   *                                  |  |     |  ADD4  |  ADD3  |--|-+     |
   954   *                                  |  |     +--------+--------+  | |     |
   955   *                                  |  |   +--------+--------+<---+ |     |
   956   *                                  |  |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
   957   *                                  |  |   +--------+--------+      | |   V
   958   *                                  |  | ------------------------   | | +--------+
   959   *                                  |  |                            | | |  RED3  |  [d0 0 0 d0]
   960   *                                  |  |                            | | +--------+
   961   *                                  |  +---->+--------+--------+    | |   |
   962   *   (T2[1w]||ADD2[4w]||ADD1[3w])   +--------|   T1   |   T0   |    | |   |
   963   *                                  |        +--------+--------+    | |   |
   964   *                                  +---->---+--------+--------+    | |   |
   965   *                                         T2|   T1   |   T0   |----+ |   |
   966   *                                        ---+--------+--------+    | |   |
   967   *                                        ---+--------+--------+<---+ |   |
   968   *                                    +--- T2|   T1   |   T0   |----------+
   969   *                                    |   ---+--------+--------+      |   |
   970   *                                    |  +--------+--------+<-------------+
   971   *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
   972   *                                    |  +--------+--------+     |    |   |
   973   *                                    |  +--------+<----------------------+
   974   *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
   975   *                                    |  +--------+              |    |
   976   *                                    +--->+--------+--------+   |    |
   977   *                                         |   T1   |   T0   |--------+
   978   *                                         +--------+--------+   |    |
   979   *                                   --------------------------- |    |
   980   *                                                               |    |
   981   *                                       +--------+--------+<----+    |
   982   *                                       |  RED2  |  RED1  |          |
   983   *                                       +--------+--------+          |
   984   *                                      ---+--------+--------+<-------+
   985   *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
   986   *                                      ---+--------+--------+
   987   *
   988   *                                                                *Mi obra de arte de siglo XXI @vpaprots
   989   *
   990   *
   991   * First group is special, doesn't get the two inputs:
   992   *                                             +--------+--------+<-+
   993   *                                     +-------|  ADD2  |  ADD1  |--|-----+
   994   *                                     |       +--------+--------+  |     |
   995   *                                     |     +--------+--------+<---+     |
   996   *                                     |     | ADD2H  | ADD1H  |--+       |
   997   *                                     |     +--------+--------+  |       |
   998   *                                     |     +--------+--------+<-+       |
   999   *                                     |     |  ADD4  |  ADD3  |--|-+     |
  1000   *                                     |     +--------+--------+  | |     |
  1001   *                                     |   +--------+--------+<---+ |     |
  1002   *                                     |   | ADD4H  | ADD3H  |------|-+   |(+vzero)
  1003   *                                     |   +--------+--------+      | |   V
  1004   *                                     | ------------------------   | | +--------+
  1005   *                                     |                            | | |  RED3  |  [d0 0 0 d0]
  1006   *                                     |                            | | +--------+
  1007   *                                     +---->+--------+--------+    | |   |
  1008   *   (T2[1w]||ADD2[4w]||ADD1[3w])            |   T1   |   T0   |----+ |   |
  1009   *                                           +--------+--------+    | |   |
  1010   *                                        ---+--------+--------+<---+ |   |
  1011   *                                    +--- T2|   T1   |   T0   |----------+
  1012   *                                    |   ---+--------+--------+      |   |
  1013   *                                    |  +--------+--------+<-------------+
  1014   *                                    |  |  RED2  |  RED1  |-----+    |   | [0 d1 d0 d1] [d0 0 d1 d0]
  1015   *                                    |  +--------+--------+     |    |   |
  1016   *                                    |  +--------+<----------------------+
  1017   *                                    |  |  RED3  |--------------+    |     [0 0 d1 d0]
  1018   *                                    |  +--------+              |    |
  1019   *                                    +--->+--------+--------+   |    |
  1020   *                                         |   T1   |   T0   |--------+
  1021   *                                         +--------+--------+   |    |
  1022   *                                   --------------------------- |    |
  1023   *                                                               |    |
  1024   *                                       +--------+--------+<----+    |
  1025   *                                       |  RED2  |  RED1  |          |
  1026   *                                       +--------+--------+          |
  1027   *                                      ---+--------+--------+<-------+
  1028   *                                       T2|   T1   |   T0   |            (H1P-H1P-H00RRAY!)
  1029   *                                      ---+--------+--------+
  1030   *
  1031   * Last 'group' needs to RED2||RED1 shifted less
  1032   */
  1033  TEXT ·p256MulInternalVX(SB), NOSPLIT, $0-0
  1034  	VL 32(CPOOL), SEL1
  1035  	VL 48(CPOOL), SEL2
  1036  	VL 64(CPOOL), SEL3
  1037  	VL 80(CPOOL), SEL4
  1038  
  1039  	// ---------------------------------------------------
  1040  
  1041  	VREPF $3, Y0, YDIG
  1042  	VMLHF X0, YDIG, ADD1H
  1043  	VMLHF X1, YDIG, ADD2H
  1044  	VMLF  X0, YDIG, ADD1
  1045  	VMLF  X1, YDIG, ADD2
  1046  
  1047  	VREPF  $2, Y0, YDIG
  1048  	VMALF  X0, YDIG, ADD1H, ADD3
  1049  	VMALF  X1, YDIG, ADD2H, ADD4
  1050  	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
  1051  	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
  1052  
  1053  	VZERO ZER
  1054  	VL    32(CPOOL), SEL1
  1055  	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1056  
  1057  	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
  1058  	VSLDB $12, ZER, ADD2, T1  // ADD2 Free
  1059  
  1060  	VACCQ  T0, ADD3, CAR1
  1061  	VAQ    T0, ADD3, T0       // ADD3 Free
  1062  	VACCCQ T1, ADD4, CAR1, T2
  1063  	VACQ   T1, ADD4, CAR1, T1 // ADD4 Free
  1064  
  1065  	VL    48(CPOOL), SEL2
  1066  	VL    64(CPOOL), SEL3
  1067  	VL    80(CPOOL), SEL4
  1068  	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1069  	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1070  	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1071  	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1072  
  1073  	VSLDB $12, T1, T0, T0
  1074  	VSLDB $12, T2, T1, T1
  1075  
  1076  	VACCQ  T0, ADD3H, CAR1
  1077  	VAQ    T0, ADD3H, T0
  1078  	VACCCQ T1, ADD4H, CAR1, T2
  1079  	VACQ   T1, ADD4H, CAR1, T1
  1080  
  1081  	// ---------------------------------------------------
  1082  
  1083  	VREPF  $1, Y0, YDIG
  1084  	VMALHF X0, YDIG, T0, ADD1H
  1085  	VMALHF X1, YDIG, T1, ADD2H
  1086  	VMALF  X0, YDIG, T0, ADD1  // T0 Free->ADD1
  1087  	VMALF  X1, YDIG, T1, ADD2  // T1 Free->ADD2
  1088  
  1089  	VREPF  $0, Y0, YDIG
  1090  	VMALF  X0, YDIG, ADD1H, ADD3
  1091  	VMALF  X1, YDIG, ADD2H, ADD4
  1092  	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
  1093  	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
  1094  
  1095  	VZERO ZER
  1096  	VL    32(CPOOL), SEL1
  1097  	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1098  
  1099  	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0
  1100  	VSLDB $12, T2, ADD2, T1   // ADD2 Free->T1, T2 Free
  1101  
  1102  	VACCQ  T0, RED1, CAR1
  1103  	VAQ    T0, RED1, T0
  1104  	VACCCQ T1, RED2, CAR1, T2
  1105  	VACQ   T1, RED2, CAR1, T1
  1106  
  1107  	VACCQ  T0, ADD3, CAR1
  1108  	VAQ    T0, ADD3, T0
  1109  	VACCCQ T1, ADD4, CAR1, CAR2
  1110  	VACQ   T1, ADD4, CAR1, T1
  1111  	VAQ    T2, CAR2, T2
  1112  
  1113  	VL    48(CPOOL), SEL2
  1114  	VL    64(CPOOL), SEL3
  1115  	VL    80(CPOOL), SEL4
  1116  	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1117  	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1118  	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1119  	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1120  
  1121  	VSLDB $12, T1, T0, T0
  1122  	VSLDB $12, T2, T1, T1
  1123  
  1124  	VACCQ  T0, ADD3H, CAR1
  1125  	VAQ    T0, ADD3H, T0
  1126  	VACCCQ T1, ADD4H, CAR1, T2
  1127  	VACQ   T1, ADD4H, CAR1, T1
  1128  
  1129  	// ---------------------------------------------------
  1130  
  1131  	VREPF  $3, Y1, YDIG
  1132  	VMALHF X0, YDIG, T0, ADD1H
  1133  	VMALHF X1, YDIG, T1, ADD2H
  1134  	VMALF  X0, YDIG, T0, ADD1
  1135  	VMALF  X1, YDIG, T1, ADD2
  1136  
  1137  	VREPF  $2, Y1, YDIG
  1138  	VMALF  X0, YDIG, ADD1H, ADD3
  1139  	VMALF  X1, YDIG, ADD2H, ADD4
  1140  	VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
  1141  	VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
  1142  
  1143  	VZERO ZER
  1144  	VL    32(CPOOL), SEL1
  1145  	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1146  
  1147  	VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
  1148  	VSLDB $12, T2, ADD2, T1   // ADD2 Free
  1149  
  1150  	VACCQ  T0, RED1, CAR1
  1151  	VAQ    T0, RED1, T0
  1152  	VACCCQ T1, RED2, CAR1, T2
  1153  	VACQ   T1, RED2, CAR1, T1
  1154  
  1155  	VACCQ  T0, ADD3, CAR1
  1156  	VAQ    T0, ADD3, T0
  1157  	VACCCQ T1, ADD4, CAR1, CAR2
  1158  	VACQ   T1, ADD4, CAR1, T1
  1159  	VAQ    T2, CAR2, T2
  1160  
  1161  	VL    48(CPOOL), SEL2
  1162  	VL    64(CPOOL), SEL3
  1163  	VL    80(CPOOL), SEL4
  1164  	VPERM RED3, T0, SEL2, RED1 // [d0  0 d1 d0]
  1165  	VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
  1166  	VPERM RED3, T0, SEL4, RED3 // [ 0  0 d1 d0]
  1167  	VSQ   RED3, RED2, RED2     // Guaranteed not to underflow
  1168  
  1169  	VSLDB $12, T1, T0, T0
  1170  	VSLDB $12, T2, T1, T1
  1171  
  1172  	VACCQ  T0, ADD3H, CAR1
  1173  	VAQ    T0, ADD3H, T0
  1174  	VACCCQ T1, ADD4H, CAR1, T2
  1175  	VACQ   T1, ADD4H, CAR1, T1
  1176  
  1177  	// ---------------------------------------------------
  1178  
  1179  	VREPF  $1, Y1, YDIG
  1180  	VMALHF X0, YDIG, T0, ADD1H
  1181  	VMALHF X1, YDIG, T1, ADD2H
  1182  	VMALF  X0, YDIG, T0, ADD1
  1183  	VMALF  X1, YDIG, T1, ADD2
  1184  
  1185  	VREPF  $0, Y1, YDIG
  1186  	VMALF  X0, YDIG, ADD1H, ADD3
  1187  	VMALF  X1, YDIG, ADD2H, ADD4
  1188  	VMALHF X0, YDIG, ADD1H, ADD3H
  1189  	VMALHF X1, YDIG, ADD2H, ADD4H
  1190  
  1191  	VZERO ZER
  1192  	VL    32(CPOOL), SEL1
  1193  	VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
  1194  
  1195  	VSLDB $12, ADD2, ADD1, T0
  1196  	VSLDB $12, T2, ADD2, T1
  1197  
  1198  	VACCQ  T0, RED1, CAR1
  1199  	VAQ    T0, RED1, T0
  1200  	VACCCQ T1, RED2, CAR1, T2
  1201  	VACQ   T1, RED2, CAR1, T1
  1202  
  1203  	VACCQ  T0, ADD3, CAR1
  1204  	VAQ    T0, ADD3, T0
  1205  	VACCCQ T1, ADD4, CAR1, CAR2
  1206  	VACQ   T1, ADD4, CAR1, T1
  1207  	VAQ    T2, CAR2, T2
  1208  
  1209  	VL    96(CPOOL), SEL5
  1210  	VL    112(CPOOL), SEL6
  1211  	VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
  1212  	VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0  0]
  1213  	VSQ   RED1, RED2, RED2     // Guaranteed not to underflow
  1214  
  1215  	VSLDB $12, T1, T0, T0
  1216  	VSLDB $12, T2, T1, T1
  1217  
  1218  	VACCQ  T0, ADD3H, CAR1
  1219  	VAQ    T0, ADD3H, T0
  1220  	VACCCQ T1, ADD4H, CAR1, T2
  1221  	VACQ   T1, ADD4H, CAR1, T1
  1222  
  1223  	VACCQ  T0, RED1, CAR1
  1224  	VAQ    T0, RED1, T0
  1225  	VACCCQ T1, RED2, CAR1, CAR2
  1226  	VACQ   T1, RED2, CAR1, T1
  1227  	VAQ    T2, CAR2, T2
  1228  
  1229  	// ---------------------------------------------------
  1230  
  1231  	VZERO   RED3
  1232  	VSCBIQ  P0, T0, CAR1
  1233  	VSQ     P0, T0, ADD1H
  1234  	VSBCBIQ T1, P1, CAR1, CAR2
  1235  	VSBIQ   T1, P1, CAR1, ADD2H
  1236  	VSBIQ   T2, RED3, CAR2, T2
  1237  
  1238  	// what output to use, ADD2H||ADD1H or T1||T0?
  1239  	VSEL T0, ADD1H, T2, T0
  1240  	VSEL T1, ADD2H, T2, T1
  1241  	RET
  1242  
  1243  #undef CPOOL
  1244  
  1245  #undef X0
  1246  #undef X1
  1247  #undef Y0
  1248  #undef Y1
  1249  #undef T0
  1250  #undef T1
  1251  #undef P0
  1252  #undef P1
  1253  
  1254  #undef SEL1
  1255  #undef SEL2
  1256  #undef SEL3
  1257  #undef SEL4
  1258  #undef SEL5
  1259  #undef SEL6
  1260  
  1261  #undef YDIG
  1262  #undef ADD1H
  1263  #undef ADD2H
  1264  #undef ADD3
  1265  #undef ADD4
  1266  #undef RED1
  1267  #undef RED2
  1268  #undef RED3
  1269  #undef T2
  1270  #undef ADD1
  1271  #undef ADD2
  1272  #undef ADD3H
  1273  #undef ADD4H
  1274  #undef ZER
  1275  #undef CAR1
  1276  #undef CAR2
  1277  
  1278  // ---------------------------------------
  1279  // p256MulInternalVMSL
  1280  // V0-V3,V30,V31 - Not Modified
  1281  // V4-V14 - Volatile
  1282  
  1283  #define CPOOL   R4
  1284  #define SCRATCH R9
  1285  
  1286  // Parameters
  1287  #define X0    V0 // Not modified
  1288  #define X1    V1 // Not modified
  1289  #define Y0    V2 // Not modified
  1290  #define Y1    V3 // Not modified
  1291  #define T0    V4
  1292  #define T1    V5
  1293  #define T2    V6
  1294  #define P0    V30 // Not modified
  1295  #define P1    V31 // Not modified
  1296  
  1297  // input: d0
  1298  // output: h0, h1
  1299  // temp: TEMP, ZERO, BORROW
  1300  #define OBSERVATION3(d0, h0, h1, TEMP, ZERO, BORROW) \
  1301  	VZERO ZERO                   \
  1302  	VSLDB $4, d0, ZERO, h0       \
  1303  	VLR   h0, BORROW             \
  1304  	VSLDB $12, ZERO, h0, TEMP    \
  1305  	VSQ   TEMP, h0, h0           \
  1306  	VSLDB $12, d0, BORROW, h1    \
  1307  	VSLDB $8, ZERO, BORROW, TEMP \
  1308  	VAQ   TEMP, h0, h0           \
  1309  
  1310  #define OBSERVATION3A(d2, h0, h1, TEMP, ZERO) \
  1311  	VZERO ZERO                \
  1312  	VSLDB $8, d2, ZERO, TEMP  \
  1313  	VSLDB $8, d2, TEMP, h0    \
  1314  	VSLDB $12, ZERO, TEMP, h1 \
  1315  	VSQ   h1, h0, h0          \
  1316  
  1317  TEXT ·p256MulInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0
  1318  	VSTM V16, V19, (SCRATCH)
  1319  
  1320  	MOVD $p256vmsl<>+0x00(SB), CPOOL
  1321  
  1322  	// Divide input1 into 5 limbs
  1323  	VGBM  $0x007f, V14
  1324  	VZERO V12
  1325  	VSLDB $2, X1, X0, V13
  1326  	VSLDB $2, Y1, Y0, V8
  1327  	VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb
  1328  	VSLDB $4, V12, Y1, V6  // V6: 4 bytes limb
  1329  
  1330  	VN V14, X0, V5   // V5: first 7 bytes limb
  1331  	VN V14, Y0, V10  // V10: first 7 bytes limb
  1332  	VN V14, V13, V13 // v13: third 7 bytes limb
  1333  	VN V14, V8, V8   // V8: third 7 bytes limb
  1334  
  1335  	VMSLG V10, V5, V12, V10 // v10: l10 x l5 (column 1)
  1336  	VMSLG V8, V5, V12, V8   // v8: l8 x l5
  1337  	VMSLG V6, V13, V12, V13 // v13: l6 x l3
  1338  	VMSLG V6, V11, V12, V11 // v11: l6 x l1 (column 9)
  1339  	VMSLG V6, V5, V12, V6   // v6: l6 x l5
  1340  
  1341  	MOVD $p256vmsl<>+0x00(SB), CPOOL
  1342  	VGBM $0x7f7f, V14
  1343  
  1344  	VL 0(CPOOL), V4
  1345  	VL 16(CPOOL), V7
  1346  	VL 32(CPOOL), V9
  1347  	VL 48(CPOOL), V5
  1348  	VLM 64(CPOOL), V16, V19
  1349  
  1350  	VPERM V12, X0, V4, V4   // v4: limb4 | limb5
  1351  	VPERM Y1, Y0, V7, V7
  1352  	VPERM V12, Y0, V9, V9   // v9: limb10 | limb9
  1353  	VPERM X1, X0, V5, V5
  1354  	VPERM X1, X0, V16, V16
  1355  	VPERM Y1, Y0, V17, V17
  1356  	VPERM X1, V12, V18, V18 // v18: limb1 | limb2
  1357  	VPERM Y1, V12, V19, V19 // v19: limb7 | limb6
  1358  	VN    V14, V7, V7       // v7:  limb9 | limb8
  1359  	VN    V14, V5, V5       // v5:  limb3 | limb4
  1360  	VN    V14, V16, V16     // v16: limb2 | limb3
  1361  	VN    V14, V17, V17     // v17: limb8 | limb7
  1362  
  1363  	VMSLG V9, V4, V12, V14   // v14: l10 x l4 + l9 x l5 (column 2)
  1364  	VMSLG V9, V5, V8, V8     // v8: l10 x l9 + l3 x l4 + l8 x l5 (column 3)
  1365  	VMSLG V9, V16, V12, V16  // v16: l10 x l9 + l2 x l3
  1366  	VMSLG V9, V18, V12, V9   // v9: l10 x l1 + l9 x l2
  1367  	VMSLG V7, V18, V12, V7   // v7: l9 x l1 + l8 x l2
  1368  	VMSLG V17, V4, V16, V16  // v16: l8 x l4 + l7 x l5 + l10 x l9 + l2 x l3 (column 4)
  1369  	VMSLG V17, V5, V9, V9    // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4
  1370  	VMSLG V17, V18, V12, V17 // v18: l8 x l1 + l7 x l2
  1371  	VMSLG V19, V5, V7, V7    // v7: l9 x l1 + l8 x l2 + l7 x l3 + l6 x l4 (column 6)
  1372  	VMSLG V19, V18, V12, V19 // v19: l7 x l1 + l6 x l2 (column 8)
  1373  	VAQ   V9, V6, V9         // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5)
  1374  	VAQ   V17, V13, V13      // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7)
  1375  
  1376  	VSLDB $9, V12, V10, V4
  1377  	VSLDB $9, V12, V7, V5
  1378  	VAQ   V4, V14, V14
  1379  	VAQ   V5, V13, V13
  1380  
  1381  	VSLDB $9, V12, V14, V4
  1382  	VSLDB $9, V12, V13, V5
  1383  	VAQ   V4, V8, V8
  1384  	VAQ   V5, V19, V19
  1385  
  1386  	VSLDB $9, V12, V8, V4
  1387  	VSLDB $9, V12, V19, V5
  1388  	VAQ   V4, V16, V16
  1389  	VAQ   V5, V11, V11
  1390  
  1391  	VSLDB $9, V12, V16, V4
  1392  	VAQ   V4, V9, V17
  1393  
  1394  	VGBM $0x007f, V4
  1395  	VGBM $0x00ff, V5
  1396  
  1397  	VN V10, V4, V10
  1398  	VN V14, V4, V14
  1399  	VN V8, V4, V8
  1400  	VN V16, V4, V16
  1401  	VN V17, V4, V9
  1402  	VN V7, V4, V7
  1403  	VN V13, V4, V13
  1404  	VN V19, V4, V19
  1405  	VN V11, V5, V11
  1406  
  1407  	VSLDB $7, V14, V14, V14
  1408  	VSLDB $14, V8, V12, V4
  1409  	VSLDB $14, V12, V8, V8
  1410  	VSLDB $5, V16, V16, V16
  1411  	VSLDB $12, V9, V12, V5
  1412  
  1413  	VO V14, V10, V10
  1414  	VO V8, V16, V16
  1415  	VO V4, V10, V10  // first rightmost 128bits of the multiplication result
  1416  	VO V5, V16, V16  // second rightmost 128bits of the multiplication result
  1417  
  1418  	// adjust v7, v13, v19, v11
  1419  	VSLDB $7, V13, V13, V13
  1420  	VSLDB $14, V19, V12, V4
  1421  	VSLDB $14, V12, V19, V19
  1422  	VSLDB $5, V11, V12, V5
  1423  	VO    V13, V7, V7
  1424  	VO    V4, V7, V7
  1425  	VO    V19, V5, V11
  1426  
  1427  	VSLDB $9, V12, V17, V14
  1428  	VSLDB $12, V12, V9, V9
  1429  	VACCQ V7, V14, V13
  1430  	VAQ   V7, V14, V7
  1431  	VAQ   V11, V13, V11
  1432  
  1433  	// First reduction, 96 bits
  1434  	VSLDB $4, V16, V10, T0
  1435  	VSLDB $4, V12, V16, T1
  1436  	VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result
  1437  	VSLDB $3, V7, V12, V7
  1438  	OBSERVATION3(V10, V8, T2, V17, V18, V19)// results V8 | T2
  1439  	VO    V7, V9, V7       // third rightmost 128bits of the multiplication result
  1440  	VACCQ T0, T2, V9
  1441  	VAQ   T0, T2, T2
  1442  	VACQ  T1, V8, V9, V8
  1443  
  1444  	// Second reduction 96 bits
  1445  	VSLDB $4, V8, T2, T0
  1446  	VSLDB $4, V12, V8, T1
  1447  	OBSERVATION3(T2, V9, V8, V17, V18, V19)// results V9 | V8
  1448  	VACCQ T0, V8, T2
  1449  	VAQ   T0, V8, V8
  1450  	VACQ  T1, V9, T2, V9
  1451  
  1452  	// Third reduction 64 bits
  1453  	VSLDB  $8, V9, V8, T0
  1454  	VSLDB  $8, V12, V9, T1
  1455  	OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13
  1456  	VACCQ  T0, V13, V12
  1457  	VAQ    T0, V13, V13
  1458  	VACQ   T1, V14, V12, V14
  1459  	VACCQ  V13, V7, V12
  1460  	VAQ    V13, V7, T0
  1461  	VACCCQ V14, V11, V12, T2
  1462  	VACQ   V14, V11, V12, T1 // results T2 | T1 | T0
  1463  
  1464  	// ---------------------------------------------------
  1465  	MOVD $p256mul<>+0x00(SB), CPOOL
  1466  
  1467  	VZERO   V12
  1468  	VSCBIQ  P0, T0, V8
  1469  	VSQ     P0, T0, V7
  1470  	VSBCBIQ T1, P1, V8, V10
  1471  	VSBIQ   T1, P1, V8, V9
  1472  	VSBIQ   T2, V12, V10, T2
  1473  
  1474  	// what output to use, V9||V7 or T1||T0?
  1475  	VSEL T0, V7, T2, T0
  1476  	VSEL T1, V9, T2, T1
  1477  
  1478  	VLM (SCRATCH), V16, V19
  1479  
  1480  	RET
  1481  
  1482  // ---------------------------------------
  1483  // p256SqrInternalVMSL
  1484  // V0-V1,V30,V31 - Not Modified
  1485  // V4-V14 - Volatile
  1486  
  1487  TEXT ·p256SqrInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0
  1488  	VSTM V16, V18, (SCRATCH)
  1489  
  1490  	MOVD $p256vmsl<>+0x00(SB), CPOOL
  1491  	// Divide input into limbs
  1492  	VGBM  $0x007f, V14
  1493  	VZERO V12
  1494  	VSLDB $2, X1, X0, V13
  1495  	VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb
  1496  
  1497  	VN V14, X0, V10  // V10: first 7 bytes limb
  1498  	VN V14, V13, V13 // v13: third 7 bytes limb
  1499  
  1500  	VMSLG V10, V10, V12, V10 // v10: l10 x l5 (column 1)
  1501  	VMSLG V13, V13, V12, V13 // v13: l8 x l3
  1502  	VMSLG V11, V11, V12, V11 // v11: l6 x l1 (column 9)
  1503  
  1504  	MOVD $p256vmsl<>+0x00(SB), CPOOL
  1505  	VGBM $0x7f7f, V14
  1506  
  1507  	VL 0(CPOOL), V4
  1508  	VL 16(CPOOL), V7
  1509  	VL 32(CPOOL), V9
  1510  	VL 48(CPOOL), V5
  1511  	VLM 64(CPOOL), V16, V18
  1512  	VL 112(CPOOL), V8
  1513  
  1514  	VPERM V12, X0, V4, V4   // v4: limb4 | limb5
  1515  	VPERM X1, X0, V7, V7
  1516  	VPERM V12, X0, V9, V9   // v9: limb10 | limb9
  1517  	VPERM X1, X0, V5, V5
  1518  	VPERM X1, X0, V16, V16
  1519  	VPERM X1, X0, V17, V17
  1520  	VPERM X1, V12, V18, V18 // v18: limb1 | limb2
  1521  	VPERM X1, V12, V8, V8   // v8:  limb7 | limb6
  1522  	VN    V14, V7, V7       // v7:  limb9 | limb8
  1523  	VN    V14, V5, V5       // v5:  limb3 | limb4
  1524  	VN    V14, V16, V16     // v16: limb2 | limb3
  1525  	VN    V14, V17, V17     // v17: limb8 | limb7
  1526  
  1527  	VMSLEOG V9, V18, V13, V6   // v6: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5)
  1528  	VMSLG   V9, V4, V12, V14   // v14: l10 x l4 + l9 x l5 (column 2)
  1529  	VMSLEOG V9, V16, V12, V16  // v16: l10 x l2 + l9 x l3 + l8 x l4 + l7 x l5 (column 4)
  1530  	VMSLEOG V7, V18, V12, V7   // v7: l9 x l1 + l8 x l2 (column 6)
  1531  	VMSLEG  V17, V18, V12, V13 // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7)
  1532  	VMSLG   V8, V18, V12, V8   // v8: l7 x l1 + l6 x l2 (column 8)
  1533  	VMSLEG  V9, V5, V12, V18   // v18: l10 x l3 + l9 x l4 + l8 x l5 (column 3)
  1534  
  1535  	VSLDB $9, V12, V10, V4
  1536  	VSLDB $9, V12, V7, V5
  1537  	VAQ   V4, V14, V14
  1538  	VAQ   V5, V13, V13
  1539  
  1540  	VSLDB $9, V12, V14, V4
  1541  	VSLDB $9, V12, V13, V5
  1542  	VAQ   V4, V18, V18
  1543  	VAQ   V5, V8, V8
  1544  
  1545  	VSLDB $9, V12, V18, V4
  1546  	VSLDB $9, V12, V8, V5
  1547  	VAQ   V4, V16, V16
  1548  	VAQ   V5, V11, V11
  1549  
  1550  	VSLDB $9, V12, V16, V4
  1551  	VAQ   V4, V6, V17
  1552  
  1553  	VGBM $0x007f, V4
  1554  	VGBM $0x00ff, V5
  1555  
  1556  	VN V10, V4, V10
  1557  	VN V14, V4, V14
  1558  	VN V18, V4, V18
  1559  	VN V16, V4, V16
  1560  	VN V17, V4, V9
  1561  	VN V7, V4, V7
  1562  	VN V13, V4, V13
  1563  	VN V8, V4, V8
  1564  	VN V11, V5, V11
  1565  
  1566  	VSLDB $7, V14, V14, V14
  1567  	VSLDB $14, V18, V12, V4
  1568  	VSLDB $14, V12, V18, V18
  1569  	VSLDB $5, V16, V16, V16
  1570  	VSLDB $12, V9, V12, V5
  1571  
  1572  	VO V14, V10, V10
  1573  	VO V18, V16, V16
  1574  	VO V4, V10, V10  // first rightmost 128bits of the multiplication result
  1575  	VO V5, V16, V16  // second rightmost 128bits of the multiplication result
  1576  
  1577  	// adjust v7, v13, v8, v11
  1578  	VSLDB $7, V13, V13, V13
  1579  	VSLDB $14, V8, V12, V4
  1580  	VSLDB $14, V12, V8, V8
  1581  	VSLDB $5, V11, V12, V5
  1582  	VO    V13, V7, V7
  1583  	VO    V4, V7, V7
  1584  	VO    V8, V5, V11
  1585  
  1586  	VSLDB $9, V12, V17, V14
  1587  	VSLDB $12, V12, V9, V9
  1588  	VACCQ V7, V14, V13
  1589  	VAQ   V7, V14, V7
  1590  	VAQ   V11, V13, V11
  1591  
  1592  	// First reduction, 96 bits
  1593  	VSLDB $4, V16, V10, T0
  1594  	VSLDB $4, V12, V16, T1
  1595  	VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result
  1596  	VSLDB $3, V7, V12, V7
  1597  	OBSERVATION3(V10, V8, T2, V16, V17, V18)// results V8 | T2
  1598  	VO    V7, V9, V7       // third rightmost 128bits of the multiplication result
  1599  	VACCQ T0, T2, V9
  1600  	VAQ   T0, T2, T2
  1601  	VACQ  T1, V8, V9, V8
  1602  
  1603  	// Second reduction 96 bits
  1604  	VSLDB $4, V8, T2, T0
  1605  	VSLDB $4, V12, V8, T1
  1606  	OBSERVATION3(T2, V9, V8, V16, V17, V18)// results V9 | V8
  1607  	VACCQ T0, V8, T2
  1608  	VAQ   T0, V8, V8
  1609  	VACQ  T1, V9, T2, V9
  1610  
  1611  	// Third reduction 64 bits
  1612  	VSLDB  $8, V9, V8, T0
  1613  	VSLDB  $8, V12, V9, T1
  1614  	OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13
  1615  	VACCQ  T0, V13, V12
  1616  	VAQ    T0, V13, V13
  1617  	VACQ   T1, V14, V12, V14
  1618  	VACCQ  V13, V7, V12
  1619  	VAQ    V13, V7, T0
  1620  	VACCCQ V14, V11, V12, T2
  1621  	VACQ   V14, V11, V12, T1 // results T2 | T1 | T0
  1622  
  1623  	// ---------------------------------------------------
  1624  	MOVD $p256mul<>+0x00(SB), CPOOL
  1625  
  1626  	VZERO   V12
  1627  	VSCBIQ  P0, T0, V8
  1628  	VSQ     P0, T0, V7
  1629  	VSBCBIQ T1, P1, V8, V10
  1630  	VSBIQ   T1, P1, V8, V9
  1631  	VSBIQ   T2, V12, V10, T2
  1632  
  1633  	// what output to use, V9||V7 or T1||T0?
  1634  	VSEL T0, V7, T2, T0
  1635  	VSEL T1, V9, T2, T1
  1636  
  1637  	VLM (SCRATCH), V16, V18
  1638  	RET
  1639  
  1640  
  1641  
  1642  #undef CPOOL
  1643  #undef SCRATCH
  1644  #undef X0
  1645  #undef X1
  1646  #undef Y0
  1647  #undef Y1
  1648  #undef T0
  1649  #undef T1
  1650  #undef T2
  1651  #undef P0
  1652  #undef P1
  1653  
  1654  #define SCRATCH R9
  1655  
  1656  TEXT p256MulInternal<>(SB),NOSPLIT,$64-0
  1657  	MOVD    $scratch-64(SP), SCRATCH
  1658  	MOVD    ·p256MulInternalFacility+0x00(SB),R7
  1659  	CALL    (R7)
  1660  	RET
  1661  
  1662  TEXT ·p256MulInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0
  1663  	MOVBZ  internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0
  1664  	MOVD    $·p256MulInternalFacility+0x00(SB), R7
  1665  	MOVD    $·p256MulInternalVX(SB), R8
  1666  	CMPBEQ  R0, $0, novmsl      // VE1 facility = 1, VMSL supported
  1667  	MOVD    $·p256MulInternalVMSL(SB), R8
  1668  novmsl:
  1669  	MOVD    R8, 0(R7)
  1670  	BR      (R8)
  1671  
  1672  GLOBL ·p256MulInternalFacility+0x00(SB), NOPTR, $8
  1673  DATA ·p256MulInternalFacility+0x00(SB)/8, $·p256MulInternalTrampolineSetup(SB)
  1674  
  1675  // Parameters
  1676  #define X0    V0
  1677  #define X1    V1
  1678  #define Y0    V2
  1679  #define Y1    V3
  1680  
  1681  TEXT ·p256SqrInternalVX(SB), NOFRAME|NOSPLIT, $0
  1682  	VLR X0, Y0
  1683  	VLR X1, Y1
  1684  	BR  ·p256MulInternalVX(SB)
  1685  
  1686  #undef X0
  1687  #undef X1
  1688  #undef Y0
  1689  #undef Y1
  1690  
  1691  
  1692  TEXT p256SqrInternal<>(SB),NOSPLIT,$48-0
  1693  	MOVD    $scratch-48(SP), SCRATCH
  1694          MOVD    ·p256SqrInternalFacility+0x00(SB),R7
  1695          CALL    (R7)
  1696  	RET
  1697  
  1698  TEXT ·p256SqrInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0
  1699  	MOVBZ  internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0
  1700  	MOVD    $·p256SqrInternalFacility+0x00(SB), R7
  1701  	MOVD    $·p256SqrInternalVX(SB), R8
  1702  	CMPBEQ  R0, $0, novmsl      // VE1 facility = 1, VMSL supported
  1703  	MOVD    $·p256SqrInternalVMSL(SB), R8
  1704  novmsl:
  1705  	MOVD    R8, 0(R7)
  1706  	BR      (R8)
  1707  
  1708  
  1709  GLOBL ·p256SqrInternalFacility+0x00(SB), NOPTR, $8
  1710  DATA ·p256SqrInternalFacility+0x00(SB)/8, $·p256SqrInternalTrampolineSetup(SB)
  1711  
  1712  #undef SCRATCH
  1713  
  1714  
  1715  #define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
  1716  	VZERO   ZER                \
  1717  	VSCBIQ  Y0, X0, CAR1       \
  1718  	VSQ     Y0, X0, T0         \
  1719  	VSBCBIQ X1, Y1, CAR1, SEL1 \
  1720  	VSBIQ   X1, Y1, CAR1, T1   \
  1721  	VSQ     SEL1, ZER, SEL1    \
  1722  	                           \
  1723  	VACCQ   T0, PL, CAR1       \
  1724  	VAQ     T0, PL, TT0        \
  1725  	VACQ    T1, PH, CAR1, TT1  \
  1726  	                           \
  1727  	VSEL    T0, TT0, SEL1, T0  \
  1728  	VSEL    T1, TT1, SEL1, T1  \
  1729  
  1730  #define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
  1731  	VACCQ   X0, Y0, CAR1        \
  1732  	VAQ     X0, Y0, T0          \
  1733  	VACCCQ  X1, Y1, CAR1, T2    \
  1734  	VACQ    X1, Y1, CAR1, T1    \
  1735  	                            \
  1736  	VZERO   ZER                 \
  1737  	VSCBIQ  PL, T0, CAR1        \
  1738  	VSQ     PL, T0, TT0         \
  1739  	VSBCBIQ T1, PH, CAR1, CAR2  \
  1740  	VSBIQ   T1, PH, CAR1, TT1   \
  1741  	VSBIQ   T2, ZER, CAR2, SEL1 \
  1742  	                            \
  1743  	VSEL    T0, TT0, SEL1, T0   \
  1744  	VSEL    T1, TT1, SEL1, T1
  1745  
  1746  #define p256HalfInternal(T1, T0, X1, X0) \
  1747  	VZERO  ZER                \
  1748  	VSBIQ  ZER, ZER, X0, SEL1 \
  1749  	                          \
  1750  	VACCQ  X0, PL, CAR1       \
  1751  	VAQ    X0, PL, T0         \
  1752  	VACCCQ X1, PH, CAR1, T2   \
  1753  	VACQ   X1, PH, CAR1, T1   \
  1754  	                          \
  1755  	VSEL   X0, T0, SEL1, T0   \
  1756  	VSEL   X1, T1, SEL1, T1   \
  1757  	VSEL   ZER, T2, SEL1, T2  \
  1758  	                          \
  1759  	VSLDB  $15, T2, ZER, TT1  \
  1760  	VSLDB  $15, T1, ZER, TT0  \
  1761  	VREPIB $1, SEL1           \
  1762  	VSRL   SEL1, T0, T0       \
  1763  	VSRL   SEL1, T1, T1       \
  1764  	VREPIB $7, SEL1           \
  1765  	VSL    SEL1, TT0, TT0     \
  1766  	VSL    SEL1, TT1, TT1     \
  1767  	VO     T0, TT0, T0        \
  1768  	VO     T1, TT1, T1
  1769  
  1770  // ---------------------------------------
  1771  // func p256MulAsm(res, in1, in2 []byte)
  1772  #define res_ptr R1
  1773  #define x_ptr   R2
  1774  #define y_ptr   R3
  1775  #define CPOOL   R4
  1776  
  1777  // Parameters
  1778  #define X0    V0
  1779  #define X1    V1
  1780  #define Y0    V2
  1781  #define Y1    V3
  1782  #define T0    V4
  1783  #define T1    V5
  1784  
  1785  // Constants
  1786  #define P0    V30
  1787  #define P1    V31
  1788  TEXT ·p256MulAsm(SB), NOSPLIT, $0
  1789  	MOVD res+0(FP), res_ptr
  1790  	MOVD in1+24(FP), x_ptr
  1791  	MOVD in2+48(FP), y_ptr
  1792  
  1793  	VL (1*16)(x_ptr), X0
  1794  	VL (0*16)(x_ptr), X1
  1795  	VL (1*16)(y_ptr), Y0
  1796  	VL (0*16)(y_ptr), Y1
  1797  
  1798  	MOVD $p256mul<>+0x00(SB), CPOOL
  1799  	VL   16(CPOOL), P0
  1800  	VL   0(CPOOL), P1
  1801  
  1802  	CALL p256MulInternal<>(SB)
  1803  
  1804  	VST T0, (1*16)(res_ptr)
  1805  	VST T1, (0*16)(res_ptr)
  1806  	RET
  1807  
  1808  #undef res_ptr
  1809  #undef x_ptr
  1810  #undef y_ptr
  1811  #undef CPOOL
  1812  
  1813  #undef X0
  1814  #undef X1
  1815  #undef Y0
  1816  #undef Y1
  1817  #undef T0
  1818  #undef T1
  1819  #undef P0
  1820  #undef P1
  1821  
  1822  // ---------------------------------------
  1823  // func p256SqrAsm(res, in1 []byte)
  1824  #define res_ptr R1
  1825  #define x_ptr   R2
  1826  #define y_ptr   R3
  1827  #define CPOOL   R4
  1828  
  1829  // Parameters
  1830  #define X0    V0
  1831  #define X1    V1
  1832  #define T0    V4
  1833  #define T1    V5
  1834  
  1835  // Constants
  1836  #define P0    V30
  1837  #define P1    V31
  1838  TEXT ·p256SqrAsm(SB), NOSPLIT, $0
  1839  	MOVD res+0(FP), res_ptr
  1840  	MOVD in1+24(FP), x_ptr
  1841  
  1842  	VL (1*16)(x_ptr), X0
  1843  	VL (0*16)(x_ptr), X1
  1844  
  1845  	MOVD $p256mul<>+0x00(SB), CPOOL
  1846  	VL   16(CPOOL), P0
  1847  	VL   0(CPOOL), P1
  1848  
  1849  	CALL p256SqrInternal<>(SB)
  1850  
  1851  	VST T0, (1*16)(res_ptr)
  1852  	VST T1, (0*16)(res_ptr)
  1853  	RET
  1854  
  1855  #undef res_ptr
  1856  #undef x_ptr
  1857  #undef y_ptr
  1858  #undef CPOOL
  1859  
  1860  #undef X0
  1861  #undef X1
  1862  #undef T0
  1863  #undef T1
  1864  #undef P0
  1865  #undef P1
  1866  
  1867  
  1868  // Point add with P2 being affine point
  1869  // If sign == 1 -> P2 = -P2
  1870  // If sel == 0 -> P3 = P1
  1871  // if zero == 0 -> P3 = P2
  1872  // p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int)
  1873  #define P3ptr   R1
  1874  #define P1ptr   R2
  1875  #define P2ptr   R3
  1876  #define CPOOL   R4
  1877  
  1878  // Temporaries in REGs
  1879  #define Y2L    V15
  1880  #define Y2H    V16
  1881  #define T1L    V17
  1882  #define T1H    V18
  1883  #define T2L    V19
  1884  #define T2H    V20
  1885  #define T3L    V21
  1886  #define T3H    V22
  1887  #define T4L    V23
  1888  #define T4H    V24
  1889  
  1890  // Temps for Sub and Add
  1891  #define TT0  V11
  1892  #define TT1  V12
  1893  #define T2   V13
  1894  
  1895  // p256MulAsm Parameters
  1896  #define X0    V0
  1897  #define X1    V1
  1898  #define Y0    V2
  1899  #define Y1    V3
  1900  #define T0    V4
  1901  #define T1    V5
  1902  
  1903  #define PL    V30
  1904  #define PH    V31
  1905  
  1906  // Names for zero/sel selects
  1907  #define X1L    V0
  1908  #define X1H    V1
  1909  #define Y1L    V2 // p256MulAsmParmY
  1910  #define Y1H    V3 // p256MulAsmParmY
  1911  #define Z1L    V4
  1912  #define Z1H    V5
  1913  #define X2L    V0
  1914  #define X2H    V1
  1915  #define Z2L    V4
  1916  #define Z2H    V5
  1917  #define X3L    V17 // T1L
  1918  #define X3H    V18 // T1H
  1919  #define Y3L    V21 // T3L
  1920  #define Y3H    V22 // T3H
  1921  #define Z3L    V28
  1922  #define Z3H    V29
  1923  
  1924  #define ZER   V6
  1925  #define SEL1  V7
  1926  #define CAR1  V8
  1927  #define CAR2  V9
  1928  /* *
  1929   * Three operand formula:
  1930   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  1931   * T1 = Z1²
  1932   * T2 = T1*Z1
  1933   * T1 = T1*X2
  1934   * T2 = T2*Y2
  1935   * T1 = T1-X1
  1936   * T2 = T2-Y1
  1937   * Z3 = Z1*T1
  1938   * T3 = T1²
  1939   * T4 = T3*T1
  1940   * T3 = T3*X1
  1941   * T1 = 2*T3
  1942   * X3 = T2²
  1943   * X3 = X3-T1
  1944   * X3 = X3-T4
  1945   * T3 = T3-X3
  1946   * T3 = T3*T2
  1947   * T4 = T4*Y1
  1948   * Y3 = T3-T4
  1949  
  1950   * Three operand formulas, but with MulInternal X,Y used to store temps
  1951  X=Z1; Y=Z1; MUL;T-   // T1 = Z1²      T1
  1952  X=T ; Y-  ; MUL;T2=T // T2 = T1*Z1    T1   T2
  1953  X-  ; Y=X2; MUL;T1=T // T1 = T1*X2    T1   T2
  1954  X=T2; Y=Y2; MUL;T-   // T2 = T2*Y2    T1   T2
  1955  SUB(T2<T-Y1)         // T2 = T2-Y1    T1   T2
  1956  SUB(Y<T1-X1)         // T1 = T1-X1    T1   T2
  1957  X=Z1; Y- ;  MUL;Z3:=T// Z3 = Z1*T1         T2
  1958  X=Y;  Y- ;  MUL;X=T  // T3 = T1*T1         T2
  1959  X- ;  Y- ;  MUL;T4=T // T4 = T3*T1         T2        T4
  1960  X- ;  Y=X1; MUL;T3=T // T3 = T3*X1         T2   T3   T4
  1961  ADD(T1<T+T)          // T1 = T3+T3    T1   T2   T3   T4
  1962  X=T2; Y=T2; MUL;T-   // X3 = T2*T2    T1   T2   T3   T4
  1963  SUB(T<T-T1)          // X3 = X3-T1    T1   T2   T3   T4
  1964  SUB(T<T-T4) X3:=T    // X3 = X3-T4         T2   T3   T4
  1965  SUB(X<T3-T)          // T3 = T3-X3         T2   T3   T4
  1966  X- ;  Y- ;  MUL;T3=T // T3 = T3*T2         T2   T3   T4
  1967  X=T4; Y=Y1; MUL;T-   // T4 = T4*Y1              T3   T4
  1968  SUB(T<T3-T) Y3:=T    // Y3 = T3-T4              T3   T4
  1969  
  1970  	*/
  1971  TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
  1972  	MOVD P3+0(FP), P3ptr
  1973  	MOVD P1+8(FP), P1ptr
  1974  	MOVD P2+16(FP), P2ptr
  1975  
  1976  	MOVD $p256mul<>+0x00(SB), CPOOL
  1977  	VL   16(CPOOL), PL
  1978  	VL   0(CPOOL), PH
  1979  
  1980  	//	if (sign == 1) {
  1981  	//		Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2  = P-Y2
  1982  	//	}
  1983  
  1984  	VL 32(P2ptr), Y2H
  1985  	VL 48(P2ptr), Y2L
  1986  
  1987  	VLREPG sign+24(FP), SEL1
  1988  	VZERO  ZER
  1989  	VCEQG  SEL1, ZER, SEL1
  1990  
  1991  	VSCBIQ Y2L, PL, CAR1
  1992  	VSQ    Y2L, PL, T1L
  1993  	VSBIQ  PH, Y2H, CAR1, T1H
  1994  
  1995  	VSEL Y2L, T1L, SEL1, Y2L
  1996  	VSEL Y2H, T1H, SEL1, Y2H
  1997  
  1998  /* *
  1999   * Three operand formula:
  2000   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  2001   */
  2002  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1²      T1
  2003  	VL   64(P1ptr), X1       // Z1H
  2004  	VL   80(P1ptr), X0       // Z1L
  2005  	VLR  X0, Y0
  2006  	VLR  X1, Y1
  2007  	CALL p256SqrInternal<>(SB)
  2008  
  2009  	// X=T ; Y-  ; MUL; T2=T // T2 = T1*Z1    T1   T2
  2010  	VLR  T0, X0
  2011  	VLR  T1, X1
  2012  	CALL p256MulInternal<>(SB)
  2013  	VLR  T0, T2L
  2014  	VLR  T1, T2H
  2015  
  2016  	// X-  ; Y=X2; MUL; T1=T // T1 = T1*X2    T1   T2
  2017  	VL   0(P2ptr), Y1        // X2H
  2018  	VL   16(P2ptr), Y0       // X2L
  2019  	CALL p256MulInternal<>(SB)
  2020  	VLR  T0, T1L
  2021  	VLR  T1, T1H
  2022  
  2023  	// X=T2; Y=Y2; MUL; T-   // T2 = T2*Y2    T1   T2
  2024  	VLR  T2L, X0
  2025  	VLR  T2H, X1
  2026  	VLR  Y2L, Y0
  2027  	VLR  Y2H, Y1
  2028  	CALL p256MulInternal<>(SB)
  2029  
  2030  	// SUB(T2<T-Y1)          // T2 = T2-Y1    T1   T2
  2031  	VL 32(P1ptr), Y1H
  2032  	VL 48(P1ptr), Y1L
  2033  	p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
  2034  
  2035  	// SUB(Y<T1-X1)          // T1 = T1-X1    T1   T2
  2036  	VL 0(P1ptr), X1H
  2037  	VL 16(P1ptr), X1L
  2038  	p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
  2039  
  2040  	// X=Z1; Y- ;  MUL; Z3:=T// Z3 = Z1*T1         T2
  2041  	VL   64(P1ptr), X1       // Z1H
  2042  	VL   80(P1ptr), X0       // Z1L
  2043  	CALL p256MulInternal<>(SB)
  2044  
  2045  	// VST T1, 64(P3ptr)
  2046  	// VST T0, 80(P3ptr)
  2047  	VLR T0, Z3L
  2048  	VLR T1, Z3H
  2049  
  2050  	// X=Y;  Y- ;  MUL; X=T  // T3 = T1*T1         T2
  2051  	VLR  Y0, X0
  2052  	VLR  Y1, X1
  2053  	CALL p256SqrInternal<>(SB)
  2054  	VLR  T0, X0
  2055  	VLR  T1, X1
  2056  
  2057  	// X- ;  Y- ;  MUL; T4=T // T4 = T3*T1         T2        T4
  2058  	CALL p256MulInternal<>(SB)
  2059  	VLR  T0, T4L
  2060  	VLR  T1, T4H
  2061  
  2062  	// X- ;  Y=X1; MUL; T3=T // T3 = T3*X1         T2   T3   T4
  2063  	VL   0(P1ptr), Y1        // X1H
  2064  	VL   16(P1ptr), Y0       // X1L
  2065  	CALL p256MulInternal<>(SB)
  2066  	VLR  T0, T3L
  2067  	VLR  T1, T3H
  2068  
  2069  	// ADD(T1<T+T)           // T1 = T3+T3    T1   T2   T3   T4
  2070  	p256AddInternal(T1H,T1L, T1,T0,T1,T0)
  2071  
  2072  	// X=T2; Y=T2; MUL; T-   // X3 = T2*T2    T1   T2   T3   T4
  2073  	VLR  T2L, X0
  2074  	VLR  T2H, X1
  2075  	VLR  T2L, Y0
  2076  	VLR  T2H, Y1
  2077  	CALL p256SqrInternal<>(SB)
  2078  
  2079  	// SUB(T<T-T1)           // X3 = X3-T1    T1   T2   T3   T4  (T1 = X3)
  2080  	p256SubInternal(T1,T0,T1,T0,T1H,T1L)
  2081  
  2082  	// SUB(T<T-T4) X3:=T     // X3 = X3-T4         T2   T3   T4
  2083  	p256SubInternal(T1,T0,T1,T0,T4H,T4L)
  2084  	VLR T0, X3L
  2085  	VLR T1, X3H
  2086  
  2087  	// SUB(X<T3-T)           // T3 = T3-X3         T2   T3   T4
  2088  	p256SubInternal(X1,X0,T3H,T3L,T1,T0)
  2089  
  2090  	// X- ;  Y- ;  MUL; T3=T // T3 = T3*T2         T2   T3   T4
  2091  	CALL p256MulInternal<>(SB)
  2092  	VLR  T0, T3L
  2093  	VLR  T1, T3H
  2094  
  2095  	// X=T4; Y=Y1; MUL; T-   // T4 = T4*Y1              T3   T4
  2096  	VLR  T4L, X0
  2097  	VLR  T4H, X1
  2098  	VL   32(P1ptr), Y1       // Y1H
  2099  	VL   48(P1ptr), Y0       // Y1L
  2100  	CALL p256MulInternal<>(SB)
  2101  
  2102  	// SUB(T<T3-T) Y3:=T     // Y3 = T3-T4              T3   T4  (T3 = Y3)
  2103  	p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
  2104  
  2105  	//	if (sel == 0) {
  2106  	//		copy(P3.x[:], X1)
  2107  	//		copy(P3.y[:], Y1)
  2108  	//		copy(P3.z[:], Z1)
  2109  	//	}
  2110  
  2111  	VL 0(P1ptr), X1H
  2112  	VL 16(P1ptr), X1L
  2113  
  2114  	// Y1 already loaded, left over from addition
  2115  	VL 64(P1ptr), Z1H
  2116  	VL 80(P1ptr), Z1L
  2117  
  2118  	VLREPG sel+32(FP), SEL1
  2119  	VZERO  ZER
  2120  	VCEQG  SEL1, ZER, SEL1
  2121  
  2122  	VSEL X1L, X3L, SEL1, X3L
  2123  	VSEL X1H, X3H, SEL1, X3H
  2124  	VSEL Y1L, Y3L, SEL1, Y3L
  2125  	VSEL Y1H, Y3H, SEL1, Y3H
  2126  	VSEL Z1L, Z3L, SEL1, Z3L
  2127  	VSEL Z1H, Z3H, SEL1, Z3H
  2128  
  2129  	//	if (zero == 0) {
  2130  	//		copy(P3.x[:], X2)
  2131  	//		copy(P3.y[:], Y2)
  2132  	//		copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
  2133  	//			0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01})  //(p256.z*2^256)%p
  2134  	//	}
  2135  	VL 0(P2ptr), X2H
  2136  	VL 16(P2ptr), X2L
  2137  
  2138  	// Y2 already loaded
  2139  	VL 128(CPOOL), Z2H
  2140  	VL 144(CPOOL), Z2L
  2141  
  2142  	VLREPG zero+40(FP), SEL1
  2143  	VZERO  ZER
  2144  	VCEQG  SEL1, ZER, SEL1
  2145  
  2146  	VSEL X2L, X3L, SEL1, X3L
  2147  	VSEL X2H, X3H, SEL1, X3H
  2148  	VSEL Y2L, Y3L, SEL1, Y3L
  2149  	VSEL Y2H, Y3H, SEL1, Y3H
  2150  	VSEL Z2L, Z3L, SEL1, Z3L
  2151  	VSEL Z2H, Z3H, SEL1, Z3H
  2152  
  2153  	// All done, store out the result!!!
  2154  	VST X3H, 0(P3ptr)
  2155  	VST X3L, 16(P3ptr)
  2156  	VST Y3H, 32(P3ptr)
  2157  	VST Y3L, 48(P3ptr)
  2158  	VST Z3H, 64(P3ptr)
  2159  	VST Z3L, 80(P3ptr)
  2160  
  2161  	RET
  2162  
  2163  #undef P3ptr
  2164  #undef P1ptr
  2165  #undef P2ptr
  2166  #undef CPOOL
  2167  
  2168  #undef Y2L
  2169  #undef Y2H
  2170  #undef T1L
  2171  #undef T1H
  2172  #undef T2L
  2173  #undef T2H
  2174  #undef T3L
  2175  #undef T3H
  2176  #undef T4L
  2177  #undef T4H
  2178  
  2179  #undef TT0
  2180  #undef TT1
  2181  #undef T2
  2182  
  2183  #undef X0
  2184  #undef X1
  2185  #undef Y0
  2186  #undef Y1
  2187  #undef T0
  2188  #undef T1
  2189  
  2190  #undef PL
  2191  #undef PH
  2192  
  2193  #undef X1L
  2194  #undef X1H
  2195  #undef Y1L
  2196  #undef Y1H
  2197  #undef Z1L
  2198  #undef Z1H
  2199  #undef X2L
  2200  #undef X2H
  2201  #undef Z2L
  2202  #undef Z2H
  2203  #undef X3L
  2204  #undef X3H
  2205  #undef Y3L
  2206  #undef Y3H
  2207  #undef Z3L
  2208  #undef Z3H
  2209  
  2210  #undef ZER
  2211  #undef SEL1
  2212  #undef CAR1
  2213  #undef CAR2
  2214  
  2215  // p256PointDoubleAsm(P3, P1 *p256Point)
  2216  // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
  2217  // https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
  2218  // https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
  2219  #define P3ptr   R1
  2220  #define P1ptr   R2
  2221  #define CPOOL   R4
  2222  
  2223  // Temporaries in REGs
  2224  #define X3L    V15
  2225  #define X3H    V16
  2226  #define Y3L    V17
  2227  #define Y3H    V18
  2228  #define T1L    V19
  2229  #define T1H    V20
  2230  #define T2L    V21
  2231  #define T2H    V22
  2232  #define T3L    V23
  2233  #define T3H    V24
  2234  
  2235  #define X1L    V6
  2236  #define X1H    V7
  2237  #define Y1L    V8
  2238  #define Y1H    V9
  2239  #define Z1L    V10
  2240  #define Z1H    V11
  2241  
  2242  // Temps for Sub and Add
  2243  #define TT0  V11
  2244  #define TT1  V12
  2245  #define T2   V13
  2246  
  2247  // p256MulAsm Parameters
  2248  #define X0    V0
  2249  #define X1    V1
  2250  #define Y0    V2
  2251  #define Y1    V3
  2252  #define T0    V4
  2253  #define T1    V5
  2254  
  2255  #define PL    V30
  2256  #define PH    V31
  2257  
  2258  #define Z3L    V23
  2259  #define Z3H    V24
  2260  
  2261  #define ZER   V26
  2262  #define SEL1  V27
  2263  #define CAR1  V28
  2264  #define CAR2  V29
  2265  /*
  2266   * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
  2267   * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
  2268   * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
  2269   * 	A  = 3(X₁-Z₁²)×(X₁+Z₁²)
  2270   * 	B  = 2Y₁
  2271   * 	Z₃ = B×Z₁
  2272   * 	C  = B²
  2273   * 	D  = C×X₁
  2274   * 	X₃ = A²-2D
  2275   * 	Y₃ = (D-X₃)×A-C²/2
  2276   *
  2277   * Three-operand formula:
  2278   *       T1 = Z1²
  2279   *       T2 = X1-T1
  2280   *       T1 = X1+T1
  2281   *       T2 = T2*T1
  2282   *       T2 = 3*T2
  2283   *       Y3 = 2*Y1
  2284   *       Z3 = Y3*Z1
  2285   *       Y3 = Y3²
  2286   *       T3 = Y3*X1
  2287   *       Y3 = Y3²
  2288   *       Y3 = half*Y3
  2289   *       X3 = T2²
  2290   *       T1 = 2*T3
  2291   *       X3 = X3-T1
  2292   *       T1 = T3-X3
  2293   *       T1 = T1*T2
  2294   *       Y3 = T1-Y3
  2295   */
  2296  
  2297  TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
  2298  	MOVD P3+0(FP), P3ptr
  2299  	MOVD P1+8(FP), P1ptr
  2300  
  2301  	MOVD $p256mul<>+0x00(SB), CPOOL
  2302  	VL   16(CPOOL), PL
  2303  	VL   0(CPOOL), PH
  2304  
  2305  	// X=Z1; Y=Z1; MUL; T-    // T1 = Z1²
  2306  	VL   64(P1ptr), X1       // Z1H
  2307  	VL   80(P1ptr), X0       // Z1L
  2308  	VLR  X0, Y0
  2309  	VLR  X1, Y1
  2310  	CALL p256SqrInternal<>(SB)
  2311  
  2312  	// SUB(X<X1-T)            // T2 = X1-T1
  2313  	VL 0(P1ptr), X1H
  2314  	VL 16(P1ptr), X1L
  2315  	p256SubInternal(X1,X0,X1H,X1L,T1,T0)
  2316  
  2317  	// ADD(Y<X1+T)            // T1 = X1+T1
  2318  	p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
  2319  
  2320  	// X-  ; Y-  ; MUL; T-    // T2 = T2*T1
  2321  	CALL p256MulInternal<>(SB)
  2322  
  2323  	// ADD(T2<T+T); ADD(T2<T2+T)  // T2 = 3*T2
  2324  	p256AddInternal(T2H,T2L,T1,T0,T1,T0)
  2325  	p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
  2326  
  2327  	// ADD(X<Y1+Y1)           // Y3 = 2*Y1
  2328  	VL 32(P1ptr), Y1H
  2329  	VL 48(P1ptr), Y1L
  2330  	p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
  2331  
  2332  	// X-  ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
  2333  	VL   64(P1ptr), Y1       // Z1H
  2334  	VL   80(P1ptr), Y0       // Z1L
  2335  	CALL p256MulInternal<>(SB)
  2336  	VST  T1, 64(P3ptr)
  2337  	VST  T0, 80(P3ptr)
  2338  
  2339  	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  2340  	VLR  X0, Y0
  2341  	VLR  X1, Y1
  2342  	CALL p256SqrInternal<>(SB)
  2343  
  2344  	// X=T ; Y=X1; MUL; T3=T  // T3 = Y3*X1
  2345  	VLR  T0, X0
  2346  	VLR  T1, X1
  2347  	VL   0(P1ptr), Y1
  2348  	VL   16(P1ptr), Y0
  2349  	CALL p256MulInternal<>(SB)
  2350  	VLR  T0, T3L
  2351  	VLR  T1, T3H
  2352  
  2353  	// X-  ; Y=X ; MUL; T-    // Y3 = Y3²
  2354  	VLR  X0, Y0
  2355  	VLR  X1, Y1
  2356  	CALL p256SqrInternal<>(SB)
  2357  
  2358  	// HAL(Y3<T)              // Y3 = half*Y3
  2359  	p256HalfInternal(Y3H,Y3L, T1,T0)
  2360  
  2361  	// X=T2; Y=T2; MUL; T-    // X3 = T2²
  2362  	VLR  T2L, X0
  2363  	VLR  T2H, X1
  2364  	VLR  T2L, Y0
  2365  	VLR  T2H, Y1
  2366  	CALL p256SqrInternal<>(SB)
  2367  
  2368  	// ADD(T1<T3+T3)          // T1 = 2*T3
  2369  	p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
  2370  
  2371  	// SUB(X3<T-T1) X3:=X3    // X3 = X3-T1
  2372  	p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
  2373  	VST X3H, 0(P3ptr)
  2374  	VST X3L, 16(P3ptr)
  2375  
  2376  	// SUB(X<T3-X3)           // T1 = T3-X3
  2377  	p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
  2378  
  2379  	// X-  ; Y-  ; MUL; T-    // T1 = T1*T2
  2380  	CALL p256MulInternal<>(SB)
  2381  
  2382  	// SUB(Y3<T-Y3)           // Y3 = T1-Y3
  2383  	p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
  2384  
  2385  	VST Y3H, 32(P3ptr)
  2386  	VST Y3L, 48(P3ptr)
  2387  	RET
  2388  
  2389  #undef P3ptr
  2390  #undef P1ptr
  2391  #undef CPOOL
  2392  #undef X3L
  2393  #undef X3H
  2394  #undef Y3L
  2395  #undef Y3H
  2396  #undef T1L
  2397  #undef T1H
  2398  #undef T2L
  2399  #undef T2H
  2400  #undef T3L
  2401  #undef T3H
  2402  #undef X1L
  2403  #undef X1H
  2404  #undef Y1L
  2405  #undef Y1H
  2406  #undef Z1L
  2407  #undef Z1H
  2408  #undef TT0
  2409  #undef TT1
  2410  #undef T2
  2411  #undef X0
  2412  #undef X1
  2413  #undef Y0
  2414  #undef Y1
  2415  #undef T0
  2416  #undef T1
  2417  #undef PL
  2418  #undef PH
  2419  #undef Z3L
  2420  #undef Z3H
  2421  #undef ZER
  2422  #undef SEL1
  2423  #undef CAR1
  2424  #undef CAR2
  2425  
  2426  // p256PointAddAsm(P3, P1, P2 *p256Point)
  2427  #define P3ptr  R1
  2428  #define P1ptr  R2
  2429  #define P2ptr  R3
  2430  #define CPOOL  R4
  2431  #define ISZERO R5
  2432  #define TRUE   R6
  2433  
  2434  // Temporaries in REGs
  2435  #define T1L   V16
  2436  #define T1H   V17
  2437  #define T2L   V18
  2438  #define T2H   V19
  2439  #define U1L   V20
  2440  #define U1H   V21
  2441  #define S1L   V22
  2442  #define S1H   V23
  2443  #define HL    V24
  2444  #define HH    V25
  2445  #define RL    V26
  2446  #define RH    V27
  2447  
  2448  // Temps for Sub and Add
  2449  #define ZER   V6
  2450  #define SEL1  V7
  2451  #define CAR1  V8
  2452  #define CAR2  V9
  2453  #define TT0  V11
  2454  #define TT1  V12
  2455  #define T2   V13
  2456  
  2457  // p256MulAsm Parameters
  2458  #define X0    V0
  2459  #define X1    V1
  2460  #define Y0    V2
  2461  #define Y1    V3
  2462  #define T0    V4
  2463  #define T1    V5
  2464  
  2465  #define PL    V30
  2466  #define PH    V31
  2467  /*
  2468   * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
  2469   *
  2470   * A = X₁×Z₂²
  2471   * B = Y₁×Z₂³
  2472   * C = X₂×Z₁²-A
  2473   * D = Y₂×Z₁³-B
  2474   * X₃ = D² - 2A×C² - C³
  2475   * Y₃ = D×(A×C² - X₃) - B×C³
  2476   * Z₃ = Z₁×Z₂×C
  2477   *
  2478   * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
  2479   * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
  2480   *
  2481   * T1 = Z1*Z1
  2482   * T2 = Z2*Z2
  2483   * U1 = X1*T2
  2484   * H  = X2*T1
  2485   * H  = H-U1
  2486   * Z3 = Z1*Z2
  2487   * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  2488   *
  2489   * S1 = Z2*T2
  2490   * S1 = Y1*S1
  2491   * R  = Z1*T1
  2492   * R  = Y2*R
  2493   * R  = R-S1
  2494   *
  2495   * T1 = H*H
  2496   * T2 = H*T1
  2497   * U1 = U1*T1
  2498   *
  2499   * X3 = R*R
  2500   * X3 = X3-T2
  2501   * T1 = 2*U1
  2502   * X3 = X3-T1 << store-out X3 result reg
  2503   *
  2504   * T2 = S1*T2
  2505   * Y3 = U1-X3
  2506   * Y3 = R*Y3
  2507   * Y3 = Y3-T2 << store-out Y3 result reg
  2508  
  2509   	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  2510  	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  2511  	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  2512  	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2513  	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2514  	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2515  	// SUB(H<H-T)            // H  = H-U1
  2516  	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2517  	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
  2518  	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2519  	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2520  	// SUB(R<T-S1)           // R  = R-S1
  2521  	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2522  	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2523  	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2524  	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2525  	// SUB(T<T-T2)           // X3 = X3-T2
  2526  	// ADD(X<U1+U1)          // T1 = 2*U1
  2527  	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2528  	// SUB(Y<U1-T)           // Y3 = U1-X3
  2529  	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2530  	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2531  	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2532  	*/
  2533  TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
  2534  	MOVD P3+0(FP), P3ptr
  2535  	MOVD P1+8(FP), P1ptr
  2536  	MOVD P2+16(FP), P2ptr
  2537  
  2538  	MOVD $p256mul<>+0x00(SB), CPOOL
  2539  	VL   16(CPOOL), PL
  2540  	VL   0(CPOOL), PH
  2541  
  2542  	// X=Z1; Y=Z1; MUL; T-   // T1 = Z1*Z1
  2543  	VL   64(P1ptr), X1       // Z1H
  2544  	VL   80(P1ptr), X0       // Z1L
  2545  	VLR  X0, Y0
  2546  	VLR  X1, Y1
  2547  	CALL p256SqrInternal<>(SB)
  2548  
  2549  	// X-  ; Y=T ; MUL; R=T  // R  = Z1*T1
  2550  	VLR  T0, Y0
  2551  	VLR  T1, Y1
  2552  	CALL p256MulInternal<>(SB)
  2553  	VLR  T0, RL
  2554  	VLR  T1, RH
  2555  
  2556  	// X=X2; Y-  ; MUL; H=T  // H  = X2*T1
  2557  	VL   0(P2ptr), X1        // X2H
  2558  	VL   16(P2ptr), X0       // X2L
  2559  	CALL p256MulInternal<>(SB)
  2560  	VLR  T0, HL
  2561  	VLR  T1, HH
  2562  
  2563  	// X=Z2; Y=Z2; MUL; T-   // T2 = Z2*Z2
  2564  	VL   64(P2ptr), X1       // Z2H
  2565  	VL   80(P2ptr), X0       // Z2L
  2566  	VLR  X0, Y0
  2567  	VLR  X1, Y1
  2568  	CALL p256SqrInternal<>(SB)
  2569  
  2570  	// X-  ; Y=T ; MUL; S1=T // S1 = Z2*T2
  2571  	VLR  T0, Y0
  2572  	VLR  T1, Y1
  2573  	CALL p256MulInternal<>(SB)
  2574  	VLR  T0, S1L
  2575  	VLR  T1, S1H
  2576  
  2577  	// X=X1; Y-  ; MUL; U1=T // U1 = X1*T2
  2578  	VL   0(P1ptr), X1        // X1H
  2579  	VL   16(P1ptr), X0       // X1L
  2580  	CALL p256MulInternal<>(SB)
  2581  	VLR  T0, U1L
  2582  	VLR  T1, U1H
  2583  
  2584  	// SUB(H<H-T)            // H  = H-U1
  2585  	p256SubInternal(HH,HL,HH,HL,T1,T0)
  2586  
  2587  	// if H == 0 or H^P == 0 then ret=1 else ret=0
  2588  	// clobbers T1H and T1L
  2589  	MOVD   $0, ISZERO
  2590  	MOVD   $1, TRUE
  2591  	VZERO  ZER
  2592  	VO     HL, HH, T1H
  2593  	VCEQGS ZER, T1H, T1H
  2594  	MOVDEQ TRUE, ISZERO
  2595  	VX     HL, PL, T1L
  2596  	VX     HH, PH, T1H
  2597  	VO     T1L, T1H, T1H
  2598  	VCEQGS ZER, T1H, T1H
  2599  	MOVDEQ TRUE, ISZERO
  2600  	MOVD   ISZERO, ret+24(FP)
  2601  
  2602  	// X=Z1; Y=Z2; MUL; T-   // Z3 = Z1*Z2
  2603  	VL   64(P1ptr), X1       // Z1H
  2604  	VL   80(P1ptr), X0       // Z1L
  2605  	VL   64(P2ptr), Y1       // Z2H
  2606  	VL   80(P2ptr), Y0       // Z2L
  2607  	CALL p256MulInternal<>(SB)
  2608  
  2609  	// X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
  2610  	VLR  T0, X0
  2611  	VLR  T1, X1
  2612  	VLR  HL, Y0
  2613  	VLR  HH, Y1
  2614  	CALL p256MulInternal<>(SB)
  2615  	VST  T1, 64(P3ptr)
  2616  	VST  T0, 80(P3ptr)
  2617  
  2618  	// X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
  2619  	VL   32(P1ptr), X1
  2620  	VL   48(P1ptr), X0
  2621  	VLR  S1L, Y0
  2622  	VLR  S1H, Y1
  2623  	CALL p256MulInternal<>(SB)
  2624  	VLR  T0, S1L
  2625  	VLR  T1, S1H
  2626  
  2627  	// X=Y2; Y=R ; MUL; T-   // R  = Y2*R
  2628  	VL   32(P2ptr), X1
  2629  	VL   48(P2ptr), X0
  2630  	VLR  RL, Y0
  2631  	VLR  RH, Y1
  2632  	CALL p256MulInternal<>(SB)
  2633  
  2634  	// SUB(R<T-S1)           // R  = T-S1
  2635  	p256SubInternal(RH,RL,T1,T0,S1H,S1L)
  2636  
  2637  	// if R == 0 or R^P == 0 then ret=ret else ret=0
  2638  	// clobbers T1H and T1L
  2639  	MOVD   $0, ISZERO
  2640  	MOVD   $1, TRUE
  2641  	VZERO  ZER
  2642  	VO     RL, RH, T1H
  2643  	VCEQGS ZER, T1H, T1H
  2644  	MOVDEQ TRUE, ISZERO
  2645  	VX     RL, PL, T1L
  2646  	VX     RH, PH, T1H
  2647  	VO     T1L, T1H, T1H
  2648  	VCEQGS ZER, T1H, T1H
  2649  	MOVDEQ TRUE, ISZERO
  2650  	AND    ret+24(FP), ISZERO
  2651  	MOVD   ISZERO, ret+24(FP)
  2652  
  2653  	// X=H ; Y=H ; MUL; T-   // T1 = H*H
  2654  	VLR  HL, X0
  2655  	VLR  HH, X1
  2656  	VLR  HL, Y0
  2657  	VLR  HH, Y1
  2658  	CALL p256SqrInternal<>(SB)
  2659  
  2660  	// X-  ; Y=T ; MUL; T2=T // T2 = H*T1
  2661  	VLR  T0, Y0
  2662  	VLR  T1, Y1
  2663  	CALL p256MulInternal<>(SB)
  2664  	VLR  T0, T2L
  2665  	VLR  T1, T2H
  2666  
  2667  	// X=U1; Y-  ; MUL; U1=T // U1 = U1*T1
  2668  	VLR  U1L, X0
  2669  	VLR  U1H, X1
  2670  	CALL p256MulInternal<>(SB)
  2671  	VLR  T0, U1L
  2672  	VLR  T1, U1H
  2673  
  2674  	// X=R ; Y=R ; MUL; T-   // X3 = R*R
  2675  	VLR  RL, X0
  2676  	VLR  RH, X1
  2677  	VLR  RL, Y0
  2678  	VLR  RH, Y1
  2679  	CALL p256SqrInternal<>(SB)
  2680  
  2681  	// SUB(T<T-T2)           // X3 = X3-T2
  2682  	p256SubInternal(T1,T0,T1,T0,T2H,T2L)
  2683  
  2684  	// ADD(X<U1+U1)          // T1 = 2*U1
  2685  	p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
  2686  
  2687  	// SUB(T<T-X) X3:=T      // X3 = X3-T1 << store-out X3 result reg
  2688  	p256SubInternal(T1,T0,T1,T0,X1,X0)
  2689  	VST T1, 0(P3ptr)
  2690  	VST T0, 16(P3ptr)
  2691  
  2692  	// SUB(Y<U1-T)           // Y3 = U1-X3
  2693  	p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
  2694  
  2695  	// X=R ; Y-  ; MUL; U1=T // Y3 = R*Y3
  2696  	VLR  RL, X0
  2697  	VLR  RH, X1
  2698  	CALL p256MulInternal<>(SB)
  2699  	VLR  T0, U1L
  2700  	VLR  T1, U1H
  2701  
  2702  	// X=S1; Y=T2; MUL; T-   // T2 = S1*T2
  2703  	VLR  S1L, X0
  2704  	VLR  S1H, X1
  2705  	VLR  T2L, Y0
  2706  	VLR  T2H, Y1
  2707  	CALL p256MulInternal<>(SB)
  2708  
  2709  	// SUB(T<U1-T); Y3:=T    // Y3 = Y3-T2 << store-out Y3 result reg
  2710  	p256SubInternal(T1,T0,U1H,U1L,T1,T0)
  2711  	VST T1, 32(P3ptr)
  2712  	VST T0, 48(P3ptr)
  2713  
  2714  	RET