github.com/twelsh-aw/go/src@v0.0.0-20230516233729-a56fe86a7c81/math/big/arith_ppc64x.s (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !math_big_pure_go && (ppc64 || ppc64le)
     6  // +build !math_big_pure_go
     7  // +build ppc64 ppc64le
     8  
     9  #include "textflag.h"
    10  
    11  // This file provides fast assembly versions for the elementary
    12  // arithmetic operations on vectors implemented in arith.go.
    13  
    14  // func addVV(z, y, y []Word) (c Word)
    15  // z[i] = x[i] + y[i] for all i, carrying
    16  TEXT ·addVV(SB), NOSPLIT, $0
    17  	MOVD  z_len+8(FP), R7   // R7 = z_len
    18  	MOVD  x+24(FP), R8      // R8 = x[]
    19  	MOVD  y+48(FP), R9      // R9 = y[]
    20  	MOVD  z+0(FP), R10      // R10 = z[]
    21  
    22  	// If z_len = 0, we are done
    23  	CMP   R0, R7
    24  	MOVD  R0, R4
    25  	BEQ   done
    26  
    27  	// Process the first iteration out of the loop so we can
    28  	// use MOVDU and avoid 3 index registers updates.
    29  	MOVD  0(R8), R11      // R11 = x[i]
    30  	MOVD  0(R9), R12      // R12 = y[i]
    31  	ADD   $-1, R7         // R7 = z_len - 1
    32  	ADDC  R12, R11, R15   // R15 = x[i] + y[i], set CA
    33  	CMP   R0, R7
    34  	MOVD  R15, 0(R10)     // z[i]
    35  	BEQ   final          // If z_len was 1, we are done
    36  
    37  	SRD   $2, R7, R5      // R5 = z_len/4
    38  	CMP   R0, R5
    39  	MOVD  R5, CTR         // Set up loop counter
    40  	BEQ   tail            // If R5 = 0, we can't use the loop
    41  
    42  	// Process 4 elements per iteration. Unrolling this loop
    43  	// means a performance trade-off: we will lose performance
    44  	// for small values of z_len (0.90x in the worst case), but
    45  	// gain significant performance as z_len increases (up to
    46  	// 1.45x).
    47  
    48  	PCALIGN $16
    49  loop:
    50  	MOVD  8(R8), R11      // R11 = x[i]
    51  	MOVD  16(R8), R12     // R12 = x[i+1]
    52  	MOVD  24(R8), R14     // R14 = x[i+2]
    53  	MOVDU 32(R8), R15     // R15 = x[i+3]
    54  	MOVD  8(R9), R16      // R16 = y[i]
    55  	MOVD  16(R9), R17     // R17 = y[i+1]
    56  	MOVD  24(R9), R18     // R18 = y[i+2]
    57  	MOVDU 32(R9), R19     // R19 = y[i+3]
    58  	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
    59  	ADDE  R12, R17, R21   // R21 = x[i+1] + y[i+1] + CA
    60  	ADDE  R14, R18, R22   // R22 = x[i+2] + y[i+2] + CA
    61  	ADDE  R15, R19, R23   // R23 = x[i+3] + y[i+3] + CA
    62  	MOVD  R20, 8(R10)     // z[i]
    63  	MOVD  R21, 16(R10)    // z[i+1]
    64  	MOVD  R22, 24(R10)    // z[i+2]
    65  	MOVDU R23, 32(R10)    // z[i+3]
    66  	ADD   $-4, R7         // R7 = z_len - 4
    67  	BC  16, 0, loop       // bdnz
    68  
    69  	// We may have more elements to read
    70  	CMP   R0, R7
    71  	BEQ   final
    72  
    73  	// Process the remaining elements, one at a time
    74  tail:
    75  	MOVDU 8(R8), R11      // R11 = x[i]
    76  	MOVDU 8(R9), R16      // R16 = y[i]
    77  	ADD   $-1, R7         // R7 = z_len - 1
    78  	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
    79  	CMP   R0, R7
    80  	MOVDU R20, 8(R10)     // z[i]
    81  	BEQ   final           // If R7 = 0, we are done
    82  
    83  	MOVDU 8(R8), R11
    84  	MOVDU 8(R9), R16
    85  	ADD   $-1, R7
    86  	ADDE  R11, R16, R20
    87  	CMP   R0, R7
    88  	MOVDU R20, 8(R10)
    89  	BEQ   final
    90  
    91  	MOVD  8(R8), R11
    92  	MOVD  8(R9), R16
    93  	ADDE  R11, R16, R20
    94  	MOVD  R20, 8(R10)
    95  
    96  final:
    97  	ADDZE R4              // Capture CA
    98  
    99  done:
   100  	MOVD  R4, c+72(FP)
   101  	RET
   102  
   103  // func subVV(z, x, y []Word) (c Word)
   104  // z[i] = x[i] - y[i] for all i, carrying
   105  TEXT ·subVV(SB), NOSPLIT, $0
   106  	MOVD  z_len+8(FP), R7 // R7 = z_len
   107  	MOVD  x+24(FP), R8    // R8 = x[]
   108  	MOVD  y+48(FP), R9    // R9 = y[]
   109  	MOVD  z+0(FP), R10    // R10 = z[]
   110  
   111  	// If z_len = 0, we are done
   112  	CMP   R0, R7
   113  	MOVD  R0, R4
   114  	BEQ   done
   115  
   116  	// Process the first iteration out of the loop so we can
   117  	// use MOVDU and avoid 3 index registers updates.
   118  	MOVD  0(R8), R11      // R11 = x[i]
   119  	MOVD  0(R9), R12      // R12 = y[i]
   120  	ADD   $-1, R7         // R7 = z_len - 1
   121  	SUBC  R12, R11, R15   // R15 = x[i] - y[i], set CA
   122  	CMP   R0, R7
   123  	MOVD  R15, 0(R10)     // z[i]
   124  	BEQ   final           // If z_len was 1, we are done
   125  
   126  	SRD   $2, R7, R5      // R5 = z_len/4
   127  	CMP   R0, R5
   128  	MOVD  R5, CTR         // Set up loop counter
   129  	BEQ   tail            // If R5 = 0, we can't use the loop
   130  
   131  	// Process 4 elements per iteration. Unrolling this loop
   132  	// means a performance trade-off: we will lose performance
   133  	// for small values of z_len (0.92x in the worst case), but
   134  	// gain significant performance as z_len increases (up to
   135  	// 1.45x).
   136  
   137  	PCALIGN $16
   138  loop:
   139  	MOVD  8(R8), R11      // R11 = x[i]
   140  	MOVD  16(R8), R12     // R12 = x[i+1]
   141  	MOVD  24(R8), R14     // R14 = x[i+2]
   142  	MOVDU 32(R8), R15     // R15 = x[i+3]
   143  	MOVD  8(R9), R16      // R16 = y[i]
   144  	MOVD  16(R9), R17     // R17 = y[i+1]
   145  	MOVD  24(R9), R18     // R18 = y[i+2]
   146  	MOVDU 32(R9), R19     // R19 = y[i+3]
   147  	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
   148  	SUBE  R17, R12, R21   // R21 = x[i+1] - y[i+1] + CA
   149  	SUBE  R18, R14, R22   // R22 = x[i+2] - y[i+2] + CA
   150  	SUBE  R19, R15, R23   // R23 = x[i+3] - y[i+3] + CA
   151  	MOVD  R20, 8(R10)     // z[i]
   152  	MOVD  R21, 16(R10)    // z[i+1]
   153  	MOVD  R22, 24(R10)    // z[i+2]
   154  	MOVDU R23, 32(R10)    // z[i+3]
   155  	ADD   $-4, R7         // R7 = z_len - 4
   156  	BC  16, 0, loop       // bdnz
   157  
   158  	// We may have more elements to read
   159  	CMP   R0, R7
   160  	BEQ   final
   161  
   162  	// Process the remaining elements, one at a time
   163  tail:
   164  	MOVDU 8(R8), R11      // R11 = x[i]
   165  	MOVDU 8(R9), R16      // R16 = y[i]
   166  	ADD   $-1, R7         // R7 = z_len - 1
   167  	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
   168  	CMP   R0, R7
   169  	MOVDU R20, 8(R10)     // z[i]
   170  	BEQ   final           // If R7 = 0, we are done
   171  
   172  	MOVDU 8(R8), R11
   173  	MOVDU 8(R9), R16
   174  	ADD   $-1, R7
   175  	SUBE  R16, R11, R20
   176  	CMP   R0, R7
   177  	MOVDU R20, 8(R10)
   178  	BEQ   final
   179  
   180  	MOVD  8(R8), R11
   181  	MOVD  8(R9), R16
   182  	SUBE  R16, R11, R20
   183  	MOVD  R20, 8(R10)
   184  
   185  final:
   186  	ADDZE R4
   187  	XOR   $1, R4
   188  
   189  done:
   190  	MOVD  R4, c+72(FP)
   191  	RET
   192  
   193  // func addVW(z, x []Word, y Word) (c Word)
   194  TEXT ·addVW(SB), NOSPLIT, $0
   195  	MOVD z+0(FP), R10	// R10 = z[]
   196  	MOVD x+24(FP), R8	// R8 = x[]
   197  	MOVD y+48(FP), R4	// R4 = y = c
   198  	MOVD z_len+8(FP), R11	// R11 = z_len
   199  
   200  	CMP   R0, R11		// If z_len is zero, return
   201  	BEQ   done
   202  
   203  	// We will process the first iteration out of the loop so we capture
   204  	// the value of c. In the subsequent iterations, we will rely on the
   205  	// value of CA set here.
   206  	MOVD  0(R8), R20	// R20 = x[i]
   207  	ADD   $-1, R11		// R11 = z_len - 1
   208  	ADDC  R20, R4, R6	// R6 = x[i] + c
   209  	CMP   R0, R11		// If z_len was 1, we are done
   210  	MOVD  R6, 0(R10)	// z[i]
   211  	BEQ   final
   212  
   213  	// We will read 4 elements per iteration
   214  	SRD   $2, R11, R9	// R9 = z_len/4
   215  	DCBT  (R8)
   216  	CMP   R0, R9
   217  	MOVD  R9, CTR		// Set up the loop counter
   218  	BEQ   tail		// If R9 = 0, we can't use the loop
   219  	PCALIGN $16
   220  
   221  loop:
   222  	MOVD  8(R8), R20	// R20 = x[i]
   223  	MOVD  16(R8), R21	// R21 = x[i+1]
   224  	MOVD  24(R8), R22	// R22 = x[i+2]
   225  	MOVDU 32(R8), R23	// R23 = x[i+3]
   226  	ADDZE R20, R24		// R24 = x[i] + CA
   227  	ADDZE R21, R25		// R25 = x[i+1] + CA
   228  	ADDZE R22, R26		// R26 = x[i+2] + CA
   229  	ADDZE R23, R27		// R27 = x[i+3] + CA
   230  	MOVD  R24, 8(R10)	// z[i]
   231  	MOVD  R25, 16(R10)	// z[i+1]
   232  	MOVD  R26, 24(R10)	// z[i+2]
   233  	MOVDU R27, 32(R10)	// z[i+3]
   234  	ADD   $-4, R11		// R11 = z_len - 4
   235  	BC    16, 0, loop	// bdnz
   236  
   237  	// We may have some elements to read
   238  	CMP R0, R11
   239  	BEQ final
   240  
   241  tail:
   242  	MOVDU 8(R8), R20
   243  	ADDZE R20, R24
   244  	ADD $-1, R11
   245  	MOVDU R24, 8(R10)
   246  	CMP R0, R11
   247  	BEQ final
   248  
   249  	MOVDU 8(R8), R20
   250  	ADDZE R20, R24
   251  	ADD $-1, R11
   252  	MOVDU R24, 8(R10)
   253  	CMP R0, R11
   254  	BEQ final
   255  
   256  	MOVD 8(R8), R20
   257  	ADDZE R20, R24
   258  	MOVD R24, 8(R10)
   259  
   260  final:
   261  	ADDZE R0, R4		// c = CA
   262  done:
   263  	MOVD  R4, c+56(FP)
   264  	RET
   265  
   266  // func subVW(z, x []Word, y Word) (c Word)
   267  TEXT ·subVW(SB), NOSPLIT, $0
   268  	MOVD  z+0(FP), R10	// R10 = z[]
   269  	MOVD  x+24(FP), R8	// R8 = x[]
   270  	MOVD  y+48(FP), R4	// R4 = y = c
   271  	MOVD  z_len+8(FP), R11	// R11 = z_len
   272  
   273  	CMP   R0, R11		// If z_len is zero, return
   274  	BEQ   done
   275  
   276  	// We will process the first iteration out of the loop so we capture
   277  	// the value of c. In the subsequent iterations, we will rely on the
   278  	// value of CA set here.
   279  	MOVD  0(R8), R20	// R20 = x[i]
   280  	ADD   $-1, R11		// R11 = z_len - 1
   281  	SUBC  R4, R20, R6	// R6 = x[i] - c
   282  	CMP   R0, R11		// If z_len was 1, we are done
   283  	MOVD  R6, 0(R10)	// z[i]
   284  	BEQ   final
   285  
   286  	// We will read 4 elements per iteration
   287  	SRD   $2, R11, R9	// R9 = z_len/4
   288  	DCBT  (R8)
   289  	CMP   R0, R9
   290  	MOVD  R9, CTR		// Set up the loop counter
   291  	BEQ   tail		// If R9 = 0, we can't use the loop
   292  
   293  	// The loop here is almost the same as the one used in s390x, but
   294  	// we don't need to capture CA every iteration because we've already
   295  	// done that above.
   296  
   297  	PCALIGN $16
   298  loop:
   299  	MOVD  8(R8), R20
   300  	MOVD  16(R8), R21
   301  	MOVD  24(R8), R22
   302  	MOVDU 32(R8), R23
   303  	SUBE  R0, R20
   304  	SUBE  R0, R21
   305  	SUBE  R0, R22
   306  	SUBE  R0, R23
   307  	MOVD  R20, 8(R10)
   308  	MOVD  R21, 16(R10)
   309  	MOVD  R22, 24(R10)
   310  	MOVDU R23, 32(R10)
   311  	ADD   $-4, R11
   312  	BC    16, 0, loop	// bdnz
   313  
   314  	// We may have some elements to read
   315  	CMP   R0, R11
   316  	BEQ   final
   317  
   318  tail:
   319  	MOVDU 8(R8), R20
   320  	SUBE  R0, R20
   321  	ADD   $-1, R11
   322  	MOVDU R20, 8(R10)
   323  	CMP   R0, R11
   324  	BEQ   final
   325  
   326  	MOVDU 8(R8), R20
   327  	SUBE  R0, R20
   328  	ADD   $-1, R11
   329  	MOVDU R20, 8(R10)
   330  	CMP   R0, R11
   331  	BEQ   final
   332  
   333  	MOVD  8(R8), R20
   334  	SUBE  R0, R20
   335  	MOVD  R20, 8(R10)
   336  
   337  final:
   338  	// Capture CA
   339  	SUBE  R4, R4
   340  	NEG   R4, R4
   341  
   342  done:
   343  	MOVD  R4, c+56(FP)
   344  	RET
   345  
   346  //func shlVU(z, x []Word, s uint) (c Word)
   347  TEXT ·shlVU(SB), NOSPLIT, $0
   348  	MOVD    z+0(FP), R3
   349  	MOVD    x+24(FP), R6
   350  	MOVD    s+48(FP), R9
   351  	MOVD    z_len+8(FP), R4
   352  	MOVD    x_len+32(FP), R7
   353  	CMP     R9, R0          // s==0 copy(z,x)
   354  	BEQ     zeroshift
   355  	CMP     R4, R0          // len(z)==0 return
   356  	BEQ     done
   357  
   358  	ADD     $-1, R4, R5     // len(z)-1
   359  	SUBC    R9, $64, R4     // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
   360  	SLD     $3, R5, R7
   361  	ADD     R6, R7, R15     // save starting address &x[len(z)-1]
   362  	ADD     R3, R7, R16     // save starting address &z[len(z)-1]
   363  	MOVD    (R6)(R7), R14
   364  	SRD     R4, R14, R7     // compute x[len(z)-1]>>ŝ into R7
   365  	CMP     R5, R0          // iterate from i=len(z)-1 to 0
   366  	BEQ     loopexit        // Already at end?
   367  	MOVD	0(R15),R10	// x[i]
   368  	PCALIGN $16
   369  shloop:
   370  	SLD     R9, R10, R10    // x[i]<<s
   371  	MOVDU   -8(R15), R14
   372  	SRD     R4, R14, R11    // x[i-1]>>ŝ
   373  	OR      R11, R10, R10
   374  	MOVD    R10, 0(R16)     // z[i-1]=x[i]<<s | x[i-1]>>ŝ
   375  	MOVD	R14, R10	// reuse x[i-1] for next iteration
   376  	ADD     $-8, R16        // i--
   377  	CMP     R15, R6         // &x[i-1]>&x[0]?
   378  	BGT     shloop
   379  loopexit:
   380  	MOVD    0(R6), R4
   381  	SLD     R9, R4, R4
   382  	MOVD    R4, 0(R3)       // z[0]=x[0]<<s
   383  	MOVD    R7, c+56(FP)    // store pre-computed x[len(z)-1]>>ŝ into c
   384  	RET
   385  
   386  zeroshift:
   387  	CMP     R6, R0          // x is null, nothing to copy
   388  	BEQ     done
   389  	CMP     R6, R3          // if x is same as z, nothing to copy
   390  	BEQ     done
   391  	CMP     R7, R4
   392  	ISEL    $0, R7, R4, R7  // Take the lower bound of lengths of x,z
   393  	SLD     $3, R7, R7
   394  	SUB     R6, R3, R11     // dest - src
   395  	CMPU    R11, R7, CR2    // < len?
   396  	BLT     CR2, backward   // there is overlap, copy backwards
   397  	MOVD    $0, R14
   398  	// shlVU processes backwards, but added a forward copy option 
   399  	// since its faster on POWER
   400  repeat:
   401  	MOVD    (R6)(R14), R15  // Copy 8 bytes at a time
   402  	MOVD    R15, (R3)(R14)
   403  	ADD     $8, R14
   404  	CMP     R14, R7         // More 8 bytes left?
   405  	BLT     repeat
   406  	BR      done
   407  backward:
   408  	ADD     $-8,R7, R14
   409  repeatback:
   410  	MOVD    (R6)(R14), R15  // copy x into z backwards
   411  	MOVD    R15, (R3)(R14)  // copy 8 bytes at a time
   412  	SUB     $8, R14
   413  	CMP     R14, $-8        // More 8 bytes left?
   414  	BGT     repeatback
   415  
   416  done:
   417  	MOVD    R0, c+56(FP)    // c=0
   418  	RET
   419  
   420  //func shrVU(z, x []Word, s uint) (c Word)
   421  TEXT ·shrVU(SB), NOSPLIT, $0
   422  	MOVD    z+0(FP), R3
   423  	MOVD    x+24(FP), R6
   424  	MOVD    s+48(FP), R9
   425  	MOVD    z_len+8(FP), R4
   426  	MOVD    x_len+32(FP), R7
   427  
   428  	CMP     R9, R0          // s==0, copy(z,x)
   429  	BEQ     zeroshift
   430  	CMP     R4, R0          // len(z)==0 return
   431  	BEQ     done
   432  	SUBC    R9, $64, R5     // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
   433  
   434  	MOVD    0(R6), R7
   435  	SLD     R5, R7, R7      // compute x[0]<<ŝ
   436  	MOVD    $1, R8          // iterate from i=1 to i<len(z)
   437  	CMP     R8, R4
   438  	BGE     loopexit        // Already at end?
   439  
   440  	// vectorize if len(z) is >=3, else jump to scalar loop
   441  	CMP     R4, $3
   442  	BLT     scalar
   443  	MTVSRD  R9, VS38        // s
   444  	VSPLTB  $7, V6, V4
   445  	MTVSRD  R5, VS39        // ŝ
   446  	VSPLTB  $7, V7, V2
   447  	ADD     $-2, R4, R16
   448  	PCALIGN $16
   449  loopback:
   450  	ADD     $-1, R8, R10
   451  	SLD     $3, R10
   452  	LXVD2X  (R6)(R10), VS32 // load x[i-1], x[i]
   453  	SLD     $3, R8, R12
   454  	LXVD2X  (R6)(R12), VS33 // load x[i], x[i+1]
   455  
   456  	VSRD    V0, V4, V3      // x[i-1]>>s, x[i]>>s
   457  	VSLD    V1, V2, V5      // x[i]<<ŝ, x[i+1]<<ŝ
   458  	VOR     V3, V5, V5      // Or(|) the two registers together
   459  	STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i]
   460  	ADD     $2, R8          // Done processing 2 entries, i and i+1
   461  	CMP     R8, R16         // Are there at least a couple of more entries left?
   462  	BLE     loopback
   463  	CMP     R8, R4          // Are we at the last element?
   464  	BEQ     loopexit
   465  scalar:	
   466  	ADD     $-1, R8, R10
   467  	SLD     $3, R10
   468  	MOVD    (R6)(R10),R11
   469  	SRD     R9, R11, R11    // x[len(z)-2] >> s
   470  	SLD     $3, R8, R12
   471  	MOVD    (R6)(R12), R12
   472  	SLD     R5, R12, R12    // x[len(z)-1]<<ŝ
   473  	OR      R12, R11, R11   // x[len(z)-2]>>s | x[len(z)-1]<<ŝ
   474  	MOVD    R11, (R3)(R10)  // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ
   475  loopexit:
   476  	ADD     $-1, R4
   477  	SLD     $3, R4
   478  	MOVD    (R6)(R4), R5
   479  	SRD     R9, R5, R5      // x[len(z)-1]>>s
   480  	MOVD    R5, (R3)(R4)    // z[len(z)-1]=x[len(z)-1]>>s
   481  	MOVD    R7, c+56(FP)    // store pre-computed x[0]<<ŝ into c
   482  	RET
   483  
   484  zeroshift:
   485  	CMP     R6, R0          // x is null, nothing to copy
   486  	BEQ     done
   487  	CMP     R6, R3          // if x is same as z, nothing to copy
   488  	BEQ     done
   489  	CMP     R7, R4
   490  	ISEL    $0, R7, R4, R7  // Take the lower bounds of lengths of x, z
   491  	SLD     $3, R7, R7
   492  	MOVD    $0, R14
   493  repeat:
   494  	MOVD    (R6)(R14), R15  // copy 8 bytes at a time
   495  	MOVD    R15, (R3)(R14)  // shrVU processes bytes only forwards
   496  	ADD     $8, R14
   497  	CMP     R14, R7         // More 8 bytes left?
   498  	BLT     repeat
   499  done:
   500  	MOVD    R0, c+56(FP)
   501  	RET
   502  
   503  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   504  TEXT ·mulAddVWW(SB), NOSPLIT, $0
   505  	MOVD    z+0(FP), R10      // R10 = z[]
   506  	MOVD    x+24(FP), R8      // R8 = x[]
   507  	MOVD    y+48(FP), R9      // R9 = y
   508  	MOVD    r+56(FP), R4      // R4 = r = c
   509  	MOVD    z_len+8(FP), R11  // R11 = z_len
   510  
   511  	CMP     R0, R11
   512  	BEQ     done
   513  
   514  	MOVD    0(R8), R20
   515  	ADD     $-1, R11
   516  	MULLD   R9, R20, R6       // R6 = z0 = Low-order(x[i]*y)
   517  	MULHDU  R9, R20, R7       // R7 = z1 = High-order(x[i]*y)
   518  	ADDC    R4, R6            // R6 = z0 + r
   519  	ADDZE   R7                // R7 = z1 + CA
   520  	CMP     R0, R11
   521  	MOVD    R7, R4            // R4 = c
   522  	MOVD    R6, 0(R10)        // z[i]
   523  	BEQ     done
   524  
   525  	// We will read 4 elements per iteration
   526  	SRD     $2, R11, R14      // R14 = z_len/4
   527  	DCBT    (R8)
   528  	CMP     R0, R14
   529  	MOVD    R14, CTR          // Set up the loop counter
   530  	BEQ     tail              // If R9 = 0, we can't use the loop
   531  	PCALIGN $16
   532  
   533  loop:
   534  	MOVD    8(R8), R20        // R20 = x[i]
   535  	MOVD    16(R8), R21       // R21 = x[i+1]
   536  	MOVD    24(R8), R22       // R22 = x[i+2]
   537  	MOVDU   32(R8), R23       // R23 = x[i+3]
   538  	MULLD   R9, R20, R24      // R24 = z0[i]
   539  	MULHDU  R9, R20, R20      // R20 = z1[i]
   540  	ADDC    R4, R24           // R24 = z0[i] + c
   541  	ADDZE   R20               // R7 = z1[i] + CA
   542  	MULLD   R9, R21, R25
   543  	MULHDU  R9, R21, R21
   544  	ADDC    R20, R25
   545  	ADDZE   R21
   546  	MULLD   R9, R22, R26
   547  	MULHDU  R9, R22, R22
   548  	MULLD   R9, R23, R27
   549  	MULHDU  R9, R23, R23
   550  	ADDC    R21, R26
   551  	ADDZE   R22
   552  	MOVD    R24, 8(R10)       // z[i]
   553  	MOVD    R25, 16(R10)      // z[i+1]
   554  	ADDC    R22, R27
   555  	ADDZE   R23,R4		  // update carry
   556  	MOVD    R26, 24(R10)      // z[i+2]
   557  	MOVDU   R27, 32(R10)      // z[i+3]
   558  	ADD     $-4, R11          // R11 = z_len - 4
   559  	BC      16, 0, loop       // bdnz
   560  
   561  	// We may have some elements to read
   562  	CMP   R0, R11
   563  	BEQ   done
   564  
   565  	// Process the remaining elements, one at a time
   566  tail:
   567  	MOVDU   8(R8), R20        // R20 = x[i]
   568  	MULLD   R9, R20, R24      // R24 = z0[i]
   569  	MULHDU  R9, R20, R25      // R25 = z1[i]
   570  	ADD     $-1, R11          // R11 = z_len - 1
   571  	ADDC    R4, R24
   572  	ADDZE   R25
   573  	MOVDU   R24, 8(R10)       // z[i]
   574  	CMP     R0, R11
   575  	MOVD    R25, R4           // R4 = c
   576  	BEQ     done              // If R11 = 0, we are done
   577  
   578  	MOVDU   8(R8), R20
   579  	MULLD   R9, R20, R24
   580  	MULHDU  R9, R20, R25
   581  	ADD     $-1, R11
   582  	ADDC    R4, R24
   583  	ADDZE   R25
   584  	MOVDU   R24, 8(R10)
   585  	CMP     R0, R11
   586  	MOVD    R25, R4
   587  	BEQ     done
   588  
   589  	MOVD    8(R8), R20
   590  	MULLD   R9, R20, R24
   591  	MULHDU  R9, R20, R25
   592  	ADD     $-1, R11
   593  	ADDC    R4, R24
   594  	ADDZE   R25
   595  	MOVD    R24, 8(R10)
   596  	MOVD    R25, R4
   597  
   598  done:
   599  	MOVD    R4, c+64(FP)
   600  	RET
   601  
   602  // func addMulVVW(z, x []Word, y Word) (c Word)
   603  TEXT ·addMulVVW(SB), NOSPLIT, $0
   604  	MOVD z+0(FP), R10	// R10 = z[]
   605  	MOVD x+24(FP), R8	// R8 = x[]
   606  	MOVD y+48(FP), R9	// R9 = y
   607  	MOVD z_len+8(FP), R22	// R22 = z_len
   608  
   609  	MOVD R0, R3		// R3 will be the index register
   610  	CMP  R0, R22
   611  	MOVD R0, R4		// R4 = c = 0
   612  	MOVD R22, CTR		// Initialize loop counter
   613  	BEQ  done
   614  	PCALIGN $16
   615  
   616  loop:
   617  	MOVD  (R8)(R3), R20	// Load x[i]
   618  	MOVD  (R10)(R3), R21	// Load z[i]
   619  	MULLD  R9, R20, R6	// R6 = Low-order(x[i]*y)
   620  	MULHDU R9, R20, R7	// R7 = High-order(x[i]*y)
   621  	ADDC   R21, R6		// R6 = z0
   622  	ADDZE  R7		// R7 = z1
   623  	ADDC   R4, R6		// R6 = z0 + c + 0
   624  	ADDZE  R7, R4           // c += z1
   625  	MOVD   R6, (R10)(R3)	// Store z[i]
   626  	ADD    $8, R3
   627  	BC  16, 0, loop		// bdnz
   628  
   629  done:
   630  	MOVD R4, c+56(FP)
   631  	RET
   632  
   633