github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/math/big/arith_ppc64x.s (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le
     6  
     7  #include "textflag.h"
     8  
     9  // This file provides fast assembly versions for the elementary
    10  // arithmetic operations on vectors implemented in arith.go.
    11  
    12  // func mulWW(x, y Word) (z1, z0 Word)
    13  TEXT ·mulWW(SB), NOSPLIT, $0
    14  	MOVD   x+0(FP), R4
    15  	MOVD   y+8(FP), R5
    16  	MULHDU R4, R5, R6
    17  	MULLD  R4, R5, R7
    18  	MOVD   R6, z1+16(FP)
    19  	MOVD   R7, z0+24(FP)
    20  	RET
    21  
    22  // func addVV(z, y, y []Word) (c Word)
    23  // z[i] = x[i] + y[i] for all i, carrying
    24  TEXT ·addVV(SB), NOSPLIT, $0
    25  	MOVD  z_len+8(FP), R7   // R7 = z_len
    26  	MOVD  x+24(FP), R8      // R8 = x[]
    27  	MOVD  y+48(FP), R9      // R9 = y[]
    28  	MOVD  z+0(FP), R10      // R10 = z[]
    29  
    30  	// If z_len = 0, we are done
    31  	CMP   R0, R7
    32  	MOVD  R0, R4
    33  	BEQ   done
    34  
    35  	// Process the first iteration out of the loop so we can
    36  	// use MOVDU and avoid 3 index registers updates.
    37  	MOVD  0(R8), R11      // R11 = x[i]
    38  	MOVD  0(R9), R12      // R12 = y[i]
    39  	ADD   $-1, R7         // R7 = z_len - 1
    40  	ADDC  R12, R11, R15   // R15 = x[i] + y[i], set CA
    41  	CMP   R0, R7
    42  	MOVD  R15, 0(R10)     // z[i]
    43  	BEQ   final          // If z_len was 1, we are done
    44  
    45  	SRD   $2, R7, R5      // R5 = z_len/4
    46  	CMP   R0, R5
    47  	MOVD  R5, CTR         // Set up loop counter
    48  	BEQ   tail            // If R5 = 0, we can't use the loop
    49  
    50  	// Process 4 elements per iteration. Unrolling this loop
    51  	// means a performance trade-off: we will lose performance
    52  	// for small values of z_len (0.90x in the worst case), but
    53  	// gain significant performance as z_len increases (up to
    54  	// 1.45x).
    55  loop:
    56  	MOVD  8(R8), R11      // R11 = x[i]
    57  	MOVD  16(R8), R12     // R12 = x[i+1]
    58  	MOVD  24(R8), R14     // R14 = x[i+2]
    59  	MOVDU 32(R8), R15     // R15 = x[i+3]
    60  	MOVD  8(R9), R16      // R16 = y[i]
    61  	MOVD  16(R9), R17     // R17 = y[i+1]
    62  	MOVD  24(R9), R18     // R18 = y[i+2]
    63  	MOVDU 32(R9), R19     // R19 = y[i+3]
    64  	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
    65  	ADDE  R12, R17, R21   // R21 = x[i+1] + y[i+1] + CA
    66  	ADDE  R14, R18, R22   // R22 = x[i+2] + y[i+2] + CA
    67  	ADDE  R15, R19, R23   // R23 = x[i+3] + y[i+3] + CA
    68  	MOVD  R20, 8(R10)     // z[i]
    69  	MOVD  R21, 16(R10)    // z[i+1]
    70  	MOVD  R22, 24(R10)    // z[i+2]
    71  	MOVDU R23, 32(R10)    // z[i+3]
    72  	ADD   $-4, R7         // R7 = z_len - 4
    73  	BC  16, 0, loop       // bdnz
    74  
    75  	// We may have more elements to read
    76  	CMP   R0, R7
    77  	BEQ   final
    78  
    79  	// Process the remaining elements, one at a time
    80  tail:
    81  	MOVDU 8(R8), R11      // R11 = x[i]
    82  	MOVDU 8(R9), R16      // R16 = y[i]
    83  	ADD   $-1, R7         // R7 = z_len - 1
    84  	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
    85  	CMP   R0, R7
    86  	MOVDU R20, 8(R10)     // z[i]
    87  	BEQ   final           // If R7 = 0, we are done
    88  
    89  	MOVDU 8(R8), R11
    90  	MOVDU 8(R9), R16
    91  	ADD   $-1, R7
    92  	ADDE  R11, R16, R20
    93  	CMP   R0, R7
    94  	MOVDU R20, 8(R10)
    95  	BEQ   final
    96  
    97  	MOVD  8(R8), R11
    98  	MOVD  8(R9), R16
    99  	ADDE  R11, R16, R20
   100  	MOVD  R20, 8(R10)
   101  
   102  final:
   103  	ADDZE R4              // Capture CA
   104  
   105  done:
   106  	MOVD  R4, c+72(FP)
   107  	RET
   108  
   109  // func subVV(z, x, y []Word) (c Word)
   110  // z[i] = x[i] - y[i] for all i, carrying
   111  TEXT ·subVV(SB), NOSPLIT, $0
   112  	MOVD  z_len+8(FP), R7 // R7 = z_len
   113  	MOVD  x+24(FP), R8    // R8 = x[]
   114  	MOVD  y+48(FP), R9    // R9 = y[]
   115  	MOVD  z+0(FP), R10    // R10 = z[]
   116  
   117  	// If z_len = 0, we are done
   118  	CMP   R0, R7
   119  	MOVD  R0, R4
   120  	BEQ   done
   121  
   122  	// Process the first iteration out of the loop so we can
   123  	// use MOVDU and avoid 3 index registers updates.
   124  	MOVD  0(R8), R11      // R11 = x[i]
   125  	MOVD  0(R9), R12      // R12 = y[i]
   126  	ADD   $-1, R7         // R7 = z_len - 1
   127  	SUBC  R12, R11, R15   // R15 = x[i] - y[i], set CA
   128  	CMP   R0, R7
   129  	MOVD  R15, 0(R10)     // z[i]
   130  	BEQ   final           // If z_len was 1, we are done
   131  
   132  	SRD   $2, R7, R5      // R5 = z_len/4
   133  	CMP   R0, R5
   134  	MOVD  R5, CTR         // Set up loop counter
   135  	BEQ   tail            // If R5 = 0, we can't use the loop
   136  
   137  	// Process 4 elements per iteration. Unrolling this loop
   138  	// means a performance trade-off: we will lose performance
   139  	// for small values of z_len (0.92x in the worst case), but
   140  	// gain significant performance as z_len increases (up to
   141  	// 1.45x).
   142  loop:
   143  	MOVD  8(R8), R11      // R11 = x[i]
   144  	MOVD  16(R8), R12     // R12 = x[i+1]
   145  	MOVD  24(R8), R14     // R14 = x[i+2]
   146  	MOVDU 32(R8), R15     // R15 = x[i+3]
   147  	MOVD  8(R9), R16      // R16 = y[i]
   148  	MOVD  16(R9), R17     // R17 = y[i+1]
   149  	MOVD  24(R9), R18     // R18 = y[i+2]
   150  	MOVDU 32(R9), R19     // R19 = y[i+3]
   151  	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
   152  	SUBE  R17, R12, R21   // R21 = x[i+1] - y[i+1] + CA
   153  	SUBE  R18, R14, R22   // R22 = x[i+2] - y[i+2] + CA
   154  	SUBE  R19, R15, R23   // R23 = x[i+3] - y[i+3] + CA
   155  	MOVD  R20, 8(R10)     // z[i]
   156  	MOVD  R21, 16(R10)    // z[i+1]
   157  	MOVD  R22, 24(R10)    // z[i+2]
   158  	MOVDU R23, 32(R10)    // z[i+3]
   159  	ADD   $-4, R7         // R7 = z_len - 4
   160  	BC  16, 0, loop       // bdnz
   161  
   162  	// We may have more elements to read
   163  	CMP   R0, R7
   164  	BEQ   final
   165  
   166  	// Process the remaining elements, one at a time
   167  tail:
   168  	MOVDU 8(R8), R11      // R11 = x[i]
   169  	MOVDU 8(R9), R16      // R16 = y[i]
   170  	ADD   $-1, R7         // R7 = z_len - 1
   171  	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
   172  	CMP   R0, R7
   173  	MOVDU R20, 8(R10)     // z[i]
   174  	BEQ   final           // If R7 = 0, we are done
   175  
   176  	MOVDU 8(R8), R11
   177  	MOVDU 8(R9), R16
   178  	ADD   $-1, R7
   179  	SUBE  R16, R11, R20
   180  	CMP   R0, R7
   181  	MOVDU R20, 8(R10)
   182  	BEQ   final
   183  
   184  	MOVD  8(R8), R11
   185  	MOVD  8(R9), R16
   186  	SUBE  R16, R11, R20
   187  	MOVD  R20, 8(R10)
   188  
   189  final:
   190  	ADDZE R4
   191  	XOR   $1, R4
   192  
   193  done:
   194  	MOVD  R4, c+72(FP)
   195  	RET
   196  
   197  // func addVW(z, x []Word, y Word) (c Word)
   198  TEXT ·addVW(SB), NOSPLIT, $0
   199  	MOVD z+0(FP), R10	// R10 = z[]
   200  	MOVD x+24(FP), R8	// R8 = x[]
   201  	MOVD y+48(FP), R4	// R4 = y = c
   202  	MOVD z_len+8(FP), R11	// R11 = z_len
   203  
   204  	CMP   R0, R11		// If z_len is zero, return
   205  	BEQ   done
   206  
   207  	// We will process the first iteration out of the loop so we capture
   208  	// the value of c. In the subsequent iterations, we will rely on the
   209  	// value of CA set here.
   210  	MOVD  0(R8), R20	// R20 = x[i]
   211  	ADD   $-1, R11		// R11 = z_len - 1
   212  	ADDC  R20, R4, R6	// R6 = x[i] + c
   213  	CMP   R0, R11		// If z_len was 1, we are done
   214  	MOVD  R6, 0(R10)	// z[i]
   215  	BEQ   final
   216  
   217  	// We will read 4 elements per iteration
   218  	SRD   $2, R11, R9	// R9 = z_len/4
   219  	DCBT  (R8)
   220  	CMP   R0, R9
   221  	MOVD  R9, CTR		// Set up the loop counter
   222  	BEQ   tail		// If R9 = 0, we can't use the loop
   223  
   224  loop:
   225  	MOVD  8(R8), R20	// R20 = x[i]
   226  	MOVD  16(R8), R21	// R21 = x[i+1]
   227  	MOVD  24(R8), R22	// R22 = x[i+2]
   228  	MOVDU 32(R8), R23	// R23 = x[i+3]
   229  	ADDZE R20, R24		// R24 = x[i] + CA
   230  	ADDZE R21, R25		// R25 = x[i+1] + CA
   231  	ADDZE R22, R26		// R26 = x[i+2] + CA
   232  	ADDZE R23, R27		// R27 = x[i+3] + CA
   233  	MOVD  R24, 8(R10)	// z[i]
   234  	MOVD  R25, 16(R10)	// z[i+1]
   235  	MOVD  R26, 24(R10)	// z[i+2]
   236  	MOVDU R27, 32(R10)	// z[i+3]
   237  	ADD   $-4, R11		// R11 = z_len - 4
   238  	BC    16, 0, loop	// bdnz
   239  
   240  	// We may have some elements to read
   241  	CMP R0, R11
   242  	BEQ final
   243  
   244  tail:
   245  	MOVDU 8(R8), R20
   246  	ADDZE R20, R24
   247  	ADD $-1, R11
   248  	MOVDU R24, 8(R10)
   249  	CMP R0, R11
   250  	BEQ final
   251  
   252  	MOVDU 8(R8), R20
   253  	ADDZE R20, R24
   254  	ADD $-1, R11
   255  	MOVDU R24, 8(R10)
   256  	CMP R0, R11
   257  	BEQ final
   258  
   259  	MOVD 8(R8), R20
   260  	ADDZE R20, R24
   261  	MOVD R24, 8(R10)
   262  
   263  final:
   264  	ADDZE R0, R4		// c = CA
   265  done:
   266  	MOVD  R4, c+56(FP)
   267  	RET
   268  
   269  // func subVW(z, x []Word, y Word) (c Word)
   270  TEXT ·subVW(SB), NOSPLIT, $0
   271  	MOVD  z+0(FP), R10	// R10 = z[]
   272  	MOVD  x+24(FP), R8	// R8 = x[]
   273  	MOVD  y+48(FP), R4	// R4 = y = c
   274  	MOVD  z_len+8(FP), R11	// R11 = z_len
   275  
   276  	CMP   R0, R11		// If z_len is zero, return
   277  	BEQ   done
   278  
   279  	// We will process the first iteration out of the loop so we capture
   280  	// the value of c. In the subsequent iterations, we will rely on the
   281  	// value of CA set here.
   282  	MOVD  0(R8), R20	// R20 = x[i]
   283  	ADD   $-1, R11		// R11 = z_len - 1
   284  	SUBC  R4, R20, R6	// R6 = x[i] - c
   285  	CMP   R0, R11		// If z_len was 1, we are done
   286  	MOVD  R6, 0(R10)	// z[i]
   287  	BEQ   final
   288  
   289  	// We will read 4 elements per iteration
   290  	SRD   $2, R11, R9	// R9 = z_len/4
   291  	DCBT  (R8)
   292  	CMP   R0, R9
   293  	MOVD  R9, CTR		// Set up the loop counter
   294  	BEQ   tail		// If R9 = 0, we can't use the loop
   295  
   296  	// The loop here is almost the same as the one used in s390x, but
   297  	// we don't need to capture CA every iteration because we've already
   298  	// done that above.
   299  loop:
   300  	MOVD  8(R8), R20
   301  	MOVD  16(R8), R21
   302  	MOVD  24(R8), R22
   303  	MOVDU 32(R8), R23
   304  	SUBE  R0, R20
   305  	SUBE  R0, R21
   306  	SUBE  R0, R22
   307  	SUBE  R0, R23
   308  	MOVD  R20, 8(R10)
   309  	MOVD  R21, 16(R10)
   310  	MOVD  R22, 24(R10)
   311  	MOVDU R23, 32(R10)
   312  	ADD   $-4, R11
   313  	BC    16, 0, loop	// bdnz
   314  
   315  	// We may have some elements to read
   316  	CMP   R0, R11
   317  	BEQ   final
   318  
   319  tail:
   320  	MOVDU 8(R8), R20
   321  	SUBE  R0, R20
   322  	ADD   $-1, R11
   323  	MOVDU R20, 8(R10)
   324  	CMP   R0, R11
   325  	BEQ   final
   326  
   327  	MOVDU 8(R8), R20
   328  	SUBE  R0, R20
   329  	ADD   $-1, R11
   330  	MOVDU R20, 8(R10)
   331  	CMP   R0, R11
   332  	BEQ   final
   333  
   334  	MOVD  8(R8), R20
   335  	SUBE  R0, R20
   336  	MOVD  R20, 8(R10)
   337  
   338  final:
   339  	// Capture CA
   340  	SUBE  R4, R4
   341  	NEG   R4, R4
   342  
   343  done:
   344  	MOVD  R4, c+56(FP)
   345  	RET
   346  
   347  TEXT ·shlVU(SB), NOSPLIT, $0
   348  	BR ·shlVU_g(SB)
   349  
   350  TEXT ·shrVU(SB), NOSPLIT, $0
   351  	BR ·shrVU_g(SB)
   352  
   353  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   354  TEXT ·mulAddVWW(SB), NOSPLIT, $0
   355  	MOVD    z+0(FP), R10      // R10 = z[]
   356  	MOVD    x+24(FP), R8      // R8 = x[]
   357  	MOVD    y+48(FP), R9      // R9 = y
   358  	MOVD    r+56(FP), R4      // R4 = r = c
   359  	MOVD    z_len+8(FP), R11  // R11 = z_len
   360  
   361  	CMP     R0, R11
   362  	BEQ     done
   363  
   364  	MOVD    0(R8), R20
   365  	ADD     $-1, R11
   366  	MULLD   R9, R20, R6       // R6 = z0 = Low-order(x[i]*y)
   367  	MULHDU  R9, R20, R7       // R7 = z1 = High-order(x[i]*y)
   368  	ADDC    R4, R6            // R6 = z0 + r
   369  	ADDZE   R7                // R7 = z1 + CA
   370  	CMP     R0, R11
   371  	MOVD    R7, R4            // R4 = c
   372  	MOVD    R6, 0(R10)        // z[i]
   373  	BEQ     done
   374  
   375  	// We will read 4 elements per iteration
   376  	SRD     $2, R11, R14      // R14 = z_len/4
   377  	DCBT    (R8)
   378  	CMP     R0, R14
   379  	MOVD    R14, CTR          // Set up the loop counter
   380  	BEQ     tail              // If R9 = 0, we can't use the loop
   381  
   382  loop:
   383  	MOVD    8(R8), R20        // R20 = x[i]
   384  	MOVD    16(R8), R21       // R21 = x[i+1]
   385  	MOVD    24(R8), R22       // R22 = x[i+2]
   386  	MOVDU   32(R8), R23       // R23 = x[i+3]
   387  	MULLD   R9, R20, R24      // R24 = z0[i]
   388  	MULHDU  R9, R20, R20      // R20 = z1[i]
   389  	ADDC    R4, R24           // R24 = z0[i] + c
   390  	ADDZE   R20               // R7 = z1[i] + CA
   391  	MULLD   R9, R21, R25
   392  	MULHDU  R9, R21, R21
   393  	ADDC    R20, R25
   394  	ADDZE   R21
   395  	MULLD   R9, R22, R26
   396  	MULHDU  R9, R22, R22
   397  	MULLD   R9, R23, R27
   398  	MULHDU  R9, R23, R23
   399  	ADDC    R21, R26
   400  	ADDZE   R22
   401  	MOVD    R24, 8(R10)       // z[i]
   402  	MOVD    R25, 16(R10)      // z[i+1]
   403  	ADDC    R22, R27
   404  	ADDZE   R23,R4		  // update carry
   405  	MOVD    R26, 24(R10)      // z[i+2]
   406  	MOVDU   R27, 32(R10)      // z[i+3]
   407  	ADD     $-4, R11          // R11 = z_len - 4
   408  	BC      16, 0, loop       // bdnz
   409  
   410  	// We may have some elements to read
   411  	CMP   R0, R11
   412  	BEQ   done
   413  
   414  	// Process the remaining elements, one at a time
   415  tail:
   416  	MOVDU   8(R8), R20        // R20 = x[i]
   417  	MULLD   R9, R20, R24      // R24 = z0[i]
   418  	MULHDU  R9, R20, R25      // R25 = z1[i]
   419  	ADD     $-1, R11          // R11 = z_len - 1
   420  	ADDC    R4, R24
   421  	ADDZE   R25
   422  	MOVDU   R24, 8(R10)       // z[i]
   423  	CMP     R0, R11
   424  	MOVD    R25, R4           // R4 = c
   425  	BEQ     done              // If R11 = 0, we are done
   426  
   427  	MOVDU   8(R8), R20
   428  	MULLD   R9, R20, R24
   429  	MULHDU  R9, R20, R25
   430  	ADD     $-1, R11
   431  	ADDC    R4, R24
   432  	ADDZE   R25
   433  	MOVDU   R24, 8(R10)
   434  	CMP     R0, R11
   435  	MOVD    R25, R4
   436  	BEQ     done
   437  
   438  	MOVD    8(R8), R20
   439  	MULLD   R9, R20, R24
   440  	MULHDU  R9, R20, R25
   441  	ADD     $-1, R11
   442  	ADDC    R4, R24
   443  	ADDZE   R25
   444  	MOVD    R24, 8(R10)
   445  	MOVD    R25, R4
   446  
   447  done:
   448  	MOVD    R4, c+64(FP)
   449  	RET
   450  
   451  // func addMulVVW(z, x []Word, y Word) (c Word)
   452  TEXT ·addMulVVW(SB), NOSPLIT, $0
   453  	MOVD z+0(FP), R10	// R10 = z[]
   454  	MOVD x+24(FP), R8	// R8 = x[]
   455  	MOVD y+48(FP), R9	// R9 = y
   456  	MOVD z_len+8(FP), R22	// R22 = z_len
   457  
   458  	MOVD R0, R3		// R3 will be the index register
   459  	CMP  R0, R22
   460  	MOVD R0, R4		// R4 = c = 0
   461  	MOVD R22, CTR		// Initialize loop counter
   462  	BEQ  done
   463  
   464  loop:
   465  	MOVD  (R8)(R3), R20	// Load x[i]
   466  	MOVD  (R10)(R3), R21	// Load z[i]
   467  	MULLD  R9, R20, R6	// R6 = Low-order(x[i]*y)
   468  	MULHDU R9, R20, R7	// R7 = High-order(x[i]*y)
   469  	ADDC   R21, R6		// R6 = z0
   470  	ADDZE  R7		// R7 = z1
   471  	ADDC   R4, R6		// R6 = z0 + c + 0
   472  	ADDZE  R7, R4           // c += z1
   473  	MOVD   R6, (R10)(R3)	// Store z[i]
   474  	ADD    $8, R3
   475  	BC  16, 0, loop		// bdnz
   476  
   477  done:
   478  	MOVD R4, c+56(FP)
   479  	RET
   480  
   481