github.com/s1s1ty/go@v0.0.0-20180207192209-104445e3140f/src/math/big/arith_ppc64x.s (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le
     6  
     7  #include "textflag.h"
     8  
     9  // This file provides fast assembly versions for the elementary
    10  // arithmetic operations on vectors implemented in arith.go.
    11  
    12  // func mulWW(x, y Word) (z1, z0 Word)
    13  TEXT ·mulWW(SB), NOSPLIT, $0
    14  	MOVD   x+0(FP), R4
    15  	MOVD   y+8(FP), R5
    16  	MULHDU R4, R5, R6
    17  	MULLD  R4, R5, R7
    18  	MOVD   R6, z1+16(FP)
    19  	MOVD   R7, z0+24(FP)
    20  	RET
    21  
    22  // func addVV(z, y, y []Word) (c Word)
    23  // z[i] = x[i] + y[i] for all i, carrying
    24  TEXT ·addVV(SB), NOSPLIT, $0
    25  	MOVD  z_len+8(FP), R7
    26  	MOVD  x+24(FP), R8
    27  	MOVD  y+48(FP), R9
    28  	MOVD  z+0(FP), R10
    29  
    30  	MOVD  R0, R4
    31  	MOVD  R0, R6  // R6 will be the address index
    32  	ADDC R4, R4   // clear CA
    33  	MOVD  R7, CTR
    34  
    35  	CMP   R0, R7
    36  	BEQ   done
    37  
    38  loop:
    39  	MOVD  (R8)(R6), R11   // x[i]
    40  	MOVD  (R9)(R6), R12   // y[i]
    41  	ADDE  R12, R11, R15   // x[i] + y[i] + CA
    42  	MOVD  R15, (R10)(R6)  // z[i]
    43  
    44  	ADD $8, R6
    45  	BC  16, 0, loop	// bdnz
    46  
    47  done:
    48  	ADDZE R4
    49  	MOVD  R4, c+72(FP)
    50  	RET
    51  
    52  // func subVV(z, x, y []Word) (c Word)
    53  // z[i] = x[i] - y[i] for all i, carrying
    54  TEXT ·subVV(SB), NOSPLIT, $0
    55  	MOVD z_len+8(FP), R7
    56  	MOVD x+24(FP), R8
    57  	MOVD y+48(FP), R9
    58  	MOVD z+0(FP), R10
    59  
    60  	MOVD  R0, R4  // c = 0
    61  	MOVD  R0, R6
    62  	SUBC R0, R0  // clear CA
    63  	MOVD  R7, CTR
    64  
    65  	CMP R0, R7
    66  	BEQ  sublend
    67  
    68  // amd64 saves and restores CF, but I believe they only have to do that because all of
    69  // their math operations clobber it - we should just be able to recover it at the end.
    70  subloop:
    71  	MOVD  (R8)(R6), R11 // x[i]
    72  	MOVD  (R9)(R6), R12 // y[i]
    73  
    74  	SUBE R12, R11, R15
    75  	MOVD R15, (R10)(R6)
    76  
    77  	ADD $8, R6
    78  	BC  16, 0, subloop  // bdnz
    79  
    80  sublend:
    81  
    82  	ADDZE R4
    83  	XOR   $1, R4
    84  	MOVD  R4, c+72(FP)
    85  	RET
    86  
    87  TEXT ·addVW(SB), NOSPLIT, $0
    88  	BR ·addVW_g(SB)
    89  
    90  TEXT ·subVW(SB), NOSPLIT, $0
    91  	BR ·subVW_g(SB)
    92  
    93  TEXT ·shlVU(SB), NOSPLIT, $0
    94  	BR ·shlVU_g(SB)
    95  
    96  TEXT ·shrVU(SB), NOSPLIT, $0
    97  	BR ·shrVU_g(SB)
    98  
    99  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   100  TEXT ·mulAddVWW(SB), NOSPLIT, $0
   101  	MOVD z+0(FP), R10	// R10 = z[]
   102  	MOVD x+24(FP), R8	// R8 = x[]
   103  	MOVD y+48(FP), R9	// R9 = y
   104  	MOVD r+56(FP), R4	// R4 = r = c
   105  	MOVD z_len+8(FP), R11	// R11 = z_len
   106  
   107  	MOVD R0, R3		// R3 will be the index register
   108  	CMP  R0, R11
   109  	MOVD R11, CTR		// Initialize loop counter
   110  	BEQ  done
   111  
   112  loop:
   113  	MOVD   (R8)(R3), R20	// x[i]
   114  	MULLD  R9, R20, R6	// R6 = z0 = Low-order(x[i]*y)
   115  	MULHDU R9, R20, R7	// R7 = z1 = High-order(x[i]*y)
   116  	ADDC   R4, R6		// Compute sum for z1 and z0
   117  	ADDZE  R7
   118  	MOVD   R6, (R10)(R3)	// z[i]
   119  	MOVD   R7, R4		// c
   120  	ADD    $8, R3
   121  	BC  16, 0, loop		// bdnz
   122  
   123  done:
   124  	MOVD R4, c+64(FP)
   125  	RET
   126  
   127  // func addMulVVW(z, x []Word, y Word) (c Word)
   128  TEXT ·addMulVVW(SB), NOSPLIT, $0
   129  	MOVD z+0(FP), R10	// R10 = z[]
   130  	MOVD x+24(FP), R8	// R8 = x[]
   131  	MOVD y+48(FP), R9	// R9 = y
   132  	MOVD z_len+8(FP), R22	// R22 = z_len
   133  
   134  	MOVD R0, R3		// R3 will be the index register
   135  	CMP  R0, R22
   136  	MOVD R0, R4		// R4 = c = 0
   137  	MOVD R22, CTR		// Initialize loop counter
   138  	BEQ  done
   139  
   140  loop:
   141  	MOVD  (R8)(R3), R20	// Load x[i]
   142  	MOVD  (R10)(R3), R21	// Load z[i]
   143  	MULLD  R9, R20, R6	// R6 = Low-order(x[i]*y)
   144  	MULHDU R9, R20, R7	// R7 = High-order(x[i]*y)
   145  	ADDC   R21, R6		// R6 = z0
   146  	ADDZE  R7		// R7 = z1
   147  	ADDC   R4, R6		// R6 = z0 + c + 0
   148  	ADDZE  R7, R4           // c += z1
   149  	MOVD   R6, (R10)(R3)	// Store z[i]
   150  	ADD    $8, R3
   151  	BC  16, 0, loop		// bdnz
   152  
   153  done:
   154  	MOVD R4, c+56(FP)
   155  	RET
   156  
   157  // func divWW(x1, x0, y Word) (q, r Word)
   158  TEXT ·divWW(SB), NOSPLIT, $0
   159  	MOVD x1+0(FP), R4
   160  	MOVD x0+8(FP), R5
   161  	MOVD y+16(FP), R6
   162  
   163  	CMPU R4, R6
   164  	BGE  divbigger
   165  
   166  	// from the programmer's note in ch. 3 of the ISA manual, p.74
   167  	DIVDEU R6, R4, R3
   168  	DIVDU  R6, R5, R7
   169  	MULLD  R6, R3, R8
   170  	MULLD  R6, R7, R20
   171  	SUB    R20, R5, R10
   172  	ADD    R7, R3, R3
   173  	SUB    R8, R10, R4
   174  	CMPU   R4, R10
   175  	BLT    adjust
   176  	CMPU   R4, R6
   177  	BLT    end
   178  
   179  adjust:
   180  	MOVD $1, R21
   181  	ADD  R21, R3, R3
   182  	SUB  R6, R4, R4
   183  
   184  end:
   185  	MOVD R3, q+24(FP)
   186  	MOVD R4, r+32(FP)
   187  
   188  	RET
   189  
   190  divbigger:
   191  	MOVD $-1, R7
   192  	MOVD R7, q+24(FP)
   193  	MOVD R7, r+32(FP)
   194  	RET
   195  
   196  TEXT ·divWVW(SB), NOSPLIT, $0
   197  	BR ·divWVW_g(SB)