github.com/flyinox/gosm@v0.0.0-20171117061539-16768cb62077/src/math/big/arith_ppc64x.s (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le
     6  
     7  #include "textflag.h"
     8  
     9  // This file provides fast assembly versions for the elementary
    10  // arithmetic operations on vectors implemented in arith.go.
    11  
    12  // func mulWW(x, y Word) (z1, z0 Word)
    13  TEXT ·mulWW(SB), NOSPLIT, $0
    14  	MOVD   x+0(FP), R4
    15  	MOVD   y+8(FP), R5
    16  	MULHDU R4, R5, R6
    17  	MULLD  R4, R5, R7
    18  	MOVD   R6, z1+16(FP)
    19  	MOVD   R7, z0+24(FP)
    20  	RET
    21  
    22  // func addVV(z, y, y []Word) (c Word)
    23  // z[i] = x[i] + y[i] for all i, carrying
    24  TEXT ·addVV(SB), NOSPLIT, $0
    25  	MOVD  z_len+8(FP), R7
    26  	MOVD  x+24(FP), R8
    27  	MOVD  y+48(FP), R9
    28  	MOVD  z+0(FP), R10
    29  
    30  	MOVD  R0, R4
    31  	MOVD  R0, R6  // R6 will be the address index
    32  	ADDC R4, R4   // clear CA
    33  	MOVD  R7, CTR
    34  
    35  	CMP   R0, R7
    36  	BEQ   done
    37  
    38  loop:
    39  	MOVD  (R8)(R6), R11   // x[i]
    40  	MOVD  (R9)(R6), R12   // y[i]
    41  	ADDE  R12, R11, R15   // x[i] + y[i] + CA
    42  	MOVD  R15, (R10)(R6)  // z[i]
    43  
    44  	ADD $8, R6
    45  	BC  16, 0, loop	// bdnz
    46  
    47  done:
    48  	ADDZE R4
    49  	MOVD  R4, c+72(FP)
    50  	RET
    51  
    52  // func subVV(z, x, y []Word) (c Word)
    53  // z[i] = x[i] - y[i] for all i, carrying
    54  TEXT ·subVV(SB), NOSPLIT, $0
    55  	MOVD z_len+8(FP), R7
    56  	MOVD x+24(FP), R8
    57  	MOVD y+48(FP), R9
    58  	MOVD z+0(FP), R10
    59  
    60  	MOVD  R0, R4  // c = 0
    61  	MOVD  R0, R6
    62  	SUBC R0, R0  // clear CA
    63  	MOVD  R7, CTR
    64  
    65  	CMP R0, R7
    66  	BEQ  sublend
    67  
    68  // amd64 saves and restores CF, but I believe they only have to do that because all of
    69  // their math operations clobber it - we should just be able to recover it at the end.
    70  subloop:
    71  	MOVD  (R8)(R6), R11 // x[i]
    72  	MOVD  (R9)(R6), R12 // y[i]
    73  
    74  	SUBE R12, R11, R15
    75  	MOVD R15, (R10)(R6)
    76  
    77  	ADD $8, R6
    78  	BC  16, 0, subloop  // bdnz
    79  
    80  sublend:
    81  
    82  	ADDZE R4
    83  	XOR   $1, R4
    84  	MOVD  R4, c+72(FP)
    85  	RET
    86  
    87  TEXT ·addVW(SB), NOSPLIT, $0
    88  	BR ·addVW_g(SB)
    89  
    90  TEXT ·subVW(SB), NOSPLIT, $0
    91  	BR ·subVW_g(SB)
    92  
    93  TEXT ·shlVU(SB), NOSPLIT, $0
    94  	BR ·shlVU_g(SB)
    95  
    96  TEXT ·shrVU(SB), NOSPLIT, $0
    97  	BR ·shrVU_g(SB)
    98  
    99  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   100  TEXT ·mulAddVWW(SB), NOSPLIT, $0
   101  	MOVD z+0(FP), R10
   102  	MOVD x+24(FP), R8
   103  	MOVD y+48(FP), R9
   104  	MOVD r+56(FP), R4     // c = r
   105  	MOVD z_len+8(FP), R11
   106  	MOVD $0, R3           // i = 0
   107  	MOVD $8, R18
   108  	MOVD $1, R19
   109  
   110  	JMP e5
   111  
   112  l5:
   113  	MULLD  R18, R3, R5
   114  	MOVD   (R8)(R5), R20
   115  	MULLD  R9, R20, R6
   116  	MULHDU R9, R20, R7
   117  	ADDC   R4, R6
   118  	ADDZE  R7
   119  	MOVD   R6, (R10)(R5)
   120  	MOVD   R7, R4
   121  	ADD    R19, R3
   122  
   123  e5:
   124  	CMP R3, R11
   125  	BLT l5
   126  
   127  	MOVD R4, c+64(FP)
   128  	RET
   129  
   130  // func addMulVVW(z, x []Word, y Word) (c Word)
   131  TEXT ·addMulVVW(SB), NOSPLIT, $0
   132  	MOVD z+0(FP), R10
   133  	MOVD x+24(FP), R8
   134  	MOVD y+48(FP), R9
   135  	MOVD z_len+8(FP), R22
   136  
   137  	MOVD $0, R5   // i = 0
   138  	MOVD $0, R4   // c = 0
   139  	MOVD $8, R28
   140  	MOVD $-2, R23
   141  	AND  R22, R23 // mask the last bit of z.len
   142  	MOVD $2, R24
   143  	CMP  R23, R24
   144  	BGE  unrolled
   145  	JMP  end
   146  
   147  unrolled:
   148  	MOVD  $8, R19         // no (RA)(RB*8) on power
   149  	MULLD R5, R19
   150  	MOVD  (R10)(R19), R11 // R11 = z[i]
   151  	MOVD  (R8)(R19), R16  // R16 = x[i]
   152  	ADD   R28, R19, R25
   153  	MOVD  (R10)(R25), R17
   154  	MOVD  (R8)(R25), R18
   155  
   156  	MULLD  R9, R16, R12
   157  	MULHDU R9, R16, R14
   158  	MULLD  R9, R18, R6
   159  	MULHDU R9, R18, R7
   160  	ADDC   R4, R12
   161  	ADDZE  R14
   162  	ADDC   R11, R12        // z[i] = (x[i]*y) + z[i] + carry
   163  	ADDZE  R14             // carry = high order bits + add carry
   164  	MOVD   R12, (R10)(R19)
   165  	ADDC   R14, R6
   166  	ADDZE  R7
   167  	ADDC   R17, R6
   168  	ADDZE  R7
   169  	MOVD   R6, (R10)(R25)
   170  	MOVD   R7, R4
   171  
   172  	ADD R24, R5
   173  	CMP R5, R23
   174  	BLT unrolled
   175  	JMP end
   176  
   177  loop:
   178  	MOVD   $8, R19
   179  	MULLD  R5, R19
   180  	MOVD   (R10)(R19), R11
   181  	MOVD   (R8)(R19), R16
   182  	MULLD  R9, R16, R12
   183  	MULHDU R9, R16, R14
   184  	ADDC   R4, R12
   185  	ADDZE  R14
   186  	ADDC   R11, R12
   187  	ADDZE  R14
   188  	MOVD   R12, (R10)(R19)
   189  	MOVD   R14, R4
   190  
   191  	MOVD $1, R15
   192  	ADD  R15, R5
   193  
   194  end:
   195  	CMP R5, R22
   196  	BLT loop
   197  
   198  	MOVD R4, c+56(FP)
   199  	RET
   200  
   201  // func divWW(x1, x0, y Word) (q, r Word)
   202  TEXT ·divWW(SB), NOSPLIT, $0
   203  	MOVD x1+0(FP), R4
   204  	MOVD x0+8(FP), R5
   205  	MOVD y+16(FP), R6
   206  
   207  	CMPU R4, R6
   208  	BGE  divbigger
   209  
   210  	// from the programmer's note in ch. 3 of the ISA manual, p.74
   211  	DIVDEU R6, R4, R3
   212  	DIVDU  R6, R5, R7
   213  	MULLD  R6, R3, R8
   214  	MULLD  R6, R7, R20
   215  	SUB    R20, R5, R10
   216  	ADD    R7, R3, R3
   217  	SUB    R8, R10, R4
   218  	CMPU   R4, R10
   219  	BLT    adjust
   220  	CMPU   R4, R6
   221  	BLT    end
   222  
   223  adjust:
   224  	MOVD $1, R21
   225  	ADD  R21, R3, R3
   226  	SUB  R6, R4, R4
   227  
   228  end:
   229  	MOVD R3, q+24(FP)
   230  	MOVD R4, r+32(FP)
   231  
   232  	RET
   233  
   234  divbigger:
   235  	MOVD $-1, R7
   236  	MOVD R7, q+24(FP)
   237  	MOVD R7, r+32(FP)
   238  	RET
   239  
   240  TEXT ·divWVW(SB), NOSPLIT, $0
   241  	BR ·divWVW_g(SB)