github.com/FenixAra/go@v0.0.0-20170127160404-96ea0918e670/src/math/big/arith_ppc64x.s (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le
     6  
     7  #include "textflag.h"
     8  
     9  // This file provides fast assembly versions for the elementary
    10  // arithmetic operations on vectors implemented in arith.go.
    11  
    12  // func mulWW(x, y Word) (z1, z0 Word)
    13  TEXT ·mulWW(SB), NOSPLIT, $0
    14  	MOVD   x+0(FP), R4
    15  	MOVD   y+8(FP), R5
    16  	MULHDU R4, R5, R6
    17  	MULLD  R4, R5, R7
    18  	MOVD   R6, z1+16(FP)
    19  	MOVD   R7, z0+24(FP)
    20  	RET
    21  
    22  TEXT ·addVV(SB), NOSPLIT, $0
    23  	BR ·addVV_g(SB)
    24  
    25  // func subVV(z, x, y []Word) (c Word)
    26  // z[i] = x[i] - y[i] for all i, carrying
    27  TEXT ·subVV(SB), NOSPLIT, $0
    28  	MOVD z_len+8(FP), R7
    29  	MOVD x+24(FP), R8
    30  	MOVD y+48(FP), R9
    31  	MOVD z+0(FP), R10
    32  
    33  	MOVD $0, R4  // c = 0
    34  	MOVD $0, R5  // i = 0
    35  	MOVD $1, R29 // work around lack of ADDI
    36  	MOVD $8, R28 // work around lack of scaled addressing
    37  
    38  	SUBC R0, R0  // clear CA
    39  	JMP  sublend
    40  
    41  // amd64 saves and restores CF, but I believe they only have to do that because all of
    42  // their math operations clobber it - we should just be able to recover it at the end.
    43  subloop:
    44  	MULLD R5, R28, R6
    45  	MOVD  (R8)(R6), R11 // x[i]
    46  	MOVD  (R9)(R6), R12 // y[i]
    47  
    48  	SUBE R12, R11, R15
    49  	MOVD R15, (R10)(R6)
    50  
    51  	ADD R29, R5 // i++
    52  
    53  sublend:
    54  	CMP R5, R7
    55  	BLT subloop
    56  
    57  	ADDZE R4
    58  	XOR   R29, R4
    59  	MOVD  R4, c+72(FP)
    60  	RET
    61  
    62  TEXT ·addVW(SB), NOSPLIT, $0
    63  	BR ·addVW_g(SB)
    64  
    65  TEXT ·subVW(SB), NOSPLIT, $0
    66  	BR ·subVW_g(SB)
    67  
    68  TEXT ·shlVU(SB), NOSPLIT, $0
    69  	BR ·shlVU_g(SB)
    70  
    71  TEXT ·shrVU(SB), NOSPLIT, $0
    72  	BR ·shrVU_g(SB)
    73  
    74  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
    75  TEXT ·mulAddVWW(SB), NOSPLIT, $0
    76  	MOVD z+0(FP), R10
    77  	MOVD x+24(FP), R8
    78  	MOVD y+48(FP), R9
    79  	MOVD r+56(FP), R4     // c = r
    80  	MOVD z_len+8(FP), R11
    81  	MOVD $0, R3           // i = 0
    82  	MOVD $8, R18
    83  	MOVD $1, R19
    84  
    85  	JMP e5
    86  
    87  l5:
    88  	MULLD  R18, R3, R5
    89  	MOVD   (R8)(R5), R20
    90  	MULLD  R9, R20, R6
    91  	MULHDU R9, R20, R7
    92  	ADDC   R4, R6
    93  	ADDZE  R7
    94  	MOVD   R6, (R10)(R5)
    95  	MOVD   R7, R4
    96  	ADD    R19, R3
    97  
    98  e5:
    99  	CMP R3, R11
   100  	BLT l5
   101  
   102  	MOVD R4, c+64(FP)
   103  	RET
   104  
   105  // func addMulVVW(z, x []Word, y Word) (c Word)
   106  TEXT ·addMulVVW(SB), NOSPLIT, $0
   107  	MOVD z+0(FP), R10
   108  	MOVD x+24(FP), R8
   109  	MOVD y+48(FP), R9
   110  	MOVD z_len+8(FP), R22
   111  
   112  	MOVD $0, R5   // i = 0
   113  	MOVD $0, R4   // c = 0
   114  	MOVD $8, R28
   115  	MOVD $-2, R23
   116  	AND  R22, R23 // mask the last bit of z.len
   117  	MOVD $2, R24
   118  	CMP  R23, R24
   119  	BGE  unrolled
   120  	JMP  end
   121  
   122  unrolled:
   123  	MOVD  $8, R19         // no (RA)(RB*8) on power
   124  	MULLD R5, R19
   125  	MOVD  (R10)(R19), R11 // R11 = z[i]
   126  	MOVD  (R8)(R19), R16  // R16 = x[i]
   127  	ADD   R28, R19, R25
   128  	MOVD  (R10)(R25), R17
   129  	MOVD  (R8)(R25), R18
   130  
   131  	MULLD  R9, R16, R12
   132  	MULHDU R9, R16, R14
   133  	MULLD  R9, R18, R6
   134  	MULHDU R9, R18, R7
   135  	ADDC   R4, R12
   136  	ADDZE  R14
   137  	ADDC   R11, R12        // z[i] = (x[i]*y) + z[i] + carry
   138  	ADDZE  R14             // carry = high order bits + add carry
   139  	MOVD   R12, (R10)(R19)
   140  	ADDC   R14, R6
   141  	ADDZE  R7
   142  	ADDC   R17, R6
   143  	ADDZE  R7
   144  	MOVD   R6, (R10)(R25)
   145  	MOVD   R7, R4
   146  
   147  	ADD R24, R5
   148  	CMP R5, R23
   149  	BLT unrolled
   150  	JMP end
   151  
   152  loop:
   153  	MOVD   $8, R19
   154  	MULLD  R5, R19
   155  	MOVD   (R10)(R19), R11
   156  	MOVD   (R8)(R19), R16
   157  	MULLD  R9, R16, R12
   158  	MULHDU R9, R16, R14
   159  	ADDC   R4, R12
   160  	ADDZE  R14
   161  	ADDC   R11, R12
   162  	ADDZE  R14
   163  	MOVD   R12, (R10)(R19)
   164  	MOVD   R14, R4
   165  
   166  	MOVD $1, R15
   167  	ADD  R15, R5
   168  
   169  end:
   170  	CMP R5, R22
   171  	BLT loop
   172  
   173  	MOVD R4, c+56(FP)
   174  	RET
   175  
   176  TEXT ·divWVW(SB), NOSPLIT, $0
   177  	BR ·divWVW_g(SB)
   178  
   179  // func bitLen(x Word) int
   180  TEXT ·bitLen(SB), NOSPLIT, $0
   181  	MOVD   x+0(FP), R4
   182  	CNTLZD R4, R4
   183  	MOVD   $64, R5
   184  	SUB    R4, R5
   185  	MOVD   R5, n+8(FP)
   186  	RET