github.com/ltltlt/go-source-code@v0.0.0-20190830023027-95be009773aa/math/big/arith_ppc64x.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 // func mulWW(x, y Word) (z1, z0 Word) 13 TEXT ·mulWW(SB), NOSPLIT, $0 14 MOVD x+0(FP), R4 15 MOVD y+8(FP), R5 16 MULHDU R4, R5, R6 17 MULLD R4, R5, R7 18 MOVD R6, z1+16(FP) 19 MOVD R7, z0+24(FP) 20 RET 21 22 // func addVV(z, y, y []Word) (c Word) 23 // z[i] = x[i] + y[i] for all i, carrying 24 TEXT ·addVV(SB), NOSPLIT, $0 25 MOVD z_len+8(FP), R7 26 MOVD x+24(FP), R8 27 MOVD y+48(FP), R9 28 MOVD z+0(FP), R10 29 30 MOVD R0, R4 31 MOVD R0, R6 // R6 will be the address index 32 ADDC R4, R4 // clear CA 33 MOVD R7, CTR 34 35 CMP R0, R7 36 BEQ done 37 38 loop: 39 MOVD (R8)(R6), R11 // x[i] 40 MOVD (R9)(R6), R12 // y[i] 41 ADDE R12, R11, R15 // x[i] + y[i] + CA 42 MOVD R15, (R10)(R6) // z[i] 43 44 ADD $8, R6 45 BC 16, 0, loop // bdnz 46 47 done: 48 ADDZE R4 49 MOVD R4, c+72(FP) 50 RET 51 52 // func subVV(z, x, y []Word) (c Word) 53 // z[i] = x[i] - y[i] for all i, carrying 54 TEXT ·subVV(SB), NOSPLIT, $0 55 MOVD z_len+8(FP), R7 56 MOVD x+24(FP), R8 57 MOVD y+48(FP), R9 58 MOVD z+0(FP), R10 59 60 MOVD R0, R4 // c = 0 61 MOVD R0, R6 62 SUBC R0, R0 // clear CA 63 MOVD R7, CTR 64 65 CMP R0, R7 66 BEQ sublend 67 68 // amd64 saves and restores CF, but I believe they only have to do that because all of 69 // their math operations clobber it - we should just be able to recover it at the end. 70 subloop: 71 MOVD (R8)(R6), R11 // x[i] 72 MOVD (R9)(R6), R12 // y[i] 73 74 SUBE R12, R11, R15 75 MOVD R15, (R10)(R6) 76 77 ADD $8, R6 78 BC 16, 0, subloop // bdnz 79 80 sublend: 81 82 ADDZE R4 83 XOR $1, R4 84 MOVD R4, c+72(FP) 85 RET 86 87 TEXT ·addVW(SB), NOSPLIT, $0 88 BR ·addVW_g(SB) 89 90 TEXT ·subVW(SB), NOSPLIT, $0 91 BR ·subVW_g(SB) 92 93 TEXT ·shlVU(SB), NOSPLIT, $0 94 BR ·shlVU_g(SB) 95 96 TEXT ·shrVU(SB), NOSPLIT, $0 97 BR ·shrVU_g(SB) 98 99 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 100 TEXT ·mulAddVWW(SB), NOSPLIT, $0 101 MOVD z+0(FP), R10 // R10 = z[] 102 MOVD x+24(FP), R8 // R8 = x[] 103 MOVD y+48(FP), R9 // R9 = y 104 MOVD r+56(FP), R4 // R4 = r = c 105 MOVD z_len+8(FP), R11 // R11 = z_len 106 107 MOVD R0, R3 // R3 will be the index register 108 CMP R0, R11 109 MOVD R11, CTR // Initialize loop counter 110 BEQ done 111 112 loop: 113 MOVD (R8)(R3), R20 // x[i] 114 MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y) 115 MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y) 116 ADDC R4, R6 // Compute sum for z1 and z0 117 ADDZE R7 118 MOVD R6, (R10)(R3) // z[i] 119 MOVD R7, R4 // c 120 ADD $8, R3 121 BC 16, 0, loop // bdnz 122 123 done: 124 MOVD R4, c+64(FP) 125 RET 126 127 // func addMulVVW(z, x []Word, y Word) (c Word) 128 TEXT ·addMulVVW(SB), NOSPLIT, $0 129 MOVD z+0(FP), R10 // R10 = z[] 130 MOVD x+24(FP), R8 // R8 = x[] 131 MOVD y+48(FP), R9 // R9 = y 132 MOVD z_len+8(FP), R22 // R22 = z_len 133 134 MOVD R0, R3 // R3 will be the index register 135 CMP R0, R22 136 MOVD R0, R4 // R4 = c = 0 137 MOVD R22, CTR // Initialize loop counter 138 BEQ done 139 140 loop: 141 MOVD (R8)(R3), R20 // Load x[i] 142 MOVD (R10)(R3), R21 // Load z[i] 143 MULLD R9, R20, R6 // R6 = Low-order(x[i]*y) 144 MULHDU R9, R20, R7 // R7 = High-order(x[i]*y) 145 ADDC R21, R6 // R6 = z0 146 ADDZE R7 // R7 = z1 147 ADDC R4, R6 // R6 = z0 + c + 0 148 ADDZE R7, R4 // c += z1 149 MOVD R6, (R10)(R3) // Store z[i] 150 ADD $8, R3 151 BC 16, 0, loop // bdnz 152 153 done: 154 MOVD R4, c+56(FP) 155 RET 156 157 // func divWW(x1, x0, y Word) (q, r Word) 158 TEXT ·divWW(SB), NOSPLIT, $0 159 MOVD x1+0(FP), R4 160 MOVD x0+8(FP), R5 161 MOVD y+16(FP), R6 162 163 CMPU R4, R6 164 BGE divbigger 165 166 // from the programmer's note in ch. 3 of the ISA manual, p.74 167 DIVDEU R6, R4, R3 168 DIVDU R6, R5, R7 169 MULLD R6, R3, R8 170 MULLD R6, R7, R20 171 SUB R20, R5, R10 172 ADD R7, R3, R3 173 SUB R8, R10, R4 174 CMPU R4, R10 175 BLT adjust 176 CMPU R4, R6 177 BLT end 178 179 adjust: 180 MOVD $1, R21 181 ADD R21, R3, R3 182 SUB R6, R4, R4 183 184 end: 185 MOVD R3, q+24(FP) 186 MOVD R4, r+32(FP) 187 188 RET 189 190 divbigger: 191 MOVD $-1, R7 192 MOVD R7, q+24(FP) 193 MOVD R7, r+32(FP) 194 RET 195 196 TEXT ·divWVW(SB), NOSPLIT, $0 197 BR ·divWVW_g(SB)