github.com/flyinox/gosm@v0.0.0-20171117061539-16768cb62077/src/math/big/arith_ppc64x.s (about) 1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le 6 7 #include "textflag.h" 8 9 // This file provides fast assembly versions for the elementary 10 // arithmetic operations on vectors implemented in arith.go. 11 12 // func mulWW(x, y Word) (z1, z0 Word) 13 TEXT ·mulWW(SB), NOSPLIT, $0 14 MOVD x+0(FP), R4 15 MOVD y+8(FP), R5 16 MULHDU R4, R5, R6 17 MULLD R4, R5, R7 18 MOVD R6, z1+16(FP) 19 MOVD R7, z0+24(FP) 20 RET 21 22 // func addVV(z, y, y []Word) (c Word) 23 // z[i] = x[i] + y[i] for all i, carrying 24 TEXT ·addVV(SB), NOSPLIT, $0 25 MOVD z_len+8(FP), R7 26 MOVD x+24(FP), R8 27 MOVD y+48(FP), R9 28 MOVD z+0(FP), R10 29 30 MOVD R0, R4 31 MOVD R0, R6 // R6 will be the address index 32 ADDC R4, R4 // clear CA 33 MOVD R7, CTR 34 35 CMP R0, R7 36 BEQ done 37 38 loop: 39 MOVD (R8)(R6), R11 // x[i] 40 MOVD (R9)(R6), R12 // y[i] 41 ADDE R12, R11, R15 // x[i] + y[i] + CA 42 MOVD R15, (R10)(R6) // z[i] 43 44 ADD $8, R6 45 BC 16, 0, loop // bdnz 46 47 done: 48 ADDZE R4 49 MOVD R4, c+72(FP) 50 RET 51 52 // func subVV(z, x, y []Word) (c Word) 53 // z[i] = x[i] - y[i] for all i, carrying 54 TEXT ·subVV(SB), NOSPLIT, $0 55 MOVD z_len+8(FP), R7 56 MOVD x+24(FP), R8 57 MOVD y+48(FP), R9 58 MOVD z+0(FP), R10 59 60 MOVD R0, R4 // c = 0 61 MOVD R0, R6 62 SUBC R0, R0 // clear CA 63 MOVD R7, CTR 64 65 CMP R0, R7 66 BEQ sublend 67 68 // amd64 saves and restores CF, but I believe they only have to do that because all of 69 // their math operations clobber it - we should just be able to recover it at the end. 70 subloop: 71 MOVD (R8)(R6), R11 // x[i] 72 MOVD (R9)(R6), R12 // y[i] 73 74 SUBE R12, R11, R15 75 MOVD R15, (R10)(R6) 76 77 ADD $8, R6 78 BC 16, 0, subloop // bdnz 79 80 sublend: 81 82 ADDZE R4 83 XOR $1, R4 84 MOVD R4, c+72(FP) 85 RET 86 87 TEXT ·addVW(SB), NOSPLIT, $0 88 BR ·addVW_g(SB) 89 90 TEXT ·subVW(SB), NOSPLIT, $0 91 BR ·subVW_g(SB) 92 93 TEXT ·shlVU(SB), NOSPLIT, $0 94 BR ·shlVU_g(SB) 95 96 TEXT ·shrVU(SB), NOSPLIT, $0 97 BR ·shrVU_g(SB) 98 99 // func mulAddVWW(z, x []Word, y, r Word) (c Word) 100 TEXT ·mulAddVWW(SB), NOSPLIT, $0 101 MOVD z+0(FP), R10 102 MOVD x+24(FP), R8 103 MOVD y+48(FP), R9 104 MOVD r+56(FP), R4 // c = r 105 MOVD z_len+8(FP), R11 106 MOVD $0, R3 // i = 0 107 MOVD $8, R18 108 MOVD $1, R19 109 110 JMP e5 111 112 l5: 113 MULLD R18, R3, R5 114 MOVD (R8)(R5), R20 115 MULLD R9, R20, R6 116 MULHDU R9, R20, R7 117 ADDC R4, R6 118 ADDZE R7 119 MOVD R6, (R10)(R5) 120 MOVD R7, R4 121 ADD R19, R3 122 123 e5: 124 CMP R3, R11 125 BLT l5 126 127 MOVD R4, c+64(FP) 128 RET 129 130 // func addMulVVW(z, x []Word, y Word) (c Word) 131 TEXT ·addMulVVW(SB), NOSPLIT, $0 132 MOVD z+0(FP), R10 133 MOVD x+24(FP), R8 134 MOVD y+48(FP), R9 135 MOVD z_len+8(FP), R22 136 137 MOVD $0, R5 // i = 0 138 MOVD $0, R4 // c = 0 139 MOVD $8, R28 140 MOVD $-2, R23 141 AND R22, R23 // mask the last bit of z.len 142 MOVD $2, R24 143 CMP R23, R24 144 BGE unrolled 145 JMP end 146 147 unrolled: 148 MOVD $8, R19 // no (RA)(RB*8) on power 149 MULLD R5, R19 150 MOVD (R10)(R19), R11 // R11 = z[i] 151 MOVD (R8)(R19), R16 // R16 = x[i] 152 ADD R28, R19, R25 153 MOVD (R10)(R25), R17 154 MOVD (R8)(R25), R18 155 156 MULLD R9, R16, R12 157 MULHDU R9, R16, R14 158 MULLD R9, R18, R6 159 MULHDU R9, R18, R7 160 ADDC R4, R12 161 ADDZE R14 162 ADDC R11, R12 // z[i] = (x[i]*y) + z[i] + carry 163 ADDZE R14 // carry = high order bits + add carry 164 MOVD R12, (R10)(R19) 165 ADDC R14, R6 166 ADDZE R7 167 ADDC R17, R6 168 ADDZE R7 169 MOVD R6, (R10)(R25) 170 MOVD R7, R4 171 172 ADD R24, R5 173 CMP R5, R23 174 BLT unrolled 175 JMP end 176 177 loop: 178 MOVD $8, R19 179 MULLD R5, R19 180 MOVD (R10)(R19), R11 181 MOVD (R8)(R19), R16 182 MULLD R9, R16, R12 183 MULHDU R9, R16, R14 184 ADDC R4, R12 185 ADDZE R14 186 ADDC R11, R12 187 ADDZE R14 188 MOVD R12, (R10)(R19) 189 MOVD R14, R4 190 191 MOVD $1, R15 192 ADD R15, R5 193 194 end: 195 CMP R5, R22 196 BLT loop 197 198 MOVD R4, c+56(FP) 199 RET 200 201 // func divWW(x1, x0, y Word) (q, r Word) 202 TEXT ·divWW(SB), NOSPLIT, $0 203 MOVD x1+0(FP), R4 204 MOVD x0+8(FP), R5 205 MOVD y+16(FP), R6 206 207 CMPU R4, R6 208 BGE divbigger 209 210 // from the programmer's note in ch. 3 of the ISA manual, p.74 211 DIVDEU R6, R4, R3 212 DIVDU R6, R5, R7 213 MULLD R6, R3, R8 214 MULLD R6, R7, R20 215 SUB R20, R5, R10 216 ADD R7, R3, R3 217 SUB R8, R10, R4 218 CMPU R4, R10 219 BLT adjust 220 CMPU R4, R6 221 BLT end 222 223 adjust: 224 MOVD $1, R21 225 ADD R21, R3, R3 226 SUB R6, R4, R4 227 228 end: 229 MOVD R3, q+24(FP) 230 MOVD R4, r+32(FP) 231 232 RET 233 234 divbigger: 235 MOVD $-1, R7 236 MOVD R7, q+24(FP) 237 MOVD R7, r+32(FP) 238 RET 239 240 TEXT ·divWVW(SB), NOSPLIT, $0 241 BR ·divWVW_g(SB)