github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc32/vmx/logops_n.asm (about) 1 dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_and_n, mpn_andn_n, mpn_nand_n, 2 dnl mpn_ior_n, mpn_iorn_n, mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise 3 dnl logical operations. 4 5 dnl Copyright 2006 Free Software Foundation, Inc. 6 7 dnl This file is part of the GNU MP Library. 8 dnl 9 dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 dnl it under the terms of either: 11 dnl 12 dnl * the GNU Lesser General Public License as published by the Free 13 dnl Software Foundation; either version 3 of the License, or (at your 14 dnl option) any later version. 15 dnl 16 dnl or 17 dnl 18 dnl * the GNU General Public License as published by the Free Software 19 dnl Foundation; either version 2 of the License, or (at your option) any 20 dnl later version. 21 dnl 22 dnl or both in parallel, as here. 23 dnl 24 dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 dnl for more details. 28 dnl 29 dnl You should have received copies of the GNU General Public License and the 30 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 dnl see https://www.gnu.org/licenses/. 32 33 include(`../config.m4') 34 35 36 C and,ior,andn,nior,xor iorn,xnor nand 37 C cycles/limb cycles/limb cycles/limb 38 C 7400,7410 (G4): 1.39 ? ? 39 C 744x,745x (G4+): 1.14 1.39 1.39 40 C 970: 1.7 2.0 2.0 41 42 C STATUS 43 C * Works for all sizes and alignment for 32-bit limbs. 44 C * Works for n >= 4 for 64-bit limbs; untested for smaller operands. 45 C * Current performance makes this pointless for 970 46 47 C TODO 48 C * Might want to make variants when just one of the source operands needs 49 C vperm, and when neither needs it. The latter runs 50% faster on 7400. 50 C * Idea: If the source operands are equally aligned, we could do the logops 51 C first, then vperm before storing! That means we never need more than one 52 C vperm, ever! 53 C * Perhaps align `rp' after initial alignment loop? 54 C * Instead of having scalar code in the beginning and end, consider using 55 C read-modify-write vector code. 56 C * Software pipeline? Hopefully not too important, this is hairy enough 57 C already. 58 C * At least be more clever about operand loading, i.e., load v operands before 59 C u operands, since v operands are sometimes negated. 60 61 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) 62 define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) 63 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) 64 65 define(`vnegb', `') C default neg-before to null 66 define(`vnega', `') C default neg-before to null 67 68 ifdef(`OPERATION_and_n', 69 ` define(`func', `mpn_and_n') 70 define(`logopS',`and $1,$2,$3') 71 define(`logop', `vand $1,$2,$3')') 72 ifdef(`OPERATION_andn_n', 73 ` define(`func', `mpn_andn_n') 74 define(`logopS',`andc $1,$2,$3') 75 define(`logop', `vandc $1,$2,$3')') 76 ifdef(`OPERATION_nand_n', 77 ` define(`func', `mpn_nand_n') 78 define(`logopS',`nand $1,$2,$3') 79 define(`logop', `vand $1,$2,$3') 80 define(`vnega', `vnor $1,$2,$2')') 81 ifdef(`OPERATION_ior_n', 82 ` define(`func', `mpn_ior_n') 83 define(`logopS',`or $1,$2,$3') 84 define(`logop', `vor $1,$2,$3')') 85 ifdef(`OPERATION_iorn_n', 86 ` define(`func', `mpn_iorn_n') 87 define(`logopS',`orc $1,$2,$3') 88 define(`vnegb', `vnor $1,$2,$2') 89 define(`logop', `vor $1,$2,$3')') 90 ifdef(`OPERATION_nior_n', 91 ` define(`func', `mpn_nior_n') 92 define(`logopS',`nor $1,$2,$3') 93 define(`logop', `vnor $1,$2,$3')') 94 ifdef(`OPERATION_xor_n', 95 ` define(`func', `mpn_xor_n') 96 define(`logopS',`xor $1,$2,$3') 97 define(`logop', `vxor $1,$2,$3')') 98 ifdef(`OPERATION_xnor_n', 99 ` define(`func',`mpn_xnor_n') 100 define(`logopS',`eqv $1,$2,$3') 101 define(`vnegb', `vnor $1,$2,$2') 102 define(`logop', `vxor $1,$2,$3')') 103 104 ifelse(GMP_LIMB_BITS,`32',` 105 define(`LIMB32',` $1') 106 define(`LIMB64',`') 107 ',` 108 define(`LIMB32',`') 109 define(`LIMB64',` $1') 110 ') 111 112 C INPUT PARAMETERS 113 define(`rp', `r3') 114 define(`up', `r4') 115 define(`vp', `r5') 116 define(`n', `r6') 117 118 define(`us', `v8') 119 define(`vs', `v9') 120 121 MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) 122 123 ASM_START() 124 PROLOGUE(func) 125 126 LIMB32(`cmpwi cr0, n, 8 ') 127 LIMB64(`cmpdi cr0, n, 4 ') 128 bge L(big) 129 130 mtctr n 131 132 LIMB32(`lwz r8, 0(up) ') 133 LIMB32(`lwz r9, 0(vp) ') 134 LIMB32(`logopS( r0, r8, r9) ') 135 LIMB32(`stw r0, 0(rp) ') 136 LIMB32(`bdz L(endS) ') 137 138 L(topS): 139 LIMB32(`lwzu r8, 4(up) ') 140 LIMB64(`ld r8, 0(up) ') 141 LIMB64(`addi up, up, GMP_LIMB_BYTES ') 142 LIMB32(`lwzu r9, 4(vp) ') 143 LIMB64(`ld r9, 0(vp) ') 144 LIMB64(`addi vp, vp, GMP_LIMB_BYTES ') 145 logopS( r0, r8, r9) 146 LIMB32(`stwu r0, 4(rp) ') 147 LIMB64(`std r0, 0(rp) ') 148 LIMB64(`addi rp, rp, GMP_LIMB_BYTES ') 149 bdnz L(topS) 150 L(endS): 151 blr 152 153 L(big): mfspr r12, 256 154 oris r0, r12, 0xfffc C Set VRSAVE bit 0-13 FIXME 155 mtspr 256, r0 156 157 C First loop until the destination is 16-byte aligned. This will execute 0 or 1 158 C times for 64-bit machines, and 0 to 3 times for 32-bit machines. 159 160 LIMB32(`rlwinm. r0, rp, 30,30,31') C (rp >> 2) mod 4 161 LIMB64(`rlwinm. r0, rp, 29,31,31') C (rp >> 3) mod 2 162 beq L(aligned) 163 164 subfic r7, r0, LIMBS_PER_VR 165 LIMB32(`li r10, 0 ') 166 subf n, r7, n 167 L(top0): 168 LIMB32(`lwz r8, 0(up) ') 169 LIMB64(`ld r8, 0(up) ') 170 addi up, up, GMP_LIMB_BYTES 171 LIMB32(`lwz r9, 0(vp) ') 172 LIMB64(`ld r9, 0(vp) ') 173 addi vp, vp, GMP_LIMB_BYTES 174 LIMB32(`addic. r7, r7, -1 ') 175 logopS( r0, r8, r9) 176 LIMB32(`stwx r0, r10, rp ') 177 LIMB64(`std r0, 0(rp) ') 178 LIMB32(`addi r10, r10, GMP_LIMB_BYTES') 179 LIMB32(`bne L(top0) ') 180 181 addi rp, rp, 16 C update rp, but preserve its alignment 182 183 L(aligned): 184 LIMB64(`srdi r7, n, 1 ') C loop count corresponding to n 185 LIMB32(`srwi r7, n, 2 ') C loop count corresponding to n 186 mtctr r7 C copy n to count register 187 188 li r10, 16 189 lvsl us, 0, up 190 lvsl vs, 0, vp 191 192 lvx v2, 0, up 193 lvx v3, 0, vp 194 bdnz L(gt1) 195 lvx v0, r10, up 196 lvx v1, r10, vp 197 vperm v4, v2, v0, us 198 vperm v5, v3, v1, vs 199 vnegb( v5, v5) 200 logop( v6, v4, v5) 201 vnega( v6, v6) 202 stvx v6, 0, rp 203 addi up, up, 16 204 addi vp, vp, 16 205 addi rp, rp, 4 206 b L(tail) 207 208 L(gt1): addi up, up, 16 209 addi vp, vp, 16 210 211 L(top): lvx v0, 0, up 212 lvx v1, 0, vp 213 vperm v4, v2, v0, us 214 vperm v5, v3, v1, vs 215 vnegb( v5, v5) 216 logop( v6, v4, v5) 217 vnega( v6, v6) 218 stvx v6, 0, rp 219 bdz L(end) 220 lvx v2, r10, up 221 lvx v3, r10, vp 222 vperm v4, v0, v2, us 223 vperm v5, v1, v3, vs 224 vnegb( v5, v5) 225 logop( v6, v4, v5) 226 vnega( v6, v6) 227 stvx v6, r10, rp 228 addi up, up, 32 229 addi vp, vp, 32 230 addi rp, rp, 32 231 bdnz L(top) 232 233 andi. r0, up, 15 234 vxor v0, v0, v0 235 beq 1f 236 lvx v0, 0, up 237 1: andi. r0, vp, 15 238 vxor v1, v1, v1 239 beq 1f 240 lvx v1, 0, vp 241 1: vperm v4, v2, v0, us 242 vperm v5, v3, v1, vs 243 vnegb( v5, v5) 244 logop( v6, v4, v5) 245 vnega( v6, v6) 246 stvx v6, 0, rp 247 addi rp, rp, 4 248 b L(tail) 249 250 L(end): andi. r0, up, 15 251 vxor v2, v2, v2 252 beq 1f 253 lvx v2, r10, up 254 1: andi. r0, vp, 15 255 vxor v3, v3, v3 256 beq 1f 257 lvx v3, r10, vp 258 1: vperm v4, v0, v2, us 259 vperm v5, v1, v3, vs 260 vnegb( v5, v5) 261 logop( v6, v4, v5) 262 vnega( v6, v6) 263 stvx v6, r10, rp 264 265 addi up, up, 16 266 addi vp, vp, 16 267 addi rp, rp, 20 268 269 L(tail): 270 LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 271 LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 272 beq L(ret) 273 addi rp, rp, 15 274 LIMB32(`rlwinm rp, rp, 0,0,27 ') 275 LIMB64(`rldicr rp, rp, 0,59 ') 276 li r10, 0 277 L(top2): 278 LIMB32(`lwzx r8, r10, up ') 279 LIMB64(`ldx r8, r10, up ') 280 LIMB32(`lwzx r9, r10, vp ') 281 LIMB64(`ldx r9, r10, vp ') 282 LIMB32(`addic. r7, r7, -1 ') 283 logopS( r0, r8, r9) 284 LIMB32(`stwx r0, r10, rp ') 285 LIMB64(`std r0, 0(rp) ') 286 LIMB32(`addi r10, r10, GMP_LIMB_BYTES') 287 LIMB32(`bne L(top2) ') 288 289 L(ret): mtspr 256, r12 290 blr 291 EPILOGUE() 292 293 C This works for 64-bit PowerPC, since a limb ptr can only be aligned 294 C in 2 relevant ways, which means we can always find a pair of aligned 295 C pointers of rp, up, and vp. 296 C process words until rp is 16-byte aligned 297 C if (((up | vp) & 15) == 0) 298 C process with VMX without any vperm 299 C else if ((up & 15) != 0 && (vp & 15) != 0) 300 C process with VMX using vperm on store data 301 C else if ((up & 15) != 0) 302 C process with VMX using vperm on up data 303 C else 304 C process with VMX using vperm on vp data 305 C 306 C rlwinm, r0, up, 0,28,31 307 C rlwinm r0, vp, 0,28,31 308 C cmpwi cr7, r0, 0 309 C cror cr6, cr0, cr7 310 C crand cr0, cr0, cr7