github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc32/vmx/copyi.asm (about) 1 dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyi. 2 3 dnl Copyright 2006 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C 16-byte coaligned unaligned 34 C cycles/limb cycles/limb 35 C 7400,7410 (G4): 0.5 0.64 36 C 744x,745x (G4+): 0.75 0.82 37 C 970 (G5): 0.78 1.02 (64-bit limbs) 38 39 C STATUS 40 C * Works for all sizes and alignments. 41 42 C TODO 43 C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling 44 C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80 45 C c/l for 970. 46 C * Consider using VMX instructions also for head and tail, by using some 47 C read-modify-write tricks. 48 C * The VMX code is used from the smallest sizes it handles, but measurements 49 C show a large speed bump at the cutoff points. Small copying (perhaps 50 C using some read-modify-write technique) should be optimized. 51 C * Make an mpn_com based on this code. 52 53 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) 54 define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) 55 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) 56 57 58 ifelse(GMP_LIMB_BITS,32,` 59 define(`LIMB32',` $1') 60 define(`LIMB64',`') 61 ',` 62 define(`LIMB32',`') 63 define(`LIMB64',` $1') 64 ') 65 66 C INPUT PARAMETERS 67 define(`rp', `r3') 68 define(`up', `r4') 69 define(`n', `r5') 70 71 define(`us', `v4') 72 73 74 ASM_START() 75 PROLOGUE(mpn_copyi) 76 77 LIMB32(`cmpi cr7, n, 11 ') 78 LIMB64(`cmpdi cr7, n, 5 ') 79 bge cr7, L(big) 80 81 or. r0, n, n 82 beqlr cr0 83 84 C Handle small cases with plain operations 85 mtctr n 86 L(topS): 87 LIMB32(`lwz r0, 0(up) ') 88 LIMB64(`ld r0, 0(up) ') 89 addi up, up, GMP_LIMB_BYTES 90 LIMB32(`stw r0, 0(rp) ') 91 LIMB64(`std r0, 0(rp) ') 92 addi rp, rp, GMP_LIMB_BYTES 93 bdnz L(topS) 94 blr 95 96 C Handle large cases with VMX operations 97 L(big): 98 mfspr r12, 256 99 oris r0, r12, 0xf800 C Set VRSAVE bit 0-4 100 mtspr 256, r0 101 102 LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4 103 LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2 104 beq L(rp_aligned) 105 106 subfic r7, r7, LIMBS_PER_VR 107 subf n, r7, n 108 L(top0): 109 LIMB32(`lwz r0, 0(up) ') 110 LIMB64(`ld r0, 0(up) ') 111 addi up, up, GMP_LIMB_BYTES 112 LIMB32(`addic. r7, r7, -1 ') 113 LIMB32(`stw r0, 0(rp) ') 114 LIMB64(`std r0, 0(rp) ') 115 addi rp, rp, GMP_LIMB_BYTES 116 LIMB32(`bne L(top0) ') 117 118 L(rp_aligned): 119 120 LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4 121 LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2 122 123 LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n 124 LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n 125 mtctr r7 C copy n to count register 126 127 li r10, 16 128 129 beq L(up_aligned) 130 131 lvsl us, 0, up 132 133 LIMB32(`andi. r0, n, 0x4 ') 134 LIMB64(`andi. r0, n, 0x2 ') 135 beq L(1) 136 lvx v0, 0, up 137 lvx v2, r10, up 138 vperm v3, v0, v2, us 139 stvx v3, 0, rp 140 addi up, up, 32 141 addi rp, rp, 16 142 b L(lpu) 143 L(1): lvx v2, 0, up 144 addi up, up, 16 145 b L(lpu) 146 147 ALIGN(32) 148 L(lpu): lvx v0, 0, up 149 vperm v3, v2, v0, us 150 stvx v3, 0, rp 151 lvx v2, r10, up 152 addi up, up, 32 153 vperm v3, v0, v2, us 154 stvx v3, r10, rp 155 addi rp, rp, 32 156 bdnz L(lpu) 157 158 addi up, up, -16 159 b L(tail) 160 161 L(up_aligned): 162 163 LIMB32(`andi. r0, n, 0x4 ') 164 LIMB64(`andi. r0, n, 0x2 ') 165 beq L(lpa) 166 lvx v0, 0, up 167 stvx v0, 0, rp 168 addi up, up, 16 169 addi rp, rp, 16 170 b L(lpa) 171 172 ALIGN(32) 173 L(lpa): lvx v0, 0, up 174 lvx v1, r10, up 175 addi up, up, 32 176 nop 177 stvx v0, 0, rp 178 stvx v1, r10, rp 179 addi rp, rp, 32 180 bdnz L(lpa) 181 182 L(tail): 183 LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 184 LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 185 beq L(ret) 186 LIMB32(`li r10, 0 ') 187 L(top2): 188 LIMB32(`lwzx r0, r10, up ') 189 LIMB64(`ld r0, 0(up) ') 190 LIMB32(`addic. r7, r7, -1 ') 191 LIMB32(`stwx r0, r10, rp ') 192 LIMB64(`std r0, 0(rp) ') 193 LIMB32(`addi r10, r10, GMP_LIMB_BYTES') 194 LIMB32(`bne L(top2) ') 195 196 L(ret): mtspr 256, r12 197 blr 198 EPILOGUE()