github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc32/vmx/mod_34lsub1.asm (about) 1 dnl PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1. 2 3 dnl Copyright 2002, 2003, 2005-2007, 2012 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 32 include(`../config.m4') 33 34 35 C cycles/limb 36 C 603e: - 37 C 604e: - 38 C 75x (G3): - 39 C 7400,7410 (G4): 1 simple load-use scheduling results in 0.75 40 C 744x,745x (G4+): 0.75 41 C ppc970: 0.75 42 C power4: - 43 C power5: - 44 45 C TODO 46 C * Either start using the low-end masking constants, or remove them. 47 C * Merge multiple feed-in cases into a parameterized code block. 48 C * Reduce register usage. It should be possible to almost halve it. 49 50 define(`up', `r3') 51 define(`n', `r4') 52 53 define(`a0', `v3') 54 define(`a1', `v4') 55 define(`a2', `v5') 56 define(`c0', `v6') 57 define(`c1', `v7') 58 define(`c2', `v8') 59 define(`z', `v9') 60 define(`x0', `v10') 61 define(`x1', `v11') 62 define(`x2', `v12') 63 define(`x3', `v13') 64 define(`pv', `v14') 65 define(`y0', `v0') 66 define(`y1', `v1') 67 define(`y2', `v2') 68 define(`y3', `v15') 69 70 ASM_START() 71 PROLOGUE(mpn_mod_34lsub1) 72 cmpwi cr0, n, 20 C tuned cutoff point 73 bge L(large) 74 75 li r9, 0 C result accumulator 76 mulli r10, n, 0xb C 0xb = ceil(32/3) 77 srwi. r10, r10, 5 C r10 = floor(n/3), n < 32 78 beq L(small_tail) 79 mtctr r10 80 lwz r6, 0(up) 81 lwz r7, 4(up) 82 lwzu r8, 8(up) 83 subf n, r10, n 84 subf n, r10, n 85 subf n, r10, n 86 bdz L(small_end) 87 88 ALIGN(16) 89 L(los): rlwinm r0, r6, 0,8,31 90 add r9, r9, r0 C add 24b from u0 91 srwi r0, r6, 24 92 lwz r6, 4(up) 93 rlwimi r0, r7, 8, 0x00ffff00 C --111100 94 add r9, r9, r0 C add 8b from u0 and 16b from u1 95 srwi r0, r7, 16 96 lwz r7, 8(up) 97 rlwimi r0, r8, 16, 0x00ff0000 C --221111 98 add r9, r9, r0 C add 16b from u1 and 8b from u2 99 srwi r0, r8, 8 C --222222 100 lwzu r8, 12(up) 101 add r9, r9, r0 C add 24b from u2 102 bdnz L(los) 103 L(small_end): 104 rlwinm r0, r6, 0,8,31 105 add r9, r9, r0 C add 24b from u0 106 srwi r0, r6, 24 107 rlwimi r0, r7, 8, 0x00ffff00 C --111100 108 add r9, r9, r0 C add 8b from u0 and 16b from u1 109 srwi r0, r7, 16 110 rlwimi r0, r8, 16, 0x00ff0000 C --221111 111 add r9, r9, r0 C add 16b from u1 and 8b from u2 112 srwi r0, r8, 8 C --222222 113 add r9, r9, r0 C add 24b from u2 114 115 addi up, up, 4 116 rlwinm r0, r9, 0,8,31 117 srwi r9, r9, 24 118 add r9, r9, r0 119 120 L(small_tail): 121 cmpi cr0, n, 1 122 blt L(ret) 123 124 lwz r6, 0(up) 125 rlwinm r0, r6, 0,8,31 126 srwi r6, r6, 24 127 add r9, r9, r0 128 add r9, r9, r6 129 130 beq L(ret) 131 132 lwz r6, 4(up) 133 rlwinm r0, r6, 8,8,23 134 srwi r6, r6, 16 135 add r9, r9, r0 136 add r9, r9, r6 137 138 L(ret): mr r3, r9 139 blr 140 141 142 L(large): 143 stwu r1, -32(r1) 144 mfspr r10, 256 145 oris r0, r10, 0xffff C Set VRSAVE bit 0-15 146 mtspr 256, r0 147 148 andi. r7, up, 15 149 vxor a0, v0, v0 150 lis r9, 0xaaaa 151 vxor a1, v0, v0 152 ori r9, r9, 0xaaab 153 vxor a2, v0, v0 154 li r5, 16 155 vxor c0, v0, v0 156 li r6, 32 157 vxor c1, v0, v0 158 LEAL( r11, cnsts) C CAUTION clobbers r0 for elf, darwin 159 vxor c2, v0, v0 160 vxor z, v0, v0 161 162 beq L(aligned16) 163 164 cmpwi cr7, r7, 8 165 bge cr7, L(na4) 166 167 lvx a2, 0, up 168 addi up, up, 16 169 vsldoi a2, a2, z, 4 170 vsldoi a2, z, a2, 12 171 172 addi n, n, 9 173 mulhwu r0, n, r9 174 srwi r0, r0, 3 C r0 = floor(n/12) 175 mtctr r0 176 177 mulli r8, r0, 12 178 subf n, r8, n 179 b L(2) 180 181 L(na4): bne cr7, L(na8) 182 183 lvx a1, 0, up 184 addi up, up, -16 185 vsldoi a1, a1, z, 8 186 vsldoi a1, z, a1, 8 187 188 addi n, n, 6 189 mulhwu r0, n, r9 190 srwi r0, r0, 3 C r0 = floor(n/12) 191 mtctr r0 192 193 mulli r8, r0, 12 194 subf n, r8, n 195 b L(1) 196 197 L(na8): 198 lvx a0, 0, up 199 vsldoi a0, a0, z, 12 200 vsldoi a0, z, a0, 4 201 202 addi n, n, 3 203 mulhwu r0, n, r9 204 srwi r0, r0, 3 C r0 = floor(n/12) 205 mtctr r0 206 207 mulli r8, r0, 12 208 subf n, r8, n 209 b L(0) 210 211 L(aligned16): 212 mulhwu r0, n, r9 213 srwi r0, r0, 3 C r0 = floor(n/12) 214 mtctr r0 215 216 mulli r8, r0, 12 217 subf n, r8, n 218 219 lvx a0, 0, up 220 L(0): lvx a1, r5, up 221 L(1): lvx a2, r6, up 222 addi up, up, 48 223 L(2): bdz L(end) 224 li r12, 256 225 li r9, 288 226 ALIGN(32) 227 L(top): 228 lvx v0, 0, up 229 vaddcuw v10, a0, v0 230 vadduwm a0, a0, v0 231 vadduwm c0, c0, v10 232 233 lvx v1, r5, up 234 vaddcuw v10, a1, v1 235 vadduwm a1, a1, v1 236 vadduwm c1, c1, v10 237 238 lvx v2, r6, up 239 dcbt up, r12 240 dcbt up, r9 241 addi up, up, 48 242 vaddcuw v10, a2, v2 243 vadduwm a2, a2, v2 244 vadduwm c2, c2, v10 245 bdnz L(top) 246 247 L(end): 248 C n = 0...11 249 cmpwi cr0, n, 0 250 beq L(sum) 251 cmpwi cr0, n, 4 252 ble L(tail.1..4) 253 cmpwi cr0, n, 8 254 ble L(tail.5..8) 255 256 L(tail.9..11): 257 lvx v0, 0, up 258 vaddcuw v10, a0, v0 259 vadduwm a0, a0, v0 260 vadduwm c0, c0, v10 261 262 lvx v1, r5, up 263 vaddcuw v10, a1, v1 264 vadduwm a1, a1, v1 265 vadduwm c1, c1, v10 266 267 lvx v2, r6, up 268 269 addi r8, r11, 96 270 rlwinm r3, n ,4,26,27 271 lvx v11, r3, r8 272 vand v2, v2, v11 273 274 vaddcuw v10, a2, v2 275 vadduwm a2, a2, v2 276 vadduwm c2, c2, v10 277 b L(sum) 278 279 L(tail.5..8): 280 lvx v0, 0, up 281 vaddcuw v10, a0, v0 282 vadduwm a0, a0, v0 283 vadduwm c0, c0, v10 284 285 lvx v1, r5, up 286 287 addi r8, r11, 96 288 rlwinm r3, n ,4,26,27 289 lvx v11, r3, r8 290 vand v1, v1, v11 291 292 vaddcuw v10, a1, v1 293 vadduwm a1, a1, v1 294 vadduwm c1, c1, v10 295 b L(sum) 296 297 L(tail.1..4): 298 lvx v0, 0, up 299 300 addi r8, r11, 96 301 rlwinm r3, n ,4,26,27 302 lvx v11, r3, r8 303 vand v0, v0, v11 304 305 vaddcuw v10, a0, v0 306 vadduwm a0, a0, v0 307 vadduwm c0, c0, v10 308 309 L(sum): lvx pv, 0, r11 310 vperm x0, a0, z, pv C extract 4 24-bit field from a0 311 vperm y0, c2, z, pv 312 lvx pv, r5, r11 313 vperm x1, a1, z, pv C extract 4 24-bit field from a1 314 vperm y1, c0, z, pv C extract 4 24-bit field from a1 315 lvx pv, r6, r11 316 vperm x2, a2, z, pv C extract 4 24-bit field from a1 317 vperm y2, c1, z, pv C extract 4 24-bit field from a1 318 li r10, 48 319 lvx pv, r10, r11 320 vperm x3, a0, z, pv C extract remaining/partial a0 fields 321 vperm y3, c2, z, pv C extract remaining/partial a0 fields 322 li r10, 64 323 lvx pv, r10, r11 324 vperm x3, a1, x3, pv C insert remaining/partial a1 fields 325 vperm y3, c0, y3, pv C insert remaining/partial a1 fields 326 li r10, 80 327 lvx pv, r10, r11 328 vperm x3, a2, x3, pv C insert remaining/partial a2 fields 329 vperm y3, c1, y3, pv C insert remaining/partial a2 fields 330 331 C We now have 4 128-bit accumulators to sum 332 vadduwm x0, x0, x1 333 vadduwm x2, x2, x3 334 vadduwm x0, x0, x2 335 336 vadduwm y0, y0, y1 337 vadduwm y2, y2, y3 338 vadduwm y0, y0, y2 339 340 vadduwm x0, x0, y0 341 342 C Reduce 32-bit fields 343 vsumsws x0, x0, z 344 345 li r7, 16 346 stvx x0, r7, r1 347 lwz r3, 28(r1) 348 349 mtspr 256, r10 350 addi r1, r1, 32 351 blr 352 EPILOGUE() 353 354 C load | v0 | v1 | v2 | 355 C acc | a0 | a1 | a2 | 356 C carry | c0 | c1 | c2 | 357 C | 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 128 358 C |---|---|---|---|---|---|---|---|---|---|---|---| 32 359 C | | | | | | | | | | | | | | | | | 24 360 C | | | | | | | | | 48 361 362 C $---------------$---------------$---------------$---------------$ 363 C | . . . . . . . . . . . . . . . | 364 C |_______________________________________________________________| 365 C | | | | | | | 366 C <-hi16-> <--- 24 --> <--- 24 --> <--- 24 --> <--- 24 --> <-lo16-> 367 368 369 DEF_OBJECT(cnsts,16) 370 C Permutation vectors in the order they are used above 371 C # 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 372 .byte 0x10,0x01,0x02,0x03, 0x10,0x06,0x07,0x00, 0x10,0x0b,0x04,0x05, 0x10,0x08,0x09,0x0a C a0 373 .byte 0x10,0x07,0x00,0x01, 0x10,0x04,0x05,0x06, 0x10,0x09,0x0a,0x0b, 0x10,0x0e,0x0f,0x08 C a1 374 .byte 0x10,0x00,0x01,0x02, 0x10,0x05,0x06,0x07, 0x10,0x0a,0x0b,0x04, 0x10,0x0f,0x08,0x09 C a2 375 .byte 0x10,0x0d,0x0e,0x0f, 0x10,0x10,0x10,0x0c, 0x10,0x10,0x10,0x10, 0x10,0x10,0x10,0x10 C part a0 376 .byte 0x10,0x11,0x12,0x13, 0x10,0x02,0x03,0x17, 0x10,0x10,0x0c,0x0d, 0x10,0x10,0x10,0x10 C part a1 377 .byte 0x10,0x11,0x12,0x13, 0x10,0x15,0x16,0x17, 0x10,0x03,0x1a,0x1b, 0x10,0x0c,0x0d,0x0e C part a2 378 C Masks for high end of number 379 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 380 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 381 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 382 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 383 C Masks for low end of number 384 C .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 385 C .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 386 C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 387 C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 388 END_OBJECT(cnsts)