github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/pa32/hppa1_1/submul_1.asm (about) 1 dnl HP-PA 1.1 mpn_submul_1 -- Multiply a limb vector with a limb and subtract 2 dnl the result from a second limb vector. 3 4 dnl Copyright 1992-1994, 2000-2002 Free Software Foundation, Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 C INPUT PARAMETERS 35 C res_ptr r26 36 C s1_ptr r25 37 C size r24 38 C s2_limb r23 39 40 C This runs at 12 cycles/limb on a PA7000. With the used instructions, it can 41 C not become faster due to data cache contention after a store. On the PA7100 42 C it runs at 11 cycles/limb. 43 44 C There are some ideas described in mul_1.asm that applies to this code too. 45 46 C It seems possible to make this run as fast as mpn_addmul_1, if we use 47 C sub,>>= %r29,%r19,%r22 48 C addi 1,%r28,%r28 49 C but that requires reworking the hairy software pipeline... 50 51 ASM_START() 52 PROLOGUE(mpn_submul_1) 53 C .callinfo frame=64,no_calls 54 55 ldo 64(%r30),%r30 56 fldws,ma 4(%r25),%fr5 57 stw %r23,-16(%r30) C move s2_limb ... 58 addib,= -1,%r24,L(just_one_limb) 59 fldws -16(%r30),%fr4 C ... into fr4 60 add %r0,%r0,%r0 C clear carry 61 xmpyu %fr4,%fr5,%fr6 62 fldws,ma 4(%r25),%fr7 63 fstds %fr6,-16(%r30) 64 xmpyu %fr4,%fr7,%fr8 65 ldw -12(%r30),%r19 C least significant limb in product 66 ldw -16(%r30),%r28 67 68 fstds %fr8,-16(%r30) 69 addib,= -1,%r24,L(end) 70 ldw -12(%r30),%r1 71 72 C Main loop 73 LDEF(loop) 74 ldws 0(%r26),%r29 75 fldws,ma 4(%r25),%fr5 76 sub %r29,%r19,%r22 77 add %r22,%r19,%r0 78 stws,ma %r22,4(%r26) 79 addc %r28,%r1,%r19 80 xmpyu %fr4,%fr5,%fr6 81 ldw -16(%r30),%r28 82 fstds %fr6,-16(%r30) 83 addc %r0,%r28,%r28 84 addib,<> -1,%r24,L(loop) 85 ldw -12(%r30),%r1 86 87 LDEF(end) 88 ldw 0(%r26),%r29 89 sub %r29,%r19,%r22 90 add %r22,%r19,%r0 91 stws,ma %r22,4(%r26) 92 addc %r28,%r1,%r19 93 ldw -16(%r30),%r28 94 ldws 0(%r26),%r29 95 addc %r0,%r28,%r28 96 sub %r29,%r19,%r22 97 add %r22,%r19,%r0 98 stws,ma %r22,4(%r26) 99 addc %r0,%r28,%r28 100 bv 0(%r2) 101 ldo -64(%r30),%r30 102 103 LDEF(just_one_limb) 104 xmpyu %fr4,%fr5,%fr6 105 ldw 0(%r26),%r29 106 fstds %fr6,-16(%r30) 107 ldw -12(%r30),%r1 108 ldw -16(%r30),%r28 109 sub %r29,%r1,%r22 110 add %r22,%r1,%r0 111 stw %r22,0(%r26) 112 addc %r0,%r28,%r28 113 bv 0(%r2) 114 ldo -64(%r30),%r30 115 EPILOGUE()