github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/mod_34lsub1.asm (about) 1 dnl Intel P6 mpn_mod_34lsub1 -- remainder modulo 2^24-1. 2 3 dnl Copyright 2000-2002, 2004 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C P6: 2.0 cycles/limb 35 36 C TODO 37 C Experiments with more unrolling indicate that 1.5 c/l is possible on P6-13 38 C with the current carry handling scheme. 39 40 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) 41 C 42 C Groups of three limbs are handled, with carry bits from 0mod3 into 1mod3 43 C into 2mod3, but at that point going into a separate carries total so we 44 C don't keep the carry flag live across the loop control. Avoiding decl 45 C lets us get to 2.0 c/l, as compared to the generic x86 code at 3.66. 46 C 47 48 defframe(PARAM_SIZE, 8) 49 defframe(PARAM_SRC, 4) 50 51 dnl re-use parameter space 52 define(SAVE_EBX, `PARAM_SIZE') 53 define(SAVE_ESI, `PARAM_SRC') 54 55 TEXT 56 ALIGN(16) 57 PROLOGUE(mpn_mod_34lsub1) 58 deflit(`FRAME',0) 59 60 movl PARAM_SIZE, %ecx 61 movl PARAM_SRC, %edx 62 63 subl $2, %ecx C size-2 64 movl (%edx), %eax C src[0] 65 ja L(three_or_more) 66 jb L(one) 67 68 C size==2 69 70 movl 4(%edx), %ecx C src[1] 71 72 movl %eax, %edx C src[0] 73 shrl $24, %eax C src[0] high 74 75 andl $0xFFFFFF, %edx C src[0] low 76 77 addl %edx, %eax 78 movl %ecx, %edx C src[1] 79 shrl $16, %ecx C src[1] high 80 81 andl $0xFFFF, %edx 82 addl %ecx, %eax 83 84 shll $8, %edx C src[1] low 85 86 addl %edx, %eax 87 L(one): 88 ret 89 90 91 L(three_or_more): 92 C eax src[0], initial acc 0mod3 93 C ebx 94 C ecx size-2 95 C edx src 96 C esi 97 C edi 98 C ebp 99 100 movl %ebx, SAVE_EBX 101 movl 4(%edx), %ebx C src[1], initial 1mod3 102 subl $3, %ecx C size-5 103 104 movl %esi, SAVE_ESI 105 movl 8(%edx), %esi C src[2], initial 2mod3 106 107 pushl %edi FRAME_pushl() 108 movl $0, %edi C initial carries 0mod3 109 jng L(done) C if size < 6 110 111 112 L(top): 113 C eax acc 0mod3 114 C ebx acc 1mod3 115 C ecx counter, limbs 116 C edx src 117 C esi acc 2mod3 118 C edi carrys into 0mod3 119 C ebp 120 121 addl 12(%edx), %eax 122 adcl 16(%edx), %ebx 123 adcl 20(%edx), %esi 124 leal 12(%edx), %edx 125 adcl $0, %edi 126 127 subl $3, %ecx 128 jg L(top) C at least 3 more to process 129 130 131 L(done): 132 C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs respectively 133 cmpl $-1, %ecx 134 jl L(done_0) C if -2, meaning 0 more limbs 135 136 C 1 or 2 more limbs 137 movl $0, %ecx 138 je L(done_1) C if -1, meaning 1 more limb only 139 movl 16(%edx), %ecx 140 L(done_1): 141 addl 12(%edx), %eax C 0mod3 142 adcl %ecx, %ebx C 1mod3 143 adcl $0, %esi C 2mod3 144 adcl $0, %edi C carries 0mod3 145 146 L(done_0): 147 C eax acc 0mod3 148 C ebx acc 1mod3 149 C ecx 150 C edx 151 C esi acc 2mod3 152 C edi carries 0mod3 153 C ebp 154 155 movl %eax, %ecx C 0mod3 156 shrl $24, %eax C 0mod3 high initial total 157 158 andl $0xFFFFFF, %ecx C 0mod3 low 159 movl %edi, %edx C carries 160 shrl $24, %edi C carries high 161 162 addl %ecx, %eax C add 0mod3 low 163 andl $0xFFFFFF, %edx C carries 0mod3 low 164 movl %ebx, %ecx C 1mod3 165 166 shrl $16, %ebx C 1mod3 high 167 addl %edi, %eax C add carries high 168 addl %edx, %eax C add carries 0mod3 low 169 170 andl $0xFFFF, %ecx C 1mod3 low mask 171 addl %ebx, %eax C add 1mod3 high 172 movl SAVE_EBX, %ebx 173 174 shll $8, %ecx C 1mod3 low 175 movl %esi, %edx C 2mod3 176 popl %edi FRAME_popl() 177 178 shrl $8, %esi C 2mod3 high 179 andl $0xFF, %edx C 2mod3 low mask 180 addl %ecx, %eax C add 1mod3 low 181 182 shll $16, %edx C 2mod3 low 183 addl %esi, %eax C add 2mod3 high 184 movl SAVE_ESI, %esi 185 186 addl %edx, %eax C add 2mod3 low 187 188 ret 189 190 EPILOGUE()