github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/pentium4/mod_34lsub1.asm (about) 1 dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. 2 3 dnl Copyright 2000-2002, 2004, 2005, 2007, 2010-2012 Free Software Foundation, 4 dnl Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 35 C cycles/limb 36 C AMD K8,K9 1.0 37 C AMD K10 1.12 38 C Intel P4 3.25 39 C Intel core2 1.5 40 C Intel corei 1.5 41 C Intel atom 2.5 42 C VIA nano 1.75 43 44 45 C INPUT PARAMETERS 46 define(`ap', %rdi) 47 define(`n', %rsi) 48 49 C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) 50 51 C TODO 52 C * Review feed-in and wind-down code. In particular, try to avoid adc and 53 C sbb to placate Pentium4. 54 C * It seems possible to reach 2.67 c/l by using a cleaner 6-way unrolling, 55 C without the dual loop exits. 56 57 ABI_SUPPORT(DOS64) 58 ABI_SUPPORT(STD64) 59 60 ASM_START() 61 TEXT 62 ALIGN(32) 63 PROLOGUE(mpn_mod_34lsub1) 64 FUNC_ENTRY(2) 65 66 mov $0x0000FFFFFFFFFFFF, %r11 67 68 sub $2, %rsi 69 ja L(gt2) 70 71 mov (ap), %rax 72 nop 73 jb L(1) 74 75 mov 8(ap), %rsi 76 mov %rax, %rdx 77 shr $48, %rax C src[0] low 78 79 and %r11, %rdx C src[0] high 80 add %rdx, %rax 81 mov R32(%rsi), R32(%rdx) 82 83 shr $32, %rsi C src[1] high 84 add %rsi, %rax 85 86 shl $16, %rdx C src[1] low 87 add %rdx, %rax 88 89 L(1): FUNC_EXIT() 90 ret 91 92 93 ALIGN(16) 94 L(gt2): xor R32(%rax), R32(%rax) 95 xor R32(%rcx), R32(%rcx) 96 xor R32(%rdx), R32(%rdx) 97 xor %r8, %r8 98 xor %r9, %r9 99 xor %r10, %r10 100 101 L(top): add (ap), %rax 102 adc $0, %r10 103 add 8(ap), %rcx 104 adc $0, %r8 105 add 16(ap), %rdx 106 adc $0, %r9 107 108 sub $3, %rsi 109 jng L(end) 110 111 add 24(ap), %rax 112 adc $0, %r10 113 add 32(ap), %rcx 114 adc $0, %r8 115 add 40(ap), %rdx 116 lea 48(ap), ap 117 adc $0, %r9 118 119 sub $3, %rsi 120 jg L(top) 121 122 123 add $-24, ap 124 L(end): add %r9, %rax 125 adc %r10, %rcx 126 adc %r8, %rdx 127 128 inc %rsi 129 mov $0x1, R32(%r10) 130 js L(combine) 131 132 mov $0x10000, R32(%r10) 133 adc 24(ap), %rax 134 dec %rsi 135 js L(combine) 136 137 adc 32(ap), %rcx 138 mov $0x100000000, %r10 139 140 L(combine): 141 sbb %rsi, %rsi C carry 142 mov %rax, %rdi C 0mod3 143 shr $48, %rax C 0mod3 high 144 145 and %r10, %rsi C carry masked 146 and %r11, %rdi C 0mod3 low 147 mov R32(%rcx), R32(%r10) C 1mod3 148 149 add %rsi, %rax C apply carry 150 shr $32, %rcx C 1mod3 high 151 152 add %rdi, %rax C apply 0mod3 low 153 movzwl %dx, R32(%rdi) C 2mod3 154 shl $16, %r10 C 1mod3 low 155 156 add %rcx, %rax C apply 1mod3 high 157 shr $16, %rdx C 2mod3 high 158 159 add %r10, %rax C apply 1mod3 low 160 shl $32, %rdi C 2mod3 low 161 162 add %rdx, %rax C apply 2mod3 high 163 add %rdi, %rax C apply 2mod3 low 164 165 FUNC_EXIT() 166 ret 167 EPILOGUE()