github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/nano/dive_1.asm (about) 1 dnl AMD64 mpn_divexact_1 -- mpn by limb exact division. 2 3 dnl Copyright 2001, 2002, 2004-2006, 2010-2012 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C cycles/limb 35 C norm unorm 36 C AMD K8,K9 11 11 37 C AMD K10 11 11 38 C Intel P4 ? 39 C Intel core2 13.5 13.25 40 C Intel corei 14.25 41 C Intel atom 34 36 42 C VIA nano 19.25 19.25 43 44 45 C INPUT PARAMETERS 46 C rp rdi 47 C up rsi 48 C n rdx 49 C divisor rcx 50 51 ABI_SUPPORT(DOS64) 52 ABI_SUPPORT(STD64) 53 54 ASM_START() 55 TEXT 56 ALIGN(16) 57 PROLOGUE(mpn_divexact_1) 58 FUNC_ENTRY(4) 59 push %rbx 60 61 mov %rcx, %rax 62 xor R32(%rcx), R32(%rcx) C shift count 63 mov %rdx, %r8 64 65 bt $0, R32(%rax) 66 jc L(odd) C skip bsfq unless divisor is even 67 bsf %rax, %rcx 68 shr R8(%rcx), %rax 69 L(odd): mov %rax, %rbx 70 shr R32(%rax) 71 and $127, R32(%rax) C d/2, 7 bits 72 73 LEA( binvert_limb_table, %rdx) 74 75 movzbl (%rdx,%rax), R32(%rax) C inv 8 bits 76 77 mov %rbx, %r11 C d without twos 78 79 lea (%rax,%rax), R32(%rdx) C 2*inv 80 imul R32(%rax), R32(%rax) C inv*inv 81 imul R32(%rbx), R32(%rax) C inv*inv*d 82 sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits 83 84 lea (%rdx,%rdx), R32(%rax) C 2*inv 85 imul R32(%rdx), R32(%rdx) C inv*inv 86 imul R32(%rbx), R32(%rdx) C inv*inv*d 87 sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits 88 89 lea (%rax,%rax), %r10 C 2*inv 90 imul %rax, %rax C inv*inv 91 imul %rbx, %rax C inv*inv*d 92 sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits 93 94 lea (%rsi,%r8,8), %rsi C up end 95 lea -8(%rdi,%r8,8), %rdi C rp end 96 neg %r8 C -n 97 98 mov (%rsi,%r8,8), %rax C up[0] 99 100 inc %r8 101 jz L(one) 102 103 test R32(%rcx), R32(%rcx) 104 jnz L(unorm) C branch if count != 0 105 xor R32(%rbx), R32(%rbx) 106 jmp L(nent) 107 108 ALIGN(8) 109 L(ntop):mul %r11 C carry limb in rdx 0 10 110 mov -8(%rsi,%r8,8), %rax C 111 sub %rbx, %rax C apply carry bit 112 setc %bl C 113 sub %rdx, %rax C apply carry limb 5 114 adc $0, %rbx C 6 115 L(nent):imul %r10, %rax C 6 116 mov %rax, (%rdi,%r8,8) C 117 inc %r8 C 118 jnz L(ntop) 119 120 mov -8(%rsi), %r9 C up high limb 121 jmp L(com) 122 123 L(unorm): 124 mov (%rsi,%r8,8), %r9 C up[1] 125 shr R8(%rcx), %rax C 126 neg R32(%rcx) 127 shl R8(%rcx), %r9 C 128 neg R32(%rcx) 129 or %r9, %rax 130 xor R32(%rbx), R32(%rbx) 131 jmp L(uent) 132 133 ALIGN(8) 134 L(utop):mul %r11 C carry limb in rdx 0 10 135 mov (%rsi,%r8,8), %rax C 136 shl R8(%rcx), %rax C 137 neg R32(%rcx) 138 or %r9, %rax 139 sub %rbx, %rax C apply carry bit 140 setc %bl C 141 sub %rdx, %rax C apply carry limb 5 142 adc $0, %rbx C 6 143 L(uent):imul %r10, %rax C 6 144 mov (%rsi,%r8,8), %r9 C 145 shr R8(%rcx), %r9 C 146 neg R32(%rcx) 147 mov %rax, (%rdi,%r8,8) C 148 inc %r8 C 149 jnz L(utop) 150 151 L(com): mul %r11 C carry limb in rdx 152 sub %rbx, %r9 C apply carry bit 153 sub %rdx, %r9 C apply carry limb 154 imul %r10, %r9 155 mov %r9, (%rdi) 156 pop %rbx 157 FUNC_EXIT() 158 ret 159 160 L(one): shr R8(%rcx), %rax 161 imul %r10, %rax 162 mov %rax, (%rdi) 163 pop %rbx 164 FUNC_EXIT() 165 ret 166 EPILOGUE()