github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/dive_1.asm (about) 1 dnl AMD64 mpn_divexact_1 -- mpn by limb exact division. 2 3 dnl Copyright 2001, 2002, 2004-2006, 2011, 2012 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 34 C cycles/limb 35 C AMD K8,K9 10 36 C AMD K10 10 37 C Intel P4 33 38 C Intel core2 13.25 39 C Intel corei 14 40 C Intel atom 42 41 C VIA nano 43 42 43 C A quick adoption of the 32-bit K7 code. 44 45 46 C INPUT PARAMETERS 47 C rp rdi 48 C up rsi 49 C n rdx 50 C divisor rcx 51 52 ABI_SUPPORT(DOS64) 53 ABI_SUPPORT(STD64) 54 55 ASM_START() 56 TEXT 57 ALIGN(16) 58 PROLOGUE(mpn_divexact_1) 59 FUNC_ENTRY(4) 60 push %rbx 61 62 mov %rcx, %rax 63 xor R32(%rcx), R32(%rcx) C shift count 64 mov %rdx, %r8 65 66 bt $0, R32(%rax) 67 jnc L(evn) C skip bsfq unless divisor is even 68 69 L(odd): mov %rax, %rbx 70 shr R32(%rax) 71 and $127, R32(%rax) C d/2, 7 bits 72 73 LEA( binvert_limb_table, %rdx) 74 75 movzbl (%rdx,%rax), R32(%rax) C inv 8 bits 76 77 mov %rbx, %r11 C d without twos 78 79 lea (%rax,%rax), R32(%rdx) C 2*inv 80 imul R32(%rax), R32(%rax) C inv*inv 81 imul R32(%rbx), R32(%rax) C inv*inv*d 82 sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits 83 84 lea (%rdx,%rdx), R32(%rax) C 2*inv 85 imul R32(%rdx), R32(%rdx) C inv*inv 86 imul R32(%rbx), R32(%rdx) C inv*inv*d 87 sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits 88 89 lea (%rax,%rax), %r10 C 2*inv 90 imul %rax, %rax C inv*inv 91 imul %rbx, %rax C inv*inv*d 92 sub %rax, %r10 C inv = 2*inv - inv*inv*d, 64 bits 93 94 lea (%rsi,%r8,8), %rsi C up end 95 lea -8(%rdi,%r8,8), %rdi C rp end 96 neg %r8 C -n 97 98 mov (%rsi,%r8,8), %rax C up[0] 99 100 inc %r8 101 jz L(one) 102 103 mov (%rsi,%r8,8), %rdx C up[1] 104 105 shrd R8(%rcx), %rdx, %rax 106 107 xor R32(%rbx), R32(%rbx) 108 jmp L(ent) 109 110 L(evn): bsf %rax, %rcx 111 shr R8(%rcx), %rax 112 jmp L(odd) 113 114 ALIGN(8) 115 L(top): 116 C rax q 117 C rbx carry bit, 0 or 1 118 C rcx shift 119 C rdx 120 C rsi up end 121 C rdi rp end 122 C r8 counter, limbs, negative 123 C r10 d^(-1) mod 2^64 124 C r11 d, shifted down 125 126 mul %r11 C carry limb in rdx 0 10 127 mov -8(%rsi,%r8,8), %rax C 128 mov (%rsi,%r8,8), %r9 C 129 shrd R8(%rcx), %r9, %rax C 130 nop C 131 sub %rbx, %rax C apply carry bit 132 setc %bl C 133 sub %rdx, %rax C apply carry limb 5 134 adc $0, %rbx C 6 135 L(ent): imul %r10, %rax C 6 136 mov %rax, (%rdi,%r8,8) C 137 inc %r8 C 138 jnz L(top) 139 140 mul %r11 C carry limb in rdx 141 mov -8(%rsi), %rax C up high limb 142 shr R8(%rcx), %rax 143 sub %rbx, %rax C apply carry bit 144 sub %rdx, %rax C apply carry limb 145 imul %r10, %rax 146 mov %rax, (%rdi) 147 pop %rbx 148 FUNC_EXIT() 149 ret 150 151 L(one): shr R8(%rcx), %rax 152 imul %r10, %rax 153 mov %rax, (%rdi) 154 pop %rbx 155 FUNC_EXIT() 156 ret 157 158 EPILOGUE()