github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/cnd_aors_n.asm (about) 1 dnl AMD64 mpn_cnd_add_n, mpn_cnd_sub_n 2 3 dnl Copyright 2011-2013 Free Software Foundation, Inc. 4 5 dnl This file is part of the GNU MP Library. 6 dnl 7 dnl The GNU MP Library is free software; you can redistribute it and/or modify 8 dnl it under the terms of either: 9 dnl 10 dnl * the GNU Lesser General Public License as published by the Free 11 dnl Software Foundation; either version 3 of the License, or (at your 12 dnl option) any later version. 13 dnl 14 dnl or 15 dnl 16 dnl * the GNU General Public License as published by the Free Software 17 dnl Foundation; either version 2 of the License, or (at your option) any 18 dnl later version. 19 dnl 20 dnl or both in parallel, as here. 21 dnl 22 dnl The GNU MP Library is distributed in the hope that it will be useful, but 23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25 dnl for more details. 26 dnl 27 dnl You should have received copies of the GNU General Public License and the 28 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29 dnl see https://www.gnu.org/licenses/. 30 31 include(`../config.m4') 32 33 C cycles/limb 34 C AMD K8,K9 2 35 C AMD K10 2 36 C AMD bd1 2.32 37 C AMD bobcat 3 38 C Intel P4 13 39 C Intel core2 2.9 40 C Intel NHM 2.8 41 C Intel SBR 2.4 42 C Intel atom 5.33 43 C VIA nano 3 44 45 C NOTES 46 C * It might seem natural to use the cmov insn here, but since this function 47 C is supposed to have the exact same execution pattern for cnd true and 48 C false, and since cmov's documentation is not clear about whether it 49 C actually reads both source operands and writes the register for a false 50 C condition, we cannot use it. 51 C * Two cases could be optimised: (1) cnd_add_n could use ADCSBB-from-memory 52 C to save one insn/limb, and (2) when up=rp cnd_add_n and cnd_sub_n could use 53 C ADCSBB-to-memory, again saving 1 insn/limb. 54 C * This runs optimally at decoder bandwidth on K10. It has not been tuned 55 C for any other processor. 56 57 C INPUT PARAMETERS 58 define(`cnd', `%rdi') dnl rcx 59 define(`rp', `%rsi') dnl rdx 60 define(`up', `%rdx') dnl r8 61 define(`vp', `%rcx') dnl r9 62 define(`n', `%r8') dnl rsp+40 63 64 ifdef(`OPERATION_cnd_add_n', ` 65 define(ADDSUB, add) 66 define(ADCSBB, adc) 67 define(func, mpn_cnd_add_n)') 68 ifdef(`OPERATION_cnd_sub_n', ` 69 define(ADDSUB, sub) 70 define(ADCSBB, sbb) 71 define(func, mpn_cnd_sub_n)') 72 73 MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) 74 75 ABI_SUPPORT(DOS64) 76 ABI_SUPPORT(STD64) 77 78 ASM_START() 79 TEXT 80 ALIGN(16) 81 PROLOGUE(func) 82 FUNC_ENTRY(4) 83 IFDOS(` mov 56(%rsp), R32(%r8)') 84 push %rbx 85 push %rbp 86 push %r12 87 push %r13 88 push %r14 89 90 neg cnd 91 sbb cnd, cnd C make cnd mask 92 93 lea (vp,n,8), vp 94 lea (up,n,8), up 95 lea (rp,n,8), rp 96 97 mov R32(n), R32(%rax) 98 neg n 99 and $3, R32(%rax) 100 jz L(top) C carry-save reg rax = 0 in this arc 101 cmp $2, R32(%rax) 102 jc L(b1) 103 jz L(b2) 104 105 L(b3): mov (vp,n,8), %r12 106 mov 8(vp,n,8), %r13 107 mov 16(vp,n,8), %r14 108 and cnd, %r12 109 mov (up,n,8), %r10 110 and cnd, %r13 111 mov 8(up,n,8), %rbx 112 and cnd, %r14 113 mov 16(up,n,8), %rbp 114 ADDSUB %r12, %r10 115 mov %r10, (rp,n,8) 116 ADCSBB %r13, %rbx 117 mov %rbx, 8(rp,n,8) 118 ADCSBB %r14, %rbp 119 mov %rbp, 16(rp,n,8) 120 sbb R32(%rax), R32(%rax) C save carry 121 add $3, n 122 js L(top) 123 jmp L(end) 124 125 L(b2): mov (vp,n,8), %r12 126 mov 8(vp,n,8), %r13 127 mov (up,n,8), %r10 128 and cnd, %r12 129 mov 8(up,n,8), %rbx 130 and cnd, %r13 131 ADDSUB %r12, %r10 132 mov %r10, (rp,n,8) 133 ADCSBB %r13, %rbx 134 mov %rbx, 8(rp,n,8) 135 sbb R32(%rax), R32(%rax) C save carry 136 add $2, n 137 js L(top) 138 jmp L(end) 139 140 L(b1): mov (vp,n,8), %r12 141 mov (up,n,8), %r10 142 and cnd, %r12 143 ADDSUB %r12, %r10 144 mov %r10, (rp,n,8) 145 sbb R32(%rax), R32(%rax) C save carry 146 add $1, n 147 jns L(end) 148 149 ALIGN(16) 150 L(top): mov (vp,n,8), %r12 151 mov 8(vp,n,8), %r13 152 mov 16(vp,n,8), %r14 153 mov 24(vp,n,8), %r11 154 and cnd, %r12 155 mov (up,n,8), %r10 156 and cnd, %r13 157 mov 8(up,n,8), %rbx 158 and cnd, %r14 159 mov 16(up,n,8), %rbp 160 and cnd, %r11 161 mov 24(up,n,8), %r9 162 add R32(%rax), R32(%rax) C restore carry 163 ADCSBB %r12, %r10 164 mov %r10, (rp,n,8) 165 ADCSBB %r13, %rbx 166 mov %rbx, 8(rp,n,8) 167 ADCSBB %r14, %rbp 168 mov %rbp, 16(rp,n,8) 169 ADCSBB %r11, %r9 170 mov %r9, 24(rp,n,8) 171 sbb R32(%rax), R32(%rax) C save carry 172 add $4, n 173 js L(top) 174 175 L(end): neg R32(%rax) 176 pop %r14 177 pop %r13 178 pop %r12 179 pop %rbp 180 pop %rbx 181 FUNC_EXIT() 182 ret 183 EPILOGUE()