github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/mmx/logops_n.asm (about) 1 dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n, 2 dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations. 3 4 dnl Copyright 1999-2002 Free Software Foundation, Inc. 5 6 dnl This file is part of the GNU MP Library. 7 dnl 8 dnl The GNU MP Library is free software; you can redistribute it and/or modify 9 dnl it under the terms of either: 10 dnl 11 dnl * the GNU Lesser General Public License as published by the Free 12 dnl Software Foundation; either version 3 of the License, or (at your 13 dnl option) any later version. 14 dnl 15 dnl or 16 dnl 17 dnl * the GNU General Public License as published by the Free Software 18 dnl Foundation; either version 2 of the License, or (at your option) any 19 dnl later version. 20 dnl 21 dnl or both in parallel, as here. 22 dnl 23 dnl The GNU MP Library is distributed in the hope that it will be useful, but 24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26 dnl for more details. 27 dnl 28 dnl You should have received copies of the GNU General Public License and the 29 dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30 dnl see https://www.gnu.org/licenses/. 31 32 include(`../config.m4') 33 34 NAILS_SUPPORT(0-31) 35 36 37 C alignment dst/src1/src2, A=0mod8, N=4mod8 38 C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N 39 C 40 C K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor 41 C K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor 42 C K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior 43 C 44 C K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor 45 C K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor 46 C K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior 47 48 49 dnl M4_p and M4_i are the MMX and integer instructions 50 dnl M4_*_neg_dst means whether to negate the final result before writing 51 dnl M4_*_neg_src2 means whether to negate the src2 values before using them 52 53 define(M4_choose_op, 54 m4_assert_numargs(7) 55 `ifdef(`OPERATION_$1',` 56 define(`M4_function', `mpn_$1') 57 define(`M4_operation', `$1') 58 define(`M4_p', `$2') 59 define(`M4_p_neg_dst', `$3') 60 define(`M4_p_neg_src2',`$4') 61 define(`M4_i', `$5') 62 define(`M4_i_neg_dst', `$6') 63 define(`M4_i_neg_src2',`$7') 64 ')') 65 66 dnl xnor is done in "iorn" style because it's a touch faster than "nior" 67 dnl style (the two are equivalent for xor). 68 dnl 69 dnl pandn can't be used with nails. 70 71 M4_choose_op( and_n, pand,0,0, andl,0,0) 72 ifelse(GMP_NAIL_BITS,0, 73 `M4_choose_op(andn_n, pandn,0,0, andl,0,1)', 74 `M4_choose_op(andn_n, pand,0,1, andl,0,1)') 75 M4_choose_op( nand_n, pand,1,0, andl,1,0) 76 M4_choose_op( ior_n, por,0,0, orl,0,0) 77 M4_choose_op( iorn_n, por,0,1, orl,0,1) 78 M4_choose_op( nior_n, por,1,0, orl,1,0) 79 M4_choose_op( xor_n, pxor,0,0, xorl,0,0) 80 M4_choose_op( xnor_n, pxor,0,1, xorl,0,1) 81 82 ifdef(`M4_function',, 83 `m4_error(`Unrecognised or undefined OPERATION symbol 84 ')') 85 86 MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) 87 88 89 C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 90 C mp_size_t size); 91 C 92 C Do src1,size M4_operation src2,size, storing the result in dst,size. 93 C 94 C Unaligned movq loads and stores are a bit slower than aligned ones. The 95 C test at the start of the routine checks the alignment of src1 and if 96 C necessary processes one limb separately at the low end to make it aligned. 97 C 98 C The raw speeds without this alignment switch are as follows. 99 C 100 C alignment dst/src1/src2, A=0mod8, N=4mod8 101 C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N 102 C 103 C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor 104 C K6 1.75 2.2 2.0 2.28 iorn,xnor 105 C K6 2.0 2.25 2.35 2.28 nand,nior 106 C 107 C 108 C Future: 109 C 110 C K6 can do one 64-bit load per cycle so each of these routines should be 111 C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be 112 C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs. 113 C The others are 4 instructions per 2 limbs, and so can only approach 1.0 114 C because there's nowhere to hide some loop control. 115 116 defframe(PARAM_SIZE,16) 117 defframe(PARAM_SRC2,12) 118 defframe(PARAM_SRC1,8) 119 defframe(PARAM_DST, 4) 120 deflit(`FRAME',0) 121 122 TEXT 123 ALIGN(32) 124 PROLOGUE(M4_function) 125 movl PARAM_SIZE, %ecx 126 pushl %ebx FRAME_pushl() 127 128 movl PARAM_SRC1, %eax 129 130 movl PARAM_SRC2, %ebx 131 cmpl $1, %ecx 132 133 movl PARAM_DST, %edx 134 ja L(two_or_more) 135 136 137 movl (%ebx), %ecx 138 popl %ebx 139 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)') 140 M4_i (%eax), %ecx 141 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)') 142 movl %ecx, (%edx) 143 144 ret 145 146 147 L(two_or_more): 148 C eax src1 149 C ebx src2 150 C ecx size 151 C edx dst 152 C esi 153 C edi 154 C ebp 155 156 pushl %esi FRAME_pushl() 157 testl $4, %eax 158 jz L(alignment_ok) 159 160 movl (%ebx), %esi 161 addl $4, %ebx 162 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %esi)') 163 M4_i (%eax), %esi 164 addl $4, %eax 165 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %esi)') 166 movl %esi, (%edx) 167 addl $4, %edx 168 decl %ecx 169 170 L(alignment_ok): 171 movl %ecx, %esi 172 shrl %ecx 173 jnz L(still_two_or_more) 174 175 movl (%ebx), %ecx 176 popl %esi 177 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)') 178 M4_i (%eax), %ecx 179 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)') 180 popl %ebx 181 movl %ecx, (%edx) 182 ret 183 184 185 L(still_two_or_more): 186 ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,` 187 pcmpeqd %mm7, %mm7 C all ones 188 ifelse(GMP_NAIL_BITS,0,,`psrld $GMP_NAIL_BITS, %mm7') C clear nails 189 ') 190 191 ALIGN(16) 192 L(top): 193 C eax src1 194 C ebx src2 195 C ecx counter 196 C edx dst 197 C esi 198 C edi 199 C ebp 200 C 201 C carry bit is low of size 202 203 movq -8(%ebx,%ecx,8), %mm0 204 ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0') 205 M4_p -8(%eax,%ecx,8), %mm0 206 ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0') 207 movq %mm0, -8(%edx,%ecx,8) 208 209 loop L(top) 210 211 212 jnc L(no_extra) 213 214 movl -4(%ebx,%esi,4), %ebx 215 ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ebx)') 216 M4_i -4(%eax,%esi,4), %ebx 217 ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ebx)') 218 movl %ebx, -4(%edx,%esi,4) 219 L(no_extra): 220 221 popl %esi 222 popl %ebx 223 emms_or_femms 224 ret 225 226 EPILOGUE()