github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/mmx/logops_n.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/mmx/logops_n.asm (about)

     1  dnl  AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
     2  dnl  mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
     3  
     4  dnl  Copyright 1999-2002 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  NAILS_SUPPORT(0-31)
    35  
    36  
    37  C         alignment dst/src1/src2, A=0mod8, N=4mod8
    38  C      A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
    39  C
    40  C K6-2  1.2   1.5   1.5   1.2   1.2   1.5   1.5   1.2   and,andn,ior,xor
    41  C K6-2  1.5   1.75  2.0   1.75  1.75  2.0   1.75  1.5   iorn,xnor
    42  C K6-2  1.75  2.0   2.0   2.0   2.0   2.0   2.0   1.75  nand,nior
    43  C
    44  C K6    1.5   1.68  1.75  1.2   1.75  1.75  1.68  1.5   and,andn,ior,xor
    45  C K6    2.0   2.0   2.25  2.25  2.25  2.25  2.0   2.0   iorn,xnor
    46  C K6    2.0   2.25  2.25  2.25  2.25  2.25  2.25  2.0   nand,nior
    47  
    48  
    49  dnl  M4_p and M4_i are the MMX and integer instructions
    50  dnl  M4_*_neg_dst means whether to negate the final result before writing
    51  dnl  M4_*_neg_src2 means whether to negate the src2 values before using them
    52  
    53  define(M4_choose_op,
    54  m4_assert_numargs(7)
    55  `ifdef(`OPERATION_$1',`
    56  define(`M4_function',  `mpn_$1')
    57  define(`M4_operation', `$1')
    58  define(`M4_p',         `$2')
    59  define(`M4_p_neg_dst', `$3')
    60  define(`M4_p_neg_src2',`$4')
    61  define(`M4_i',         `$5')
    62  define(`M4_i_neg_dst', `$6')
    63  define(`M4_i_neg_src2',`$7')
    64  ')')
    65  
    66  dnl  xnor is done in "iorn" style because it's a touch faster than "nior"
    67  dnl  style (the two are equivalent for xor).
    68  dnl
    69  dnl  pandn can't be used with nails.
    70  
    71  M4_choose_op( and_n,  pand,0,0,  andl,0,0)
    72  ifelse(GMP_NAIL_BITS,0,
    73  `M4_choose_op(andn_n, pandn,0,0, andl,0,1)',
    74  `M4_choose_op(andn_n, pand,0,1,  andl,0,1)')
    75  M4_choose_op( nand_n, pand,1,0,  andl,1,0)
    76  M4_choose_op( ior_n,  por,0,0,   orl,0,0)
    77  M4_choose_op( iorn_n, por,0,1,   orl,0,1)
    78  M4_choose_op( nior_n, por,1,0,   orl,1,0)
    79  M4_choose_op( xor_n,  pxor,0,0,  xorl,0,0)
    80  M4_choose_op( xnor_n, pxor,0,1,  xorl,0,1)
    81  
    82  ifdef(`M4_function',,
    83  `m4_error(`Unrecognised or undefined OPERATION symbol
    84  ')')
    85  
    86  MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
    87  
    88  
    89  C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
    90  C                   mp_size_t size);
    91  C
    92  C Do src1,size M4_operation src2,size, storing the result in dst,size.
    93  C
    94  C Unaligned movq loads and stores are a bit slower than aligned ones.  The
    95  C test at the start of the routine checks the alignment of src1 and if
    96  C necessary processes one limb separately at the low end to make it aligned.
    97  C
    98  C The raw speeds without this alignment switch are as follows.
    99  C
   100  C           alignment dst/src1/src2, A=0mod8, N=4mod8
   101  C     A/A/A  A/A/N  A/N/A  A/N/N  N/A/A  N/A/N  N/N/A  N/N/N
   102  C
   103  C K6                 1.5    2.0                 1.5    2.0    and,andn,ior,xor
   104  C K6                 1.75   2.2                 2.0    2.28   iorn,xnor
   105  C K6                 2.0    2.25                2.35   2.28   nand,nior
   106  C
   107  C
   108  C Future:
   109  C
   110  C K6 can do one 64-bit load per cycle so each of these routines should be
   111  C able to approach 1.0 c/l, if aligned.  The basic and/andn/ior/xor might be
   112  C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
   113  C The others are 4 instructions per 2 limbs, and so can only approach 1.0
   114  C because there's nowhere to hide some loop control.
   115  
   116  defframe(PARAM_SIZE,16)
   117  defframe(PARAM_SRC2,12)
   118  defframe(PARAM_SRC1,8)
   119  defframe(PARAM_DST, 4)
   120  deflit(`FRAME',0)
   121  
   122  	TEXT
   123  	ALIGN(32)
   124  PROLOGUE(M4_function)
   125  			movl	PARAM_SIZE, %ecx
   126  			pushl	%ebx		FRAME_pushl()
   127  
   128  			movl	PARAM_SRC1, %eax
   129  
   130  			movl	PARAM_SRC2, %ebx
   131  			cmpl	$1, %ecx
   132  
   133  			movl	PARAM_DST, %edx
   134  			ja	L(two_or_more)
   135  
   136  
   137  			movl	(%ebx), %ecx
   138  			popl	%ebx
   139  ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
   140  			M4_i	(%eax), %ecx
   141  ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
   142  			movl	%ecx, (%edx)
   143  
   144  			ret
   145  
   146  
   147  L(two_or_more):
   148  			C eax	src1
   149  			C ebx	src2
   150  			C ecx	size
   151  			C edx	dst
   152  			C esi
   153  			C edi
   154  			C ebp
   155  
   156  			pushl	%esi		FRAME_pushl()
   157  			testl	$4, %eax
   158  			jz	L(alignment_ok)
   159  
   160  			movl	(%ebx), %esi
   161  			addl	$4, %ebx
   162  ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%esi)')
   163  			M4_i	(%eax), %esi
   164  			addl	$4, %eax
   165  ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%esi)')
   166  			movl	%esi, (%edx)
   167  			addl	$4, %edx
   168  			decl	%ecx
   169  
   170  L(alignment_ok):
   171  			movl	%ecx, %esi
   172  			shrl	%ecx
   173  			jnz	L(still_two_or_more)
   174  
   175  			movl	(%ebx), %ecx
   176  			popl	%esi
   177  ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
   178  			M4_i	(%eax), %ecx
   179  ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
   180  			popl	%ebx
   181  			movl	%ecx, (%edx)
   182  			ret
   183  
   184  
   185  L(still_two_or_more):
   186  ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
   187  			pcmpeqd	%mm7, %mm7		C all ones
   188  ifelse(GMP_NAIL_BITS,0,,`psrld	$GMP_NAIL_BITS, %mm7')	C clear nails
   189  ')
   190  
   191  			ALIGN(16)
   192  L(top):
   193  			C eax	src1
   194  			C ebx	src2
   195  			C ecx	counter
   196  			C edx	dst
   197  			C esi
   198  			C edi
   199  			C ebp
   200  			C
   201  			C carry bit is low of size
   202  
   203  			movq	-8(%ebx,%ecx,8), %mm0
   204  ifelse(M4_p_neg_src2,1,`pxor	%mm7, %mm0')
   205  			M4_p	-8(%eax,%ecx,8), %mm0
   206  ifelse(M4_p_neg_dst,1,`	pxor	%mm7, %mm0')
   207  			movq	%mm0, -8(%edx,%ecx,8)
   208  
   209  			loop	L(top)
   210  
   211  
   212  			jnc	L(no_extra)
   213  
   214  			movl	-4(%ebx,%esi,4), %ebx
   215  ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%ebx)')
   216  			M4_i	-4(%eax,%esi,4), %ebx
   217  ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%ebx)')
   218  			movl	%ebx, -4(%edx,%esi,4)
   219  L(no_extra):
   220  
   221  			popl	%esi
   222  			popl	%ebx
   223  			emms_or_femms
   224  			ret
   225  
   226  EPILOGUE()