github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/aors_n.asm (about)

     1  dnl  AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
     2  
     3  dnl  Copyright 1999-2003 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C K7: 1.64 cycles/limb (at 16 limbs/loop).
    35  
    36  
    37  
    38  dnl  K7: UNROLL_COUNT cycles/limb
    39  dnl           8           1.9
    40  dnl          16           1.64
    41  dnl          32           1.7
    42  dnl          64           2.0
    43  dnl  Maximum possible with the current code is 64.
    44  
    45  deflit(UNROLL_COUNT, 16)
    46  
    47  
    48  ifdef(`OPERATION_add_n', `
    49  	define(M4_inst,        adcl)
    50  	define(M4_function_n,  mpn_add_n)
    51  	define(M4_function_nc, mpn_add_nc)
    52  	define(M4_description, add)
    53  ',`ifdef(`OPERATION_sub_n', `
    54  	define(M4_inst,        sbbl)
    55  	define(M4_function_n,  mpn_sub_n)
    56  	define(M4_function_nc, mpn_sub_nc)
    57  	define(M4_description, subtract)
    58  ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
    59  ')')')
    60  
    61  MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
    62  
    63  
    64  C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
    65  C                         mp_size_t size);
    66  C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
    67  C	                   mp_size_t size, mp_limb_t carry);
    68  C
    69  C Calculate src1,size M4_description src2,size, and store the result in
    70  C dst,size.  The return value is the carry bit from the top of the result (1
    71  C or 0).
    72  C
    73  C The _nc version accepts 1 or 0 for an initial carry into the low limb of
    74  C the calculation.  Note values other than 1 or 0 here will lead to garbage
    75  C results.
    76  C
    77  C This code runs at 1.64 cycles/limb, which might be the best possible with
    78  C plain integer operations.  Each limb is 2 loads and 1 store, any 2 of
    79  C which can be done each cycle, leading to 1.5 c/l.
    80  
    81  dnl  Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
    82  ifdef(`PIC',`
    83  deflit(UNROLL_THRESHOLD, 8)
    84  ',`
    85  deflit(UNROLL_THRESHOLD, 8)
    86  ')
    87  
    88  defframe(PARAM_CARRY,20)
    89  defframe(PARAM_SIZE, 16)
    90  defframe(PARAM_SRC2, 12)
    91  defframe(PARAM_SRC1, 8)
    92  defframe(PARAM_DST,  4)
    93  
    94  defframe(SAVE_EBP, -4)
    95  defframe(SAVE_ESI, -8)
    96  defframe(SAVE_EBX, -12)
    97  defframe(SAVE_EDI, -16)
    98  deflit(STACK_SPACE, 16)
    99  
   100  	TEXT
   101  	ALIGN(32)
   102  deflit(`FRAME',0)
   103  
   104  PROLOGUE(M4_function_nc)
   105  	movl	PARAM_CARRY, %eax
   106  	jmp	L(start)
   107  EPILOGUE()
   108  
   109  PROLOGUE(M4_function_n)
   110  
   111  	xorl	%eax, %eax	C carry
   112  L(start):
   113  	movl	PARAM_SIZE, %ecx
   114  	subl	$STACK_SPACE, %esp
   115  deflit(`FRAME',STACK_SPACE)
   116  
   117  	movl	%edi, SAVE_EDI
   118  	movl	%ebx, SAVE_EBX
   119  	cmpl	$UNROLL_THRESHOLD, %ecx
   120  
   121  	movl	PARAM_SRC2, %edx
   122  	movl	PARAM_SRC1, %ebx
   123  	jae	L(unroll)
   124  
   125  	movl	PARAM_DST, %edi
   126  	leal	(%ebx,%ecx,4), %ebx
   127  	leal	(%edx,%ecx,4), %edx
   128  
   129  	leal	(%edi,%ecx,4), %edi
   130  	negl	%ecx
   131  	shrl	%eax
   132  
   133  	C This loop in in a single 16 byte code block already, so no
   134  	C alignment necessary.
   135  L(simple):
   136  	C eax	scratch
   137  	C ebx	src1
   138  	C ecx	counter
   139  	C edx	src2
   140  	C esi
   141  	C edi	dst
   142  	C ebp
   143  
   144  	movl	(%ebx,%ecx,4), %eax
   145  	M4_inst	(%edx,%ecx,4), %eax
   146  	movl	%eax, (%edi,%ecx,4)
   147  	incl	%ecx
   148  	jnz	L(simple)
   149  
   150  	movl	$0, %eax
   151  	movl	SAVE_EDI, %edi
   152  
   153  	movl	SAVE_EBX, %ebx
   154  	setc	%al
   155  	addl	$STACK_SPACE, %esp
   156  
   157  	ret
   158  
   159  
   160  C -----------------------------------------------------------------------------
   161  	C This is at 0x55, close enough to aligned.
   162  L(unroll):
   163  deflit(`FRAME',STACK_SPACE)
   164  	movl	%ebp, SAVE_EBP
   165  	andl	$-2, %ecx		C size low bit masked out
   166  	andl	$1, PARAM_SIZE		C size low bit kept
   167  
   168  	movl	%ecx, %edi
   169  	decl	%ecx
   170  	movl	PARAM_DST, %ebp
   171  
   172  	shrl	$UNROLL_LOG2, %ecx
   173  	negl	%edi
   174  	movl	%esi, SAVE_ESI
   175  
   176  	andl	$UNROLL_MASK, %edi
   177  
   178  ifdef(`PIC',`
   179  	call	L(pic_calc)
   180  L(here):
   181  ',`
   182  	leal	L(entry) (%edi,%edi,8), %esi	C 9 bytes per
   183  ')
   184  	negl	%edi
   185  	shrl	%eax
   186  
   187  	leal	ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
   188  	leal	ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
   189  	leal	ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
   190  
   191  	jmp	*%esi
   192  
   193  
   194  ifdef(`PIC',`
   195  L(pic_calc):
   196  	C See mpn/x86/README about old gas bugs
   197  	leal	(%edi,%edi,8), %esi
   198  	addl	$L(entry)-L(here), %esi
   199  	addl	(%esp), %esi
   200  	ret_internal
   201  ')
   202  
   203  
   204  C -----------------------------------------------------------------------------
   205  	ALIGN(32)
   206  L(top):
   207  	C eax	zero
   208  	C ebx	src1
   209  	C ecx	counter
   210  	C edx	src2
   211  	C esi	scratch (was computed jump)
   212  	C edi	dst
   213  	C ebp	scratch
   214  
   215  	leal	UNROLL_BYTES(%edx), %edx
   216  
   217  L(entry):
   218  deflit(CHUNK_COUNT, 2)
   219  forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
   220  	deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
   221  	deflit(`disp1', eval(disp0 + 4))
   222  
   223  Zdisp(	movl,	disp0,(%ebx), %esi)
   224  	movl	disp1(%ebx), %ebp
   225  Zdisp(	M4_inst,disp0,(%edx), %esi)
   226  Zdisp(	movl,	%esi, disp0,(%edi))
   227  	M4_inst	disp1(%edx), %ebp
   228  	movl	%ebp, disp1(%edi)
   229  ')
   230  
   231  	decl	%ecx
   232  	leal	UNROLL_BYTES(%ebx), %ebx
   233  	leal	UNROLL_BYTES(%edi), %edi
   234  	jns	L(top)
   235  
   236  
   237  	mov	PARAM_SIZE, %esi
   238  	movl	SAVE_EBP, %ebp
   239  	movl	$0, %eax
   240  
   241  	decl	%esi
   242  	js	L(even)
   243  
   244  	movl	(%ebx), %ecx
   245  	M4_inst	UNROLL_BYTES(%edx), %ecx
   246  	movl	%ecx, (%edi)
   247  L(even):
   248  
   249  	movl	SAVE_EDI, %edi
   250  	movl	SAVE_EBX, %ebx
   251  	setc	%al
   252  
   253  	movl	SAVE_ESI, %esi
   254  	addl	$STACK_SPACE, %esp
   255  
   256  	ret
   257  
   258  EPILOGUE()