github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/aors_n.asm (about)

     1  dnl  AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
     2  
     3  dnl  Copyright 1999-2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
    35  
    36  
    37  ifdef(`OPERATION_add_n', `
    38  	define(M4_inst,        adcl)
    39  	define(M4_function_n,  mpn_add_n)
    40  	define(M4_function_nc, mpn_add_nc)
    41  	define(M4_description, add)
    42  ',`ifdef(`OPERATION_sub_n', `
    43  	define(M4_inst,        sbbl)
    44  	define(M4_function_n,  mpn_sub_n)
    45  	define(M4_function_nc, mpn_sub_nc)
    46  	define(M4_description, subtract)
    47  ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
    48  ')')')
    49  
    50  MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
    51  
    52  
    53  C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
    54  C                          mp_size_t size);
    55  C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
    56  C	                      mp_size_t size, mp_limb_t carry);
    57  C
    58  C Calculate src1,size M4_description src2,size, and store the result in
    59  C dst,size.  The return value is the carry bit from the top of the result
    60  C (1 or 0).
    61  C
    62  C The _nc version accepts 1 or 0 for an initial carry into the low limb of
    63  C the calculation.  Note values other than 1 or 0 here will lead to garbage
    64  C results.
    65  C
    66  C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
    67  C an in-place dst+=src to 2.5 c/l.  The unrolled loops have 1 cycle/loop of
    68  C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
    69  
    70  define(PARAM_CARRY, `FRAME+20(%esp)')
    71  define(PARAM_SIZE,  `FRAME+16(%esp)')
    72  define(PARAM_SRC2,  `FRAME+12(%esp)')
    73  define(PARAM_SRC1,  `FRAME+8(%esp)')
    74  define(PARAM_DST,   `FRAME+4(%esp)')
    75  deflit(`FRAME',0)
    76  
    77  dnl  minimum 5 because the unrolled code can't handle less
    78  deflit(UNROLL_THRESHOLD, 5)
    79  
    80  	TEXT
    81  	ALIGN(32)
    82  
    83  PROLOGUE(M4_function_nc)
    84  	movl	PARAM_CARRY, %eax
    85  	jmp	L(start)
    86  EPILOGUE()
    87  
    88  
    89  PROLOGUE(M4_function_n)
    90  	xorl	%eax, %eax
    91  L(start):
    92  	movl	PARAM_SIZE, %ecx
    93  	pushl	%ebx
    94  FRAME_pushl()
    95  
    96  	movl	PARAM_SRC1, %ebx
    97  	pushl	%edi
    98  FRAME_pushl()
    99  
   100  	movl	PARAM_SRC2, %edx
   101  	cmpl	$UNROLL_THRESHOLD, %ecx
   102  
   103  	movl	PARAM_DST, %edi
   104  	jae	L(unroll)
   105  
   106  
   107  	shrl	%eax		C initial carry flag
   108  
   109  	C offset 0x21 here, close enough to aligned
   110  L(simple):
   111  	C eax	scratch
   112  	C ebx	src1
   113  	C ecx	counter
   114  	C edx	src2
   115  	C esi
   116  	C edi	dst
   117  	C ebp
   118  	C
   119  	C The store to (%edi) could be done with a stosl; it'd be smaller
   120  	C code, but there's no speed gain and a cld would have to be added
   121  	C (per mpn/x86/README).
   122  
   123  	movl	(%ebx), %eax
   124  	leal	4(%ebx), %ebx
   125  
   126  	M4_inst	(%edx), %eax
   127  
   128  	movl	%eax, (%edi)
   129  	leal	4(%edi), %edi
   130  
   131  	leal	4(%edx), %edx
   132  	loop	L(simple)
   133  
   134  
   135  	movl	$0, %eax
   136  	popl	%edi
   137  
   138  	setc	%al
   139  
   140  	popl	%ebx
   141  	ret
   142  
   143  
   144  C -----------------------------------------------------------------------------
   145  L(unroll):
   146  	C eax	carry
   147  	C ebx	src1
   148  	C ecx	counter
   149  	C edx	src2
   150  	C esi
   151  	C edi	dst
   152  	C ebp
   153  
   154  	cmpl	%edi, %ebx
   155  	pushl	%esi
   156  
   157  	je	L(inplace)
   158  
   159  ifdef(`OPERATION_add_n',`
   160  	cmpl	%edi, %edx
   161  
   162  	je	L(inplace_reverse)
   163  ')
   164  
   165  	movl	%ecx, %esi
   166  
   167  	andl	$-4, %ecx
   168  	andl	$3, %esi
   169  
   170  	leal	(%ebx,%ecx,4), %ebx
   171  	leal	(%edx,%ecx,4), %edx
   172  	leal	(%edi,%ecx,4), %edi
   173  
   174  	negl	%ecx
   175  	shrl	%eax
   176  
   177  	ALIGN(32)
   178  L(normal_top):
   179  	C eax	counter, qwords, negative
   180  	C ebx	src1
   181  	C ecx	scratch
   182  	C edx	src2
   183  	C esi
   184  	C edi	dst
   185  	C ebp
   186  
   187  	movl	(%ebx,%ecx,4), %eax
   188  	leal	5(%ecx), %ecx
   189  	M4_inst	-20(%edx,%ecx,4), %eax
   190  	movl	%eax, -20(%edi,%ecx,4)
   191  
   192  	movl	4-20(%ebx,%ecx,4), %eax
   193  	M4_inst	4-20(%edx,%ecx,4), %eax
   194  	movl	%eax, 4-20(%edi,%ecx,4)
   195  
   196  	movl	8-20(%ebx,%ecx,4), %eax
   197  	M4_inst	8-20(%edx,%ecx,4), %eax
   198  	movl	%eax, 8-20(%edi,%ecx,4)
   199  
   200  	movl	12-20(%ebx,%ecx,4), %eax
   201  	M4_inst	12-20(%edx,%ecx,4), %eax
   202  	movl	%eax, 12-20(%edi,%ecx,4)
   203  
   204  	loop	L(normal_top)
   205  
   206  
   207  	decl	%esi
   208  	jz	L(normal_finish_one)
   209  	js	L(normal_done)
   210  
   211  	C two or three more limbs
   212  
   213  	movl	(%ebx), %eax
   214  	M4_inst	(%edx), %eax
   215  	movl	%eax, (%edi)
   216  
   217  	movl	4(%ebx), %eax
   218  	M4_inst	4(%edx), %eax
   219  	decl	%esi
   220  	movl	%eax, 4(%edi)
   221  
   222  	jz	L(normal_done)
   223  	movl	$2, %ecx
   224  
   225  L(normal_finish_one):
   226  	movl	(%ebx,%ecx,4), %eax
   227  	M4_inst	(%edx,%ecx,4), %eax
   228  	movl	%eax, (%edi,%ecx,4)
   229  
   230  L(normal_done):
   231  	popl	%esi
   232  	popl	%edi
   233  
   234  	movl	$0, %eax
   235  	popl	%ebx
   236  
   237  	setc	%al
   238  
   239  	ret
   240  
   241  
   242  C -----------------------------------------------------------------------------
   243  
   244  ifdef(`OPERATION_add_n',`
   245  L(inplace_reverse):
   246  	C dst==src2
   247  
   248  	movl	%ebx, %edx
   249  ')
   250  
   251  L(inplace):
   252  	C eax	initial carry
   253  	C ebx
   254  	C ecx	size
   255  	C edx	src
   256  	C esi
   257  	C edi	dst
   258  	C ebp
   259  
   260  	leal	-1(%ecx), %esi
   261  	decl	%ecx
   262  
   263  	andl	$-4, %ecx
   264  	andl	$3, %esi
   265  
   266  	movl	(%edx), %ebx		C src low limb
   267  	leal	(%edx,%ecx,4), %edx
   268  
   269  	leal	(%edi,%ecx,4), %edi
   270  	negl	%ecx
   271  
   272  	shrl	%eax
   273  
   274  
   275  	ALIGN(32)
   276  L(inplace_top):
   277  	C eax
   278  	C ebx	next src limb
   279  	C ecx	size
   280  	C edx	src
   281  	C esi
   282  	C edi	dst
   283  	C ebp
   284  
   285  	M4_inst	%ebx, (%edi,%ecx,4)
   286  
   287  	movl	4(%edx,%ecx,4), %eax
   288  	leal	5(%ecx), %ecx
   289  
   290  	M4_inst	%eax, 4-20(%edi,%ecx,4)
   291  
   292  	movl	8-20(%edx,%ecx,4), %eax
   293  	movl	12-20(%edx,%ecx,4), %ebx
   294  
   295  	M4_inst	%eax, 8-20(%edi,%ecx,4)
   296  	M4_inst	%ebx, 12-20(%edi,%ecx,4)
   297  
   298  	movl	16-20(%edx,%ecx,4), %ebx
   299  	loop	L(inplace_top)
   300  
   301  
   302  	C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
   303  
   304  	M4_inst	%ebx, (%edi)
   305  
   306  	decl	%esi
   307  	jz	L(inplace_finish_one)
   308  	js	L(inplace_done)
   309  
   310  	C two or three more limbs
   311  
   312  	movl	4(%edx), %eax
   313  	movl	8(%edx), %ebx
   314  	M4_inst	%eax, 4(%edi)
   315  	M4_inst	%ebx, 8(%edi)
   316  
   317  	decl	%esi
   318  	movl	$2, %ecx
   319  
   320  	jz	L(normal_done)
   321  
   322  L(inplace_finish_one):
   323  	movl	4(%edx,%ecx,4), %eax
   324  	M4_inst	%eax, 4(%edi,%ecx,4)
   325  
   326  L(inplace_done):
   327  	popl	%esi
   328  	popl	%edi
   329  
   330  	movl	$0, %eax
   331  	popl	%ebx
   332  
   333  	setc	%al
   334  
   335  	ret
   336  
   337  EPILOGUE()