github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/mul_1.asm (about)

     1  dnl  AMD K6 mpn_mul_1 -- mpn by limb multiply.
     2  
     3  dnl  Copyright 1999, 2000, 2002, 2005 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C			    cycles/limb
    35  C P5
    36  C P6 model 0-8,10-12		 5.5
    37  C P6 model 9  (Banias)
    38  C P6 model 13 (Dothan)		 4.87
    39  C P4 model 0  (Willamette)
    40  C P4 model 1  (?)
    41  C P4 model 2  (Northwood)
    42  C P4 model 3  (Prescott)
    43  C P4 model 4  (Nocona)
    44  C AMD K6			 6.25
    45  C AMD K7
    46  C AMD K8
    47  
    48  
    49  C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
    50  C                      mp_limb_t multiplier);
    51  C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
    52  C                       mp_limb_t multiplier, mp_limb_t carry);
    53  C
    54  C Multiply src,size by mult and store the result in dst,size.
    55  C Return the carry limb from the top of the result.
    56  C
    57  C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
    58  C the low limb of the result.
    59  
    60  defframe(PARAM_CARRY,     20)
    61  defframe(PARAM_MULTIPLIER,16)
    62  defframe(PARAM_SIZE,      12)
    63  defframe(PARAM_SRC,       8)
    64  defframe(PARAM_DST,       4)
    65  
    66  dnl  minimum 5 because the unrolled code can't handle less
    67  deflit(UNROLL_THRESHOLD, 5)
    68  
    69  	TEXT
    70  	ALIGN(32)
    71  
    72  PROLOGUE(mpn_mul_1c)
    73  	pushl	%esi
    74  deflit(`FRAME',4)
    75  	movl	PARAM_CARRY, %esi
    76  	jmp	L(start_nc)
    77  EPILOGUE()
    78  
    79  
    80  PROLOGUE(mpn_mul_1)
    81  	push	%esi
    82  deflit(`FRAME',4)
    83  	xorl	%esi, %esi	C initial carry
    84  
    85  L(start_nc):
    86  	mov	PARAM_SIZE, %ecx
    87  	push	%ebx
    88  FRAME_pushl()
    89  
    90  	movl	PARAM_SRC, %ebx
    91  	push	%edi
    92  FRAME_pushl()
    93  
    94  	movl	PARAM_DST, %edi
    95  	pushl	%ebp
    96  FRAME_pushl()
    97  
    98  	cmpl	$UNROLL_THRESHOLD, %ecx
    99  	movl	PARAM_MULTIPLIER, %ebp
   100  
   101  	jae	L(unroll)
   102  
   103  
   104  	C code offset 0x22 here, close enough to aligned
   105  L(simple):
   106  	C eax	scratch
   107  	C ebx	src
   108  	C ecx	counter
   109  	C edx	scratch
   110  	C esi	carry
   111  	C edi	dst
   112  	C ebp	multiplier
   113  	C
   114  	C this loop 8 cycles/limb
   115  
   116  	movl	(%ebx), %eax
   117  	addl	$4, %ebx
   118  
   119  	mull	%ebp
   120  
   121  	addl	%esi, %eax
   122  	movl	$0, %esi
   123  
   124  	adcl	%edx, %esi
   125  
   126  	movl	%eax, (%edi)
   127  	addl	$4, %edi
   128  
   129  	loop	L(simple)
   130  
   131  
   132  	popl	%ebp
   133  
   134  	popl	%edi
   135  	popl	%ebx
   136  
   137  	movl	%esi, %eax
   138  	popl	%esi
   139  
   140  	ret
   141  
   142  
   143  C -----------------------------------------------------------------------------
   144  C The code for each limb is 6 cycles, with instruction decoding being the
   145  C limiting factor.  At 4 limbs/loop and 1 cycle/loop of overhead it's 6.25
   146  C cycles/limb in total.
   147  C
   148  C The secret ingredient to get 6.25 is to start the loop with the mul and
   149  C have the load/store pair at the end.  Rotating the load/store to the top
   150  C is an 0.5 c/l slowdown.  (Some address generation effect probably.)
   151  C
   152  C The whole unrolled loop fits nicely in exactly 80 bytes.
   153  
   154  
   155  	ALIGN(16)	C already aligned to 16 here actually
   156  L(unroll):
   157  	movl	(%ebx), %eax
   158  	leal	-16(%ebx,%ecx,4), %ebx
   159  
   160  	leal	-16(%edi,%ecx,4), %edi
   161  	subl	$4, %ecx
   162  
   163  	negl	%ecx
   164  
   165  
   166  	ALIGN(16)	C one byte nop for this alignment
   167  L(top):
   168  	C eax	scratch
   169  	C ebx	&src[size-4]
   170  	C ecx	counter
   171  	C edx	scratch
   172  	C esi	carry
   173  	C edi	&dst[size-4]
   174  	C ebp	multiplier
   175  
   176  	mull	%ebp
   177  
   178  	addl	%esi, %eax
   179  	movl	$0, %esi
   180  
   181  	adcl	%edx, %esi
   182  
   183  	movl	%eax, (%edi,%ecx,4)
   184  	movl	4(%ebx,%ecx,4), %eax
   185  
   186  
   187  	mull	%ebp
   188  
   189  	addl	%esi, %eax
   190  	movl	$0, %esi
   191  
   192  	adcl	%edx, %esi
   193  
   194  	movl	%eax, 4(%edi,%ecx,4)
   195  	movl	8(%ebx,%ecx,4), %eax
   196  
   197  
   198  	mull	%ebp
   199  
   200  	addl	%esi, %eax
   201  	movl	$0, %esi
   202  
   203  	adcl	%edx, %esi
   204  
   205  	movl	%eax, 8(%edi,%ecx,4)
   206  	movl	12(%ebx,%ecx,4), %eax
   207  
   208  
   209  	mull	%ebp
   210  
   211  	addl	%esi, %eax
   212  	movl	$0, %esi
   213  
   214  	adcl	%edx, %esi
   215  
   216  	movl	%eax, 12(%edi,%ecx,4)
   217  	movl	16(%ebx,%ecx,4), %eax
   218  
   219  
   220  	addl	$4, %ecx
   221  	js	L(top)
   222  
   223  
   224  
   225  	C eax	next src limb
   226  	C ebx	&src[size-4]
   227  	C ecx	0 to 3 representing respectively 4 to 1 further limbs
   228  	C edx
   229  	C esi	carry
   230  	C edi	&dst[size-4]
   231  
   232  	testb	$2, %cl
   233  	jnz	L(finish_not_two)
   234  
   235  	mull	%ebp
   236  
   237  	addl	%esi, %eax
   238  	movl	$0, %esi
   239  
   240  	adcl	%edx, %esi
   241  
   242  	movl	%eax, (%edi,%ecx,4)
   243  	movl	4(%ebx,%ecx,4), %eax
   244  
   245  
   246  	mull	%ebp
   247  
   248  	addl	%esi, %eax
   249  	movl	$0, %esi
   250  
   251  	adcl	%edx, %esi
   252  
   253  	movl	%eax, 4(%edi,%ecx,4)
   254  	movl	8(%ebx,%ecx,4), %eax
   255  
   256  	addl	$2, %ecx
   257  L(finish_not_two):
   258  
   259  
   260  	testb	$1, %cl
   261  	jnz	L(finish_not_one)
   262  
   263  	mull	%ebp
   264  
   265  	addl	%esi, %eax
   266  	movl	$0, %esi
   267  
   268  	adcl	%edx, %esi
   269  
   270  	movl	%eax, 8(%edi)
   271  	movl	12(%ebx), %eax
   272  L(finish_not_one):
   273  
   274  
   275  	mull	%ebp
   276  
   277  	addl	%esi, %eax
   278  	popl	%ebp
   279  
   280  	adcl	$0, %edx
   281  
   282  	movl	%eax, 12(%edi)
   283  	popl	%edi
   284  
   285  	popl	%ebx
   286  	movl	%edx, %eax
   287  
   288  	popl	%esi
   289  
   290  	ret
   291  
   292  EPILOGUE()