github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/mul_basecase.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/mul_basecase.asm (about)

     1  dnl  x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
     2  dnl  in a third limb vector.
     3  
     4  dnl  Copyright 1996-2002 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  
    35  C     cycles/crossproduct
    36  C P5	  15
    37  C P6	   7.5
    38  C K6	  12.5
    39  C K7	   5.5
    40  C P4	  24
    41  
    42  
    43  C void mpn_mul_basecase (mp_ptr wp,
    44  C                        mp_srcptr xp, mp_size_t xsize,
    45  C                        mp_srcptr yp, mp_size_t ysize);
    46  C
    47  C This was written in a haste since the Pentium optimized code that was used
    48  C for all x86 machines was slow for the Pentium II.  This code would benefit
    49  C from some cleanup.
    50  C
    51  C To shave off some percentage of the run-time, one should make 4 variants
    52  C of the Louter loop, for the four different outcomes of un mod 4.  That
    53  C would avoid Loop0 altogether.  Code expansion would be > 4-fold for that
    54  C part of the function, but since it is not very large, that would be
    55  C acceptable.
    56  C
    57  C The mul loop (at L(oopM)) might need some tweaking.  It's current speed is
    58  C unknown.
    59  
    60  defframe(PARAM_YSIZE,20)
    61  defframe(PARAM_YP,   16)
    62  defframe(PARAM_XSIZE,12)
    63  defframe(PARAM_XP,   8)
    64  defframe(PARAM_WP,   4)
    65  
    66  defframe(VAR_MULTIPLIER, -4)
    67  defframe(VAR_COUNTER,    -8)
    68  deflit(VAR_STACK_SPACE,  8)
    69  
    70  	TEXT
    71  	ALIGN(8)
    72  
    73  PROLOGUE(mpn_mul_basecase)
    74  deflit(`FRAME',0)
    75  
    76  	subl	$VAR_STACK_SPACE,%esp
    77  	pushl	%esi
    78  	pushl	%ebp
    79  	pushl	%edi
    80  deflit(`FRAME',eval(VAR_STACK_SPACE+12))
    81  
    82  	movl	PARAM_XP,%esi
    83  	movl	PARAM_WP,%edi
    84  	movl	PARAM_YP,%ebp
    85  
    86  	movl	(%esi),%eax		C load xp[0]
    87  	mull	(%ebp)			C multiply by yp[0]
    88  	movl	%eax,(%edi)		C store to wp[0]
    89  	movl	PARAM_XSIZE,%ecx	C xsize
    90  	decl	%ecx			C If xsize = 1, ysize = 1 too
    91  	jz	L(done)
    92  
    93  	pushl	%ebx
    94  FRAME_pushl()
    95  	movl	%edx,%ebx
    96  
    97  	leal	4(%esi),%esi
    98  	leal	4(%edi),%edi
    99  
   100  L(oopM):
   101  	movl	(%esi),%eax		C load next limb at xp[j]
   102  	leal	4(%esi),%esi
   103  	mull	(%ebp)
   104  	addl	%ebx,%eax
   105  	movl	%edx,%ebx
   106  	adcl	$0,%ebx
   107  	movl	%eax,(%edi)
   108  	leal	4(%edi),%edi
   109  	decl	%ecx
   110  	jnz	L(oopM)
   111  
   112  	movl	%ebx,(%edi)		C most significant limb of product
   113  	addl	$4,%edi			C increment wp
   114  	movl	PARAM_XSIZE,%eax
   115  	shll	$2,%eax
   116  	subl	%eax,%edi
   117  	subl	%eax,%esi
   118  
   119  	movl	PARAM_YSIZE,%eax	C ysize
   120  	decl	%eax
   121  	jz	L(skip)
   122  	movl	%eax,VAR_COUNTER	C set index i to ysize
   123  
   124  L(outer):
   125  	movl	PARAM_YP,%ebp		C yp
   126  	addl	$4,%ebp			C make ebp point to next v limb
   127  	movl	%ebp,PARAM_YP
   128  	movl	(%ebp),%eax		C copy y limb ...
   129  	movl	%eax,VAR_MULTIPLIER	C ... to stack slot
   130  	movl	PARAM_XSIZE,%ecx
   131  
   132  	xorl	%ebx,%ebx
   133  	andl	$3,%ecx
   134  	jz	L(end0)
   135  
   136  L(oop0):
   137  	movl	(%esi),%eax
   138  	mull	VAR_MULTIPLIER
   139  	leal	4(%esi),%esi
   140  	addl	%ebx,%eax
   141  	movl	$0,%ebx
   142  	adcl	%ebx,%edx
   143  	addl	%eax,(%edi)
   144  	adcl	%edx,%ebx		C propagate carry into cylimb
   145  
   146  	leal	4(%edi),%edi
   147  	decl	%ecx
   148  	jnz	L(oop0)
   149  
   150  L(end0):
   151  	movl	PARAM_XSIZE,%ecx
   152  	shrl	$2,%ecx
   153  	jz	L(endX)
   154  
   155  	ALIGN(8)
   156  L(oopX):
   157  	movl	(%esi),%eax
   158  	mull	VAR_MULTIPLIER
   159  	addl	%eax,%ebx
   160  	movl	$0,%ebp
   161  	adcl	%edx,%ebp
   162  
   163  	movl	4(%esi),%eax
   164  	mull	VAR_MULTIPLIER
   165  	addl	%ebx,(%edi)
   166  	adcl	%eax,%ebp	C new lo + cylimb
   167  	movl	$0,%ebx
   168  	adcl	%edx,%ebx
   169  
   170  	movl	8(%esi),%eax
   171  	mull	VAR_MULTIPLIER
   172  	addl	%ebp,4(%edi)
   173  	adcl	%eax,%ebx	C new lo + cylimb
   174  	movl	$0,%ebp
   175  	adcl	%edx,%ebp
   176  
   177  	movl	12(%esi),%eax
   178  	mull	VAR_MULTIPLIER
   179  	addl	%ebx,8(%edi)
   180  	adcl	%eax,%ebp	C new lo + cylimb
   181  	movl	$0,%ebx
   182  	adcl	%edx,%ebx
   183  
   184  	addl	%ebp,12(%edi)
   185  	adcl	$0,%ebx		C propagate carry into cylimb
   186  
   187  	leal	16(%esi),%esi
   188  	leal	16(%edi),%edi
   189  	decl	%ecx
   190  	jnz	L(oopX)
   191  
   192  L(endX):
   193  	movl	%ebx,(%edi)
   194  	addl	$4,%edi
   195  
   196  	C we incremented wp and xp in the loop above; compensate
   197  	movl	PARAM_XSIZE,%eax
   198  	shll	$2,%eax
   199  	subl	%eax,%edi
   200  	subl	%eax,%esi
   201  
   202  	movl	VAR_COUNTER,%eax
   203  	decl	%eax
   204  	movl	%eax,VAR_COUNTER
   205  	jnz	L(outer)
   206  
   207  L(skip):
   208  	popl	%ebx
   209  	popl	%edi
   210  	popl	%ebp
   211  	popl	%esi
   212  	addl	$8,%esp
   213  	ret
   214  
   215  L(done):
   216  	movl	%edx,4(%edi)	   C store to wp[1]
   217  	popl	%edi
   218  	popl	%ebp
   219  	popl	%esi
   220  	addl	$8,%esp
   221  	ret
   222  
   223  EPILOGUE()