github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/mmx/dive_1.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/mmx/dive_1.asm (about)

     1  dnl  AMD K6 mpn_divexact_1 -- mpn by limb exact division.
     2  
     3  dnl  Copyright 2000-2002, 2007 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C         divisor
    35  C       odd   even
    36  C K6:   10.0  12.0  cycles/limb
    37  C K6-2: 10.0  11.5
    38  
    39  
    40  C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
    41  C                      mp_limb_t divisor);
    42  C
    43  C A simple divl is used for size==1.  This is about 10 cycles faster for an
    44  C odd divisor or 20 cycles for an even divisor.
    45  C
    46  C The loops are quite sensitive to code alignment, speeds should be
    47  C rechecked (odd and even divisor, pic and non-pic) if contemplating
    48  C changing anything.
    49  
    50  defframe(PARAM_DIVISOR,16)
    51  defframe(PARAM_SIZE,   12)
    52  defframe(PARAM_SRC,    8)
    53  defframe(PARAM_DST,    4)
    54  
    55  dnl  re-use parameter space
    56  define(VAR_INVERSE,`PARAM_DST')
    57  
    58  	TEXT
    59  
    60  	ALIGN(32)
    61  PROLOGUE(mpn_divexact_1)
    62  deflit(`FRAME',0)
    63  
    64  	movl	PARAM_SIZE, %ecx
    65  
    66  	movl	PARAM_SRC, %eax
    67  	xorl	%edx, %edx
    68  
    69  	cmpl	$1, %ecx
    70  	jnz	L(two_or_more)
    71  
    72  	movl	(%eax), %eax
    73  
    74  	divl	PARAM_DIVISOR
    75  
    76  	movl	PARAM_DST, %ecx
    77  	movl	%eax, (%ecx)
    78  
    79  	ret
    80  
    81  
    82  L(two_or_more):
    83  	movl	PARAM_DIVISOR, %eax
    84  	pushl	%ebx		FRAME_pushl()
    85  
    86  	movl	PARAM_SRC, %ebx
    87  	pushl	%ebp		FRAME_pushl()
    88  
    89  L(strip_twos):
    90  	shrl	%eax
    91  	incl	%edx			C will get shift+1
    92  
    93  	jnc	L(strip_twos)
    94  	pushl	%esi		FRAME_pushl()
    95  
    96  	leal	1(%eax,%eax), %esi	C d without twos
    97  	andl	$127, %eax		C d/2, 7 bits
    98  
    99  ifdef(`PIC',`
   100  	LEA(	binvert_limb_table, %ebp)
   101  Zdisp(	movzbl,	0,(%eax,%ebp), %eax)
   102  ',`
   103  	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
   104  ')
   105  	pushl	%edi		FRAME_pushl()
   106  
   107  	leal	(%eax,%eax), %ebp	C 2*inv
   108  
   109  	imull	%eax, %eax		C inv*inv
   110  
   111  	movl	PARAM_DST, %edi
   112  
   113  	imull	%esi, %eax		C inv*inv*d
   114  
   115  	subl	%eax, %ebp		C inv = 2*inv - inv*inv*d
   116  	leal	(%ebp,%ebp), %eax	C 2*inv
   117  
   118  	imull	%ebp, %ebp		C inv*inv
   119  
   120  	movl	%esi, PARAM_DIVISOR	C d without twos
   121  	leal	(%ebx,%ecx,4), %ebx	C src end
   122  
   123  	imull	%esi, %ebp		C inv*inv*d
   124  
   125  	leal	(%edi,%ecx,4), %edi	C dst end
   126  	negl	%ecx			C -size
   127  
   128  	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
   129  	subl	$1, %edx		C shift amount, and clear carry
   130  
   131  	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
   132  	pushl	%eax	FRAME_pushl()
   133  	imull	PARAM_DIVISOR, %eax
   134  	cmpl	$1, %eax
   135  	popl	%eax	FRAME_popl()')
   136  
   137  	movl	%eax, VAR_INVERSE
   138  	jnz	L(even)
   139  
   140  	movl	(%ebx,%ecx,4), %esi	C src low limb
   141  	jmp	L(odd_entry)
   142  
   143  
   144  	ALIGN(16)
   145  	nop	C code alignment
   146  L(odd_top):
   147  	C eax	scratch
   148  	C ebx	src end
   149  	C ecx	counter, limbs, negative
   150  	C edx	inverse
   151  	C esi	next limb, adjusted for carry
   152  	C edi	dst end
   153  	C ebp	carry bit, 0 or -1
   154  
   155  	imull	%edx, %esi
   156  
   157  	movl	PARAM_DIVISOR, %eax
   158  	movl	%esi, -4(%edi,%ecx,4)
   159  
   160  	mull	%esi			C carry limb in edx
   161  
   162  	subl	%ebp, %edx		C apply carry bit
   163  	movl	(%ebx,%ecx,4), %esi
   164  
   165  L(odd_entry):
   166  	subl	%edx, %esi		C apply carry limb
   167  	movl	VAR_INVERSE, %edx
   168  
   169  	sbbl	%ebp, %ebp		C 0 or -1
   170  
   171  	incl	%ecx
   172  	jnz	L(odd_top)
   173  
   174  
   175  	imull	%edx, %esi
   176  
   177  	movl	%esi, -4(%edi,%ecx,4)
   178  
   179  	popl	%edi
   180  	popl	%esi
   181  
   182  	popl	%ebp
   183  	popl	%ebx
   184  
   185  	ret
   186  
   187  
   188  L(even):
   189  	C eax
   190  	C ebx	src end
   191  	C ecx	-size
   192  	C edx	twos
   193  	C esi
   194  	C edi	dst end
   195  	C ebp
   196  
   197  	xorl	%ebp, %ebp
   198  Zdisp(	movq,	0,(%ebx,%ecx,4), %mm0)	C src[0,1]
   199  
   200  	movd	%edx, %mm7
   201  	movl	VAR_INVERSE, %edx
   202  
   203  	addl	$2, %ecx
   204  	psrlq	%mm7, %mm0
   205  
   206  	movd	%mm0, %esi
   207  	jz	L(even_two)		C if only two limbs
   208  
   209  
   210  C Out-of-order execution is good enough to hide the load/rshift/movd
   211  C latency.  Having imul at the top of the loop gives 11.5 c/l instead of 12,
   212  C on K6-2.  In fact there's only 11 of decode, but nothing running at 11 has
   213  C been found.  Maybe the fact every second movq is unaligned costs the extra
   214  C 0.5.
   215  
   216  L(even_top):
   217  	C eax	scratch
   218  	C ebx	src end
   219  	C ecx	counter, limbs, negative
   220  	C edx	inverse
   221  	C esi	next limb, adjusted for carry
   222  	C edi	dst end
   223  	C ebp	carry bit, 0 or -1
   224  	C
   225  	C mm0	scratch, source limbs
   226  	C mm7	twos
   227  
   228  	imull	%edx, %esi
   229  
   230  	movl	%esi, -8(%edi,%ecx,4)
   231  	movl	PARAM_DIVISOR, %eax
   232  
   233  	mull	%esi			C carry limb in edx
   234  
   235  	movq	-4(%ebx,%ecx,4), %mm0
   236  	psrlq	%mm7, %mm0
   237  
   238  	movd	%mm0, %esi
   239  	subl	%ebp, %edx		C apply carry bit
   240  
   241  	subl	%edx, %esi		C apply carry limb
   242  	movl	VAR_INVERSE, %edx
   243  
   244  	sbbl	%ebp, %ebp		C 0 or -1
   245  
   246  	incl	%ecx
   247  	jnz	L(even_top)
   248  
   249  
   250  L(even_two):
   251  	movd	-4(%ebx), %mm0		C src high limb
   252  	psrlq	%mm7, %mm0
   253  
   254  	imull	%edx, %esi
   255  
   256  	movl	%esi, -8(%edi)
   257  	movl	PARAM_DIVISOR, %eax
   258  
   259  	mull	%esi			C carry limb in edx
   260  
   261  	movd	%mm0, %esi
   262  	subl	%ebp, %edx		C apply carry bit
   263  
   264  	movl	VAR_INVERSE, %eax
   265  	subl	%edx, %esi		C apply carry limb
   266  
   267  	imull	%eax, %esi
   268  
   269  	movl	%esi, -4(%edi)
   270  
   271  	popl	%edi
   272  	popl	%esi
   273  
   274  	popl	%ebp
   275  	popl	%ebx
   276  
   277  	emms_or_femms
   278  
   279  	ret
   280  
   281  EPILOGUE()
   282  ASM_END()