github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/dive_1.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/dive_1.asm (about)

     1  dnl  Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
     2  
     3  dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C P4: 19.0 cycles/limb
    35  
    36  
    37  C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
    38  C                      mp_limb_t divisor);
    39  C
    40  C Pairs of movd's are used to avoid unaligned loads.  Despite the loads not
    41  C being on the dependent chain and there being plenty of cycles available,
    42  C using an unaligned movq on every second iteration measured about 23 c/l.
    43  C
    44  C Using divl for size==1 seems a touch quicker than mul-by-inverse.  The mul
    45  C will be about 9+2*4+2*2+10*4+19+12 = 92 cycles latency, though some of
    46  C that might be hidden by out-of-order execution, whereas divl is around 60.
    47  C At size==2 an extra 19 for the mul versus 60 for the divl will see the mul
    48  C faster.
    49  
    50  defframe(PARAM_DIVISOR,16)
    51  defframe(PARAM_SIZE,   12)
    52  defframe(PARAM_SRC,    8)
    53  defframe(PARAM_DST,    4)
    54  
    55  	TEXT
    56  
    57  	ALIGN(16)
    58  PROLOGUE(mpn_divexact_1)
    59  deflit(`FRAME',0)
    60  
    61  	movl	PARAM_SIZE, %edx
    62  
    63  	movl	PARAM_SRC, %eax
    64  
    65  	movl	PARAM_DIVISOR, %ecx
    66  	subl	$1, %edx
    67  	jnz	L(two_or_more)
    68  
    69  	movl	(%eax), %eax
    70  	xorl	%edx, %edx
    71  
    72  	divl	%ecx
    73  	movl	PARAM_DST, %ecx
    74  
    75  	movl	%eax, (%ecx)
    76  	ret
    77  
    78  
    79  L(two_or_more):
    80  	C eax	src
    81  	C ebx
    82  	C ecx	divisor
    83  	C edx	size-1
    84  
    85  	movl	%ecx, %eax
    86  	bsfl	%ecx, %ecx		C trailing twos
    87  
    88  	shrl	%cl, %eax		C d = divisor without twos
    89  	movd	%eax, %mm6
    90  	movd	%ecx, %mm7		C shift
    91  
    92  	shrl	%eax			C d/2
    93  
    94  	andl	$127, %eax		C d/2, 7 bits
    95  
    96  ifdef(`PIC',`
    97  	LEA(	binvert_limb_table, %ecx)
    98  	movzbl	(%eax,%ecx), %eax		C inv 8 bits
    99  ',`
   100  	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
   101  ')
   102  
   103  	C
   104  
   105  	movd	%eax, %mm5		C inv
   106  
   107  	movd	%eax, %mm0		C inv
   108  
   109  	pmuludq	%mm5, %mm5		C inv*inv
   110  
   111  	C
   112  
   113  	pmuludq	%mm6, %mm5		C inv*inv*d
   114  	paddd	%mm0, %mm0		C 2*inv
   115  
   116  	C
   117  
   118  	psubd	%mm5, %mm0		C inv = 2*inv - inv*inv*d
   119  	pxor	%mm5, %mm5
   120  
   121  	paddd	%mm0, %mm5
   122  	pmuludq	%mm0, %mm0		C inv*inv
   123  
   124  	pcmpeqd	%mm4, %mm4
   125  	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
   126  
   127  	C
   128  
   129  	pmuludq	%mm6, %mm0		C inv*inv*d
   130  	paddd	%mm5, %mm5		C 2*inv
   131  
   132  	movl	PARAM_SRC, %eax
   133  	movl	PARAM_DST, %ecx
   134  	pxor	%mm1, %mm1		C initial carry limb
   135  
   136  	C
   137  
   138  	psubd	%mm0, %mm5		C inv = 2*inv - inv*inv*d
   139  
   140  	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
   141  	pushl	%eax	FRAME_pushl()
   142  	movq	%mm6, %mm0
   143  	pmuludq	%mm5, %mm0
   144  	movd	%mm0, %eax
   145  	cmpl	$1, %eax
   146  	popl	%eax	FRAME_popl()')
   147  
   148  	pxor	%mm0, %mm0		C initial carry bit
   149  
   150  
   151  C The dependent chain here is as follows.
   152  C
   153  C					latency
   154  C	psubq	 s = (src-cbit) - climb	   2
   155  C	pmuludq	 q = s*inverse		   8
   156  C	pmuludq	 prod = q*divisor	   8
   157  C	psrlq	 climb = high(prod)	   2
   158  C					  --
   159  C					  20
   160  C
   161  C Yet the loop measures 19.0 c/l, so obviously there's something gained
   162  C there over a straight reading of the chip documentation.
   163  
   164  L(top):
   165  	C eax	src, incrementing
   166  	C ebx
   167  	C ecx	dst, incrementing
   168  	C edx	counter, size-1 iterations
   169  	C
   170  	C mm0	carry bit
   171  	C mm1	carry limb
   172  	C mm4	0x00000000FFFFFFFF
   173  	C mm5	inverse
   174  	C mm6	divisor
   175  	C mm7	shift
   176  
   177  	movd	(%eax), %mm2
   178  	movd	4(%eax), %mm3
   179  	addl	$4, %eax
   180  	punpckldq %mm3, %mm2
   181  
   182  	psrlq	%mm7, %mm2
   183  	pand	%mm4, %mm2		C src
   184  	psubq	%mm0, %mm2		C src - cbit
   185  
   186  	psubq	%mm1, %mm2		C src - cbit - climb
   187  	movq	%mm2, %mm0
   188  	psrlq	$63, %mm0		C new cbit
   189  
   190  	pmuludq	%mm5, %mm2		C s*inverse
   191  	movd	%mm2, (%ecx)		C q
   192  	addl	$4, %ecx
   193  
   194  	movq	%mm6, %mm1
   195  	pmuludq	%mm2, %mm1		C q*divisor
   196  	psrlq	$32, %mm1		C new climb
   197  
   198  	subl	$1, %edx
   199  	jnz	L(top)
   200  
   201  
   202  L(done):
   203  	movd	(%eax), %mm2
   204  	psrlq	%mm7, %mm2		C src
   205  	psubq	%mm0, %mm2		C src - cbit
   206  
   207  	psubq	%mm1, %mm2		C src - cbit - climb
   208  
   209  	pmuludq	%mm5, %mm2		C s*inverse
   210  	movd	%mm2, (%ecx)		C q
   211  
   212  	emms
   213  	ret
   214  
   215  EPILOGUE()
   216  ASM_END()