github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/bdiv_q_1.asm (about)

     1  dnl  Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
     2  
     3  dnl  Rearranged from mpn/x86/pentium4/sse2/dive_1.asm by Marco Bodrato.
     4  
     5  dnl  Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  
    36  C P4: 19.0 cycles/limb
    37  
    38  C Pairs of movd's are used to avoid unaligned loads.  Despite the loads not
    39  C being on the dependent chain and there being plenty of cycles available,
    40  C using an unaligned movq on every second iteration measured about 23 c/l.
    41  C
    42  
    43  defframe(PARAM_SHIFT,  24)
    44  defframe(PARAM_INVERSE,20)
    45  defframe(PARAM_DIVISOR,16)
    46  defframe(PARAM_SIZE,   12)
    47  defframe(PARAM_SRC,    8)
    48  defframe(PARAM_DST,    4)
    49  
    50  	TEXT
    51  
    52  C mp_limb_t
    53  C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
    54  C		    mp_limb_t inverse, int shift)
    55  	ALIGN(32)
    56  PROLOGUE(mpn_pi1_bdiv_q_1)
    57  deflit(`FRAME',0)
    58  
    59  	movl	PARAM_SIZE, %edx
    60  
    61  	movl	PARAM_SRC, %eax
    62  
    63  	movl	PARAM_DIVISOR, %ecx
    64  
    65  	movd	%ecx, %mm6
    66  	movl	PARAM_SHIFT, %ecx
    67  
    68  	movd	%ecx, %mm7		C shift
    69  
    70  	C
    71  
    72  	movl	PARAM_INVERSE, %ecx
    73  	movd	%ecx, %mm5		C inv
    74  
    75  	movl	PARAM_DST, %ecx
    76  	pxor	%mm1, %mm1		C initial carry limb
    77  	pxor	%mm0, %mm0		C initial carry bit
    78  
    79  	subl	$1, %edx
    80  	jz	L(done)
    81  
    82  	pcmpeqd	%mm4, %mm4
    83  	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
    84  
    85  C The dependent chain here is as follows.
    86  C
    87  C					latency
    88  C	psubq	 s = (src-cbit) - climb	   2
    89  C	pmuludq	 q = s*inverse		   8
    90  C	pmuludq	 prod = q*divisor	   8
    91  C	psrlq	 climb = high(prod)	   2
    92  C					  --
    93  C					  20
    94  C
    95  C Yet the loop measures 19.0 c/l, so obviously there's something gained
    96  C there over a straight reading of the chip documentation.
    97  
    98  L(top):
    99  	C eax	src, incrementing
   100  	C ebx
   101  	C ecx	dst, incrementing
   102  	C edx	counter, size-1 iterations
   103  	C
   104  	C mm0	carry bit
   105  	C mm1	carry limb
   106  	C mm4	0x00000000FFFFFFFF
   107  	C mm5	inverse
   108  	C mm6	divisor
   109  	C mm7	shift
   110  
   111  	movd	(%eax), %mm2
   112  	movd	4(%eax), %mm3
   113  	addl	$4, %eax
   114  	punpckldq %mm3, %mm2
   115  
   116  	psrlq	%mm7, %mm2
   117  	pand	%mm4, %mm2		C src
   118  	psubq	%mm0, %mm2		C src - cbit
   119  
   120  	psubq	%mm1, %mm2		C src - cbit - climb
   121  	movq	%mm2, %mm0
   122  	psrlq	$63, %mm0		C new cbit
   123  
   124  	pmuludq	%mm5, %mm2		C s*inverse
   125  	movd	%mm2, (%ecx)		C q
   126  	addl	$4, %ecx
   127  
   128  	movq	%mm6, %mm1
   129  	pmuludq	%mm2, %mm1		C q*divisor
   130  	psrlq	$32, %mm1		C new climb
   131  
   132  L(entry):
   133  	subl	$1, %edx
   134  	jnz	L(top)
   135  
   136  L(done):
   137  	movd	(%eax), %mm2
   138  	psrlq	%mm7, %mm2		C src
   139  	psubq	%mm0, %mm2		C src - cbit
   140  
   141  	psubq	%mm1, %mm2		C src - cbit - climb
   142  
   143  	pmuludq	%mm5, %mm2		C s*inverse
   144  	movd	%mm2, (%ecx)		C q
   145  
   146  	emms
   147  	ret
   148  
   149  EPILOGUE()
   150  
   151  	ALIGN(16)
   152  C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
   153  C                           mp_limb_t divisor);
   154  C
   155  PROLOGUE(mpn_bdiv_q_1)
   156  deflit(`FRAME',0)
   157  
   158  	movl	PARAM_SIZE, %edx
   159  
   160  	movl	PARAM_DIVISOR, %ecx
   161  
   162  	C eax	src
   163  	C ebx
   164  	C ecx	divisor
   165  	C edx	size-1
   166  
   167  	movl	%ecx, %eax
   168  	bsfl	%ecx, %ecx		C trailing twos
   169  
   170  	shrl	%cl, %eax		C d = divisor without twos
   171  	movd	%eax, %mm6
   172  	movd	%ecx, %mm7		C shift
   173  
   174  	shrl	%eax			C d/2
   175  
   176  	andl	$127, %eax		C d/2, 7 bits
   177  
   178  ifdef(`PIC',`
   179  	LEA(	binvert_limb_table, %ecx)
   180  	movzbl	(%eax,%ecx), %eax		C inv 8 bits
   181  ',`
   182  	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
   183  ')
   184  
   185  	C
   186  
   187  	movd	%eax, %mm5		C inv
   188  
   189  	movd	%eax, %mm0		C inv
   190  
   191  	pmuludq	%mm5, %mm5		C inv*inv
   192  
   193  	C
   194  
   195  	pmuludq	%mm6, %mm5		C inv*inv*d
   196  	paddd	%mm0, %mm0		C 2*inv
   197  
   198  	C
   199  
   200  	psubd	%mm5, %mm0		C inv = 2*inv - inv*inv*d
   201  	pxor	%mm5, %mm5
   202  
   203  	paddd	%mm0, %mm5
   204  	pmuludq	%mm0, %mm0		C inv*inv
   205  
   206  	pcmpeqd	%mm4, %mm4
   207  	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
   208  
   209  	C
   210  
   211  	pmuludq	%mm6, %mm0		C inv*inv*d
   212  	paddd	%mm5, %mm5		C 2*inv
   213  
   214  	movl	PARAM_SRC, %eax
   215  	movl	PARAM_DST, %ecx
   216  	pxor	%mm1, %mm1		C initial carry limb
   217  
   218  	C
   219  
   220  	psubd	%mm0, %mm5		C inv = 2*inv - inv*inv*d
   221  
   222  	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
   223  	pushl	%eax	FRAME_pushl()
   224  	movq	%mm6, %mm0
   225  	pmuludq	%mm5, %mm0
   226  	movd	%mm0, %eax
   227  	cmpl	$1, %eax
   228  	popl	%eax	FRAME_popl()')
   229  
   230  	pxor	%mm0, %mm0		C initial carry bit
   231  	jmp	L(entry)
   232  
   233  EPILOGUE()
   234  ASM_END()