github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/mod_1_4.asm (about)

     1  dnl  x86-32 mpn_mod_1s_4p for Pentium 4 and P6 models with SSE2 (i.e. 9,D,E,F).
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C TODO:
    36  C  * Optimize.  The present code was written quite straightforwardly.
    37  C  * Optimize post-loop reduction code.
    38  C  * Write a cps function that uses sse2 insns.
    39  
    40  C			    cycles/limb
    41  C P6 model 0-8,10-12		-
    42  C P6 model 9   (Banias)		?
    43  C P6 model 13  (Dothan)		3.4
    44  C P4 model 0-1 (Willamette)	?
    45  C P4 model 2   (Northwood)	4
    46  C P4 model 3-4 (Prescott)	4.5
    47  
    48  C INPUT PARAMETERS
    49  C ap		sp + 4
    50  C n		sp + 8
    51  C b		sp + 12
    52  C cps		sp + 16
    53  
    54  define(`B1modb', `%mm1')
    55  define(`B2modb', `%mm2')
    56  define(`B3modb', `%mm3')
    57  define(`B4modb', `%mm4')
    58  define(`B5modb', `%mm5')
    59  define(`ap',     `%edx')
    60  define(`n',      `%eax')
    61  
    62  ASM_START()
    63  	TEXT
    64  	ALIGN(16)
    65  PROLOGUE(mpn_mod_1s_4p)
    66  	push	%ebx
    67  	mov	8(%esp), ap
    68  	mov	12(%esp), n
    69  	mov	20(%esp), %ecx
    70  
    71  	movd	8(%ecx), B1modb
    72  	movd	12(%ecx), B2modb
    73  	movd	16(%ecx), B3modb
    74  	movd	20(%ecx), B4modb
    75  	movd	24(%ecx), B5modb
    76  
    77  	mov	n, %ebx
    78  	lea	-4(ap,n,4), ap
    79  	and	$3, %ebx
    80  	je	L(b0)
    81  	cmp	$2, %ebx
    82  	jc	L(b1)
    83  	je	L(b2)
    84  
    85  L(b3):	movd	-4(ap), %mm7
    86  	pmuludq	B1modb, %mm7
    87  	movd	-8(ap), %mm6
    88  	paddq	%mm6, %mm7
    89  	movd	(ap), %mm6
    90  	pmuludq	B2modb, %mm6
    91  	paddq	%mm6, %mm7
    92  	lea	-24(ap), ap
    93  	add	$-3, n
    94  	jz	L(end)
    95  	jmp	L(top)
    96  
    97  L(b0):	movd	-8(ap), %mm7
    98  	pmuludq	B1modb, %mm7
    99  	movd	-12(ap), %mm6
   100  	paddq	%mm6, %mm7
   101  	movd	-4(ap), %mm6
   102  	pmuludq	B2modb, %mm6
   103  	paddq	%mm6, %mm7
   104  	movd	(ap), %mm6
   105  	pmuludq	B3modb, %mm6
   106  	paddq	%mm6, %mm7
   107  	lea	-28(ap), ap
   108  	add	$-4, n
   109  	jz	L(end)
   110  	jmp	L(top)
   111  
   112  L(b1):	movd	(ap), %mm7
   113  	lea	-16(ap), ap
   114  	dec	n
   115  	jz	L(x)
   116  	jmp	L(top)
   117  
   118  L(b2):	movd	-4(ap), %mm7		C rl
   119  	punpckldq (ap), %mm7		C rh
   120  	lea	-20(ap), ap
   121  	add	$-2, n
   122  	jz	L(end)
   123  
   124  	ALIGN(8)
   125  L(top):	movd	4(ap), %mm0
   126  	pmuludq	B1modb, %mm0
   127  	movd	0(ap), %mm6
   128  	paddq	%mm6, %mm0
   129  
   130  	movd	8(ap), %mm6
   131  	pmuludq	B2modb, %mm6
   132  	paddq	%mm6, %mm0
   133  
   134  	movd	12(ap), %mm6
   135  	pmuludq	B3modb, %mm6
   136  	paddq	%mm6, %mm0
   137  
   138  	movq	%mm7, %mm6
   139  	psrlq	$32, %mm7		C rh
   140  	pmuludq	B5modb, %mm7
   141  	pmuludq	B4modb, %mm6
   142  
   143  	paddq	%mm0, %mm7
   144  	paddq	%mm6, %mm7
   145  
   146  	add	$-16, ap
   147  	add	$-4, n
   148  	jnz	L(top)
   149  
   150  L(end):	pcmpeqd	%mm4, %mm4
   151  	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
   152  	pand	%mm7, %mm4		C rl
   153  	psrlq	$32, %mm7		C rh
   154  	pmuludq	B1modb, %mm7		C rh,cl
   155  	paddq	%mm4, %mm7		C rh,rl
   156  L(x):	movd	4(%ecx), %mm4		C cnt
   157  	psllq	%mm4, %mm7		C rh,rl normalized
   158  	movq	%mm7, %mm2		C rl in low half
   159  	psrlq	$32, %mm7		C rh
   160  	movd	(%ecx), %mm1		C bi
   161  	pmuludq	%mm7, %mm1		C qh,ql
   162  	paddq	%mm2, %mm1		C qh-1,ql
   163  	movd	%mm1, %ecx		C ql
   164  	psrlq	$32, %mm1		C qh-1
   165  	movd	16(%esp), %mm3		C b
   166  	pmuludq	%mm1, %mm3		C (qh-1) * b
   167  	psubq	%mm3, %mm2		C r in low half (could use psubd)
   168  	movd	%mm2, %eax		C r
   169  	mov	16(%esp), %ebx
   170  	sub	%ebx, %eax		C r
   171  	cmp	%eax, %ecx
   172  	lea	(%eax,%ebx), %edx
   173  	cmovc(	%edx, %eax)
   174  	movd	%mm4, %ecx		C cnt
   175  	cmp	%ebx, %eax
   176  	jae	L(fix)
   177  	emms
   178  	pop	%ebx
   179  	shr	%cl, %eax
   180  	ret
   181  
   182  L(fix):	sub	%ebx, %eax
   183  	emms
   184  	pop	%ebx
   185  	shr	%cl, %eax
   186  	ret
   187  EPILOGUE()
   188  
   189  	ALIGN(16)
   190  PROLOGUE(mpn_mod_1s_4p_cps)
   191  C CAUTION: This is the same code as in k7/mod_1_4.asm
   192  	push	%ebp
   193  	push	%edi
   194  	push	%esi
   195  	push	%ebx
   196  	mov	20(%esp), %ebp		C FIXME: avoid bp for 0-idx
   197  	mov	24(%esp), %ebx
   198  	bsr	%ebx, %ecx
   199  	xor	$31, %ecx
   200  	sal	%cl, %ebx		C b << cnt
   201  	mov	%ebx, %edx
   202  	not	%edx
   203  	mov	$-1, %eax
   204  	div	%ebx
   205  	xor	%edi, %edi
   206  	sub	%ebx, %edi
   207  	mov	$1, %esi
   208  	mov	%eax, (%ebp)		C store bi
   209  	mov	%ecx, 4(%ebp)		C store cnt
   210  	shld	%cl, %eax, %esi
   211  	imul	%edi, %esi
   212  	mov	%eax, %edi
   213  	mul	%esi
   214  
   215  	add	%esi, %edx
   216  	shr	%cl, %esi
   217  	mov	%esi, 8(%ebp)		C store B1modb
   218  
   219  	not	%edx
   220  	imul	%ebx, %edx
   221  	lea	(%edx,%ebx), %esi
   222  	cmp	%edx, %eax
   223  	cmovnc(	%edx, %esi)
   224  	mov	%edi, %eax
   225  	mul	%esi
   226  
   227  	add	%esi, %edx
   228  	shr	%cl, %esi
   229  	mov	%esi, 12(%ebp)		C store B2modb
   230  
   231  	not	%edx
   232  	imul	%ebx, %edx
   233  	lea	(%edx,%ebx), %esi
   234  	cmp	%edx, %eax
   235  	cmovnc(	%edx, %esi)
   236  	mov	%edi, %eax
   237  	mul	%esi
   238  
   239  	add	%esi, %edx
   240  	shr	%cl, %esi
   241  	mov	%esi, 16(%ebp)		C store B3modb
   242  
   243  	not	%edx
   244  	imul	%ebx, %edx
   245  	lea	(%edx,%ebx), %esi
   246  	cmp	%edx, %eax
   247  	cmovnc(	%edx, %esi)
   248  	mov	%edi, %eax
   249  	mul	%esi
   250  
   251  	add	%esi, %edx
   252  	shr	%cl, %esi
   253  	mov	%esi, 20(%ebp)		C store B4modb
   254  
   255  	not	%edx
   256  	imul	%ebx, %edx
   257  	add	%edx, %ebx
   258  	cmp	%edx, %eax
   259  	cmovnc(	%edx, %ebx)
   260  
   261  	shr	%cl, %ebx
   262  	mov	%ebx, 24(%ebp)		C store B5modb
   263  
   264  	pop	%ebx
   265  	pop	%esi
   266  	pop	%edi
   267  	pop	%ebp
   268  	ret
   269  EPILOGUE()