github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mod_1_1.asm (about)

     1  dnl  x86-32 mpn_mod_1_1p, requiring cmov.
     2  
     3  dnl  Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
     4  
     5  dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C			    cycles/limb
    36  C P5				 ?
    37  C P6 model 0-8,10-12		 ?
    38  C P6 model 9  (Banias)		 ?
    39  C P6 model 13 (Dothan)		 ?
    40  C P4 model 0  (Willamette)	 ?
    41  C P4 model 1  (?)		 ?
    42  C P4 model 2  (Northwood)	 ?
    43  C P4 model 3  (Prescott)	 ?
    44  C P4 model 4  (Nocona)		 ?
    45  C AMD K6			 ?
    46  C AMD K7			 7
    47  C AMD K8			 ?
    48  
    49  define(`B2mb', `%ebx')
    50  define(`r0', `%esi')
    51  define(`r2', `%ebp')
    52  define(`t0', `%edi')
    53  define(`ap', `%ecx')  C Also shift count
    54  
    55  C Stack frame
    56  C	pre	36(%esp)
    57  C	b	32(%esp)
    58  C	n	28(%esp)
    59  C	ap	24(%esp)
    60  C	return	20(%esp)
    61  C	%ebp	16(%esp)
    62  C	%edi	12(%esp)
    63  C	%esi	8(%esp)
    64  C	%ebx	4(%esp)
    65  C	B2mod	(%esp)
    66  
    67  define(`B2modb', `(%esp)')
    68  define(`n', `28(%esp)')
    69  define(`b', `32(%esp)')
    70  define(`pre', `36(%esp)')
    71  
    72  C mp_limb_t
    73  C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4])
    74  C
    75  C The pre array contains bi, cnt, B1modb, B2modb
    76  C Note: This implementation needs B1modb only when cnt > 0
    77  
    78  ASM_START()
    79  	TEXT
    80  	ALIGN(8)
    81  PROLOGUE(mpn_mod_1_1p)
    82  	push	%ebp
    83  	push	%edi
    84  	push	%esi
    85  	push	%ebx
    86  	mov	32(%esp), %ebp		C pre[]
    87  
    88  	mov	12(%ebp), %eax		C B2modb
    89  	push	%eax			C Put it on stack
    90  
    91  	mov	n, %edx
    92  	mov	24(%esp), ap
    93  
    94  	lea	(ap, %edx, 4), ap
    95  	mov	-4(ap), %eax
    96  	cmp	$3, %edx
    97  	jnc	L(first)
    98  	mov	-8(ap), r0
    99  	jmp	L(reduce_two)
   100  
   101  L(first):
   102  	C First iteration, no r2
   103  	mull	B2modb
   104  	mov	-12(ap), r0
   105  	add	%eax, r0
   106  	mov	-8(ap), %eax
   107  	adc	%edx, %eax
   108  	sbb	r2, r2
   109  	subl	$3, n
   110  	lea	-16(ap), ap
   111  	jz	L(reduce_three)
   112  
   113  	mov	B2modb, B2mb
   114  	sub	b, B2mb
   115  	lea	(B2mb, r0), t0
   116  	jmp	L(mid)
   117  
   118  	ALIGN(16)
   119  L(top): C Loopmixed to 7 c/l on k7
   120  	add	%eax, r0
   121  	lea	(B2mb, r0), t0
   122  	mov	r2, %eax
   123  	adc	%edx, %eax
   124  	sbb	r2, r2
   125  L(mid):	mull	B2modb
   126  	and	B2modb, r2
   127  	add	r0, r2
   128  	decl	n
   129  	mov	(ap), r0
   130  	cmovc(	t0, r2)
   131  	lea	-4(ap), ap
   132  	jnz	L(top)
   133  
   134  	add	%eax, r0
   135  	mov	r2, %eax
   136  	adc	%edx, %eax
   137  	sbb	r2, r2
   138  
   139  L(reduce_three):
   140  	C Eliminate r2
   141  	and	b, r2
   142  	sub	r2, %eax
   143  
   144  L(reduce_two):
   145  	mov	pre, %ebp
   146  	movb	4(%ebp), %cl
   147  	test	%cl, %cl
   148  	jz	L(normalized)
   149  
   150  	C Unnormalized, use B1modb to reduce to size < B b
   151  	mull	8(%ebp)
   152  	xor	t0, t0
   153  	add	%eax, r0
   154  	adc	%edx, t0
   155  	mov	t0, %eax
   156  
   157  	C Left-shift to normalize
   158  	shld	%cl, r0, %eax C Always use shld?
   159  
   160  	shl	%cl, r0
   161  	jmp	L(udiv)
   162  
   163  L(normalized):
   164  	mov	%eax, t0
   165  	sub	b, t0
   166  	cmovnc(	t0, %eax)
   167  
   168  L(udiv):
   169  	lea	1(%eax), t0
   170  	mull	(%ebp)
   171  	mov	b, %ebx		C Needed in register for lea
   172  	add	r0, %eax
   173  	adc	t0, %edx
   174  	imul	%ebx, %edx
   175  	sub	%edx, r0
   176  	cmp	r0, %eax
   177  	lea	(%ebx, r0), %eax
   178  	cmovnc(	r0, %eax)
   179  	cmp	%ebx, %eax
   180  	jnc	L(fix)
   181  L(ok):	shr	%cl, %eax
   182  
   183  	add	$4, %esp
   184  	pop	%ebx
   185  	pop	%esi
   186  	pop	%edi
   187  	pop	%ebp
   188  
   189  	ret
   190  L(fix):	sub	%ebx, %eax
   191  	jmp	L(ok)
   192  EPILOGUE()
   193  
   194  PROLOGUE(mpn_mod_1_1p_cps)
   195  	push	%ebp
   196  	mov	12(%esp), %ebp
   197  	push	%esi
   198  	bsr	%ebp, %ecx
   199  	push	%ebx
   200  	xor	$31, %ecx
   201  	mov	16(%esp), %esi
   202  	sal	%cl, %ebp
   203  	mov	%ebp, %edx
   204  	not	%edx
   205  	mov	$-1, %eax
   206  	div	%ebp			C On K7, invert_limb would be a few cycles faster.
   207  	mov	%eax, (%esi)		C store bi
   208  	mov	%ecx, 4(%esi)		C store cnt
   209  	neg	%ebp
   210  	mov	$1, %edx
   211  	shld	%cl, %eax, %edx
   212  	imul	%ebp, %edx
   213  	shr	%cl, %edx
   214  	imul	%ebp, %eax
   215  	mov	%edx, 8(%esi)		C store B1modb
   216  	mov	%eax, 12(%esi)		C store B2modb
   217  	pop	%ebx
   218  	pop	%esi
   219  	pop	%ebp
   220  	ret
   221  EPILOGUE()