github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mod_1_4.asm (about)

     1  dnl  x86-32 mpn_mod_1s_4p, requiring cmov.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C			    cycles/limb
    36  C P5				 ?
    37  C P6 model 0-8,10-12		 ?
    38  C P6 model 9  (Banias)		 ?
    39  C P6 model 13 (Dothan)		 6
    40  C P4 model 0  (Willamette)	 ?
    41  C P4 model 1  (?)		 ?
    42  C P4 model 2  (Northwood)	15.5
    43  C P4 model 3  (Prescott)	 ?
    44  C P4 model 4  (Nocona)		 ?
    45  C AMD K6			 ?
    46  C AMD K7			 4.75
    47  C AMD K8			 ?
    48  
    49  ASM_START()
    50  	TEXT
    51  	ALIGN(16)
    52  PROLOGUE(mpn_mod_1s_4p)
    53  	push	%ebp
    54  	push	%edi
    55  	push	%esi
    56  	push	%ebx
    57  	sub	$28, %esp
    58  	mov	60(%esp), %edi		C cps[]
    59  	mov	8(%edi), %eax
    60  	mov	12(%edi), %edx
    61  	mov	16(%edi), %ecx
    62  	mov	20(%edi), %esi
    63  	mov	24(%edi), %edi
    64  	mov	%eax, 4(%esp)
    65  	mov	%edx, 8(%esp)
    66  	mov	%ecx, 12(%esp)
    67  	mov	%esi, 16(%esp)
    68  	mov	%edi, 20(%esp)
    69  	mov	52(%esp), %eax		C n
    70  	xor	%edi, %edi
    71  	mov	48(%esp), %esi		C up
    72  	lea	-12(%esi,%eax,4), %esi
    73  	and	$3, %eax
    74  	je	L(b0)
    75  	cmp	$2, %eax
    76  	jc	L(b1)
    77  	je	L(b2)
    78  
    79  L(b3):	mov	4(%esi), %eax
    80  	mull	4(%esp)
    81  	mov	(%esi), %ebp
    82  	add	%eax, %ebp
    83  	adc	%edx, %edi
    84  	mov	8(%esi), %eax
    85  	mull	8(%esp)
    86  	lea	-12(%esi), %esi
    87  	jmp	L(m0)
    88  
    89  L(b0):	mov	(%esi), %eax
    90  	mull	4(%esp)
    91  	mov	-4(%esi), %ebp
    92  	add	%eax, %ebp
    93  	adc	%edx, %edi
    94  	mov	4(%esi), %eax
    95  	mull	8(%esp)
    96  	add	%eax, %ebp
    97  	adc	%edx, %edi
    98  	mov	8(%esi), %eax
    99  	mull	12(%esp)
   100  	lea	-16(%esi), %esi
   101  	jmp	L(m0)
   102  
   103  L(b1):	mov	8(%esi), %ebp
   104  	lea	-4(%esi), %esi
   105  	jmp	L(m1)
   106  
   107  L(b2):	mov	8(%esi), %edi
   108  	mov	4(%esi), %ebp
   109  	lea	-8(%esi), %esi
   110  	jmp	L(m1)
   111  
   112  	ALIGN(16)
   113  L(top):	mov	(%esi), %eax
   114  	mull	4(%esp)
   115  	mov	-4(%esi), %ebx
   116  	xor	%ecx, %ecx
   117  	add	%eax, %ebx
   118  	adc	%edx, %ecx
   119  	mov	4(%esi), %eax
   120  	mull	8(%esp)
   121  	add	%eax, %ebx
   122  	adc	%edx, %ecx
   123  	mov	8(%esi), %eax
   124  	mull	12(%esp)
   125  	add	%eax, %ebx
   126  	adc	%edx, %ecx
   127  	lea	-16(%esi), %esi
   128  	mov	16(%esp), %eax
   129  	mul	%ebp
   130  	add	%eax, %ebx
   131  	adc	%edx, %ecx
   132  	mov	20(%esp), %eax
   133  	mul	%edi
   134  	mov	%ebx, %ebp
   135  	mov	%ecx, %edi
   136  L(m0):	add	%eax, %ebp
   137  	adc	%edx, %edi
   138  L(m1):	subl	$4, 52(%esp)
   139  	ja	L(top)
   140  
   141  L(end):	mov	4(%esp), %eax
   142  	mul	%edi
   143  	mov	60(%esp), %edi
   144  	add	%eax, %ebp
   145  	adc	$0, %edx
   146  	mov	4(%edi), %ecx
   147  	mov	%edx, %esi
   148  	mov	%ebp, %eax
   149  	sal	%cl, %esi
   150  	mov	%ecx, %ebx
   151  	neg	%ecx
   152  	shr	%cl, %eax
   153  	or	%esi, %eax
   154  	lea	1(%eax), %esi
   155  	mull	(%edi)
   156  	mov	%ebx, %ecx
   157  	mov	%eax, %ebx
   158  	mov	%ebp, %eax
   159  	mov	56(%esp), %ebp
   160  	sal	%cl, %eax
   161  	add	%eax, %ebx
   162  	adc	%esi, %edx
   163  	imul	%ebp, %edx
   164  	sub	%edx, %eax
   165  	lea	(%eax,%ebp), %edx
   166  	cmp	%eax, %ebx
   167  	cmovc(	%edx, %eax)
   168  	mov	%eax, %edx
   169  	sub	%ebp, %eax
   170  	cmovc(	%edx, %eax)
   171  	add	$28, %esp
   172  	pop	%ebx
   173  	pop	%esi
   174  	pop	%edi
   175  	pop	%ebp
   176  	shr	%cl, %eax
   177  	ret
   178  EPILOGUE()
   179  
   180  	ALIGN(16)
   181  PROLOGUE(mpn_mod_1s_4p_cps)
   182  C CAUTION: This is the same code as in pentium4/sse2/mod_1_4.asm
   183  	push	%ebp
   184  	push	%edi
   185  	push	%esi
   186  	push	%ebx
   187  	mov	20(%esp), %ebp		C FIXME: avoid bp for 0-idx
   188  	mov	24(%esp), %ebx
   189  	bsr	%ebx, %ecx
   190  	xor	$31, %ecx
   191  	sal	%cl, %ebx		C b << cnt
   192  	mov	%ebx, %edx
   193  	not	%edx
   194  	mov	$-1, %eax
   195  	div	%ebx
   196  	xor	%edi, %edi
   197  	sub	%ebx, %edi
   198  	mov	$1, %esi
   199  	mov	%eax, (%ebp)		C store bi
   200  	mov	%ecx, 4(%ebp)		C store cnt
   201  	shld	%cl, %eax, %esi
   202  	imul	%edi, %esi
   203  	mov	%eax, %edi
   204  	mul	%esi
   205  
   206  	add	%esi, %edx
   207  	shr	%cl, %esi
   208  	mov	%esi, 8(%ebp)		C store B1modb
   209  
   210  	not	%edx
   211  	imul	%ebx, %edx
   212  	lea	(%edx,%ebx), %esi
   213  	cmp	%edx, %eax
   214  	cmovnc(	%edx, %esi)
   215  	mov	%edi, %eax
   216  	mul	%esi
   217  
   218  	add	%esi, %edx
   219  	shr	%cl, %esi
   220  	mov	%esi, 12(%ebp)		C store B2modb
   221  
   222  	not	%edx
   223  	imul	%ebx, %edx
   224  	lea	(%edx,%ebx), %esi
   225  	cmp	%edx, %eax
   226  	cmovnc(	%edx, %esi)
   227  	mov	%edi, %eax
   228  	mul	%esi
   229  
   230  	add	%esi, %edx
   231  	shr	%cl, %esi
   232  	mov	%esi, 16(%ebp)		C store B3modb
   233  
   234  	not	%edx
   235  	imul	%ebx, %edx
   236  	lea	(%edx,%ebx), %esi
   237  	cmp	%edx, %eax
   238  	cmovnc(	%edx, %esi)
   239  	mov	%edi, %eax
   240  	mul	%esi
   241  
   242  	add	%esi, %edx
   243  	shr	%cl, %esi
   244  	mov	%esi, 20(%ebp)		C store B4modb
   245  
   246  	not	%edx
   247  	imul	%ebx, %edx
   248  	add	%edx, %ebx
   249  	cmp	%edx, %eax
   250  	cmovnc(	%edx, %ebx)
   251  
   252  	shr	%cl, %ebx
   253  	mov	%ebx, 24(%ebp)		C store B5modb
   254  
   255  	pop	%ebx
   256  	pop	%esi
   257  	pop	%edi
   258  	pop	%ebp
   259  	ret
   260  EPILOGUE()