github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/lshsub_n.asm (about)

     1  dnl  Intel P6 mpn_lshsub_n -- mpn papillion support.
     2  
     3  dnl  Copyright 2006 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C P6/13: 3.35 cycles/limb	(separate mpn_sub_n + mpn_lshift needs 4.12)
    34  
    35  C (1) The loop is not scheduled in any way, and scheduling attempts have not
    36  C     improved speed on P6/13.  Presumably, the K7 will want scheduling, if it
    37  C     at all wants to use MMX.
    38  C (2) We could save a register by not alternatingly using eax and edx in the
    39  C     loop.
    40  
    41  define(`rp',	`%edi')
    42  define(`up',	`%esi')
    43  define(`vp',	`%ebx')
    44  define(`n',	`%ecx')
    45  define(`cnt',	`%mm7')
    46  
    47  ASM_START()
    48  
    49  	TEXT
    50  	ALIGN(16)
    51  
    52  PROLOGUE(mpn_lshsub_n)
    53  	push	%edi
    54  	push	%esi
    55  	push	%ebx
    56  
    57  	mov	16(%esp), rp
    58  	mov	20(%esp), up
    59  	mov	24(%esp), vp
    60  	mov	28(%esp), n
    61  	mov	$32, %eax
    62  	sub	32(%esp), %eax
    63  	movd	%eax, cnt
    64  
    65  	lea	(up,n,4), up
    66  	lea	(vp,n,4), vp
    67  	lea	(rp,n,4), rp
    68  
    69  	neg	n
    70  	mov	n, %eax
    71  	and	$-8, n
    72  	and	$7, %eax
    73  	shl	%eax				C eax = 2x
    74  	lea	(%eax,%eax,4), %edx		C edx = 10x
    75  ifdef(`PIC',`
    76  	call	L(pic_calc)
    77  L(here):
    78  ',`
    79  	lea	L(ent)(%eax,%edx,2), %eax	C eax = 22x
    80  ')
    81  
    82  	pxor	%mm1, %mm1
    83  	pxor	%mm0, %mm0
    84  
    85  	jmp	*%eax
    86  
    87  ifdef(`PIC',`
    88  L(pic_calc):
    89  	C See mpn/x86/README about old gas bugs
    90  	lea	(%eax,%edx,2), %eax
    91  	add	$L(ent)-L(here), %eax
    92  	add	(%esp), %eax
    93  	ret_internal
    94  ')
    95  
    96  L(end):	C compute (cy<<cnt) | (edx>>(32-cnt))
    97  	sbb	%eax, %eax
    98  	neg	%eax
    99  	mov	32(%esp), %ecx
   100  	shld	%cl, %edx, %eax
   101  
   102  	emms
   103  
   104  	pop	%ebx
   105  	pop	%esi
   106  	pop	%edi
   107  	ret
   108  	ALIGN(16)
   109  L(top):	jecxz	L(end)
   110  L(ent):	mov	   0(up,n,4), %eax
   111  	sbb	   0(vp,n,4), %eax
   112  	movd	   %eax, %mm0
   113  	punpckldq  %mm0, %mm1
   114  	psrlq	   %mm7, %mm1
   115  	movd	   %mm1, 0(rp,n,4)
   116  
   117  	mov	   4(up,n,4), %edx
   118  	sbb	   4(vp,n,4), %edx
   119  	movd	   %edx, %mm1
   120  	punpckldq  %mm1, %mm0
   121  	psrlq	   %mm7, %mm0
   122  	movd	   %mm0, 4(rp,n,4)
   123  
   124  	mov	   8(up,n,4), %eax
   125  	sbb	   8(vp,n,4), %eax
   126  	movd	   %eax, %mm0
   127  	punpckldq  %mm0, %mm1
   128  	psrlq	   %mm7, %mm1
   129  	movd	   %mm1, 8(rp,n,4)
   130  
   131  	mov	   12(up,n,4), %edx
   132  	sbb	   12(vp,n,4), %edx
   133  	movd	   %edx, %mm1
   134  	punpckldq  %mm1, %mm0
   135  	psrlq	   %mm7, %mm0
   136  	movd	   %mm0, 12(rp,n,4)
   137  
   138  	mov	   16(up,n,4), %eax
   139  	sbb	   16(vp,n,4), %eax
   140  	movd	   %eax, %mm0
   141  	punpckldq  %mm0, %mm1
   142  	psrlq	   %mm7, %mm1
   143  	movd	   %mm1, 16(rp,n,4)
   144  
   145  	mov	   20(up,n,4), %edx
   146  	sbb	   20(vp,n,4), %edx
   147  	movd	   %edx, %mm1
   148  	punpckldq  %mm1, %mm0
   149  	psrlq	   %mm7, %mm0
   150  	movd	   %mm0, 20(rp,n,4)
   151  
   152  	mov	   24(up,n,4), %eax
   153  	sbb	   24(vp,n,4), %eax
   154  	movd	   %eax, %mm0
   155  	punpckldq  %mm0, %mm1
   156  	psrlq	   %mm7, %mm1
   157  	movd	   %mm1, 24(rp,n,4)
   158  
   159  	mov	   28(up,n,4), %edx
   160  	sbb	   28(vp,n,4), %edx
   161  	movd	   %edx, %mm1
   162  	punpckldq  %mm1, %mm0
   163  	psrlq	   %mm7, %mm0
   164  	movd	   %mm0, 28(rp,n,4)
   165  
   166  	lea	   8(n), n
   167  	jmp	   L(top)
   168  
   169  EPILOGUE()