github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/sublsh1_n.asm (about)

     1  dnl  AMD K7 mpn_sublsh1_n_ip1 -- rp[] = rp[] - (up[] << 1)
     2  
     3  dnl  Copyright 2011 Free Software Foundation, Inc.
     4  
     5  dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C This is an attempt at a sublsh1_n for x86-32, not relying on sse2 insns.  The
    36  C innerloop is 2*3-way unrolled, which is best we can do with the available
    37  C registers.  It seems tricky to use the same structure for rsblsh1_n, since we
    38  C cannot feed carry between operations there.
    39  
    40  C			    cycles/limb
    41  C P5
    42  C P6 model 0-8,10-12
    43  C P6 model 9  (Banias)
    44  C P6 model 13 (Dothan)
    45  C P4 model 0  (Willamette)
    46  C P4 model 1  (?)
    47  C P4 model 2  (Northwood)
    48  C P4 model 3  (Prescott)
    49  C P4 model 4  (Nocona)
    50  C Intel Atom			 6.75
    51  C AMD K6
    52  C AMD K7
    53  C AMD K8
    54  
    55  C This is a basic sublsh1_n for k7, atom, and perhaps some other x86-32
    56  C processors.  It uses 2*4-way unrolling, for good reasons.
    57  C
    58  C Breaking carry recurrency might be a good idea.  We would then need separate
    59  C registers for the shift carry and add/subtract carry, which in turn would
    60  C force is to 2*2-way unrolling.
    61  
    62  defframe(PARAM_SIZE,	12)
    63  defframe(PARAM_SRC,	 8)
    64  defframe(PARAM_DST,	 4)
    65  
    66  dnl  re-use parameter space
    67  define(VAR_COUNT,`PARAM_SIZE')
    68  define(SAVE_EBX,`PARAM_SRC')
    69  define(SAVE_EBP,`PARAM_DST')
    70  
    71  ASM_START()
    72  	TEXT
    73  	ALIGN(8)
    74  PROLOGUE(mpn_sublsh1_n_ip1)
    75  deflit(`FRAME',0)
    76  
    77  define(`rp',  `%edi')
    78  define(`up',  `%esi')
    79  
    80  	mov	PARAM_SIZE, %eax	C size
    81  	push	up			FRAME_pushl()
    82  	push	rp			FRAME_pushl()
    83  	xor	%edx, %edx
    84  	mov	PARAM_SRC, up
    85  	mov	PARAM_DST, rp
    86  	mov	%ebx, SAVE_EBX
    87  	mov	%eax, %ebx
    88  	shr	$3, %eax
    89  
    90  	not	%eax			C count = -(size\8)-i
    91  	and	$7, %ebx		C size % 8
    92  	jz	L(exact)
    93  
    94  L(oop):
    95  ifdef(`CPU_P6',`
    96  	shr	%edx ')			C restore 2nd saved carry bit
    97  	mov	(up), %ecx
    98  	adc	%ecx, %ecx
    99  	rcr	%edx			C restore 1st saved carry bit
   100  	lea	4(up), up
   101  	sbb	%ecx, (rp)
   102  	lea	4(rp), rp
   103  	adc	%edx, %edx		C save a carry bit in edx
   104  ifdef(`CPU_P6',`
   105  	adc	%edx, %edx ')		C save another carry bit in edx
   106  	dec	%ebx
   107  	jnz	L(oop)
   108  L(exact):
   109  	inc	%eax
   110  	jz	L(end)
   111  	mov	%eax, VAR_COUNT
   112  	mov	%ebp, SAVE_EBP
   113  
   114  	ALIGN(16)
   115  L(top):
   116  ifdef(`CPU_P6',`
   117  	shr	%edx ')			C restore 2nd saved carry bit
   118  	mov	(up), %eax
   119  	adc	%eax, %eax
   120  	mov	4(up), %ebx
   121  	adc	%ebx, %ebx
   122  	mov	8(up), %ecx
   123  	adc	%ecx, %ecx
   124  	mov	12(up), %ebp
   125  	adc	%ebp, %ebp
   126  
   127  	rcr	%edx			C restore 1st saved carry bit
   128  
   129  	sbb	%eax, (rp)
   130  	sbb	%ebx, 4(rp)
   131  	sbb	%ecx, 8(rp)
   132  	sbb	%ebp, 12(rp)
   133  
   134  	mov	16(up), %eax
   135  	adc	%eax, %eax
   136  	mov	20(up), %ebx
   137  	adc	%ebx, %ebx
   138  	mov	24(up), %ecx
   139  	adc	%ecx, %ecx
   140  	mov	28(up), %ebp
   141  	adc	%ebp, %ebp
   142  
   143  	lea	32(up), up
   144  	adc	%edx, %edx		C save a carry bit in edx
   145  
   146  	sbb	%eax, 16(rp)
   147  	sbb	%ebx, 20(rp)
   148  	sbb	%ecx, 24(rp)
   149  	sbb	%ebp, 28(rp)
   150  
   151  ifdef(`CPU_P6',`
   152  	adc	%edx, %edx ')		C save another carry bit in edx
   153  	incl	VAR_COUNT
   154  	lea	32(rp), rp
   155  	jne	L(top)
   156  
   157  	mov	SAVE_EBP, %ebp
   158  L(end):
   159  	mov	SAVE_EBX, %ebx
   160  
   161  ifdef(`CPU_P6',`
   162  	xor	%eax, %eax
   163  	shr	$1, %edx
   164  	adc	%edx, %eax
   165  ',`
   166  	adc	$0, %edx
   167  	mov	%edx, %eax
   168  ')
   169  	pop	rp			FRAME_popl()
   170  	pop	up			FRAME_popl()
   171  	ret
   172  EPILOGUE()
   173  ASM_END()