github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/atom/lshift.asm (about)

     1  dnl  Intel Atom mpn_lshift -- mpn left shift.
     2  
     3  dnl  Copyright 2011 Free Software Foundation, Inc.
     4  
     5  dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
    36  C			unsigned cnt);
    37  
    38  C				  cycles/limb
    39  C				cnt!=1	cnt==1
    40  C P5
    41  C P6 model 0-8,10-12
    42  C P6 model 9  (Banias)
    43  C P6 model 13 (Dothan)
    44  C P4 model 0  (Willamette)
    45  C P4 model 1  (?)
    46  C P4 model 2  (Northwood)
    47  C P4 model 3  (Prescott)
    48  C P4 model 4  (Nocona)
    49  C Intel Atom			 5	 2.5
    50  C AMD K6
    51  C AMD K7
    52  C AMD K8
    53  C AMD K10
    54  
    55  defframe(PARAM_CNT, 16)
    56  defframe(PARAM_SIZE,12)
    57  defframe(PARAM_SRC,  8)
    58  defframe(PARAM_DST,  4)
    59  
    60  dnl  re-use parameter space
    61  define(SAVE_UP,`PARAM_CNT')
    62  define(VAR_COUNT,`PARAM_SIZE')
    63  define(SAVE_EBX,`PARAM_SRC')
    64  define(SAVE_EBP,`PARAM_DST')
    65  
    66  define(`rp',  `%edi')
    67  define(`up',  `%esi')
    68  define(`cnt',  `%ecx')
    69  
    70  ASM_START()
    71  	TEXT
    72  	ALIGN(8)
    73  deflit(`FRAME',0)
    74  PROLOGUE(mpn_lshift)
    75  	mov	PARAM_CNT, cnt
    76  	mov	PARAM_SIZE, %edx
    77  	mov	up, SAVE_UP
    78  	mov	PARAM_SRC, up
    79  	push	rp			FRAME_pushl()
    80  	mov	PARAM_DST, rp
    81  
    82  C We can use faster code for shift-by-1 under certain conditions.
    83  	cmp	$1,cnt
    84  	jne	L(normal)
    85  	cmpl	rp, up
    86  	jnc	L(special)		C jump if s_ptr + 1 >= res_ptr
    87  	leal	(up,%edx,4),%eax
    88  	cmpl	%eax,rp
    89  	jnc	L(special)		C jump if res_ptr >= s_ptr + size
    90  
    91  L(normal):
    92  	lea	-4(up,%edx,4), up
    93  	mov	%ebx, SAVE_EBX
    94  	lea	-4(rp,%edx,4), rp
    95  
    96  	shr	%edx
    97  	mov	(up), %eax
    98  	mov	%edx, VAR_COUNT
    99  	jnc	L(evn)
   100  
   101  	mov	%eax, %ebx
   102  	shl	%cl, %ebx
   103  	neg	cnt
   104  	shr	%cl, %eax
   105  	test	%edx, %edx
   106  	jnz	L(gt1)
   107  	mov	%ebx, (rp)
   108  	jmp	L(quit)
   109  
   110  L(gt1):	mov	%ebp, SAVE_EBP
   111  	push	%eax
   112  	mov	-4(up), %eax
   113  	mov	%eax, %ebp
   114  	shr	%cl, %eax
   115  	jmp	L(lo1)
   116  
   117  L(evn):	mov	%ebp, SAVE_EBP
   118  	neg	cnt
   119  	mov	%eax, %ebp
   120  	mov	-4(up), %edx
   121  	shr	%cl, %eax
   122  	mov	%edx, %ebx
   123  	shr	%cl, %edx
   124  	neg	cnt
   125  	decl	VAR_COUNT
   126  	lea	4(rp), rp
   127  	lea	-4(up), up
   128  	jz	L(end)
   129  	push	%eax			FRAME_pushl()
   130  
   131  	ALIGN(8)
   132  L(top):	shl	%cl, %ebp
   133  	or	%ebp, %edx
   134  	shl	%cl, %ebx
   135  	neg	cnt
   136  	mov	-4(up), %eax
   137  	mov	%eax, %ebp
   138  	mov	%edx, -4(rp)
   139  	shr	%cl, %eax
   140  	lea	-8(rp), rp
   141  L(lo1):	mov	-8(up), %edx
   142  	or	%ebx, %eax
   143  	mov	%edx, %ebx
   144  	shr	%cl, %edx
   145  	lea	-8(up), up
   146  	neg	cnt
   147  	mov	%eax, (rp)
   148  	decl	VAR_COUNT
   149  	jg	L(top)
   150  
   151  	pop	%eax			FRAME_popl()
   152  L(end):
   153  	shl	%cl, %ebp
   154  	shl	%cl, %ebx
   155  	or	%ebp, %edx
   156  	mov	SAVE_EBP, %ebp
   157  	mov	%edx, -4(rp)
   158  	mov	%ebx, -8(rp)
   159  
   160  L(quit):
   161  	mov	SAVE_UP, up
   162  	mov	SAVE_EBX, %ebx
   163  	pop	rp			FRAME_popl()
   164  	ret
   165  
   166  L(special):
   167  deflit(`FRAME',4)
   168  	lea	3(%edx), %eax		C size + 3
   169  	dec	%edx			C size - 1
   170  	mov	(up), %ecx
   171  	shr	$2, %eax		C (size + 3) / 4
   172  	and	$3, %edx		C (size - 1) % 4
   173  	jz	L(goloop)		C jmp if  size == 1 (mod 4)
   174  	shr	%edx
   175  	jnc	L(odd)			C jum if  size == 3 (mod 4)
   176  
   177  	add	%ecx, %ecx
   178  	lea	4(up), up
   179  	mov	%ecx, (rp)
   180  	mov	(up), %ecx
   181  	lea	4(rp), rp
   182  
   183  	dec	%edx
   184  	jnz	L(goloop)		C jump if  size == 0 (mod 4)
   185  L(odd):	lea	-8(up), up
   186  	lea	-8(rp), rp
   187  	jmp	L(sentry)		C reached if size == 2 or 3 (mod 4)
   188  
   189  L(sloop):
   190  	adc	%ecx, %ecx
   191  	mov	4(up), %edx
   192  	mov	%ecx, (rp)
   193  	adc	%edx, %edx
   194  	mov	8(up), %ecx
   195  	mov	%edx, 4(rp)
   196  L(sentry):
   197  	adc	%ecx, %ecx
   198  	mov	12(up), %edx
   199  	mov	%ecx, 8(rp)
   200  	adc	%edx, %edx
   201  	lea	16(up), up
   202  	mov	%edx, 12(rp)
   203  	lea	16(rp), rp
   204  	mov	(up), %ecx
   205  L(goloop):
   206  	decl	%eax
   207  	jnz	L(sloop)
   208  
   209  L(squit):
   210  	adc	%ecx, %ecx
   211  	mov	%ecx, (rp)
   212  	adc	%eax, %eax
   213  
   214  	mov	SAVE_UP, up
   215  	pop	rp			FRAME_popl()
   216  	ret
   217  EPILOGUE()
   218  ASM_END()