github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/atom/rsh1aors_n.asm (about)

     1  dnl  x86-64 mpn_rsh1add_n/mpn_rsh1sub_n.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C TODO
    36  C  * Schedule loop less.  It is now almost surely overscheduled, resulting in
    37  C    large feed-in and wind-down code.
    38  
    39  C	     cycles/limb
    40  C AMD K8,K9	 ?
    41  C AMD K10	 ?
    42  C Intel P4	 ?
    43  C Intel core2	 ?
    44  C Intel NMH	 ?
    45  C Intel SBR	 ?
    46  C Intel atom	 5.25
    47  C VIA nano	 ?
    48  
    49  C INPUT PARAMETERS
    50  define(`rp',`%rdi')
    51  define(`up',`%rsi')
    52  define(`vp',`%rdx')
    53  define(`n',`%rcx')
    54  
    55  ifdef(`OPERATION_rsh1add_n', `
    56  	define(ADDSUB,	      add)
    57  	define(ADCSBB,	      adc)
    58  	define(func_n,	      mpn_rsh1add_n)
    59  	define(func_nc,	      mpn_rsh1add_nc)')
    60  ifdef(`OPERATION_rsh1sub_n', `
    61  	define(ADDSUB,	      sub)
    62  	define(ADCSBB,	      sbb)
    63  	define(func_n,	      mpn_rsh1sub_n)
    64  	define(func_nc,	      mpn_rsh1sub_nc)')
    65  
    66  ABI_SUPPORT(DOS64)
    67  ABI_SUPPORT(STD64)
    68  
    69  MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
    70  
    71  ASM_START()
    72  	TEXT
    73  	ALIGN(16)
    74  PROLOGUE(func_n)
    75  	FUNC_ENTRY(4)
    76  	push	%rbx
    77  	push	%rbp
    78  	push	%r12
    79  	push	%r13
    80  	push	%r14
    81  	push	%r15
    82  
    83  	mov	(up), %r15
    84  	ADDSUB	(vp), %r15
    85  	sbb	R32(%rbx), R32(%rbx)
    86  	xor	R32(%rax), R32(%rax)
    87  	shr	%r15
    88  	adc	R32(%rax), R32(%rax)	C return value
    89  
    90  	mov	R32(n), R32(%rbp)
    91  	and	$3, R32(%rbp)
    92  	jz	L(b0)
    93  	cmp	$2, R32(%rbp)
    94  	jae	L(b23)
    95  
    96  L(b1):	dec	n
    97  	jnz	L(gt1)
    98  	shl	$63, %rbx
    99  	add	%rbx, %r15
   100  	mov	%r15, (rp)
   101  	jmp	L(cj1)
   102  L(gt1):	lea	24(up), up
   103  	lea	24(vp), vp
   104  	mov	-16(up), %r9
   105  	add	R32(%rbx), R32(%rbx)
   106  	mov	-8(up), %r10
   107  	lea	24(rp), rp
   108  	mov	(up), %r11
   109  	ADCSBB	-16(vp), %r9
   110  	ADCSBB	-8(vp), %r10
   111  	mov	%r15, %r12
   112  	ADCSBB	(vp), %r11
   113  	mov	%r9, %r13
   114  	sbb	R32(%rbx), R32(%rbx)
   115  	mov	%r11, %r15
   116  	mov	%r10, %r14
   117  	shl	$63, %r11
   118  	shl	$63, %r10
   119  	shl	$63, %r9
   120  	or	%r9, %r12
   121  	shr	%r13
   122  	mov	8(up), %r8
   123  	shr	%r14
   124  	or	%r10, %r13
   125  	shr	%r15
   126  	or	%r11, %r14
   127  	sub	$4, n
   128  	jz	L(cj5)
   129  L(gt5):	mov	16(up), %r9
   130  	add	R32(%rbx), R32(%rbx)
   131  	mov	24(up), %r10
   132  	ADCSBB	8(vp), %r8
   133  	mov	%r15, %rbp
   134  	mov	32(up), %r11
   135  	jmp	L(lo1)
   136  
   137  L(b23):	jnz	L(b3)
   138  	mov	8(up), %r8
   139  	sub	$2, n
   140  	jnz	L(gt2)
   141  	add	R32(%rbx), R32(%rbx)
   142  	ADCSBB	8(vp), %r8
   143  	mov	%r8, %r12
   144  	jmp	L(cj2)
   145  L(gt2):	mov	16(up), %r9
   146  	add	R32(%rbx), R32(%rbx)
   147  	mov	24(up), %r10
   148  	ADCSBB	8(vp), %r8
   149  	mov	%r15, %rbp
   150  	mov	32(up), %r11
   151  	ADCSBB	16(vp), %r9
   152  	lea	32(up), up
   153  	ADCSBB	24(vp), %r10
   154  	mov	%r9, %r13
   155  	ADCSBB	32(vp), %r11
   156  	mov	%r8, %r12
   157  	jmp	L(lo2)
   158  
   159  L(b3):	lea	40(up), up
   160  	lea	8(vp), vp
   161  	mov	%r15, %r14
   162  	add	R32(%rbx), R32(%rbx)
   163  	mov	-32(up), %r11
   164  	ADCSBB	0(vp), %r11
   165  	lea	8(rp), rp
   166  	sbb	R32(%rbx), R32(%rbx)
   167  	mov	%r11, %r15
   168  	shl	$63, %r11
   169  	mov	-24(up), %r8
   170  	shr	%r15
   171  	or	%r11, %r14
   172  	sub	$3, n
   173  	jnz	L(gt3)
   174  	add	R32(%rbx), R32(%rbx)
   175  	ADCSBB	8(vp), %r8
   176  	jmp	L(cj3)
   177  L(gt3):	mov	-16(up), %r9
   178  	add	R32(%rbx), R32(%rbx)
   179  	mov	-8(up), %r10
   180  	ADCSBB	8(vp), %r8
   181  	mov	%r15, %rbp
   182  	mov	(up), %r11
   183  	ADCSBB	16(vp), %r9
   184  	ADCSBB	24(vp), %r10
   185  	mov	%r8, %r12
   186  	jmp	L(lo3)
   187  
   188  L(b0):	lea	48(up), up
   189  	lea	16(vp), vp
   190  	add	R32(%rbx), R32(%rbx)
   191  	mov	-40(up), %r10
   192  	lea	16(rp), rp
   193  	mov	-32(up), %r11
   194  	ADCSBB	-8(vp), %r10
   195  	mov	%r15, %r13
   196  	ADCSBB	(vp), %r11
   197  	sbb	R32(%rbx), R32(%rbx)
   198  	mov	%r11, %r15
   199  	mov	%r10, %r14
   200  	shl	$63, %r11
   201  	shl	$63, %r10
   202  	mov	-24(up), %r8
   203  	shr	%r14
   204  	or	%r10, %r13
   205  	shr	%r15
   206  	or	%r11, %r14
   207  	sub	$4, n
   208  	jnz	L(gt4)
   209  	add	R32(%rbx), R32(%rbx)
   210  	ADCSBB	8(vp), %r8
   211  	jmp	L(cj4)
   212  L(gt4):	mov	-16(up), %r9
   213  	add	R32(%rbx), R32(%rbx)
   214  	mov	-8(up), %r10
   215  	ADCSBB	8(vp), %r8
   216  	mov	%r15, %rbp
   217  	mov	(up), %r11
   218  	ADCSBB	16(vp), %r9
   219  	jmp	L(lo0)
   220  
   221  	ALIGN(8)
   222  L(top):	mov	16(up), %r9
   223  	shr	%r14
   224  	or	%r10, %r13
   225  	shr	%r15
   226  	or	%r11, %r14
   227  	add	R32(%rbx), R32(%rbx)
   228  	mov	24(up), %r10
   229  	mov	%rbp, (rp)
   230  	ADCSBB	8(vp), %r8
   231  	mov	%r15, %rbp
   232  	lea	32(rp), rp
   233  	mov	32(up), %r11
   234  L(lo1):	ADCSBB	16(vp), %r9
   235  	lea	32(up), up
   236  	mov	%r12, -24(rp)
   237  L(lo0):	ADCSBB	24(vp), %r10
   238  	mov	%r8, %r12
   239  	mov	%r13, -16(rp)
   240  L(lo3):	ADCSBB	32(vp), %r11
   241  	mov	%r9, %r13
   242  	mov	%r14, -8(rp)
   243  L(lo2):	sbb	R32(%rbx), R32(%rbx)
   244  	shl	$63, %r8
   245  	mov	%r11, %r15
   246  	shr	%r12
   247  	mov	%r10, %r14
   248  	shl	$63, %r9
   249  	lea	32(vp), vp
   250  	shl	$63, %r10
   251  	or	%r8, %rbp
   252  	shl	$63, %r11
   253  	or	%r9, %r12
   254  	shr	%r13
   255  	mov	8(up), %r8
   256  	sub	$4, n
   257  	jg	L(top)
   258  
   259  L(end):	shr	%r14
   260  	or	%r10, %r13
   261  	shr	%r15
   262  	or	%r11, %r14
   263  	mov	%rbp, (rp)
   264  	lea	32(rp), rp
   265  L(cj5):	add	R32(%rbx), R32(%rbx)
   266  	ADCSBB	8(vp), %r8
   267  	mov	%r12, -24(rp)
   268  L(cj4):	mov	%r13, -16(rp)
   269  L(cj3):	mov	%r8, %r12
   270  	mov	%r14, -8(rp)
   271  L(cj2):	sbb	R32(%rbx), R32(%rbx)
   272  	shl	$63, %r8
   273  	shr	%r12
   274  	or	%r8, %r15
   275  	shl	$63, %rbx
   276  	add	%rbx, %r12
   277  	mov	%r15, (rp)
   278  	mov	%r12, 8(rp)
   279  L(cj1):	pop	%r15
   280  	pop	%r14
   281  	pop	%r13
   282  	pop	%r12
   283  	pop	%rbp
   284  	pop	%rbx
   285  	FUNC_EXIT()
   286  	ret
   287  EPILOGUE()