github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/pentium4/rsh1aors_n.asm (about)

     1  dnl  x86-64 mpn_rsh1add_n/mpn_rsh1sub_n optimized for Pentium 4.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  
    36  C	     cycles/limb
    37  C AMD K8,K9	 4.13
    38  C AMD K10	 4.13
    39  C Intel P4	 5.70
    40  C Intel core2	 4.75
    41  C Intel corei	 5
    42  C Intel atom	 8.75
    43  C VIA nano	 5.25
    44  
    45  C TODO
    46  C  * Try to make this smaller, 746 bytes seem excessive for this 2nd class
    47  C    function.  Less sw pipelining would help, and since we now probably
    48  C    pipeline somewhat too deeply, it might not affect performance too much.
    49  C  * A separate small-n loop might speed things as well as make things smaller.
    50  C    That loop should be selected before pushing registers.
    51  
    52  C INPUT PARAMETERS
    53  define(`rp',	`%rdi')
    54  define(`up',	`%rsi')
    55  define(`vp',	`%rdx')
    56  define(`n',	`%rcx')
    57  define(`cy',	`%r8')
    58  
    59  ifdef(`OPERATION_rsh1add_n', `
    60  	define(ADDSUB,	      add)
    61  	define(func,	      mpn_rsh1add_n)
    62  	define(func_nc,	      mpn_rsh1add_nc)')
    63  ifdef(`OPERATION_rsh1sub_n', `
    64  	define(ADDSUB,	      sub)
    65  	define(func,	      mpn_rsh1sub_n)
    66  	define(func_nc,	      mpn_rsh1sub_nc)')
    67  
    68  ABI_SUPPORT(DOS64)
    69  ABI_SUPPORT(STD64)
    70  
    71  MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
    72  
    73  ASM_START()
    74  	TEXT
    75  PROLOGUE(func)
    76  	FUNC_ENTRY(4)
    77  	xor	%r8, %r8
    78  IFDOS(`	jmp	L(ent)		')
    79  EPILOGUE()
    80  PROLOGUE(func_nc)
    81  	FUNC_ENTRY(4)
    82  IFDOS(`	mov	56(%rsp), %r8	')
    83  L(ent):	push	%rbx
    84  	push	%r12
    85  	push	%r13
    86  	push	%r14
    87  	push	%r15
    88  
    89  	mov	(vp), %r9
    90  	mov	(up), %r15
    91  
    92  	mov	R32(n), R32(%rax)
    93  	and	$3, R32(%rax)
    94  	jne	L(n00)
    95  
    96  	mov	R32(%r8), R32(%rbx)	C n = 0, 4, 8, ...
    97  	mov	8(up), %r10
    98  	ADDSUB	%r9, %r15
    99  	mov	8(vp), %r9
   100  	setc	R8(%rax)
   101  	ADDSUB	%rbx, %r15		C return bit
   102  	jnc	1f
   103  	mov	$1, R8(%rax)
   104  1:	mov	16(up), %r12
   105  	ADDSUB	%r9, %r10
   106  	mov	16(vp), %r9
   107  	setc	R8(%rbx)
   108  	mov	%r15, %r13
   109  	ADDSUB	%rax, %r10
   110  	jnc	1f
   111  	mov	$1, R8(%rbx)
   112  1:	mov	24(up), %r11
   113  	ADDSUB	%r9, %r12
   114  	lea	32(up), up
   115  	mov	24(vp), %r9
   116  	lea	32(vp), vp
   117  	setc	R8(%rax)
   118  	mov	%r10, %r14
   119  	shl	$63, %r10
   120  	shr	%r13
   121  	jmp	L(L00)
   122  
   123  L(n00):	cmp	$2, R32(%rax)
   124  	jnc	L(n01)
   125  	xor	R32(%rbx), R32(%rbx)	C n = 1, 5, 9, ...
   126  	lea	-24(rp), rp
   127  	mov	R32(%r8), R32(%rax)
   128  	dec	n
   129  	jnz	L(gt1)
   130  	ADDSUB	%r9, %r15
   131  	setc	R8(%rbx)
   132  	ADDSUB	%rax, %r15
   133  	jnc	1f
   134  	mov	$1, R8(%rbx)
   135  1:	mov	%r15, %r14
   136  	shl	$63, %rbx
   137  	shr	%r14
   138  	jmp	L(cj1)
   139  L(gt1):	mov	8(up), %r8
   140  	ADDSUB	%r9, %r15
   141  	mov	8(vp), %r9
   142  	setc	R8(%rbx)
   143  	ADDSUB	%rax, %r15
   144  	jnc	1f
   145  	mov	$1, R8(%rbx)
   146  1:	mov	16(up), %r10
   147  	ADDSUB	%r9, %r8
   148  	mov	16(vp), %r9
   149  	setc	R8(%rax)
   150  	mov	%r15, %r14
   151  	ADDSUB	%rbx, %r8
   152  	jnc	1f
   153  	mov	$1, R8(%rax)
   154  1:	mov	24(up), %r12
   155  	ADDSUB	%r9, %r10
   156  	mov	24(vp), %r9
   157  	setc	R8(%rbx)
   158  	mov	%r8, %r13
   159  	shl	$63, %r8
   160  	shr	%r14
   161  	lea	8(up), up
   162  	lea	8(vp), vp
   163  	jmp	L(L01)
   164  
   165  L(n01):	jne	L(n10)
   166  	lea	-16(rp), rp		C n = 2, 6, 10, ...
   167  	mov	R32(%r8), R32(%rbx)
   168  	mov	8(up), %r11
   169  	ADDSUB	%r9, %r15
   170  	mov	8(vp), %r9
   171  	setc	R8(%rax)
   172  	ADDSUB	%rbx, %r15
   173  	jnc	1f
   174  	mov	$1, R8(%rax)
   175  1:	sub	$2, n
   176  	jnz	L(gt2)
   177  	ADDSUB	%r9, %r11
   178  	setc	R8(%rbx)
   179  	mov	%r15, %r13
   180  	ADDSUB	%rax, %r11
   181  	jnc	1f
   182  	mov	$1, R8(%rbx)
   183  1:	mov	%r11, %r14
   184  	shl	$63, %r11
   185  	shr	%r13
   186  	jmp	L(cj2)
   187  L(gt2):	mov	16(up), %r8
   188  	ADDSUB	%r9, %r11
   189  	mov	16(vp), %r9
   190  	setc	R8(%rbx)
   191  	mov	%r15, %r13
   192  	ADDSUB	%rax, %r11
   193  	jnc	1f
   194  	mov	$1, R8(%rbx)
   195  1:	mov	24(up), %r10
   196  	ADDSUB	%r9, %r8
   197  	mov	24(vp), %r9
   198  	setc	R8(%rax)
   199  	mov	%r11, %r14
   200  	shl	$63, %r11
   201  	shr	%r13
   202  	lea	16(up), up
   203  	lea	16(vp), vp
   204  	jmp	L(L10)
   205  
   206  L(n10):	xor	R32(%rbx), R32(%rbx)	C n = 3, 7, 11, ...
   207  	lea	-8(rp), rp
   208  	mov	R32(%r8), R32(%rax)
   209  	mov	8(up), %r12
   210  	ADDSUB	%r9, %r15
   211  	mov	8(vp), %r9
   212  	setc	R8(%rbx)
   213  	ADDSUB	%rax, %r15
   214  	jnc	1f
   215  	mov	$1, R8(%rbx)
   216  1:	mov	16(up), %r11
   217  	ADDSUB	%r9, %r12
   218  	mov	16(vp), %r9
   219  	setc	R8(%rax)
   220  	mov	%r15, %r14
   221  	ADDSUB	%rbx, %r12
   222  	jnc	1f
   223  	mov	$1, R8(%rax)
   224  1:	sub	$3, n
   225  	jnz	L(gt3)
   226  	ADDSUB	%r9, %r11
   227  	setc	R8(%rbx)
   228  	mov	%r12, %r13
   229  	shl	$63, %r12
   230  	shr	%r14
   231  	jmp	L(cj3)
   232  L(gt3):	mov	24(up), %r8
   233  	ADDSUB	%r9, %r11
   234  	mov	24(vp), %r9
   235  	setc	R8(%rbx)
   236  	mov	%r12, %r13
   237  	shl	$63, %r12
   238  	shr	%r14
   239  	lea	24(up), up
   240  	lea	24(vp), vp
   241  	jmp	L(L11)
   242  
   243  L(c0):	mov	$1, R8(%rbx)
   244  	jmp	L(rc0)
   245  L(c1):	mov	$1, R8(%rax)
   246  	jmp	L(rc1)
   247  L(c2):	mov	$1, R8(%rbx)
   248  	jmp	L(rc2)
   249  
   250  	ALIGN(16)
   251  L(top):	mov	(up), %r8	C not on critical path
   252  	or	%r13, %r10
   253  	ADDSUB	%r9, %r11	C not on critical path
   254  	mov	(vp), %r9	C not on critical path
   255  	setc	R8(%rbx)	C save carry out
   256  	mov	%r12, %r13	C new for later
   257  	shl	$63, %r12	C shift new right
   258  	shr	%r14		C shift old left
   259  	mov	%r10, (rp)
   260  L(L11):	ADDSUB	%rax, %r11	C apply previous carry out
   261  	jc	L(c0)		C jump if ripple
   262  L(rc0):	mov	8(up), %r10
   263  	or	%r14, %r12
   264  	ADDSUB	%r9, %r8
   265  	mov	8(vp), %r9
   266  	setc	R8(%rax)
   267  	mov	%r11, %r14
   268  	shl	$63, %r11
   269  	shr	%r13
   270  	mov	%r12, 8(rp)
   271  L(L10):	ADDSUB	%rbx, %r8
   272  	jc	L(c1)
   273  L(rc1):	mov	16(up), %r12
   274  	or	%r13, %r11
   275  	ADDSUB	%r9, %r10
   276  	mov	16(vp), %r9
   277  	setc	R8(%rbx)
   278  	mov	%r8, %r13
   279  	shl	$63, %r8
   280  	shr	%r14
   281  	mov	%r11, 16(rp)
   282  L(L01):	ADDSUB	%rax, %r10
   283  	jc	L(c2)
   284  L(rc2):	mov	24(up), %r11
   285  	or	%r14, %r8
   286  	ADDSUB	%r9, %r12
   287  	lea	32(up), up
   288  	mov	24(vp), %r9
   289  	lea	32(vp), vp
   290  	setc	R8(%rax)
   291  	mov	%r10, %r14
   292  	shl	$63, %r10
   293  	shr	%r13
   294  	mov	%r8, 24(rp)
   295  	lea	32(rp), rp
   296  L(L00):	ADDSUB	%rbx, %r12
   297  	jc	L(c3)
   298  L(rc3):	sub	$4, n
   299  	ja	L(top)
   300  
   301  L(end):	or	%r13, %r10
   302  	ADDSUB	%r9, %r11
   303  	setc	R8(%rbx)
   304  	mov	%r12, %r13
   305  	shl	$63, %r12
   306  	shr	%r14
   307  	mov	%r10, (rp)
   308  L(cj3):	ADDSUB	%rax, %r11
   309  	jnc	1f
   310  	mov	$1, R8(%rbx)
   311  1:	or	%r14, %r12
   312  	mov	%r11, %r14
   313  	shl	$63, %r11
   314  	shr	%r13
   315  	mov	%r12, 8(rp)
   316  L(cj2):	or	%r13, %r11
   317  	shl	$63, %rbx
   318  	shr	%r14
   319  	mov	%r11, 16(rp)
   320  L(cj1):	or	%r14, %rbx
   321  	mov	%rbx, 24(rp)
   322  
   323  	mov	R32(%r15), R32(%rax)
   324  	and	$1, R32(%rax)
   325  	pop	%r15
   326  	pop	%r14
   327  	pop	%r13
   328  	pop	%r12
   329  	pop	%rbx
   330  	FUNC_EXIT()
   331  	ret
   332  L(c3):	mov	$1, R8(%rax)
   333  	jmp	L(rc3)
   334  EPILOGUE()