github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/atom/sublsh1_n.asm (about)

     1  dnl  AMD64 mpn_sublsh1_n optimised for Intel Atom.
     2  dnl  Used also for AMD bd1.
     3  
     4  dnl  Contributed to the GNU project by Torbjorn Granlund.
     5  
     6  dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
     7  
     8  dnl  This file is part of the GNU MP Library.
     9  dnl
    10  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    11  dnl  it under the terms of either:
    12  dnl
    13  dnl    * the GNU Lesser General Public License as published by the Free
    14  dnl      Software Foundation; either version 3 of the License, or (at your
    15  dnl      option) any later version.
    16  dnl
    17  dnl  or
    18  dnl
    19  dnl    * the GNU General Public License as published by the Free Software
    20  dnl      Foundation; either version 2 of the License, or (at your option) any
    21  dnl      later version.
    22  dnl
    23  dnl  or both in parallel, as here.
    24  dnl
    25  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    26  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    27  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    28  dnl  for more details.
    29  dnl
    30  dnl  You should have received copies of the GNU General Public License and the
    31  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    32  dnl  see https://www.gnu.org/licenses/.
    33  
    34  include(`../config.m4')
    35  
    36  C TODO
    37  C  * This code is slightly large at 501 bytes.
    38  C  * aorrlsh1_n.asm and this file use the same basic pattern.
    39  
    40  C	     cycles/limb
    41  C AMD K8,K9	 ?
    42  C AMD K10	 ?
    43  C AMD bd1	 2.3
    44  C AMD bobcat	 ?
    45  C Intel P4	 ?
    46  C Intel core2	 ?
    47  C Intel NHM	 ?
    48  C Intel SBR	 ?
    49  C Intel atom	 5	(4.875 is probably possible)
    50  C VIA nano	 ?
    51  
    52  C INPUT PARAMETERS
    53  define(`rp',       `%rdi')
    54  define(`up',       `%rsi')
    55  define(`vp',       `%rdx')
    56  define(`n',        `%rcx')
    57  define(`cy',       `%r8')
    58  
    59  ABI_SUPPORT(DOS64)
    60  ABI_SUPPORT(STD64)
    61  
    62  ASM_START()
    63  	TEXT
    64  	ALIGN(16)
    65  PROLOGUE(mpn_sublsh1_n)
    66  	FUNC_ENTRY(4)
    67  	push	%rbp
    68  	push	%r15
    69  	xor	R32(%rbp), R32(%rbp)
    70  L(ent):	mov	R32(n), R32(%rax)
    71  	and	$3, R32(%rax)
    72  	jz	L(b0)
    73  	cmp	$2, R32(%rax)
    74  	jz	L(b2)
    75  	jg	L(b3)
    76  
    77  L(b1):	mov	(vp), %r8
    78  	add	%r8, %r8
    79  	lea	8(vp), vp
    80  	sbb	R32(%rax), R32(%rax)	C save scy
    81  	add	R32(%rbp), R32(%rbp)	C restore acy
    82  	mov	(up), %r15
    83  	sbb	%r8, %r15
    84  	mov	%r15, (rp)
    85  	sbb	R32(%rbp), R32(%rbp)	C save acy
    86  	lea	8(up), up
    87  	lea	8(rp), rp
    88  	jmp	L(b0)
    89  
    90  L(b2):	mov	(vp), %r8
    91  	add	%r8, %r8
    92  	mov	8(vp), %r9
    93  	adc	%r9, %r9
    94  	lea	16(vp), vp
    95  	sbb	R32(%rax), R32(%rax)	C save scy
    96  	add	R32(%rbp), R32(%rbp)	C restore acy
    97  	mov	(up), %r15
    98  	sbb	%r8, %r15
    99  	mov	%r15, (rp)
   100  	mov	8(up), %r15
   101  	sbb	%r9, %r15
   102  	mov	%r15, 8(rp)
   103  	sbb	R32(%rbp), R32(%rbp)	C save acy
   104  	lea	16(up), up
   105  	lea	16(rp), rp
   106  	jmp	L(b0)
   107  
   108  L(b3):	mov	(vp), %r8
   109  	add	%r8, %r8
   110  	mov	8(vp), %r9
   111  	adc	%r9, %r9
   112  	mov	16(vp), %r10
   113  	adc	%r10, %r10
   114  	lea	24(vp), vp
   115  	sbb	R32(%rax), R32(%rax)	C save scy
   116  	add	R32(%rbp), R32(%rbp)	C restore acy
   117  	mov	(up), %r15
   118  	sbb	%r8, %r15
   119  	mov	%r15, (rp)
   120  	mov	8(up), %r15
   121  	sbb	%r9, %r15
   122  	mov	%r15, 8(rp)
   123  	mov	16(up), %r15
   124  	sbb	%r10, %r15
   125  	mov	%r15, 16(rp)
   126  	sbb	R32(%rbp), R32(%rbp)	C save acy
   127  	lea	24(up), up
   128  	lea	24(rp), rp
   129  
   130  L(b0):	test	$4, R8(n)
   131  	jz	L(skp)
   132  	add	R32(%rax), R32(%rax)	C restore scy
   133  	mov	(vp), %r8
   134  	adc	%r8, %r8
   135  	mov	8(vp), %r9
   136  	adc	%r9, %r9
   137  	mov	16(vp), %r10
   138  	adc	%r10, %r10
   139  	mov	24(vp), %r11
   140  	adc	%r11, %r11
   141  	lea	32(vp), vp
   142  	sbb	R32(%rax), R32(%rax)	C save scy
   143  	add	R32(%rbp), R32(%rbp)	C restore acy
   144  	mov	(up), %r15
   145  	sbb	%r8, %r15
   146  	mov	%r15, (rp)
   147  	mov	8(up), %r15
   148  	sbb	%r9, %r15
   149  	mov	%r15, 8(rp)
   150  	mov	16(up), %r15
   151  	sbb	%r10, %r15
   152  	mov	%r15, 16(rp)
   153  	mov	24(up), %r15
   154  	sbb	%r11, %r15
   155  	mov	%r15, 24(rp)
   156  	lea	32(up), up
   157  	lea	32(rp), rp
   158  	sbb	R32(%rbp), R32(%rbp)	C save acy
   159  
   160  L(skp):	cmp	$8, n
   161  	jl	L(rtn)
   162  
   163  	push	%r12
   164  	push	%r13
   165  	push	%r14
   166  	push	%rbx
   167  	lea	-64(rp), rp
   168  	jmp	L(x)
   169  
   170  	ALIGN(16)
   171  L(top):	mov	(vp), %r8
   172  	add	R32(%rax), R32(%rax)
   173  	lea	64(vp), vp
   174  	adc	%r8, %r8
   175  	mov	-56(vp), %r9
   176  	adc	%r9, %r9
   177  	mov	-48(vp), %r10
   178  	adc	%r10, %r10
   179  	mov	-40(vp), %r11
   180  	adc	%r11, %r11
   181  	mov	-32(vp), %r12
   182  	adc	%r12, %r12
   183  	mov	-24(vp), %r13
   184  	adc	%r13, %r13
   185  	mov	-16(vp), %r14
   186  	adc	%r14, %r14
   187  	mov	-8(vp), %r15
   188  	adc	%r15, %r15
   189  	sbb	R32(%rax), R32(%rax)
   190  	add	R32(%rbp), R32(%rbp)
   191  	mov	(up), %rbp
   192  	lea	64(rp), rp
   193  	mov	8(up), %rbx
   194  	sbb	%r8, %rbp
   195  	mov	32(up), %r8
   196  	mov	%rbp, (rp)
   197  	sbb	%r9, %rbx
   198  	mov	16(up), %rbp
   199  	mov	%rbx, 8(rp)
   200  	sbb	%r10, %rbp
   201  	mov	24(up), %rbx
   202  	mov	%rbp, 16(rp)
   203  	sbb	%r11, %rbx
   204  	mov	%rbx, 24(rp)
   205  	sbb	%r12, %r8
   206  	mov	40(up), %r9
   207  	mov	%r8, 32(rp)
   208  	sbb	%r13, %r9
   209  	mov	48(up), %rbp
   210  	mov	%r9, 40(rp)
   211  	sbb	%r14, %rbp
   212  	mov	56(up), %rbx
   213  	mov	%rbp, 48(rp)
   214  	sbb	%r15, %rbx
   215  	lea	64(up), up
   216  	mov	%rbx, 56(rp)
   217  	sbb	R32(%rbp), R32(%rbp)
   218  L(x):	sub	$8, n
   219  	jge	L(top)
   220  
   221  L(end):	pop	%rbx
   222  	pop	%r14
   223  	pop	%r13
   224  	pop	%r12
   225  L(rtn):
   226  	add	R32(%rbp), R32(%rax)
   227  	neg	R32(%rax)
   228  
   229  	pop	%r15
   230  	pop	%rbp
   231  	FUNC_EXIT()
   232  	ret
   233  EPILOGUE()
   234  PROLOGUE(mpn_sublsh1_nc)
   235  	FUNC_ENTRY(4)
   236  IFDOS(`	mov	56(%rsp), %r8	')
   237  	push	%rbp
   238  	push	%r15
   239  	neg	%r8			C set CF
   240  	sbb	R32(%rbp), R32(%rbp)	C save acy
   241  	jmp	L(ent)
   242  EPILOGUE()