github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/submul_1.asm (about)

     1  dnl  Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and
     2  dnl  subtract the result from a second limb vector.
     3  
     4  dnl  Copyright 2001, 2002, 2008, 2010 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  
    35  C			    cycles/limb
    36  C P6 model 0-8,10-12		-
    37  C P6 model 9   (Banias)		6.8
    38  C P6 model 13  (Dothan)		6.9
    39  C P4 model 0-1 (Willamette)	?
    40  C P4 model 2   (Northwood)	5.87
    41  C P4 model 3-4 (Prescott)	6.5
    42  
    43  C This code represents a step forwards compared to the code available before
    44  C GMP 5.1, but it is not carefully tuned for either P6 or P4.  In fact, it is
    45  C not good for P6.  For P4 it saved a bit over 1 c/l for both Northwood and
    46  C Prescott compared to the old code.
    47  C
    48  C The arrangements made here to get a two instruction dependent chain are
    49  C slightly subtle.  In the loop the carry (or borrow rather) is a negative so
    50  C that a paddq can be used to give a low limb ready to store, and a high limb
    51  C ready to become the new carry after a psrlq.
    52  C
    53  C If the carry was a simple twos complement negative then the psrlq shift would
    54  C need to bring in 0 bits or 1 bits according to whether the high was zero or
    55  C non-zero, since a non-zero value would represent a negative needing sign
    56  C extension.  That wouldn't be particularly easy to arrange and certainly would
    57  C add an instruction to the dependent chain, so instead an offset is applied so
    58  C that the high limb will be 0xFFFFFFFF+c.  With c in the range -0xFFFFFFFF to
    59  C 0, the value 0xFFFFFFFF+c is in the range 0 to 0xFFFFFFFF and is therefore
    60  C always positive and can always have 0 bits shifted in, which is what psrlq
    61  C does.
    62  C
    63  C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be
    64  C done off the dependent chain.  The total adjustment then is to add
    65  C 0xFFFFFFFF00000000 to offset the new carry, and subtract 0x00000000FFFFFFFF
    66  C to remove the offset from the current carry, for a net add of
    67  C 0xFFFFFFFE00000001.  In the code this is applied to the destination limb when
    68  C fetched.
    69  C
    70  C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement
    71  C negative, which is how it's undone for the return value, but that doesn't
    72  C seem as clear.
    73  
    74  defframe(PARAM_CARRY,     20)
    75  defframe(PARAM_MULTIPLIER,16)
    76  defframe(PARAM_SIZE,      12)
    77  defframe(PARAM_SRC,       8)
    78  defframe(PARAM_DST,       4)
    79  
    80  	TEXT
    81  	ALIGN(16)
    82  
    83  PROLOGUE(mpn_submul_1c)
    84  deflit(`FRAME',0)
    85  	movd	PARAM_CARRY, %mm1
    86  	jmp	L(start_1c)
    87  EPILOGUE()
    88  
    89  PROLOGUE(mpn_submul_1)
    90  deflit(`FRAME',0)
    91  	pxor	%mm1, %mm1		C initial borrow
    92  
    93  L(start_1c):
    94  	mov	PARAM_SRC, %eax
    95  	pcmpeqd	%mm0, %mm0
    96  
    97  	movd	PARAM_MULTIPLIER, %mm7
    98  	pcmpeqd	%mm6, %mm6
    99  
   100  	mov	PARAM_DST, %edx
   101  	psrlq	$32, %mm0		C 0x00000000FFFFFFFF
   102  
   103  	mov	PARAM_SIZE, %ecx
   104  	psllq	$32, %mm6		C 0xFFFFFFFF00000000
   105  
   106  	psubq	%mm0, %mm6		C 0xFFFFFFFE00000001
   107  
   108  	psubq	%mm1, %mm0		C 0xFFFFFFFF - borrow
   109  
   110  
   111  	movd	(%eax), %mm3		C up
   112  	movd	(%edx), %mm4		C rp
   113  
   114  	add	$-1, %ecx
   115  	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001
   116  	pmuludq	%mm7, %mm3
   117  	jnz	L(gt1)
   118  	psubq	%mm3, %mm4		C prod
   119  	paddq	%mm4, %mm0		C borrow
   120  	movd	%mm0, (%edx)		C result
   121  	jmp	L(rt)
   122  
   123  L(gt1):	movd	4(%eax), %mm1		C up
   124  	movd	4(%edx), %mm2		C rp
   125  
   126  	add	$-1, %ecx
   127  	jz	L(eev)
   128  
   129  	ALIGN(16)
   130  L(top):	paddq	%mm6, %mm2		C add 0xFFFFFFFE00000001
   131  	pmuludq	%mm7, %mm1
   132  	psubq	%mm3, %mm4		C prod
   133  	movd	8(%eax), %mm3		C up
   134  	paddq	%mm4, %mm0		C borrow
   135  	movd	8(%edx), %mm4		C rp
   136  	movd	%mm0, (%edx)		C result
   137  	psrlq	$32, %mm0
   138  
   139  	add	$-1, %ecx
   140  	jz	L(eod)
   141  
   142  	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001
   143  	pmuludq	%mm7, %mm3
   144  	psubq	%mm1, %mm2		C prod
   145  	movd	12(%eax), %mm1		C up
   146  	paddq	%mm2, %mm0		C borrow
   147  	movd	12(%edx), %mm2		C rp
   148  	movd	%mm0, 4(%edx)		C result
   149  	psrlq	$32, %mm0
   150  
   151  	lea	8(%eax), %eax
   152  	lea	8(%edx), %edx
   153  	add	$-1, %ecx
   154  	jnz	L(top)
   155  
   156  
   157  L(eev):	paddq	%mm6, %mm2		C add 0xFFFFFFFE00000001
   158  	pmuludq	%mm7, %mm1
   159  	psubq	%mm3, %mm4		C prod
   160  	paddq	%mm4, %mm0		C borrow
   161  	movd	%mm0, (%edx)		C result
   162  	psrlq	$32, %mm0
   163  	psubq	%mm1, %mm2		C prod
   164  	paddq	%mm2, %mm0		C borrow
   165  	movd	%mm0, 4(%edx)		C result
   166  L(rt):	psrlq	$32, %mm0
   167  	movd	%mm0, %eax
   168  	not	%eax
   169  	emms
   170  	ret
   171  
   172  L(eod):	paddq	%mm6, %mm4		C add 0xFFFFFFFE00000001
   173  	pmuludq	%mm7, %mm3
   174  	psubq	%mm1, %mm2		C prod
   175  	paddq	%mm2, %mm0		C borrow
   176  	movd	%mm0, 4(%edx)		C result
   177  	psrlq	$32, %mm0
   178  	psubq	%mm3, %mm4		C prod
   179  	paddq	%mm4, %mm0		C borrow
   180  	movd	%mm0, 8(%edx)		C result
   181  	jmp	L(rt)
   182  EPILOGUE()