github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mod_34lsub1.asm (about)

     1  dnl  AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
     2  
     3  dnl  Copyright 2000-2002, 2004, 2005, 2008 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C         cycles/limb
    35  C Athlon:     1
    36  C Hammer:     1
    37  
    38  
    39  C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
    40  C
    41  C The loop form below and the 64 byte code alignment seem necessary for the
    42  C claimed speed.  This is a bit strange, since normally k7 isn't very
    43  C sensitive to such things.  Perhaps there has to be 6 instructions in the
    44  C first 16 bytes for the BTB entry or something.
    45  
    46  defframe(PARAM_SIZE, 8)
    47  defframe(PARAM_SRC,  4)
    48  
    49  dnl  re-use parameter space
    50  define(SAVE_EDI, `PARAM_SIZE')
    51  
    52  	TEXT
    53  	ALIGN(64)
    54  PROLOGUE(mpn_mod_34lsub1)
    55  deflit(`FRAME',0)
    56  
    57  	movl	PARAM_SIZE, %ecx
    58  	movl	PARAM_SRC, %edx
    59  
    60  	subl	$2, %ecx
    61  	ja	L(three_or_more)
    62  
    63  	movl	(%edx), %eax
    64  	jb	L(one)
    65  
    66  	movl	4(%edx), %ecx
    67  	movl	%eax, %edx
    68  	shrl	$24, %eax		C src[0] low
    69  
    70  	andl	$0xFFFFFF, %edx		C src[0] high
    71  	addl	%edx, %eax
    72  	movl	%ecx, %edx
    73  
    74  	andl	$0xFFFF, %ecx
    75  	shrl	$16, %edx		C src[1] high
    76  	addl	%edx, %eax
    77  
    78  	shll	$8, %ecx		C src[1] low
    79  	addl	%ecx, %eax
    80  
    81  L(one):
    82  	ret
    83  
    84  
    85  L(three_or_more):
    86  	C eax
    87  	C ebx
    88  	C ecx	size-2
    89  	C edx	src
    90  	C esi
    91  	C edi
    92  
    93  	pushl	%ebx	FRAME_pushl()
    94  	xorl	%eax, %eax
    95  	xorl	%ebx, %ebx
    96  
    97  	movl	%edi, SAVE_EDI
    98  	pushl	%esi	FRAME_pushl()
    99  	xorl	%esi, %esi		C and clear carry flag
   100  
   101  
   102  	C code offset 0x40 at this point
   103  L(top):
   104  	C eax	acc 0mod3
   105  	C ebx	acc 1mod3
   106  	C ecx	counter, limbs
   107  	C edx	src
   108  	C esi	acc 2mod3
   109  	C edi
   110  
   111  	leal	24(%edx), %edx
   112  	leal	-2(%ecx), %ecx
   113  	adcl	-24(%edx), %eax
   114  	adcl	-20(%edx), %ebx
   115  	adcl	-16(%edx), %esi
   116  
   117  	decl	%ecx
   118  	jng	L(done_loop)
   119  
   120  	leal	-2(%ecx), %ecx
   121  	adcl	-12(%edx), %eax
   122  	adcl	-8(%edx), %ebx
   123  	adcl	-4(%edx), %esi
   124  
   125  	decl	%ecx
   126  	jg	L(top)
   127  
   128  
   129  	leal	12(%edx), %edx
   130  
   131  
   132  L(done_loop):
   133  	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
   134  
   135  	incl	%ecx
   136  	movl	$0xFFFFFFFF, %edi
   137  	js	L(combine)
   138  
   139  	adcl	-12(%edx), %eax
   140  	decl	%ecx
   141  	movl	$0xFFFFFF00, %edi
   142  	js	L(combine)
   143  
   144  	adcl	-8(%edx), %ebx
   145  	movl	$0xFFFF0000, %edi
   146  
   147  
   148  L(combine):
   149  	C eax	acc 0mod3
   150  	C ebx	acc 1mod3
   151  	C ecx
   152  	C edx
   153  	C esi	acc 2mod3
   154  	C edi	mask
   155  
   156  	sbbl	%ecx, %ecx		C carry
   157  	movl	%eax, %edx		C 0mod3
   158  	shrl	$24, %eax		C 0mod3 high
   159  
   160  	andl	%edi, %ecx		C carry masked
   161  	andl	$0x00FFFFFF, %edx	C 0mod3 low
   162  	movl	%ebx, %edi		C 1mod3
   163  
   164  	subl	%ecx, %eax		C apply carry
   165  	shrl	$16, %ebx		C 1mod3 high
   166  	andl	$0xFFFF, %edi
   167  
   168  	addl	%edx, %eax		C apply 0mod3 low
   169  	movl	%esi, %edx		C 2mod3
   170  	shll	$8, %edi		C 1mod3 low
   171  
   172  	addl	%ebx, %eax		C apply 1mod3 high
   173  	shrl	$8, %esi		C 2mod3 high
   174  	movzbl	%dl, %edx		C 2mod3 low
   175  
   176  	addl	%edi, %eax		C apply 1mod3 low
   177  	shll	$16, %edx		C 2mod3 low
   178  
   179  	addl	%esi, %eax		C apply 2mod3 high
   180  	popl	%esi	FRAME_popl()
   181  
   182  	movl	SAVE_EDI, %edi
   183  	addl	%edx, %eax		C apply 2mod3 low
   184  	popl	%ebx	FRAME_popl()
   185  
   186  	ret
   187  
   188  EPILOGUE()