github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/p6/mod_34lsub1.asm (about)

     1  dnl  Intel P6 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
     2  
     3  dnl  Copyright 2000-2002, 2004 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C P6: 2.0 cycles/limb
    35  
    36  C TODO
    37  C  Experiments with more unrolling indicate that 1.5 c/l is possible on P6-13
    38  C  with the current carry handling scheme.
    39  
    40  C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
    41  C
    42  C Groups of three limbs are handled, with carry bits from 0mod3 into 1mod3
    43  C into 2mod3, but at that point going into a separate carries total so we
    44  C don't keep the carry flag live across the loop control.  Avoiding decl
    45  C lets us get to 2.0 c/l, as compared to the generic x86 code at 3.66.
    46  C
    47  
    48  defframe(PARAM_SIZE, 8)
    49  defframe(PARAM_SRC,  4)
    50  
    51  dnl  re-use parameter space
    52  define(SAVE_EBX, `PARAM_SIZE')
    53  define(SAVE_ESI, `PARAM_SRC')
    54  
    55  	TEXT
    56  	ALIGN(16)
    57  PROLOGUE(mpn_mod_34lsub1)
    58  deflit(`FRAME',0)
    59  
    60  	movl	PARAM_SIZE, %ecx
    61  	movl	PARAM_SRC, %edx
    62  
    63  	subl	$2, %ecx		C size-2
    64  	movl	(%edx), %eax		C src[0]
    65  	ja	L(three_or_more)
    66  	jb	L(one)
    67  
    68  	C size==2
    69  
    70  	movl	4(%edx), %ecx		C src[1]
    71  
    72  	movl	%eax, %edx		C src[0]
    73  	shrl	$24, %eax		C src[0] high
    74  
    75  	andl	$0xFFFFFF, %edx		C src[0] low
    76  
    77  	addl	%edx, %eax
    78  	movl	%ecx, %edx		C src[1]
    79  	shrl	$16, %ecx		C src[1] high
    80  
    81  	andl	$0xFFFF, %edx
    82  	addl	%ecx, %eax
    83  
    84  	shll	$8, %edx		C src[1] low
    85  
    86  	addl	%edx, %eax
    87  L(one):
    88  	ret
    89  
    90  
    91  L(three_or_more):
    92  	C eax	src[0], initial acc 0mod3
    93  	C ebx
    94  	C ecx	size-2
    95  	C edx	src
    96  	C esi
    97  	C edi
    98  	C ebp
    99  
   100  	movl	%ebx, SAVE_EBX
   101  	movl	4(%edx), %ebx		C src[1], initial 1mod3
   102  	subl	$3, %ecx		C size-5
   103  
   104  	movl	%esi, SAVE_ESI
   105  	movl	8(%edx), %esi		C src[2], initial 2mod3
   106  
   107  	pushl	%edi	FRAME_pushl()
   108  	movl	$0, %edi		C initial carries 0mod3
   109  	jng	L(done)			C if size < 6
   110  
   111  
   112  L(top):
   113  	C eax	acc 0mod3
   114  	C ebx	acc 1mod3
   115  	C ecx	counter, limbs
   116  	C edx	src
   117  	C esi	acc 2mod3
   118  	C edi	carrys into 0mod3
   119  	C ebp
   120  
   121  	addl	12(%edx), %eax
   122  	adcl	16(%edx), %ebx
   123  	adcl	20(%edx), %esi
   124  	leal	12(%edx), %edx
   125  	adcl	$0, %edi
   126  
   127  	subl	$3, %ecx
   128  	jg	L(top)			C at least 3 more to process
   129  
   130  
   131  L(done):
   132  	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs respectively
   133  	cmpl	$-1, %ecx
   134  	jl	L(done_0)		C if -2, meaning 0 more limbs
   135  
   136  	C 1 or 2 more limbs
   137  	movl	$0, %ecx
   138  	je	L(done_1)		C if -1, meaning 1 more limb only
   139  	movl	16(%edx), %ecx
   140  L(done_1):
   141  	addl	12(%edx), %eax		C 0mod3
   142  	adcl	%ecx, %ebx		C 1mod3
   143  	adcl	$0, %esi		C 2mod3
   144  	adcl	$0, %edi		C carries 0mod3
   145  
   146  L(done_0):
   147  	C eax	acc 0mod3
   148  	C ebx	acc 1mod3
   149  	C ecx
   150  	C edx
   151  	C esi	acc 2mod3
   152  	C edi	carries 0mod3
   153  	C ebp
   154  
   155  	movl	%eax, %ecx		C 0mod3
   156  	shrl	$24, %eax		C 0mod3 high initial total
   157  
   158  	andl	$0xFFFFFF, %ecx		C 0mod3 low
   159  	movl	%edi, %edx		C carries
   160  	shrl	$24, %edi		C carries high
   161  
   162  	addl	%ecx, %eax		C add 0mod3 low
   163  	andl	$0xFFFFFF, %edx		C carries 0mod3 low
   164  	movl	%ebx, %ecx		C 1mod3
   165  
   166  	shrl	$16, %ebx		C 1mod3 high
   167  	addl	%edi, %eax		C add carries high
   168  	addl	%edx, %eax		C add carries 0mod3 low
   169  
   170  	andl	$0xFFFF, %ecx		C 1mod3 low mask
   171  	addl	%ebx, %eax		C add 1mod3 high
   172  	movl	SAVE_EBX, %ebx
   173  
   174  	shll	$8, %ecx		C 1mod3 low
   175  	movl	%esi, %edx		C 2mod3
   176  	popl	%edi	FRAME_popl()
   177  
   178  	shrl	$8, %esi		C 2mod3 high
   179  	andl	$0xFF, %edx		C 2mod3 low mask
   180  	addl	%ecx, %eax		C add 1mod3 low
   181  
   182  	shll	$16, %edx		C 2mod3 low
   183  	addl	%esi, %eax		C add 2mod3 high
   184  	movl	SAVE_ESI, %esi
   185  
   186  	addl	%edx, %eax		C add 2mod3 low
   187  
   188  	ret
   189  
   190  EPILOGUE()