github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/mod_34lsub1.asm (about)

     1  dnl  AMD K6 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1.
     2  
     3  dnl  Copyright 2000-2002 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C K6: 2.66 cycles/limb
    35  
    36  
    37  C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
    38  C
    39  C An attempt was made to use a loop like
    40  C
    41  C L(top):
    42  C	adcl	(%edx), %eax
    43  C	adcl	4(%edx), %ebx
    44  C	adcl	8(%edx), %esi
    45  C	leal	12(%edx), %edx
    46  C	loop	L(top)
    47  C
    48  C with %ecx starting from floor(size/3), but it still measured 2.66 c/l.
    49  C The form used instead can save about 6 cycles by not dividing by 3.
    50  C
    51  C In the code used, putting the "leal"s at the top of the loop is necessary
    52  C for the claimed speed, anywhere else costs an extra cycle per loop.
    53  C Perhaps a tight loop like this needs short decode instructions at the
    54  C branch target, which would explain the leal/loop form above taking 8
    55  C cycles instead of 7 too.
    56  
    57  defframe(PARAM_SIZE, 8)
    58  defframe(PARAM_SRC,  4)
    59  
    60  dnl  re-use parameter space
    61  define(SAVE_EBX, `PARAM_SIZE')
    62  define(SAVE_ESI, `PARAM_SRC')
    63  
    64  	TEXT
    65  	ALIGN(16)
    66  PROLOGUE(mpn_mod_34lsub1)
    67  deflit(`FRAME',0)
    68  
    69  	movl	PARAM_SIZE, %eax
    70  	movl	PARAM_SRC, %edx
    71  
    72  	subl	$2, %eax
    73  	ja	L(three_or_more)
    74  
    75  Zdisp(	movl,	0,(%edx), %eax)		C avoid code cache line boundary
    76  	jne	L(one)
    77  
    78  	movl	%eax, %ecx
    79  	movl	4(%edx), %edx
    80  
    81  	shrl	$24, %eax		C src[0] high
    82  	andl	$0x00FFFFFF, %ecx	C src[0] low
    83  
    84  	addl	%ecx, %eax
    85  	movl	%edx, %ecx
    86  
    87  	shll	$8, %edx
    88  	andl	$0x00FFFF00, %edx	C src[1] high
    89  
    90  	shrl	$16, %ecx		C src[1] low
    91  	addl	%ecx, %eax
    92  
    93  	addl	%edx, %eax
    94  
    95  L(one):
    96  	ret
    97  
    98  
    99  L(three_or_more):
   100  	C eax	size-2
   101  	C ebx
   102  	C ecx
   103  	C edx	src
   104  
   105  	movl	%ebx, SAVE_EBX
   106  	xorl	%ebx, %ebx
   107  
   108  	movl	%esi, SAVE_ESI
   109  	pushl	%edi	FRAME_pushl()
   110  
   111  	xorl	%esi, %esi
   112  	xorl	%edi, %edi		C and clear carry flag
   113  
   114  L(top):
   115  	C eax	counter, limbs
   116  	C ebx	acc 0mod3
   117  	C ecx
   118  	C edx	src, incrementing
   119  	C esi	acc 1mod3
   120  	C edi	acc 2mod3
   121  	C ebp
   122  
   123  	leal	-2(%eax), %eax
   124  	leal	12(%edx), %edx
   125  
   126  	adcl	-12(%edx), %ebx
   127  	adcl	-8(%edx), %esi
   128  	adcl	-4(%edx), %edi
   129  
   130  	decl	%eax
   131  	jg	L(top)
   132  
   133  
   134  	C ecx is -3, -2 or -1 representing 0, 1 or 2 more limbs, respectively
   135  
   136  	movb	$0, %cl
   137  	incl	%eax
   138  
   139  	js	L(combine)		C 0 more
   140  
   141  Zdisp(	adcl,	0,(%edx), %ebx)		C avoid code cache line crossings
   142  
   143  	movb	$8, %cl
   144  	decl	%eax
   145  
   146  	js	L(combine)		C 1 more
   147  
   148  	adcl	4(%edx), %esi
   149  
   150  	movb	$16, %cl
   151  
   152  
   153  L(combine):
   154  	sbbl	%edx, %edx
   155  
   156  	shll	%cl, %edx		C carry
   157  	movl	%ebx, %eax		C 0mod3
   158  
   159  	shrl	$24, %eax		C 0mod3 high
   160  	andl	$0x00FFFFFF, %ebx	C 0mod3 low
   161  
   162  	subl	%edx, %eax		C apply carry
   163  	movl	%esi, %ecx		C 1mod3
   164  
   165  	shrl	$16, %esi		C 1mod3 high
   166  	addl	%ebx, %eax		C apply 0mod3 low
   167  
   168  	andl	$0x0000FFFF, %ecx
   169  	addl	%esi, %eax		C apply 1mod3 high
   170  
   171  	shll	$8, %ecx		C 1mod3 low
   172  	movl	%edi, %edx		C 2mod3
   173  
   174  	shrl	$8, %edx		C 2mod3 high
   175  	addl	%ecx, %eax		C apply 1mod3 low
   176  
   177  	addl	%edx, %eax		C apply 2mod3 high
   178  	andl	$0x000000FF, %edi
   179  
   180  	shll	$16, %edi		C 2mod3 low
   181  	movl	SAVE_EBX, %ebx
   182  
   183  	addl	%edi, %eax		C apply 2mod3 low
   184  	movl	SAVE_ESI, %esi
   185  
   186  	popl	%edi
   187  
   188  	ret
   189  
   190  EPILOGUE()