github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/mod_34lsub1.asm (about)

     1  dnl  Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
     2  
     3  dnl  Copyright 2000-2003 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C Pentium4: 1.0 cycles/limb
    35  
    36  
    37  C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
    38  C
    39  C Enhancements:
    40  C
    41  C There might a couple of cycles to save by using plain integer code for
    42  C more small sizes.  2 limbs measures about 20 cycles, but 3 limbs jumps to
    43  C about 46 (inclusive of some function call overheads).
    44  
    45  defframe(PARAM_SIZE, 8)
    46  defframe(PARAM_SRC,  4)
    47  
    48  dnl  re-use parameter space
    49  define(SAVE_EBX, `PARAM_SRC')
    50  define(SAVE_ESI, `PARAM_SIZE')
    51  
    52  	TEXT
    53  	ALIGN(16)
    54  PROLOGUE(mpn_mod_34lsub1)
    55  deflit(`FRAME',0)
    56  
    57  	movl	PARAM_SIZE, %ecx
    58  	movl	PARAM_SRC, %edx
    59  	movl	(%edx), %eax
    60  
    61  	subl	$2, %ecx
    62  	ja	L(three_or_more)
    63  	jne	L(one)
    64  
    65  	movl	4(%edx), %edx
    66  	movl	%eax, %ecx
    67  	shrl	$24, %eax		C src[0] high
    68  
    69  	andl	$0x00FFFFFF, %ecx	C src[0] low
    70  	addl	%ecx, %eax
    71  
    72  	movl	%edx, %ecx
    73  	shll	$8, %edx
    74  
    75  	shrl	$16, %ecx		C src[1] low
    76  	addl	%ecx, %eax
    77  
    78  	andl	$0x00FFFF00, %edx	C src[1] high
    79  	addl	%edx, %eax
    80  
    81  L(one):
    82  	ret
    83  
    84  
    85  L(three_or_more):
    86  	pxor	%mm0, %mm0
    87  	pxor	%mm1, %mm1
    88  	pxor	%mm2, %mm2
    89  
    90  	pcmpeqd	%mm7, %mm7
    91  	psrlq	$32, %mm7	C 0x00000000FFFFFFFF, low 32 bits
    92  
    93  	pcmpeqd	%mm6, %mm6
    94  	psrlq	$40, %mm6	C 0x0000000000FFFFFF, low 24 bits
    95  
    96  L(top):
    97  	C eax
    98  	C ebx
    99  	C ecx	counter, size-2 to 0, -1 or -2
   100  	C edx	src, incrementing
   101  	C
   102  	C mm0	sum 0mod3
   103  	C mm1	sum 1mod3
   104  	C mm2	sum 2mod3
   105  	C mm3
   106  	C mm4
   107  	C mm5
   108  	C mm6	0x0000000000FFFFFF
   109  	C mm7	0x00000000FFFFFFFF
   110  
   111  	movd	(%edx), %mm3
   112  	paddq	%mm3, %mm0
   113  
   114  	movd	4(%edx), %mm3
   115  	paddq	%mm3, %mm1
   116  
   117  	movd	8(%edx), %mm3
   118  	paddq	%mm3, %mm2
   119  
   120  	addl	$12, %edx
   121  	subl	$3, %ecx
   122  	ja	L(top)
   123  
   124  
   125  	C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
   126  
   127  	addl	$1, %ecx
   128  	js	L(combine)		C 0 more
   129  
   130  	movd	(%edx), %mm3
   131  	paddq	%mm3, %mm0
   132  
   133  	jz	L(combine)		C 1 more
   134  
   135  	movd	4(%edx), %mm3
   136  	paddq	%mm3, %mm1
   137  
   138  L(combine):
   139  	movq	%mm7, %mm3		C low halves
   140  	pand	%mm0, %mm3
   141  
   142  	movq	%mm7, %mm4
   143  	pand	%mm1, %mm4
   144  
   145  	movq	%mm7, %mm5
   146  	pand	%mm2, %mm5
   147  
   148  	psrlq	$32, %mm0		C high halves
   149  	psrlq	$32, %mm1
   150  	psrlq	$32, %mm2
   151  
   152  	paddq	%mm0, %mm4		C fold high halves to give 33 bits each
   153  	paddq	%mm1, %mm5
   154  	paddq	%mm2, %mm3
   155  
   156  	psllq	$8, %mm4		C combine at respective offsets
   157  	psllq	$16, %mm5
   158  	paddq	%mm4, %mm3
   159  	paddq	%mm5, %mm3		C 0x000cxxxxxxxxxxxx, 50 bits
   160  
   161  	pand	%mm3, %mm6		C fold at 24 bits
   162  	psrlq	$24, %mm3
   163  
   164  	paddq	%mm6, %mm3
   165  	movd	%mm3, %eax
   166  
   167  	ASSERT(z,	C nothing left in high dword
   168  	`psrlq	$32, %mm3
   169  	movd	%mm3, %ecx
   170  	orl	%ecx, %ecx')
   171  
   172  	emms
   173  	ret
   174  
   175  EPILOGUE()