github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/mmx/popham.asm (about)

     1  dnl  Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
     2  dnl  hamming distance.
     3  
     4  dnl  Copyright 2000-2002, 2007 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  
    35  C			     popcount	     hamdist
    36  C P3 model 9  (Banias)		?		?
    37  C P3 model 13 (Dothan)		6		6
    38  C P4 model 0  (Willamette)
    39  C P4 model 1  (?)
    40  C P4 model 2  (Northwood)	8		9
    41  C P4 model 3  (Prescott)	8		9
    42  C P4 model 4  (Nocona)
    43  
    44  C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
    45  C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
    46  C
    47  C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided.
    48  C Two movd's and a punpckldq seems to be the same speed as an aligned movq,
    49  C and using them saves fiddling about with alignment testing on entry.
    50  C
    51  C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l
    52  C might be possible, but 8 c/l relying on out-of-order execution is already
    53  C quite reasonable.
    54  
    55  ifdef(`OPERATION_popcount',,
    56  `ifdef(`OPERATION_hamdist',,
    57  `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
    58  ')')')
    59  
    60  define(HAM,
    61  m4_assert_numargs(1)
    62  `ifdef(`OPERATION_hamdist',`$1')')
    63  
    64  define(POP,
    65  m4_assert_numargs(1)
    66  `ifdef(`OPERATION_popcount',`$1')')
    67  
    68  HAM(`
    69  defframe(PARAM_SIZE, 12)
    70  defframe(PARAM_SRC2,  8)
    71  defframe(PARAM_SRC,   4)
    72  define(M4_function,mpn_hamdist)
    73  ')
    74  POP(`
    75  defframe(PARAM_SIZE,  8)
    76  defframe(PARAM_SRC,   4)
    77  define(M4_function,mpn_popcount)
    78  ')
    79  
    80  MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
    81  
    82  
    83  ifdef(`PIC',,`
    84  	dnl  non-PIC
    85  	RODATA
    86  	ALIGN(8)
    87  L(rodata_AAAAAAAAAAAAAAAA):
    88  	.long	0xAAAAAAAA
    89  	.long	0xAAAAAAAA
    90  L(rodata_3333333333333333):
    91  	.long	0x33333333
    92  	.long	0x33333333
    93  L(rodata_0F0F0F0F0F0F0F0F):
    94  	.long	0x0F0F0F0F
    95  	.long	0x0F0F0F0F
    96  ')
    97  
    98  	TEXT
    99  	ALIGN(16)
   100  
   101  PROLOGUE(M4_function)
   102  deflit(`FRAME',0)
   103  
   104  	movl	PARAM_SIZE, %ecx
   105  	movl	PARAM_SRC, %eax
   106  
   107  ifdef(`PIC',`
   108  	movl	$0xAAAAAAAA, %edx
   109  	movd	%edx, %mm7
   110  	punpckldq %mm7, %mm7
   111  
   112  	movl	$0x33333333, %edx
   113  	movd	%edx, %mm6
   114  	punpckldq %mm6, %mm6
   115  
   116  	movl	$0x0F0F0F0F, %edx
   117  	movd	%edx, %mm5
   118  	punpckldq %mm5, %mm5
   119  
   120  HAM(`	movl	PARAM_SRC2, %edx')
   121  
   122  ',`
   123  	dnl non-PIC
   124  HAM(`	movl	PARAM_SRC2, %edx')
   125  	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
   126  	movq	L(rodata_3333333333333333), %mm6
   127  	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
   128  ')
   129  
   130  	pxor	%mm4, %mm4		C zero
   131  	pxor	%mm0, %mm0		C total
   132  
   133  	subl	$1, %ecx
   134  	ja	L(top)
   135  
   136  L(last):
   137  	movd	(%eax,%ecx,4), %mm1		C src high limb
   138  HAM(`	movd	(%edx,%ecx,4), %mm2
   139  	pxor	%mm2, %mm1
   140  ')
   141  	jmp	L(loaded)
   142  
   143  
   144  L(top):
   145  	C eax	src
   146  	C ebx
   147  	C ecx	counter, size-1 to 2 or 1, inclusive
   148  	C edx	[hamdist] src2
   149  	C
   150  	C mm0	total (low dword)
   151  	C mm1	(scratch)
   152  	C mm2	(scratch)
   153  	C mm3
   154  	C mm4	0x0000000000000000
   155  	C mm5	0x0F0F0F0F0F0F0F0F
   156  	C mm6	0x3333333333333333
   157  	C mm7	0xAAAAAAAAAAAAAAAA
   158  
   159  	movd	(%eax), %mm1
   160  	movd	4(%eax), %mm2
   161  	punpckldq %mm2, %mm1
   162  	addl	$8, %eax
   163  
   164  HAM(`	movd	(%edx), %mm2
   165  	movd	4(%edx), %mm3
   166  	punpckldq %mm3, %mm2
   167  	pxor	%mm2, %mm1
   168  	addl	$8, %edx
   169  ')
   170  
   171  L(loaded):
   172  	movq	%mm7, %mm2
   173  	pand	%mm1, %mm2
   174  	psrlq	$1, %mm2
   175  	psubd	%mm2, %mm1	C bit pairs
   176  
   177  	movq	%mm6, %mm2
   178  	pand	%mm1, %mm2
   179  	psrlq	$2, %mm1
   180  	pand	%mm6, %mm1
   181  	paddd	%mm2, %mm1	C nibbles
   182  
   183  	movq	%mm5, %mm2
   184  	pand	%mm1, %mm2
   185  	psrlq	$4, %mm1
   186  	pand	%mm5, %mm1
   187  	paddd	%mm2, %mm1	C bytes
   188  
   189  	psadbw(	%mm4, %mm1)
   190  	paddd	%mm1, %mm0	C to total
   191  
   192  	subl	$2, %ecx
   193  	jg	L(top)
   194  
   195  	C ecx is 0 or -1 representing respectively 1 or 0 further limbs
   196  	jz	L(last)
   197  
   198  
   199  	movd	%mm0, %eax
   200  	emms
   201  	ret
   202  
   203  EPILOGUE()