github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k7/mmx/popham.asm (about)

     1  dnl  AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
     2  dnl  distance.
     3  
     4  dnl  Copyright 2000-2002 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  
    35  C			     popcount	     hamdist
    36  C P3 generic			6.5		7
    37  C P3 model 9  (Banias)          5.7		6.1
    38  C P3 model 13 (Dothan)		5.75		6
    39  C K7				5		6
    40  
    41  C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
    42  C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
    43  C
    44  C The code here is almost certainly not optimal, but is already a 3x speedup
    45  C over the generic C code.  The main improvement would be to interleave
    46  C processing of two qwords in the loop so as to fully exploit the available
    47  C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
    48  C
    49  C The loop is based on the example "Efficient 64-bit population count using
    50  C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
    51  C page 158 of rev E (reference in mpn/x86/k7/README).
    52  
    53  ifdef(`OPERATION_popcount',,
    54  `ifdef(`OPERATION_hamdist',,
    55  `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
    56  ')')')
    57  
    58  define(HAM,
    59  m4_assert_numargs(1)
    60  `ifdef(`OPERATION_hamdist',`$1')')
    61  
    62  define(POP,
    63  m4_assert_numargs(1)
    64  `ifdef(`OPERATION_popcount',`$1')')
    65  
    66  HAM(`
    67  defframe(PARAM_SIZE,   12)
    68  defframe(PARAM_SRC2,   8)
    69  defframe(PARAM_SRC,    4)
    70  define(M4_function,mpn_hamdist)
    71  ')
    72  POP(`
    73  defframe(PARAM_SIZE,   8)
    74  defframe(PARAM_SRC,    4)
    75  define(M4_function,mpn_popcount)
    76  ')
    77  
    78  MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
    79  
    80  
    81  ifdef(`PIC',,`
    82  	dnl  non-PIC
    83  
    84  	RODATA
    85  	ALIGN(8)
    86  
    87  L(rodata_AAAAAAAAAAAAAAAA):
    88  	.long	0xAAAAAAAA
    89  	.long	0xAAAAAAAA
    90  
    91  L(rodata_3333333333333333):
    92  	.long	0x33333333
    93  	.long	0x33333333
    94  
    95  L(rodata_0F0F0F0F0F0F0F0F):
    96  	.long	0x0F0F0F0F
    97  	.long	0x0F0F0F0F
    98  ')
    99  
   100  	TEXT
   101  	ALIGN(32)
   102  
   103  PROLOGUE(M4_function)
   104  deflit(`FRAME',0)
   105  
   106  	movl	PARAM_SIZE, %ecx
   107  
   108  ifdef(`PIC',`
   109  	movl	$0xAAAAAAAA, %eax
   110  	movl	$0x33333333, %edx
   111  
   112  	movd	%eax, %mm7
   113  	movd	%edx, %mm6
   114  
   115  	movl	$0x0F0F0F0F, %eax
   116  
   117  	punpckldq %mm7, %mm7
   118  	punpckldq %mm6, %mm6
   119  
   120  	movd	%eax, %mm5
   121  	movd	%edx, %mm4
   122  
   123  	punpckldq %mm5, %mm5
   124  
   125  ',`
   126  	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
   127  	movq	L(rodata_3333333333333333), %mm6
   128  	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
   129  ')
   130  	pxor	%mm4, %mm4
   131  
   132  define(REG_AAAAAAAAAAAAAAAA,%mm7)
   133  define(REG_3333333333333333,%mm6)
   134  define(REG_0F0F0F0F0F0F0F0F,%mm5)
   135  define(REG_0000000000000000,%mm4)
   136  
   137  
   138  	movl	PARAM_SRC, %eax
   139  HAM(`	movl	PARAM_SRC2, %edx')
   140  
   141  	pxor	%mm2, %mm2	C total
   142  
   143  	shrl	%ecx
   144  	jnc	L(top)
   145  
   146  	movd	(%eax,%ecx,8), %mm1
   147  
   148  HAM(`	movd	(%edx,%ecx,8), %mm0
   149  	pxor	%mm0, %mm1
   150  ')
   151  	orl	%ecx, %ecx
   152  	jmp	L(loaded)
   153  
   154  
   155  	ALIGN(16)
   156  L(top):
   157  	C eax	src
   158  	C ebx
   159  	C ecx	counter, qwords, decrementing
   160  	C edx	[hamdist] src2
   161  	C
   162  	C mm0	(scratch)
   163  	C mm1	(scratch)
   164  	C mm2	total (low dword)
   165  	C mm3
   166  	C mm4	\
   167  	C mm5	| special constants
   168  	C mm6	|
   169  	C mm7	/
   170  
   171  	movq	-8(%eax,%ecx,8), %mm1
   172  
   173  HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
   174  	decl	%ecx
   175  
   176  L(loaded):
   177  	movq	%mm1, %mm0
   178  	pand	REG_AAAAAAAAAAAAAAAA, %mm1
   179  
   180  	psrlq	$1, %mm1
   181  
   182  	psubd	%mm1, %mm0	C bit pairs
   183  
   184  
   185  	movq	%mm0, %mm1
   186  	psrlq	$2, %mm0
   187  
   188  	pand	REG_3333333333333333, %mm0
   189  	pand	REG_3333333333333333, %mm1
   190  
   191  	paddd	%mm1, %mm0	C nibbles
   192  
   193  
   194  	movq	%mm0, %mm1
   195  	psrlq	$4, %mm0
   196  
   197  	pand	REG_0F0F0F0F0F0F0F0F, %mm0
   198  	pand	REG_0F0F0F0F0F0F0F0F, %mm1
   199  
   200  	paddd	%mm1, %mm0	C bytes
   201  
   202  
   203  	psadbw(	%mm4, %mm0)
   204  
   205  	paddd	%mm0, %mm2	C add to total
   206  	jnz	L(top)
   207  
   208  
   209  	movd	%mm2, %eax
   210  	emms
   211  	ret
   212  
   213  EPILOGUE()