github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/k6/mmx/popham.asm (about)

     1  dnl  AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
     2  dnl  hamming distance.
     3  
     4  dnl  Copyright 2000-2002 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  
    35  C        popcount  hamdist
    36  C K6-2:    9.0       11.5   cycles/limb
    37  C K6:      12.5      13.0
    38  
    39  
    40  C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
    41  C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
    42  C
    43  C The code here isn't optimal, but it's already a 2x speedup over the plain
    44  C integer mpn/generic/popcount.c,hamdist.c.
    45  
    46  
    47  ifdef(`OPERATION_popcount',,
    48  `ifdef(`OPERATION_hamdist',,
    49  `m4_error(`Need OPERATION_popcount or OPERATION_hamdist
    50  ')m4exit(1)')')
    51  
    52  define(HAM,
    53  m4_assert_numargs(1)
    54  `ifdef(`OPERATION_hamdist',`$1')')
    55  
    56  define(POP,
    57  m4_assert_numargs(1)
    58  `ifdef(`OPERATION_popcount',`$1')')
    59  
    60  HAM(`
    61  defframe(PARAM_SIZE,   12)
    62  defframe(PARAM_SRC2,   8)
    63  defframe(PARAM_SRC,    4)
    64  define(M4_function,mpn_hamdist)
    65  ')
    66  POP(`
    67  defframe(PARAM_SIZE,   8)
    68  defframe(PARAM_SRC,    4)
    69  define(M4_function,mpn_popcount)
    70  ')
    71  
    72  MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
    73  
    74  
    75  ifdef(`PIC',,`
    76  	dnl  non-PIC
    77  
    78  	RODATA
    79  	ALIGN(8)
    80  
    81  L(rodata_AAAAAAAAAAAAAAAA):
    82  	.long	0xAAAAAAAA
    83  	.long	0xAAAAAAAA
    84  
    85  L(rodata_3333333333333333):
    86  	.long	0x33333333
    87  	.long	0x33333333
    88  
    89  L(rodata_0F0F0F0F0F0F0F0F):
    90  	.long	0x0F0F0F0F
    91  	.long	0x0F0F0F0F
    92  
    93  L(rodata_000000FF000000FF):
    94  	.long	0x000000FF
    95  	.long	0x000000FF
    96  ')
    97  
    98  	TEXT
    99  	ALIGN(32)
   100  
   101  POP(`ifdef(`PIC', `
   102  	C avoid shrl crossing a 32-byte boundary
   103  	nop')')
   104  
   105  PROLOGUE(M4_function)
   106  deflit(`FRAME',0)
   107  
   108  	movl	PARAM_SIZE, %ecx
   109  
   110  ifdef(`PIC',`
   111  	movl	$0xAAAAAAAA, %eax
   112  	movl	$0x33333333, %edx
   113  
   114  	movd	%eax, %mm7
   115  	movd	%edx, %mm6
   116  
   117  	movl	$0x0F0F0F0F, %eax
   118  	movl	$0x000000FF, %edx
   119  
   120  	punpckldq %mm7, %mm7
   121  	punpckldq %mm6, %mm6
   122  
   123  	movd	%eax, %mm5
   124  	movd	%edx, %mm4
   125  
   126  	punpckldq %mm5, %mm5
   127  	punpckldq %mm4, %mm4
   128  ',`
   129  
   130  	movq	L(rodata_AAAAAAAAAAAAAAAA), %mm7
   131  	movq	L(rodata_3333333333333333), %mm6
   132  	movq	L(rodata_0F0F0F0F0F0F0F0F), %mm5
   133  	movq	L(rodata_000000FF000000FF), %mm4
   134  ')
   135  
   136  define(REG_AAAAAAAAAAAAAAAA, %mm7)
   137  define(REG_3333333333333333, %mm6)
   138  define(REG_0F0F0F0F0F0F0F0F, %mm5)
   139  define(REG_000000FF000000FF, %mm4)
   140  
   141  
   142  	movl	PARAM_SRC, %eax
   143  HAM(`	movl	PARAM_SRC2, %edx')
   144  
   145  	pxor	%mm2, %mm2	C total
   146  
   147  	shrl	%ecx
   148  	jnc	L(top)
   149  
   150  Zdisp(	movd,	0,(%eax,%ecx,8), %mm1)
   151  
   152  HAM(`
   153  Zdisp(	movd,	0,(%edx,%ecx,8), %mm0)
   154  	pxor	%mm0, %mm1
   155  ')
   156  
   157  	incl	%ecx
   158  	jmp	L(loaded)
   159  
   160  
   161  	ALIGN(16)
   162  POP(`	nop	C alignment to avoid crossing 32-byte boundaries')
   163  
   164  L(top):
   165  	C eax	src
   166  	C ebx
   167  	C ecx	counter, qwords, decrementing
   168  	C edx	[hamdist] src2
   169  	C
   170  	C mm0	(scratch)
   171  	C mm1	(scratch)
   172  	C mm2	total (low dword)
   173  	C mm3
   174  	C mm4	\
   175  	C mm5	| special constants
   176  	C mm6	|
   177  	C mm7	/
   178  
   179  	movq	-8(%eax,%ecx,8), %mm1
   180  HAM(`	pxor	-8(%edx,%ecx,8), %mm1')
   181  
   182  L(loaded):
   183  	movq	%mm1, %mm0
   184  	pand	REG_AAAAAAAAAAAAAAAA, %mm1
   185  
   186  	psrlq	$1, %mm1
   187  HAM(`	nop			C code alignment')
   188  
   189  	psubd	%mm1, %mm0	C bit pairs
   190  HAM(`	nop			C code alignment')
   191  
   192  
   193  	movq	%mm0, %mm1
   194  	psrlq	$2, %mm0
   195  
   196  	pand	REG_3333333333333333, %mm0
   197  	pand	REG_3333333333333333, %mm1
   198  
   199  	paddd	%mm1, %mm0	C nibbles
   200  
   201  
   202  	movq	%mm0, %mm1
   203  	psrlq	$4, %mm0
   204  
   205  	pand	REG_0F0F0F0F0F0F0F0F, %mm0
   206  	pand	REG_0F0F0F0F0F0F0F0F, %mm1
   207  
   208  	paddd	%mm1, %mm0	C bytes
   209  
   210  	movq	%mm0, %mm1
   211  	psrlq	$8, %mm0
   212  
   213  
   214  	paddb	%mm1, %mm0	C words
   215  
   216  
   217  	movq	%mm0, %mm1
   218  	psrlq	$16, %mm0
   219  
   220  	paddd	%mm1, %mm0	C dwords
   221  
   222  	pand	REG_000000FF000000FF, %mm0
   223  
   224  	paddd	%mm0, %mm2	C low to total
   225  	psrlq	$32, %mm0
   226  
   227  	paddd	%mm0, %mm2	C high to total
   228  	loop	L(top)
   229  
   230  
   231  
   232  	movd	%mm2, %eax
   233  	emms_or_femms
   234  	ret
   235  
   236  EPILOGUE()