github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86/pentium4/sse2/popcount.asm (about)

     1  dnl  X86-32 and X86-64 mpn_popcount using SSE2.
     2  
     3  dnl  Copyright 2006, 2007, 2011, 2015 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  
    32  include(`../config.m4')
    33  
    34  
    35  C 32-bit		     popcount	     hamdist
    36  C			    cycles/limb	    cycles/limb
    37  C P5				-
    38  C P6 model 0-8,10-12		-
    39  C P6 model 9  (Banias)		?
    40  C P6 model 13 (Dothan)		4
    41  C P4 model 0  (Willamette)	?
    42  C P4 model 1  (?)		?
    43  C P4 model 2  (Northwood)	3.9
    44  C P4 model 3  (Prescott)	?
    45  C P4 model 4  (Nocona)		?
    46  C AMD K6			-
    47  C AMD K7			-
    48  C AMD K8			?
    49  
    50  C 64-bit		     popcount	     hamdist
    51  C			    cycles/limb	    cycles/limb
    52  C P4 model 4 (Nocona):		8
    53  C AMD K8,K9			7.5
    54  C AMD K10			3.5
    55  C Intel core2			3.68
    56  C Intel corei			3.15
    57  C Intel atom		       10.8
    58  C VIA nano			6.5
    59  
    60  C TODO
    61  C  * Make an mpn_hamdist based on this.  Alignment could either be handled by
    62  C    using movdqu for one operand and movdqa for the other, or by painfully
    63  C    shifting as we go.  Unfortunately, there seem to be no usable shift
    64  C    instruction, except for one that takes an immediate count.
    65  C  * It would probably be possible to cut a few cycles/limb using software
    66  C    pipelining.
    67  C  * There are 35 decode slots unused by the SSE2 instructions.  Loop control
    68  C    needs just 2 or 3 slots, leaving around 32 slots.  This allows a parallel
    69  C    integer based popcount.  Such a combined loop would handle 6 limbs in
    70  C    about 30 cycles on K8.
    71  C  * We could save a byte or two by using 32-bit operations on areg.
    72  C  * Check if using movdqa to a temp of and then register-based pand is faster.
    73  
    74  ifelse(GMP_LIMB_BITS,`32',
    75  `	define(`up',  `%edx')
    76  	define(`n',   `%ecx')
    77  	define(`areg',`%eax')
    78  	define(`breg',`%ebx')
    79  	define(`zero',`%xmm4')
    80  	define(`LIMB32',`	$1')
    81  	define(`LIMB64',`dnl')
    82  ',`
    83  	define(`up',  `%rdi')
    84  	define(`n',   `%rsi')
    85  	define(`areg',`%rax')
    86  	define(`breg',`%rdx')
    87  	define(`zero',`%xmm8')
    88  	define(`LIMB32',`dnl')
    89  	define(`LIMB64',`	$1')
    90  ')
    91  
    92  define(`mm01010101',`%xmm6')
    93  define(`mm00110011',`%xmm7')
    94  define(`mm00001111',`%xmm2')
    95  
    96  define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
    97  define(`LIMBS_PER_XMM',  eval(16/GMP_LIMB_BYTES))
    98  define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
    99  
   100  undefine(`psadbw')			C override inherited m4 version
   101  
   102  C This file is shared between 32-bit and 64-bit builds.  Only the former has
   103  C LEAL.  Default LEAL as an alias of LEA.
   104  ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')')
   105  
   106  ASM_START()
   107  
   108  C Make cnsts global to work around Apple relocation bug.
   109  ifdef(`DARWIN',`
   110  	define(`cnsts', MPN(popccnsts))
   111  	GLOBL	cnsts')
   112  
   113  	TEXT
   114  	ALIGN(32)
   115  PROLOGUE(mpn_popcount)
   116  
   117  LIMB32(`mov	4(%esp), up	')
   118  LIMB32(`mov	8(%esp), n	')
   119  LIMB32(`push	%ebx		')
   120  
   121  	pxor	%xmm3, %xmm3		C zero grand total count
   122  LIMB64(`pxor	zero, zero	')
   123  ifdef(`PIC',`
   124  	LEAL(	cnsts, breg)
   125  ',`
   126  LIMB32(`mov	$cnsts, breg	')
   127  LIMB64(`movabs	$cnsts, breg	')
   128  ')
   129  
   130  	movdqa	-48(breg), mm01010101
   131  	movdqa	-32(breg), mm00110011
   132  	movdqa	-16(breg), mm00001111
   133  
   134  	mov	up, areg
   135  	and	$-16, up		C round `up' down to 128-bit boundary
   136  	and	$12, areg		C 32:areg = 0, 4, 8, 12
   137  					C 64:areg = 0, 8
   138  	movdqa	(up), %xmm0
   139  	pand	64(breg,areg,4), %xmm0
   140  	shr	$m4_log2(GMP_LIMB_BYTES), %eax
   141  	add	areg, n			C compensate n for rounded down `up'
   142  
   143  	pxor	%xmm4, %xmm4
   144  	sub	$LIMBS_PER_XMM, n
   145  	jbe	L(sum)
   146  
   147  	sub	$LIMBS_PER_XMM, n
   148  	ja	L(ent)
   149  	jmp	L(lsum)
   150  
   151  	ALIGN(16)
   152  L(top):	movdqa	(up), %xmm0
   153  L(ent):	movdqa	16(up), %xmm4
   154  
   155  	movdqa	%xmm0, %xmm1
   156  	movdqa	%xmm4, %xmm5
   157  	psrld	$1, %xmm0
   158  	psrld	$1, %xmm4
   159  	pand	mm01010101, %xmm0
   160  	pand	mm01010101, %xmm4
   161  	psubd	%xmm0, %xmm1
   162  	psubd	%xmm4, %xmm5
   163  
   164  	movdqa	%xmm1, %xmm0
   165  	movdqa	%xmm5, %xmm4
   166  	psrlq	$2, %xmm1
   167  	psrlq	$2, %xmm5
   168  	pand	mm00110011, %xmm0
   169  	pand	mm00110011, %xmm4
   170  	pand	mm00110011, %xmm1
   171  	pand	mm00110011, %xmm5
   172  	paddq	%xmm0, %xmm1
   173  	paddq	%xmm4, %xmm5
   174  
   175  LIMB32(`pxor	zero, zero	')
   176  
   177  	add	$32, up
   178  	sub	$LIMBS_PER_2XMM, n
   179  
   180  	paddq	%xmm5, %xmm1
   181  	movdqa	%xmm1, %xmm0
   182  	psrlq	$4, %xmm1
   183  	pand	mm00001111, %xmm0
   184  	pand	mm00001111, %xmm1
   185  	paddq	%xmm0, %xmm1
   186  
   187  	psadbw	zero, %xmm1
   188  	paddq	%xmm1, %xmm3		C add to grand total
   189  
   190  	jnc	L(top)
   191  L(end):
   192  	add	$LIMBS_PER_2XMM, n
   193  	jz	L(rt)
   194  	movdqa	(up), %xmm0
   195  	pxor	%xmm4, %xmm4
   196  	sub	$LIMBS_PER_XMM, n
   197  	jbe	L(sum)
   198  L(lsum):
   199  	movdqa	%xmm0, %xmm4
   200  	movdqa	16(up), %xmm0
   201  L(sum):
   202  	shl	$m4_log2(GMP_LIMB_BYTES), n
   203  	and	$12, n
   204  	pand	(breg,n,4), %xmm0
   205  
   206  	movdqa	%xmm0, %xmm1
   207  	movdqa	%xmm4, %xmm5
   208  	psrld	$1, %xmm0
   209  	psrld	$1, %xmm4
   210  	pand	mm01010101, %xmm0
   211  	pand	mm01010101, %xmm4
   212  	psubd	%xmm0, %xmm1
   213  	psubd	%xmm4, %xmm5
   214  
   215  	movdqa	%xmm1, %xmm0
   216  	movdqa	%xmm5, %xmm4
   217  	psrlq	$2, %xmm1
   218  	psrlq	$2, %xmm5
   219  	pand	mm00110011, %xmm0
   220  	pand	mm00110011, %xmm4
   221  	pand	mm00110011, %xmm1
   222  	pand	mm00110011, %xmm5
   223  	paddq	%xmm0, %xmm1
   224  	paddq	%xmm4, %xmm5
   225  
   226  LIMB32(`pxor	zero, zero	')
   227  
   228  	paddq	%xmm5, %xmm1
   229  	movdqa	%xmm1, %xmm0
   230  	psrlq	$4, %xmm1
   231  	pand	mm00001111, %xmm0
   232  	pand	mm00001111, %xmm1
   233  	paddq	%xmm0, %xmm1
   234  
   235  	psadbw	zero, %xmm1
   236  	paddq	%xmm1, %xmm3		C add to grand total
   237  
   238  
   239  C Add the two 64-bit halves of the grand total counter
   240  L(rt):	movdqa	%xmm3, %xmm0
   241  	psrldq	$8, %xmm3
   242  	paddq	%xmm3, %xmm0
   243  	movd	%xmm0, areg		C movq avoided due to gas bug
   244  
   245  LIMB32(`pop	%ebx		')
   246  	ret
   247  
   248  EPILOGUE()
   249  DEF_OBJECT(dummy,16)
   250  C Three magic constants used for masking out bits
   251  	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
   252  	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
   253  
   254  	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
   255  	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
   256  
   257  	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
   258  	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
   259  cnsts:
   260  C Masks for high end of number
   261  	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   262  	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   263  
   264  	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
   265  	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
   266  
   267  	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   268  	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
   269  
   270  	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   271  	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
   272  C Masks for low end of number
   273  	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   274  	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   275  
   276  	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
   277  	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   278  
   279  	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
   280  	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   281  
   282  	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
   283  	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
   284  END_OBJECT(dummy)
   285  ASM_END()