github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/popham.asm (about)

     1  dnl  AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance.
     2  
     3  dnl  Copyright 2004, 2005, 2007, 2010-2012 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  
    32  include(`../config.m4')
    33  
    34  
    35  C		     popcount	      hamdist
    36  C		    cycles/limb	    cycles/limb
    37  C AMD K8,K9		 6		 7
    38  C AMD K10		 6		 7
    39  C Intel P4		12		14.3
    40  C Intel core2		 7		 8
    41  C Intel corei		 ?		 7.3
    42  C Intel atom		16.5		17.5
    43  C VIA nano		 8.75		10.4
    44  
    45  C TODO
    46  C  * Tune.  It should be possible to reach 5 c/l for popcount and 6 c/l for
    47  C    hamdist for K8/K9.
    48  
    49  
    50  ifdef(`OPERATION_popcount',`
    51    define(`func',`mpn_popcount')
    52    define(`up',		`%rdi')
    53    define(`n',		`%rsi')
    54    define(`h55555555',	`%r10')
    55    define(`h33333333',	`%r11')
    56    define(`h0f0f0f0f',	`%rcx')
    57    define(`h01010101',	`%rdx')
    58    define(`POP',		`$1')
    59    define(`HAM',		`dnl')
    60  ')
    61  ifdef(`OPERATION_hamdist',`
    62    define(`func',`mpn_hamdist')
    63    define(`up',		`%rdi')
    64    define(`vp',		`%rsi')
    65    define(`n',		`%rdx')
    66    define(`h55555555',	`%r10')
    67    define(`h33333333',	`%r11')
    68    define(`h0f0f0f0f',	`%rcx')
    69    define(`h01010101',	`%r14')
    70    define(`POP',		`dnl')
    71    define(`HAM',		`$1')
    72  ')
    73  
    74  
    75  MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
    76  
    77  ABI_SUPPORT(DOS64)
    78  ABI_SUPPORT(STD64)
    79  
    80  ASM_START()
    81  	TEXT
    82  	ALIGN(32)
    83  PROLOGUE(func)
    84   POP(`	FUNC_ENTRY(2)		')
    85   HAM(`	FUNC_ENTRY(3)		')
    86  	push	%r12
    87  	push	%r13
    88   HAM(`	push	%r14		')
    89  
    90  	mov	$0x5555555555555555, h55555555
    91  	mov	$0x3333333333333333, h33333333
    92  	mov	$0x0f0f0f0f0f0f0f0f, h0f0f0f0f
    93  	mov	$0x0101010101010101, h01010101
    94  
    95  	lea	(up,n,8), up
    96   HAM(`	lea	(vp,n,8), vp	')
    97  	neg	n
    98  
    99  	xor	R32(%rax), R32(%rax)
   100  
   101  	bt	$0, R32(n)
   102  	jnc	L(top)
   103  
   104  	mov	(up,n,8), %r8
   105   HAM(`	xor	(vp,n,8), %r8	')
   106  
   107  	mov	%r8, %r9
   108  	shr	%r8
   109  	and	h55555555, %r8
   110  	sub	%r8, %r9
   111  
   112  	mov	%r9, %r8
   113  	shr	$2, %r9
   114  	and	h33333333, %r8
   115  	and	h33333333, %r9
   116  	add	%r8, %r9		C 16 4-bit fields (0..4)
   117  
   118  	mov	%r9, %r8
   119  	shr	$4, %r9
   120  	and	h0f0f0f0f, %r8
   121  	and	h0f0f0f0f, %r9
   122  	add	%r8, %r9		C 8 8-bit fields (0..16)
   123  
   124  	imul	h01010101, %r9		C sum the 8 fields in high 8 bits
   125  	shr	$56, %r9
   126  
   127  	mov	%r9, %rax		C add to total
   128  	add	$1, n
   129  	jz	L(end)
   130  
   131  	ALIGN(16)
   132  L(top):	mov	(up,n,8), %r8
   133  	mov	8(up,n,8), %r12
   134   HAM(`	xor	(vp,n,8), %r8	')
   135   HAM(`	xor	8(vp,n,8), %r12	')
   136  
   137  	mov	%r8, %r9
   138  	mov	%r12, %r13
   139  	shr	%r8
   140  	shr	%r12
   141  	and	h55555555, %r8
   142  	and	h55555555, %r12
   143  	sub	%r8, %r9
   144  	sub	%r12, %r13
   145  
   146  	mov	%r9, %r8
   147  	mov	%r13, %r12
   148  	shr	$2, %r9
   149  	shr	$2, %r13
   150  	and	h33333333, %r8
   151  	and	h33333333, %r9
   152  	and	h33333333, %r12
   153  	and	h33333333, %r13
   154  	add	%r8, %r9		C 16 4-bit fields (0..4)
   155  	add	%r12, %r13		C 16 4-bit fields (0..4)
   156  
   157  	add	%r13, %r9		C 16 4-bit fields (0..8)
   158  	mov	%r9, %r8
   159  	shr	$4, %r9
   160  	and	h0f0f0f0f, %r8
   161  	and	h0f0f0f0f, %r9
   162  	add	%r8, %r9		C 8 8-bit fields (0..16)
   163  
   164  	imul	h01010101, %r9		C sum the 8 fields in high 8 bits
   165  	shr	$56, %r9
   166  
   167  	add	%r9, %rax		C add to total
   168  	add	$2, n
   169  	jnc	L(top)
   170  
   171  L(end):
   172   HAM(`	pop	%r14		')
   173  	pop	%r13
   174  	pop	%r12
   175  	FUNC_EXIT()
   176  	ret
   177  EPILOGUE()