github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc64/vmx/popcount.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc64/vmx/popcount.asm (about)

     1  dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_popcount.
     2  
     3  dnl  Copyright 2006, 2010 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C                   cycles/limb
    34  C 7400,7410 (G4):       ?
    35  C 744x,745x (G4+):      1.125
    36  C 970 (G5):             2.25
    37  
    38  C TODO
    39  C  * Rewrite the awkward huge n outer loop code.
    40  C  * Two lvx, two vperm, and two vxor could make us a similar hamdist.
    41  C  * Compress cnsts table in 64-bit mode, only half the values are needed.
    42  
    43  define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
    44  define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
    45  define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
    46  
    47  define(`OPERATION_popcount')
    48  
    49  define(`ap',	`r3')
    50  define(`n',	`r4')
    51  
    52  define(`rtab',	`v10')
    53  define(`cnt4',	`v11')
    54  
    55  ifelse(GMP_LIMB_BITS,32,`
    56  	define(`LIMB32',`	$1')
    57  	define(`LIMB64',`')
    58  ',`
    59  	define(`LIMB32',`')
    60  	define(`LIMB64',`	$1')
    61  ')
    62  
    63  C The inner loop handles up to 2^34 bits, i.e., 2^31 64-limbs, due to overflow
    64  C in vsum4ubs.  For large operands, we work in chunks, of size LIMBS_PER_CHUNK.
    65  define(`LIMBS_PER_CHUNK', 0x1000)
    66  define(`LIMBS_CHUNK_THRES', 0x1001)
    67  
    68  ASM_START()
    69  PROLOGUE(mpn_popcount,toc)
    70  	mfspr	r10, 256
    71  	oris	r0, r10, 0xfffc		C Set VRSAVE bit 0-13
    72  	mtspr	256, r0
    73  
    74  ifdef(`HAVE_ABI_mode32',
    75  `	rldicl	n, n, 0, 32')		C zero extend n
    76  
    77  C Load various constants into vector registers
    78  	LEAL(	r11, cnsts)
    79  	li	r12, 16
    80  	vspltisb cnt4, 4		C 0x0404...04 used as shift count
    81  
    82  	li	r7, 160
    83  	lvx	rtab, 0, r11
    84  
    85  LIMB64(`lis	r0, LIMBS_CHUNK_THRES	')
    86  LIMB64(`cmpd	cr7, n, r0		')
    87  
    88  	lvx	v0, 0, ap
    89  	addi	r7, r11, 80
    90  	rlwinm	r6, ap, 2,26,29
    91  	lvx	v8, r7, r6
    92  	vand	v0, v0, v8
    93  
    94  LIMB32(`rlwinm	r8, ap, 30,30,31	')
    95  LIMB64(`rlwinm	r8, ap, 29,31,31	')
    96  	add	n, n, r8		C compensate n for rounded down `ap'
    97  
    98  	vxor	v1, v1, v1
    99  	li	r8, 0			C grand total count
   100  
   101  	vxor	v12, v12, v12		C zero total count
   102  	vxor	v13, v13, v13		C zero total count
   103  
   104  	addic.	n, n, -LIMBS_PER_VR
   105  	ble	L(sum)
   106  
   107  	addic.	n, n, -LIMBS_PER_VR
   108  	ble	L(lsum)
   109  
   110  C For 64-bit machines, handle huge n that would overflow vsum4ubs
   111  LIMB64(`ble	cr7, L(small)		')
   112  LIMB64(`addis	r9, n, -LIMBS_PER_CHUNK	') C remaining n
   113  LIMB64(`lis	n, LIMBS_PER_CHUNK	')
   114  
   115  	ALIGN(16)
   116  L(small):
   117  LIMB32(`srwi	r7, n, 3	')	C loop count corresponding to n
   118  LIMB64(`srdi	r7, n, 2	')	C loop count corresponding to n
   119  	addi	r7, r7, 1
   120  	mtctr	r7			C copy n to count register
   121  	b	L(ent)
   122  
   123  	ALIGN(16)
   124  L(top):
   125  	lvx	v0, 0, ap
   126  L(ent):	lvx	v1, r12, ap
   127  	addi	ap, ap, 32
   128  	vsrb	v8, v0, cnt4
   129  	vsrb	v9, v1, cnt4
   130  	vperm	v2, rtab, rtab, v0
   131  	vperm	v3, rtab, rtab, v8
   132  	vperm	v4, rtab, rtab, v1
   133  	vperm	v5, rtab, rtab, v9
   134  	vaddubm	v6, v2, v3
   135  	vaddubm	v7, v4, v5
   136  	vsum4ubs v12, v6, v12
   137  	vsum4ubs v13, v7, v13
   138  	bdnz	L(top)
   139  
   140  	andi.	n, n, eval(LIMBS_PER_2VR-1)
   141  	beq	L(rt)
   142  
   143  	lvx	v0, 0, ap
   144  	vxor	v1, v1, v1
   145  	cmpwi	n, LIMBS_PER_VR
   146  	ble	L(sum)
   147  L(lsum):
   148  	vor	v1, v0, v0
   149  	lvx	v0, r12, ap
   150  L(sum):
   151  LIMB32(`rlwinm	r6, n, 4,26,27	')
   152  LIMB64(`rlwinm	r6, n, 5,26,26	')
   153  	addi	r7, r11, 16
   154  	lvx	v8, r7, r6
   155  	vand	v0, v0, v8
   156  	vsrb	v8, v0, cnt4
   157  	vsrb	v9, v1, cnt4
   158  	vperm	v2, rtab, rtab, v0
   159  	vperm	v3, rtab, rtab, v8
   160  	vperm	v4, rtab, rtab, v1
   161  	vperm	v5, rtab, rtab, v9
   162  	vaddubm	v6, v2, v3
   163  	vaddubm	v7, v4, v5
   164  	vsum4ubs v12, v6, v12
   165  	vsum4ubs v13, v7, v13
   166  
   167  	ALIGN(16)
   168  L(rt):	vadduwm	v3, v12, v13
   169  	li	r7, -16			C FIXME: does all ppc32 and ppc64 ABIs
   170  	stvx	v3, r7, r1		C FIXME: ...support storing below sp?
   171  
   172  	lwz	r7, -16(r1)
   173  	add	r8, r8, r7
   174  	lwz	r7, -12(r1)
   175  	add	r8, r8, r7
   176  	lwz	r7, -8(r1)
   177  	add	r8, r8, r7
   178  	lwz	r7, -4(r1)
   179  	add	r8, r8, r7
   180  
   181  C Handle outer loop for huge n.  We inherit cr7 and r0 from above.
   182  LIMB64(`ble	cr7, L(ret)
   183  	vxor	v12, v12, v12		C zero total count
   184  	vxor	v13, v13, v13		C zero total count
   185  	mr	n, r9
   186  	cmpd	cr7, n, r0
   187  	ble	cr7, L(2)
   188  	addis	r9, n, -LIMBS_PER_CHUNK	C remaining n
   189  	lis	n, LIMBS_PER_CHUNK
   190  L(2):	srdi	r7, n, 2		C loop count corresponding to n
   191  	mtctr	r7			C copy n to count register
   192  	b	L(top)
   193  ')
   194  
   195  	ALIGN(16)
   196  L(ret):	mr	r3, r8
   197  	mtspr	256, r10
   198  	blr
   199  EPILOGUE()
   200  
   201  DEF_OBJECT(cnsts,16)
   202  C Counts for vperm
   203  	.byte	0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
   204  	.byte	0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
   205  C Masks for high end of number
   206  	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   207  	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   208  
   209  	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
   210  	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
   211  
   212  	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   213  	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
   214  
   215  	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   216  	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
   217  C Masks for low end of number
   218  	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   219  	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   220  
   221  	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
   222  	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   223  
   224  	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
   225  	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   226  
   227  	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
   228  	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
   229  END_OBJECT(cnsts)
   230  ASM_END()