github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/arm64/popcount.asm (about)

     1  dnl  ARM64 Neon mpn_popcount -- mpn bit population count.
     2  
     3  dnl  Copyright 2013, 2014 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C	     cycles/limb
    34  C Cortex-A53	 ?
    35  C Cortex-A57	 ?
    36  
    37  C TODO
    38  C  * Consider greater unrolling.
    39  C  * Arrange to align the pointer, if that helps performance.  Use the same
    40  C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
    41  C    valgrind!)
    42  C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
    43  C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
    44  
    45  changecom(@&*$)
    46  
    47  C INPUT PARAMETERS
    48  define(`ap', x0)
    49  define(`n',  x1)
    50  
    51  C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
    52  C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
    53  C (8*2^16-1)/64 = 0x1fff limbs.  We use a chunksize close to that, but which
    54  C  allows the huge count code to jump deep into the code (at L(chu)).
    55  
    56  define(`maxsize',  0x1fff)
    57  define(`chunksize',0x1ff0)
    58  
    59  ASM_START()
    60  PROLOGUE(mpn_popcount)
    61  
    62  	mov	x11, #maxsize
    63  	cmp	n, x11
    64  	b.hi	L(gt8k)
    65  
    66  L(lt8k):
    67  	movi	v4.16b, #0			C clear summation register
    68  	movi	v5.16b, #0			C clear summation register
    69  
    70  	tbz	n, #0, L(xx0)
    71  	sub	n, n, #1
    72  	ld1	{v0.1d}, [ap], #8		C load 1 limb
    73  	cnt	v6.16b, v0.16b
    74  	uadalp	v4.8h,  v6.16b			C could also splat
    75  
    76  L(xx0):	tbz	n, #1, L(x00)
    77  	sub	n, n, #2
    78  	ld1	{v0.2d}, [ap], #16		C load 2 limbs
    79  	cnt	v6.16b, v0.16b
    80  	uadalp	v4.8h,  v6.16b
    81  
    82  L(x00):	tbz	n, #2, L(000)
    83  	subs	n, n, #4
    84  	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
    85  	b.ls	L(sum)
    86  
    87  L(gt4):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
    88  	sub	n, n, #4
    89  	cnt	v6.16b, v0.16b
    90  	cnt	v7.16b, v1.16b
    91  	b	L(mid)
    92  
    93  L(000):	subs	n, n, #8
    94  	b.lo	L(e0)
    95  
    96  L(chu):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
    97  	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
    98  	cnt	v6.16b, v2.16b
    99  	cnt	v7.16b, v3.16b
   100  	subs	n, n, #8
   101  	b.lo	L(end)
   102  
   103  L(top):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
   104  	uadalp	v4.8h,  v6.16b
   105  	cnt	v6.16b, v0.16b
   106  	uadalp	v5.8h,  v7.16b
   107  	cnt	v7.16b, v1.16b
   108  L(mid):	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
   109  	subs	n, n, #8
   110  	uadalp	v4.8h,  v6.16b
   111  	cnt	v6.16b, v2.16b
   112  	uadalp	v5.8h,  v7.16b
   113  	cnt	v7.16b, v3.16b
   114  	b.hs	L(top)
   115  
   116  L(end):	uadalp	v4.8h,  v6.16b
   117  	uadalp	v5.8h,  v7.16b
   118  L(sum):	cnt	v6.16b, v0.16b
   119  	cnt	v7.16b, v1.16b
   120  	uadalp	v4.8h,  v6.16b
   121  	uadalp	v5.8h,  v7.16b
   122  	add	v4.8h, v4.8h, v5.8h
   123  					C we have 8 16-bit counts
   124  L(e0):	uaddlp	v4.4s,  v4.8h		C we have 4 32-bit counts
   125  	uaddlp	v4.2d,  v4.4s		C we have 2 64-bit counts
   126  	mov	x0, v4.d[0]
   127  	mov	x1, v4.d[1]
   128  	add	x0, x0, x1
   129  	ret
   130  
   131  C Code for count > maxsize.  Splits operand and calls above code.
   132  define(`ap2', x5)			C caller-saves reg not used above
   133  L(gt8k):
   134  	mov	x8, x30
   135  	mov	x7, n			C full count (caller-saves reg not used above)
   136  	mov	x4, #0			C total sum  (caller-saves reg not used above)
   137  	mov	x9, #chunksize*8	C caller-saves reg not used above
   138  	mov	x10, #chunksize		C caller-saves reg not used above
   139  
   140  1:	add	ap2, ap, x9		C point at subsequent block
   141  	mov	n, #chunksize-8		C count for this invocation, adjusted for entry pt
   142  	movi	v4.16b, #0		C clear chunk summation register
   143  	movi	v5.16b, #0		C clear chunk summation register
   144  	bl	L(chu)			C jump deep inside code
   145  	add	x4, x4, x0
   146  	mov	ap, ap2			C put chunk pointer in place for calls
   147  	sub	x7, x7, x10
   148  	cmp	x7, x11
   149  	b.hi	1b
   150  
   151  	mov	n, x7			C count for final invocation
   152  	bl	L(lt8k)
   153  	add	x0, x4, x0
   154  	mov	x30, x8
   155  	ret
   156  EPILOGUE()