github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/arm/neon/popcount.asm (about)

     1  dnl  ARM Neon mpn_popcount -- mpn bit population count.
     2  
     3  dnl  Copyright 2013 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C	     cycles/limb
    34  C StrongARM:	 -
    35  C XScale	 -
    36  C Cortex-A7	 ?
    37  C Cortex-A8	 ?
    38  C Cortex-A9	 1.125
    39  C Cortex-A15	 0.56
    40  
    41  C TODO
    42  C  * Explore using vldr and vldm.  Does it help on A9?  (These loads do
    43  C    64-bits-at-a-time, which will mess up in big-endian mode.  Except not for
    44  C    popcount. Except perhaps also for popcount for the edge loads.)
    45  C  * Arrange to align the pointer, if that helps performance.  Use the same
    46  C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
    47  C    valgrind!)
    48  C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
    49  C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
    50  
    51  C INPUT PARAMETERS
    52  define(`ap', r0)
    53  define(`n',  r1)
    54  
    55  C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end
    56  C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
    57  C (8*2^16-1)/32 = 0x3fff limbs.  We use a chunksize close to that, but which
    58  C can be represented as a 8-bit ARM constant.
    59  C
    60  define(`chunksize',0x3f80)
    61  
    62  ASM_START()
    63  PROLOGUE(mpn_popcount)
    64  
    65  	cmp	n, #chunksize
    66  	bhi	L(gt16k)
    67  
    68  L(lt16k):
    69  	vmov.i64   q8, #0		C clear summation register
    70  	vmov.i64   q9, #0		C clear summation register
    71  
    72  	tst	   n, #1
    73  	beq	   L(xxx0)
    74  	vmov.i64   d0, #0
    75  	sub	   n, n, #1
    76  	vld1.32   {d0[0]}, [ap]!	C load 1 limb
    77  	vcnt.8	   d24, d0
    78  	vpadal.u8  d16, d24		C d16/q8 = 0; could just splat
    79  
    80  L(xxx0):tst	   n, #2
    81  	beq	   L(xx00)
    82  	sub	   n, n, #2
    83  	vld1.32    {d0}, [ap]!		C load 2 limbs
    84  	vcnt.8	   d24, d0
    85  	vpadal.u8  d16, d24
    86  
    87  L(xx00):tst	   n, #4
    88  	beq	   L(x000)
    89  	sub	   n, n, #4
    90  	vld1.32    {q0}, [ap]!		C load 4 limbs
    91  	vcnt.8	   q12, q0
    92  	vpadal.u8  q8, q12
    93  
    94  L(x000):tst	   n, #8
    95  	beq	   L(0000)
    96  
    97  	subs	   n, n, #8
    98  	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
    99  	bls	   L(sum)
   100  
   101  L(gt8):	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
   102  	sub	   n, n, #8
   103  	vcnt.8	   q12, q0
   104  	vcnt.8	   q13, q1
   105  	b	   L(mid)
   106  
   107  L(0000):subs	   n, n, #16
   108  	blo	   L(e0)
   109  
   110  	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
   111  	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
   112  	vcnt.8	   q12, q2
   113  	vcnt.8	   q13, q3
   114  	subs	   n, n, #16
   115  	blo	   L(end)
   116  
   117  L(top):	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
   118  	vpadal.u8  q8, q12
   119  	vcnt.8	   q12, q0
   120  	vpadal.u8  q9, q13
   121  	vcnt.8	   q13, q1
   122  L(mid):	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
   123  	subs	   n, n, #16
   124  	vpadal.u8  q8, q12
   125  	vcnt.8	   q12, q2
   126  	vpadal.u8  q9, q13
   127  	vcnt.8	   q13, q3
   128  	bhs	   L(top)
   129  
   130  L(end):	vpadal.u8  q8, q12
   131  	vpadal.u8  q9, q13
   132  L(sum):	vcnt.8	   q12, q0
   133  	vcnt.8	   q13, q1
   134  	vpadal.u8  q8, q12
   135  	vpadal.u8  q9, q13
   136  	vadd.i16   q8, q8, q9
   137  					C we have 8 16-bit counts
   138  L(e0):	vpaddl.u16 q8, q8		C we have 4 32-bit counts
   139  	vpaddl.u32 q8, q8		C we have 2 64-bit counts
   140  	vmov.32    r0, d16[0]
   141  	vmov.32    r1, d17[0]
   142  	add	   r0, r0, r1
   143  	bx	lr
   144  
   145  C Code for large count.  Splits operand and calls above code.
   146  define(`ap2', r2)			C caller-saves reg not used above
   147  L(gt16k):
   148  	push	{r4,r14}
   149  	mov	ap2, ap
   150  	mov	r3, n			C full count
   151  	mov	r4, #0			C total sum
   152  
   153  1:	mov	n, #chunksize		C count for this invocation
   154  	bl	L(lt16k)		C could jump deep inside code
   155  	add	ap2, ap2, #chunksize*4	C point at next chunk
   156  	add	r4, r4, r0
   157  	mov	ap, ap2			C put chunk pointer in place for call
   158  	sub	r3, r3, #chunksize
   159  	cmp	r3, #chunksize
   160  	bhi	1b
   161  
   162  	mov	n, r3			C count for final invocation
   163  	bl	L(lt16k)
   164  	add	r0, r4, r0
   165  	pop	{r4,pc}
   166  EPILOGUE()