github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/arm/neon/hamdist.asm (about)

     1  dnl  ARM Neon mpn_hamdist -- mpn bit hamming distance.
     2  
     3  dnl  Copyright 2013 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C	     cycles/limb
    34  C StrongARM:	 -
    35  C XScale	 -
    36  C Cortex-A7	 ?
    37  C Cortex-A8	 ?
    38  C Cortex-A9	 1.89
    39  C Cortex-A15	 0.95
    40  
    41  C TODO
    42  C  * Explore using vldr and vldm.  Does it help on A9?  (These loads do
    43  C    64-bits-at-a-time, which will mess up in big-endian mode.  Except not for
    44  C    popcount. Except perhaps also for popcount for the edge loads.)
    45  C  * Arrange to align the pointer, if that helps performance.  Use the same
    46  C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
    47  C    valgrind!)
    48  C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
    49  C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
    50  
    51  C INPUT PARAMETERS
    52  define(`ap', r0)
    53  define(`bp', r1)
    54  define(`n',  r2)
    55  
    56  C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end
    57  C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
    58  C (8*2^16-1)/32 = 0x3fff limbs.  We use a chunksize close to that, but which
    59  C can be represented as a 8-bit ARM constant.
    60  C
    61  define(`chunksize',0x3f80)
    62  
    63  ASM_START()
    64  PROLOGUE(mpn_hamdist)
    65  
    66  	cmp	n, #chunksize
    67  	bhi	L(gt16k)
    68  
    69  L(lt16k):
    70  	vmov.i64   q8, #0		C clear summation register
    71  	vmov.i64   q9, #0		C clear summation register
    72  
    73  	tst	   n, #1
    74  	beq	   L(xxx0)
    75  	vmov.i64   d0, #0
    76  	vmov.i64   d20, #0
    77  	sub	   n, n, #1
    78  	vld1.32   {d0[0]}, [ap]!	C load 1 limb
    79  	vld1.32   {d20[0]}, [bp]!	C load 1 limb
    80  	veor	   d0, d0, d20
    81  	vcnt.8	   d24, d0
    82  	vpadal.u8  d16, d24		C d16/q8 = 0; could just splat
    83  
    84  L(xxx0):tst	   n, #2
    85  	beq	   L(xx00)
    86  	sub	   n, n, #2
    87  	vld1.32    {d0}, [ap]!		C load 2 limbs
    88  	vld1.32    {d20}, [bp]!		C load 2 limbs
    89  	veor	   d0, d0, d20
    90  	vcnt.8	   d24, d0
    91  	vpadal.u8  d16, d24
    92  
    93  L(xx00):tst	   n, #4
    94  	beq	   L(x000)
    95  	sub	   n, n, #4
    96  	vld1.32    {q0}, [ap]!		C load 4 limbs
    97  	vld1.32    {q10}, [bp]!		C load 4 limbs
    98  	veor	   q0, q0, q10
    99  	vcnt.8	   q12, q0
   100  	vpadal.u8  q8, q12
   101  
   102  L(x000):tst	   n, #8
   103  	beq	   L(0000)
   104  
   105  	subs	   n, n, #8
   106  	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
   107  	vld1.32    {q10,q11}, [bp]!	C load 8 limbs
   108  	bls	   L(sum)
   109  
   110  L(gt8):	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
   111  	vld1.32    {q14,q15}, [bp]!	C load 8 limbs
   112  	veor	   q0, q0, q10
   113  	veor	   q1, q1, q11
   114  	sub	   n, n, #8
   115  	vcnt.8	   q12, q0
   116  	vcnt.8	   q13, q1
   117  	b	   L(mid)
   118  
   119  L(0000):subs	   n, n, #16
   120  	blo	   L(e0)
   121  
   122  	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
   123  	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
   124  	vld1.32    {q14,q15}, [bp]!	C load 8 limbs
   125  	vld1.32    {q10,q11}, [bp]!	C load 8 limbs
   126  	veor	   q2, q2, q14
   127  	veor	   q3, q3, q15
   128  	vcnt.8	   q12, q2
   129  	vcnt.8	   q13, q3
   130  	subs	   n, n, #16
   131  	blo	   L(end)
   132  
   133  L(top):	vld1.32    {q2,q3}, [ap]!	C load 8 limbs
   134  	vld1.32    {q14,q15}, [bp]!	C load 8 limbs
   135  	veor	   q0, q0, q10
   136  	veor	   q1, q1, q11
   137  	vpadal.u8  q8, q12
   138  	vcnt.8	   q12, q0
   139  	vpadal.u8  q9, q13
   140  	vcnt.8	   q13, q1
   141  L(mid):	vld1.32    {q0,q1}, [ap]!	C load 8 limbs
   142  	vld1.32    {q10,q11}, [bp]!	C load 8 limbs
   143  	veor	   q2, q2, q14
   144  	veor	   q3, q3, q15
   145  	subs	   n, n, #16
   146  	vpadal.u8  q8, q12
   147  	vcnt.8	   q12, q2
   148  	vpadal.u8  q9, q13
   149  	vcnt.8	   q13, q3
   150  	bhs	   L(top)
   151  
   152  L(end):	vpadal.u8  q8, q12
   153  	vpadal.u8  q9, q13
   154  L(sum):	veor	   q0, q0, q10
   155  	veor	   q1, q1, q11
   156  	vcnt.8	   q12, q0
   157  	vcnt.8	   q13, q1
   158  	vpadal.u8  q8, q12
   159  	vpadal.u8  q9, q13
   160  	vadd.i16   q8, q8, q9
   161  					C we have 8 16-bit counts
   162  L(e0):	vpaddl.u16 q8, q8		C we have 4 32-bit counts
   163  	vpaddl.u32 q8, q8		C we have 2 64-bit counts
   164  	vmov.32    r0, d16[0]
   165  	vmov.32    r1, d17[0]
   166  	add	   r0, r0, r1
   167  	bx	lr
   168  
   169  C Code for large count.  Splits operand and calls above code.
   170  define(`ap2', r5)
   171  define(`bp2', r6)
   172  L(gt16k):
   173  	push	{r4,r5,r6,r14}
   174  	mov	ap2, ap
   175  	mov	bp2, bp
   176  	mov	r3, n			C full count
   177  	mov	r4, #0			C total sum
   178  
   179  1:	mov	n, #chunksize		C count for this invocation
   180  	bl	L(lt16k)		C could jump deep inside code
   181  	add	ap2, ap2, #chunksize*4	C point at next chunk
   182  	add	bp2, bp2, #chunksize*4	C point at next chunk
   183  	add	r4, r4, r0
   184  	mov	ap, ap2			C put chunk pointer in place for call
   185  	mov	bp, bp2			C put chunk pointer in place for call
   186  	sub	r3, r3, #chunksize
   187  	cmp	r3, #chunksize
   188  	bhi	1b
   189  
   190  	mov	n, r3			C count for final invocation
   191  	bl	L(lt16k)
   192  	add	r0, r4, r0
   193  	pop	{r4,r5,r6,pc}
   194  EPILOGUE()