github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/arm64/hamdist.asm (about)

     1  dnl  ARM64 Neon mpn_hamdist -- mpn bit hamming distance.
     2  
     3  dnl  Copyright 2013, 2014 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C	     cycles/limb
    34  C Cortex-A53	 ?
    35  C Cortex-A57	 ?
    36  
    37  C TODO
    38  C  * Consider greater unrolling.
    39  C  * Arrange to align the pointer, if that helps performance.  Use the same
    40  C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
    41  C    valgrind!)
    42  C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
    43  C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
    44  
    45  changecom(@&*$)
    46  
    47  C INPUT PARAMETERS
    48  define(`ap', x0)
    49  define(`bp', x1)
    50  define(`n',  x2)
    51  
    52  C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
    53  C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
    54  C (8*2^16-1)/64 = 0x1fff limbs.  We use a chunksize close to that, but which
    55  C  allows the huge count code to jump deep into the code (at L(chu)).
    56  
    57  define(`maxsize',  0x1fff)
    58  define(`chunksize',0x1ff0)
    59  
    60  ASM_START()
    61  PROLOGUE(mpn_hamdist)
    62  
    63  	mov	x11, #maxsize
    64  	cmp	n, x11
    65  	b.hi	L(gt8k)
    66  
    67  L(lt8k):
    68  	movi	v4.16b, #0			C clear summation register
    69  	movi	v5.16b, #0			C clear summation register
    70  
    71  	tbz	n, #0, L(xx0)
    72  	sub	n, n, #1
    73  	ld1	{v0.1d}, [ap], #8		C load 1 limb
    74  	ld1	{v16.1d}, [bp], #8		C load 1 limb
    75  	eor	v0.16b, v0.16b, v16.16b
    76  	cnt	v6.16b, v0.16b
    77  	uadalp	v4.8h,  v6.16b			C could also splat
    78  
    79  L(xx0):	tbz	n, #1, L(x00)
    80  	sub	n, n, #2
    81  	ld1	{v0.2d}, [ap], #16		C load 2 limbs
    82  	ld1	{v16.2d}, [bp], #16		C load 2 limbs
    83  	eor	v0.16b, v0.16b, v16.16b
    84  	cnt	v6.16b, v0.16b
    85  	uadalp	v4.8h,  v6.16b
    86  
    87  L(x00):	tbz	n, #2, L(000)
    88  	subs	n, n, #4
    89  	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
    90  	ld1	{v16.2d,v17.2d}, [bp], #32	C load 4 limbs
    91  	b.ls	L(sum)
    92  
    93  L(gt4):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
    94  	ld1	{v18.2d,v19.2d}, [bp], #32	C load 4 limbs
    95  	eor	v0.16b, v0.16b, v16.16b
    96  	eor	v1.16b, v1.16b, v17.16b
    97  	sub	n, n, #4
    98  	cnt	v6.16b, v0.16b
    99  	cnt	v7.16b, v1.16b
   100  	b	L(mid)
   101  
   102  L(000):	subs	n, n, #8
   103  	b.lo	L(e0)
   104  
   105  L(chu):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
   106  	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
   107  	ld1	{v18.2d,v19.2d}, [bp], #32	C load 4 limbs
   108  	ld1	{v16.2d,v17.2d}, [bp], #32	C load 4 limbs
   109  	eor	v2.16b, v2.16b, v18.16b
   110  	eor	v3.16b, v3.16b, v19.16b
   111  	cnt	v6.16b, v2.16b
   112  	cnt	v7.16b, v3.16b
   113  	subs	n, n, #8
   114  	b.lo	L(end)
   115  
   116  L(top):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
   117  	ld1	{v18.2d,v19.2d}, [bp], #32	C load 4 limbs
   118  	eor	v0.16b, v0.16b, v16.16b
   119  	eor	v1.16b, v1.16b, v17.16b
   120  	uadalp	v4.8h,  v6.16b
   121  	cnt	v6.16b, v0.16b
   122  	uadalp	v5.8h,  v7.16b
   123  	cnt	v7.16b, v1.16b
   124  L(mid):	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
   125  	ld1	{v16.2d,v17.2d}, [bp], #32	C load 4 limbs
   126  	eor	v2.16b, v2.16b, v18.16b
   127  	eor	v3.16b, v3.16b, v19.16b
   128  	subs	n, n, #8
   129  	uadalp	v4.8h,  v6.16b
   130  	cnt	v6.16b, v2.16b
   131  	uadalp	v5.8h,  v7.16b
   132  	cnt	v7.16b, v3.16b
   133  	b.hs	L(top)
   134  
   135  L(end):	uadalp	v4.8h,  v6.16b
   136  	uadalp	v5.8h,  v7.16b
   137  L(sum):	eor	v0.16b, v0.16b, v16.16b
   138  	eor	v1.16b, v1.16b, v17.16b
   139  	cnt	v6.16b, v0.16b
   140  	cnt	v7.16b, v1.16b
   141  	uadalp	v4.8h,  v6.16b
   142  	uadalp	v5.8h,  v7.16b
   143  	add	v4.8h, v4.8h, v5.8h
   144  					C we have 8 16-bit counts
   145  L(e0):	uaddlp	v4.4s,  v4.8h		C we have 4 32-bit counts
   146  	uaddlp	v4.2d,  v4.4s		C we have 2 64-bit counts
   147  	mov	x0, v4.d[0]
   148  	mov	x1, v4.d[1]
   149  	add	x0, x0, x1
   150  	ret
   151  
   152  C Code for count > maxsize.  Splits operand and calls above code.
   153  define(`ap2', x5)			C caller-saves reg not used above
   154  define(`bp2', x6)			C caller-saves reg not used above
   155  L(gt8k):
   156  	mov	x8, x30
   157  	mov	x7, n			C full count (caller-saves reg not used above)
   158  	mov	x4, #0			C total sum  (caller-saves reg not used above)
   159  	mov	x9, #chunksize*8	C caller-saves reg not used above
   160  	mov	x10, #chunksize		C caller-saves reg not used above
   161  
   162  1:	add	ap2, ap, x9		C point at subsequent block
   163  	add	bp2, bp, x9		C point at subsequent block
   164  	mov	n, #chunksize-8		C count for this invocation, adjusted for entry pt
   165  	movi	v4.16b, #0		C clear chunk summation register
   166  	movi	v5.16b, #0		C clear chunk summation register
   167  	bl	L(chu)			C jump deep inside code
   168  	add	x4, x4, x0
   169  	mov	ap, ap2			C put chunk pointer in place for calls
   170  	mov	bp, bp2			C put chunk pointer in place for calls
   171  	sub	x7, x7, x10
   172  	cmp	x7, x11
   173  	b.hi	1b
   174  
   175  	mov	n, x7			C count for final invocation
   176  	bl	L(lt8k)
   177  	add	x0, x4, x0
   178  	mov	x30, x8
   179  	ret
   180  EPILOGUE()