github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/arm/neon/lorrshift.asm (about)

     1  dnl  ARM Neon mpn_lshift and mpn_rshift.
     2  
     3  dnl  Contributed to the GNU project by Torbjörn Granlund.
     4  
     5  dnl  Copyright 2013 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C	     cycles/limb     cycles/limb     cycles/limb      good
    36  C              aligned	      unaligned	      best seen	     for cpu?
    37  C StrongARM	 -		 -
    38  C XScale	 -		 -
    39  C Cortex-A7	 ?		 ?
    40  C Cortex-A8	 ?		 ?
    41  C Cortex-A9	 3		 3				Y
    42  C Cortex-A15	 1.5		 1.5				Y
    43  
    44  
    45  C We read 64 bits at a time at 32-bit aligned addresses, and except for the
    46  C first and last store, we write using 64-bit aligned addresses.  All shifting
    47  C is done on 64-bit words in 'extension' registers.
    48  C
    49  C It should be possible to read also using 64-bit alignment, by manipulating
    50  C the shift count for unaligned operands.  Not done, since it does not seem to
    51  C matter for A9 or A15.
    52  C
    53  C This will not work in big-endian mode.
    54  
    55  C TODO
    56  C  * Try using 128-bit operations.  Note that Neon lacks pure 128-bit shifts,
    57  C    which might make it tricky.
    58  C  * Clean up and simplify.
    59  C  * Consider sharing most of the code for lshift and rshift, since the feed-in code,
    60  C    the loop, and most of the wind-down code are identical.
    61  C  * Replace the basecase code with code using 'extension' registers.
    62  C  * Optimise.  It is not clear that this loop insn permutation is optimal for
    63  C    either A9 or A15.
    64  
    65  C INPUT PARAMETERS
    66  define(`rp',  `r0')
    67  define(`ap',  `r1')
    68  define(`n',   `r2')
    69  define(`cnt', `r3')
    70  
    71  ifdef(`OPERATION_lshift',`
    72  	define(`IFLSH', `$1')
    73  	define(`IFRSH', `')
    74  	define(`X',`0')
    75  	define(`Y',`1')
    76  	define(`func',`mpn_lshift')
    77  ')
    78  ifdef(`OPERATION_rshift',`
    79  	define(`IFLSH', `')
    80  	define(`IFRSH', `$1')
    81  	define(`X',`1')
    82  	define(`Y',`0')
    83  	define(`func',`mpn_rshift')
    84  ')
    85  
    86  MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
    87  
    88  ASM_START()
    89  	TEXT
    90  	ALIGN(64)
    91  PROLOGUE(func)
    92  IFLSH(`	mov	r12, n, lsl #2	')
    93  IFLSH(`	add	rp, rp, r12	')
    94  IFLSH(`	add	ap, ap, r12	')
    95  
    96  	cmp	n, #4			C SIMD code n limit
    97  	ble	L(base)
    98  
    99  ifdef(`OPERATION_lshift',`
   100  	vdup.32	d6, r3			C left shift count is positive
   101  	sub	r3, r3, #64		C right shift count is negative
   102  	vdup.32	d7, r3
   103  	mov	r12, #-8')		C lshift pointer update offset
   104  ifdef(`OPERATION_rshift',`
   105  	rsb	r3, r3, #0		C right shift count is negative
   106  	vdup.32	d6, r3
   107  	add	r3, r3, #64		C left shift count is positive
   108  	vdup.32	d7, r3
   109  	mov	r12, #8')		C rshift pointer update offset
   110  
   111  IFLSH(`	sub	ap, ap, #8	')
   112  	vld1.32	{d19}, [ap], r12	C load initial 2 limbs
   113  	vshl.u64 d18, d19, d7		C retval
   114  
   115  	tst	rp, #4			C is rp 64-bit aligned already?
   116  	beq	L(rp_aligned)		C yes, skip
   117  IFLSH(`	add	ap, ap, #4	')	C move back ap pointer
   118  IFRSH(`	sub	ap, ap, #4	')	C move back ap pointer
   119  	vshl.u64 d4, d19, d6
   120  	sub	n, n, #1		C first limb handled
   121  IFLSH(`	sub	 rp, rp, #4	')
   122  	vst1.32	 {d4[Y]}, [rp]IFRSH(!)	C store first limb, rp gets aligned
   123  	vld1.32	 {d19}, [ap], r12	C load ap[1] and ap[2]
   124  
   125  L(rp_aligned):
   126  IFLSH(`	sub	rp, rp, #8	')
   127  	subs	n, n, #6
   128  	blt	L(two_or_three_more)
   129  	tst	n, #2
   130  	beq	L(2)
   131  
   132  L(1):	vld1.32	 {d17}, [ap], r12
   133  	vshl.u64 d5, d19, d6
   134  	vld1.32	 {d16}, [ap], r12
   135  	vshl.u64 d0, d17, d7
   136  	vshl.u64 d4, d17, d6
   137  	sub	n, n, #2
   138  	b	 L(mid)
   139  
   140  L(2):	vld1.32	 {d16}, [ap], r12
   141  	vshl.u64 d4, d19, d6
   142  	vld1.32	 {d17}, [ap], r12
   143  	vshl.u64 d1, d16, d7
   144  	vshl.u64 d5, d16, d6
   145  	subs	n, n, #4
   146  	blt	L(end)
   147  
   148  L(top):	vld1.32	 {d16}, [ap], r12
   149  	vorr	 d2, d4, d1
   150  	vshl.u64 d0, d17, d7
   151  	vshl.u64 d4, d17, d6
   152  	vst1.32	 {d2}, [rp:64], r12
   153  L(mid):	vld1.32	 {d17}, [ap], r12
   154  	vorr	 d3, d5, d0
   155  	vshl.u64 d1, d16, d7
   156  	vshl.u64 d5, d16, d6
   157  	vst1.32	 {d3}, [rp:64], r12
   158  	subs	n, n, #4
   159  	bge	L(top)
   160  
   161  L(end):	tst	 n, #1
   162  	beq	 L(evn)
   163  
   164  	vorr	 d2, d4, d1
   165  	vst1.32	 {d2}, [rp:64], r12
   166  	b	 L(cj1)
   167  
   168  L(evn):	vorr	 d2, d4, d1
   169  	vshl.u64 d0, d17, d7
   170  	vshl.u64 d16, d17, d6
   171  	vst1.32	 {d2}, [rp:64], r12
   172  	vorr	 d2, d5, d0
   173  	b	 L(cj2)
   174  
   175  C Load last 2 - 3 limbs, store last 4 - 5 limbs
   176  L(two_or_three_more):
   177  	tst	n, #1
   178  	beq	L(l2)
   179  
   180  L(l3):	vshl.u64 d5, d19, d6
   181  	vld1.32	 {d17}, [ap], r12
   182  L(cj1):	veor	 d16, d16, d16
   183  IFLSH(`	add	 ap, ap, #4	')
   184  	vld1.32	 {d16[Y]}, [ap], r12
   185  	vshl.u64 d0, d17, d7
   186  	vshl.u64 d4, d17, d6
   187  	vorr	 d3, d5, d0
   188  	vshl.u64 d1, d16, d7
   189  	vshl.u64 d5, d16, d6
   190  	vst1.32	 {d3}, [rp:64], r12
   191  	vorr	 d2, d4, d1
   192  	vst1.32	 {d2}, [rp:64], r12
   193  IFLSH(`	add	 rp, rp, #4	')
   194  	vst1.32	 {d5[Y]}, [rp]
   195  	vmov.32	 r0, d18[X]
   196  	bx	lr
   197  
   198  L(l2):	vld1.32	 {d16}, [ap], r12
   199  	vshl.u64 d4, d19, d6
   200  	vshl.u64 d1, d16, d7
   201  	vshl.u64 d16, d16, d6
   202  	vorr	 d2, d4, d1
   203  L(cj2):	vst1.32	 {d2}, [rp:64], r12
   204  	vst1.32	 {d16}, [rp]
   205  	vmov.32	 r0, d18[X]
   206  	bx	lr
   207  
   208  
   209  define(`tnc', `r12')
   210  L(base):
   211  	push	{r4, r6, r7, r8}
   212  ifdef(`OPERATION_lshift',`
   213  	ldr	r4, [ap, #-4]!
   214  	rsb	tnc, cnt, #32
   215  
   216  	mov	r7, r4, lsl cnt
   217  	tst	n, #1
   218  	beq	L(ev)			C n even
   219  
   220  L(od):	subs	n, n, #2
   221  	bcc	L(ed1)			C n = 1
   222  	ldr	r8, [ap, #-4]!
   223  	b	L(md)			C n = 3
   224  
   225  L(ev):	ldr	r6, [ap, #-4]!
   226  	subs	n, n, #2
   227  	beq	L(ed)			C n = 3
   228  					C n = 4
   229  L(tp):	ldr	r8, [ap, #-4]!
   230  	orr	r7, r7, r6, lsr tnc
   231  	str	r7, [rp, #-4]!
   232  	mov	r7, r6, lsl cnt
   233  L(md):	ldr	r6, [ap, #-4]!
   234  	orr	r7, r7, r8, lsr tnc
   235  	str	r7, [rp, #-4]!
   236  	mov	r7, r8, lsl cnt
   237  
   238  L(ed):	orr	r7, r7, r6, lsr tnc
   239  	str	r7, [rp, #-4]!
   240  	mov	r7, r6, lsl cnt
   241  L(ed1):	str	r7, [rp, #-4]
   242  	mov	r0, r4, lsr tnc
   243  ')
   244  ifdef(`OPERATION_rshift',`
   245  	ldr	r4, [ap]
   246  	rsb	tnc, cnt, #32
   247  
   248  	mov	r7, r4, lsr cnt
   249  	tst	n, #1
   250  	beq	L(ev)			C n even
   251  
   252  L(od):	subs	n, n, #2
   253  	bcc	L(ed1)			C n = 1
   254  	ldr	r8, [ap, #4]!
   255  	b	L(md)			C n = 3
   256  
   257  L(ev):	ldr	r6, [ap, #4]!
   258  	subs	n, n, #2
   259  	beq	L(ed)			C n = 2
   260  					C n = 4
   261  
   262  L(tp):	ldr	r8, [ap, #4]!
   263  	orr	r7, r7, r6, lsl tnc
   264  	str	r7, [rp], #4
   265  	mov	r7, r6, lsr cnt
   266  L(md):	ldr	r6, [ap, #4]!
   267  	orr	r7, r7, r8, lsl tnc
   268  	str	r7, [rp], #4
   269  	mov	r7, r8, lsr cnt
   270  
   271  L(ed):	orr	r7, r7, r6, lsl tnc
   272  	str	r7, [rp], #4
   273  	mov	r7, r6, lsr cnt
   274  L(ed1):	str	r7, [rp], #4
   275  	mov	r0, r4, lsl tnc
   276  ')
   277  	pop	{r4, r6, r7, r8}
   278  	bx	r14
   279  EPILOGUE()