github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/arm/neon/lshiftc.asm (about)

     1  dnl  ARM Neon mpn_lshiftc.
     2  
     3  dnl  Contributed to the GNU project by Torbjörn Granlund.
     4  
     5  dnl  Copyright 2013 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C	     cycles/limb     cycles/limb     cycles/limb      good
    36  C              aligned	      unaligned	      best seen	     for cpu?
    37  C StrongARM	 -		 -
    38  C XScale	 -		 -
    39  C Cortex-A7	 ?		 ?
    40  C Cortex-A8	 ?		 ?
    41  C Cortex-A9	 3.5		 3.5				Y
    42  C Cortex-A15	 1.75		 1.75				Y
    43  
    44  
    45  C We read 64 bits at a time at 32-bit aligned addresses, and except for the
    46  C first and last store, we write using 64-bit aligned addresses.  All shifting
    47  C is done on 64-bit words in 'extension' registers.
    48  C
    49  C It should be possible to read also using 64-bit alignment, by manipulating
    50  C the shift count for unaligned operands.  Not done, since it does not seem to
    51  C matter for A9 or A15.
    52  C
    53  C This will not work in big-endian mode.
    54  
    55  C TODO
    56  C  * Try using 128-bit operations.  Note that Neon lacks pure 128-bit shifts,
    57  C    which might make it tricky.
    58  C  * Clean up and simplify.
    59  C  * Consider sharing most of the code for lshift and rshift, since the feed-in
    60  C    code, the loop, and most of the wind-down code are identical.
    61  C  * Replace the basecase code with code using 'extension' registers.
    62  C  * Optimise.  It is not clear that this loop insn permutation is optimal for
    63  C    either A9 or A15.
    64  
    65  C INPUT PARAMETERS
    66  define(`rp',  `r0')
    67  define(`ap',  `r1')
    68  define(`n',   `r2')
    69  define(`cnt', `r3')
    70  
    71  	define(`IFLSH', `$1')
    72  	define(`IFRSH', `')
    73  	define(`X',`0')
    74  	define(`Y',`1')
    75  	define(`func',`mpn_lshiftc')
    76  define(`OPERATION_lshiftc',1)
    77  
    78  ASM_START()
    79  	TEXT
    80  	ALIGN(64)
    81  PROLOGUE(mpn_lshiftc)
    82  IFLSH(`	mov	r12, n, lsl #2	')
    83  IFLSH(`	add	rp, rp, r12	')
    84  IFLSH(`	add	ap, ap, r12	')
    85  
    86  	cmp	n, #4			C SIMD code n limit
    87  	ble	L(base)
    88  
    89  ifdef(`OPERATION_lshiftc',`
    90  	vdup.32	d6, r3			C left shift count is positive
    91  	sub	r3, r3, #64		C right shift count is negative
    92  	vdup.32	d7, r3
    93  	mov	r12, #-8')		C lshift pointer update offset
    94  ifdef(`OPERATION_rshift',`
    95  	rsb	r3, r3, #0		C right shift count is negative
    96  	vdup.32	d6, r3
    97  	add	r3, r3, #64		C left shift count is positive
    98  	vdup.32	d7, r3
    99  	mov	r12, #8')		C rshift pointer update offset
   100  
   101  IFLSH(`	sub	ap, ap, #8	')
   102  	vld1.32	{d19}, [ap], r12	C load initial 2 limbs
   103  	vshl.u64 d18, d19, d7		C retval
   104  
   105  	tst	rp, #4			C is rp 64-bit aligned already?
   106  	beq	L(rp_aligned)		C yes, skip
   107  	vmvn	 d19, d19
   108  IFLSH(`	add	ap, ap, #4	')	C move back ap pointer
   109  IFRSH(`	sub	ap, ap, #4	')	C move back ap pointer
   110  	vshl.u64 d4, d19, d6
   111  	sub	n, n, #1		C first limb handled
   112  IFLSH(`	sub	 rp, rp, #4	')
   113  	vst1.32	 {d4[Y]}, [rp]IFRSH(!)	C store first limb, rp gets aligned
   114  	vld1.32	 {d19}, [ap], r12	C load ap[1] and ap[2]
   115  
   116  L(rp_aligned):
   117  IFLSH(`	sub	rp, rp, #8	')
   118  	subs	n, n, #6
   119  	vmvn	 d19, d19
   120  	blt	L(two_or_three_more)
   121  	tst	n, #2
   122  	beq	L(2)
   123  
   124  L(1):	vld1.32	 {d17}, [ap], r12
   125  	vshl.u64 d5, d19, d6
   126  	vmvn	 d17, d17
   127  	vld1.32	 {d16}, [ap], r12
   128  	vshl.u64 d0, d17, d7
   129  	vshl.u64 d4, d17, d6
   130  	sub	n, n, #2
   131  	b	 L(mid)
   132  
   133  L(2):	vld1.32	 {d16}, [ap], r12
   134  	vshl.u64 d4, d19, d6
   135  	vmvn	 d16, d16
   136  	vld1.32	 {d17}, [ap], r12
   137  	vshl.u64 d1, d16, d7
   138  	vshl.u64 d5, d16, d6
   139  	subs	n, n, #4
   140  	blt	L(end)
   141  
   142  L(top):	vmvn	 d17, d17
   143  	vld1.32	 {d16}, [ap], r12
   144  	vorr	 d2, d4, d1
   145  	vshl.u64 d0, d17, d7
   146  	vshl.u64 d4, d17, d6
   147  	vst1.32	 {d2}, [rp:64], r12
   148  L(mid):	vmvn	 d16, d16
   149  	vld1.32	 {d17}, [ap], r12
   150  	vorr	 d3, d5, d0
   151  	vshl.u64 d1, d16, d7
   152  	vshl.u64 d5, d16, d6
   153  	vst1.32	 {d3}, [rp:64], r12
   154  	subs	n, n, #4
   155  	bge	L(top)
   156  
   157  L(end):	tst	 n, #1
   158  	beq	 L(evn)
   159  
   160  	vorr	 d2, d4, d1
   161  	vst1.32	 {d2}, [rp:64], r12
   162  	b	 L(cj1)
   163  
   164  L(evn):	vmvn	 d17, d17
   165  	vorr	 d2, d4, d1
   166  	vshl.u64 d0, d17, d7
   167  	vshl.u64 d4, d17, d6
   168  	vst1.32	 {d2}, [rp:64], r12
   169  	vmov.u8	 d17, #255
   170  	vorr	 d2, d5, d0
   171  	vshl.u64 d0, d17, d7
   172  	vorr	 d3, d4, d0
   173  	b	 L(cj2)
   174  
   175  C Load last 2 - 3 limbs, store last 4 - 5 limbs
   176  L(two_or_three_more):
   177  	tst	n, #1
   178  	beq	L(l2)
   179  
   180  L(l3):	vshl.u64 d5, d19, d6
   181  	vld1.32	 {d17}, [ap], r12
   182  L(cj1):	vmov.u8	 d16, #0
   183  IFLSH(`	add	 ap, ap, #4	')
   184  	vmvn	 d17, d17
   185  	vld1.32	 {d16[Y]}, [ap], r12
   186  	vshl.u64 d0, d17, d7
   187  	vshl.u64 d4, d17, d6
   188  	vmvn	 d16, d16
   189  	vorr	 d3, d5, d0
   190  	vshl.u64 d1, d16, d7
   191  	vshl.u64 d5, d16, d6
   192  	vst1.32	 {d3}, [rp:64], r12
   193  	vorr	 d2, d4, d1
   194  	vst1.32	 {d2}, [rp:64], r12
   195  IFLSH(`	add	 rp, rp, #4	')
   196  	vst1.32	 {d5[Y]}, [rp]
   197  	vmov.32	 r0, d18[X]
   198  	bx	lr
   199  
   200  L(l2):	vld1.32	 {d16}, [ap], r12
   201  	vshl.u64 d4, d19, d6
   202  	vmvn	 d16, d16
   203  	vshl.u64 d1, d16, d7
   204  	vshl.u64 d5, d16, d6
   205  	vmov.u8	 d17, #255
   206  	vorr	 d2, d4, d1
   207  	vshl.u64 d0, d17, d7
   208  	vorr	 d3, d5, d0
   209  L(cj2):	vst1.32	 {d2}, [rp:64], r12
   210  	vst1.32	 {d3}, [rp]
   211  	vmov.32	 r0, d18[X]
   212  	bx	lr
   213  
   214  
   215  define(`tnc', `r12')
   216  L(base):
   217  	push	{r4, r6, r7, r8}
   218  	ldr	r4, [ap, #-4]!
   219  	rsb	tnc, cnt, #32
   220  	mvn	r6, r4
   221  
   222  	mov	r7, r6, lsl cnt
   223  	tst	n, #1
   224  	beq	L(ev)			C n even
   225  
   226  L(od):	subs	n, n, #2
   227  	bcc	L(ed1)			C n = 1
   228  	ldr	r8, [ap, #-4]!
   229  	mvn	r8, r8
   230  	b	L(md)			C n = 3
   231  
   232  L(ev):	ldr	r6, [ap, #-4]!
   233  	mvn	r6, r6
   234  	subs	n, n, #2
   235  	beq	L(ed)			C n = 3
   236  					C n = 4
   237  L(tp):	ldr	r8, [ap, #-4]!
   238  	orr	r7, r7, r6, lsr tnc
   239  	str	r7, [rp, #-4]!
   240  	mvn	r8, r8
   241  	mov	r7, r6, lsl cnt
   242  L(md):	ldr	r6, [ap, #-4]!
   243  	orr	r7, r7, r8, lsr tnc
   244  	str	r7, [rp, #-4]!
   245  	mvn	r6, r6
   246  	mov	r7, r8, lsl cnt
   247  
   248  L(ed):	orr	r7, r7, r6, lsr tnc
   249  	str	r7, [rp, #-4]!
   250  	mov	r7, r6, lsl cnt
   251  L(ed1):	mvn	r6, #0
   252  	orr	r7, r7, r6, lsr tnc
   253  	str	r7, [rp, #-4]
   254  	mov	r0, r4, lsr tnc
   255  	pop	{r4, r6, r7, r8}
   256  	bx	r14
   257  EPILOGUE()