github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/arm/v6/addmul_3.asm (about)

     1  dnl  ARM mpn_addmul_3.
     2  
     3  dnl  Contributed to the GNU project by Torbjörn Granlund.
     4  
     5  dnl  Copyright 2013 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C	     cycles/limb
    36  C StrongARM:	 -
    37  C XScale	 -
    38  C ARM11		 4.33
    39  C Cortex-A7	 3.23
    40  C Cortex-A8	 3.19
    41  C Cortex-A9	 2.125
    42  C Cortex-A15	 2
    43  
    44  C TODO
    45  C  * Use a fast path for n <= KARATSUBA_MUL_THRESHOLD using a jump table,
    46  C    avoiding the current multiply.
    47  C  * Start the first multiply or multiplies early.
    48  
    49  define(`rp',`r0')
    50  define(`up',`r1')
    51  define(`n', `r2')
    52  define(`vp',`r3')
    53  
    54  define(`v0',`r4')  define(`v1',`r5')  define(`v2',`r6')
    55  define(`u0',`r3')  define(`u1',`r14')
    56  define(`w0',`r7')  define(`w1',`r8')  define(`w2',`r9')
    57  define(`cy0',`r10')  define(`cy1',`r11') define(`cy2',`r12')
    58  
    59  
    60  ASM_START()
    61  PROLOGUE(mpn_addmul_3)
    62  	push	{ r4-r11, r14 }
    63  
    64  	ldr	w0, =0xaaaaaaab		C 3^{-1} mod 2^32
    65  	ldm	vp, { v0,v1,v2 }
    66  	mov	cy0, #0
    67  	mov	cy1, #0
    68  	mov	cy2, #0
    69  
    70  C Tricky n mod 6
    71  	mul	w0, w0, n		C n * 3^{-1} mod 2^32
    72  	and	w0, w0, #0xc0000001	C pseudo-CRT mod 3,2
    73  	sub	n, n, #3
    74  ifdef(`PIC',`
    75  	add	pc, pc, w0, ror $28
    76  	nop
    77  	b	L(b0)
    78  	b	L(b2)
    79  	b	L(b4)
    80  	.word	0xe7f000f0	C udf
    81  	b	L(b3)
    82  	b	L(b5)
    83  	b	L(b1)
    84  ',`
    85  	ldr	pc, [pc, w0, ror $28]
    86  	nop
    87  	.word	L(b0), L(b2), L(b4), 0, L(b3), L(b5), L(b1)
    88  ')
    89  
    90  L(b5):	add	up, up, #-8
    91  	ldr	w1, [rp, #0]
    92  	ldr	w2, [rp, #4]
    93  	ldr	u1, [up, #8]
    94  	b	L(lo5)
    95  
    96  L(b4):	add	rp, rp, #-4
    97  	add	up, up, #-12
    98  	ldr	w2, [rp, #4]
    99  	ldr	w0, [rp, #8]
   100  	ldr	u0, [up, #12]
   101  	b	L(lo4)
   102  
   103  L(b3):	add	rp, rp, #-8
   104  	add	up, up, #-16
   105  	ldr	w0, [rp, #8]
   106  	ldr	w1, [rp, #12]
   107  	ldr	u1, [up, #16]
   108  	b	L(lo3)
   109  
   110  L(b1):	add	rp, rp, #8
   111  	ldr	w2, [rp, #-8]
   112  	ldr	w0, [rp, #-4]
   113  	ldr	u1, [up, #0]
   114  	b	L(lo1)
   115  
   116  L(b0):	add	rp, rp, #4
   117  	add	up, up, #-4
   118  	ldr	w0, [rp, #-4]
   119  	ldr	w1, [rp, #0]
   120  	ldr	u0, [up, #4]
   121  	b	L(lo0)
   122  
   123  L(b2):	add	rp, rp, #12
   124  	add	up, up, #4
   125  	ldr	w1, [rp, #-12]
   126  	ldr	w2, [rp, #-8]
   127  	ldr	u0, [up, #-4]
   128  
   129  	ALIGN(16)
   130  L(top):	ldr	w0, [rp, #-4]
   131  	umaal	w1, cy0, u0, v0
   132  	ldr	u1, [up, #0]
   133  	umaal	w2, cy1, u0, v1
   134  	str	w1, [rp, #-12]
   135  	umaal	w0, cy2, u0, v2
   136  L(lo1):	ldr	w1, [rp, #0]
   137  	umaal	w2, cy0, u1, v0
   138  	ldr	u0, [up, #4]
   139  	umaal	w0, cy1, u1, v1
   140  	str	w2, [rp, #-8]
   141  	umaal	w1, cy2, u1, v2
   142  L(lo0):	ldr	w2, [rp, #4]
   143  	umaal	w0, cy0, u0, v0
   144  	ldr	u1, [up, #8]
   145  	umaal	w1, cy1, u0, v1
   146  	str	w0, [rp, #-4]
   147  	umaal	w2, cy2, u0, v2
   148  L(lo5):	ldr	w0, [rp, #8]
   149  	umaal	w1, cy0, u1, v0
   150  	ldr	u0, [up, #12]
   151  	umaal	w2, cy1, u1, v1
   152  	str	w1, [rp, #0]
   153  	umaal	w0, cy2, u1, v2
   154  L(lo4):	ldr	w1, [rp, #12]
   155  	umaal	w2, cy0, u0, v0
   156  	ldr	u1, [up, #16]
   157  	umaal	w0, cy1, u0, v1
   158  	str	w2, [rp, #4]
   159  	umaal	w1, cy2, u0, v2
   160  L(lo3):	ldr	w2, [rp, #16]
   161  	umaal	w0, cy0, u1, v0
   162  	ldr	u0, [up, #20]
   163  	umaal	w1, cy1, u1, v1
   164  	str	w0, [rp, #8]
   165  	umaal	w2, cy2, u1, v2
   166  L(lo2):	subs	n, n, #6
   167  	add	up, up, #24
   168  	add	rp, rp, #24
   169  	bge	L(top)
   170  
   171  L(end):	umaal	w1, cy0, u0, v0
   172  	ldr	u1, [up, #0]
   173  	umaal	w2, cy1, u0, v1
   174  	str	w1, [rp, #-12]
   175  	mov	w0, #0
   176  	umaal	w0, cy2, u0, v2
   177  	umaal	w2, cy0, u1, v0
   178  	umaal	w0, cy1, u1, v1
   179  	str	w2, [rp, #-8]
   180  	umaal	cy1, cy2, u1, v2
   181  	adds	w0, w0, cy0
   182  	str	w0, [rp, #-4]
   183  	adcs	w1, cy1, #0
   184  	str	w1, [rp, #0]
   185  	adc	r0, cy2, #0
   186  
   187  	pop	{ r4-r11, pc }
   188  EPILOGUE()