github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/arm/v6/sqr_basecase.asm (about)

     1  dnl  ARM v6 mpn_sqr_basecase.
     2  
     3  dnl  Contributed to the GNU project by Torbjörn Granlund.
     4  
     5  dnl  Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C Code structure:
    36  C
    37  C
    38  C        m_2(0m4)        m_2(2m4)        m_2(1m4)        m_2(3m4)
    39  C           |               |               |               |
    40  C           |               |               |               |
    41  C           |               |               |               |
    42  C          \|/             \|/             \|/             \|/
    43  C              ____________                   ____________
    44  C             /            \                 /            \
    45  C            \|/            \               \|/            \
    46  C         am_2(3m4)       am_2(1m4)       am_2(0m4)       am_2(2m4)
    47  C            \            /|\                \            /|\
    48  C             \____________/                  \____________/
    49  C                       \                        /
    50  C                        \                      /
    51  C                         \                    /
    52  C                         cor3             cor2
    53  C                            \              /
    54  C                             \            /
    55  C                            sqr_diag_addlsh1
    56  
    57  C TODO
    58  C  * Align more labels.
    59  C  * Further tweak counter and updates in outer loops.  (This could save
    60  C    perhaps 5n cycles).
    61  C  * Avoid sub-with-lsl in outer loops.  We could keep n up-shifted, then
    62  C    initialise loop counter i with a right shift.
    63  C  * Try to use fewer register.  Perhaps coalesce r9 branch target and n_saved.
    64  C    (This could save 2-3 cycles for n > 4.)
    65  C  * Optimise sqr_diag_addlsh1 loop.  The current code uses old-style carry
    66  C    propagation.
    67  C  * Stop loops earlier suppressing writes of upper-most rp[] values.
    68  C  * The addmul_2 loops here runs well on all cores, but mul_2 runs poorly
    69  C    particularly on Cortex-A8.
    70  
    71  
    72  define(`rp',      r0)
    73  define(`up',      r1)
    74  define(`n',       r2)
    75  
    76  define(`v0',      r3)
    77  define(`v1',      r6)
    78  define(`i',       r8)
    79  define(`n_saved', r14)
    80  define(`cya',     r11)
    81  define(`cyb',     r12)
    82  define(`u0',      r7)
    83  define(`u1',      r9)
    84  
    85  ASM_START()
    86  PROLOGUE(mpn_sqr_basecase)
    87  	and	r12, n, #3
    88  	cmp	n, #4
    89  	addgt	r12, r12, #4
    90  	add	pc, pc, r12, lsl #2
    91  	nop
    92  	b	L(4)
    93  	b	L(1)
    94  	b	L(2)
    95  	b	L(3)
    96  	b	L(0m4)
    97  	b	L(1m4)
    98  	b	L(2m4)
    99  	b	L(3m4)
   100  
   101  
   102  L(1m4):	push	{r4-r11, r14}
   103  	mov	n_saved, n
   104  	sub	i, n, #4
   105  	sub	n, n, #2
   106  	add	r10, pc, #L(am2_2m4)-.-8
   107  	ldm	up, {v0,v1,u0}
   108  	sub	up, up, #4
   109  	mov	cyb, #0
   110  	mov	r5, #0
   111  	umull	r4, cya, v1, v0
   112  	str	r4, [rp], #-12
   113  	mov	r4, #0
   114  	b	L(ko0)
   115  
   116  L(3m4):	push	{r4-r11, r14}
   117  	mov	n_saved, n
   118  	sub	i, n, #4
   119  	sub	n, n, #2
   120  	add	r10, pc, #L(am2_0m4)-.-8
   121  	ldm	up, {v0,v1,u0}
   122  	add	up, up, #4
   123  	mov	cyb, #0
   124  	mov	r5, #0
   125  	umull	r4, cya, v1, v0
   126  	str	r4, [rp], #-4
   127  	mov	r4, #0
   128  	b	L(ko2)
   129  
   130  L(2m4):	push	{r4-r11, r14}
   131  	mov	n_saved, n
   132  	sub	i, n, #4
   133  	sub	n, n, #2
   134  	add	r10, pc, #L(am2_3m4)-.-8
   135  	ldm	up, {v0,v1,u1}
   136  	mov	cyb, #0
   137  	mov	r4, #0
   138  	umull	r5, cya, v1, v0
   139  	str	r5, [rp], #-8
   140  	mov	r5, #0
   141  	b	L(ko1)
   142  
   143  L(0m4):	push	{r4-r11, r14}
   144  	mov	n_saved, n
   145  	sub	i, n, #4
   146  	sub	n, n, #2
   147  	add	r10, pc, #L(am2_1m4)-.-8
   148  	ldm	up, {v0,v1,u1}
   149  	mov	cyb, #0
   150  	mov	r4, #0
   151  	add	up, up, #8
   152  	umull	r5, cya, v1, v0
   153  	str	r5, [rp, #0]
   154  	mov	r5, #0
   155  
   156  L(top):	ldr	u0, [up, #4]
   157  	umaal	r4, cya, u1, v0
   158  	str	r4, [rp, #4]
   159  	mov	r4, #0
   160  	umaal	r5, cyb, u1, v1
   161  L(ko2):	ldr	u1, [up, #8]
   162  	umaal	r5, cya, u0, v0
   163  	str	r5, [rp, #8]
   164  	mov	r5, #0
   165  	umaal	r4, cyb, u0, v1
   166  L(ko1):	ldr	u0, [up, #12]
   167  	umaal	r4, cya, u1, v0
   168  	str	r4, [rp, #12]
   169  	mov	r4, #0
   170  	umaal	r5, cyb, u1, v1
   171  L(ko0):	ldr	u1, [up, #16]!
   172  	umaal	r5, cya, u0, v0
   173  	str	r5, [rp, #16]!
   174  	mov	r5, #0
   175  	umaal	r4, cyb, u0, v1
   176  	subs	i, i, #4
   177  	bhi	L(top)
   178  
   179  	umaal	r4, cya, u1, v0
   180  	ldr	u0, [up, #4]
   181  	umaal	r5, cyb, u1, v1
   182  	str	r4, [rp, #4]
   183  	umaal	r5, cya, u0, v0
   184  	umaal	cya, cyb, u0, v1
   185  	str	r5, [rp, #8]
   186  	str	cya, [rp, #12]
   187  	str	cyb, [rp, #16]
   188  
   189  	add	up, up, #4
   190  	sub	n, n, #1
   191  	add	rp, rp, #8
   192  	bx	r10
   193  
   194  L(evnloop):
   195  	subs	i, n, #6
   196  	sub	n, n, #2
   197  	blt	L(cor2)
   198  	ldm	up, {v0,v1,u1}
   199  	add	up, up, #8
   200  	mov	cya, #0
   201  	mov	cyb, #0
   202  	ldr	r4, [rp, #-4]
   203  	umaal	r4, cya, v1, v0
   204  	str	r4, [rp, #-4]
   205  	ldr	r4, [rp, #0]
   206  
   207  	ALIGN(16)
   208  L(ua2):	ldr	r5, [rp, #4]
   209  	umaal	r4, cya, u1, v0
   210  	ldr	u0, [up, #4]
   211  	umaal	r5, cyb, u1, v1
   212  	str	r4, [rp, #0]
   213  	ldr	r4, [rp, #8]
   214  	umaal	r5, cya, u0, v0
   215  	ldr	u1, [up, #8]
   216  	umaal	r4, cyb, u0, v1
   217  	str	r5, [rp, #4]
   218  	ldr	r5, [rp, #12]
   219  	umaal	r4, cya, u1, v0
   220  	ldr	u0, [up, #12]
   221  	umaal	r5, cyb, u1, v1
   222  	str	r4, [rp, #8]
   223  	ldr	r4, [rp, #16]!
   224  	umaal	r5, cya, u0, v0
   225  	ldr	u1, [up, #16]!
   226  	umaal	r4, cyb, u0, v1
   227  	str	r5, [rp, #-4]
   228  	subs	i, i, #4
   229  	bhs	L(ua2)
   230  
   231  	umaal	r4, cya, u1, v0
   232  	umaal	cya, cyb, u1, v1
   233  	str	r4, [rp, #0]
   234  	str	cya, [rp, #4]
   235  	str	cyb, [rp, #8]
   236  L(am2_0m4):
   237  	sub	rp, rp, n, lsl #2
   238  	sub	up, up, n, lsl #2
   239  	add	rp, rp, #8
   240  
   241  	sub	i, n, #4
   242  	sub	n, n, #2
   243  	ldm	up, {v0,v1,u1}
   244  	mov	cya, #0
   245  	mov	cyb, #0
   246  	ldr	r4, [rp, #4]
   247  	umaal	r4, cya, v1, v0
   248  	str	r4, [rp, #4]
   249  	ldr	r4, [rp, #8]
   250  	b	L(lo0)
   251  
   252  	ALIGN(16)
   253  L(ua0):	ldr	r5, [rp, #4]
   254  	umaal	r4, cya, u1, v0
   255  	ldr	u0, [up, #4]
   256  	umaal	r5, cyb, u1, v1
   257  	str	r4, [rp, #0]
   258  	ldr	r4, [rp, #8]
   259  	umaal	r5, cya, u0, v0
   260  	ldr	u1, [up, #8]
   261  	umaal	r4, cyb, u0, v1
   262  	str	r5, [rp, #4]
   263  L(lo0):	ldr	r5, [rp, #12]
   264  	umaal	r4, cya, u1, v0
   265  	ldr	u0, [up, #12]
   266  	umaal	r5, cyb, u1, v1
   267  	str	r4, [rp, #8]
   268  	ldr	r4, [rp, #16]!
   269  	umaal	r5, cya, u0, v0
   270  	ldr	u1, [up, #16]!
   271  	umaal	r4, cyb, u0, v1
   272  	str	r5, [rp, #-4]
   273  	subs	i, i, #4
   274  	bhs	L(ua0)
   275  
   276  	umaal	r4, cya, u1, v0
   277  	umaal	cya, cyb, u1, v1
   278  	str	r4, [rp, #0]
   279  	str	cya, [rp, #4]
   280  	str	cyb, [rp, #8]
   281  L(am2_2m4):
   282  	sub	rp, rp, n, lsl #2
   283  	sub	up, up, n, lsl #2
   284  	add	rp, rp, #16
   285  	b	L(evnloop)
   286  
   287  
   288  L(oddloop):
   289  	sub	i, n, #5
   290  	sub	n, n, #2
   291  	ldm	up, {v0,v1,u0}
   292  	mov	cya, #0
   293  	mov	cyb, #0
   294  	ldr	r5, [rp, #0]
   295  	umaal	r5, cya, v1, v0
   296  	str	r5, [rp, #0]
   297  	ldr	r5, [rp, #4]
   298  	add	up, up, #4
   299  	b	L(lo1)
   300  
   301  	ALIGN(16)
   302  L(ua1):	ldr	r5, [rp, #4]
   303  	umaal	r4, cya, u1, v0
   304  	ldr	u0, [up, #4]
   305  	umaal	r5, cyb, u1, v1
   306  	str	r4, [rp, #0]
   307  L(lo1):	ldr	r4, [rp, #8]
   308  	umaal	r5, cya, u0, v0
   309  	ldr	u1, [up, #8]
   310  	umaal	r4, cyb, u0, v1
   311  	str	r5, [rp, #4]
   312  	ldr	r5, [rp, #12]
   313  	umaal	r4, cya, u1, v0
   314  	ldr	u0, [up, #12]
   315  	umaal	r5, cyb, u1, v1
   316  	str	r4, [rp, #8]
   317  	ldr	r4, [rp, #16]!
   318  	umaal	r5, cya, u0, v0
   319  	ldr	u1, [up, #16]!
   320  	umaal	r4, cyb, u0, v1
   321  	str	r5, [rp, #-4]
   322  	subs	i, i, #4
   323  	bhs	L(ua1)
   324  
   325  	umaal	r4, cya, u1, v0
   326  	umaal	cya, cyb, u1, v1
   327  	str	r4, [rp, #0]
   328  	str	cya, [rp, #4]
   329  	str	cyb, [rp, #8]
   330  L(am2_3m4):
   331  	sub	rp, rp, n, lsl #2
   332  	sub	up, up, n, lsl #2
   333  	add	rp, rp, #4
   334  
   335  	subs	i, n, #3
   336  	beq	L(cor3)
   337  	sub	n, n, #2
   338  	ldm	up, {v0,v1,u0}
   339  	mov	cya, #0
   340  	mov	cyb, #0
   341  	ldr	r5, [rp, #8]
   342  	sub	up, up, #4
   343  	umaal	r5, cya, v1, v0
   344  	str	r5, [rp, #8]
   345  	ldr	r5, [rp, #12]
   346  	b	L(lo3)
   347  
   348  	ALIGN(16)
   349  L(ua3):	ldr	r5, [rp, #4]
   350  	umaal	r4, cya, u1, v0
   351  	ldr	u0, [up, #4]
   352  	umaal	r5, cyb, u1, v1
   353  	str	r4, [rp, #0]
   354  	ldr	r4, [rp, #8]
   355  	umaal	r5, cya, u0, v0
   356  	ldr	u1, [up, #8]
   357  	umaal	r4, cyb, u0, v1
   358  	str	r5, [rp, #4]
   359  	ldr	r5, [rp, #12]
   360  	umaal	r4, cya, u1, v0
   361  	ldr	u0, [up, #12]
   362  	umaal	r5, cyb, u1, v1
   363  	str	r4, [rp, #8]
   364  L(lo3):	ldr	r4, [rp, #16]!
   365  	umaal	r5, cya, u0, v0
   366  	ldr	u1, [up, #16]!
   367  	umaal	r4, cyb, u0, v1
   368  	str	r5, [rp, #-4]
   369  	subs	i, i, #4
   370  	bhs	L(ua3)
   371  
   372  	umaal	r4, cya, u1, v0
   373  	umaal	cya, cyb, u1, v1
   374  	str	r4, [rp, #0]
   375  	str	cya, [rp, #4]
   376  	str	cyb, [rp, #8]
   377  L(am2_1m4):
   378  	sub	rp, rp, n, lsl #2
   379  	sub	up, up, n, lsl #2
   380  	add	rp, rp, #12
   381  	b	L(oddloop)
   382  
   383  
   384  L(cor3):ldm	up, {v0,v1,u0}
   385  	ldr	r5, [rp, #8]
   386  	mov	cya, #0
   387  	mov	cyb, #0
   388  	umaal	r5, cya, v1, v0
   389  	str	r5, [rp, #8]
   390  	ldr	r5, [rp, #12]
   391  	ldr	r4, [rp, #16]
   392  	umaal	r5, cya, u0, v0
   393  	ldr	u1, [up, #12]
   394  	umaal	r4, cyb, u0, v1
   395  	str	r5, [rp, #12]
   396  	umaal	r4, cya, u1, v0
   397  	umaal	cya, cyb, u1, v1
   398  	str	r4, [rp, #16]
   399  	str	cya, [rp, #20]
   400  	str	cyb, [rp, #24]
   401  	add	up, up, #16
   402  	mov	cya, cyb
   403  	adds	rp, rp, #36		C clear cy
   404  	mov	cyb, #0
   405  	umaal	cya, cyb, u1, u0
   406  	b	L(sqr_diag_addlsh1)
   407  
   408  L(cor2):
   409  	ldm	up!, {v0,v1,u0}
   410  	mov	r4, cya
   411  	mov	r5, cyb
   412  	mov	cya, #0
   413  	umaal	r4, cya, v1, v0
   414  	mov	cyb, #0
   415  	umaal	r5, cya, u0, v0
   416  	strd	r4, r5, [rp, #-4]
   417  	umaal	cya, cyb, u0, v1
   418  	add	rp, rp, #16
   419  C	b	L(sqr_diag_addlsh1)
   420  
   421  
   422  define(`w0',  r6)
   423  define(`w1',  r7)
   424  define(`w2',  r8)
   425  define(`rbx', r9)
   426  
   427  L(sqr_diag_addlsh1):
   428  	str	cya, [rp, #-12]
   429  	str	cyb, [rp, #-8]
   430  	sub	n, n_saved, #1
   431  	sub	up, up, n_saved, lsl #2
   432  	sub	rp, rp, n_saved, lsl #3
   433  	ldr	r3, [up], #4
   434  	umull	w1, r5, r3, r3
   435  	mov	w2, #0
   436  	mov	r10, #0
   437  C	cmn	r0, #0			C clear cy (already clear)
   438  	b	L(lm)
   439  
   440  L(tsd):	adds	w0, w0, rbx
   441  	adcs	w1, w1, r4
   442  	str	w0, [rp, #0]
   443  L(lm):	ldr	w0, [rp, #4]
   444  	str	w1, [rp, #4]
   445  	ldr	w1, [rp, #8]!
   446  	add	rbx, r5, w2
   447  	adcs	w0, w0, w0
   448  	ldr	r3, [up], #4
   449  	adcs	w1, w1, w1
   450  	adc	w2, r10, r10
   451  	umull	r4, r5, r3, r3
   452  	subs	n, n, #1
   453  	bne	L(tsd)
   454  
   455  	adds	w0, w0, rbx
   456  	adcs	w1, w1, r4
   457  	adc	w2, r5, w2
   458  	stm	rp, {w0,w1,w2}
   459  
   460  	pop	{r4-r11, pc}
   461  
   462  
   463  C Straight line code for n <= 4
   464  
   465  L(1):	ldr	r3, [up, #0]
   466  	umull	r1, r2, r3, r3
   467  	stm	rp, {r1,r2}
   468  	bx	r14
   469  
   470  L(2):	push	{r4-r5}
   471  	ldm	up, {r5,r12}
   472  	umull	r1, r2, r5, r5
   473  	umull	r3, r4, r12, r12
   474  	umull	r5, r12, r5, r12
   475  	adds	r5, r5, r5
   476  	adcs	r12, r12, r12
   477  	adc	r4, r4, #0
   478  	adds	r2, r2, r5
   479  	adcs	r3, r3, r12
   480  	adc	r4, r4, #0
   481  	stm	rp, {r1,r2,r3,r4}
   482  	pop	{r4-r5}
   483  	bx	r14
   484  
   485  L(3):	push	{r4-r11}
   486  	ldm	up, {r7,r8,r9}
   487  	umull	r1, r2, r7, r7
   488  	umull	r3, r4, r8, r8
   489  	umull	r5, r6, r9, r9
   490  	umull	r10, r11, r7, r8
   491  	mov	r12, #0
   492  	umlal	r11, r12, r7, r9
   493  	mov	r7, #0
   494  	umlal	r12, r7, r8, r9
   495  	adds	r10, r10, r10
   496  	adcs	r11, r11, r11
   497  	adcs	r12, r12, r12
   498  	adcs	r7, r7, r7
   499  	adc	r6, r6, #0
   500  	adds	r2, r2, r10
   501  	adcs	r3, r3, r11
   502  	adcs	r4, r4, r12
   503  	adcs	r5, r5, r7
   504  	adc	r6, r6, #0
   505  	stm	rp, {r1,r2,r3,r4,r5,r6}
   506  	pop	{r4-r11}
   507  	bx	r14
   508  
   509  L(4):	push	{r4-r11, r14}
   510  	ldm	up, {r9,r10,r11,r12}
   511  	umull	r1, r2, r9, r9
   512  	umull	r3, r4, r10, r10
   513  	umull	r5, r6, r11, r11
   514  	umull	r7, r8, r12, r12
   515  	stm	rp, {r1,r2,r3,r4,r5,r6,r7}
   516  	umull	r1, r2, r9, r10
   517  	mov	r3, #0
   518  	umlal	r2, r3, r9, r11
   519  	mov	r4, #0
   520  	umlal	r3, r4, r9, r12
   521  	mov	r5, #0
   522  	umlal	r3, r5, r10, r11
   523  	umaal	r4, r5, r10, r12
   524  	mov	r6, #0
   525  	umlal	r5, r6, r11, r12
   526  	adds	r1, r1, r1
   527  	adcs	r2, r2, r2
   528  	adcs	r3, r3, r3
   529  	adcs	r4, r4, r4
   530  	adcs	r5, r5, r5
   531  	adcs	r6, r6, r6
   532  	add	rp, rp, #4
   533  	adc	r7, r8, #0
   534  	ldm	rp, {r8,r9,r10,r11,r12,r14}
   535  	adds	r1, r1, r8
   536  	adcs	r2, r2, r9
   537  	adcs	r3, r3, r10
   538  	adcs	r4, r4, r11
   539  	adcs	r5, r5, r12
   540  	adcs	r6, r6, r14
   541  	adc	r7, r7, #0
   542  	stm	rp, {r1,r2,r3,r4,r5,r6,r7}
   543  	pop	{r4-r11, pc}
   544  EPILOGUE()