github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/ev6/sub_n.asm (about)

     1  dnl  Alpha ev6 mpn_sub_n -- Subtract two limb vectors of the same length > 0
     2  dnl  and store difference in a third limb vector.
     3  
     4  dnl  Copyright 2000, 2003, 2005 Free Software Foundation, Inc.
     5  
     6  dnl  This file is part of the GNU MP Library.
     7  dnl
     8  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     9  dnl  it under the terms of either:
    10  dnl
    11  dnl    * the GNU Lesser General Public License as published by the Free
    12  dnl      Software Foundation; either version 3 of the License, or (at your
    13  dnl      option) any later version.
    14  dnl
    15  dnl  or
    16  dnl
    17  dnl    * the GNU General Public License as published by the Free Software
    18  dnl      Foundation; either version 2 of the License, or (at your option) any
    19  dnl      later version.
    20  dnl
    21  dnl  or both in parallel, as here.
    22  dnl
    23  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    24  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    25  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    26  dnl  for more details.
    27  dnl
    28  dnl  You should have received copies of the GNU General Public License and the
    29  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    30  dnl  see https://www.gnu.org/licenses/.
    31  
    32  include(`../config.m4')
    33  
    34  C      cycles/limb
    35  C EV4:     ?
    36  C EV5:     5.4
    37  C EV6:     2.125
    38  
    39  C  INPUT PARAMETERS
    40  C  rp	r16
    41  C  up	r17
    42  C  vp	r18
    43  C  n	r19
    44  C  cy	r20   (for mpn_add_nc)
    45  
    46  C TODO
    47  C   Finish cleaning up cy registers r22, r23 (make them use cy0/cy1)
    48  C   Use multi-pronged feed-in.
    49  C   Perform additional micro-tuning
    50  
    51  C  This code was written in cooperation with ev6 pipeline expert Steve Root.
    52  
    53  C  Pair loads and stores where possible
    54  C  Store pairs oct-aligned where possible (didn't need it here)
    55  C  Stores are delayed every third cycle
    56  C  Loads and stores are delayed by fills
    57  C  U stays still, put code there where possible (note alternation of U1 and U0)
    58  C  L moves because of loads and stores
    59  C  Note dampers in L to limit damage
    60  
    61  C  This odd-looking optimization expects that were having random bits in our
    62  C  data, so that a pure zero result is unlikely. so we penalize the unlikely
    63  C  case to help the common case.
    64  
    65  define(`u0', `r0')  define(`u1', `r3')
    66  define(`v0', `r1')  define(`v1', `r4')
    67  
    68  define(`cy0', `r20')  define(`cy1', `r21')
    69  
    70  MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc)
    71  
    72  ASM_START()
    73  PROLOGUE(mpn_sub_nc)
    74  	br	r31,	$entry
    75  EPILOGUE()
    76  PROLOGUE(mpn_sub_n)
    77  	bis	r31,	r31,	cy0	C clear carry in
    78  $entry:	cmpult	r19,	5,	r22	C L1 move counter
    79  	ldq	u1,	0(r17)		C L0 get next ones
    80  	ldq	v1,	0(r18)		C L1
    81  	bne	r22,	$Lsmall
    82  
    83  	ldq	u0,	8(r17)		C L0 get next ones
    84  	ldq	v0,	8(r18)		C L1
    85  	subq	u1,	v1,	r5	C U0 sub two data
    86  
    87  	cmpult	u1,	v1,	r23	C U0 did it borrow
    88  	ldq	u1,	16(r17)		C L0 get next ones
    89  	ldq	v1,	16(r18)		C L1
    90  
    91  	subq	u0,	v0,	r8	C U1 sub two data
    92  	subq	r5,	cy0,	r24	C U0 borrow in
    93  
    94  	cmpult	u0,	v0,	r22	C U1 did it borrow
    95  	beq	r5,	$fix5f		C U0 fix exact zero
    96  $ret5f:	ldq	u0,	24(r17)		C L0 get next ones
    97  	ldq	v0,	24(r18)		C L1
    98  
    99  	subq	r8,	r23,	r25	C U1 borrow from last
   100  	subq	u1,	v1,	r7	C U0 sub two data
   101  
   102  	beq	r8,	$fix6f		C U1 fix exact zero
   103  $ret6f:	cmpult	u1,	v1,	r23	C U0 did it borrow
   104  	ldq	u1,	32(r17)		C L0 get next ones
   105  	ldq	v1,	32(r18)		C L1
   106  
   107  	lda	r17,	40(r17)		C L0 move pointer
   108  	lda	r18,	40(r18)		C L1 move pointer
   109  
   110  	lda	r16,	-8(r16)
   111  	lda	r19,	-13(r19)	C L1 move counter
   112  	blt	r19,	$Lend		C U1 loop control
   113  
   114  
   115  C Main loop.  8-way unrolled.
   116  	ALIGN(16)
   117  $Loop:	subq	u0,	v0,	r2	C U1 sub two data
   118  	stq	r24,	8(r16)		C L0 put an answer
   119  	subq	r7,	r22,	r24	C U0 borrow from last
   120  	stq	r25,	16(r16)		C L1 pair
   121  
   122  	cmpult	u0,	v0,	cy1	C U1 did it borrow
   123  	beq	r7,	$fix7		C U0 fix exact 0
   124  $ret7:	ldq	u0,	0(r17)		C L0 get next ones
   125  	ldq	v0,	0(r18)		C L1
   126  
   127  	bis	r31,	r31,	r31	C L  damp out
   128  	subq	r2,	r23,	r25	C U1 borrow from last
   129  	bis	r31,	r31,	r31	C L  moves in L !
   130  	subq	u1,	v1,	r5	C U0 sub two data
   131  
   132  	beq	r2,	$fix0		C U1 fix exact zero
   133  $ret0:	cmpult	u1,	v1,	cy0	C U0 did it borrow
   134  	ldq	u1,	8(r17)		C L0 get next ones
   135  	ldq	v1,	8(r18)		C L1
   136  
   137  	subq	u0,	v0,	r8	C U1 sub two data
   138  	stq	r24,	24(r16)		C L0 store pair
   139  	subq	r5,	cy1,	r24	C U0 borrow from last
   140  	stq	r25,	32(r16)		C L1
   141  
   142  	cmpult	u0,	v0,	r22	C U1 did it borrow
   143  	beq	r5,	$fix1		C U0 fix exact zero
   144  $ret1:	ldq	u0,	16(r17)		C L0 get next ones
   145  	ldq	v0,	16(r18)		C L1
   146  
   147  	lda	r16,	64(r16)		C L0 move pointer
   148  	subq	r8,	cy0,	r25	C U1 borrow from last
   149  	lda	r19,	-8(r19)		C L1 move counter
   150  	subq	u1,	v1,	r7	C U0 sub two data
   151  
   152  	beq	r8,	$fix2		C U1 fix exact zero
   153  $ret2:	cmpult	u1,	v1,	r23	C U0 did it borrow
   154  	ldq	u1,	24(r17)		C L0 get next ones
   155  	ldq	v1,	24(r18)		C L1
   156  
   157  	subq	u0,	v0,	r2	C U1 sub two data
   158  	stq	r24,	-24(r16)	C L0 put an answer
   159  	subq	r7,	r22,	r24	C U0 borrow from last
   160  	stq	r25,	-16(r16)	C L1 pair
   161  
   162  	cmpult	u0,	v0,	cy1	C U1 did it borrow
   163  	beq	r7,	$fix3		C U0 fix exact 0
   164  $ret3:	ldq	u0,	32(r17)		C L0 get next ones
   165  	ldq	v0,	32(r18)		C L1
   166  
   167  	bis	r31,	r31,	r31	C L  damp out
   168  	subq	r2,	r23,	r25	C U1 borrow from last
   169  	bis	r31,	r31,	r31	C L  moves in L !
   170  	subq	u1,	v1,	r5	C U0 sub two data
   171  
   172  	beq	r2,	$fix4		C U1 fix exact zero
   173  $ret4:	cmpult	u1,	v1,	cy0	C U0 did it borrow
   174  	ldq	u1,	40(r17)		C L0 get next ones
   175  	ldq	v1,	40(r18)		C L1
   176  
   177  	subq	u0,	v0,	r8	C U1 sub two data
   178  	stq	r24,	-8(r16)		C L0 store pair
   179  	subq	r5,	cy1,	r24	C U0 borrow from last
   180  	stq	r25,	0(r16)		C L1
   181  
   182  	cmpult	u0,	v0,	r22	C U1 did it borrow
   183  	beq	r5,	$fix5		C U0 fix exact zero
   184  $ret5:	ldq	u0,	48(r17)		C L0 get next ones
   185  	ldq	v0,	48(r18)		C L1
   186  
   187  	ldl	r31, 256(r17)		C L0 prefetch
   188  	subq	r8,	cy0,	r25	C U1 borrow from last
   189  	ldl	r31, 256(r18)		C L1 prefetch
   190  	subq	u1,	v1,	r7	C U0 sub two data
   191  
   192  	beq	r8,	$fix6		C U1 fix exact zero
   193  $ret6:	cmpult	u1,	v1,	r23	C U0 did it borrow
   194  	ldq	u1,	56(r17)		C L0 get next ones
   195  	ldq	v1,	56(r18)		C L1
   196  
   197  	lda	r17,	64(r17)		C L0 move pointer
   198  	bis	r31,	r31,	r31	C U
   199  	lda	r18,	64(r18)		C L1 move pointer
   200  	bge	r19,	$Loop		C U1 loop control
   201  C ==== main loop end
   202  
   203  $Lend:	subq	u0,	v0,	r2	C U1 sub two data
   204  	stq	r24,	8(r16)		C L0 put an answer
   205  	subq	r7,	r22,	r24	C U0 borrow from last
   206  	stq	r25,	16(r16)		C L1 pair
   207  	cmpult	u0,	v0,	cy1	C U1 did it borrow
   208  	beq	r7,	$fix7c		C U0 fix exact 0
   209  $ret7c:	subq	r2,	r23,	r25	C U1 borrow from last
   210  	subq	u1,	v1,	r5	C U0 sub two data
   211  	beq	r2,	$fix0c		C U1 fix exact zero
   212  $ret0c:	cmpult	u1,	v1,	cy0	C U0 did it borrow
   213  	stq	r24,	24(r16)		C L0 store pair
   214  	subq	r5,	cy1,	r24	C U0 borrow from last
   215  	stq	r25,	32(r16)		C L1
   216  	beq	r5,	$fix1c		C U0 fix exact zero
   217  $ret1c:	stq	r24,	40(r16)		C L0 put an answer
   218  	lda	r16,	48(r16)		C L0 move pointer
   219  
   220  	lda	r19,	8(r19)
   221  	beq	r19,	$Lret
   222  
   223  	ldq	u1,	0(r17)
   224  	ldq	v1,	0(r18)
   225  $Lsmall:
   226  	lda	r19,	-1(r19)
   227  	beq	r19,	$Lend0
   228  
   229  	ALIGN(8)
   230  $Loop0:	subq	u1,	v1,	r2	C main sub
   231  	cmpult	u1,	v1,	r8	C compute bw from last sub
   232  	ldq	u1,	8(r17)
   233  	ldq	v1,	8(r18)
   234  	subq	r2,	cy0,	r5	C borrow sub
   235  	lda	r17,	8(r17)
   236  	lda	r18,	8(r18)
   237  	stq	r5,	0(r16)
   238  	cmpult	r2,	cy0,	cy0	C compute bw from last sub
   239  	lda	r19,	-1(r19)		C decr loop cnt
   240  	bis	r8,	cy0,	cy0	C combine bw from the two subs
   241  	lda	r16,	8(r16)
   242  	bne	r19,	$Loop0
   243  $Lend0:	subq	u1,	v1,	r2	C main sub
   244  	subq	r2,	cy0,	r5	C borrow sub
   245  	cmpult	u1,	v1,	r8	C compute bw from last sub
   246  	cmpult	r2,	cy0,	cy0	C compute bw from last sub
   247  	stq	r5,	0(r16)
   248  	bis	r8,	cy0,	r0	C combine bw from the two subs
   249  	ret	r31,(r26),1
   250  
   251  	ALIGN(8)
   252  $Lret:	lda	r0,	0(cy0)		C copy borrow into return register
   253  	ret	r31,(r26),1
   254  
   255  $fix5f:	bis	r23,	cy0,	r23	C bring forward borrow
   256  	br	r31,	$ret5f
   257  $fix6f:	bis	r22,	r23,	r22	C bring forward borrow
   258  	br	r31,	$ret6f
   259  $fix0:	bis	cy1,	r23,	cy1	C bring forward borrow
   260  	br	r31,	$ret0
   261  $fix1:	bis	cy0,	cy1,	cy0	C bring forward borrow
   262  	br	r31,	$ret1
   263  $fix2:	bis	r22,	cy0,	r22	C bring forward borrow
   264  	br	r31,	$ret2
   265  $fix3:	bis	r23,	r22,	r23	C bring forward borrow
   266  	br	r31,	$ret3
   267  $fix4:	bis	cy1,	r23,	cy1	C bring forward borrow
   268  	br	r31,	$ret4
   269  $fix5:	bis	cy1,	cy0,	cy0	C bring forward borrow
   270  	br	r31,	$ret5
   271  $fix6:	bis	r22,	cy0,	r22	C bring forward borrow
   272  	br	r31,	$ret6
   273  $fix7:	bis	r23,	r22,	r23	C bring forward borrow
   274  	br	r31,	$ret7
   275  $fix0c:	bis	cy1,	r23,	cy1	C bring forward borrow
   276  	br	r31,	$ret0c
   277  $fix1c:	bis	cy0,	cy1,	cy0	C bring forward borrow
   278  	br	r31,	$ret1c
   279  $fix7c:	bis	r23,	r22,	r23	C bring forward borrow
   280  	br	r31,	$ret7c
   281  
   282  EPILOGUE()
   283  ASM_END()