github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/ev6/mod_1_4.asm (about)

     1  dnl Alpha mpn_mod_1s_4p
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C TODO:
    36  C  * Optimise.  2.75 c/l should be possible.
    37  C  * Write a proper mpn_mod_1s_4p_cps.  The code below was compiler generated.
    38  C  * Optimise feed-in code, starting the sw pipeline in switch code.
    39  C  * Shorten software pipeline.  The mul instructions are scheduled too far
    40  C    from their users.  Fixing this will allow us to use fewer registers.
    41  C  * If we cannot reduce register usage, write perhaps small-n basecase.
    42  C  * Does this work for PIC?
    43  
    44  C      cycles/limb
    45  C EV4:     ?
    46  C EV5:    23
    47  C EV6:     3
    48  
    49  define(`ap',     `r16')
    50  define(`n',      `r17')
    51  define(`pl',     `r24')
    52  define(`ph',     `r25')
    53  define(`rl',     `r6')
    54  define(`rh',     `r7')
    55  define(`B1modb', `r1')
    56  define(`B2modb', `r2')
    57  define(`B3modb', `r3')
    58  define(`B4modb', `r4')
    59  define(`B5modb', `r5')
    60  
    61  ASM_START()
    62  PROLOGUE(mpn_mod_1s_4p)
    63  	lda	r30, -64(r30)
    64  	stq	r9, 8(r30)
    65  	ldq	B1modb, 16(r19)
    66  	stq	r10, 16(r30)
    67  	ldq	B2modb, 24(r19)
    68  	stq	r11, 24(r30)
    69  	ldq	B3modb, 32(r19)
    70  	stq	r12, 32(r30)
    71  	ldq	B4modb, 40(r19)
    72  	stq	r13, 40(r30)
    73  	ldq	B5modb, 48(r19)
    74  	s8addq	n, ap, ap		C point ap at vector end
    75  
    76  	and	n, 3, r0
    77  	lda	n, -4(n)
    78  	beq	r0, L(b0)
    79  	lda	r6, -2(r0)
    80  	blt	r6, L(b1)
    81  	beq	r6, L(b2)
    82  
    83  L(b3):	ldq	r21, -16(ap)
    84  	ldq	r22, -8(ap)
    85  	ldq	r20, -24(ap)
    86  	mulq	r21, B1modb, r8
    87  	umulh	r21, B1modb, r12
    88  	mulq	r22, B2modb, r9
    89  	umulh	r22, B2modb, r13
    90  	addq	r8, r20, pl
    91  	cmpult	pl, r8, r0
    92  	addq	r0, r12, ph
    93  	addq	r9, pl, rl
    94  	cmpult	rl, r9, r0
    95  	addq	r13, ph, ph
    96  	addq	r0, ph, rh
    97  	lda	ap, -56(ap)
    98  	br	L(com)
    99  
   100  L(b0):	ldq	r21, -24(ap)
   101  	ldq	r22, -16(ap)
   102  	ldq	r23, -8(ap)
   103  	ldq	r20, -32(ap)
   104  	mulq	r21, B1modb, r8
   105  	umulh	r21, B1modb, r12
   106  	mulq	r22, B2modb, r9
   107  	umulh	r22, B2modb, r13
   108  	mulq	r23, B3modb, r10
   109  	umulh	r23, B3modb, r27
   110  	addq	r8, r20, pl
   111  	cmpult	pl, r8, r0
   112  	addq	r0, r12, ph
   113  	addq	r9, pl, pl
   114  	cmpult	pl, r9, r0
   115  	addq	r13, ph, ph
   116  	addq	r0, ph, ph
   117  	addq	r10, pl, rl
   118  	cmpult	rl, r10, r0
   119  	addq	r27, ph, ph
   120  	addq	r0, ph, rh
   121  	lda	ap, -64(ap)
   122  	br	L(com)
   123  
   124  L(b1):	bis	r31, r31, rh
   125  	ldq	rl, -8(ap)
   126  	lda	ap, -40(ap)
   127  	br	L(com)
   128  
   129  L(b2):	ldq	rh, -8(ap)
   130  	ldq	rl, -16(ap)
   131  	lda	ap, -48(ap)
   132  
   133  L(com):	ble	n, L(ed3)
   134  	ldq	r21, 8(ap)
   135  	ldq	r22, 16(ap)
   136  	ldq	r23, 24(ap)
   137  	ldq	r20, 0(ap)
   138  	lda	n, -4(n)
   139  	lda	ap, -32(ap)
   140  	mulq	r21, B1modb, r8
   141  	umulh	r21, B1modb, r12
   142  	mulq	r22, B2modb, r9
   143  	umulh	r22, B2modb, r13
   144  	mulq	r23, B3modb, r10
   145  	umulh	r23, B3modb, r27
   146  	mulq	rl, B4modb, r11
   147  	umulh	rl, B4modb, r28
   148  	ble	n, L(ed2)
   149  
   150  	ALIGN(16)
   151  L(top):	ldq	r21, 8(ap)
   152  	mulq	rh, B5modb, rl
   153  	addq	r8, r20, pl
   154  	ldq	r22, 16(ap)
   155  	cmpult	pl, r8, r0
   156  	umulh	rh, B5modb, rh
   157  	ldq	r23, 24(ap)
   158  	addq	r0, r12, ph
   159  	addq	r9, pl, pl
   160  	mulq	r21, B1modb, r8
   161  	cmpult	pl, r9, r0
   162  	addq	r13, ph, ph
   163  	umulh	r21, B1modb, r12
   164  	lda	ap, -32(ap)
   165  	addq	r0, ph, ph
   166  	addq	r10, pl, pl
   167  	mulq	r22, B2modb, r9
   168  	cmpult	pl, r10, r0
   169  	addq	r27, ph, ph
   170  	addq	r11, pl, pl
   171  	umulh	r22, B2modb, r13
   172  	addq	r0, ph, ph
   173  	cmpult	pl, r11, r0
   174  	addq	r28, ph, ph
   175  	mulq	r23, B3modb, r10
   176  	ldq	r20, 32(ap)
   177  	addq	pl, rl, rl
   178  	umulh	r23, B3modb, r27
   179  	addq	r0, ph, ph
   180  	cmpult	rl, pl, r0
   181  	mulq	rl, B4modb, r11
   182  	addq	ph, rh, rh
   183  	umulh	rl, B4modb, r28
   184  	addq	r0, rh, rh
   185  	lda	n, -4(n)
   186  	bgt	n, L(top)
   187  
   188  L(ed2):	mulq	rh, B5modb, rl
   189  	addq	r8, r20, pl
   190  	umulh	rh, B5modb, rh
   191  	cmpult	pl, r8, r0
   192  	addq	r0, r12, ph
   193  	addq	r9, pl, pl
   194  	cmpult	pl, r9, r0
   195  	addq	r13, ph, ph
   196  	addq	r0, ph, ph
   197  	addq	r10, pl, pl
   198  	cmpult	pl, r10, r0
   199  	addq	r27, ph, ph
   200  	addq	r11, pl, pl
   201  	addq	r0, ph, ph
   202  	cmpult	pl, r11, r0
   203  	addq	r28, ph, ph
   204  	addq	pl, rl, rl
   205  	addq	r0, ph, ph
   206  	cmpult	rl, pl, r0
   207  	addq	ph, rh, rh
   208  	addq	r0, rh, rh
   209  
   210  L(ed3):	mulq	rh, B1modb, r8
   211  	umulh	rh, B1modb, rh
   212  	addq	r8, rl, rl
   213  	cmpult	rl, r8, r0
   214  	addq	r0, rh, rh
   215  
   216  	ldq	r24, 8(r19)		C cnt
   217  	sll	rh, r24, rh
   218  	subq	r31, r24, r25
   219  	srl	rl, r25, r2
   220  	sll	rl, r24, rl
   221  	or	r2, rh, rh
   222  
   223  	ldq	r23, 0(r19)		C bi
   224  	mulq	rh, r23, r8
   225  	umulh	rh, r23, r9
   226  	addq	rh, 1, r7
   227  	addq	r8, rl, r8		C ql
   228  	cmpult	r8, rl, r0
   229  	addq	r9, r7, r9
   230  	addq	r0, r9, r9		C qh
   231  	mulq	r9, r18, r21		C qh * b
   232  	subq	rl, r21, rl
   233  	cmpult	r8, rl, r0		C rl > ql
   234  	negq	r0, r0
   235  	and	r0, r18, r0
   236  	addq	rl, r0, rl
   237  	cmpule	r18, rl, r0		C rl >= b
   238  	negq	r0, r0
   239  	and	r0, r18, r0
   240  	subq	rl, r0, rl
   241  
   242  	srl	rl, r24, r0
   243  
   244  	ldq	r9, 8(r30)
   245  	ldq	r10, 16(r30)
   246  	ldq	r11, 24(r30)
   247  	ldq	r12, 32(r30)
   248  	ldq	r13, 40(r30)
   249  	lda	r30, 64(r30)
   250  	ret	r31, (r26), 1
   251  EPILOGUE()
   252  
   253  PROLOGUE(mpn_mod_1s_4p_cps,gp)
   254  	lda	r30, -32(r30)
   255  	stq	r26, 0(r30)
   256  	stq	r9, 8(r30)
   257  	stq	r10, 16(r30)
   258  	stq	r11, 24(r30)
   259  	mov	r16, r11
   260  	LEA(	r4, __clz_tab)
   261  	lda	r10, 65(r31)
   262  	cmpbge	r31, r17, r1
   263  	srl	r1, 1, r1
   264  	xor	r1, 127, r1
   265  	addq	r1, r4, r1
   266  	ldq_u	r2, 0(r1)
   267  	extbl	r2, r1, r2
   268  	s8subq	r2, 7, r2
   269  	srl	r17, r2, r3
   270  	subq	r10, r2, r10
   271  	addq	r3, r4, r3
   272  	ldq_u	r1, 0(r3)
   273  	extbl	r1, r3, r1
   274  	subq	r10, r1, r10
   275  	sll	r17, r10, r9
   276  	mov	r9, r16
   277  	jsr	r26, mpn_invert_limb
   278  	LDGP(	r29, 0(r26))
   279  	subq	r31, r10, r2
   280  	lda	r1, 1(r31)
   281  	sll	r1, r10, r1
   282  	subq	r31, r9, r3
   283  	srl	r0, r2, r2
   284  	ldq	r26, 0(r30)
   285  	bis	r2, r1, r2
   286  	stq	r0, 0(r11)
   287  	stq	r10, 8(r11)
   288  	mulq	r2, r3, r2
   289  	srl	r2, r10, r3
   290  	umulh	r2, r0, r1
   291  	stq	r3, 16(r11)
   292  	mulq	r2, r0, r3
   293  	ornot	r31, r1, r1
   294  	subq	r1, r2, r1
   295  	mulq	r1, r9, r1
   296  	addq	r1, r9, r2
   297  	cmpule	r1, r3, r3
   298  	cmoveq	r3, r2, r1
   299  	srl	r1, r10, r3
   300  	umulh	r1, r0, r2
   301  	stq	r3, 24(r11)
   302  	mulq	r1, r0, r3
   303  	ornot	r31, r2, r2
   304  	subq	r2, r1, r2
   305  	mulq	r2, r9, r2
   306  	addq	r2, r9, r1
   307  	cmpule	r2, r3, r3
   308  	cmoveq	r3, r1, r2
   309  	srl	r2, r10, r1
   310  	umulh	r2, r0, r3
   311  	stq	r1, 32(r11)
   312  	mulq	r2, r0, r1
   313  	ornot	r31, r3, r3
   314  	subq	r3, r2, r3
   315  	mulq	r3, r9, r3
   316  	addq	r3, r9, r2
   317  	cmpule	r3, r1, r1
   318  	cmoveq	r1, r2, r3
   319  	srl	r3, r10, r2
   320  	umulh	r3, r0, r1
   321  	stq	r2, 40(r11)
   322  	mulq	r3, r0, r0
   323  	ornot	r31, r1, r1
   324  	subq	r1, r3, r1
   325  	mulq	r1, r9, r1
   326  	addq	r1, r9, r9
   327  	cmpule	r1, r0, r0
   328  	cmoveq	r0, r9, r1
   329  	ldq	r9, 8(r30)
   330  	srl	r1, r10, r1
   331  	ldq	r10, 16(r30)
   332  	stq	r1, 48(r11)
   333  	ldq	r11, 24(r30)
   334  	lda	r30, 32(r30)
   335  	ret	r31, (r26), 1
   336  EPILOGUE()