github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc64/mode64/mod_1_4.asm (about)

     1  dnl  PowerPC-64 mpn_mod_1s_4p
     2  
     3  dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C                   cycles/limb
    34  C POWER3/PPC630          ?
    35  C POWER4/PPC970          9
    36  C POWER5                 9
    37  C POWER6                13
    38  C POWER7                3.5
    39  
    40  C TODO
    41  C  * Optimise, in particular the cps function.  This was compiler-generated and
    42  C    then hand optimised.
    43  
    44  C INPUT PARAMETERS
    45  define(`ap',  `r3')
    46  define(`n',   `r4')
    47  define(`d',   `r5')
    48  define(`cps', `r6')
    49  
    50  ASM_START()
    51  
    52  EXTERN_FUNC(mpn_invert_limb)
    53  
    54  PROLOGUE(mpn_mod_1s_4p)
    55  	std	r23, -72(r1)
    56  	ld	r23, 48(cps)
    57  	std	r24, -64(r1)
    58  	std	r25, -56(r1)
    59  	ld	r24, 32(cps)
    60  	ld	r25, 24(cps)
    61  	std	r26, -48(r1)
    62  	std	r27, -40(r1)
    63  	ld	r26, 16(cps)
    64  	std	r28, -32(r1)
    65  	std	r29, -24(r1)
    66  	std	r30, -16(r1)
    67  	std	r31, -8(r1)
    68  	ld	r30, 40(cps)
    69  
    70  	rldicl.	r0, n, 0,62
    71  	sldi	r31, n, 3
    72  	add	ap, ap, r31		C make ap point at end of operand
    73  
    74  	cmpdi	cr7, r0, 2
    75  	beq	cr0, L(b00)
    76  	blt	cr7, L(b01)
    77  	beq	cr7, L(b10)
    78  
    79  L(b11):	ld	r11, -16(ap)
    80  	ld	r9, -8(ap)
    81  	ld	r0, -24(ap)
    82  	mulhdu	r27, r11, r26
    83  	mulld	r8, r11, r26
    84  	mulhdu	r11, r9, r25
    85  	mulld	r9, r9, r25
    86  	addc	r31, r8, r0
    87  	addze	r10, r27
    88  	addc	r0, r9, r31
    89  	adde	r9, r11, r10
    90  	addi	ap, ap, -40
    91  	b	L(6)
    92  
    93  	ALIGN(16)
    94  L(b00):	ld	r11, -24(ap)
    95  	ld	r10, -16(ap)
    96  	ld	r9, -8(ap)
    97  	ld	r0, -32(ap)
    98  	mulld	r8, r11, r26
    99  	mulhdu	r7, r10, r25
   100  	mulhdu	r27, r11, r26
   101  	mulhdu	r11, r9, r24
   102  	mulld	r10, r10, r25
   103  	mulld	r9, r9, r24
   104  	addc	r31, r8, r0
   105  	addze	r0, r27
   106  	addc	r8, r31, r10
   107  	adde	r10, r0, r7
   108  	addc	r0, r9, r8
   109  	adde	r9, r11, r10
   110  	addi	ap, ap, -48
   111  	b	L(6)
   112  
   113  	ALIGN(16)
   114  L(b01):	li	r9, 0
   115  	ld	r0, -8(ap)
   116  	addi	ap, ap, -24
   117  	b	L(6)
   118  
   119  	ALIGN(16)
   120  L(b10):	ld	r9, -8(ap)
   121  	ld	r0, -16(ap)
   122  	addi	ap, ap, -32
   123  
   124  	ALIGN(16)
   125  L(6):	addi	r10, n, 3
   126  	srdi	r7, r10, 2
   127  	mtctr	r7
   128  	bdz	L(end)
   129  
   130  	ALIGN(16)
   131  L(top):	ld	r31, -16(ap)
   132  	ld	r10, -8(ap)
   133  	ld	r11, 8(ap)
   134  	ld	r12, 0(ap)
   135  	mulld	r29, r0, r30		C rl * B4modb
   136  	mulhdu	r0,  r0, r30		C rl * B4modb
   137  	mulhdu	r27, r10, r26
   138  	mulld	r10, r10, r26
   139  	mulhdu	r7, r9, r23		C rh * B5modb
   140  	mulld	r9, r9, r23		C rh * B5modb
   141  	mulhdu	r28, r11, r24
   142  	mulld	r11, r11, r24
   143  	mulhdu	r4, r12, r25
   144  	mulld	r12, r12, r25
   145  	addc	r8, r10, r31
   146  	addze	r10, r27
   147  	addi	ap, ap, -32
   148  	addc	r27, r8, r12
   149  	adde	r12, r10, r4
   150  	addc	r11, r27, r11
   151  	adde	r31, r12, r28
   152  	addc	r12, r11, r29
   153  	adde	r4, r31, r0
   154  	addc	r0, r9, r12
   155  	adde	r9, r7, r4
   156  	bdnz	L(top)
   157  
   158  L(end):
   159  ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
   160  `	lwz	r3, 8(cps)',
   161  `	lwz	r3, 12(cps)')
   162  	mulld	r10, r9, r26
   163  	mulhdu	r9, r9, r26
   164  	addc	r11, r0, r10
   165  	addze	r9, r9
   166  	ld	r10, 0(cps)
   167  	subfic	r8, r3, 64
   168  	sld	r9, r9, r3
   169  	srd	r8, r11, r8
   170  	sld	r11, r11, r3
   171  	or	r9, r8, r9
   172  	mulld	r0, r9, r10
   173  	mulhdu	r10, r9, r10
   174  	addi	r9, r9, 1
   175  	addc	r8, r0, r11
   176  	adde	r0, r10, r9
   177  	mulld	r0, r0, d
   178  	subf	r0, r0, r11
   179  	cmpld	cr7, r8, r0
   180  	bge	cr7, L(9)
   181  	add	r0, r0, d
   182  L(9):	cmpld	cr7, r0, d
   183  	bge-	cr7, L(16)
   184  L(10):	srd	r3, r0, r3
   185  	ld	r23, -72(r1)
   186  	ld	r24, -64(r1)
   187  	ld	r25, -56(r1)
   188  	ld	r26, -48(r1)
   189  	ld	r27, -40(r1)
   190  	ld	r28, -32(r1)
   191  	ld	r29, -24(r1)
   192  	ld	r30, -16(r1)
   193  	ld	r31, -8(r1)
   194  	blr
   195  
   196  L(16):	subf	r0, d, r0
   197  	b	L(10)
   198  EPILOGUE()
   199  
   200  PROLOGUE(mpn_mod_1s_4p_cps,toc)
   201  	mflr	r0
   202  	std	r29, -24(r1)
   203  	std	r30, -16(r1)
   204  	mr	r29, r3
   205  	std	r0, 16(r1)
   206  	std	r31, -8(r1)
   207  	stdu	r1, -144(r1)
   208  	cntlzd	r31, r4
   209  	sld	r30, r4, r31
   210  	mr	r3, r30
   211  	CALL(	mpn_invert_limb)
   212  	subfic	r9, r31, 64
   213  	li	r10, 1
   214  	sld	r10, r10, r31
   215  	srd	r9, r3, r9
   216  	neg	r0, r30
   217  	or	r10, r10, r9
   218  	mulld	r10, r10, r0
   219  	mulhdu	r11, r10, r3
   220  	nor	r11, r11, r11
   221  	subf	r11, r10, r11
   222  	mulld	r11, r11, r30
   223  	mulld	r0, r10, r3
   224  	cmpld	cr7, r0, r11
   225  	bge	cr7, L(18)
   226  	add	r11, r11, r30
   227  L(18):	mulhdu	r9, r11, r3
   228  	add	r9, r11, r9
   229  	nor	r9, r9, r9
   230  	mulld	r9, r9, r30
   231  	mulld	r0, r11, r3
   232  	cmpld	cr7, r0, r9
   233  	bge	cr7, L(19)
   234  	add	r9, r9, r30
   235  L(19):	mulhdu	r0, r9, r3
   236  	add	r0, r9, r0
   237  	nor	r0, r0, r0
   238  	mulld	r0, r0, r30
   239  	mulld	r8, r9, r3
   240  	cmpld	cr7, r8, r0
   241  	bge	cr7, L(20)
   242  	add	r0, r0, r30
   243  L(20):	mulhdu	r8, r0, r3
   244  	add	r8, r0, r8
   245  	nor	r8, r8, r8
   246  	mulld	r8, r8, r30
   247  	mulld	r7, r0, r3
   248  	cmpld	cr7, r7, r8
   249  	bge	cr7, L(21)
   250  	add	r8, r8, r30
   251  L(21):	srd	r0, r0, r31
   252  	addi	r1, r1, 144
   253  	srd	r8, r8, r31
   254  	srd	r10, r10, r31
   255  	srd	r11, r11, r31
   256  	std	r0, 40(r29)
   257  	std	r31, 8(r29)
   258  	srd	r9, r9, r31
   259  	ld	r0, 16(r1)
   260  	ld	r30, -16(r1)
   261  	std	r8, 48(r29)
   262  	std	r3, 0(r29)
   263  	mtlr	r0
   264  	ld	r31, -8(r1)
   265  	std	r10, 16(r29)
   266  	std	r11, 24(r29)
   267  	std	r9, 32(r29)
   268  	ld	r29, -24(r1)
   269  	blr
   270  EPILOGUE()