github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc32/vmx/mod_34lsub1.asm (about)

     1  dnl  PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1.
     2  
     3  dnl  Copyright 2002, 2003, 2005-2007, 2012 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  
    32  include(`../config.m4')
    33  
    34  
    35  C                cycles/limb
    36  C 603e:              -
    37  C 604e:              -
    38  C 75x (G3):          -
    39  C 7400,7410 (G4):    1          simple load-use scheduling results in 0.75
    40  C 744x,745x (G4+):   0.75
    41  C ppc970:            0.75
    42  C power4:            -
    43  C power5:            -
    44  
    45  C TODO
    46  C  * Either start using the low-end masking constants, or remove them.
    47  C  * Merge multiple feed-in cases into a parameterized code block.
    48  C  * Reduce register usage.  It should be possible to almost halve it.
    49  
    50  define(`up', `r3')
    51  define(`n', `r4')
    52  
    53  define(`a0', `v3')
    54  define(`a1', `v4')
    55  define(`a2', `v5')
    56  define(`c0', `v6')
    57  define(`c1', `v7')
    58  define(`c2', `v8')
    59  define(`z',  `v9')
    60  define(`x0', `v10')
    61  define(`x1', `v11')
    62  define(`x2', `v12')
    63  define(`x3', `v13')
    64  define(`pv', `v14')
    65  define(`y0', `v0')
    66  define(`y1', `v1')
    67  define(`y2', `v2')
    68  define(`y3', `v15')
    69  
    70  ASM_START()
    71  PROLOGUE(mpn_mod_34lsub1)
    72  	cmpwi	cr0, n, 20		C tuned cutoff point
    73  	bge	L(large)
    74  
    75  	li	r9, 0			C result accumulator
    76  	mulli	r10, n, 0xb		C 0xb = ceil(32/3)
    77  	srwi.	r10, r10, 5		C r10 = floor(n/3), n < 32
    78  	beq	L(small_tail)
    79  	mtctr	r10
    80  	lwz	r6, 0(up)
    81  	lwz	r7, 4(up)
    82  	lwzu	r8, 8(up)
    83  	subf	n, r10, n
    84  	subf	n, r10, n
    85  	subf	n, r10, n
    86  	bdz	L(small_end)
    87  
    88  	ALIGN(16)
    89  L(los):	rlwinm	r0, r6, 0,8,31
    90  	add	r9, r9, r0		C add 24b from u0
    91  	srwi	r0, r6, 24
    92  	lwz	r6, 4(up)
    93  	rlwimi	r0, r7, 8, 0x00ffff00	C --111100
    94  	add	r9, r9, r0		C add 8b from u0 and 16b from u1
    95  	srwi	r0, r7, 16
    96  	lwz	r7, 8(up)
    97  	rlwimi	r0, r8, 16, 0x00ff0000	C --221111
    98  	add	r9, r9, r0		C add 16b from u1 and 8b from u2
    99  	srwi	r0, r8, 8		C --222222
   100  	lwzu	r8, 12(up)
   101  	add	r9, r9, r0		C add 24b from u2
   102  	bdnz	L(los)
   103  L(small_end):
   104  	rlwinm	r0, r6, 0,8,31
   105  	add	r9, r9, r0		C add 24b from u0
   106  	srwi	r0, r6, 24
   107  	rlwimi	r0, r7, 8, 0x00ffff00	C --111100
   108  	add	r9, r9, r0		C add 8b from u0 and 16b from u1
   109  	srwi	r0, r7, 16
   110  	rlwimi	r0, r8, 16, 0x00ff0000	C --221111
   111  	add	r9, r9, r0		C add 16b from u1 and 8b from u2
   112  	srwi	r0, r8, 8		C --222222
   113  	add	r9, r9, r0		C add 24b from u2
   114  
   115  	addi	up, up, 4
   116  	rlwinm	r0, r9, 0,8,31
   117  	srwi	r9, r9, 24
   118  	add	r9, r9, r0
   119  
   120  L(small_tail):
   121  	cmpi	cr0, n, 1
   122  	blt	L(ret)
   123  
   124  	lwz	r6, 0(up)
   125  	rlwinm	r0, r6, 0,8,31
   126  	srwi	r6, r6, 24
   127  	add	r9, r9, r0
   128  	add	r9, r9, r6
   129  
   130  	beq	L(ret)
   131  
   132  	lwz	r6, 4(up)
   133  	rlwinm	r0, r6, 8,8,23
   134  	srwi	r6, r6, 16
   135  	add	r9, r9, r0
   136  	add	r9, r9, r6
   137  
   138  L(ret):	mr	r3, r9
   139  	blr
   140  
   141  
   142  L(large):
   143  	stwu	r1, -32(r1)
   144  	mfspr	r10, 256
   145  	oris	r0, r10, 0xffff		C Set VRSAVE bit 0-15
   146  	mtspr	256, r0
   147  
   148  	andi.	r7, up, 15
   149  	vxor	a0, v0, v0
   150  	lis	r9, 0xaaaa
   151  	vxor	a1, v0, v0
   152  	ori	r9, r9, 0xaaab
   153  	vxor	a2, v0, v0
   154  	li	r5, 16
   155  	vxor	c0, v0, v0
   156  	li	r6, 32
   157  	vxor	c1, v0, v0
   158  	LEAL(	r11, cnsts)		C CAUTION clobbers r0 for elf, darwin
   159  	vxor	c2, v0, v0
   160  	vxor	z, v0, v0
   161  
   162  	beq	L(aligned16)
   163  
   164  	cmpwi	cr7, r7, 8
   165  	bge	cr7, L(na4)
   166  
   167  	lvx	a2, 0, up
   168  	addi	up, up, 16
   169  	vsldoi	a2, a2, z, 4
   170  	vsldoi	a2, z, a2, 12
   171  
   172  	addi	n, n, 9
   173  	mulhwu	r0, n, r9
   174  	srwi	r0, r0, 3		C r0 = floor(n/12)
   175  	mtctr	r0
   176  
   177  	mulli	r8, r0, 12
   178  	subf	n, r8, n
   179  	b	L(2)
   180  
   181  L(na4):	bne	cr7, L(na8)
   182  
   183  	lvx	a1, 0, up
   184  	addi	up, up, -16
   185  	vsldoi	a1, a1, z, 8
   186  	vsldoi	a1, z, a1, 8
   187  
   188  	addi	n, n, 6
   189  	mulhwu	r0, n, r9
   190  	srwi	r0, r0, 3		C r0 = floor(n/12)
   191  	mtctr	r0
   192  
   193  	mulli	r8, r0, 12
   194  	subf	n, r8, n
   195  	b	L(1)
   196  
   197  L(na8):
   198  	lvx	a0, 0, up
   199  	vsldoi	a0, a0, z, 12
   200  	vsldoi	a0, z, a0, 4
   201  
   202  	addi	n, n, 3
   203  	mulhwu	r0, n, r9
   204  	srwi	r0, r0, 3		C r0 = floor(n/12)
   205  	mtctr	r0
   206  
   207  	mulli	r8, r0, 12
   208  	subf	n, r8, n
   209  	b	L(0)
   210  
   211  L(aligned16):
   212  	mulhwu	r0, n, r9
   213  	srwi	r0, r0, 3		C r0 = floor(n/12)
   214  	mtctr	r0
   215  
   216  	mulli	r8, r0, 12
   217  	subf	n, r8, n
   218  
   219  	lvx	a0, 0, up
   220  L(0):	lvx	a1, r5, up
   221  L(1):	lvx	a2, r6, up
   222  	addi	up, up, 48
   223  L(2):	bdz	L(end)
   224  	li	r12, 256
   225  	li	r9, 288
   226  	ALIGN(32)
   227  L(top):
   228  	lvx	v0, 0, up
   229  	vaddcuw	v10, a0, v0
   230  	vadduwm	a0, a0, v0
   231  	vadduwm	c0, c0, v10
   232  
   233  	lvx	v1, r5, up
   234  	vaddcuw	v10, a1, v1
   235  	vadduwm	a1, a1, v1
   236  	vadduwm	c1, c1, v10
   237  
   238  	lvx	v2, r6, up
   239  	dcbt	up, r12
   240  	dcbt	up, r9
   241  	addi	up, up, 48
   242  	vaddcuw	v10, a2, v2
   243  	vadduwm	a2, a2, v2
   244  	vadduwm	c2, c2, v10
   245  	bdnz	L(top)
   246  
   247  L(end):
   248  C n = 0...11
   249  	cmpwi	cr0, n, 0
   250  	beq	L(sum)
   251  	cmpwi	cr0, n, 4
   252  	ble	L(tail.1..4)
   253  	cmpwi	cr0, n, 8
   254  	ble	L(tail.5..8)
   255  
   256  L(tail.9..11):
   257  	lvx	v0, 0, up
   258  	vaddcuw	v10, a0, v0
   259  	vadduwm	a0, a0, v0
   260  	vadduwm	c0, c0, v10
   261  
   262  	lvx	v1, r5, up
   263  	vaddcuw	v10, a1, v1
   264  	vadduwm	a1, a1, v1
   265  	vadduwm	c1, c1, v10
   266  
   267  	lvx	v2, r6, up
   268  
   269  	addi	r8, r11, 96
   270  	rlwinm	r3, n ,4,26,27
   271  	lvx	v11, r3, r8
   272  	vand	v2, v2, v11
   273  
   274  	vaddcuw	v10, a2, v2
   275  	vadduwm	a2, a2, v2
   276  	vadduwm	c2, c2, v10
   277  	b	L(sum)
   278  
   279  L(tail.5..8):
   280  	lvx	v0, 0, up
   281  	vaddcuw	v10, a0, v0
   282  	vadduwm	a0, a0, v0
   283  	vadduwm	c0, c0, v10
   284  
   285  	lvx	v1, r5, up
   286  
   287  	addi	r8, r11, 96
   288  	rlwinm	r3, n ,4,26,27
   289  	lvx	v11, r3, r8
   290  	vand	v1, v1, v11
   291  
   292  	vaddcuw	v10, a1, v1
   293  	vadduwm	a1, a1, v1
   294  	vadduwm	c1, c1, v10
   295  	b	L(sum)
   296  
   297  L(tail.1..4):
   298  	lvx	v0, 0, up
   299  
   300  	addi	r8, r11, 96
   301  	rlwinm	r3, n ,4,26,27
   302  	lvx	v11, r3, r8
   303  	vand	v0, v0, v11
   304  
   305  	vaddcuw	v10, a0, v0
   306  	vadduwm	a0, a0, v0
   307  	vadduwm	c0, c0, v10
   308  
   309  L(sum):	lvx	pv, 0, r11
   310  	vperm	x0, a0, z, pv		C extract 4 24-bit field from a0
   311  	vperm	y0, c2, z, pv
   312  	lvx	pv, r5, r11
   313  	vperm	x1, a1, z, pv		C extract 4 24-bit field from a1
   314  	vperm	y1, c0, z, pv		C extract 4 24-bit field from a1
   315  	lvx	pv, r6, r11
   316  	vperm	x2, a2, z, pv		C extract 4 24-bit field from a1
   317  	vperm	y2, c1, z, pv		C extract 4 24-bit field from a1
   318  	li	r10,  48
   319  	lvx	pv, r10, r11
   320  	vperm	x3, a0, z, pv		C extract remaining/partial a0 fields
   321  	vperm	y3, c2, z, pv		C extract remaining/partial a0 fields
   322  	li	r10,  64
   323  	lvx	pv, r10, r11
   324  	vperm	x3, a1, x3, pv		C insert remaining/partial a1 fields
   325  	vperm	y3, c0, y3, pv		C insert remaining/partial a1 fields
   326  	li	r10,  80
   327  	lvx	pv, r10, r11
   328  	vperm	x3, a2, x3, pv		C insert remaining/partial a2 fields
   329  	vperm	y3, c1, y3, pv		C insert remaining/partial a2 fields
   330  
   331  C We now have 4 128-bit accumulators to sum
   332  	vadduwm	x0, x0, x1
   333  	vadduwm	x2, x2, x3
   334  	vadduwm	x0, x0, x2
   335  
   336  	vadduwm	y0, y0, y1
   337  	vadduwm	y2, y2, y3
   338  	vadduwm	y0, y0, y2
   339  
   340  	vadduwm	x0, x0, y0
   341  
   342  C Reduce 32-bit fields
   343  	vsumsws	x0, x0, z
   344  
   345  	li	r7, 16
   346  	stvx	x0, r7, r1
   347  	lwz	r3, 28(r1)
   348  
   349  	mtspr	256, r10
   350  	addi	r1, r1, 32
   351  	blr
   352  EPILOGUE()
   353  
   354  C load	|      v0       |      v1       |      v2       |
   355  C acc	|      a0       |      a1       |      a2       |
   356  C carry	|      c0       |      c1       |      c2       |
   357  C	| 0   1   2   3 | 4   5   6   7 | 8   9  10  11 |  128
   358  C	|---|---|---|---|---|---|---|---|---|---|---|---|   32
   359  C	|  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |   24
   360  C	|     |     |     |     |     |     |     |     |   48
   361  
   362  C       $---------------$---------------$---------------$---------------$
   363  C       |   .   .   .   .   .   .   .   .   .   .   .   .   .   .   .   |
   364  C       |_______________________________________________________________|
   365  C   |           |           |           |           |           |           |
   366  C       <-hi16-> <--- 24 --> <--- 24 --> <--- 24 --> <--- 24 --> <-lo16->
   367  
   368  
   369  DEF_OBJECT(cnsts,16)
   370  C Permutation vectors in the order they are used above
   371  C #      00   01   02   03    04   05   06   07    08   09   0a   0b    0c   0d   0e   0f
   372   .byte 0x10,0x01,0x02,0x03, 0x10,0x06,0x07,0x00, 0x10,0x0b,0x04,0x05, 0x10,0x08,0x09,0x0a C a0
   373   .byte 0x10,0x07,0x00,0x01, 0x10,0x04,0x05,0x06, 0x10,0x09,0x0a,0x0b, 0x10,0x0e,0x0f,0x08 C a1
   374   .byte 0x10,0x00,0x01,0x02, 0x10,0x05,0x06,0x07, 0x10,0x0a,0x0b,0x04, 0x10,0x0f,0x08,0x09 C a2
   375   .byte 0x10,0x0d,0x0e,0x0f, 0x10,0x10,0x10,0x0c, 0x10,0x10,0x10,0x10, 0x10,0x10,0x10,0x10 C part a0
   376   .byte 0x10,0x11,0x12,0x13, 0x10,0x02,0x03,0x17, 0x10,0x10,0x0c,0x0d, 0x10,0x10,0x10,0x10 C part a1
   377   .byte 0x10,0x11,0x12,0x13, 0x10,0x15,0x16,0x17, 0x10,0x03,0x1a,0x1b, 0x10,0x0c,0x0d,0x0e C part a2
   378  C Masks for high end of number
   379   .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   380   .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
   381   .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
   382   .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
   383  C Masks for low end of number
   384  C .byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   385  C .byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   386  C .byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
   387  C .byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
   388  END_OBJECT(cnsts)