github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc32/vmx/copyi.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc32/vmx/copyi.asm (about)

     1  dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_copyi.
     2  
     3  dnl  Copyright 2006 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C                16-byte coaligned      unaligned
    34  C                   cycles/limb        cycles/limb
    35  C 7400,7410 (G4):       0.5                0.64
    36  C 744x,745x (G4+):      0.75               0.82
    37  C 970 (G5):             0.78               1.02		(64-bit limbs)
    38  
    39  C STATUS
    40  C  * Works for all sizes and alignments.
    41  
    42  C TODO
    43  C  * Optimize unaligned case.  Some basic tests with 2-way and 4-way unrolling
    44  C    indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
    45  C    c/l for 970.
    46  C  * Consider using VMX instructions also for head and tail, by using some
    47  C    read-modify-write tricks.
    48  C  * The VMX code is used from the smallest sizes it handles, but measurements
    49  C    show a large speed bump at the cutoff points.  Small copying (perhaps
    50  C    using some read-modify-write technique) should be optimized.
    51  C  * Make an mpn_com based on this code.
    52  
    53  define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
    54  define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
    55  define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
    56  
    57  
    58  ifelse(GMP_LIMB_BITS,32,`
    59  	define(`LIMB32',`	$1')
    60  	define(`LIMB64',`')
    61  ',`
    62  	define(`LIMB32',`')
    63  	define(`LIMB64',`	$1')
    64  ')
    65  
    66  C INPUT PARAMETERS
    67  define(`rp',	`r3')
    68  define(`up',	`r4')
    69  define(`n',	`r5')
    70  
    71  define(`us',	`v4')
    72  
    73  
    74  ASM_START()
    75  PROLOGUE(mpn_copyi)
    76  
    77  LIMB32(`cmpi	cr7, n, 11	')
    78  LIMB64(`cmpdi	cr7, n, 5	')
    79  	bge	cr7, L(big)
    80  
    81  	or.	r0, n, n
    82  	beqlr	cr0
    83  
    84  C Handle small cases with plain operations
    85  	mtctr	n
    86  L(topS):
    87  LIMB32(`lwz	r0, 0(up)	')
    88  LIMB64(`ld	r0, 0(up)	')
    89  	addi	up, up, GMP_LIMB_BYTES
    90  LIMB32(`stw	r0, 0(rp)	')
    91  LIMB64(`std	r0, 0(rp)	')
    92  	addi	rp, rp, GMP_LIMB_BYTES
    93  	bdnz	L(topS)
    94  	blr
    95  
    96  C Handle large cases with VMX operations
    97  L(big):
    98  	mfspr	r12, 256
    99  	oris	r0, r12, 0xf800		C Set VRSAVE bit 0-4
   100  	mtspr	256, r0
   101  
   102  LIMB32(`rlwinm.	r7, rp, 30,30,31')	C (rp >> 2) mod 4
   103  LIMB64(`rlwinm.	r7, rp, 29,31,31')	C (rp >> 3) mod 2
   104  	beq	L(rp_aligned)
   105  
   106  	subfic	r7, r7, LIMBS_PER_VR
   107  	subf	n, r7, n
   108  L(top0):
   109  LIMB32(`lwz	r0, 0(up)	')
   110  LIMB64(`ld	r0, 0(up)	')
   111  	addi	up, up, GMP_LIMB_BYTES
   112  LIMB32(`addic.	r7, r7, -1	')
   113  LIMB32(`stw	r0, 0(rp)	')
   114  LIMB64(`std	r0, 0(rp)	')
   115  	addi	rp, rp, GMP_LIMB_BYTES
   116  LIMB32(`bne	L(top0)		')
   117  
   118  L(rp_aligned):
   119  
   120  LIMB32(`rlwinm.	r0, up, 30,30,31')	C (up >> 2) mod 4
   121  LIMB64(`rlwinm.	r0, up, 29,31,31')	C (up >> 3) mod 2
   122  
   123  LIMB64(`srdi	r7, n, 2	')	C loop count corresponding to n
   124  LIMB32(`srwi	r7, n, 3	')	C loop count corresponding to n
   125  	mtctr	r7			C copy n to count register
   126  
   127  	li	r10, 16
   128  
   129  	beq	L(up_aligned)
   130  
   131  	lvsl	us, 0, up
   132  
   133  LIMB32(`andi.	r0, n, 0x4	')
   134  LIMB64(`andi.	r0, n, 0x2	')
   135  	beq	L(1)
   136  	lvx	v0, 0, up
   137  	lvx	v2, r10, up
   138  	vperm	v3, v0, v2, us
   139  	stvx	v3, 0, rp
   140  	addi	up, up, 32
   141  	addi	rp, rp, 16
   142  	b	L(lpu)
   143  L(1):	lvx	v2, 0, up
   144  	addi	up, up, 16
   145  	b	L(lpu)
   146  
   147  	ALIGN(32)
   148  L(lpu):	lvx	v0, 0, up
   149  	vperm	v3, v2, v0, us
   150  	stvx	v3, 0, rp
   151  	lvx	v2, r10, up
   152  	addi	up, up, 32
   153  	vperm	v3, v0, v2, us
   154  	stvx	v3, r10, rp
   155  	addi	rp, rp, 32
   156  	bdnz	L(lpu)
   157  
   158  	addi	up, up, -16
   159  	b	L(tail)
   160  
   161  L(up_aligned):
   162  
   163  LIMB32(`andi.	r0, n, 0x4	')
   164  LIMB64(`andi.	r0, n, 0x2	')
   165  	beq	L(lpa)
   166  	lvx	v0, 0,   up
   167  	stvx	v0, 0,   rp
   168  	addi	up, up, 16
   169  	addi	rp, rp, 16
   170  	b	L(lpa)
   171  
   172  	ALIGN(32)
   173  L(lpa):	lvx	v0, 0,   up
   174  	lvx	v1, r10, up
   175  	addi	up, up, 32
   176  	nop
   177  	stvx	v0, 0,   rp
   178  	stvx	v1, r10, rp
   179  	addi	rp, rp, 32
   180  	bdnz	L(lpa)
   181  
   182  L(tail):
   183  LIMB32(`rlwinm.	r7, n, 0,30,31	')	C r7 = n mod 4
   184  LIMB64(`rlwinm.	r7, n, 0,31,31	')	C r7 = n mod 2
   185  	beq	L(ret)
   186  LIMB32(`li	r10, 0		')
   187  L(top2):
   188  LIMB32(`lwzx	r0, r10, up	')
   189  LIMB64(`ld	r0, 0(up)	')
   190  LIMB32(`addic.	r7, r7, -1	')
   191  LIMB32(`stwx	r0, r10, rp	')
   192  LIMB64(`std	r0, 0(rp)	')
   193  LIMB32(`addi	r10, r10, GMP_LIMB_BYTES')
   194  LIMB32(`bne	L(top2)		')
   195  
   196  L(ret):	mtspr	256, r12
   197  	blr
   198  EPILOGUE()