github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc32/vmx/copyd.asm (about)

     1  dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd.
     2  
     3  dnl  Copyright 2006 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C                16-byte coaligned      unaligned
    34  C                   cycles/limb        cycles/limb
    35  C 7400,7410 (G4):       0.5                0.64
    36  C 744x,745x (G4+):      0.75               0.82
    37  C 970 (G5):             0.78               1.02		(64-bit limbs)
    38  
    39  C STATUS
    40  C  * Works for all sizes and alignments.
    41  
    42  C TODO
    43  C  * Optimize unaligned case.  Some basic tests with 2-way and 4-way unrolling
    44  C    indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
    45  C    c/l for 970.
    46  C  * Consider using VMX instructions also for head and tail, by using some
    47  C    read-modify-write tricks.
    48  C  * The VMX code is used from the smallest sizes it handles, but measurements
    49  C    show a large speed bump at the cutoff points.  Small copying (perhaps
    50  C    using some read-modify-write technique) should be optimized.
    51  C  * Make an mpn_com based on this code.
    52  
    53  define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
    54  define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
    55  define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
    56  
    57  
    58  ifelse(GMP_LIMB_BITS,32,`
    59  	define(`LIMB32',`	$1')
    60  	define(`LIMB64',`')
    61  ',`
    62  	define(`LIMB32',`')
    63  	define(`LIMB64',`	$1')
    64  ')
    65  
    66  C INPUT PARAMETERS
    67  define(`rp',	`r3')
    68  define(`up',	`r4')
    69  define(`n',	`r5')
    70  
    71  define(`us',	`v4')
    72  
    73  
    74  ASM_START()
    75  PROLOGUE(mpn_copyd)
    76  
    77  LIMB32(`slwi.	r0, n, 2	')
    78  LIMB64(`sldi.	r0, n, 3	')
    79  	add	rp, rp, r0
    80  	add	up, up, r0
    81  
    82  LIMB32(`cmpi	cr7, n, 11	')
    83  LIMB64(`cmpdi	cr7, n, 5	')
    84  	bge	cr7, L(big)
    85  
    86  	beqlr	cr0
    87  
    88  C Handle small cases with plain operations
    89  	mtctr	n
    90  L(topS):
    91  LIMB32(`lwz	r0, -4(up)	')
    92  LIMB64(`ld	r0, -8(up)	')
    93  	addi	up, up, -GMP_LIMB_BYTES
    94  LIMB32(`stw	r0, -4(rp)	')
    95  LIMB64(`std	r0, -8(rp)	')
    96  	addi	rp, rp, -GMP_LIMB_BYTES
    97  	bdnz	L(topS)
    98  	blr
    99  
   100  C Handle large cases with VMX operations
   101  L(big):
   102  	addi	rp, rp, -16
   103  	addi	up, up, -16
   104  	mfspr	r12, 256
   105  	oris	r0, r12, 0xf800		C Set VRSAVE bit 0-4
   106  	mtspr	256, r0
   107  
   108  LIMB32(`rlwinm.	r7, rp, 30,30,31')	C (rp >> 2) mod 4
   109  LIMB64(`rlwinm.	r7, rp, 29,31,31')	C (rp >> 3) mod 2
   110  	beq	L(rp_aligned)
   111  
   112  	subf	n, r7, n
   113  L(top0):
   114  LIMB32(`lwz	r0, 12(up)	')
   115  LIMB64(`ld	r0, 8(up)	')
   116  	addi	up, up, -GMP_LIMB_BYTES
   117  LIMB32(`addic.	r7, r7, -1	')
   118  LIMB32(`stw	r0, 12(rp)	')
   119  LIMB64(`std	r0, 8(rp)	')
   120  	addi	rp, rp, -GMP_LIMB_BYTES
   121  LIMB32(`bne	L(top0)		')
   122  
   123  L(rp_aligned):
   124  
   125  LIMB32(`rlwinm.	r0, up, 30,30,31')	C (up >> 2) mod 4
   126  LIMB64(`rlwinm.	r0, up, 29,31,31')	C (up >> 3) mod 2
   127  
   128  LIMB64(`srdi	r7, n, 2	')	C loop count corresponding to n
   129  LIMB32(`srwi	r7, n, 3	')	C loop count corresponding to n
   130  	mtctr	r7			C copy n to count register
   131  
   132  	li	r10, -16
   133  
   134  	beq	L(up_aligned)
   135  
   136  	lvsl	us, 0, up
   137  
   138  	addi	up, up, 16
   139  LIMB32(`andi.	r0, n, 0x4	')
   140  LIMB64(`andi.	r0, n, 0x2	')
   141  	beq	L(1)
   142  	lvx	v0, 0, up
   143  	lvx	v2, r10, up
   144  	vperm	v3, v2, v0, us
   145  	stvx	v3, 0, rp
   146  	addi	up, up, -32
   147  	addi	rp, rp, -16
   148  	b	L(lpu)
   149  L(1):	lvx	v2, 0, up
   150  	addi	up, up, -16
   151  	b	L(lpu)
   152  
   153  	ALIGN(32)
   154  L(lpu):	lvx	v0, 0, up
   155  	vperm	v3, v0, v2, us
   156  	stvx	v3, 0, rp
   157  	lvx	v2, r10, up
   158  	addi	up, up, -32
   159  	vperm	v3, v2, v0, us
   160  	stvx	v3, r10, rp
   161  	addi	rp, rp, -32
   162  	bdnz	L(lpu)
   163  
   164  	b	L(tail)
   165  
   166  L(up_aligned):
   167  
   168  LIMB32(`andi.	r0, n, 0x4	')
   169  LIMB64(`andi.	r0, n, 0x2	')
   170  	beq	L(lpa)
   171  	lvx	v0, 0,   up
   172  	stvx	v0, 0,   rp
   173  	addi	up, up, -16
   174  	addi	rp, rp, -16
   175  	b	L(lpa)
   176  
   177  	ALIGN(32)
   178  L(lpa):	lvx	v0, 0,   up
   179  	lvx	v1, r10, up
   180  	addi	up, up, -32
   181  	nop
   182  	stvx	v0, 0,   rp
   183  	stvx	v1, r10, rp
   184  	addi	rp, rp, -32
   185  	bdnz	L(lpa)
   186  
   187  L(tail):
   188  LIMB32(`rlwinm.	r7, n, 0,30,31	')	C r7 = n mod 4
   189  LIMB64(`rlwinm.	r7, n, 0,31,31	')	C r7 = n mod 2
   190  	beq	L(ret)
   191  LIMB32(`li	r10, 12		')
   192  L(top2):
   193  LIMB32(`lwzx	r0, r10, up	')
   194  LIMB64(`ld	r0, 8(up)	')
   195  LIMB32(`addic.	r7, r7, -1	')
   196  LIMB32(`stwx	r0, r10, rp	')
   197  LIMB64(`std	r0, 8(rp)	')
   198  LIMB32(`addi	r10, r10, -GMP_LIMB_BYTES')
   199  LIMB32(`bne	L(top2)		')
   200  
   201  L(ret):	mtspr	256, r12
   202  	blr
   203  EPILOGUE()