github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc32/vmx/logops_n.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/powerpc32/vmx/logops_n.asm (about)

     1  dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_and_n, mpn_andn_n, mpn_nand_n,
     2  dnl  mpn_ior_n, mpn_iorn_n, mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise
     3  dnl  logical operations.
     4  
     5  dnl  Copyright 2006 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  
    36  C               and,ior,andn,nior,xor    iorn,xnor         nand
    37  C                   cycles/limb         cycles/limb    cycles/limb
    38  C 7400,7410 (G4):       1.39                 ?              ?
    39  C 744x,745x (G4+):      1.14                1.39           1.39
    40  C 970:                  1.7                 2.0            2.0
    41  
    42  C STATUS
    43  C  * Works for all sizes and alignment for 32-bit limbs.
    44  C  * Works for n >= 4 for 64-bit limbs; untested for smaller operands.
    45  C  * Current performance makes this pointless for 970
    46  
    47  C TODO
    48  C  * Might want to make variants when just one of the source operands needs
    49  C    vperm, and when neither needs it.  The latter runs 50% faster on 7400.
    50  C  * Idea: If the source operands are equally aligned, we could do the logops
    51  C    first, then vperm before storing!  That means we never need more than one
    52  C    vperm, ever!
    53  C  * Perhaps align `rp' after initial alignment loop?
    54  C  * Instead of having scalar code in the beginning and end, consider using
    55  C    read-modify-write vector code.
    56  C  * Software pipeline?  Hopefully not too important, this is hairy enough
    57  C    already.
    58  C  * At least be more clever about operand loading, i.e., load v operands before
    59  C    u operands, since v operands are sometimes negated.
    60  
    61  define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
    62  define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
    63  define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
    64  
    65  define(`vnegb', `')		C default neg-before to null
    66  define(`vnega', `')		C default neg-before to null
    67  
    68  ifdef(`OPERATION_and_n',
    69  `	define(`func',	`mpn_and_n')
    70  	define(`logopS',`and	$1,$2,$3')
    71  	define(`logop',	`vand	$1,$2,$3')')
    72  ifdef(`OPERATION_andn_n',
    73  `	define(`func',	`mpn_andn_n')
    74  	define(`logopS',`andc	$1,$2,$3')
    75  	define(`logop',	`vandc	$1,$2,$3')')
    76  ifdef(`OPERATION_nand_n',
    77  `	define(`func',	`mpn_nand_n')
    78  	define(`logopS',`nand	$1,$2,$3')
    79  	define(`logop',	`vand	$1,$2,$3')
    80  	define(`vnega',	`vnor	$1,$2,$2')')
    81  ifdef(`OPERATION_ior_n',
    82  `	define(`func',	`mpn_ior_n')
    83  	define(`logopS',`or	$1,$2,$3')
    84  	define(`logop',	`vor	$1,$2,$3')')
    85  ifdef(`OPERATION_iorn_n',
    86  `	define(`func',	`mpn_iorn_n')
    87  	define(`logopS',`orc	$1,$2,$3')
    88  	define(`vnegb',	`vnor	$1,$2,$2')
    89  	define(`logop',	`vor	$1,$2,$3')')
    90  ifdef(`OPERATION_nior_n',
    91  `	define(`func',	`mpn_nior_n')
    92  	define(`logopS',`nor	$1,$2,$3')
    93  	define(`logop',	`vnor	$1,$2,$3')')
    94  ifdef(`OPERATION_xor_n',
    95  `	define(`func',	`mpn_xor_n')
    96  	define(`logopS',`xor	$1,$2,$3')
    97  	define(`logop',	`vxor	$1,$2,$3')')
    98  ifdef(`OPERATION_xnor_n',
    99  `	define(`func',`mpn_xnor_n')
   100  	define(`logopS',`eqv	$1,$2,$3')
   101  	define(`vnegb',	`vnor	$1,$2,$2')
   102  	define(`logop',	`vxor	$1,$2,$3')')
   103  
   104  ifelse(GMP_LIMB_BITS,`32',`
   105  	define(`LIMB32',`	$1')
   106  	define(`LIMB64',`')
   107  ',`
   108  	define(`LIMB32',`')
   109  	define(`LIMB64',`	$1')
   110  ')
   111  
   112  C INPUT PARAMETERS
   113  define(`rp',	`r3')
   114  define(`up',	`r4')
   115  define(`vp',	`r5')
   116  define(`n',	`r6')
   117  
   118  define(`us',	`v8')
   119  define(`vs',	`v9')
   120  
   121  MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
   122  
   123  ASM_START()
   124  PROLOGUE(func)
   125  
   126  LIMB32(`cmpwi	cr0, n, 8	')
   127  LIMB64(`cmpdi	cr0, n, 4	')
   128  	bge	L(big)
   129  
   130  	mtctr	n
   131  
   132  LIMB32(`lwz	r8, 0(up)	')
   133  LIMB32(`lwz	r9, 0(vp)	')
   134  LIMB32(`logopS(	r0, r8, r9)	')
   135  LIMB32(`stw	r0, 0(rp)	')
   136  LIMB32(`bdz	L(endS)		')
   137  
   138  L(topS):
   139  LIMB32(`lwzu	r8, 4(up)	')
   140  LIMB64(`ld	r8, 0(up)	')
   141  LIMB64(`addi	up, up, GMP_LIMB_BYTES	')
   142  LIMB32(`lwzu	r9, 4(vp)	')
   143  LIMB64(`ld	r9, 0(vp)	')
   144  LIMB64(`addi	vp, vp, GMP_LIMB_BYTES	')
   145  	logopS(	r0, r8, r9)
   146  LIMB32(`stwu	r0, 4(rp)	')
   147  LIMB64(`std	r0, 0(rp)	')
   148  LIMB64(`addi	rp, rp, GMP_LIMB_BYTES	')
   149  	bdnz	L(topS)
   150  L(endS):
   151  	blr
   152  
   153  L(big):	mfspr	r12, 256
   154  	oris	r0, r12, 0xfffc		C Set VRSAVE bit 0-13 FIXME
   155  	mtspr	256, r0
   156  
   157  C First loop until the destination is 16-byte aligned.  This will execute 0 or 1
   158  C times for 64-bit machines, and 0 to 3 times for 32-bit machines.
   159  
   160  LIMB32(`rlwinm.	r0, rp, 30,30,31')	C (rp >> 2) mod 4
   161  LIMB64(`rlwinm.	r0, rp, 29,31,31')	C (rp >> 3) mod 2
   162  	beq	L(aligned)
   163  
   164  	subfic	r7, r0, LIMBS_PER_VR
   165  LIMB32(`li	r10, 0		')
   166  	subf	n, r7, n
   167  L(top0):
   168  LIMB32(`lwz	r8, 0(up)	')
   169  LIMB64(`ld	r8, 0(up)	')
   170  	addi	up, up, GMP_LIMB_BYTES
   171  LIMB32(`lwz	r9, 0(vp)	')
   172  LIMB64(`ld	r9, 0(vp)	')
   173  	addi	vp, vp, GMP_LIMB_BYTES
   174  LIMB32(`addic.	r7, r7, -1	')
   175  	logopS(	r0, r8, r9)
   176  LIMB32(`stwx	r0, r10, rp	')
   177  LIMB64(`std	r0, 0(rp)	')
   178  LIMB32(`addi	r10, r10, GMP_LIMB_BYTES')
   179  LIMB32(`bne	L(top0)		')
   180  
   181  	addi	rp, rp, 16		C update rp, but preserve its alignment
   182  
   183  L(aligned):
   184  LIMB64(`srdi	r7, n, 1	')	C loop count corresponding to n
   185  LIMB32(`srwi	r7, n, 2	')	C loop count corresponding to n
   186  	mtctr	r7			C copy n to count register
   187  
   188  	li	r10, 16
   189  	lvsl	us, 0, up
   190  	lvsl	vs, 0, vp
   191  
   192  	lvx	v2, 0, up
   193  	lvx	v3, 0, vp
   194  	bdnz	L(gt1)
   195  	lvx	v0, r10, up
   196  	lvx	v1, r10, vp
   197  	vperm	v4, v2, v0, us
   198  	vperm	v5, v3, v1, vs
   199  	vnegb(	v5, v5)
   200  	logop(	v6, v4, v5)
   201  	vnega(	v6, v6)
   202  	stvx	v6, 0, rp
   203  	addi	up, up, 16
   204  	addi	vp, vp, 16
   205  	addi	rp, rp, 4
   206  	b	L(tail)
   207  
   208  L(gt1):	addi	up, up, 16
   209  	addi	vp, vp, 16
   210  
   211  L(top):	lvx	v0, 0, up
   212  	lvx	v1, 0, vp
   213  	vperm	v4, v2, v0, us
   214  	vperm	v5, v3, v1, vs
   215  	vnegb(	v5, v5)
   216  	logop(	v6, v4, v5)
   217  	vnega(	v6, v6)
   218  	stvx	v6, 0, rp
   219  	bdz	L(end)
   220  	lvx	v2, r10, up
   221  	lvx	v3, r10, vp
   222  	vperm	v4, v0, v2, us
   223  	vperm	v5, v1, v3, vs
   224  	vnegb(	v5, v5)
   225  	logop(	v6, v4, v5)
   226  	vnega(	v6, v6)
   227  	stvx	v6, r10, rp
   228  	addi	up, up, 32
   229  	addi	vp, vp, 32
   230  	addi	rp, rp, 32
   231  	bdnz	L(top)
   232  
   233  	andi.	r0, up, 15
   234  	vxor	v0, v0, v0
   235  	beq	1f
   236  	lvx	v0, 0, up
   237  1:	andi.	r0, vp, 15
   238  	vxor	v1, v1, v1
   239  	beq	1f
   240  	lvx	v1, 0, vp
   241  1:	vperm	v4, v2, v0, us
   242  	vperm	v5, v3, v1, vs
   243  	vnegb(	v5, v5)
   244  	logop(	v6, v4, v5)
   245  	vnega(	v6, v6)
   246  	stvx	v6, 0, rp
   247  	addi	rp, rp, 4
   248  	b	L(tail)
   249  
   250  L(end):	andi.	r0, up, 15
   251  	vxor	v2, v2, v2
   252  	beq	1f
   253  	lvx	v2, r10, up
   254  1:	andi.	r0, vp, 15
   255  	vxor	v3, v3, v3
   256  	beq	1f
   257  	lvx	v3, r10, vp
   258  1:	vperm	v4, v0, v2, us
   259  	vperm	v5, v1, v3, vs
   260  	vnegb(	v5, v5)
   261  	logop(	v6, v4, v5)
   262  	vnega(	v6, v6)
   263  	stvx	v6, r10, rp
   264  
   265  	addi	up, up, 16
   266  	addi	vp, vp, 16
   267  	addi	rp, rp, 20
   268  
   269  L(tail):
   270  LIMB32(`rlwinm.	r7, n, 0,30,31	')	C r7 = n mod 4
   271  LIMB64(`rlwinm.	r7, n, 0,31,31	')	C r7 = n mod 2
   272  	beq	L(ret)
   273  	addi	rp, rp, 15
   274  LIMB32(`rlwinm	rp, rp, 0,0,27	')
   275  LIMB64(`rldicr	rp, rp, 0,59	')
   276  	li	r10, 0
   277  L(top2):
   278  LIMB32(`lwzx	r8, r10, up	')
   279  LIMB64(`ldx	r8, r10, up	')
   280  LIMB32(`lwzx	r9, r10, vp	')
   281  LIMB64(`ldx	r9, r10, vp	')
   282  LIMB32(`addic.	r7, r7, -1	')
   283  	logopS(	r0, r8, r9)
   284  LIMB32(`stwx	r0, r10, rp	')
   285  LIMB64(`std	r0, 0(rp)	')
   286  LIMB32(`addi	r10, r10, GMP_LIMB_BYTES')
   287  LIMB32(`bne	L(top2)		')
   288  
   289  L(ret):	mtspr	256, r12
   290  	blr
   291  EPILOGUE()
   292  
   293  C This works for 64-bit PowerPC, since a limb ptr can only be aligned
   294  C in 2 relevant ways, which means we can always find a pair of aligned
   295  C pointers of rp, up, and vp.
   296  C process words until rp is 16-byte aligned
   297  C if (((up | vp) & 15) == 0)
   298  C   process with VMX without any vperm
   299  C else if ((up & 15) != 0 && (vp & 15) != 0)
   300  C   process with VMX using vperm on store data
   301  C else if ((up & 15) != 0)
   302  C   process with VMX using vperm on up data
   303  C else
   304  C   process with VMX using vperm on vp data
   305  C
   306  C	rlwinm,	r0, up, 0,28,31
   307  C	rlwinm	r0, vp, 0,28,31
   308  C	cmpwi	cr7, r0, 0
   309  C	cror	cr6, cr0, cr7
   310  C	crand	cr0, cr0, cr7