github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/com.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/alpha/com.asm (about)

     1  dnl  Alpha mpn_com -- mpn one's complement.
     2  
     3  dnl  Copyright 2003 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  
    34  C      cycles/limb
    35  C EV4:    4.75
    36  C EV5:    2.0
    37  C EV6:    1.5
    38  
    39  
    40  C mp_limb_t mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
    41  C
    42  C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total
    43  C 2.0 c/l.  In general, a pattern like this unrolled to N limbs per loop
    44  C will be 1.5+2/N c/l.
    45  C
    46  C 2 cycles of loop control are unavoidable, for pointer updates and the
    47  C taken branch bubble, but also since ldq cannot issue two cycles after stq
    48  C (and with a run of stqs that means neither of two cycles at the end of the
    49  C loop.
    50  C
    51  C The fbeq is forced into the second cycle of the loop using unops, since
    52  C the first time through it must wait for the cvtqt result.  Once that
    53  C result is ready (a 1 cycle stall) then both the branch and following loads
    54  C can issue together.
    55  C
    56  C The main loop handles an odd count of limbs, being two limbs loaded before
    57  C each size test, plus one pipelined around from the previous iteration (or
    58  C setup in the entry sequence).
    59  C
    60  C An even number of limbs is handled by an explicit dst[0]=~src[0] in the
    61  C entry sequence, and an increment of the pointers.  For an odd size there's
    62  C no increment and the first store in the loop (r24) is a repeat of dst[0].
    63  C
    64  C Note that the load for r24 after the possible pointer increment is done
    65  C before the explicit store to dst[0], in case src==dst.
    66  
    67  
    68  ASM_START()
    69  
    70  FLOAT64(L(dat), 2.0)
    71  
    72  	ALIGN(16)
    73  
    74  PROLOGUE(mpn_com,gp)
    75  
    76  	C r16	dst
    77  	C r17	src
    78  	C r18	size
    79  
    80  	lda	r30, -16(r30)		C temporary stack space
    81  	lda	r7, -3(r18)		C size - 3
    82  
    83  	ldq	r20, 0(r17)		C src[0]
    84  	srl	r7, 1, r6		C (size-3)/2
    85  
    86  	stq	r6, 8(r30)		C (size-3)/2
    87  	and	r7, 1, r5		C 1 if size even
    88  
    89  	LEA(	r8, L(dat))
    90  	s8addq	r5, r17, r17		C skip src[0] if even
    91  
    92  	ornot	r31, r20, r20		C ~src[0]
    93  	unop
    94  
    95  	ldt	f0, 8(r30)		C (size-3)/2
    96  	ldq	r24, 0(r17)		C src[0 or 1]
    97  
    98  	stq	r20, 0(r16)		C dst[0]
    99  	s8addq	r5, r16, r19		C skip dst[0] if even
   100  
   101  	ldt	f1, 0(r8)		C data 2.0
   102  	lda	r30, 16(r30)		C restore stack
   103  	unop
   104  	cvtqt	f0, f0			C (size-3)/2 as float
   105  
   106  	ornot	r31, r24, r24
   107  	blt	r7, L(done_1)		C if size<=2
   108  	unop
   109  	unop
   110  
   111  
   112  	C 16-byte alignment here
   113  L(top):
   114  	C r17	src, incrementing
   115  	C r19	dst, incrementing
   116  	C r24	dst[i] result, ready to store
   117  	C f0	(size-3)/2, decrementing
   118  	C f1	2.0
   119  
   120  	ldq	r20, 8(r17)		C src[i+1]
   121  	ldq	r21, 16(r17)		C src[i+2]
   122  	unop
   123  	unop
   124  
   125  	fbeq	f0, L(done_2)
   126  	unop
   127  	ldq	r22, 24(r17)		C src[i+3]
   128  	ldq	r23, 32(r17)		C src[i+4]
   129  
   130  	stq	r24, 0(r19)		C dst[i]
   131  	ornot	r31, r20, r20
   132  	subt	f0, f1, f0		C count -= 2
   133  	unop
   134  
   135  	stq	r20, 8(r19)		C dst[i+1]
   136  	ornot	r31, r21, r21
   137  	unop
   138  	unop
   139  
   140  	stq	r21, 16(r19)		C dst[i+2]
   141  	ornot	r31, r22, r22
   142  
   143  	stq	r22, 24(r19)		C dst[i+3]
   144  	ornot	r31, r23, r24
   145  
   146  	lda	r17, 32(r17)		C src += 4
   147  	lda	r19, 32(r19)		C dst += 4
   148  	unop
   149  	fbge	f0, L(top)
   150  
   151  
   152  L(done_1):
   153  	C r19	&dst[size-1]
   154  	C r24	result for dst[size-1]
   155  
   156  	stq	r24, 0(r19)		C dst[size-1]
   157  	ret	r31, (r26), 1
   158  
   159  
   160  L(done_2):
   161  	C r19	&dst[size-3]
   162  	C r20	src[size-2]
   163  	C r21	src[size-1]
   164  	C r24	result for dst[size-3]
   165  
   166  	stq	r24, 0(r19)		C dst[size-3]
   167  	ornot	r31, r20, r20
   168  
   169  	stq	r20, 8(r19)		C dst[size-2]
   170  	ornot	r31, r21, r21
   171  
   172  	stq	r21, 16(r19)		C dst[size-1]
   173  	ret	r31, (r26), 1
   174  
   175  EPILOGUE()
   176  ASM_END()