github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/s390_64/lshift.asm (about)

     1  dnl  S/390-64 mpn_lshift.
     2  
     3  dnl  Copyright 2011, 2012, 2014 Free Software Foundation, Inc.
     4  
     5  dnl  This file is part of the GNU MP Library.
     6  dnl
     7  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
     8  dnl  it under the terms of either:
     9  dnl
    10  dnl    * the GNU Lesser General Public License as published by the Free
    11  dnl      Software Foundation; either version 3 of the License, or (at your
    12  dnl      option) any later version.
    13  dnl
    14  dnl  or
    15  dnl
    16  dnl    * the GNU General Public License as published by the Free Software
    17  dnl      Foundation; either version 2 of the License, or (at your option) any
    18  dnl      later version.
    19  dnl
    20  dnl  or both in parallel, as here.
    21  dnl
    22  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    23  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    24  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    25  dnl  for more details.
    26  dnl
    27  dnl  You should have received copies of the GNU General Public License and the
    28  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    29  dnl  see https://www.gnu.org/licenses/.
    30  
    31  include(`../config.m4')
    32  
    33  C            cycles/limb
    34  C z900		 7
    35  C z990           3
    36  C z9		 ?
    37  C z10		 6
    38  C z196		 ?
    39  
    40  C NOTES
    41  C  * This uses discrete loads and stores in a software pipeline.  Using lmg and
    42  C    stmg is not faster.
    43  C  * One could assume more pipelining could approach 2.5 c/l, but we have not
    44  C    found any 8-way loop that runs better than the current 4-way loop.
    45  C  * Consider using the same feed-in code for 1 <= n <= 3 as for n mod 4,
    46  C    similarly to the x86_64 sqr_basecase feed-in.
    47  
    48  C INPUT PARAMETERS
    49  define(`rp',	`%r2')
    50  define(`up',	`%r3')
    51  define(`n',	`%r4')
    52  define(`cnt',	`%r5')
    53  
    54  define(`tnc',	`%r6')
    55  
    56  ASM_START()
    57  PROLOGUE(mpn_lshift)
    58  	cghi	n, 3
    59  	jh	L(gt1)
    60  
    61  	stmg	%r6, %r7, 48(%r15)
    62  	larl	%r1, L(tab)-4
    63  	lcgr	tnc, cnt
    64  	sllg	n, n, 2
    65  	b	0(n,%r1)
    66  L(tab):	j	L(n1)
    67  	j	L(n2)
    68  	j	L(n3)
    69  
    70  L(n1):	lg	%r1, 0(up)
    71  	sllg	%r0, %r1, 0(cnt)
    72  	stg	%r0, 0(rp)
    73  	srlg	%r2, %r1, 0(tnc)
    74  	lg	%r6, 48(%r15)		C restoring r7 not needed
    75  	br	%r14
    76  
    77  L(n2):	lg	%r1, 8(up)
    78  	srlg	%r4, %r1, 0(tnc)
    79  	sllg	%r0, %r1, 0(cnt)
    80  	j	L(cj)
    81  
    82  L(n3):	lg	%r1, 16(up)
    83  	srlg	%r4, %r1, 0(tnc)
    84  	sllg	%r0, %r1, 0(cnt)
    85  	lg	%r1, 8(up)
    86  	srlg	%r7, %r1, 0(tnc)
    87  	ogr	%r7, %r0
    88  	sllg	%r0, %r1, 0(cnt)
    89  	stg	%r7, 16(rp)
    90  L(cj):	lg	%r1, 0(up)
    91  	srlg	%r7, %r1, 0(tnc)
    92  	ogr	%r7, %r0
    93  	sllg	%r0, %r1, 0(cnt)
    94  	stg	%r7, 8(rp)
    95  	stg	%r0, 0(rp)
    96  	lgr	%r2, %r4
    97  	lmg	%r6, %r7, 48(%r15)
    98  	br	%r14
    99  
   100  L(gt1):	stmg	%r6, %r13, 48(%r15)
   101  	lcgr	tnc, cnt		C tnc = -cnt
   102  
   103  	sllg	%r1, n, 3
   104  	srlg	%r0, n, 2		C loop count
   105  
   106  	agr	up, %r1			C point up at end of U
   107  	agr	rp, %r1			C point rp at end of R
   108  	aghi	up, -56
   109  	aghi	rp, -40
   110  
   111  	lghi	%r7, 3
   112  	ngr	%r7, n
   113  	je	L(b0)
   114  	cghi	%r7, 2
   115  	jl	L(b1)
   116  	je	L(b2)
   117  
   118  L(b3):	lg	%r7, 48(up)
   119  	srlg	%r9, %r7, 0(tnc)
   120  	sllg	%r11, %r7, 0(cnt)
   121  	lg	%r8, 40(up)
   122  	lg	%r7, 32(up)
   123  	srlg	%r4, %r8, 0(tnc)
   124  	sllg	%r13, %r8, 0(cnt)
   125  	ogr	%r11, %r4
   126  	la	rp, 16(rp)
   127  	j	L(lm3)
   128  
   129  L(b2):	lg	%r8, 48(up)
   130  	lg	%r7, 40(up)
   131  	srlg	%r9, %r8, 0(tnc)
   132  	sllg	%r13, %r8, 0(cnt)
   133  	la	rp, 24(rp)
   134  	la	up, 8(up)
   135  	j	L(lm2)
   136  
   137  L(b1):	lg	%r7, 48(up)
   138  	srlg	%r9, %r7, 0(tnc)
   139  	sllg	%r11, %r7, 0(cnt)
   140  	lg	%r8, 40(up)
   141  	lg	%r7, 32(up)
   142  	srlg	%r4, %r8, 0(tnc)
   143  	sllg	%r10, %r8, 0(cnt)
   144  	ogr	%r11, %r4
   145  	la	rp, 32(rp)
   146  	la	up, 16(up)
   147  	j	L(lm1)
   148  
   149  L(b0):	lg	%r8, 48(up)
   150  	lg	%r7, 40(up)
   151  	srlg	%r9, %r8, 0(tnc)
   152  	sllg	%r10, %r8, 0(cnt)
   153  	la	rp, 40(rp)
   154  	la	up, 24(up)
   155  	j	L(lm0)
   156  
   157  	ALIGN(8)
   158  L(top):	srlg	%r4, %r8, 0(tnc)
   159  	sllg	%r13, %r8, 0(cnt)
   160  	ogr	%r11, %r4
   161  	stg	%r10, 24(rp)
   162  L(lm3):	stg	%r11, 16(rp)
   163  L(lm2):	srlg	%r12, %r7, 0(tnc)
   164  	sllg	%r11, %r7, 0(cnt)
   165  	lg	%r8, 24(up)
   166  	lg	%r7, 16(up)
   167  	ogr	%r13, %r12
   168  	srlg	%r4, %r8, 0(tnc)
   169  	sllg	%r10, %r8, 0(cnt)
   170  	ogr	%r11, %r4
   171  	stg	%r13, 8(rp)
   172  L(lm1):	stg	%r11, 0(rp)
   173  L(lm0):	srlg	%r12, %r7, 0(tnc)
   174  	aghi	rp, -32
   175  	sllg	%r11, %r7, 0(cnt)
   176  	lg	%r8, 8(up)
   177  	lg	%r7, 0(up)
   178  	aghi	up, -32
   179  	ogr	%r10, %r12
   180  	brctg	%r0, L(top)
   181  
   182  L(end):	srlg	%r4, %r8, 0(tnc)
   183  	sllg	%r13, %r8, 0(cnt)
   184  	ogr	%r11, %r4
   185  	stg	%r10, 24(rp)
   186  	stg	%r11, 16(rp)
   187  	srlg	%r12, %r7, 0(tnc)
   188  	sllg	%r11, %r7, 0(cnt)
   189  	ogr	%r13, %r12
   190  	stg	%r13, 8(rp)
   191  	stg	%r11, 0(rp)
   192  	lgr	%r2, %r9
   193  
   194  	lmg	%r6, %r13, 48(%r15)
   195  	br	%r14
   196  EPILOGUE()