github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/sec_tabselect.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/sec_tabselect.asm (about)

     1  dnl  AMD64 mpn_sec_tabselect.
     2  
     3  dnl  Contributed to the GNU project by Torbjörn Granlund.
     4  
     5  dnl  Copyright 2011-2013 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  
    36  C	     cycles/limb          good for cpu
    37  C AMD K8,K9	 1.5			Y
    38  C AMD K10	 1.4
    39  C AMD bd1	 2.64
    40  C AMD bobcat	 2.15			Y
    41  C Intel P4	 4
    42  C Intel core2	 1.38
    43  C Intel NHM	 1.75
    44  C Intel SBR	 1.25
    45  C Intel atom	 2.5			Y
    46  C VIA nano	 1.75			Y
    47  
    48  C NOTES
    49  C  * This has not been tuned for any specific processor.  Its speed should not
    50  C    be too bad, though.
    51  C  * Using SSE2/AVX2 could result in many-fold speedup.
    52  C  * WORKS FOR n mod 4 = 0 ONLY!
    53  
    54  C mpn_sec_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
    55  define(`rp',     `%rdi')
    56  define(`tp',     `%rsi')
    57  define(`n',      `%rdx')
    58  define(`nents',  `%rcx')
    59  define(`which',  `%r8')
    60  
    61  define(`i',      `%rbp')
    62  define(`j',      `%r9')
    63  
    64  C rax  rbx  rcx  rdx  rdi  rsi  rbp   r8   r9  r10  r11  r12  r13  r14  r15
    65  C          nents  n   rp   tab   i   which j    *    *    *    *    *    *
    66  
    67  ABI_SUPPORT(DOS64)
    68  ABI_SUPPORT(STD64)
    69  
    70  ASM_START()
    71  	TEXT
    72  	ALIGN(16)
    73  PROLOGUE(mpn_sec_tabselect)
    74  	FUNC_ENTRY(4)
    75  IFDOS(`	mov	56(%rsp), %r8d	')
    76  
    77  	push	%rbx
    78  	push	%rbp
    79  	push	%r12
    80  	push	%r13
    81  	push	%r14
    82  	push	%r15
    83  
    84  	mov	n, j
    85  	add	$-4, j
    86  	js	L(outer_end)
    87  
    88  L(outer_top):
    89  	mov	nents, i
    90  	push	tp
    91  	xor	R32(%r12), R32(%r12)
    92  	xor	R32(%r13), R32(%r13)
    93  	xor	R32(%r14), R32(%r14)
    94  	xor	R32(%r15), R32(%r15)
    95  	mov	which, %rbx
    96  
    97  	ALIGN(16)
    98  L(top):	sub	$1, %rbx
    99  	sbb	%rax, %rax
   100  	mov	0(tp), %r10
   101  	mov	8(tp), %r11
   102  	and	%rax, %r10
   103  	and	%rax, %r11
   104  	or	%r10, %r12
   105  	or	%r11, %r13
   106  	mov	16(tp), %r10
   107  	mov	24(tp), %r11
   108  	and	%rax, %r10
   109  	and	%rax, %r11
   110  	or	%r10, %r14
   111  	or	%r11, %r15
   112  	lea	(tp,n,8), tp
   113  	add	$-1, i
   114  	jne	L(top)
   115  
   116  	mov	%r12, 0(rp)
   117  	mov	%r13, 8(rp)
   118  	mov	%r14, 16(rp)
   119  	mov	%r15, 24(rp)
   120  	pop	tp
   121  	lea	32(tp), tp
   122  	lea	32(rp), rp
   123  	add	$-4, j
   124  	jns	L(outer_top)
   125  L(outer_end):
   126  
   127  	test	$2, R8(n)
   128  	jz	L(b0x)
   129  L(b1x):	mov	nents, i
   130  	push	tp
   131  	xor	R32(%r12), R32(%r12)
   132  	xor	R32(%r13), R32(%r13)
   133  	mov	which, %rbx
   134  	ALIGN(16)
   135  L(tp2):	sub	$1, %rbx
   136  	sbb	%rax, %rax
   137  	mov	0(tp), %r10
   138  	mov	8(tp), %r11
   139  	and	%rax, %r10
   140  	and	%rax, %r11
   141  	or	%r10, %r12
   142  	or	%r11, %r13
   143  	lea	(tp,n,8), tp
   144  	add	$-1, i
   145  	jne	L(tp2)
   146  	mov	%r12, 0(rp)
   147  	mov	%r13, 8(rp)
   148  	pop	tp
   149  	lea	16(tp), tp
   150  	lea	16(rp), rp
   151  
   152  L(b0x):	test	$1, R8(n)
   153  	jz	L(b00)
   154  L(b01):	mov	nents, i
   155  	xor	R32(%r12), R32(%r12)
   156  	mov	which, %rbx
   157  	ALIGN(16)
   158  L(tp1):	sub	$1, %rbx
   159  	sbb	%rax, %rax
   160  	mov	0(tp), %r10
   161  	and	%rax, %r10
   162  	or	%r10, %r12
   163  	lea	(tp,n,8), tp
   164  	add	$-1, i
   165  	jne	L(tp1)
   166  	mov	%r12, 0(rp)
   167  
   168  L(b00):	pop	%r15
   169  	pop	%r14
   170  	pop	%r13
   171  	pop	%r12
   172  	pop	%rbp
   173  	pop	%rbx
   174  	FUNC_EXIT()
   175  	ret
   176  EPILOGUE()