github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/sec_tabselect.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/sec_tabselect.asm (about)

     1  dnl  AMD64 SSE mpn_sec_tabselect.
     2  
     3  dnl  Contributed to the GNU project by Torbjörn Granlund.
     4  
     5  dnl  Copyright 2011-2013 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  
    36  C	     cycles/limb     cycles/limb     cycles/limb
    37  C	      ali,evn n	     unal,evn n	      other cases
    38  C AMD K8,K9	 1.65		1.65		 1.8
    39  C AMD K10	 0.78		0.78		 0.85
    40  C AMD bd1	 0.80		0.91		 1.25
    41  C AMD bobcat	 2.15		2.15		 2.37
    42  C Intel P4	 2.5		2.5		 2.95
    43  C Intel core2	 1.17		1.25		 1.25
    44  C Intel NHM	 0.87		0.90		 0.90
    45  C Intel SBR	 0.63		0.79		 0.77
    46  C Intel atom	 4.3		 4.3		 4.3	slower than plain code
    47  C VIA nano	 1.4		 5.1		 3.14	too alignment dependent
    48  
    49  C NOTES
    50  C  * We only honour the least significant 32 bits of the `which' and `nents'
    51  C    arguments to allow efficient code using just SSE2.  We would need to
    52  C    either use the SSE4_1 pcmpeqq, or find some other SSE2 sequence.
    53  C  * We use movd for copying between xmm and plain registers, since old gas
    54  C    rejects movq.  But gas assembles movd as movq when given a 64-bit greg.
    55  
    56  define(`rp',     `%rdi')
    57  define(`tp',     `%rsi')
    58  define(`n',      `%rdx')
    59  define(`nents',  `%rcx')
    60  define(`which',  `%r8')
    61  
    62  define(`i',      `%r10')
    63  define(`j',      `%r9')
    64  
    65  C rax  rbx  rcx  rdx  rdi  rsi  rbp   r8   r9  r10  r11  r12  r13  r14  r15
    66  C          nents  n   rp   tab       which j    i   temp  *    *    *    *
    67  
    68  ABI_SUPPORT(DOS64)
    69  ABI_SUPPORT(STD64)
    70  
    71  ASM_START()
    72  	TEXT
    73  	ALIGN(16)
    74  PROLOGUE(mpn_sec_tabselect)
    75  	FUNC_ENTRY(4)
    76  IFDOS(`	mov	56(%rsp), %r8d	')
    77  
    78  	movd	which, %xmm8
    79  	pshufd	$0, %xmm8, %xmm8	C 4 `which' copies
    80  	mov	$1, R32(%rax)
    81  	movd	%rax, %xmm9
    82  	pshufd	$0, %xmm9, %xmm9	C 4 copies of 1
    83  
    84  	mov	n, j
    85  	add	$-8, j
    86  	js	L(outer_end)
    87  
    88  L(outer_top):
    89  	mov	nents, i
    90  	mov	tp, %r11
    91  	pxor	%xmm13, %xmm13
    92  	pxor	%xmm4, %xmm4
    93  	pxor	%xmm5, %xmm5
    94  	pxor	%xmm6, %xmm6
    95  	pxor	%xmm7, %xmm7
    96  	ALIGN(16)
    97  L(top):	movdqa	%xmm8, %xmm0
    98  	pcmpeqd	%xmm13, %xmm0
    99  	paddd	%xmm9, %xmm13
   100  	movdqu	0(tp), %xmm2
   101  	movdqu	16(tp), %xmm3
   102  	pand	%xmm0, %xmm2
   103  	pand	%xmm0, %xmm3
   104  	por	%xmm2, %xmm4
   105  	por	%xmm3, %xmm5
   106  	movdqu	32(tp), %xmm2
   107  	movdqu	48(tp), %xmm3
   108  	pand	%xmm0, %xmm2
   109  	pand	%xmm0, %xmm3
   110  	por	%xmm2, %xmm6
   111  	por	%xmm3, %xmm7
   112  	lea	(tp,n,8), tp
   113  	add	$-1, i
   114  	jne	L(top)
   115  
   116  	movdqu	%xmm4, 0(rp)
   117  	movdqu	%xmm5, 16(rp)
   118  	movdqu	%xmm6, 32(rp)
   119  	movdqu	%xmm7, 48(rp)
   120  
   121  	lea	64(%r11), tp
   122  	lea	64(rp), rp
   123  	add	$-8, j
   124  	jns	L(outer_top)
   125  L(outer_end):
   126  
   127  	test	$4, R8(n)
   128  	je	L(b0xx)
   129  L(b1xx):mov	nents, i
   130  	mov	tp, %r11
   131  	pxor	%xmm13, %xmm13
   132  	pxor	%xmm4, %xmm4
   133  	pxor	%xmm5, %xmm5
   134  	ALIGN(16)
   135  L(tp4):	movdqa	%xmm8, %xmm0
   136  	pcmpeqd	%xmm13, %xmm0
   137  	paddd	%xmm9, %xmm13
   138  	movdqu	0(tp), %xmm2
   139  	movdqu	16(tp), %xmm3
   140  	pand	%xmm0, %xmm2
   141  	pand	%xmm0, %xmm3
   142  	por	%xmm2, %xmm4
   143  	por	%xmm3, %xmm5
   144  	lea	(tp,n,8), tp
   145  	add	$-1, i
   146  	jne	L(tp4)
   147  	movdqu	%xmm4, 0(rp)
   148  	movdqu	%xmm5, 16(rp)
   149  	lea	32(%r11), tp
   150  	lea	32(rp), rp
   151  
   152  L(b0xx):test	$2, R8(n)
   153  	je	L(b00x)
   154  L(b01x):mov	nents, i
   155  	mov	tp, %r11
   156  	pxor	%xmm13, %xmm13
   157  	pxor	%xmm4, %xmm4
   158  	ALIGN(16)
   159  L(tp2):	movdqa	%xmm8, %xmm0
   160  	pcmpeqd	%xmm13, %xmm0
   161  	paddd	%xmm9, %xmm13
   162  	movdqu	0(tp), %xmm2
   163  	pand	%xmm0, %xmm2
   164  	por	%xmm2, %xmm4
   165  	lea	(tp,n,8), tp
   166  	add	$-1, i
   167  	jne	L(tp2)
   168  	movdqu	%xmm4, 0(rp)
   169  	lea	16(%r11), tp
   170  	lea	16(rp), rp
   171  
   172  L(b00x):test	$1, R8(n)
   173  	je	L(b000)
   174  L(b001):mov	nents, i
   175  	mov	tp, %r11
   176  	pxor	%xmm13, %xmm13
   177  	pxor	%xmm4, %xmm4
   178  	ALIGN(16)
   179  L(tp1):	movdqa	%xmm8, %xmm0
   180  	pcmpeqd	%xmm13, %xmm0
   181  	paddd	%xmm9, %xmm13
   182  	movq	0(tp), %xmm2
   183  	pand	%xmm0, %xmm2
   184  	por	%xmm2, %xmm4
   185  	lea	(tp,n,8), tp
   186  	add	$-1, i
   187  	jne	L(tp1)
   188  	movq	%xmm4, 0(rp)
   189  
   190  L(b000):FUNC_EXIT()
   191  	ret
   192  EPILOGUE()