github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/copyi-palignr.asm

github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/copyi-palignr.asm (about)

     1  dnl  AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3.
     2  
     3  dnl  Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
     4  
     5  dnl  Contributed to the GNU project by Torbjörn Granlund.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C	     cycles/limb     cycles/limb     cycles/limb      good
    36  C              aligned	      unaligned	      best seen	     for cpu?
    37  C AMD K8,K9	 2.0		 illop		1.0/1.0		N
    38  C AMD K10	 0.85		 illop				Y/N
    39  C AMD bull	 0.70		 0.66				Y
    40  C AMD pile	 0.68		 0.66				Y
    41  C AMD steam	 ?		 ?
    42  C AMD excavator	 ?		 ?
    43  C AMD bobcat	 1.97		 8.16		1.5/1.5		N
    44  C AMD jaguar	 0.77		 0.93		0.65/opt	N/Y
    45  C Intel P4	 2.26		 illop				Y/N
    46  C Intel core	 0.52		 0.64		opt/opt		Y
    47  C Intel NHM	 0.52		 0.71		opt/opt		Y
    48  C Intel SBR	 0.51		 0.54		opt/0.51	Y
    49  C Intel IBR	 0.50		 0.54		opt/opt		Y
    50  C Intel HWL	 0.50		 0.51		opt/opt		Y
    51  C Intel BWL	 0.55		 0.55		opt/opt		Y
    52  C Intel atom	 1.16		 1.61		opt/opt		Y
    53  C Intel SLM	 1.02		 1.07		opt/opt		Y
    54  C VIA nano	 1.09		 1.08		opt/opt		Y
    55  
    56  C We use only 16-byte operations, except for unaligned top-most and bottom-most
    57  C limbs.  We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).  That
    58  C instruction is better adapted to mpn_copyd's needs, we need to contort the
    59  C code to use it here.
    60  C
    61  C For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
    62  C taken from the x86_64 default code.
    63  
    64  C INPUT PARAMETERS
    65  define(`rp', `%rdi')
    66  define(`up', `%rsi')
    67  define(`n',  `%rdx')
    68  
    69  C There are three instructions for loading an aligned 128-bit quantity.  We use
    70  C movaps, since it has the shortest coding.
    71  dnl define(`movdqa', ``movaps'')
    72  
    73  ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)')
    74  
    75  ASM_START()
    76  	TEXT
    77  	ALIGN(64)
    78  PROLOGUE(mpn_copyi)
    79  	FUNC_ENTRY(3)
    80  
    81  	cmp	$COPYI_SSE_THRESHOLD, n
    82  	jbe	L(bc)
    83  
    84  	test	$8, R8(rp)		C is rp 16-byte aligned?
    85  	jz	L(rp_aligned)		C jump if rp aligned
    86  
    87  	movsq				C copy one limb
    88  	dec	n
    89  
    90  L(rp_aligned):
    91  	test	$8, R8(up)
    92  	jnz	L(uent)
    93  
    94  ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
    95  `	sub	$8, n',
    96  `	jmp	L(am)')
    97  
    98  	ALIGN(16)
    99  L(atop):movdqa	0(up), %xmm0
   100  	movdqa	16(up), %xmm1
   101  	movdqa	32(up), %xmm2
   102  	movdqa	48(up), %xmm3
   103  	lea	64(up), up
   104  	movdqa	%xmm0, (rp)
   105  	movdqa	%xmm1, 16(rp)
   106  	movdqa	%xmm2, 32(rp)
   107  	movdqa	%xmm3, 48(rp)
   108  	lea	64(rp), rp
   109  L(am):	sub	$8, n
   110  	jnc	L(atop)
   111  
   112  	test	$4, R8(n)
   113  	jz	1f
   114  	movdqa	(up), %xmm0
   115  	movdqa	16(up), %xmm1
   116  	lea	32(up), up
   117  	movdqa	%xmm0, (rp)
   118  	movdqa	%xmm1, 16(rp)
   119  	lea	32(rp), rp
   120  
   121  1:	test	$2, R8(n)
   122  	jz	1f
   123  	movdqa	(up), %xmm0
   124  	lea	16(up), up
   125  	movdqa	%xmm0, (rp)
   126  	lea	16(rp), rp
   127  
   128  1:	test	$1, R8(n)
   129  	jz	1f
   130  	mov	(up), %r8
   131  	mov	%r8, (rp)
   132  
   133  1:	FUNC_EXIT()
   134  	ret
   135  
   136  L(uent):
   137  C Code handling up - rp = 8 (mod 16)
   138  
   139  	cmp	$16, n
   140  	jc	L(ued0)
   141  
   142  IFDOS(`	add	$-56, %rsp	')
   143  IFDOS(`	movdqa	%xmm6, (%rsp)	')
   144  IFDOS(`	movdqa	%xmm7, 16(%rsp)	')
   145  IFDOS(`	movdqa	%xmm8, 32(%rsp)	')
   146  
   147  	movaps	120(up), %xmm7
   148  	movaps	104(up), %xmm6
   149  	movaps	88(up), %xmm5
   150  	movaps	72(up), %xmm4
   151  	movaps	56(up), %xmm3
   152  	movaps	40(up), %xmm2
   153  	lea	128(up), up
   154  	sub	$32, n
   155  	jc	L(ued1)
   156  
   157  	ALIGN(16)
   158  L(utop):movaps	-104(up), %xmm1
   159  	sub	$16, n
   160  	movaps	-120(up), %xmm0
   161  	palignr($8, %xmm6, %xmm7)
   162  	movaps	-136(up), %xmm8
   163  	movdqa	%xmm7, 112(rp)
   164  	palignr($8, %xmm5, %xmm6)
   165  	movaps	120(up), %xmm7
   166  	movdqa	%xmm6, 96(rp)
   167  	palignr($8, %xmm4, %xmm5)
   168  	movaps	104(up), %xmm6
   169  	movdqa	%xmm5, 80(rp)
   170  	palignr($8, %xmm3, %xmm4)
   171  	movaps	88(up), %xmm5
   172  	movdqa	%xmm4, 64(rp)
   173  	palignr($8, %xmm2, %xmm3)
   174  	movaps	72(up), %xmm4
   175  	movdqa	%xmm3, 48(rp)
   176  	palignr($8, %xmm1, %xmm2)
   177  	movaps	56(up), %xmm3
   178  	movdqa	%xmm2, 32(rp)
   179  	palignr($8, %xmm0, %xmm1)
   180  	movaps	40(up), %xmm2
   181  	movdqa	%xmm1, 16(rp)
   182  	palignr($8, %xmm8, %xmm0)
   183  	lea	128(up), up
   184  	movdqa	%xmm0, (rp)
   185  	lea	128(rp), rp
   186  	jnc	L(utop)
   187  
   188  L(ued1):movaps	-104(up), %xmm1
   189  	movaps	-120(up), %xmm0
   190  	movaps	-136(up), %xmm8
   191  	palignr($8, %xmm6, %xmm7)
   192  	movdqa	%xmm7, 112(rp)
   193  	palignr($8, %xmm5, %xmm6)
   194  	movdqa	%xmm6, 96(rp)
   195  	palignr($8, %xmm4, %xmm5)
   196  	movdqa	%xmm5, 80(rp)
   197  	palignr($8, %xmm3, %xmm4)
   198  	movdqa	%xmm4, 64(rp)
   199  	palignr($8, %xmm2, %xmm3)
   200  	movdqa	%xmm3, 48(rp)
   201  	palignr($8, %xmm1, %xmm2)
   202  	movdqa	%xmm2, 32(rp)
   203  	palignr($8, %xmm0, %xmm1)
   204  	movdqa	%xmm1, 16(rp)
   205  	palignr($8, %xmm8, %xmm0)
   206  	movdqa	%xmm0, (rp)
   207  	lea	128(rp), rp
   208  
   209  IFDOS(`	movdqa	(%rsp), %xmm6	')
   210  IFDOS(`	movdqa	16(%rsp), %xmm7	')
   211  IFDOS(`	movdqa	32(%rsp), %xmm8	')
   212  IFDOS(`	add	$56, %rsp	')
   213  
   214  L(ued0):test	$8, R8(n)
   215  	jz	1f
   216  	movaps	56(up), %xmm3
   217  	movaps	40(up), %xmm2
   218  	movaps	24(up), %xmm1
   219  	movaps	8(up), %xmm0
   220  	movaps	-8(up), %xmm4
   221  	palignr($8, %xmm2, %xmm3)
   222  	movdqa	%xmm3, 48(rp)
   223  	palignr($8, %xmm1, %xmm2)
   224  	movdqa	%xmm2, 32(rp)
   225  	palignr($8, %xmm0, %xmm1)
   226  	movdqa	%xmm1, 16(rp)
   227  	palignr($8, %xmm4, %xmm0)
   228  	lea	64(up), up
   229  	movdqa	%xmm0, (rp)
   230  	lea	64(rp), rp
   231  
   232  1:	test	$4, R8(n)
   233  	jz	1f
   234  	movaps	24(up), %xmm1
   235  	movaps	8(up), %xmm0
   236  	palignr($8, %xmm0, %xmm1)
   237  	movaps	-8(up), %xmm3
   238  	movdqa	%xmm1, 16(rp)
   239  	palignr($8, %xmm3, %xmm0)
   240  	lea	32(up), up
   241  	movdqa	%xmm0, (rp)
   242  	lea	32(rp), rp
   243  
   244  1:	test	$2, R8(n)
   245  	jz	1f
   246  	movdqa	8(up), %xmm0
   247  	movdqa	-8(up), %xmm3
   248  	palignr($8, %xmm3, %xmm0)
   249  	lea	16(up), up
   250  	movdqa	%xmm0, (rp)
   251  	lea	16(rp), rp
   252  
   253  1:	test	$1, R8(n)
   254  	jz	1f
   255  	mov	(up), %r8
   256  	mov	%r8, (rp)
   257  
   258  1:	FUNC_EXIT()
   259  	ret
   260  
   261  C Basecase code.  Needed for good small operands speed, not for
   262  C correctness as the above code is currently written.
   263  
   264  L(bc):	lea	-8(rp), rp
   265  	sub	$4, R32(n)
   266  	jc	L(end)
   267  
   268  	ALIGN(16)
   269  L(top):	mov	(up), %r8
   270  	mov	8(up), %r9
   271  	lea	32(rp), rp
   272  	mov	16(up), %r10
   273  	mov	24(up), %r11
   274  	lea	32(up), up
   275  	mov	%r8, -24(rp)
   276  	mov	%r9, -16(rp)
   277  ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
   278  `	sub	$4, R32(n)')
   279  	mov	%r10, -8(rp)
   280  	mov	%r11, (rp)
   281  ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
   282  `	jnc	L(top)')
   283  
   284  L(end):	test	$1, R8(n)
   285  	jz	1f
   286  	mov	(up), %r8
   287  	mov	%r8, 8(rp)
   288  	lea	8(rp), rp
   289  	lea	8(up), up
   290  1:	test	$2, R8(n)
   291  	jz	1f
   292  	mov	(up), %r8
   293  	mov	8(up), %r9
   294  	mov	%r8, 8(rp)
   295  	mov	%r9, 16(rp)
   296  1:	FUNC_EXIT()
   297  	ret
   298  EPILOGUE()