github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/fastsse/com-palignr.asm (about)

     1  dnl  AMD64 mpn_com optimised for CPUs with fast SSE copying and SSSE3.
     2  
     3  dnl  Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
     4  
     5  dnl  Contributed to the GNU project by Torbjorn Granlund.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C	     cycles/limb     cycles/limb     cycles/limb      good
    36  C              aligned	      unaligned	      best seen	     for cpu?
    37  C AMD K8,K9	 2.0		 illop		1.0/1.0		N
    38  C AMD K10	 0.85		 illop				Y/N
    39  C AMD bull	 1.39		 ? 1.45				Y/N
    40  C AMD pile     0.8-1.4	       0.7-1.4				Y
    41  C AMD steam
    42  C AMD excavator
    43  C AMD bobcat	 1.97		 ? 8.17		1.5/1.5		N
    44  C AMD jaguar	 1.02		 1.02		0.91/0.91	N
    45  C Intel P4	 2.26		 illop				Y/N
    46  C Intel core	 0.52		 0.95		opt/0.74	Y
    47  C Intel NHM	 0.52		 0.65		opt/opt		Y
    48  C Intel SBR	 0.51		 0.65		opt/opt		Y
    49  C Intel IBR	 0.50		 0.64		opt/0.57	Y
    50  C Intel HWL	 0.51		 0.58		opt/opt		Y
    51  C Intel BWL	 0.57		 0.69		opt/0.65	Y
    52  C Intel atom	 1.16		 1.70		opt/opt		Y
    53  C Intel SLM	 1.02		 1.52				N
    54  C VIA nano	 1.09		 1.10		opt/opt		Y
    55  
    56  C We use only 16-byte operations, except for unaligned top-most and bottom-most
    57  C limbs.  We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).  That
    58  C instruction is better adapted to mpn_copyd's needs, we need to contort the
    59  C code to use it here.
    60  C
    61  C For operands of < COM_SSE_THRESHOLD limbs, we use a plain 64-bit loop, taken
    62  C from the x86_64 default code.
    63  
    64  C INPUT PARAMETERS
    65  define(`rp', `%rdi')
    66  define(`up', `%rsi')
    67  define(`n',  `%rdx')
    68  
    69  C There are three instructions for loading an aligned 128-bit quantity.  We use
    70  C movaps, since it has the shortest coding.
    71  define(`movdqa', ``movaps'')
    72  
    73  ifdef(`COM_SSE_THRESHOLD',`',`define(`COM_SSE_THRESHOLD', 7)')
    74  
    75  ASM_START()
    76  	TEXT
    77  	ALIGN(64)
    78  PROLOGUE(mpn_com)
    79  	FUNC_ENTRY(3)
    80  
    81  	cmp	$COM_SSE_THRESHOLD, n
    82  	jbe	L(bc)
    83  
    84  	pcmpeqb	%xmm7, %xmm7		C set to 111...111
    85  
    86  	test	$8, R8(rp)		C is rp 16-byte aligned?
    87  	jz	L(rp_aligned)		C jump if rp aligned
    88  
    89  	mov	(up), %r8
    90  	lea	8(up), up
    91  	not	%r8
    92  	mov	%r8, (rp)
    93  	lea	8(rp), rp
    94  	dec	n
    95  
    96  L(rp_aligned):
    97  	test	$8, R8(up)
    98  	jnz	L(uent)
    99  
   100  ifelse(eval(COM_SSE_THRESHOLD >= 8),1,
   101  `	sub	$8, n',
   102  `	jmp	L(am)')
   103  
   104  	ALIGN(16)
   105  L(atop):movdqa	0(up), %xmm0
   106  	movdqa	16(up), %xmm1
   107  	movdqa	32(up), %xmm2
   108  	movdqa	48(up), %xmm3
   109  	lea	64(up), up
   110  	pxor	%xmm7, %xmm0
   111  	pxor	%xmm7, %xmm1
   112  	pxor	%xmm7, %xmm2
   113  	pxor	%xmm7, %xmm3
   114  	movdqa	%xmm0, (rp)
   115  	movdqa	%xmm1, 16(rp)
   116  	movdqa	%xmm2, 32(rp)
   117  	movdqa	%xmm3, 48(rp)
   118  	lea	64(rp), rp
   119  L(am):	sub	$8, n
   120  	jnc	L(atop)
   121  
   122  	test	$4, R8(n)
   123  	jz	1f
   124  	movdqa	(up), %xmm0
   125  	movdqa	16(up), %xmm1
   126  	lea	32(up), up
   127  	pxor	%xmm7, %xmm0
   128  	pxor	%xmm7, %xmm1
   129  	movdqa	%xmm0, (rp)
   130  	movdqa	%xmm1, 16(rp)
   131  	lea	32(rp), rp
   132  
   133  1:	test	$2, R8(n)
   134  	jz	1f
   135  	movdqa	(up), %xmm0
   136  	lea	16(up), up
   137  	pxor	%xmm7, %xmm0
   138  	movdqa	%xmm0, (rp)
   139  	lea	16(rp), rp
   140  
   141  1:	test	$1, R8(n)
   142  	jz	1f
   143  	mov	(up), %r8
   144  	not	%r8
   145  	mov	%r8, (rp)
   146  
   147  1:	FUNC_EXIT()
   148  	ret
   149  
   150  L(uent):
   151  C Code handling up - rp = 8 (mod 16)
   152  
   153  C FIXME: The code below only handles overlap if it is close to complete, or
   154  C quite separate: up-rp < 5 or up-up > 15 limbs
   155  	lea	-40(up), %rax		C 40 = 5 * GMP_LIMB_BYTES
   156  	sub	rp, %rax
   157  	cmp	$80, %rax		C 80 = (15-5) * GMP_LIMB_BYTES
   158  	jbe	L(bc)			C deflect to plain loop
   159  
   160  	sub	$16, n
   161  	jc	L(uend)
   162  
   163  	movdqa	120(up), %xmm3
   164  
   165  	sub	$16, n
   166  	jmp	L(um)
   167  
   168  	ALIGN(16)
   169  L(utop):movdqa	120(up), %xmm3
   170  	pxor	%xmm7, %xmm0
   171  	movdqa	%xmm0, -128(rp)
   172  	sub	$16, n
   173  L(um):	movdqa	104(up), %xmm2
   174  	palignr($8, %xmm2, %xmm3)
   175  	movdqa	88(up), %xmm1
   176  	pxor	%xmm7, %xmm3
   177  	movdqa	%xmm3, 112(rp)
   178  	palignr($8, %xmm1, %xmm2)
   179  	movdqa	72(up), %xmm0
   180  	pxor	%xmm7, %xmm2
   181  	movdqa	%xmm2, 96(rp)
   182  	palignr($8, %xmm0, %xmm1)
   183  	movdqa	56(up), %xmm3
   184  	pxor	%xmm7, %xmm1
   185  	movdqa	%xmm1, 80(rp)
   186  	palignr($8, %xmm3, %xmm0)
   187  	movdqa	40(up), %xmm2
   188  	pxor	%xmm7, %xmm0
   189  	movdqa	%xmm0, 64(rp)
   190  	palignr($8, %xmm2, %xmm3)
   191  	movdqa	24(up), %xmm1
   192  	pxor	%xmm7, %xmm3
   193  	movdqa	%xmm3, 48(rp)
   194  	palignr($8, %xmm1, %xmm2)
   195  	movdqa	8(up), %xmm0
   196  	pxor	%xmm7, %xmm2
   197  	movdqa	%xmm2, 32(rp)
   198  	palignr($8, %xmm0, %xmm1)
   199  	movdqa	-8(up), %xmm3
   200  	pxor	%xmm7, %xmm1
   201  	movdqa	%xmm1, 16(rp)
   202  	palignr($8, %xmm3, %xmm0)
   203  	lea	128(up), up
   204  	lea	128(rp), rp
   205  	jnc	L(utop)
   206  
   207  	pxor	%xmm7, %xmm0
   208  	movdqa	%xmm0, -128(rp)
   209  
   210  L(uend):test	$8, R8(n)
   211  	jz	1f
   212  	movdqa	56(up), %xmm3
   213  	movdqa	40(up), %xmm2
   214  	palignr($8, %xmm2, %xmm3)
   215  	movdqa	24(up), %xmm1
   216  	pxor	%xmm7, %xmm3
   217  	movdqa	%xmm3, 48(rp)
   218  	palignr($8, %xmm1, %xmm2)
   219  	movdqa	8(up), %xmm0
   220  	pxor	%xmm7, %xmm2
   221  	movdqa	%xmm2, 32(rp)
   222  	palignr($8, %xmm0, %xmm1)
   223  	movdqa	-8(up), %xmm3
   224  	pxor	%xmm7, %xmm1
   225  	movdqa	%xmm1, 16(rp)
   226  	palignr($8, %xmm3, %xmm0)
   227  	lea	64(up), up
   228  	pxor	%xmm7, %xmm0
   229  	movdqa	%xmm0, (rp)
   230  	lea	64(rp), rp
   231  
   232  1:	test	$4, R8(n)
   233  	jz	1f
   234  	movdqa	24(up), %xmm1
   235  	movdqa	8(up), %xmm0
   236  	palignr($8, %xmm0, %xmm1)
   237  	movdqa	-8(up), %xmm3
   238  	pxor	%xmm7, %xmm1
   239  	movdqa	%xmm1, 16(rp)
   240  	palignr($8, %xmm3, %xmm0)
   241  	lea	32(up), up
   242  	pxor	%xmm7, %xmm0
   243  	movdqa	%xmm0, (rp)
   244  	lea	32(rp), rp
   245  
   246  1:	test	$2, R8(n)
   247  	jz	1f
   248  	movdqa	8(up), %xmm0
   249  	movdqa	-8(up), %xmm3
   250  	palignr($8, %xmm3, %xmm0)
   251  	lea	16(up), up
   252  	pxor	%xmm7, %xmm0
   253  	movdqa	%xmm0, (rp)
   254  	lea	16(rp), rp
   255  
   256  1:	test	$1, R8(n)
   257  	jz	1f
   258  	mov	(up), %r8
   259  	not	%r8
   260  	mov	%r8, (rp)
   261  
   262  1:	FUNC_EXIT()
   263  	ret
   264  
   265  C Basecase code.  Needed for good small operands speed, not for
   266  C correctness as the above code is currently written.
   267  
   268  L(bc):	lea	-8(rp), rp
   269  	sub	$4, R32(n)
   270  	jc	L(end)
   271  
   272  ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
   273  `	ALIGN(16)')
   274  L(top):	mov	(up), %r8
   275  	mov	8(up), %r9
   276  	lea	32(rp), rp
   277  	mov	16(up), %r10
   278  	mov	24(up), %r11
   279  	lea	32(up), up
   280  	not	%r8
   281  	not	%r9
   282  	not	%r10
   283  	not	%r11
   284  	mov	%r8, -24(rp)
   285  	mov	%r9, -16(rp)
   286  ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
   287  `	sub	$4, R32(n)')
   288  	mov	%r10, -8(rp)
   289  	mov	%r11, (rp)
   290  ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
   291  `	jnc	L(top)')
   292  
   293  L(end):	test	$1, R8(n)
   294  	jz	1f
   295  	mov	(up), %r8
   296  	not	%r8
   297  	mov	%r8, 8(rp)
   298  	lea	8(rp), rp
   299  	lea	8(up), up
   300  1:	test	$2, R8(n)
   301  	jz	1f
   302  	mov	(up), %r8
   303  	mov	8(up), %r9
   304  	not	%r8
   305  	not	%r9
   306  	mov	%r8, 8(rp)
   307  	mov	%r9, 16(rp)
   308  1:	FUNC_EXIT()
   309  	ret
   310  EPILOGUE()