github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/ia64/hamdist.asm (about)

     1  dnl  IA-64 mpn_hamdist -- mpn hamming distance.
     2  
     3  dnl  Contributed to the GNU project by Torbjorn Granlund.
     4  
     5  dnl  Copyright 2003-2005 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  include(`../config.m4')
    34  
    35  C           cycles/limb
    36  C Itanium:       2
    37  C Itanium 2:     1
    38  
    39  C INPUT PARAMETERS
    40  define(`up', `r32')
    41  define(`vp', `r33')
    42  define(`n', `r34')
    43  
    44  define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
    45  define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23')
    46  define(`x0',`r24') define(`x1',`r25') define(`x2',`r26') define(`x3',`r27')
    47  define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31')
    48  define(`s',`r8')
    49  
    50  
    51  ASM_START()
    52  PROLOGUE(mpn_hamdist)
    53  	.prologue
    54  ifdef(`HAVE_ABI_32',
    55  `	addp4		up = 0, up		C			M I
    56  	addp4		vp = 0, vp		C			M I
    57  	zxt4		n = n			C			I
    58  	;;
    59  ')
    60  
    61   {.mmi;	ld8		r10 = [up], 8		C load first ulimb	M01
    62  	ld8		r11 = [vp], 8		C load first vlimb	M01
    63  	mov.i		r2 = ar.lc		C save ar.lc		I0
    64  }{.mmi;	and		r14 = 3, n		C			M I
    65  	cmp.lt		p15, p0 = 4, n		C small count?		M I
    66  	add		n = -5, n		C			M I
    67  	;;
    68  }{.mmi;	cmp.eq		p6, p0 = 1, r14		C			M I
    69  	cmp.eq		p7, p0 = 2, r14		C			M I
    70  	cmp.eq		p8, p0 = 3, r14		C			M I
    71  }{.bbb
    72    (p6)	br.dptk		.Lb01			C			B
    73    (p7)	br.dptk		.Lb10			C			B
    74    (p8)	br.dptk		.Lb11			C			B
    75  }
    76  
    77  
    78  .Lb00:	ld8		u1 = [up], 8		C			M01
    79  	ld8		v1 = [vp], 8		C			M01
    80  	shr.u		n = n, 2		C			I0
    81  	xor		x0 = r10, r11		C			M I
    82  	;;
    83  	ld8		u2 = [up], 8		C			M01
    84  	ld8		v2 = [vp], 8		C			M01
    85  	mov.i		ar.lc = n		C			I0
    86  	xor		x1 = u1, v1		C			M I
    87  	;;
    88  	ld8		u3 = [up], 8		C			M01
    89  	ld8		v3 = [vp], 8		C			M01
    90  	xor		x2 = u2, v2		C			M I
    91  	mov		s = 0			C			M I
    92    (p15)	br.cond.dptk	.grt4			C			B
    93  	;;
    94  	popcnt		c0 = x0			C			I0
    95  	xor		x3 = u3, v3		C			M I
    96  	;;
    97  	popcnt		c1 = x1			C			I0
    98  	;;
    99  	popcnt		c2 = x2			C			I0
   100  	br		.Lcj4			C			B
   101  
   102  .grt4:	ld8		u0 = [up], 8		C			M01
   103  	ld8		v0 = [vp], 8		C			M01
   104  	xor		x1 = u1, v1		C			M I
   105  	;;
   106  	ld8		u1 = [up], 8		C			M01
   107  	ld8		v1 = [vp], 8		C			M01
   108  	xor		x2 = u2, v2		C			M I
   109  	;;
   110  	ld8		u2 = [up], 8		C			M01
   111  	ld8		v2 = [vp], 8		C			M01
   112  	popcnt		c0 = x0			C			I0
   113  	xor		x3 = u3, v3		C			M I
   114  	;;
   115  	ld8		u3 = [up], 8		C			M01
   116  	ld8		v3 = [vp], 8		C			M01
   117  	popcnt		c1 = x1			C			I0
   118  	xor		x0 = u0, v0		C			M I
   119  	br.cloop.dpnt	.grt8			C			B
   120  
   121  	popcnt		c2 = x2			C			I0
   122  	xor		x1 = u1, v1		C			M I
   123  	br		.Lcj8			C			B
   124  
   125  .grt8:	ld8		u0 = [up], 8		C			M01
   126  	ld8		v0 = [vp], 8		C			M01
   127  	popcnt		c2 = x2			C			I0
   128  	xor		x1 = u1, v1		C			M I
   129  	br		.LL00			C			B
   130  
   131  
   132  .Lb01:	xor		x3 = r10, r11		C			M I
   133  	shr.u		n = n, 2		C			I0
   134    (p15)	br.cond.dptk	.grt1			C			B
   135  	;;
   136  	popcnt		r8 = x3			C			I0
   137  	br.ret.sptk.many b0			C			B
   138  
   139  .grt1:	ld8		u0 = [up], 8		C			M01
   140  	ld8		v0 = [vp], 8		C			M01
   141  	mov.i		ar.lc = n		C			I0
   142  	;;
   143  	ld8		u1 = [up], 8		C			M01
   144  	ld8		v1 = [vp], 8		C			M01
   145  	mov		s = 0			C			M I
   146  	;;
   147  	ld8		u2 = [up], 8		C			M01
   148  	ld8		v2 = [vp], 8		C			M01
   149  	;;
   150  	ld8		u3 = [up], 8		C			M01
   151  	ld8		v3 = [vp], 8		C			M01
   152  	xor		x0 = u0, v0		C			M I
   153  	br.cloop.dpnt	.grt5			C			B
   154  
   155  	xor		x1 = u1, v1		C			M I
   156  	;;
   157  	popcnt		c3 = x3			C			I0
   158  	xor		x2 = u2, v2		C			M I
   159  	;;
   160  	popcnt		c0 = x0			C			I0
   161  	xor		x3 = u3, v3		C			M I
   162  	;;
   163  	popcnt		c1 = x1			C			I0
   164  	br		.Lcj5			C			B
   165  
   166  .grt5:	ld8		u0 = [up], 8		C			M01
   167  	ld8		v0 = [vp], 8		C			M01
   168  	xor		x1 = u1, v1		C			M I
   169  	;;
   170  	ld8		u1 = [up], 8		C			M01
   171  	ld8		v1 = [vp], 8		C			M01
   172  	popcnt		c3 = x3			C			I0
   173  	xor		x2 = u2, v2		C			M I
   174  	;;
   175  	ld8		u2 = [up], 8		C			M01
   176  	ld8		v2 = [vp], 8		C			M01
   177  	popcnt		c0 = x0			C			I0
   178  	xor		x3 = u3, v3		C			M I
   179  	;;
   180  	ld8		u3 = [up], 8		C			M01
   181  	ld8		v3 = [vp], 8		C			M01
   182  	popcnt		c1 = x1			C			I0
   183  	xor		x0 = u0, v0		C			M I
   184  	br.cloop.dpnt	.Loop			C			B
   185  	br		.Lend			C			B
   186  
   187  
   188  .Lb10:	ld8		u3 = [up], 8		C			M01
   189  	ld8		v3 = [vp], 8		C			M01
   190  	xor		x2 = r10, r11		C			M I
   191    (p15)	br.cond.dptk	.grt2			C			B
   192  	;;
   193  	xor		x3 = u3, v3		C			M I
   194  	;;
   195  	popcnt		c2 = x2			C			I0
   196  	;;
   197  	popcnt		c3 = x3			C			I0
   198  	;;
   199  	add		s = c2, c3		C			M I
   200  	br.ret.sptk.many b0			C			B
   201  
   202  .grt2:	ld8		u0 = [up], 8		C			M01
   203  	ld8		v0 = [vp], 8		C			M01
   204  	shr.u		n = n, 2		C			I0
   205  	;;
   206  	ld8		u1 = [up], 8		C			M01
   207  	ld8		v1 = [vp], 8		C			M01
   208  	mov.i		ar.lc = n		C			I0
   209  	mov		s = 0			C			M I
   210  	;;
   211  	ld8		u2 = [up], 8		C			M01
   212  	ld8		v2 = [vp], 8		C			M01
   213  	xor		x3 = u3, v3		C			M I
   214  	;;
   215  	ld8		u3 = [up], 8		C			M01
   216  	ld8		v3 = [vp], 8		C			M01
   217  	xor		x0 = u0, v0		C			M I
   218  	br.cloop.dptk	.grt6			C			B
   219  
   220  	popcnt		c2 = x2			C			I0
   221  	xor		x1 = u1, v1		C			M I
   222  	;;
   223  	popcnt		c3 = x3			C			I0
   224  	xor		x2 = u2, v2		C			M I
   225  	;;
   226  	popcnt		c0 = x0			C			I0
   227  	xor		x3 = u3, v3		C			M I
   228  	br		.Lcj6			C			B
   229  
   230  .grt6:	ld8		u0 = [up], 8		C			M01
   231  	ld8		v0 = [vp], 8		C			M01
   232  	popcnt		c2 = x2			C			I0
   233  	xor		x1 = u1, v1		C			M I
   234  	;;
   235  	ld8		u1 = [up], 8		C			M01
   236  	ld8		v1 = [vp], 8		C			M01
   237  	popcnt		c3 = x3			C			I0
   238  	xor		x2 = u2, v2		C			M I
   239  	;;
   240  	ld8		u2 = [up], 8		C			M01
   241  	ld8		v2 = [vp], 8		C			M01
   242  	popcnt		c0 = x0			C			I0
   243  	xor		x3 = u3, v3		C			M I
   244  	br		.LL10			C			B
   245  
   246  
   247  .Lb11:	ld8		u2 = [up], 8		C			M01
   248  	ld8		v2 = [vp], 8		C			M01
   249  	shr.u		n = n, 2		C			I0
   250  	xor		x1 = r10, r11		C			M I
   251  	;;
   252  	ld8		u3 = [up], 8		C			M01
   253  	ld8		v3 = [vp], 8		C			M01
   254  	xor		x2 = u2, v2		C			M I
   255    (p15)	br.cond.dptk	.grt3			C			B
   256  	;;
   257  	xor		x3 = u3, v3		C			M I
   258  	;;
   259  	popcnt		c1 = x1			C			I0
   260  	;;
   261  	popcnt		c2 = x2			C			I0
   262  	;;
   263  	popcnt		c3 = x3			C			I0
   264  	;;
   265  	add		s = c1, c2		C			M I
   266  	;;
   267  	add		s = s, c3		C			M I
   268  	br.ret.sptk.many b0			C			B
   269  
   270  .grt3:	ld8		u0 = [up], 8		C			M01
   271  	ld8		v0 = [vp], 8		C			M01
   272  	mov.i		ar.lc = n		C			I0
   273  	;;
   274  	ld8		u1 = [up], 8		C			M01
   275  	ld8		v1 = [vp], 8		C			M01
   276  	mov		s = 0			C			M I
   277  	;;
   278  	ld8		u2 = [up], 8		C			M01
   279  	ld8		v2 = [vp], 8		C			M01
   280  	xor		x3 = u3, v3		C			M I
   281  	;;
   282  	ld8		u3 = [up], 8		C			M01
   283  	ld8		v3 = [vp], 8		C			M01
   284  	popcnt		c1 = x1			C			I0
   285  	xor		x0 = u0, v0		C			M I
   286  	br.cloop.dptk	.grt7			C			B
   287  	popcnt		c2 = x2			C			I0
   288  	xor		x1 = u1, v1		C			M I
   289  	;;
   290  	popcnt		c3 = x3			C			I0
   291  	xor		x2 = u2, v2		C			M I
   292  	br		.Lcj7			C			B
   293  
   294  .grt7:	ld8		u0 = [up], 8		C			M01
   295  	ld8		v0 = [vp], 8		C			M01
   296  	popcnt		c2 = x2			C			I0
   297  	xor		x1 = u1, v1		C			M I
   298  	;;
   299  	ld8		u1 = [up], 8		C			M01
   300  	ld8		v1 = [vp], 8		C			M01
   301  	popcnt		c3 = x3			C			I0
   302  	xor		x2 = u2, v2		C			M I
   303  	br		.LL11			C			B
   304  
   305  
   306  	ALIGN(32)
   307  .Loop:	ld8		u0 = [up], 8		C			M01
   308  	ld8		v0 = [vp], 8		C			M01
   309  	popcnt		c2 = x2			C			I0
   310  	add		s = s, c3		C			M I
   311  	xor		x1 = u1, v1		C			M I
   312  	nop.b		1			C			-
   313  	;;
   314  .LL00:	ld8		u1 = [up], 8		C			M01
   315  	ld8		v1 = [vp], 8		C			M01
   316  	popcnt		c3 = x3			C			I0
   317  	add		s = s, c0		C			M I
   318  	xor		x2 = u2, v2		C			M I
   319  	nop.b		1			C			-
   320  	;;
   321  .LL11:	ld8		u2 = [up], 8		C			M01
   322  	ld8		v2 = [vp], 8		C			M01
   323  	popcnt		c0 = x0			C			I0
   324  	add		s = s, c1		C			M I
   325  	xor		x3 = u3, v3		C			M I
   326  	nop.b		1			C			-
   327  	;;
   328  .LL10:	ld8		u3 = [up], 8		C			M01
   329  	ld8		v3 = [vp], 8		C			M01
   330  	popcnt		c1 = x1			C			I0
   331  	add		s = s, c2		C			M I
   332  	xor		x0 = u0, v0		C			M I
   333  	br.cloop.dptk	.Loop			C			B
   334  	;;
   335  
   336  .Lend:	popcnt		c2 = x2			C			I0
   337  	add		s = s, c3		C			M I
   338  	xor		x1 = u1, v1		C			M I
   339  	;;
   340  .Lcj8:	popcnt		c3 = x3			C			I0
   341  	add		s = s, c0		C			M I
   342  	xor		x2 = u2, v2		C			M I
   343  	;;
   344  .Lcj7:	popcnt		c0 = x0			C			I0
   345  	add		s = s, c1		C			M I
   346  	xor		x3 = u3, v3		C			M I
   347  	;;
   348  .Lcj6:	popcnt		c1 = x1			C			I0
   349  	add		s = s, c2		C			M I
   350  	;;
   351  .Lcj5:	popcnt		c2 = x2			C			I0
   352  	add		s = s, c3		C			M I
   353  	;;
   354  .Lcj4:	popcnt		c3 = x3			C			I0
   355  	add		s = s, c0		C			M I
   356  	;;
   357  	add		s = s, c1		C			M I
   358  	;;
   359  	add		s = s, c2		C			M I
   360  	;;
   361  	add		s = s, c3		C			M I
   362  	mov.i		ar.lc = r2		C			I0
   363  	br.ret.sptk.many b0			C			B
   364  EPILOGUE()
   365  ASM_END()