github.com/aergoio/aergo@v1.3.1/libtool/src/gmp-6.1.2/mpn/x86_64/k8/mulmid_basecase.asm (about)

     1  dnl  AMD64 mpn_mulmid_basecase
     2  
     3  dnl  Contributed by David Harvey.
     4  
     5  dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
     6  
     7  dnl  This file is part of the GNU MP Library.
     8  dnl
     9  dnl  The GNU MP Library is free software; you can redistribute it and/or modify
    10  dnl  it under the terms of either:
    11  dnl
    12  dnl    * the GNU Lesser General Public License as published by the Free
    13  dnl      Software Foundation; either version 3 of the License, or (at your
    14  dnl      option) any later version.
    15  dnl
    16  dnl  or
    17  dnl
    18  dnl    * the GNU General Public License as published by the Free Software
    19  dnl      Foundation; either version 2 of the License, or (at your option) any
    20  dnl      later version.
    21  dnl
    22  dnl  or both in parallel, as here.
    23  dnl
    24  dnl  The GNU MP Library is distributed in the hope that it will be useful, but
    25  dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
    26  dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
    27  dnl  for more details.
    28  dnl
    29  dnl  You should have received copies of the GNU General Public License and the
    30  dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
    31  dnl  see https://www.gnu.org/licenses/.
    32  
    33  
    34  include(`../config.m4')
    35  
    36  C	     cycles/limb
    37  C K8,K9:	 2.375  (2.5 when un - vn is "small")
    38  C K10:		 ?
    39  C P4:		 ?
    40  C P6-15:	 ?
    41  
    42  C INPUT PARAMETERS
    43  define(`rp',      `%rdi')
    44  define(`up',      `%rsi')
    45  define(`un_param',`%rdx')
    46  define(`vp_param',`%rcx')
    47  define(`vn',      `%r8')
    48  
    49  define(`v0', `%r12')
    50  define(`v1', `%r9')
    51  
    52  define(`w0', `%rbx')
    53  define(`w1', `%rcx')
    54  define(`w2', `%rbp')
    55  define(`w3', `%r10')
    56  
    57  define(`n',  `%r11')
    58  define(`outer_addr', `%r14')
    59  define(`un',  `%r13')
    60  define(`vp',  `%r15')
    61  
    62  define(`vp_inner', `%r10')
    63  
    64  ABI_SUPPORT(DOS64)
    65  ABI_SUPPORT(STD64)
    66  
    67  ASM_START()
    68  	TEXT
    69  	ALIGN(16)
    70  PROLOGUE(mpn_mulmid_basecase)
    71  	FUNC_ENTRY(4)
    72  IFDOS(`	mov	56(%rsp), %r8d	')
    73  	push	%rbx
    74  	push	%rbp
    75  	push	%r12
    76  	push	%r13
    77  	push	%r14
    78  	push	%r15
    79  
    80  	mov	vp_param, vp
    81  
    82  	C use un for row length (= un_param - vn + 1)
    83  	lea	1(un_param), un
    84  	sub	vn, un
    85  
    86  	lea	(rp,un,8), rp
    87  
    88  	cmp	$4, un		C TODO: needs tuning
    89  	jc	L(diagonal)
    90  
    91  	lea	(up,un_param,8), up
    92  
    93  	test	$1, vn
    94  	jz	L(mul_2)
    95  
    96  C ===========================================================
    97  C     mul_1 for vp[0] if vn is odd
    98  
    99  L(mul_1):
   100  	mov	R32(un), R32(w0)
   101  
   102  	neg	un
   103  	mov	(up,un,8), %rax
   104  	mov	(vp), v0
   105  	mul	v0
   106  
   107  	and	$-4, un		C round down to multiple of 4
   108  	mov	un, n
   109  
   110  	and	$3, R32(w0)
   111  	jz	L(mul_1_prologue_0)
   112  	cmp	$2, R32(w0)
   113  	jc	L(mul_1_prologue_1)
   114  	jz	L(mul_1_prologue_2)
   115  
   116  L(mul_1_prologue_3):
   117  	mov	%rax, w3
   118  	mov	%rdx, w0
   119  	lea	L(addmul_prologue_3)(%rip), outer_addr
   120  	jmp	L(mul_1_entry_3)
   121  
   122  	ALIGN(16)
   123  L(mul_1_prologue_0):
   124  	mov	%rax, w2
   125  	mov	%rdx, w3		C note already w0 == 0
   126  	lea	L(addmul_prologue_0)(%rip), outer_addr
   127  	jmp	L(mul_1_entry_0)
   128  
   129  	ALIGN(16)
   130  L(mul_1_prologue_1):
   131  	add	$4, n
   132  	mov	%rax, w1
   133  	mov	%rdx, w2
   134  	mov	$0, R32(w3)
   135  	mov	(up,n,8), %rax
   136  	lea	L(addmul_prologue_1)(%rip), outer_addr
   137  	jmp	L(mul_1_entry_1)
   138  
   139  	ALIGN(16)
   140  L(mul_1_prologue_2):
   141  	mov	%rax, w0
   142  	mov	%rdx, w1
   143  	mov	24(up,n,8), %rax
   144  	mov	$0, R32(w2)
   145  	mov	$0, R32(w3)
   146  	lea	L(addmul_prologue_2)(%rip), outer_addr
   147  	jmp	L(mul_1_entry_2)
   148  
   149  
   150  	C this loop is 10 c/loop = 2.5 c/l on K8
   151  
   152  	ALIGN(16)
   153  L(mul_1_top):
   154  	mov	w0, -16(rp,n,8)
   155  	add	%rax, w1
   156  	mov	(up,n,8), %rax
   157  	adc	%rdx, w2
   158  L(mul_1_entry_1):
   159  	mov	$0, R32(w0)
   160  	mul	v0
   161  	mov	w1, -8(rp,n,8)
   162  	add	%rax, w2
   163  	adc	%rdx, w3
   164  L(mul_1_entry_0):
   165  	mov	8(up,n,8), %rax
   166  	mul	v0
   167  	mov	w2, (rp,n,8)
   168  	add	%rax, w3
   169  	adc	%rdx, w0
   170  L(mul_1_entry_3):
   171  	mov	16(up,n,8), %rax
   172  	mul	v0
   173  	mov	w3, 8(rp,n,8)
   174  	mov	$0, R32(w2)		C zero
   175  	mov	w2, w3			C zero
   176  	add	%rax, w0
   177  	mov	24(up,n,8), %rax
   178  	mov	w2, w1			C zero
   179  	adc	%rdx, w1
   180  L(mul_1_entry_2):
   181  	mul	v0
   182  	add	$4, n
   183  	js	L(mul_1_top)
   184  
   185  	mov	w0, -16(rp)
   186  	add	%rax, w1
   187  	mov	w1, -8(rp)
   188  	mov	w2, 8(rp)		C zero last limb of output
   189  	adc	%rdx, w2
   190  	mov	w2, (rp)
   191  
   192  	dec	vn
   193  	jz	L(ret)
   194  
   195  	lea	-8(up), up
   196  	lea	8(vp), vp
   197  
   198  	mov	un, n
   199  	mov	(vp), v0
   200  	mov	8(vp), v1
   201  
   202  	jmp	*outer_addr
   203  
   204  C ===========================================================
   205  C     mul_2 for vp[0], vp[1] if vn is even
   206  
   207  	ALIGN(16)
   208  L(mul_2):
   209  	mov	R32(un), R32(w0)
   210  
   211  	neg	un
   212  	mov	-8(up,un,8), %rax
   213  	mov	(vp), v0
   214  	mov	8(vp), v1
   215  	mul	v1
   216  
   217  	and	$-4, un		C round down to multiple of 4
   218  	mov	un, n
   219  
   220  	and	$3, R32(w0)
   221  	jz	L(mul_2_prologue_0)
   222  	cmp	$2, R32(w0)
   223  	jc	L(mul_2_prologue_1)
   224  	jz	L(mul_2_prologue_2)
   225  
   226  L(mul_2_prologue_3):
   227  	mov	%rax, w1
   228  	mov	%rdx, w2
   229  	lea	L(addmul_prologue_3)(%rip), outer_addr
   230  	jmp	L(mul_2_entry_3)
   231  
   232  	ALIGN(16)
   233  L(mul_2_prologue_0):
   234  	mov	%rax, w0
   235  	mov	%rdx, w1
   236  	lea	L(addmul_prologue_0)(%rip), outer_addr
   237  	jmp	L(mul_2_entry_0)
   238  
   239  	ALIGN(16)
   240  L(mul_2_prologue_1):
   241  	mov	%rax, w3
   242  	mov	%rdx, w0
   243  	mov	$0, R32(w1)
   244  	lea	L(addmul_prologue_1)(%rip), outer_addr
   245  	jmp	L(mul_2_entry_1)
   246  
   247  	ALIGN(16)
   248  L(mul_2_prologue_2):
   249  	mov	%rax, w2
   250  	mov	%rdx, w3
   251  	mov	$0, R32(w0)
   252  	mov	16(up,n,8), %rax
   253  	lea	L(addmul_prologue_2)(%rip), outer_addr
   254  	jmp	L(mul_2_entry_2)
   255  
   256  
   257  	C this loop is 18 c/loop = 2.25 c/l on K8
   258  
   259  	ALIGN(16)
   260  L(mul_2_top):
   261  	mov     -8(up,n,8), %rax
   262  	mul     v1
   263  	add     %rax, w0
   264  	adc     %rdx, w1
   265  L(mul_2_entry_0):
   266  	mov     $0, R32(w2)
   267  	mov     (up,n,8), %rax
   268  	mul     v0
   269  	add     %rax, w0
   270  	mov     (up,n,8), %rax
   271  	adc     %rdx, w1
   272  	adc     $0, R32(w2)
   273  	mul     v1
   274  	add     %rax, w1
   275  	mov     w0, (rp,n,8)
   276  	adc     %rdx, w2
   277  L(mul_2_entry_3):
   278  	mov     8(up,n,8), %rax
   279  	mul     v0
   280  	mov     $0, R32(w3)
   281  	add     %rax, w1
   282  	adc     %rdx, w2
   283  	mov     $0, R32(w0)
   284  	adc     $0, R32(w3)
   285  	mov     8(up,n,8), %rax
   286  	mov     w1, 8(rp,n,8)
   287  	mul     v1
   288  	add     %rax, w2
   289  	mov     16(up,n,8), %rax
   290  	adc     %rdx, w3
   291  L(mul_2_entry_2):
   292  	mov     $0, R32(w1)
   293  	mul     v0
   294  	add     %rax, w2
   295  	mov     16(up,n,8), %rax
   296  	adc     %rdx, w3
   297  	adc     $0, R32(w0)
   298  	mul     v1
   299  	add     %rax, w3
   300  	mov     w2, 16(rp,n,8)
   301  	adc     %rdx, w0
   302  L(mul_2_entry_1):
   303  	mov     24(up,n,8), %rax
   304  	mul     v0
   305  	add     %rax, w3
   306  	adc     %rdx, w0
   307  	adc     $0, R32(w1)
   308  	add     $4, n
   309  	mov     w3, -8(rp,n,8)
   310  	jnz     L(mul_2_top)
   311  
   312  	mov	w0, (rp)
   313  	mov	w1, 8(rp)
   314  
   315  	sub	$2, vn
   316  	jz	L(ret)
   317  
   318  	lea	16(vp), vp
   319  	lea	-16(up), up
   320  
   321  	mov	un, n
   322  	mov	(vp), v0
   323  	mov	8(vp), v1
   324  
   325  	jmp	*outer_addr
   326  
   327  C ===========================================================
   328  C     addmul_2 for remaining vp's
   329  
   330  	ALIGN(16)
   331  L(addmul_prologue_0):
   332  	mov	-8(up,n,8), %rax
   333  	mul	v1
   334  	mov	%rax, w1
   335  	mov	%rdx, w2
   336  	mov	$0, R32(w3)
   337  	jmp	L(addmul_entry_0)
   338  
   339  	ALIGN(16)
   340  L(addmul_prologue_1):
   341  	mov	16(up,n,8), %rax
   342  	mul	v1
   343  	mov	%rax, w0
   344  	mov	%rdx, w1
   345  	mov	$0, R32(w2)
   346  	mov	24(up,n,8), %rax
   347  	jmp	L(addmul_entry_1)
   348  
   349  	ALIGN(16)
   350  L(addmul_prologue_2):
   351  	mov	8(up,n,8), %rax
   352  	mul	v1
   353  	mov	%rax, w3
   354  	mov	%rdx, w0
   355  	mov	$0, R32(w1)
   356  	jmp	L(addmul_entry_2)
   357  
   358  	ALIGN(16)
   359  L(addmul_prologue_3):
   360  	mov	(up,n,8), %rax
   361  	mul	v1
   362  	mov	%rax, w2
   363  	mov	%rdx, w3
   364  	mov	$0, R32(w0)
   365  	mov	$0, R32(w1)
   366  	jmp	L(addmul_entry_3)
   367  
   368  	C this loop is 19 c/loop = 2.375 c/l on K8
   369  
   370  	ALIGN(16)
   371  L(addmul_top):
   372  	mov	$0, R32(w3)
   373  	add	%rax, w0
   374  	mov	-8(up,n,8), %rax
   375  	adc	%rdx, w1
   376  	adc	$0, R32(w2)
   377  	mul	v1
   378  	add	w0, -8(rp,n,8)
   379  	adc	%rax, w1
   380  	adc	%rdx, w2
   381  L(addmul_entry_0):
   382  	mov	(up,n,8), %rax
   383  	mul	v0
   384  	add	%rax, w1
   385  	mov	(up,n,8), %rax
   386  	adc	%rdx, w2
   387  	adc	$0, R32(w3)
   388  	mul	v1
   389  	add	w1, (rp,n,8)
   390  	mov	$0, R32(w1)
   391  	adc	%rax, w2
   392  	mov	$0, R32(w0)
   393  	adc	%rdx, w3
   394  L(addmul_entry_3):
   395  	mov	8(up,n,8), %rax
   396  	mul	v0
   397  	add	%rax, w2
   398  	mov	8(up,n,8), %rax
   399  	adc	%rdx, w3
   400  	adc	$0, R32(w0)
   401  	mul	v1
   402  	add	w2, 8(rp,n,8)
   403  	adc	%rax, w3
   404  	adc	%rdx, w0
   405  L(addmul_entry_2):
   406  	mov	16(up,n,8), %rax
   407  	mul	v0
   408  	add	%rax, w3
   409  	mov	16(up,n,8), %rax
   410  	adc	%rdx, w0
   411  	adc	$0, R32(w1)
   412  	mul	v1
   413  	add	w3, 16(rp,n,8)
   414  	nop			C don't ask...
   415  	adc	%rax, w0
   416  	mov	$0, R32(w2)
   417  	mov	24(up,n,8), %rax
   418  	adc	%rdx, w1
   419  L(addmul_entry_1):
   420  	mul	v0
   421  	add	$4, n
   422  	jnz	L(addmul_top)
   423  
   424  	add	%rax, w0
   425  	adc	%rdx, w1
   426  	adc	$0, R32(w2)
   427  
   428  	add	w0, -8(rp)
   429  	adc	w1, (rp)
   430  	adc	w2, 8(rp)
   431  
   432  	sub	$2, vn
   433  	jz	L(ret)
   434  
   435  	lea	16(vp), vp
   436  	lea	-16(up), up
   437  
   438  	mov	un, n
   439  	mov	(vp), v0
   440  	mov	8(vp), v1
   441  
   442  	jmp	*outer_addr
   443  
   444  C ===========================================================
   445  C     accumulate along diagonals if un - vn is small
   446  
   447  	ALIGN(16)
   448  L(diagonal):
   449  	xor	R32(w0), R32(w0)
   450  	xor	R32(w1), R32(w1)
   451  	xor	R32(w2), R32(w2)
   452  
   453  	neg	un
   454  
   455  	mov	R32(vn), %eax
   456  	and	$3, %eax
   457  	jz	L(diag_prologue_0)
   458  	cmp	$2, %eax
   459  	jc	L(diag_prologue_1)
   460  	jz	L(diag_prologue_2)
   461  
   462  L(diag_prologue_3):
   463  	lea	-8(vp), vp
   464  	mov	vp, vp_inner
   465  	add	$1, vn
   466  	mov	vn, n
   467  	lea	L(diag_entry_3)(%rip), outer_addr
   468  	jmp	L(diag_entry_3)
   469  
   470  L(diag_prologue_0):
   471  	mov	vp, vp_inner
   472  	mov	vn, n
   473  	lea	0(%rip), outer_addr
   474  	mov     -8(up,n,8), %rax
   475  	jmp	L(diag_entry_0)
   476  
   477  L(diag_prologue_1):
   478  	lea	8(vp), vp
   479  	mov	vp, vp_inner
   480  	add	$3, vn
   481  	mov	vn, n
   482  	lea	0(%rip), outer_addr
   483  	mov     -8(vp_inner), %rax
   484  	jmp	L(diag_entry_1)
   485  
   486  L(diag_prologue_2):
   487  	lea	-16(vp), vp
   488  	mov	vp, vp_inner
   489  	add	$2, vn
   490  	mov	vn, n
   491  	lea	0(%rip), outer_addr
   492  	mov	16(vp_inner), %rax
   493  	jmp	L(diag_entry_2)
   494  
   495  
   496  	C this loop is 10 c/loop = 2.5 c/l on K8
   497  
   498  	ALIGN(16)
   499  L(diag_top):
   500  	add     %rax, w0
   501  	adc     %rdx, w1
   502  	mov     -8(up,n,8), %rax
   503  	adc     $0, w2
   504  L(diag_entry_0):
   505  	mulq    (vp_inner)
   506  	add     %rax, w0
   507  	adc     %rdx, w1
   508  	adc     $0, w2
   509  L(diag_entry_3):
   510  	mov     -16(up,n,8), %rax
   511  	mulq    8(vp_inner)
   512  	add     %rax, w0
   513  	mov     16(vp_inner), %rax
   514  	adc     %rdx, w1
   515  	adc     $0, w2
   516  L(diag_entry_2):
   517  	mulq    -24(up,n,8)
   518  	add     %rax, w0
   519  	mov     24(vp_inner), %rax
   520  	adc     %rdx, w1
   521  	lea     32(vp_inner), vp_inner
   522  	adc     $0, w2
   523  L(diag_entry_1):
   524  	mulq    -32(up,n,8)
   525  	sub     $4, n
   526  	jnz	L(diag_top)
   527  
   528  	add	%rax, w0
   529  	adc	%rdx, w1
   530  	adc	$0, w2
   531  
   532  	mov	w0, (rp,un,8)
   533  
   534  	inc	un
   535  	jz	L(diag_end)
   536  
   537  	mov	vn, n
   538  	mov	vp, vp_inner
   539  
   540  	lea	8(up), up
   541  	mov	w1, w0
   542  	mov	w2, w1
   543  	xor	R32(w2), R32(w2)
   544  
   545  	jmp	*outer_addr
   546  
   547  L(diag_end):
   548  	mov	w1, (rp)
   549  	mov	w2, 8(rp)
   550  
   551  L(ret):	pop	%r15
   552  	pop	%r14
   553  	pop	%r13
   554  	pop	%r12
   555  	pop	%rbp
   556  	pop	%rbx
   557  	FUNC_EXIT()
   558  	ret
   559  EPILOGUE()