github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/math/big/arith_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !math_big_pure_go
     6  
     7  #include "textflag.h"
     8  
     9  // This file provides fast assembly versions for the elementary
    10  // arithmetic operations on vectors implemented in arith.go.
    11  
    12  // func mulWW(x, y Word) (z1, z0 Word)
    13  TEXT ·mulWW(SB),NOSPLIT,$0
    14  	MOVQ x+0(FP), AX
    15  	MULQ y+8(FP)
    16  	MOVQ DX, z1+16(FP)
    17  	MOVQ AX, z0+24(FP)
    18  	RET
    19  
    20  
    21  
    22  // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
    23  // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
    24  // This is faster than using rotate instructions.
    25  
    26  // func addVV(z, x, y []Word) (c Word)
    27  TEXT ·addVV(SB),NOSPLIT,$0
    28  	MOVQ z_len+8(FP), DI
    29  	MOVQ x+24(FP), R8
    30  	MOVQ y+48(FP), R9
    31  	MOVQ z+0(FP), R10
    32  
    33  	MOVQ $0, CX		// c = 0
    34  	MOVQ $0, SI		// i = 0
    35  
    36  	// s/JL/JMP/ below to disable the unrolled loop
    37  	SUBQ $4, DI		// n -= 4
    38  	JL V1			// if n < 0 goto V1
    39  
    40  U1:	// n >= 0
    41  	// regular loop body unrolled 4x
    42  	ADDQ CX, CX		// restore CF
    43  	MOVQ 0(R8)(SI*8), R11
    44  	MOVQ 8(R8)(SI*8), R12
    45  	MOVQ 16(R8)(SI*8), R13
    46  	MOVQ 24(R8)(SI*8), R14
    47  	ADCQ 0(R9)(SI*8), R11
    48  	ADCQ 8(R9)(SI*8), R12
    49  	ADCQ 16(R9)(SI*8), R13
    50  	ADCQ 24(R9)(SI*8), R14
    51  	MOVQ R11, 0(R10)(SI*8)
    52  	MOVQ R12, 8(R10)(SI*8)
    53  	MOVQ R13, 16(R10)(SI*8)
    54  	MOVQ R14, 24(R10)(SI*8)
    55  	SBBQ CX, CX		// save CF
    56  
    57  	ADDQ $4, SI		// i += 4
    58  	SUBQ $4, DI		// n -= 4
    59  	JGE U1			// if n >= 0 goto U1
    60  
    61  V1:	ADDQ $4, DI		// n += 4
    62  	JLE E1			// if n <= 0 goto E1
    63  
    64  L1:	// n > 0
    65  	ADDQ CX, CX		// restore CF
    66  	MOVQ 0(R8)(SI*8), R11
    67  	ADCQ 0(R9)(SI*8), R11
    68  	MOVQ R11, 0(R10)(SI*8)
    69  	SBBQ CX, CX		// save CF
    70  
    71  	ADDQ $1, SI		// i++
    72  	SUBQ $1, DI		// n--
    73  	JG L1			// if n > 0 goto L1
    74  
    75  E1:	NEGQ CX
    76  	MOVQ CX, c+72(FP)	// return c
    77  	RET
    78  
    79  
    80  // func subVV(z, x, y []Word) (c Word)
    81  // (same as addVV except for SBBQ instead of ADCQ and label names)
    82  TEXT ·subVV(SB),NOSPLIT,$0
    83  	MOVQ z_len+8(FP), DI
    84  	MOVQ x+24(FP), R8
    85  	MOVQ y+48(FP), R9
    86  	MOVQ z+0(FP), R10
    87  
    88  	MOVQ $0, CX		// c = 0
    89  	MOVQ $0, SI		// i = 0
    90  
    91  	// s/JL/JMP/ below to disable the unrolled loop
    92  	SUBQ $4, DI		// n -= 4
    93  	JL V2			// if n < 0 goto V2
    94  
    95  U2:	// n >= 0
    96  	// regular loop body unrolled 4x
    97  	ADDQ CX, CX		// restore CF
    98  	MOVQ 0(R8)(SI*8), R11
    99  	MOVQ 8(R8)(SI*8), R12
   100  	MOVQ 16(R8)(SI*8), R13
   101  	MOVQ 24(R8)(SI*8), R14
   102  	SBBQ 0(R9)(SI*8), R11
   103  	SBBQ 8(R9)(SI*8), R12
   104  	SBBQ 16(R9)(SI*8), R13
   105  	SBBQ 24(R9)(SI*8), R14
   106  	MOVQ R11, 0(R10)(SI*8)
   107  	MOVQ R12, 8(R10)(SI*8)
   108  	MOVQ R13, 16(R10)(SI*8)
   109  	MOVQ R14, 24(R10)(SI*8)
   110  	SBBQ CX, CX		// save CF
   111  
   112  	ADDQ $4, SI		// i += 4
   113  	SUBQ $4, DI		// n -= 4
   114  	JGE U2			// if n >= 0 goto U2
   115  
   116  V2:	ADDQ $4, DI		// n += 4
   117  	JLE E2			// if n <= 0 goto E2
   118  
   119  L2:	// n > 0
   120  	ADDQ CX, CX		// restore CF
   121  	MOVQ 0(R8)(SI*8), R11
   122  	SBBQ 0(R9)(SI*8), R11
   123  	MOVQ R11, 0(R10)(SI*8)
   124  	SBBQ CX, CX		// save CF
   125  
   126  	ADDQ $1, SI		// i++
   127  	SUBQ $1, DI		// n--
   128  	JG L2			// if n > 0 goto L2
   129  
   130  E2:	NEGQ CX
   131  	MOVQ CX, c+72(FP)	// return c
   132  	RET
   133  
   134  
   135  // func addVW(z, x []Word, y Word) (c Word)
   136  TEXT ·addVW(SB),NOSPLIT,$0
   137  	MOVQ z_len+8(FP), DI
   138  	CMPQ DI, $32
   139  	JG large
   140  	MOVQ x+24(FP), R8
   141  	MOVQ y+48(FP), CX	// c = y
   142  	MOVQ z+0(FP), R10
   143  
   144  	MOVQ $0, SI		// i = 0
   145  
   146  	// s/JL/JMP/ below to disable the unrolled loop
   147  	SUBQ $4, DI		// n -= 4
   148  	JL V3			// if n < 4 goto V3
   149  
   150  U3:	// n >= 0
   151  	// regular loop body unrolled 4x
   152  	MOVQ 0(R8)(SI*8), R11
   153  	MOVQ 8(R8)(SI*8), R12
   154  	MOVQ 16(R8)(SI*8), R13
   155  	MOVQ 24(R8)(SI*8), R14
   156  	ADDQ CX, R11
   157  	ADCQ $0, R12
   158  	ADCQ $0, R13
   159  	ADCQ $0, R14
   160  	SBBQ CX, CX		// save CF
   161  	NEGQ CX
   162  	MOVQ R11, 0(R10)(SI*8)
   163  	MOVQ R12, 8(R10)(SI*8)
   164  	MOVQ R13, 16(R10)(SI*8)
   165  	MOVQ R14, 24(R10)(SI*8)
   166  
   167  	ADDQ $4, SI		// i += 4
   168  	SUBQ $4, DI		// n -= 4
   169  	JGE U3			// if n >= 0 goto U3
   170  
   171  V3:	ADDQ $4, DI		// n += 4
   172  	JLE E3			// if n <= 0 goto E3
   173  
   174  L3:	// n > 0
   175  	ADDQ 0(R8)(SI*8), CX
   176  	MOVQ CX, 0(R10)(SI*8)
   177  	SBBQ CX, CX		// save CF
   178  	NEGQ CX
   179  
   180  	ADDQ $1, SI		// i++
   181  	SUBQ $1, DI		// n--
   182  	JG L3			// if n > 0 goto L3
   183  
   184  E3:	MOVQ CX, c+56(FP)	// return c
   185  	RET
   186  large:
   187  	JMP ·addVWlarge(SB)
   188  
   189  
   190  // func subVW(z, x []Word, y Word) (c Word)
   191  // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
   192  TEXT ·subVW(SB),NOSPLIT,$0
   193  	MOVQ z_len+8(FP), DI
   194  	CMPQ DI, $32
   195  	JG large
   196  	MOVQ x+24(FP), R8
   197  	MOVQ y+48(FP), CX	// c = y
   198  	MOVQ z+0(FP), R10
   199  
   200  	MOVQ $0, SI		// i = 0
   201  
   202  	// s/JL/JMP/ below to disable the unrolled loop
   203  	SUBQ $4, DI		// n -= 4
   204  	JL V4			// if n < 4 goto V4
   205  
   206  U4:	// n >= 0
   207  	// regular loop body unrolled 4x
   208  	MOVQ 0(R8)(SI*8), R11
   209  	MOVQ 8(R8)(SI*8), R12
   210  	MOVQ 16(R8)(SI*8), R13
   211  	MOVQ 24(R8)(SI*8), R14
   212  	SUBQ CX, R11
   213  	SBBQ $0, R12
   214  	SBBQ $0, R13
   215  	SBBQ $0, R14
   216  	SBBQ CX, CX		// save CF
   217  	NEGQ CX
   218  	MOVQ R11, 0(R10)(SI*8)
   219  	MOVQ R12, 8(R10)(SI*8)
   220  	MOVQ R13, 16(R10)(SI*8)
   221  	MOVQ R14, 24(R10)(SI*8)
   222  
   223  	ADDQ $4, SI		// i += 4
   224  	SUBQ $4, DI		// n -= 4
   225  	JGE U4			// if n >= 0 goto U4
   226  
   227  V4:	ADDQ $4, DI		// n += 4
   228  	JLE E4			// if n <= 0 goto E4
   229  
   230  L4:	// n > 0
   231  	MOVQ 0(R8)(SI*8), R11
   232  	SUBQ CX, R11
   233  	MOVQ R11, 0(R10)(SI*8)
   234  	SBBQ CX, CX		// save CF
   235  	NEGQ CX
   236  
   237  	ADDQ $1, SI		// i++
   238  	SUBQ $1, DI		// n--
   239  	JG L4			// if n > 0 goto L4
   240  
   241  E4:	MOVQ CX, c+56(FP)	// return c
   242  	RET
   243  large:
   244  	JMP ·subVWlarge(SB)
   245  
   246  
   247  // func shlVU(z, x []Word, s uint) (c Word)
   248  TEXT ·shlVU(SB),NOSPLIT,$0
   249  	MOVQ z_len+8(FP), BX	// i = z
   250  	SUBQ $1, BX		// i--
   251  	JL X8b			// i < 0	(n <= 0)
   252  
   253  	// n > 0
   254  	MOVQ z+0(FP), R10
   255  	MOVQ x+24(FP), R8
   256  	MOVQ s+48(FP), CX
   257  	MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
   258  	MOVQ $0, DX
   259  	SHLQ CX, AX, DX		// w1>>ŝ
   260  	MOVQ DX, c+56(FP)
   261  
   262  	CMPQ BX, $0
   263  	JLE X8a			// i <= 0
   264  
   265  	// i > 0
   266  L8:	MOVQ AX, DX		// w = w1
   267  	MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
   268  	SHLQ CX, AX, DX		// w<<s | w1>>ŝ
   269  	MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>ŝ
   270  	SUBQ $1, BX		// i--
   271  	JG L8			// i > 0
   272  
   273  	// i <= 0
   274  X8a:	SHLQ CX, AX		// w1<<s
   275  	MOVQ AX, (R10)		// z[0] = w1<<s
   276  	RET
   277  
   278  X8b:	MOVQ $0, c+56(FP)
   279  	RET
   280  
   281  
   282  // func shrVU(z, x []Word, s uint) (c Word)
   283  TEXT ·shrVU(SB),NOSPLIT,$0
   284  	MOVQ z_len+8(FP), R11
   285  	SUBQ $1, R11		// n--
   286  	JL X9b			// n < 0	(n <= 0)
   287  
   288  	// n > 0
   289  	MOVQ z+0(FP), R10
   290  	MOVQ x+24(FP), R8
   291  	MOVQ s+48(FP), CX
   292  	MOVQ (R8), AX		// w1 = x[0]
   293  	MOVQ $0, DX
   294  	SHRQ CX, AX, DX		// w1<<ŝ
   295  	MOVQ DX, c+56(FP)
   296  
   297  	MOVQ $0, BX		// i = 0
   298  	JMP E9
   299  
   300  	// i < n-1
   301  L9:	MOVQ AX, DX		// w = w1
   302  	MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
   303  	SHRQ CX, AX, DX		// w>>s | w1<<ŝ
   304  	MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<ŝ
   305  	ADDQ $1, BX		// i++
   306  
   307  E9:	CMPQ BX, R11
   308  	JL L9			// i < n-1
   309  
   310  	// i >= n-1
   311  X9a:	SHRQ CX, AX		// w1>>s
   312  	MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
   313  	RET
   314  
   315  X9b:	MOVQ $0, c+56(FP)
   316  	RET
   317  
   318  
   319  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   320  TEXT ·mulAddVWW(SB),NOSPLIT,$0
   321  	MOVQ z+0(FP), R10
   322  	MOVQ x+24(FP), R8
   323  	MOVQ y+48(FP), R9
   324  	MOVQ r+56(FP), CX	// c = r
   325  	MOVQ z_len+8(FP), R11
   326  	MOVQ $0, BX		// i = 0
   327  
   328  	CMPQ R11, $4
   329  	JL E5
   330  
   331  U5:	// i+4 <= n
   332  	// regular loop body unrolled 4x
   333  	MOVQ (0*8)(R8)(BX*8), AX
   334  	MULQ R9
   335  	ADDQ CX, AX
   336  	ADCQ $0, DX
   337  	MOVQ AX, (0*8)(R10)(BX*8)
   338  	MOVQ DX, CX
   339  	MOVQ (1*8)(R8)(BX*8), AX
   340  	MULQ R9
   341  	ADDQ CX, AX
   342  	ADCQ $0, DX
   343  	MOVQ AX, (1*8)(R10)(BX*8)
   344  	MOVQ DX, CX
   345  	MOVQ (2*8)(R8)(BX*8), AX
   346  	MULQ R9
   347  	ADDQ CX, AX
   348  	ADCQ $0, DX
   349  	MOVQ AX, (2*8)(R10)(BX*8)
   350  	MOVQ DX, CX
   351  	MOVQ (3*8)(R8)(BX*8), AX
   352  	MULQ R9
   353  	ADDQ CX, AX
   354  	ADCQ $0, DX
   355  	MOVQ AX, (3*8)(R10)(BX*8)
   356  	MOVQ DX, CX
   357  	ADDQ $4, BX		// i += 4
   358  
   359  	LEAQ 4(BX), DX
   360  	CMPQ DX, R11
   361  	JLE U5
   362  	JMP E5
   363  
   364  L5:	MOVQ (R8)(BX*8), AX
   365  	MULQ R9
   366  	ADDQ CX, AX
   367  	ADCQ $0, DX
   368  	MOVQ AX, (R10)(BX*8)
   369  	MOVQ DX, CX
   370  	ADDQ $1, BX		// i++
   371  
   372  E5:	CMPQ BX, R11		// i < n
   373  	JL L5
   374  
   375  	MOVQ CX, c+64(FP)
   376  	RET
   377  
   378  
   379  // func addMulVVW(z, x []Word, y Word) (c Word)
   380  TEXT ·addMulVVW(SB),NOSPLIT,$0
   381  	CMPB    ·support_adx(SB), $1
   382  	JEQ adx
   383  	MOVQ z+0(FP), R10
   384  	MOVQ x+24(FP), R8
   385  	MOVQ y+48(FP), R9
   386  	MOVQ z_len+8(FP), R11
   387  	MOVQ $0, BX		// i = 0
   388  	MOVQ $0, CX		// c = 0
   389  	MOVQ R11, R12
   390  	ANDQ $-2, R12
   391  	CMPQ R11, $2
   392  	JAE A6
   393  	JMP E6
   394  
   395  A6:
   396  	MOVQ (R8)(BX*8), AX
   397  	MULQ R9
   398  	ADDQ (R10)(BX*8), AX
   399  	ADCQ $0, DX
   400  	ADDQ CX, AX
   401  	ADCQ $0, DX
   402  	MOVQ DX, CX
   403  	MOVQ AX, (R10)(BX*8)
   404  
   405  	MOVQ (8)(R8)(BX*8), AX
   406  	MULQ R9
   407  	ADDQ (8)(R10)(BX*8), AX
   408  	ADCQ $0, DX
   409  	ADDQ CX, AX
   410  	ADCQ $0, DX
   411  	MOVQ DX, CX
   412  	MOVQ AX, (8)(R10)(BX*8)
   413  
   414  	ADDQ $2, BX
   415  	CMPQ BX, R12
   416  	JL A6
   417  	JMP E6
   418  
   419  L6:	MOVQ (R8)(BX*8), AX
   420  	MULQ R9
   421  	ADDQ CX, AX
   422  	ADCQ $0, DX
   423  	ADDQ AX, (R10)(BX*8)
   424  	ADCQ $0, DX
   425  	MOVQ DX, CX
   426  	ADDQ $1, BX		// i++
   427  
   428  E6:	CMPQ BX, R11		// i < n
   429  	JL L6
   430  
   431  	MOVQ CX, c+56(FP)
   432  	RET
   433  
   434  adx:
   435  	MOVQ z_len+8(FP), R11
   436  	MOVQ z+0(FP), R10
   437  	MOVQ x+24(FP), R8
   438  	MOVQ y+48(FP), DX
   439  	MOVQ $0, BX   // i = 0
   440  	MOVQ $0, CX   // carry
   441  	CMPQ R11, $8
   442  	JAE  adx_loop_header
   443  	CMPQ BX, R11
   444  	JL adx_short
   445  	MOVQ CX, c+56(FP)
   446  	RET
   447  
   448  adx_loop_header:
   449  	MOVQ  R11, R13
   450  	ANDQ  $-8, R13
   451  adx_loop:
   452  	XORQ  R9, R9  // unset flags
   453  	MULXQ (R8), SI, DI
   454  	ADCXQ CX,SI
   455  	ADOXQ (R10), SI
   456  	MOVQ  SI,(R10)
   457  
   458  	MULXQ 8(R8), AX, CX
   459  	ADCXQ DI, AX
   460  	ADOXQ 8(R10), AX
   461  	MOVQ  AX, 8(R10)
   462  
   463  	MULXQ 16(R8), SI, DI
   464  	ADCXQ CX, SI
   465  	ADOXQ 16(R10), SI
   466  	MOVQ  SI, 16(R10)
   467  
   468  	MULXQ 24(R8), AX, CX
   469  	ADCXQ DI, AX
   470  	ADOXQ 24(R10), AX
   471  	MOVQ  AX, 24(R10)
   472  
   473  	MULXQ 32(R8), SI, DI
   474  	ADCXQ CX, SI
   475  	ADOXQ 32(R10), SI
   476  	MOVQ  SI, 32(R10)
   477  
   478  	MULXQ 40(R8), AX, CX
   479  	ADCXQ DI, AX
   480  	ADOXQ 40(R10), AX
   481  	MOVQ  AX, 40(R10)
   482  
   483  	MULXQ 48(R8), SI, DI
   484  	ADCXQ CX, SI
   485  	ADOXQ 48(R10), SI
   486  	MOVQ  SI, 48(R10)
   487  
   488  	MULXQ 56(R8), AX, CX
   489  	ADCXQ DI, AX
   490  	ADOXQ 56(R10), AX
   491  	MOVQ  AX, 56(R10)
   492  
   493  	ADCXQ R9, CX
   494  	ADOXQ R9, CX
   495  
   496  	ADDQ $64, R8
   497  	ADDQ $64, R10
   498  	ADDQ $8, BX
   499  
   500  	CMPQ BX, R13
   501  	JL adx_loop
   502  	MOVQ z+0(FP), R10
   503  	MOVQ x+24(FP), R8
   504  	CMPQ BX, R11
   505  	JL adx_short
   506  	MOVQ CX, c+56(FP)
   507  	RET
   508  
   509  adx_short:
   510  	MULXQ (R8)(BX*8), SI, DI
   511  	ADDQ CX, SI
   512  	ADCQ $0, DI
   513  	ADDQ SI, (R10)(BX*8)
   514  	ADCQ $0, DI
   515  	MOVQ DI, CX
   516  	ADDQ $1, BX		// i++
   517  
   518  	CMPQ BX, R11
   519  	JL adx_short
   520  
   521  	MOVQ CX, c+56(FP)
   522  	RET
   523  
   524  
   525