github.com/c12o16h1/go/src@v0.0.0-20200114212001-5a151c0f00ed/math/big/arith_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !math_big_pure_go
     6  
     7  #include "textflag.h"
     8  
     9  // This file provides fast assembly versions for the elementary
    10  // arithmetic operations on vectors implemented in arith.go.
    11  
    12  // func mulWW(x, y Word) (z1, z0 Word)
    13  TEXT ·mulWW(SB),NOSPLIT,$0
    14  	MOVQ x+0(FP), AX
    15  	MULQ y+8(FP)
    16  	MOVQ DX, z1+16(FP)
    17  	MOVQ AX, z0+24(FP)
    18  	RET
    19  
    20  
    21  // func divWW(x1, x0, y Word) (q, r Word)
    22  TEXT ·divWW(SB),NOSPLIT,$0
    23  	MOVQ x1+0(FP), DX
    24  	MOVQ x0+8(FP), AX
    25  	DIVQ y+16(FP)
    26  	MOVQ AX, q+24(FP)
    27  	MOVQ DX, r+32(FP)
    28  	RET
    29  
    30  // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
    31  // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
    32  // This is faster than using rotate instructions.
    33  
    34  // func addVV(z, x, y []Word) (c Word)
    35  TEXT ·addVV(SB),NOSPLIT,$0
    36  	MOVQ z_len+8(FP), DI
    37  	MOVQ x+24(FP), R8
    38  	MOVQ y+48(FP), R9
    39  	MOVQ z+0(FP), R10
    40  
    41  	MOVQ $0, CX		// c = 0
    42  	MOVQ $0, SI		// i = 0
    43  
    44  	// s/JL/JMP/ below to disable the unrolled loop
    45  	SUBQ $4, DI		// n -= 4
    46  	JL V1			// if n < 0 goto V1
    47  
    48  U1:	// n >= 0
    49  	// regular loop body unrolled 4x
    50  	ADDQ CX, CX		// restore CF
    51  	MOVQ 0(R8)(SI*8), R11
    52  	MOVQ 8(R8)(SI*8), R12
    53  	MOVQ 16(R8)(SI*8), R13
    54  	MOVQ 24(R8)(SI*8), R14
    55  	ADCQ 0(R9)(SI*8), R11
    56  	ADCQ 8(R9)(SI*8), R12
    57  	ADCQ 16(R9)(SI*8), R13
    58  	ADCQ 24(R9)(SI*8), R14
    59  	MOVQ R11, 0(R10)(SI*8)
    60  	MOVQ R12, 8(R10)(SI*8)
    61  	MOVQ R13, 16(R10)(SI*8)
    62  	MOVQ R14, 24(R10)(SI*8)
    63  	SBBQ CX, CX		// save CF
    64  
    65  	ADDQ $4, SI		// i += 4
    66  	SUBQ $4, DI		// n -= 4
    67  	JGE U1			// if n >= 0 goto U1
    68  
    69  V1:	ADDQ $4, DI		// n += 4
    70  	JLE E1			// if n <= 0 goto E1
    71  
    72  L1:	// n > 0
    73  	ADDQ CX, CX		// restore CF
    74  	MOVQ 0(R8)(SI*8), R11
    75  	ADCQ 0(R9)(SI*8), R11
    76  	MOVQ R11, 0(R10)(SI*8)
    77  	SBBQ CX, CX		// save CF
    78  
    79  	ADDQ $1, SI		// i++
    80  	SUBQ $1, DI		// n--
    81  	JG L1			// if n > 0 goto L1
    82  
    83  E1:	NEGQ CX
    84  	MOVQ CX, c+72(FP)	// return c
    85  	RET
    86  
    87  
    88  // func subVV(z, x, y []Word) (c Word)
    89  // (same as addVV except for SBBQ instead of ADCQ and label names)
    90  TEXT ·subVV(SB),NOSPLIT,$0
    91  	MOVQ z_len+8(FP), DI
    92  	MOVQ x+24(FP), R8
    93  	MOVQ y+48(FP), R9
    94  	MOVQ z+0(FP), R10
    95  
    96  	MOVQ $0, CX		// c = 0
    97  	MOVQ $0, SI		// i = 0
    98  
    99  	// s/JL/JMP/ below to disable the unrolled loop
   100  	SUBQ $4, DI		// n -= 4
   101  	JL V2			// if n < 0 goto V2
   102  
   103  U2:	// n >= 0
   104  	// regular loop body unrolled 4x
   105  	ADDQ CX, CX		// restore CF
   106  	MOVQ 0(R8)(SI*8), R11
   107  	MOVQ 8(R8)(SI*8), R12
   108  	MOVQ 16(R8)(SI*8), R13
   109  	MOVQ 24(R8)(SI*8), R14
   110  	SBBQ 0(R9)(SI*8), R11
   111  	SBBQ 8(R9)(SI*8), R12
   112  	SBBQ 16(R9)(SI*8), R13
   113  	SBBQ 24(R9)(SI*8), R14
   114  	MOVQ R11, 0(R10)(SI*8)
   115  	MOVQ R12, 8(R10)(SI*8)
   116  	MOVQ R13, 16(R10)(SI*8)
   117  	MOVQ R14, 24(R10)(SI*8)
   118  	SBBQ CX, CX		// save CF
   119  
   120  	ADDQ $4, SI		// i += 4
   121  	SUBQ $4, DI		// n -= 4
   122  	JGE U2			// if n >= 0 goto U2
   123  
   124  V2:	ADDQ $4, DI		// n += 4
   125  	JLE E2			// if n <= 0 goto E2
   126  
   127  L2:	// n > 0
   128  	ADDQ CX, CX		// restore CF
   129  	MOVQ 0(R8)(SI*8), R11
   130  	SBBQ 0(R9)(SI*8), R11
   131  	MOVQ R11, 0(R10)(SI*8)
   132  	SBBQ CX, CX		// save CF
   133  
   134  	ADDQ $1, SI		// i++
   135  	SUBQ $1, DI		// n--
   136  	JG L2			// if n > 0 goto L2
   137  
   138  E2:	NEGQ CX
   139  	MOVQ CX, c+72(FP)	// return c
   140  	RET
   141  
   142  
   143  // func addVW(z, x []Word, y Word) (c Word)
   144  TEXT ·addVW(SB),NOSPLIT,$0
   145  	MOVQ z_len+8(FP), DI
   146  	CMPQ DI, $32
   147  	JG large
   148  	MOVQ x+24(FP), R8
   149  	MOVQ y+48(FP), CX	// c = y
   150  	MOVQ z+0(FP), R10
   151  
   152  	MOVQ $0, SI		// i = 0
   153  
   154  	// s/JL/JMP/ below to disable the unrolled loop
   155  	SUBQ $4, DI		// n -= 4
   156  	JL V3			// if n < 4 goto V3
   157  
   158  U3:	// n >= 0
   159  	// regular loop body unrolled 4x
   160  	MOVQ 0(R8)(SI*8), R11
   161  	MOVQ 8(R8)(SI*8), R12
   162  	MOVQ 16(R8)(SI*8), R13
   163  	MOVQ 24(R8)(SI*8), R14
   164  	ADDQ CX, R11
   165  	ADCQ $0, R12
   166  	ADCQ $0, R13
   167  	ADCQ $0, R14
   168  	SBBQ CX, CX		// save CF
   169  	NEGQ CX
   170  	MOVQ R11, 0(R10)(SI*8)
   171  	MOVQ R12, 8(R10)(SI*8)
   172  	MOVQ R13, 16(R10)(SI*8)
   173  	MOVQ R14, 24(R10)(SI*8)
   174  
   175  	ADDQ $4, SI		// i += 4
   176  	SUBQ $4, DI		// n -= 4
   177  	JGE U3			// if n >= 0 goto U3
   178  
   179  V3:	ADDQ $4, DI		// n += 4
   180  	JLE E3			// if n <= 0 goto E3
   181  
   182  L3:	// n > 0
   183  	ADDQ 0(R8)(SI*8), CX
   184  	MOVQ CX, 0(R10)(SI*8)
   185  	SBBQ CX, CX		// save CF
   186  	NEGQ CX
   187  
   188  	ADDQ $1, SI		// i++
   189  	SUBQ $1, DI		// n--
   190  	JG L3			// if n > 0 goto L3
   191  
   192  E3:	MOVQ CX, c+56(FP)	// return c
   193  	RET
   194  large:
   195  	JMP ·addVWlarge(SB)
   196  
   197  
   198  // func subVW(z, x []Word, y Word) (c Word)
   199  // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
   200  TEXT ·subVW(SB),NOSPLIT,$0
   201  	MOVQ z_len+8(FP), DI
   202  	CMPQ DI, $32
   203  	JG large
   204  	MOVQ x+24(FP), R8
   205  	MOVQ y+48(FP), CX	// c = y
   206  	MOVQ z+0(FP), R10
   207  
   208  	MOVQ $0, SI		// i = 0
   209  
   210  	// s/JL/JMP/ below to disable the unrolled loop
   211  	SUBQ $4, DI		// n -= 4
   212  	JL V4			// if n < 4 goto V4
   213  
   214  U4:	// n >= 0
   215  	// regular loop body unrolled 4x
   216  	MOVQ 0(R8)(SI*8), R11
   217  	MOVQ 8(R8)(SI*8), R12
   218  	MOVQ 16(R8)(SI*8), R13
   219  	MOVQ 24(R8)(SI*8), R14
   220  	SUBQ CX, R11
   221  	SBBQ $0, R12
   222  	SBBQ $0, R13
   223  	SBBQ $0, R14
   224  	SBBQ CX, CX		// save CF
   225  	NEGQ CX
   226  	MOVQ R11, 0(R10)(SI*8)
   227  	MOVQ R12, 8(R10)(SI*8)
   228  	MOVQ R13, 16(R10)(SI*8)
   229  	MOVQ R14, 24(R10)(SI*8)
   230  
   231  	ADDQ $4, SI		// i += 4
   232  	SUBQ $4, DI		// n -= 4
   233  	JGE U4			// if n >= 0 goto U4
   234  
   235  V4:	ADDQ $4, DI		// n += 4
   236  	JLE E4			// if n <= 0 goto E4
   237  
   238  L4:	// n > 0
   239  	MOVQ 0(R8)(SI*8), R11
   240  	SUBQ CX, R11
   241  	MOVQ R11, 0(R10)(SI*8)
   242  	SBBQ CX, CX		// save CF
   243  	NEGQ CX
   244  
   245  	ADDQ $1, SI		// i++
   246  	SUBQ $1, DI		// n--
   247  	JG L4			// if n > 0 goto L4
   248  
   249  E4:	MOVQ CX, c+56(FP)	// return c
   250  	RET
   251  large:
   252  	JMP ·subVWlarge(SB)
   253  
   254  
   255  // func shlVU(z, x []Word, s uint) (c Word)
   256  TEXT ·shlVU(SB),NOSPLIT,$0
   257  	MOVQ z_len+8(FP), BX	// i = z
   258  	SUBQ $1, BX		// i--
   259  	JL X8b			// i < 0	(n <= 0)
   260  
   261  	// n > 0
   262  	MOVQ z+0(FP), R10
   263  	MOVQ x+24(FP), R8
   264  	MOVQ s+48(FP), CX
   265  	MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
   266  	MOVQ $0, DX
   267  	SHLQ CX, AX, DX		// w1>>ŝ
   268  	MOVQ DX, c+56(FP)
   269  
   270  	CMPQ BX, $0
   271  	JLE X8a			// i <= 0
   272  
   273  	// i > 0
   274  L8:	MOVQ AX, DX		// w = w1
   275  	MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
   276  	SHLQ CX, AX, DX		// w<<s | w1>>ŝ
   277  	MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>ŝ
   278  	SUBQ $1, BX		// i--
   279  	JG L8			// i > 0
   280  
   281  	// i <= 0
   282  X8a:	SHLQ CX, AX		// w1<<s
   283  	MOVQ AX, (R10)		// z[0] = w1<<s
   284  	RET
   285  
   286  X8b:	MOVQ $0, c+56(FP)
   287  	RET
   288  
   289  
   290  // func shrVU(z, x []Word, s uint) (c Word)
   291  TEXT ·shrVU(SB),NOSPLIT,$0
   292  	MOVQ z_len+8(FP), R11
   293  	SUBQ $1, R11		// n--
   294  	JL X9b			// n < 0	(n <= 0)
   295  
   296  	// n > 0
   297  	MOVQ z+0(FP), R10
   298  	MOVQ x+24(FP), R8
   299  	MOVQ s+48(FP), CX
   300  	MOVQ (R8), AX		// w1 = x[0]
   301  	MOVQ $0, DX
   302  	SHRQ CX, AX, DX		// w1<<ŝ
   303  	MOVQ DX, c+56(FP)
   304  
   305  	MOVQ $0, BX		// i = 0
   306  	JMP E9
   307  
   308  	// i < n-1
   309  L9:	MOVQ AX, DX		// w = w1
   310  	MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
   311  	SHRQ CX, AX, DX		// w>>s | w1<<ŝ
   312  	MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<ŝ
   313  	ADDQ $1, BX		// i++
   314  
   315  E9:	CMPQ BX, R11
   316  	JL L9			// i < n-1
   317  
   318  	// i >= n-1
   319  X9a:	SHRQ CX, AX		// w1>>s
   320  	MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
   321  	RET
   322  
   323  X9b:	MOVQ $0, c+56(FP)
   324  	RET
   325  
   326  
   327  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   328  TEXT ·mulAddVWW(SB),NOSPLIT,$0
   329  	MOVQ z+0(FP), R10
   330  	MOVQ x+24(FP), R8
   331  	MOVQ y+48(FP), R9
   332  	MOVQ r+56(FP), CX	// c = r
   333  	MOVQ z_len+8(FP), R11
   334  	MOVQ $0, BX		// i = 0
   335  
   336  	CMPQ R11, $4
   337  	JL E5
   338  
   339  U5:	// i+4 <= n
   340  	// regular loop body unrolled 4x
   341  	MOVQ (0*8)(R8)(BX*8), AX
   342  	MULQ R9
   343  	ADDQ CX, AX
   344  	ADCQ $0, DX
   345  	MOVQ AX, (0*8)(R10)(BX*8)
   346  	MOVQ DX, CX
   347  	MOVQ (1*8)(R8)(BX*8), AX
   348  	MULQ R9
   349  	ADDQ CX, AX
   350  	ADCQ $0, DX
   351  	MOVQ AX, (1*8)(R10)(BX*8)
   352  	MOVQ DX, CX
   353  	MOVQ (2*8)(R8)(BX*8), AX
   354  	MULQ R9
   355  	ADDQ CX, AX
   356  	ADCQ $0, DX
   357  	MOVQ AX, (2*8)(R10)(BX*8)
   358  	MOVQ DX, CX
   359  	MOVQ (3*8)(R8)(BX*8), AX
   360  	MULQ R9
   361  	ADDQ CX, AX
   362  	ADCQ $0, DX
   363  	MOVQ AX, (3*8)(R10)(BX*8)
   364  	MOVQ DX, CX
   365  	ADDQ $4, BX		// i += 4
   366  
   367  	LEAQ 4(BX), DX
   368  	CMPQ DX, R11
   369  	JLE U5
   370  	JMP E5
   371  
   372  L5:	MOVQ (R8)(BX*8), AX
   373  	MULQ R9
   374  	ADDQ CX, AX
   375  	ADCQ $0, DX
   376  	MOVQ AX, (R10)(BX*8)
   377  	MOVQ DX, CX
   378  	ADDQ $1, BX		// i++
   379  
   380  E5:	CMPQ BX, R11		// i < n
   381  	JL L5
   382  
   383  	MOVQ CX, c+64(FP)
   384  	RET
   385  
   386  
   387  // func addMulVVW(z, x []Word, y Word) (c Word)
   388  TEXT ·addMulVVW(SB),NOSPLIT,$0
   389  	CMPB    ·support_adx(SB), $1
   390  	JEQ adx
   391  	MOVQ z+0(FP), R10
   392  	MOVQ x+24(FP), R8
   393  	MOVQ y+48(FP), R9
   394  	MOVQ z_len+8(FP), R11
   395  	MOVQ $0, BX		// i = 0
   396  	MOVQ $0, CX		// c = 0
   397  	MOVQ R11, R12
   398  	ANDQ $-2, R12
   399  	CMPQ R11, $2
   400  	JAE A6
   401  	JMP E6
   402  
   403  A6:
   404  	MOVQ (R8)(BX*8), AX
   405  	MULQ R9
   406  	ADDQ (R10)(BX*8), AX
   407  	ADCQ $0, DX
   408  	ADDQ CX, AX
   409  	ADCQ $0, DX
   410  	MOVQ DX, CX
   411  	MOVQ AX, (R10)(BX*8)
   412  
   413  	MOVQ (8)(R8)(BX*8), AX
   414  	MULQ R9
   415  	ADDQ (8)(R10)(BX*8), AX
   416  	ADCQ $0, DX
   417  	ADDQ CX, AX
   418  	ADCQ $0, DX
   419  	MOVQ DX, CX
   420  	MOVQ AX, (8)(R10)(BX*8)
   421  
   422  	ADDQ $2, BX
   423  	CMPQ BX, R12
   424  	JL A6
   425  	JMP E6
   426  
   427  L6:	MOVQ (R8)(BX*8), AX
   428  	MULQ R9
   429  	ADDQ CX, AX
   430  	ADCQ $0, DX
   431  	ADDQ AX, (R10)(BX*8)
   432  	ADCQ $0, DX
   433  	MOVQ DX, CX
   434  	ADDQ $1, BX		// i++
   435  
   436  E6:	CMPQ BX, R11		// i < n
   437  	JL L6
   438  
   439  	MOVQ CX, c+56(FP)
   440  	RET
   441  
   442  adx:
   443  	MOVQ z_len+8(FP), R11
   444  	MOVQ z+0(FP), R10
   445  	MOVQ x+24(FP), R8
   446  	MOVQ y+48(FP), DX
   447  	MOVQ $0, BX   // i = 0
   448  	MOVQ $0, CX   // carry
   449  	CMPQ R11, $8
   450  	JAE  adx_loop_header
   451  	CMPQ BX, R11
   452  	JL adx_short
   453  	MOVQ CX, c+56(FP)
   454  	RET
   455  
   456  adx_loop_header:
   457  	MOVQ  R11, R13
   458  	ANDQ  $-8, R13
   459  adx_loop:
   460  	XORQ  R9, R9  // unset flags
   461  	MULXQ (R8), SI, DI
   462  	ADCXQ CX,SI
   463  	ADOXQ (R10), SI
   464  	MOVQ  SI,(R10)
   465  
   466  	MULXQ 8(R8), AX, CX
   467  	ADCXQ DI, AX
   468  	ADOXQ 8(R10), AX
   469  	MOVQ  AX, 8(R10)
   470  
   471  	MULXQ 16(R8), SI, DI
   472  	ADCXQ CX, SI
   473  	ADOXQ 16(R10), SI
   474  	MOVQ  SI, 16(R10)
   475  
   476  	MULXQ 24(R8), AX, CX
   477  	ADCXQ DI, AX
   478  	ADOXQ 24(R10), AX
   479  	MOVQ  AX, 24(R10)
   480  
   481  	MULXQ 32(R8), SI, DI
   482  	ADCXQ CX, SI
   483  	ADOXQ 32(R10), SI
   484  	MOVQ  SI, 32(R10)
   485  
   486  	MULXQ 40(R8), AX, CX
   487  	ADCXQ DI, AX
   488  	ADOXQ 40(R10), AX
   489  	MOVQ  AX, 40(R10)
   490  
   491  	MULXQ 48(R8), SI, DI
   492  	ADCXQ CX, SI
   493  	ADOXQ 48(R10), SI
   494  	MOVQ  SI, 48(R10)
   495  
   496  	MULXQ 56(R8), AX, CX
   497  	ADCXQ DI, AX
   498  	ADOXQ 56(R10), AX
   499  	MOVQ  AX, 56(R10)
   500  
   501  	ADCXQ R9, CX
   502  	ADOXQ R9, CX
   503  
   504  	ADDQ $64, R8
   505  	ADDQ $64, R10
   506  	ADDQ $8, BX
   507  
   508  	CMPQ BX, R13
   509  	JL adx_loop
   510  	MOVQ z+0(FP), R10
   511  	MOVQ x+24(FP), R8
   512  	CMPQ BX, R11
   513  	JL adx_short
   514  	MOVQ CX, c+56(FP)
   515  	RET
   516  
   517  adx_short:
   518  	MULXQ (R8)(BX*8), SI, DI
   519  	ADDQ CX, SI
   520  	ADCQ $0, DI
   521  	ADDQ SI, (R10)(BX*8)
   522  	ADCQ $0, DI
   523  	MOVQ DI, CX
   524  	ADDQ $1, BX		// i++
   525  
   526  	CMPQ BX, R11
   527  	JL adx_short
   528  
   529  	MOVQ CX, c+56(FP)
   530  	RET
   531  
   532  
   533  
   534  // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
   535  TEXT ·divWVW(SB),NOSPLIT,$0
   536  	MOVQ z+0(FP), R10
   537  	MOVQ xn+24(FP), DX	// r = xn
   538  	MOVQ x+32(FP), R8
   539  	MOVQ y+56(FP), R9
   540  	MOVQ z_len+8(FP), BX	// i = z
   541  	JMP E7
   542  
   543  L7:	MOVQ (R8)(BX*8), AX
   544  	DIVQ R9
   545  	MOVQ AX, (R10)(BX*8)
   546  
   547  E7:	SUBQ $1, BX		// i--
   548  	JGE L7			// i >= 0
   549  
   550  	MOVQ DX, r+64(FP)
   551  	RET