github.com/megatontech/mynoteforgo@v0.0.0-20200507084910-5d0c6ea6e890/源码/math/big/arith_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !math_big_pure_go
     6  
     7  #include "textflag.h"
     8  
     9  // This file provides fast assembly versions for the elementary
    10  // arithmetic operations on vectors implemented in arith.go.
    11  
    12  // func mulWW(x, y Word) (z1, z0 Word)
    13  TEXT ·mulWW(SB),NOSPLIT,$0
    14  	MOVQ x+0(FP), AX
    15  	MULQ y+8(FP)
    16  	MOVQ DX, z1+16(FP)
    17  	MOVQ AX, z0+24(FP)
    18  	RET
    19  
    20  
    21  // func divWW(x1, x0, y Word) (q, r Word)
    22  TEXT ·divWW(SB),NOSPLIT,$0
    23  	MOVQ x1+0(FP), DX
    24  	MOVQ x0+8(FP), AX
    25  	DIVQ y+16(FP)
    26  	MOVQ AX, q+24(FP)
    27  	MOVQ DX, r+32(FP)
    28  	RET
    29  
    30  // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
    31  // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
    32  // This is faster than using rotate instructions.
    33  
    34  // func addVV(z, x, y []Word) (c Word)
    35  TEXT ·addVV(SB),NOSPLIT,$0
    36  	MOVQ z_len+8(FP), DI
    37  	MOVQ x+24(FP), R8
    38  	MOVQ y+48(FP), R9
    39  	MOVQ z+0(FP), R10
    40  
    41  	MOVQ $0, CX		// c = 0
    42  	MOVQ $0, SI		// i = 0
    43  
    44  	// s/JL/JMP/ below to disable the unrolled loop
    45  	SUBQ $4, DI		// n -= 4
    46  	JL V1			// if n < 0 goto V1
    47  
    48  U1:	// n >= 0
    49  	// regular loop body unrolled 4x
    50  	ADDQ CX, CX		// restore CF
    51  	MOVQ 0(R8)(SI*8), R11
    52  	MOVQ 8(R8)(SI*8), R12
    53  	MOVQ 16(R8)(SI*8), R13
    54  	MOVQ 24(R8)(SI*8), R14
    55  	ADCQ 0(R9)(SI*8), R11
    56  	ADCQ 8(R9)(SI*8), R12
    57  	ADCQ 16(R9)(SI*8), R13
    58  	ADCQ 24(R9)(SI*8), R14
    59  	MOVQ R11, 0(R10)(SI*8)
    60  	MOVQ R12, 8(R10)(SI*8)
    61  	MOVQ R13, 16(R10)(SI*8)
    62  	MOVQ R14, 24(R10)(SI*8)
    63  	SBBQ CX, CX		// save CF
    64  
    65  	ADDQ $4, SI		// i += 4
    66  	SUBQ $4, DI		// n -= 4
    67  	JGE U1			// if n >= 0 goto U1
    68  
    69  V1:	ADDQ $4, DI		// n += 4
    70  	JLE E1			// if n <= 0 goto E1
    71  
    72  L1:	// n > 0
    73  	ADDQ CX, CX		// restore CF
    74  	MOVQ 0(R8)(SI*8), R11
    75  	ADCQ 0(R9)(SI*8), R11
    76  	MOVQ R11, 0(R10)(SI*8)
    77  	SBBQ CX, CX		// save CF
    78  
    79  	ADDQ $1, SI		// i++
    80  	SUBQ $1, DI		// n--
    81  	JG L1			// if n > 0 goto L1
    82  
    83  E1:	NEGQ CX
    84  	MOVQ CX, c+72(FP)	// return c
    85  	RET
    86  
    87  
    88  // func subVV(z, x, y []Word) (c Word)
    89  // (same as addVV except for SBBQ instead of ADCQ and label names)
    90  TEXT ·subVV(SB),NOSPLIT,$0
    91  	MOVQ z_len+8(FP), DI
    92  	MOVQ x+24(FP), R8
    93  	MOVQ y+48(FP), R9
    94  	MOVQ z+0(FP), R10
    95  
    96  	MOVQ $0, CX		// c = 0
    97  	MOVQ $0, SI		// i = 0
    98  
    99  	// s/JL/JMP/ below to disable the unrolled loop
   100  	SUBQ $4, DI		// n -= 4
   101  	JL V2			// if n < 0 goto V2
   102  
   103  U2:	// n >= 0
   104  	// regular loop body unrolled 4x
   105  	ADDQ CX, CX		// restore CF
   106  	MOVQ 0(R8)(SI*8), R11
   107  	MOVQ 8(R8)(SI*8), R12
   108  	MOVQ 16(R8)(SI*8), R13
   109  	MOVQ 24(R8)(SI*8), R14
   110  	SBBQ 0(R9)(SI*8), R11
   111  	SBBQ 8(R9)(SI*8), R12
   112  	SBBQ 16(R9)(SI*8), R13
   113  	SBBQ 24(R9)(SI*8), R14
   114  	MOVQ R11, 0(R10)(SI*8)
   115  	MOVQ R12, 8(R10)(SI*8)
   116  	MOVQ R13, 16(R10)(SI*8)
   117  	MOVQ R14, 24(R10)(SI*8)
   118  	SBBQ CX, CX		// save CF
   119  
   120  	ADDQ $4, SI		// i += 4
   121  	SUBQ $4, DI		// n -= 4
   122  	JGE U2			// if n >= 0 goto U2
   123  
   124  V2:	ADDQ $4, DI		// n += 4
   125  	JLE E2			// if n <= 0 goto E2
   126  
   127  L2:	// n > 0
   128  	ADDQ CX, CX		// restore CF
   129  	MOVQ 0(R8)(SI*8), R11
   130  	SBBQ 0(R9)(SI*8), R11
   131  	MOVQ R11, 0(R10)(SI*8)
   132  	SBBQ CX, CX		// save CF
   133  
   134  	ADDQ $1, SI		// i++
   135  	SUBQ $1, DI		// n--
   136  	JG L2			// if n > 0 goto L2
   137  
   138  E2:	NEGQ CX
   139  	MOVQ CX, c+72(FP)	// return c
   140  	RET
   141  
   142  
   143  // func addVW(z, x []Word, y Word) (c Word)
   144  TEXT ·addVW(SB),NOSPLIT,$0
   145  	MOVQ z_len+8(FP), DI
   146  	MOVQ x+24(FP), R8
   147  	MOVQ y+48(FP), CX	// c = y
   148  	MOVQ z+0(FP), R10
   149  
   150  	MOVQ $0, SI		// i = 0
   151  
   152  	// s/JL/JMP/ below to disable the unrolled loop
   153  	SUBQ $4, DI		// n -= 4
   154  	JL V3			// if n < 4 goto V3
   155  
   156  U3:	// n >= 0
   157  	// regular loop body unrolled 4x
   158  	MOVQ 0(R8)(SI*8), R11
   159  	MOVQ 8(R8)(SI*8), R12
   160  	MOVQ 16(R8)(SI*8), R13
   161  	MOVQ 24(R8)(SI*8), R14
   162  	ADDQ CX, R11
   163  	ADCQ $0, R12
   164  	ADCQ $0, R13
   165  	ADCQ $0, R14
   166  	SBBQ CX, CX		// save CF
   167  	NEGQ CX
   168  	MOVQ R11, 0(R10)(SI*8)
   169  	MOVQ R12, 8(R10)(SI*8)
   170  	MOVQ R13, 16(R10)(SI*8)
   171  	MOVQ R14, 24(R10)(SI*8)
   172  
   173  	ADDQ $4, SI		// i += 4
   174  	SUBQ $4, DI		// n -= 4
   175  	JGE U3			// if n >= 0 goto U3
   176  
   177  V3:	ADDQ $4, DI		// n += 4
   178  	JLE E3			// if n <= 0 goto E3
   179  
   180  L3:	// n > 0
   181  	ADDQ 0(R8)(SI*8), CX
   182  	MOVQ CX, 0(R10)(SI*8)
   183  	SBBQ CX, CX		// save CF
   184  	NEGQ CX
   185  
   186  	ADDQ $1, SI		// i++
   187  	SUBQ $1, DI		// n--
   188  	JG L3			// if n > 0 goto L3
   189  
   190  E3:	MOVQ CX, c+56(FP)	// return c
   191  	RET
   192  
   193  
   194  // func subVW(z, x []Word, y Word) (c Word)
   195  // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
   196  TEXT ·subVW(SB),NOSPLIT,$0
   197  	MOVQ z_len+8(FP), DI
   198  	MOVQ x+24(FP), R8
   199  	MOVQ y+48(FP), CX	// c = y
   200  	MOVQ z+0(FP), R10
   201  
   202  	MOVQ $0, SI		// i = 0
   203  
   204  	// s/JL/JMP/ below to disable the unrolled loop
   205  	SUBQ $4, DI		// n -= 4
   206  	JL V4			// if n < 4 goto V4
   207  
   208  U4:	// n >= 0
   209  	// regular loop body unrolled 4x
   210  	MOVQ 0(R8)(SI*8), R11
   211  	MOVQ 8(R8)(SI*8), R12
   212  	MOVQ 16(R8)(SI*8), R13
   213  	MOVQ 24(R8)(SI*8), R14
   214  	SUBQ CX, R11
   215  	SBBQ $0, R12
   216  	SBBQ $0, R13
   217  	SBBQ $0, R14
   218  	SBBQ CX, CX		// save CF
   219  	NEGQ CX
   220  	MOVQ R11, 0(R10)(SI*8)
   221  	MOVQ R12, 8(R10)(SI*8)
   222  	MOVQ R13, 16(R10)(SI*8)
   223  	MOVQ R14, 24(R10)(SI*8)
   224  
   225  	ADDQ $4, SI		// i += 4
   226  	SUBQ $4, DI		// n -= 4
   227  	JGE U4			// if n >= 0 goto U4
   228  
   229  V4:	ADDQ $4, DI		// n += 4
   230  	JLE E4			// if n <= 0 goto E4
   231  
   232  L4:	// n > 0
   233  	MOVQ 0(R8)(SI*8), R11
   234  	SUBQ CX, R11
   235  	MOVQ R11, 0(R10)(SI*8)
   236  	SBBQ CX, CX		// save CF
   237  	NEGQ CX
   238  
   239  	ADDQ $1, SI		// i++
   240  	SUBQ $1, DI		// n--
   241  	JG L4			// if n > 0 goto L4
   242  
   243  E4:	MOVQ CX, c+56(FP)	// return c
   244  	RET
   245  
   246  
   247  // func shlVU(z, x []Word, s uint) (c Word)
   248  TEXT ·shlVU(SB),NOSPLIT,$0
   249  	MOVQ z_len+8(FP), BX	// i = z
   250  	SUBQ $1, BX		// i--
   251  	JL X8b			// i < 0	(n <= 0)
   252  
   253  	// n > 0
   254  	MOVQ z+0(FP), R10
   255  	MOVQ x+24(FP), R8
   256  	MOVQ s+48(FP), CX
   257  	MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
   258  	MOVQ $0, DX
   259  	SHLQ CX, DX:AX		// w1>>ŝ
   260  	MOVQ DX, c+56(FP)
   261  
   262  	CMPQ BX, $0
   263  	JLE X8a			// i <= 0
   264  
   265  	// i > 0
   266  L8:	MOVQ AX, DX		// w = w1
   267  	MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
   268  	SHLQ CX, DX:AX		// w<<s | w1>>ŝ
   269  	MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>ŝ
   270  	SUBQ $1, BX		// i--
   271  	JG L8			// i > 0
   272  
   273  	// i <= 0
   274  X8a:	SHLQ CX, AX		// w1<<s
   275  	MOVQ AX, (R10)		// z[0] = w1<<s
   276  	RET
   277  
   278  X8b:	MOVQ $0, c+56(FP)
   279  	RET
   280  
   281  
   282  // func shrVU(z, x []Word, s uint) (c Word)
   283  TEXT ·shrVU(SB),NOSPLIT,$0
   284  	MOVQ z_len+8(FP), R11
   285  	SUBQ $1, R11		// n--
   286  	JL X9b			// n < 0	(n <= 0)
   287  
   288  	// n > 0
   289  	MOVQ z+0(FP), R10
   290  	MOVQ x+24(FP), R8
   291  	MOVQ s+48(FP), CX
   292  	MOVQ (R8), AX		// w1 = x[0]
   293  	MOVQ $0, DX
   294  	SHRQ CX, DX:AX		// w1<<ŝ
   295  	MOVQ DX, c+56(FP)
   296  
   297  	MOVQ $0, BX		// i = 0
   298  	JMP E9
   299  
   300  	// i < n-1
   301  L9:	MOVQ AX, DX		// w = w1
   302  	MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
   303  	SHRQ CX, DX:AX		// w>>s | w1<<ŝ
   304  	MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<ŝ
   305  	ADDQ $1, BX		// i++
   306  
   307  E9:	CMPQ BX, R11
   308  	JL L9			// i < n-1
   309  
   310  	// i >= n-1
   311  X9a:	SHRQ CX, AX		// w1>>s
   312  	MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
   313  	RET
   314  
   315  X9b:	MOVQ $0, c+56(FP)
   316  	RET
   317  
   318  
   319  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   320  TEXT ·mulAddVWW(SB),NOSPLIT,$0
   321  	MOVQ z+0(FP), R10
   322  	MOVQ x+24(FP), R8
   323  	MOVQ y+48(FP), R9
   324  	MOVQ r+56(FP), CX	// c = r
   325  	MOVQ z_len+8(FP), R11
   326  	MOVQ $0, BX		// i = 0
   327  
   328  	CMPQ R11, $4
   329  	JL E5
   330  
   331  U5:	// i+4 <= n
   332  	// regular loop body unrolled 4x
   333  	MOVQ (0*8)(R8)(BX*8), AX
   334  	MULQ R9
   335  	ADDQ CX, AX
   336  	ADCQ $0, DX
   337  	MOVQ AX, (0*8)(R10)(BX*8)
   338  	MOVQ DX, CX
   339  	MOVQ (1*8)(R8)(BX*8), AX
   340  	MULQ R9
   341  	ADDQ CX, AX
   342  	ADCQ $0, DX
   343  	MOVQ AX, (1*8)(R10)(BX*8)
   344  	MOVQ DX, CX
   345  	MOVQ (2*8)(R8)(BX*8), AX
   346  	MULQ R9
   347  	ADDQ CX, AX
   348  	ADCQ $0, DX
   349  	MOVQ AX, (2*8)(R10)(BX*8)
   350  	MOVQ DX, CX
   351  	MOVQ (3*8)(R8)(BX*8), AX
   352  	MULQ R9
   353  	ADDQ CX, AX
   354  	ADCQ $0, DX
   355  	MOVQ AX, (3*8)(R10)(BX*8)
   356  	MOVQ DX, CX
   357  	ADDQ $4, BX		// i += 4
   358  
   359  	LEAQ 4(BX), DX
   360  	CMPQ DX, R11
   361  	JLE U5
   362  	JMP E5
   363  
   364  L5:	MOVQ (R8)(BX*8), AX
   365  	MULQ R9
   366  	ADDQ CX, AX
   367  	ADCQ $0, DX
   368  	MOVQ AX, (R10)(BX*8)
   369  	MOVQ DX, CX
   370  	ADDQ $1, BX		// i++
   371  
   372  E5:	CMPQ BX, R11		// i < n
   373  	JL L5
   374  
   375  	MOVQ CX, c+64(FP)
   376  	RET
   377  
   378  
   379  // func addMulVVW(z, x []Word, y Word) (c Word)
   380  TEXT ·addMulVVW(SB),NOSPLIT,$0
   381  	CMPB    ·support_adx(SB), $1
   382  	JEQ adx
   383  	MOVQ z+0(FP), R10
   384  	MOVQ x+24(FP), R8
   385  	MOVQ y+48(FP), R9
   386  	MOVQ z_len+8(FP), R11
   387  	MOVQ $0, BX		// i = 0
   388  	MOVQ $0, CX		// c = 0
   389  	MOVQ R11, R12
   390  	ANDQ $-2, R12
   391  	CMPQ R11, $2
   392  	JAE A6
   393  	JMP E6
   394  
   395  A6:
   396  	MOVQ (R8)(BX*8), AX
   397  	MULQ R9
   398  	ADDQ (R10)(BX*8), AX
   399  	ADCQ $0, DX
   400  	ADDQ CX, AX
   401  	ADCQ $0, DX
   402  	MOVQ DX, CX
   403  	MOVQ AX, (R10)(BX*8)
   404  
   405  	MOVQ (8)(R8)(BX*8), AX
   406  	MULQ R9
   407  	ADDQ (8)(R10)(BX*8), AX
   408  	ADCQ $0, DX
   409  	ADDQ CX, AX
   410  	ADCQ $0, DX
   411  	MOVQ DX, CX
   412  	MOVQ AX, (8)(R10)(BX*8)
   413  
   414  	ADDQ $2, BX
   415  	CMPQ BX, R12
   416  	JL A6
   417  	JMP E6
   418  
   419  L6:	MOVQ (R8)(BX*8), AX
   420  	MULQ R9
   421  	ADDQ CX, AX
   422  	ADCQ $0, DX
   423  	ADDQ AX, (R10)(BX*8)
   424  	ADCQ $0, DX
   425  	MOVQ DX, CX
   426  	ADDQ $1, BX		// i++
   427  
   428  E6:	CMPQ BX, R11		// i < n
   429  	JL L6
   430  
   431  	MOVQ CX, c+56(FP)
   432  	RET
   433  
   434  adx:
   435  	MOVQ z_len+8(FP), R11
   436  	MOVQ z+0(FP), R10
   437  	MOVQ x+24(FP), R8
   438  	MOVQ y+48(FP), DX
   439  	MOVQ $0, BX   // i = 0
   440  	MOVQ $0, CX   // carry
   441  	CMPQ R11, $8
   442  	JAE  adx_loop_header
   443  	CMPQ BX, R11
   444  	JL adx_short
   445  	MOVQ CX, c+56(FP)
   446  	RET
   447  
   448  adx_loop_header:
   449  	MOVQ  R11, R13
   450  	ANDQ  $-8, R13
   451  adx_loop:
   452  	XORQ  R9, R9  // unset flags
   453  	MULXQ (R8), SI, DI
   454  	ADCXQ CX,SI
   455  	ADOXQ (R10), SI
   456  	MOVQ  SI,(R10)
   457  
   458  	MULXQ 8(R8), AX, CX
   459  	ADCXQ DI, AX
   460  	ADOXQ 8(R10), AX
   461  	MOVQ  AX, 8(R10)
   462  
   463  	MULXQ 16(R8), SI, DI
   464  	ADCXQ CX, SI
   465  	ADOXQ 16(R10), SI
   466  	MOVQ  SI, 16(R10)
   467  
   468  	MULXQ 24(R8), AX, CX
   469  	ADCXQ DI, AX
   470  	ADOXQ 24(R10), AX
   471  	MOVQ  AX, 24(R10)
   472  
   473  	MULXQ 32(R8), SI, DI
   474  	ADCXQ CX, SI
   475  	ADOXQ 32(R10), SI
   476  	MOVQ  SI, 32(R10)
   477  
   478  	MULXQ 40(R8), AX, CX
   479  	ADCXQ DI, AX
   480  	ADOXQ 40(R10), AX
   481  	MOVQ  AX, 40(R10)
   482  
   483  	MULXQ 48(R8), SI, DI
   484  	ADCXQ CX, SI
   485  	ADOXQ 48(R10), SI
   486  	MOVQ  SI, 48(R10)
   487  
   488  	MULXQ 56(R8), AX, CX
   489  	ADCXQ DI, AX
   490  	ADOXQ 56(R10), AX
   491  	MOVQ  AX, 56(R10)
   492  
   493  	ADCXQ R9, CX
   494  	ADOXQ R9, CX
   495  
   496  	ADDQ $64, R8
   497  	ADDQ $64, R10
   498  	ADDQ $8, BX
   499  
   500  	CMPQ BX, R13
   501  	JL adx_loop
   502  	MOVQ z+0(FP), R10
   503  	MOVQ x+24(FP), R8
   504  	CMPQ BX, R11
   505  	JL adx_short
   506  	MOVQ CX, c+56(FP)
   507  	RET
   508  
   509  adx_short:
   510  	MULXQ (R8)(BX*8), SI, DI
   511  	ADDQ CX, SI
   512  	ADCQ $0, DI
   513  	ADDQ SI, (R10)(BX*8)
   514  	ADCQ $0, DI
   515  	MOVQ DI, CX
   516  	ADDQ $1, BX		// i++
   517  
   518  	CMPQ BX, R11
   519  	JL adx_short
   520  
   521  	MOVQ CX, c+56(FP)
   522  	RET
   523  
   524  
   525  
   526  // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
   527  TEXT ·divWVW(SB),NOSPLIT,$0
   528  	MOVQ z+0(FP), R10
   529  	MOVQ xn+24(FP), DX	// r = xn
   530  	MOVQ x+32(FP), R8
   531  	MOVQ y+56(FP), R9
   532  	MOVQ z_len+8(FP), BX	// i = z
   533  	JMP E7
   534  
   535  L7:	MOVQ (R8)(BX*8), AX
   536  	DIVQ R9
   537  	MOVQ AX, (R10)(BX*8)
   538  
   539  E7:	SUBQ $1, BX		// i--
   540  	JGE L7			// i >= 0
   541  
   542  	MOVQ DX, r+64(FP)
   543  	RET