github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/math/big/arith_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !math_big_pure_go
     6  // +build !math_big_pure_go
     7  
     8  #include "textflag.h"
     9  
    10  // This file provides fast assembly versions for the elementary
    11  // arithmetic operations on vectors implemented in arith.go.
    12  
    13  // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
    14  // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
    15  // This is faster than using rotate instructions.
    16  
    17  // func addVV(z, x, y []Word) (c Word)
    18  TEXT ·addVV(SB),NOSPLIT,$0
    19  	MOVQ z_len+8(FP), DI
    20  	MOVQ x+24(FP), R8
    21  	MOVQ y+48(FP), R9
    22  	MOVQ z+0(FP), R10
    23  
    24  	MOVQ $0, CX		// c = 0
    25  	MOVQ $0, SI		// i = 0
    26  
    27  	// s/JL/JMP/ below to disable the unrolled loop
    28  	SUBQ $4, DI		// n -= 4
    29  	JL V1			// if n < 0 goto V1
    30  
    31  U1:	// n >= 0
    32  	// regular loop body unrolled 4x
    33  	ADDQ CX, CX		// restore CF
    34  	MOVQ 0(R8)(SI*8), R11
    35  	MOVQ 8(R8)(SI*8), R12
    36  	MOVQ 16(R8)(SI*8), R13
    37  	MOVQ 24(R8)(SI*8), R14
    38  	ADCQ 0(R9)(SI*8), R11
    39  	ADCQ 8(R9)(SI*8), R12
    40  	ADCQ 16(R9)(SI*8), R13
    41  	ADCQ 24(R9)(SI*8), R14
    42  	MOVQ R11, 0(R10)(SI*8)
    43  	MOVQ R12, 8(R10)(SI*8)
    44  	MOVQ R13, 16(R10)(SI*8)
    45  	MOVQ R14, 24(R10)(SI*8)
    46  	SBBQ CX, CX		// save CF
    47  
    48  	ADDQ $4, SI		// i += 4
    49  	SUBQ $4, DI		// n -= 4
    50  	JGE U1			// if n >= 0 goto U1
    51  
    52  V1:	ADDQ $4, DI		// n += 4
    53  	JLE E1			// if n <= 0 goto E1
    54  
    55  L1:	// n > 0
    56  	ADDQ CX, CX		// restore CF
    57  	MOVQ 0(R8)(SI*8), R11
    58  	ADCQ 0(R9)(SI*8), R11
    59  	MOVQ R11, 0(R10)(SI*8)
    60  	SBBQ CX, CX		// save CF
    61  
    62  	ADDQ $1, SI		// i++
    63  	SUBQ $1, DI		// n--
    64  	JG L1			// if n > 0 goto L1
    65  
    66  E1:	NEGQ CX
    67  	MOVQ CX, c+72(FP)	// return c
    68  	RET
    69  
    70  
    71  // func subVV(z, x, y []Word) (c Word)
    72  // (same as addVV except for SBBQ instead of ADCQ and label names)
    73  TEXT ·subVV(SB),NOSPLIT,$0
    74  	MOVQ z_len+8(FP), DI
    75  	MOVQ x+24(FP), R8
    76  	MOVQ y+48(FP), R9
    77  	MOVQ z+0(FP), R10
    78  
    79  	MOVQ $0, CX		// c = 0
    80  	MOVQ $0, SI		// i = 0
    81  
    82  	// s/JL/JMP/ below to disable the unrolled loop
    83  	SUBQ $4, DI		// n -= 4
    84  	JL V2			// if n < 0 goto V2
    85  
    86  U2:	// n >= 0
    87  	// regular loop body unrolled 4x
    88  	ADDQ CX, CX		// restore CF
    89  	MOVQ 0(R8)(SI*8), R11
    90  	MOVQ 8(R8)(SI*8), R12
    91  	MOVQ 16(R8)(SI*8), R13
    92  	MOVQ 24(R8)(SI*8), R14
    93  	SBBQ 0(R9)(SI*8), R11
    94  	SBBQ 8(R9)(SI*8), R12
    95  	SBBQ 16(R9)(SI*8), R13
    96  	SBBQ 24(R9)(SI*8), R14
    97  	MOVQ R11, 0(R10)(SI*8)
    98  	MOVQ R12, 8(R10)(SI*8)
    99  	MOVQ R13, 16(R10)(SI*8)
   100  	MOVQ R14, 24(R10)(SI*8)
   101  	SBBQ CX, CX		// save CF
   102  
   103  	ADDQ $4, SI		// i += 4
   104  	SUBQ $4, DI		// n -= 4
   105  	JGE U2			// if n >= 0 goto U2
   106  
   107  V2:	ADDQ $4, DI		// n += 4
   108  	JLE E2			// if n <= 0 goto E2
   109  
   110  L2:	// n > 0
   111  	ADDQ CX, CX		// restore CF
   112  	MOVQ 0(R8)(SI*8), R11
   113  	SBBQ 0(R9)(SI*8), R11
   114  	MOVQ R11, 0(R10)(SI*8)
   115  	SBBQ CX, CX		// save CF
   116  
   117  	ADDQ $1, SI		// i++
   118  	SUBQ $1, DI		// n--
   119  	JG L2			// if n > 0 goto L2
   120  
   121  E2:	NEGQ CX
   122  	MOVQ CX, c+72(FP)	// return c
   123  	RET
   124  
   125  
   126  // func addVW(z, x []Word, y Word) (c Word)
   127  TEXT ·addVW(SB),NOSPLIT,$0
   128  	MOVQ z_len+8(FP), DI
   129  	CMPQ DI, $32
   130  	JG large
   131  	MOVQ x+24(FP), R8
   132  	MOVQ y+48(FP), CX	// c = y
   133  	MOVQ z+0(FP), R10
   134  
   135  	MOVQ $0, SI		// i = 0
   136  
   137  	// s/JL/JMP/ below to disable the unrolled loop
   138  	SUBQ $4, DI		// n -= 4
   139  	JL V3			// if n < 4 goto V3
   140  
   141  U3:	// n >= 0
   142  	// regular loop body unrolled 4x
   143  	MOVQ 0(R8)(SI*8), R11
   144  	MOVQ 8(R8)(SI*8), R12
   145  	MOVQ 16(R8)(SI*8), R13
   146  	MOVQ 24(R8)(SI*8), R14
   147  	ADDQ CX, R11
   148  	ADCQ $0, R12
   149  	ADCQ $0, R13
   150  	ADCQ $0, R14
   151  	SBBQ CX, CX		// save CF
   152  	NEGQ CX
   153  	MOVQ R11, 0(R10)(SI*8)
   154  	MOVQ R12, 8(R10)(SI*8)
   155  	MOVQ R13, 16(R10)(SI*8)
   156  	MOVQ R14, 24(R10)(SI*8)
   157  
   158  	ADDQ $4, SI		// i += 4
   159  	SUBQ $4, DI		// n -= 4
   160  	JGE U3			// if n >= 0 goto U3
   161  
   162  V3:	ADDQ $4, DI		// n += 4
   163  	JLE E3			// if n <= 0 goto E3
   164  
   165  L3:	// n > 0
   166  	ADDQ 0(R8)(SI*8), CX
   167  	MOVQ CX, 0(R10)(SI*8)
   168  	SBBQ CX, CX		// save CF
   169  	NEGQ CX
   170  
   171  	ADDQ $1, SI		// i++
   172  	SUBQ $1, DI		// n--
   173  	JG L3			// if n > 0 goto L3
   174  
   175  E3:	MOVQ CX, c+56(FP)	// return c
   176  	RET
   177  large:
   178  	JMP ·addVWlarge(SB)
   179  
   180  
   181  // func subVW(z, x []Word, y Word) (c Word)
   182  // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
   183  TEXT ·subVW(SB),NOSPLIT,$0
   184  	MOVQ z_len+8(FP), DI
   185  	CMPQ DI, $32
   186  	JG large
   187  	MOVQ x+24(FP), R8
   188  	MOVQ y+48(FP), CX	// c = y
   189  	MOVQ z+0(FP), R10
   190  
   191  	MOVQ $0, SI		// i = 0
   192  
   193  	// s/JL/JMP/ below to disable the unrolled loop
   194  	SUBQ $4, DI		// n -= 4
   195  	JL V4			// if n < 4 goto V4
   196  
   197  U4:	// n >= 0
   198  	// regular loop body unrolled 4x
   199  	MOVQ 0(R8)(SI*8), R11
   200  	MOVQ 8(R8)(SI*8), R12
   201  	MOVQ 16(R8)(SI*8), R13
   202  	MOVQ 24(R8)(SI*8), R14
   203  	SUBQ CX, R11
   204  	SBBQ $0, R12
   205  	SBBQ $0, R13
   206  	SBBQ $0, R14
   207  	SBBQ CX, CX		// save CF
   208  	NEGQ CX
   209  	MOVQ R11, 0(R10)(SI*8)
   210  	MOVQ R12, 8(R10)(SI*8)
   211  	MOVQ R13, 16(R10)(SI*8)
   212  	MOVQ R14, 24(R10)(SI*8)
   213  
   214  	ADDQ $4, SI		// i += 4
   215  	SUBQ $4, DI		// n -= 4
   216  	JGE U4			// if n >= 0 goto U4
   217  
   218  V4:	ADDQ $4, DI		// n += 4
   219  	JLE E4			// if n <= 0 goto E4
   220  
   221  L4:	// n > 0
   222  	MOVQ 0(R8)(SI*8), R11
   223  	SUBQ CX, R11
   224  	MOVQ R11, 0(R10)(SI*8)
   225  	SBBQ CX, CX		// save CF
   226  	NEGQ CX
   227  
   228  	ADDQ $1, SI		// i++
   229  	SUBQ $1, DI		// n--
   230  	JG L4			// if n > 0 goto L4
   231  
   232  E4:	MOVQ CX, c+56(FP)	// return c
   233  	RET
   234  large:
   235  	JMP ·subVWlarge(SB)
   236  
   237  
   238  // func shlVU(z, x []Word, s uint) (c Word)
   239  TEXT ·shlVU(SB),NOSPLIT,$0
   240  	MOVQ z_len+8(FP), BX	// i = z
   241  	SUBQ $1, BX		// i--
   242  	JL X8b			// i < 0	(n <= 0)
   243  
   244  	// n > 0
   245  	MOVQ z+0(FP), R10
   246  	MOVQ x+24(FP), R8
   247  	MOVQ s+48(FP), CX
   248  	MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
   249  	MOVQ $0, DX
   250  	SHLQ CX, AX, DX		// w1>>ŝ
   251  	MOVQ DX, c+56(FP)
   252  
   253  	CMPQ BX, $0
   254  	JLE X8a			// i <= 0
   255  
   256  	// i > 0
   257  L8:	MOVQ AX, DX		// w = w1
   258  	MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
   259  	SHLQ CX, AX, DX		// w<<s | w1>>ŝ
   260  	MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>ŝ
   261  	SUBQ $1, BX		// i--
   262  	JG L8			// i > 0
   263  
   264  	// i <= 0
   265  X8a:	SHLQ CX, AX		// w1<<s
   266  	MOVQ AX, (R10)		// z[0] = w1<<s
   267  	RET
   268  
   269  X8b:	MOVQ $0, c+56(FP)
   270  	RET
   271  
   272  
   273  // func shrVU(z, x []Word, s uint) (c Word)
   274  TEXT ·shrVU(SB),NOSPLIT,$0
   275  	MOVQ z_len+8(FP), R11
   276  	SUBQ $1, R11		// n--
   277  	JL X9b			// n < 0	(n <= 0)
   278  
   279  	// n > 0
   280  	MOVQ z+0(FP), R10
   281  	MOVQ x+24(FP), R8
   282  	MOVQ s+48(FP), CX
   283  	MOVQ (R8), AX		// w1 = x[0]
   284  	MOVQ $0, DX
   285  	SHRQ CX, AX, DX		// w1<<ŝ
   286  	MOVQ DX, c+56(FP)
   287  
   288  	MOVQ $0, BX		// i = 0
   289  	JMP E9
   290  
   291  	// i < n-1
   292  L9:	MOVQ AX, DX		// w = w1
   293  	MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
   294  	SHRQ CX, AX, DX		// w>>s | w1<<ŝ
   295  	MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<ŝ
   296  	ADDQ $1, BX		// i++
   297  
   298  E9:	CMPQ BX, R11
   299  	JL L9			// i < n-1
   300  
   301  	// i >= n-1
   302  X9a:	SHRQ CX, AX		// w1>>s
   303  	MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
   304  	RET
   305  
   306  X9b:	MOVQ $0, c+56(FP)
   307  	RET
   308  
   309  
   310  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   311  TEXT ·mulAddVWW(SB),NOSPLIT,$0
   312  	MOVQ z+0(FP), R10
   313  	MOVQ x+24(FP), R8
   314  	MOVQ y+48(FP), R9
   315  	MOVQ r+56(FP), CX	// c = r
   316  	MOVQ z_len+8(FP), R11
   317  	MOVQ $0, BX		// i = 0
   318  
   319  	CMPQ R11, $4
   320  	JL E5
   321  
   322  U5:	// i+4 <= n
   323  	// regular loop body unrolled 4x
   324  	MOVQ (0*8)(R8)(BX*8), AX
   325  	MULQ R9
   326  	ADDQ CX, AX
   327  	ADCQ $0, DX
   328  	MOVQ AX, (0*8)(R10)(BX*8)
   329  	MOVQ DX, CX
   330  	MOVQ (1*8)(R8)(BX*8), AX
   331  	MULQ R9
   332  	ADDQ CX, AX
   333  	ADCQ $0, DX
   334  	MOVQ AX, (1*8)(R10)(BX*8)
   335  	MOVQ DX, CX
   336  	MOVQ (2*8)(R8)(BX*8), AX
   337  	MULQ R9
   338  	ADDQ CX, AX
   339  	ADCQ $0, DX
   340  	MOVQ AX, (2*8)(R10)(BX*8)
   341  	MOVQ DX, CX
   342  	MOVQ (3*8)(R8)(BX*8), AX
   343  	MULQ R9
   344  	ADDQ CX, AX
   345  	ADCQ $0, DX
   346  	MOVQ AX, (3*8)(R10)(BX*8)
   347  	MOVQ DX, CX
   348  	ADDQ $4, BX		// i += 4
   349  
   350  	LEAQ 4(BX), DX
   351  	CMPQ DX, R11
   352  	JLE U5
   353  	JMP E5
   354  
   355  L5:	MOVQ (R8)(BX*8), AX
   356  	MULQ R9
   357  	ADDQ CX, AX
   358  	ADCQ $0, DX
   359  	MOVQ AX, (R10)(BX*8)
   360  	MOVQ DX, CX
   361  	ADDQ $1, BX		// i++
   362  
   363  E5:	CMPQ BX, R11		// i < n
   364  	JL L5
   365  
   366  	MOVQ CX, c+64(FP)
   367  	RET
   368  
   369  
   370  // func addMulVVW(z, x []Word, y Word) (c Word)
   371  TEXT ·addMulVVW(SB),NOSPLIT,$0
   372  	CMPB ·support_adx(SB), $1
   373  	JEQ adx
   374  	MOVQ z+0(FP), R10
   375  	MOVQ x+24(FP), R8
   376  	MOVQ y+48(FP), R9
   377  	MOVQ z_len+8(FP), R11
   378  	MOVQ $0, BX		// i = 0
   379  	MOVQ $0, CX		// c = 0
   380  	MOVQ R11, R12
   381  	ANDQ $-2, R12
   382  	CMPQ R11, $2
   383  	JAE A6
   384  	JMP E6
   385  
   386  A6:
   387  	MOVQ (R8)(BX*8), AX
   388  	MULQ R9
   389  	ADDQ (R10)(BX*8), AX
   390  	ADCQ $0, DX
   391  	ADDQ CX, AX
   392  	ADCQ $0, DX
   393  	MOVQ DX, CX
   394  	MOVQ AX, (R10)(BX*8)
   395  
   396  	MOVQ (8)(R8)(BX*8), AX
   397  	MULQ R9
   398  	ADDQ (8)(R10)(BX*8), AX
   399  	ADCQ $0, DX
   400  	ADDQ CX, AX
   401  	ADCQ $0, DX
   402  	MOVQ DX, CX
   403  	MOVQ AX, (8)(R10)(BX*8)
   404  
   405  	ADDQ $2, BX
   406  	CMPQ BX, R12
   407  	JL A6
   408  	JMP E6
   409  
   410  L6:	MOVQ (R8)(BX*8), AX
   411  	MULQ R9
   412  	ADDQ CX, AX
   413  	ADCQ $0, DX
   414  	ADDQ AX, (R10)(BX*8)
   415  	ADCQ $0, DX
   416  	MOVQ DX, CX
   417  	ADDQ $1, BX		// i++
   418  
   419  E6:	CMPQ BX, R11		// i < n
   420  	JL L6
   421  
   422  	MOVQ CX, c+56(FP)
   423  	RET
   424  
   425  adx:
   426  	MOVQ z_len+8(FP), R11
   427  	MOVQ z+0(FP), R10
   428  	MOVQ x+24(FP), R8
   429  	MOVQ y+48(FP), DX
   430  	MOVQ $0, BX   // i = 0
   431  	MOVQ $0, CX   // carry
   432  	CMPQ R11, $8
   433  	JAE  adx_loop_header
   434  	CMPQ BX, R11
   435  	JL adx_short
   436  	MOVQ CX, c+56(FP)
   437  	RET
   438  
   439  adx_loop_header:
   440  	MOVQ  R11, R13
   441  	ANDQ  $-8, R13
   442  adx_loop:
   443  	XORQ  R9, R9  // unset flags
   444  	MULXQ (R8), SI, DI
   445  	ADCXQ CX,SI
   446  	ADOXQ (R10), SI
   447  	MOVQ  SI,(R10)
   448  
   449  	MULXQ 8(R8), AX, CX
   450  	ADCXQ DI, AX
   451  	ADOXQ 8(R10), AX
   452  	MOVQ  AX, 8(R10)
   453  
   454  	MULXQ 16(R8), SI, DI
   455  	ADCXQ CX, SI
   456  	ADOXQ 16(R10), SI
   457  	MOVQ  SI, 16(R10)
   458  
   459  	MULXQ 24(R8), AX, CX
   460  	ADCXQ DI, AX
   461  	ADOXQ 24(R10), AX
   462  	MOVQ  AX, 24(R10)
   463  
   464  	MULXQ 32(R8), SI, DI
   465  	ADCXQ CX, SI
   466  	ADOXQ 32(R10), SI
   467  	MOVQ  SI, 32(R10)
   468  
   469  	MULXQ 40(R8), AX, CX
   470  	ADCXQ DI, AX
   471  	ADOXQ 40(R10), AX
   472  	MOVQ  AX, 40(R10)
   473  
   474  	MULXQ 48(R8), SI, DI
   475  	ADCXQ CX, SI
   476  	ADOXQ 48(R10), SI
   477  	MOVQ  SI, 48(R10)
   478  
   479  	MULXQ 56(R8), AX, CX
   480  	ADCXQ DI, AX
   481  	ADOXQ 56(R10), AX
   482  	MOVQ  AX, 56(R10)
   483  
   484  	ADCXQ R9, CX
   485  	ADOXQ R9, CX
   486  
   487  	ADDQ $64, R8
   488  	ADDQ $64, R10
   489  	ADDQ $8, BX
   490  
   491  	CMPQ BX, R13
   492  	JL adx_loop
   493  	MOVQ z+0(FP), R10
   494  	MOVQ x+24(FP), R8
   495  	CMPQ BX, R11
   496  	JL adx_short
   497  	MOVQ CX, c+56(FP)
   498  	RET
   499  
   500  adx_short:
   501  	MULXQ (R8)(BX*8), SI, DI
   502  	ADDQ CX, SI
   503  	ADCQ $0, DI
   504  	ADDQ SI, (R10)(BX*8)
   505  	ADCQ $0, DI
   506  	MOVQ DI, CX
   507  	ADDQ $1, BX		// i++
   508  
   509  	CMPQ BX, R11
   510  	JL adx_short
   511  
   512  	MOVQ CX, c+56(FP)
   513  	RET
   514  
   515  
   516