github.com/c9s/go@v0.0.0-20180120015821-984e81f64e0c/src/math/big/arith_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !math_big_pure_go
     6  
     7  #include "textflag.h"
     8  
     9  // This file provides fast assembly versions for the elementary
    10  // arithmetic operations on vectors implemented in arith.go.
    11  
    12  // func mulWW(x, y Word) (z1, z0 Word)
    13  TEXT ·mulWW(SB),NOSPLIT,$0
    14  	MOVQ x+0(FP), AX
    15  	MULQ y+8(FP)
    16  	MOVQ DX, z1+16(FP)
    17  	MOVQ AX, z0+24(FP)
    18  	RET
    19  
    20  
    21  // func divWW(x1, x0, y Word) (q, r Word)
    22  TEXT ·divWW(SB),NOSPLIT,$0
    23  	MOVQ x1+0(FP), DX
    24  	MOVQ x0+8(FP), AX
    25  	DIVQ y+16(FP)
    26  	MOVQ AX, q+24(FP)
    27  	MOVQ DX, r+32(FP)
    28  	RET
    29  
    30  // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
    31  // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
    32  // This is faster than using rotate instructions.
    33  
    34  // func addVV(z, x, y []Word) (c Word)
    35  TEXT ·addVV(SB),NOSPLIT,$0
    36  	MOVQ z_len+8(FP), DI
    37  	MOVQ x+24(FP), R8
    38  	MOVQ y+48(FP), R9
    39  	MOVQ z+0(FP), R10
    40  
    41  	MOVQ $0, CX		// c = 0
    42  	MOVQ $0, SI		// i = 0
    43  
    44  	// s/JL/JMP/ below to disable the unrolled loop
    45  	SUBQ $4, DI		// n -= 4
    46  	JL V1			// if n < 0 goto V1
    47  
    48  U1:	// n >= 0
    49  	// regular loop body unrolled 4x
    50  	ADDQ CX, CX		// restore CF
    51  	MOVQ 0(R8)(SI*8), R11
    52  	MOVQ 8(R8)(SI*8), R12
    53  	MOVQ 16(R8)(SI*8), R13
    54  	MOVQ 24(R8)(SI*8), R14
    55  	ADCQ 0(R9)(SI*8), R11
    56  	ADCQ 8(R9)(SI*8), R12
    57  	ADCQ 16(R9)(SI*8), R13
    58  	ADCQ 24(R9)(SI*8), R14
    59  	MOVQ R11, 0(R10)(SI*8)
    60  	MOVQ R12, 8(R10)(SI*8)
    61  	MOVQ R13, 16(R10)(SI*8)
    62  	MOVQ R14, 24(R10)(SI*8)
    63  	SBBQ CX, CX		// save CF
    64  
    65  	ADDQ $4, SI		// i += 4
    66  	SUBQ $4, DI		// n -= 4
    67  	JGE U1			// if n >= 0 goto U1
    68  
    69  V1:	ADDQ $4, DI		// n += 4
    70  	JLE E1			// if n <= 0 goto E1
    71  
    72  L1:	// n > 0
    73  	ADDQ CX, CX		// restore CF
    74  	MOVQ 0(R8)(SI*8), R11
    75  	ADCQ 0(R9)(SI*8), R11
    76  	MOVQ R11, 0(R10)(SI*8)
    77  	SBBQ CX, CX		// save CF
    78  
    79  	ADDQ $1, SI		// i++
    80  	SUBQ $1, DI		// n--
    81  	JG L1			// if n > 0 goto L1
    82  
    83  E1:	NEGQ CX
    84  	MOVQ CX, c+72(FP)	// return c
    85  	RET
    86  
    87  
    88  // func subVV(z, x, y []Word) (c Word)
    89  // (same as addVV except for SBBQ instead of ADCQ and label names)
    90  TEXT ·subVV(SB),NOSPLIT,$0
    91  	MOVQ z_len+8(FP), DI
    92  	MOVQ x+24(FP), R8
    93  	MOVQ y+48(FP), R9
    94  	MOVQ z+0(FP), R10
    95  
    96  	MOVQ $0, CX		// c = 0
    97  	MOVQ $0, SI		// i = 0
    98  
    99  	// s/JL/JMP/ below to disable the unrolled loop
   100  	SUBQ $4, DI		// n -= 4
   101  	JL V2			// if n < 0 goto V2
   102  
   103  U2:	// n >= 0
   104  	// regular loop body unrolled 4x
   105  	ADDQ CX, CX		// restore CF
   106  	MOVQ 0(R8)(SI*8), R11
   107  	MOVQ 8(R8)(SI*8), R12
   108  	MOVQ 16(R8)(SI*8), R13
   109  	MOVQ 24(R8)(SI*8), R14
   110  	SBBQ 0(R9)(SI*8), R11
   111  	SBBQ 8(R9)(SI*8), R12
   112  	SBBQ 16(R9)(SI*8), R13
   113  	SBBQ 24(R9)(SI*8), R14
   114  	MOVQ R11, 0(R10)(SI*8)
   115  	MOVQ R12, 8(R10)(SI*8)
   116  	MOVQ R13, 16(R10)(SI*8)
   117  	MOVQ R14, 24(R10)(SI*8)
   118  	SBBQ CX, CX		// save CF
   119  
   120  	ADDQ $4, SI		// i += 4
   121  	SUBQ $4, DI		// n -= 4
   122  	JGE U2			// if n >= 0 goto U2
   123  
   124  V2:	ADDQ $4, DI		// n += 4
   125  	JLE E2			// if n <= 0 goto E2
   126  
   127  L2:	// n > 0
   128  	ADDQ CX, CX		// restore CF
   129  	MOVQ 0(R8)(SI*8), R11
   130  	SBBQ 0(R9)(SI*8), R11
   131  	MOVQ R11, 0(R10)(SI*8)
   132  	SBBQ CX, CX		// save CF
   133  
   134  	ADDQ $1, SI		// i++
   135  	SUBQ $1, DI		// n--
   136  	JG L2			// if n > 0 goto L2
   137  
   138  E2:	NEGQ CX
   139  	MOVQ CX, c+72(FP)	// return c
   140  	RET
   141  
   142  
   143  // func addVW(z, x []Word, y Word) (c Word)
   144  TEXT ·addVW(SB),NOSPLIT,$0
   145  	MOVQ z_len+8(FP), DI
   146  	MOVQ x+24(FP), R8
   147  	MOVQ y+48(FP), CX	// c = y
   148  	MOVQ z+0(FP), R10
   149  
   150  	MOVQ $0, SI		// i = 0
   151  
   152  	// s/JL/JMP/ below to disable the unrolled loop
   153  	SUBQ $4, DI		// n -= 4
   154  	JL V3			// if n < 4 goto V3
   155  
   156  U3:	// n >= 0
   157  	// regular loop body unrolled 4x
   158  	MOVQ 0(R8)(SI*8), R11
   159  	MOVQ 8(R8)(SI*8), R12
   160  	MOVQ 16(R8)(SI*8), R13
   161  	MOVQ 24(R8)(SI*8), R14
   162  	ADDQ CX, R11
   163  	ADCQ $0, R12
   164  	ADCQ $0, R13
   165  	ADCQ $0, R14
   166  	SBBQ CX, CX		// save CF
   167  	NEGQ CX
   168  	MOVQ R11, 0(R10)(SI*8)
   169  	MOVQ R12, 8(R10)(SI*8)
   170  	MOVQ R13, 16(R10)(SI*8)
   171  	MOVQ R14, 24(R10)(SI*8)
   172  
   173  	ADDQ $4, SI		// i += 4
   174  	SUBQ $4, DI		// n -= 4
   175  	JGE U3			// if n >= 0 goto U3
   176  
   177  V3:	ADDQ $4, DI		// n += 4
   178  	JLE E3			// if n <= 0 goto E3
   179  
   180  L3:	// n > 0
   181  	ADDQ 0(R8)(SI*8), CX
   182  	MOVQ CX, 0(R10)(SI*8)
   183  	SBBQ CX, CX		// save CF
   184  	NEGQ CX
   185  
   186  	ADDQ $1, SI		// i++
   187  	SUBQ $1, DI		// n--
   188  	JG L3			// if n > 0 goto L3
   189  
   190  E3:	MOVQ CX, c+56(FP)	// return c
   191  	RET
   192  
   193  
   194  // func subVW(z, x []Word, y Word) (c Word)
   195  // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
   196  TEXT ·subVW(SB),NOSPLIT,$0
   197  	MOVQ z_len+8(FP), DI
   198  	MOVQ x+24(FP), R8
   199  	MOVQ y+48(FP), CX	// c = y
   200  	MOVQ z+0(FP), R10
   201  
   202  	MOVQ $0, SI		// i = 0
   203  
   204  	// s/JL/JMP/ below to disable the unrolled loop
   205  	SUBQ $4, DI		// n -= 4
   206  	JL V4			// if n < 4 goto V4
   207  
   208  U4:	// n >= 0
   209  	// regular loop body unrolled 4x
   210  	MOVQ 0(R8)(SI*8), R11
   211  	MOVQ 8(R8)(SI*8), R12
   212  	MOVQ 16(R8)(SI*8), R13
   213  	MOVQ 24(R8)(SI*8), R14
   214  	SUBQ CX, R11
   215  	SBBQ $0, R12
   216  	SBBQ $0, R13
   217  	SBBQ $0, R14
   218  	SBBQ CX, CX		// save CF
   219  	NEGQ CX
   220  	MOVQ R11, 0(R10)(SI*8)
   221  	MOVQ R12, 8(R10)(SI*8)
   222  	MOVQ R13, 16(R10)(SI*8)
   223  	MOVQ R14, 24(R10)(SI*8)
   224  
   225  	ADDQ $4, SI		// i += 4
   226  	SUBQ $4, DI		// n -= 4
   227  	JGE U4			// if n >= 0 goto U4
   228  
   229  V4:	ADDQ $4, DI		// n += 4
   230  	JLE E4			// if n <= 0 goto E4
   231  
   232  L4:	// n > 0
   233  	MOVQ 0(R8)(SI*8), R11
   234  	SUBQ CX, R11
   235  	MOVQ R11, 0(R10)(SI*8)
   236  	SBBQ CX, CX		// save CF
   237  	NEGQ CX
   238  
   239  	ADDQ $1, SI		// i++
   240  	SUBQ $1, DI		// n--
   241  	JG L4			// if n > 0 goto L4
   242  
   243  E4:	MOVQ CX, c+56(FP)	// return c
   244  	RET
   245  
   246  
   247  // func shlVU(z, x []Word, s uint) (c Word)
   248  TEXT ·shlVU(SB),NOSPLIT,$0
   249  	MOVQ z_len+8(FP), BX	// i = z
   250  	SUBQ $1, BX		// i--
   251  	JL X8b			// i < 0	(n <= 0)
   252  
   253  	// n > 0
   254  	MOVQ z+0(FP), R10
   255  	MOVQ x+24(FP), R8
   256  	MOVQ s+48(FP), CX
   257  	MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
   258  	MOVQ $0, DX
   259  	SHLQ CX, DX:AX		// w1>>ŝ
   260  	MOVQ DX, c+56(FP)
   261  
   262  	CMPQ BX, $0
   263  	JLE X8a			// i <= 0
   264  
   265  	// i > 0
   266  L8:	MOVQ AX, DX		// w = w1
   267  	MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
   268  	SHLQ CX, DX:AX		// w<<s | w1>>ŝ
   269  	MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>ŝ
   270  	SUBQ $1, BX		// i--
   271  	JG L8			// i > 0
   272  
   273  	// i <= 0
   274  X8a:	SHLQ CX, AX		// w1<<s
   275  	MOVQ AX, (R10)		// z[0] = w1<<s
   276  	RET
   277  
   278  X8b:	MOVQ $0, c+56(FP)
   279  	RET
   280  
   281  
   282  // func shrVU(z, x []Word, s uint) (c Word)
   283  TEXT ·shrVU(SB),NOSPLIT,$0
   284  	MOVQ z_len+8(FP), R11
   285  	SUBQ $1, R11		// n--
   286  	JL X9b			// n < 0	(n <= 0)
   287  
   288  	// n > 0
   289  	MOVQ z+0(FP), R10
   290  	MOVQ x+24(FP), R8
   291  	MOVQ s+48(FP), CX
   292  	MOVQ (R8), AX		// w1 = x[0]
   293  	MOVQ $0, DX
   294  	SHRQ CX, DX:AX		// w1<<ŝ
   295  	MOVQ DX, c+56(FP)
   296  
   297  	MOVQ $0, BX		// i = 0
   298  	JMP E9
   299  
   300  	// i < n-1
   301  L9:	MOVQ AX, DX		// w = w1
   302  	MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
   303  	SHRQ CX, DX:AX		// w>>s | w1<<ŝ
   304  	MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<ŝ
   305  	ADDQ $1, BX		// i++
   306  
   307  E9:	CMPQ BX, R11
   308  	JL L9			// i < n-1
   309  
   310  	// i >= n-1
   311  X9a:	SHRQ CX, AX		// w1>>s
   312  	MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
   313  	RET
   314  
   315  X9b:	MOVQ $0, c+56(FP)
   316  	RET
   317  
   318  
   319  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   320  TEXT ·mulAddVWW(SB),NOSPLIT,$0
   321  	MOVQ z+0(FP), R10
   322  	MOVQ x+24(FP), R8
   323  	MOVQ y+48(FP), R9
   324  	MOVQ r+56(FP), CX	// c = r
   325  	MOVQ z_len+8(FP), R11
   326  	MOVQ $0, BX		// i = 0
   327  	
   328  	CMPQ R11, $4
   329  	JL E5
   330  	
   331  U5:	// i+4 <= n
   332  	// regular loop body unrolled 4x
   333  	MOVQ (0*8)(R8)(BX*8), AX
   334  	MULQ R9
   335  	ADDQ CX, AX
   336  	ADCQ $0, DX
   337  	MOVQ AX, (0*8)(R10)(BX*8)
   338  	MOVQ DX, CX
   339  	MOVQ (1*8)(R8)(BX*8), AX
   340  	MULQ R9
   341  	ADDQ CX, AX
   342  	ADCQ $0, DX
   343  	MOVQ AX, (1*8)(R10)(BX*8)
   344  	MOVQ DX, CX
   345  	MOVQ (2*8)(R8)(BX*8), AX
   346  	MULQ R9
   347  	ADDQ CX, AX
   348  	ADCQ $0, DX
   349  	MOVQ AX, (2*8)(R10)(BX*8)
   350  	MOVQ DX, CX
   351  	MOVQ (3*8)(R8)(BX*8), AX
   352  	MULQ R9
   353  	ADDQ CX, AX
   354  	ADCQ $0, DX
   355  	MOVQ AX, (3*8)(R10)(BX*8)
   356  	MOVQ DX, CX
   357  	ADDQ $4, BX		// i += 4
   358  	
   359  	LEAQ 4(BX), DX
   360  	CMPQ DX, R11
   361  	JLE U5
   362  	JMP E5
   363  
   364  L5:	MOVQ (R8)(BX*8), AX
   365  	MULQ R9
   366  	ADDQ CX, AX
   367  	ADCQ $0, DX
   368  	MOVQ AX, (R10)(BX*8)
   369  	MOVQ DX, CX
   370  	ADDQ $1, BX		// i++
   371  
   372  E5:	CMPQ BX, R11		// i < n
   373  	JL L5
   374  
   375  	MOVQ CX, c+64(FP)
   376  	RET
   377  
   378  
   379  // func addMulVVW(z, x []Word, y Word) (c Word)
   380  TEXT ·addMulVVW(SB),NOSPLIT,$0
   381  	MOVQ z+0(FP), R10
   382  	MOVQ x+24(FP), R8
   383  	MOVQ y+48(FP), R9
   384  	MOVQ z_len+8(FP), R11
   385  	MOVQ $0, BX		// i = 0
   386  	MOVQ $0, CX		// c = 0
   387  	MOVQ R11, R12
   388  	ANDQ $-2, R12
   389  	CMPQ R11, $2
   390  	JAE A6
   391  	JMP E6
   392  
   393  A6:
   394  	MOVQ (R8)(BX*8), AX
   395  	MULQ R9
   396  	ADDQ (R10)(BX*8), AX
   397  	ADCQ $0, DX
   398  	ADDQ CX, AX
   399  	ADCQ $0, DX
   400  	MOVQ DX, CX
   401  	MOVQ AX, (R10)(BX*8)
   402  
   403  	MOVQ (8)(R8)(BX*8), AX
   404  	MULQ R9
   405  	ADDQ (8)(R10)(BX*8), AX
   406  	ADCQ $0, DX
   407  	ADDQ CX, AX
   408  	ADCQ $0, DX
   409  	MOVQ DX, CX
   410  	MOVQ AX, (8)(R10)(BX*8)
   411  
   412  	ADDQ $2, BX
   413  	CMPQ BX, R12
   414  	JL A6
   415  	JMP E6
   416  
   417  L6:	MOVQ (R8)(BX*8), AX
   418  	MULQ R9
   419  	ADDQ CX, AX
   420  	ADCQ $0, DX
   421  	ADDQ AX, (R10)(BX*8)
   422  	ADCQ $0, DX
   423  	MOVQ DX, CX
   424  	ADDQ $1, BX		// i++
   425  
   426  E6:	CMPQ BX, R11		// i < n
   427  	JL L6
   428  
   429  	MOVQ CX, c+56(FP)
   430  	RET
   431  
   432  
   433  // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
   434  TEXT ·divWVW(SB),NOSPLIT,$0
   435  	MOVQ z+0(FP), R10
   436  	MOVQ xn+24(FP), DX	// r = xn
   437  	MOVQ x+32(FP), R8
   438  	MOVQ y+56(FP), R9
   439  	MOVQ z_len+8(FP), BX	// i = z
   440  	JMP E7
   441  
   442  L7:	MOVQ (R8)(BX*8), AX
   443  	DIVQ R9
   444  	MOVQ AX, (R10)(BX*8)
   445  
   446  E7:	SUBQ $1, BX		// i--
   447  	JGE L7			// i >= 0
   448  
   449  	MOVQ DX, r+64(FP)
   450  	RET