github.com/flyinox/gosm@v0.0.0-20171117061539-16768cb62077/src/math/big/arith_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !math_big_pure_go
     6  
     7  #include "textflag.h"
     8  
     9  // This file provides fast assembly versions for the elementary
    10  // arithmetic operations on vectors implemented in arith.go.
    11  
    12  // func mulWW(x, y Word) (z1, z0 Word)
    13  TEXT ·mulWW(SB),NOSPLIT,$0
    14  	MOVQ x+0(FP), AX
    15  	MULQ y+8(FP)
    16  	MOVQ DX, z1+16(FP)
    17  	MOVQ AX, z0+24(FP)
    18  	RET
    19  
    20  
    21  // func divWW(x1, x0, y Word) (q, r Word)
    22  TEXT ·divWW(SB),NOSPLIT,$0
    23  	MOVQ x1+0(FP), DX
    24  	MOVQ x0+8(FP), AX
    25  	DIVQ y+16(FP)
    26  	MOVQ AX, q+24(FP)
    27  	MOVQ DX, r+32(FP)
    28  	RET
    29  
    30  // The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
    31  // It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
    32  // This is faster than using rotate instructions.
    33  //
    34  // CAUTION: Note that MOVQ $0, Rx is translated to XORQ Rx, Rx which clears the carry bit!
    35  
    36  // func addVV(z, x, y []Word) (c Word)
    37  TEXT ·addVV(SB),NOSPLIT,$0
    38  	MOVQ z_len+8(FP), DI
    39  	MOVQ x+24(FP), R8
    40  	MOVQ y+48(FP), R9
    41  	MOVQ z+0(FP), R10
    42  
    43  	MOVQ $0, CX		// c = 0
    44  	MOVQ $0, SI		// i = 0
    45  
    46  	// s/JL/JMP/ below to disable the unrolled loop
    47  	SUBQ $4, DI		// n -= 4
    48  	JL V1			// if n < 0 goto V1
    49  
    50  U1:	// n >= 0
    51  	// regular loop body unrolled 4x
    52  	ADDQ CX, CX		// restore CF
    53  	MOVQ 0(R8)(SI*8), R11
    54  	MOVQ 8(R8)(SI*8), R12
    55  	MOVQ 16(R8)(SI*8), R13
    56  	MOVQ 24(R8)(SI*8), R14
    57  	ADCQ 0(R9)(SI*8), R11
    58  	ADCQ 8(R9)(SI*8), R12
    59  	ADCQ 16(R9)(SI*8), R13
    60  	ADCQ 24(R9)(SI*8), R14
    61  	MOVQ R11, 0(R10)(SI*8)
    62  	MOVQ R12, 8(R10)(SI*8)
    63  	MOVQ R13, 16(R10)(SI*8)
    64  	MOVQ R14, 24(R10)(SI*8)
    65  	SBBQ CX, CX		// save CF
    66  
    67  	ADDQ $4, SI		// i += 4
    68  	SUBQ $4, DI		// n -= 4
    69  	JGE U1			// if n >= 0 goto U1
    70  
    71  V1:	ADDQ $4, DI		// n += 4
    72  	JLE E1			// if n <= 0 goto E1
    73  
    74  L1:	// n > 0
    75  	ADDQ CX, CX		// restore CF
    76  	MOVQ 0(R8)(SI*8), R11
    77  	ADCQ 0(R9)(SI*8), R11
    78  	MOVQ R11, 0(R10)(SI*8)
    79  	SBBQ CX, CX		// save CF
    80  
    81  	ADDQ $1, SI		// i++
    82  	SUBQ $1, DI		// n--
    83  	JG L1			// if n > 0 goto L1
    84  
    85  E1:	NEGQ CX
    86  	MOVQ CX, c+72(FP)	// return c
    87  	RET
    88  
    89  
    90  // func subVV(z, x, y []Word) (c Word)
    91  // (same as addVV except for SBBQ instead of ADCQ and label names)
    92  TEXT ·subVV(SB),NOSPLIT,$0
    93  	MOVQ z_len+8(FP), DI
    94  	MOVQ x+24(FP), R8
    95  	MOVQ y+48(FP), R9
    96  	MOVQ z+0(FP), R10
    97  
    98  	MOVQ $0, CX		// c = 0
    99  	MOVQ $0, SI		// i = 0
   100  
   101  	// s/JL/JMP/ below to disable the unrolled loop
   102  	SUBQ $4, DI		// n -= 4
   103  	JL V2			// if n < 0 goto V2
   104  
   105  U2:	// n >= 0
   106  	// regular loop body unrolled 4x
   107  	ADDQ CX, CX		// restore CF
   108  	MOVQ 0(R8)(SI*8), R11
   109  	MOVQ 8(R8)(SI*8), R12
   110  	MOVQ 16(R8)(SI*8), R13
   111  	MOVQ 24(R8)(SI*8), R14
   112  	SBBQ 0(R9)(SI*8), R11
   113  	SBBQ 8(R9)(SI*8), R12
   114  	SBBQ 16(R9)(SI*8), R13
   115  	SBBQ 24(R9)(SI*8), R14
   116  	MOVQ R11, 0(R10)(SI*8)
   117  	MOVQ R12, 8(R10)(SI*8)
   118  	MOVQ R13, 16(R10)(SI*8)
   119  	MOVQ R14, 24(R10)(SI*8)
   120  	SBBQ CX, CX		// save CF
   121  
   122  	ADDQ $4, SI		// i += 4
   123  	SUBQ $4, DI		// n -= 4
   124  	JGE U2			// if n >= 0 goto U2
   125  
   126  V2:	ADDQ $4, DI		// n += 4
   127  	JLE E2			// if n <= 0 goto E2
   128  
   129  L2:	// n > 0
   130  	ADDQ CX, CX		// restore CF
   131  	MOVQ 0(R8)(SI*8), R11
   132  	SBBQ 0(R9)(SI*8), R11
   133  	MOVQ R11, 0(R10)(SI*8)
   134  	SBBQ CX, CX		// save CF
   135  
   136  	ADDQ $1, SI		// i++
   137  	SUBQ $1, DI		// n--
   138  	JG L2			// if n > 0 goto L2
   139  
   140  E2:	NEGQ CX
   141  	MOVQ CX, c+72(FP)	// return c
   142  	RET
   143  
   144  
   145  // func addVW(z, x []Word, y Word) (c Word)
   146  TEXT ·addVW(SB),NOSPLIT,$0
   147  	MOVQ z_len+8(FP), DI
   148  	MOVQ x+24(FP), R8
   149  	MOVQ y+48(FP), CX	// c = y
   150  	MOVQ z+0(FP), R10
   151  
   152  	MOVQ $0, SI		// i = 0
   153  
   154  	// s/JL/JMP/ below to disable the unrolled loop
   155  	SUBQ $4, DI		// n -= 4
   156  	JL V3			// if n < 4 goto V3
   157  
   158  U3:	// n >= 0
   159  	// regular loop body unrolled 4x
   160  	MOVQ 0(R8)(SI*8), R11
   161  	MOVQ 8(R8)(SI*8), R12
   162  	MOVQ 16(R8)(SI*8), R13
   163  	MOVQ 24(R8)(SI*8), R14
   164  	ADDQ CX, R11
   165  	ADCQ $0, R12
   166  	ADCQ $0, R13
   167  	ADCQ $0, R14
   168  	SBBQ CX, CX		// save CF
   169  	NEGQ CX
   170  	MOVQ R11, 0(R10)(SI*8)
   171  	MOVQ R12, 8(R10)(SI*8)
   172  	MOVQ R13, 16(R10)(SI*8)
   173  	MOVQ R14, 24(R10)(SI*8)
   174  
   175  	ADDQ $4, SI		// i += 4
   176  	SUBQ $4, DI		// n -= 4
   177  	JGE U3			// if n >= 0 goto U3
   178  
   179  V3:	ADDQ $4, DI		// n += 4
   180  	JLE E3			// if n <= 0 goto E3
   181  
   182  L3:	// n > 0
   183  	ADDQ 0(R8)(SI*8), CX
   184  	MOVQ CX, 0(R10)(SI*8)
   185  	SBBQ CX, CX		// save CF
   186  	NEGQ CX
   187  
   188  	ADDQ $1, SI		// i++
   189  	SUBQ $1, DI		// n--
   190  	JG L3			// if n > 0 goto L3
   191  
   192  E3:	MOVQ CX, c+56(FP)	// return c
   193  	RET
   194  
   195  
   196  // func subVW(z, x []Word, y Word) (c Word)
   197  // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
   198  TEXT ·subVW(SB),NOSPLIT,$0
   199  	MOVQ z_len+8(FP), DI
   200  	MOVQ x+24(FP), R8
   201  	MOVQ y+48(FP), CX	// c = y
   202  	MOVQ z+0(FP), R10
   203  
   204  	MOVQ $0, SI		// i = 0
   205  
   206  	// s/JL/JMP/ below to disable the unrolled loop
   207  	SUBQ $4, DI		// n -= 4
   208  	JL V4			// if n < 4 goto V4
   209  
   210  U4:	// n >= 0
   211  	// regular loop body unrolled 4x
   212  	MOVQ 0(R8)(SI*8), R11
   213  	MOVQ 8(R8)(SI*8), R12
   214  	MOVQ 16(R8)(SI*8), R13
   215  	MOVQ 24(R8)(SI*8), R14
   216  	SUBQ CX, R11
   217  	SBBQ $0, R12
   218  	SBBQ $0, R13
   219  	SBBQ $0, R14
   220  	SBBQ CX, CX		// save CF
   221  	NEGQ CX
   222  	MOVQ R11, 0(R10)(SI*8)
   223  	MOVQ R12, 8(R10)(SI*8)
   224  	MOVQ R13, 16(R10)(SI*8)
   225  	MOVQ R14, 24(R10)(SI*8)
   226  
   227  	ADDQ $4, SI		// i += 4
   228  	SUBQ $4, DI		// n -= 4
   229  	JGE U4			// if n >= 0 goto U4
   230  
   231  V4:	ADDQ $4, DI		// n += 4
   232  	JLE E4			// if n <= 0 goto E4
   233  
   234  L4:	// n > 0
   235  	MOVQ 0(R8)(SI*8), R11
   236  	SUBQ CX, R11
   237  	MOVQ R11, 0(R10)(SI*8)
   238  	SBBQ CX, CX		// save CF
   239  	NEGQ CX
   240  
   241  	ADDQ $1, SI		// i++
   242  	SUBQ $1, DI		// n--
   243  	JG L4			// if n > 0 goto L4
   244  
   245  E4:	MOVQ CX, c+56(FP)	// return c
   246  	RET
   247  
   248  
   249  // func shlVU(z, x []Word, s uint) (c Word)
   250  TEXT ·shlVU(SB),NOSPLIT,$0
   251  	MOVQ z_len+8(FP), BX	// i = z
   252  	SUBQ $1, BX		// i--
   253  	JL X8b			// i < 0	(n <= 0)
   254  
   255  	// n > 0
   256  	MOVQ z+0(FP), R10
   257  	MOVQ x+24(FP), R8
   258  	MOVQ s+48(FP), CX
   259  	MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
   260  	MOVQ $0, DX
   261  	SHLQ CX, DX:AX		// w1>>ŝ
   262  	MOVQ DX, c+56(FP)
   263  
   264  	CMPQ BX, $0
   265  	JLE X8a			// i <= 0
   266  
   267  	// i > 0
   268  L8:	MOVQ AX, DX		// w = w1
   269  	MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
   270  	SHLQ CX, DX:AX		// w<<s | w1>>ŝ
   271  	MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>ŝ
   272  	SUBQ $1, BX		// i--
   273  	JG L8			// i > 0
   274  
   275  	// i <= 0
   276  X8a:	SHLQ CX, AX		// w1<<s
   277  	MOVQ AX, (R10)		// z[0] = w1<<s
   278  	RET
   279  
   280  X8b:	MOVQ $0, c+56(FP)
   281  	RET
   282  
   283  
   284  // func shrVU(z, x []Word, s uint) (c Word)
   285  TEXT ·shrVU(SB),NOSPLIT,$0
   286  	MOVQ z_len+8(FP), R11
   287  	SUBQ $1, R11		// n--
   288  	JL X9b			// n < 0	(n <= 0)
   289  
   290  	// n > 0
   291  	MOVQ z+0(FP), R10
   292  	MOVQ x+24(FP), R8
   293  	MOVQ s+48(FP), CX
   294  	MOVQ (R8), AX		// w1 = x[0]
   295  	MOVQ $0, DX
   296  	SHRQ CX, DX:AX		// w1<<ŝ
   297  	MOVQ DX, c+56(FP)
   298  
   299  	MOVQ $0, BX		// i = 0
   300  	JMP E9
   301  
   302  	// i < n-1
   303  L9:	MOVQ AX, DX		// w = w1
   304  	MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
   305  	SHRQ CX, DX:AX		// w>>s | w1<<ŝ
   306  	MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<ŝ
   307  	ADDQ $1, BX		// i++
   308  
   309  E9:	CMPQ BX, R11
   310  	JL L9			// i < n-1
   311  
   312  	// i >= n-1
   313  X9a:	SHRQ CX, AX		// w1>>s
   314  	MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
   315  	RET
   316  
   317  X9b:	MOVQ $0, c+56(FP)
   318  	RET
   319  
   320  
   321  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   322  TEXT ·mulAddVWW(SB),NOSPLIT,$0
   323  	MOVQ z+0(FP), R10
   324  	MOVQ x+24(FP), R8
   325  	MOVQ y+48(FP), R9
   326  	MOVQ r+56(FP), CX	// c = r
   327  	MOVQ z_len+8(FP), R11
   328  	MOVQ $0, BX		// i = 0
   329  	
   330  	CMPQ R11, $4
   331  	JL E5
   332  	
   333  U5:	// i+4 <= n
   334  	// regular loop body unrolled 4x
   335  	MOVQ (0*8)(R8)(BX*8), AX
   336  	MULQ R9
   337  	ADDQ CX, AX
   338  	ADCQ $0, DX
   339  	MOVQ AX, (0*8)(R10)(BX*8)
   340  	MOVQ DX, CX
   341  	MOVQ (1*8)(R8)(BX*8), AX
   342  	MULQ R9
   343  	ADDQ CX, AX
   344  	ADCQ $0, DX
   345  	MOVQ AX, (1*8)(R10)(BX*8)
   346  	MOVQ DX, CX
   347  	MOVQ (2*8)(R8)(BX*8), AX
   348  	MULQ R9
   349  	ADDQ CX, AX
   350  	ADCQ $0, DX
   351  	MOVQ AX, (2*8)(R10)(BX*8)
   352  	MOVQ DX, CX
   353  	MOVQ (3*8)(R8)(BX*8), AX
   354  	MULQ R9
   355  	ADDQ CX, AX
   356  	ADCQ $0, DX
   357  	MOVQ AX, (3*8)(R10)(BX*8)
   358  	MOVQ DX, CX
   359  	ADDQ $4, BX		// i += 4
   360  	
   361  	LEAQ 4(BX), DX
   362  	CMPQ DX, R11
   363  	JLE U5
   364  	JMP E5
   365  
   366  L5:	MOVQ (R8)(BX*8), AX
   367  	MULQ R9
   368  	ADDQ CX, AX
   369  	ADCQ $0, DX
   370  	MOVQ AX, (R10)(BX*8)
   371  	MOVQ DX, CX
   372  	ADDQ $1, BX		// i++
   373  
   374  E5:	CMPQ BX, R11		// i < n
   375  	JL L5
   376  
   377  	MOVQ CX, c+64(FP)
   378  	RET
   379  
   380  
   381  // func addMulVVW(z, x []Word, y Word) (c Word)
   382  TEXT ·addMulVVW(SB),NOSPLIT,$0
   383  	MOVQ z+0(FP), R10
   384  	MOVQ x+24(FP), R8
   385  	MOVQ y+48(FP), R9
   386  	MOVQ z_len+8(FP), R11
   387  	MOVQ $0, BX		// i = 0
   388  	MOVQ $0, CX		// c = 0
   389  	MOVQ R11, R12
   390  	ANDQ $-2, R12
   391  	CMPQ R11, $2
   392  	JAE A6
   393  	JMP E6
   394  
   395  A6:
   396  	MOVQ (R8)(BX*8), AX
   397  	MULQ R9
   398  	ADDQ (R10)(BX*8), AX
   399  	ADCQ $0, DX
   400  	ADDQ CX, AX
   401  	ADCQ $0, DX
   402  	MOVQ DX, CX
   403  	MOVQ AX, (R10)(BX*8)
   404  
   405  	MOVQ (8)(R8)(BX*8), AX
   406  	MULQ R9
   407  	ADDQ (8)(R10)(BX*8), AX
   408  	ADCQ $0, DX
   409  	ADDQ CX, AX
   410  	ADCQ $0, DX
   411  	MOVQ DX, CX
   412  	MOVQ AX, (8)(R10)(BX*8)
   413  
   414  	ADDQ $2, BX
   415  	CMPQ BX, R12
   416  	JL A6
   417  	JMP E6
   418  
   419  L6:	MOVQ (R8)(BX*8), AX
   420  	MULQ R9
   421  	ADDQ CX, AX
   422  	ADCQ $0, DX
   423  	ADDQ AX, (R10)(BX*8)
   424  	ADCQ $0, DX
   425  	MOVQ DX, CX
   426  	ADDQ $1, BX		// i++
   427  
   428  E6:	CMPQ BX, R11		// i < n
   429  	JL L6
   430  
   431  	MOVQ CX, c+56(FP)
   432  	RET
   433  
   434  
   435  // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
   436  TEXT ·divWVW(SB),NOSPLIT,$0
   437  	MOVQ z+0(FP), R10
   438  	MOVQ xn+24(FP), DX	// r = xn
   439  	MOVQ x+32(FP), R8
   440  	MOVQ y+56(FP), R9
   441  	MOVQ z_len+8(FP), BX	// i = z
   442  	JMP E7
   443  
   444  L7:	MOVQ (R8)(BX*8), AX
   445  	DIVQ R9
   446  	MOVQ AX, (R10)(BX*8)
   447  
   448  E7:	SUBQ $1, BX		// i--
   449  	JGE L7			// i >= 0
   450  
   451  	MOVQ DX, r+64(FP)
   452  	RET