github.com/hongwozai/go-src-1.4.3@v0.0.0-20191127132709-dc3fce3dbccb/src/math/big/arith_amd64.s (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "textflag.h"
     6  
     7  // This file provides fast assembly versions for the elementary
     8  // arithmetic operations on vectors implemented in arith.go.
     9  
    10  // Literal instruction for MOVQ $0, CX.
    11  // (MOVQ $0, reg is translated to XORQ reg, reg and clears CF.)
    12  #define ZERO_CX BYTE $0x48; \
    13  		BYTE $0xc7; \
    14  		BYTE $0xc1; \
    15  		BYTE $0x00; \
    16  		BYTE $0x00; \
    17  		BYTE $0x00; \
    18  		BYTE $0x00
    19  
    20  // func mulWW(x, y Word) (z1, z0 Word)
    21  TEXT ·mulWW(SB),NOSPLIT,$0
    22  	MOVQ x+0(FP), AX
    23  	MULQ y+8(FP)
    24  	MOVQ DX, z1+16(FP)
    25  	MOVQ AX, z0+24(FP)
    26  	RET
    27  
    28  
    29  // func divWW(x1, x0, y Word) (q, r Word)
    30  TEXT ·divWW(SB),NOSPLIT,$0
    31  	MOVQ x1+0(FP), DX
    32  	MOVQ x0+8(FP), AX
    33  	DIVQ y+16(FP)
    34  	MOVQ AX, q+24(FP)
    35  	MOVQ DX, r+32(FP)
    36  	RET
    37  
    38  
    39  // func addVV(z, x, y []Word) (c Word)
    40  TEXT ·addVV(SB),NOSPLIT,$0
    41  	MOVQ z_len+8(FP), DI
    42  	MOVQ x+24(FP), R8
    43  	MOVQ y+48(FP), R9
    44  	MOVQ z+0(FP), R10
    45  
    46  	MOVQ $0, CX		// c = 0
    47  	MOVQ $0, SI		// i = 0
    48  
    49  	// s/JL/JMP/ below to disable the unrolled loop
    50  	SUBQ $4, DI		// n -= 4
    51  	JL V1			// if n < 0 goto V1
    52  
    53  U1:	// n >= 0
    54  	// regular loop body unrolled 4x
    55  	RCRQ $1, CX		// CF = c
    56  	MOVQ 0(R8)(SI*8), R11
    57  	MOVQ 8(R8)(SI*8), R12
    58  	MOVQ 16(R8)(SI*8), R13
    59  	MOVQ 24(R8)(SI*8), R14
    60  	ADCQ 0(R9)(SI*8), R11
    61  	ADCQ 8(R9)(SI*8), R12
    62  	ADCQ 16(R9)(SI*8), R13
    63  	ADCQ 24(R9)(SI*8), R14
    64  	MOVQ R11, 0(R10)(SI*8)
    65  	MOVQ R12, 8(R10)(SI*8)
    66  	MOVQ R13, 16(R10)(SI*8)
    67  	MOVQ R14, 24(R10)(SI*8)
    68  	RCLQ $1, CX		// c = CF
    69  
    70  	ADDQ $4, SI		// i += 4
    71  	SUBQ $4, DI		// n -= 4
    72  	JGE U1			// if n >= 0 goto U1
    73  
    74  V1:	ADDQ $4, DI		// n += 4
    75  	JLE E1			// if n <= 0 goto E1
    76  
    77  L1:	// n > 0
    78  	RCRQ $1, CX		// CF = c
    79  	MOVQ 0(R8)(SI*8), R11
    80  	ADCQ 0(R9)(SI*8), R11
    81  	MOVQ R11, 0(R10)(SI*8)
    82  	RCLQ $1, CX		// c = CF
    83  
    84  	ADDQ $1, SI		// i++
    85  	SUBQ $1, DI		// n--
    86  	JG L1			// if n > 0 goto L1
    87  
    88  E1:	MOVQ CX, c+72(FP)	// return c
    89  	RET
    90  
    91  
    92  // func subVV(z, x, y []Word) (c Word)
    93  // (same as addVV except for SBBQ instead of ADCQ and label names)
    94  TEXT ·subVV(SB),NOSPLIT,$0
    95  	MOVQ z_len+8(FP), DI
    96  	MOVQ x+24(FP), R8
    97  	MOVQ y+48(FP), R9
    98  	MOVQ z+0(FP), R10
    99  
   100  	MOVQ $0, CX		// c = 0
   101  	MOVQ $0, SI		// i = 0
   102  
   103  	// s/JL/JMP/ below to disable the unrolled loop
   104  	SUBQ $4, DI		// n -= 4
   105  	JL V2			// if n < 0 goto V2
   106  
   107  U2:	// n >= 0
   108  	// regular loop body unrolled 4x
   109  	RCRQ $1, CX		// CF = c
   110  	MOVQ 0(R8)(SI*8), R11
   111  	MOVQ 8(R8)(SI*8), R12
   112  	MOVQ 16(R8)(SI*8), R13
   113  	MOVQ 24(R8)(SI*8), R14
   114  	SBBQ 0(R9)(SI*8), R11
   115  	SBBQ 8(R9)(SI*8), R12
   116  	SBBQ 16(R9)(SI*8), R13
   117  	SBBQ 24(R9)(SI*8), R14
   118  	MOVQ R11, 0(R10)(SI*8)
   119  	MOVQ R12, 8(R10)(SI*8)
   120  	MOVQ R13, 16(R10)(SI*8)
   121  	MOVQ R14, 24(R10)(SI*8)
   122  	RCLQ $1, CX		// c = CF
   123  
   124  	ADDQ $4, SI		// i += 4
   125  	SUBQ $4, DI		// n -= 4
   126  	JGE U2			// if n >= 0 goto U2
   127  
   128  V2:	ADDQ $4, DI		// n += 4
   129  	JLE E2			// if n <= 0 goto E2
   130  
   131  L2:	// n > 0
   132  	RCRQ $1, CX		// CF = c
   133  	MOVQ 0(R8)(SI*8), R11
   134  	SBBQ 0(R9)(SI*8), R11
   135  	MOVQ R11, 0(R10)(SI*8)
   136  	RCLQ $1, CX		// c = CF
   137  
   138  	ADDQ $1, SI		// i++
   139  	SUBQ $1, DI		// n--
   140  	JG L2			// if n > 0 goto L2
   141  
   142  E2:	MOVQ CX, c+72(FP)	// return c
   143  	RET
   144  
   145  
   146  // func addVW(z, x []Word, y Word) (c Word)
   147  TEXT ·addVW(SB),NOSPLIT,$0
   148  	MOVQ z_len+8(FP), DI
   149  	MOVQ x+24(FP), R8
   150  	MOVQ y+48(FP), CX	// c = y
   151  	MOVQ z+0(FP), R10
   152  
   153  	MOVQ $0, SI		// i = 0
   154  
   155  	// s/JL/JMP/ below to disable the unrolled loop
   156  	SUBQ $4, DI		// n -= 4
   157  	JL V3			// if n < 4 goto V3
   158  
   159  U3:	// n >= 0
   160  	// regular loop body unrolled 4x
   161  	MOVQ 0(R8)(SI*8), R11
   162  	MOVQ 8(R8)(SI*8), R12
   163  	MOVQ 16(R8)(SI*8), R13
   164  	MOVQ 24(R8)(SI*8), R14
   165  	ADDQ CX, R11
   166  	ZERO_CX
   167  	ADCQ $0, R12
   168  	ADCQ $0, R13
   169  	ADCQ $0, R14
   170  	SETCS CX		// c = CF
   171  	MOVQ R11, 0(R10)(SI*8)
   172  	MOVQ R12, 8(R10)(SI*8)
   173  	MOVQ R13, 16(R10)(SI*8)
   174  	MOVQ R14, 24(R10)(SI*8)
   175  
   176  	ADDQ $4, SI		// i += 4
   177  	SUBQ $4, DI		// n -= 4
   178  	JGE U3			// if n >= 0 goto U3
   179  
   180  V3:	ADDQ $4, DI		// n += 4
   181  	JLE E3			// if n <= 0 goto E3
   182  
   183  L3:	// n > 0
   184  	ADDQ 0(R8)(SI*8), CX
   185  	MOVQ CX, 0(R10)(SI*8)
   186  	ZERO_CX
   187  	RCLQ $1, CX		// c = CF
   188  
   189  	ADDQ $1, SI		// i++
   190  	SUBQ $1, DI		// n--
   191  	JG L3			// if n > 0 goto L3
   192  
   193  E3:	MOVQ CX, c+56(FP)	// return c
   194  	RET
   195  
   196  
   197  // func subVW(z, x []Word, y Word) (c Word)
   198  // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
   199  TEXT ·subVW(SB),NOSPLIT,$0
   200  	MOVQ z_len+8(FP), DI
   201  	MOVQ x+24(FP), R8
   202  	MOVQ y+48(FP), CX	// c = y
   203  	MOVQ z+0(FP), R10
   204  	
   205  	MOVQ $0, SI		// i = 0
   206  
   207  	// s/JL/JMP/ below to disable the unrolled loop
   208  	SUBQ $4, DI		// n -= 4
   209  	JL V4			// if n < 4 goto V4
   210  
   211  U4:	// n >= 0
   212  	// regular loop body unrolled 4x
   213  	MOVQ 0(R8)(SI*8), R11
   214  	MOVQ 8(R8)(SI*8), R12
   215  	MOVQ 16(R8)(SI*8), R13
   216  	MOVQ 24(R8)(SI*8), R14
   217  	SUBQ CX, R11
   218  	ZERO_CX
   219  	SBBQ $0, R12
   220  	SBBQ $0, R13
   221  	SBBQ $0, R14
   222  	SETCS CX		// c = CF
   223  	MOVQ R11, 0(R10)(SI*8)
   224  	MOVQ R12, 8(R10)(SI*8)
   225  	MOVQ R13, 16(R10)(SI*8)
   226  	MOVQ R14, 24(R10)(SI*8)
   227  
   228  	ADDQ $4, SI		// i += 4
   229  	SUBQ $4, DI		// n -= 4
   230  	JGE U4			// if n >= 0 goto U4
   231  
   232  V4:	ADDQ $4, DI		// n += 4
   233  	JLE E4			// if n <= 0 goto E4
   234  
   235  L4:	// n > 0
   236  	MOVQ 0(R8)(SI*8), R11
   237  	SUBQ CX, R11
   238  	MOVQ R11, 0(R10)(SI*8)
   239  	ZERO_CX
   240  	RCLQ $1, CX		// c = CF
   241  
   242  	ADDQ $1, SI		// i++
   243  	SUBQ $1, DI		// n--
   244  	JG L4			// if n > 0 goto L4
   245  
   246  E4:	MOVQ CX, c+56(FP)	// return c
   247  	RET
   248  
   249  
   250  // func shlVU(z, x []Word, s uint) (c Word)
   251  TEXT ·shlVU(SB),NOSPLIT,$0
   252  	MOVQ z_len+8(FP), BX	// i = z
   253  	SUBQ $1, BX		// i--
   254  	JL X8b			// i < 0	(n <= 0)
   255  
   256  	// n > 0
   257  	MOVQ z+0(FP), R10
   258  	MOVQ x+24(FP), R8
   259  	MOVQ s+48(FP), CX
   260  	MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
   261  	MOVQ $0, DX
   262  	SHLQ CX, DX:AX		// w1>>ŝ
   263  	MOVQ DX, c+56(FP)
   264  
   265  	CMPQ BX, $0
   266  	JLE X8a			// i <= 0
   267  
   268  	// i > 0
   269  L8:	MOVQ AX, DX		// w = w1
   270  	MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
   271  	SHLQ CX, DX:AX		// w<<s | w1>>ŝ
   272  	MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>ŝ
   273  	SUBQ $1, BX		// i--
   274  	JG L8			// i > 0
   275  
   276  	// i <= 0
   277  X8a:	SHLQ CX, AX		// w1<<s
   278  	MOVQ AX, (R10)		// z[0] = w1<<s
   279  	RET
   280  
   281  X8b:	MOVQ $0, c+56(FP)
   282  	RET
   283  
   284  
   285  // func shrVU(z, x []Word, s uint) (c Word)
   286  TEXT ·shrVU(SB),NOSPLIT,$0
   287  	MOVQ z_len+8(FP), R11
   288  	SUBQ $1, R11		// n--
   289  	JL X9b			// n < 0	(n <= 0)
   290  
   291  	// n > 0
   292  	MOVQ z+0(FP), R10
   293  	MOVQ x+24(FP), R8
   294  	MOVQ s+48(FP), CX
   295  	MOVQ (R8), AX		// w1 = x[0]
   296  	MOVQ $0, DX
   297  	SHRQ CX, DX:AX		// w1<<ŝ
   298  	MOVQ DX, c+56(FP)
   299  
   300  	MOVQ $0, BX		// i = 0
   301  	JMP E9
   302  
   303  	// i < n-1
   304  L9:	MOVQ AX, DX		// w = w1
   305  	MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
   306  	SHRQ CX, DX:AX		// w>>s | w1<<ŝ
   307  	MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<ŝ
   308  	ADDQ $1, BX		// i++
   309  	
   310  E9:	CMPQ BX, R11
   311  	JL L9			// i < n-1
   312  
   313  	// i >= n-1
   314  X9a:	SHRQ CX, AX		// w1>>s
   315  	MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
   316  	RET
   317  
   318  X9b:	MOVQ $0, c+56(FP)
   319  	RET
   320  
   321  
   322  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   323  TEXT ·mulAddVWW(SB),NOSPLIT,$0
   324  	MOVQ z+0(FP), R10
   325  	MOVQ x+24(FP), R8
   326  	MOVQ y+48(FP), R9
   327  	MOVQ r+56(FP), CX	// c = r
   328  	MOVQ z_len+8(FP), R11
   329  	MOVQ $0, BX		// i = 0
   330  	JMP E5
   331  
   332  L5:	MOVQ (R8)(BX*8), AX
   333  	MULQ R9
   334  	ADDQ CX, AX
   335  	ADCQ $0, DX
   336  	MOVQ AX, (R10)(BX*8)
   337  	MOVQ DX, CX
   338  	ADDQ $1, BX		// i++
   339  
   340  E5:	CMPQ BX, R11		// i < n
   341  	JL L5
   342  
   343  	MOVQ CX, c+64(FP)
   344  	RET
   345  
   346  
   347  // func addMulVVW(z, x []Word, y Word) (c Word)
   348  TEXT ·addMulVVW(SB),NOSPLIT,$0
   349  	MOVQ z+0(FP), R10
   350  	MOVQ x+24(FP), R8
   351  	MOVQ y+48(FP), R9
   352  	MOVQ z_len+8(FP), R11
   353  	MOVQ $0, BX		// i = 0
   354  	MOVQ $0, CX		// c = 0
   355  	JMP E6
   356  
   357  L6:	MOVQ (R8)(BX*8), AX
   358  	MULQ R9
   359  	ADDQ CX, AX
   360  	ADCQ $0, DX
   361  	ADDQ AX, (R10)(BX*8)
   362  	ADCQ $0, DX
   363  	MOVQ DX, CX
   364  	ADDQ $1, BX		// i++
   365  
   366  E6:	CMPQ BX, R11		// i < n
   367  	JL L6
   368  
   369  	MOVQ CX, c+56(FP)
   370  	RET
   371  
   372  
   373  // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
   374  TEXT ·divWVW(SB),NOSPLIT,$0
   375  	MOVQ z+0(FP), R10
   376  	MOVQ xn+24(FP), DX	// r = xn
   377  	MOVQ x+32(FP), R8
   378  	MOVQ y+56(FP), R9
   379  	MOVQ z_len+8(FP), BX	// i = z
   380  	JMP E7
   381  
   382  L7:	MOVQ (R8)(BX*8), AX
   383  	DIVQ R9
   384  	MOVQ AX, (R10)(BX*8)
   385  
   386  E7:	SUBQ $1, BX		// i--
   387  	JGE L7			// i >= 0
   388  
   389  	MOVQ DX, r+64(FP)
   390  	RET
   391  
   392  // func bitLen(x Word) (n int)
   393  TEXT ·bitLen(SB),NOSPLIT,$0
   394  	BSRQ x+0(FP), AX
   395  	JZ Z1
   396  	ADDQ $1, AX
   397  	MOVQ AX, n+8(FP)
   398  	RET
   399  
   400  Z1:	MOVQ $0, n+8(FP)
   401  	RET