github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/math/big/arith_arm64.s (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !math_big_pure_go
     6  
     7  #include "textflag.h"
     8  
     9  // This file provides fast assembly versions for the elementary
    10  // arithmetic operations on vectors implemented in arith.go.
    11  
    12  // TODO: Consider re-implementing using Advanced SIMD
    13  // once the assembler supports those instructions.
    14  
    15  // func mulWW(x, y Word) (z1, z0 Word)
    16  TEXT ·mulWW(SB),NOSPLIT,$0
    17  	MOVD	x+0(FP), R0
    18  	MOVD	y+8(FP), R1
    19  	MUL	R0, R1, R2
    20  	UMULH	R0, R1, R3
    21  	MOVD	R3, z1+16(FP)
    22  	MOVD	R2, z0+24(FP)
    23  	RET
    24  
    25  
    26  // func addVV(z, x, y []Word) (c Word)
    27  TEXT ·addVV(SB),NOSPLIT,$0
    28  	MOVD	z_len+8(FP), R0
    29  	MOVD	x+24(FP), R8
    30  	MOVD	y+48(FP), R9
    31  	MOVD	z+0(FP), R10
    32  	ADDS	$0, R0		// clear carry flag
    33  	TBZ	$0, R0, two
    34  	MOVD.P	8(R8), R11
    35  	MOVD.P	8(R9), R15
    36  	ADCS	R15, R11
    37  	MOVD.P	R11, 8(R10)
    38  	SUB	$1, R0
    39  two:
    40  	TBZ	$1, R0, loop
    41  	LDP.P	16(R8), (R11, R12)
    42  	LDP.P	16(R9), (R15, R16)
    43  	ADCS	R15, R11
    44  	ADCS	R16, R12
    45  	STP.P	(R11, R12), 16(R10)
    46  	SUB	$2, R0
    47  loop:
    48  	CBZ	R0, done	// careful not to touch the carry flag
    49  	LDP.P	32(R8), (R11, R12)
    50  	LDP	-16(R8), (R13, R14)
    51  	LDP.P	32(R9), (R15, R16)
    52  	LDP	-16(R9), (R17, R19)
    53  	ADCS	R15, R11
    54  	ADCS	R16, R12
    55  	ADCS	R17, R13
    56  	ADCS	R19, R14
    57  	STP.P	(R11, R12), 32(R10)
    58  	STP	(R13, R14), -16(R10)
    59  	SUB	$4, R0
    60  	B	loop
    61  done:
    62  	CSET	HS, R0		// extract carry flag
    63  	MOVD	R0, c+72(FP)
    64  	RET
    65  
    66  
    67  // func subVV(z, x, y []Word) (c Word)
    68  TEXT ·subVV(SB),NOSPLIT,$0
    69  	MOVD	z_len+8(FP), R0
    70  	MOVD	x+24(FP), R8
    71  	MOVD	y+48(FP), R9
    72  	MOVD	z+0(FP), R10
    73  	CMP	R0, R0		// set carry flag
    74  	TBZ	$0, R0, two
    75  	MOVD.P	8(R8), R11
    76  	MOVD.P	8(R9), R15
    77  	SBCS	R15, R11
    78  	MOVD.P	R11, 8(R10)
    79  	SUB	$1, R0
    80  two:
    81  	TBZ	$1, R0, loop
    82  	LDP.P	16(R8), (R11, R12)
    83  	LDP.P	16(R9), (R15, R16)
    84  	SBCS	R15, R11
    85  	SBCS	R16, R12
    86  	STP.P	(R11, R12), 16(R10)
    87  	SUB	$2, R0
    88  loop:
    89  	CBZ	R0, done	// careful not to touch the carry flag
    90  	LDP.P	32(R8), (R11, R12)
    91  	LDP	-16(R8), (R13, R14)
    92  	LDP.P	32(R9), (R15, R16)
    93  	LDP	-16(R9), (R17, R19)
    94  	SBCS	R15, R11
    95  	SBCS	R16, R12
    96  	SBCS	R17, R13
    97  	SBCS	R19, R14
    98  	STP.P	(R11, R12), 32(R10)
    99  	STP	(R13, R14), -16(R10)
   100  	SUB	$4, R0
   101  	B	loop
   102  done:
   103  	CSET	LO, R0		// extract carry flag
   104  	MOVD	R0, c+72(FP)
   105  	RET
   106  
   107  #define vwOneOp(instr, op1)				\
   108  	MOVD.P	8(R1), R4;				\
   109  	instr	op1, R4;				\
   110  	MOVD.P	R4, 8(R3);
   111  
   112  // handle the first 1~4 elements before starting iteration in addVW/subVW
   113  #define vwPreIter(instr1, instr2, counter, target)	\
   114  	vwOneOp(instr1, R2);				\
   115  	SUB	$1, counter;				\
   116  	CBZ	counter, target;			\
   117  	vwOneOp(instr2, $0);				\
   118  	SUB	$1, counter;				\
   119  	CBZ	counter, target;			\
   120  	vwOneOp(instr2, $0);				\
   121  	SUB	$1, counter;				\
   122  	CBZ	counter, target;			\
   123  	vwOneOp(instr2, $0);
   124  
   125  // do one iteration of add or sub in addVW/subVW
   126  #define vwOneIter(instr, counter, exit)	\
   127  	CBZ	counter, exit;		\	// careful not to touch the carry flag
   128  	LDP.P	32(R1), (R4, R5);	\
   129  	LDP	-16(R1), (R6, R7);	\
   130  	instr	$0, R4, R8;		\
   131  	instr	$0, R5, R9;		\
   132  	instr	$0, R6, R10;		\
   133  	instr	$0, R7, R11;		\
   134  	STP.P	(R8, R9), 32(R3);	\
   135  	STP	(R10, R11), -16(R3);	\
   136  	SUB	$4, counter;
   137  
   138  // do one iteration of copy in addVW/subVW
   139  #define vwOneIterCopy(counter, exit)			\
   140  	CBZ	counter, exit;				\
   141  	LDP.P	32(R1), (R4, R5);			\
   142  	LDP	-16(R1), (R6, R7);			\
   143  	STP.P	(R4, R5), 32(R3);			\
   144  	STP	(R6, R7), -16(R3);			\
   145  	SUB	$4, counter;
   146  
   147  // func addVW(z, x []Word, y Word) (c Word)
   148  // The 'large' branch handles large 'z'. It checks the carry flag on every iteration
   149  // and switches to copy if we are done with carries. The copying is skipped as well
   150  // if 'x' and 'z' happen to share the same underlying storage.
   151  // The overhead of the checking and branching is visible when 'z' are small (~5%),
   152  // so set a threshold of 32, and remain the small-sized part entirely untouched.
   153  TEXT ·addVW(SB),NOSPLIT,$0
   154  	MOVD	z+0(FP), R3
   155  	MOVD	z_len+8(FP), R0
   156  	MOVD	x+24(FP), R1
   157  	MOVD	y+48(FP), R2
   158  	CMP	$32, R0
   159  	BGE	large		// large-sized 'z' and 'x'
   160  	CBZ	R0, len0	// the length of z is 0
   161  	MOVD.P	8(R1), R4
   162  	ADDS	R2, R4		// z[0] = x[0] + y, set carry
   163  	MOVD.P	R4, 8(R3)
   164  	SUB	$1, R0
   165  	CBZ	R0, len1	// the length of z is 1
   166  	TBZ	$0, R0, two
   167  	MOVD.P	8(R1), R4	// do it once
   168  	ADCS	$0, R4
   169  	MOVD.P	R4, 8(R3)
   170  	SUB	$1, R0
   171  two:				// do it twice
   172  	TBZ	$1, R0, loop
   173  	LDP.P	16(R1), (R4, R5)
   174  	ADCS	$0, R4, R8	// c, z[i] = x[i] + c
   175  	ADCS	$0, R5, R9
   176  	STP.P	(R8, R9), 16(R3)
   177  	SUB	$2, R0
   178  loop:				// do four times per round
   179  	vwOneIter(ADCS, R0, len1)
   180  	B	loop
   181  len1:
   182  	CSET	HS, R2		// extract carry flag
   183  len0:
   184  	MOVD	R2, c+56(FP)
   185  done:
   186  	RET
   187  large:
   188  	AND	$0x3, R0, R10
   189  	AND	$~0x3, R0
   190  	// unrolling for the first 1~4 elements to avoid saving the carry
   191  	// flag in each step, adjust $R0 if we unrolled 4 elements
   192  	vwPreIter(ADDS, ADCS, R10, add4)
   193  	SUB	$4, R0
   194  add4:
   195  	BCC	copy
   196  	vwOneIter(ADCS, R0, len1)
   197  	B	add4
   198  copy:
   199  	MOVD	ZR, c+56(FP)
   200  	CMP	R1, R3
   201  	BEQ	done
   202  copy_4:				// no carry flag, copy the rest
   203  	vwOneIterCopy(R0, done)
   204  	B	copy_4
   205  
   206  // func subVW(z, x []Word, y Word) (c Word)
   207  // The 'large' branch handles large 'z'. It checks the carry flag on every iteration
   208  // and switches to copy if we are done with carries. The copying is skipped as well
   209  // if 'x' and 'z' happen to share the same underlying storage.
   210  // The overhead of the checking and branching is visible when 'z' are small (~5%),
   211  // so set a threshold of 32, and remain the small-sized part entirely untouched.
   212  TEXT ·subVW(SB),NOSPLIT,$0
   213  	MOVD	z+0(FP), R3
   214  	MOVD	z_len+8(FP), R0
   215  	MOVD	x+24(FP), R1
   216  	MOVD	y+48(FP), R2
   217  	CMP	$32, R0
   218  	BGE	large		// large-sized 'z' and 'x'
   219  	CBZ	R0, len0	// the length of z is 0
   220  	MOVD.P	8(R1), R4
   221  	SUBS	R2, R4		// z[0] = x[0] - y, set carry
   222  	MOVD.P	R4, 8(R3)
   223  	SUB	$1, R0
   224  	CBZ	R0, len1	// the length of z is 1
   225  	TBZ	$0, R0, two	// do it once
   226  	MOVD.P	8(R1), R4
   227  	SBCS	$0, R4
   228  	MOVD.P	R4, 8(R3)
   229  	SUB	$1, R0
   230  two:				// do it twice
   231  	TBZ	$1, R0, loop
   232  	LDP.P	16(R1), (R4, R5)
   233  	SBCS	$0, R4, R8	// c, z[i] = x[i] + c
   234  	SBCS	$0, R5, R9
   235  	STP.P	(R8, R9), 16(R3)
   236  	SUB	$2, R0
   237  loop:				// do four times per round
   238  	vwOneIter(SBCS, R0, len1)
   239  	B	loop
   240  len1:
   241  	CSET	LO, R2		// extract carry flag
   242  len0:
   243  	MOVD	R2, c+56(FP)
   244  done:
   245  	RET
   246  large:
   247  	AND	$0x3, R0, R10
   248  	AND	$~0x3, R0
   249  	// unrolling for the first 1~4 elements to avoid saving the carry
   250  	// flag in each step, adjust $R0 if we unrolled 4 elements
   251  	vwPreIter(SUBS, SBCS, R10, sub4)
   252  	SUB	$4, R0
   253  sub4:
   254  	BCS	copy
   255  	vwOneIter(SBCS, R0, len1)
   256  	B	sub4
   257  copy:
   258  	MOVD	ZR, c+56(FP)
   259  	CMP	R1, R3
   260  	BEQ	done
   261  copy_4:				// no carry flag, copy the rest
   262  	vwOneIterCopy(R0, done)
   263  	B	copy_4
   264  
   265  // func shlVU(z, x []Word, s uint) (c Word)
   266  // This implementation handles the shift operation from the high word to the low word,
   267  // which may be an error for the case where the low word of x overlaps with the high
   268  // word of z. When calling this function directly, you need to pay attention to this
   269  // situation.
   270  TEXT ·shlVU(SB),NOSPLIT,$0
   271  	LDP	z+0(FP), (R0, R1)	// R0 = z.ptr, R1 = len(z)
   272  	MOVD	x+24(FP), R2
   273  	MOVD	s+48(FP), R3
   274  	ADD	R1<<3, R0	// R0 = &z[n]
   275  	ADD	R1<<3, R2	// R2 = &x[n]
   276  	CBZ	R1, len0
   277  	CBZ	R3, copy	// if the number of shift is 0, just copy x to z
   278  	MOVD	$64, R4
   279  	SUB	R3, R4
   280  	// handling the most significant element x[n-1]
   281  	MOVD.W	-8(R2), R6
   282  	LSR	R4, R6, R5	// return value
   283  	LSL	R3, R6, R8	// x[i] << s
   284  	SUB	$1, R1
   285  one:	TBZ	$0, R1, two
   286  	MOVD.W	-8(R2), R6
   287  	LSR	R4, R6, R7
   288  	ORR	R8, R7
   289  	LSL	R3, R6, R8
   290  	SUB	$1, R1
   291  	MOVD.W	R7, -8(R0)
   292  two:
   293  	TBZ	$1, R1, loop
   294  	LDP.W	-16(R2), (R6, R7)
   295  	LSR	R4, R7, R10
   296  	ORR	R8, R10
   297  	LSL	R3, R7
   298  	LSR	R4, R6, R9
   299  	ORR	R7, R9
   300  	LSL	R3, R6, R8
   301  	SUB	$2, R1
   302  	STP.W	(R9, R10), -16(R0)
   303  loop:
   304  	CBZ	R1, done
   305  	LDP.W	-32(R2), (R10, R11)
   306  	LDP	16(R2), (R12, R13)
   307  	LSR	R4, R13, R23
   308  	ORR	R8, R23		// z[i] = (x[i] << s) | (x[i-1] >> (64 - s))
   309  	LSL	R3, R13
   310  	LSR	R4, R12, R22
   311  	ORR	R13, R22
   312  	LSL	R3, R12
   313  	LSR	R4, R11, R21
   314  	ORR	R12, R21
   315  	LSL	R3, R11
   316  	LSR	R4, R10, R20
   317  	ORR	R11, R20
   318  	LSL	R3, R10, R8
   319  	STP.W	(R20, R21), -32(R0)
   320  	STP	(R22, R23), 16(R0)
   321  	SUB	$4, R1
   322  	B	loop
   323  done:
   324  	MOVD.W	R8, -8(R0)	// the first element x[0]
   325  	MOVD	R5, c+56(FP)	// the part moved out from x[n-1]
   326  	RET
   327  copy:
   328  	CMP	R0, R2
   329  	BEQ	len0
   330  	TBZ	$0, R1, ctwo
   331  	MOVD.W	-8(R2), R4
   332  	MOVD.W	R4, -8(R0)
   333  	SUB	$1, R1
   334  ctwo:
   335  	TBZ	$1, R1, cloop
   336  	LDP.W	-16(R2), (R4, R5)
   337  	STP.W	(R4, R5), -16(R0)
   338  	SUB	$2, R1
   339  cloop:
   340  	CBZ	R1, len0
   341  	LDP.W	-32(R2), (R4, R5)
   342  	LDP	16(R2), (R6, R7)
   343  	STP.W	(R4, R5), -32(R0)
   344  	STP	(R6, R7), 16(R0)
   345  	SUB	$4, R1
   346  	B	cloop
   347  len0:
   348  	MOVD	$0, c+56(FP)
   349  	RET
   350  
   351  // func shrVU(z, x []Word, s uint) (c Word)
   352  // This implementation handles the shift operation from the low word to the high word,
   353  // which may be an error for the case where the high word of x overlaps with the low
   354  // word of z. When calling this function directly, you need to pay attention to this
   355  // situation.
   356  TEXT ·shrVU(SB),NOSPLIT,$0
   357  	MOVD	z+0(FP), R0
   358  	MOVD	z_len+8(FP), R1
   359  	MOVD	x+24(FP), R2
   360  	MOVD	s+48(FP), R3
   361  	MOVD	$0, R8
   362  	MOVD	$64, R4
   363  	SUB	R3, R4
   364  	CBZ	R1, len0
   365  	CBZ	R3, copy	// if the number of shift is 0, just copy x to z
   366  
   367  	MOVD.P	8(R2), R20
   368  	LSR	R3, R20, R8
   369  	LSL	R4, R20
   370  	MOVD	R20, c+56(FP)	// deal with the first element
   371  	SUB	$1, R1
   372  
   373  	TBZ	$0, R1, two
   374  	MOVD.P	8(R2), R6
   375  	LSL	R4, R6, R20
   376  	ORR	R8, R20
   377  	LSR	R3, R6, R8
   378  	MOVD.P	R20, 8(R0)
   379  	SUB	$1, R1
   380  two:
   381  	TBZ	$1, R1, loop
   382  	LDP.P	16(R2), (R6, R7)
   383  	LSL	R4, R6, R20
   384  	LSR	R3, R6
   385  	ORR	R8, R20
   386  	LSL	R4, R7, R21
   387  	LSR	R3, R7, R8
   388  	ORR	R6, R21
   389  	STP.P	(R20, R21), 16(R0)
   390  	SUB	$2, R1
   391  loop:
   392  	CBZ	R1, done
   393  	LDP.P	32(R2), (R10, R11)
   394  	LDP	-16(R2), (R12, R13)
   395  	LSL	R4, R10, R20
   396  	LSR	R3, R10
   397  	ORR	R8, R20		// z[i] = (x[i] >> s) | (x[i+1] << (64 - s))
   398  	LSL	R4, R11, R21
   399  	LSR	R3, R11
   400  	ORR	R10, R21
   401  	LSL	R4, R12, R22
   402  	LSR	R3, R12
   403  	ORR	R11, R22
   404  	LSL	R4, R13, R23
   405  	LSR	R3, R13, R8
   406  	ORR	R12, R23
   407  	STP.P	(R20, R21), 32(R0)
   408  	STP	(R22, R23), -16(R0)
   409  	SUB	$4, R1
   410  	B	loop
   411  done:
   412  	MOVD	R8, (R0)	// deal with the last element
   413  	RET
   414  copy:
   415  	CMP	R0, R2
   416  	BEQ	len0
   417  	TBZ	$0, R1, ctwo
   418  	MOVD.P	8(R2), R3
   419  	MOVD.P	R3, 8(R0)
   420  	SUB	$1, R1
   421  ctwo:
   422  	TBZ	$1, R1, cloop
   423  	LDP.P	16(R2), (R4, R5)
   424  	STP.P	(R4, R5), 16(R0)
   425  	SUB	$2, R1
   426  cloop:
   427  	CBZ	R1, len0
   428  	LDP.P	32(R2), (R4, R5)
   429  	LDP	-16(R2), (R6, R7)
   430  	STP.P	(R4, R5), 32(R0)
   431  	STP	(R6, R7), -16(R0)
   432  	SUB	$4, R1
   433  	B	cloop
   434  len0:
   435  	MOVD	$0, c+56(FP)
   436  	RET
   437  
   438  
   439  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   440  TEXT ·mulAddVWW(SB),NOSPLIT,$0
   441  	MOVD	z+0(FP), R1
   442  	MOVD	z_len+8(FP), R0
   443  	MOVD	x+24(FP), R2
   444  	MOVD	y+48(FP), R3
   445  	MOVD	r+56(FP), R4
   446  	// c, z = x * y + r
   447  	TBZ	$0, R0, two
   448  	MOVD.P	8(R2), R5
   449  	MUL	R3, R5, R7
   450  	UMULH	R3, R5, R8
   451  	ADDS	R4, R7
   452  	ADC	$0, R8, R4	// c, z[i] = x[i] * y +  r
   453  	MOVD.P	R7, 8(R1)
   454  	SUB	$1, R0
   455  two:
   456  	TBZ	$1, R0, loop
   457  	LDP.P	16(R2), (R5, R6)
   458  	MUL	R3, R5, R10
   459  	UMULH	R3, R5, R11
   460  	ADDS	R4, R10
   461  	MUL	R3, R6, R12
   462  	UMULH	R3, R6, R13
   463  	ADCS	R12, R11
   464  	ADC	$0, R13, R4
   465  
   466  	STP.P	(R10, R11), 16(R1)
   467  	SUB	$2, R0
   468  loop:
   469  	CBZ	R0, done
   470  	LDP.P	32(R2), (R5, R6)
   471  	LDP	-16(R2), (R7, R8)
   472  
   473  	MUL	R3, R5, R10
   474  	UMULH	R3, R5, R11
   475  	ADDS	R4, R10
   476  	MUL	R3, R6, R12
   477  	UMULH	R3, R6, R13
   478  	ADCS	R11, R12
   479  
   480  	MUL	R3, R7, R14
   481  	UMULH	R3, R7, R15
   482  	ADCS	R13, R14
   483  	MUL	R3, R8, R16
   484  	UMULH	R3, R8, R17
   485  	ADCS	R15, R16
   486  	ADC	$0, R17, R4
   487  
   488  	STP.P	(R10, R12), 32(R1)
   489  	STP	(R14, R16), -16(R1)
   490  	SUB	$4, R0
   491  	B	loop
   492  done:
   493  	MOVD	R4, c+64(FP)
   494  	RET
   495  
   496  
   497  // func addMulVVW(z, x []Word, y Word) (c Word)
   498  TEXT ·addMulVVW(SB),NOSPLIT,$0
   499  	MOVD	z+0(FP), R1
   500  	MOVD	z_len+8(FP), R0
   501  	MOVD	x+24(FP), R2
   502  	MOVD	y+48(FP), R3
   503  	MOVD	$0, R4
   504  
   505  	TBZ	$0, R0, two
   506  
   507  	MOVD.P	8(R2), R5
   508  	MOVD	(R1), R6
   509  
   510  	MUL	R5, R3, R7
   511  	UMULH	R5, R3, R8
   512  
   513  	ADDS	R7, R6
   514  	ADC	$0, R8, R4
   515  
   516  	MOVD.P	R6, 8(R1)
   517  	SUB	$1, R0
   518  
   519  two:
   520  	TBZ	$1, R0, loop
   521  
   522  	LDP.P	16(R2), (R5, R10)
   523  	LDP	(R1), (R6, R11)
   524  
   525  	MUL	R10, R3, R13
   526  	UMULH	R10, R3, R12
   527  
   528  	MUL	R5, R3, R7
   529  	UMULH	R5, R3, R8
   530  
   531  	ADDS	R4, R6
   532  	ADCS	R13, R11
   533  	ADC	$0, R12
   534  
   535  	ADDS	R7, R6
   536  	ADCS	R8, R11
   537  	ADC	$0, R12, R4
   538  
   539  	STP.P	(R6, R11), 16(R1)
   540  	SUB	$2, R0
   541  
   542  // The main loop of this code operates on a block of 4 words every iteration
   543  // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
   544  // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
   545  // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
   546  loop:
   547  	CBZ	R0, done
   548  
   549  	LDP.P	16(R2), (R5, R6)
   550  	LDP.P	16(R2), (R7, R8)
   551  
   552  	LDP	(R1), (R9, R10)
   553  	ADDS	R4, R9
   554  	MUL	R6, R3, R14
   555  	ADCS	R14, R10
   556  	MUL	R7, R3, R15
   557  	LDP	16(R1), (R11, R12)
   558  	ADCS	R15, R11
   559  	MUL	R8, R3, R16
   560  	ADCS	R16, R12
   561  	UMULH	R8, R3, R20
   562  	ADC	$0, R20
   563  
   564  	MUL	R5, R3, R13
   565  	ADDS	R13, R9
   566  	UMULH	R5, R3, R17
   567  	ADCS	R17, R10
   568  	UMULH	R6, R3, R21
   569  	STP.P	(R9, R10), 16(R1)
   570  	ADCS	R21, R11
   571  	UMULH	R7, R3, R19
   572  	ADCS	R19, R12
   573  	STP.P	(R11, R12), 16(R1)
   574  	ADC	$0, R20, R4
   575  
   576  	SUB	$4, R0
   577  	B	loop
   578  
   579  done:
   580  	MOVD	R4, c+56(FP)
   581  	RET
   582  
   583