github.com/twelsh-aw/go/src@v0.0.0-20230516233729-a56fe86a7c81/math/big/arith_arm64.s (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !math_big_pure_go
     6  // +build !math_big_pure_go
     7  
     8  #include "textflag.h"
     9  
    10  // This file provides fast assembly versions for the elementary
    11  // arithmetic operations on vectors implemented in arith.go.
    12  
    13  // TODO: Consider re-implementing using Advanced SIMD
    14  // once the assembler supports those instructions.
    15  
    16  // func addVV(z, x, y []Word) (c Word)
    17  TEXT ·addVV(SB),NOSPLIT,$0
    18  	MOVD	z_len+8(FP), R0
    19  	MOVD	x+24(FP), R8
    20  	MOVD	y+48(FP), R9
    21  	MOVD	z+0(FP), R10
    22  	ADDS	$0, R0		// clear carry flag
    23  	TBZ	$0, R0, two
    24  	MOVD.P	8(R8), R11
    25  	MOVD.P	8(R9), R15
    26  	ADCS	R15, R11
    27  	MOVD.P	R11, 8(R10)
    28  	SUB	$1, R0
    29  two:
    30  	TBZ	$1, R0, loop
    31  	LDP.P	16(R8), (R11, R12)
    32  	LDP.P	16(R9), (R15, R16)
    33  	ADCS	R15, R11
    34  	ADCS	R16, R12
    35  	STP.P	(R11, R12), 16(R10)
    36  	SUB	$2, R0
    37  loop:
    38  	CBZ	R0, done	// careful not to touch the carry flag
    39  	LDP.P	32(R8), (R11, R12)
    40  	LDP	-16(R8), (R13, R14)
    41  	LDP.P	32(R9), (R15, R16)
    42  	LDP	-16(R9), (R17, R19)
    43  	ADCS	R15, R11
    44  	ADCS	R16, R12
    45  	ADCS	R17, R13
    46  	ADCS	R19, R14
    47  	STP.P	(R11, R12), 32(R10)
    48  	STP	(R13, R14), -16(R10)
    49  	SUB	$4, R0
    50  	B	loop
    51  done:
    52  	CSET	HS, R0		// extract carry flag
    53  	MOVD	R0, c+72(FP)
    54  	RET
    55  
    56  
    57  // func subVV(z, x, y []Word) (c Word)
    58  TEXT ·subVV(SB),NOSPLIT,$0
    59  	MOVD	z_len+8(FP), R0
    60  	MOVD	x+24(FP), R8
    61  	MOVD	y+48(FP), R9
    62  	MOVD	z+0(FP), R10
    63  	CMP	R0, R0		// set carry flag
    64  	TBZ	$0, R0, two
    65  	MOVD.P	8(R8), R11
    66  	MOVD.P	8(R9), R15
    67  	SBCS	R15, R11
    68  	MOVD.P	R11, 8(R10)
    69  	SUB	$1, R0
    70  two:
    71  	TBZ	$1, R0, loop
    72  	LDP.P	16(R8), (R11, R12)
    73  	LDP.P	16(R9), (R15, R16)
    74  	SBCS	R15, R11
    75  	SBCS	R16, R12
    76  	STP.P	(R11, R12), 16(R10)
    77  	SUB	$2, R0
    78  loop:
    79  	CBZ	R0, done	// careful not to touch the carry flag
    80  	LDP.P	32(R8), (R11, R12)
    81  	LDP	-16(R8), (R13, R14)
    82  	LDP.P	32(R9), (R15, R16)
    83  	LDP	-16(R9), (R17, R19)
    84  	SBCS	R15, R11
    85  	SBCS	R16, R12
    86  	SBCS	R17, R13
    87  	SBCS	R19, R14
    88  	STP.P	(R11, R12), 32(R10)
    89  	STP	(R13, R14), -16(R10)
    90  	SUB	$4, R0
    91  	B	loop
    92  done:
    93  	CSET	LO, R0		// extract carry flag
    94  	MOVD	R0, c+72(FP)
    95  	RET
    96  
    97  #define vwOneOp(instr, op1)				\
    98  	MOVD.P	8(R1), R4;				\
    99  	instr	op1, R4;				\
   100  	MOVD.P	R4, 8(R3);
   101  
   102  // handle the first 1~4 elements before starting iteration in addVW/subVW
   103  #define vwPreIter(instr1, instr2, counter, target)	\
   104  	vwOneOp(instr1, R2);				\
   105  	SUB	$1, counter;				\
   106  	CBZ	counter, target;			\
   107  	vwOneOp(instr2, $0);				\
   108  	SUB	$1, counter;				\
   109  	CBZ	counter, target;			\
   110  	vwOneOp(instr2, $0);				\
   111  	SUB	$1, counter;				\
   112  	CBZ	counter, target;			\
   113  	vwOneOp(instr2, $0);
   114  
   115  // do one iteration of add or sub in addVW/subVW
   116  #define vwOneIter(instr, counter, exit)	\
   117  	CBZ	counter, exit;		\	// careful not to touch the carry flag
   118  	LDP.P	32(R1), (R4, R5);	\
   119  	LDP	-16(R1), (R6, R7);	\
   120  	instr	$0, R4, R8;		\
   121  	instr	$0, R5, R9;		\
   122  	instr	$0, R6, R10;		\
   123  	instr	$0, R7, R11;		\
   124  	STP.P	(R8, R9), 32(R3);	\
   125  	STP	(R10, R11), -16(R3);	\
   126  	SUB	$4, counter;
   127  
   128  // do one iteration of copy in addVW/subVW
   129  #define vwOneIterCopy(counter, exit)			\
   130  	CBZ	counter, exit;				\
   131  	LDP.P	32(R1), (R4, R5);			\
   132  	LDP	-16(R1), (R6, R7);			\
   133  	STP.P	(R4, R5), 32(R3);			\
   134  	STP	(R6, R7), -16(R3);			\
   135  	SUB	$4, counter;
   136  
   137  // func addVW(z, x []Word, y Word) (c Word)
   138  // The 'large' branch handles large 'z'. It checks the carry flag on every iteration
   139  // and switches to copy if we are done with carries. The copying is skipped as well
   140  // if 'x' and 'z' happen to share the same underlying storage.
   141  // The overhead of the checking and branching is visible when 'z' are small (~5%),
   142  // so set a threshold of 32, and remain the small-sized part entirely untouched.
   143  TEXT ·addVW(SB),NOSPLIT,$0
   144  	MOVD	z+0(FP), R3
   145  	MOVD	z_len+8(FP), R0
   146  	MOVD	x+24(FP), R1
   147  	MOVD	y+48(FP), R2
   148  	CMP	$32, R0
   149  	BGE	large		// large-sized 'z' and 'x'
   150  	CBZ	R0, len0	// the length of z is 0
   151  	MOVD.P	8(R1), R4
   152  	ADDS	R2, R4		// z[0] = x[0] + y, set carry
   153  	MOVD.P	R4, 8(R3)
   154  	SUB	$1, R0
   155  	CBZ	R0, len1	// the length of z is 1
   156  	TBZ	$0, R0, two
   157  	MOVD.P	8(R1), R4	// do it once
   158  	ADCS	$0, R4
   159  	MOVD.P	R4, 8(R3)
   160  	SUB	$1, R0
   161  two:				// do it twice
   162  	TBZ	$1, R0, loop
   163  	LDP.P	16(R1), (R4, R5)
   164  	ADCS	$0, R4, R8	// c, z[i] = x[i] + c
   165  	ADCS	$0, R5, R9
   166  	STP.P	(R8, R9), 16(R3)
   167  	SUB	$2, R0
   168  loop:				// do four times per round
   169  	vwOneIter(ADCS, R0, len1)
   170  	B	loop
   171  len1:
   172  	CSET	HS, R2		// extract carry flag
   173  len0:
   174  	MOVD	R2, c+56(FP)
   175  done:
   176  	RET
   177  large:
   178  	AND	$0x3, R0, R10
   179  	AND	$~0x3, R0
   180  	// unrolling for the first 1~4 elements to avoid saving the carry
   181  	// flag in each step, adjust $R0 if we unrolled 4 elements
   182  	vwPreIter(ADDS, ADCS, R10, add4)
   183  	SUB	$4, R0
   184  add4:
   185  	BCC	copy
   186  	vwOneIter(ADCS, R0, len1)
   187  	B	add4
   188  copy:
   189  	MOVD	ZR, c+56(FP)
   190  	CMP	R1, R3
   191  	BEQ	done
   192  copy_4:				// no carry flag, copy the rest
   193  	vwOneIterCopy(R0, done)
   194  	B	copy_4
   195  
   196  // func subVW(z, x []Word, y Word) (c Word)
   197  // The 'large' branch handles large 'z'. It checks the carry flag on every iteration
   198  // and switches to copy if we are done with carries. The copying is skipped as well
   199  // if 'x' and 'z' happen to share the same underlying storage.
   200  // The overhead of the checking and branching is visible when 'z' are small (~5%),
   201  // so set a threshold of 32, and remain the small-sized part entirely untouched.
   202  TEXT ·subVW(SB),NOSPLIT,$0
   203  	MOVD	z+0(FP), R3
   204  	MOVD	z_len+8(FP), R0
   205  	MOVD	x+24(FP), R1
   206  	MOVD	y+48(FP), R2
   207  	CMP	$32, R0
   208  	BGE	large		// large-sized 'z' and 'x'
   209  	CBZ	R0, len0	// the length of z is 0
   210  	MOVD.P	8(R1), R4
   211  	SUBS	R2, R4		// z[0] = x[0] - y, set carry
   212  	MOVD.P	R4, 8(R3)
   213  	SUB	$1, R0
   214  	CBZ	R0, len1	// the length of z is 1
   215  	TBZ	$0, R0, two	// do it once
   216  	MOVD.P	8(R1), R4
   217  	SBCS	$0, R4
   218  	MOVD.P	R4, 8(R3)
   219  	SUB	$1, R0
   220  two:				// do it twice
   221  	TBZ	$1, R0, loop
   222  	LDP.P	16(R1), (R4, R5)
   223  	SBCS	$0, R4, R8	// c, z[i] = x[i] + c
   224  	SBCS	$0, R5, R9
   225  	STP.P	(R8, R9), 16(R3)
   226  	SUB	$2, R0
   227  loop:				// do four times per round
   228  	vwOneIter(SBCS, R0, len1)
   229  	B	loop
   230  len1:
   231  	CSET	LO, R2		// extract carry flag
   232  len0:
   233  	MOVD	R2, c+56(FP)
   234  done:
   235  	RET
   236  large:
   237  	AND	$0x3, R0, R10
   238  	AND	$~0x3, R0
   239  	// unrolling for the first 1~4 elements to avoid saving the carry
   240  	// flag in each step, adjust $R0 if we unrolled 4 elements
   241  	vwPreIter(SUBS, SBCS, R10, sub4)
   242  	SUB	$4, R0
   243  sub4:
   244  	BCS	copy
   245  	vwOneIter(SBCS, R0, len1)
   246  	B	sub4
   247  copy:
   248  	MOVD	ZR, c+56(FP)
   249  	CMP	R1, R3
   250  	BEQ	done
   251  copy_4:				// no carry flag, copy the rest
   252  	vwOneIterCopy(R0, done)
   253  	B	copy_4
   254  
   255  // func shlVU(z, x []Word, s uint) (c Word)
   256  // This implementation handles the shift operation from the high word to the low word,
   257  // which may be an error for the case where the low word of x overlaps with the high
   258  // word of z. When calling this function directly, you need to pay attention to this
   259  // situation.
   260  TEXT ·shlVU(SB),NOSPLIT,$0
   261  	LDP	z+0(FP), (R0, R1)	// R0 = z.ptr, R1 = len(z)
   262  	MOVD	x+24(FP), R2
   263  	MOVD	s+48(FP), R3
   264  	ADD	R1<<3, R0	// R0 = &z[n]
   265  	ADD	R1<<3, R2	// R2 = &x[n]
   266  	CBZ	R1, len0
   267  	CBZ	R3, copy	// if the number of shift is 0, just copy x to z
   268  	MOVD	$64, R4
   269  	SUB	R3, R4
   270  	// handling the most significant element x[n-1]
   271  	MOVD.W	-8(R2), R6
   272  	LSR	R4, R6, R5	// return value
   273  	LSL	R3, R6, R8	// x[i] << s
   274  	SUB	$1, R1
   275  one:	TBZ	$0, R1, two
   276  	MOVD.W	-8(R2), R6
   277  	LSR	R4, R6, R7
   278  	ORR	R8, R7
   279  	LSL	R3, R6, R8
   280  	SUB	$1, R1
   281  	MOVD.W	R7, -8(R0)
   282  two:
   283  	TBZ	$1, R1, loop
   284  	LDP.W	-16(R2), (R6, R7)
   285  	LSR	R4, R7, R10
   286  	ORR	R8, R10
   287  	LSL	R3, R7
   288  	LSR	R4, R6, R9
   289  	ORR	R7, R9
   290  	LSL	R3, R6, R8
   291  	SUB	$2, R1
   292  	STP.W	(R9, R10), -16(R0)
   293  loop:
   294  	CBZ	R1, done
   295  	LDP.W	-32(R2), (R10, R11)
   296  	LDP	16(R2), (R12, R13)
   297  	LSR	R4, R13, R23
   298  	ORR	R8, R23		// z[i] = (x[i] << s) | (x[i-1] >> (64 - s))
   299  	LSL	R3, R13
   300  	LSR	R4, R12, R22
   301  	ORR	R13, R22
   302  	LSL	R3, R12
   303  	LSR	R4, R11, R21
   304  	ORR	R12, R21
   305  	LSL	R3, R11
   306  	LSR	R4, R10, R20
   307  	ORR	R11, R20
   308  	LSL	R3, R10, R8
   309  	STP.W	(R20, R21), -32(R0)
   310  	STP	(R22, R23), 16(R0)
   311  	SUB	$4, R1
   312  	B	loop
   313  done:
   314  	MOVD.W	R8, -8(R0)	// the first element x[0]
   315  	MOVD	R5, c+56(FP)	// the part moved out from x[n-1]
   316  	RET
   317  copy:
   318  	CMP	R0, R2
   319  	BEQ	len0
   320  	TBZ	$0, R1, ctwo
   321  	MOVD.W	-8(R2), R4
   322  	MOVD.W	R4, -8(R0)
   323  	SUB	$1, R1
   324  ctwo:
   325  	TBZ	$1, R1, cloop
   326  	LDP.W	-16(R2), (R4, R5)
   327  	STP.W	(R4, R5), -16(R0)
   328  	SUB	$2, R1
   329  cloop:
   330  	CBZ	R1, len0
   331  	LDP.W	-32(R2), (R4, R5)
   332  	LDP	16(R2), (R6, R7)
   333  	STP.W	(R4, R5), -32(R0)
   334  	STP	(R6, R7), 16(R0)
   335  	SUB	$4, R1
   336  	B	cloop
   337  len0:
   338  	MOVD	$0, c+56(FP)
   339  	RET
   340  
   341  // func shrVU(z, x []Word, s uint) (c Word)
   342  // This implementation handles the shift operation from the low word to the high word,
   343  // which may be an error for the case where the high word of x overlaps with the low
   344  // word of z. When calling this function directly, you need to pay attention to this
   345  // situation.
   346  TEXT ·shrVU(SB),NOSPLIT,$0
   347  	MOVD	z+0(FP), R0
   348  	MOVD	z_len+8(FP), R1
   349  	MOVD	x+24(FP), R2
   350  	MOVD	s+48(FP), R3
   351  	MOVD	$0, R8
   352  	MOVD	$64, R4
   353  	SUB	R3, R4
   354  	CBZ	R1, len0
   355  	CBZ	R3, copy	// if the number of shift is 0, just copy x to z
   356  
   357  	MOVD.P	8(R2), R20
   358  	LSR	R3, R20, R8
   359  	LSL	R4, R20
   360  	MOVD	R20, c+56(FP)	// deal with the first element
   361  	SUB	$1, R1
   362  
   363  	TBZ	$0, R1, two
   364  	MOVD.P	8(R2), R6
   365  	LSL	R4, R6, R20
   366  	ORR	R8, R20
   367  	LSR	R3, R6, R8
   368  	MOVD.P	R20, 8(R0)
   369  	SUB	$1, R1
   370  two:
   371  	TBZ	$1, R1, loop
   372  	LDP.P	16(R2), (R6, R7)
   373  	LSL	R4, R6, R20
   374  	LSR	R3, R6
   375  	ORR	R8, R20
   376  	LSL	R4, R7, R21
   377  	LSR	R3, R7, R8
   378  	ORR	R6, R21
   379  	STP.P	(R20, R21), 16(R0)
   380  	SUB	$2, R1
   381  loop:
   382  	CBZ	R1, done
   383  	LDP.P	32(R2), (R10, R11)
   384  	LDP	-16(R2), (R12, R13)
   385  	LSL	R4, R10, R20
   386  	LSR	R3, R10
   387  	ORR	R8, R20		// z[i] = (x[i] >> s) | (x[i+1] << (64 - s))
   388  	LSL	R4, R11, R21
   389  	LSR	R3, R11
   390  	ORR	R10, R21
   391  	LSL	R4, R12, R22
   392  	LSR	R3, R12
   393  	ORR	R11, R22
   394  	LSL	R4, R13, R23
   395  	LSR	R3, R13, R8
   396  	ORR	R12, R23
   397  	STP.P	(R20, R21), 32(R0)
   398  	STP	(R22, R23), -16(R0)
   399  	SUB	$4, R1
   400  	B	loop
   401  done:
   402  	MOVD	R8, (R0)	// deal with the last element
   403  	RET
   404  copy:
   405  	CMP	R0, R2
   406  	BEQ	len0
   407  	TBZ	$0, R1, ctwo
   408  	MOVD.P	8(R2), R3
   409  	MOVD.P	R3, 8(R0)
   410  	SUB	$1, R1
   411  ctwo:
   412  	TBZ	$1, R1, cloop
   413  	LDP.P	16(R2), (R4, R5)
   414  	STP.P	(R4, R5), 16(R0)
   415  	SUB	$2, R1
   416  cloop:
   417  	CBZ	R1, len0
   418  	LDP.P	32(R2), (R4, R5)
   419  	LDP	-16(R2), (R6, R7)
   420  	STP.P	(R4, R5), 32(R0)
   421  	STP	(R6, R7), -16(R0)
   422  	SUB	$4, R1
   423  	B	cloop
   424  len0:
   425  	MOVD	$0, c+56(FP)
   426  	RET
   427  
   428  
   429  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   430  TEXT ·mulAddVWW(SB),NOSPLIT,$0
   431  	MOVD	z+0(FP), R1
   432  	MOVD	z_len+8(FP), R0
   433  	MOVD	x+24(FP), R2
   434  	MOVD	y+48(FP), R3
   435  	MOVD	r+56(FP), R4
   436  	// c, z = x * y + r
   437  	TBZ	$0, R0, two
   438  	MOVD.P	8(R2), R5
   439  	MUL	R3, R5, R7
   440  	UMULH	R3, R5, R8
   441  	ADDS	R4, R7
   442  	ADC	$0, R8, R4	// c, z[i] = x[i] * y +  r
   443  	MOVD.P	R7, 8(R1)
   444  	SUB	$1, R0
   445  two:
   446  	TBZ	$1, R0, loop
   447  	LDP.P	16(R2), (R5, R6)
   448  	MUL	R3, R5, R10
   449  	UMULH	R3, R5, R11
   450  	ADDS	R4, R10
   451  	MUL	R3, R6, R12
   452  	UMULH	R3, R6, R13
   453  	ADCS	R12, R11
   454  	ADC	$0, R13, R4
   455  
   456  	STP.P	(R10, R11), 16(R1)
   457  	SUB	$2, R0
   458  loop:
   459  	CBZ	R0, done
   460  	LDP.P	32(R2), (R5, R6)
   461  	LDP	-16(R2), (R7, R8)
   462  
   463  	MUL	R3, R5, R10
   464  	UMULH	R3, R5, R11
   465  	ADDS	R4, R10
   466  	MUL	R3, R6, R12
   467  	UMULH	R3, R6, R13
   468  	ADCS	R11, R12
   469  
   470  	MUL	R3, R7, R14
   471  	UMULH	R3, R7, R15
   472  	ADCS	R13, R14
   473  	MUL	R3, R8, R16
   474  	UMULH	R3, R8, R17
   475  	ADCS	R15, R16
   476  	ADC	$0, R17, R4
   477  
   478  	STP.P	(R10, R12), 32(R1)
   479  	STP	(R14, R16), -16(R1)
   480  	SUB	$4, R0
   481  	B	loop
   482  done:
   483  	MOVD	R4, c+64(FP)
   484  	RET
   485  
   486  
   487  // func addMulVVW(z, x []Word, y Word) (c Word)
   488  TEXT ·addMulVVW(SB),NOSPLIT,$0
   489  	MOVD	z+0(FP), R1
   490  	MOVD	z_len+8(FP), R0
   491  	MOVD	x+24(FP), R2
   492  	MOVD	y+48(FP), R3
   493  	MOVD	$0, R4
   494  
   495  	TBZ	$0, R0, two
   496  
   497  	MOVD.P	8(R2), R5
   498  	MOVD	(R1), R6
   499  
   500  	MUL	R5, R3, R7
   501  	UMULH	R5, R3, R8
   502  
   503  	ADDS	R7, R6
   504  	ADC	$0, R8, R4
   505  
   506  	MOVD.P	R6, 8(R1)
   507  	SUB	$1, R0
   508  
   509  two:
   510  	TBZ	$1, R0, loop
   511  
   512  	LDP.P	16(R2), (R5, R10)
   513  	LDP	(R1), (R6, R11)
   514  
   515  	MUL	R10, R3, R13
   516  	UMULH	R10, R3, R12
   517  
   518  	MUL	R5, R3, R7
   519  	UMULH	R5, R3, R8
   520  
   521  	ADDS	R4, R6
   522  	ADCS	R13, R11
   523  	ADC	$0, R12
   524  
   525  	ADDS	R7, R6
   526  	ADCS	R8, R11
   527  	ADC	$0, R12, R4
   528  
   529  	STP.P	(R6, R11), 16(R1)
   530  	SUB	$2, R0
   531  
   532  // The main loop of this code operates on a block of 4 words every iteration
   533  // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
   534  // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
   535  // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
   536  loop:
   537  	CBZ	R0, done
   538  
   539  	LDP.P	16(R2), (R5, R6)
   540  	LDP.P	16(R2), (R7, R8)
   541  
   542  	LDP	(R1), (R9, R10)
   543  	ADDS	R4, R9
   544  	MUL	R6, R3, R14
   545  	ADCS	R14, R10
   546  	MUL	R7, R3, R15
   547  	LDP	16(R1), (R11, R12)
   548  	ADCS	R15, R11
   549  	MUL	R8, R3, R16
   550  	ADCS	R16, R12
   551  	UMULH	R8, R3, R20
   552  	ADC	$0, R20
   553  
   554  	MUL	R5, R3, R13
   555  	ADDS	R13, R9
   556  	UMULH	R5, R3, R17
   557  	ADCS	R17, R10
   558  	UMULH	R6, R3, R21
   559  	STP.P	(R9, R10), 16(R1)
   560  	ADCS	R21, R11
   561  	UMULH	R7, R3, R19
   562  	ADCS	R19, R12
   563  	STP.P	(R11, R12), 16(R1)
   564  	ADC	$0, R20, R4
   565  
   566  	SUB	$4, R0
   567  	B	loop
   568  
   569  done:
   570  	MOVD	R4, c+56(FP)
   571  	RET
   572  
   573