github.com/megatontech/mynoteforgo@v0.0.0-20200507084910-5d0c6ea6e890/源码/math/big/arith_arm64.s (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !math_big_pure_go
     6  
     7  #include "textflag.h"
     8  
     9  // This file provides fast assembly versions for the elementary
    10  // arithmetic operations on vectors implemented in arith.go.
    11  
    12  // TODO: Consider re-implementing using Advanced SIMD
    13  // once the assembler supports those instructions.
    14  
    15  // func mulWW(x, y Word) (z1, z0 Word)
    16  TEXT ·mulWW(SB),NOSPLIT,$0
    17  	MOVD	x+0(FP), R0
    18  	MOVD	y+8(FP), R1
    19  	MUL	R0, R1, R2
    20  	UMULH	R0, R1, R3
    21  	MOVD	R3, z1+16(FP)
    22  	MOVD	R2, z0+24(FP)
    23  	RET
    24  
    25  
    26  // func divWW(x1, x0, y Word) (q, r Word)
    27  TEXT ·divWW(SB),NOSPLIT,$0
    28  	B	·divWW_g(SB) // ARM64 has no multiword division
    29  
    30  
    31  // func addVV(z, x, y []Word) (c Word)
    32  TEXT ·addVV(SB),NOSPLIT,$0
    33  	MOVD	z_len+8(FP), R0
    34  	MOVD	x+24(FP), R8
    35  	MOVD	y+48(FP), R9
    36  	MOVD	z+0(FP), R10
    37  	ADDS	$0, R0		// clear carry flag
    38  	TBZ	$0, R0, two
    39  	MOVD.P	8(R8), R11
    40  	MOVD.P	8(R9), R15
    41  	ADCS	R15, R11
    42  	MOVD.P	R11, 8(R10)
    43  	SUB	$1, R0
    44  two:
    45  	TBZ	$1, R0, loop
    46  	LDP.P	16(R8), (R11, R12)
    47  	LDP.P	16(R9), (R15, R16)
    48  	ADCS	R15, R11
    49  	ADCS	R16, R12
    50  	STP.P	(R11, R12), 16(R10)
    51  	SUB	$2, R0
    52  loop:
    53  	CBZ	R0, done	// careful not to touch the carry flag
    54  	LDP.P	32(R8), (R11, R12)
    55  	LDP	-16(R8), (R13, R14)
    56  	LDP.P	32(R9), (R15, R16)
    57  	LDP	-16(R9), (R17, R19)
    58  	ADCS	R15, R11
    59  	ADCS	R16, R12
    60  	ADCS	R17, R13
    61  	ADCS	R19, R14
    62  	STP.P	(R11, R12), 32(R10)
    63  	STP	(R13, R14), -16(R10)
    64  	SUB	$4, R0
    65  	B	loop
    66  done:
    67  	CSET	HS, R0		// extract carry flag
    68  	MOVD	R0, c+72(FP)
    69  	RET
    70  
    71  
    72  // func subVV(z, x, y []Word) (c Word)
    73  TEXT ·subVV(SB),NOSPLIT,$0
    74  	MOVD	z_len+8(FP), R0
    75  	MOVD	x+24(FP), R8
    76  	MOVD	y+48(FP), R9
    77  	MOVD	z+0(FP), R10
    78  	CMP	R0, R0		// set carry flag
    79  	TBZ	$0, R0, two
    80  	MOVD.P	8(R8), R11
    81  	MOVD.P	8(R9), R15
    82  	SBCS	R15, R11
    83  	MOVD.P	R11, 8(R10)
    84  	SUB	$1, R0
    85  two:
    86  	TBZ	$1, R0, loop
    87  	LDP.P	16(R8), (R11, R12)
    88  	LDP.P	16(R9), (R15, R16)
    89  	SBCS	R15, R11
    90  	SBCS	R16, R12
    91  	STP.P	(R11, R12), 16(R10)
    92  	SUB	$2, R0
    93  loop:
    94  	CBZ	R0, done	// careful not to touch the carry flag
    95  	LDP.P	32(R8), (R11, R12)
    96  	LDP	-16(R8), (R13, R14)
    97  	LDP.P	32(R9), (R15, R16)
    98  	LDP	-16(R9), (R17, R19)
    99  	SBCS	R15, R11
   100  	SBCS	R16, R12
   101  	SBCS	R17, R13
   102  	SBCS	R19, R14
   103  	STP.P	(R11, R12), 32(R10)
   104  	STP	(R13, R14), -16(R10)
   105  	SUB	$4, R0
   106  	B	loop
   107  done:
   108  	CSET	LO, R0		// extract carry flag
   109  	MOVD	R0, c+72(FP)
   110  	RET
   111  
   112  
   113  // func addVW(z, x []Word, y Word) (c Word)
   114  TEXT ·addVW(SB),NOSPLIT,$0
   115  	MOVD	z+0(FP), R3
   116  	MOVD	z_len+8(FP), R0
   117  	MOVD	x+24(FP), R1
   118  	MOVD	y+48(FP), R2
   119  	CBZ	R0, len0	// the length of z is 0
   120  	MOVD.P	8(R1), R4
   121  	ADDS	R2, R4		// z[0] = x[0] + y, set carry
   122  	MOVD.P	R4, 8(R3)
   123  	SUB	$1, R0
   124  	CBZ	R0, len1	// the length of z is 1
   125  	TBZ	$0, R0, two
   126  	MOVD.P	8(R1), R4	// do it once
   127  	ADCS	$0, R4
   128  	MOVD.P	R4, 8(R3)
   129  	SUB	$1, R0
   130  two:				// do it twice
   131  	TBZ	$1, R0, loop
   132  	LDP.P	16(R1), (R4, R5)
   133  	ADCS	$0, R4, R8	// c, z[i] = x[i] + c
   134  	ADCS	$0, R5, R9
   135  	STP.P	(R8, R9), 16(R3)
   136  	SUB	$2, R0
   137  loop:				// do four times per round
   138  	CBZ	R0, len1	// careful not to touch the carry flag
   139  	LDP.P	32(R1), (R4, R5)
   140  	LDP	-16(R1), (R6, R7)
   141  	ADCS	$0, R4, R8
   142  	ADCS	$0, R5, R9
   143  	ADCS	$0, R6, R10
   144  	ADCS	$0, R7, R11
   145  	STP.P	(R8, R9), 32(R3)
   146  	STP	(R10, R11), -16(R3)
   147  	SUB	$4, R0
   148  	B	loop
   149  len1:
   150  	CSET	HS, R2		// extract carry flag
   151  len0:
   152  	MOVD	R2, c+56(FP)
   153  	RET
   154  
   155  // func subVW(z, x []Word, y Word) (c Word)
   156  TEXT ·subVW(SB),NOSPLIT,$0
   157  	MOVD	z+0(FP), R3
   158  	MOVD	z_len+8(FP), R0
   159  	MOVD	x+24(FP), R1
   160  	MOVD	y+48(FP), R2
   161  	CBZ	R0, len0	// the length of z is 0
   162  	MOVD.P	8(R1), R4
   163  	SUBS	R2, R4		// z[0] = x[0] - y, set carry
   164  	MOVD.P	R4, 8(R3)
   165  	SUB	$1, R0
   166  	CBZ	R0, len1	// the length of z is 1
   167  	TBZ	$0, R0, two	// do it once
   168  	MOVD.P	8(R1), R4
   169  	SBCS	$0, R4
   170  	MOVD.P	R4, 8(R3)
   171  	SUB	$1, R0
   172  two:				// do it twice
   173  	TBZ	$1, R0, loop
   174  	LDP.P	16(R1), (R4, R5)
   175  	SBCS	$0, R4, R8	// c, z[i] = x[i] + c
   176  	SBCS	$0, R5, R9
   177  	STP.P	(R8, R9), 16(R3)
   178  	SUB	$2, R0
   179  loop:				// do four times per round
   180  	CBZ	R0, len1	// careful not to touch the carry flag
   181  	LDP.P	32(R1), (R4, R5)
   182  	LDP	-16(R1), (R6, R7)
   183  	SBCS	$0, R4, R8
   184  	SBCS	$0, R5, R9
   185  	SBCS	$0, R6, R10
   186  	SBCS	$0, R7, R11
   187  	STP.P	(R8, R9), 32(R3)
   188  	STP	(R10, R11), -16(R3)
   189  	SUB	$4, R0
   190  	B	loop
   191  len1:
   192  	CSET	LO, R2		// extract carry flag
   193  len0:
   194  	MOVD	R2, c+56(FP)
   195  	RET
   196  
   197  
   198  // func shlVU(z, x []Word, s uint) (c Word)
   199  TEXT ·shlVU(SB),NOSPLIT,$0
   200  	MOVD	z+0(FP), R0
   201  	MOVD	z_len+8(FP), R1
   202  	MOVD	x+24(FP), R2
   203  	MOVD	s+48(FP), R3
   204  	MOVD	$0, R8		// in order not to affect the first element, R8 is initialized to zero
   205  	MOVD	$64, R4
   206  	SUB	R3, R4
   207  	CBZ	R1, len0
   208  	CBZ	R3, copy	// if the number of shift is 0, just copy x to z
   209  
   210  	TBZ	$0, R1, two
   211  	MOVD.P	8(R2), R6
   212  	LSR	R4, R6, R8
   213  	LSL	R3, R6
   214  	MOVD.P	R6, 8(R0)
   215  	SUB	$1, R1
   216  two:
   217  	TBZ	$1, R1, loop
   218  	LDP.P	16(R2), (R6, R7)
   219  	LSR	R4, R6, R9
   220  	LSL	R3, R6
   221  	ORR	R8, R6
   222  	LSR	R4, R7, R8
   223  	LSL	R3, R7
   224  	ORR	R9, R7
   225  	STP.P	(R6, R7), 16(R0)
   226  	SUB	$2, R1
   227  loop:
   228  	CBZ	R1, done
   229  	LDP.P	32(R2), (R10, R11)
   230  	LDP	-16(R2), (R12, R13)
   231  	LSR	R4, R10, R20
   232  	LSL	R3, R10
   233  	ORR	R8, R10		// z[i] = (x[i] << s) | (x[i-1] >> (64 - s))
   234  	LSR	R4, R11, R21
   235  	LSL	R3, R11
   236  	ORR	R20, R11
   237  	LSR	R4, R12, R22
   238  	LSL	R3, R12
   239  	ORR	R21, R12
   240  	LSR	R4, R13, R8
   241  	LSL	R3, R13
   242  	ORR	R22, R13
   243  	STP.P	(R10, R11), 32(R0)
   244  	STP	(R12, R13), -16(R0)
   245  	SUB	$4, R1
   246  	B	loop
   247  done:
   248  	MOVD	R8, c+56(FP)	// the part moved out from the last element
   249  	RET
   250  copy:
   251  	TBZ	$0, R1, ctwo
   252  	MOVD.P	8(R2), R3
   253  	MOVD.P	R3, 8(R0)
   254  	SUB	$1, R1
   255  ctwo:
   256  	TBZ	$1, R1, cloop
   257  	LDP.P	16(R2), (R4, R5)
   258  	STP.P	(R4, R5), 16(R0)
   259  	SUB	$2, R1
   260  cloop:
   261  	CBZ	R1, len0
   262  	LDP.P	32(R2), (R4, R5)
   263  	LDP	-16(R2), (R6, R7)
   264  	STP.P	(R4, R5), 32(R0)
   265  	STP	(R6, R7), -16(R0)
   266  	SUB	$4, R1
   267  	B	cloop
   268  len0:
   269  	MOVD	$0, c+56(FP)
   270  	RET
   271  
   272  
   273  // func shrVU(z, x []Word, s uint) (c Word)
   274  TEXT ·shrVU(SB),NOSPLIT,$0
   275  	MOVD	z+0(FP), R0
   276  	MOVD	z_len+8(FP), R1
   277  	MOVD	x+24(FP), R2
   278  	MOVD	s+48(FP), R3
   279  	MOVD	$0, R8
   280  	MOVD	$64, R4
   281  	SUB	R3, R4
   282  	CBZ	R1, len0
   283  	CBZ	R3, copy	// if the number of shift is 0, just copy x to z
   284  
   285  	MOVD.P	8(R2), R20
   286  	LSR	R3, R20, R8
   287  	LSL	R4, R20
   288  	MOVD	R20, c+56(FP)	// deal with the first element
   289  	SUB	$1, R1
   290  
   291  	TBZ	$0, R1, two
   292  	MOVD.P	8(R2), R6
   293  	LSL	R4, R6, R20
   294  	ORR	R8, R20
   295  	LSR	R3, R6, R8
   296  	MOVD.P	R20, 8(R0)
   297  	SUB	$1, R1
   298  two:
   299  	TBZ	$1, R1, loop
   300  	LDP.P	16(R2), (R6, R7)
   301  	LSL	R4, R6, R20
   302  	LSR	R3, R6
   303  	ORR	R8, R20
   304  	LSL	R4, R7, R21
   305  	LSR	R3, R7, R8
   306  	ORR	R6, R21
   307  	STP.P	(R20, R21), 16(R0)
   308  	SUB	$2, R1
   309  loop:
   310  	CBZ	R1, done
   311  	LDP.P	32(R2), (R10, R11)
   312  	LDP	-16(R2), (R12, R13)
   313  	LSL	R4, R10, R20
   314  	LSR	R3, R10
   315  	ORR	R8, R20		// z[i] = (x[i] >> s) | (x[i+1] << (64 - s))
   316  	LSL	R4, R11, R21
   317  	LSR	R3, R11
   318  	ORR	R10, R21
   319  	LSL	R4, R12, R22
   320  	LSR	R3, R12
   321  	ORR	R11, R22
   322  	LSL	R4, R13, R23
   323  	LSR	R3, R13, R8
   324  	ORR	R12, R23
   325  	STP.P	(R20, R21), 32(R0)
   326  	STP	(R22, R23), -16(R0)
   327  	SUB	$4, R1
   328  	B	loop
   329  done:
   330  	MOVD	R8, (R0)	// deal with the last element
   331  	RET
   332  copy:
   333  	TBZ	$0, R1, ctwo
   334  	MOVD.P	8(R2), R3
   335  	MOVD.P	R3, 8(R0)
   336  	SUB	$1, R1
   337  ctwo:
   338  	TBZ	$1, R1, cloop
   339  	LDP.P	16(R2), (R4, R5)
   340  	STP.P	(R4, R5), 16(R0)
   341  	SUB	$2, R1
   342  cloop:
   343  	CBZ	R1, len0
   344  	LDP.P	32(R2), (R4, R5)
   345  	LDP	-16(R2), (R6, R7)
   346  	STP.P	(R4, R5), 32(R0)
   347  	STP	(R6, R7), -16(R0)
   348  	SUB	$4, R1
   349  	B	cloop
   350  len0:
   351  	MOVD	$0, c+56(FP)
   352  	RET
   353  
   354  
   355  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   356  TEXT ·mulAddVWW(SB),NOSPLIT,$0
   357  	MOVD	z+0(FP), R1
   358  	MOVD	z_len+8(FP), R0
   359  	MOVD	x+24(FP), R2
   360  	MOVD	y+48(FP), R3
   361  	MOVD	r+56(FP), R4
   362  loop:
   363  	CBZ	R0, done
   364  	MOVD.P	8(R2), R5
   365  	UMULH	R5, R3, R7
   366  	MUL	R5, R3, R6
   367  	ADDS	R4, R6
   368  	ADC	$0, R7
   369  	MOVD.P	R6, 8(R1)
   370  	MOVD	R7, R4
   371  	SUB	$1, R0
   372  	B	loop
   373  done:
   374  	MOVD	R4, c+64(FP)
   375  	RET
   376  
   377  
   378  // func addMulVVW(z, x []Word, y Word) (c Word)
   379  TEXT ·addMulVVW(SB),NOSPLIT,$0
   380  	MOVD	z+0(FP), R1
   381  	MOVD	z_len+8(FP), R0
   382  	MOVD	x+24(FP), R2
   383  	MOVD	y+48(FP), R3
   384  	MOVD	$0, R4
   385  
   386  	TBZ	$0, R0, two
   387  
   388  	MOVD.P	8(R2), R5
   389  	MOVD	(R1), R6
   390  
   391  	MUL	R5, R3, R7
   392  	UMULH	R5, R3, R8
   393  
   394  	ADDS	R7, R6
   395  	ADC	$0, R8, R4
   396  
   397  	MOVD.P	R6, 8(R1)
   398  	SUB	$1, R0
   399  
   400  two:
   401  	TBZ	$1, R0, loop
   402  
   403  	LDP.P	16(R2), (R5, R10)
   404  	LDP	(R1), (R6, R11)
   405  
   406  	MUL	R10, R3, R13
   407  	UMULH	R10, R3, R12
   408  
   409  	MUL	R5, R3, R7
   410  	UMULH	R5, R3, R8
   411  
   412  	ADDS	R4, R6
   413  	ADCS	R13, R11
   414  	ADC	$0, R12
   415  
   416  	ADDS	R7, R6
   417  	ADCS	R8, R11
   418  	ADC	$0, R12, R4
   419  
   420  	STP.P	(R6, R11), 16(R1)
   421  	SUB	$2, R0
   422  
   423  // The main loop of this code operates on a block of 4 words every iteration
   424  // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
   425  // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
   426  // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
   427  loop:
   428  	CBZ	R0, done
   429  
   430  	LDP.P	16(R2), (R5, R6)
   431  	LDP.P	16(R2), (R7, R8)
   432  
   433  	LDP	(R1), (R9, R10)
   434  	ADDS	R4, R9
   435  	MUL	R6, R3, R14
   436  	ADCS	R14, R10
   437  	MUL	R7, R3, R15
   438  	LDP	16(R1), (R11, R12)
   439  	ADCS	R15, R11
   440  	MUL	R8, R3, R16
   441  	ADCS	R16, R12
   442  	UMULH	R8, R3, R20
   443  	ADC	$0, R20
   444  
   445  	MUL	R5, R3, R13
   446  	ADDS	R13, R9
   447  	UMULH	R5, R3, R17
   448  	ADCS	R17, R10
   449  	UMULH	R6, R3, R21
   450  	STP.P	(R9, R10), 16(R1)
   451  	ADCS	R21, R11
   452  	UMULH	R7, R3, R19
   453  	ADCS	R19, R12
   454  	STP.P	(R11, R12), 16(R1)
   455  	ADC	$0, R20, R4
   456  
   457  	SUB	$4, R0
   458  	B	loop
   459  
   460  done:
   461  	MOVD	R4, c+56(FP)
   462  	RET
   463  
   464  // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
   465  TEXT ·divWVW(SB),NOSPLIT,$0
   466  	B ·divWVW_g(SB)