github.com/emmansun/gmsm@v0.29.1/internal/bigmod/nat_arm64.s (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !purego
     6  
     7  #include "textflag.h"
     8  
     9  // func addMulVVW256(z, x *uint, y uint) (c uint)
    10  TEXT ·addMulVVW256(SB), $0-32
    11  	MOVD	$4, R0
    12  	JMP		addMulVVWy(SB)
    13  
    14  // func addMulVVW1024(z, x *uint, y uint) (c uint)
    15  TEXT ·addMulVVW1024(SB), $0-32
    16  	MOVD	$16, R0
    17  	JMP		addMulVVWy(SB)
    18  
    19  // func addMulVVW1536(z, x *uint, y uint) (c uint)
    20  TEXT ·addMulVVW1536(SB), $0-32
    21  	MOVD	$24, R0
    22  	JMP		addMulVVWy(SB)
    23  
    24  // func addMulVVW2048(z, x *uint, y uint) (c uint)
    25  TEXT ·addMulVVW2048(SB), $0-32
    26  	MOVD	$32, R0
    27  	JMP		addMulVVWy(SB)
    28  
    29  TEXT addMulVVWy(SB), NOFRAME|NOSPLIT, $0
    30  	MOVD	z+0(FP), R1
    31  	MOVD	x+8(FP), R2
    32  	MOVD	y+16(FP), R3
    33  	MOVD	$0, R4
    34  
    35  // The main loop of this code operates on a block of 4 words every iteration
    36  // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
    37  // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
    38  // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
    39  loop:
    40  	CBZ	R0, done
    41  
    42  	LDP.P	16(R2), (R5, R6)
    43  	LDP.P	16(R2), (R7, R8)
    44  
    45  	LDP	(R1), (R9, R10)
    46  	ADDS	R4, R9
    47  	MUL	R6, R3, R14
    48  	ADCS	R14, R10
    49  	MUL	R7, R3, R15
    50  	LDP	16(R1), (R11, R12)
    51  	ADCS	R15, R11
    52  	MUL	R8, R3, R16
    53  	ADCS	R16, R12
    54  	UMULH	R8, R3, R20
    55  	ADC	$0, R20
    56  
    57  	MUL	R5, R3, R13
    58  	ADDS	R13, R9
    59  	UMULH	R5, R3, R17
    60  	ADCS	R17, R10
    61  	UMULH	R6, R3, R21
    62  	STP.P	(R9, R10), 16(R1)
    63  	ADCS	R21, R11
    64  	UMULH	R7, R3, R19
    65  	ADCS	R19, R12
    66  	STP.P	(R11, R12), 16(R1)
    67  	ADC	$0, R20, R4
    68  
    69  	SUB	$4, R0
    70  	B	loop
    71  
    72  done:
    73  	MOVD	R4, c+24(FP)
    74  	RET