github.com/emmansun/gmsm@v0.29.1/internal/bigmod/nat_ppc64x.s (about)

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !purego && (ppc64 || ppc64le)
     6  
     7  #include "textflag.h"
     8  
     9  // func addMulVVW256(z, x *uint, y uint) (c uint)
    10  TEXT ·addMulVVW256(SB), $0-32
    11  	MOVD	$1, R6 // R6 = z_len/4
    12  	JMP		addMulVVWy<>(SB)
    13  
    14  // func addMulVVW1024(z, x *uint, y uint) (c uint)
    15  TEXT ·addMulVVW1024(SB), $0-32
    16  	MOVD	$4, R6 // R6 = z_len/4
    17  	JMP		addMulVVWy<>(SB)
    18  
    19  // func addMulVVW1536(z, x *uint, y uint) (c uint)
    20  TEXT ·addMulVVW1536(SB), $0-32
    21  	MOVD	$6, R6 // R6 = z_len/4
    22  	JMP		addMulVVWy<>(SB)
    23  
    24  // func addMulVVW2048(z, x *uint, y uint) (c uint)
    25  TEXT ·addMulVVW2048(SB), $0-32
    26  	MOVD	$8, R6 // R6 = z_len/4
    27  	JMP		addMulVVWy<>(SB)
    28  
    29  // This local function expects to be called only by
    30  // callers above. R6 contains the z length/4
    31  // since 4 values are processed for each
    32  // loop iteration, and is guaranteed to be > 0.
    33  // If other callers are added this function might
    34  // need to change.
    35  TEXT addMulVVWy<>(SB), NOSPLIT, $0
    36  	MOVD	z+0(FP), R3
    37  	MOVD	x+8(FP), R4
    38  	MOVD	y+16(FP), R5
    39  
    40  	MOVD	$0, R9		// R9 = c = 0
    41  	MOVD	R6, CTR		// Initialize loop counter
    42  	PCALIGN	$16
    43  
    44  loop:
    45  	MOVD	0(R4), R14	// x[i]
    46  	MOVD	8(R4), R16	// x[i+1]
    47  	MOVD	16(R4), R18	// x[i+2]
    48  	MOVD	24(R4), R20	// x[i+3]
    49  	MOVD	0(R3), R15	// z[i]
    50  	MOVD	8(R3), R17	// z[i+1]
    51  	MOVD	16(R3), R19	// z[i+2]
    52  	MOVD	24(R3), R21	// z[i+3]
    53  	MULLD	R5, R14, R10	// low x[i]*y
    54  	MULHDU	R5, R14, R11	// high x[i]*y
    55  	ADDC	R15, R10
    56  	ADDZE	R11
    57  	ADDC	R9, R10
    58  	ADDZE	R11, R9
    59  	MULLD	R5, R16, R14	// low x[i+1]*y
    60  	MULHDU	R5, R16, R15	// high x[i+1]*y
    61  	ADDC	R17, R14
    62  	ADDZE	R15
    63  	ADDC	R9, R14
    64  	ADDZE	R15, R9
    65  	MULLD	R5, R18, R16	// low x[i+2]*y
    66  	MULHDU	R5, R18, R17	// high x[i+2]*y
    67  	ADDC	R19, R16
    68  	ADDZE	R17
    69  	ADDC	R9, R16
    70  	ADDZE	R17, R9
    71  	MULLD	R5, R20, R18	// low x[i+3]*y
    72  	MULHDU	R5, R20, R19	// high x[i+3]*y
    73  	ADDC	R21, R18
    74  	ADDZE	R19
    75  	ADDC	R9, R18
    76  	ADDZE	R19, R9
    77  	MOVD	R10, 0(R3)	// z[i]
    78  	MOVD	R14, 8(R3)	// z[i+1]
    79  	MOVD	R16, 16(R3)	// z[i+2]
    80  	MOVD	R18, 24(R3)	// z[i+3]
    81  	ADD	$32, R3
    82  	ADD	$32, R4
    83  	BDNZ	loop
    84  
    85  done:
    86  	MOVD	R9, c+24(FP)
    87  	RET