github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/crypto/sha512/sha512block_ppc64x.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Based on CRYPTOGAMS code with the following comment:
     6  // # ====================================================================
     7  // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
     8  // # project. The module is, however, dual licensed under OpenSSL and
     9  // # CRYPTOGAMS licenses depending on where you obtain it. For further
    10  // # details see http://www.openssl.org/~appro/cryptogams/.
    11  // # ====================================================================
    12  
    13  //go:build ppc64 || ppc64le
    14  
    15  #include "textflag.h"
    16  
    17  // SHA512 block routine. See sha512block.go for Go equivalent.
    18  //
    19  // The algorithm is detailed in FIPS 180-4:
    20  //
    21  //  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    22  //
    23  // Wt = Mt; for 0 <= t <= 15
    24  // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
    25  //
    26  // a = H0
    27  // b = H1
    28  // c = H2
    29  // d = H3
    30  // e = H4
    31  // f = H5
    32  // g = H6
    33  // h = H7
    34  //
    35  // for t = 0 to 79 {
    36  //    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    37  //    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    38  //    h = g
    39  //    g = f
    40  //    f = e
    41  //    e = d + T1
    42  //    d = c
    43  //    c = b
    44  //    b = a
    45  //    a = T1 + T2
    46  // }
    47  //
    48  // H0 = a + H0
    49  // H1 = b + H1
    50  // H2 = c + H2
    51  // H3 = d + H3
    52  // H4 = e + H4
    53  // H5 = f + H5
    54  // H6 = g + H6
    55  // H7 = h + H7
    56  
    57  #define CTX	R3
    58  #define INP	R4
    59  #define END	R5
    60  #define TBL	R6
    61  #define IDX	R7
    62  #define CNT	R8
    63  #define LEN	R9
    64  #define OFFLOAD	R11
    65  #define TEMP	R12
    66  
    67  #define HEX00	R0
    68  #define HEX10	R10
    69  #define HEX20	R25
    70  #define HEX30	R26
    71  
    72  // V0-V7 are A-H
    73  // V8-V23 are used for the message schedule
    74  #define KI	V24
    75  #define FUNC	V25
    76  #define S0	V26
    77  #define S1	V27
    78  #define s0	V28
    79  #define s1	V29
    80  #define LEMASK	V31	// Permutation control register for little endian
    81  
    82  // VPERM is needed on LE to switch the bytes
    83  
    84  #ifdef GOARCH_ppc64le
    85  #define VPERMLE(va,vb,vc,vt) VPERM va, vb, vc, vt
    86  #else
    87  #define VPERMLE(va,vb,vc,vt)
    88  #endif
    89  
    90  // 2 copies of each Kt, to fill both doublewords of a vector register
    91  DATA  ·kcon+0x000(SB)/8, $0x428a2f98d728ae22
    92  DATA  ·kcon+0x008(SB)/8, $0x428a2f98d728ae22
    93  DATA  ·kcon+0x010(SB)/8, $0x7137449123ef65cd
    94  DATA  ·kcon+0x018(SB)/8, $0x7137449123ef65cd
    95  DATA  ·kcon+0x020(SB)/8, $0xb5c0fbcfec4d3b2f
    96  DATA  ·kcon+0x028(SB)/8, $0xb5c0fbcfec4d3b2f
    97  DATA  ·kcon+0x030(SB)/8, $0xe9b5dba58189dbbc
    98  DATA  ·kcon+0x038(SB)/8, $0xe9b5dba58189dbbc
    99  DATA  ·kcon+0x040(SB)/8, $0x3956c25bf348b538
   100  DATA  ·kcon+0x048(SB)/8, $0x3956c25bf348b538
   101  DATA  ·kcon+0x050(SB)/8, $0x59f111f1b605d019
   102  DATA  ·kcon+0x058(SB)/8, $0x59f111f1b605d019
   103  DATA  ·kcon+0x060(SB)/8, $0x923f82a4af194f9b
   104  DATA  ·kcon+0x068(SB)/8, $0x923f82a4af194f9b
   105  DATA  ·kcon+0x070(SB)/8, $0xab1c5ed5da6d8118
   106  DATA  ·kcon+0x078(SB)/8, $0xab1c5ed5da6d8118
   107  DATA  ·kcon+0x080(SB)/8, $0xd807aa98a3030242
   108  DATA  ·kcon+0x088(SB)/8, $0xd807aa98a3030242
   109  DATA  ·kcon+0x090(SB)/8, $0x12835b0145706fbe
   110  DATA  ·kcon+0x098(SB)/8, $0x12835b0145706fbe
   111  DATA  ·kcon+0x0A0(SB)/8, $0x243185be4ee4b28c
   112  DATA  ·kcon+0x0A8(SB)/8, $0x243185be4ee4b28c
   113  DATA  ·kcon+0x0B0(SB)/8, $0x550c7dc3d5ffb4e2
   114  DATA  ·kcon+0x0B8(SB)/8, $0x550c7dc3d5ffb4e2
   115  DATA  ·kcon+0x0C0(SB)/8, $0x72be5d74f27b896f
   116  DATA  ·kcon+0x0C8(SB)/8, $0x72be5d74f27b896f
   117  DATA  ·kcon+0x0D0(SB)/8, $0x80deb1fe3b1696b1
   118  DATA  ·kcon+0x0D8(SB)/8, $0x80deb1fe3b1696b1
   119  DATA  ·kcon+0x0E0(SB)/8, $0x9bdc06a725c71235
   120  DATA  ·kcon+0x0E8(SB)/8, $0x9bdc06a725c71235
   121  DATA  ·kcon+0x0F0(SB)/8, $0xc19bf174cf692694
   122  DATA  ·kcon+0x0F8(SB)/8, $0xc19bf174cf692694
   123  DATA  ·kcon+0x100(SB)/8, $0xe49b69c19ef14ad2
   124  DATA  ·kcon+0x108(SB)/8, $0xe49b69c19ef14ad2
   125  DATA  ·kcon+0x110(SB)/8, $0xefbe4786384f25e3
   126  DATA  ·kcon+0x118(SB)/8, $0xefbe4786384f25e3
   127  DATA  ·kcon+0x120(SB)/8, $0x0fc19dc68b8cd5b5
   128  DATA  ·kcon+0x128(SB)/8, $0x0fc19dc68b8cd5b5
   129  DATA  ·kcon+0x130(SB)/8, $0x240ca1cc77ac9c65
   130  DATA  ·kcon+0x138(SB)/8, $0x240ca1cc77ac9c65
   131  DATA  ·kcon+0x140(SB)/8, $0x2de92c6f592b0275
   132  DATA  ·kcon+0x148(SB)/8, $0x2de92c6f592b0275
   133  DATA  ·kcon+0x150(SB)/8, $0x4a7484aa6ea6e483
   134  DATA  ·kcon+0x158(SB)/8, $0x4a7484aa6ea6e483
   135  DATA  ·kcon+0x160(SB)/8, $0x5cb0a9dcbd41fbd4
   136  DATA  ·kcon+0x168(SB)/8, $0x5cb0a9dcbd41fbd4
   137  DATA  ·kcon+0x170(SB)/8, $0x76f988da831153b5
   138  DATA  ·kcon+0x178(SB)/8, $0x76f988da831153b5
   139  DATA  ·kcon+0x180(SB)/8, $0x983e5152ee66dfab
   140  DATA  ·kcon+0x188(SB)/8, $0x983e5152ee66dfab
   141  DATA  ·kcon+0x190(SB)/8, $0xa831c66d2db43210
   142  DATA  ·kcon+0x198(SB)/8, $0xa831c66d2db43210
   143  DATA  ·kcon+0x1A0(SB)/8, $0xb00327c898fb213f
   144  DATA  ·kcon+0x1A8(SB)/8, $0xb00327c898fb213f
   145  DATA  ·kcon+0x1B0(SB)/8, $0xbf597fc7beef0ee4
   146  DATA  ·kcon+0x1B8(SB)/8, $0xbf597fc7beef0ee4
   147  DATA  ·kcon+0x1C0(SB)/8, $0xc6e00bf33da88fc2
   148  DATA  ·kcon+0x1C8(SB)/8, $0xc6e00bf33da88fc2
   149  DATA  ·kcon+0x1D0(SB)/8, $0xd5a79147930aa725
   150  DATA  ·kcon+0x1D8(SB)/8, $0xd5a79147930aa725
   151  DATA  ·kcon+0x1E0(SB)/8, $0x06ca6351e003826f
   152  DATA  ·kcon+0x1E8(SB)/8, $0x06ca6351e003826f
   153  DATA  ·kcon+0x1F0(SB)/8, $0x142929670a0e6e70
   154  DATA  ·kcon+0x1F8(SB)/8, $0x142929670a0e6e70
   155  DATA  ·kcon+0x200(SB)/8, $0x27b70a8546d22ffc
   156  DATA  ·kcon+0x208(SB)/8, $0x27b70a8546d22ffc
   157  DATA  ·kcon+0x210(SB)/8, $0x2e1b21385c26c926
   158  DATA  ·kcon+0x218(SB)/8, $0x2e1b21385c26c926
   159  DATA  ·kcon+0x220(SB)/8, $0x4d2c6dfc5ac42aed
   160  DATA  ·kcon+0x228(SB)/8, $0x4d2c6dfc5ac42aed
   161  DATA  ·kcon+0x230(SB)/8, $0x53380d139d95b3df
   162  DATA  ·kcon+0x238(SB)/8, $0x53380d139d95b3df
   163  DATA  ·kcon+0x240(SB)/8, $0x650a73548baf63de
   164  DATA  ·kcon+0x248(SB)/8, $0x650a73548baf63de
   165  DATA  ·kcon+0x250(SB)/8, $0x766a0abb3c77b2a8
   166  DATA  ·kcon+0x258(SB)/8, $0x766a0abb3c77b2a8
   167  DATA  ·kcon+0x260(SB)/8, $0x81c2c92e47edaee6
   168  DATA  ·kcon+0x268(SB)/8, $0x81c2c92e47edaee6
   169  DATA  ·kcon+0x270(SB)/8, $0x92722c851482353b
   170  DATA  ·kcon+0x278(SB)/8, $0x92722c851482353b
   171  DATA  ·kcon+0x280(SB)/8, $0xa2bfe8a14cf10364
   172  DATA  ·kcon+0x288(SB)/8, $0xa2bfe8a14cf10364
   173  DATA  ·kcon+0x290(SB)/8, $0xa81a664bbc423001
   174  DATA  ·kcon+0x298(SB)/8, $0xa81a664bbc423001
   175  DATA  ·kcon+0x2A0(SB)/8, $0xc24b8b70d0f89791
   176  DATA  ·kcon+0x2A8(SB)/8, $0xc24b8b70d0f89791
   177  DATA  ·kcon+0x2B0(SB)/8, $0xc76c51a30654be30
   178  DATA  ·kcon+0x2B8(SB)/8, $0xc76c51a30654be30
   179  DATA  ·kcon+0x2C0(SB)/8, $0xd192e819d6ef5218
   180  DATA  ·kcon+0x2C8(SB)/8, $0xd192e819d6ef5218
   181  DATA  ·kcon+0x2D0(SB)/8, $0xd69906245565a910
   182  DATA  ·kcon+0x2D8(SB)/8, $0xd69906245565a910
   183  DATA  ·kcon+0x2E0(SB)/8, $0xf40e35855771202a
   184  DATA  ·kcon+0x2E8(SB)/8, $0xf40e35855771202a
   185  DATA  ·kcon+0x2F0(SB)/8, $0x106aa07032bbd1b8
   186  DATA  ·kcon+0x2F8(SB)/8, $0x106aa07032bbd1b8
   187  DATA  ·kcon+0x300(SB)/8, $0x19a4c116b8d2d0c8
   188  DATA  ·kcon+0x308(SB)/8, $0x19a4c116b8d2d0c8
   189  DATA  ·kcon+0x310(SB)/8, $0x1e376c085141ab53
   190  DATA  ·kcon+0x318(SB)/8, $0x1e376c085141ab53
   191  DATA  ·kcon+0x320(SB)/8, $0x2748774cdf8eeb99
   192  DATA  ·kcon+0x328(SB)/8, $0x2748774cdf8eeb99
   193  DATA  ·kcon+0x330(SB)/8, $0x34b0bcb5e19b48a8
   194  DATA  ·kcon+0x338(SB)/8, $0x34b0bcb5e19b48a8
   195  DATA  ·kcon+0x340(SB)/8, $0x391c0cb3c5c95a63
   196  DATA  ·kcon+0x348(SB)/8, $0x391c0cb3c5c95a63
   197  DATA  ·kcon+0x350(SB)/8, $0x4ed8aa4ae3418acb
   198  DATA  ·kcon+0x358(SB)/8, $0x4ed8aa4ae3418acb
   199  DATA  ·kcon+0x360(SB)/8, $0x5b9cca4f7763e373
   200  DATA  ·kcon+0x368(SB)/8, $0x5b9cca4f7763e373
   201  DATA  ·kcon+0x370(SB)/8, $0x682e6ff3d6b2b8a3
   202  DATA  ·kcon+0x378(SB)/8, $0x682e6ff3d6b2b8a3
   203  DATA  ·kcon+0x380(SB)/8, $0x748f82ee5defb2fc
   204  DATA  ·kcon+0x388(SB)/8, $0x748f82ee5defb2fc
   205  DATA  ·kcon+0x390(SB)/8, $0x78a5636f43172f60
   206  DATA  ·kcon+0x398(SB)/8, $0x78a5636f43172f60
   207  DATA  ·kcon+0x3A0(SB)/8, $0x84c87814a1f0ab72
   208  DATA  ·kcon+0x3A8(SB)/8, $0x84c87814a1f0ab72
   209  DATA  ·kcon+0x3B0(SB)/8, $0x8cc702081a6439ec
   210  DATA  ·kcon+0x3B8(SB)/8, $0x8cc702081a6439ec
   211  DATA  ·kcon+0x3C0(SB)/8, $0x90befffa23631e28
   212  DATA  ·kcon+0x3C8(SB)/8, $0x90befffa23631e28
   213  DATA  ·kcon+0x3D0(SB)/8, $0xa4506cebde82bde9
   214  DATA  ·kcon+0x3D8(SB)/8, $0xa4506cebde82bde9
   215  DATA  ·kcon+0x3E0(SB)/8, $0xbef9a3f7b2c67915
   216  DATA  ·kcon+0x3E8(SB)/8, $0xbef9a3f7b2c67915
   217  DATA  ·kcon+0x3F0(SB)/8, $0xc67178f2e372532b
   218  DATA  ·kcon+0x3F8(SB)/8, $0xc67178f2e372532b
   219  DATA  ·kcon+0x400(SB)/8, $0xca273eceea26619c
   220  DATA  ·kcon+0x408(SB)/8, $0xca273eceea26619c
   221  DATA  ·kcon+0x410(SB)/8, $0xd186b8c721c0c207
   222  DATA  ·kcon+0x418(SB)/8, $0xd186b8c721c0c207
   223  DATA  ·kcon+0x420(SB)/8, $0xeada7dd6cde0eb1e
   224  DATA  ·kcon+0x428(SB)/8, $0xeada7dd6cde0eb1e
   225  DATA  ·kcon+0x430(SB)/8, $0xf57d4f7fee6ed178
   226  DATA  ·kcon+0x438(SB)/8, $0xf57d4f7fee6ed178
   227  DATA  ·kcon+0x440(SB)/8, $0x06f067aa72176fba
   228  DATA  ·kcon+0x448(SB)/8, $0x06f067aa72176fba
   229  DATA  ·kcon+0x450(SB)/8, $0x0a637dc5a2c898a6
   230  DATA  ·kcon+0x458(SB)/8, $0x0a637dc5a2c898a6
   231  DATA  ·kcon+0x460(SB)/8, $0x113f9804bef90dae
   232  DATA  ·kcon+0x468(SB)/8, $0x113f9804bef90dae
   233  DATA  ·kcon+0x470(SB)/8, $0x1b710b35131c471b
   234  DATA  ·kcon+0x478(SB)/8, $0x1b710b35131c471b
   235  DATA  ·kcon+0x480(SB)/8, $0x28db77f523047d84
   236  DATA  ·kcon+0x488(SB)/8, $0x28db77f523047d84
   237  DATA  ·kcon+0x490(SB)/8, $0x32caab7b40c72493
   238  DATA  ·kcon+0x498(SB)/8, $0x32caab7b40c72493
   239  DATA  ·kcon+0x4A0(SB)/8, $0x3c9ebe0a15c9bebc
   240  DATA  ·kcon+0x4A8(SB)/8, $0x3c9ebe0a15c9bebc
   241  DATA  ·kcon+0x4B0(SB)/8, $0x431d67c49c100d4c
   242  DATA  ·kcon+0x4B8(SB)/8, $0x431d67c49c100d4c
   243  DATA  ·kcon+0x4C0(SB)/8, $0x4cc5d4becb3e42b6
   244  DATA  ·kcon+0x4C8(SB)/8, $0x4cc5d4becb3e42b6
   245  DATA  ·kcon+0x4D0(SB)/8, $0x597f299cfc657e2a
   246  DATA  ·kcon+0x4D8(SB)/8, $0x597f299cfc657e2a
   247  DATA  ·kcon+0x4E0(SB)/8, $0x5fcb6fab3ad6faec
   248  DATA  ·kcon+0x4E8(SB)/8, $0x5fcb6fab3ad6faec
   249  DATA  ·kcon+0x4F0(SB)/8, $0x6c44198c4a475817
   250  DATA  ·kcon+0x4F8(SB)/8, $0x6c44198c4a475817
   251  DATA  ·kcon+0x500(SB)/8, $0x0000000000000000
   252  DATA  ·kcon+0x508(SB)/8, $0x0000000000000000
   253  DATA  ·kcon+0x510(SB)/8, $0x1011121314151617
   254  DATA  ·kcon+0x518(SB)/8, $0x0001020304050607
   255  GLOBL ·kcon(SB), RODATA, $1312
   256  
   257  #define SHA512ROUND0(a, b, c, d, e, f, g, h, xi) \
   258  	VSEL		g, f, e, FUNC; \
   259  	VSHASIGMAD	$15, e, $1, S1; \
   260  	VADDUDM		xi, h, h; \
   261  	VSHASIGMAD	$0, a, $1, S0; \
   262  	VADDUDM		FUNC, h, h; \
   263  	VXOR		b, a, FUNC; \
   264  	VADDUDM		S1, h, h; \
   265  	VSEL		b, c, FUNC, FUNC; \
   266  	VADDUDM		KI, g, g; \
   267  	VADDUDM		h, d, d; \
   268  	VADDUDM		FUNC, S0, S0; \
   269  	LVX		(TBL)(IDX), KI; \
   270  	ADD		$16, IDX; \
   271  	VADDUDM		S0, h, h
   272  
   273  #define SHA512ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14) \
   274  	VSHASIGMAD	$0, xj_1, $0, s0; \
   275  	VSEL		g, f, e, FUNC; \
   276  	VSHASIGMAD	$15, e, $1, S1; \
   277  	VADDUDM		xi, h, h; \
   278  	VSHASIGMAD	$0, a, $1, S0; \
   279  	VSHASIGMAD	$15, xj_14, $0, s1; \
   280  	VADDUDM		FUNC, h, h; \
   281  	VXOR		b, a, FUNC; \
   282  	VADDUDM		xj_9, xj, xj; \
   283  	VADDUDM		S1, h, h; \
   284  	VSEL		b, c, FUNC, FUNC; \
   285  	VADDUDM		KI, g, g; \
   286  	VADDUDM		h, d, d; \
   287  	VADDUDM		FUNC, S0, S0; \
   288  	VADDUDM		s0, xj, xj; \
   289  	LVX		(TBL)(IDX), KI; \
   290  	ADD		$16, IDX; \
   291  	VADDUDM		S0, h, h; \
   292  	VADDUDM		s1, xj, xj
   293  
   294  // func block(dig *digest, p []byte)
   295  TEXT ·block(SB),0,$0-32
   296  	MOVD	dig+0(FP), CTX
   297  	MOVD	p_base+8(FP), INP
   298  	MOVD	p_len+16(FP), LEN
   299  
   300  	SRD	$6, LEN
   301  	SLD	$6, LEN
   302  
   303  	ADD	INP, LEN, END
   304  
   305  	CMP	INP, END
   306  	BEQ	end
   307  
   308  	MOVD	$·kcon(SB), TBL
   309  	MOVD	R1, OFFLOAD
   310  
   311  	MOVD	R0, CNT
   312  	MOVWZ	$0x10, HEX10
   313  	MOVWZ	$0x20, HEX20
   314  	MOVWZ	$0x30, HEX30
   315  
   316  // Generate the mask used with VPERM for LE
   317  
   318  #ifdef GOARCH_ppc64le
   319  	MOVWZ	$8, IDX
   320  	LVSL	(IDX)(R0), LEMASK
   321  	VSPLTISB	$0x0F, KI
   322  	VXOR	KI, LEMASK, LEMASK
   323  #endif
   324  
   325  	LXVD2X	(CTX)(HEX00), VS32	// v0 = vs32
   326  	LXVD2X	(CTX)(HEX10), VS34	// v2 = vs34
   327  	LXVD2X	(CTX)(HEX20), VS36	// v4 = vs36
   328  	// unpack the input values into vector registers
   329  	VSLDOI	$8, V0, V0, V1
   330  	LXVD2X	(CTX)(HEX30), VS38	// v6 = vs38
   331  	VSLDOI	$8, V2, V2, V3
   332  	VSLDOI	$8, V4, V4, V5
   333  	VSLDOI	$8, V6, V6, V7
   334  
   335  loop:
   336  	LVX	(TBL)(HEX00), KI
   337  	MOVWZ	$16, IDX
   338  
   339  	LXVD2X	(INP)(R0), VS40	// load v8 (=vs40) in advance
   340  	ADD	$16, INP
   341  
   342  	// Copy V0-V7 to VS24-VS31
   343  
   344  	XXLOR	V0, V0, VS24
   345  	XXLOR	V1, V1, VS25
   346  	XXLOR	V2, V2, VS26
   347  	XXLOR	V3, V3, VS27
   348  	XXLOR	V4, V4, VS28
   349  	XXLOR	V5, V5, VS29
   350  	XXLOR	V6, V6, VS30
   351  	XXLOR	V7, V7, VS31
   352  
   353  	VADDUDM	KI, V7, V7	// h+K[i]
   354  	LVX	(TBL)(IDX), KI
   355  	ADD	$16, IDX
   356  
   357  	VPERMLE(V8,V8,LEMASK,V8)
   358  	SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8)
   359  	LXVD2X	(INP)(R0), VS42	// load v10 (=vs42) in advance
   360  	ADD	$16, INP, INP
   361  	VSLDOI	$8, V8, V8, V9
   362  	SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9)
   363  	VPERMLE(V10,V10,LEMASK,V10)
   364  	SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10)
   365  	LXVD2X	(INP)(R0), VS44	// load v12 (=vs44) in advance
   366  	ADD	$16, INP, INP
   367  	VSLDOI	$8, V10, V10, V11
   368  	SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11)
   369  	VPERMLE(V12,V12,LEMASK,V12)
   370  	SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12)
   371  	LXVD2X	(INP)(R0), VS46	// load v14 (=vs46) in advance
   372  	ADD	$16, INP, INP
   373  	VSLDOI	$8, V12, V12, V13
   374  	SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13)
   375  	VPERMLE(V14,V14,LEMASK,V14)
   376  	SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14)
   377  	LXVD2X	(INP)(R0), VS48	// load v16 (=vs48) in advance
   378  	ADD	$16, INP, INP
   379  	VSLDOI	$8, V14, V14, V15
   380  	SHA512ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15)
   381  	VPERMLE(V16,V16,LEMASK,V16)
   382  	SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16)
   383  	LXVD2X	(INP)(R0), VS50	// load v18 (=vs50) in advance
   384  	ADD	$16, INP, INP
   385  	VSLDOI	$8, V16, V16, V17
   386  	SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17)
   387  	VPERMLE(V18,V18,LEMASK,V18)
   388  	SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18)
   389  	LXVD2X	(INP)(R0), VS52	// load v20 (=vs52) in advance
   390  	ADD	$16, INP, INP
   391  	VSLDOI	$8, V18, V18, V19
   392  	SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19)
   393  	VPERMLE(V20,V20,LEMASK,V20)
   394  	SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20)
   395  	LXVD2X	(INP)(R0), VS54	// load v22 (=vs54) in advance
   396  	ADD	$16, INP, INP
   397  	VSLDOI	$8, V20, V20, V21
   398  	SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21)
   399  	VPERMLE(V22,V22,LEMASK,V22)
   400  	SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22)
   401  	VSLDOI	$8, V22, V22, V23
   402  	SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
   403  
   404  	MOVWZ	$4, TEMP
   405  	MOVWZ	TEMP, CTR
   406  
   407  L16_xx:
   408  	SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23)
   409  	SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8)
   410  	SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9)
   411  	SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10)
   412  	SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11)
   413  	SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12)
   414  	SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13)
   415  	SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14)
   416  	SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15)
   417  	SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16)
   418  	SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17)
   419  	SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18)
   420  	SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19)
   421  	SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20)
   422  	SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21)
   423  	SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
   424  
   425  	BC	0x10, 0, L16_xx		// bdnz
   426  
   427  	XXLOR	VS24, VS24, V10
   428  	XXLOR	VS25, VS25, V11
   429  	XXLOR	VS26, VS26, V12
   430  	XXLOR	VS27, VS27, V13
   431  	XXLOR	VS28, VS28, V14
   432  	XXLOR	VS29, VS29, V15
   433  	XXLOR	VS30, VS30, V16
   434  	XXLOR	VS31, VS31, V17
   435  	VADDUDM	V10, V0, V0
   436  	VADDUDM	V11, V1, V1
   437  	VADDUDM	V12, V2, V2
   438  	VADDUDM	V13, V3, V3
   439  	VADDUDM	V14, V4, V4
   440  	VADDUDM	V15, V5, V5
   441  	VADDUDM	V16, V6, V6
   442  	VADDUDM	V17, V7, V7
   443  
   444  	CMPU	INP, END
   445  	BLT	loop
   446  
   447  #ifdef GOARCH_ppc64le
   448  	VPERM	V0, V1, KI, V0
   449  	VPERM	V2, V3, KI, V2
   450  	VPERM	V4, V5, KI, V4
   451  	VPERM	V6, V7, KI, V6
   452  #else
   453  	VPERM	V1, V0, KI, V0
   454  	VPERM	V3, V2, KI, V2
   455  	VPERM	V5, V4, KI, V4
   456  	VPERM	V7, V6, KI, V6
   457  #endif
   458  	STXVD2X	VS32, (CTX+HEX00)	// v0 = vs32
   459  	STXVD2X	VS34, (CTX+HEX10)	// v2 = vs34
   460  	STXVD2X	VS36, (CTX+HEX20)	// v4 = vs36
   461  	STXVD2X	VS38, (CTX+HEX30)	// v6 = vs38
   462  
   463  end:
   464  	RET
   465