github.com/bir3/gocompiler@v0.3.205/src/cmd/internal/notsha256/sha256block_ppc64x.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //
     6  // WARNING: this file is built by the bootstrap compiler, thus
     7  // it must maintain compatibility with the oldest supported
     8  // bootstrap toolchain.
     9  //
    10  
    11  //go:build !purego && (ppc64 || ppc64le)
    12  // +build !purego
    13  // +build ppc64 ppc64le
    14  
    15  // Based on CRYPTOGAMS code with the following comment:
    16  // # ====================================================================
    17  // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
    18  // # project. The module is, however, dual licensed under OpenSSL and
    19  // # CRYPTOGAMS licenses depending on where you obtain it. For further
    20  // # details see http://www.openssl.org/~appro/cryptogams/.
    21  // # ====================================================================
    22  
    23  #include "textflag.h"
    24  
    25  // SHA256 block routine. See sha256block.go for Go equivalent.
    26  //
    27  // The algorithm is detailed in FIPS 180-4:
    28  //
    29  //  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    30  //
    31  // Wt = Mt; for 0 <= t <= 15
    32  // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
    33  //
    34  // a = H0
    35  // b = H1
    36  // c = H2
    37  // d = H3
    38  // e = H4
    39  // f = H5
    40  // g = H6
    41  // h = H7
    42  //
    43  // for t = 0 to 63 {
    44  //    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    45  //    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    46  //    h = g
    47  //    g = f
    48  //    f = e
    49  //    e = d + T1
    50  //    d = c
    51  //    c = b
    52  //    b = a
    53  //    a = T1 + T2
    54  // }
    55  //
    56  // H0 = a + H0
    57  // H1 = b + H1
    58  // H2 = c + H2
    59  // H3 = d + H3
    60  // H4 = e + H4
    61  // H5 = f + H5
    62  // H6 = g + H6
    63  // H7 = h + H7
    64  
    65  #define CTX	R3
    66  #define INP	R4
    67  #define END	R5
    68  #define TBL	R6 // Pointer into kcon table
    69  #define LEN	R9
    70  #define TEMP	R12
    71  
    72  #define TBL_STRT	R7 // Pointer to start of kcon table.
    73  
    74  #define R_x000	R0
    75  #define R_x010	R8
    76  #define R_x020	R10
    77  #define R_x030	R11
    78  #define R_x040	R14
    79  #define R_x050	R15
    80  #define R_x060	R16
    81  #define R_x070	R17
    82  #define R_x080	R18
    83  #define R_x090	R19
    84  #define R_x0a0	R20
    85  #define R_x0b0	R21
    86  #define R_x0c0	R22
    87  #define R_x0d0	R23
    88  #define R_x0e0	R24
    89  #define R_x0f0	R25
    90  #define R_x100	R26
    91  #define R_x110	R27
    92  
    93  
    94  // V0-V7 are A-H
    95  // V8-V23 are used for the message schedule
    96  #define KI	V24
    97  #define FUNC	V25
    98  #define S0	V26
    99  #define S1	V27
   100  #define s0	V28
   101  #define s1	V29
   102  #define LEMASK	V31 // Permutation control register for little endian
   103  
   104  // 4 copies of each Kt, to fill all 4 words of a vector register
   105  DATA  ·kcon+0x000(SB)/8, $0x428a2f98428a2f98
   106  DATA  ·kcon+0x008(SB)/8, $0x428a2f98428a2f98
   107  DATA  ·kcon+0x010(SB)/8, $0x7137449171374491
   108  DATA  ·kcon+0x018(SB)/8, $0x7137449171374491
   109  DATA  ·kcon+0x020(SB)/8, $0xb5c0fbcfb5c0fbcf
   110  DATA  ·kcon+0x028(SB)/8, $0xb5c0fbcfb5c0fbcf
   111  DATA  ·kcon+0x030(SB)/8, $0xe9b5dba5e9b5dba5
   112  DATA  ·kcon+0x038(SB)/8, $0xe9b5dba5e9b5dba5
   113  DATA  ·kcon+0x040(SB)/8, $0x3956c25b3956c25b
   114  DATA  ·kcon+0x048(SB)/8, $0x3956c25b3956c25b
   115  DATA  ·kcon+0x050(SB)/8, $0x59f111f159f111f1
   116  DATA  ·kcon+0x058(SB)/8, $0x59f111f159f111f1
   117  DATA  ·kcon+0x060(SB)/8, $0x923f82a4923f82a4
   118  DATA  ·kcon+0x068(SB)/8, $0x923f82a4923f82a4
   119  DATA  ·kcon+0x070(SB)/8, $0xab1c5ed5ab1c5ed5
   120  DATA  ·kcon+0x078(SB)/8, $0xab1c5ed5ab1c5ed5
   121  DATA  ·kcon+0x080(SB)/8, $0xd807aa98d807aa98
   122  DATA  ·kcon+0x088(SB)/8, $0xd807aa98d807aa98
   123  DATA  ·kcon+0x090(SB)/8, $0x12835b0112835b01
   124  DATA  ·kcon+0x098(SB)/8, $0x12835b0112835b01
   125  DATA  ·kcon+0x0A0(SB)/8, $0x243185be243185be
   126  DATA  ·kcon+0x0A8(SB)/8, $0x243185be243185be
   127  DATA  ·kcon+0x0B0(SB)/8, $0x550c7dc3550c7dc3
   128  DATA  ·kcon+0x0B8(SB)/8, $0x550c7dc3550c7dc3
   129  DATA  ·kcon+0x0C0(SB)/8, $0x72be5d7472be5d74
   130  DATA  ·kcon+0x0C8(SB)/8, $0x72be5d7472be5d74
   131  DATA  ·kcon+0x0D0(SB)/8, $0x80deb1fe80deb1fe
   132  DATA  ·kcon+0x0D8(SB)/8, $0x80deb1fe80deb1fe
   133  DATA  ·kcon+0x0E0(SB)/8, $0x9bdc06a79bdc06a7
   134  DATA  ·kcon+0x0E8(SB)/8, $0x9bdc06a79bdc06a7
   135  DATA  ·kcon+0x0F0(SB)/8, $0xc19bf174c19bf174
   136  DATA  ·kcon+0x0F8(SB)/8, $0xc19bf174c19bf174
   137  DATA  ·kcon+0x100(SB)/8, $0xe49b69c1e49b69c1
   138  DATA  ·kcon+0x108(SB)/8, $0xe49b69c1e49b69c1
   139  DATA  ·kcon+0x110(SB)/8, $0xefbe4786efbe4786
   140  DATA  ·kcon+0x118(SB)/8, $0xefbe4786efbe4786
   141  DATA  ·kcon+0x120(SB)/8, $0x0fc19dc60fc19dc6
   142  DATA  ·kcon+0x128(SB)/8, $0x0fc19dc60fc19dc6
   143  DATA  ·kcon+0x130(SB)/8, $0x240ca1cc240ca1cc
   144  DATA  ·kcon+0x138(SB)/8, $0x240ca1cc240ca1cc
   145  DATA  ·kcon+0x140(SB)/8, $0x2de92c6f2de92c6f
   146  DATA  ·kcon+0x148(SB)/8, $0x2de92c6f2de92c6f
   147  DATA  ·kcon+0x150(SB)/8, $0x4a7484aa4a7484aa
   148  DATA  ·kcon+0x158(SB)/8, $0x4a7484aa4a7484aa
   149  DATA  ·kcon+0x160(SB)/8, $0x5cb0a9dc5cb0a9dc
   150  DATA  ·kcon+0x168(SB)/8, $0x5cb0a9dc5cb0a9dc
   151  DATA  ·kcon+0x170(SB)/8, $0x76f988da76f988da
   152  DATA  ·kcon+0x178(SB)/8, $0x76f988da76f988da
   153  DATA  ·kcon+0x180(SB)/8, $0x983e5152983e5152
   154  DATA  ·kcon+0x188(SB)/8, $0x983e5152983e5152
   155  DATA  ·kcon+0x190(SB)/8, $0xa831c66da831c66d
   156  DATA  ·kcon+0x198(SB)/8, $0xa831c66da831c66d
   157  DATA  ·kcon+0x1A0(SB)/8, $0xb00327c8b00327c8
   158  DATA  ·kcon+0x1A8(SB)/8, $0xb00327c8b00327c8
   159  DATA  ·kcon+0x1B0(SB)/8, $0xbf597fc7bf597fc7
   160  DATA  ·kcon+0x1B8(SB)/8, $0xbf597fc7bf597fc7
   161  DATA  ·kcon+0x1C0(SB)/8, $0xc6e00bf3c6e00bf3
   162  DATA  ·kcon+0x1C8(SB)/8, $0xc6e00bf3c6e00bf3
   163  DATA  ·kcon+0x1D0(SB)/8, $0xd5a79147d5a79147
   164  DATA  ·kcon+0x1D8(SB)/8, $0xd5a79147d5a79147
   165  DATA  ·kcon+0x1E0(SB)/8, $0x06ca635106ca6351
   166  DATA  ·kcon+0x1E8(SB)/8, $0x06ca635106ca6351
   167  DATA  ·kcon+0x1F0(SB)/8, $0x1429296714292967
   168  DATA  ·kcon+0x1F8(SB)/8, $0x1429296714292967
   169  DATA  ·kcon+0x200(SB)/8, $0x27b70a8527b70a85
   170  DATA  ·kcon+0x208(SB)/8, $0x27b70a8527b70a85
   171  DATA  ·kcon+0x210(SB)/8, $0x2e1b21382e1b2138
   172  DATA  ·kcon+0x218(SB)/8, $0x2e1b21382e1b2138
   173  DATA  ·kcon+0x220(SB)/8, $0x4d2c6dfc4d2c6dfc
   174  DATA  ·kcon+0x228(SB)/8, $0x4d2c6dfc4d2c6dfc
   175  DATA  ·kcon+0x230(SB)/8, $0x53380d1353380d13
   176  DATA  ·kcon+0x238(SB)/8, $0x53380d1353380d13
   177  DATA  ·kcon+0x240(SB)/8, $0x650a7354650a7354
   178  DATA  ·kcon+0x248(SB)/8, $0x650a7354650a7354
   179  DATA  ·kcon+0x250(SB)/8, $0x766a0abb766a0abb
   180  DATA  ·kcon+0x258(SB)/8, $0x766a0abb766a0abb
   181  DATA  ·kcon+0x260(SB)/8, $0x81c2c92e81c2c92e
   182  DATA  ·kcon+0x268(SB)/8, $0x81c2c92e81c2c92e
   183  DATA  ·kcon+0x270(SB)/8, $0x92722c8592722c85
   184  DATA  ·kcon+0x278(SB)/8, $0x92722c8592722c85
   185  DATA  ·kcon+0x280(SB)/8, $0xa2bfe8a1a2bfe8a1
   186  DATA  ·kcon+0x288(SB)/8, $0xa2bfe8a1a2bfe8a1
   187  DATA  ·kcon+0x290(SB)/8, $0xa81a664ba81a664b
   188  DATA  ·kcon+0x298(SB)/8, $0xa81a664ba81a664b
   189  DATA  ·kcon+0x2A0(SB)/8, $0xc24b8b70c24b8b70
   190  DATA  ·kcon+0x2A8(SB)/8, $0xc24b8b70c24b8b70
   191  DATA  ·kcon+0x2B0(SB)/8, $0xc76c51a3c76c51a3
   192  DATA  ·kcon+0x2B8(SB)/8, $0xc76c51a3c76c51a3
   193  DATA  ·kcon+0x2C0(SB)/8, $0xd192e819d192e819
   194  DATA  ·kcon+0x2C8(SB)/8, $0xd192e819d192e819
   195  DATA  ·kcon+0x2D0(SB)/8, $0xd6990624d6990624
   196  DATA  ·kcon+0x2D8(SB)/8, $0xd6990624d6990624
   197  DATA  ·kcon+0x2E0(SB)/8, $0xf40e3585f40e3585
   198  DATA  ·kcon+0x2E8(SB)/8, $0xf40e3585f40e3585
   199  DATA  ·kcon+0x2F0(SB)/8, $0x106aa070106aa070
   200  DATA  ·kcon+0x2F8(SB)/8, $0x106aa070106aa070
   201  DATA  ·kcon+0x300(SB)/8, $0x19a4c11619a4c116
   202  DATA  ·kcon+0x308(SB)/8, $0x19a4c11619a4c116
   203  DATA  ·kcon+0x310(SB)/8, $0x1e376c081e376c08
   204  DATA  ·kcon+0x318(SB)/8, $0x1e376c081e376c08
   205  DATA  ·kcon+0x320(SB)/8, $0x2748774c2748774c
   206  DATA  ·kcon+0x328(SB)/8, $0x2748774c2748774c
   207  DATA  ·kcon+0x330(SB)/8, $0x34b0bcb534b0bcb5
   208  DATA  ·kcon+0x338(SB)/8, $0x34b0bcb534b0bcb5
   209  DATA  ·kcon+0x340(SB)/8, $0x391c0cb3391c0cb3
   210  DATA  ·kcon+0x348(SB)/8, $0x391c0cb3391c0cb3
   211  DATA  ·kcon+0x350(SB)/8, $0x4ed8aa4a4ed8aa4a
   212  DATA  ·kcon+0x358(SB)/8, $0x4ed8aa4a4ed8aa4a
   213  DATA  ·kcon+0x360(SB)/8, $0x5b9cca4f5b9cca4f
   214  DATA  ·kcon+0x368(SB)/8, $0x5b9cca4f5b9cca4f
   215  DATA  ·kcon+0x370(SB)/8, $0x682e6ff3682e6ff3
   216  DATA  ·kcon+0x378(SB)/8, $0x682e6ff3682e6ff3
   217  DATA  ·kcon+0x380(SB)/8, $0x748f82ee748f82ee
   218  DATA  ·kcon+0x388(SB)/8, $0x748f82ee748f82ee
   219  DATA  ·kcon+0x390(SB)/8, $0x78a5636f78a5636f
   220  DATA  ·kcon+0x398(SB)/8, $0x78a5636f78a5636f
   221  DATA  ·kcon+0x3A0(SB)/8, $0x84c8781484c87814
   222  DATA  ·kcon+0x3A8(SB)/8, $0x84c8781484c87814
   223  DATA  ·kcon+0x3B0(SB)/8, $0x8cc702088cc70208
   224  DATA  ·kcon+0x3B8(SB)/8, $0x8cc702088cc70208
   225  DATA  ·kcon+0x3C0(SB)/8, $0x90befffa90befffa
   226  DATA  ·kcon+0x3C8(SB)/8, $0x90befffa90befffa
   227  DATA  ·kcon+0x3D0(SB)/8, $0xa4506ceba4506ceb
   228  DATA  ·kcon+0x3D8(SB)/8, $0xa4506ceba4506ceb
   229  DATA  ·kcon+0x3E0(SB)/8, $0xbef9a3f7bef9a3f7
   230  DATA  ·kcon+0x3E8(SB)/8, $0xbef9a3f7bef9a3f7
   231  DATA  ·kcon+0x3F0(SB)/8, $0xc67178f2c67178f2
   232  DATA  ·kcon+0x3F8(SB)/8, $0xc67178f2c67178f2
   233  DATA  ·kcon+0x400(SB)/8, $0x0000000000000000
   234  DATA  ·kcon+0x408(SB)/8, $0x0000000000000000
   235  
   236  #ifdef GOARCH_ppc64le
   237  DATA  ·kcon+0x410(SB)/8, $0x1011121310111213 // permutation control vectors
   238  DATA  ·kcon+0x418(SB)/8, $0x1011121300010203
   239  DATA  ·kcon+0x420(SB)/8, $0x1011121310111213
   240  DATA  ·kcon+0x428(SB)/8, $0x0405060700010203
   241  DATA  ·kcon+0x430(SB)/8, $0x1011121308090a0b
   242  DATA  ·kcon+0x438(SB)/8, $0x0405060700010203
   243  #else
   244  DATA  ·kcon+0x410(SB)/8, $0x1011121300010203
   245  DATA  ·kcon+0x418(SB)/8, $0x1011121310111213 // permutation control vectors
   246  DATA  ·kcon+0x420(SB)/8, $0x0405060700010203
   247  DATA  ·kcon+0x428(SB)/8, $0x1011121310111213
   248  DATA  ·kcon+0x430(SB)/8, $0x0001020304050607
   249  DATA  ·kcon+0x438(SB)/8, $0x08090a0b10111213
   250  #endif
   251  
   252  GLOBL ·kcon(SB), RODATA, $1088
   253  
   254  #define SHA256ROUND0(a, b, c, d, e, f, g, h, xi, idx) \
   255  	VSEL		g, f, e, FUNC; \
   256  	VSHASIGMAW	$15, e, $1, S1; \
   257  	VADDUWM		xi, h, h; \
   258  	VSHASIGMAW	$0, a, $1, S0; \
   259  	VADDUWM		FUNC, h, h; \
   260  	VXOR		b, a, FUNC; \
   261  	VADDUWM		S1, h, h; \
   262  	VSEL		b, c, FUNC, FUNC; \
   263  	VADDUWM		KI, g, g; \
   264  	VADDUWM		h, d, d; \
   265  	VADDUWM		FUNC, S0, S0; \
   266  	LVX		(TBL)(idx), KI; \
   267  	VADDUWM		S0, h, h
   268  
   269  #define SHA256ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14, idx) \
   270  	VSHASIGMAW	$0, xj_1, $0, s0; \
   271  	VSEL		g, f, e, FUNC; \
   272  	VSHASIGMAW	$15, e, $1, S1; \
   273  	VADDUWM		xi, h, h; \
   274  	VSHASIGMAW	$0, a, $1, S0; \
   275  	VSHASIGMAW	$15, xj_14, $0, s1; \
   276  	VADDUWM		FUNC, h, h; \
   277  	VXOR		b, a, FUNC; \
   278  	VADDUWM		xj_9, xj, xj; \
   279  	VADDUWM		S1, h, h; \
   280  	VSEL		b, c, FUNC, FUNC; \
   281  	VADDUWM		KI, g, g; \
   282  	VADDUWM		h, d, d; \
   283  	VADDUWM		FUNC, S0, S0; \
   284  	VADDUWM		s0, xj, xj; \
   285  	LVX		(TBL)(idx), KI; \
   286  	VADDUWM		S0, h, h; \
   287  	VADDUWM		s1, xj, xj
   288  
   289  #ifdef GOARCH_ppc64le
   290  #define VPERMLE(va,vb,vc,vt) VPERM va, vb, vc, vt
   291  #else
   292  #define VPERMLE(va,vb,vc,vt)
   293  #endif
   294  
   295  // func block(dig *digest, p []byte)
   296  TEXT ·block(SB),0,$0-32
   297  	MOVD	dig+0(FP), CTX
   298  	MOVD	p_base+8(FP), INP
   299  	MOVD	p_len+16(FP), LEN
   300  
   301  	SRD	$6, LEN
   302  	SLD	$6, LEN
   303  	ADD	INP, LEN, END
   304  
   305  	CMP	INP, END
   306  	BEQ	end
   307  
   308  	MOVD	$·kcon(SB), TBL_STRT
   309  	MOVD	$0x10, R_x010
   310  
   311  #ifdef GOARCH_ppc64le
   312  	MOVWZ	$8, TEMP
   313  	LVSL	(TEMP)(R0), LEMASK
   314  	VSPLTISB	$0x0F, KI
   315  	VXOR	KI, LEMASK, LEMASK
   316  #endif
   317  
   318  	LXVW4X	(CTX)(R_x000), V0
   319  	LXVW4X	(CTX)(R_x010), V4
   320  
   321  	// unpack the input values into vector registers
   322  	VSLDOI	$4, V0, V0, V1
   323  	VSLDOI	$8, V0, V0, V2
   324  	VSLDOI	$12, V0, V0, V3
   325  	VSLDOI	$4, V4, V4, V5
   326  	VSLDOI	$8, V4, V4, V6
   327  	VSLDOI	$12, V4, V4, V7
   328  
   329  	MOVD	$0x020, R_x020
   330  	MOVD	$0x030, R_x030
   331  	MOVD	$0x040, R_x040
   332  	MOVD	$0x050, R_x050
   333  	MOVD	$0x060, R_x060
   334  	MOVD	$0x070, R_x070
   335  	MOVD	$0x080, R_x080
   336  	MOVD	$0x090, R_x090
   337  	MOVD	$0x0a0, R_x0a0
   338  	MOVD	$0x0b0, R_x0b0
   339  	MOVD	$0x0c0, R_x0c0
   340  	MOVD	$0x0d0, R_x0d0
   341  	MOVD	$0x0e0, R_x0e0
   342  	MOVD	$0x0f0, R_x0f0
   343  	MOVD	$0x100, R_x100
   344  	MOVD	$0x110, R_x110
   345  
   346  loop:
   347  	MOVD	TBL_STRT, TBL
   348  	LVX	(TBL)(R_x000), KI
   349  
   350  	LXVD2X	(INP)(R_x000), V8 // load v8 in advance
   351  
   352  	// Offload to VSR24-31 (aka FPR24-31)
   353  	XXLOR	V0, V0, VS24
   354  	XXLOR	V1, V1, VS25
   355  	XXLOR	V2, V2, VS26
   356  	XXLOR	V3, V3, VS27
   357  	XXLOR	V4, V4, VS28
   358  	XXLOR	V5, V5, VS29
   359  	XXLOR	V6, V6, VS30
   360  	XXLOR	V7, V7, VS31
   361  
   362  	VADDUWM	KI, V7, V7        // h+K[i]
   363  	LVX	(TBL)(R_x010), KI
   364  
   365  	VPERMLE(V8, V8, LEMASK, V8)
   366  	SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8, R_x020)
   367  	VSLDOI	$4, V8, V8, V9
   368  	SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9, R_x030)
   369  	VSLDOI	$4, V9, V9, V10
   370  	SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10, R_x040)
   371  	LXVD2X	(INP)(R_x010), V12 // load v12 in advance
   372  	VSLDOI	$4, V10, V10, V11
   373  	SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11, R_x050)
   374  	VPERMLE(V12, V12, LEMASK, V12)
   375  	SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12, R_x060)
   376  	VSLDOI	$4, V12, V12, V13
   377  	SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13, R_x070)
   378  	VSLDOI	$4, V13, V13, V14
   379  	SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14, R_x080)
   380  	LXVD2X	(INP)(R_x020), V16 // load v16 in advance
   381  	VSLDOI	$4, V14, V14, V15
   382  	SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15, R_x090)
   383  	VPERMLE(V16, V16, LEMASK, V16)
   384  	SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16, R_x0a0)
   385  	VSLDOI	$4, V16, V16, V17
   386  	SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17, R_x0b0)
   387  	VSLDOI	$4, V17, V17, V18
   388  	SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18, R_x0c0)
   389  	VSLDOI	$4, V18, V18, V19
   390  	LXVD2X	(INP)(R_x030), V20 // load v20 in advance
   391  	SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19, R_x0d0)
   392  	VPERMLE(V20, V20, LEMASK, V20)
   393  	SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20, R_x0e0)
   394  	VSLDOI	$4, V20, V20, V21
   395  	SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21, R_x0f0)
   396  	VSLDOI	$4, V21, V21, V22
   397  	SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22, R_x100)
   398  	VSLDOI	$4, V22, V22, V23
   399  	SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x110)
   400  
   401  	MOVD	$3, TEMP
   402  	MOVD	TEMP, CTR
   403  	ADD	$0x120, TBL
   404  	ADD	$0x40, INP
   405  
   406  L16_xx:
   407  	SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23, R_x000)
   408  	SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8, R_x010)
   409  	SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9, R_x020)
   410  	SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10, R_x030)
   411  	SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11, R_x040)
   412  	SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12, R_x050)
   413  	SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13, R_x060)
   414  	SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14, R_x070)
   415  	SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15, R_x080)
   416  	SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16, R_x090)
   417  	SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17, R_x0a0)
   418  	SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18, R_x0b0)
   419  	SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19, R_x0c0)
   420  	SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20, R_x0d0)
   421  	SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21, R_x0e0)
   422  	SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x0f0)
   423  	ADD	$0x100, TBL
   424  
   425  	BDNZ	L16_xx
   426  
   427  	XXLOR	VS24, VS24, V10
   428  
   429  	XXLOR	VS25, VS25, V11
   430  	VADDUWM	V10, V0, V0
   431  	XXLOR	VS26, VS26, V12
   432  	VADDUWM	V11, V1, V1
   433  	XXLOR	VS27, VS27, V13
   434  	VADDUWM	V12, V2, V2
   435  	XXLOR	VS28, VS28, V14
   436  	VADDUWM	V13, V3, V3
   437  	XXLOR	VS29, VS29, V15
   438  	VADDUWM	V14, V4, V4
   439  	XXLOR	VS30, VS30, V16
   440  	VADDUWM	V15, V5, V5
   441  	XXLOR	VS31, VS31, V17
   442  	VADDUWM	V16, V6, V6
   443  	VADDUWM	V17, V7, V7
   444  
   445  	CMPU	INP, END
   446  	BLT	loop
   447  
   448  	LVX	(TBL)(R_x000), V8
   449  	VPERM	V0, V1, KI, V0
   450  	LVX	(TBL)(R_x010), V9
   451  	VPERM	V4, V5, KI, V4
   452  	VPERM	V0, V2, V8, V0
   453  	VPERM	V4, V6, V8, V4
   454  	VPERM	V0, V3, V9, V0
   455  	VPERM	V4, V7, V9, V4
   456  	STXVD2X	V0, (CTX+R_x000)
   457  	STXVD2X	V4, (CTX+R_x010)
   458  
   459  end:
   460  	RET
   461