github.com/bir3/gocompiler@v0.9.2202/src/cmd/internal/notsha256/sha256block_ppc64x.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //
     6  // WARNING: this file is built by the bootstrap compiler, thus
     7  // it must maintain compatibility with the oldest supported
     8  // bootstrap toolchain.
     9  //
    10  
    11  //go:build !purego && (ppc64 || ppc64le)
    12  
    13  // Based on CRYPTOGAMS code with the following comment:
    14  // # ====================================================================
    15  // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
    16  // # project. The module is, however, dual licensed under OpenSSL and
    17  // # CRYPTOGAMS licenses depending on where you obtain it. For further
    18  // # details see http://www.openssl.org/~appro/cryptogams/.
    19  // # ====================================================================
    20  
    21  #include "textflag.h"
    22  
    23  // SHA256 block routine. See sha256block.go for Go equivalent.
    24  //
    25  // The algorithm is detailed in FIPS 180-4:
    26  //
    27  //  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    28  //
    29  // Wt = Mt; for 0 <= t <= 15
    30  // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
    31  //
    32  // a = H0
    33  // b = H1
    34  // c = H2
    35  // d = H3
    36  // e = H4
    37  // f = H5
    38  // g = H6
    39  // h = H7
    40  //
    41  // for t = 0 to 63 {
    42  //    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    43  //    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    44  //    h = g
    45  //    g = f
    46  //    f = e
    47  //    e = d + T1
    48  //    d = c
    49  //    c = b
    50  //    b = a
    51  //    a = T1 + T2
    52  // }
    53  //
    54  // H0 = a + H0
    55  // H1 = b + H1
    56  // H2 = c + H2
    57  // H3 = d + H3
    58  // H4 = e + H4
    59  // H5 = f + H5
    60  // H6 = g + H6
    61  // H7 = h + H7
    62  
    63  #define CTX	R3
    64  #define INP	R4
    65  #define END	R5
    66  #define TBL	R6 // Pointer into kcon table
    67  #define LEN	R9
    68  #define TEMP	R12
    69  
    70  #define TBL_STRT	R7 // Pointer to start of kcon table.
    71  
    72  #define R_x000	R0
    73  #define R_x010	R8
    74  #define R_x020	R10
    75  #define R_x030	R11
    76  #define R_x040	R14
    77  #define R_x050	R15
    78  #define R_x060	R16
    79  #define R_x070	R17
    80  #define R_x080	R18
    81  #define R_x090	R19
    82  #define R_x0a0	R20
    83  #define R_x0b0	R21
    84  #define R_x0c0	R22
    85  #define R_x0d0	R23
    86  #define R_x0e0	R24
    87  #define R_x0f0	R25
    88  #define R_x100	R26
    89  #define R_x110	R27
    90  
    91  
    92  // V0-V7 are A-H
    93  // V8-V23 are used for the message schedule
    94  #define KI	V24
    95  #define FUNC	V25
    96  #define S0	V26
    97  #define S1	V27
    98  #define s0	V28
    99  #define s1	V29
   100  #define LEMASK	V31 // Permutation control register for little endian
   101  
   102  // 4 copies of each Kt, to fill all 4 words of a vector register
   103  DATA  ·kcon+0x000(SB)/8, $0x428a2f98428a2f98
   104  DATA  ·kcon+0x008(SB)/8, $0x428a2f98428a2f98
   105  DATA  ·kcon+0x010(SB)/8, $0x7137449171374491
   106  DATA  ·kcon+0x018(SB)/8, $0x7137449171374491
   107  DATA  ·kcon+0x020(SB)/8, $0xb5c0fbcfb5c0fbcf
   108  DATA  ·kcon+0x028(SB)/8, $0xb5c0fbcfb5c0fbcf
   109  DATA  ·kcon+0x030(SB)/8, $0xe9b5dba5e9b5dba5
   110  DATA  ·kcon+0x038(SB)/8, $0xe9b5dba5e9b5dba5
   111  DATA  ·kcon+0x040(SB)/8, $0x3956c25b3956c25b
   112  DATA  ·kcon+0x048(SB)/8, $0x3956c25b3956c25b
   113  DATA  ·kcon+0x050(SB)/8, $0x59f111f159f111f1
   114  DATA  ·kcon+0x058(SB)/8, $0x59f111f159f111f1
   115  DATA  ·kcon+0x060(SB)/8, $0x923f82a4923f82a4
   116  DATA  ·kcon+0x068(SB)/8, $0x923f82a4923f82a4
   117  DATA  ·kcon+0x070(SB)/8, $0xab1c5ed5ab1c5ed5
   118  DATA  ·kcon+0x078(SB)/8, $0xab1c5ed5ab1c5ed5
   119  DATA  ·kcon+0x080(SB)/8, $0xd807aa98d807aa98
   120  DATA  ·kcon+0x088(SB)/8, $0xd807aa98d807aa98
   121  DATA  ·kcon+0x090(SB)/8, $0x12835b0112835b01
   122  DATA  ·kcon+0x098(SB)/8, $0x12835b0112835b01
   123  DATA  ·kcon+0x0A0(SB)/8, $0x243185be243185be
   124  DATA  ·kcon+0x0A8(SB)/8, $0x243185be243185be
   125  DATA  ·kcon+0x0B0(SB)/8, $0x550c7dc3550c7dc3
   126  DATA  ·kcon+0x0B8(SB)/8, $0x550c7dc3550c7dc3
   127  DATA  ·kcon+0x0C0(SB)/8, $0x72be5d7472be5d74
   128  DATA  ·kcon+0x0C8(SB)/8, $0x72be5d7472be5d74
   129  DATA  ·kcon+0x0D0(SB)/8, $0x80deb1fe80deb1fe
   130  DATA  ·kcon+0x0D8(SB)/8, $0x80deb1fe80deb1fe
   131  DATA  ·kcon+0x0E0(SB)/8, $0x9bdc06a79bdc06a7
   132  DATA  ·kcon+0x0E8(SB)/8, $0x9bdc06a79bdc06a7
   133  DATA  ·kcon+0x0F0(SB)/8, $0xc19bf174c19bf174
   134  DATA  ·kcon+0x0F8(SB)/8, $0xc19bf174c19bf174
   135  DATA  ·kcon+0x100(SB)/8, $0xe49b69c1e49b69c1
   136  DATA  ·kcon+0x108(SB)/8, $0xe49b69c1e49b69c1
   137  DATA  ·kcon+0x110(SB)/8, $0xefbe4786efbe4786
   138  DATA  ·kcon+0x118(SB)/8, $0xefbe4786efbe4786
   139  DATA  ·kcon+0x120(SB)/8, $0x0fc19dc60fc19dc6
   140  DATA  ·kcon+0x128(SB)/8, $0x0fc19dc60fc19dc6
   141  DATA  ·kcon+0x130(SB)/8, $0x240ca1cc240ca1cc
   142  DATA  ·kcon+0x138(SB)/8, $0x240ca1cc240ca1cc
   143  DATA  ·kcon+0x140(SB)/8, $0x2de92c6f2de92c6f
   144  DATA  ·kcon+0x148(SB)/8, $0x2de92c6f2de92c6f
   145  DATA  ·kcon+0x150(SB)/8, $0x4a7484aa4a7484aa
   146  DATA  ·kcon+0x158(SB)/8, $0x4a7484aa4a7484aa
   147  DATA  ·kcon+0x160(SB)/8, $0x5cb0a9dc5cb0a9dc
   148  DATA  ·kcon+0x168(SB)/8, $0x5cb0a9dc5cb0a9dc
   149  DATA  ·kcon+0x170(SB)/8, $0x76f988da76f988da
   150  DATA  ·kcon+0x178(SB)/8, $0x76f988da76f988da
   151  DATA  ·kcon+0x180(SB)/8, $0x983e5152983e5152
   152  DATA  ·kcon+0x188(SB)/8, $0x983e5152983e5152
   153  DATA  ·kcon+0x190(SB)/8, $0xa831c66da831c66d
   154  DATA  ·kcon+0x198(SB)/8, $0xa831c66da831c66d
   155  DATA  ·kcon+0x1A0(SB)/8, $0xb00327c8b00327c8
   156  DATA  ·kcon+0x1A8(SB)/8, $0xb00327c8b00327c8
   157  DATA  ·kcon+0x1B0(SB)/8, $0xbf597fc7bf597fc7
   158  DATA  ·kcon+0x1B8(SB)/8, $0xbf597fc7bf597fc7
   159  DATA  ·kcon+0x1C0(SB)/8, $0xc6e00bf3c6e00bf3
   160  DATA  ·kcon+0x1C8(SB)/8, $0xc6e00bf3c6e00bf3
   161  DATA  ·kcon+0x1D0(SB)/8, $0xd5a79147d5a79147
   162  DATA  ·kcon+0x1D8(SB)/8, $0xd5a79147d5a79147
   163  DATA  ·kcon+0x1E0(SB)/8, $0x06ca635106ca6351
   164  DATA  ·kcon+0x1E8(SB)/8, $0x06ca635106ca6351
   165  DATA  ·kcon+0x1F0(SB)/8, $0x1429296714292967
   166  DATA  ·kcon+0x1F8(SB)/8, $0x1429296714292967
   167  DATA  ·kcon+0x200(SB)/8, $0x27b70a8527b70a85
   168  DATA  ·kcon+0x208(SB)/8, $0x27b70a8527b70a85
   169  DATA  ·kcon+0x210(SB)/8, $0x2e1b21382e1b2138
   170  DATA  ·kcon+0x218(SB)/8, $0x2e1b21382e1b2138
   171  DATA  ·kcon+0x220(SB)/8, $0x4d2c6dfc4d2c6dfc
   172  DATA  ·kcon+0x228(SB)/8, $0x4d2c6dfc4d2c6dfc
   173  DATA  ·kcon+0x230(SB)/8, $0x53380d1353380d13
   174  DATA  ·kcon+0x238(SB)/8, $0x53380d1353380d13
   175  DATA  ·kcon+0x240(SB)/8, $0x650a7354650a7354
   176  DATA  ·kcon+0x248(SB)/8, $0x650a7354650a7354
   177  DATA  ·kcon+0x250(SB)/8, $0x766a0abb766a0abb
   178  DATA  ·kcon+0x258(SB)/8, $0x766a0abb766a0abb
   179  DATA  ·kcon+0x260(SB)/8, $0x81c2c92e81c2c92e
   180  DATA  ·kcon+0x268(SB)/8, $0x81c2c92e81c2c92e
   181  DATA  ·kcon+0x270(SB)/8, $0x92722c8592722c85
   182  DATA  ·kcon+0x278(SB)/8, $0x92722c8592722c85
   183  DATA  ·kcon+0x280(SB)/8, $0xa2bfe8a1a2bfe8a1
   184  DATA  ·kcon+0x288(SB)/8, $0xa2bfe8a1a2bfe8a1
   185  DATA  ·kcon+0x290(SB)/8, $0xa81a664ba81a664b
   186  DATA  ·kcon+0x298(SB)/8, $0xa81a664ba81a664b
   187  DATA  ·kcon+0x2A0(SB)/8, $0xc24b8b70c24b8b70
   188  DATA  ·kcon+0x2A8(SB)/8, $0xc24b8b70c24b8b70
   189  DATA  ·kcon+0x2B0(SB)/8, $0xc76c51a3c76c51a3
   190  DATA  ·kcon+0x2B8(SB)/8, $0xc76c51a3c76c51a3
   191  DATA  ·kcon+0x2C0(SB)/8, $0xd192e819d192e819
   192  DATA  ·kcon+0x2C8(SB)/8, $0xd192e819d192e819
   193  DATA  ·kcon+0x2D0(SB)/8, $0xd6990624d6990624
   194  DATA  ·kcon+0x2D8(SB)/8, $0xd6990624d6990624
   195  DATA  ·kcon+0x2E0(SB)/8, $0xf40e3585f40e3585
   196  DATA  ·kcon+0x2E8(SB)/8, $0xf40e3585f40e3585
   197  DATA  ·kcon+0x2F0(SB)/8, $0x106aa070106aa070
   198  DATA  ·kcon+0x2F8(SB)/8, $0x106aa070106aa070
   199  DATA  ·kcon+0x300(SB)/8, $0x19a4c11619a4c116
   200  DATA  ·kcon+0x308(SB)/8, $0x19a4c11619a4c116
   201  DATA  ·kcon+0x310(SB)/8, $0x1e376c081e376c08
   202  DATA  ·kcon+0x318(SB)/8, $0x1e376c081e376c08
   203  DATA  ·kcon+0x320(SB)/8, $0x2748774c2748774c
   204  DATA  ·kcon+0x328(SB)/8, $0x2748774c2748774c
   205  DATA  ·kcon+0x330(SB)/8, $0x34b0bcb534b0bcb5
   206  DATA  ·kcon+0x338(SB)/8, $0x34b0bcb534b0bcb5
   207  DATA  ·kcon+0x340(SB)/8, $0x391c0cb3391c0cb3
   208  DATA  ·kcon+0x348(SB)/8, $0x391c0cb3391c0cb3
   209  DATA  ·kcon+0x350(SB)/8, $0x4ed8aa4a4ed8aa4a
   210  DATA  ·kcon+0x358(SB)/8, $0x4ed8aa4a4ed8aa4a
   211  DATA  ·kcon+0x360(SB)/8, $0x5b9cca4f5b9cca4f
   212  DATA  ·kcon+0x368(SB)/8, $0x5b9cca4f5b9cca4f
   213  DATA  ·kcon+0x370(SB)/8, $0x682e6ff3682e6ff3
   214  DATA  ·kcon+0x378(SB)/8, $0x682e6ff3682e6ff3
   215  DATA  ·kcon+0x380(SB)/8, $0x748f82ee748f82ee
   216  DATA  ·kcon+0x388(SB)/8, $0x748f82ee748f82ee
   217  DATA  ·kcon+0x390(SB)/8, $0x78a5636f78a5636f
   218  DATA  ·kcon+0x398(SB)/8, $0x78a5636f78a5636f
   219  DATA  ·kcon+0x3A0(SB)/8, $0x84c8781484c87814
   220  DATA  ·kcon+0x3A8(SB)/8, $0x84c8781484c87814
   221  DATA  ·kcon+0x3B0(SB)/8, $0x8cc702088cc70208
   222  DATA  ·kcon+0x3B8(SB)/8, $0x8cc702088cc70208
   223  DATA  ·kcon+0x3C0(SB)/8, $0x90befffa90befffa
   224  DATA  ·kcon+0x3C8(SB)/8, $0x90befffa90befffa
   225  DATA  ·kcon+0x3D0(SB)/8, $0xa4506ceba4506ceb
   226  DATA  ·kcon+0x3D8(SB)/8, $0xa4506ceba4506ceb
   227  DATA  ·kcon+0x3E0(SB)/8, $0xbef9a3f7bef9a3f7
   228  DATA  ·kcon+0x3E8(SB)/8, $0xbef9a3f7bef9a3f7
   229  DATA  ·kcon+0x3F0(SB)/8, $0xc67178f2c67178f2
   230  DATA  ·kcon+0x3F8(SB)/8, $0xc67178f2c67178f2
   231  DATA  ·kcon+0x400(SB)/8, $0x0000000000000000
   232  DATA  ·kcon+0x408(SB)/8, $0x0000000000000000
   233  
   234  #ifdef GOARCH_ppc64le
   235  DATA  ·kcon+0x410(SB)/8, $0x1011121310111213 // permutation control vectors
   236  DATA  ·kcon+0x418(SB)/8, $0x1011121300010203
   237  DATA  ·kcon+0x420(SB)/8, $0x1011121310111213
   238  DATA  ·kcon+0x428(SB)/8, $0x0405060700010203
   239  DATA  ·kcon+0x430(SB)/8, $0x1011121308090a0b
   240  DATA  ·kcon+0x438(SB)/8, $0x0405060700010203
   241  #else
   242  DATA  ·kcon+0x410(SB)/8, $0x1011121300010203
   243  DATA  ·kcon+0x418(SB)/8, $0x1011121310111213 // permutation control vectors
   244  DATA  ·kcon+0x420(SB)/8, $0x0405060700010203
   245  DATA  ·kcon+0x428(SB)/8, $0x1011121310111213
   246  DATA  ·kcon+0x430(SB)/8, $0x0001020304050607
   247  DATA  ·kcon+0x438(SB)/8, $0x08090a0b10111213
   248  #endif
   249  
   250  GLOBL ·kcon(SB), RODATA, $1088
   251  
   252  #define SHA256ROUND0(a, b, c, d, e, f, g, h, xi, idx) \
   253  	VSEL		g, f, e, FUNC; \
   254  	VSHASIGMAW	$15, e, $1, S1; \
   255  	VADDUWM		xi, h, h; \
   256  	VSHASIGMAW	$0, a, $1, S0; \
   257  	VADDUWM		FUNC, h, h; \
   258  	VXOR		b, a, FUNC; \
   259  	VADDUWM		S1, h, h; \
   260  	VSEL		b, c, FUNC, FUNC; \
   261  	VADDUWM		KI, g, g; \
   262  	VADDUWM		h, d, d; \
   263  	VADDUWM		FUNC, S0, S0; \
   264  	LVX		(TBL)(idx), KI; \
   265  	VADDUWM		S0, h, h
   266  
   267  #define SHA256ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14, idx) \
   268  	VSHASIGMAW	$0, xj_1, $0, s0; \
   269  	VSEL		g, f, e, FUNC; \
   270  	VSHASIGMAW	$15, e, $1, S1; \
   271  	VADDUWM		xi, h, h; \
   272  	VSHASIGMAW	$0, a, $1, S0; \
   273  	VSHASIGMAW	$15, xj_14, $0, s1; \
   274  	VADDUWM		FUNC, h, h; \
   275  	VXOR		b, a, FUNC; \
   276  	VADDUWM		xj_9, xj, xj; \
   277  	VADDUWM		S1, h, h; \
   278  	VSEL		b, c, FUNC, FUNC; \
   279  	VADDUWM		KI, g, g; \
   280  	VADDUWM		h, d, d; \
   281  	VADDUWM		FUNC, S0, S0; \
   282  	VADDUWM		s0, xj, xj; \
   283  	LVX		(TBL)(idx), KI; \
   284  	VADDUWM		S0, h, h; \
   285  	VADDUWM		s1, xj, xj
   286  
   287  #ifdef GOARCH_ppc64le
   288  #define VPERMLE(va,vb,vc,vt) VPERM va, vb, vc, vt
   289  #else
   290  #define VPERMLE(va,vb,vc,vt)
   291  #endif
   292  
   293  // func block(dig *digest, p []byte)
   294  TEXT ·block(SB),0,$0-32
   295  	MOVD	dig+0(FP), CTX
   296  	MOVD	p_base+8(FP), INP
   297  	MOVD	p_len+16(FP), LEN
   298  
   299  	SRD	$6, LEN
   300  	SLD	$6, LEN
   301  	ADD	INP, LEN, END
   302  
   303  	CMP	INP, END
   304  	BEQ	end
   305  
   306  	MOVD	$·kcon(SB), TBL_STRT
   307  	MOVD	$0x10, R_x010
   308  
   309  #ifdef GOARCH_ppc64le
   310  	MOVWZ	$8, TEMP
   311  	LVSL	(TEMP)(R0), LEMASK
   312  	VSPLTISB	$0x0F, KI
   313  	VXOR	KI, LEMASK, LEMASK
   314  #endif
   315  
   316  	LXVW4X	(CTX)(R_x000), V0
   317  	LXVW4X	(CTX)(R_x010), V4
   318  
   319  	// unpack the input values into vector registers
   320  	VSLDOI	$4, V0, V0, V1
   321  	VSLDOI	$8, V0, V0, V2
   322  	VSLDOI	$12, V0, V0, V3
   323  	VSLDOI	$4, V4, V4, V5
   324  	VSLDOI	$8, V4, V4, V6
   325  	VSLDOI	$12, V4, V4, V7
   326  
   327  	MOVD	$0x020, R_x020
   328  	MOVD	$0x030, R_x030
   329  	MOVD	$0x040, R_x040
   330  	MOVD	$0x050, R_x050
   331  	MOVD	$0x060, R_x060
   332  	MOVD	$0x070, R_x070
   333  	MOVD	$0x080, R_x080
   334  	MOVD	$0x090, R_x090
   335  	MOVD	$0x0a0, R_x0a0
   336  	MOVD	$0x0b0, R_x0b0
   337  	MOVD	$0x0c0, R_x0c0
   338  	MOVD	$0x0d0, R_x0d0
   339  	MOVD	$0x0e0, R_x0e0
   340  	MOVD	$0x0f0, R_x0f0
   341  	MOVD	$0x100, R_x100
   342  	MOVD	$0x110, R_x110
   343  
   344  loop:
   345  	MOVD	TBL_STRT, TBL
   346  	LVX	(TBL)(R_x000), KI
   347  
   348  	LXVD2X	(INP)(R_x000), V8 // load v8 in advance
   349  
   350  	// Offload to VSR24-31 (aka FPR24-31)
   351  	XXLOR	V0, V0, VS24
   352  	XXLOR	V1, V1, VS25
   353  	XXLOR	V2, V2, VS26
   354  	XXLOR	V3, V3, VS27
   355  	XXLOR	V4, V4, VS28
   356  	XXLOR	V5, V5, VS29
   357  	XXLOR	V6, V6, VS30
   358  	XXLOR	V7, V7, VS31
   359  
   360  	VADDUWM	KI, V7, V7        // h+K[i]
   361  	LVX	(TBL)(R_x010), KI
   362  
   363  	VPERMLE(V8, V8, LEMASK, V8)
   364  	SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8, R_x020)
   365  	VSLDOI	$4, V8, V8, V9
   366  	SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9, R_x030)
   367  	VSLDOI	$4, V9, V9, V10
   368  	SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10, R_x040)
   369  	LXVD2X	(INP)(R_x010), V12 // load v12 in advance
   370  	VSLDOI	$4, V10, V10, V11
   371  	SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11, R_x050)
   372  	VPERMLE(V12, V12, LEMASK, V12)
   373  	SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12, R_x060)
   374  	VSLDOI	$4, V12, V12, V13
   375  	SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13, R_x070)
   376  	VSLDOI	$4, V13, V13, V14
   377  	SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14, R_x080)
   378  	LXVD2X	(INP)(R_x020), V16 // load v16 in advance
   379  	VSLDOI	$4, V14, V14, V15
   380  	SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15, R_x090)
   381  	VPERMLE(V16, V16, LEMASK, V16)
   382  	SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16, R_x0a0)
   383  	VSLDOI	$4, V16, V16, V17
   384  	SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17, R_x0b0)
   385  	VSLDOI	$4, V17, V17, V18
   386  	SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18, R_x0c0)
   387  	VSLDOI	$4, V18, V18, V19
   388  	LXVD2X	(INP)(R_x030), V20 // load v20 in advance
   389  	SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19, R_x0d0)
   390  	VPERMLE(V20, V20, LEMASK, V20)
   391  	SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20, R_x0e0)
   392  	VSLDOI	$4, V20, V20, V21
   393  	SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21, R_x0f0)
   394  	VSLDOI	$4, V21, V21, V22
   395  	SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22, R_x100)
   396  	VSLDOI	$4, V22, V22, V23
   397  	SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x110)
   398  
   399  	MOVD	$3, TEMP
   400  	MOVD	TEMP, CTR
   401  	ADD	$0x120, TBL
   402  	ADD	$0x40, INP
   403  
   404  L16_xx:
   405  	SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23, R_x000)
   406  	SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8, R_x010)
   407  	SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9, R_x020)
   408  	SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10, R_x030)
   409  	SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11, R_x040)
   410  	SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12, R_x050)
   411  	SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13, R_x060)
   412  	SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14, R_x070)
   413  	SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15, R_x080)
   414  	SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16, R_x090)
   415  	SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17, R_x0a0)
   416  	SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18, R_x0b0)
   417  	SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19, R_x0c0)
   418  	SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20, R_x0d0)
   419  	SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21, R_x0e0)
   420  	SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x0f0)
   421  	ADD	$0x100, TBL
   422  
   423  	BDNZ	L16_xx
   424  
   425  	XXLOR	VS24, VS24, V10
   426  
   427  	XXLOR	VS25, VS25, V11
   428  	VADDUWM	V10, V0, V0
   429  	XXLOR	VS26, VS26, V12
   430  	VADDUWM	V11, V1, V1
   431  	XXLOR	VS27, VS27, V13
   432  	VADDUWM	V12, V2, V2
   433  	XXLOR	VS28, VS28, V14
   434  	VADDUWM	V13, V3, V3
   435  	XXLOR	VS29, VS29, V15
   436  	VADDUWM	V14, V4, V4
   437  	XXLOR	VS30, VS30, V16
   438  	VADDUWM	V15, V5, V5
   439  	XXLOR	VS31, VS31, V17
   440  	VADDUWM	V16, V6, V6
   441  	VADDUWM	V17, V7, V7
   442  
   443  	CMPU	INP, END
   444  	BLT	loop
   445  
   446  	LVX	(TBL)(R_x000), V8
   447  	VPERM	V0, V1, KI, V0
   448  	LVX	(TBL)(R_x010), V9
   449  	VPERM	V4, V5, KI, V4
   450  	VPERM	V0, V2, V8, V0
   451  	VPERM	V4, V6, V8, V4
   452  	VPERM	V0, V3, V9, V0
   453  	VPERM	V4, V7, V9, V4
   454  	STXVD2X	V0, (CTX+R_x000)
   455  	STXVD2X	V4, (CTX+R_x010)
   456  
   457  end:
   458  	RET
   459