github.com/mattn/go@v0.0.0-20171011075504-07f7db3ea99f/src/crypto/sha256/sha256block_ppc64le.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // This is a derived work from OpenSSL of SHA-2 using assembly optimizations. The
     6  // original code was written by Andy Polyakov <appro@openssl.org> and it's dual
     7  // licensed under OpenSSL and CRYPTOGAMS licenses depending on where you obtain
     8  // it. For further details see http://www.openssl.org/~appro/cryptogams/.
     9  
    10  #include "textflag.h"
    11  
    12  // SHA256 block routine. See sha256block.go for Go equivalent.
    13  //
    14  // The algorithm is detailed in FIPS 180-4:
    15  //
    16  //  http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
    17  //
    18  // Wt = Mt; for 0 <= t <= 15
    19  // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
    20  //
    21  // a = H0
    22  // b = H1
    23  // c = H2
    24  // d = H3
    25  // e = H4
    26  // f = H5
    27  // g = H6
    28  // h = H7
    29  //
    30  // for t = 0 to 63 {
    31  //    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
    32  //    T2 = BIGSIGMA0(a) + Maj(a,b,c)
    33  //    h = g
    34  //    g = f
    35  //    f = e
    36  //    e = d + T1
    37  //    d = c
    38  //    c = b
    39  //    b = a
    40  //    a = T1 + T2
    41  // }
    42  //
    43  // H0 = a + H0
    44  // H1 = b + H1
    45  // H2 = c + H2
    46  // H3 = d + H3
    47  // H4 = e + H4
    48  // H5 = f + H5
    49  // H6 = g + H6
    50  // H7 = h + H7
    51  
    52  #define CTX	R3
    53  #define INP	R4
    54  #define END	R5
    55  #define TBL	R6
    56  #define IDX	R7
    57  #define CNT	R8
    58  #define LEN	R9
    59  #define OFFLOAD	R11
    60  #define TEMP	R12
    61  
    62  #define HEX00	R0
    63  #define HEX10	R10
    64  #define HEX20	R25
    65  #define HEX30	R26
    66  #define HEX40	R27
    67  #define HEX50	R28
    68  #define HEX60	R29
    69  #define HEX70	R31
    70  
    71  // V0-V7 are A-H
    72  // V8-V23 are used for the message schedule
    73  #define KI	V24
    74  #define FUNC	V25
    75  #define S0	V26
    76  #define S1	V27
    77  #define s0	V28
    78  #define s1	V29
    79  #define LEMASK	V31	// Permutation control register for little endian
    80  
    81  // 4 copies of each Kt, to fill all 4 words of a vector register
    82  DATA  ·kcon+0x000(SB)/8, $0x428a2f98428a2f98
    83  DATA  ·kcon+0x008(SB)/8, $0x428a2f98428a2f98
    84  DATA  ·kcon+0x010(SB)/8, $0x7137449171374491
    85  DATA  ·kcon+0x018(SB)/8, $0x7137449171374491
    86  DATA  ·kcon+0x020(SB)/8, $0xb5c0fbcfb5c0fbcf
    87  DATA  ·kcon+0x028(SB)/8, $0xb5c0fbcfb5c0fbcf
    88  DATA  ·kcon+0x030(SB)/8, $0xe9b5dba5e9b5dba5
    89  DATA  ·kcon+0x038(SB)/8, $0xe9b5dba5e9b5dba5
    90  DATA  ·kcon+0x040(SB)/8, $0x3956c25b3956c25b
    91  DATA  ·kcon+0x048(SB)/8, $0x3956c25b3956c25b
    92  DATA  ·kcon+0x050(SB)/8, $0x59f111f159f111f1
    93  DATA  ·kcon+0x058(SB)/8, $0x59f111f159f111f1
    94  DATA  ·kcon+0x060(SB)/8, $0x923f82a4923f82a4
    95  DATA  ·kcon+0x068(SB)/8, $0x923f82a4923f82a4
    96  DATA  ·kcon+0x070(SB)/8, $0xab1c5ed5ab1c5ed5
    97  DATA  ·kcon+0x078(SB)/8, $0xab1c5ed5ab1c5ed5
    98  DATA  ·kcon+0x080(SB)/8, $0xd807aa98d807aa98
    99  DATA  ·kcon+0x088(SB)/8, $0xd807aa98d807aa98
   100  DATA  ·kcon+0x090(SB)/8, $0x12835b0112835b01
   101  DATA  ·kcon+0x098(SB)/8, $0x12835b0112835b01
   102  DATA  ·kcon+0x0A0(SB)/8, $0x243185be243185be
   103  DATA  ·kcon+0x0A8(SB)/8, $0x243185be243185be
   104  DATA  ·kcon+0x0B0(SB)/8, $0x550c7dc3550c7dc3
   105  DATA  ·kcon+0x0B8(SB)/8, $0x550c7dc3550c7dc3
   106  DATA  ·kcon+0x0C0(SB)/8, $0x72be5d7472be5d74
   107  DATA  ·kcon+0x0C8(SB)/8, $0x72be5d7472be5d74
   108  DATA  ·kcon+0x0D0(SB)/8, $0x80deb1fe80deb1fe
   109  DATA  ·kcon+0x0D8(SB)/8, $0x80deb1fe80deb1fe
   110  DATA  ·kcon+0x0E0(SB)/8, $0x9bdc06a79bdc06a7
   111  DATA  ·kcon+0x0E8(SB)/8, $0x9bdc06a79bdc06a7
   112  DATA  ·kcon+0x0F0(SB)/8, $0xc19bf174c19bf174
   113  DATA  ·kcon+0x0F8(SB)/8, $0xc19bf174c19bf174
   114  DATA  ·kcon+0x100(SB)/8, $0xe49b69c1e49b69c1
   115  DATA  ·kcon+0x108(SB)/8, $0xe49b69c1e49b69c1
   116  DATA  ·kcon+0x110(SB)/8, $0xefbe4786efbe4786
   117  DATA  ·kcon+0x118(SB)/8, $0xefbe4786efbe4786
   118  DATA  ·kcon+0x120(SB)/8, $0x0fc19dc60fc19dc6
   119  DATA  ·kcon+0x128(SB)/8, $0x0fc19dc60fc19dc6
   120  DATA  ·kcon+0x130(SB)/8, $0x240ca1cc240ca1cc
   121  DATA  ·kcon+0x138(SB)/8, $0x240ca1cc240ca1cc
   122  DATA  ·kcon+0x140(SB)/8, $0x2de92c6f2de92c6f
   123  DATA  ·kcon+0x148(SB)/8, $0x2de92c6f2de92c6f
   124  DATA  ·kcon+0x150(SB)/8, $0x4a7484aa4a7484aa
   125  DATA  ·kcon+0x158(SB)/8, $0x4a7484aa4a7484aa
   126  DATA  ·kcon+0x160(SB)/8, $0x5cb0a9dc5cb0a9dc
   127  DATA  ·kcon+0x168(SB)/8, $0x5cb0a9dc5cb0a9dc
   128  DATA  ·kcon+0x170(SB)/8, $0x76f988da76f988da
   129  DATA  ·kcon+0x178(SB)/8, $0x76f988da76f988da
   130  DATA  ·kcon+0x180(SB)/8, $0x983e5152983e5152
   131  DATA  ·kcon+0x188(SB)/8, $0x983e5152983e5152
   132  DATA  ·kcon+0x190(SB)/8, $0xa831c66da831c66d
   133  DATA  ·kcon+0x198(SB)/8, $0xa831c66da831c66d
   134  DATA  ·kcon+0x1A0(SB)/8, $0xb00327c8b00327c8
   135  DATA  ·kcon+0x1A8(SB)/8, $0xb00327c8b00327c8
   136  DATA  ·kcon+0x1B0(SB)/8, $0xbf597fc7bf597fc7
   137  DATA  ·kcon+0x1B8(SB)/8, $0xbf597fc7bf597fc7
   138  DATA  ·kcon+0x1C0(SB)/8, $0xc6e00bf3c6e00bf3
   139  DATA  ·kcon+0x1C8(SB)/8, $0xc6e00bf3c6e00bf3
   140  DATA  ·kcon+0x1D0(SB)/8, $0xd5a79147d5a79147
   141  DATA  ·kcon+0x1D8(SB)/8, $0xd5a79147d5a79147
   142  DATA  ·kcon+0x1E0(SB)/8, $0x06ca635106ca6351
   143  DATA  ·kcon+0x1E8(SB)/8, $0x06ca635106ca6351
   144  DATA  ·kcon+0x1F0(SB)/8, $0x1429296714292967
   145  DATA  ·kcon+0x1F8(SB)/8, $0x1429296714292967
   146  DATA  ·kcon+0x200(SB)/8, $0x27b70a8527b70a85
   147  DATA  ·kcon+0x208(SB)/8, $0x27b70a8527b70a85
   148  DATA  ·kcon+0x210(SB)/8, $0x2e1b21382e1b2138
   149  DATA  ·kcon+0x218(SB)/8, $0x2e1b21382e1b2138
   150  DATA  ·kcon+0x220(SB)/8, $0x4d2c6dfc4d2c6dfc
   151  DATA  ·kcon+0x228(SB)/8, $0x4d2c6dfc4d2c6dfc
   152  DATA  ·kcon+0x230(SB)/8, $0x53380d1353380d13
   153  DATA  ·kcon+0x238(SB)/8, $0x53380d1353380d13
   154  DATA  ·kcon+0x240(SB)/8, $0x650a7354650a7354
   155  DATA  ·kcon+0x248(SB)/8, $0x650a7354650a7354
   156  DATA  ·kcon+0x250(SB)/8, $0x766a0abb766a0abb
   157  DATA  ·kcon+0x258(SB)/8, $0x766a0abb766a0abb
   158  DATA  ·kcon+0x260(SB)/8, $0x81c2c92e81c2c92e
   159  DATA  ·kcon+0x268(SB)/8, $0x81c2c92e81c2c92e
   160  DATA  ·kcon+0x270(SB)/8, $0x92722c8592722c85
   161  DATA  ·kcon+0x278(SB)/8, $0x92722c8592722c85
   162  DATA  ·kcon+0x280(SB)/8, $0xa2bfe8a1a2bfe8a1
   163  DATA  ·kcon+0x288(SB)/8, $0xa2bfe8a1a2bfe8a1
   164  DATA  ·kcon+0x290(SB)/8, $0xa81a664ba81a664b
   165  DATA  ·kcon+0x298(SB)/8, $0xa81a664ba81a664b
   166  DATA  ·kcon+0x2A0(SB)/8, $0xc24b8b70c24b8b70
   167  DATA  ·kcon+0x2A8(SB)/8, $0xc24b8b70c24b8b70
   168  DATA  ·kcon+0x2B0(SB)/8, $0xc76c51a3c76c51a3
   169  DATA  ·kcon+0x2B8(SB)/8, $0xc76c51a3c76c51a3
   170  DATA  ·kcon+0x2C0(SB)/8, $0xd192e819d192e819
   171  DATA  ·kcon+0x2C8(SB)/8, $0xd192e819d192e819
   172  DATA  ·kcon+0x2D0(SB)/8, $0xd6990624d6990624
   173  DATA  ·kcon+0x2D8(SB)/8, $0xd6990624d6990624
   174  DATA  ·kcon+0x2E0(SB)/8, $0xf40e3585f40e3585
   175  DATA  ·kcon+0x2E8(SB)/8, $0xf40e3585f40e3585
   176  DATA  ·kcon+0x2F0(SB)/8, $0x106aa070106aa070
   177  DATA  ·kcon+0x2F8(SB)/8, $0x106aa070106aa070
   178  DATA  ·kcon+0x300(SB)/8, $0x19a4c11619a4c116
   179  DATA  ·kcon+0x308(SB)/8, $0x19a4c11619a4c116
   180  DATA  ·kcon+0x310(SB)/8, $0x1e376c081e376c08
   181  DATA  ·kcon+0x318(SB)/8, $0x1e376c081e376c08
   182  DATA  ·kcon+0x320(SB)/8, $0x2748774c2748774c
   183  DATA  ·kcon+0x328(SB)/8, $0x2748774c2748774c
   184  DATA  ·kcon+0x330(SB)/8, $0x34b0bcb534b0bcb5
   185  DATA  ·kcon+0x338(SB)/8, $0x34b0bcb534b0bcb5
   186  DATA  ·kcon+0x340(SB)/8, $0x391c0cb3391c0cb3
   187  DATA  ·kcon+0x348(SB)/8, $0x391c0cb3391c0cb3
   188  DATA  ·kcon+0x350(SB)/8, $0x4ed8aa4a4ed8aa4a
   189  DATA  ·kcon+0x358(SB)/8, $0x4ed8aa4a4ed8aa4a
   190  DATA  ·kcon+0x360(SB)/8, $0x5b9cca4f5b9cca4f
   191  DATA  ·kcon+0x368(SB)/8, $0x5b9cca4f5b9cca4f
   192  DATA  ·kcon+0x370(SB)/8, $0x682e6ff3682e6ff3
   193  DATA  ·kcon+0x378(SB)/8, $0x682e6ff3682e6ff3
   194  DATA  ·kcon+0x380(SB)/8, $0x748f82ee748f82ee
   195  DATA  ·kcon+0x388(SB)/8, $0x748f82ee748f82ee
   196  DATA  ·kcon+0x390(SB)/8, $0x78a5636f78a5636f
   197  DATA  ·kcon+0x398(SB)/8, $0x78a5636f78a5636f
   198  DATA  ·kcon+0x3A0(SB)/8, $0x84c8781484c87814
   199  DATA  ·kcon+0x3A8(SB)/8, $0x84c8781484c87814
   200  DATA  ·kcon+0x3B0(SB)/8, $0x8cc702088cc70208
   201  DATA  ·kcon+0x3B8(SB)/8, $0x8cc702088cc70208
   202  DATA  ·kcon+0x3C0(SB)/8, $0x90befffa90befffa
   203  DATA  ·kcon+0x3C8(SB)/8, $0x90befffa90befffa
   204  DATA  ·kcon+0x3D0(SB)/8, $0xa4506ceba4506ceb
   205  DATA  ·kcon+0x3D8(SB)/8, $0xa4506ceba4506ceb
   206  DATA  ·kcon+0x3E0(SB)/8, $0xbef9a3f7bef9a3f7
   207  DATA  ·kcon+0x3E8(SB)/8, $0xbef9a3f7bef9a3f7
   208  DATA  ·kcon+0x3F0(SB)/8, $0xc67178f2c67178f2
   209  DATA  ·kcon+0x3F8(SB)/8, $0xc67178f2c67178f2
   210  DATA  ·kcon+0x400(SB)/8, $0x0000000000000000
   211  DATA  ·kcon+0x408(SB)/8, $0x0000000000000000
   212  DATA  ·kcon+0x410(SB)/8, $0x1011121310111213	// permutation control vectors
   213  DATA  ·kcon+0x418(SB)/8, $0x1011121300010203
   214  DATA  ·kcon+0x420(SB)/8, $0x1011121310111213
   215  DATA  ·kcon+0x428(SB)/8, $0x0405060700010203
   216  DATA  ·kcon+0x430(SB)/8, $0x1011121308090a0b
   217  DATA  ·kcon+0x438(SB)/8, $0x0405060700010203
   218  GLOBL ·kcon(SB), RODATA, $1088
   219  
   220  #define SHA256ROUND0(a, b, c, d, e, f, g, h, xi) \
   221  	VSEL		g, f, e, FUNC; \
   222  	VSHASIGMAW	$15, e, $1, S1; \
   223  	VADDUWM		xi, h, h; \
   224  	VSHASIGMAW	$0, a, $1, S0; \
   225  	VADDUWM		FUNC, h, h; \
   226  	VXOR		b, a, FUNC; \
   227  	VADDUWM		S1, h, h; \
   228  	VSEL		b, c, FUNC, FUNC; \
   229  	VADDUWM		KI, g, g; \
   230  	VADDUWM		h, d, d; \
   231  	VADDUWM		FUNC, S0, S0; \
   232  	LVX		(TBL)(IDX), KI; \
   233  	ADD		$16, IDX; \
   234  	VADDUWM		S0, h, h
   235  
   236  #define SHA256ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14) \
   237  	VSHASIGMAW	$0, xj_1, $0, s0; \
   238  	VSEL		g, f, e, FUNC; \
   239  	VSHASIGMAW	$15, e, $1, S1; \
   240  	VADDUWM		xi, h, h; \
   241  	VSHASIGMAW	$0, a, $1, S0; \
   242  	VSHASIGMAW	$15, xj_14, $0, s1; \
   243  	VADDUWM		FUNC, h, h; \
   244  	VXOR		b, a, FUNC; \
   245  	VADDUWM		xj_9, xj, xj; \
   246  	VADDUWM		S1, h, h; \
   247  	VSEL		b, c, FUNC, FUNC; \
   248  	VADDUWM		KI, g, g; \
   249  	VADDUWM		h, d, d; \
   250  	VADDUWM		FUNC, S0, S0; \
   251  	VADDUWM		s0, xj, xj; \
   252  	LVX		(TBL)(IDX), KI; \
   253  	ADD		$16, IDX; \
   254  	VADDUWM		S0, h, h; \
   255  	VADDUWM		s1, xj, xj
   256  
   257  // func block(dig *digest, p []byte)
   258  TEXT ·block(SB),0,$128-32
   259  	MOVD	dig+0(FP), CTX
   260  	MOVD	p_base+8(FP), INP
   261  	MOVD	p_len+16(FP), LEN
   262  
   263  	SRD	$6, LEN
   264  	SLD	$6, LEN
   265  
   266  	ADD	INP, LEN, END
   267  
   268  	CMP	INP, END
   269  	BEQ	end
   270  
   271  	MOVD	$·kcon(SB), TBL
   272  	MOVD	R1, OFFLOAD
   273  
   274  	MOVD	R0, CNT
   275  	MOVWZ	$0x10, HEX10
   276  	MOVWZ	$0x20, HEX20
   277  	MOVWZ	$0x30, HEX30
   278  	MOVWZ	$0x40, HEX40
   279  	MOVWZ	$0x50, HEX50
   280  	MOVWZ	$0x60, HEX60
   281  	MOVWZ	$0x70, HEX70
   282  
   283  	MOVWZ	$8, IDX
   284  	LVSL	(IDX)(R0), LEMASK
   285  	VSPLTISB	$0x0F, KI
   286  	VXOR	KI, LEMASK, LEMASK
   287  
   288  	LXVW4X	(CTX)(HEX00), VS32	// v0 = vs32
   289  	LXVW4X	(CTX)(HEX10), VS36	// v4 = vs36
   290  
   291  	// unpack the input values into vector registers
   292  	VSLDOI	$4, V0, V0, V1
   293  	VSLDOI	$8, V0, V0, V2
   294  	VSLDOI	$12, V0, V0, V3
   295  	VSLDOI	$4, V4, V4, V5
   296  	VSLDOI	$8, V4, V4, V6
   297  	VSLDOI	$12, V4, V4, V7
   298  
   299  loop:
   300  	LVX	(TBL)(HEX00), KI
   301  	MOVWZ	$16, IDX
   302  
   303  	LXVD2X	(INP)(R0), VS40	// load v8 (=vs40) in advance
   304  	ADD	$16, INP
   305  
   306  	STVX	V0, (OFFLOAD+HEX00)
   307  	STVX	V1, (OFFLOAD+HEX10)
   308  	STVX	V2, (OFFLOAD+HEX20)
   309  	STVX	V3, (OFFLOAD+HEX30)
   310  	STVX	V4, (OFFLOAD+HEX40)
   311  	STVX	V5, (OFFLOAD+HEX50)
   312  	STVX	V6, (OFFLOAD+HEX60)
   313  	STVX	V7, (OFFLOAD+HEX70)
   314  
   315  	VADDUWM	KI, V7, V7	// h+K[i]
   316  	LVX	(TBL)(IDX), KI
   317  	ADD	$16, IDX
   318  
   319  	VPERM	V8, V8, LEMASK, V8
   320  	SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8)
   321  	VSLDOI	$4, V8, V8, V9
   322  	SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9)
   323  	VSLDOI	$4, V9, V9, V10
   324  	SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10)
   325  	LXVD2X	(INP)(R0), VS44	// load v12 (=vs44) in advance
   326  	ADD	$16, INP, INP
   327  	VSLDOI	$4, V10, V10, V11
   328  	SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11)
   329  	VPERM	V12, V12, LEMASK, V12
   330  	SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12)
   331  	VSLDOI	$4, V12, V12, V13
   332  	SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13)
   333  	VSLDOI	$4, V13, V13, V14
   334  	SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14)
   335  	LXVD2X	(INP)(R0), VS48	// load v16 (=vs48) in advance
   336  	ADD	$16, INP, INP
   337  	VSLDOI	$4, V14, V14, V15
   338  	SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15)
   339  	VPERM	V16, V16, LEMASK, V16
   340  	SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16)
   341  	VSLDOI	$4, V16, V16, V17
   342  	SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17)
   343  	VSLDOI	$4, V17, V17, V18
   344  	SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18)
   345  	VSLDOI	$4, V18, V18, V19
   346  	LXVD2X	(INP)(R0), VS52	// load v20 (=vs52) in advance
   347  	ADD	$16, INP, INP
   348  	SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19)
   349  	VPERM	V20, V20, LEMASK, V20
   350  	SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20)
   351  	VSLDOI	$4, V20, V20, V21
   352  	SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21)
   353  	VSLDOI	$4, V21, V21, V22
   354  	SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22)
   355  	VSLDOI	$4, V22, V22, V23
   356  	SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
   357  
   358  	MOVWZ	$3, TEMP
   359  	MOVWZ	TEMP, CTR
   360  
   361  L16_xx:
   362  	SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23)
   363  	SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8)
   364  	SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9)
   365  	SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10)
   366  	SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11)
   367  	SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12)
   368  	SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13)
   369  	SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14)
   370  	SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15)
   371  	SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16)
   372  	SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17)
   373  	SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18)
   374  	SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19)
   375  	SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20)
   376  	SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21)
   377  	SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
   378  
   379  	BC	0x10, 0, L16_xx		// bdnz
   380  
   381  	LVX	(OFFLOAD)(HEX00), V10
   382  
   383  	LVX	(OFFLOAD)(HEX10), V11
   384  	VADDUWM	V10, V0, V0
   385  	LVX	(OFFLOAD)(HEX20), V12
   386  	VADDUWM	V11, V1, V1
   387  	LVX	(OFFLOAD)(HEX30), V13
   388  	VADDUWM	V12, V2, V2
   389  	LVX	(OFFLOAD)(HEX40), V14
   390  	VADDUWM	V13, V3, V3
   391  	LVX	(OFFLOAD)(HEX50), V15
   392  	VADDUWM	V14, V4, V4
   393  	LVX	(OFFLOAD)(HEX60), V16
   394  	VADDUWM	V15, V5, V5
   395  	LVX	(OFFLOAD)(HEX70), V17
   396  	VADDUWM	V16, V6, V6
   397  	VADDUWM	V17, V7, V7
   398  
   399  	CMPU	INP, END
   400  	BLT	loop
   401  
   402  	LVX	(TBL)(IDX), V8
   403  	ADD	$16, IDX
   404  	VPERM	V0, V1, KI, V0
   405  	LVX	(TBL)(IDX), V9
   406  	VPERM	V4, V5, KI, V4
   407  	VPERM	V0, V2, V8, V0
   408  	VPERM	V4, V6, V8, V4
   409  	VPERM	V0, V3, V9, V0
   410  	VPERM	V4, V7, V9, V4
   411  	STXVD2X	VS32, (CTX+HEX00)	// v0 = vs32
   412  	STXVD2X	VS36, (CTX+HEX10)	// v4 = vs36
   413  
   414  end:
   415  	RET
   416