github.com/waltonchain/waltonchain_gwtc_src@v1.1.4-0.20201225072101-8a298c95a819/crypto/x11/simd/simd.go (about)

     1  // Use of this source code is governed by an ISC
     2  // license that can be found in the LICENSE file.
     3  
     4  package simd
     5  
     6  import (
     7  	"fmt"
     8  
     9  	"github.com/wtc/go-wtc/crypto/hash"
    10  )
    11  
    12  // HashSize holds the size of a hash in bytes.
    13  const HashSize = int(64)
    14  
    15  // BlockSize holds the size of a block in bytes.
    16  const BlockSize = uintptr(128)
    17  
    18  ////////////////
    19  
    20  type digest struct {
    21  	ptr uintptr
    22  	ch  uint32
    23  	cl  uint32
    24  
    25  	h [32]uint32
    26  
    27  	b [BlockSize]byte
    28  }
    29  
    30  // New returns a new digest to compute a SIMD512 hash.
    31  func New() hash.Digest {
    32  	ref := &digest{}
    33  	ref.Reset()
    34  	return ref
    35  }
    36  
    37  ////////////////
    38  
    39  // Reset resets the digest to its initial state.
    40  func (ref *digest) Reset() {
    41  	ref.ptr = 0
    42  	ref.cl, ref.ch = 0, 0
    43  	copy(ref.h[:], kInit[:])
    44  }
    45  
    46  // Sum appends the current hash to dst and returns the result
    47  // as a slice. It does not change the underlying hash state.
    48  func (ref *digest) Sum(dst []byte) []byte {
    49  	dgt := *ref
    50  	hsh := [64]byte{}
    51  	dgt.Close(hsh[:], 0, 0)
    52  	return append(dst, hsh[:]...)
    53  }
    54  
    55  // Write more data to the running hash, never returns an error.
    56  func (ref *digest) Write(src []byte) (int, error) {
    57  	sln := uintptr(len(src))
    58  	fln := len(src)
    59  
    60  	for sln > 0 {
    61  		cln := BlockSize - ref.ptr
    62  
    63  		if cln > sln {
    64  			cln = sln
    65  		}
    66  		sln -= cln
    67  
    68  		copy(ref.b[ref.ptr:], src[:cln])
    69  		src = src[cln:]
    70  
    71  		ref.ptr += cln
    72  		if ref.ptr == BlockSize {
    73  			ref.compress(0)
    74  			ref.ptr = 0
    75  
    76  			ref.cl += 1
    77  			if ref.cl == 0 {
    78  				ref.ch++
    79  			}
    80  		}
    81  	}
    82  
    83  	return fln, nil
    84  }
    85  
    86  // Close the digest by writing the last bits and storing the hash
    87  // in dst. This prepares the digest for reuse by calling reset. A call
    88  // to Close with a dst that is smaller then HashSize will return an error.
    89  func (ref *digest) Close(dst []byte, bits uint8, bcnt uint8) error {
    90  	if ln := len(dst); HashSize > ln {
    91  		return fmt.Errorf("Simd Close: dst min length: %d, got %d", HashSize, ln)
    92  	}
    93  
    94  	if ref.ptr > 0 || bcnt > 0 {
    95  		memset(ref.b[ref.ptr:], 0)
    96  		ref.b[ref.ptr] = uint8(bits & (0xFF << (8 - bcnt)))
    97  		ref.compress(0)
    98  	}
    99  
   100  	memset(ref.b[:], 0)
   101  	{
   102  		low := uint32(ref.cl << 10)
   103  		low += uint32(ref.ptr<<3) + uint32(bcnt)
   104  		high := uint32(ref.ch<<10) + (ref.cl >> 22)
   105  		encUInt32le(ref.b[:], low)
   106  		encUInt32le(ref.b[4:], high)
   107  	}
   108  	ref.compress(1)
   109  
   110  	for u := int(0); u < 16; u++ {
   111  		encUInt32le(dst[(u<<2):], ref.h[u])
   112  	}
   113  
   114  	ref.Reset()
   115  	return nil
   116  }
   117  
   118  // Size returns the number of bytes required to store the hash.
   119  func (*digest) Size() int {
   120  	return HashSize
   121  }
   122  
   123  // BlockSize returns the block size of the hash.
   124  func (*digest) BlockSize() int {
   125  	return int(BlockSize)
   126  }
   127  
   128  ////////////////
   129  
   130  func memset(dst []byte, src byte) {
   131  	for i := range dst {
   132  		dst[i] = src
   133  	}
   134  }
   135  
   136  func decUInt32le(src []byte) uint32 {
   137  	return (uint32(src[0]) |
   138  		uint32(src[1])<<8 |
   139  		uint32(src[2])<<16 |
   140  		uint32(src[3])<<24)
   141  }
   142  
   143  func encUInt32le(dst []uint8, src uint32) {
   144  	dst[0] = uint8(src)
   145  	dst[1] = uint8(src >> 8)
   146  	dst[2] = uint8(src >> 16)
   147  	dst[3] = uint8(src >> 24)
   148  }
   149  
   150  func (ref *digest) compress(last int) {
   151  	var q [256]int32
   152  	var w [64]uint32
   153  	var st [32]uint32
   154  
   155  	mixoutRound(ref.b[:], q[:], 1<<2)
   156  	mixoutRound(ref.b[2:], q[64:], 1<<2)
   157  
   158  	{
   159  		var t int32
   160  		var u, v uintptr
   161  
   162  		m := q[0]
   163  		n := q[64]
   164  		q[0] = m + n
   165  		q[64] = m - n
   166  
   167  		m = q[u+1]
   168  		n = q[u+1+64]
   169  		t = (n * kAlphaTab[v+1*2])
   170  		t = ((t) & 0xFFFF) + ((t) >> 16)
   171  		q[u+1] = m + t
   172  		q[u+1+64] = m - t
   173  		m = q[u+2]
   174  		n = q[u+2+64]
   175  		t = (n * kAlphaTab[v+2*2])
   176  		t = ((t) & 0xFFFF) + ((t) >> 16)
   177  		q[u+2] = m + t
   178  		q[u+2+64] = m - t
   179  		m = q[u+3]
   180  		n = q[u+3+64]
   181  		t = (n * kAlphaTab[v+3*2])
   182  		t = ((t) & 0xFFFF) + ((t) >> 16)
   183  		q[u+3] = m + t
   184  		q[u+3+64] = m - t
   185  
   186  		u = 4
   187  		v = 4 * 2
   188  		for u < 64 {
   189  			m = q[u]
   190  			n = q[u+(64)]
   191  			t = (n * kAlphaTab[v+0*2])
   192  			t = ((t) & 0xFFFF) + ((t) >> 16)
   193  			q[u] = m + t
   194  			q[u+(64)] = m - t
   195  
   196  			m = q[u+1]
   197  			n = q[u+1+64]
   198  			t = (n * kAlphaTab[v+1*2])
   199  			t = ((t) & 0xFFFF) + ((t) >> 16)
   200  			q[u+1] = m + t
   201  			q[u+1+64] = m - t
   202  			m = q[u+2]
   203  			n = q[u+2+64]
   204  			t = (n * kAlphaTab[v+2*2])
   205  			t = ((t) & 0xFFFF) + ((t) >> 16)
   206  			q[u+2] = m + t
   207  			q[u+2+64] = m - t
   208  			m = q[u+3]
   209  			n = q[u+3+64]
   210  			t = (n * kAlphaTab[v+3*2])
   211  			t = ((t) & 0xFFFF) + ((t) >> 16)
   212  			q[u+3] = m + t
   213  			q[u+3+64] = m - t
   214  
   215  			u += 4
   216  			v += 4 * 2
   217  		}
   218  	}
   219  
   220  	mixoutRound(ref.b[1:], q[128:], 1<<2)
   221  	mixoutRound(ref.b[3:], q[192:], 1<<2)
   222  
   223  	{
   224  		var t int32
   225  		var u, v uintptr
   226  
   227  		m := q[128]
   228  		n := q[128+64]
   229  		q[128] = m + n
   230  		q[128+64] = m - n
   231  
   232  		m = q[128+u+1]
   233  		n = q[128+u+1+64]
   234  		t = (n * kAlphaTab[v+1*2])
   235  		t = ((t) & 0xFFFF) + ((t) >> 16)
   236  		q[128+u+1] = m + t
   237  		q[128+u+1+64] = m - t
   238  		m = q[128+u+2]
   239  		n = q[128+u+2+64]
   240  		t = (n * kAlphaTab[v+2*2])
   241  		t = ((t) & 0xFFFF) + ((t) >> 16)
   242  		q[128+u+2] = m + t
   243  		q[128+u+2+64] = m - t
   244  		m = q[128+u+3]
   245  		n = q[128+u+3+64]
   246  		t = (n * kAlphaTab[v+3*2])
   247  		t = ((t) & 0xFFFF) + ((t) >> 16)
   248  		q[128+u+3] = m + t
   249  		q[128+u+3+64] = m - t
   250  
   251  		u = 4
   252  		v = 4 * 2
   253  		for u < 64 {
   254  			m = q[128+u]
   255  			n = q[128+u+64]
   256  			t = (n * kAlphaTab[v+0*2])
   257  			t = ((t) & 0xFFFF) + ((t) >> 16)
   258  			q[128+u] = m + t
   259  			q[128+u+64] = m - t
   260  
   261  			m = q[128+u+1]
   262  			n = q[128+u+1+64]
   263  			t = (n * kAlphaTab[v+1*2])
   264  			t = ((t) & 0xFFFF) + ((t) >> 16)
   265  			q[128+u+1] = m + t
   266  			q[128+u+1+64] = m - t
   267  			m = q[128+u+2]
   268  			n = q[128+u+2+64]
   269  			t = (n * kAlphaTab[v+2*2])
   270  			t = ((t) & 0xFFFF) + ((t) >> 16)
   271  			q[128+u+2] = m + t
   272  			q[128+u+2+64] = m - t
   273  			m = q[128+u+3]
   274  			n = q[128+u+3+64]
   275  			t = (n * kAlphaTab[v+3*2])
   276  			t = ((t) & 0xFFFF) + ((t) >> 16)
   277  			q[128+u+3] = m + t
   278  			q[128+u+3+64] = m - t
   279  
   280  			u += 4
   281  			v += 4 * 2
   282  		}
   283  	}
   284  
   285  	{
   286  		var t int32
   287  		var u, v uintptr
   288  
   289  		m := q[0]
   290  		n := q[128]
   291  		q[0] = m + n
   292  		q[128] = m - n
   293  
   294  		m = q[u+1]
   295  		n = q[u+1+128]
   296  		t = (n * kAlphaTab[v+1])
   297  		t = ((t) & 0xFFFF) + ((t) >> 16)
   298  		q[u+1] = m + t
   299  		q[u+1+128] = m - t
   300  		m = q[u+2]
   301  		n = q[u+2+128]
   302  		t = (n * kAlphaTab[v+2])
   303  		t = ((t) & 0xFFFF) + ((t) >> 16)
   304  		q[u+2] = m + t
   305  		q[u+2+128] = m - t
   306  		m = q[u+3]
   307  		n = q[u+3+128]
   308  		t = (n * kAlphaTab[v+3])
   309  		t = ((t) & 0xFFFF) + ((t) >> 16)
   310  		q[u+3] = m + t
   311  		q[u+3+128] = m - t
   312  
   313  		u = 4
   314  		v = 4
   315  		for u < 128 {
   316  			m = q[u]
   317  			n = q[u+128]
   318  			t = (n * kAlphaTab[v+0])
   319  			t = ((t) & 0xFFFF) + ((t) >> 16)
   320  			q[u] = m + t
   321  			q[u+128] = m - t
   322  
   323  			m = q[u+1]
   324  			n = q[u+1+128]
   325  			t = (n * kAlphaTab[v+1])
   326  			t = ((t) & 0xFFFF) + ((t) >> 16)
   327  			q[u+1] = m + t
   328  			q[u+1+128] = m - t
   329  			m = q[u+2]
   330  			n = q[u+2+128]
   331  			t = (n * kAlphaTab[v+2])
   332  			t = ((t) & 0xFFFF) + ((t) >> 16)
   333  			q[u+2] = m + t
   334  			q[u+2+128] = m - t
   335  			m = q[u+3]
   336  			n = q[u+3+128]
   337  			t = (n * kAlphaTab[v+3])
   338  			t = ((t) & 0xFFFF) + ((t) >> 16)
   339  			q[u+3] = m + t
   340  			q[u+3+128] = m - t
   341  
   342  			u += 4
   343  			v += 4
   344  		}
   345  	}
   346  
   347  	if last == 1 {
   348  		var tq int32
   349  		for i := uintptr(0); i < 256; i++ {
   350  			tq = q[i] + kYOffB[i]
   351  			tq = (((tq) & 0xFFFF) + ((tq) >> 16))
   352  			tq = (((tq) & 0xFF) - ((tq) >> 8))
   353  			tq = (((tq) & 0xFF) - ((tq) >> 8))
   354  			if tq <= 128 {
   355  				q[i] = tq
   356  			} else {
   357  				q[i] = tq - 257
   358  			}
   359  		}
   360  	} else {
   361  		var tq int32
   362  		for i := uintptr(0); i < 256; i++ {
   363  			tq = q[i] + kYOffA[i]
   364  			tq = (((tq) & 0xFFFF) + ((tq) >> 16))
   365  			tq = (((tq) & 0xFF) - ((tq) >> 8))
   366  			tq = (((tq) & 0xFF) - ((tq) >> 8))
   367  			if tq <= 128 {
   368  				q[i] = tq
   369  			} else {
   370  				q[i] = tq - 257
   371  			}
   372  		}
   373  	}
   374  
   375  	{
   376  		b := ref.b[:]
   377  		s := ref.h[:]
   378  		for i := uintptr(0); i < 32; i += 8 {
   379  			st[i+0] = s[i+0] ^ decUInt32le(b[4*(i+0):])
   380  			st[i+1] = s[i+1] ^ decUInt32le(b[4*(i+1):])
   381  			st[i+2] = s[i+2] ^ decUInt32le(b[4*(i+2):])
   382  			st[i+3] = s[i+3] ^ decUInt32le(b[4*(i+3):])
   383  			st[i+4] = s[i+4] ^ decUInt32le(b[4*(i+4):])
   384  			st[i+5] = s[i+5] ^ decUInt32le(b[4*(i+5):])
   385  			st[i+6] = s[i+6] ^ decUInt32le(b[4*(i+6):])
   386  			st[i+7] = s[i+7] ^ decUInt32le(b[4*(i+7):])
   387  		}
   388  	}
   389  
   390  	for u := uintptr(0); u < 64; u += 8 {
   391  		v := uintptr(wbp[(u >> 3)])
   392  
   393  		w[u+0] = ((uint32(q[v+2*0]*185) & uint32(0xFFFF)) +
   394  			(uint32(q[v+2*0+1]*185) << 16))
   395  		w[u+1] = ((uint32(q[v+2*1]*185) & uint32(0xFFFF)) +
   396  			(uint32(q[v+2*1+1]*185) << 16))
   397  		w[u+2] = ((uint32(q[v+2*2]*185) & uint32(0xFFFF)) +
   398  			(uint32(q[v+2*2+1]*185) << 16))
   399  		w[u+3] = ((uint32(q[v+2*3]*185) & uint32(0xFFFF)) +
   400  			(uint32(q[v+2*3+1]*185) << 16))
   401  		w[u+4] = ((uint32(q[v+2*4]*185) & uint32(0xFFFF)) +
   402  			(uint32(q[v+2*4+1]*185) << 16))
   403  		w[u+5] = ((uint32(q[v+2*5]*185) & uint32(0xFFFF)) +
   404  			(uint32(q[v+2*5+1]*185) << 16))
   405  		w[u+6] = ((uint32(q[v+2*6]*185) & uint32(0xFFFF)) +
   406  			(uint32(q[v+2*6+1]*185) << 16))
   407  		w[u+7] = ((uint32(q[v+2*7]*185) & uint32(0xFFFF)) +
   408  			(uint32(q[v+2*7+1]*185) << 16))
   409  	}
   410  
   411  	mixinRound(st[:], w[:], 0, 3, 23, 17, 27)
   412  
   413  	for u := uintptr(0); u < 64; u += 8 {
   414  		v := uintptr(wbp[(u>>3)+8])
   415  
   416  		w[u+0] = (uint32(q[v+2*0]*185) & uint32(0xFFFF)) +
   417  			(uint32(q[v+2*0+1]*185) << 16)
   418  		w[u+1] = (uint32(q[v+2*1]*185) & uint32(0xFFFF)) +
   419  			(uint32(q[v+2*1+1]*185) << 16)
   420  		w[u+2] = (uint32(q[v+2*2]*185) & uint32(0xFFFF)) +
   421  			(uint32(q[v+2*2+1]*185) << 16)
   422  		w[u+3] = (uint32(q[v+2*3]*185) & uint32(0xFFFF)) +
   423  			(uint32(q[v+2*3+1]*185) << 16)
   424  		w[u+4] = (uint32(q[v+2*4]*185) & uint32(0xFFFF)) +
   425  			(uint32(q[v+2*4+1]*185) << 16)
   426  		w[u+5] = (uint32(q[v+2*5]*185) & uint32(0xFFFF)) +
   427  			(uint32(q[v+2*5+1]*185) << 16)
   428  		w[u+6] = (uint32(q[v+2*6]*185) & uint32(0xFFFF)) +
   429  			(uint32(q[v+2*6+1]*185) << 16)
   430  		w[u+7] = (uint32(q[v+2*7]*185) & uint32(0xFFFF)) +
   431  			(uint32(q[v+2*7+1]*185) << 16)
   432  	}
   433  	mixinRound(st[:], w[:], 1, 28, 19, 22, 7)
   434  
   435  	for u := uintptr(0); u < 64; u += 8 {
   436  		v := uintptr(wbp[(u>>3)+16])
   437  
   438  		w[u+0] = ((uint32(q[v+2*0-256]*(233)) & uint32(0xFFFF)) +
   439  			(uint32((q[v+2*0-128])*(233)) << 16))
   440  		w[u+1] = ((uint32(q[v+2*1-256]*(233)) & uint32(0xFFFF)) +
   441  			(uint32((q[v+2*1-128])*(233)) << 16))
   442  		w[u+2] = ((uint32(q[v+2*2-256]*(233)) & uint32(0xFFFF)) +
   443  			(uint32((q[v+2*2-128])*(233)) << 16))
   444  		w[u+3] = ((uint32(q[v+2*3-256]*(233)) & uint32(0xFFFF)) +
   445  			(uint32((q[v+2*3-128])*(233)) << 16))
   446  		w[u+4] = ((uint32(q[v+2*4-256]*(233)) & uint32(0xFFFF)) +
   447  			(uint32((q[v+2*4-128])*(233)) << 16))
   448  		w[u+5] = ((uint32(q[v+2*5-256]*(233)) & uint32(0xFFFF)) +
   449  			(uint32((q[v+2*5-128])*(233)) << 16))
   450  		w[u+6] = ((uint32(q[v+2*6-256]*(233)) & uint32(0xFFFF)) +
   451  			(uint32((q[v+2*6-128])*(233)) << 16))
   452  		w[u+7] = ((uint32(q[v+2*7-256]*(233)) & uint32(0xFFFF)) +
   453  			(uint32((q[v+2*7-128])*(233)) << 16))
   454  	}
   455  	mixinRound(st[:], w[:], 2, 29, 9, 15, 5)
   456  
   457  	for u := uintptr(0); u < 64; u += 8 {
   458  		v := uintptr(wbp[(u>>3)+24])
   459  
   460  		w[u+0] = ((uint32(q[v+2*0-383]*233) & uint32(0xFFFF)) +
   461  			(uint32(q[v+2*0-255]*233) << 16))
   462  		w[u+1] = ((uint32(q[v+2*1-383]*233) & uint32(0xFFFF)) +
   463  			(uint32(q[v+2*1-255]*233) << 16))
   464  		w[u+2] = ((uint32(q[v+2*2-383]*233) & uint32(0xFFFF)) +
   465  			(uint32(q[v+2*2-255]*233) << 16))
   466  		w[u+3] = ((uint32(q[v+2*3-383]*233) & uint32(0xFFFF)) +
   467  			(uint32(q[v+2*3-255]*233) << 16))
   468  		w[u+4] = ((uint32(q[v+2*4-383]*233) & uint32(0xFFFF)) +
   469  			(uint32(q[v+2*4-255]*233) << 16))
   470  		w[u+5] = ((uint32(q[v+2*5-383]*233) & uint32(0xFFFF)) +
   471  			(uint32(q[v+2*5-255]*233) << 16))
   472  		w[u+6] = ((uint32(q[v+2*6-383]*233) & uint32(0xFFFF)) +
   473  			(uint32(q[v+2*6-255]*233) << 16))
   474  		w[u+7] = ((uint32(q[v+2*7-383]*233) & uint32(0xFFFF)) +
   475  			(uint32(q[v+2*7-255]*233) << 16))
   476  	}
   477  	mixinRound(st[:], w[:], 3, 4, 13, 10, 25)
   478  
   479  	{
   480  		var tp uint32
   481  		var tA [8]uint32
   482  
   483  		sta := ref.h[:]
   484  
   485  		tA[0] = ((st[0] << 4) | (st[0] >> (32 - 4)))
   486  		tA[1] = ((st[1] << 4) | (st[1] >> (32 - 4)))
   487  		tA[2] = ((st[2] << 4) | (st[2] >> (32 - 4)))
   488  		tA[3] = ((st[3] << 4) | (st[3] >> (32 - 4)))
   489  		tA[4] = ((st[4] << 4) | (st[4] >> (32 - 4)))
   490  		tA[5] = ((st[5] << 4) | (st[5] >> (32 - 4)))
   491  		tA[6] = ((st[6] << 4) | (st[6] >> (32 - 4)))
   492  		tA[7] = ((st[7] << 4) | (st[7] >> (32 - 4)))
   493  
   494  		tp = uint32(st[kIdxD[0]] + sta[0] +
   495  			(((st[kIdxB[0]] ^ st[kIdxC[0]]) & st[kIdxA[0]]) ^ st[kIdxC[0]]))
   496  		st[kIdxA[0]] = ((tp << 13) | (tp >> (32 - 13))) + tA[kPrem[4][0]]
   497  		st[kIdxD[0]] = st[kIdxC[0]]
   498  		st[kIdxC[0]] = st[kIdxB[0]]
   499  		st[kIdxB[0]] = tA[0]
   500  
   501  		tp = uint32(st[kIdxD[1]] + sta[1] +
   502  			(((st[kIdxB[1]] ^ st[kIdxC[1]]) & st[kIdxA[1]]) ^ st[kIdxC[1]]))
   503  		st[kIdxA[1]] = ((tp << 13) | (tp >> (32 - 13))) + tA[kPrem[4][1]]
   504  		st[kIdxD[1]] = st[kIdxC[1]]
   505  		st[kIdxC[1]] = st[kIdxB[1]]
   506  		st[kIdxB[1]] = tA[1]
   507  
   508  		tp = uint32(st[kIdxD[2]] + sta[2] +
   509  			(((st[kIdxB[2]] ^ st[kIdxC[2]]) & st[kIdxA[2]]) ^ st[kIdxC[2]]))
   510  		st[kIdxA[2]] = ((tp << 13) | (tp >> (32 - 13))) + tA[kPrem[4][2]]
   511  		st[kIdxD[2]] = st[kIdxC[2]]
   512  		st[kIdxC[2]] = st[kIdxB[2]]
   513  		st[kIdxB[2]] = tA[2]
   514  
   515  		tp = uint32(st[kIdxD[3]] + sta[3] +
   516  			(((st[kIdxB[3]] ^ st[kIdxC[3]]) & st[kIdxA[3]]) ^ st[kIdxC[3]]))
   517  		st[kIdxA[3]] = ((tp << 13) | (tp >> (32 - 13))) + tA[kPrem[4][3]]
   518  		st[kIdxD[3]] = st[kIdxC[3]]
   519  		st[kIdxC[3]] = st[kIdxB[3]]
   520  		st[kIdxB[3]] = tA[3]
   521  
   522  		tp = uint32(st[kIdxD[4]] + sta[4] +
   523  			(((st[kIdxB[4]] ^ st[kIdxC[4]]) & st[kIdxA[4]]) ^ st[kIdxC[4]]))
   524  		st[kIdxA[4]] = ((tp << 13) | (tp >> (32 - 13))) + tA[kPrem[4][4]]
   525  		st[kIdxD[4]] = st[kIdxC[4]]
   526  		st[kIdxC[4]] = st[kIdxB[4]]
   527  		st[kIdxB[4]] = tA[4]
   528  
   529  		tp = uint32(st[kIdxD[5]] + sta[5] +
   530  			(((st[kIdxB[5]] ^ st[kIdxC[5]]) & st[kIdxA[5]]) ^ st[kIdxC[5]]))
   531  		st[kIdxA[5]] = ((tp << 13) | (tp >> (32 - 13))) + tA[kPrem[4][5]]
   532  		st[kIdxD[5]] = st[kIdxC[5]]
   533  		st[kIdxC[5]] = st[kIdxB[5]]
   534  		st[kIdxB[5]] = tA[5]
   535  
   536  		tp = uint32(st[kIdxD[6]] + sta[6] +
   537  			(((st[kIdxB[6]] ^ st[kIdxC[6]]) & st[kIdxA[6]]) ^ st[kIdxC[6]]))
   538  		st[kIdxA[6]] = ((tp << 13) | (tp >> (32 - 13))) + tA[kPrem[4][6]]
   539  		st[kIdxD[6]] = st[kIdxC[6]]
   540  		st[kIdxC[6]] = st[kIdxB[6]]
   541  		st[kIdxB[6]] = tA[6]
   542  
   543  		tp = uint32(st[kIdxD[7]] + sta[7] +
   544  			(((st[kIdxB[7]] ^ st[kIdxC[7]]) & st[kIdxA[7]]) ^ st[kIdxC[7]]))
   545  		st[kIdxA[7]] = ((tp << 13) | (tp >> (32 - 13))) + tA[kPrem[4][7]]
   546  		st[kIdxD[7]] = st[kIdxC[7]]
   547  		st[kIdxC[7]] = st[kIdxB[7]]
   548  		st[kIdxB[7]] = tA[7]
   549  
   550  		tA[0] = ((st[0] << 13) | (st[0] >> (32 - 13)))
   551  		tA[1] = ((st[1] << 13) | (st[1] >> (32 - 13)))
   552  		tA[2] = ((st[2] << 13) | (st[2] >> (32 - 13)))
   553  		tA[3] = ((st[3] << 13) | (st[3] >> (32 - 13)))
   554  		tA[4] = ((st[4] << 13) | (st[4] >> (32 - 13)))
   555  		tA[5] = ((st[5] << 13) | (st[5] >> (32 - 13)))
   556  		tA[6] = ((st[6] << 13) | (st[6] >> (32 - 13)))
   557  		tA[7] = ((st[7] << 13) | (st[7] >> (32 - 13)))
   558  
   559  		tp = uint32(st[kIdxD[0]] + sta[8] +
   560  			(((st[kIdxB[0]] ^ st[kIdxC[0]]) & st[kIdxA[0]]) ^ st[kIdxC[0]]))
   561  		st[kIdxA[0]] = ((tp << 10) | (tp >> (32 - 10))) + tA[kPrem[5][0]]
   562  		st[kIdxD[0]] = st[kIdxC[0]]
   563  		st[kIdxC[0]] = st[kIdxB[0]]
   564  		st[kIdxB[0]] = tA[0]
   565  
   566  		tp = uint32(st[kIdxD[1]] + sta[9] +
   567  			(((st[kIdxB[1]] ^ st[kIdxC[1]]) & st[kIdxA[1]]) ^ st[kIdxC[1]]))
   568  		st[kIdxA[1]] = ((tp << 10) | (tp >> (32 - 10))) + tA[kPrem[5][1]]
   569  		st[kIdxD[1]] = st[kIdxC[1]]
   570  		st[kIdxC[1]] = st[kIdxB[1]]
   571  		st[kIdxB[1]] = tA[1]
   572  
   573  		tp = uint32(st[kIdxD[2]] + sta[10] +
   574  			(((st[kIdxB[2]] ^ st[kIdxC[2]]) & st[kIdxA[2]]) ^ st[kIdxC[2]]))
   575  		st[kIdxA[2]] = ((tp << 10) | (tp >> (32 - 10))) + tA[kPrem[5][2]]
   576  		st[kIdxD[2]] = st[kIdxC[2]]
   577  		st[kIdxC[2]] = st[kIdxB[2]]
   578  		st[kIdxB[2]] = tA[2]
   579  
   580  		tp = uint32(st[kIdxD[3]] + sta[11] +
   581  			(((st[kIdxB[3]] ^ st[kIdxC[3]]) & st[kIdxA[3]]) ^ st[kIdxC[3]]))
   582  		st[kIdxA[3]] = ((tp << 10) | (tp >> (32 - 10))) + tA[kPrem[5][3]]
   583  		st[kIdxD[3]] = st[kIdxC[3]]
   584  		st[kIdxC[3]] = st[kIdxB[3]]
   585  		st[kIdxB[3]] = tA[3]
   586  
   587  		tp = uint32(st[kIdxD[4]] + sta[12] +
   588  			(((st[kIdxB[4]] ^ st[kIdxC[4]]) & st[kIdxA[4]]) ^ st[kIdxC[4]]))
   589  		st[kIdxA[4]] = ((tp << 10) | (tp >> (32 - 10))) + tA[kPrem[5][4]]
   590  		st[kIdxD[4]] = st[kIdxC[4]]
   591  		st[kIdxC[4]] = st[kIdxB[4]]
   592  		st[kIdxB[4]] = tA[4]
   593  
   594  		tp = uint32(st[kIdxD[5]] + sta[13] +
   595  			(((st[kIdxB[5]] ^ st[kIdxC[5]]) & st[kIdxA[5]]) ^ st[kIdxC[5]]))
   596  		st[kIdxA[5]] = ((tp << 10) | (tp >> (32 - 10))) + tA[kPrem[5][5]]
   597  		st[kIdxD[5]] = st[kIdxC[5]]
   598  		st[kIdxC[5]] = st[kIdxB[5]]
   599  		st[kIdxB[5]] = tA[5]
   600  
   601  		tp = uint32(st[kIdxD[6]] + sta[14] +
   602  			(((st[kIdxB[6]] ^ st[kIdxC[6]]) & st[kIdxA[6]]) ^ st[kIdxC[6]]))
   603  		st[kIdxA[6]] = ((tp << 10) | (tp >> (32 - 10))) + tA[kPrem[5][6]]
   604  		st[kIdxD[6]] = st[kIdxC[6]]
   605  		st[kIdxC[6]] = st[kIdxB[6]]
   606  		st[kIdxB[6]] = tA[6]
   607  
   608  		tp = uint32(st[kIdxD[7]] + sta[15] +
   609  			(((st[kIdxB[7]] ^ st[kIdxC[7]]) & st[kIdxA[7]]) ^ st[kIdxC[7]]))
   610  		st[kIdxA[7]] = ((tp << 10) | (tp >> (32 - 10))) + tA[kPrem[5][7]]
   611  		st[kIdxD[7]] = st[kIdxC[7]]
   612  		st[kIdxC[7]] = st[kIdxB[7]]
   613  		st[kIdxB[7]] = tA[7]
   614  
   615  		tA[0] = ((st[0] << 10) | (st[0] >> (32 - 10)))
   616  		tA[1] = ((st[1] << 10) | (st[1] >> (32 - 10)))
   617  		tA[2] = ((st[2] << 10) | (st[2] >> (32 - 10)))
   618  		tA[3] = ((st[3] << 10) | (st[3] >> (32 - 10)))
   619  		tA[4] = ((st[4] << 10) | (st[4] >> (32 - 10)))
   620  		tA[5] = ((st[5] << 10) | (st[5] >> (32 - 10)))
   621  		tA[6] = ((st[6] << 10) | (st[6] >> (32 - 10)))
   622  		tA[7] = ((st[7] << 10) | (st[7] >> (32 - 10)))
   623  
   624  		tp = uint32(st[kIdxD[0]] + sta[16] +
   625  			(((st[kIdxB[0]] ^ st[kIdxC[0]]) & st[kIdxA[0]]) ^ st[kIdxC[0]]))
   626  		st[kIdxA[0]] = ((tp << 25) | (tp >> (32 - 25))) + tA[kPrem[6][0]]
   627  		st[kIdxD[0]] = st[kIdxC[0]]
   628  		st[kIdxC[0]] = st[kIdxB[0]]
   629  		st[kIdxB[0]] = tA[0]
   630  
   631  		tp = uint32(st[kIdxD[1]] + sta[17] +
   632  			(((st[kIdxB[1]] ^ st[kIdxC[1]]) & st[kIdxA[1]]) ^ st[kIdxC[1]]))
   633  		st[kIdxA[1]] = ((tp << 25) | (tp >> (32 - 25))) + tA[kPrem[6][1]]
   634  		st[kIdxD[1]] = st[kIdxC[1]]
   635  		st[kIdxC[1]] = st[kIdxB[1]]
   636  		st[kIdxB[1]] = tA[1]
   637  
   638  		tp = uint32(st[kIdxD[2]] + sta[18] +
   639  			(((st[kIdxB[2]] ^ st[kIdxC[2]]) & st[kIdxA[2]]) ^ st[kIdxC[2]]))
   640  		st[kIdxA[2]] = ((tp << 25) | (tp >> (32 - 25))) + tA[kPrem[6][2]]
   641  		st[kIdxD[2]] = st[kIdxC[2]]
   642  		st[kIdxC[2]] = st[kIdxB[2]]
   643  		st[kIdxB[2]] = tA[2]
   644  
   645  		tp = uint32(st[kIdxD[3]] + sta[19] +
   646  			(((st[kIdxB[3]] ^ st[kIdxC[3]]) & st[kIdxA[3]]) ^ st[kIdxC[3]]))
   647  		st[kIdxA[3]] = ((tp << 25) | (tp >> (32 - 25))) + tA[kPrem[6][3]]
   648  		st[kIdxD[3]] = st[kIdxC[3]]
   649  		st[kIdxC[3]] = st[kIdxB[3]]
   650  		st[kIdxB[3]] = tA[3]
   651  
   652  		tp = uint32(st[kIdxD[4]] + sta[20] +
   653  			(((st[kIdxB[4]] ^ st[kIdxC[4]]) & st[kIdxA[4]]) ^ st[kIdxC[4]]))
   654  		st[kIdxA[4]] = ((tp << 25) | (tp >> (32 - 25))) + tA[kPrem[6][4]]
   655  		st[kIdxD[4]] = st[kIdxC[4]]
   656  		st[kIdxC[4]] = st[kIdxB[4]]
   657  		st[kIdxB[4]] = tA[4]
   658  
   659  		tp = uint32(st[kIdxD[5]] + sta[21] +
   660  			(((st[kIdxB[5]] ^ st[kIdxC[5]]) & st[kIdxA[5]]) ^ st[kIdxC[5]]))
   661  		st[kIdxA[5]] = ((tp << 25) | (tp >> (32 - 25))) + tA[kPrem[6][5]]
   662  		st[kIdxD[5]] = st[kIdxC[5]]
   663  		st[kIdxC[5]] = st[kIdxB[5]]
   664  		st[kIdxB[5]] = tA[5]
   665  
   666  		tp = uint32(st[kIdxD[6]] + sta[22] +
   667  			(((st[kIdxB[6]] ^ st[kIdxC[6]]) & st[kIdxA[6]]) ^ st[kIdxC[6]]))
   668  		st[kIdxA[6]] = ((tp << 25) | (tp >> (32 - 25))) + tA[kPrem[6][6]]
   669  		st[kIdxD[6]] = st[kIdxC[6]]
   670  		st[kIdxC[6]] = st[kIdxB[6]]
   671  		st[kIdxB[6]] = tA[6]
   672  
   673  		tp = uint32(st[kIdxD[7]] + sta[23] +
   674  			(((st[kIdxB[7]] ^ st[kIdxC[7]]) & st[kIdxA[7]]) ^ st[kIdxC[7]]))
   675  		st[kIdxA[7]] = ((tp << 25) | (tp >> (32 - 25))) + tA[kPrem[6][7]]
   676  		st[kIdxD[7]] = st[kIdxC[7]]
   677  		st[kIdxC[7]] = st[kIdxB[7]]
   678  		st[kIdxB[7]] = tA[7]
   679  
   680  		tA[0] = ((st[0] << 25) | (st[0] >> (32 - 25)))
   681  		tA[1] = ((st[1] << 25) | (st[1] >> (32 - 25)))
   682  		tA[2] = ((st[2] << 25) | (st[2] >> (32 - 25)))
   683  		tA[3] = ((st[3] << 25) | (st[3] >> (32 - 25)))
   684  		tA[4] = ((st[4] << 25) | (st[4] >> (32 - 25)))
   685  		tA[5] = ((st[5] << 25) | (st[5] >> (32 - 25)))
   686  		tA[6] = ((st[6] << 25) | (st[6] >> (32 - 25)))
   687  		tA[7] = ((st[7] << 25) | (st[7] >> (32 - 25)))
   688  
   689  		tp = uint32(st[kIdxD[0]] + sta[24] +
   690  			(((st[kIdxB[0]] ^ st[kIdxC[0]]) & st[kIdxA[0]]) ^ st[kIdxC[0]]))
   691  		st[kIdxA[0]] = ((tp << 4) | (tp >> (32 - 4))) + tA[kPrem[0][0]]
   692  		st[kIdxD[0]] = st[kIdxC[0]]
   693  		st[kIdxC[0]] = st[kIdxB[0]]
   694  		st[kIdxB[0]] = tA[0]
   695  
   696  		tp = uint32(st[kIdxD[1]] + sta[25] +
   697  			(((st[kIdxB[1]] ^ st[kIdxC[1]]) & st[kIdxA[1]]) ^ st[kIdxC[1]]))
   698  		st[kIdxA[1]] = ((tp << 4) | (tp >> (32 - 4))) + tA[kPrem[0][1]]
   699  		st[kIdxD[1]] = st[kIdxC[1]]
   700  		st[kIdxC[1]] = st[kIdxB[1]]
   701  		st[kIdxB[1]] = tA[1]
   702  
   703  		tp = uint32(st[kIdxD[2]] + sta[26] +
   704  			(((st[kIdxB[2]] ^ st[kIdxC[2]]) & st[kIdxA[2]]) ^ st[kIdxC[2]]))
   705  		st[kIdxA[2]] = ((tp << 4) | (tp >> (32 - 4))) + tA[kPrem[0][2]]
   706  		st[kIdxD[2]] = st[kIdxC[2]]
   707  		st[kIdxC[2]] = st[kIdxB[2]]
   708  		st[kIdxB[2]] = tA[2]
   709  
   710  		tp = uint32(st[kIdxD[3]] + sta[27] +
   711  			(((st[kIdxB[3]] ^ st[kIdxC[3]]) & st[kIdxA[3]]) ^ st[kIdxC[3]]))
   712  		st[kIdxA[3]] = ((tp << 4) | (tp >> (32 - 4))) + tA[kPrem[0][3]]
   713  		st[kIdxD[3]] = st[kIdxC[3]]
   714  		st[kIdxC[3]] = st[kIdxB[3]]
   715  		st[kIdxB[3]] = tA[3]
   716  
   717  		tp = uint32(st[kIdxD[4]] + sta[28] +
   718  			(((st[kIdxB[4]] ^ st[kIdxC[4]]) & st[kIdxA[4]]) ^ st[kIdxC[4]]))
   719  		st[kIdxA[4]] = ((tp << 4) | (tp >> (32 - 4))) + tA[kPrem[0][4]]
   720  		st[kIdxD[4]] = st[kIdxC[4]]
   721  		st[kIdxC[4]] = st[kIdxB[4]]
   722  		st[kIdxB[4]] = tA[4]
   723  
   724  		tp = uint32(st[kIdxD[5]] + sta[29] +
   725  			(((st[kIdxB[5]] ^ st[kIdxC[5]]) & st[kIdxA[5]]) ^ st[kIdxC[5]]))
   726  		st[kIdxA[5]] = ((tp << 4) | (tp >> (32 - 4))) + tA[kPrem[0][5]]
   727  		st[kIdxD[5]] = st[kIdxC[5]]
   728  		st[kIdxC[5]] = st[kIdxB[5]]
   729  		st[kIdxB[5]] = tA[5]
   730  
   731  		tp = uint32(st[kIdxD[6]] + sta[30] +
   732  			(((st[kIdxB[6]] ^ st[kIdxC[6]]) & st[kIdxA[6]]) ^ st[kIdxC[6]]))
   733  		st[kIdxA[6]] = ((tp << 4) | (tp >> (32 - 4))) + tA[kPrem[0][6]]
   734  		st[kIdxD[6]] = st[kIdxC[6]]
   735  		st[kIdxC[6]] = st[kIdxB[6]]
   736  		st[kIdxB[6]] = tA[6]
   737  
   738  		tp = uint32(st[kIdxD[7]] + sta[31] +
   739  			(((st[kIdxB[7]] ^ st[kIdxC[7]]) & st[kIdxA[7]]) ^ st[kIdxC[7]]))
   740  		st[kIdxA[7]] = ((tp << 4) | (tp >> (32 - 4))) + tA[kPrem[0][7]]
   741  		st[kIdxD[7]] = st[kIdxC[7]]
   742  		st[kIdxC[7]] = st[kIdxB[7]]
   743  		st[kIdxB[7]] = tA[7]
   744  	}
   745  
   746  	copy(ref.h[:], st[:])
   747  }
   748  
   749  func mixoutRound(x []uint8, q []int32, xt uintptr) {
   750  	var tx int32
   751  	var d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7 int32
   752  	var d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7 int32
   753  
   754  	xd := xt << 1
   755  
   756  	{
   757  		var sa, sb uintptr
   758  		var x0, x1, x2, x3 int32
   759  		var a0, a1, a2, a3 int32
   760  		var b0, b1, b2, b3 int32
   761  
   762  		sb = xd << 2
   763  
   764  		x0 = int32(x[0])
   765  		x1 = int32(x[sb])
   766  		x2 = int32(x[2*sb])
   767  		x3 = int32(x[3*sb])
   768  
   769  		a0 = x0 + x2
   770  		a1 = x0 + (x2 << 4)
   771  		a2 = x0 - x2
   772  		a3 = x0 - (x2 << 4)
   773  
   774  		b0 = x1 + x3
   775  		tx = ((x1 << 2) + (x3 << 6))
   776  		b1 = ((tx & 0xFF) - (tx >> 8))
   777  		b2 = (x1 << 4) - (x3 << 4)
   778  		tx = (x1 << 6) + (x3 << 2)
   779  		b3 = ((tx & 0xFF) - (tx >> 8))
   780  
   781  		d1_0 = a0 + b0
   782  		d1_1 = a1 + b1
   783  		d1_2 = a2 + b2
   784  		d1_3 = a3 + b3
   785  		d1_4 = a0 - b0
   786  		d1_5 = a1 - b1
   787  		d1_6 = a2 - b2
   788  		d1_7 = a3 - b3
   789  
   790  		sa = xd << 1
   791  		sb = xd << 2
   792  
   793  		x0 = int32(x[sa])
   794  		x1 = int32(x[sa+sb])
   795  		x2 = int32(x[sa+2*sb])
   796  		x3 = int32(x[sa+3*sb])
   797  
   798  		a0 = x0 + x2
   799  		a1 = x0 + (x2 << 4)
   800  		a2 = x0 - x2
   801  		a3 = x0 - (x2 << 4)
   802  
   803  		b0 = x1 + x3
   804  		tx = ((x1 << 2) + (x3 << 6))
   805  		b1 = ((tx & 0xFF) - (tx >> 8))
   806  		b2 = (x1 << 4) - (x3 << 4)
   807  		tx = (x1 << 6) + (x3 << 2)
   808  		b3 = ((tx & 0xFF) - (tx >> 8))
   809  
   810  		d2_0 = a0 + b0
   811  		d2_1 = a1 + b1
   812  		d2_2 = a2 + b2
   813  		d2_3 = a3 + b3
   814  		d2_4 = a0 - b0
   815  		d2_5 = a1 - b1
   816  		d2_6 = a2 - b2
   817  		d2_7 = a3 - b3
   818  	}
   819  
   820  	q[0] = d1_0 + d2_0
   821  	q[1] = d1_1 + (d2_1 << 1)
   822  	q[2] = d1_2 + (d2_2 << 2)
   823  	q[3] = d1_3 + (d2_3 << 3)
   824  	q[4] = d1_4 + (d2_4 << 4)
   825  	q[5] = d1_5 + (d2_5 << 5)
   826  	q[6] = d1_6 + (d2_6 << 6)
   827  	q[7] = d1_7 + (d2_7 << 7)
   828  	q[8] = d1_0 - d2_0
   829  	q[9] = d1_1 - (d2_1 << 1)
   830  	q[10] = d1_2 - (d2_2 << 2)
   831  	q[11] = d1_3 - (d2_3 << 3)
   832  	q[12] = d1_4 - (d2_4 << 4)
   833  	q[13] = d1_5 - (d2_5 << 5)
   834  	q[14] = d1_6 - (d2_6 << 6)
   835  	q[15] = d1_7 - (d2_7 << 7)
   836  
   837  	{
   838  		var sa, sb uintptr
   839  		var x0, x1, x2, x3 int32
   840  		var a0, a1, a2, a3 int32
   841  		var b0, b1, b2, b3 int32
   842  
   843  		sb = xd << 2
   844  
   845  		x0 = int32(x[(xd)])
   846  		x1 = int32(x[(xd)+sb])
   847  		x2 = int32(x[(xd)+2*sb])
   848  		x3 = int32(x[(xd)+3*sb])
   849  
   850  		a0 = x0 + x2
   851  		a1 = x0 + (x2 << 4)
   852  		a2 = x0 - x2
   853  		a3 = x0 - (x2 << 4)
   854  
   855  		b0 = x1 + x3
   856  		tx = ((x1 << 2) + (x3 << 6))
   857  		b1 = ((tx & 0xFF) - (tx >> 8))
   858  		b2 = (x1 << 4) - (x3 << 4)
   859  		tx = (x1 << 6) + (x3 << 2)
   860  		b3 = ((tx & 0xFF) - (tx >> 8))
   861  
   862  		d1_0 = a0 + b0
   863  		d1_1 = a1 + b1
   864  		d1_2 = a2 + b2
   865  		d1_3 = a3 + b3
   866  		d1_4 = a0 - b0
   867  		d1_5 = a1 - b1
   868  		d1_6 = a2 - b2
   869  		d1_7 = a3 - b3
   870  
   871  		sa = xd + (xd << 1)
   872  		sb = xd << 2
   873  
   874  		x0 = int32(x[sa])
   875  		x1 = int32(x[sa+sb])
   876  		x2 = int32(x[sa+2*sb])
   877  		x3 = int32(x[sa+3*sb])
   878  
   879  		a0 = x0 + x2
   880  		a1 = x0 + (x2 << 4)
   881  		a2 = x0 - x2
   882  		a3 = x0 - (x2 << 4)
   883  
   884  		b0 = x1 + x3
   885  		tx = ((x1 << 2) + (x3 << 6))
   886  		b1 = ((tx & 0xFF) - (tx >> 8))
   887  		b2 = (x1 << 4) - (x3 << 4)
   888  		tx = (x1 << 6) + (x3 << 2)
   889  		b3 = ((tx & 0xFF) - (tx >> 8))
   890  
   891  		d2_0 = a0 + b0
   892  		d2_1 = a1 + b1
   893  		d2_2 = a2 + b2
   894  		d2_3 = a3 + b3
   895  		d2_4 = a0 - b0
   896  		d2_5 = a1 - b1
   897  		d2_6 = a2 - b2
   898  		d2_7 = a3 - b3
   899  	}
   900  
   901  	q[16+0] = d1_0 + d2_0
   902  	q[16+1] = d1_1 + (d2_1 << 1)
   903  	q[16+2] = d1_2 + (d2_2 << 2)
   904  	q[16+3] = d1_3 + (d2_3 << 3)
   905  	q[16+4] = d1_4 + (d2_4 << 4)
   906  	q[16+5] = d1_5 + (d2_5 << 5)
   907  	q[16+6] = d1_6 + (d2_6 << 6)
   908  	q[16+7] = d1_7 + (d2_7 << 7)
   909  	q[16+8] = d1_0 - d2_0
   910  	q[16+9] = d1_1 - (d2_1 << 1)
   911  	q[16+10] = d1_2 - (d2_2 << 2)
   912  	q[16+11] = d1_3 - (d2_3 << 3)
   913  	q[16+12] = d1_4 - (d2_4 << 4)
   914  	q[16+13] = d1_5 - (d2_5 << 5)
   915  	q[16+14] = d1_6 - (d2_6 << 6)
   916  	q[16+15] = d1_7 - (d2_7 << 7)
   917  
   918  	{
   919  		var u, v uintptr
   920  
   921  		m := q[0]
   922  		n := q[16]
   923  		q[0] = m + n
   924  		q[16] = m - n
   925  
   926  		m = q[u+1]
   927  		n = q[u+1+16]
   928  		tx = (n * kAlphaTab[v+1*8])
   929  		tx = ((tx & 0xFFFF) + (tx >> 16))
   930  		q[u+1] = m + tx
   931  		q[u+1+16] = m - tx
   932  		m = q[u+2]
   933  		n = q[u+2+16]
   934  		tx = (n * kAlphaTab[v+2*8])
   935  		tx = ((tx & 0xFFFF) + (tx >> 16))
   936  		q[u+2] = m + tx
   937  		q[u+2+16] = m - tx
   938  		m = q[u+3]
   939  		n = q[u+3+16]
   940  		tx = (n * kAlphaTab[v+3*8])
   941  		tx = ((tx & 0xFFFF) + (tx >> 16))
   942  		q[u+3] = m + tx
   943  		q[u+3+16] = m - tx
   944  
   945  		for u < 16 {
   946  			u += 4
   947  			v += 4 * 8
   948  
   949  			m = q[u+0]
   950  			n = q[u+0+16]
   951  			tx = (n * kAlphaTab[v+0*8])
   952  			tx = ((tx & 0xFFFF) + (tx >> 16))
   953  			q[u+0] = m + tx
   954  			q[u+0+16] = m - tx
   955  
   956  			m = q[u+1]
   957  			n = q[u+1+16]
   958  			tx = (n * kAlphaTab[v+1*8])
   959  			tx = ((tx & 0xFFFF) + (tx >> 16))
   960  			q[u+1] = m + tx
   961  			q[u+1+16] = m - tx
   962  			m = q[u+2]
   963  			n = q[u+2+16]
   964  			tx = (n * kAlphaTab[v+2*8])
   965  			tx = ((tx & 0xFFFF) + (tx >> 16))
   966  			q[u+2] = m + tx
   967  			q[u+2+16] = m - tx
   968  			m = q[u+3]
   969  			n = q[u+3+16]
   970  			tx = (n * kAlphaTab[v+3*8])
   971  			tx = ((tx & 0xFFFF) + (tx >> 16))
   972  			q[u+3] = m + tx
   973  			q[u+3+16] = m - tx
   974  		}
   975  	}
   976  
   977  	{
   978  		var sa, sb uintptr
   979  		var x0, x1, x2, x3 int32
   980  		var a0, a1, a2, a3 int32
   981  		var b0, b1, b2, b3 int32
   982  
   983  		sb = uintptr(xd << 2)
   984  
   985  		x0 = int32(x[xt])
   986  		x1 = int32(x[xt+sb])
   987  		x2 = int32(x[xt+2*sb])
   988  		x3 = int32(x[xt+3*sb])
   989  
   990  		a0 = x0 + x2
   991  		a1 = x0 + (x2 << 4)
   992  		a2 = x0 - x2
   993  		a3 = x0 - (x2 << 4)
   994  
   995  		b0 = x1 + x3
   996  		tx = ((x1 << 2) + (x3 << 6))
   997  		b1 = ((tx & 0xFF) - (tx >> 8))
   998  		b2 = (x1 << 4) - (x3 << 4)
   999  		tx = (x1 << 6) + (x3 << 2)
  1000  		b3 = ((tx & 0xFF) - (tx >> 8))
  1001  
  1002  		d1_0 = a0 + b0
  1003  		d1_1 = a1 + b1
  1004  		d1_2 = a2 + b2
  1005  		d1_3 = a3 + b3
  1006  		d1_4 = a0 - b0
  1007  		d1_5 = a1 - b1
  1008  		d1_6 = a2 - b2
  1009  		d1_7 = a3 - b3
  1010  
  1011  		sa = xt + (xd << 1)
  1012  		sb = xd << 2
  1013  
  1014  		x0 = int32(x[sa])
  1015  		x1 = int32(x[sa+sb])
  1016  		x2 = int32(x[sa+2*sb])
  1017  		x3 = int32(x[sa+3*sb])
  1018  
  1019  		a0 = x0 + x2
  1020  		a1 = x0 + (x2 << 4)
  1021  		a2 = x0 - x2
  1022  		a3 = x0 - (x2 << 4)
  1023  
  1024  		b0 = x1 + x3
  1025  		tx = ((x1 << 2) + (x3 << 6))
  1026  		b1 = ((tx & 0xFF) - (tx >> 8))
  1027  		b2 = (x1 << 4) - (x3 << 4)
  1028  		tx = (x1 << 6) + (x3 << 2)
  1029  		b3 = ((tx & 0xFF) - (tx >> 8))
  1030  
  1031  		d2_0 = a0 + b0
  1032  		d2_1 = a1 + b1
  1033  		d2_2 = a2 + b2
  1034  		d2_3 = a3 + b3
  1035  		d2_4 = a0 - b0
  1036  		d2_5 = a1 - b1
  1037  		d2_6 = a2 - b2
  1038  		d2_7 = a3 - b3
  1039  	}
  1040  
  1041  	q[32+0] = d1_0 + d2_0
  1042  	q[32+1] = d1_1 + (d2_1 << 1)
  1043  	q[32+2] = d1_2 + (d2_2 << 2)
  1044  	q[32+3] = d1_3 + (d2_3 << 3)
  1045  	q[32+4] = d1_4 + (d2_4 << 4)
  1046  	q[32+5] = d1_5 + (d2_5 << 5)
  1047  	q[32+6] = d1_6 + (d2_6 << 6)
  1048  	q[32+7] = d1_7 + (d2_7 << 7)
  1049  	q[32+8] = d1_0 - d2_0
  1050  	q[32+9] = d1_1 - (d2_1 << 1)
  1051  	q[32+10] = d1_2 - (d2_2 << 2)
  1052  	q[32+11] = d1_3 - (d2_3 << 3)
  1053  	q[32+12] = d1_4 - (d2_4 << 4)
  1054  	q[32+13] = d1_5 - (d2_5 << 5)
  1055  	q[32+14] = d1_6 - (d2_6 << 6)
  1056  	q[32+15] = d1_7 - (d2_7 << 7)
  1057  
  1058  	{
  1059  		var sa, sb uintptr
  1060  		var x0, x1, x2, x3 int32
  1061  		var a0, a1, a2, a3 int32
  1062  		var b0, b1, b2, b3 int32
  1063  
  1064  		sa = (xt) + (xd)
  1065  		sb = xd << 2
  1066  
  1067  		x0 = int32(x[sa])
  1068  		x1 = int32(x[sa+sb])
  1069  		x2 = int32(x[sa+2*sb])
  1070  		x3 = int32(x[sa+3*sb])
  1071  
  1072  		a0 = x0 + x2
  1073  		a1 = x0 + (x2 << 4)
  1074  		a2 = x0 - x2
  1075  		a3 = x0 - (x2 << 4)
  1076  
  1077  		b0 = x1 + x3
  1078  		tx = ((x1 << 2) + (x3 << 6))
  1079  		b1 = ((tx & 0xFF) - (tx >> 8))
  1080  		b2 = (x1 << 4) - (x3 << 4)
  1081  		tx = (x1 << 6) + (x3 << 2)
  1082  		b3 = ((tx & 0xFF) - (tx >> 8))
  1083  
  1084  		d1_0 = a0 + b0
  1085  		d1_1 = a1 + b1
  1086  		d1_2 = a2 + b2
  1087  		d1_3 = a3 + b3
  1088  		d1_4 = a0 - b0
  1089  		d1_5 = a1 - b1
  1090  		d1_6 = a2 - b2
  1091  		d1_7 = a3 - b3
  1092  
  1093  		sa = (xt + xd) + (xd << 1)
  1094  		sb = xd << 2
  1095  
  1096  		x0 = int32(x[sa])
  1097  		x1 = int32(x[sa+sb])
  1098  		x2 = int32(x[sa+2*sb])
  1099  		x3 = int32(x[sa+3*sb])
  1100  
  1101  		a0 = x0 + x2
  1102  		a1 = x0 + (x2 << 4)
  1103  		a2 = x0 - x2
  1104  		a3 = x0 - (x2 << 4)
  1105  
  1106  		b0 = x1 + x3
  1107  		tx = ((x1 << 2) + (x3 << 6))
  1108  		b1 = ((tx & 0xFF) - (tx >> 8))
  1109  		b2 = (x1 << 4) - (x3 << 4)
  1110  		tx = (x1 << 6) + (x3 << 2)
  1111  		b3 = ((tx & 0xFF) - (tx >> 8))
  1112  
  1113  		d2_0 = a0 + b0
  1114  		d2_1 = a1 + b1
  1115  		d2_2 = a2 + b2
  1116  		d2_3 = a3 + b3
  1117  		d2_4 = a0 - b0
  1118  		d2_5 = a1 - b1
  1119  		d2_6 = a2 - b2
  1120  		d2_7 = a3 - b3
  1121  	}
  1122  
  1123  	q[48+0] = d1_0 + d2_0
  1124  	q[48+1] = d1_1 + (d2_1 << 1)
  1125  	q[48+2] = d1_2 + (d2_2 << 2)
  1126  	q[48+3] = d1_3 + (d2_3 << 3)
  1127  	q[48+4] = d1_4 + (d2_4 << 4)
  1128  	q[48+5] = d1_5 + (d2_5 << 5)
  1129  	q[48+6] = d1_6 + (d2_6 << 6)
  1130  	q[48+7] = d1_7 + (d2_7 << 7)
  1131  	q[48+8] = d1_0 - d2_0
  1132  	q[48+9] = d1_1 - (d2_1 << 1)
  1133  	q[48+10] = d1_2 - (d2_2 << 2)
  1134  	q[48+11] = d1_3 - (d2_3 << 3)
  1135  	q[48+12] = d1_4 - (d2_4 << 4)
  1136  	q[48+13] = d1_5 - (d2_5 << 5)
  1137  	q[48+14] = d1_6 - (d2_6 << 6)
  1138  	q[48+15] = d1_7 - (d2_7 << 7)
  1139  
  1140  	{
  1141  		var u, v uintptr
  1142  
  1143  		m := q[(32)]
  1144  		n := q[(32)+(16)]
  1145  		q[(32)] = m + n
  1146  		q[(32)+(16)] = m - n
  1147  
  1148  		m = q[(32)+u+1]
  1149  		n = q[(32)+u+1+(16)]
  1150  		tx = (n * kAlphaTab[v+1*(8)])
  1151  		tx = ((tx & 0xFFFF) + (tx >> 16))
  1152  		q[(32)+u+1] = m + tx
  1153  		q[(32)+u+1+(16)] = m - tx
  1154  		m = q[(32)+u+2]
  1155  		n = q[(32)+u+2+(16)]
  1156  		tx = (n * kAlphaTab[v+2*(8)])
  1157  		tx = ((tx & 0xFFFF) + (tx >> 16))
  1158  		q[(32)+u+2] = m + tx
  1159  		q[(32)+u+2+(16)] = m - tx
  1160  		m = q[(32)+u+3]
  1161  		n = q[(32)+u+3+(16)]
  1162  		tx = (n * kAlphaTab[v+3*(8)])
  1163  		tx = ((tx & 0xFFFF) + (tx >> 16))
  1164  		q[(32)+u+3] = m + tx
  1165  		q[(32)+u+3+(16)] = m - tx
  1166  
  1167  		u = 4
  1168  		v = 4 * (8)
  1169  		for u < 16 {
  1170  			m = q[(32)+u]
  1171  			n = q[(32)+u+(16)]
  1172  			tx = (n * kAlphaTab[v+0*(8)])
  1173  			tx = ((tx & 0xFFFF) + (tx >> 16))
  1174  			q[(32)+u+0] = m + tx
  1175  			q[(32)+u+0+(16)] = m - tx
  1176  
  1177  			m = q[(32)+u+1]
  1178  			n = q[(32)+u+1+(16)]
  1179  			tx = (n * kAlphaTab[v+1*(8)])
  1180  			tx = ((tx & 0xFFFF) + (tx >> 16))
  1181  			q[(32)+u+1] = m + tx
  1182  			q[(32)+u+1+(16)] = m - tx
  1183  			m = q[(32)+u+2]
  1184  			n = q[(32)+u+2+(16)]
  1185  			tx = (n * kAlphaTab[v+2*(8)])
  1186  			tx = ((tx & 0xFFFF) + (tx >> 16))
  1187  			q[(32)+u+2] = m + tx
  1188  			q[(32)+u+2+(16)] = m - tx
  1189  			m = q[(32)+u+3]
  1190  			n = q[(32)+u+3+(16)]
  1191  			tx = (n * kAlphaTab[v+3*(8)])
  1192  			tx = ((tx & 0xFFFF) + (tx >> 16))
  1193  			q[(32)+u+3] = m + tx
  1194  			q[(32)+u+3+(16)] = m - tx
  1195  
  1196  			u += 4
  1197  			v += 4 * (8)
  1198  		}
  1199  	}
  1200  
  1201  	{
  1202  		var u, v uintptr
  1203  
  1204  		m := q[0]
  1205  		n := q[32]
  1206  		q[0] = m + n
  1207  		q[32] = m - n
  1208  
  1209  		m = q[u+1]
  1210  		n = q[u+1+32]
  1211  		tx = (n * kAlphaTab[v+1*4])
  1212  		tx = ((tx & 0xFFFF) + (tx >> 16))
  1213  		q[u+1] = m + tx
  1214  		q[u+1+32] = m - tx
  1215  		m = q[u+2]
  1216  		n = q[u+2+32]
  1217  		tx = (n * kAlphaTab[v+2*4])
  1218  		tx = ((tx & 0xFFFF) + (tx >> 16))
  1219  		q[u+2] = m + tx
  1220  		q[u+2+32] = m - tx
  1221  		m = q[u+3]
  1222  		n = q[u+3+32]
  1223  		tx = (n * kAlphaTab[v+3*4])
  1224  		tx = ((tx & 0xFFFF) + (tx >> 16))
  1225  		q[u+3] = m + tx
  1226  		q[u+3+32] = m - tx
  1227  
  1228  		u = 4
  1229  		v = 4 * 4
  1230  		for u < 32 {
  1231  			m = q[u]
  1232  			n = q[u+32]
  1233  			tx = (n * kAlphaTab[v+0*4])
  1234  			tx = ((tx & 0xFFFF) + (tx >> 16))
  1235  			q[u] = m + tx
  1236  			q[u+(32)] = m - tx
  1237  
  1238  			m = q[u+1]
  1239  			n = q[u+1+32]
  1240  			tx = (n * kAlphaTab[v+1*4])
  1241  			tx = ((tx & 0xFFFF) + (tx >> 16))
  1242  			q[u+1] = m + tx
  1243  			q[u+1+32] = m - tx
  1244  			m = q[u+2]
  1245  			n = q[u+2+32]
  1246  			tx = (n * kAlphaTab[v+2*4])
  1247  			tx = ((tx & 0xFFFF) + (tx >> 16))
  1248  			q[u+2] = m + tx
  1249  			q[u+2+32] = m - tx
  1250  			m = q[u+3]
  1251  			n = q[u+3+32]
  1252  			tx = (n * kAlphaTab[v+3*4])
  1253  			tx = ((tx & 0xFFFF) + (tx >> 16))
  1254  			q[u+3] = m + tx
  1255  			q[u+3+32] = m - tx
  1256  
  1257  			u += 4
  1258  			v += 4 * 4
  1259  		}
  1260  	}
  1261  }
  1262  
  1263  func mixinRound(h, w []uint32, isp, p0, p1, p2, p3 uint32) {
  1264  	var tA [8]uint32
  1265  	var tp uint32
  1266  
  1267  	tA[0] = ((h[0] << p0) | (h[0] >> (32 - p0)))
  1268  	tA[1] = ((h[1] << p0) | (h[1] >> (32 - p0)))
  1269  	tA[2] = ((h[2] << p0) | (h[2] >> (32 - p0)))
  1270  	tA[3] = ((h[3] << p0) | (h[3] >> (32 - p0)))
  1271  	tA[4] = ((h[4] << p0) | (h[4] >> (32 - p0)))
  1272  	tA[5] = ((h[5] << p0) | (h[5] >> (32 - p0)))
  1273  	tA[6] = ((h[6] << p0) | (h[6] >> (32 - p0)))
  1274  	tA[7] = ((h[7] << p0) | (h[7] >> (32 - p0)))
  1275  
  1276  	tp = uint32(h[kIdxD[0]] + w[0] +
  1277  		(((h[kIdxB[0]] ^ h[kIdxC[0]]) & h[kIdxA[0]]) ^ h[kIdxC[0]]))
  1278  	h[kIdxA[0]] = ((tp << p1) | (tp >> (32 - p1))) + tA[kPrems[isp]]
  1279  	h[kIdxD[0]] = h[kIdxC[0]]
  1280  	h[kIdxC[0]] = h[kIdxB[0]]
  1281  	h[kIdxB[0]] = tA[0]
  1282  
  1283  	tp = uint32(h[kIdxD[1]] + w[1] +
  1284  		(((h[kIdxB[1]] ^ h[kIdxC[1]]) & h[kIdxA[1]]) ^ h[kIdxC[1]]))
  1285  	h[kIdxA[1]] = ((tp << p1) | (tp >> (32 - p1))) + tA[kPrems[isp]^1]
  1286  	h[kIdxD[1]] = h[kIdxC[1]]
  1287  	h[kIdxC[1]] = h[kIdxB[1]]
  1288  	h[kIdxB[1]] = tA[1]
  1289  
  1290  	tp = uint32(h[kIdxD[2]] + w[2] +
  1291  		(((h[kIdxB[2]] ^ h[kIdxC[2]]) & h[kIdxA[2]]) ^ h[kIdxC[2]]))
  1292  	h[kIdxA[2]] = ((tp << p1) | (tp >> (32 - p1))) + tA[kPrems[isp]^2]
  1293  	h[kIdxD[2]] = h[kIdxC[2]]
  1294  	h[kIdxC[2]] = h[kIdxB[2]]
  1295  	h[kIdxB[2]] = tA[2]
  1296  
  1297  	tp = uint32(h[kIdxD[3]] + w[3] +
  1298  		(((h[kIdxB[3]] ^ h[kIdxC[3]]) & h[kIdxA[3]]) ^ h[kIdxC[3]]))
  1299  	h[kIdxA[3]] = ((tp << p1) | (tp >> (32 - p1))) + tA[kPrems[isp]^3]
  1300  	h[kIdxD[3]] = h[kIdxC[3]]
  1301  	h[kIdxC[3]] = h[kIdxB[3]]
  1302  	h[kIdxB[3]] = tA[3]
  1303  
  1304  	tp = uint32(h[kIdxD[4]] + w[4] +
  1305  		(((h[kIdxB[4]] ^ h[kIdxC[4]]) & h[kIdxA[4]]) ^ h[kIdxC[4]]))
  1306  	h[kIdxA[4]] = ((tp << p1) | (tp >> (32 - p1))) + tA[kPrems[isp]^4]
  1307  	h[kIdxD[4]] = h[kIdxC[4]]
  1308  	h[kIdxC[4]] = h[kIdxB[4]]
  1309  	h[kIdxB[4]] = tA[4]
  1310  
  1311  	tp = uint32(h[kIdxD[5]] + w[5] +
  1312  		(((h[kIdxB[5]] ^ h[kIdxC[5]]) & h[kIdxA[5]]) ^ h[kIdxC[5]]))
  1313  	h[kIdxA[5]] = ((tp << p1) | (tp >> (32 - p1))) + tA[kPrems[isp]^5]
  1314  	h[kIdxD[5]] = h[kIdxC[5]]
  1315  	h[kIdxC[5]] = h[kIdxB[5]]
  1316  	h[kIdxB[5]] = tA[5]
  1317  
  1318  	tp = uint32(h[kIdxD[6]] + w[6] +
  1319  		(((h[kIdxB[6]] ^ h[kIdxC[6]]) & h[kIdxA[6]]) ^ h[kIdxC[6]]))
  1320  	h[kIdxA[6]] = ((tp << p1) | (tp >> (32 - p1))) + tA[kPrems[isp]^6]
  1321  	h[kIdxD[6]] = h[kIdxC[6]]
  1322  	h[kIdxC[6]] = h[kIdxB[6]]
  1323  	h[kIdxB[6]] = tA[6]
  1324  
  1325  	tp = uint32(h[kIdxD[7]] + w[7] +
  1326  		(((h[kIdxB[7]] ^ h[kIdxC[7]]) & h[kIdxA[7]]) ^ h[kIdxC[7]]))
  1327  	h[kIdxA[7]] = ((tp << p1) | (tp >> (32 - p1))) + tA[kPrems[isp]^7]
  1328  	h[kIdxD[7]] = h[kIdxC[7]]
  1329  	h[kIdxC[7]] = h[kIdxB[7]]
  1330  	h[kIdxB[7]] = tA[7]
  1331  
  1332  	tA[0] = ((h[0] << p1) | (h[0] >> (32 - p1)))
  1333  	tA[1] = ((h[1] << p1) | (h[1] >> (32 - p1)))
  1334  	tA[2] = ((h[2] << p1) | (h[2] >> (32 - p1)))
  1335  	tA[3] = ((h[3] << p1) | (h[3] >> (32 - p1)))
  1336  	tA[4] = ((h[4] << p1) | (h[4] >> (32 - p1)))
  1337  	tA[5] = ((h[5] << p1) | (h[5] >> (32 - p1)))
  1338  	tA[6] = ((h[6] << p1) | (h[6] >> (32 - p1)))
  1339  	tA[7] = ((h[7] << p1) | (h[7] >> (32 - p1)))
  1340  
  1341  	tp = uint32(h[kIdxD[0]] + w[8] +
  1342  		(((h[kIdxB[0]] ^ h[kIdxC[0]]) & h[kIdxA[0]]) ^ h[kIdxC[0]]))
  1343  	h[kIdxA[0]] = ((tp << p2) | (tp >> (32 - p2))) + tA[kPrems[isp+1]]
  1344  	h[kIdxD[0]] = h[kIdxC[0]]
  1345  	h[kIdxC[0]] = h[kIdxB[0]]
  1346  	h[kIdxB[0]] = tA[0]
  1347  
  1348  	tp = uint32(h[kIdxD[1]] + w[9] +
  1349  		(((h[kIdxB[1]] ^ h[kIdxC[1]]) & h[kIdxA[1]]) ^ h[kIdxC[1]]))
  1350  	h[kIdxA[1]] = ((tp << p2) | (tp >> (32 - p2))) + tA[kPrems[isp+1]^1]
  1351  	h[kIdxD[1]] = h[kIdxC[1]]
  1352  	h[kIdxC[1]] = h[kIdxB[1]]
  1353  	h[kIdxB[1]] = tA[1]
  1354  
  1355  	tp = uint32(h[kIdxD[2]] + w[10] +
  1356  		(((h[kIdxB[2]] ^ h[kIdxC[2]]) & h[kIdxA[2]]) ^ h[kIdxC[2]]))
  1357  	h[kIdxA[2]] = ((tp << p2) | (tp >> (32 - p2))) + tA[kPrems[isp+1]^2]
  1358  	h[kIdxD[2]] = h[kIdxC[2]]
  1359  	h[kIdxC[2]] = h[kIdxB[2]]
  1360  	h[kIdxB[2]] = tA[2]
  1361  
  1362  	tp = uint32(h[kIdxD[3]] + w[11] +
  1363  		(((h[kIdxB[3]] ^ h[kIdxC[3]]) & h[kIdxA[3]]) ^ h[kIdxC[3]]))
  1364  	h[kIdxA[3]] = ((tp << p2) | (tp >> (32 - p2))) + tA[kPrems[isp+1]^3]
  1365  	h[kIdxD[3]] = h[kIdxC[3]]
  1366  	h[kIdxC[3]] = h[kIdxB[3]]
  1367  	h[kIdxB[3]] = tA[3]
  1368  
  1369  	tp = uint32(h[kIdxD[4]] + w[12] +
  1370  		(((h[kIdxB[4]] ^ h[kIdxC[4]]) & h[kIdxA[4]]) ^ h[kIdxC[4]]))
  1371  	h[kIdxA[4]] = ((tp << p2) | (tp >> (32 - p2))) + tA[kPrems[isp+1]^4]
  1372  	h[kIdxD[4]] = h[kIdxC[4]]
  1373  	h[kIdxC[4]] = h[kIdxB[4]]
  1374  	h[kIdxB[4]] = tA[4]
  1375  
  1376  	tp = uint32(h[kIdxD[5]] + w[13] +
  1377  		(((h[kIdxB[5]] ^ h[kIdxC[5]]) & h[kIdxA[5]]) ^ h[kIdxC[5]]))
  1378  	h[kIdxA[5]] = ((tp << p2) | (tp >> (32 - p2))) + tA[kPrems[isp+1]^5]
  1379  	h[kIdxD[5]] = h[kIdxC[5]]
  1380  	h[kIdxC[5]] = h[kIdxB[5]]
  1381  	h[kIdxB[5]] = tA[5]
  1382  
  1383  	tp = uint32(h[kIdxD[6]] + w[14] +
  1384  		(((h[kIdxB[6]] ^ h[kIdxC[6]]) & h[kIdxA[6]]) ^ h[kIdxC[6]]))
  1385  	h[kIdxA[6]] = ((tp << p2) | (tp >> (32 - p2))) + tA[kPrems[isp+1]^6]
  1386  	h[kIdxD[6]] = h[kIdxC[6]]
  1387  	h[kIdxC[6]] = h[kIdxB[6]]
  1388  	h[kIdxB[6]] = tA[6]
  1389  
  1390  	tp = uint32(h[kIdxD[7]] + w[15] +
  1391  		(((h[kIdxB[7]] ^ h[kIdxC[7]]) & h[kIdxA[7]]) ^ h[kIdxC[7]]))
  1392  	h[kIdxA[7]] = ((tp << p2) | (tp >> (32 - p2))) + tA[kPrems[isp+1]^7]
  1393  	h[kIdxD[7]] = h[kIdxC[7]]
  1394  	h[kIdxC[7]] = h[kIdxB[7]]
  1395  	h[kIdxB[7]] = tA[7]
  1396  
  1397  	tA[0] = ((h[0] << p2) | (h[0] >> (32 - p2)))
  1398  	tA[1] = ((h[1] << p2) | (h[1] >> (32 - p2)))
  1399  	tA[2] = ((h[2] << p2) | (h[2] >> (32 - p2)))
  1400  	tA[3] = ((h[3] << p2) | (h[3] >> (32 - p2)))
  1401  	tA[4] = ((h[4] << p2) | (h[4] >> (32 - p2)))
  1402  	tA[5] = ((h[5] << p2) | (h[5] >> (32 - p2)))
  1403  	tA[6] = ((h[6] << p2) | (h[6] >> (32 - p2)))
  1404  	tA[7] = ((h[7] << p2) | (h[7] >> (32 - p2)))
  1405  
  1406  	tp = uint32(h[kIdxD[0]] + w[16] +
  1407  		(((h[kIdxB[0]] ^ h[kIdxC[0]]) & h[kIdxA[0]]) ^ h[kIdxC[0]]))
  1408  	h[kIdxA[0]] = ((tp << p3) | (tp >> (32 - p3))) + tA[kPrems[isp+2]]
  1409  	h[kIdxD[0]] = h[kIdxC[0]]
  1410  	h[kIdxC[0]] = h[kIdxB[0]]
  1411  	h[kIdxB[0]] = tA[0]
  1412  
  1413  	tp = uint32(h[kIdxD[1]] + w[17] +
  1414  		(((h[kIdxB[1]] ^ h[kIdxC[1]]) & h[kIdxA[1]]) ^ h[kIdxC[1]]))
  1415  	h[kIdxA[1]] = ((tp << p3) | (tp >> (32 - p3))) + tA[kPrems[isp+2]^1]
  1416  	h[kIdxD[1]] = h[kIdxC[1]]
  1417  	h[kIdxC[1]] = h[kIdxB[1]]
  1418  	h[kIdxB[1]] = tA[1]
  1419  
  1420  	tp = uint32(h[kIdxD[2]] + w[18] +
  1421  		(((h[kIdxB[2]] ^ h[kIdxC[2]]) & h[kIdxA[2]]) ^ h[kIdxC[2]]))
  1422  	h[kIdxA[2]] = ((tp << p3) | (tp >> (32 - p3))) + tA[kPrems[isp+2]^2]
  1423  	h[kIdxD[2]] = h[kIdxC[2]]
  1424  	h[kIdxC[2]] = h[kIdxB[2]]
  1425  	h[kIdxB[2]] = tA[2]
  1426  
  1427  	tp = uint32(h[kIdxD[3]] + w[19] +
  1428  		(((h[kIdxB[3]] ^ h[kIdxC[3]]) & h[kIdxA[3]]) ^ h[kIdxC[3]]))
  1429  	h[kIdxA[3]] = ((tp << p3) | (tp >> (32 - p3))) + tA[kPrems[isp+2]^3]
  1430  	h[kIdxD[3]] = h[kIdxC[3]]
  1431  	h[kIdxC[3]] = h[kIdxB[3]]
  1432  	h[kIdxB[3]] = tA[3]
  1433  
  1434  	tp = uint32(h[kIdxD[4]] + w[20] +
  1435  		(((h[kIdxB[4]] ^ h[kIdxC[4]]) & h[kIdxA[4]]) ^ h[kIdxC[4]]))
  1436  	h[kIdxA[4]] = ((tp << p3) | (tp >> (32 - p3))) + tA[kPrems[isp+2]^4]
  1437  	h[kIdxD[4]] = h[kIdxC[4]]
  1438  	h[kIdxC[4]] = h[kIdxB[4]]
  1439  	h[kIdxB[4]] = tA[4]
  1440  
  1441  	tp = uint32(h[kIdxD[5]] + w[21] +
  1442  		(((h[kIdxB[5]] ^ h[kIdxC[5]]) & h[kIdxA[5]]) ^ h[kIdxC[5]]))
  1443  	h[kIdxA[5]] = ((tp << p3) | (tp >> (32 - p3))) + tA[kPrems[isp+2]^5]
  1444  	h[kIdxD[5]] = h[kIdxC[5]]
  1445  	h[kIdxC[5]] = h[kIdxB[5]]
  1446  	h[kIdxB[5]] = tA[5]
  1447  
  1448  	tp = uint32(h[kIdxD[6]] + w[22] +
  1449  		(((h[kIdxB[6]] ^ h[kIdxC[6]]) & h[kIdxA[6]]) ^ h[kIdxC[6]]))
  1450  	h[kIdxA[6]] = ((tp << p3) | (tp >> (32 - p3))) + tA[kPrems[isp+2]^6]
  1451  	h[kIdxD[6]] = h[kIdxC[6]]
  1452  	h[kIdxC[6]] = h[kIdxB[6]]
  1453  	h[kIdxB[6]] = tA[6]
  1454  
  1455  	tp = uint32(h[kIdxD[7]] + w[23] +
  1456  		(((h[kIdxB[7]] ^ h[kIdxC[7]]) & h[kIdxA[7]]) ^ h[kIdxC[7]]))
  1457  	h[kIdxA[7]] = ((tp << p3) | (tp >> (32 - p3))) + tA[kPrems[isp+2]^7]
  1458  	h[kIdxD[7]] = h[kIdxC[7]]
  1459  	h[kIdxC[7]] = h[kIdxB[7]]
  1460  	h[kIdxB[7]] = tA[7]
  1461  
  1462  	tA[0] = ((h[0] << p3) | (h[0] >> (32 - p3)))
  1463  	tA[1] = ((h[1] << p3) | (h[1] >> (32 - p3)))
  1464  	tA[2] = ((h[2] << p3) | (h[2] >> (32 - p3)))
  1465  	tA[3] = ((h[3] << p3) | (h[3] >> (32 - p3)))
  1466  	tA[4] = ((h[4] << p3) | (h[4] >> (32 - p3)))
  1467  	tA[5] = ((h[5] << p3) | (h[5] >> (32 - p3)))
  1468  	tA[6] = ((h[6] << p3) | (h[6] >> (32 - p3)))
  1469  	tA[7] = ((h[7] << p3) | (h[7] >> (32 - p3)))
  1470  
  1471  	tp = uint32(h[kIdxD[0]] + w[24] +
  1472  		(((h[kIdxB[0]] ^ h[kIdxC[0]]) & h[kIdxA[0]]) ^ h[kIdxC[0]]))
  1473  	h[kIdxA[0]] = ((tp << p0) | (tp >> (32 - p0))) + tA[kPrems[isp+3]]
  1474  	h[kIdxD[0]] = h[kIdxC[0]]
  1475  	h[kIdxC[0]] = h[kIdxB[0]]
  1476  	h[kIdxB[0]] = tA[0]
  1477  
  1478  	tp = uint32(h[kIdxD[1]] + w[25] +
  1479  		(((h[kIdxB[1]] ^ h[kIdxC[1]]) & h[kIdxA[1]]) ^ h[kIdxC[1]]))
  1480  	h[kIdxA[1]] = ((tp << p0) | (tp >> (32 - p0))) + tA[kPrems[isp+3]^1]
  1481  	h[kIdxD[1]] = h[kIdxC[1]]
  1482  	h[kIdxC[1]] = h[kIdxB[1]]
  1483  	h[kIdxB[1]] = tA[1]
  1484  
  1485  	tp = uint32(h[kIdxD[2]] + w[26] +
  1486  		(((h[kIdxB[2]] ^ h[kIdxC[2]]) & h[kIdxA[2]]) ^ h[kIdxC[2]]))
  1487  	h[kIdxA[2]] = ((tp << p0) | (tp >> (32 - p0))) + tA[kPrems[isp+3]^2]
  1488  	h[kIdxD[2]] = h[kIdxC[2]]
  1489  	h[kIdxC[2]] = h[kIdxB[2]]
  1490  	h[kIdxB[2]] = tA[2]
  1491  
  1492  	tp = uint32(h[kIdxD[3]] + w[27] +
  1493  		(((h[kIdxB[3]] ^ h[kIdxC[3]]) & h[kIdxA[3]]) ^ h[kIdxC[3]]))
  1494  	h[kIdxA[3]] = ((tp << p0) | (tp >> (32 - p0))) + tA[kPrems[isp+3]^3]
  1495  	h[kIdxD[3]] = h[kIdxC[3]]
  1496  	h[kIdxC[3]] = h[kIdxB[3]]
  1497  	h[kIdxB[3]] = tA[3]
  1498  
  1499  	tp = uint32(h[kIdxD[4]] + w[28] +
  1500  		(((h[kIdxB[4]] ^ h[kIdxC[4]]) & h[kIdxA[4]]) ^ h[kIdxC[4]]))
  1501  	h[kIdxA[4]] = ((tp << p0) | (tp >> (32 - p0))) + tA[kPrems[isp+3]^4]
  1502  	h[kIdxD[4]] = h[kIdxC[4]]
  1503  	h[kIdxC[4]] = h[kIdxB[4]]
  1504  	h[kIdxB[4]] = tA[4]
  1505  
  1506  	tp = uint32(h[kIdxD[5]] + w[29] +
  1507  		(((h[kIdxB[5]] ^ h[kIdxC[5]]) & h[kIdxA[5]]) ^ h[kIdxC[5]]))
  1508  	h[kIdxA[5]] = ((tp << p0) | (tp >> (32 - p0))) + tA[kPrems[isp+3]^5]
  1509  	h[kIdxD[5]] = h[kIdxC[5]]
  1510  	h[kIdxC[5]] = h[kIdxB[5]]
  1511  	h[kIdxB[5]] = tA[5]
  1512  
  1513  	tp = uint32(h[kIdxD[6]] + w[30] +
  1514  		(((h[kIdxB[6]] ^ h[kIdxC[6]]) & h[kIdxA[6]]) ^ h[kIdxC[6]]))
  1515  	h[kIdxA[6]] = ((tp << p0) | (tp >> (32 - p0))) + tA[kPrems[isp+3]^6]
  1516  	h[kIdxD[6]] = h[kIdxC[6]]
  1517  	h[kIdxC[6]] = h[kIdxB[6]]
  1518  	h[kIdxB[6]] = tA[6]
  1519  
  1520  	tp = uint32(h[kIdxD[7]] + w[31] +
  1521  		(((h[kIdxB[7]] ^ h[kIdxC[7]]) & h[kIdxA[7]]) ^ h[kIdxC[7]]))
  1522  	h[kIdxA[7]] = ((tp << p0) | (tp >> (32 - p0))) + tA[kPrems[isp+3]^7]
  1523  	h[kIdxD[7]] = h[kIdxC[7]]
  1524  	h[kIdxC[7]] = h[kIdxB[7]]
  1525  	h[kIdxB[7]] = tA[7]
  1526  
  1527  	tA[0] = ((h[0] << p0) | (h[0] >> (32 - p0)))
  1528  	tA[1] = ((h[1] << p0) | (h[1] >> (32 - p0)))
  1529  	tA[2] = ((h[2] << p0) | (h[2] >> (32 - p0)))
  1530  	tA[3] = ((h[3] << p0) | (h[3] >> (32 - p0)))
  1531  	tA[4] = ((h[4] << p0) | (h[4] >> (32 - p0)))
  1532  	tA[5] = ((h[5] << p0) | (h[5] >> (32 - p0)))
  1533  	tA[6] = ((h[6] << p0) | (h[6] >> (32 - p0)))
  1534  	tA[7] = ((h[7] << p0) | (h[7] >> (32 - p0)))
  1535  
  1536  	tp = uint32(h[kIdxD[0]] + w[32] +
  1537  		((h[kIdxA[0]] & h[kIdxB[0]]) | ((h[kIdxA[0]] | h[kIdxB[0]]) & h[kIdxC[0]])))
  1538  	h[kIdxA[0]] = ((tp << p1) | (tp >> (32 - p1))) + tA[kPrems[isp+4]]
  1539  	h[kIdxD[0]] = h[kIdxC[0]]
  1540  	h[kIdxC[0]] = h[kIdxB[0]]
  1541  	h[kIdxB[0]] = tA[0]
  1542  
  1543  	tp = uint32(h[kIdxD[1]] + w[33] +
  1544  		((h[kIdxA[1]] & h[kIdxB[1]]) | ((h[kIdxA[1]] | h[kIdxB[1]]) & h[kIdxC[1]])))
  1545  	h[kIdxA[1]] = ((tp << p1) | (tp >> (32 - p1))) + tA[kPrems[isp+4]^1]
  1546  	h[kIdxD[1]] = h[kIdxC[1]]
  1547  	h[kIdxC[1]] = h[kIdxB[1]]
  1548  	h[kIdxB[1]] = tA[1]
  1549  
  1550  	tp = uint32(h[kIdxD[2]] + w[34] +
  1551  		((h[kIdxA[2]] & h[kIdxB[2]]) | ((h[kIdxA[2]] | h[kIdxB[2]]) & h[kIdxC[2]])))
  1552  	h[kIdxA[2]] = ((tp << p1) | (tp >> (32 - p1))) + tA[kPrems[isp+4]^2]
  1553  	h[kIdxD[2]] = h[kIdxC[2]]
  1554  	h[kIdxC[2]] = h[kIdxB[2]]
  1555  	h[kIdxB[2]] = tA[2]
  1556  
  1557  	tp = uint32(h[kIdxD[3]] + w[35] +
  1558  		((h[kIdxA[3]] & h[kIdxB[3]]) | ((h[kIdxA[3]] | h[kIdxB[3]]) & h[kIdxC[3]])))
  1559  	h[kIdxA[3]] = ((tp << p1) | (tp >> (32 - p1))) + tA[kPrems[isp+4]^3]
  1560  	h[kIdxD[3]] = h[kIdxC[3]]
  1561  	h[kIdxC[3]] = h[kIdxB[3]]
  1562  	h[kIdxB[3]] = tA[3]
  1563  
  1564  	tp = uint32(h[kIdxD[4]] + w[36] +
  1565  		((h[kIdxA[4]] & h[kIdxB[4]]) | ((h[kIdxA[4]] | h[kIdxB[4]]) & h[kIdxC[4]])))
  1566  	h[kIdxA[4]] = ((tp << p1) | (tp >> (32 - p1))) + tA[kPrems[isp+4]^4]
  1567  	h[kIdxD[4]] = h[kIdxC[4]]
  1568  	h[kIdxC[4]] = h[kIdxB[4]]
  1569  	h[kIdxB[4]] = tA[4]
  1570  
  1571  	tp = uint32(h[kIdxD[5]] + w[37] +
  1572  		((h[kIdxA[5]] & h[kIdxB[5]]) | ((h[kIdxA[5]] | h[kIdxB[5]]) & h[kIdxC[5]])))
  1573  	h[kIdxA[5]] = ((tp << p1) | (tp >> (32 - p1))) + tA[kPrems[isp+4]^5]
  1574  	h[kIdxD[5]] = h[kIdxC[5]]
  1575  	h[kIdxC[5]] = h[kIdxB[5]]
  1576  	h[kIdxB[5]] = tA[5]
  1577  
  1578  	tp = uint32(h[kIdxD[6]] + w[38] +
  1579  		((h[kIdxA[6]] & h[kIdxB[6]]) | ((h[kIdxA[6]] | h[kIdxB[6]]) & h[kIdxC[6]])))
  1580  	h[kIdxA[6]] = ((tp << p1) | (tp >> (32 - p1))) + tA[kPrems[isp+4]^6]
  1581  	h[kIdxD[6]] = h[kIdxC[6]]
  1582  	h[kIdxC[6]] = h[kIdxB[6]]
  1583  	h[kIdxB[6]] = tA[6]
  1584  
  1585  	tp = uint32(h[kIdxD[7]] + w[39] +
  1586  		((h[kIdxA[7]] & h[kIdxB[7]]) | ((h[kIdxA[7]] | h[kIdxB[7]]) & h[kIdxC[7]])))
  1587  	h[kIdxA[7]] = ((tp << p1) | (tp >> (32 - p1))) + tA[kPrems[isp+4]^7]
  1588  	h[kIdxD[7]] = h[kIdxC[7]]
  1589  	h[kIdxC[7]] = h[kIdxB[7]]
  1590  	h[kIdxB[7]] = tA[7]
  1591  
  1592  	tA[0] = ((h[0] << p1) | (h[0] >> (32 - p1)))
  1593  	tA[1] = ((h[1] << p1) | (h[1] >> (32 - p1)))
  1594  	tA[2] = ((h[2] << p1) | (h[2] >> (32 - p1)))
  1595  	tA[3] = ((h[3] << p1) | (h[3] >> (32 - p1)))
  1596  	tA[4] = ((h[4] << p1) | (h[4] >> (32 - p1)))
  1597  	tA[5] = ((h[5] << p1) | (h[5] >> (32 - p1)))
  1598  	tA[6] = ((h[6] << p1) | (h[6] >> (32 - p1)))
  1599  	tA[7] = ((h[7] << p1) | (h[7] >> (32 - p1)))
  1600  
  1601  	tp = uint32(h[kIdxD[0]] + w[40] +
  1602  		((h[kIdxA[0]] & h[kIdxB[0]]) | ((h[kIdxA[0]] | h[kIdxB[0]]) & h[kIdxC[0]])))
  1603  	h[kIdxA[0]] = ((tp << p2) | (tp >> (32 - p2))) + tA[kPrems[isp+5]]
  1604  	h[kIdxD[0]] = h[kIdxC[0]]
  1605  	h[kIdxC[0]] = h[kIdxB[0]]
  1606  	h[kIdxB[0]] = tA[0]
  1607  
  1608  	tp = uint32(h[kIdxD[1]] + w[41] +
  1609  		((h[kIdxA[1]] & h[kIdxB[1]]) | ((h[kIdxA[1]] | h[kIdxB[1]]) & h[kIdxC[1]])))
  1610  	h[kIdxA[1]] = ((tp << p2) | (tp >> (32 - p2))) + tA[kPrems[isp+5]^1]
  1611  	h[kIdxD[1]] = h[kIdxC[1]]
  1612  	h[kIdxC[1]] = h[kIdxB[1]]
  1613  	h[kIdxB[1]] = tA[1]
  1614  
  1615  	tp = uint32(h[kIdxD[2]] + w[42] +
  1616  		((h[kIdxA[2]] & h[kIdxB[2]]) | ((h[kIdxA[2]] | h[kIdxB[2]]) & h[kIdxC[2]])))
  1617  	h[kIdxA[2]] = ((tp << p2) | (tp >> (32 - p2))) + tA[kPrems[isp+5]^2]
  1618  	h[kIdxD[2]] = h[kIdxC[2]]
  1619  	h[kIdxC[2]] = h[kIdxB[2]]
  1620  	h[kIdxB[2]] = tA[2]
  1621  
  1622  	tp = uint32(h[kIdxD[3]] + w[43] +
  1623  		((h[kIdxA[3]] & h[kIdxB[3]]) | ((h[kIdxA[3]] | h[kIdxB[3]]) & h[kIdxC[3]])))
  1624  	h[kIdxA[3]] = ((tp << p2) | (tp >> (32 - p2))) + tA[kPrems[isp+5]^3]
  1625  	h[kIdxD[3]] = h[kIdxC[3]]
  1626  	h[kIdxC[3]] = h[kIdxB[3]]
  1627  	h[kIdxB[3]] = tA[3]
  1628  
  1629  	tp = uint32(h[kIdxD[4]] + w[44] +
  1630  		((h[kIdxA[4]] & h[kIdxB[4]]) | ((h[kIdxA[4]] | h[kIdxB[4]]) & h[kIdxC[4]])))
  1631  	h[kIdxA[4]] = ((tp << p2) | (tp >> (32 - p2))) + tA[kPrems[isp+5]^4]
  1632  	h[kIdxD[4]] = h[kIdxC[4]]
  1633  	h[kIdxC[4]] = h[kIdxB[4]]
  1634  	h[kIdxB[4]] = tA[4]
  1635  
  1636  	tp = uint32(h[kIdxD[5]] + w[45] +
  1637  		((h[kIdxA[5]] & h[kIdxB[5]]) | ((h[kIdxA[5]] | h[kIdxB[5]]) & h[kIdxC[5]])))
  1638  	h[kIdxA[5]] = ((tp << p2) | (tp >> (32 - p2))) + tA[kPrems[isp+5]^5]
  1639  	h[kIdxD[5]] = h[kIdxC[5]]
  1640  	h[kIdxC[5]] = h[kIdxB[5]]
  1641  	h[kIdxB[5]] = tA[5]
  1642  
  1643  	tp = uint32(h[kIdxD[6]] + w[46] +
  1644  		((h[kIdxA[6]] & h[kIdxB[6]]) | ((h[kIdxA[6]] | h[kIdxB[6]]) & h[kIdxC[6]])))
  1645  	h[kIdxA[6]] = ((tp << p2) | (tp >> (32 - p2))) + tA[kPrems[isp+5]^6]
  1646  	h[kIdxD[6]] = h[kIdxC[6]]
  1647  	h[kIdxC[6]] = h[kIdxB[6]]
  1648  	h[kIdxB[6]] = tA[6]
  1649  
  1650  	tp = uint32(h[kIdxD[7]] + w[47] +
  1651  		((h[kIdxA[7]] & h[kIdxB[7]]) | ((h[kIdxA[7]] | h[kIdxB[7]]) & h[kIdxC[7]])))
  1652  	h[kIdxA[7]] = ((tp << p2) | (tp >> (32 - p2))) + tA[kPrems[isp+5]^7]
  1653  	h[kIdxD[7]] = h[kIdxC[7]]
  1654  	h[kIdxC[7]] = h[kIdxB[7]]
  1655  	h[kIdxB[7]] = tA[7]
  1656  
  1657  	tA[0] = ((h[0] << p2) | (h[0] >> (32 - p2)))
  1658  	tA[1] = ((h[1] << p2) | (h[1] >> (32 - p2)))
  1659  	tA[2] = ((h[2] << p2) | (h[2] >> (32 - p2)))
  1660  	tA[3] = ((h[3] << p2) | (h[3] >> (32 - p2)))
  1661  	tA[4] = ((h[4] << p2) | (h[4] >> (32 - p2)))
  1662  	tA[5] = ((h[5] << p2) | (h[5] >> (32 - p2)))
  1663  	tA[6] = ((h[6] << p2) | (h[6] >> (32 - p2)))
  1664  	tA[7] = ((h[7] << p2) | (h[7] >> (32 - p2)))
  1665  
  1666  	tp = uint32(h[kIdxD[0]] + w[48] +
  1667  		((h[kIdxA[0]] & h[kIdxB[0]]) | ((h[kIdxA[0]] | h[kIdxB[0]]) & h[kIdxC[0]])))
  1668  	h[kIdxA[0]] = ((tp << p3) | (tp >> (32 - p3))) + tA[kPrems[isp+6]]
  1669  	h[kIdxD[0]] = h[kIdxC[0]]
  1670  	h[kIdxC[0]] = h[kIdxB[0]]
  1671  	h[kIdxB[0]] = tA[0]
  1672  
  1673  	tp = uint32(h[kIdxD[1]] + w[49] +
  1674  		((h[kIdxA[1]] & h[kIdxB[1]]) | ((h[kIdxA[1]] | h[kIdxB[1]]) & h[kIdxC[1]])))
  1675  	h[kIdxA[1]] = ((tp << p3) | (tp >> (32 - p3))) + tA[kPrems[isp+6]^1]
  1676  	h[kIdxD[1]] = h[kIdxC[1]]
  1677  	h[kIdxC[1]] = h[kIdxB[1]]
  1678  	h[kIdxB[1]] = tA[1]
  1679  
  1680  	tp = uint32(h[kIdxD[2]] + w[50] +
  1681  		((h[kIdxA[2]] & h[kIdxB[2]]) | ((h[kIdxA[2]] | h[kIdxB[2]]) & h[kIdxC[2]])))
  1682  	h[kIdxA[2]] = ((tp << p3) | (tp >> (32 - p3))) + tA[kPrems[isp+6]^2]
  1683  	h[kIdxD[2]] = h[kIdxC[2]]
  1684  	h[kIdxC[2]] = h[kIdxB[2]]
  1685  	h[kIdxB[2]] = tA[2]
  1686  
  1687  	tp = uint32(h[kIdxD[3]] + w[51] +
  1688  		((h[kIdxA[3]] & h[kIdxB[3]]) | ((h[kIdxA[3]] | h[kIdxB[3]]) & h[kIdxC[3]])))
  1689  	h[kIdxA[3]] = ((tp << p3) | (tp >> (32 - p3))) + tA[kPrems[isp+6]^3]
  1690  	h[kIdxD[3]] = h[kIdxC[3]]
  1691  	h[kIdxC[3]] = h[kIdxB[3]]
  1692  	h[kIdxB[3]] = tA[3]
  1693  
  1694  	tp = uint32(h[kIdxD[4]] + w[52] +
  1695  		((h[kIdxA[4]] & h[kIdxB[4]]) | ((h[kIdxA[4]] | h[kIdxB[4]]) & h[kIdxC[4]])))
  1696  	h[kIdxA[4]] = ((tp << p3) | (tp >> (32 - p3))) + tA[kPrems[isp+6]^4]
  1697  	h[kIdxD[4]] = h[kIdxC[4]]
  1698  	h[kIdxC[4]] = h[kIdxB[4]]
  1699  	h[kIdxB[4]] = tA[4]
  1700  
  1701  	tp = uint32(h[kIdxD[5]] + w[53] +
  1702  		((h[kIdxA[5]] & h[kIdxB[5]]) | ((h[kIdxA[5]] | h[kIdxB[5]]) & h[kIdxC[5]])))
  1703  	h[kIdxA[5]] = ((tp << p3) | (tp >> (32 - p3))) + tA[kPrems[isp+6]^5]
  1704  	h[kIdxD[5]] = h[kIdxC[5]]
  1705  	h[kIdxC[5]] = h[kIdxB[5]]
  1706  	h[kIdxB[5]] = tA[5]
  1707  
  1708  	tp = uint32(h[kIdxD[6]] + w[54] +
  1709  		((h[kIdxA[6]] & h[kIdxB[6]]) | ((h[kIdxA[6]] | h[kIdxB[6]]) & h[kIdxC[6]])))
  1710  	h[kIdxA[6]] = ((tp << p3) | (tp >> (32 - p3))) + tA[kPrems[isp+6]^6]
  1711  	h[kIdxD[6]] = h[kIdxC[6]]
  1712  	h[kIdxC[6]] = h[kIdxB[6]]
  1713  	h[kIdxB[6]] = tA[6]
  1714  
  1715  	tp = uint32(h[kIdxD[7]] + w[55] +
  1716  		((h[kIdxA[7]] & h[kIdxB[7]]) | ((h[kIdxA[7]] | h[kIdxB[7]]) & h[kIdxC[7]])))
  1717  	h[kIdxA[7]] = ((tp << p3) | (tp >> (32 - p3))) + tA[kPrems[isp+6]^7]
  1718  	h[kIdxD[7]] = h[kIdxC[7]]
  1719  	h[kIdxC[7]] = h[kIdxB[7]]
  1720  	h[kIdxB[7]] = tA[7]
  1721  
  1722  	tA[0] = ((h[0] << p3) | (h[0] >> (32 - p3)))
  1723  	tA[1] = ((h[1] << p3) | (h[1] >> (32 - p3)))
  1724  	tA[2] = ((h[2] << p3) | (h[2] >> (32 - p3)))
  1725  	tA[3] = ((h[3] << p3) | (h[3] >> (32 - p3)))
  1726  	tA[4] = ((h[4] << p3) | (h[4] >> (32 - p3)))
  1727  	tA[5] = ((h[5] << p3) | (h[5] >> (32 - p3)))
  1728  	tA[6] = ((h[6] << p3) | (h[6] >> (32 - p3)))
  1729  	tA[7] = ((h[7] << p3) | (h[7] >> (32 - p3)))
  1730  
  1731  	tp = uint32(h[kIdxD[0]] + w[56] +
  1732  		((h[kIdxA[0]] & h[kIdxB[0]]) | ((h[kIdxA[0]] | h[kIdxB[0]]) & h[kIdxC[0]])))
  1733  	h[kIdxA[0]] = ((tp << p0) | (tp >> (32 - p0))) + tA[kPrems[isp+7]]
  1734  	h[kIdxD[0]] = h[kIdxC[0]]
  1735  	h[kIdxC[0]] = h[kIdxB[0]]
  1736  	h[kIdxB[0]] = tA[0]
  1737  
  1738  	tp = uint32(h[kIdxD[1]] + w[57] +
  1739  		((h[kIdxA[1]] & h[kIdxB[1]]) | ((h[kIdxA[1]] | h[kIdxB[1]]) & h[kIdxC[1]])))
  1740  	h[kIdxA[1]] = ((tp << p0) | (tp >> (32 - p0))) + tA[kPrems[isp+7]^1]
  1741  	h[kIdxD[1]] = h[kIdxC[1]]
  1742  	h[kIdxC[1]] = h[kIdxB[1]]
  1743  	h[kIdxB[1]] = tA[1]
  1744  
  1745  	tp = uint32(h[kIdxD[2]] + w[58] +
  1746  		((h[kIdxA[2]] & h[kIdxB[2]]) | ((h[kIdxA[2]] | h[kIdxB[2]]) & h[kIdxC[2]])))
  1747  	h[kIdxA[2]] = ((tp << p0) | (tp >> (32 - p0))) + tA[kPrems[isp+7]^2]
  1748  	h[kIdxD[2]] = h[kIdxC[2]]
  1749  	h[kIdxC[2]] = h[kIdxB[2]]
  1750  	h[kIdxB[2]] = tA[2]
  1751  
  1752  	tp = uint32(h[kIdxD[3]] + w[59] +
  1753  		((h[kIdxA[3]] & h[kIdxB[3]]) | ((h[kIdxA[3]] | h[kIdxB[3]]) & h[kIdxC[3]])))
  1754  	h[kIdxA[3]] = ((tp << p0) | (tp >> (32 - p0))) + tA[kPrems[isp+7]^3]
  1755  	h[kIdxD[3]] = h[kIdxC[3]]
  1756  	h[kIdxC[3]] = h[kIdxB[3]]
  1757  	h[kIdxB[3]] = tA[3]
  1758  
  1759  	tp = uint32(h[kIdxD[4]] + w[60] +
  1760  		((h[kIdxA[4]] & h[kIdxB[4]]) | ((h[kIdxA[4]] | h[kIdxB[4]]) & h[kIdxC[4]])))
  1761  	h[kIdxA[4]] = ((tp << p0) | (tp >> (32 - p0))) + tA[kPrems[isp+7]^4]
  1762  	h[kIdxD[4]] = h[kIdxC[4]]
  1763  	h[kIdxC[4]] = h[kIdxB[4]]
  1764  	h[kIdxB[4]] = tA[4]
  1765  
  1766  	tp = uint32(h[kIdxD[5]] + w[61] +
  1767  		((h[kIdxA[5]] & h[kIdxB[5]]) | ((h[kIdxA[5]] | h[kIdxB[5]]) & h[kIdxC[5]])))
  1768  	h[kIdxA[5]] = ((tp << p0) | (tp >> (32 - p0))) + tA[kPrems[isp+7]^5]
  1769  	h[kIdxD[5]] = h[kIdxC[5]]
  1770  	h[kIdxC[5]] = h[kIdxB[5]]
  1771  	h[kIdxB[5]] = tA[5]
  1772  
  1773  	tp = uint32(h[kIdxD[6]] + w[62] +
  1774  		((h[kIdxA[6]] & h[kIdxB[6]]) | ((h[kIdxA[6]] | h[kIdxB[6]]) & h[kIdxC[6]])))
  1775  	h[kIdxA[6]] = ((tp << p0) | (tp >> (32 - p0))) + tA[kPrems[isp+7]^6]
  1776  	h[kIdxD[6]] = h[kIdxC[6]]
  1777  	h[kIdxC[6]] = h[kIdxB[6]]
  1778  	h[kIdxB[6]] = tA[6]
  1779  
  1780  	tp = uint32(h[kIdxD[7]] + w[63] +
  1781  		((h[kIdxA[7]] & h[kIdxB[7]]) | ((h[kIdxA[7]] | h[kIdxB[7]]) & h[kIdxC[7]])))
  1782  	h[kIdxA[7]] = ((tp << p0) | (tp >> (32 - p0))) + tA[kPrems[isp+7]^7]
  1783  	h[kIdxD[7]] = h[kIdxC[7]]
  1784  	h[kIdxC[7]] = h[kIdxB[7]]
  1785  	h[kIdxB[7]] = tA[7]
  1786  }
  1787  
  1788  ////////////////
  1789  
  1790  var kInit = []uint32{
  1791  	uint32(0x0BA16B95), uint32(0x72F999AD),
  1792  	uint32(0x9FECC2AE), uint32(0xBA3264FC),
  1793  	uint32(0x5E894929), uint32(0x8E9F30E5),
  1794  	uint32(0x2F1DAA37), uint32(0xF0F2C558),
  1795  	uint32(0xAC506643), uint32(0xA90635A5),
  1796  	uint32(0xE25B878B), uint32(0xAAB7878F),
  1797  	uint32(0x88817F7A), uint32(0x0A02892B),
  1798  	uint32(0x559A7550), uint32(0x598F657E),
  1799  	uint32(0x7EEF60A1), uint32(0x6B70E3E8),
  1800  	uint32(0x9C1714D1), uint32(0xB958E2A8),
  1801  	uint32(0xAB02675E), uint32(0xED1C014F),
  1802  	uint32(0xCD8D65BB), uint32(0xFDB7A257),
  1803  	uint32(0x09254899), uint32(0xD699C7BC),
  1804  	uint32(0x9019B6DC), uint32(0x2B9022E4),
  1805  	uint32(0x8FA14956), uint32(0x21BF9BD3),
  1806  	uint32(0xB94D0943), uint32(0x6FFDDC22),
  1807  }
  1808  
  1809  var kIdxA = [8]uint8{
  1810  	0, 1, 2, 3, 4, 5, 6, 7,
  1811  }
  1812  var kIdxB = [8]uint8{
  1813  	8, 9, 10, 11, 12, 13, 14, 15,
  1814  }
  1815  var kIdxC = [8]uint8{
  1816  	16, 17, 18, 19, 20, 21, 22, 23,
  1817  }
  1818  var kIdxD = [8]uint8{
  1819  	24, 25, 26, 27, 28, 29, 30, 31,
  1820  }
  1821  
  1822  var kPrems = []uint8{
  1823  	1, 6, 2, 3, 5, 7, 4, 1, 6, 2, 3,
  1824  }
  1825  
  1826  var kPrem = [7][8]uint8{
  1827  	{1, 0, 3, 2, 5, 4, 7, 6},
  1828  	{6, 7, 4, 5, 2, 3, 0, 1},
  1829  	{2, 3, 0, 1, 6, 7, 4, 5},
  1830  	{3, 2, 1, 0, 7, 6, 5, 4},
  1831  	{5, 4, 7, 6, 1, 0, 3, 2},
  1832  	{7, 6, 5, 4, 3, 2, 1, 0},
  1833  	{4, 5, 6, 7, 0, 1, 2, 3},
  1834  }
  1835  
  1836  var wbp = [32]uintptr{
  1837  	4 << 4, 6 << 4, 0 << 4, 2 << 4,
  1838  	7 << 4, 5 << 4, 3 << 4, 1 << 4,
  1839  	15 << 4, 11 << 4, 12 << 4, 8 << 4,
  1840  	9 << 4, 13 << 4, 10 << 4, 14 << 4,
  1841  	17 << 4, 18 << 4, 23 << 4, 20 << 4,
  1842  	22 << 4, 21 << 4, 16 << 4, 19 << 4,
  1843  	30 << 4, 24 << 4, 25 << 4, 31 << 4,
  1844  	27 << 4, 29 << 4, 28 << 4, 26 << 4,
  1845  }
  1846  
  1847  var kAlphaTab = []int32{
  1848  	1, 41, 139, 45, 46, 87, 226, 14, 60, 147, 116, 130,
  1849  	190, 80, 196, 69, 2, 82, 21, 90, 92, 174, 195, 28,
  1850  	120, 37, 232, 3, 123, 160, 135, 138, 4, 164, 42, 180,
  1851  	184, 91, 133, 56, 240, 74, 207, 6, 246, 63, 13, 19,
  1852  	8, 71, 84, 103, 111, 182, 9, 112, 223, 148, 157, 12,
  1853  	235, 126, 26, 38, 16, 142, 168, 206, 222, 107, 18, 224,
  1854  	189, 39, 57, 24, 213, 252, 52, 76, 32, 27, 79, 155,
  1855  	187, 214, 36, 191, 121, 78, 114, 48, 169, 247, 104, 152,
  1856  	64, 54, 158, 53, 117, 171, 72, 125, 242, 156, 228, 96,
  1857  	81, 237, 208, 47, 128, 108, 59, 106, 234, 85, 144, 250,
  1858  	227, 55, 199, 192, 162, 217, 159, 94, 256, 216, 118, 212,
  1859  	211, 170, 31, 243, 197, 110, 141, 127, 67, 177, 61, 188,
  1860  	255, 175, 236, 167, 165, 83, 62, 229, 137, 220, 25, 254,
  1861  	134, 97, 122, 119, 253, 93, 215, 77, 73, 166, 124, 201,
  1862  	17, 183, 50, 251, 11, 194, 244, 238, 249, 186, 173, 154,
  1863  	146, 75, 248, 145, 34, 109, 100, 245, 22, 131, 231, 219,
  1864  	241, 115, 89, 51, 35, 150, 239, 33, 68, 218, 200, 233,
  1865  	44, 5, 205, 181, 225, 230, 178, 102, 70, 43, 221, 66,
  1866  	136, 179, 143, 209, 88, 10, 153, 105, 193, 203, 99, 204,
  1867  	140, 86, 185, 132, 15, 101, 29, 161, 176, 20, 49, 210,
  1868  	129, 149, 198, 151, 23, 172, 113, 7, 30, 202, 58, 65,
  1869  	95, 40, 98, 163,
  1870  }
  1871  
  1872  var kYOffA = []int32{
  1873  	1, 163, 98, 40, 95, 65, 58, 202, 30, 7, 113, 172,
  1874  	23, 151, 198, 149, 129, 210, 49, 20, 176, 161, 29, 101,
  1875  	15, 132, 185, 86, 140, 204, 99, 203, 193, 105, 153, 10,
  1876  	88, 209, 143, 179, 136, 66, 221, 43, 70, 102, 178, 230,
  1877  	225, 181, 205, 5, 44, 233, 200, 218, 68, 33, 239, 150,
  1878  	35, 51, 89, 115, 241, 219, 231, 131, 22, 245, 100, 109,
  1879  	34, 145, 248, 75, 146, 154, 173, 186, 249, 238, 244, 194,
  1880  	11, 251, 50, 183, 17, 201, 124, 166, 73, 77, 215, 93,
  1881  	253, 119, 122, 97, 134, 254, 25, 220, 137, 229, 62, 83,
  1882  	165, 167, 236, 175, 255, 188, 61, 177, 67, 127, 141, 110,
  1883  	197, 243, 31, 170, 211, 212, 118, 216, 256, 94, 159, 217,
  1884  	162, 192, 199, 55, 227, 250, 144, 85, 234, 106, 59, 108,
  1885  	128, 47, 208, 237, 81, 96, 228, 156, 242, 125, 72, 171,
  1886  	117, 53, 158, 54, 64, 152, 104, 247, 169, 48, 114, 78,
  1887  	121, 191, 36, 214, 187, 155, 79, 27, 32, 76, 52, 252,
  1888  	213, 24, 57, 39, 189, 224, 18, 107, 222, 206, 168, 142,
  1889  	16, 38, 26, 126, 235, 12, 157, 148, 223, 112, 9, 182,
  1890  	111, 103, 84, 71, 8, 19, 13, 63, 246, 6, 207, 74,
  1891  	240, 56, 133, 91, 184, 180, 42, 164, 4, 138, 135, 160,
  1892  	123, 3, 232, 37, 120, 28, 195, 174, 92, 90, 21, 82,
  1893  	2, 69, 196, 80, 190, 130, 116, 147, 60, 14, 226, 87,
  1894  	46, 45, 139, 41,
  1895  }
  1896  
  1897  var kYOffB = []int32{
  1898  	2, 203, 156, 47, 118, 214, 107, 106, 45, 93, 212, 20,
  1899  	111, 73, 162, 251, 97, 215, 249, 53, 211, 19, 3, 89,
  1900  	49, 207, 101, 67, 151, 130, 223, 23, 189, 202, 178, 239,
  1901  	253, 127, 204, 49, 76, 236, 82, 137, 232, 157, 65, 79,
  1902  	96, 161, 176, 130, 161, 30, 47, 9, 189, 247, 61, 226,
  1903  	248, 90, 107, 64, 0, 88, 131, 243, 133, 59, 113, 115,
  1904  	17, 236, 33, 213, 12, 191, 111, 19, 251, 61, 103, 208,
  1905  	57, 35, 148, 248, 47, 116, 65, 119, 249, 178, 143, 40,
  1906  	189, 129, 8, 163, 204, 227, 230, 196, 205, 122, 151, 45,
  1907  	187, 19, 227, 72, 247, 125, 111, 121, 140, 220, 6, 107,
  1908  	77, 69, 10, 101, 21, 65, 149, 171, 255, 54, 101, 210,
  1909  	139, 43, 150, 151, 212, 164, 45, 237, 146, 184, 95, 6,
  1910  	160, 42, 8, 204, 46, 238, 254, 168, 208, 50, 156, 190,
  1911  	106, 127, 34, 234, 68, 55, 79, 18, 4, 130, 53, 208,
  1912  	181, 21, 175, 120, 25, 100, 192, 178, 161, 96, 81, 127,
  1913  	96, 227, 210, 248, 68, 10, 196, 31, 9, 167, 150, 193,
  1914  	0, 169, 126, 14, 124, 198, 144, 142, 240, 21, 224, 44,
  1915  	245, 66, 146, 238, 6, 196, 154, 49, 200, 222, 109, 9,
  1916  	210, 141, 192, 138, 8, 79, 114, 217, 68, 128, 249, 94,
  1917  	53, 30, 27, 61, 52, 135, 106, 212, 70, 238, 30, 185,
  1918  	10, 132, 146, 136, 117, 37, 251, 150, 180, 188, 247, 156,
  1919  	236, 192, 108, 86,
  1920  }