github.com/go-asm/go@v1.21.1-0.20240213172139-40c5ead50c48/chacha8rand/chacha8_generic.go (about)

     1  // Copyright 2023 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // ChaCha8 is ChaCha with 8 rounds.
     6  // See https://cr.yp.to/chacha/chacha-20080128.pdf.
     7  //
     8  // ChaCha8 operates on a 4x4 matrix of uint32 values, initially set to:
     9  //
    10  //	const1 const2 const3 const4
    11  //	seed   seed   seed   seed
    12  //	seed   seed   seed   seed
    13  //	counter64     0      0
    14  //
    15  // We use the same constants as ChaCha20 does, a random seed,
    16  // and a counter. Running ChaCha8 on this input produces
    17  // a 4x4 matrix of pseudo-random values with as much entropy
    18  // as the seed.
    19  //
    20  // Given SIMD registers that can hold N uint32s, it is possible
    21  // to run N ChaCha8 block transformations in parallel by filling
    22  // the first register with the N copies of const1, the second
    23  // with N copies of const2, and so on, and then running the operations.
    24  //
    25  // Each iteration of ChaCha8Rand operates over 32 bytes of input and
    26  // produces 992 bytes of RNG output, plus 32 bytes of input for the next
    27  // iteration.
    28  //
    29  // The 32 bytes of input are used as a ChaCha8 key, with a zero nonce, to
    30  // produce 1024 bytes of output (16 blocks, with counters 0 to 15).
    31  // First, for each block, the values 0x61707865, 0x3320646e, 0x79622d32,
    32  // 0x6b206574 are subtracted from the 32-bit little-endian words at
    33  // position 0, 1, 2, and 3 respectively, and an increasing counter
    34  // starting at zero is subtracted from each word at position 12. Then,
    35  // this stream is permuted such that for each sequence of four blocks,
    36  // first we output the first four bytes of each block, then the next four
    37  // bytes of each block, and so on. Finally, the last 32 bytes of output
    38  // are used as the input of the next iteration, and the remaining 992
    39  // bytes are the RNG output.
    40  //
    41  // See https://c2sp.org/chacha8rand for additional details.
    42  //
    43  // Normal ChaCha20 implementations for encryption use this same
    44  // parallelism but then have to deinterlace the results so that
    45  // it appears the blocks were generated separately. For the purposes
    46  // of generating random numbers, the interlacing is fine.
    47  // We are simply locked in to preserving the 4-way interlacing
    48  // in any future optimizations.
    49  package chacha8rand
    50  
    51  import (
    52  	"unsafe"
    53  
    54  	"github.com/go-asm/go/goarch"
    55  )
    56  
    57  // setup sets up 4 ChaCha8 blocks in b32 with the counter and seed.
    58  // Note that b32 is [16][4]uint32 not [4][16]uint32: the blocks are interlaced
    59  // the same way they would be in a 4-way SIMD implementations.
    60  func setup(seed *[4]uint64, b32 *[16][4]uint32, counter uint32) {
    61  	// Convert to uint64 to do half as many stores to memory.
    62  	b := (*[16][2]uint64)(unsafe.Pointer(b32))
    63  
    64  	// Constants; same as in ChaCha20: "expand 32-byte k"
    65  	b[0][0] = 0x61707865_61707865
    66  	b[0][1] = 0x61707865_61707865
    67  
    68  	b[1][0] = 0x3320646e_3320646e
    69  	b[1][1] = 0x3320646e_3320646e
    70  
    71  	b[2][0] = 0x79622d32_79622d32
    72  	b[2][1] = 0x79622d32_79622d32
    73  
    74  	b[3][0] = 0x6b206574_6b206574
    75  	b[3][1] = 0x6b206574_6b206574
    76  
    77  	// Seed values.
    78  	var x64 uint64
    79  	var x uint32
    80  
    81  	x = uint32(seed[0])
    82  	x64 = uint64(x)<<32 | uint64(x)
    83  	b[4][0] = x64
    84  	b[4][1] = x64
    85  
    86  	x = uint32(seed[0] >> 32)
    87  	x64 = uint64(x)<<32 | uint64(x)
    88  	b[5][0] = x64
    89  	b[5][1] = x64
    90  
    91  	x = uint32(seed[1])
    92  	x64 = uint64(x)<<32 | uint64(x)
    93  	b[6][0] = x64
    94  	b[6][1] = x64
    95  
    96  	x = uint32(seed[1] >> 32)
    97  	x64 = uint64(x)<<32 | uint64(x)
    98  	b[7][0] = x64
    99  	b[7][1] = x64
   100  
   101  	x = uint32(seed[2])
   102  	x64 = uint64(x)<<32 | uint64(x)
   103  	b[8][0] = x64
   104  	b[8][1] = x64
   105  
   106  	x = uint32(seed[2] >> 32)
   107  	x64 = uint64(x)<<32 | uint64(x)
   108  	b[9][0] = x64
   109  	b[9][1] = x64
   110  
   111  	x = uint32(seed[3])
   112  	x64 = uint64(x)<<32 | uint64(x)
   113  	b[10][0] = x64
   114  	b[10][1] = x64
   115  
   116  	x = uint32(seed[3] >> 32)
   117  	x64 = uint64(x)<<32 | uint64(x)
   118  	b[11][0] = x64
   119  	b[11][1] = x64
   120  
   121  	// Counters.
   122  	if goarch.BigEndian {
   123  		b[12][0] = uint64(counter+0)<<32 | uint64(counter+1)
   124  		b[12][1] = uint64(counter+2)<<32 | uint64(counter+3)
   125  	} else {
   126  		b[12][0] = uint64(counter+0) | uint64(counter+1)<<32
   127  		b[12][1] = uint64(counter+2) | uint64(counter+3)<<32
   128  	}
   129  
   130  	// Zeros.
   131  	b[13][0] = 0
   132  	b[13][1] = 0
   133  	b[14][0] = 0
   134  	b[14][1] = 0
   135  
   136  	b[15][0] = 0
   137  	b[15][1] = 0
   138  }
   139  
   140  func _() {
   141  	// block and block_generic must have same type
   142  	x := block
   143  	x = block_generic
   144  	_ = x
   145  }
   146  
   147  // block_generic is the non-assembly block implementation,
   148  // for use on systems without special assembly.
   149  // Even on such systems, it is quite fast: on GOOS=386,
   150  // ChaCha8 using this code generates random values faster than PCG-DXSM.
   151  func block_generic(seed *[4]uint64, buf *[32]uint64, counter uint32) {
   152  	b := (*[16][4]uint32)(unsafe.Pointer(buf))
   153  
   154  	setup(seed, b, counter)
   155  
   156  	for i := range b[0] {
   157  		// Load block i from b[*][i] into local variables.
   158  		b0 := b[0][i]
   159  		b1 := b[1][i]
   160  		b2 := b[2][i]
   161  		b3 := b[3][i]
   162  		b4 := b[4][i]
   163  		b5 := b[5][i]
   164  		b6 := b[6][i]
   165  		b7 := b[7][i]
   166  		b8 := b[8][i]
   167  		b9 := b[9][i]
   168  		b10 := b[10][i]
   169  		b11 := b[11][i]
   170  		b12 := b[12][i]
   171  		b13 := b[13][i]
   172  		b14 := b[14][i]
   173  		b15 := b[15][i]
   174  
   175  		// 4 iterations of eight quarter-rounds each is 8 rounds
   176  		for round := 0; round < 4; round++ {
   177  			b0, b4, b8, b12 = qr(b0, b4, b8, b12)
   178  			b1, b5, b9, b13 = qr(b1, b5, b9, b13)
   179  			b2, b6, b10, b14 = qr(b2, b6, b10, b14)
   180  			b3, b7, b11, b15 = qr(b3, b7, b11, b15)
   181  
   182  			b0, b5, b10, b15 = qr(b0, b5, b10, b15)
   183  			b1, b6, b11, b12 = qr(b1, b6, b11, b12)
   184  			b2, b7, b8, b13 = qr(b2, b7, b8, b13)
   185  			b3, b4, b9, b14 = qr(b3, b4, b9, b14)
   186  		}
   187  
   188  		// Store block i back into b[*][i].
   189  		// Add b4..b11 back to the original key material,
   190  		// like in ChaCha20, to avoid trivial invertibility.
   191  		// There is no entropy in b0..b3 and b12..b15
   192  		// so we can skip the additions and save some time.
   193  		b[0][i] = b0
   194  		b[1][i] = b1
   195  		b[2][i] = b2
   196  		b[3][i] = b3
   197  		b[4][i] += b4
   198  		b[5][i] += b5
   199  		b[6][i] += b6
   200  		b[7][i] += b7
   201  		b[8][i] += b8
   202  		b[9][i] += b9
   203  		b[10][i] += b10
   204  		b[11][i] += b11
   205  		b[12][i] = b12
   206  		b[13][i] = b13
   207  		b[14][i] = b14
   208  		b[15][i] = b15
   209  	}
   210  
   211  	if goarch.BigEndian {
   212  		// On a big-endian system, reading the uint32 pairs as uint64s
   213  		// will word-swap them compared to little-endian, so we word-swap
   214  		// them here first to make the next swap get the right answer.
   215  		for i, x := range buf {
   216  			buf[i] = x>>32 | x<<32
   217  		}
   218  	}
   219  }
   220  
   221  // qr is the (inlinable) ChaCha8 quarter round.
   222  func qr(a, b, c, d uint32) (_a, _b, _c, _d uint32) {
   223  	a += b
   224  	d ^= a
   225  	d = d<<16 | d>>16
   226  	c += d
   227  	b ^= c
   228  	b = b<<12 | b>>20
   229  	a += b
   230  	d ^= a
   231  	d = d<<8 | d>>24
   232  	c += d
   233  	b ^= c
   234  	b = b<<7 | b>>25
   235  	return a, b, c, d
   236  }