github.com/go-asm/go@v1.21.1-0.20240213172139-40c5ead50c48/chacha8rand/chacha8_generic.go (about) 1 // Copyright 2023 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // ChaCha8 is ChaCha with 8 rounds. 6 // See https://cr.yp.to/chacha/chacha-20080128.pdf. 7 // 8 // ChaCha8 operates on a 4x4 matrix of uint32 values, initially set to: 9 // 10 // const1 const2 const3 const4 11 // seed seed seed seed 12 // seed seed seed seed 13 // counter64 0 0 14 // 15 // We use the same constants as ChaCha20 does, a random seed, 16 // and a counter. Running ChaCha8 on this input produces 17 // a 4x4 matrix of pseudo-random values with as much entropy 18 // as the seed. 19 // 20 // Given SIMD registers that can hold N uint32s, it is possible 21 // to run N ChaCha8 block transformations in parallel by filling 22 // the first register with the N copies of const1, the second 23 // with N copies of const2, and so on, and then running the operations. 24 // 25 // Each iteration of ChaCha8Rand operates over 32 bytes of input and 26 // produces 992 bytes of RNG output, plus 32 bytes of input for the next 27 // iteration. 28 // 29 // The 32 bytes of input are used as a ChaCha8 key, with a zero nonce, to 30 // produce 1024 bytes of output (16 blocks, with counters 0 to 15). 31 // First, for each block, the values 0x61707865, 0x3320646e, 0x79622d32, 32 // 0x6b206574 are subtracted from the 32-bit little-endian words at 33 // position 0, 1, 2, and 3 respectively, and an increasing counter 34 // starting at zero is subtracted from each word at position 12. Then, 35 // this stream is permuted such that for each sequence of four blocks, 36 // first we output the first four bytes of each block, then the next four 37 // bytes of each block, and so on. Finally, the last 32 bytes of output 38 // are used as the input of the next iteration, and the remaining 992 39 // bytes are the RNG output. 40 // 41 // See https://c2sp.org/chacha8rand for additional details. 42 // 43 // Normal ChaCha20 implementations for encryption use this same 44 // parallelism but then have to deinterlace the results so that 45 // it appears the blocks were generated separately. For the purposes 46 // of generating random numbers, the interlacing is fine. 47 // We are simply locked in to preserving the 4-way interlacing 48 // in any future optimizations. 49 package chacha8rand 50 51 import ( 52 "unsafe" 53 54 "github.com/go-asm/go/goarch" 55 ) 56 57 // setup sets up 4 ChaCha8 blocks in b32 with the counter and seed. 58 // Note that b32 is [16][4]uint32 not [4][16]uint32: the blocks are interlaced 59 // the same way they would be in a 4-way SIMD implementations. 60 func setup(seed *[4]uint64, b32 *[16][4]uint32, counter uint32) { 61 // Convert to uint64 to do half as many stores to memory. 62 b := (*[16][2]uint64)(unsafe.Pointer(b32)) 63 64 // Constants; same as in ChaCha20: "expand 32-byte k" 65 b[0][0] = 0x61707865_61707865 66 b[0][1] = 0x61707865_61707865 67 68 b[1][0] = 0x3320646e_3320646e 69 b[1][1] = 0x3320646e_3320646e 70 71 b[2][0] = 0x79622d32_79622d32 72 b[2][1] = 0x79622d32_79622d32 73 74 b[3][0] = 0x6b206574_6b206574 75 b[3][1] = 0x6b206574_6b206574 76 77 // Seed values. 78 var x64 uint64 79 var x uint32 80 81 x = uint32(seed[0]) 82 x64 = uint64(x)<<32 | uint64(x) 83 b[4][0] = x64 84 b[4][1] = x64 85 86 x = uint32(seed[0] >> 32) 87 x64 = uint64(x)<<32 | uint64(x) 88 b[5][0] = x64 89 b[5][1] = x64 90 91 x = uint32(seed[1]) 92 x64 = uint64(x)<<32 | uint64(x) 93 b[6][0] = x64 94 b[6][1] = x64 95 96 x = uint32(seed[1] >> 32) 97 x64 = uint64(x)<<32 | uint64(x) 98 b[7][0] = x64 99 b[7][1] = x64 100 101 x = uint32(seed[2]) 102 x64 = uint64(x)<<32 | uint64(x) 103 b[8][0] = x64 104 b[8][1] = x64 105 106 x = uint32(seed[2] >> 32) 107 x64 = uint64(x)<<32 | uint64(x) 108 b[9][0] = x64 109 b[9][1] = x64 110 111 x = uint32(seed[3]) 112 x64 = uint64(x)<<32 | uint64(x) 113 b[10][0] = x64 114 b[10][1] = x64 115 116 x = uint32(seed[3] >> 32) 117 x64 = uint64(x)<<32 | uint64(x) 118 b[11][0] = x64 119 b[11][1] = x64 120 121 // Counters. 122 if goarch.BigEndian { 123 b[12][0] = uint64(counter+0)<<32 | uint64(counter+1) 124 b[12][1] = uint64(counter+2)<<32 | uint64(counter+3) 125 } else { 126 b[12][0] = uint64(counter+0) | uint64(counter+1)<<32 127 b[12][1] = uint64(counter+2) | uint64(counter+3)<<32 128 } 129 130 // Zeros. 131 b[13][0] = 0 132 b[13][1] = 0 133 b[14][0] = 0 134 b[14][1] = 0 135 136 b[15][0] = 0 137 b[15][1] = 0 138 } 139 140 func _() { 141 // block and block_generic must have same type 142 x := block 143 x = block_generic 144 _ = x 145 } 146 147 // block_generic is the non-assembly block implementation, 148 // for use on systems without special assembly. 149 // Even on such systems, it is quite fast: on GOOS=386, 150 // ChaCha8 using this code generates random values faster than PCG-DXSM. 151 func block_generic(seed *[4]uint64, buf *[32]uint64, counter uint32) { 152 b := (*[16][4]uint32)(unsafe.Pointer(buf)) 153 154 setup(seed, b, counter) 155 156 for i := range b[0] { 157 // Load block i from b[*][i] into local variables. 158 b0 := b[0][i] 159 b1 := b[1][i] 160 b2 := b[2][i] 161 b3 := b[3][i] 162 b4 := b[4][i] 163 b5 := b[5][i] 164 b6 := b[6][i] 165 b7 := b[7][i] 166 b8 := b[8][i] 167 b9 := b[9][i] 168 b10 := b[10][i] 169 b11 := b[11][i] 170 b12 := b[12][i] 171 b13 := b[13][i] 172 b14 := b[14][i] 173 b15 := b[15][i] 174 175 // 4 iterations of eight quarter-rounds each is 8 rounds 176 for round := 0; round < 4; round++ { 177 b0, b4, b8, b12 = qr(b0, b4, b8, b12) 178 b1, b5, b9, b13 = qr(b1, b5, b9, b13) 179 b2, b6, b10, b14 = qr(b2, b6, b10, b14) 180 b3, b7, b11, b15 = qr(b3, b7, b11, b15) 181 182 b0, b5, b10, b15 = qr(b0, b5, b10, b15) 183 b1, b6, b11, b12 = qr(b1, b6, b11, b12) 184 b2, b7, b8, b13 = qr(b2, b7, b8, b13) 185 b3, b4, b9, b14 = qr(b3, b4, b9, b14) 186 } 187 188 // Store block i back into b[*][i]. 189 // Add b4..b11 back to the original key material, 190 // like in ChaCha20, to avoid trivial invertibility. 191 // There is no entropy in b0..b3 and b12..b15 192 // so we can skip the additions and save some time. 193 b[0][i] = b0 194 b[1][i] = b1 195 b[2][i] = b2 196 b[3][i] = b3 197 b[4][i] += b4 198 b[5][i] += b5 199 b[6][i] += b6 200 b[7][i] += b7 201 b[8][i] += b8 202 b[9][i] += b9 203 b[10][i] += b10 204 b[11][i] += b11 205 b[12][i] = b12 206 b[13][i] = b13 207 b[14][i] = b14 208 b[15][i] = b15 209 } 210 211 if goarch.BigEndian { 212 // On a big-endian system, reading the uint32 pairs as uint64s 213 // will word-swap them compared to little-endian, so we word-swap 214 // them here first to make the next swap get the right answer. 215 for i, x := range buf { 216 buf[i] = x>>32 | x<<32 217 } 218 } 219 } 220 221 // qr is the (inlinable) ChaCha8 quarter round. 222 func qr(a, b, c, d uint32) (_a, _b, _c, _d uint32) { 223 a += b 224 d ^= a 225 d = d<<16 | d>>16 226 c += d 227 b ^= c 228 b = b<<12 | b>>20 229 a += b 230 d ^= a 231 d = d<<8 | d>>24 232 c += d 233 b ^= c 234 b = b<<7 | b>>25 235 return a, b, c, d 236 }