github.com/cloudflare/circl@v1.5.0/simd/keccakf1600/f1600x.go (about) 1 // Package keccakf1600 provides a two and four-way Keccak-f[1600] permutation in parallel. 2 // 3 // Keccak-f[1600] is the permutation underlying several algorithms such as 4 // Keccak, SHA3 and SHAKE. Running two or four permutations in parallel is 5 // useful in some scenarios like in hash-based signatures. 6 // 7 // # Limitations 8 // 9 // Note that not all the architectures support SIMD instructions. This package 10 // uses AVX2 instructions that are available in some AMD64 architectures 11 // and NEON instructions that are available in some ARM64 architectures. 12 // 13 // For those systems not supporting these, the package still provides the 14 // expected functionality by means of a generic and slow implementation. 15 // The recommendation is to beforehand verify IsEnabledX4() and IsEnabledX2() 16 // to determine if the current system supports the SIMD implementation. 17 package keccakf1600 18 19 import ( 20 "runtime" 21 "unsafe" 22 23 "github.com/cloudflare/circl/internal/sha3" 24 "golang.org/x/sys/cpu" 25 ) 26 27 // StateX4 contains state for the four-way permutation including the four 28 // interleaved [25]uint64 buffers. Call Initialize() before use to initialize 29 // and get a pointer to the interleaved buffer. 30 type StateX4 struct { 31 // Go guarantees a to be aligned on 8 bytes, whereas we need it to be 32 // aligned on 32 bytes for bet performance. Thus we leave some headroom 33 // to be able to move the start of the state. 34 35 // 4 x 25 uint64s for the interleaved states and three uint64s headroom 36 // to fix alignment. 37 a [103]uint64 38 39 // Offset into a that is 32 byte aligned. 40 offset int 41 42 // If true, permute will use 12-round keccak instead of 24-round keccak 43 turbo bool 44 } 45 46 // StateX2 contains state for the two-way permutation including the two 47 // interleaved [25]uint64 buffers. Call Initialize() before use to initialize 48 // and get a pointer to the interleaved buffer. 49 type StateX2 struct { 50 // Go guarantees a to be aligned on 8 bytes, whereas we need it to be 51 // aligned on 32 bytes for bet performance. Thus we leave some headroom 52 // to be able to move the start of the state. 53 54 // 2 x 25 uint64s for the interleaved states and three uint64s headroom 55 // to fix alignment. 56 a [53]uint64 57 58 // Offset into a that is 32 byte aligned. 59 offset int 60 61 // If true, permute will use 12-round keccak instead of 24-round keccak 62 turbo bool 63 } 64 65 // IsEnabledX4 returns true if the architecture supports a four-way SIMD 66 // implementation provided in this package. 67 func IsEnabledX4() bool { return cpu.X86.HasAVX2 } 68 69 // IsEnabledX2 returns true if the architecture supports a two-way SIMD 70 // implementation provided in this package. 71 func IsEnabledX2() bool { return enabledX2 } 72 73 // Initialize the state and returns the buffer on which the four permutations 74 // will act: a uint64 slice of length 100. The first permutation will act 75 // on {a[0], a[4], ..., a[96]}, the second on {a[1], a[5], ..., a[97]}, etc. 76 // If turbo is true, applies 12-round variant instead of the usual 24. 77 func (s *StateX4) Initialize(turbo bool) []uint64 { 78 s.turbo = turbo 79 rp := unsafe.Pointer(&s.a[0]) 80 81 // uint64s are always aligned by a multiple of 8. Compute the remainder 82 // of the address modulo 32 divided by 8. 83 rem := (int(uintptr(rp)&31) >> 3) 84 85 if rem != 0 { 86 s.offset = 4 - rem 87 } 88 89 // The slice we return will be aligned on 32 byte boundary. 90 return s.a[s.offset : s.offset+100] 91 } 92 93 // Initialize the state and returns the buffer on which the two permutations 94 // will act: a uint64 slice of length 50. The first permutation will act 95 // on {a[0], a[2], ..., a[48]} and the second on {a[1], a[3], ..., a[49]}. 96 // If turbo is true, applies 12-round variant instead of the usual 24. 97 func (s *StateX2) Initialize(turbo bool) []uint64 { 98 s.turbo = turbo 99 rp := unsafe.Pointer(&s.a[0]) 100 101 // uint64s are always aligned by a multiple of 8. Compute the remainder 102 // of the address modulo 32 divided by 8. 103 rem := (int(uintptr(rp)&31) >> 3) 104 105 if rem != 0 { 106 s.offset = 4 - rem 107 } 108 109 // The slice we return will be aligned on 32 byte boundary. 110 return s.a[s.offset : s.offset+50] 111 } 112 113 // Permute performs the four parallel Keccak-f[1600]s interleaved on the slice 114 // returned from Initialize(). 115 func (s *StateX4) Permute() { 116 if IsEnabledX4() { 117 permuteSIMDx4(s.a[s.offset:], s.turbo) 118 } else { 119 permuteScalarX4(s.a[s.offset:], s.turbo) // A slower generic implementation. 120 } 121 } 122 123 // Permute performs the two parallel Keccak-f[1600]s interleaved on the slice 124 // returned from Initialize(). 125 func (s *StateX2) Permute() { 126 if IsEnabledX2() { 127 permuteSIMDx2(s.a[s.offset:], s.turbo) 128 } else { 129 permuteScalarX2(s.a[s.offset:], s.turbo) // A slower generic implementation. 130 } 131 } 132 133 func permuteScalarX4(a []uint64, turbo bool) { 134 var buf [25]uint64 135 for i := 0; i < 4; i++ { 136 for j := 0; j < 25; j++ { 137 buf[j] = a[4*j+i] 138 } 139 sha3.KeccakF1600(&buf, turbo) 140 for j := 0; j < 25; j++ { 141 a[4*j+i] = buf[j] 142 } 143 } 144 } 145 146 func permuteScalarX2(a []uint64, turbo bool) { 147 var buf [25]uint64 148 for i := 0; i < 2; i++ { 149 for j := 0; j < 25; j++ { 150 buf[j] = a[2*j+i] 151 } 152 sha3.KeccakF1600(&buf, turbo) 153 for j := 0; j < 25; j++ { 154 a[2*j+i] = buf[j] 155 } 156 } 157 } 158 159 var enabledX2 bool 160 161 func init() { 162 enabledX2 = runtime.GOARCH == "arm64" && runtime.GOOS == "darwin" 163 }