github.com/cloudflare/circl@v1.5.0/simd/keccakf1600/f1600x.go (about)

     1  // Package keccakf1600 provides a two and four-way Keccak-f[1600] permutation in parallel.
     2  //
     3  // Keccak-f[1600] is the permutation underlying several algorithms such as
     4  // Keccak, SHA3 and SHAKE. Running two or four permutations in parallel is
     5  // useful in some scenarios like in hash-based signatures.
     6  //
     7  // # Limitations
     8  //
     9  // Note that not all the architectures support SIMD instructions. This package
    10  // uses AVX2 instructions that are available in some AMD64 architectures
    11  // and  NEON instructions that are available in some ARM64 architectures.
    12  //
    13  // For those systems not supporting these, the package still provides the
    14  // expected functionality by means of a generic and slow implementation.
    15  // The recommendation is to beforehand verify IsEnabledX4() and IsEnabledX2()
    16  // to determine if the current system supports the SIMD implementation.
    17  package keccakf1600
    18  
    19  import (
    20  	"runtime"
    21  	"unsafe"
    22  
    23  	"github.com/cloudflare/circl/internal/sha3"
    24  	"golang.org/x/sys/cpu"
    25  )
    26  
    27  // StateX4 contains state for the four-way permutation including the four
    28  // interleaved [25]uint64 buffers. Call Initialize() before use to initialize
    29  // and get a pointer to the interleaved buffer.
    30  type StateX4 struct {
    31  	// Go guarantees a to be aligned on 8 bytes, whereas we need it to be
    32  	// aligned on 32 bytes for bet performance.  Thus we leave some headroom
    33  	// to be able to move the start of the state.
    34  
    35  	// 4 x 25 uint64s for the interleaved states and three uint64s headroom
    36  	// to fix alignment.
    37  	a [103]uint64
    38  
    39  	// Offset into a that is 32 byte aligned.
    40  	offset int
    41  
    42  	// If true, permute will use 12-round keccak instead of 24-round keccak
    43  	turbo bool
    44  }
    45  
    46  // StateX2 contains state for the two-way permutation including the two
    47  // interleaved [25]uint64 buffers. Call Initialize() before use to initialize
    48  // and get a pointer to the interleaved buffer.
    49  type StateX2 struct {
    50  	// Go guarantees a to be aligned on 8 bytes, whereas we need it to be
    51  	// aligned on 32 bytes for bet performance.  Thus we leave some headroom
    52  	// to be able to move the start of the state.
    53  
    54  	// 2 x 25 uint64s for the interleaved states and three uint64s headroom
    55  	// to fix alignment.
    56  	a [53]uint64
    57  
    58  	// Offset into a that is 32 byte aligned.
    59  	offset int
    60  
    61  	// If true, permute will use 12-round keccak instead of 24-round keccak
    62  	turbo bool
    63  }
    64  
    65  // IsEnabledX4 returns true if the architecture supports a four-way SIMD
    66  // implementation provided in this package.
    67  func IsEnabledX4() bool { return cpu.X86.HasAVX2 }
    68  
    69  // IsEnabledX2 returns true if the architecture supports a two-way SIMD
    70  // implementation provided in this package.
    71  func IsEnabledX2() bool { return enabledX2 }
    72  
    73  // Initialize the state and returns the buffer on which the four permutations
    74  // will act: a uint64 slice of length 100.  The first permutation will act
    75  // on {a[0], a[4], ..., a[96]}, the second on {a[1], a[5], ..., a[97]}, etc.
    76  // If turbo is true, applies 12-round variant instead of the usual 24.
    77  func (s *StateX4) Initialize(turbo bool) []uint64 {
    78  	s.turbo = turbo
    79  	rp := unsafe.Pointer(&s.a[0])
    80  
    81  	// uint64s are always aligned by a multiple of 8.  Compute the remainder
    82  	// of the address modulo 32 divided by 8.
    83  	rem := (int(uintptr(rp)&31) >> 3)
    84  
    85  	if rem != 0 {
    86  		s.offset = 4 - rem
    87  	}
    88  
    89  	// The slice we return will be aligned on 32 byte boundary.
    90  	return s.a[s.offset : s.offset+100]
    91  }
    92  
    93  // Initialize the state and returns the buffer on which the two permutations
    94  // will act: a uint64 slice of length 50.  The first permutation will act
    95  // on {a[0], a[2], ..., a[48]} and the second on {a[1], a[3], ..., a[49]}.
    96  // If turbo is true, applies 12-round variant instead of the usual 24.
    97  func (s *StateX2) Initialize(turbo bool) []uint64 {
    98  	s.turbo = turbo
    99  	rp := unsafe.Pointer(&s.a[0])
   100  
   101  	// uint64s are always aligned by a multiple of 8.  Compute the remainder
   102  	// of the address modulo 32 divided by 8.
   103  	rem := (int(uintptr(rp)&31) >> 3)
   104  
   105  	if rem != 0 {
   106  		s.offset = 4 - rem
   107  	}
   108  
   109  	// The slice we return will be aligned on 32 byte boundary.
   110  	return s.a[s.offset : s.offset+50]
   111  }
   112  
   113  // Permute performs the four parallel Keccak-f[1600]s interleaved on the slice
   114  // returned from Initialize().
   115  func (s *StateX4) Permute() {
   116  	if IsEnabledX4() {
   117  		permuteSIMDx4(s.a[s.offset:], s.turbo)
   118  	} else {
   119  		permuteScalarX4(s.a[s.offset:], s.turbo) // A slower generic implementation.
   120  	}
   121  }
   122  
   123  // Permute performs the two parallel Keccak-f[1600]s interleaved on the slice
   124  // returned from Initialize().
   125  func (s *StateX2) Permute() {
   126  	if IsEnabledX2() {
   127  		permuteSIMDx2(s.a[s.offset:], s.turbo)
   128  	} else {
   129  		permuteScalarX2(s.a[s.offset:], s.turbo) // A slower generic implementation.
   130  	}
   131  }
   132  
   133  func permuteScalarX4(a []uint64, turbo bool) {
   134  	var buf [25]uint64
   135  	for i := 0; i < 4; i++ {
   136  		for j := 0; j < 25; j++ {
   137  			buf[j] = a[4*j+i]
   138  		}
   139  		sha3.KeccakF1600(&buf, turbo)
   140  		for j := 0; j < 25; j++ {
   141  			a[4*j+i] = buf[j]
   142  		}
   143  	}
   144  }
   145  
   146  func permuteScalarX2(a []uint64, turbo bool) {
   147  	var buf [25]uint64
   148  	for i := 0; i < 2; i++ {
   149  		for j := 0; j < 25; j++ {
   150  			buf[j] = a[2*j+i]
   151  		}
   152  		sha3.KeccakF1600(&buf, turbo)
   153  		for j := 0; j < 25; j++ {
   154  			a[2*j+i] = buf[j]
   155  		}
   156  	}
   157  }
   158  
   159  var enabledX2 bool
   160  
   161  func init() {
   162  	enabledX2 = runtime.GOARCH == "arm64" && runtime.GOOS == "darwin"
   163  }