github.com/consensys/gnark-crypto@v0.14.0/ecc/bls12-377/fr/fft/bitreverse.go (about)

     1  // Copyright 2020 Consensys Software Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Code generated by consensys/gnark-crypto DO NOT EDIT
    16  
    17  package fft
    18  
    19  import (
    20  	"math/bits"
    21  	"runtime"
    22  
    23  	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr"
    24  )
    25  
    26  // BitReverse applies the bit-reversal permutation to v.
    27  // len(v) must be a power of 2
    28  func BitReverse(v []fr.Element) {
    29  	n := uint64(len(v))
    30  	if bits.OnesCount64(n) != 1 {
    31  		panic("len(a) must be a power of 2")
    32  	}
    33  
    34  	if runtime.GOARCH == "arm64" {
    35  		bitReverseNaive(v)
    36  	} else {
    37  		bitReverseCobra(v)
    38  	}
    39  }
    40  
    41  // bitReverseNaive applies the bit-reversal permutation to v.
    42  // len(v) must be a power of 2
    43  func bitReverseNaive(v []fr.Element) {
    44  	n := uint64(len(v))
    45  	nn := uint64(64 - bits.TrailingZeros64(n))
    46  
    47  	for i := uint64(0); i < n; i++ {
    48  		iRev := bits.Reverse64(i) >> nn
    49  		if iRev > i {
    50  			v[i], v[iRev] = v[iRev], v[i]
    51  		}
    52  	}
    53  }
    54  
    55  // bitReverseCobraInPlace applies the bit-reversal permutation to v.
    56  // len(v) must be a power of 2
    57  // This is derived from:
    58  //
    59  //   - Towards an Optimal Bit-Reversal Permutation Program
    60  //     Larry Carter and Kang Su Gatlin, 1998
    61  //     https://csaws.cs.technion.ac.il/~itai/Courses/Cache/bit.pdf
    62  //
    63  //   - Practically efficient methods for performing bit-reversed
    64  //     permutation in C++11 on the x86-64 architecture
    65  //     Knauth, Adas, Whitfield, Wang, Ickler, Conrad, Serang, 2017
    66  //     https://arxiv.org/pdf/1708.01873.pdf
    67  //
    68  //   - and more specifically, constantine implementation:
    69  //     https://github.com/mratsim/constantine/blob/d51699248db04e29c7b1ad97e0bafa1499db00b5/constantine/math/polynomials/fft.nim#L205
    70  //     by Mamy Ratsimbazafy (@mratsim).
    71  func bitReverseCobraInPlace(v []fr.Element) {
    72  	logN := uint64(bits.Len64(uint64(len(v))) - 1)
    73  	logTileSize := deriveLogTileSize(logN)
    74  	logBLen := logN - 2*logTileSize
    75  	bLen := uint64(1) << logBLen
    76  	bShift := logBLen + logTileSize
    77  	tileSize := uint64(1) << logTileSize
    78  
    79  	// rough idea;
    80  	// bit reversal permutation naive implementation may have some cache associativity issues,
    81  	// since we are accessing elements by strides of powers of 2.
    82  	// on large inputs, this is noticeable and can be improved by using a t buffer.
    83  	// idea is for t buffer to be small enough to fit in cache.
    84  	// in the first inner loop, we copy the elements of v into t in a bit-reversed order.
    85  	// in the subsequent inner loops, accesses have much better cache locality than the naive implementation.
    86  	// hence even if we apparently do more work (swaps / copies), we are faster.
    87  	//
    88  	// on arm64 (and particularly on M1 macs), this is not noticeable, and the naive implementation is faster,
    89  	// in most cases.
    90  	// on x86 (and particularly on aws hpc6a) this is noticeable, and the t buffer implementation is faster (up to 3x).
    91  	//
    92  	// optimal choice for the tile size is cache dependent; in theory, we want the t buffer to fit in the L1 cache;
    93  	// in practice, a common size for L1 is 64kb, a field element is 32bytes or more.
    94  	// hence we can fit 2k elements in the L1 cache, which corresponds to a tile size of 2**5 with some margin for cache conflicts.
    95  	//
    96  	// for most sizes of interest, this tile size choice doesn't yield good results;
    97  	// we find that a tile size of 2**9 gives best results for input sizes from 2**21 up to 2**27+.
    98  	t := make([]fr.Element, tileSize*tileSize)
    99  
   100  	// see https://csaws.cs.technion.ac.il/~itai/Courses/Cache/bit.pdf
   101  	// for a detailed explanation of the algorithm.
   102  	for b := uint64(0); b < bLen; b++ {
   103  
   104  		for a := uint64(0); a < tileSize; a++ {
   105  			aRev := (bits.Reverse64(a) >> (64 - logTileSize)) << logTileSize
   106  			for c := uint64(0); c < tileSize; c++ {
   107  				idx := (a << bShift) | (b << logTileSize) | c
   108  				t[aRev|c] = v[idx]
   109  			}
   110  		}
   111  
   112  		bRev := (bits.Reverse64(b) >> (64 - logBLen)) << logTileSize
   113  
   114  		for c := uint64(0); c < tileSize; c++ {
   115  			cRev := ((bits.Reverse64(c) >> (64 - logTileSize)) << bShift) | bRev
   116  			for aRev := uint64(0); aRev < tileSize; aRev++ {
   117  				a := bits.Reverse64(aRev) >> (64 - logTileSize)
   118  				idx := (a << bShift) | (b << logTileSize) | c
   119  				idxRev := cRev | aRev
   120  				if idx < idxRev {
   121  					tIdx := (aRev << logTileSize) | c
   122  					v[idxRev], t[tIdx] = t[tIdx], v[idxRev]
   123  				}
   124  			}
   125  		}
   126  
   127  		for a := uint64(0); a < tileSize; a++ {
   128  			aRev := bits.Reverse64(a) >> (64 - logTileSize)
   129  			for c := uint64(0); c < tileSize; c++ {
   130  				cRev := (bits.Reverse64(c) >> (64 - logTileSize)) << bShift
   131  				idx := (a << bShift) | (b << logTileSize) | c
   132  				idxRev := cRev | bRev | aRev
   133  				if idx < idxRev {
   134  					tIdx := (aRev << logTileSize) | c
   135  					v[idx], t[tIdx] = t[tIdx], v[idx]
   136  				}
   137  			}
   138  		}
   139  	}
   140  }
   141  
   142  func bitReverseCobra(v []fr.Element) {
   143  	switch len(v) {
   144  	case 1 << 21:
   145  		bitReverseCobraInPlace_9_21(v)
   146  	case 1 << 22:
   147  		bitReverseCobraInPlace_9_22(v)
   148  	case 1 << 23:
   149  		bitReverseCobraInPlace_9_23(v)
   150  	case 1 << 24:
   151  		bitReverseCobraInPlace_9_24(v)
   152  	case 1 << 25:
   153  		bitReverseCobraInPlace_9_25(v)
   154  	case 1 << 26:
   155  		bitReverseCobraInPlace_9_26(v)
   156  	case 1 << 27:
   157  		bitReverseCobraInPlace_9_27(v)
   158  	default:
   159  		if len(v) > 1<<27 {
   160  			bitReverseCobraInPlace(v)
   161  		} else {
   162  			bitReverseNaive(v)
   163  		}
   164  	}
   165  }
   166  
   167  func deriveLogTileSize(logN uint64) uint64 {
   168  	q := uint64(9) // see bitReverseCobraInPlace for more details
   169  
   170  	for int(logN)-int(2*q) <= 0 {
   171  		q--
   172  	}
   173  
   174  	return q
   175  }
   176  
   177  // bitReverseCobraInPlace_9_21 applies the bit-reversal permutation to v.
   178  // len(v) must be 1 << 21.
   179  // see bitReverseCobraInPlace for more details; this function is specialized for 9,
   180  // as it declares the t buffer and various constants statically for performance.
   181  func bitReverseCobraInPlace_9_21(v []fr.Element) {
   182  	const (
   183  		logTileSize = uint64(9)
   184  		tileSize    = uint64(1) << logTileSize
   185  		logN        = 21
   186  		logBLen     = logN - 2*logTileSize
   187  		bShift      = logBLen + logTileSize
   188  		bLen        = uint64(1) << logBLen
   189  	)
   190  
   191  	var t [tileSize * tileSize]fr.Element
   192  
   193  	for b := uint64(0); b < bLen; b++ {
   194  
   195  		for a := uint64(0); a < tileSize; a++ {
   196  			aRev := (bits.Reverse64(a) >> 55) << logTileSize
   197  			for c := uint64(0); c < tileSize; c++ {
   198  				idx := (a << bShift) | (b << logTileSize) | c
   199  				t[aRev|c] = v[idx]
   200  			}
   201  		}
   202  
   203  		bRev := (bits.Reverse64(b) >> (64 - logBLen)) << logTileSize
   204  
   205  		for c := uint64(0); c < tileSize; c++ {
   206  			cRev := ((bits.Reverse64(c) >> 55) << bShift) | bRev
   207  			for aRev := uint64(0); aRev < tileSize; aRev++ {
   208  				a := bits.Reverse64(aRev) >> 55
   209  				idx := (a << bShift) | (b << logTileSize) | c
   210  				idxRev := cRev | aRev
   211  				if idx < idxRev {
   212  					tIdx := (aRev << logTileSize) | c
   213  					v[idxRev], t[tIdx] = t[tIdx], v[idxRev]
   214  				}
   215  			}
   216  		}
   217  
   218  		for a := uint64(0); a < tileSize; a++ {
   219  			aRev := bits.Reverse64(a) >> 55
   220  			for c := uint64(0); c < tileSize; c++ {
   221  				cRev := (bits.Reverse64(c) >> 55) << bShift
   222  				idx := (a << bShift) | (b << logTileSize) | c
   223  				idxRev := cRev | bRev | aRev
   224  				if idx < idxRev {
   225  					tIdx := (aRev << logTileSize) | c
   226  					v[idx], t[tIdx] = t[tIdx], v[idx]
   227  				}
   228  			}
   229  		}
   230  	}
   231  
   232  }
   233  
   234  // bitReverseCobraInPlace_9_22 applies the bit-reversal permutation to v.
   235  // len(v) must be 1 << 22.
   236  // see bitReverseCobraInPlace for more details; this function is specialized for 9,
   237  // as it declares the t buffer and various constants statically for performance.
   238  func bitReverseCobraInPlace_9_22(v []fr.Element) {
   239  	const (
   240  		logTileSize = uint64(9)
   241  		tileSize    = uint64(1) << logTileSize
   242  		logN        = 22
   243  		logBLen     = logN - 2*logTileSize
   244  		bShift      = logBLen + logTileSize
   245  		bLen        = uint64(1) << logBLen
   246  	)
   247  
   248  	var t [tileSize * tileSize]fr.Element
   249  
   250  	for b := uint64(0); b < bLen; b++ {
   251  
   252  		for a := uint64(0); a < tileSize; a++ {
   253  			aRev := (bits.Reverse64(a) >> 55) << logTileSize
   254  			for c := uint64(0); c < tileSize; c++ {
   255  				idx := (a << bShift) | (b << logTileSize) | c
   256  				t[aRev|c] = v[idx]
   257  			}
   258  		}
   259  
   260  		bRev := (bits.Reverse64(b) >> (64 - logBLen)) << logTileSize
   261  
   262  		for c := uint64(0); c < tileSize; c++ {
   263  			cRev := ((bits.Reverse64(c) >> 55) << bShift) | bRev
   264  			for aRev := uint64(0); aRev < tileSize; aRev++ {
   265  				a := bits.Reverse64(aRev) >> 55
   266  				idx := (a << bShift) | (b << logTileSize) | c
   267  				idxRev := cRev | aRev
   268  				if idx < idxRev {
   269  					tIdx := (aRev << logTileSize) | c
   270  					v[idxRev], t[tIdx] = t[tIdx], v[idxRev]
   271  				}
   272  			}
   273  		}
   274  
   275  		for a := uint64(0); a < tileSize; a++ {
   276  			aRev := bits.Reverse64(a) >> 55
   277  			for c := uint64(0); c < tileSize; c++ {
   278  				cRev := (bits.Reverse64(c) >> 55) << bShift
   279  				idx := (a << bShift) | (b << logTileSize) | c
   280  				idxRev := cRev | bRev | aRev
   281  				if idx < idxRev {
   282  					tIdx := (aRev << logTileSize) | c
   283  					v[idx], t[tIdx] = t[tIdx], v[idx]
   284  				}
   285  			}
   286  		}
   287  	}
   288  
   289  }
   290  
   291  // bitReverseCobraInPlace_9_23 applies the bit-reversal permutation to v.
   292  // len(v) must be 1 << 23.
   293  // see bitReverseCobraInPlace for more details; this function is specialized for 9,
   294  // as it declares the t buffer and various constants statically for performance.
   295  func bitReverseCobraInPlace_9_23(v []fr.Element) {
   296  	const (
   297  		logTileSize = uint64(9)
   298  		tileSize    = uint64(1) << logTileSize
   299  		logN        = 23
   300  		logBLen     = logN - 2*logTileSize
   301  		bShift      = logBLen + logTileSize
   302  		bLen        = uint64(1) << logBLen
   303  	)
   304  
   305  	var t [tileSize * tileSize]fr.Element
   306  
   307  	for b := uint64(0); b < bLen; b++ {
   308  
   309  		for a := uint64(0); a < tileSize; a++ {
   310  			aRev := (bits.Reverse64(a) >> 55) << logTileSize
   311  			for c := uint64(0); c < tileSize; c++ {
   312  				idx := (a << bShift) | (b << logTileSize) | c
   313  				t[aRev|c] = v[idx]
   314  			}
   315  		}
   316  
   317  		bRev := (bits.Reverse64(b) >> (64 - logBLen)) << logTileSize
   318  
   319  		for c := uint64(0); c < tileSize; c++ {
   320  			cRev := ((bits.Reverse64(c) >> 55) << bShift) | bRev
   321  			for aRev := uint64(0); aRev < tileSize; aRev++ {
   322  				a := bits.Reverse64(aRev) >> 55
   323  				idx := (a << bShift) | (b << logTileSize) | c
   324  				idxRev := cRev | aRev
   325  				if idx < idxRev {
   326  					tIdx := (aRev << logTileSize) | c
   327  					v[idxRev], t[tIdx] = t[tIdx], v[idxRev]
   328  				}
   329  			}
   330  		}
   331  
   332  		for a := uint64(0); a < tileSize; a++ {
   333  			aRev := bits.Reverse64(a) >> 55
   334  			for c := uint64(0); c < tileSize; c++ {
   335  				cRev := (bits.Reverse64(c) >> 55) << bShift
   336  				idx := (a << bShift) | (b << logTileSize) | c
   337  				idxRev := cRev | bRev | aRev
   338  				if idx < idxRev {
   339  					tIdx := (aRev << logTileSize) | c
   340  					v[idx], t[tIdx] = t[tIdx], v[idx]
   341  				}
   342  			}
   343  		}
   344  	}
   345  
   346  }
   347  
   348  // bitReverseCobraInPlace_9_24 applies the bit-reversal permutation to v.
   349  // len(v) must be 1 << 24.
   350  // see bitReverseCobraInPlace for more details; this function is specialized for 9,
   351  // as it declares the t buffer and various constants statically for performance.
   352  func bitReverseCobraInPlace_9_24(v []fr.Element) {
   353  	const (
   354  		logTileSize = uint64(9)
   355  		tileSize    = uint64(1) << logTileSize
   356  		logN        = 24
   357  		logBLen     = logN - 2*logTileSize
   358  		bShift      = logBLen + logTileSize
   359  		bLen        = uint64(1) << logBLen
   360  	)
   361  
   362  	var t [tileSize * tileSize]fr.Element
   363  
   364  	for b := uint64(0); b < bLen; b++ {
   365  
   366  		for a := uint64(0); a < tileSize; a++ {
   367  			aRev := (bits.Reverse64(a) >> 55) << logTileSize
   368  			for c := uint64(0); c < tileSize; c++ {
   369  				idx := (a << bShift) | (b << logTileSize) | c
   370  				t[aRev|c] = v[idx]
   371  			}
   372  		}
   373  
   374  		bRev := (bits.Reverse64(b) >> (64 - logBLen)) << logTileSize
   375  
   376  		for c := uint64(0); c < tileSize; c++ {
   377  			cRev := ((bits.Reverse64(c) >> 55) << bShift) | bRev
   378  			for aRev := uint64(0); aRev < tileSize; aRev++ {
   379  				a := bits.Reverse64(aRev) >> 55
   380  				idx := (a << bShift) | (b << logTileSize) | c
   381  				idxRev := cRev | aRev
   382  				if idx < idxRev {
   383  					tIdx := (aRev << logTileSize) | c
   384  					v[idxRev], t[tIdx] = t[tIdx], v[idxRev]
   385  				}
   386  			}
   387  		}
   388  
   389  		for a := uint64(0); a < tileSize; a++ {
   390  			aRev := bits.Reverse64(a) >> 55
   391  			for c := uint64(0); c < tileSize; c++ {
   392  				cRev := (bits.Reverse64(c) >> 55) << bShift
   393  				idx := (a << bShift) | (b << logTileSize) | c
   394  				idxRev := cRev | bRev | aRev
   395  				if idx < idxRev {
   396  					tIdx := (aRev << logTileSize) | c
   397  					v[idx], t[tIdx] = t[tIdx], v[idx]
   398  				}
   399  			}
   400  		}
   401  	}
   402  
   403  }
   404  
   405  // bitReverseCobraInPlace_9_25 applies the bit-reversal permutation to v.
   406  // len(v) must be 1 << 25.
   407  // see bitReverseCobraInPlace for more details; this function is specialized for 9,
   408  // as it declares the t buffer and various constants statically for performance.
   409  func bitReverseCobraInPlace_9_25(v []fr.Element) {
   410  	const (
   411  		logTileSize = uint64(9)
   412  		tileSize    = uint64(1) << logTileSize
   413  		logN        = 25
   414  		logBLen     = logN - 2*logTileSize
   415  		bShift      = logBLen + logTileSize
   416  		bLen        = uint64(1) << logBLen
   417  	)
   418  
   419  	var t [tileSize * tileSize]fr.Element
   420  
   421  	for b := uint64(0); b < bLen; b++ {
   422  
   423  		for a := uint64(0); a < tileSize; a++ {
   424  			aRev := (bits.Reverse64(a) >> 55) << logTileSize
   425  			for c := uint64(0); c < tileSize; c++ {
   426  				idx := (a << bShift) | (b << logTileSize) | c
   427  				t[aRev|c] = v[idx]
   428  			}
   429  		}
   430  
   431  		bRev := (bits.Reverse64(b) >> (64 - logBLen)) << logTileSize
   432  
   433  		for c := uint64(0); c < tileSize; c++ {
   434  			cRev := ((bits.Reverse64(c) >> 55) << bShift) | bRev
   435  			for aRev := uint64(0); aRev < tileSize; aRev++ {
   436  				a := bits.Reverse64(aRev) >> 55
   437  				idx := (a << bShift) | (b << logTileSize) | c
   438  				idxRev := cRev | aRev
   439  				if idx < idxRev {
   440  					tIdx := (aRev << logTileSize) | c
   441  					v[idxRev], t[tIdx] = t[tIdx], v[idxRev]
   442  				}
   443  			}
   444  		}
   445  
   446  		for a := uint64(0); a < tileSize; a++ {
   447  			aRev := bits.Reverse64(a) >> 55
   448  			for c := uint64(0); c < tileSize; c++ {
   449  				cRev := (bits.Reverse64(c) >> 55) << bShift
   450  				idx := (a << bShift) | (b << logTileSize) | c
   451  				idxRev := cRev | bRev | aRev
   452  				if idx < idxRev {
   453  					tIdx := (aRev << logTileSize) | c
   454  					v[idx], t[tIdx] = t[tIdx], v[idx]
   455  				}
   456  			}
   457  		}
   458  	}
   459  
   460  }
   461  
   462  // bitReverseCobraInPlace_9_26 applies the bit-reversal permutation to v.
   463  // len(v) must be 1 << 26.
   464  // see bitReverseCobraInPlace for more details; this function is specialized for 9,
   465  // as it declares the t buffer and various constants statically for performance.
   466  func bitReverseCobraInPlace_9_26(v []fr.Element) {
   467  	const (
   468  		logTileSize = uint64(9)
   469  		tileSize    = uint64(1) << logTileSize
   470  		logN        = 26
   471  		logBLen     = logN - 2*logTileSize
   472  		bShift      = logBLen + logTileSize
   473  		bLen        = uint64(1) << logBLen
   474  	)
   475  
   476  	var t [tileSize * tileSize]fr.Element
   477  
   478  	for b := uint64(0); b < bLen; b++ {
   479  
   480  		for a := uint64(0); a < tileSize; a++ {
   481  			aRev := (bits.Reverse64(a) >> 55) << logTileSize
   482  			for c := uint64(0); c < tileSize; c++ {
   483  				idx := (a << bShift) | (b << logTileSize) | c
   484  				t[aRev|c] = v[idx]
   485  			}
   486  		}
   487  
   488  		bRev := (bits.Reverse64(b) >> (64 - logBLen)) << logTileSize
   489  
   490  		for c := uint64(0); c < tileSize; c++ {
   491  			cRev := ((bits.Reverse64(c) >> 55) << bShift) | bRev
   492  			for aRev := uint64(0); aRev < tileSize; aRev++ {
   493  				a := bits.Reverse64(aRev) >> 55
   494  				idx := (a << bShift) | (b << logTileSize) | c
   495  				idxRev := cRev | aRev
   496  				if idx < idxRev {
   497  					tIdx := (aRev << logTileSize) | c
   498  					v[idxRev], t[tIdx] = t[tIdx], v[idxRev]
   499  				}
   500  			}
   501  		}
   502  
   503  		for a := uint64(0); a < tileSize; a++ {
   504  			aRev := bits.Reverse64(a) >> 55
   505  			for c := uint64(0); c < tileSize; c++ {
   506  				cRev := (bits.Reverse64(c) >> 55) << bShift
   507  				idx := (a << bShift) | (b << logTileSize) | c
   508  				idxRev := cRev | bRev | aRev
   509  				if idx < idxRev {
   510  					tIdx := (aRev << logTileSize) | c
   511  					v[idx], t[tIdx] = t[tIdx], v[idx]
   512  				}
   513  			}
   514  		}
   515  	}
   516  
   517  }
   518  
   519  // bitReverseCobraInPlace_9_27 applies the bit-reversal permutation to v.
   520  // len(v) must be 1 << 27.
   521  // see bitReverseCobraInPlace for more details; this function is specialized for 9,
   522  // as it declares the t buffer and various constants statically for performance.
   523  func bitReverseCobraInPlace_9_27(v []fr.Element) {
   524  	const (
   525  		logTileSize = uint64(9)
   526  		tileSize    = uint64(1) << logTileSize
   527  		logN        = 27
   528  		logBLen     = logN - 2*logTileSize
   529  		bShift      = logBLen + logTileSize
   530  		bLen        = uint64(1) << logBLen
   531  	)
   532  
   533  	var t [tileSize * tileSize]fr.Element
   534  
   535  	for b := uint64(0); b < bLen; b++ {
   536  
   537  		for a := uint64(0); a < tileSize; a++ {
   538  			aRev := (bits.Reverse64(a) >> 55) << logTileSize
   539  			for c := uint64(0); c < tileSize; c++ {
   540  				idx := (a << bShift) | (b << logTileSize) | c
   541  				t[aRev|c] = v[idx]
   542  			}
   543  		}
   544  
   545  		bRev := (bits.Reverse64(b) >> (64 - logBLen)) << logTileSize
   546  
   547  		for c := uint64(0); c < tileSize; c++ {
   548  			cRev := ((bits.Reverse64(c) >> 55) << bShift) | bRev
   549  			for aRev := uint64(0); aRev < tileSize; aRev++ {
   550  				a := bits.Reverse64(aRev) >> 55
   551  				idx := (a << bShift) | (b << logTileSize) | c
   552  				idxRev := cRev | aRev
   553  				if idx < idxRev {
   554  					tIdx := (aRev << logTileSize) | c
   555  					v[idxRev], t[tIdx] = t[tIdx], v[idxRev]
   556  				}
   557  			}
   558  		}
   559  
   560  		for a := uint64(0); a < tileSize; a++ {
   561  			aRev := bits.Reverse64(a) >> 55
   562  			for c := uint64(0); c < tileSize; c++ {
   563  				cRev := (bits.Reverse64(c) >> 55) << bShift
   564  				idx := (a << bShift) | (b << logTileSize) | c
   565  				idxRev := cRev | bRev | aRev
   566  				if idx < idxRev {
   567  					tIdx := (aRev << logTileSize) | c
   568  					v[idx], t[tIdx] = t[tIdx], v[idx]
   569  				}
   570  			}
   571  		}
   572  	}
   573  
   574  }