git.sr.ht/~pingoo/stdx@v0.0.0-20240218134121-094174641f6e/crypto/internal/blake3/compress_amd64.go (about)

     1  package blake3
     2  
     3  import "unsafe"
     4  
     5  //go:generate go run avo/gen.go -out blake3_amd64.s
     6  
     7  //go:noescape
     8  func compressChunksAVX512(cvs *[16][8]uint32, buf *[16 * chunkSize]byte, key *[8]uint32, counter uint64, flags uint32)
     9  
    10  //go:noescape
    11  func compressChunksAVX2(cvs *[8][8]uint32, buf *[8 * chunkSize]byte, key *[8]uint32, counter uint64, flags uint32)
    12  
    13  //go:noescape
    14  func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
    15  
    16  //go:noescape
    17  func compressBlocksAVX2(out *[512]byte, msgs *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
    18  
    19  //go:noescape
    20  func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32)
    21  
    22  func compressNode(n node) (out [16]uint32) {
    23  	compressNodeGeneric(&out, n)
    24  	return
    25  }
    26  
    27  func compressBufferAVX512(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
    28  	var cvs [maxSIMD][8]uint32
    29  	compressChunksAVX512(&cvs, buf, key, counter, flags)
    30  	numChunks := uint64(buflen / chunkSize)
    31  	if buflen%chunkSize != 0 {
    32  		// use non-asm for remainder
    33  		partialChunk := buf[buflen-buflen%chunkSize : buflen]
    34  		cvs[numChunks] = chainingValue(compressChunk(partialChunk, key, counter+numChunks, flags))
    35  		numChunks++
    36  	}
    37  	return mergeSubtrees(&cvs, numChunks, key, flags)
    38  }
    39  
    40  func compressBufferAVX2(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
    41  	var cvs [maxSIMD][8]uint32
    42  	cvHalves := (*[2][8][8]uint32)(unsafe.Pointer(&cvs))
    43  	bufHalves := (*[2][8 * chunkSize]byte)(unsafe.Pointer(buf))
    44  	compressChunksAVX2(&cvHalves[0], &bufHalves[0], key, counter, flags)
    45  	numChunks := uint64(buflen / chunkSize)
    46  	if numChunks > 8 {
    47  		compressChunksAVX2(&cvHalves[1], &bufHalves[1], key, counter+8, flags)
    48  	}
    49  	if buflen%chunkSize != 0 {
    50  		// use non-asm for remainder
    51  		partialChunk := buf[buflen-buflen%chunkSize : buflen]
    52  		cvs[numChunks] = chainingValue(compressChunk(partialChunk, key, counter+numChunks, flags))
    53  		numChunks++
    54  	}
    55  	return mergeSubtrees(&cvs, numChunks, key, flags)
    56  }
    57  
    58  func compressBuffer(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
    59  	switch {
    60  	case haveAVX512 && buflen >= chunkSize*2:
    61  		return compressBufferAVX512(buf, buflen, key, counter, flags)
    62  	case haveAVX2 && buflen >= chunkSize*2:
    63  		return compressBufferAVX2(buf, buflen, key, counter, flags)
    64  	default:
    65  		return compressBufferGeneric(buf, buflen, key, counter, flags)
    66  	}
    67  }
    68  
    69  func compressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) node {
    70  	n := node{
    71  		cv:       *key,
    72  		counter:  counter,
    73  		blockLen: blockSize,
    74  		flags:    flags | flagChunkStart,
    75  	}
    76  	blockBytes := (*[64]byte)(unsafe.Pointer(&n.block))[:]
    77  	for len(chunk) > blockSize {
    78  		copy(blockBytes, chunk)
    79  		chunk = chunk[blockSize:]
    80  		n.cv = chainingValue(n)
    81  		n.flags &^= flagChunkStart
    82  	}
    83  	// pad last block with zeros
    84  	n.block = [16]uint32{}
    85  	copy(blockBytes, chunk)
    86  	n.blockLen = uint32(len(chunk))
    87  	n.flags |= flagChunkEnd
    88  	return n
    89  }
    90  
    91  func hashBlock(out *[64]byte, buf []byte) {
    92  	var block [16]uint32
    93  	copy((*[64]byte)(unsafe.Pointer(&block))[:], buf)
    94  	compressNodeGeneric((*[16]uint32)(unsafe.Pointer(out)), node{
    95  		cv:       iv,
    96  		block:    block,
    97  		blockLen: uint32(len(buf)),
    98  		flags:    flagChunkStart | flagChunkEnd | flagRoot,
    99  	})
   100  }
   101  
   102  func compressBlocks(out *[maxSIMD * blockSize]byte, n node) {
   103  	switch {
   104  	case haveAVX512:
   105  		compressBlocksAVX512(out, &n.block, &n.cv, n.counter, n.blockLen, n.flags)
   106  	case haveAVX2:
   107  		outs := (*[2][512]byte)(unsafe.Pointer(out))
   108  		compressBlocksAVX2(&outs[0], &n.block, &n.cv, n.counter, n.blockLen, n.flags)
   109  		compressBlocksAVX2(&outs[1], &n.block, &n.cv, n.counter+8, n.blockLen, n.flags)
   110  	default:
   111  		outs := (*[maxSIMD][64]byte)(unsafe.Pointer(out))
   112  		compressBlocksGeneric(outs, n)
   113  	}
   114  }
   115  
   116  func mergeSubtrees(cvs *[maxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) node {
   117  	if !haveAVX2 {
   118  		return mergeSubtreesGeneric(cvs, numCVs, key, flags)
   119  	}
   120  	for numCVs > 2 {
   121  		if numCVs%2 == 0 {
   122  			compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer(cvs)), cvs, key, flags)
   123  		} else {
   124  			keep := cvs[numCVs-1]
   125  			compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer(cvs)), cvs, key, flags)
   126  			cvs[numCVs/2] = keep
   127  			numCVs++
   128  		}
   129  		numCVs /= 2
   130  	}
   131  	return parentNode(cvs[0], cvs[1], *key, flags)
   132  }
   133  
   134  func wordsToBytes(words [16]uint32, block *[64]byte) {
   135  	*block = *(*[64]byte)(unsafe.Pointer(&words))
   136  }
   137  
   138  func bytesToCV(b []byte) [8]uint32 {
   139  	return *(*[8]uint32)(unsafe.Pointer(&b[0]))
   140  }
   141  
   142  func cvToBytes(cv *[8]uint32) *[32]byte {
   143  	return (*[32]byte)(unsafe.Pointer(cv))
   144  }