git.sr.ht/~pingoo/stdx@v0.0.0-20240218134121-094174641f6e/crypto/internal/blake3/compress_amd64.go (about) 1 package blake3 2 3 import "unsafe" 4 5 //go:generate go run avo/gen.go -out blake3_amd64.s 6 7 //go:noescape 8 func compressChunksAVX512(cvs *[16][8]uint32, buf *[16 * chunkSize]byte, key *[8]uint32, counter uint64, flags uint32) 9 10 //go:noescape 11 func compressChunksAVX2(cvs *[8][8]uint32, buf *[8 * chunkSize]byte, key *[8]uint32, counter uint64, flags uint32) 12 13 //go:noescape 14 func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) 15 16 //go:noescape 17 func compressBlocksAVX2(out *[512]byte, msgs *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32) 18 19 //go:noescape 20 func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32) 21 22 func compressNode(n node) (out [16]uint32) { 23 compressNodeGeneric(&out, n) 24 return 25 } 26 27 func compressBufferAVX512(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node { 28 var cvs [maxSIMD][8]uint32 29 compressChunksAVX512(&cvs, buf, key, counter, flags) 30 numChunks := uint64(buflen / chunkSize) 31 if buflen%chunkSize != 0 { 32 // use non-asm for remainder 33 partialChunk := buf[buflen-buflen%chunkSize : buflen] 34 cvs[numChunks] = chainingValue(compressChunk(partialChunk, key, counter+numChunks, flags)) 35 numChunks++ 36 } 37 return mergeSubtrees(&cvs, numChunks, key, flags) 38 } 39 40 func compressBufferAVX2(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node { 41 var cvs [maxSIMD][8]uint32 42 cvHalves := (*[2][8][8]uint32)(unsafe.Pointer(&cvs)) 43 bufHalves := (*[2][8 * chunkSize]byte)(unsafe.Pointer(buf)) 44 compressChunksAVX2(&cvHalves[0], &bufHalves[0], key, counter, flags) 45 numChunks := uint64(buflen / chunkSize) 46 if numChunks > 8 { 47 compressChunksAVX2(&cvHalves[1], &bufHalves[1], key, counter+8, flags) 48 } 49 if buflen%chunkSize != 0 { 50 // use non-asm for remainder 51 partialChunk := buf[buflen-buflen%chunkSize : buflen] 52 cvs[numChunks] = chainingValue(compressChunk(partialChunk, key, counter+numChunks, flags)) 53 numChunks++ 54 } 55 return mergeSubtrees(&cvs, numChunks, key, flags) 56 } 57 58 func compressBuffer(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node { 59 switch { 60 case haveAVX512 && buflen >= chunkSize*2: 61 return compressBufferAVX512(buf, buflen, key, counter, flags) 62 case haveAVX2 && buflen >= chunkSize*2: 63 return compressBufferAVX2(buf, buflen, key, counter, flags) 64 default: 65 return compressBufferGeneric(buf, buflen, key, counter, flags) 66 } 67 } 68 69 func compressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) node { 70 n := node{ 71 cv: *key, 72 counter: counter, 73 blockLen: blockSize, 74 flags: flags | flagChunkStart, 75 } 76 blockBytes := (*[64]byte)(unsafe.Pointer(&n.block))[:] 77 for len(chunk) > blockSize { 78 copy(blockBytes, chunk) 79 chunk = chunk[blockSize:] 80 n.cv = chainingValue(n) 81 n.flags &^= flagChunkStart 82 } 83 // pad last block with zeros 84 n.block = [16]uint32{} 85 copy(blockBytes, chunk) 86 n.blockLen = uint32(len(chunk)) 87 n.flags |= flagChunkEnd 88 return n 89 } 90 91 func hashBlock(out *[64]byte, buf []byte) { 92 var block [16]uint32 93 copy((*[64]byte)(unsafe.Pointer(&block))[:], buf) 94 compressNodeGeneric((*[16]uint32)(unsafe.Pointer(out)), node{ 95 cv: iv, 96 block: block, 97 blockLen: uint32(len(buf)), 98 flags: flagChunkStart | flagChunkEnd | flagRoot, 99 }) 100 } 101 102 func compressBlocks(out *[maxSIMD * blockSize]byte, n node) { 103 switch { 104 case haveAVX512: 105 compressBlocksAVX512(out, &n.block, &n.cv, n.counter, n.blockLen, n.flags) 106 case haveAVX2: 107 outs := (*[2][512]byte)(unsafe.Pointer(out)) 108 compressBlocksAVX2(&outs[0], &n.block, &n.cv, n.counter, n.blockLen, n.flags) 109 compressBlocksAVX2(&outs[1], &n.block, &n.cv, n.counter+8, n.blockLen, n.flags) 110 default: 111 outs := (*[maxSIMD][64]byte)(unsafe.Pointer(out)) 112 compressBlocksGeneric(outs, n) 113 } 114 } 115 116 func mergeSubtrees(cvs *[maxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) node { 117 if !haveAVX2 { 118 return mergeSubtreesGeneric(cvs, numCVs, key, flags) 119 } 120 for numCVs > 2 { 121 if numCVs%2 == 0 { 122 compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer(cvs)), cvs, key, flags) 123 } else { 124 keep := cvs[numCVs-1] 125 compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer(cvs)), cvs, key, flags) 126 cvs[numCVs/2] = keep 127 numCVs++ 128 } 129 numCVs /= 2 130 } 131 return parentNode(cvs[0], cvs[1], *key, flags) 132 } 133 134 func wordsToBytes(words [16]uint32, block *[64]byte) { 135 *block = *(*[64]byte)(unsafe.Pointer(&words)) 136 } 137 138 func bytesToCV(b []byte) [8]uint32 { 139 return *(*[8]uint32)(unsafe.Pointer(&b[0])) 140 } 141 142 func cvToBytes(cv *[8]uint32) *[32]byte { 143 return (*[32]byte)(unsafe.Pointer(cv)) 144 }