github.com/grailbio/base@v0.0.11/compress/libdeflate/libdeflate.go (about) 1 // Copyright 2018 GRAIL, Inc. All rights reserved. 2 // Use of this source code is governed by the Apache-2.0 3 // license that can be found in the LICENSE file. 4 5 package libdeflate 6 7 import ( 8 "compress/gzip" 9 "encoding/binary" 10 "errors" 11 "fmt" 12 "hash/crc32" 13 "io" 14 ) 15 16 // This is a slightly modified version of klauspost/compress/gzip/gzip.go , and 17 // is designed to be a drop-in replacement for it in bgzf writer code. It may 18 // eventually go into its own sub-package, but for now I'll keep it here since 19 // bgzf is our only use case (zstd is generally a better choice for new custom 20 // formats). 21 22 // These constants are copied from klauspost/compress/gzip. The 23 // BestCompression value is technically a lie for libdeflate--it goes all the 24 // way up to 12, not 9--so I've defined an extra constant for the real highest 25 // setting. 26 const ( 27 BestSpeed = gzip.BestSpeed 28 BestCompression = gzip.BestCompression 29 BestestCompression = 12 30 DefaultCompression = gzip.DefaultCompression 31 32 // NoCompression and ConstantCompression/HuffmanOnly are not supported by 33 // this package. 34 35 DefaultBufCap = 0x10000 36 37 gzipID1 = 0x1f 38 gzipID2 = 0x8b 39 gzipDeflate = 8 40 ) 41 42 // A Writer is an io.WriteCloser. 43 // Writes to a Writer are compressed and written to w. 44 type Writer struct { 45 gzip.Header 46 w io.Writer 47 level int 48 compressor Compressor 49 digest uint32 // CRC-32, IEEE polynomial (section 8) 50 size uint32 // Uncompressed size (section 2.3.1) 51 closed bool 52 buf []byte 53 54 // Next two fields are specific to libdeflate. 55 // NewWriter{Level}() can't accept a bufCap argument without breaking 56 // klauspost/compress/gzip compatibility, so we have bufCap default to the 57 // 65536 value needed by bgzf, export a function that changes it, and 58 // lazy-initialize buf[] so that we don't allocate a size-65536 block only to 59 // immediately throw it away if the user wants a different capacity. 60 // bufCap must be large enough to fit the entire gzip block, not just the 61 // compressed portion. 62 bufCap int 63 bufPos int 64 65 // wroteHeader has been removed since it's equivalent to (bufPos != 0). 66 67 err error 68 } 69 70 // NewWriter returns a new Writer. 71 // Writes to the returned writer are compressed and written to w. 72 // 73 // It is the caller's responsibility to call Close on the WriteCloser when 74 // done. 75 // Writes may be buffered and not flushed until Close. 76 // 77 // Callers that wish to set the fields in Writer.Header must do so before the 78 // first call to Write, Flush, or Close. 79 func NewWriter(w io.Writer) *Writer { 80 z, _ := NewWriterLevel(w, DefaultCompression) 81 return z 82 } 83 84 // NewWriterLevel is like NewWriter but specifies the compression level instead 85 // of assuming DefaultCompression. 86 // 87 // The compression level can be DefaultCompression, or any integer value 88 // between 1 and BestCompression inclusive. The error returned will be nil if 89 // the level is valid. 90 func NewWriterLevel(w io.Writer, level int) (*Writer, error) { 91 if (level < DefaultCompression) || (level == 0) || (level > BestestCompression) { 92 return nil, fmt.Errorf("libdeflate: invalid compression level: %d", level) 93 } 94 z := new(Writer) 95 96 z.bufCap = DefaultBufCap 97 98 z.init(w, level) 99 return z, nil 100 } 101 102 func (z *Writer) init(w io.Writer, level int) { 103 // compressor is now lazy-(re)initialized later. 104 105 buf := z.buf 106 bufCap := z.bufCap 107 108 *z = Writer{ 109 Header: gzip.Header{ 110 OS: 255, // unknown 111 }, 112 w: w, 113 level: level, 114 buf: buf, 115 bufCap: bufCap, 116 } 117 } 118 119 // Reset discards the Writer z's state and makes it equivalent to the result of 120 // its original state from NewWriter or NewWriterLevel, but writing to w 121 // instead. This permits reusing a Writer rather than allocating a new one. 122 // 123 // It is safe to call Reset() without Close(). In this case, *no* bytes from 124 // the previous block are written. 125 func (z *Writer) Reset(w io.Writer) { 126 z.init(w, z.level) 127 } 128 129 // SetCap changes the capacity of the final write buffer, which must fit the 130 // entire gzip block (header and footer bytes included). (libdeflate requires 131 // this value to be declared in advance.) 132 // If this function isn't called, the capacity is set to DefaultBufCap == 133 // 0x10000. 134 func (z *Writer) SetCap(newCap int) error { 135 if z.bufPos != 0 { 136 return errors.New("libdeflate.SetCap: invalid call (must immediately follow initialization/reset)") 137 } 138 if newCap < 18 { 139 // guarantee enough space for always-present header and footer bytes, so we 140 // can be slightly less paranoid with bounds-checks 141 return errors.New("libdeflate.SetCap: capacity too low") 142 } 143 z.bufCap = newCap 144 return nil 145 } 146 147 var le = binary.LittleEndian 148 149 // appendBytes appends a length-prefixed byte slice to z.buf. 150 func (z *Writer) appendBytes(b []byte) error { 151 if len(b) > 0xffff { 152 return errors.New("libdeflate.Write: extra data is too large") 153 } 154 midPos := z.bufPos + 2 155 endPos := midPos + len(b) 156 if endPos > z.bufCap { 157 return errors.New("libdeflate.Write: out of buffer space") 158 } 159 le.PutUint16(z.buf[z.bufPos:], uint16(len(b))) 160 copy(z.buf[midPos:], b) 161 z.bufPos = endPos 162 return nil 163 } 164 165 // appendString appends a UTF-8 string s in GZIP's format to z.buf. 166 // GZIP (RFC 1952) specifies that strings are NUL-terminated ISO 8859-1 167 // (Latin-1). 168 func (z *Writer) appendString(s string) (err error) { 169 // GZIP stores Latin-1 strings; error if non-Latin-1; convert if non-ASCII. 170 needconv := false 171 for _, v := range s { 172 if v == 0 || v > 0xff { 173 return errors.New("libdeflate.Write: non-Latin-1 header string") 174 } 175 if v > 0x7f { 176 needconv = true 177 } 178 } 179 nulPos := z.bufPos 180 if needconv { 181 b := make([]byte, 0, len(s)) 182 for _, v := range s { 183 b = append(b, byte(v)) 184 } 185 nulPos += len(b) 186 if nulPos >= z.bufCap { 187 return errors.New("libdeflate.Write: out of buffer space") 188 } 189 copy(z.buf[z.bufPos:], b) 190 } else { 191 nulPos += len(s) 192 if nulPos >= z.bufCap { 193 return errors.New("libdeflate.Write: out of buffer space") 194 } 195 copy(z.buf[z.bufPos:], s) 196 } 197 // GZIP strings are NUL-terminated. 198 z.buf[nulPos] = 0 199 z.bufPos = nulPos + 1 200 return nil 201 } 202 203 // Write writes a compressed form of p to the underlying io.Writer. The 204 // compressed bytes are not necessarily flushed until the Writer is closed. 205 func (z *Writer) Write(p []byte) (int, error) { 206 if z.err != nil { 207 return 0, z.err 208 } 209 // Enforce the libdeflate constraint. 210 if z.bufPos != 0 { 211 z.err = errors.New("libdeflate.Write: only one Write operation permitted per block") 212 return 0, z.err 213 } 214 // 'Write' the GZIP header lazily. 215 if len(z.buf) < z.bufCap { 216 if cap(z.buf) < z.bufCap { 217 // Impossible to avoid zero-initialization here in Go. 218 z.buf = make([]byte, z.bufCap) 219 } else { 220 // No need to zero-reinitialize. 221 z.buf = z.buf[:z.bufCap] 222 } 223 } else if len(z.buf) > z.bufCap { 224 // Likely to be irrelevant, but may as well maintain this invariant 225 z.buf = z.buf[:z.bufCap] 226 } 227 z.buf[0] = gzipID1 228 z.buf[1] = gzipID2 229 z.buf[2] = gzipDeflate 230 z.buf[3] = 0 231 if z.Extra != nil { 232 z.buf[3] |= 0x04 233 } 234 if z.Name != "" { 235 z.buf[3] |= 0x08 236 } 237 if z.Comment != "" { 238 z.buf[3] |= 0x10 239 } 240 le.PutUint32(z.buf[4:8], uint32(z.ModTime.Unix())) 241 if z.level >= BestCompression { 242 // Reasonable to set this for any level in 9..12. 243 z.buf[8] = 2 244 } else if z.level == BestSpeed { 245 z.buf[8] = 4 246 } else { 247 z.buf[8] = 0 248 } 249 z.buf[9] = z.OS 250 z.bufPos = 10 251 if z.Extra != nil { 252 z.err = z.appendBytes(z.Extra) 253 if z.err != nil { 254 return 0, z.err 255 } 256 } 257 if z.Name != "" { 258 z.err = z.appendString(z.Name) 259 if z.err != nil { 260 return 0, z.err 261 } 262 } 263 if z.Comment != "" { 264 z.err = z.appendString(z.Comment) 265 if z.err != nil { 266 return 0, z.err 267 } 268 } 269 z.err = z.compressor.Init(z.level) 270 if z.err != nil { 271 return 0, z.err 272 } 273 z.size += uint32(len(p)) 274 z.digest = crc32.Update(z.digest, crc32.IEEETable, p) 275 276 n := z.compressor.Compress(z.buf[z.bufPos:z.bufCap-8], p) 277 z.bufPos += n 278 if n == 0 { 279 z.err = errors.New("libdeflate.Write: out of buffer space") 280 } 281 return n, z.err 282 } 283 284 // Flush() has been removed for now. 285 286 // Close closes the Writer, flushing any unwritten data to the underlying 287 // io.Writer, but does not close the underlying io.Writer. 288 func (z *Writer) Close() error { 289 if z.err != nil { 290 return z.err 291 } 292 if z.closed { 293 return nil 294 } 295 z.closed = true 296 if z.bufPos == 0 { 297 _, z.err = z.Write(nil) 298 if z.err != nil { 299 return z.err 300 } 301 } 302 // a bit inefficient to keep calling this, but given the current interface we 303 // have no choice. 304 z.compressor.Cleanup() 305 midPos := z.bufPos + 4 306 endPos := midPos + 4 307 if endPos > z.bufCap { 308 z.err = errors.New("libdeflate.Write: out of buffer space") 309 } 310 le.PutUint32(z.buf[z.bufPos:], z.digest) 311 le.PutUint32(z.buf[midPos:], z.size) 312 _, z.err = z.w.Write(z.buf[:endPos]) 313 return z.err 314 }