github.com/geraldss/go/src@v0.0.0-20210511222824-ac7d0ebfc235/compress/lzw/reader.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package lzw implements the Lempel-Ziv-Welch compressed data format, 6 // described in T. A. Welch, ``A Technique for High-Performance Data 7 // Compression'', Computer, 17(6) (June 1984), pp 8-19. 8 // 9 // In particular, it implements LZW as used by the GIF and PDF file 10 // formats, which means variable-width codes up to 12 bits and the first 11 // two non-literal codes are a clear code and an EOF code. 12 // 13 // The TIFF file format uses a similar but incompatible version of the LZW 14 // algorithm. See the golang.org/x/image/tiff/lzw package for an 15 // implementation. 16 package lzw 17 18 // TODO(nigeltao): check that PDF uses LZW in the same way as GIF, 19 // modulo LSB/MSB packing order. 20 21 import ( 22 "bufio" 23 "errors" 24 "fmt" 25 "io" 26 ) 27 28 // Order specifies the bit ordering in an LZW data stream. 29 type Order int 30 31 const ( 32 // LSB means Least Significant Bits first, as used in the GIF file format. 33 LSB Order = iota 34 // MSB means Most Significant Bits first, as used in the TIFF and PDF 35 // file formats. 36 MSB 37 ) 38 39 const ( 40 maxWidth = 12 41 decoderInvalidCode = 0xffff 42 flushBuffer = 1 << maxWidth 43 ) 44 45 // decoder is the state from which the readXxx method converts a byte 46 // stream into a code stream. 47 type decoder struct { 48 r io.ByteReader 49 bits uint32 50 nBits uint 51 width uint 52 read func(*decoder) (uint16, error) // readLSB or readMSB 53 litWidth int // width in bits of literal codes 54 err error 55 56 // The first 1<<litWidth codes are literal codes. 57 // The next two codes mean clear and EOF. 58 // Other valid codes are in the range [lo, hi] where lo := clear + 2, 59 // with the upper bound incrementing on each code seen. 60 // 61 // overflow is the code at which hi overflows the code width. It always 62 // equals 1 << width. 63 // 64 // last is the most recently seen code, or decoderInvalidCode. 65 // 66 // An invariant is that hi < overflow. 67 clear, eof, hi, overflow, last uint16 68 69 // Each code c in [lo, hi] expands to two or more bytes. For c != hi: 70 // suffix[c] is the last of these bytes. 71 // prefix[c] is the code for all but the last byte. 72 // This code can either be a literal code or another code in [lo, c). 73 // The c == hi case is a special case. 74 suffix [1 << maxWidth]uint8 75 prefix [1 << maxWidth]uint16 76 77 // output is the temporary output buffer. 78 // Literal codes are accumulated from the start of the buffer. 79 // Non-literal codes decode to a sequence of suffixes that are first 80 // written right-to-left from the end of the buffer before being copied 81 // to the start of the buffer. 82 // It is flushed when it contains >= 1<<maxWidth bytes, 83 // so that there is always room to decode an entire code. 84 output [2 * 1 << maxWidth]byte 85 o int // write index into output 86 toRead []byte // bytes to return from Read 87 } 88 89 // readLSB returns the next code for "Least Significant Bits first" data. 90 func (d *decoder) readLSB() (uint16, error) { 91 for d.nBits < d.width { 92 x, err := d.r.ReadByte() 93 if err != nil { 94 return 0, err 95 } 96 d.bits |= uint32(x) << d.nBits 97 d.nBits += 8 98 } 99 code := uint16(d.bits & (1<<d.width - 1)) 100 d.bits >>= d.width 101 d.nBits -= d.width 102 return code, nil 103 } 104 105 // readMSB returns the next code for "Most Significant Bits first" data. 106 func (d *decoder) readMSB() (uint16, error) { 107 for d.nBits < d.width { 108 x, err := d.r.ReadByte() 109 if err != nil { 110 return 0, err 111 } 112 d.bits |= uint32(x) << (24 - d.nBits) 113 d.nBits += 8 114 } 115 code := uint16(d.bits >> (32 - d.width)) 116 d.bits <<= d.width 117 d.nBits -= d.width 118 return code, nil 119 } 120 121 func (d *decoder) Read(b []byte) (int, error) { 122 for { 123 if len(d.toRead) > 0 { 124 n := copy(b, d.toRead) 125 d.toRead = d.toRead[n:] 126 return n, nil 127 } 128 if d.err != nil { 129 return 0, d.err 130 } 131 d.decode() 132 } 133 } 134 135 // decode decompresses bytes from r and leaves them in d.toRead. 136 // read specifies how to decode bytes into codes. 137 // litWidth is the width in bits of literal codes. 138 func (d *decoder) decode() { 139 // Loop over the code stream, converting codes into decompressed bytes. 140 loop: 141 for { 142 code, err := d.read(d) 143 if err != nil { 144 if err == io.EOF { 145 err = io.ErrUnexpectedEOF 146 } 147 d.err = err 148 break 149 } 150 switch { 151 case code < d.clear: 152 // We have a literal code. 153 d.output[d.o] = uint8(code) 154 d.o++ 155 if d.last != decoderInvalidCode { 156 // Save what the hi code expands to. 157 d.suffix[d.hi] = uint8(code) 158 d.prefix[d.hi] = d.last 159 } 160 case code == d.clear: 161 d.width = 1 + uint(d.litWidth) 162 d.hi = d.eof 163 d.overflow = 1 << d.width 164 d.last = decoderInvalidCode 165 continue 166 case code == d.eof: 167 d.err = io.EOF 168 break loop 169 case code <= d.hi: 170 c, i := code, len(d.output)-1 171 if code == d.hi && d.last != decoderInvalidCode { 172 // code == hi is a special case which expands to the last expansion 173 // followed by the head of the last expansion. To find the head, we walk 174 // the prefix chain until we find a literal code. 175 c = d.last 176 for c >= d.clear { 177 c = d.prefix[c] 178 } 179 d.output[i] = uint8(c) 180 i-- 181 c = d.last 182 } 183 // Copy the suffix chain into output and then write that to w. 184 for c >= d.clear { 185 d.output[i] = d.suffix[c] 186 i-- 187 c = d.prefix[c] 188 } 189 d.output[i] = uint8(c) 190 d.o += copy(d.output[d.o:], d.output[i:]) 191 if d.last != decoderInvalidCode { 192 // Save what the hi code expands to. 193 d.suffix[d.hi] = uint8(c) 194 d.prefix[d.hi] = d.last 195 } 196 default: 197 d.err = errors.New("lzw: invalid code") 198 break loop 199 } 200 d.last, d.hi = code, d.hi+1 201 if d.hi >= d.overflow { 202 if d.hi > d.overflow { 203 panic("unreachable") 204 } 205 if d.width == maxWidth { 206 d.last = decoderInvalidCode 207 // Undo the d.hi++ a few lines above, so that (1) we maintain 208 // the invariant that d.hi < d.overflow, and (2) d.hi does not 209 // eventually overflow a uint16. 210 d.hi-- 211 } else { 212 d.width++ 213 d.overflow = 1 << d.width 214 } 215 } 216 if d.o >= flushBuffer { 217 break 218 } 219 } 220 // Flush pending output. 221 d.toRead = d.output[:d.o] 222 d.o = 0 223 } 224 225 var errClosed = errors.New("lzw: reader/writer is closed") 226 227 func (d *decoder) Close() error { 228 d.err = errClosed // in case any Reads come along 229 return nil 230 } 231 232 // NewReader creates a new io.ReadCloser. 233 // Reads from the returned io.ReadCloser read and decompress data from r. 234 // If r does not also implement io.ByteReader, 235 // the decompressor may read more data than necessary from r. 236 // It is the caller's responsibility to call Close on the ReadCloser when 237 // finished reading. 238 // The number of bits to use for literal codes, litWidth, must be in the 239 // range [2,8] and is typically 8. It must equal the litWidth 240 // used during compression. 241 func NewReader(r io.Reader, order Order, litWidth int) io.ReadCloser { 242 d := new(decoder) 243 switch order { 244 case LSB: 245 d.read = (*decoder).readLSB 246 case MSB: 247 d.read = (*decoder).readMSB 248 default: 249 d.err = errors.New("lzw: unknown order") 250 return d 251 } 252 if litWidth < 2 || 8 < litWidth { 253 d.err = fmt.Errorf("lzw: litWidth %d out of range", litWidth) 254 return d 255 } 256 if br, ok := r.(io.ByteReader); ok { 257 d.r = br 258 } else { 259 d.r = bufio.NewReader(r) 260 } 261 d.litWidth = litWidth 262 d.width = 1 + uint(litWidth) 263 d.clear = uint16(1) << uint(litWidth) 264 d.eof, d.hi = d.clear+1, d.clear+1 265 d.overflow = uint16(1) << d.width 266 d.last = decoderInvalidCode 267 268 return d 269 }