github.com/grailbio/base@v0.0.11/compress/libdeflate/deflate_decompress.c (about) 1 /* 2 * deflate_decompress.c - a decompressor for DEFLATE 3 * 4 * Copyright 2016 Eric Biggers 5 * 6 * Permission is hereby granted, free of charge, to any person 7 * obtaining a copy of this software and associated documentation 8 * files (the "Software"), to deal in the Software without 9 * restriction, including without limitation the rights to use, 10 * copy, modify, merge, publish, distribute, sublicense, and/or sell 11 * copies of the Software, and to permit persons to whom the 12 * Software is furnished to do so, subject to the following 13 * conditions: 14 * 15 * The above copyright notice and this permission notice shall be 16 * included in all copies or substantial portions of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 20 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 21 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 22 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 23 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 * OTHER DEALINGS IN THE SOFTWARE. 26 * 27 * --------------------------------------------------------------------------- 28 * 29 * This is a highly optimized DEFLATE decompressor. When compiled with gcc on 30 * x86_64, it decompresses data in about 52% of the time of zlib (48% if BMI2 31 * instructions are available). On other architectures it should still be 32 * significantly faster than zlib, but the difference may be smaller. 33 * 34 * Why this is faster than zlib's implementation: 35 * 36 * - Word accesses rather than byte accesses when reading input 37 * - Word accesses rather than byte accesses when copying matches 38 * - Faster Huffman decoding combined with various DEFLATE-specific tricks 39 * - Larger bitbuffer variable that doesn't need to be filled as often 40 * - Other optimizations to remove unnecessary branches 41 * - Only full-buffer decompression is supported, so the code doesn't need to 42 * support stopping and resuming decompression. 43 * - On x86_64, compile a version of the decompression routine using BMI2 44 * instructions and use it automatically at runtime when supported. 45 */ 46 47 #include <stdlib.h> 48 #include <string.h> 49 50 #include "deflate_constants.h" 51 #include "unaligned.h" 52 53 #include "libdeflate.h" 54 55 /* 56 * If the expression passed to SAFETY_CHECK() evaluates to false, then the 57 * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the 58 * compressed data is invalid. 59 * 60 * Theoretically, these checks could be disabled for specialized applications 61 * where all input to the decompressor will be trusted. 62 */ 63 #if 0 64 # pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!") 65 # define SAFETY_CHECK(expr) (void)(expr) 66 #else 67 # define SAFETY_CHECK(expr) if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA 68 #endif 69 70 /* 71 * Each TABLEBITS number is the base-2 logarithm of the number of entries in the 72 * main portion of the corresponding decode table. Each number should be large 73 * enough to ensure that for typical data, the vast majority of symbols can be 74 * decoded by a direct lookup of the next TABLEBITS bits of compressed data. 75 * However, this must be balanced against the fact that a larger table requires 76 * more memory and requires more time to fill. 77 * 78 * Note: you cannot change a TABLEBITS number without also changing the 79 * corresponding ENOUGH number! 80 */ 81 #define PRECODE_TABLEBITS 7 82 #define LITLEN_TABLEBITS 10 83 #define OFFSET_TABLEBITS 8 84 85 /* 86 * Each ENOUGH number is the maximum number of decode table entries that may be 87 * required for the corresponding Huffman code, including the main table and all 88 * subtables. Each number depends on three parameters: 89 * 90 * (1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMBOLS) 91 * (2) the number of main table bits (the TABLEBITS numbers defined above) 92 * (3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN) 93 * 94 * The ENOUGH numbers were computed using the utility program 'enough' from 95 * zlib. This program enumerates all possible relevant Huffman codes to find 96 * the worst-case usage of decode table entries. 97 */ 98 #define PRECODE_ENOUGH 128 /* enough 19 7 7 */ 99 #define LITLEN_ENOUGH 1334 /* enough 288 10 15 */ 100 #define OFFSET_ENOUGH 402 /* enough 32 8 15 */ 101 102 /* 103 * Type for codeword lengths. 104 */ 105 typedef u8 len_t; 106 107 /* 108 * The main DEFLATE decompressor structure. Since this implementation only 109 * supports full buffer decompression, this structure does not store the entire 110 * decompression state, but rather only some arrays that are too large to 111 * comfortably allocate on the stack. 112 */ 113 struct libdeflate_decompressor { 114 115 /* 116 * The arrays aren't all needed at the same time. 'precode_lens' and 117 * 'precode_decode_table' are unneeded after 'lens' has been filled. 118 * Furthermore, 'lens' need not be retained after building the litlen 119 * and offset decode tables. In fact, 'lens' can be in union with 120 * 'litlen_decode_table' provided that 'offset_decode_table' is separate 121 * and is built first. 122 */ 123 124 union { 125 len_t precode_lens[DEFLATE_NUM_PRECODE_SYMS]; 126 127 struct { 128 len_t lens[DEFLATE_NUM_LITLEN_SYMS + 129 DEFLATE_NUM_OFFSET_SYMS + 130 DEFLATE_MAX_LENS_OVERRUN]; 131 132 u32 precode_decode_table[PRECODE_ENOUGH]; 133 } l; 134 135 u32 litlen_decode_table[LITLEN_ENOUGH]; 136 } u; 137 138 u32 offset_decode_table[OFFSET_ENOUGH]; 139 140 u16 working_space[2 * (DEFLATE_MAX_CODEWORD_LEN + 1) + 141 DEFLATE_MAX_NUM_SYMS]; 142 }; 143 144 /***************************************************************************** 145 * Input bitstream * 146 *****************************************************************************/ 147 148 /* 149 * The state of the "input bitstream" consists of the following variables: 150 * 151 * - in_next: pointer to the next unread byte in the input buffer 152 * 153 * - in_end: pointer just past the end of the input buffer 154 * 155 * - bitbuf: a word-sized variable containing bits that have been read from 156 * the input buffer. The buffered bits are right-aligned 157 * (they're the low-order bits). 158 * 159 * - bitsleft: number of bits in 'bitbuf' that are valid. 160 * 161 * To make it easier for the compiler to optimize the code by keeping variables 162 * in registers, these are declared as normal variables and manipulated using 163 * macros. 164 */ 165 166 /* 167 * The type for the bitbuffer variable ('bitbuf' described above). For best 168 * performance, this should have size equal to a machine word. 169 * 170 * 64-bit platforms have a significant advantage: they get a bigger bitbuffer 171 * which they have to fill less often. 172 */ 173 typedef machine_word_t bitbuf_t; 174 175 /* 176 * Number of bits the bitbuffer variable can hold. 177 */ 178 #define BITBUF_NBITS (8 * sizeof(bitbuf_t)) 179 180 /* 181 * The maximum number of bits that can be requested to be in the bitbuffer 182 * variable. This is the maximum value of 'n' that can be passed 183 * ENSURE_BITS(n). 184 * 185 * This not equal to BITBUF_NBITS because we never read less than one byte at a 186 * time. If the bitbuffer variable contains more than (BITBUF_NBITS - 8) bits, 187 * then we can't read another byte without first consuming some bits. So the 188 * maximum count we can ensure is (BITBUF_NBITS - 7). 189 */ 190 #define MAX_ENSURE (BITBUF_NBITS - 7) 191 192 /* 193 * Evaluates to true if 'n' is a valid argument to ENSURE_BITS(n), or false if 194 * 'n' is too large to be passed to ENSURE_BITS(n). Note: if 'n' is a compile 195 * time constant, then this expression will be a compile-type constant. 196 * Therefore, CAN_ENSURE() can be used choose between alternative 197 * implementations at compile time. 198 */ 199 #define CAN_ENSURE(n) ((n) <= MAX_ENSURE) 200 201 /* 202 * Fill the bitbuffer variable, reading one byte at a time. 203 * 204 * If we would overread the input buffer, we just don't read anything, leaving 205 * the bits zeroed but marking them filled. This simplifies the decompressor 206 * because it removes the need to distinguish between real overreads and 207 * overreads that occur only because of the decompressor's own lookahead. 208 * 209 * The disadvantage is that real overreads are not detected immediately. 210 * However, this is safe because the decompressor is still guaranteed to make 211 * forward progress when presented never-ending 0 bits. In an existing block 212 * output will be getting generated, whereas new blocks can only be uncompressed 213 * (since the type code for uncompressed blocks is 0), for which we check for 214 * previous overread. But even if we didn't check, uncompressed blocks would 215 * fail to validate because LEN would not equal ~NLEN. So the decompressor will 216 * eventually either detect that the output buffer is full, or detect invalid 217 * input, or finish the final block. 218 */ 219 #define FILL_BITS_BYTEWISE() \ 220 do { \ 221 if (likely(in_next != in_end)) \ 222 bitbuf |= (bitbuf_t)*in_next++ << bitsleft; \ 223 else \ 224 overrun_count++; \ 225 bitsleft += 8; \ 226 } while (bitsleft <= BITBUF_NBITS - 8) 227 228 /* 229 * Fill the bitbuffer variable by reading the next word from the input buffer. 230 * This can be significantly faster than FILL_BITS_BYTEWISE(). However, for 231 * this to work correctly, the word must be interpreted in little-endian format. 232 * In addition, the memory access may be unaligned. Therefore, this method is 233 * most efficient on little-endian architectures that support fast unaligned 234 * access, such as x86 and x86_64. 235 */ 236 #define FILL_BITS_WORDWISE() \ 237 do { \ 238 bitbuf |= get_unaligned_leword(in_next) << bitsleft; \ 239 in_next += (BITBUF_NBITS - bitsleft) >> 3; \ 240 bitsleft += (BITBUF_NBITS - bitsleft) & ~7; \ 241 } while (0) 242 243 /* 244 * Does the bitbuffer variable currently contain at least 'n' bits? 245 */ 246 #define HAVE_BITS(n) (bitsleft >= (n)) 247 248 /* 249 * Load more bits from the input buffer until the specified number of bits is 250 * present in the bitbuffer variable. 'n' cannot be too large; see MAX_ENSURE 251 * and CAN_ENSURE(). 252 */ 253 #define ENSURE_BITS(n) \ 254 if (!HAVE_BITS(n)) { \ 255 if (CPU_IS_LITTLE_ENDIAN() && \ 256 UNALIGNED_ACCESS_IS_FAST && \ 257 likely(in_end - in_next >= sizeof(bitbuf_t))) \ 258 FILL_BITS_WORDWISE(); \ 259 else \ 260 FILL_BITS_BYTEWISE(); \ 261 } 262 263 /* 264 * Return the next 'n' bits from the bitbuffer variable without removing them. 265 */ 266 #define BITS(n) ((u32)bitbuf & (((u32)1 << (n)) - 1)) 267 268 /* 269 * Remove the next 'n' bits from the bitbuffer variable. 270 */ 271 #define REMOVE_BITS(n) (bitbuf >>= (n), bitsleft -= (n)) 272 273 /* 274 * Remove and return the next 'n' bits from the bitbuffer variable. 275 */ 276 #define POP_BITS(n) (tmp32 = BITS(n), REMOVE_BITS(n), tmp32) 277 278 /* 279 * Verify that the input buffer hasn't been overread, then align the input to 280 * the next byte boundary, discarding any remaining bits in the current byte. 281 * 282 * Note that if the bitbuffer variable currently contains more than 7 bits, then 283 * we must rewind 'in_next', effectively putting those bits back. Only the bits 284 * in what would be the "current" byte if we were reading one byte at a time can 285 * be actually discarded. 286 */ 287 #define ALIGN_INPUT() \ 288 do { \ 289 SAFETY_CHECK(overrun_count <= (bitsleft >> 3)); \ 290 in_next -= (bitsleft >> 3) - overrun_count; \ 291 overrun_count = 0; \ 292 bitbuf = 0; \ 293 bitsleft = 0; \ 294 } while(0) 295 296 /* 297 * Read a 16-bit value from the input. This must have been preceded by a call 298 * to ALIGN_INPUT(), and the caller must have already checked for overrun. 299 */ 300 #define READ_U16() (tmp16 = get_unaligned_le16(in_next), in_next += 2, tmp16) 301 302 /***************************************************************************** 303 * Huffman decoding * 304 *****************************************************************************/ 305 306 /* 307 * A decode table for order TABLEBITS consists of a main table of (1 << 308 * TABLEBITS) entries followed by a variable number of subtables. 309 * 310 * The decoding algorithm takes the next TABLEBITS bits of compressed data and 311 * uses them as an index into the decode table. The resulting entry is either a 312 * "direct entry", meaning that it contains the value desired, or a "subtable 313 * pointer", meaning that the entry references a subtable that must be indexed 314 * using more bits of the compressed data to decode the symbol. 315 * 316 * Each decode table (a main table along with with its subtables, if any) is 317 * associated with a Huffman code. Logically, the result of a decode table 318 * lookup is a symbol from the alphabet from which the corresponding Huffman 319 * code was constructed. A symbol with codeword length n <= TABLEBITS is 320 * associated with 2**(TABLEBITS - n) direct entries in the table, whereas a 321 * symbol with codeword length n > TABLEBITS is associated with one or more 322 * subtable entries. 323 * 324 * On top of this basic design, we implement several optimizations: 325 * 326 * - We store the length of each codeword directly in each of its decode table 327 * entries. This allows the codeword length to be produced without indexing 328 * an additional table. 329 * 330 * - When beneficial, we don't store the Huffman symbol itself, but instead data 331 * generated from it. For example, when decoding an offset symbol in DEFLATE, 332 * it's more efficient if we can decode the offset base and number of extra 333 * offset bits directly rather than decoding the offset symbol and then 334 * looking up both of those values in an additional table or tables. 335 * 336 * The size of each decode table entry is 32 bits, which provides slightly 337 * better performance than 16-bit entries on 32 and 64 bit processers, provided 338 * that the table doesn't get so large that it takes up too much memory and 339 * starts generating cache misses. The bits of each decode table entry are 340 * defined as follows: 341 * 342 * - Bits 30 -- 31: flags (see below) 343 * - Bits 8 -- 29: decode result: a Huffman symbol or related data 344 * - Bits 0 -- 7: codeword length 345 */ 346 347 /* 348 * This flag is set in all main decode table entries that represent subtable 349 * pointers. 350 */ 351 #define HUFFDEC_SUBTABLE_POINTER 0x80000000 352 353 /* 354 * This flag is set in all entries in the litlen decode table that represent 355 * literals. 356 */ 357 #define HUFFDEC_LITERAL 0x40000000 358 359 /* Mask for extracting the codeword length from a decode table entry. */ 360 #define HUFFDEC_LENGTH_MASK 0xFF 361 362 /* Shift to extract the decode result from a decode table entry. */ 363 #define HUFFDEC_RESULT_SHIFT 8 364 365 /* The decode result for each precode symbol. There is no special optimization 366 * for the precode; the decode result is simply the symbol value. */ 367 static const u32 precode_decode_results[DEFLATE_NUM_PRECODE_SYMS] = { 368 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 369 }; 370 371 /* The decode result for each litlen symbol. For literals, this is the literal 372 * value itself and the HUFFDEC_LITERAL flag. For lengths, this is the length 373 * base and the number of extra length bits. */ 374 static const u32 litlen_decode_results[DEFLATE_NUM_LITLEN_SYMS] = { 375 #define ENTRY(literal) ((HUFFDEC_LITERAL >> HUFFDEC_RESULT_SHIFT) | (literal)) 376 377 /* Literals */ 378 ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , 379 ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , 380 ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , 381 ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , 382 ENTRY(16) , ENTRY(17) , ENTRY(18) , ENTRY(19) , 383 ENTRY(20) , ENTRY(21) , ENTRY(22) , ENTRY(23) , 384 ENTRY(24) , ENTRY(25) , ENTRY(26) , ENTRY(27) , 385 ENTRY(28) , ENTRY(29) , ENTRY(30) , ENTRY(31) , 386 ENTRY(32) , ENTRY(33) , ENTRY(34) , ENTRY(35) , 387 ENTRY(36) , ENTRY(37) , ENTRY(38) , ENTRY(39) , 388 ENTRY(40) , ENTRY(41) , ENTRY(42) , ENTRY(43) , 389 ENTRY(44) , ENTRY(45) , ENTRY(46) , ENTRY(47) , 390 ENTRY(48) , ENTRY(49) , ENTRY(50) , ENTRY(51) , 391 ENTRY(52) , ENTRY(53) , ENTRY(54) , ENTRY(55) , 392 ENTRY(56) , ENTRY(57) , ENTRY(58) , ENTRY(59) , 393 ENTRY(60) , ENTRY(61) , ENTRY(62) , ENTRY(63) , 394 ENTRY(64) , ENTRY(65) , ENTRY(66) , ENTRY(67) , 395 ENTRY(68) , ENTRY(69) , ENTRY(70) , ENTRY(71) , 396 ENTRY(72) , ENTRY(73) , ENTRY(74) , ENTRY(75) , 397 ENTRY(76) , ENTRY(77) , ENTRY(78) , ENTRY(79) , 398 ENTRY(80) , ENTRY(81) , ENTRY(82) , ENTRY(83) , 399 ENTRY(84) , ENTRY(85) , ENTRY(86) , ENTRY(87) , 400 ENTRY(88) , ENTRY(89) , ENTRY(90) , ENTRY(91) , 401 ENTRY(92) , ENTRY(93) , ENTRY(94) , ENTRY(95) , 402 ENTRY(96) , ENTRY(97) , ENTRY(98) , ENTRY(99) , 403 ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) , 404 ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) , 405 ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) , 406 ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) , 407 ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) , 408 ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) , 409 ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) , 410 ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) , 411 ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) , 412 ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) , 413 ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) , 414 ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) , 415 ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) , 416 ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) , 417 ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) , 418 ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) , 419 ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) , 420 ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) , 421 ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) , 422 ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) , 423 ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) , 424 ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) , 425 ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) , 426 ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) , 427 ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) , 428 ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) , 429 ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) , 430 ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) , 431 ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) , 432 ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) , 433 ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) , 434 ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) , 435 ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) , 436 ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) , 437 ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) , 438 ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) , 439 ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) , 440 ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) , 441 ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) , 442 #undef ENTRY 443 444 #define HUFFDEC_EXTRA_LENGTH_BITS_MASK 0xFF 445 #define HUFFDEC_LENGTH_BASE_SHIFT 8 446 #define HUFFDEC_END_OF_BLOCK_LENGTH 0 447 448 #define ENTRY(length_base, num_extra_bits) \ 449 (((u32)(length_base) << HUFFDEC_LENGTH_BASE_SHIFT) | (num_extra_bits)) 450 451 /* End of block */ 452 ENTRY(HUFFDEC_END_OF_BLOCK_LENGTH, 0), 453 454 /* Lengths */ 455 ENTRY(3 , 0) , ENTRY(4 , 0) , ENTRY(5 , 0) , ENTRY(6 , 0), 456 ENTRY(7 , 0) , ENTRY(8 , 0) , ENTRY(9 , 0) , ENTRY(10 , 0), 457 ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1), 458 ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2), 459 ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3), 460 ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4), 461 ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5), 462 ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) , 463 #undef ENTRY 464 }; 465 466 /* The decode result for each offset symbol. This is the offset base and the 467 * number of extra offset bits. */ 468 static const u32 offset_decode_results[DEFLATE_NUM_OFFSET_SYMS] = { 469 470 #define HUFFDEC_EXTRA_OFFSET_BITS_SHIFT 16 471 #define HUFFDEC_OFFSET_BASE_MASK (((u32)1 << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) - 1) 472 473 #define ENTRY(offset_base, num_extra_bits) \ 474 ((offset_base) | ((u32)(num_extra_bits) << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT)) 475 ENTRY(1 , 0) , ENTRY(2 , 0) , ENTRY(3 , 0) , ENTRY(4 , 0) , 476 ENTRY(5 , 1) , ENTRY(7 , 1) , ENTRY(9 , 2) , ENTRY(13 , 2) , 477 ENTRY(17 , 3) , ENTRY(25 , 3) , ENTRY(33 , 4) , ENTRY(49 , 4) , 478 ENTRY(65 , 5) , ENTRY(97 , 5) , ENTRY(129 , 6) , ENTRY(193 , 6) , 479 ENTRY(257 , 7) , ENTRY(385 , 7) , ENTRY(513 , 8) , ENTRY(769 , 8) , 480 ENTRY(1025 , 9) , ENTRY(1537 , 9) , ENTRY(2049 , 10) , ENTRY(3073 , 10) , 481 ENTRY(4097 , 11) , ENTRY(6145 , 11) , ENTRY(8193 , 12) , ENTRY(12289 , 12) , 482 ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(32769 , 14) , ENTRY(49153 , 14) , 483 #undef ENTRY 484 }; 485 486 /* Construct a decode table entry from a decode result and codeword length. */ 487 static forceinline u32 488 make_decode_table_entry(u32 result, u32 length) 489 { 490 return (result << HUFFDEC_RESULT_SHIFT) | length; 491 } 492 493 /* 494 * Build a table for fast decoding of symbols from a Huffman code. As input, 495 * this function takes the codeword length of each symbol which may be used in 496 * the code. As output, it produces a decode table for the canonical Huffman 497 * code described by the codeword lengths. The decode table is built with the 498 * assumption that it will be indexed with "bit-reversed" codewords, where the 499 * low-order bit is the first bit of the codeword. This format is used for all 500 * Huffman codes in DEFLATE. 501 * 502 * @decode_table 503 * The array in which the decode table will be generated. This array must 504 * have sufficient length; see the definition of the ENOUGH numbers. 505 * @lens 506 * An array which provides, for each symbol, the length of the 507 * corresponding codeword in bits, or 0 if the symbol is unused. This may 508 * alias @decode_table, since nothing is written to @decode_table until all 509 * @lens have been consumed. All codeword lengths are assumed to be <= 510 * @max_codeword_len but are otherwise considered untrusted. If they do 511 * not form a valid Huffman code, then the decode table is not built and 512 * %false is returned. 513 * @num_syms 514 * The number of symbols in the code, including all unused symbols. 515 * @decode_results 516 * An array which provides, for each symbol, the actual value to store into 517 * the decode table. This value will be directly produced as the result of 518 * decoding that symbol, thereby moving the indirection out of the decode 519 * loop and into the table initialization. 520 * @table_bits 521 * The log base-2 of the number of main table entries to use. 522 * @max_codeword_len 523 * The maximum allowed codeword length for this Huffman code. 524 * @working_space 525 * A temporary array of length '2 * (@max_codeword_len + 1) + @num_syms'. 526 * 527 * Returns %true if successful; %false if the codeword lengths do not form a 528 * valid Huffman code. 529 */ 530 static bool 531 build_decode_table(u32 decode_table[], 532 const len_t lens[], 533 const unsigned num_syms, 534 const u32 decode_results[], 535 const unsigned table_bits, 536 const unsigned max_codeword_len, 537 u16 working_space[]) 538 { 539 u16 * const len_counts = &working_space[0]; 540 u16 * const offsets = &working_space[1 * (max_codeword_len + 1)]; 541 u16 * const sorted_syms = &working_space[2 * (max_codeword_len + 1)]; 542 unsigned len; 543 unsigned sym; 544 s32 remainder; 545 unsigned sym_idx; 546 unsigned codeword_len; 547 unsigned codeword_reversed = 0; 548 unsigned cur_codeword_prefix = -1; 549 unsigned cur_table_start = 0; 550 unsigned cur_table_bits = table_bits; 551 unsigned num_dropped_bits = 0; 552 const unsigned table_mask = (1U << table_bits) - 1; 553 554 /* Count how many symbols have each codeword length, including 0. */ 555 for (len = 0; len <= max_codeword_len; len++) 556 len_counts[len] = 0; 557 for (sym = 0; sym < num_syms; sym++) 558 len_counts[lens[sym]]++; 559 560 /* Sort the symbols primarily by increasing codeword length and 561 * secondarily by increasing symbol value. */ 562 563 /* Initialize 'offsets' so that offsets[len] is the number of codewords 564 * shorter than 'len' bits, including length 0. */ 565 offsets[0] = 0; 566 for (len = 0; len < max_codeword_len; len++) 567 offsets[len + 1] = offsets[len] + len_counts[len]; 568 569 /* Use the 'offsets' array to sort the symbols. */ 570 for (sym = 0; sym < num_syms; sym++) 571 sorted_syms[offsets[lens[sym]]++] = sym; 572 573 /* It is already guaranteed that all lengths are <= max_codeword_len, 574 * but it cannot be assumed they form a complete prefix code. A 575 * codeword of length n should require a proportion of the codespace 576 * equaling (1/2)^n. The code is complete if and only if, by this 577 * measure, the codespace is exactly filled by the lengths. */ 578 remainder = 1; 579 for (len = 1; len <= max_codeword_len; len++) { 580 remainder <<= 1; 581 remainder -= len_counts[len]; 582 if (unlikely(remainder < 0)) { 583 /* The lengths overflow the codespace; that is, the code 584 * is over-subscribed. */ 585 return false; 586 } 587 } 588 589 if (unlikely(remainder != 0)) { 590 /* The lengths do not fill the codespace; that is, they form an 591 * incomplete code. */ 592 593 /* Initialize the table entries to default values. When 594 * decompressing a well-formed stream, these default values will 595 * never be used. But since a malformed stream might contain 596 * any bits at all, these entries need to be set anyway. */ 597 u32 entry = make_decode_table_entry(decode_results[0], 1); 598 for (sym = 0; sym < (1U << table_bits); sym++) 599 decode_table[sym] = entry; 600 601 /* A completely empty code is permitted. */ 602 if (remainder == (1U << max_codeword_len)) 603 return true; 604 605 /* The code is nonempty and incomplete. Proceed only if there 606 * is a single used symbol and its codeword has length 1. The 607 * DEFLATE RFC is somewhat unclear regarding this case. What 608 * zlib's decompressor does is permit this case for 609 * literal/length and offset codes and assume the codeword is 0 610 * rather than 1. We do the same except we allow this case for 611 * precodes too. */ 612 if (remainder != (1U << (max_codeword_len - 1)) || 613 len_counts[1] != 1) 614 return false; 615 } 616 617 /* Generate the decode table entries. Since we process codewords from 618 * shortest to longest, the main portion of the decode table is filled 619 * first; then the subtables are filled. Note that it's already been 620 * verified that the code is nonempty and not over-subscribed. */ 621 622 /* Start with the smallest codeword length and the smallest-valued 623 * symbol which has that codeword length. */ 624 sym_idx = offsets[0]; 625 codeword_len = 1; 626 while (len_counts[codeword_len] == 0) 627 codeword_len++; 628 629 for (;;) { /* For each used symbol and its codeword... */ 630 unsigned sym; 631 u32 entry; 632 unsigned i; 633 unsigned end; 634 unsigned increment; 635 unsigned bit; 636 637 /* Get the next symbol. */ 638 sym = sorted_syms[sym_idx]; 639 640 /* Start a new subtable if the codeword is long enough to 641 * require a subtable, *and* the first 'table_bits' bits of the 642 * codeword don't match the prefix for the previous subtable if 643 * any. */ 644 if (codeword_len > table_bits && 645 (codeword_reversed & table_mask) != cur_codeword_prefix) { 646 647 cur_codeword_prefix = (codeword_reversed & table_mask); 648 649 cur_table_start += 1U << cur_table_bits; 650 651 /* Calculate the subtable length. If the codeword 652 * length exceeds 'table_bits' by n, the subtable needs 653 * at least 2**n entries. But it may need more; if 654 * there are fewer than 2**n codewords of length 655 * 'table_bits + n' remaining, then n will need to be 656 * incremented to bring in longer codewords until the 657 * subtable can be filled completely. Note that it 658 * always will, eventually, be possible to fill the 659 * subtable, since the only case where we may have an 660 * incomplete code is a single codeword of length 1, 661 * and that never requires any subtables. */ 662 cur_table_bits = codeword_len - table_bits; 663 remainder = (s32)1 << cur_table_bits; 664 for (;;) { 665 remainder -= len_counts[table_bits + 666 cur_table_bits]; 667 if (remainder <= 0) 668 break; 669 cur_table_bits++; 670 remainder <<= 1; 671 } 672 673 /* Create the entry that points from the main table to 674 * the subtable. This entry contains the index of the 675 * start of the subtable and the number of bits with 676 * which the subtable is indexed (the log base 2 of the 677 * number of entries it contains). */ 678 decode_table[cur_codeword_prefix] = 679 HUFFDEC_SUBTABLE_POINTER | 680 make_decode_table_entry(cur_table_start, 681 cur_table_bits); 682 683 /* Now that we're filling a subtable, we need to drop 684 * the first 'table_bits' bits of the codewords. */ 685 num_dropped_bits = table_bits; 686 } 687 688 /* Create the decode table entry, which packs the decode result 689 * and the codeword length (minus 'table_bits' for subtables) 690 * together. */ 691 entry = make_decode_table_entry(decode_results[sym], 692 codeword_len - num_dropped_bits); 693 694 /* Fill in as many copies of the decode table entry as are 695 * needed. The number of entries to fill is a power of 2 and 696 * depends on the codeword length; it could be as few as 1 or as 697 * large as half the size of the table. Since the codewords are 698 * bit-reversed, the indices to fill are those with the codeword 699 * in its low bits; it's the high bits that vary. */ 700 i = cur_table_start + (codeword_reversed >> num_dropped_bits); 701 end = cur_table_start + (1U << cur_table_bits); 702 increment = 1U << (codeword_len - num_dropped_bits); 703 do { 704 decode_table[i] = entry; 705 i += increment; 706 } while (i < end); 707 708 /* Advance to the next codeword by incrementing it. But since 709 * our codewords are bit-reversed, we must manipulate the bits 710 * ourselves rather than simply adding 1. */ 711 bit = 1U << (codeword_len - 1); 712 while (codeword_reversed & bit) 713 bit >>= 1; 714 codeword_reversed &= bit - 1; 715 codeword_reversed |= bit; 716 717 /* Advance to the next symbol. This will either increase the 718 * codeword length, or keep the same codeword length but 719 * increase the symbol value. Note: since we are using 720 * bit-reversed codewords, we don't need to explicitly append 721 * zeroes to the codeword when the codeword length increases. */ 722 if (++sym_idx == num_syms) 723 return true; 724 len_counts[codeword_len]--; 725 while (len_counts[codeword_len] == 0) 726 codeword_len++; 727 } 728 } 729 730 /* Build the decode table for the precode. */ 731 static bool 732 build_precode_decode_table(struct libdeflate_decompressor *d) 733 { 734 /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ 735 STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128); 736 737 return build_decode_table(d->u.l.precode_decode_table, 738 d->u.precode_lens, 739 DEFLATE_NUM_PRECODE_SYMS, 740 precode_decode_results, 741 PRECODE_TABLEBITS, 742 DEFLATE_MAX_PRE_CODEWORD_LEN, 743 d->working_space); 744 } 745 746 /* Build the decode table for the literal/length code. */ 747 static bool 748 build_litlen_decode_table(struct libdeflate_decompressor *d, 749 unsigned num_litlen_syms, unsigned num_offset_syms) 750 { 751 /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ 752 STATIC_ASSERT(LITLEN_TABLEBITS == 10 && LITLEN_ENOUGH == 1334); 753 754 return build_decode_table(d->u.litlen_decode_table, 755 d->u.l.lens, 756 num_litlen_syms, 757 litlen_decode_results, 758 LITLEN_TABLEBITS, 759 DEFLATE_MAX_LITLEN_CODEWORD_LEN, 760 d->working_space); 761 } 762 763 /* Build the decode table for the offset code. */ 764 static bool 765 build_offset_decode_table(struct libdeflate_decompressor *d, 766 unsigned num_litlen_syms, unsigned num_offset_syms) 767 { 768 /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ 769 STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402); 770 771 return build_decode_table(d->offset_decode_table, 772 d->u.l.lens + num_litlen_syms, 773 num_offset_syms, 774 offset_decode_results, 775 OFFSET_TABLEBITS, 776 DEFLATE_MAX_OFFSET_CODEWORD_LEN, 777 d->working_space); 778 } 779 780 static forceinline machine_word_t 781 repeat_byte(u8 b) 782 { 783 machine_word_t v; 784 785 STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); 786 787 v = b; 788 v |= v << 8; 789 v |= v << 16; 790 v |= v << ((WORDBITS == 64) ? 32 : 0); 791 return v; 792 } 793 794 static forceinline void 795 copy_word_unaligned(const void *src, void *dst) 796 { 797 store_word_unaligned(load_word_unaligned(src), dst); 798 } 799 800 /***************************************************************************** 801 * Main decompression routine 802 *****************************************************************************/ 803 804 typedef enum libdeflate_result (*decompress_func_t) 805 (struct libdeflate_decompressor * restrict d, 806 const void * restrict in, size_t in_nbytes, 807 void * restrict out, size_t out_nbytes_avail, 808 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); 809 810 #undef DEFAULT_IMPL 811 #undef DISPATCH 812 #if defined(__i386__) || defined(__x86_64__) 813 # include "decompress_impl.h" 814 #endif 815 816 #ifndef DEFAULT_IMPL 817 # define FUNCNAME deflate_decompress_default 818 # define ATTRIBUTES 819 # include "decompress_template.h" 820 # define DEFAULT_IMPL deflate_decompress_default 821 #endif 822 823 #ifdef DISPATCH 824 static enum libdeflate_result 825 dispatch(struct libdeflate_decompressor * restrict d, 826 const void * restrict in, size_t in_nbytes, 827 void * restrict out, size_t out_nbytes_avail, 828 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); 829 830 static volatile decompress_func_t decompress_impl = dispatch; 831 832 /* Choose the fastest implementation at runtime */ 833 static enum libdeflate_result 834 dispatch(struct libdeflate_decompressor * restrict d, 835 const void * restrict in, size_t in_nbytes, 836 void * restrict out, size_t out_nbytes_avail, 837 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) 838 { 839 decompress_func_t f = arch_select_decompress_func(); 840 841 if (f == NULL) 842 f = DEFAULT_IMPL; 843 844 decompress_impl = f; 845 return (*f)(d, in, in_nbytes, out, out_nbytes_avail, 846 actual_in_nbytes_ret, actual_out_nbytes_ret); 847 } 848 #else 849 # define decompress_impl DEFAULT_IMPL /* only one implementation, use it */ 850 #endif 851 852 853 /* 854 * This is the main DEFLATE decompression routine. See libdeflate.h for the 855 * documentation. 856 * 857 * Note that the real code is in decompress_template.h. The part here just 858 * handles calling the appropriate implementation depending on the CPU features 859 * at runtime. 860 */ 861 LIBDEFLATEAPI enum libdeflate_result 862 libdeflate_deflate_decompress_ex(struct libdeflate_decompressor * restrict d, 863 const void * restrict in, size_t in_nbytes, 864 void * restrict out, size_t out_nbytes_avail, 865 size_t *actual_in_nbytes_ret, 866 size_t *actual_out_nbytes_ret) 867 { 868 return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail, 869 actual_in_nbytes_ret, actual_out_nbytes_ret); 870 } 871 872 LIBDEFLATEAPI enum libdeflate_result 873 libdeflate_deflate_decompress(struct libdeflate_decompressor * restrict d, 874 const void * restrict in, size_t in_nbytes, 875 void * restrict out, size_t out_nbytes_avail, 876 size_t *actual_out_nbytes_ret) 877 { 878 return libdeflate_deflate_decompress_ex(d, in, in_nbytes, 879 out, out_nbytes_avail, 880 NULL, actual_out_nbytes_ret); 881 } 882 883 LIBDEFLATEAPI struct libdeflate_decompressor * 884 libdeflate_alloc_decompressor(void) 885 { 886 return malloc(sizeof(struct libdeflate_decompressor)); 887 } 888 889 LIBDEFLATEAPI void 890 libdeflate_free_decompressor(struct libdeflate_decompressor *d) 891 { 892 free(d); 893 }