github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/compress/libdeflate/deflate_decompress.c

github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/compress/libdeflate/deflate_decompress.c (about)

     1  /*
     2   * deflate_decompress.c - a decompressor for DEFLATE
     3   *
     4   * Copyright 2016 Eric Biggers
     5   *
     6   * Permission is hereby granted, free of charge, to any person
     7   * obtaining a copy of this software and associated documentation
     8   * files (the "Software"), to deal in the Software without
     9   * restriction, including without limitation the rights to use,
    10   * copy, modify, merge, publish, distribute, sublicense, and/or sell
    11   * copies of the Software, and to permit persons to whom the
    12   * Software is furnished to do so, subject to the following
    13   * conditions:
    14   *
    15   * The above copyright notice and this permission notice shall be
    16   * included in all copies or substantial portions of the Software.
    17   *
    18   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    19   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    20   * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    21   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    22   * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    23   * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    24   * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    25   * OTHER DEALINGS IN THE SOFTWARE.
    26   *
    27   * ---------------------------------------------------------------------------
    28   *
    29   * This is a highly optimized DEFLATE decompressor.  When compiled with gcc on
    30   * x86_64, it decompresses data in about 52% of the time of zlib (48% if BMI2
    31   * instructions are available).  On other architectures it should still be
    32   * significantly faster than zlib, but the difference may be smaller.
    33   *
    34   * Why this is faster than zlib's implementation:
    35   *
    36   * - Word accesses rather than byte accesses when reading input
    37   * - Word accesses rather than byte accesses when copying matches
    38   * - Faster Huffman decoding combined with various DEFLATE-specific tricks
    39   * - Larger bitbuffer variable that doesn't need to be filled as often
    40   * - Other optimizations to remove unnecessary branches
    41   * - Only full-buffer decompression is supported, so the code doesn't need to
    42   *   support stopping and resuming decompression.
    43   * - On x86_64, compile a version of the decompression routine using BMI2
    44   *   instructions and use it automatically at runtime when supported.
    45   */
    46  
    47  #include <stdlib.h>
    48  #include <string.h>
    49  
    50  #include "deflate_constants.h"
    51  #include "unaligned.h"
    52  
    53  #include "libdeflate.h"
    54  
    55  /*
    56   * If the expression passed to SAFETY_CHECK() evaluates to false, then the
    57   * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the
    58   * compressed data is invalid.
    59   *
    60   * Theoretically, these checks could be disabled for specialized applications
    61   * where all input to the decompressor will be trusted.
    62   */
    63  #if 0
    64  #  pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!")
    65  #  define SAFETY_CHECK(expr)	(void)(expr)
    66  #else
    67  #  define SAFETY_CHECK(expr)	if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA
    68  #endif
    69  
    70  /*
    71   * Each TABLEBITS number is the base-2 logarithm of the number of entries in the
    72   * main portion of the corresponding decode table.  Each number should be large
    73   * enough to ensure that for typical data, the vast majority of symbols can be
    74   * decoded by a direct lookup of the next TABLEBITS bits of compressed data.
    75   * However, this must be balanced against the fact that a larger table requires
    76   * more memory and requires more time to fill.
    77   *
    78   * Note: you cannot change a TABLEBITS number without also changing the
    79   * corresponding ENOUGH number!
    80   */
    81  #define PRECODE_TABLEBITS	7
    82  #define LITLEN_TABLEBITS	10
    83  #define OFFSET_TABLEBITS	8
    84  
    85  /*
    86   * Each ENOUGH number is the maximum number of decode table entries that may be
    87   * required for the corresponding Huffman code, including the main table and all
    88   * subtables.  Each number depends on three parameters:
    89   *
    90   *	(1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMBOLS)
    91   *	(2) the number of main table bits (the TABLEBITS numbers defined above)
    92   *	(3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN)
    93   *
    94   * The ENOUGH numbers were computed using the utility program 'enough' from
    95   * zlib.  This program enumerates all possible relevant Huffman codes to find
    96   * the worst-case usage of decode table entries.
    97   */
    98  #define PRECODE_ENOUGH		128	/* enough 19 7 7	*/
    99  #define LITLEN_ENOUGH		1334	/* enough 288 10 15	*/
   100  #define OFFSET_ENOUGH		402	/* enough 32 8 15	*/
   101  
   102  /*
   103   * Type for codeword lengths.
   104   */
   105  typedef u8 len_t;
   106  
   107  /*
   108   * The main DEFLATE decompressor structure.  Since this implementation only
   109   * supports full buffer decompression, this structure does not store the entire
   110   * decompression state, but rather only some arrays that are too large to
   111   * comfortably allocate on the stack.
   112   */
   113  struct libdeflate_decompressor {
   114  
   115  	/*
   116  	 * The arrays aren't all needed at the same time.  'precode_lens' and
   117  	 * 'precode_decode_table' are unneeded after 'lens' has been filled.
   118  	 * Furthermore, 'lens' need not be retained after building the litlen
   119  	 * and offset decode tables.  In fact, 'lens' can be in union with
   120  	 * 'litlen_decode_table' provided that 'offset_decode_table' is separate
   121  	 * and is built first.
   122  	 */
   123  
   124  	union {
   125  		len_t precode_lens[DEFLATE_NUM_PRECODE_SYMS];
   126  
   127  		struct {
   128  			len_t lens[DEFLATE_NUM_LITLEN_SYMS +
   129  				   DEFLATE_NUM_OFFSET_SYMS +
   130  				   DEFLATE_MAX_LENS_OVERRUN];
   131  
   132  			u32 precode_decode_table[PRECODE_ENOUGH];
   133  		} l;
   134  
   135  		u32 litlen_decode_table[LITLEN_ENOUGH];
   136  	} u;
   137  
   138  	u32 offset_decode_table[OFFSET_ENOUGH];
   139  
   140  	u16 working_space[2 * (DEFLATE_MAX_CODEWORD_LEN + 1) +
   141  			  DEFLATE_MAX_NUM_SYMS];
   142  };
   143  
   144  /*****************************************************************************
   145   *				Input bitstream                              *
   146   *****************************************************************************/
   147  
   148  /*
   149   * The state of the "input bitstream" consists of the following variables:
   150   *
   151   *	- in_next: pointer to the next unread byte in the input buffer
   152   *
   153   *	- in_end: pointer just past the end of the input buffer
   154   *
   155   *	- bitbuf: a word-sized variable containing bits that have been read from
   156   *		  the input buffer.  The buffered bits are right-aligned
   157   *		  (they're the low-order bits).
   158   *
   159   *	- bitsleft: number of bits in 'bitbuf' that are valid.
   160   *
   161   * To make it easier for the compiler to optimize the code by keeping variables
   162   * in registers, these are declared as normal variables and manipulated using
   163   * macros.
   164   */
   165  
   166  /*
   167   * The type for the bitbuffer variable ('bitbuf' described above).  For best
   168   * performance, this should have size equal to a machine word.
   169   *
   170   * 64-bit platforms have a significant advantage: they get a bigger bitbuffer
   171   * which they have to fill less often.
   172   */
   173  typedef machine_word_t bitbuf_t;
   174  
   175  /*
   176   * Number of bits the bitbuffer variable can hold.
   177   */
   178  #define BITBUF_NBITS	(8 * sizeof(bitbuf_t))
   179  
   180  /*
   181   * The maximum number of bits that can be requested to be in the bitbuffer
   182   * variable.  This is the maximum value of 'n' that can be passed
   183   * ENSURE_BITS(n).
   184   *
   185   * This not equal to BITBUF_NBITS because we never read less than one byte at a
   186   * time.  If the bitbuffer variable contains more than (BITBUF_NBITS - 8) bits,
   187   * then we can't read another byte without first consuming some bits.  So the
   188   * maximum count we can ensure is (BITBUF_NBITS - 7).
   189   */
   190  #define MAX_ENSURE	(BITBUF_NBITS - 7)
   191  
   192  /*
   193   * Evaluates to true if 'n' is a valid argument to ENSURE_BITS(n), or false if
   194   * 'n' is too large to be passed to ENSURE_BITS(n).  Note: if 'n' is a compile
   195   * time constant, then this expression will be a compile-type constant.
   196   * Therefore, CAN_ENSURE() can be used choose between alternative
   197   * implementations at compile time.
   198   */
   199  #define CAN_ENSURE(n)	((n) <= MAX_ENSURE)
   200  
   201  /*
   202   * Fill the bitbuffer variable, reading one byte at a time.
   203   *
   204   * If we would overread the input buffer, we just don't read anything, leaving
   205   * the bits zeroed but marking them filled.  This simplifies the decompressor
   206   * because it removes the need to distinguish between real overreads and
   207   * overreads that occur only because of the decompressor's own lookahead.
   208   *
   209   * The disadvantage is that real overreads are not detected immediately.
   210   * However, this is safe because the decompressor is still guaranteed to make
   211   * forward progress when presented never-ending 0 bits.  In an existing block
   212   * output will be getting generated, whereas new blocks can only be uncompressed
   213   * (since the type code for uncompressed blocks is 0), for which we check for
   214   * previous overread.  But even if we didn't check, uncompressed blocks would
   215   * fail to validate because LEN would not equal ~NLEN.  So the decompressor will
   216   * eventually either detect that the output buffer is full, or detect invalid
   217   * input, or finish the final block.
   218   */
   219  #define FILL_BITS_BYTEWISE()					\
   220  do {								\
   221  	if (likely(in_next != in_end))				\
   222  		bitbuf |= (bitbuf_t)*in_next++ << bitsleft;	\
   223  	else							\
   224  		overrun_count++;				\
   225  	bitsleft += 8;						\
   226  } while (bitsleft <= BITBUF_NBITS - 8)
   227  
   228  /*
   229   * Fill the bitbuffer variable by reading the next word from the input buffer.
   230   * This can be significantly faster than FILL_BITS_BYTEWISE().  However, for
   231   * this to work correctly, the word must be interpreted in little-endian format.
   232   * In addition, the memory access may be unaligned.  Therefore, this method is
   233   * most efficient on little-endian architectures that support fast unaligned
   234   * access, such as x86 and x86_64.
   235   */
   236  #define FILL_BITS_WORDWISE()					\
   237  do {								\
   238  	bitbuf |= get_unaligned_leword(in_next) << bitsleft;	\
   239  	in_next += (BITBUF_NBITS - bitsleft) >> 3;		\
   240  	bitsleft += (BITBUF_NBITS - bitsleft) & ~7;		\
   241  } while (0)
   242  
   243  /*
   244   * Does the bitbuffer variable currently contain at least 'n' bits?
   245   */
   246  #define HAVE_BITS(n) (bitsleft >= (n))
   247  
   248  /*
   249   * Load more bits from the input buffer until the specified number of bits is
   250   * present in the bitbuffer variable.  'n' cannot be too large; see MAX_ENSURE
   251   * and CAN_ENSURE().
   252   */
   253  #define ENSURE_BITS(n)						\
   254  if (!HAVE_BITS(n)) {						\
   255  	if (CPU_IS_LITTLE_ENDIAN() &&				\
   256  	    UNALIGNED_ACCESS_IS_FAST &&				\
   257  	    likely(in_end - in_next >= sizeof(bitbuf_t)))	\
   258  		FILL_BITS_WORDWISE();				\
   259  	else							\
   260  		FILL_BITS_BYTEWISE();				\
   261  }
   262  
   263  /*
   264   * Return the next 'n' bits from the bitbuffer variable without removing them.
   265   */
   266  #define BITS(n) ((u32)bitbuf & (((u32)1 << (n)) - 1))
   267  
   268  /*
   269   * Remove the next 'n' bits from the bitbuffer variable.
   270   */
   271  #define REMOVE_BITS(n) (bitbuf >>= (n), bitsleft -= (n))
   272  
   273  /*
   274   * Remove and return the next 'n' bits from the bitbuffer variable.
   275   */
   276  #define POP_BITS(n) (tmp32 = BITS(n), REMOVE_BITS(n), tmp32)
   277  
   278  /*
   279   * Verify that the input buffer hasn't been overread, then align the input to
   280   * the next byte boundary, discarding any remaining bits in the current byte.
   281   *
   282   * Note that if the bitbuffer variable currently contains more than 7 bits, then
   283   * we must rewind 'in_next', effectively putting those bits back.  Only the bits
   284   * in what would be the "current" byte if we were reading one byte at a time can
   285   * be actually discarded.
   286   */
   287  #define ALIGN_INPUT()							\
   288  do {									\
   289  	SAFETY_CHECK(overrun_count <= (bitsleft >> 3));			\
   290  	in_next -= (bitsleft >> 3) - overrun_count;			\
   291  	overrun_count = 0;						\
   292  	bitbuf = 0;							\
   293  	bitsleft = 0;							\
   294  } while(0)
   295  
   296  /*
   297   * Read a 16-bit value from the input.  This must have been preceded by a call
   298   * to ALIGN_INPUT(), and the caller must have already checked for overrun.
   299   */
   300  #define READ_U16() (tmp16 = get_unaligned_le16(in_next), in_next += 2, tmp16)
   301  
   302  /*****************************************************************************
   303   *                              Huffman decoding                             *
   304   *****************************************************************************/
   305  
   306  /*
   307   * A decode table for order TABLEBITS consists of a main table of (1 <<
   308   * TABLEBITS) entries followed by a variable number of subtables.
   309   *
   310   * The decoding algorithm takes the next TABLEBITS bits of compressed data and
   311   * uses them as an index into the decode table.  The resulting entry is either a
   312   * "direct entry", meaning that it contains the value desired, or a "subtable
   313   * pointer", meaning that the entry references a subtable that must be indexed
   314   * using more bits of the compressed data to decode the symbol.
   315   *
   316   * Each decode table (a main table along with with its subtables, if any) is
   317   * associated with a Huffman code.  Logically, the result of a decode table
   318   * lookup is a symbol from the alphabet from which the corresponding Huffman
   319   * code was constructed.  A symbol with codeword length n <= TABLEBITS is
   320   * associated with 2**(TABLEBITS - n) direct entries in the table, whereas a
   321   * symbol with codeword length n > TABLEBITS is associated with one or more
   322   * subtable entries.
   323   *
   324   * On top of this basic design, we implement several optimizations:
   325   *
   326   * - We store the length of each codeword directly in each of its decode table
   327   *   entries.  This allows the codeword length to be produced without indexing
   328   *   an additional table.
   329   *
   330   * - When beneficial, we don't store the Huffman symbol itself, but instead data
   331   *   generated from it.  For example, when decoding an offset symbol in DEFLATE,
   332   *   it's more efficient if we can decode the offset base and number of extra
   333   *   offset bits directly rather than decoding the offset symbol and then
   334   *   looking up both of those values in an additional table or tables.
   335   *
   336   * The size of each decode table entry is 32 bits, which provides slightly
   337   * better performance than 16-bit entries on 32 and 64 bit processers, provided
   338   * that the table doesn't get so large that it takes up too much memory and
   339   * starts generating cache misses.  The bits of each decode table entry are
   340   * defined as follows:
   341   *
   342   * - Bits 30 -- 31: flags (see below)
   343   * - Bits 8 -- 29: decode result: a Huffman symbol or related data
   344   * - Bits 0 -- 7: codeword length
   345   */
   346  
   347  /*
   348   * This flag is set in all main decode table entries that represent subtable
   349   * pointers.
   350   */
   351  #define HUFFDEC_SUBTABLE_POINTER	0x80000000
   352  
   353  /*
   354   * This flag is set in all entries in the litlen decode table that represent
   355   * literals.
   356   */
   357  #define HUFFDEC_LITERAL			0x40000000
   358  
   359  /* Mask for extracting the codeword length from a decode table entry.  */
   360  #define HUFFDEC_LENGTH_MASK		0xFF
   361  
   362  /* Shift to extract the decode result from a decode table entry.  */
   363  #define HUFFDEC_RESULT_SHIFT		8
   364  
   365  /* The decode result for each precode symbol.  There is no special optimization
   366   * for the precode; the decode result is simply the symbol value.  */
   367  static const u32 precode_decode_results[DEFLATE_NUM_PRECODE_SYMS] = {
   368  	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
   369  };
   370  
   371  /* The decode result for each litlen symbol.  For literals, this is the literal
   372   * value itself and the HUFFDEC_LITERAL flag.  For lengths, this is the length
   373   * base and the number of extra length bits.  */
   374  static const u32 litlen_decode_results[DEFLATE_NUM_LITLEN_SYMS] = {
   375  #define ENTRY(literal)	((HUFFDEC_LITERAL >> HUFFDEC_RESULT_SHIFT) | (literal))
   376  
   377  	/* Literals  */
   378  	ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
   379  	ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
   380  	ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
   381  	ENTRY(12)  , ENTRY(13)  , ENTRY(14)  , ENTRY(15)  ,
   382  	ENTRY(16)  , ENTRY(17)  , ENTRY(18)  , ENTRY(19)  ,
   383  	ENTRY(20)  , ENTRY(21)  , ENTRY(22)  , ENTRY(23)  ,
   384  	ENTRY(24)  , ENTRY(25)  , ENTRY(26)  , ENTRY(27)  ,
   385  	ENTRY(28)  , ENTRY(29)  , ENTRY(30)  , ENTRY(31)  ,
   386  	ENTRY(32)  , ENTRY(33)  , ENTRY(34)  , ENTRY(35)  ,
   387  	ENTRY(36)  , ENTRY(37)  , ENTRY(38)  , ENTRY(39)  ,
   388  	ENTRY(40)  , ENTRY(41)  , ENTRY(42)  , ENTRY(43)  ,
   389  	ENTRY(44)  , ENTRY(45)  , ENTRY(46)  , ENTRY(47)  ,
   390  	ENTRY(48)  , ENTRY(49)  , ENTRY(50)  , ENTRY(51)  ,
   391  	ENTRY(52)  , ENTRY(53)  , ENTRY(54)  , ENTRY(55)  ,
   392  	ENTRY(56)  , ENTRY(57)  , ENTRY(58)  , ENTRY(59)  ,
   393  	ENTRY(60)  , ENTRY(61)  , ENTRY(62)  , ENTRY(63)  ,
   394  	ENTRY(64)  , ENTRY(65)  , ENTRY(66)  , ENTRY(67)  ,
   395  	ENTRY(68)  , ENTRY(69)  , ENTRY(70)  , ENTRY(71)  ,
   396  	ENTRY(72)  , ENTRY(73)  , ENTRY(74)  , ENTRY(75)  ,
   397  	ENTRY(76)  , ENTRY(77)  , ENTRY(78)  , ENTRY(79)  ,
   398  	ENTRY(80)  , ENTRY(81)  , ENTRY(82)  , ENTRY(83)  ,
   399  	ENTRY(84)  , ENTRY(85)  , ENTRY(86)  , ENTRY(87)  ,
   400  	ENTRY(88)  , ENTRY(89)  , ENTRY(90)  , ENTRY(91)  ,
   401  	ENTRY(92)  , ENTRY(93)  , ENTRY(94)  , ENTRY(95)  ,
   402  	ENTRY(96)  , ENTRY(97)  , ENTRY(98)  , ENTRY(99)  ,
   403  	ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) ,
   404  	ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) ,
   405  	ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) ,
   406  	ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) ,
   407  	ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) ,
   408  	ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) ,
   409  	ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) ,
   410  	ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) ,
   411  	ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) ,
   412  	ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) ,
   413  	ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) ,
   414  	ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) ,
   415  	ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) ,
   416  	ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) ,
   417  	ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) ,
   418  	ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) ,
   419  	ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) ,
   420  	ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) ,
   421  	ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) ,
   422  	ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) ,
   423  	ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) ,
   424  	ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) ,
   425  	ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) ,
   426  	ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) ,
   427  	ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) ,
   428  	ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) ,
   429  	ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) ,
   430  	ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) ,
   431  	ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) ,
   432  	ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) ,
   433  	ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) ,
   434  	ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) ,
   435  	ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) ,
   436  	ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) ,
   437  	ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) ,
   438  	ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) ,
   439  	ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) ,
   440  	ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) ,
   441  	ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) ,
   442  #undef ENTRY
   443  
   444  #define HUFFDEC_EXTRA_LENGTH_BITS_MASK	0xFF
   445  #define HUFFDEC_LENGTH_BASE_SHIFT	8
   446  #define HUFFDEC_END_OF_BLOCK_LENGTH	0
   447  
   448  #define ENTRY(length_base, num_extra_bits) \
   449  	(((u32)(length_base) << HUFFDEC_LENGTH_BASE_SHIFT) | (num_extra_bits))
   450  
   451  	/* End of block  */
   452  	ENTRY(HUFFDEC_END_OF_BLOCK_LENGTH, 0),
   453  
   454  	/* Lengths  */
   455  	ENTRY(3  , 0) , ENTRY(4  , 0) , ENTRY(5  , 0) , ENTRY(6  , 0),
   456  	ENTRY(7  , 0) , ENTRY(8  , 0) , ENTRY(9  , 0) , ENTRY(10 , 0),
   457  	ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1),
   458  	ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2),
   459  	ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3),
   460  	ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4),
   461  	ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5),
   462  	ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) ,
   463  #undef ENTRY
   464  };
   465  
   466  /* The decode result for each offset symbol.  This is the offset base and the
   467   * number of extra offset bits.  */
   468  static const u32 offset_decode_results[DEFLATE_NUM_OFFSET_SYMS] = {
   469  
   470  #define HUFFDEC_EXTRA_OFFSET_BITS_SHIFT 16
   471  #define HUFFDEC_OFFSET_BASE_MASK (((u32)1 << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) - 1)
   472  
   473  #define ENTRY(offset_base, num_extra_bits) \
   474  	((offset_base) | ((u32)(num_extra_bits) << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT))
   475  	ENTRY(1     , 0)  , ENTRY(2     , 0)  , ENTRY(3     , 0)  , ENTRY(4     , 0)  ,
   476  	ENTRY(5     , 1)  , ENTRY(7     , 1)  , ENTRY(9     , 2)  , ENTRY(13    , 2) ,
   477  	ENTRY(17    , 3)  , ENTRY(25    , 3)  , ENTRY(33    , 4)  , ENTRY(49    , 4)  ,
   478  	ENTRY(65    , 5)  , ENTRY(97    , 5)  , ENTRY(129   , 6)  , ENTRY(193   , 6)  ,
   479  	ENTRY(257   , 7)  , ENTRY(385   , 7)  , ENTRY(513   , 8)  , ENTRY(769   , 8)  ,
   480  	ENTRY(1025  , 9)  , ENTRY(1537  , 9)  , ENTRY(2049  , 10) , ENTRY(3073  , 10) ,
   481  	ENTRY(4097  , 11) , ENTRY(6145  , 11) , ENTRY(8193  , 12) , ENTRY(12289 , 12) ,
   482  	ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(32769 , 14) , ENTRY(49153 , 14) ,
   483  #undef ENTRY
   484  };
   485  
   486  /* Construct a decode table entry from a decode result and codeword length.  */
   487  static forceinline u32
   488  make_decode_table_entry(u32 result, u32 length)
   489  {
   490  	return (result << HUFFDEC_RESULT_SHIFT) | length;
   491  }
   492  
   493  /*
   494   * Build a table for fast decoding of symbols from a Huffman code.  As input,
   495   * this function takes the codeword length of each symbol which may be used in
   496   * the code.  As output, it produces a decode table for the canonical Huffman
   497   * code described by the codeword lengths.  The decode table is built with the
   498   * assumption that it will be indexed with "bit-reversed" codewords, where the
   499   * low-order bit is the first bit of the codeword.  This format is used for all
   500   * Huffman codes in DEFLATE.
   501   *
   502   * @decode_table
   503   *	The array in which the decode table will be generated.  This array must
   504   *	have sufficient length; see the definition of the ENOUGH numbers.
   505   * @lens
   506   *	An array which provides, for each symbol, the length of the
   507   *	corresponding codeword in bits, or 0 if the symbol is unused.  This may
   508   *	alias @decode_table, since nothing is written to @decode_table until all
   509   *	@lens have been consumed.  All codeword lengths are assumed to be <=
   510   *	@max_codeword_len but are otherwise considered untrusted.  If they do
   511   *	not form a valid Huffman code, then the decode table is not built and
   512   *	%false is returned.
   513   * @num_syms
   514   *	The number of symbols in the code, including all unused symbols.
   515   * @decode_results
   516   *	An array which provides, for each symbol, the actual value to store into
   517   *	the decode table.  This value will be directly produced as the result of
   518   *	decoding that symbol, thereby moving the indirection out of the decode
   519   *	loop and into the table initialization.
   520   * @table_bits
   521   *	The log base-2 of the number of main table entries to use.
   522   * @max_codeword_len
   523   *	The maximum allowed codeword length for this Huffman code.
   524   * @working_space
   525   *	A temporary array of length '2 * (@max_codeword_len + 1) + @num_syms'.
   526   *
   527   * Returns %true if successful; %false if the codeword lengths do not form a
   528   * valid Huffman code.
   529   */
   530  static bool
   531  build_decode_table(u32 decode_table[],
   532  		   const len_t lens[],
   533  		   const unsigned num_syms,
   534  		   const u32 decode_results[],
   535  		   const unsigned table_bits,
   536  		   const unsigned max_codeword_len,
   537  		   u16 working_space[])
   538  {
   539  	u16 * const len_counts = &working_space[0];
   540  	u16 * const offsets = &working_space[1 * (max_codeword_len + 1)];
   541  	u16 * const sorted_syms = &working_space[2 * (max_codeword_len + 1)];
   542  	unsigned len;
   543  	unsigned sym;
   544  	s32 remainder;
   545  	unsigned sym_idx;
   546  	unsigned codeword_len;
   547  	unsigned codeword_reversed = 0;
   548  	unsigned cur_codeword_prefix = -1;
   549  	unsigned cur_table_start = 0;
   550  	unsigned cur_table_bits = table_bits;
   551  	unsigned num_dropped_bits = 0;
   552  	const unsigned table_mask = (1U << table_bits) - 1;
   553  
   554  	/* Count how many symbols have each codeword length, including 0.  */
   555  	for (len = 0; len <= max_codeword_len; len++)
   556  		len_counts[len] = 0;
   557  	for (sym = 0; sym < num_syms; sym++)
   558  		len_counts[lens[sym]]++;
   559  
   560  	/* Sort the symbols primarily by increasing codeword length and
   561  	 * secondarily by increasing symbol value.  */
   562  
   563  	/* Initialize 'offsets' so that offsets[len] is the number of codewords
   564  	 * shorter than 'len' bits, including length 0.  */
   565  	offsets[0] = 0;
   566  	for (len = 0; len < max_codeword_len; len++)
   567  		offsets[len + 1] = offsets[len] + len_counts[len];
   568  
   569  	/* Use the 'offsets' array to sort the symbols.  */
   570  	for (sym = 0; sym < num_syms; sym++)
   571  		sorted_syms[offsets[lens[sym]]++] = sym;
   572  
   573  	/* It is already guaranteed that all lengths are <= max_codeword_len,
   574  	 * but it cannot be assumed they form a complete prefix code.  A
   575  	 * codeword of length n should require a proportion of the codespace
   576  	 * equaling (1/2)^n.  The code is complete if and only if, by this
   577  	 * measure, the codespace is exactly filled by the lengths.  */
   578  	remainder = 1;
   579  	for (len = 1; len <= max_codeword_len; len++) {
   580  		remainder <<= 1;
   581  		remainder -= len_counts[len];
   582  		if (unlikely(remainder < 0)) {
   583  			/* The lengths overflow the codespace; that is, the code
   584  			 * is over-subscribed.  */
   585  			return false;
   586  		}
   587  	}
   588  
   589  	if (unlikely(remainder != 0)) {
   590  		/* The lengths do not fill the codespace; that is, they form an
   591  		 * incomplete code.  */
   592  
   593  		/* Initialize the table entries to default values.  When
   594  		 * decompressing a well-formed stream, these default values will
   595  		 * never be used.  But since a malformed stream might contain
   596  		 * any bits at all, these entries need to be set anyway.  */
   597  		u32 entry = make_decode_table_entry(decode_results[0], 1);
   598  		for (sym = 0; sym < (1U << table_bits); sym++)
   599  			decode_table[sym] = entry;
   600  
   601  		/* A completely empty code is permitted.  */
   602  		if (remainder == (1U << max_codeword_len))
   603  			return true;
   604  
   605  		/* The code is nonempty and incomplete.  Proceed only if there
   606  		 * is a single used symbol and its codeword has length 1.  The
   607  		 * DEFLATE RFC is somewhat unclear regarding this case.  What
   608  		 * zlib's decompressor does is permit this case for
   609  		 * literal/length and offset codes and assume the codeword is 0
   610  		 * rather than 1.  We do the same except we allow this case for
   611  		 * precodes too.  */
   612  		if (remainder != (1U << (max_codeword_len - 1)) ||
   613  		    len_counts[1] != 1)
   614  			return false;
   615  	}
   616  
   617  	/* Generate the decode table entries.  Since we process codewords from
   618  	 * shortest to longest, the main portion of the decode table is filled
   619  	 * first; then the subtables are filled.  Note that it's already been
   620  	 * verified that the code is nonempty and not over-subscribed.  */
   621  
   622  	/* Start with the smallest codeword length and the smallest-valued
   623  	 * symbol which has that codeword length.  */
   624  	sym_idx = offsets[0];
   625  	codeword_len = 1;
   626  	while (len_counts[codeword_len] == 0)
   627  		codeword_len++;
   628  
   629  	for (;;) {  /* For each used symbol and its codeword...  */
   630  		unsigned sym;
   631  		u32 entry;
   632  		unsigned i;
   633  		unsigned end;
   634  		unsigned increment;
   635  		unsigned bit;
   636  
   637  		/* Get the next symbol.  */
   638  		sym = sorted_syms[sym_idx];
   639  
   640  		/* Start a new subtable if the codeword is long enough to
   641  		 * require a subtable, *and* the first 'table_bits' bits of the
   642  		 * codeword don't match the prefix for the previous subtable if
   643  		 * any.  */
   644  		if (codeword_len > table_bits &&
   645  		    (codeword_reversed & table_mask) != cur_codeword_prefix) {
   646  
   647  			cur_codeword_prefix = (codeword_reversed & table_mask);
   648  
   649  			cur_table_start += 1U << cur_table_bits;
   650  
   651  			/* Calculate the subtable length.  If the codeword
   652  			 * length exceeds 'table_bits' by n, the subtable needs
   653  			 * at least 2**n entries.  But it may need more; if
   654  			 * there are fewer than 2**n codewords of length
   655  			 * 'table_bits + n' remaining, then n will need to be
   656  			 * incremented to bring in longer codewords until the
   657  			 * subtable can be filled completely.  Note that it
   658  			 * always will, eventually, be possible to fill the
   659  			 * subtable, since the only case where we may have an
   660  			 * incomplete code is a single codeword of length 1,
   661  			 * and that never requires any subtables.  */
   662  			cur_table_bits = codeword_len - table_bits;
   663  			remainder = (s32)1 << cur_table_bits;
   664  			for (;;) {
   665  				remainder -= len_counts[table_bits +
   666  							cur_table_bits];
   667  				if (remainder <= 0)
   668  					break;
   669  				cur_table_bits++;
   670  				remainder <<= 1;
   671  			}
   672  
   673  			/* Create the entry that points from the main table to
   674  			 * the subtable.  This entry contains the index of the
   675  			 * start of the subtable and the number of bits with
   676  			 * which the subtable is indexed (the log base 2 of the
   677  			 * number of entries it contains).  */
   678  			decode_table[cur_codeword_prefix] =
   679  				HUFFDEC_SUBTABLE_POINTER |
   680  				make_decode_table_entry(cur_table_start,
   681  							cur_table_bits);
   682  
   683  			/* Now that we're filling a subtable, we need to drop
   684  			 * the first 'table_bits' bits of the codewords.  */
   685  			num_dropped_bits = table_bits;
   686  		}
   687  
   688  		/* Create the decode table entry, which packs the decode result
   689  		 * and the codeword length (minus 'table_bits' for subtables)
   690  		 * together.  */
   691  		entry = make_decode_table_entry(decode_results[sym],
   692  						codeword_len - num_dropped_bits);
   693  
   694  		/* Fill in as many copies of the decode table entry as are
   695  		 * needed.  The number of entries to fill is a power of 2 and
   696  		 * depends on the codeword length; it could be as few as 1 or as
   697  		 * large as half the size of the table.  Since the codewords are
   698  		 * bit-reversed, the indices to fill are those with the codeword
   699  		 * in its low bits; it's the high bits that vary.  */
   700  		i = cur_table_start + (codeword_reversed >> num_dropped_bits);
   701  		end = cur_table_start + (1U << cur_table_bits);
   702  		increment = 1U << (codeword_len - num_dropped_bits);
   703  		do {
   704  			decode_table[i] = entry;
   705  			i += increment;
   706  		} while (i < end);
   707  
   708  		/* Advance to the next codeword by incrementing it.  But since
   709  		 * our codewords are bit-reversed, we must manipulate the bits
   710  		 * ourselves rather than simply adding 1.  */
   711  		bit = 1U << (codeword_len - 1);
   712  		while (codeword_reversed & bit)
   713  			bit >>= 1;
   714  		codeword_reversed &= bit - 1;
   715  		codeword_reversed |= bit;
   716  
   717  		/* Advance to the next symbol.  This will either increase the
   718  		 * codeword length, or keep the same codeword length but
   719  		 * increase the symbol value.  Note: since we are using
   720  		 * bit-reversed codewords, we don't need to explicitly append
   721  		 * zeroes to the codeword when the codeword length increases. */
   722  		if (++sym_idx == num_syms)
   723  			return true;
   724  		len_counts[codeword_len]--;
   725  		while (len_counts[codeword_len] == 0)
   726  			codeword_len++;
   727  	}
   728  }
   729  
   730  /* Build the decode table for the precode.  */
   731  static bool
   732  build_precode_decode_table(struct libdeflate_decompressor *d)
   733  {
   734  	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
   735  	STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128);
   736  
   737  	return build_decode_table(d->u.l.precode_decode_table,
   738  				  d->u.precode_lens,
   739  				  DEFLATE_NUM_PRECODE_SYMS,
   740  				  precode_decode_results,
   741  				  PRECODE_TABLEBITS,
   742  				  DEFLATE_MAX_PRE_CODEWORD_LEN,
   743  				  d->working_space);
   744  }
   745  
   746  /* Build the decode table for the literal/length code.  */
   747  static bool
   748  build_litlen_decode_table(struct libdeflate_decompressor *d,
   749  			  unsigned num_litlen_syms, unsigned num_offset_syms)
   750  {
   751  	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
   752  	STATIC_ASSERT(LITLEN_TABLEBITS == 10 && LITLEN_ENOUGH == 1334);
   753  
   754  	return build_decode_table(d->u.litlen_decode_table,
   755  				  d->u.l.lens,
   756  				  num_litlen_syms,
   757  				  litlen_decode_results,
   758  				  LITLEN_TABLEBITS,
   759  				  DEFLATE_MAX_LITLEN_CODEWORD_LEN,
   760  				  d->working_space);
   761  }
   762  
   763  /* Build the decode table for the offset code.  */
   764  static bool
   765  build_offset_decode_table(struct libdeflate_decompressor *d,
   766  			  unsigned num_litlen_syms, unsigned num_offset_syms)
   767  {
   768  	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
   769  	STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402);
   770  
   771  	return build_decode_table(d->offset_decode_table,
   772  				  d->u.l.lens + num_litlen_syms,
   773  				  num_offset_syms,
   774  				  offset_decode_results,
   775  				  OFFSET_TABLEBITS,
   776  				  DEFLATE_MAX_OFFSET_CODEWORD_LEN,
   777  				  d->working_space);
   778  }
   779  
   780  static forceinline machine_word_t
   781  repeat_byte(u8 b)
   782  {
   783  	machine_word_t v;
   784  
   785  	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
   786  
   787  	v = b;
   788  	v |= v << 8;
   789  	v |= v << 16;
   790  	v |= v << ((WORDBITS == 64) ? 32 : 0);
   791  	return v;
   792  }
   793  
   794  static forceinline void
   795  copy_word_unaligned(const void *src, void *dst)
   796  {
   797  	store_word_unaligned(load_word_unaligned(src), dst);
   798  }
   799  
   800  /*****************************************************************************
   801   *                         Main decompression routine
   802   *****************************************************************************/
   803  
   804  typedef enum libdeflate_result (*decompress_func_t)
   805  	(struct libdeflate_decompressor * restrict d,
   806  	 const void * restrict in, size_t in_nbytes,
   807  	 void * restrict out, size_t out_nbytes_avail,
   808  	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
   809  
   810  #undef DEFAULT_IMPL
   811  #undef DISPATCH
   812  #if defined(__i386__) || defined(__x86_64__)
   813  #  include "decompress_impl.h"
   814  #endif
   815  
   816  #ifndef DEFAULT_IMPL
   817  #  define FUNCNAME deflate_decompress_default
   818  #  define ATTRIBUTES
   819  #  include "decompress_template.h"
   820  #  define DEFAULT_IMPL deflate_decompress_default
   821  #endif
   822  
   823  #ifdef DISPATCH
   824  static enum libdeflate_result
   825  dispatch(struct libdeflate_decompressor * restrict d,
   826  	 const void * restrict in, size_t in_nbytes,
   827  	 void * restrict out, size_t out_nbytes_avail,
   828  	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
   829  
   830  static volatile decompress_func_t decompress_impl = dispatch;
   831  
   832  /* Choose the fastest implementation at runtime */
   833  static enum libdeflate_result
   834  dispatch(struct libdeflate_decompressor * restrict d,
   835  	 const void * restrict in, size_t in_nbytes,
   836  	 void * restrict out, size_t out_nbytes_avail,
   837  	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
   838  {
   839  	decompress_func_t f = arch_select_decompress_func();
   840  
   841  	if (f == NULL)
   842  		f = DEFAULT_IMPL;
   843  
   844  	decompress_impl = f;
   845  	return (*f)(d, in, in_nbytes, out, out_nbytes_avail,
   846  		    actual_in_nbytes_ret, actual_out_nbytes_ret);
   847  }
   848  #else
   849  #  define decompress_impl DEFAULT_IMPL /* only one implementation, use it */
   850  #endif
   851  
   852  
   853  /*
   854   * This is the main DEFLATE decompression routine.  See libdeflate.h for the
   855   * documentation.
   856   *
   857   * Note that the real code is in decompress_template.h.  The part here just
   858   * handles calling the appropriate implementation depending on the CPU features
   859   * at runtime.
   860   */
   861  LIBDEFLATEAPI enum libdeflate_result
   862  libdeflate_deflate_decompress_ex(struct libdeflate_decompressor * restrict d,
   863  				 const void * restrict in, size_t in_nbytes,
   864  				 void * restrict out, size_t out_nbytes_avail,
   865  				 size_t *actual_in_nbytes_ret,
   866  				 size_t *actual_out_nbytes_ret)
   867  {
   868  	return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail,
   869  			       actual_in_nbytes_ret, actual_out_nbytes_ret);
   870  }
   871  
   872  LIBDEFLATEAPI enum libdeflate_result
   873  libdeflate_deflate_decompress(struct libdeflate_decompressor * restrict d,
   874  			      const void * restrict in, size_t in_nbytes,
   875  			      void * restrict out, size_t out_nbytes_avail,
   876  			      size_t *actual_out_nbytes_ret)
   877  {
   878  	return libdeflate_deflate_decompress_ex(d, in, in_nbytes,
   879  						out, out_nbytes_avail,
   880  						NULL, actual_out_nbytes_ret);
   881  }
   882  
   883  LIBDEFLATEAPI struct libdeflate_decompressor *
   884  libdeflate_alloc_decompressor(void)
   885  {
   886  	return malloc(sizeof(struct libdeflate_decompressor));
   887  }
   888  
   889  LIBDEFLATEAPI void
   890  libdeflate_free_decompressor(struct libdeflate_decompressor *d)
   891  {
   892  	free(d);
   893  }