github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/compress/libdeflate/deflate_compress.c

github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/compress/libdeflate/deflate_compress.c (about)

     1  /*
     2   * deflate_compress.c - a compressor for DEFLATE
     3   *
     4   * Originally public domain; changes after 2016-09-07 are copyrighted.
     5   *
     6   * Copyright 2016 Eric Biggers
     7   *
     8   * Permission is hereby granted, free of charge, to any person
     9   * obtaining a copy of this software and associated documentation
    10   * files (the "Software"), to deal in the Software without
    11   * restriction, including without limitation the rights to use,
    12   * copy, modify, merge, publish, distribute, sublicense, and/or sell
    13   * copies of the Software, and to permit persons to whom the
    14   * Software is furnished to do so, subject to the following
    15   * conditions:
    16   *
    17   * The above copyright notice and this permission notice shall be
    18   * included in all copies or substantial portions of the Software.
    19   *
    20   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    21   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    22   * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    23   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    24   * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    25   * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    26   * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    27   * OTHER DEALINGS IN THE SOFTWARE.
    28   */
    29  
    30  #include <stdlib.h>
    31  #include <string.h>
    32  
    33  #include "aligned_malloc.h"
    34  #include "deflate_compress.h"
    35  #include "deflate_constants.h"
    36  #include "unaligned.h"
    37  
    38  #include "libdeflate.h"
    39  
    40  /*
    41   * By default, the near-optimal parsing algorithm is enabled at compression
    42   * level 8 and above.  The near-optimal parsing algorithm produces a compression
    43   * ratio significantly better than the greedy and lazy algorithms implemented
    44   * here, and also the algorithm used by zlib at level 9.  However, it is slow.
    45   */
    46  #define SUPPORT_NEAR_OPTIMAL_PARSING 1
    47  
    48  /*
    49   * Define to 1 to maintain the full map from match offsets to offset slots.
    50   * This slightly speeds up translations of match offsets to offset slots, but it
    51   * uses 32769 bytes of memory rather than the 512 bytes used by the condensed
    52   * map.  The speedup provided by the larger map is most helpful when the
    53   * near-optimal parsing algorithm is being used.
    54   */
    55  #define USE_FULL_OFFSET_SLOT_FAST	SUPPORT_NEAR_OPTIMAL_PARSING
    56  
    57  /*
    58   * DEFLATE uses a 32768 byte sliding window; set the matchfinder parameters
    59   * appropriately.
    60   */
    61  #define MATCHFINDER_WINDOW_ORDER	15
    62  
    63  #include "hc_matchfinder.h"
    64  #if SUPPORT_NEAR_OPTIMAL_PARSING
    65  #  include "bt_matchfinder.h"
    66  #endif
    67  
    68  /*
    69   * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes,
    70   * except if the last block has to be shorter.
    71   */
    72  #define MIN_BLOCK_LENGTH	10000
    73  
    74  /*
    75   * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but
    76   * the final length might be slightly longer due to matches extending beyond
    77   * this limit.
    78   */
    79  #define SOFT_MAX_BLOCK_LENGTH	300000
    80  
    81  /*
    82   * The number of observed matches or literals that represents sufficient data to
    83   * decide whether the current block should be terminated or not.
    84   */
    85  #define NUM_OBSERVATIONS_PER_BLOCK_CHECK       512
    86  
    87  
    88  #if SUPPORT_NEAR_OPTIMAL_PARSING
    89  /* Constants specific to the near-optimal parsing algorithm */
    90  
    91  /*
    92   * The maximum number of matches the matchfinder can find at a single position.
    93   * Since the matchfinder never finds more than one match for the same length,
    94   * presuming one of each possible length is sufficient for an upper bound.
    95   * (This says nothing about whether it is worthwhile to consider so many
    96   * matches; this is just defining the worst case.)
    97   */
    98  #  define MAX_MATCHES_PER_POS	(DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
    99  
   100  /*
   101   * The number of lz_match structures in the match cache, excluding the extra
   102   * "overflow" entries.  This value should be high enough so that nearly the
   103   * time, all matches found in a given block can fit in the match cache.
   104   * However, fallback behavior (immediately terminating the block) on cache
   105   * overflow is still required.
   106   */
   107  #  define CACHE_LENGTH      (SOFT_MAX_BLOCK_LENGTH * 5)
   108  
   109  #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
   110  
   111  /*
   112   * These are the compressor-side limits on the codeword lengths for each Huffman
   113   * code.  To make outputting bits slightly faster, some of these limits are
   114   * lower than the limits defined by the DEFLATE format.  This does not
   115   * significantly affect the compression ratio, at least for the block lengths we
   116   * use.
   117   */
   118  #define MAX_LITLEN_CODEWORD_LEN		14
   119  #define MAX_OFFSET_CODEWORD_LEN		DEFLATE_MAX_OFFSET_CODEWORD_LEN
   120  #define MAX_PRE_CODEWORD_LEN		DEFLATE_MAX_PRE_CODEWORD_LEN
   121  
   122  /* Table: length slot => length slot base value  */
   123  static const unsigned deflate_length_slot_base[] = {
   124  	3   , 4   , 5   , 6   , 7   , 8   , 9   , 10  ,
   125  	11  , 13  , 15  , 17  , 19  , 23  , 27  , 31  ,
   126  	35  , 43  , 51  , 59  , 67  , 83  , 99  , 115 ,
   127  	131 , 163 , 195 , 227 , 258 ,
   128  };
   129  
   130  /* Table: length slot => number of extra length bits  */
   131  static const u8 deflate_extra_length_bits[] = {
   132  	0   , 0   , 0   , 0   , 0   , 0   , 0   , 0 ,
   133  	1   , 1   , 1   , 1   , 2   , 2   , 2   , 2 ,
   134  	3   , 3   , 3   , 3   , 4   , 4   , 4   , 4 ,
   135  	5   , 5   , 5   , 5   , 0   ,
   136  };
   137  
   138  /* Table: offset slot => offset slot base value  */
   139  static const unsigned deflate_offset_slot_base[] = {
   140  	1    , 2    , 3    , 4     , 5     , 7     , 9     , 13    ,
   141  	17   , 25   , 33   , 49    , 65    , 97    , 129   , 193   ,
   142  	257  , 385  , 513  , 769   , 1025  , 1537  , 2049  , 3073  ,
   143  	4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
   144  };
   145  
   146  /* Table: offset slot => number of extra offset bits  */
   147  static const u8 deflate_extra_offset_bits[] = {
   148  	0    , 0    , 0    , 0     , 1     , 1     , 2     , 2     ,
   149  	3    , 3    , 4    , 4     , 5     , 5     , 6     , 6     ,
   150  	7    , 7    , 8    , 8     , 9     , 9     , 10    , 10    ,
   151  	11   , 11   , 12   , 12    , 13    , 13    ,
   152  };
   153  
   154  /* Table: length => length slot  */
   155  static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
   156  	0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
   157  	12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
   158  	16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
   159  	18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
   160  	20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
   161  	21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
   162  	22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
   163  	23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
   164  	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
   165  	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
   166  	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
   167  	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
   168  	26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
   169  	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
   170  	27, 27, 28,
   171  };
   172  
   173  /* The order in which precode codeword lengths are stored */
   174  static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
   175  	16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
   176  };
   177  
   178  /* Codewords for the DEFLATE Huffman codes.  */
   179  struct deflate_codewords {
   180  	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
   181  	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
   182  };
   183  
   184  /* Codeword lengths (in bits) for the DEFLATE Huffman codes.
   185   * A zero length means the corresponding symbol had zero frequency.  */
   186  struct deflate_lens {
   187  	u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
   188  	u8 offset[DEFLATE_NUM_OFFSET_SYMS];
   189  };
   190  
   191  /* Codewords and lengths for the DEFLATE Huffman codes.  */
   192  struct deflate_codes {
   193  	struct deflate_codewords codewords;
   194  	struct deflate_lens lens;
   195  };
   196  
   197  /* Symbol frequency counters for the DEFLATE Huffman codes.  */
   198  struct deflate_freqs {
   199  	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
   200  	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
   201  };
   202  
   203  #if SUPPORT_NEAR_OPTIMAL_PARSING
   204  
   205  /* Costs for the near-optimal parsing algorithm.  */
   206  struct deflate_costs {
   207  
   208  	/* The cost to output each possible literal.  */
   209  	u32 literal[DEFLATE_NUM_LITERALS];
   210  
   211  	/* The cost to output each possible match length.  */
   212  	u32 length[DEFLATE_MAX_MATCH_LEN + 1];
   213  
   214  	/* The cost to output a match offset of each possible offset slot.  */
   215  	u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
   216  };
   217  
   218  /*
   219   * COST_SHIFT is a scaling factor that makes it possible to consider fractional
   220   * bit costs.  A token requiring 'n' bits to represent has cost n << COST_SHIFT.
   221   *
   222   * Note: this is only useful as a statistical trick for when the true costs are
   223   * unknown.  In reality, each token in DEFLATE requires a whole number of bits
   224   * to output.
   225   */
   226  #define COST_SHIFT	3
   227  
   228  /*
   229   * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
   230   * be needed to output a symbol that was unused in the previous optimization
   231   * pass.  Assigning a default cost allows the symbol to be used in the next
   232   * optimization pass.  However, the cost should be relatively high because the
   233   * symbol probably won't be used very many times (if at all).
   234   */
   235  #define LITERAL_NOSTAT_BITS	13
   236  #define LENGTH_NOSTAT_BITS	13
   237  #define OFFSET_NOSTAT_BITS	10
   238  
   239  #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
   240  
   241  /*
   242   * Represents a run of literals followed by a match or end-of-block.  This
   243   * struct is needed to temporarily store items chosen by the parser, since items
   244   * cannot be written until all items for the block have been chosen and the
   245   * block's Huffman codes have been computed.
   246   */
   247  struct deflate_sequence {
   248  
   249  	/* Bits 0..22: the number of literals in this run.  This may be 0 and
   250  	 * can be at most about SOFT_MAX_BLOCK_LENGTH.  The literals are not
   251  	 * stored explicitly in this structure; instead, they are read directly
   252  	 * from the uncompressed data.
   253  	 *
   254  	 * Bits 23..31: the length of the match which follows the literals, or 0
   255  	 * if this literal run was the last in the block, so there is no match
   256  	 * which follows it.  */
   257  	u32 litrunlen_and_length;
   258  
   259  	/* If 'length' doesn't indicate end-of-block, then this is the offset of
   260  	 * the match which follows the literals.  */
   261  	u16 offset;
   262  
   263  	/* If 'length' doesn't indicate end-of-block, then this is the offset
   264  	 * symbol of the match which follows the literals.  */
   265  	u8 offset_symbol;
   266  
   267  	/* If 'length' doesn't indicate end-of-block, then this is the length
   268  	 * slot of the match which follows the literals.  */
   269  	u8 length_slot;
   270  };
   271  
   272  #if SUPPORT_NEAR_OPTIMAL_PARSING
   273  
   274  /*
   275   * This structure represents a byte position in the input data and a node in the
   276   * graph of possible match/literal choices for the current block.
   277   *
   278   * Logically, each incoming edge to this node is labeled with a literal or a
   279   * match that can be taken to reach this position from an earlier position; and
   280   * each outgoing edge from this node is labeled with a literal or a match that
   281   * can be taken to advance from this position to a later position.
   282   *
   283   * But these "edges" are actually stored elsewhere (in 'match_cache').  Here we
   284   * associate with each node just two pieces of information:
   285   *
   286   *	'cost_to_end' is the minimum cost to reach the end of the block from
   287   *	this position.
   288   *
   289   *	'item' represents the literal or match that must be chosen from here to
   290   *	reach the end of the block with the minimum cost.  Equivalently, this
   291   *	can be interpreted as the label of the outgoing edge on the minimum-cost
   292   *	path to the "end of block" node from this node.
   293   */
   294  struct deflate_optimum_node {
   295  
   296  	u32 cost_to_end;
   297  
   298  	/*
   299  	 * Notes on the match/literal representation used here:
   300  	 *
   301  	 *	The low bits of 'item' are the length: 1 if this is a literal,
   302  	 *	or the match length if this is a match.
   303  	 *
   304  	 *	The high bits of 'item' are the actual literal byte if this is a
   305  	 *	literal, or the match offset if this is a match.
   306  	 */
   307  #define OPTIMUM_OFFSET_SHIFT 9
   308  #define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
   309  	u32 item;
   310  
   311  };
   312  
   313  #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
   314  
   315  /* Block split statistics.  See "Block splitting algorithm" below. */
   316  #define NUM_LITERAL_OBSERVATION_TYPES 8
   317  #define NUM_MATCH_OBSERVATION_TYPES 2
   318  #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
   319  struct block_split_stats {
   320  	u32 new_observations[NUM_OBSERVATION_TYPES];
   321  	u32 observations[NUM_OBSERVATION_TYPES];
   322  	u32 num_new_observations;
   323  	u32 num_observations;
   324  };
   325  
   326  /* The main DEFLATE compressor structure  */
   327  struct libdeflate_compressor {
   328  
   329  	/* Pointer to the compress() implementation chosen at allocation time */
   330  	size_t (*impl)(struct libdeflate_compressor *,
   331  		       const u8 *, size_t, u8 *, size_t);
   332  
   333  	/* Frequency counters for the current block  */
   334  	struct deflate_freqs freqs;
   335  
   336  	/* Dynamic Huffman codes for the current block  */
   337  	struct deflate_codes codes;
   338  
   339  	/* Static Huffman codes */
   340  	struct deflate_codes static_codes;
   341  
   342  	/* Block split statistics for the currently pending block */
   343  	struct block_split_stats split_stats;
   344  
   345  	/* A table for fast lookups of offset slot by match offset.
   346  	 *
   347  	 * If the full table is being used, it is a direct mapping from offset
   348  	 * to offset slot.
   349  	 *
   350  	 * If the condensed table is being used, the first 256 entries map
   351  	 * directly to the offset slots of offsets 1 through 256.  The next 256
   352  	 * entries map to the offset slots for the remaining offsets, stepping
   353  	 * through the offsets with a stride of 128.  This relies on the fact
   354  	 * that each of the remaining offset slots contains at least 128 offsets
   355  	 * and has an offset base that is a multiple of 128.  */
   356  #if USE_FULL_OFFSET_SLOT_FAST
   357  	u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1];
   358  #else
   359  	u8 offset_slot_fast[512];
   360  #endif
   361  
   362  	/* The "nice" match length: if a match of this length is found, choose
   363  	 * it immediately without further consideration.  */
   364  	unsigned nice_match_length;
   365  
   366  	/* The maximum search depth: consider at most this many potential
   367  	 * matches at each position.  */
   368  	unsigned max_search_depth;
   369  
   370  	/* The compression level with which this compressor was created.  */
   371  	unsigned compression_level;
   372  
   373  	/* Temporary space for Huffman code output  */
   374  	u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS];
   375  	u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
   376  	u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS];
   377  	unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS];
   378  	unsigned num_litlen_syms;
   379  	unsigned num_offset_syms;
   380  	unsigned num_explicit_lens;
   381  	unsigned num_precode_items;
   382  
   383  	union {
   384  		/* Data for greedy or lazy parsing  */
   385  		struct {
   386  			/* Hash chain matchfinder  */
   387  			struct hc_matchfinder hc_mf;
   388  
   389  			/* The matches and literals that the parser has chosen
   390  			 * for the current block.  The required length of this
   391  			 * array is limited by the maximum number of matches
   392  			 * that can ever be chosen for a single block, plus one
   393  			 * for the special entry at the end.  */
   394  			struct deflate_sequence sequences[
   395  				DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
   396  					     DEFLATE_MIN_MATCH_LEN) + 1];
   397  		} g; /* (g)reedy */
   398  
   399  	#if SUPPORT_NEAR_OPTIMAL_PARSING
   400  		/* Data for near-optimal parsing  */
   401  		struct {
   402  
   403  			/* Binary tree matchfinder  */
   404  			struct bt_matchfinder bt_mf;
   405  
   406  			/*
   407  			 * Cached matches for the current block.  This array
   408  			 * contains the matches that were found at each position
   409  			 * in the block.  Specifically, for each position, there
   410  			 * is a list of matches found at that position, if any,
   411  			 * sorted by strictly increasing length.  In addition,
   412  			 * following the matches for each position, there is a
   413  			 * special 'struct lz_match' whose 'length' member
   414  			 * contains the number of matches found at that
   415  			 * position, and whose 'offset' member contains the
   416  			 * literal at that position.
   417  			 *
   418  			 * Note: in rare cases, there will be a very high number
   419  			 * of matches in the block and this array will overflow.
   420  			 * If this happens, we force the end of the current
   421  			 * block.  CACHE_LENGTH is the length at which we
   422  			 * actually check for overflow.  The extra slots beyond
   423  			 * this are enough to absorb the worst case overflow,
   424  			 * which occurs if starting at &match_cache[CACHE_LENGTH
   425  			 * - 1], we write MAX_MATCHES_PER_POS matches and a
   426  			 * match count header, then skip searching for matches
   427  			 * at 'DEFLATE_MAX_MATCH_LEN - 1' positions and write
   428  			 * the match count header for each.
   429  			 */
   430  			struct lz_match match_cache[CACHE_LENGTH +
   431  						    MAX_MATCHES_PER_POS +
   432  						    DEFLATE_MAX_MATCH_LEN - 1];
   433  
   434  			/*
   435  			 * Array of nodes, one per position, for running the
   436  			 * minimum-cost path algorithm.
   437  			 *
   438  			 * This array must be large enough to accommodate the
   439  			 * worst-case number of nodes, which occurs if we find a
   440  			 * match of length DEFLATE_MAX_MATCH_LEN at position
   441  			 * SOFT_MAX_BLOCK_LENGTH - 1, producing a block of
   442  			 * length SOFT_MAX_BLOCK_LENGTH - 1 +
   443  			 * DEFLATE_MAX_MATCH_LEN.  Add one for the end-of-block
   444  			 * node.
   445  			 */
   446  			struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 +
   447  								  DEFLATE_MAX_MATCH_LEN + 1];
   448  
   449  			/* The current cost model being used.  */
   450  			struct deflate_costs costs;
   451  
   452  			unsigned num_optim_passes;
   453  		} n; /* (n)ear-optimal */
   454  	#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
   455  
   456  	} p; /* (p)arser */
   457  };
   458  
   459  /*
   460   * The type for the bitbuffer variable, which temporarily holds bits that are
   461   * being packed into bytes and written to the output buffer.  For best
   462   * performance, this should have size equal to a machine word.
   463   */
   464  typedef machine_word_t bitbuf_t;
   465  #define BITBUF_NBITS	(8 * sizeof(bitbuf_t))
   466  
   467  /* Can the specified number of bits always be added to 'bitbuf' after any
   468   * pending bytes have been flushed?  */
   469  #define CAN_BUFFER(n)	((n) <= BITBUF_NBITS - 7)
   470  
   471  /*
   472   * Structure to keep track of the current state of sending bits to the
   473   * compressed output buffer.
   474   */
   475  struct deflate_output_bitstream {
   476  
   477  	/* Bits that haven't yet been written to the output buffer.  */
   478  	bitbuf_t bitbuf;
   479  
   480  	/* Number of bits currently held in @bitbuf.  */
   481  	unsigned bitcount;
   482  
   483  	/* Pointer to the beginning of the output buffer.  */
   484  	u8 *begin;
   485  
   486  	/* Pointer to the position in the output buffer at which the next byte
   487  	 * should be written.  */
   488  	u8 *next;
   489  
   490  	/* Pointer just past the end of the output buffer.  */
   491  	u8 *end;
   492  };
   493  
   494  /*
   495   * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be
   496   * present following os->end, in order to not overrun the buffer when generating
   497   * output.  When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t)
   498   * bytes for put_unaligned_leword().  Otherwise we need only 1 byte.  However,
   499   * to make the compression algorithm produce the same result on all CPU
   500   * architectures (which is sometimes desirable), we have to unconditionally use
   501   * the maximum for any CPU, which is sizeof(bitbuf_t) == 8.
   502   */
   503  #define OUTPUT_END_PADDING	8
   504  
   505  /* Initialize the output bitstream.  'size' is assumed to be at least
   506   * OUTPUT_END_PADDING.  */
   507  static void
   508  deflate_init_output(struct deflate_output_bitstream *os,
   509  		    void *buffer, size_t size)
   510  {
   511  	os->bitbuf = 0;
   512  	os->bitcount = 0;
   513  	os->begin = buffer;
   514  	os->next = os->begin;
   515  	os->end = os->begin + size - OUTPUT_END_PADDING;
   516  }
   517  
   518  /* Add some bits to the bitbuffer variable of the output bitstream.  The caller
   519   * must make sure there is enough room.  */
   520  static forceinline void
   521  deflate_add_bits(struct deflate_output_bitstream *os,
   522  		 const bitbuf_t bits, const unsigned num_bits)
   523  {
   524  	os->bitbuf |= bits << os->bitcount;
   525  	os->bitcount += num_bits;
   526  }
   527  
   528  /* Flush bits from the bitbuffer variable to the output buffer.  */
   529  static forceinline void
   530  deflate_flush_bits(struct deflate_output_bitstream *os)
   531  {
   532  	if (UNALIGNED_ACCESS_IS_FAST) {
   533  		/* Flush a whole word (branchlessly).  */
   534  		put_unaligned_leword(os->bitbuf, os->next);
   535  		os->bitbuf >>= os->bitcount & ~7;
   536  		os->next += MIN(os->end - os->next, os->bitcount >> 3);
   537  		os->bitcount &= 7;
   538  	} else {
   539  		/* Flush a byte at a time.  */
   540  		while (os->bitcount >= 8) {
   541  			*os->next = os->bitbuf;
   542  			if (os->next != os->end)
   543  				os->next++;
   544  			os->bitcount -= 8;
   545  			os->bitbuf >>= 8;
   546  		}
   547  	}
   548  }
   549  
   550  /* Align the bitstream on a byte boundary. */
   551  static forceinline void
   552  deflate_align_bitstream(struct deflate_output_bitstream *os)
   553  {
   554  	os->bitcount += -os->bitcount & 7;
   555  	deflate_flush_bits(os);
   556  }
   557  
   558  /*
   559   * Flush any remaining bits to the output buffer if needed.  Return the total
   560   * number of bytes written to the output buffer, or 0 if an overflow occurred.
   561   */
   562  static u32
   563  deflate_flush_output(struct deflate_output_bitstream *os)
   564  {
   565  	if (os->next == os->end) /* overflow?  */
   566  		return 0;
   567  
   568  	while ((int)os->bitcount > 0) {
   569  		*os->next++ = os->bitbuf;
   570  		os->bitcount -= 8;
   571  		os->bitbuf >>= 8;
   572  	}
   573  
   574  	return os->next - os->begin;
   575  }
   576  
   577  /* Given the binary tree node A[subtree_idx] whose children already
   578   * satisfy the maxheap property, swap the node with its greater child
   579   * until it is greater than both its children, so that the maxheap
   580   * property is satisfied in the subtree rooted at A[subtree_idx].  */
   581  static void
   582  heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
   583  {
   584  	unsigned parent_idx;
   585  	unsigned child_idx;
   586  	u32 v;
   587  
   588  	v = A[subtree_idx];
   589  	parent_idx = subtree_idx;
   590  	while ((child_idx = parent_idx * 2) <= length) {
   591  		if (child_idx < length && A[child_idx + 1] > A[child_idx])
   592  			child_idx++;
   593  		if (v >= A[child_idx])
   594  			break;
   595  		A[parent_idx] = A[child_idx];
   596  		parent_idx = child_idx;
   597  	}
   598  	A[parent_idx] = v;
   599  }
   600  
   601  /* Rearrange the array 'A' so that it satisfies the maxheap property.
   602   * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
   603   */
   604  static void
   605  heapify_array(u32 A[], unsigned length)
   606  {
   607  	unsigned subtree_idx;
   608  
   609  	for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
   610  		heapify_subtree(A, length, subtree_idx);
   611  }
   612  
   613  /*
   614   * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
   615   *
   616   * Note: name this function heap_sort() instead of heapsort() to avoid colliding
   617   * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
   618   * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
   619   */
   620  static void
   621  heap_sort(u32 A[], unsigned length)
   622  {
   623  	A--; /* Use 1-based indices  */
   624  
   625  	heapify_array(A, length);
   626  
   627  	while (length >= 2) {
   628  		u32 tmp = A[length];
   629  		A[length] = A[1];
   630  		A[1] = tmp;
   631  		length--;
   632  		heapify_subtree(A, length, 1);
   633  	}
   634  }
   635  
   636  #define NUM_SYMBOL_BITS 10
   637  #define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
   638  
   639  #define GET_NUM_COUNTERS(num_syms)	((((num_syms) + 3 / 4) + 3) & ~3)
   640  /*
   641   * Sort the symbols primarily by frequency and secondarily by symbol
   642   * value.  Discard symbols with zero frequency and fill in an array with
   643   * the remaining symbols, along with their frequencies.  The low
   644   * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
   645   * value, and the remaining bits will contain the frequency.
   646   *
   647   * @num_syms
   648   *	Number of symbols in the alphabet.
   649   *	Can't be greater than (1 << NUM_SYMBOL_BITS).
   650   *
   651   * @freqs[num_syms]
   652   *	The frequency of each symbol.
   653   *
   654   * @lens[num_syms]
   655   *	An array that eventually will hold the length of each codeword.
   656   *	This function only fills in the codeword lengths for symbols that
   657   *	have zero frequency, which are not well defined per se but will
   658   *	be set to 0.
   659   *
   660   * @symout[num_syms]
   661   *	The output array, described above.
   662   *
   663   * Returns the number of entries in 'symout' that were filled.  This is
   664   * the number of symbols that have nonzero frequency.
   665   */
   666  static unsigned
   667  sort_symbols(unsigned num_syms, const u32 freqs[restrict],
   668  	     u8 lens[restrict], u32 symout[restrict])
   669  {
   670  	unsigned sym;
   671  	unsigned i;
   672  	unsigned num_used_syms;
   673  	unsigned num_counters;
   674  	unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
   675  
   676  	/* We rely on heapsort, but with an added optimization.  Since
   677  	 * it's common for most symbol frequencies to be low, we first do
   678  	 * a count sort using a limited number of counters.  High
   679  	 * frequencies will be counted in the last counter, and only they
   680  	 * will be sorted with heapsort.
   681  	 *
   682  	 * Note: with more symbols, it is generally beneficial to have more
   683  	 * counters.  About 1 counter per 4 symbols seems fast.
   684  	 *
   685  	 * Note: I also tested radix sort, but even for large symbol
   686  	 * counts (> 255) and frequencies bounded at 16 bits (enabling
   687  	 * radix sort by just two base-256 digits), it didn't seem any
   688  	 * faster than the method implemented here.
   689  	 *
   690  	 * Note: I tested the optimized quicksort implementation from
   691  	 * glibc (with indirection overhead removed), but it was only
   692  	 * marginally faster than the simple heapsort implemented here.
   693  	 *
   694  	 * Tests were done with building the codes for LZX.  Results may
   695  	 * vary for different compression algorithms...!  */
   696  
   697  	num_counters = GET_NUM_COUNTERS(num_syms);
   698  
   699  	memset(counters, 0, num_counters * sizeof(counters[0]));
   700  
   701  	/* Count the frequencies.  */
   702  	for (sym = 0; sym < num_syms; sym++)
   703  		counters[MIN(freqs[sym], num_counters - 1)]++;
   704  
   705  	/* Make the counters cumulative, ignoring the zero-th, which
   706  	 * counted symbols with zero frequency.  As a side effect, this
   707  	 * calculates the number of symbols with nonzero frequency.  */
   708  	num_used_syms = 0;
   709  	for (i = 1; i < num_counters; i++) {
   710  		unsigned count = counters[i];
   711  		counters[i] = num_used_syms;
   712  		num_used_syms += count;
   713  	}
   714  
   715  	/* Sort nonzero-frequency symbols using the counters.  At the
   716  	 * same time, set the codeword lengths of zero-frequency symbols
   717  	 * to 0.  */
   718  	for (sym = 0; sym < num_syms; sym++) {
   719  		u32 freq = freqs[sym];
   720  		if (freq != 0) {
   721  			symout[counters[MIN(freq, num_counters - 1)]++] =
   722  				sym | (freq << NUM_SYMBOL_BITS);
   723  		} else {
   724  			lens[sym] = 0;
   725  		}
   726  	}
   727  
   728  	/* Sort the symbols counted in the last counter.  */
   729  	heap_sort(symout + counters[num_counters - 2],
   730  		  counters[num_counters - 1] - counters[num_counters - 2]);
   731  
   732  	return num_used_syms;
   733  }
   734  
   735  /*
   736   * Build the Huffman tree.
   737   *
   738   * This is an optimized implementation that
   739   *	(a) takes advantage of the frequencies being already sorted;
   740   *	(b) only generates non-leaf nodes, since the non-leaf nodes of a
   741   *	    Huffman tree are sufficient to generate a canonical code;
   742   *	(c) Only stores parent pointers, not child pointers;
   743   *	(d) Produces the nodes in the same memory used for input
   744   *	    frequency information.
   745   *
   746   * Array 'A', which contains 'sym_count' entries, is used for both input
   747   * and output.  For this function, 'sym_count' must be at least 2.
   748   *
   749   * For input, the array must contain the frequencies of the symbols,
   750   * sorted in increasing order.  Specifically, each entry must contain a
   751   * frequency left shifted by NUM_SYMBOL_BITS bits.  Any data in the low
   752   * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
   753   * Although these bits will, in fact, contain the symbols that correspond
   754   * to the frequencies, this function is concerned with frequencies only
   755   * and keeps the symbols as-is.
   756   *
   757   * For output, this function will produce the non-leaf nodes of the
   758   * Huffman tree.  These nodes will be stored in the first (sym_count - 1)
   759   * entries of the array.  Entry A[sym_count - 2] will represent the root
   760   * node.  Each other node will contain the zero-based index of its parent
   761   * node in 'A', left shifted by NUM_SYMBOL_BITS bits.  The low
   762   * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is.  Again,
   763   * note that although these low bits will, in fact, contain a symbol
   764   * value, this symbol will have *no relationship* with the Huffman tree
   765   * node that happens to occupy the same slot.  This is because this
   766   * implementation only generates the non-leaf nodes of the tree.
   767   */
   768  static void
   769  build_tree(u32 A[], unsigned sym_count)
   770  {
   771  	/* Index, in 'A', of next lowest frequency symbol that has not
   772  	 * yet been processed.  */
   773  	unsigned i = 0;
   774  
   775  	/* Index, in 'A', of next lowest frequency parentless non-leaf
   776  	 * node; or, if equal to 'e', then no such node exists yet.  */
   777  	unsigned b = 0;
   778  
   779  	/* Index, in 'A', of next node to allocate as a non-leaf.  */
   780  	unsigned e = 0;
   781  
   782  	do {
   783  		unsigned m, n;
   784  		u32 freq_shifted;
   785  
   786  		/* Choose the two next lowest frequency entries.  */
   787  
   788  		if (i != sym_count &&
   789  		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
   790  			m = i++;
   791  		else
   792  			m = b++;
   793  
   794  		if (i != sym_count &&
   795  		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
   796  			n = i++;
   797  		else
   798  			n = b++;
   799  
   800  		/* Allocate a non-leaf node and link the entries to it.
   801  		 *
   802  		 * If we link an entry that we're visiting for the first
   803  		 * time (via index 'i'), then we're actually linking a
   804  		 * leaf node and it will have no effect, since the leaf
   805  		 * will be overwritten with a non-leaf when index 'e'
   806  		 * catches up to it.  But it's not any slower to
   807  		 * unconditionally set the parent index.
   808  		 *
   809  		 * We also compute the frequency of the non-leaf node as
   810  		 * the sum of its two children's frequencies.  */
   811  
   812  		freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
   813  
   814  		A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
   815  		A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
   816  		A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
   817  		e++;
   818  	} while (sym_count - e > 1);
   819  		/* When just one entry remains, it is a "leaf" that was
   820  		 * linked to some other node.  We ignore it, since the
   821  		 * rest of the array contains the non-leaves which we
   822  		 * need.  (Note that we're assuming the cases with 0 or 1
   823  		 * symbols were handled separately.) */
   824  }
   825  
   826  /*
   827   * Given the stripped-down Huffman tree constructed by build_tree(),
   828   * determine the number of codewords that should be assigned each
   829   * possible length, taking into account the length-limited constraint.
   830   *
   831   * @A
   832   *	The array produced by build_tree(), containing parent index
   833   *	information for the non-leaf nodes of the Huffman tree.  Each
   834   *	entry in this array is a node; a node's parent always has a
   835   *	greater index than that node itself.  This function will
   836   *	overwrite the parent index information in this array, so
   837   *	essentially it will destroy the tree.  However, the data in the
   838   *	low NUM_SYMBOL_BITS of each entry will be preserved.
   839   *
   840   * @root_idx
   841   *	The 0-based index of the root node in 'A', and consequently one
   842   *	less than the number of tree node entries in 'A'.  (Or, really 2
   843   *	less than the actual length of 'A'.)
   844   *
   845   * @len_counts
   846   *	An array of length ('max_codeword_len' + 1) in which the number of
   847   *	codewords having each length <= max_codeword_len will be
   848   *	returned.
   849   *
   850   * @max_codeword_len
   851   *	The maximum permissible codeword length.
   852   */
   853  static void
   854  compute_length_counts(u32 A[restrict], unsigned root_idx,
   855  		      unsigned len_counts[restrict], unsigned max_codeword_len)
   856  {
   857  	unsigned len;
   858  	int node;
   859  
   860  	/* The key observations are:
   861  	 *
   862  	 * (1) We can traverse the non-leaf nodes of the tree, always
   863  	 * visiting a parent before its children, by simply iterating
   864  	 * through the array in reverse order.  Consequently, we can
   865  	 * compute the depth of each node in one pass, overwriting the
   866  	 * parent indices with depths.
   867  	 *
   868  	 * (2) We can initially assume that in the real Huffman tree,
   869  	 * both children of the root are leaves.  This corresponds to two
   870  	 * codewords of length 1.  Then, whenever we visit a (non-leaf)
   871  	 * node during the traversal, we modify this assumption to
   872  	 * account for the current node *not* being a leaf, but rather
   873  	 * its two children being leaves.  This causes the loss of one
   874  	 * codeword for the current depth and the addition of two
   875  	 * codewords for the current depth plus one.
   876  	 *
   877  	 * (3) We can handle the length-limited constraint fairly easily
   878  	 * by simply using the largest length available when a depth
   879  	 * exceeds max_codeword_len.
   880  	 */
   881  
   882  	for (len = 0; len <= max_codeword_len; len++)
   883  		len_counts[len] = 0;
   884  	len_counts[1] = 2;
   885  
   886  	/* Set the root node's depth to 0.  */
   887  	A[root_idx] &= SYMBOL_MASK;
   888  
   889  	for (node = root_idx - 1; node >= 0; node--) {
   890  
   891  		/* Calculate the depth of this node.  */
   892  
   893  		unsigned parent = A[node] >> NUM_SYMBOL_BITS;
   894  		unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
   895  		unsigned depth = parent_depth + 1;
   896  		unsigned len = depth;
   897  
   898  		/* Set the depth of this node so that it is available
   899  		 * when its children (if any) are processed.  */
   900  
   901  		A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
   902  
   903  		/* If needed, decrease the length to meet the
   904  		 * length-limited constraint.  This is not the optimal
   905  		 * method for generating length-limited Huffman codes!
   906  		 * But it should be good enough.  */
   907  		if (len >= max_codeword_len) {
   908  			len = max_codeword_len;
   909  			do {
   910  				len--;
   911  			} while (len_counts[len] == 0);
   912  		}
   913  
   914  		/* Account for the fact that we have a non-leaf node at
   915  		 * the current depth.  */
   916  		len_counts[len]--;
   917  		len_counts[len + 1] += 2;
   918  	}
   919  }
   920  
   921  /*
   922   * Generate the codewords for a canonical Huffman code.
   923   *
   924   * @A
   925   *	The output array for codewords.  In addition, initially this
   926   *	array must contain the symbols, sorted primarily by frequency and
   927   *	secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
   928   *	each entry.
   929   *
   930   * @len
   931   *	Output array for codeword lengths.
   932   *
   933   * @len_counts
   934   *	An array that provides the number of codewords that will have
   935   *	each possible length <= max_codeword_len.
   936   *
   937   * @max_codeword_len
   938   *	Maximum length, in bits, of each codeword.
   939   *
   940   * @num_syms
   941   *	Number of symbols in the alphabet, including symbols with zero
   942   *	frequency.  This is the length of the 'A' and 'len' arrays.
   943   */
   944  static void
   945  gen_codewords(u32 A[restrict], u8 lens[restrict],
   946  	      const unsigned len_counts[restrict],
   947  	      unsigned max_codeword_len, unsigned num_syms)
   948  {
   949  	u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
   950  	unsigned i;
   951  	unsigned len;
   952  	unsigned sym;
   953  
   954  	/* Given the number of codewords that will have each length,
   955  	 * assign codeword lengths to symbols.  We do this by assigning
   956  	 * the lengths in decreasing order to the symbols sorted
   957  	 * primarily by increasing frequency and secondarily by
   958  	 * increasing symbol value.  */
   959  	for (i = 0, len = max_codeword_len; len >= 1; len--) {
   960  		unsigned count = len_counts[len];
   961  		while (count--)
   962  			lens[A[i++] & SYMBOL_MASK] = len;
   963  	}
   964  
   965  	/* Generate the codewords themselves.  We initialize the
   966  	 * 'next_codewords' array to provide the lexicographically first
   967  	 * codeword of each length, then assign codewords in symbol
   968  	 * order.  This produces a canonical code.  */
   969  	next_codewords[0] = 0;
   970  	next_codewords[1] = 0;
   971  	for (len = 2; len <= max_codeword_len; len++)
   972  		next_codewords[len] =
   973  			(next_codewords[len - 1] + len_counts[len - 1]) << 1;
   974  
   975  	for (sym = 0; sym < num_syms; sym++)
   976  		A[sym] = next_codewords[lens[sym]]++;
   977  }
   978  
   979  /*
   980   * ---------------------------------------------------------------------
   981   *			make_canonical_huffman_code()
   982   * ---------------------------------------------------------------------
   983   *
   984   * Given an alphabet and the frequency of each symbol in it, construct a
   985   * length-limited canonical Huffman code.
   986   *
   987   * @num_syms
   988   *	The number of symbols in the alphabet.  The symbols are the
   989   *	integers in the range [0, num_syms - 1].  This parameter must be
   990   *	at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
   991   *
   992   * @max_codeword_len
   993   *	The maximum permissible codeword length.
   994   *
   995   * @freqs
   996   *	An array of @num_syms entries, each of which specifies the
   997   *	frequency of the corresponding symbol.  It is valid for some,
   998   *	none, or all of the frequencies to be 0.
   999   *
  1000   * @lens
  1001   *	An array of @num_syms entries in which this function will return
  1002   *	the length, in bits, of the codeword assigned to each symbol.
  1003   *	Symbols with 0 frequency will not have codewords per se, but
  1004   *	their entries in this array will be set to 0.  No lengths greater
  1005   *	than @max_codeword_len will be assigned.
  1006   *
  1007   * @codewords
  1008   *	An array of @num_syms entries in which this function will return
  1009   *	the codeword for each symbol, right-justified and padded on the
  1010   *	left with zeroes.  Codewords for symbols with 0 frequency will be
  1011   *	undefined.
  1012   *
  1013   * ---------------------------------------------------------------------
  1014   *
  1015   * This function builds a length-limited canonical Huffman code.
  1016   *
  1017   * A length-limited Huffman code contains no codewords longer than some
  1018   * specified length, and has exactly (with some algorithms) or
  1019   * approximately (with the algorithm used here) the minimum weighted path
  1020   * length from the root, given this constraint.
  1021   *
  1022   * A canonical Huffman code satisfies the properties that a longer
  1023   * codeword never lexicographically precedes a shorter codeword, and the
  1024   * lexicographic ordering of codewords of the same length is the same as
  1025   * the lexicographic ordering of the corresponding symbols.  A canonical
  1026   * Huffman code, or more generally a canonical prefix code, can be
  1027   * reconstructed from only a list containing the codeword length of each
  1028   * symbol.
  1029   *
  1030   * The classic algorithm to generate a Huffman code creates a node for
  1031   * each symbol, then inserts these nodes into a min-heap keyed by symbol
  1032   * frequency.  Then, repeatedly, the two lowest-frequency nodes are
  1033   * removed from the min-heap and added as the children of a new node
  1034   * having frequency equal to the sum of its two children, which is then
  1035   * inserted into the min-heap.  When only a single node remains in the
  1036   * min-heap, it is the root of the Huffman tree.  The codeword for each
  1037   * symbol is determined by the path needed to reach the corresponding
  1038   * node from the root.  Descending to the left child appends a 0 bit,
  1039   * whereas descending to the right child appends a 1 bit.
  1040   *
  1041   * The classic algorithm is relatively easy to understand, but it is
  1042   * subject to a number of inefficiencies.  In practice, it is fastest to
  1043   * first sort the symbols by frequency.  (This itself can be subject to
  1044   * an optimization based on the fact that most frequencies tend to be
  1045   * low.)  At the same time, we sort secondarily by symbol value, which
  1046   * aids the process of generating a canonical code.  Then, during tree
  1047   * construction, no heap is necessary because both the leaf nodes and the
  1048   * unparented non-leaf nodes can be easily maintained in sorted order.
  1049   * Consequently, there can never be more than two possibilities for the
  1050   * next-lowest-frequency node.
  1051   *
  1052   * In addition, because we're generating a canonical code, we actually
  1053   * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
  1054   * This is because for canonical code generation we don't need to know
  1055   * where the symbols are in the tree.  Rather, we only need to know how
  1056   * many leaf nodes have each depth (codeword length).  And this
  1057   * information can, in fact, be quickly generated from the tree of
  1058   * non-leaves only.
  1059   *
  1060   * Furthermore, we can build this stripped-down Huffman tree directly in
  1061   * the array in which the codewords are to be generated, provided that
  1062   * these array slots are large enough to hold a symbol and frequency
  1063   * value.
  1064   *
  1065   * Still furthermore, we don't even need to maintain explicit child
  1066   * pointers.  We only need the parent pointers, and even those can be
  1067   * overwritten in-place with depth information as part of the process of
  1068   * extracting codeword lengths from the tree.  So in summary, we do NOT
  1069   * need a big structure like:
  1070   *
  1071   *	struct huffman_tree_node {
  1072   *		unsigned int symbol;
  1073   *		unsigned int frequency;
  1074   *		unsigned int depth;
  1075   *		struct huffman_tree_node *left_child;
  1076   *		struct huffman_tree_node *right_child;
  1077   *	};
  1078   *
  1079   *
  1080   *   ... which often gets used in "naive" implementations of Huffman code
  1081   *   generation.
  1082   *
  1083   * Many of these optimizations are based on the implementation in 7-Zip
  1084   * (source file: C/HuffEnc.c), which has been placed in the public domain
  1085   * by Igor Pavlov.
  1086   */
  1087  static void
  1088  make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
  1089  			    const u32 freqs[restrict],
  1090  			    u8 lens[restrict], u32 codewords[restrict])
  1091  {
  1092  	u32 *A = codewords;
  1093  	unsigned num_used_syms;
  1094  
  1095  	STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
  1096  
  1097  	/* We begin by sorting the symbols primarily by frequency and
  1098  	 * secondarily by symbol value.  As an optimization, the array
  1099  	 * used for this purpose ('A') shares storage with the space in
  1100  	 * which we will eventually return the codewords.  */
  1101  
  1102  	num_used_syms = sort_symbols(num_syms, freqs, lens, A);
  1103  
  1104  	/* 'num_used_syms' is the number of symbols with nonzero
  1105  	 * frequency.  This may be less than @num_syms.  'num_used_syms'
  1106  	 * is also the number of entries in 'A' that are valid.  Each
  1107  	 * entry consists of a distinct symbol and a nonzero frequency
  1108  	 * packed into a 32-bit integer.  */
  1109  
  1110  	/* Handle special cases where only 0 or 1 symbols were used (had
  1111  	 * nonzero frequency).  */
  1112  
  1113  	if (unlikely(num_used_syms == 0)) {
  1114  		/* Code is empty.  sort_symbols() already set all lengths
  1115  		 * to 0, so there is nothing more to do.  */
  1116  		return;
  1117  	}
  1118  
  1119  	if (unlikely(num_used_syms == 1)) {
  1120  		/* Only one symbol was used, so we only need one
  1121  		 * codeword.  But two codewords are needed to form the
  1122  		 * smallest complete Huffman code, which uses codewords 0
  1123  		 * and 1.  Therefore, we choose another symbol to which
  1124  		 * to assign a codeword.  We use 0 (if the used symbol is
  1125  		 * not 0) or 1 (if the used symbol is 0).  In either
  1126  		 * case, the lesser-valued symbol must be assigned
  1127  		 * codeword 0 so that the resulting code is canonical.  */
  1128  
  1129  		unsigned sym = A[0] & SYMBOL_MASK;
  1130  		unsigned nonzero_idx = sym ? sym : 1;
  1131  
  1132  		codewords[0] = 0;
  1133  		lens[0] = 1;
  1134  		codewords[nonzero_idx] = 1;
  1135  		lens[nonzero_idx] = 1;
  1136  		return;
  1137  	}
  1138  
  1139  	/* Build a stripped-down version of the Huffman tree, sharing the
  1140  	 * array 'A' with the symbol values.  Then extract length counts
  1141  	 * from the tree and use them to generate the final codewords.  */
  1142  
  1143  	build_tree(A, num_used_syms);
  1144  
  1145  	{
  1146  		unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
  1147  
  1148  		compute_length_counts(A, num_used_syms - 2,
  1149  				      len_counts, max_codeword_len);
  1150  
  1151  		gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
  1152  	}
  1153  }
  1154  
  1155  /*
  1156   * Clear the Huffman symbol frequency counters.
  1157   * This must be called when starting a new DEFLATE block.
  1158   */
  1159  static void
  1160  deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
  1161  {
  1162  	memset(&c->freqs, 0, sizeof(c->freqs));
  1163  }
  1164  
  1165  /* Reverse the Huffman codeword 'codeword', which is 'len' bits in length.  */
  1166  static u32
  1167  deflate_reverse_codeword(u32 codeword, u8 len)
  1168  {
  1169  	/* The following branchless algorithm is faster than going bit by bit.
  1170  	 * Note: since no codewords are longer than 16 bits, we only need to
  1171  	 * reverse the low 16 bits of the 'u32'.  */
  1172  	STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
  1173  
  1174  	/* Flip adjacent 1-bit fields  */
  1175  	codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1);
  1176  
  1177  	/* Flip adjacent 2-bit fields  */
  1178  	codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2);
  1179  
  1180  	/* Flip adjacent 4-bit fields  */
  1181  	codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4);
  1182  
  1183  	/* Flip adjacent 8-bit fields  */
  1184  	codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8);
  1185  
  1186  	/* Return the high 'len' bits of the bit-reversed 16 bit value.  */
  1187  	return codeword >> (16 - len);
  1188  }
  1189  
  1190  /* Make a canonical Huffman code with bit-reversed codewords.  */
  1191  static void
  1192  deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
  1193  			  const u32 freqs[], u8 lens[], u32 codewords[])
  1194  {
  1195  	unsigned sym;
  1196  
  1197  	make_canonical_huffman_code(num_syms, max_codeword_len,
  1198  				    freqs, lens, codewords);
  1199  
  1200  	for (sym = 0; sym < num_syms; sym++)
  1201  		codewords[sym] = deflate_reverse_codeword(codewords[sym], lens[sym]);
  1202  }
  1203  
  1204  /*
  1205   * Build the literal/length and offset Huffman codes for a DEFLATE block.
  1206   *
  1207   * This takes as input the frequency tables for each code and produces as output
  1208   * a set of tables that map symbols to codewords and codeword lengths.
  1209   */
  1210  static void
  1211  deflate_make_huffman_codes(const struct deflate_freqs *freqs,
  1212  			   struct deflate_codes *codes)
  1213  {
  1214  	STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
  1215  	STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
  1216  
  1217  	deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
  1218  				  MAX_LITLEN_CODEWORD_LEN,
  1219  				  freqs->litlen,
  1220  				  codes->lens.litlen,
  1221  				  codes->codewords.litlen);
  1222  
  1223  	deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
  1224  				  MAX_OFFSET_CODEWORD_LEN,
  1225  				  freqs->offset,
  1226  				  codes->lens.offset,
  1227  				  codes->codewords.offset);
  1228  }
  1229  
  1230  /* Initialize c->static_codes.  */
  1231  static void
  1232  deflate_init_static_codes(struct libdeflate_compressor *c)
  1233  {
  1234  	unsigned i;
  1235  
  1236  	for (i = 0; i < 144; i++)
  1237  		c->freqs.litlen[i] = 1 << (9 - 8);
  1238  	for (; i < 256; i++)
  1239  		c->freqs.litlen[i] = 1 << (9 - 9);
  1240  	for (; i < 280; i++)
  1241  		c->freqs.litlen[i] = 1 << (9 - 7);
  1242  	for (; i < 288; i++)
  1243  		c->freqs.litlen[i] = 1 << (9 - 8);
  1244  
  1245  	for (i = 0; i < 32; i++)
  1246  		c->freqs.offset[i] = 1 << (5 - 5);
  1247  
  1248  	deflate_make_huffman_codes(&c->freqs, &c->static_codes);
  1249  }
  1250  
  1251  /* Return the offset slot for the specified match offset.  */
  1252  static forceinline unsigned
  1253  deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset)
  1254  {
  1255  #if USE_FULL_OFFSET_SLOT_FAST
  1256  	return c->offset_slot_fast[offset];
  1257  #else
  1258  	if (offset <= 256)
  1259  		return c->offset_slot_fast[offset - 1];
  1260  	else
  1261  		return c->offset_slot_fast[256 + ((offset - 1) >> 7)];
  1262  #endif
  1263  }
  1264  
  1265  /* Write the header fields common to all DEFLATE block types.  */
  1266  static void
  1267  deflate_write_block_header(struct deflate_output_bitstream *os,
  1268  			   bool is_final_block, unsigned block_type)
  1269  {
  1270  	deflate_add_bits(os, is_final_block, 1);
  1271  	deflate_add_bits(os, block_type, 2);
  1272  	deflate_flush_bits(os);
  1273  }
  1274  
  1275  static unsigned
  1276  deflate_compute_precode_items(const u8 lens[restrict],
  1277  			      const unsigned num_lens,
  1278  			      u32 precode_freqs[restrict],
  1279  			      unsigned precode_items[restrict])
  1280  {
  1281  	unsigned *itemptr;
  1282  	unsigned run_start;
  1283  	unsigned run_end;
  1284  	unsigned extra_bits;
  1285  	u8 len;
  1286  
  1287  	memset(precode_freqs, 0,
  1288  	       DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));
  1289  
  1290  	itemptr = precode_items;
  1291  	run_start = 0;
  1292  	do {
  1293  		/* Find the next run of codeword lengths.  */
  1294  
  1295  		/* len = the length being repeated  */
  1296  		len = lens[run_start];
  1297  
  1298  		/* Extend the run.  */
  1299  		run_end = run_start;
  1300  		do {
  1301  			run_end++;
  1302  		} while (run_end != num_lens && len == lens[run_end]);
  1303  
  1304  		if (len == 0) {
  1305  			/* Run of zeroes.  */
  1306  
  1307  			/* Symbol 18: RLE 11 to 138 zeroes at a time.  */
  1308  			while ((run_end - run_start) >= 11) {
  1309  				extra_bits = MIN((run_end - run_start) - 11, 0x7F);
  1310  				precode_freqs[18]++;
  1311  				*itemptr++ = 18 | (extra_bits << 5);
  1312  				run_start += 11 + extra_bits;
  1313  			}
  1314  
  1315  			/* Symbol 17: RLE 3 to 10 zeroes at a time.  */
  1316  			if ((run_end - run_start) >= 3) {
  1317  				extra_bits = MIN((run_end - run_start) - 3, 0x7);
  1318  				precode_freqs[17]++;
  1319  				*itemptr++ = 17 | (extra_bits << 5);
  1320  				run_start += 3 + extra_bits;
  1321  			}
  1322  		} else {
  1323  
  1324  			/* A run of nonzero lengths. */
  1325  
  1326  			/* Symbol 16: RLE 3 to 6 of the previous length.  */
  1327  			if ((run_end - run_start) >= 4) {
  1328  				precode_freqs[len]++;
  1329  				*itemptr++ = len;
  1330  				run_start++;
  1331  				do {
  1332  					extra_bits = MIN((run_end - run_start) - 3, 0x3);
  1333  					precode_freqs[16]++;
  1334  					*itemptr++ = 16 | (extra_bits << 5);
  1335  					run_start += 3 + extra_bits;
  1336  				} while ((run_end - run_start) >= 3);
  1337  			}
  1338  		}
  1339  
  1340  		/* Output any remaining lengths without RLE.  */
  1341  		while (run_start != run_end) {
  1342  			precode_freqs[len]++;
  1343  			*itemptr++ = len;
  1344  			run_start++;
  1345  		}
  1346  	} while (run_start != num_lens);
  1347  
  1348  	return itemptr - precode_items;
  1349  }
  1350  
  1351  /*
  1352   * Huffman codeword lengths for dynamic Huffman blocks are compressed using a
  1353   * separate Huffman code, the "precode", which contains a symbol for each
  1354   * possible codeword length in the larger code as well as several special
  1355   * symbols to represent repeated codeword lengths (a form of run-length
  1356   * encoding).  The precode is itself constructed in canonical form, and its
  1357   * codeword lengths are represented literally in 19 3-bit fields that
  1358   * immediately precede the compressed codeword lengths of the larger code.
  1359   */
  1360  
  1361  /* Precompute the information needed to output Huffman codes. */
  1362  static void
  1363  deflate_precompute_huffman_header(struct libdeflate_compressor *c)
  1364  {
  1365  	/* Compute how many litlen and offset symbols are needed. */
  1366  
  1367  	for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
  1368  	     c->num_litlen_syms > 257;
  1369  	     c->num_litlen_syms--)
  1370  		if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0)
  1371  			break;
  1372  
  1373  	for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
  1374  	     c->num_offset_syms > 1;
  1375  	     c->num_offset_syms--)
  1376  		if (c->codes.lens.offset[c->num_offset_syms - 1] != 0)
  1377  			break;
  1378  
  1379  	/* If we're not using the full set of literal/length codeword lengths,
  1380  	 * then temporarily move the offset codeword lengths over so that the
  1381  	 * literal/length and offset codeword lengths are contiguous. */
  1382  
  1383  	STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
  1384  		      DEFLATE_NUM_LITLEN_SYMS);
  1385  
  1386  	if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
  1387  		memmove((u8 *)&c->codes.lens + c->num_litlen_syms,
  1388  			(u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
  1389  			c->num_offset_syms);
  1390  	}
  1391  
  1392  	/* Compute the "items" (RLE / literal tokens and extra bits) with which
  1393  	 * the codeword lengths in the larger code will be output. */
  1394  	c->num_precode_items =
  1395  		deflate_compute_precode_items((u8 *)&c->codes.lens,
  1396  					      c->num_litlen_syms +
  1397  							c->num_offset_syms,
  1398  					      c->precode_freqs,
  1399  					      c->precode_items);
  1400  
  1401  	/* Build the precode. */
  1402  	STATIC_ASSERT(MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
  1403  	deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
  1404  				  MAX_PRE_CODEWORD_LEN,
  1405  				  c->precode_freqs, c->precode_lens,
  1406  				  c->precode_codewords);
  1407  
  1408  	/* Count how many precode lengths we actually need to output. */
  1409  	for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
  1410  	     c->num_explicit_lens > 4;
  1411  	     c->num_explicit_lens--)
  1412  		if (c->precode_lens[deflate_precode_lens_permutation[
  1413  						c->num_explicit_lens - 1]] != 0)
  1414  			break;
  1415  
  1416  	/* Restore the offset codeword lengths if needed. */
  1417  	if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
  1418  		memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
  1419  			(u8 *)&c->codes.lens + c->num_litlen_syms,
  1420  			c->num_offset_syms);
  1421  	}
  1422  }
  1423  
  1424  /* Output the Huffman codes. */
  1425  static void
  1426  deflate_write_huffman_header(struct libdeflate_compressor *c,
  1427  			     struct deflate_output_bitstream *os)
  1428  {
  1429  	unsigned i;
  1430  
  1431  	deflate_add_bits(os, c->num_litlen_syms - 257, 5);
  1432  	deflate_add_bits(os, c->num_offset_syms - 1, 5);
  1433  	deflate_add_bits(os, c->num_explicit_lens - 4, 4);
  1434  	deflate_flush_bits(os);
  1435  
  1436  	/* Output the lengths of the codewords in the precode.  */
  1437  	for (i = 0; i < c->num_explicit_lens; i++) {
  1438  		deflate_add_bits(os, c->precode_lens[
  1439  				       deflate_precode_lens_permutation[i]], 3);
  1440  		deflate_flush_bits(os);
  1441  	}
  1442  
  1443  	/* Output the encoded lengths of the codewords in the larger code.  */
  1444  	for (i = 0; i < c->num_precode_items; i++) {
  1445  		unsigned precode_item = c->precode_items[i];
  1446  		unsigned precode_sym = precode_item & 0x1F;
  1447  		deflate_add_bits(os, c->precode_codewords[precode_sym],
  1448  				 c->precode_lens[precode_sym]);
  1449  		if (precode_sym >= 16) {
  1450  			if (precode_sym == 16)
  1451  				deflate_add_bits(os, precode_item >> 5, 2);
  1452  			else if (precode_sym == 17)
  1453  				deflate_add_bits(os, precode_item >> 5, 3);
  1454  			else
  1455  				deflate_add_bits(os, precode_item >> 5, 7);
  1456  		}
  1457  		STATIC_ASSERT(CAN_BUFFER(DEFLATE_MAX_PRE_CODEWORD_LEN + 7));
  1458  		deflate_flush_bits(os);
  1459  	}
  1460  }
  1461  
  1462  static void
  1463  deflate_write_sequences(struct deflate_output_bitstream * restrict os,
  1464  			const struct deflate_codes * restrict codes,
  1465  			const struct deflate_sequence sequences[restrict],
  1466  			const u8 * restrict in_next)
  1467  {
  1468  	const struct deflate_sequence *seq = sequences;
  1469  
  1470  	for (;;) {
  1471  		u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF;
  1472  		unsigned length = seq->litrunlen_and_length >> 23;
  1473  		unsigned length_slot;
  1474  		unsigned litlen_symbol;
  1475  		unsigned offset_symbol;
  1476  
  1477  		if (litrunlen) {
  1478  		#if 1
  1479  			while (litrunlen >= 4) {
  1480  				unsigned lit0 = in_next[0];
  1481  				unsigned lit1 = in_next[1];
  1482  				unsigned lit2 = in_next[2];
  1483  				unsigned lit3 = in_next[3];
  1484  
  1485  				deflate_add_bits(os, codes->codewords.litlen[lit0],
  1486  						 codes->lens.litlen[lit0]);
  1487  				if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
  1488  					deflate_flush_bits(os);
  1489  
  1490  				deflate_add_bits(os, codes->codewords.litlen[lit1],
  1491  						 codes->lens.litlen[lit1]);
  1492  				if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN))
  1493  					deflate_flush_bits(os);
  1494  
  1495  				deflate_add_bits(os, codes->codewords.litlen[lit2],
  1496  						 codes->lens.litlen[lit2]);
  1497  				if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
  1498  					deflate_flush_bits(os);
  1499  
  1500  				deflate_add_bits(os, codes->codewords.litlen[lit3],
  1501  						 codes->lens.litlen[lit3]);
  1502  				deflate_flush_bits(os);
  1503  				in_next += 4;
  1504  				litrunlen -= 4;
  1505  			}
  1506  			if (litrunlen-- != 0) {
  1507  				deflate_add_bits(os, codes->codewords.litlen[*in_next],
  1508  						 codes->lens.litlen[*in_next]);
  1509  				if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
  1510  					deflate_flush_bits(os);
  1511  				in_next++;
  1512  				if (litrunlen-- != 0) {
  1513  					deflate_add_bits(os, codes->codewords.litlen[*in_next],
  1514  							 codes->lens.litlen[*in_next]);
  1515  					if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
  1516  						deflate_flush_bits(os);
  1517  					in_next++;
  1518  					if (litrunlen-- != 0) {
  1519  						deflate_add_bits(os, codes->codewords.litlen[*in_next],
  1520  								 codes->lens.litlen[*in_next]);
  1521  						if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
  1522  							deflate_flush_bits(os);
  1523  						in_next++;
  1524  					}
  1525  				}
  1526  				if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
  1527  					deflate_flush_bits(os);
  1528  			}
  1529  		#else
  1530  			do {
  1531  				unsigned lit = *in_next++;
  1532  				deflate_add_bits(os, codes->codewords.litlen[lit],
  1533  						 codes->lens.litlen[lit]);
  1534  				deflate_flush_bits(os);
  1535  			} while (--litrunlen);
  1536  		#endif
  1537  		}
  1538  
  1539  		if (length == 0)
  1540  			return;
  1541  
  1542  		in_next += length;
  1543  
  1544  		length_slot = seq->length_slot;
  1545  		litlen_symbol = 257 + length_slot;
  1546  
  1547  		/* Litlen symbol  */
  1548  		deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
  1549  				 codes->lens.litlen[litlen_symbol]);
  1550  
  1551  		/* Extra length bits  */
  1552  		STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
  1553  					 DEFLATE_MAX_EXTRA_LENGTH_BITS));
  1554  		deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
  1555  				 deflate_extra_length_bits[length_slot]);
  1556  
  1557  		if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
  1558  				DEFLATE_MAX_EXTRA_LENGTH_BITS +
  1559  				MAX_OFFSET_CODEWORD_LEN +
  1560  				DEFLATE_MAX_EXTRA_OFFSET_BITS))
  1561  			deflate_flush_bits(os);
  1562  
  1563  		/* Offset symbol  */
  1564  		offset_symbol = seq->offset_symbol;
  1565  		deflate_add_bits(os, codes->codewords.offset[offset_symbol],
  1566  				 codes->lens.offset[offset_symbol]);
  1567  
  1568  		if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
  1569  				DEFLATE_MAX_EXTRA_OFFSET_BITS))
  1570  			deflate_flush_bits(os);
  1571  
  1572  		/* Extra offset bits  */
  1573  		deflate_add_bits(os, seq->offset - deflate_offset_slot_base[offset_symbol],
  1574  				 deflate_extra_offset_bits[offset_symbol]);
  1575  
  1576  		deflate_flush_bits(os);
  1577  
  1578  		seq++;
  1579  	}
  1580  }
  1581  
  1582  #if SUPPORT_NEAR_OPTIMAL_PARSING
  1583  /*
  1584   * Follow the minimum-cost path in the graph of possible match/literal choices
  1585   * for the current block and write out the matches/literals using the specified
  1586   * Huffman codes.
  1587   *
  1588   * Note: this is slightly duplicated with deflate_write_sequences(), the reason
  1589   * being that we don't want to waste time translating between intermediate
  1590   * match/literal representations.
  1591   */
  1592  static void
  1593  deflate_write_item_list(struct deflate_output_bitstream *os,
  1594  			const struct deflate_codes *codes,
  1595  			struct libdeflate_compressor *c,
  1596  			u32 block_length)
  1597  {
  1598  	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
  1599  	struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length];
  1600  	do {
  1601  		unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
  1602  		unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
  1603  		unsigned litlen_symbol;
  1604  		unsigned length_slot;
  1605  		unsigned offset_slot;
  1606  
  1607  		if (length == 1) {
  1608  			/* Literal  */
  1609  			litlen_symbol = offset;
  1610  			deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
  1611  					 codes->lens.litlen[litlen_symbol]);
  1612  			deflate_flush_bits(os);
  1613  		} else {
  1614  			/* Match length  */
  1615  			length_slot = deflate_length_slot[length];
  1616  			litlen_symbol = 257 + length_slot;
  1617  			deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
  1618  					 codes->lens.litlen[litlen_symbol]);
  1619  
  1620  			deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
  1621  					 deflate_extra_length_bits[length_slot]);
  1622  
  1623  			if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
  1624  					DEFLATE_MAX_EXTRA_LENGTH_BITS +
  1625  					MAX_OFFSET_CODEWORD_LEN +
  1626  					DEFLATE_MAX_EXTRA_OFFSET_BITS))
  1627  				deflate_flush_bits(os);
  1628  
  1629  
  1630  			/* Match offset  */
  1631  			offset_slot = deflate_get_offset_slot(c, offset);
  1632  			deflate_add_bits(os, codes->codewords.offset[offset_slot],
  1633  					 codes->lens.offset[offset_slot]);
  1634  
  1635  			if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
  1636  					DEFLATE_MAX_EXTRA_OFFSET_BITS))
  1637  				deflate_flush_bits(os);
  1638  
  1639  			deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot],
  1640  					 deflate_extra_offset_bits[offset_slot]);
  1641  
  1642  			deflate_flush_bits(os);
  1643  		}
  1644  		cur_node += length;
  1645  	} while (cur_node != end_node);
  1646  }
  1647  #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
  1648  
  1649  /* Output the end-of-block symbol.  */
  1650  static void
  1651  deflate_write_end_of_block(struct deflate_output_bitstream *os,
  1652  			   const struct deflate_codes *codes)
  1653  {
  1654  	deflate_add_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
  1655  			 codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
  1656  	deflate_flush_bits(os);
  1657  }
  1658  
  1659  static void
  1660  deflate_write_uncompressed_block(struct deflate_output_bitstream *os,
  1661  				 const u8 *data, u16 len,
  1662  				 bool is_final_block)
  1663  {
  1664  	deflate_write_block_header(os, is_final_block,
  1665  				   DEFLATE_BLOCKTYPE_UNCOMPRESSED);
  1666  	deflate_align_bitstream(os);
  1667  
  1668  	if (4 + (u32)len >= os->end - os->next) {
  1669  		os->next = os->end;
  1670  		return;
  1671  	}
  1672  
  1673  	put_unaligned_le16(len, os->next);
  1674  	os->next += 2;
  1675  	put_unaligned_le16(~len, os->next);
  1676  	os->next += 2;
  1677  	memcpy(os->next, data, len);
  1678  	os->next += len;
  1679  }
  1680  
  1681  static void
  1682  deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os,
  1683  				  const u8 *data, u32 data_length,
  1684  				  bool is_final_block)
  1685  {
  1686  	do {
  1687  		u16 len = MIN(data_length, UINT16_MAX);
  1688  
  1689  		deflate_write_uncompressed_block(os, data, len,
  1690  					is_final_block && len == data_length);
  1691  		data += len;
  1692  		data_length -= len;
  1693  	} while (data_length != 0);
  1694  }
  1695  
  1696  /*
  1697   * Choose the best type of block to use (dynamic Huffman, static Huffman, or
  1698   * uncompressed), then output it.
  1699   */
  1700  static void
  1701  deflate_flush_block(struct libdeflate_compressor * restrict c,
  1702  		    struct deflate_output_bitstream * restrict os,
  1703  		    const u8 * restrict block_begin, u32 block_length,
  1704  		    bool is_final_block, bool use_item_list)
  1705  {
  1706  	static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
  1707  		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7,
  1708  	};
  1709  
  1710  	/* Costs are measured in bits */
  1711  	u32 dynamic_cost = 0;
  1712  	u32 static_cost = 0;
  1713  	u32 uncompressed_cost = 0;
  1714  	struct deflate_codes *codes;
  1715  	int block_type;
  1716  	unsigned sym;
  1717  
  1718  	/* Tally the end-of-block symbol. */
  1719  	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
  1720  
  1721  	/* Build dynamic Huffman codes. */
  1722  	deflate_make_huffman_codes(&c->freqs, &c->codes);
  1723  
  1724  	/* Account for the cost of sending dynamic Huffman codes. */
  1725  	deflate_precompute_huffman_header(c);
  1726  	dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens);
  1727  	for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
  1728  		u32 extra = deflate_extra_precode_bits[sym];
  1729  		dynamic_cost += c->precode_freqs[sym] *
  1730  				(extra + c->precode_lens[sym]);
  1731  	}
  1732  
  1733  	/* Account for the cost of encoding literals. */
  1734  	for (sym = 0; sym < 256; sym++) {
  1735  		dynamic_cost += c->freqs.litlen[sym] *
  1736  				c->codes.lens.litlen[sym];
  1737  	}
  1738  	for (sym = 0; sym < 144; sym++)
  1739  		static_cost += c->freqs.litlen[sym] * 8;
  1740  	for (; sym < 256; sym++)
  1741  		static_cost += c->freqs.litlen[sym] * 9;
  1742  
  1743  	/* Account for the cost of encoding the end-of-block symbol. */
  1744  	dynamic_cost += c->codes.lens.litlen[256];
  1745  	static_cost += 7;
  1746  
  1747  	/* Account for the cost of encoding lengths. */
  1748  	for (sym = 257; sym < 257 + ARRAY_LEN(deflate_extra_length_bits); sym++) {
  1749  		u32 extra = deflate_extra_length_bits[sym - 257];
  1750  		dynamic_cost += c->freqs.litlen[sym] *
  1751  				(extra + c->codes.lens.litlen[sym]);
  1752  		static_cost += c->freqs.litlen[sym] *
  1753  				(extra + c->static_codes.lens.litlen[sym]);
  1754  	}
  1755  
  1756  	/* Account for the cost of encoding offsets. */
  1757  	for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
  1758  		u32 extra = deflate_extra_offset_bits[sym];
  1759  		dynamic_cost += c->freqs.offset[sym] *
  1760  				(extra + c->codes.lens.offset[sym]);
  1761  		static_cost += c->freqs.offset[sym] * (extra + 5);
  1762  	}
  1763  
  1764  	/* Compute the cost of using uncompressed blocks. */
  1765  	uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 +
  1766  			     (40 * (DIV_ROUND_UP(block_length,
  1767  						 UINT16_MAX) - 1)) +
  1768  			     (8 * block_length);
  1769  
  1770  	/* Choose the cheapest block type. */
  1771  	if (dynamic_cost < MIN(static_cost, uncompressed_cost)) {
  1772  		block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN;
  1773  		codes = &c->codes;
  1774  	} else if (static_cost < uncompressed_cost) {
  1775  		block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN;
  1776  		codes = &c->static_codes;
  1777  	} else {
  1778  		block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED;
  1779  	}
  1780  
  1781  	/* Now actually output the block. */
  1782  
  1783  	if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
  1784  		/* Note: the length being flushed may exceed the maximum length
  1785  		 * of an uncompressed block (65535 bytes).  Therefore, more than
  1786  		 * one uncompressed block might be needed. */
  1787  		deflate_write_uncompressed_blocks(os, block_begin, block_length,
  1788  						  is_final_block);
  1789  	} else {
  1790  		/* Output the block header. */
  1791  		deflate_write_block_header(os, is_final_block, block_type);
  1792  
  1793  		/* Output the Huffman codes (dynamic Huffman blocks only). */
  1794  		if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN)
  1795  			deflate_write_huffman_header(c, os);
  1796  
  1797  		/* Output the literals, matches, and end-of-block symbol. */
  1798  	#if SUPPORT_NEAR_OPTIMAL_PARSING
  1799  		if (use_item_list)
  1800  			deflate_write_item_list(os, codes, c, block_length);
  1801  		else
  1802  	#endif
  1803  			deflate_write_sequences(os, codes, c->p.g.sequences,
  1804  						block_begin);
  1805  		deflate_write_end_of_block(os, codes);
  1806  	}
  1807  }
  1808  
  1809  static forceinline void
  1810  deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
  1811  		       u32 *litrunlen_p)
  1812  {
  1813  	c->freqs.litlen[literal]++;
  1814  	++*litrunlen_p;
  1815  }
  1816  
  1817  static forceinline void
  1818  deflate_choose_match(struct libdeflate_compressor *c,
  1819  		     unsigned length, unsigned offset,
  1820  		     u32 *litrunlen_p, struct deflate_sequence **next_seq_p)
  1821  {
  1822  	struct deflate_sequence *seq = *next_seq_p;
  1823  	unsigned length_slot = deflate_length_slot[length];
  1824  	unsigned offset_slot = deflate_get_offset_slot(c, offset);
  1825  
  1826  	c->freqs.litlen[257 + length_slot]++;
  1827  	c->freqs.offset[offset_slot]++;
  1828  
  1829  	seq->litrunlen_and_length = ((u32)length << 23) | *litrunlen_p;
  1830  	seq->offset = offset;
  1831  	seq->length_slot = length_slot;
  1832  	seq->offset_symbol = offset_slot;
  1833  
  1834  	*litrunlen_p = 0;
  1835  	*next_seq_p = seq + 1;
  1836  }
  1837  
  1838  static forceinline void
  1839  deflate_finish_sequence(struct deflate_sequence *seq, u32 litrunlen)
  1840  {
  1841  	seq->litrunlen_and_length = litrunlen; /* length = 0 */
  1842  }
  1843  
  1844  /******************************************************************************/
  1845  
  1846  /*
  1847   * Block splitting algorithm.  The problem is to decide when it is worthwhile to
  1848   * start a new block with new Huffman codes.  There is a theoretically optimal
  1849   * solution: recursively consider every possible block split, considering the
  1850   * exact cost of each block, and choose the minimum cost approach.  But this is
  1851   * far too slow.  Instead, as an approximation, we can count symbols and after
  1852   * every N symbols, compare the expected distribution of symbols based on the
  1853   * previous data with the actual distribution.  If they differ "by enough", then
  1854   * start a new block.
  1855   *
  1856   * As an optimization and heuristic, we don't distinguish between every symbol
  1857   * but rather we combine many symbols into a single "observation type".  For
  1858   * literals we only look at the high bits and low bits, and for matches we only
  1859   * look at whether the match is long or not.  The assumption is that for typical
  1860   * "real" data, places that are good block boundaries will tend to be noticable
  1861   * based only on changes in these aggregate frequencies, without looking for
  1862   * subtle differences in individual symbols.  For example, a change from ASCII
  1863   * bytes to non-ASCII bytes, or from few matches (generally less compressible)
  1864   * to many matches (generally more compressible), would be easily noticed based
  1865   * on the aggregates.
  1866   *
  1867   * For determining whether the frequency distributions are "different enough" to
  1868   * start a new block, the simply heuristic of splitting when the sum of absolute
  1869   * differences exceeds a constant seems to be good enough.  We also add a number
  1870   * proportional to the block length so that the algorithm is more likely to end
  1871   * long blocks than short blocks.  This reflects the general expectation that it
  1872   * will become increasingly beneficial to start a new block as the current
  1873   * block grows longer.
  1874   *
  1875   * Finally, for an approximation, it is not strictly necessary that the exact
  1876   * symbols being used are considered.  With "near-optimal parsing", for example,
  1877   * the actual symbols that will be used are unknown until after the block
  1878   * boundary is chosen and the block has been optimized.  Since the final choices
  1879   * cannot be used, we can use preliminary "greedy" choices instead.
  1880   */
  1881  
  1882  /* Initialize the block split statistics when starting a new block. */
  1883  static void
  1884  init_block_split_stats(struct block_split_stats *stats)
  1885  {
  1886  	int i;
  1887  
  1888  	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
  1889  		stats->new_observations[i] = 0;
  1890  		stats->observations[i] = 0;
  1891  	}
  1892  	stats->num_new_observations = 0;
  1893  	stats->num_observations = 0;
  1894  }
  1895  
  1896  /* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
  1897   * literal, for 8 possible literal observation types.  */
  1898  static forceinline void
  1899  observe_literal(struct block_split_stats *stats, u8 lit)
  1900  {
  1901  	stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
  1902  	stats->num_new_observations++;
  1903  }
  1904  
  1905  /* Match observation.  Heuristic: use one observation type for "short match" and
  1906   * one observation type for "long match".  */
  1907  static forceinline void
  1908  observe_match(struct block_split_stats *stats, unsigned length)
  1909  {
  1910  	stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++;
  1911  	stats->num_new_observations++;
  1912  }
  1913  
  1914  static bool
  1915  do_end_block_check(struct block_split_stats *stats, u32 block_length)
  1916  {
  1917  	int i;
  1918  
  1919  	if (stats->num_observations > 0) {
  1920  
  1921  		/* Note: to avoid slow divisions, we do not divide by
  1922  		 * 'num_observations', but rather do all math with the numbers
  1923  		 * multiplied by 'num_observations'.  */
  1924  		u32 total_delta = 0;
  1925  		for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
  1926  			u32 expected = stats->observations[i] * stats->num_new_observations;
  1927  			u32 actual = stats->new_observations[i] * stats->num_observations;
  1928  			u32 delta = (actual > expected) ? actual - expected :
  1929  							  expected - actual;
  1930  			total_delta += delta;
  1931  		}
  1932  
  1933  		/* Ready to end the block? */
  1934  		if (total_delta + (block_length / 4096) * stats->num_observations >=
  1935  		    NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations)
  1936  			return true;
  1937  	}
  1938  
  1939  	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
  1940  		stats->num_observations += stats->new_observations[i];
  1941  		stats->observations[i] += stats->new_observations[i];
  1942  		stats->new_observations[i] = 0;
  1943  	}
  1944  	stats->num_new_observations = 0;
  1945  	return false;
  1946  }
  1947  
  1948  static forceinline bool
  1949  should_end_block(struct block_split_stats *stats,
  1950  		 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
  1951  {
  1952  	/* Ready to check block split statistics? */
  1953  	if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
  1954  	    in_next - in_block_begin < MIN_BLOCK_LENGTH ||
  1955  	    in_end - in_next < MIN_BLOCK_LENGTH)
  1956  		return false;
  1957  
  1958  	return do_end_block_check(stats, in_next - in_block_begin);
  1959  }
  1960  
  1961  /******************************************************************************/
  1962  
  1963  /*
  1964   * This is the "greedy" DEFLATE compressor. It always chooses the longest match.
  1965   */
  1966  static size_t
  1967  deflate_compress_greedy(struct libdeflate_compressor * restrict c,
  1968  			const u8 * restrict in, size_t in_nbytes,
  1969  			u8 * restrict out, size_t out_nbytes_avail)
  1970  {
  1971  	const u8 *in_next = in;
  1972  	const u8 *in_end = in_next + in_nbytes;
  1973  	struct deflate_output_bitstream os;
  1974  	const u8 *in_cur_base = in_next;
  1975  	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
  1976  	unsigned nice_len = MIN(c->nice_match_length, max_len);
  1977  	u32 next_hashes[2] = {0, 0};
  1978  
  1979  	deflate_init_output(&os, out, out_nbytes_avail);
  1980  	hc_matchfinder_init(&c->p.g.hc_mf);
  1981  
  1982  	do {
  1983  		/* Starting a new DEFLATE block.  */
  1984  
  1985  		const u8 * const in_block_begin = in_next;
  1986  		const u8 * const in_max_block_end =
  1987  			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
  1988  		u32 litrunlen = 0;
  1989  		struct deflate_sequence *next_seq = c->p.g.sequences;
  1990  
  1991  		init_block_split_stats(&c->split_stats);
  1992  		deflate_reset_symbol_frequencies(c);
  1993  
  1994  		do {
  1995  			u32 length;
  1996  			u32 offset;
  1997  
  1998  			/* Decrease the maximum and nice match lengths if we're
  1999  			 * approaching the end of the input buffer.  */
  2000  			if (unlikely(max_len > in_end - in_next)) {
  2001  				max_len = in_end - in_next;
  2002  				nice_len = MIN(nice_len, max_len);
  2003  			}
  2004  
  2005  			length = hc_matchfinder_longest_match(&c->p.g.hc_mf,
  2006  							      &in_cur_base,
  2007  							      in_next,
  2008  							      DEFLATE_MIN_MATCH_LEN - 1,
  2009  							      max_len,
  2010  							      nice_len,
  2011  							      c->max_search_depth,
  2012  							      next_hashes,
  2013  							      &offset);
  2014  
  2015  			if (length >= DEFLATE_MIN_MATCH_LEN) {
  2016  				/* Match found.  */
  2017  				deflate_choose_match(c, length, offset,
  2018  						     &litrunlen, &next_seq);
  2019  				observe_match(&c->split_stats, length);
  2020  				in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
  2021  									&in_cur_base,
  2022  									in_next + 1,
  2023  									in_end,
  2024  									length - 1,
  2025  									next_hashes);
  2026  			} else {
  2027  				/* No match found.  */
  2028  				deflate_choose_literal(c, *in_next, &litrunlen);
  2029  				observe_literal(&c->split_stats, *in_next);
  2030  				in_next++;
  2031  			}
  2032  
  2033  			/* Check if it's time to output another block.  */
  2034  		} while (in_next < in_max_block_end &&
  2035  			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
  2036  
  2037  		deflate_finish_sequence(next_seq, litrunlen);
  2038  		deflate_flush_block(c, &os, in_block_begin,
  2039  				    in_next - in_block_begin,
  2040  				    in_next == in_end, false);
  2041  	} while (in_next != in_end);
  2042  
  2043  	return deflate_flush_output(&os);
  2044  }
  2045  
  2046  /*
  2047   * This is the "lazy" DEFLATE compressor.  Before choosing a match, it checks to
  2048   * see if there's a longer match at the next position.  If yes, it outputs a
  2049   * literal and continues to the next position.  If no, it outputs the match.
  2050   */
  2051  static size_t
  2052  deflate_compress_lazy(struct libdeflate_compressor * restrict c,
  2053  		      const u8 * restrict in, size_t in_nbytes,
  2054  		      u8 * restrict out, size_t out_nbytes_avail)
  2055  {
  2056  	const u8 *in_next = in;
  2057  	const u8 *in_end = in_next + in_nbytes;
  2058  	struct deflate_output_bitstream os;
  2059  	const u8 *in_cur_base = in_next;
  2060  	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
  2061  	unsigned nice_len = MIN(c->nice_match_length, max_len);
  2062  	u32 next_hashes[2] = {0, 0};
  2063  
  2064  	deflate_init_output(&os, out, out_nbytes_avail);
  2065  	hc_matchfinder_init(&c->p.g.hc_mf);
  2066  
  2067  	do {
  2068  		/* Starting a new DEFLATE block.  */
  2069  
  2070  		const u8 * const in_block_begin = in_next;
  2071  		const u8 * const in_max_block_end =
  2072  			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
  2073  		u32 litrunlen = 0;
  2074  		struct deflate_sequence *next_seq = c->p.g.sequences;
  2075  
  2076  		init_block_split_stats(&c->split_stats);
  2077  		deflate_reset_symbol_frequencies(c);
  2078  
  2079  		do {
  2080  			unsigned cur_len;
  2081  			unsigned cur_offset;
  2082  			unsigned next_len;
  2083  			unsigned next_offset;
  2084  
  2085  			if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
  2086  				max_len = in_end - in_next;
  2087  				nice_len = MIN(nice_len, max_len);
  2088  			}
  2089  
  2090  			/* Find the longest match at the current position.  */
  2091  			cur_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
  2092  							       &in_cur_base,
  2093  							       in_next,
  2094  							       DEFLATE_MIN_MATCH_LEN - 1,
  2095  							       max_len,
  2096  							       nice_len,
  2097  							       c->max_search_depth,
  2098  							       next_hashes,
  2099  							       &cur_offset);
  2100  			in_next += 1;
  2101  
  2102  			if (cur_len < DEFLATE_MIN_MATCH_LEN) {
  2103  				/* No match found.  Choose a literal.  */
  2104  				deflate_choose_literal(c, *(in_next - 1), &litrunlen);
  2105  				observe_literal(&c->split_stats, *(in_next - 1));
  2106  				continue;
  2107  			}
  2108  
  2109  		have_cur_match:
  2110  			observe_match(&c->split_stats, cur_len);
  2111  
  2112  			/* We have a match at the current position.  */
  2113  
  2114  			/* If the current match is very long, choose it
  2115  			 * immediately.  */
  2116  			if (cur_len >= nice_len) {
  2117  				deflate_choose_match(c, cur_len, cur_offset,
  2118  						     &litrunlen, &next_seq);
  2119  				in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
  2120  									&in_cur_base,
  2121  									in_next,
  2122  									in_end,
  2123  									cur_len - 1,
  2124  									next_hashes);
  2125  				continue;
  2126  			}
  2127  
  2128  			/*
  2129  			 * Try to find a match at the next position.
  2130  			 *
  2131  			 * Note: since we already have a match at the *current*
  2132  			 * position, we use only half the 'max_search_depth'
  2133  			 * when checking the *next* position.  This is a useful
  2134  			 * trade-off because it's more worthwhile to use a
  2135  			 * greater search depth on the initial match.
  2136  			 *
  2137  			 * Note: it's possible to structure the code such that
  2138  			 * there's only one call to longest_match(), which
  2139  			 * handles both the "find the initial match" and "try to
  2140  			 * find a longer match" cases.  However, it is faster to
  2141  			 * have two call sites, with longest_match() inlined at
  2142  			 * each.
  2143  			 */
  2144  			if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
  2145  				max_len = in_end - in_next;
  2146  				nice_len = MIN(nice_len, max_len);
  2147  			}
  2148  			next_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
  2149  								&in_cur_base,
  2150  								in_next,
  2151  								cur_len,
  2152  								max_len,
  2153  								nice_len,
  2154  								c->max_search_depth / 2,
  2155  								next_hashes,
  2156  								&next_offset);
  2157  			in_next += 1;
  2158  
  2159  			if (next_len > cur_len) {
  2160  				/* Found a longer match at the next position.
  2161  				 * Output a literal.  Then the next match
  2162  				 * becomes the current match.  */
  2163  				deflate_choose_literal(c, *(in_next - 2), &litrunlen);
  2164  				cur_len = next_len;
  2165  				cur_offset = next_offset;
  2166  				goto have_cur_match;
  2167  			}
  2168  
  2169  			/* No longer match at the next position.
  2170  			 * Output the current match.  */
  2171  			deflate_choose_match(c, cur_len, cur_offset,
  2172  					     &litrunlen, &next_seq);
  2173  			in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
  2174  								&in_cur_base,
  2175  								in_next,
  2176  								in_end,
  2177  								cur_len - 2,
  2178  								next_hashes);
  2179  
  2180  			/* Check if it's time to output another block.  */
  2181  		} while (in_next < in_max_block_end &&
  2182  			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
  2183  
  2184  		deflate_finish_sequence(next_seq, litrunlen);
  2185  		deflate_flush_block(c, &os, in_block_begin,
  2186  				    in_next - in_block_begin,
  2187  				    in_next == in_end, false);
  2188  	} while (in_next != in_end);
  2189  
  2190  	return deflate_flush_output(&os);
  2191  }
  2192  
  2193  #if SUPPORT_NEAR_OPTIMAL_PARSING
  2194  
  2195  /*
  2196   * Follow the minimum-cost path in the graph of possible match/literal choices
  2197   * for the current block and compute the frequencies of the Huffman symbols that
  2198   * would be needed to output those matches and literals.
  2199   */
  2200  static void
  2201  deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
  2202  {
  2203  	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
  2204  	struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
  2205  	do {
  2206  		unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
  2207  		unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
  2208  
  2209  		if (length == 1) {
  2210  			/* Literal  */
  2211  			c->freqs.litlen[offset]++;
  2212  		} else {
  2213  			/* Match  */
  2214  			c->freqs.litlen[257 + deflate_length_slot[length]]++;
  2215  			c->freqs.offset[deflate_get_offset_slot(c, offset)]++;
  2216  		}
  2217  		cur_node += length;
  2218  	} while (cur_node != end_node);
  2219  }
  2220  
  2221  /* Set the current cost model from the codeword lengths specified in @lens.  */
  2222  static void
  2223  deflate_set_costs_from_codes(struct libdeflate_compressor *c,
  2224  			     const struct deflate_lens *lens)
  2225  {
  2226  	unsigned i;
  2227  
  2228  	/* Literals  */
  2229  	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
  2230  		u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS);
  2231  		c->p.n.costs.literal[i] = bits << COST_SHIFT;
  2232  	}
  2233  
  2234  	/* Lengths  */
  2235  	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
  2236  		unsigned length_slot = deflate_length_slot[i];
  2237  		unsigned litlen_sym = 257 + length_slot;
  2238  		u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
  2239  		bits += deflate_extra_length_bits[length_slot];
  2240  		c->p.n.costs.length[i] = bits << COST_SHIFT;
  2241  	}
  2242  
  2243  	/* Offset slots  */
  2244  	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
  2245  		u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS);
  2246  		bits += deflate_extra_offset_bits[i];
  2247  		c->p.n.costs.offset_slot[i] = bits << COST_SHIFT;
  2248  	}
  2249  }
  2250  
  2251  static forceinline u32
  2252  deflate_default_literal_cost(unsigned literal)
  2253  {
  2254  	STATIC_ASSERT(COST_SHIFT == 3);
  2255  	/* 66 is 8.25 bits/symbol  */
  2256  	return 66;
  2257  }
  2258  
  2259  static forceinline u32
  2260  deflate_default_length_slot_cost(unsigned length_slot)
  2261  {
  2262  	STATIC_ASSERT(COST_SHIFT == 3);
  2263  	/* 60 is 7.5 bits/symbol  */
  2264  	return 60 + ((u32)deflate_extra_length_bits[length_slot] << COST_SHIFT);
  2265  }
  2266  
  2267  static forceinline u32
  2268  deflate_default_offset_slot_cost(unsigned offset_slot)
  2269  {
  2270  	STATIC_ASSERT(COST_SHIFT == 3);
  2271  	/* 39 is 4.875 bits/symbol  */
  2272  	return 39 + ((u32)deflate_extra_offset_bits[offset_slot] << COST_SHIFT);
  2273  }
  2274  
  2275  /*
  2276   * Set default symbol costs for the first block's first optimization pass.
  2277   *
  2278   * It works well to assume that each symbol is equally probable.  This results
  2279   * in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 <<
  2280   * COST_SHIFT)) where 'num_syms' is the number of symbols in the corresponding
  2281   * alphabet.  However, we intentionally bias the parse towards matches rather
  2282   * than literals by using a slightly lower default cost for length symbols than
  2283   * for literals.  This often improves the compression ratio slightly.
  2284   */
  2285  static void
  2286  deflate_set_default_costs(struct libdeflate_compressor *c)
  2287  {
  2288  	unsigned i;
  2289  
  2290  	/* Literals  */
  2291  	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
  2292  		c->p.n.costs.literal[i] = deflate_default_literal_cost(i);
  2293  
  2294  	/* Lengths  */
  2295  	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
  2296  		c->p.n.costs.length[i] = deflate_default_length_slot_cost(
  2297  						deflate_length_slot[i]);
  2298  
  2299  	/* Offset slots  */
  2300  	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
  2301  		c->p.n.costs.offset_slot[i] = deflate_default_offset_slot_cost(i);
  2302  }
  2303  
  2304  static forceinline void
  2305  deflate_adjust_cost(u32 *cost_p, u32 default_cost)
  2306  {
  2307  	*cost_p += ((s32)default_cost - (s32)*cost_p) >> 1;
  2308  }
  2309  
  2310  /*
  2311   * Adjust the costs when beginning a new block.
  2312   *
  2313   * Since the current costs have been optimized for the data, it's undesirable to
  2314   * throw them away and start over with the default costs.  At the same time, we
  2315   * don't want to bias the parse by assuming that the next block will be similar
  2316   * to the current block.  As a compromise, make the costs closer to the
  2317   * defaults, but don't simply set them to the defaults.
  2318   */
  2319  static void
  2320  deflate_adjust_costs(struct libdeflate_compressor *c)
  2321  {
  2322  	unsigned i;
  2323  
  2324  	/* Literals  */
  2325  	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
  2326  		deflate_adjust_cost(&c->p.n.costs.literal[i],
  2327  				    deflate_default_literal_cost(i));
  2328  
  2329  	/* Lengths  */
  2330  	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
  2331  		deflate_adjust_cost(&c->p.n.costs.length[i],
  2332  				    deflate_default_length_slot_cost(
  2333  						deflate_length_slot[i]));
  2334  
  2335  	/* Offset slots  */
  2336  	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
  2337  		deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
  2338  				    deflate_default_offset_slot_cost(i));
  2339  }
  2340  
  2341  /*
  2342   * Find the minimum-cost path through the graph of possible match/literal
  2343   * choices for this block.
  2344   *
  2345   * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
  2346   * represents the node at the beginning of the block, to
  2347   * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
  2348   * the block.  Edge costs are evaluated using the cost model 'c->p.n.costs'.
  2349   *
  2350   * The algorithm works backwards, starting at the end node and proceeding
  2351   * backwards one node at a time.  At each node, the minimum cost to reach the
  2352   * end node is computed and the match/literal choice that begins that path is
  2353   * saved.
  2354   */
  2355  static void
  2356  deflate_find_min_cost_path(struct libdeflate_compressor *c,
  2357  			   const u32 block_length,
  2358  			   const struct lz_match *cache_ptr)
  2359  {
  2360  	struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
  2361  	struct deflate_optimum_node *cur_node = end_node;
  2362  
  2363  	cur_node->cost_to_end = 0;
  2364  	do {
  2365  		unsigned num_matches;
  2366  		unsigned literal;
  2367  		u32 best_cost_to_end;
  2368  
  2369  		cur_node--;
  2370  		cache_ptr--;
  2371  
  2372  		num_matches = cache_ptr->length;
  2373  		literal = cache_ptr->offset;
  2374  
  2375  		/* It's always possible to choose a literal.  */
  2376  		best_cost_to_end = c->p.n.costs.literal[literal] +
  2377  				   (cur_node + 1)->cost_to_end;
  2378  		cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
  2379  
  2380  		/* Also consider matches if there are any.  */
  2381  		if (num_matches) {
  2382  			const struct lz_match *match;
  2383  			unsigned len;
  2384  			unsigned offset;
  2385  			unsigned offset_slot;
  2386  			u32 offset_cost;
  2387  			u32 cost_to_end;
  2388  
  2389  			/*
  2390  			 * Consider each length from the minimum
  2391  			 * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
  2392  			 * match found at this position.  For each length, we
  2393  			 * consider only the smallest offset for which that
  2394  			 * length is available.  Although this is not guaranteed
  2395  			 * to be optimal due to the possibility of a larger
  2396  			 * offset costing less than a smaller offset to code,
  2397  			 * this is a very useful heuristic.
  2398  			 */
  2399  			match = cache_ptr - num_matches;
  2400  			len = DEFLATE_MIN_MATCH_LEN;
  2401  			do {
  2402  				offset = match->offset;
  2403  				offset_slot = deflate_get_offset_slot(c, offset);
  2404  				offset_cost = c->p.n.costs.offset_slot[offset_slot];
  2405  				do {
  2406  					cost_to_end = offset_cost +
  2407  						      c->p.n.costs.length[len] +
  2408  						      (cur_node + len)->cost_to_end;
  2409  					if (cost_to_end < best_cost_to_end) {
  2410  						best_cost_to_end = cost_to_end;
  2411  						cur_node->item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len;
  2412  					}
  2413  				} while (++len <= match->length);
  2414  			} while (++match != cache_ptr);
  2415  			cache_ptr -= num_matches;
  2416  		}
  2417  		cur_node->cost_to_end = best_cost_to_end;
  2418  	} while (cur_node != &c->p.n.optimum_nodes[0]);
  2419  }
  2420  
  2421  /*
  2422   * Choose the literal/match sequence to use for the current block.  The basic
  2423   * algorithm finds a minimum-cost path through the block's graph of
  2424   * literal/match choices, given a cost model.  However, the cost of each symbol
  2425   * is unknown until the Huffman codes have been built, but at the same time the
  2426   * Huffman codes depend on the frequencies of chosen symbols.  Consequently,
  2427   * multiple passes must be used to try to approximate an optimal solution.  The
  2428   * first pass uses default costs, mixed with the costs from the previous block
  2429   * if any.  Later passes use the Huffman codeword lengths from the previous pass
  2430   * as the costs.
  2431   */
  2432  static void
  2433  deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
  2434  		       const struct lz_match *cache_ptr, bool is_first_block)
  2435  {
  2436  	unsigned num_passes_remaining = c->p.n.num_optim_passes;
  2437  	u32 i;
  2438  
  2439  	/* Force the block to really end at the desired length, even if some
  2440  	 * matches extend beyond it. */
  2441  	for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
  2442  					ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
  2443  		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
  2444  
  2445  	/* Set the initial costs. */
  2446  	if (is_first_block)
  2447  		deflate_set_default_costs(c);
  2448  	else
  2449  		deflate_adjust_costs(c);
  2450  
  2451  	for (;;) {
  2452  		/* Find the minimum cost path for this pass. */
  2453  		deflate_find_min_cost_path(c, block_length, cache_ptr);
  2454  
  2455  		/* Compute frequencies of the chosen symbols. */
  2456  		deflate_reset_symbol_frequencies(c);
  2457  		deflate_tally_item_list(c, block_length);
  2458  
  2459  		if (--num_passes_remaining == 0)
  2460  			break;
  2461  
  2462  		/* At least one optimization pass remains; update the costs. */
  2463  		deflate_make_huffman_codes(&c->freqs, &c->codes);
  2464  		deflate_set_costs_from_codes(c, &c->codes.lens);
  2465  	}
  2466  }
  2467  
  2468  /*
  2469   * This is the "near-optimal" DEFLATE compressor.  It computes the optimal
  2470   * representation of each DEFLATE block using a minimum-cost path search over
  2471   * the graph of possible match/literal choices for that block, assuming a
  2472   * certain cost for each Huffman symbol.
  2473   *
  2474   * For several reasons, the end result is not guaranteed to be optimal:
  2475   *
  2476   * - Nonoptimal choice of blocks
  2477   * - Heuristic limitations on which matches are actually considered
  2478   * - Symbol costs are unknown until the symbols have already been chosen
  2479   *   (so iterative optimization must be used)
  2480   */
  2481  static size_t
  2482  deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
  2483  			      const u8 * restrict in, size_t in_nbytes,
  2484  			      u8 * restrict out, size_t out_nbytes_avail)
  2485  {
  2486  	const u8 *in_next = in;
  2487  	const u8 *in_end = in_next + in_nbytes;
  2488  	struct deflate_output_bitstream os;
  2489  	const u8 *in_cur_base = in_next;
  2490  	const u8 *in_next_slide = in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
  2491  	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
  2492  	unsigned nice_len = MIN(c->nice_match_length, max_len);
  2493  	u32 next_hashes[2] = {0, 0};
  2494  
  2495  	deflate_init_output(&os, out, out_nbytes_avail);
  2496  	bt_matchfinder_init(&c->p.n.bt_mf);
  2497  
  2498  	do {
  2499  		/* Starting a new DEFLATE block.  */
  2500  
  2501  		struct lz_match *cache_ptr = c->p.n.match_cache;
  2502  		const u8 * const in_block_begin = in_next;
  2503  		const u8 * const in_max_block_end =
  2504  			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
  2505  		const u8 *next_observation = in_next;
  2506  
  2507  		init_block_split_stats(&c->split_stats);
  2508  
  2509  		/*
  2510  		 * Find matches until we decide to end the block.  We end the
  2511  		 * block if any of the following is true:
  2512  		 *
  2513  		 * (1) Maximum block length has been reached
  2514  		 * (2) Match catch may overflow.
  2515  		 * (3) Block split heuristic says to split now.
  2516  		 */
  2517  		do {
  2518  			struct lz_match *matches;
  2519  			unsigned best_len;
  2520  
  2521  			/* Slide the window forward if needed.  */
  2522  			if (in_next == in_next_slide) {
  2523  				bt_matchfinder_slide_window(&c->p.n.bt_mf);
  2524  				in_cur_base = in_next;
  2525  				in_next_slide = in_next + MIN(in_end - in_next,
  2526  							      MATCHFINDER_WINDOW_SIZE);
  2527  			}
  2528  
  2529  			/* Decrease the maximum and nice match lengths if we're
  2530  			 * approaching the end of the input buffer.  */
  2531  			if (unlikely(max_len > in_end - in_next)) {
  2532  				max_len = in_end - in_next;
  2533  				nice_len = MIN(nice_len, max_len);
  2534  			}
  2535  
  2536  			/*
  2537  			 * Find matches with the current position using the
  2538  			 * binary tree matchfinder and save them in
  2539  			 * 'match_cache'.
  2540  			 *
  2541  			 * Note: the binary tree matchfinder is more suited for
  2542  			 * optimal parsing than the hash chain matchfinder.  The
  2543  			 * reasons for this include:
  2544  			 *
  2545  			 * - The binary tree matchfinder can find more matches
  2546  			 *   in the same number of steps.
  2547  			 * - One of the major advantages of hash chains is that
  2548  			 *   skipping positions (not searching for matches at
  2549  			 *   them) is faster; however, with optimal parsing we
  2550  			 *   search for matches at almost all positions, so this
  2551  			 *   advantage of hash chains is negated.
  2552  			 */
  2553  			matches = cache_ptr;
  2554  			best_len = 0;
  2555  			if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
  2556  				cache_ptr = bt_matchfinder_get_matches(&c->p.n.bt_mf,
  2557  								       in_cur_base,
  2558  								       in_next - in_cur_base,
  2559  								       max_len,
  2560  								       nice_len,
  2561  								       c->max_search_depth,
  2562  								       next_hashes,
  2563  								       &best_len,
  2564  								       matches);
  2565  			}
  2566  
  2567  			if (in_next >= next_observation) {
  2568  				if (best_len >= 4) {
  2569  					observe_match(&c->split_stats, best_len);
  2570  					next_observation = in_next + best_len;
  2571  				} else {
  2572  					observe_literal(&c->split_stats, *in_next);
  2573  					next_observation = in_next + 1;
  2574  				}
  2575  			}
  2576  
  2577  			cache_ptr->length = cache_ptr - matches;
  2578  			cache_ptr->offset = *in_next;
  2579  			in_next++;
  2580  			cache_ptr++;
  2581  
  2582  			/*
  2583  			 * If there was a very long match found, don't cache any
  2584  			 * matches for the bytes covered by that match.  This
  2585  			 * avoids degenerate behavior when compressing highly
  2586  			 * redundant data, where the number of matches can be
  2587  			 * very large.
  2588  			 *
  2589  			 * This heuristic doesn't actually hurt the compression
  2590  			 * ratio very much.  If there's a long match, then the
  2591  			 * data must be highly compressible, so it doesn't
  2592  			 * matter much what we do.
  2593  			 */
  2594  			if (best_len >= DEFLATE_MIN_MATCH_LEN && best_len >= nice_len) {
  2595  				--best_len;
  2596  				do {
  2597  					if (in_next == in_next_slide) {
  2598  						bt_matchfinder_slide_window(&c->p.n.bt_mf);
  2599  						in_cur_base = in_next;
  2600  						in_next_slide = in_next + MIN(in_end - in_next,
  2601  									      MATCHFINDER_WINDOW_SIZE);
  2602  					}
  2603  					if (unlikely(max_len > in_end - in_next)) {
  2604  						max_len = in_end - in_next;
  2605  						nice_len = MIN(nice_len, max_len);
  2606  					}
  2607  					if (max_len >= BT_MATCHFINDER_REQUIRED_NBYTES) {
  2608  						bt_matchfinder_skip_position(&c->p.n.bt_mf,
  2609  									     in_cur_base,
  2610  									     in_next - in_cur_base,
  2611  									     nice_len,
  2612  									     c->max_search_depth,
  2613  									     next_hashes);
  2614  					}
  2615  					cache_ptr->length = 0;
  2616  					cache_ptr->offset = *in_next;
  2617  					in_next++;
  2618  					cache_ptr++;
  2619  				} while (--best_len);
  2620  			}
  2621  		} while (in_next < in_max_block_end &&
  2622  			 cache_ptr < &c->p.n.match_cache[CACHE_LENGTH] &&
  2623  			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
  2624  
  2625  		/* All the matches for this block have been cached.  Now choose
  2626  		 * the sequence of items to output and flush the block.  */
  2627  		deflate_optimize_block(c, in_next - in_block_begin, cache_ptr,
  2628  				       in_block_begin == in);
  2629  		deflate_flush_block(c, &os, in_block_begin, in_next - in_block_begin,
  2630  				    in_next == in_end, true);
  2631  	} while (in_next != in_end);
  2632  
  2633  	return deflate_flush_output(&os);
  2634  }
  2635  
  2636  #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
  2637  
  2638  /* Initialize c->offset_slot_fast.  */
  2639  static void
  2640  deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
  2641  {
  2642  	unsigned offset_slot;
  2643  	unsigned offset;
  2644  	unsigned offset_end;
  2645  
  2646  	for (offset_slot = 0;
  2647  	     offset_slot < ARRAY_LEN(deflate_offset_slot_base);
  2648  	     offset_slot++)
  2649  	{
  2650  		offset = deflate_offset_slot_base[offset_slot];
  2651  	#if USE_FULL_OFFSET_SLOT_FAST
  2652  		offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
  2653  		do {
  2654  			c->offset_slot_fast[offset] = offset_slot;
  2655  		} while (++offset != offset_end);
  2656  	#else
  2657  		if (offset <= 256) {
  2658  			offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
  2659  			do {
  2660  				c->offset_slot_fast[offset - 1] = offset_slot;
  2661  			} while (++offset != offset_end);
  2662  		} else {
  2663  			offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
  2664  			do {
  2665  				c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot;
  2666  			} while ((offset += (1 << 7)) != offset_end);
  2667  		}
  2668  	#endif
  2669  	}
  2670  }
  2671  
  2672  LIBDEFLATEAPI struct libdeflate_compressor *
  2673  libdeflate_alloc_compressor(int compression_level)
  2674  {
  2675  	struct libdeflate_compressor *c;
  2676  	size_t size;
  2677  
  2678  #if SUPPORT_NEAR_OPTIMAL_PARSING
  2679  	if (compression_level >= 8)
  2680  		size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.n);
  2681  	else
  2682  #endif
  2683  		size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.g);
  2684  
  2685  	c = aligned_malloc(MATCHFINDER_ALIGNMENT, size);
  2686  	if (!c)
  2687  		return NULL;
  2688  
  2689  	switch (compression_level) {
  2690  	case 1:
  2691  		c->impl = deflate_compress_greedy;
  2692  		c->max_search_depth = 2;
  2693  		c->nice_match_length = 8;
  2694  		break;
  2695  	case 2:
  2696  		c->impl = deflate_compress_greedy;
  2697  		c->max_search_depth = 6;
  2698  		c->nice_match_length = 10;
  2699  		break;
  2700  	case 3:
  2701  		c->impl = deflate_compress_greedy;
  2702  		c->max_search_depth = 12;
  2703  		c->nice_match_length = 14;
  2704  		break;
  2705  	case 4:
  2706  		c->impl = deflate_compress_greedy;
  2707  		c->max_search_depth = 24;
  2708  		c->nice_match_length = 24;
  2709  		break;
  2710  	case 5:
  2711  		c->impl = deflate_compress_lazy;
  2712  		c->max_search_depth = 20;
  2713  		c->nice_match_length = 30;
  2714  		break;
  2715  	case 6:
  2716  		c->impl = deflate_compress_lazy;
  2717  		c->max_search_depth = 40;
  2718  		c->nice_match_length = 65;
  2719  		break;
  2720  	case 7:
  2721  		c->impl = deflate_compress_lazy;
  2722  		c->max_search_depth = 100;
  2723  		c->nice_match_length = 130;
  2724  		break;
  2725  #if SUPPORT_NEAR_OPTIMAL_PARSING
  2726  	case 8:
  2727  		c->impl = deflate_compress_near_optimal;
  2728  		c->max_search_depth = 12;
  2729  		c->nice_match_length = 20;
  2730  		c->p.n.num_optim_passes = 1;
  2731  		break;
  2732  	case 9:
  2733  		c->impl = deflate_compress_near_optimal;
  2734  		c->max_search_depth = 16;
  2735  		c->nice_match_length = 26;
  2736  		c->p.n.num_optim_passes = 2;
  2737  		break;
  2738  	case 10:
  2739  		c->impl = deflate_compress_near_optimal;
  2740  		c->max_search_depth = 30;
  2741  		c->nice_match_length = 50;
  2742  		c->p.n.num_optim_passes = 2;
  2743  		break;
  2744  	case 11:
  2745  		c->impl = deflate_compress_near_optimal;
  2746  		c->max_search_depth = 60;
  2747  		c->nice_match_length = 80;
  2748  		c->p.n.num_optim_passes = 3;
  2749  		break;
  2750  	case 12:
  2751  		c->impl = deflate_compress_near_optimal;
  2752  		c->max_search_depth = 100;
  2753  		c->nice_match_length = 133;
  2754  		c->p.n.num_optim_passes = 4;
  2755  		break;
  2756  #else
  2757  	case 8:
  2758  		c->impl = deflate_compress_lazy;
  2759  		c->max_search_depth = 150;
  2760  		c->nice_match_length = 200;
  2761  		break;
  2762  	case 9:
  2763  		c->impl = deflate_compress_lazy;
  2764  		c->max_search_depth = 200;
  2765  		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
  2766  		break;
  2767  #endif
  2768  	default:
  2769  		aligned_free(c);
  2770  		return NULL;
  2771  	}
  2772  
  2773  	c->compression_level = compression_level;
  2774  
  2775  	deflate_init_offset_slot_fast(c);
  2776  	deflate_init_static_codes(c);
  2777  
  2778  	return c;
  2779  }
  2780  
  2781  LIBDEFLATEAPI size_t
  2782  libdeflate_deflate_compress(struct libdeflate_compressor *c,
  2783  			    const void *in, size_t in_nbytes,
  2784  			    void *out, size_t out_nbytes_avail)
  2785  {
  2786  	if (unlikely(out_nbytes_avail < OUTPUT_END_PADDING))
  2787  		return 0;
  2788  
  2789  	/* For extremely small inputs just use a single uncompressed block. */
  2790  	if (unlikely(in_nbytes < 16)) {
  2791  		struct deflate_output_bitstream os;
  2792  		deflate_init_output(&os, out, out_nbytes_avail);
  2793  		if (in_nbytes == 0)
  2794  			in = &os; /* Avoid passing NULL to memcpy() */
  2795  		deflate_write_uncompressed_block(&os, in, in_nbytes, true);
  2796  		return deflate_flush_output(&os);
  2797  	}
  2798  
  2799  	return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail);
  2800  }
  2801  
  2802  LIBDEFLATEAPI void
  2803  libdeflate_free_compressor(struct libdeflate_compressor *c)
  2804  {
  2805  	aligned_free(c);
  2806  }
  2807  
  2808  unsigned int
  2809  deflate_get_compression_level(struct libdeflate_compressor *c)
  2810  {
  2811  	return c->compression_level;
  2812  }
  2813  
  2814  LIBDEFLATEAPI size_t
  2815  libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
  2816  				  size_t in_nbytes)
  2817  {
  2818  	/*
  2819  	 * The worst case is all uncompressed blocks where one block has length
  2820  	 * <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH.
  2821  	 * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE,
  2822  	 * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN.
  2823  	 */
  2824  	size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
  2825  	return (5 * max_num_blocks) + in_nbytes + 1 + OUTPUT_END_PADDING;
  2826  }