github.com/Schaudge/grailbase@v0.0.0-20240223061707-44c758a471c0/compress/libdeflate/deflate_compress.c (about) 1 /* 2 * deflate_compress.c - a compressor for DEFLATE 3 * 4 * Originally public domain; changes after 2016-09-07 are copyrighted. 5 * 6 * Copyright 2016 Eric Biggers 7 * 8 * Permission is hereby granted, free of charge, to any person 9 * obtaining a copy of this software and associated documentation 10 * files (the "Software"), to deal in the Software without 11 * restriction, including without limitation the rights to use, 12 * copy, modify, merge, publish, distribute, sublicense, and/or sell 13 * copies of the Software, and to permit persons to whom the 14 * Software is furnished to do so, subject to the following 15 * conditions: 16 * 17 * The above copyright notice and this permission notice shall be 18 * included in all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 22 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 24 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 25 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 26 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 27 * OTHER DEALINGS IN THE SOFTWARE. 28 */ 29 30 #include <stdlib.h> 31 #include <string.h> 32 33 #include "aligned_malloc.h" 34 #include "deflate_compress.h" 35 #include "deflate_constants.h" 36 #include "unaligned.h" 37 38 #include "libdeflate.h" 39 40 /* 41 * By default, the near-optimal parsing algorithm is enabled at compression 42 * level 8 and above. The near-optimal parsing algorithm produces a compression 43 * ratio significantly better than the greedy and lazy algorithms implemented 44 * here, and also the algorithm used by zlib at level 9. However, it is slow. 45 */ 46 #define SUPPORT_NEAR_OPTIMAL_PARSING 1 47 48 /* 49 * Define to 1 to maintain the full map from match offsets to offset slots. 50 * This slightly speeds up translations of match offsets to offset slots, but it 51 * uses 32769 bytes of memory rather than the 512 bytes used by the condensed 52 * map. The speedup provided by the larger map is most helpful when the 53 * near-optimal parsing algorithm is being used. 54 */ 55 #define USE_FULL_OFFSET_SLOT_FAST SUPPORT_NEAR_OPTIMAL_PARSING 56 57 /* 58 * DEFLATE uses a 32768 byte sliding window; set the matchfinder parameters 59 * appropriately. 60 */ 61 #define MATCHFINDER_WINDOW_ORDER 15 62 63 #include "hc_matchfinder.h" 64 #if SUPPORT_NEAR_OPTIMAL_PARSING 65 # include "bt_matchfinder.h" 66 #endif 67 68 /* 69 * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes, 70 * except if the last block has to be shorter. 71 */ 72 #define MIN_BLOCK_LENGTH 10000 73 74 /* 75 * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but 76 * the final length might be slightly longer due to matches extending beyond 77 * this limit. 78 */ 79 #define SOFT_MAX_BLOCK_LENGTH 300000 80 81 /* 82 * The number of observed matches or literals that represents sufficient data to 83 * decide whether the current block should be terminated or not. 84 */ 85 #define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512 86 87 88 #if SUPPORT_NEAR_OPTIMAL_PARSING 89 /* Constants specific to the near-optimal parsing algorithm */ 90 91 /* 92 * The maximum number of matches the matchfinder can find at a single position. 93 * Since the matchfinder never finds more than one match for the same length, 94 * presuming one of each possible length is sufficient for an upper bound. 95 * (This says nothing about whether it is worthwhile to consider so many 96 * matches; this is just defining the worst case.) 97 */ 98 # define MAX_MATCHES_PER_POS (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1) 99 100 /* 101 * The number of lz_match structures in the match cache, excluding the extra 102 * "overflow" entries. This value should be high enough so that nearly the 103 * time, all matches found in a given block can fit in the match cache. 104 * However, fallback behavior (immediately terminating the block) on cache 105 * overflow is still required. 106 */ 107 # define CACHE_LENGTH (SOFT_MAX_BLOCK_LENGTH * 5) 108 109 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ 110 111 /* 112 * These are the compressor-side limits on the codeword lengths for each Huffman 113 * code. To make outputting bits slightly faster, some of these limits are 114 * lower than the limits defined by the DEFLATE format. This does not 115 * significantly affect the compression ratio, at least for the block lengths we 116 * use. 117 */ 118 #define MAX_LITLEN_CODEWORD_LEN 14 119 #define MAX_OFFSET_CODEWORD_LEN DEFLATE_MAX_OFFSET_CODEWORD_LEN 120 #define MAX_PRE_CODEWORD_LEN DEFLATE_MAX_PRE_CODEWORD_LEN 121 122 /* Table: length slot => length slot base value */ 123 static const unsigned deflate_length_slot_base[] = { 124 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 125 11 , 13 , 15 , 17 , 19 , 23 , 27 , 31 , 126 35 , 43 , 51 , 59 , 67 , 83 , 99 , 115 , 127 131 , 163 , 195 , 227 , 258 , 128 }; 129 130 /* Table: length slot => number of extra length bits */ 131 static const u8 deflate_extra_length_bits[] = { 132 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 133 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 , 134 3 , 3 , 3 , 3 , 4 , 4 , 4 , 4 , 135 5 , 5 , 5 , 5 , 0 , 136 }; 137 138 /* Table: offset slot => offset slot base value */ 139 static const unsigned deflate_offset_slot_base[] = { 140 1 , 2 , 3 , 4 , 5 , 7 , 9 , 13 , 141 17 , 25 , 33 , 49 , 65 , 97 , 129 , 193 , 142 257 , 385 , 513 , 769 , 1025 , 1537 , 2049 , 3073 , 143 4097 , 6145 , 8193 , 12289 , 16385 , 24577 , 144 }; 145 146 /* Table: offset slot => number of extra offset bits */ 147 static const u8 deflate_extra_offset_bits[] = { 148 0 , 0 , 0 , 0 , 1 , 1 , 2 , 2 , 149 3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 , 150 7 , 7 , 8 , 8 , 9 , 9 , 10 , 10 , 151 11 , 11 , 12 , 12 , 13 , 13 , 152 }; 153 154 /* Table: length => length slot */ 155 static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = { 156 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12, 157 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 158 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 159 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 160 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 161 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 162 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 163 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 164 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, 165 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 166 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 167 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 168 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 169 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 170 27, 27, 28, 171 }; 172 173 /* The order in which precode codeword lengths are stored */ 174 static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { 175 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 176 }; 177 178 /* Codewords for the DEFLATE Huffman codes. */ 179 struct deflate_codewords { 180 u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; 181 u32 offset[DEFLATE_NUM_OFFSET_SYMS]; 182 }; 183 184 /* Codeword lengths (in bits) for the DEFLATE Huffman codes. 185 * A zero length means the corresponding symbol had zero frequency. */ 186 struct deflate_lens { 187 u8 litlen[DEFLATE_NUM_LITLEN_SYMS]; 188 u8 offset[DEFLATE_NUM_OFFSET_SYMS]; 189 }; 190 191 /* Codewords and lengths for the DEFLATE Huffman codes. */ 192 struct deflate_codes { 193 struct deflate_codewords codewords; 194 struct deflate_lens lens; 195 }; 196 197 /* Symbol frequency counters for the DEFLATE Huffman codes. */ 198 struct deflate_freqs { 199 u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; 200 u32 offset[DEFLATE_NUM_OFFSET_SYMS]; 201 }; 202 203 #if SUPPORT_NEAR_OPTIMAL_PARSING 204 205 /* Costs for the near-optimal parsing algorithm. */ 206 struct deflate_costs { 207 208 /* The cost to output each possible literal. */ 209 u32 literal[DEFLATE_NUM_LITERALS]; 210 211 /* The cost to output each possible match length. */ 212 u32 length[DEFLATE_MAX_MATCH_LEN + 1]; 213 214 /* The cost to output a match offset of each possible offset slot. */ 215 u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS]; 216 }; 217 218 /* 219 * COST_SHIFT is a scaling factor that makes it possible to consider fractional 220 * bit costs. A token requiring 'n' bits to represent has cost n << COST_SHIFT. 221 * 222 * Note: this is only useful as a statistical trick for when the true costs are 223 * unknown. In reality, each token in DEFLATE requires a whole number of bits 224 * to output. 225 */ 226 #define COST_SHIFT 3 227 228 /* 229 * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to 230 * be needed to output a symbol that was unused in the previous optimization 231 * pass. Assigning a default cost allows the symbol to be used in the next 232 * optimization pass. However, the cost should be relatively high because the 233 * symbol probably won't be used very many times (if at all). 234 */ 235 #define LITERAL_NOSTAT_BITS 13 236 #define LENGTH_NOSTAT_BITS 13 237 #define OFFSET_NOSTAT_BITS 10 238 239 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ 240 241 /* 242 * Represents a run of literals followed by a match or end-of-block. This 243 * struct is needed to temporarily store items chosen by the parser, since items 244 * cannot be written until all items for the block have been chosen and the 245 * block's Huffman codes have been computed. 246 */ 247 struct deflate_sequence { 248 249 /* Bits 0..22: the number of literals in this run. This may be 0 and 250 * can be at most about SOFT_MAX_BLOCK_LENGTH. The literals are not 251 * stored explicitly in this structure; instead, they are read directly 252 * from the uncompressed data. 253 * 254 * Bits 23..31: the length of the match which follows the literals, or 0 255 * if this literal run was the last in the block, so there is no match 256 * which follows it. */ 257 u32 litrunlen_and_length; 258 259 /* If 'length' doesn't indicate end-of-block, then this is the offset of 260 * the match which follows the literals. */ 261 u16 offset; 262 263 /* If 'length' doesn't indicate end-of-block, then this is the offset 264 * symbol of the match which follows the literals. */ 265 u8 offset_symbol; 266 267 /* If 'length' doesn't indicate end-of-block, then this is the length 268 * slot of the match which follows the literals. */ 269 u8 length_slot; 270 }; 271 272 #if SUPPORT_NEAR_OPTIMAL_PARSING 273 274 /* 275 * This structure represents a byte position in the input data and a node in the 276 * graph of possible match/literal choices for the current block. 277 * 278 * Logically, each incoming edge to this node is labeled with a literal or a 279 * match that can be taken to reach this position from an earlier position; and 280 * each outgoing edge from this node is labeled with a literal or a match that 281 * can be taken to advance from this position to a later position. 282 * 283 * But these "edges" are actually stored elsewhere (in 'match_cache'). Here we 284 * associate with each node just two pieces of information: 285 * 286 * 'cost_to_end' is the minimum cost to reach the end of the block from 287 * this position. 288 * 289 * 'item' represents the literal or match that must be chosen from here to 290 * reach the end of the block with the minimum cost. Equivalently, this 291 * can be interpreted as the label of the outgoing edge on the minimum-cost 292 * path to the "end of block" node from this node. 293 */ 294 struct deflate_optimum_node { 295 296 u32 cost_to_end; 297 298 /* 299 * Notes on the match/literal representation used here: 300 * 301 * The low bits of 'item' are the length: 1 if this is a literal, 302 * or the match length if this is a match. 303 * 304 * The high bits of 'item' are the actual literal byte if this is a 305 * literal, or the match offset if this is a match. 306 */ 307 #define OPTIMUM_OFFSET_SHIFT 9 308 #define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1) 309 u32 item; 310 311 }; 312 313 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ 314 315 /* Block split statistics. See "Block splitting algorithm" below. */ 316 #define NUM_LITERAL_OBSERVATION_TYPES 8 317 #define NUM_MATCH_OBSERVATION_TYPES 2 318 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES) 319 struct block_split_stats { 320 u32 new_observations[NUM_OBSERVATION_TYPES]; 321 u32 observations[NUM_OBSERVATION_TYPES]; 322 u32 num_new_observations; 323 u32 num_observations; 324 }; 325 326 /* The main DEFLATE compressor structure */ 327 struct libdeflate_compressor { 328 329 /* Pointer to the compress() implementation chosen at allocation time */ 330 size_t (*impl)(struct libdeflate_compressor *, 331 const u8 *, size_t, u8 *, size_t); 332 333 /* Frequency counters for the current block */ 334 struct deflate_freqs freqs; 335 336 /* Dynamic Huffman codes for the current block */ 337 struct deflate_codes codes; 338 339 /* Static Huffman codes */ 340 struct deflate_codes static_codes; 341 342 /* Block split statistics for the currently pending block */ 343 struct block_split_stats split_stats; 344 345 /* A table for fast lookups of offset slot by match offset. 346 * 347 * If the full table is being used, it is a direct mapping from offset 348 * to offset slot. 349 * 350 * If the condensed table is being used, the first 256 entries map 351 * directly to the offset slots of offsets 1 through 256. The next 256 352 * entries map to the offset slots for the remaining offsets, stepping 353 * through the offsets with a stride of 128. This relies on the fact 354 * that each of the remaining offset slots contains at least 128 offsets 355 * and has an offset base that is a multiple of 128. */ 356 #if USE_FULL_OFFSET_SLOT_FAST 357 u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1]; 358 #else 359 u8 offset_slot_fast[512]; 360 #endif 361 362 /* The "nice" match length: if a match of this length is found, choose 363 * it immediately without further consideration. */ 364 unsigned nice_match_length; 365 366 /* The maximum search depth: consider at most this many potential 367 * matches at each position. */ 368 unsigned max_search_depth; 369 370 /* The compression level with which this compressor was created. */ 371 unsigned compression_level; 372 373 /* Temporary space for Huffman code output */ 374 u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS]; 375 u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS]; 376 u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS]; 377 unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS]; 378 unsigned num_litlen_syms; 379 unsigned num_offset_syms; 380 unsigned num_explicit_lens; 381 unsigned num_precode_items; 382 383 union { 384 /* Data for greedy or lazy parsing */ 385 struct { 386 /* Hash chain matchfinder */ 387 struct hc_matchfinder hc_mf; 388 389 /* The matches and literals that the parser has chosen 390 * for the current block. The required length of this 391 * array is limited by the maximum number of matches 392 * that can ever be chosen for a single block, plus one 393 * for the special entry at the end. */ 394 struct deflate_sequence sequences[ 395 DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH, 396 DEFLATE_MIN_MATCH_LEN) + 1]; 397 } g; /* (g)reedy */ 398 399 #if SUPPORT_NEAR_OPTIMAL_PARSING 400 /* Data for near-optimal parsing */ 401 struct { 402 403 /* Binary tree matchfinder */ 404 struct bt_matchfinder bt_mf; 405 406 /* 407 * Cached matches for the current block. This array 408 * contains the matches that were found at each position 409 * in the block. Specifically, for each position, there 410 * is a list of matches found at that position, if any, 411 * sorted by strictly increasing length. In addition, 412 * following the matches for each position, there is a 413 * special 'struct lz_match' whose 'length' member 414 * contains the number of matches found at that 415 * position, and whose 'offset' member contains the 416 * literal at that position. 417 * 418 * Note: in rare cases, there will be a very high number 419 * of matches in the block and this array will overflow. 420 * If this happens, we force the end of the current 421 * block. CACHE_LENGTH is the length at which we 422 * actually check for overflow. The extra slots beyond 423 * this are enough to absorb the worst case overflow, 424 * which occurs if starting at &match_cache[CACHE_LENGTH 425 * - 1], we write MAX_MATCHES_PER_POS matches and a 426 * match count header, then skip searching for matches 427 * at 'DEFLATE_MAX_MATCH_LEN - 1' positions and write 428 * the match count header for each. 429 */ 430 struct lz_match match_cache[CACHE_LENGTH + 431 MAX_MATCHES_PER_POS + 432 DEFLATE_MAX_MATCH_LEN - 1]; 433 434 /* 435 * Array of nodes, one per position, for running the 436 * minimum-cost path algorithm. 437 * 438 * This array must be large enough to accommodate the 439 * worst-case number of nodes, which occurs if we find a 440 * match of length DEFLATE_MAX_MATCH_LEN at position 441 * SOFT_MAX_BLOCK_LENGTH - 1, producing a block of 442 * length SOFT_MAX_BLOCK_LENGTH - 1 + 443 * DEFLATE_MAX_MATCH_LEN. Add one for the end-of-block 444 * node. 445 */ 446 struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 + 447 DEFLATE_MAX_MATCH_LEN + 1]; 448 449 /* The current cost model being used. */ 450 struct deflate_costs costs; 451 452 unsigned num_optim_passes; 453 } n; /* (n)ear-optimal */ 454 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ 455 456 } p; /* (p)arser */ 457 }; 458 459 /* 460 * The type for the bitbuffer variable, which temporarily holds bits that are 461 * being packed into bytes and written to the output buffer. For best 462 * performance, this should have size equal to a machine word. 463 */ 464 typedef machine_word_t bitbuf_t; 465 #define BITBUF_NBITS (8 * sizeof(bitbuf_t)) 466 467 /* Can the specified number of bits always be added to 'bitbuf' after any 468 * pending bytes have been flushed? */ 469 #define CAN_BUFFER(n) ((n) <= BITBUF_NBITS - 7) 470 471 /* 472 * Structure to keep track of the current state of sending bits to the 473 * compressed output buffer. 474 */ 475 struct deflate_output_bitstream { 476 477 /* Bits that haven't yet been written to the output buffer. */ 478 bitbuf_t bitbuf; 479 480 /* Number of bits currently held in @bitbuf. */ 481 unsigned bitcount; 482 483 /* Pointer to the beginning of the output buffer. */ 484 u8 *begin; 485 486 /* Pointer to the position in the output buffer at which the next byte 487 * should be written. */ 488 u8 *next; 489 490 /* Pointer just past the end of the output buffer. */ 491 u8 *end; 492 }; 493 494 /* 495 * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be 496 * present following os->end, in order to not overrun the buffer when generating 497 * output. When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t) 498 * bytes for put_unaligned_leword(). Otherwise we need only 1 byte. However, 499 * to make the compression algorithm produce the same result on all CPU 500 * architectures (which is sometimes desirable), we have to unconditionally use 501 * the maximum for any CPU, which is sizeof(bitbuf_t) == 8. 502 */ 503 #define OUTPUT_END_PADDING 8 504 505 /* Initialize the output bitstream. 'size' is assumed to be at least 506 * OUTPUT_END_PADDING. */ 507 static void 508 deflate_init_output(struct deflate_output_bitstream *os, 509 void *buffer, size_t size) 510 { 511 os->bitbuf = 0; 512 os->bitcount = 0; 513 os->begin = buffer; 514 os->next = os->begin; 515 os->end = os->begin + size - OUTPUT_END_PADDING; 516 } 517 518 /* Add some bits to the bitbuffer variable of the output bitstream. The caller 519 * must make sure there is enough room. */ 520 static forceinline void 521 deflate_add_bits(struct deflate_output_bitstream *os, 522 const bitbuf_t bits, const unsigned num_bits) 523 { 524 os->bitbuf |= bits << os->bitcount; 525 os->bitcount += num_bits; 526 } 527 528 /* Flush bits from the bitbuffer variable to the output buffer. */ 529 static forceinline void 530 deflate_flush_bits(struct deflate_output_bitstream *os) 531 { 532 if (UNALIGNED_ACCESS_IS_FAST) { 533 /* Flush a whole word (branchlessly). */ 534 put_unaligned_leword(os->bitbuf, os->next); 535 os->bitbuf >>= os->bitcount & ~7; 536 os->next += MIN(os->end - os->next, os->bitcount >> 3); 537 os->bitcount &= 7; 538 } else { 539 /* Flush a byte at a time. */ 540 while (os->bitcount >= 8) { 541 *os->next = os->bitbuf; 542 if (os->next != os->end) 543 os->next++; 544 os->bitcount -= 8; 545 os->bitbuf >>= 8; 546 } 547 } 548 } 549 550 /* Align the bitstream on a byte boundary. */ 551 static forceinline void 552 deflate_align_bitstream(struct deflate_output_bitstream *os) 553 { 554 os->bitcount += -os->bitcount & 7; 555 deflate_flush_bits(os); 556 } 557 558 /* 559 * Flush any remaining bits to the output buffer if needed. Return the total 560 * number of bytes written to the output buffer, or 0 if an overflow occurred. 561 */ 562 static u32 563 deflate_flush_output(struct deflate_output_bitstream *os) 564 { 565 if (os->next == os->end) /* overflow? */ 566 return 0; 567 568 while ((int)os->bitcount > 0) { 569 *os->next++ = os->bitbuf; 570 os->bitcount -= 8; 571 os->bitbuf >>= 8; 572 } 573 574 return os->next - os->begin; 575 } 576 577 /* Given the binary tree node A[subtree_idx] whose children already 578 * satisfy the maxheap property, swap the node with its greater child 579 * until it is greater than both its children, so that the maxheap 580 * property is satisfied in the subtree rooted at A[subtree_idx]. */ 581 static void 582 heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx) 583 { 584 unsigned parent_idx; 585 unsigned child_idx; 586 u32 v; 587 588 v = A[subtree_idx]; 589 parent_idx = subtree_idx; 590 while ((child_idx = parent_idx * 2) <= length) { 591 if (child_idx < length && A[child_idx + 1] > A[child_idx]) 592 child_idx++; 593 if (v >= A[child_idx]) 594 break; 595 A[parent_idx] = A[child_idx]; 596 parent_idx = child_idx; 597 } 598 A[parent_idx] = v; 599 } 600 601 /* Rearrange the array 'A' so that it satisfies the maxheap property. 602 * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1]. 603 */ 604 static void 605 heapify_array(u32 A[], unsigned length) 606 { 607 unsigned subtree_idx; 608 609 for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--) 610 heapify_subtree(A, length, subtree_idx); 611 } 612 613 /* 614 * Sort the array 'A', which contains 'length' unsigned 32-bit integers. 615 * 616 * Note: name this function heap_sort() instead of heapsort() to avoid colliding 617 * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't 618 * necessary when compiling with -D_ANSI_SOURCE, which is the better solution. 619 */ 620 static void 621 heap_sort(u32 A[], unsigned length) 622 { 623 A--; /* Use 1-based indices */ 624 625 heapify_array(A, length); 626 627 while (length >= 2) { 628 u32 tmp = A[length]; 629 A[length] = A[1]; 630 A[1] = tmp; 631 length--; 632 heapify_subtree(A, length, 1); 633 } 634 } 635 636 #define NUM_SYMBOL_BITS 10 637 #define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1) 638 639 #define GET_NUM_COUNTERS(num_syms) ((((num_syms) + 3 / 4) + 3) & ~3) 640 /* 641 * Sort the symbols primarily by frequency and secondarily by symbol 642 * value. Discard symbols with zero frequency and fill in an array with 643 * the remaining symbols, along with their frequencies. The low 644 * NUM_SYMBOL_BITS bits of each array entry will contain the symbol 645 * value, and the remaining bits will contain the frequency. 646 * 647 * @num_syms 648 * Number of symbols in the alphabet. 649 * Can't be greater than (1 << NUM_SYMBOL_BITS). 650 * 651 * @freqs[num_syms] 652 * The frequency of each symbol. 653 * 654 * @lens[num_syms] 655 * An array that eventually will hold the length of each codeword. 656 * This function only fills in the codeword lengths for symbols that 657 * have zero frequency, which are not well defined per se but will 658 * be set to 0. 659 * 660 * @symout[num_syms] 661 * The output array, described above. 662 * 663 * Returns the number of entries in 'symout' that were filled. This is 664 * the number of symbols that have nonzero frequency. 665 */ 666 static unsigned 667 sort_symbols(unsigned num_syms, const u32 freqs[restrict], 668 u8 lens[restrict], u32 symout[restrict]) 669 { 670 unsigned sym; 671 unsigned i; 672 unsigned num_used_syms; 673 unsigned num_counters; 674 unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)]; 675 676 /* We rely on heapsort, but with an added optimization. Since 677 * it's common for most symbol frequencies to be low, we first do 678 * a count sort using a limited number of counters. High 679 * frequencies will be counted in the last counter, and only they 680 * will be sorted with heapsort. 681 * 682 * Note: with more symbols, it is generally beneficial to have more 683 * counters. About 1 counter per 4 symbols seems fast. 684 * 685 * Note: I also tested radix sort, but even for large symbol 686 * counts (> 255) and frequencies bounded at 16 bits (enabling 687 * radix sort by just two base-256 digits), it didn't seem any 688 * faster than the method implemented here. 689 * 690 * Note: I tested the optimized quicksort implementation from 691 * glibc (with indirection overhead removed), but it was only 692 * marginally faster than the simple heapsort implemented here. 693 * 694 * Tests were done with building the codes for LZX. Results may 695 * vary for different compression algorithms...! */ 696 697 num_counters = GET_NUM_COUNTERS(num_syms); 698 699 memset(counters, 0, num_counters * sizeof(counters[0])); 700 701 /* Count the frequencies. */ 702 for (sym = 0; sym < num_syms; sym++) 703 counters[MIN(freqs[sym], num_counters - 1)]++; 704 705 /* Make the counters cumulative, ignoring the zero-th, which 706 * counted symbols with zero frequency. As a side effect, this 707 * calculates the number of symbols with nonzero frequency. */ 708 num_used_syms = 0; 709 for (i = 1; i < num_counters; i++) { 710 unsigned count = counters[i]; 711 counters[i] = num_used_syms; 712 num_used_syms += count; 713 } 714 715 /* Sort nonzero-frequency symbols using the counters. At the 716 * same time, set the codeword lengths of zero-frequency symbols 717 * to 0. */ 718 for (sym = 0; sym < num_syms; sym++) { 719 u32 freq = freqs[sym]; 720 if (freq != 0) { 721 symout[counters[MIN(freq, num_counters - 1)]++] = 722 sym | (freq << NUM_SYMBOL_BITS); 723 } else { 724 lens[sym] = 0; 725 } 726 } 727 728 /* Sort the symbols counted in the last counter. */ 729 heap_sort(symout + counters[num_counters - 2], 730 counters[num_counters - 1] - counters[num_counters - 2]); 731 732 return num_used_syms; 733 } 734 735 /* 736 * Build the Huffman tree. 737 * 738 * This is an optimized implementation that 739 * (a) takes advantage of the frequencies being already sorted; 740 * (b) only generates non-leaf nodes, since the non-leaf nodes of a 741 * Huffman tree are sufficient to generate a canonical code; 742 * (c) Only stores parent pointers, not child pointers; 743 * (d) Produces the nodes in the same memory used for input 744 * frequency information. 745 * 746 * Array 'A', which contains 'sym_count' entries, is used for both input 747 * and output. For this function, 'sym_count' must be at least 2. 748 * 749 * For input, the array must contain the frequencies of the symbols, 750 * sorted in increasing order. Specifically, each entry must contain a 751 * frequency left shifted by NUM_SYMBOL_BITS bits. Any data in the low 752 * NUM_SYMBOL_BITS bits of the entries will be ignored by this function. 753 * Although these bits will, in fact, contain the symbols that correspond 754 * to the frequencies, this function is concerned with frequencies only 755 * and keeps the symbols as-is. 756 * 757 * For output, this function will produce the non-leaf nodes of the 758 * Huffman tree. These nodes will be stored in the first (sym_count - 1) 759 * entries of the array. Entry A[sym_count - 2] will represent the root 760 * node. Each other node will contain the zero-based index of its parent 761 * node in 'A', left shifted by NUM_SYMBOL_BITS bits. The low 762 * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is. Again, 763 * note that although these low bits will, in fact, contain a symbol 764 * value, this symbol will have *no relationship* with the Huffman tree 765 * node that happens to occupy the same slot. This is because this 766 * implementation only generates the non-leaf nodes of the tree. 767 */ 768 static void 769 build_tree(u32 A[], unsigned sym_count) 770 { 771 /* Index, in 'A', of next lowest frequency symbol that has not 772 * yet been processed. */ 773 unsigned i = 0; 774 775 /* Index, in 'A', of next lowest frequency parentless non-leaf 776 * node; or, if equal to 'e', then no such node exists yet. */ 777 unsigned b = 0; 778 779 /* Index, in 'A', of next node to allocate as a non-leaf. */ 780 unsigned e = 0; 781 782 do { 783 unsigned m, n; 784 u32 freq_shifted; 785 786 /* Choose the two next lowest frequency entries. */ 787 788 if (i != sym_count && 789 (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS))) 790 m = i++; 791 else 792 m = b++; 793 794 if (i != sym_count && 795 (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS))) 796 n = i++; 797 else 798 n = b++; 799 800 /* Allocate a non-leaf node and link the entries to it. 801 * 802 * If we link an entry that we're visiting for the first 803 * time (via index 'i'), then we're actually linking a 804 * leaf node and it will have no effect, since the leaf 805 * will be overwritten with a non-leaf when index 'e' 806 * catches up to it. But it's not any slower to 807 * unconditionally set the parent index. 808 * 809 * We also compute the frequency of the non-leaf node as 810 * the sum of its two children's frequencies. */ 811 812 freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK); 813 814 A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS); 815 A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS); 816 A[e] = (A[e] & SYMBOL_MASK) | freq_shifted; 817 e++; 818 } while (sym_count - e > 1); 819 /* When just one entry remains, it is a "leaf" that was 820 * linked to some other node. We ignore it, since the 821 * rest of the array contains the non-leaves which we 822 * need. (Note that we're assuming the cases with 0 or 1 823 * symbols were handled separately.) */ 824 } 825 826 /* 827 * Given the stripped-down Huffman tree constructed by build_tree(), 828 * determine the number of codewords that should be assigned each 829 * possible length, taking into account the length-limited constraint. 830 * 831 * @A 832 * The array produced by build_tree(), containing parent index 833 * information for the non-leaf nodes of the Huffman tree. Each 834 * entry in this array is a node; a node's parent always has a 835 * greater index than that node itself. This function will 836 * overwrite the parent index information in this array, so 837 * essentially it will destroy the tree. However, the data in the 838 * low NUM_SYMBOL_BITS of each entry will be preserved. 839 * 840 * @root_idx 841 * The 0-based index of the root node in 'A', and consequently one 842 * less than the number of tree node entries in 'A'. (Or, really 2 843 * less than the actual length of 'A'.) 844 * 845 * @len_counts 846 * An array of length ('max_codeword_len' + 1) in which the number of 847 * codewords having each length <= max_codeword_len will be 848 * returned. 849 * 850 * @max_codeword_len 851 * The maximum permissible codeword length. 852 */ 853 static void 854 compute_length_counts(u32 A[restrict], unsigned root_idx, 855 unsigned len_counts[restrict], unsigned max_codeword_len) 856 { 857 unsigned len; 858 int node; 859 860 /* The key observations are: 861 * 862 * (1) We can traverse the non-leaf nodes of the tree, always 863 * visiting a parent before its children, by simply iterating 864 * through the array in reverse order. Consequently, we can 865 * compute the depth of each node in one pass, overwriting the 866 * parent indices with depths. 867 * 868 * (2) We can initially assume that in the real Huffman tree, 869 * both children of the root are leaves. This corresponds to two 870 * codewords of length 1. Then, whenever we visit a (non-leaf) 871 * node during the traversal, we modify this assumption to 872 * account for the current node *not* being a leaf, but rather 873 * its two children being leaves. This causes the loss of one 874 * codeword for the current depth and the addition of two 875 * codewords for the current depth plus one. 876 * 877 * (3) We can handle the length-limited constraint fairly easily 878 * by simply using the largest length available when a depth 879 * exceeds max_codeword_len. 880 */ 881 882 for (len = 0; len <= max_codeword_len; len++) 883 len_counts[len] = 0; 884 len_counts[1] = 2; 885 886 /* Set the root node's depth to 0. */ 887 A[root_idx] &= SYMBOL_MASK; 888 889 for (node = root_idx - 1; node >= 0; node--) { 890 891 /* Calculate the depth of this node. */ 892 893 unsigned parent = A[node] >> NUM_SYMBOL_BITS; 894 unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS; 895 unsigned depth = parent_depth + 1; 896 unsigned len = depth; 897 898 /* Set the depth of this node so that it is available 899 * when its children (if any) are processed. */ 900 901 A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS); 902 903 /* If needed, decrease the length to meet the 904 * length-limited constraint. This is not the optimal 905 * method for generating length-limited Huffman codes! 906 * But it should be good enough. */ 907 if (len >= max_codeword_len) { 908 len = max_codeword_len; 909 do { 910 len--; 911 } while (len_counts[len] == 0); 912 } 913 914 /* Account for the fact that we have a non-leaf node at 915 * the current depth. */ 916 len_counts[len]--; 917 len_counts[len + 1] += 2; 918 } 919 } 920 921 /* 922 * Generate the codewords for a canonical Huffman code. 923 * 924 * @A 925 * The output array for codewords. In addition, initially this 926 * array must contain the symbols, sorted primarily by frequency and 927 * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of 928 * each entry. 929 * 930 * @len 931 * Output array for codeword lengths. 932 * 933 * @len_counts 934 * An array that provides the number of codewords that will have 935 * each possible length <= max_codeword_len. 936 * 937 * @max_codeword_len 938 * Maximum length, in bits, of each codeword. 939 * 940 * @num_syms 941 * Number of symbols in the alphabet, including symbols with zero 942 * frequency. This is the length of the 'A' and 'len' arrays. 943 */ 944 static void 945 gen_codewords(u32 A[restrict], u8 lens[restrict], 946 const unsigned len_counts[restrict], 947 unsigned max_codeword_len, unsigned num_syms) 948 { 949 u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1]; 950 unsigned i; 951 unsigned len; 952 unsigned sym; 953 954 /* Given the number of codewords that will have each length, 955 * assign codeword lengths to symbols. We do this by assigning 956 * the lengths in decreasing order to the symbols sorted 957 * primarily by increasing frequency and secondarily by 958 * increasing symbol value. */ 959 for (i = 0, len = max_codeword_len; len >= 1; len--) { 960 unsigned count = len_counts[len]; 961 while (count--) 962 lens[A[i++] & SYMBOL_MASK] = len; 963 } 964 965 /* Generate the codewords themselves. We initialize the 966 * 'next_codewords' array to provide the lexicographically first 967 * codeword of each length, then assign codewords in symbol 968 * order. This produces a canonical code. */ 969 next_codewords[0] = 0; 970 next_codewords[1] = 0; 971 for (len = 2; len <= max_codeword_len; len++) 972 next_codewords[len] = 973 (next_codewords[len - 1] + len_counts[len - 1]) << 1; 974 975 for (sym = 0; sym < num_syms; sym++) 976 A[sym] = next_codewords[lens[sym]]++; 977 } 978 979 /* 980 * --------------------------------------------------------------------- 981 * make_canonical_huffman_code() 982 * --------------------------------------------------------------------- 983 * 984 * Given an alphabet and the frequency of each symbol in it, construct a 985 * length-limited canonical Huffman code. 986 * 987 * @num_syms 988 * The number of symbols in the alphabet. The symbols are the 989 * integers in the range [0, num_syms - 1]. This parameter must be 990 * at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS). 991 * 992 * @max_codeword_len 993 * The maximum permissible codeword length. 994 * 995 * @freqs 996 * An array of @num_syms entries, each of which specifies the 997 * frequency of the corresponding symbol. It is valid for some, 998 * none, or all of the frequencies to be 0. 999 * 1000 * @lens 1001 * An array of @num_syms entries in which this function will return 1002 * the length, in bits, of the codeword assigned to each symbol. 1003 * Symbols with 0 frequency will not have codewords per se, but 1004 * their entries in this array will be set to 0. No lengths greater 1005 * than @max_codeword_len will be assigned. 1006 * 1007 * @codewords 1008 * An array of @num_syms entries in which this function will return 1009 * the codeword for each symbol, right-justified and padded on the 1010 * left with zeroes. Codewords for symbols with 0 frequency will be 1011 * undefined. 1012 * 1013 * --------------------------------------------------------------------- 1014 * 1015 * This function builds a length-limited canonical Huffman code. 1016 * 1017 * A length-limited Huffman code contains no codewords longer than some 1018 * specified length, and has exactly (with some algorithms) or 1019 * approximately (with the algorithm used here) the minimum weighted path 1020 * length from the root, given this constraint. 1021 * 1022 * A canonical Huffman code satisfies the properties that a longer 1023 * codeword never lexicographically precedes a shorter codeword, and the 1024 * lexicographic ordering of codewords of the same length is the same as 1025 * the lexicographic ordering of the corresponding symbols. A canonical 1026 * Huffman code, or more generally a canonical prefix code, can be 1027 * reconstructed from only a list containing the codeword length of each 1028 * symbol. 1029 * 1030 * The classic algorithm to generate a Huffman code creates a node for 1031 * each symbol, then inserts these nodes into a min-heap keyed by symbol 1032 * frequency. Then, repeatedly, the two lowest-frequency nodes are 1033 * removed from the min-heap and added as the children of a new node 1034 * having frequency equal to the sum of its two children, which is then 1035 * inserted into the min-heap. When only a single node remains in the 1036 * min-heap, it is the root of the Huffman tree. The codeword for each 1037 * symbol is determined by the path needed to reach the corresponding 1038 * node from the root. Descending to the left child appends a 0 bit, 1039 * whereas descending to the right child appends a 1 bit. 1040 * 1041 * The classic algorithm is relatively easy to understand, but it is 1042 * subject to a number of inefficiencies. In practice, it is fastest to 1043 * first sort the symbols by frequency. (This itself can be subject to 1044 * an optimization based on the fact that most frequencies tend to be 1045 * low.) At the same time, we sort secondarily by symbol value, which 1046 * aids the process of generating a canonical code. Then, during tree 1047 * construction, no heap is necessary because both the leaf nodes and the 1048 * unparented non-leaf nodes can be easily maintained in sorted order. 1049 * Consequently, there can never be more than two possibilities for the 1050 * next-lowest-frequency node. 1051 * 1052 * In addition, because we're generating a canonical code, we actually 1053 * don't need the leaf nodes of the tree at all, only the non-leaf nodes. 1054 * This is because for canonical code generation we don't need to know 1055 * where the symbols are in the tree. Rather, we only need to know how 1056 * many leaf nodes have each depth (codeword length). And this 1057 * information can, in fact, be quickly generated from the tree of 1058 * non-leaves only. 1059 * 1060 * Furthermore, we can build this stripped-down Huffman tree directly in 1061 * the array in which the codewords are to be generated, provided that 1062 * these array slots are large enough to hold a symbol and frequency 1063 * value. 1064 * 1065 * Still furthermore, we don't even need to maintain explicit child 1066 * pointers. We only need the parent pointers, and even those can be 1067 * overwritten in-place with depth information as part of the process of 1068 * extracting codeword lengths from the tree. So in summary, we do NOT 1069 * need a big structure like: 1070 * 1071 * struct huffman_tree_node { 1072 * unsigned int symbol; 1073 * unsigned int frequency; 1074 * unsigned int depth; 1075 * struct huffman_tree_node *left_child; 1076 * struct huffman_tree_node *right_child; 1077 * }; 1078 * 1079 * 1080 * ... which often gets used in "naive" implementations of Huffman code 1081 * generation. 1082 * 1083 * Many of these optimizations are based on the implementation in 7-Zip 1084 * (source file: C/HuffEnc.c), which has been placed in the public domain 1085 * by Igor Pavlov. 1086 */ 1087 static void 1088 make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len, 1089 const u32 freqs[restrict], 1090 u8 lens[restrict], u32 codewords[restrict]) 1091 { 1092 u32 *A = codewords; 1093 unsigned num_used_syms; 1094 1095 STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS); 1096 1097 /* We begin by sorting the symbols primarily by frequency and 1098 * secondarily by symbol value. As an optimization, the array 1099 * used for this purpose ('A') shares storage with the space in 1100 * which we will eventually return the codewords. */ 1101 1102 num_used_syms = sort_symbols(num_syms, freqs, lens, A); 1103 1104 /* 'num_used_syms' is the number of symbols with nonzero 1105 * frequency. This may be less than @num_syms. 'num_used_syms' 1106 * is also the number of entries in 'A' that are valid. Each 1107 * entry consists of a distinct symbol and a nonzero frequency 1108 * packed into a 32-bit integer. */ 1109 1110 /* Handle special cases where only 0 or 1 symbols were used (had 1111 * nonzero frequency). */ 1112 1113 if (unlikely(num_used_syms == 0)) { 1114 /* Code is empty. sort_symbols() already set all lengths 1115 * to 0, so there is nothing more to do. */ 1116 return; 1117 } 1118 1119 if (unlikely(num_used_syms == 1)) { 1120 /* Only one symbol was used, so we only need one 1121 * codeword. But two codewords are needed to form the 1122 * smallest complete Huffman code, which uses codewords 0 1123 * and 1. Therefore, we choose another symbol to which 1124 * to assign a codeword. We use 0 (if the used symbol is 1125 * not 0) or 1 (if the used symbol is 0). In either 1126 * case, the lesser-valued symbol must be assigned 1127 * codeword 0 so that the resulting code is canonical. */ 1128 1129 unsigned sym = A[0] & SYMBOL_MASK; 1130 unsigned nonzero_idx = sym ? sym : 1; 1131 1132 codewords[0] = 0; 1133 lens[0] = 1; 1134 codewords[nonzero_idx] = 1; 1135 lens[nonzero_idx] = 1; 1136 return; 1137 } 1138 1139 /* Build a stripped-down version of the Huffman tree, sharing the 1140 * array 'A' with the symbol values. Then extract length counts 1141 * from the tree and use them to generate the final codewords. */ 1142 1143 build_tree(A, num_used_syms); 1144 1145 { 1146 unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; 1147 1148 compute_length_counts(A, num_used_syms - 2, 1149 len_counts, max_codeword_len); 1150 1151 gen_codewords(A, lens, len_counts, max_codeword_len, num_syms); 1152 } 1153 } 1154 1155 /* 1156 * Clear the Huffman symbol frequency counters. 1157 * This must be called when starting a new DEFLATE block. 1158 */ 1159 static void 1160 deflate_reset_symbol_frequencies(struct libdeflate_compressor *c) 1161 { 1162 memset(&c->freqs, 0, sizeof(c->freqs)); 1163 } 1164 1165 /* Reverse the Huffman codeword 'codeword', which is 'len' bits in length. */ 1166 static u32 1167 deflate_reverse_codeword(u32 codeword, u8 len) 1168 { 1169 /* The following branchless algorithm is faster than going bit by bit. 1170 * Note: since no codewords are longer than 16 bits, we only need to 1171 * reverse the low 16 bits of the 'u32'. */ 1172 STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16); 1173 1174 /* Flip adjacent 1-bit fields */ 1175 codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1); 1176 1177 /* Flip adjacent 2-bit fields */ 1178 codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2); 1179 1180 /* Flip adjacent 4-bit fields */ 1181 codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4); 1182 1183 /* Flip adjacent 8-bit fields */ 1184 codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8); 1185 1186 /* Return the high 'len' bits of the bit-reversed 16 bit value. */ 1187 return codeword >> (16 - len); 1188 } 1189 1190 /* Make a canonical Huffman code with bit-reversed codewords. */ 1191 static void 1192 deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len, 1193 const u32 freqs[], u8 lens[], u32 codewords[]) 1194 { 1195 unsigned sym; 1196 1197 make_canonical_huffman_code(num_syms, max_codeword_len, 1198 freqs, lens, codewords); 1199 1200 for (sym = 0; sym < num_syms; sym++) 1201 codewords[sym] = deflate_reverse_codeword(codewords[sym], lens[sym]); 1202 } 1203 1204 /* 1205 * Build the literal/length and offset Huffman codes for a DEFLATE block. 1206 * 1207 * This takes as input the frequency tables for each code and produces as output 1208 * a set of tables that map symbols to codewords and codeword lengths. 1209 */ 1210 static void 1211 deflate_make_huffman_codes(const struct deflate_freqs *freqs, 1212 struct deflate_codes *codes) 1213 { 1214 STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN); 1215 STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN); 1216 1217 deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS, 1218 MAX_LITLEN_CODEWORD_LEN, 1219 freqs->litlen, 1220 codes->lens.litlen, 1221 codes->codewords.litlen); 1222 1223 deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS, 1224 MAX_OFFSET_CODEWORD_LEN, 1225 freqs->offset, 1226 codes->lens.offset, 1227 codes->codewords.offset); 1228 } 1229 1230 /* Initialize c->static_codes. */ 1231 static void 1232 deflate_init_static_codes(struct libdeflate_compressor *c) 1233 { 1234 unsigned i; 1235 1236 for (i = 0; i < 144; i++) 1237 c->freqs.litlen[i] = 1 << (9 - 8); 1238 for (; i < 256; i++) 1239 c->freqs.litlen[i] = 1 << (9 - 9); 1240 for (; i < 280; i++) 1241 c->freqs.litlen[i] = 1 << (9 - 7); 1242 for (; i < 288; i++) 1243 c->freqs.litlen[i] = 1 << (9 - 8); 1244 1245 for (i = 0; i < 32; i++) 1246 c->freqs.offset[i] = 1 << (5 - 5); 1247 1248 deflate_make_huffman_codes(&c->freqs, &c->static_codes); 1249 } 1250 1251 /* Return the offset slot for the specified match offset. */ 1252 static forceinline unsigned 1253 deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset) 1254 { 1255 #if USE_FULL_OFFSET_SLOT_FAST 1256 return c->offset_slot_fast[offset]; 1257 #else 1258 if (offset <= 256) 1259 return c->offset_slot_fast[offset - 1]; 1260 else 1261 return c->offset_slot_fast[256 + ((offset - 1) >> 7)]; 1262 #endif 1263 } 1264 1265 /* Write the header fields common to all DEFLATE block types. */ 1266 static void 1267 deflate_write_block_header(struct deflate_output_bitstream *os, 1268 bool is_final_block, unsigned block_type) 1269 { 1270 deflate_add_bits(os, is_final_block, 1); 1271 deflate_add_bits(os, block_type, 2); 1272 deflate_flush_bits(os); 1273 } 1274 1275 static unsigned 1276 deflate_compute_precode_items(const u8 lens[restrict], 1277 const unsigned num_lens, 1278 u32 precode_freqs[restrict], 1279 unsigned precode_items[restrict]) 1280 { 1281 unsigned *itemptr; 1282 unsigned run_start; 1283 unsigned run_end; 1284 unsigned extra_bits; 1285 u8 len; 1286 1287 memset(precode_freqs, 0, 1288 DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0])); 1289 1290 itemptr = precode_items; 1291 run_start = 0; 1292 do { 1293 /* Find the next run of codeword lengths. */ 1294 1295 /* len = the length being repeated */ 1296 len = lens[run_start]; 1297 1298 /* Extend the run. */ 1299 run_end = run_start; 1300 do { 1301 run_end++; 1302 } while (run_end != num_lens && len == lens[run_end]); 1303 1304 if (len == 0) { 1305 /* Run of zeroes. */ 1306 1307 /* Symbol 18: RLE 11 to 138 zeroes at a time. */ 1308 while ((run_end - run_start) >= 11) { 1309 extra_bits = MIN((run_end - run_start) - 11, 0x7F); 1310 precode_freqs[18]++; 1311 *itemptr++ = 18 | (extra_bits << 5); 1312 run_start += 11 + extra_bits; 1313 } 1314 1315 /* Symbol 17: RLE 3 to 10 zeroes at a time. */ 1316 if ((run_end - run_start) >= 3) { 1317 extra_bits = MIN((run_end - run_start) - 3, 0x7); 1318 precode_freqs[17]++; 1319 *itemptr++ = 17 | (extra_bits << 5); 1320 run_start += 3 + extra_bits; 1321 } 1322 } else { 1323 1324 /* A run of nonzero lengths. */ 1325 1326 /* Symbol 16: RLE 3 to 6 of the previous length. */ 1327 if ((run_end - run_start) >= 4) { 1328 precode_freqs[len]++; 1329 *itemptr++ = len; 1330 run_start++; 1331 do { 1332 extra_bits = MIN((run_end - run_start) - 3, 0x3); 1333 precode_freqs[16]++; 1334 *itemptr++ = 16 | (extra_bits << 5); 1335 run_start += 3 + extra_bits; 1336 } while ((run_end - run_start) >= 3); 1337 } 1338 } 1339 1340 /* Output any remaining lengths without RLE. */ 1341 while (run_start != run_end) { 1342 precode_freqs[len]++; 1343 *itemptr++ = len; 1344 run_start++; 1345 } 1346 } while (run_start != num_lens); 1347 1348 return itemptr - precode_items; 1349 } 1350 1351 /* 1352 * Huffman codeword lengths for dynamic Huffman blocks are compressed using a 1353 * separate Huffman code, the "precode", which contains a symbol for each 1354 * possible codeword length in the larger code as well as several special 1355 * symbols to represent repeated codeword lengths (a form of run-length 1356 * encoding). The precode is itself constructed in canonical form, and its 1357 * codeword lengths are represented literally in 19 3-bit fields that 1358 * immediately precede the compressed codeword lengths of the larger code. 1359 */ 1360 1361 /* Precompute the information needed to output Huffman codes. */ 1362 static void 1363 deflate_precompute_huffman_header(struct libdeflate_compressor *c) 1364 { 1365 /* Compute how many litlen and offset symbols are needed. */ 1366 1367 for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS; 1368 c->num_litlen_syms > 257; 1369 c->num_litlen_syms--) 1370 if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0) 1371 break; 1372 1373 for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS; 1374 c->num_offset_syms > 1; 1375 c->num_offset_syms--) 1376 if (c->codes.lens.offset[c->num_offset_syms - 1] != 0) 1377 break; 1378 1379 /* If we're not using the full set of literal/length codeword lengths, 1380 * then temporarily move the offset codeword lengths over so that the 1381 * literal/length and offset codeword lengths are contiguous. */ 1382 1383 STATIC_ASSERT(offsetof(struct deflate_lens, offset) == 1384 DEFLATE_NUM_LITLEN_SYMS); 1385 1386 if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { 1387 memmove((u8 *)&c->codes.lens + c->num_litlen_syms, 1388 (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, 1389 c->num_offset_syms); 1390 } 1391 1392 /* Compute the "items" (RLE / literal tokens and extra bits) with which 1393 * the codeword lengths in the larger code will be output. */ 1394 c->num_precode_items = 1395 deflate_compute_precode_items((u8 *)&c->codes.lens, 1396 c->num_litlen_syms + 1397 c->num_offset_syms, 1398 c->precode_freqs, 1399 c->precode_items); 1400 1401 /* Build the precode. */ 1402 STATIC_ASSERT(MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN); 1403 deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS, 1404 MAX_PRE_CODEWORD_LEN, 1405 c->precode_freqs, c->precode_lens, 1406 c->precode_codewords); 1407 1408 /* Count how many precode lengths we actually need to output. */ 1409 for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS; 1410 c->num_explicit_lens > 4; 1411 c->num_explicit_lens--) 1412 if (c->precode_lens[deflate_precode_lens_permutation[ 1413 c->num_explicit_lens - 1]] != 0) 1414 break; 1415 1416 /* Restore the offset codeword lengths if needed. */ 1417 if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { 1418 memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, 1419 (u8 *)&c->codes.lens + c->num_litlen_syms, 1420 c->num_offset_syms); 1421 } 1422 } 1423 1424 /* Output the Huffman codes. */ 1425 static void 1426 deflate_write_huffman_header(struct libdeflate_compressor *c, 1427 struct deflate_output_bitstream *os) 1428 { 1429 unsigned i; 1430 1431 deflate_add_bits(os, c->num_litlen_syms - 257, 5); 1432 deflate_add_bits(os, c->num_offset_syms - 1, 5); 1433 deflate_add_bits(os, c->num_explicit_lens - 4, 4); 1434 deflate_flush_bits(os); 1435 1436 /* Output the lengths of the codewords in the precode. */ 1437 for (i = 0; i < c->num_explicit_lens; i++) { 1438 deflate_add_bits(os, c->precode_lens[ 1439 deflate_precode_lens_permutation[i]], 3); 1440 deflate_flush_bits(os); 1441 } 1442 1443 /* Output the encoded lengths of the codewords in the larger code. */ 1444 for (i = 0; i < c->num_precode_items; i++) { 1445 unsigned precode_item = c->precode_items[i]; 1446 unsigned precode_sym = precode_item & 0x1F; 1447 deflate_add_bits(os, c->precode_codewords[precode_sym], 1448 c->precode_lens[precode_sym]); 1449 if (precode_sym >= 16) { 1450 if (precode_sym == 16) 1451 deflate_add_bits(os, precode_item >> 5, 2); 1452 else if (precode_sym == 17) 1453 deflate_add_bits(os, precode_item >> 5, 3); 1454 else 1455 deflate_add_bits(os, precode_item >> 5, 7); 1456 } 1457 STATIC_ASSERT(CAN_BUFFER(DEFLATE_MAX_PRE_CODEWORD_LEN + 7)); 1458 deflate_flush_bits(os); 1459 } 1460 } 1461 1462 static void 1463 deflate_write_sequences(struct deflate_output_bitstream * restrict os, 1464 const struct deflate_codes * restrict codes, 1465 const struct deflate_sequence sequences[restrict], 1466 const u8 * restrict in_next) 1467 { 1468 const struct deflate_sequence *seq = sequences; 1469 1470 for (;;) { 1471 u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF; 1472 unsigned length = seq->litrunlen_and_length >> 23; 1473 unsigned length_slot; 1474 unsigned litlen_symbol; 1475 unsigned offset_symbol; 1476 1477 if (litrunlen) { 1478 #if 1 1479 while (litrunlen >= 4) { 1480 unsigned lit0 = in_next[0]; 1481 unsigned lit1 = in_next[1]; 1482 unsigned lit2 = in_next[2]; 1483 unsigned lit3 = in_next[3]; 1484 1485 deflate_add_bits(os, codes->codewords.litlen[lit0], 1486 codes->lens.litlen[lit0]); 1487 if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN)) 1488 deflate_flush_bits(os); 1489 1490 deflate_add_bits(os, codes->codewords.litlen[lit1], 1491 codes->lens.litlen[lit1]); 1492 if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN)) 1493 deflate_flush_bits(os); 1494 1495 deflate_add_bits(os, codes->codewords.litlen[lit2], 1496 codes->lens.litlen[lit2]); 1497 if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN)) 1498 deflate_flush_bits(os); 1499 1500 deflate_add_bits(os, codes->codewords.litlen[lit3], 1501 codes->lens.litlen[lit3]); 1502 deflate_flush_bits(os); 1503 in_next += 4; 1504 litrunlen -= 4; 1505 } 1506 if (litrunlen-- != 0) { 1507 deflate_add_bits(os, codes->codewords.litlen[*in_next], 1508 codes->lens.litlen[*in_next]); 1509 if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) 1510 deflate_flush_bits(os); 1511 in_next++; 1512 if (litrunlen-- != 0) { 1513 deflate_add_bits(os, codes->codewords.litlen[*in_next], 1514 codes->lens.litlen[*in_next]); 1515 if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) 1516 deflate_flush_bits(os); 1517 in_next++; 1518 if (litrunlen-- != 0) { 1519 deflate_add_bits(os, codes->codewords.litlen[*in_next], 1520 codes->lens.litlen[*in_next]); 1521 if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) 1522 deflate_flush_bits(os); 1523 in_next++; 1524 } 1525 } 1526 if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) 1527 deflate_flush_bits(os); 1528 } 1529 #else 1530 do { 1531 unsigned lit = *in_next++; 1532 deflate_add_bits(os, codes->codewords.litlen[lit], 1533 codes->lens.litlen[lit]); 1534 deflate_flush_bits(os); 1535 } while (--litrunlen); 1536 #endif 1537 } 1538 1539 if (length == 0) 1540 return; 1541 1542 in_next += length; 1543 1544 length_slot = seq->length_slot; 1545 litlen_symbol = 257 + length_slot; 1546 1547 /* Litlen symbol */ 1548 deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], 1549 codes->lens.litlen[litlen_symbol]); 1550 1551 /* Extra length bits */ 1552 STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + 1553 DEFLATE_MAX_EXTRA_LENGTH_BITS)); 1554 deflate_add_bits(os, length - deflate_length_slot_base[length_slot], 1555 deflate_extra_length_bits[length_slot]); 1556 1557 if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + 1558 DEFLATE_MAX_EXTRA_LENGTH_BITS + 1559 MAX_OFFSET_CODEWORD_LEN + 1560 DEFLATE_MAX_EXTRA_OFFSET_BITS)) 1561 deflate_flush_bits(os); 1562 1563 /* Offset symbol */ 1564 offset_symbol = seq->offset_symbol; 1565 deflate_add_bits(os, codes->codewords.offset[offset_symbol], 1566 codes->lens.offset[offset_symbol]); 1567 1568 if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN + 1569 DEFLATE_MAX_EXTRA_OFFSET_BITS)) 1570 deflate_flush_bits(os); 1571 1572 /* Extra offset bits */ 1573 deflate_add_bits(os, seq->offset - deflate_offset_slot_base[offset_symbol], 1574 deflate_extra_offset_bits[offset_symbol]); 1575 1576 deflate_flush_bits(os); 1577 1578 seq++; 1579 } 1580 } 1581 1582 #if SUPPORT_NEAR_OPTIMAL_PARSING 1583 /* 1584 * Follow the minimum-cost path in the graph of possible match/literal choices 1585 * for the current block and write out the matches/literals using the specified 1586 * Huffman codes. 1587 * 1588 * Note: this is slightly duplicated with deflate_write_sequences(), the reason 1589 * being that we don't want to waste time translating between intermediate 1590 * match/literal representations. 1591 */ 1592 static void 1593 deflate_write_item_list(struct deflate_output_bitstream *os, 1594 const struct deflate_codes *codes, 1595 struct libdeflate_compressor *c, 1596 u32 block_length) 1597 { 1598 struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0]; 1599 struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length]; 1600 do { 1601 unsigned length = cur_node->item & OPTIMUM_LEN_MASK; 1602 unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; 1603 unsigned litlen_symbol; 1604 unsigned length_slot; 1605 unsigned offset_slot; 1606 1607 if (length == 1) { 1608 /* Literal */ 1609 litlen_symbol = offset; 1610 deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], 1611 codes->lens.litlen[litlen_symbol]); 1612 deflate_flush_bits(os); 1613 } else { 1614 /* Match length */ 1615 length_slot = deflate_length_slot[length]; 1616 litlen_symbol = 257 + length_slot; 1617 deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], 1618 codes->lens.litlen[litlen_symbol]); 1619 1620 deflate_add_bits(os, length - deflate_length_slot_base[length_slot], 1621 deflate_extra_length_bits[length_slot]); 1622 1623 if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + 1624 DEFLATE_MAX_EXTRA_LENGTH_BITS + 1625 MAX_OFFSET_CODEWORD_LEN + 1626 DEFLATE_MAX_EXTRA_OFFSET_BITS)) 1627 deflate_flush_bits(os); 1628 1629 1630 /* Match offset */ 1631 offset_slot = deflate_get_offset_slot(c, offset); 1632 deflate_add_bits(os, codes->codewords.offset[offset_slot], 1633 codes->lens.offset[offset_slot]); 1634 1635 if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN + 1636 DEFLATE_MAX_EXTRA_OFFSET_BITS)) 1637 deflate_flush_bits(os); 1638 1639 deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot], 1640 deflate_extra_offset_bits[offset_slot]); 1641 1642 deflate_flush_bits(os); 1643 } 1644 cur_node += length; 1645 } while (cur_node != end_node); 1646 } 1647 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ 1648 1649 /* Output the end-of-block symbol. */ 1650 static void 1651 deflate_write_end_of_block(struct deflate_output_bitstream *os, 1652 const struct deflate_codes *codes) 1653 { 1654 deflate_add_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK], 1655 codes->lens.litlen[DEFLATE_END_OF_BLOCK]); 1656 deflate_flush_bits(os); 1657 } 1658 1659 static void 1660 deflate_write_uncompressed_block(struct deflate_output_bitstream *os, 1661 const u8 *data, u16 len, 1662 bool is_final_block) 1663 { 1664 deflate_write_block_header(os, is_final_block, 1665 DEFLATE_BLOCKTYPE_UNCOMPRESSED); 1666 deflate_align_bitstream(os); 1667 1668 if (4 + (u32)len >= os->end - os->next) { 1669 os->next = os->end; 1670 return; 1671 } 1672 1673 put_unaligned_le16(len, os->next); 1674 os->next += 2; 1675 put_unaligned_le16(~len, os->next); 1676 os->next += 2; 1677 memcpy(os->next, data, len); 1678 os->next += len; 1679 } 1680 1681 static void 1682 deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os, 1683 const u8 *data, u32 data_length, 1684 bool is_final_block) 1685 { 1686 do { 1687 u16 len = MIN(data_length, UINT16_MAX); 1688 1689 deflate_write_uncompressed_block(os, data, len, 1690 is_final_block && len == data_length); 1691 data += len; 1692 data_length -= len; 1693 } while (data_length != 0); 1694 } 1695 1696 /* 1697 * Choose the best type of block to use (dynamic Huffman, static Huffman, or 1698 * uncompressed), then output it. 1699 */ 1700 static void 1701 deflate_flush_block(struct libdeflate_compressor * restrict c, 1702 struct deflate_output_bitstream * restrict os, 1703 const u8 * restrict block_begin, u32 block_length, 1704 bool is_final_block, bool use_item_list) 1705 { 1706 static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = { 1707 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7, 1708 }; 1709 1710 /* Costs are measured in bits */ 1711 u32 dynamic_cost = 0; 1712 u32 static_cost = 0; 1713 u32 uncompressed_cost = 0; 1714 struct deflate_codes *codes; 1715 int block_type; 1716 unsigned sym; 1717 1718 /* Tally the end-of-block symbol. */ 1719 c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; 1720 1721 /* Build dynamic Huffman codes. */ 1722 deflate_make_huffman_codes(&c->freqs, &c->codes); 1723 1724 /* Account for the cost of sending dynamic Huffman codes. */ 1725 deflate_precompute_huffman_header(c); 1726 dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens); 1727 for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { 1728 u32 extra = deflate_extra_precode_bits[sym]; 1729 dynamic_cost += c->precode_freqs[sym] * 1730 (extra + c->precode_lens[sym]); 1731 } 1732 1733 /* Account for the cost of encoding literals. */ 1734 for (sym = 0; sym < 256; sym++) { 1735 dynamic_cost += c->freqs.litlen[sym] * 1736 c->codes.lens.litlen[sym]; 1737 } 1738 for (sym = 0; sym < 144; sym++) 1739 static_cost += c->freqs.litlen[sym] * 8; 1740 for (; sym < 256; sym++) 1741 static_cost += c->freqs.litlen[sym] * 9; 1742 1743 /* Account for the cost of encoding the end-of-block symbol. */ 1744 dynamic_cost += c->codes.lens.litlen[256]; 1745 static_cost += 7; 1746 1747 /* Account for the cost of encoding lengths. */ 1748 for (sym = 257; sym < 257 + ARRAY_LEN(deflate_extra_length_bits); sym++) { 1749 u32 extra = deflate_extra_length_bits[sym - 257]; 1750 dynamic_cost += c->freqs.litlen[sym] * 1751 (extra + c->codes.lens.litlen[sym]); 1752 static_cost += c->freqs.litlen[sym] * 1753 (extra + c->static_codes.lens.litlen[sym]); 1754 } 1755 1756 /* Account for the cost of encoding offsets. */ 1757 for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) { 1758 u32 extra = deflate_extra_offset_bits[sym]; 1759 dynamic_cost += c->freqs.offset[sym] * 1760 (extra + c->codes.lens.offset[sym]); 1761 static_cost += c->freqs.offset[sym] * (extra + 5); 1762 } 1763 1764 /* Compute the cost of using uncompressed blocks. */ 1765 uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 + 1766 (40 * (DIV_ROUND_UP(block_length, 1767 UINT16_MAX) - 1)) + 1768 (8 * block_length); 1769 1770 /* Choose the cheapest block type. */ 1771 if (dynamic_cost < MIN(static_cost, uncompressed_cost)) { 1772 block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN; 1773 codes = &c->codes; 1774 } else if (static_cost < uncompressed_cost) { 1775 block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN; 1776 codes = &c->static_codes; 1777 } else { 1778 block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED; 1779 } 1780 1781 /* Now actually output the block. */ 1782 1783 if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { 1784 /* Note: the length being flushed may exceed the maximum length 1785 * of an uncompressed block (65535 bytes). Therefore, more than 1786 * one uncompressed block might be needed. */ 1787 deflate_write_uncompressed_blocks(os, block_begin, block_length, 1788 is_final_block); 1789 } else { 1790 /* Output the block header. */ 1791 deflate_write_block_header(os, is_final_block, block_type); 1792 1793 /* Output the Huffman codes (dynamic Huffman blocks only). */ 1794 if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) 1795 deflate_write_huffman_header(c, os); 1796 1797 /* Output the literals, matches, and end-of-block symbol. */ 1798 #if SUPPORT_NEAR_OPTIMAL_PARSING 1799 if (use_item_list) 1800 deflate_write_item_list(os, codes, c, block_length); 1801 else 1802 #endif 1803 deflate_write_sequences(os, codes, c->p.g.sequences, 1804 block_begin); 1805 deflate_write_end_of_block(os, codes); 1806 } 1807 } 1808 1809 static forceinline void 1810 deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal, 1811 u32 *litrunlen_p) 1812 { 1813 c->freqs.litlen[literal]++; 1814 ++*litrunlen_p; 1815 } 1816 1817 static forceinline void 1818 deflate_choose_match(struct libdeflate_compressor *c, 1819 unsigned length, unsigned offset, 1820 u32 *litrunlen_p, struct deflate_sequence **next_seq_p) 1821 { 1822 struct deflate_sequence *seq = *next_seq_p; 1823 unsigned length_slot = deflate_length_slot[length]; 1824 unsigned offset_slot = deflate_get_offset_slot(c, offset); 1825 1826 c->freqs.litlen[257 + length_slot]++; 1827 c->freqs.offset[offset_slot]++; 1828 1829 seq->litrunlen_and_length = ((u32)length << 23) | *litrunlen_p; 1830 seq->offset = offset; 1831 seq->length_slot = length_slot; 1832 seq->offset_symbol = offset_slot; 1833 1834 *litrunlen_p = 0; 1835 *next_seq_p = seq + 1; 1836 } 1837 1838 static forceinline void 1839 deflate_finish_sequence(struct deflate_sequence *seq, u32 litrunlen) 1840 { 1841 seq->litrunlen_and_length = litrunlen; /* length = 0 */ 1842 } 1843 1844 /******************************************************************************/ 1845 1846 /* 1847 * Block splitting algorithm. The problem is to decide when it is worthwhile to 1848 * start a new block with new Huffman codes. There is a theoretically optimal 1849 * solution: recursively consider every possible block split, considering the 1850 * exact cost of each block, and choose the minimum cost approach. But this is 1851 * far too slow. Instead, as an approximation, we can count symbols and after 1852 * every N symbols, compare the expected distribution of symbols based on the 1853 * previous data with the actual distribution. If they differ "by enough", then 1854 * start a new block. 1855 * 1856 * As an optimization and heuristic, we don't distinguish between every symbol 1857 * but rather we combine many symbols into a single "observation type". For 1858 * literals we only look at the high bits and low bits, and for matches we only 1859 * look at whether the match is long or not. The assumption is that for typical 1860 * "real" data, places that are good block boundaries will tend to be noticable 1861 * based only on changes in these aggregate frequencies, without looking for 1862 * subtle differences in individual symbols. For example, a change from ASCII 1863 * bytes to non-ASCII bytes, or from few matches (generally less compressible) 1864 * to many matches (generally more compressible), would be easily noticed based 1865 * on the aggregates. 1866 * 1867 * For determining whether the frequency distributions are "different enough" to 1868 * start a new block, the simply heuristic of splitting when the sum of absolute 1869 * differences exceeds a constant seems to be good enough. We also add a number 1870 * proportional to the block length so that the algorithm is more likely to end 1871 * long blocks than short blocks. This reflects the general expectation that it 1872 * will become increasingly beneficial to start a new block as the current 1873 * block grows longer. 1874 * 1875 * Finally, for an approximation, it is not strictly necessary that the exact 1876 * symbols being used are considered. With "near-optimal parsing", for example, 1877 * the actual symbols that will be used are unknown until after the block 1878 * boundary is chosen and the block has been optimized. Since the final choices 1879 * cannot be used, we can use preliminary "greedy" choices instead. 1880 */ 1881 1882 /* Initialize the block split statistics when starting a new block. */ 1883 static void 1884 init_block_split_stats(struct block_split_stats *stats) 1885 { 1886 int i; 1887 1888 for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { 1889 stats->new_observations[i] = 0; 1890 stats->observations[i] = 0; 1891 } 1892 stats->num_new_observations = 0; 1893 stats->num_observations = 0; 1894 } 1895 1896 /* Literal observation. Heuristic: use the top 2 bits and low 1 bits of the 1897 * literal, for 8 possible literal observation types. */ 1898 static forceinline void 1899 observe_literal(struct block_split_stats *stats, u8 lit) 1900 { 1901 stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++; 1902 stats->num_new_observations++; 1903 } 1904 1905 /* Match observation. Heuristic: use one observation type for "short match" and 1906 * one observation type for "long match". */ 1907 static forceinline void 1908 observe_match(struct block_split_stats *stats, unsigned length) 1909 { 1910 stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++; 1911 stats->num_new_observations++; 1912 } 1913 1914 static bool 1915 do_end_block_check(struct block_split_stats *stats, u32 block_length) 1916 { 1917 int i; 1918 1919 if (stats->num_observations > 0) { 1920 1921 /* Note: to avoid slow divisions, we do not divide by 1922 * 'num_observations', but rather do all math with the numbers 1923 * multiplied by 'num_observations'. */ 1924 u32 total_delta = 0; 1925 for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { 1926 u32 expected = stats->observations[i] * stats->num_new_observations; 1927 u32 actual = stats->new_observations[i] * stats->num_observations; 1928 u32 delta = (actual > expected) ? actual - expected : 1929 expected - actual; 1930 total_delta += delta; 1931 } 1932 1933 /* Ready to end the block? */ 1934 if (total_delta + (block_length / 4096) * stats->num_observations >= 1935 NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations) 1936 return true; 1937 } 1938 1939 for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { 1940 stats->num_observations += stats->new_observations[i]; 1941 stats->observations[i] += stats->new_observations[i]; 1942 stats->new_observations[i] = 0; 1943 } 1944 stats->num_new_observations = 0; 1945 return false; 1946 } 1947 1948 static forceinline bool 1949 should_end_block(struct block_split_stats *stats, 1950 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end) 1951 { 1952 /* Ready to check block split statistics? */ 1953 if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK || 1954 in_next - in_block_begin < MIN_BLOCK_LENGTH || 1955 in_end - in_next < MIN_BLOCK_LENGTH) 1956 return false; 1957 1958 return do_end_block_check(stats, in_next - in_block_begin); 1959 } 1960 1961 /******************************************************************************/ 1962 1963 /* 1964 * This is the "greedy" DEFLATE compressor. It always chooses the longest match. 1965 */ 1966 static size_t 1967 deflate_compress_greedy(struct libdeflate_compressor * restrict c, 1968 const u8 * restrict in, size_t in_nbytes, 1969 u8 * restrict out, size_t out_nbytes_avail) 1970 { 1971 const u8 *in_next = in; 1972 const u8 *in_end = in_next + in_nbytes; 1973 struct deflate_output_bitstream os; 1974 const u8 *in_cur_base = in_next; 1975 unsigned max_len = DEFLATE_MAX_MATCH_LEN; 1976 unsigned nice_len = MIN(c->nice_match_length, max_len); 1977 u32 next_hashes[2] = {0, 0}; 1978 1979 deflate_init_output(&os, out, out_nbytes_avail); 1980 hc_matchfinder_init(&c->p.g.hc_mf); 1981 1982 do { 1983 /* Starting a new DEFLATE block. */ 1984 1985 const u8 * const in_block_begin = in_next; 1986 const u8 * const in_max_block_end = 1987 in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH); 1988 u32 litrunlen = 0; 1989 struct deflate_sequence *next_seq = c->p.g.sequences; 1990 1991 init_block_split_stats(&c->split_stats); 1992 deflate_reset_symbol_frequencies(c); 1993 1994 do { 1995 u32 length; 1996 u32 offset; 1997 1998 /* Decrease the maximum and nice match lengths if we're 1999 * approaching the end of the input buffer. */ 2000 if (unlikely(max_len > in_end - in_next)) { 2001 max_len = in_end - in_next; 2002 nice_len = MIN(nice_len, max_len); 2003 } 2004 2005 length = hc_matchfinder_longest_match(&c->p.g.hc_mf, 2006 &in_cur_base, 2007 in_next, 2008 DEFLATE_MIN_MATCH_LEN - 1, 2009 max_len, 2010 nice_len, 2011 c->max_search_depth, 2012 next_hashes, 2013 &offset); 2014 2015 if (length >= DEFLATE_MIN_MATCH_LEN) { 2016 /* Match found. */ 2017 deflate_choose_match(c, length, offset, 2018 &litrunlen, &next_seq); 2019 observe_match(&c->split_stats, length); 2020 in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf, 2021 &in_cur_base, 2022 in_next + 1, 2023 in_end, 2024 length - 1, 2025 next_hashes); 2026 } else { 2027 /* No match found. */ 2028 deflate_choose_literal(c, *in_next, &litrunlen); 2029 observe_literal(&c->split_stats, *in_next); 2030 in_next++; 2031 } 2032 2033 /* Check if it's time to output another block. */ 2034 } while (in_next < in_max_block_end && 2035 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); 2036 2037 deflate_finish_sequence(next_seq, litrunlen); 2038 deflate_flush_block(c, &os, in_block_begin, 2039 in_next - in_block_begin, 2040 in_next == in_end, false); 2041 } while (in_next != in_end); 2042 2043 return deflate_flush_output(&os); 2044 } 2045 2046 /* 2047 * This is the "lazy" DEFLATE compressor. Before choosing a match, it checks to 2048 * see if there's a longer match at the next position. If yes, it outputs a 2049 * literal and continues to the next position. If no, it outputs the match. 2050 */ 2051 static size_t 2052 deflate_compress_lazy(struct libdeflate_compressor * restrict c, 2053 const u8 * restrict in, size_t in_nbytes, 2054 u8 * restrict out, size_t out_nbytes_avail) 2055 { 2056 const u8 *in_next = in; 2057 const u8 *in_end = in_next + in_nbytes; 2058 struct deflate_output_bitstream os; 2059 const u8 *in_cur_base = in_next; 2060 unsigned max_len = DEFLATE_MAX_MATCH_LEN; 2061 unsigned nice_len = MIN(c->nice_match_length, max_len); 2062 u32 next_hashes[2] = {0, 0}; 2063 2064 deflate_init_output(&os, out, out_nbytes_avail); 2065 hc_matchfinder_init(&c->p.g.hc_mf); 2066 2067 do { 2068 /* Starting a new DEFLATE block. */ 2069 2070 const u8 * const in_block_begin = in_next; 2071 const u8 * const in_max_block_end = 2072 in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH); 2073 u32 litrunlen = 0; 2074 struct deflate_sequence *next_seq = c->p.g.sequences; 2075 2076 init_block_split_stats(&c->split_stats); 2077 deflate_reset_symbol_frequencies(c); 2078 2079 do { 2080 unsigned cur_len; 2081 unsigned cur_offset; 2082 unsigned next_len; 2083 unsigned next_offset; 2084 2085 if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) { 2086 max_len = in_end - in_next; 2087 nice_len = MIN(nice_len, max_len); 2088 } 2089 2090 /* Find the longest match at the current position. */ 2091 cur_len = hc_matchfinder_longest_match(&c->p.g.hc_mf, 2092 &in_cur_base, 2093 in_next, 2094 DEFLATE_MIN_MATCH_LEN - 1, 2095 max_len, 2096 nice_len, 2097 c->max_search_depth, 2098 next_hashes, 2099 &cur_offset); 2100 in_next += 1; 2101 2102 if (cur_len < DEFLATE_MIN_MATCH_LEN) { 2103 /* No match found. Choose a literal. */ 2104 deflate_choose_literal(c, *(in_next - 1), &litrunlen); 2105 observe_literal(&c->split_stats, *(in_next - 1)); 2106 continue; 2107 } 2108 2109 have_cur_match: 2110 observe_match(&c->split_stats, cur_len); 2111 2112 /* We have a match at the current position. */ 2113 2114 /* If the current match is very long, choose it 2115 * immediately. */ 2116 if (cur_len >= nice_len) { 2117 deflate_choose_match(c, cur_len, cur_offset, 2118 &litrunlen, &next_seq); 2119 in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf, 2120 &in_cur_base, 2121 in_next, 2122 in_end, 2123 cur_len - 1, 2124 next_hashes); 2125 continue; 2126 } 2127 2128 /* 2129 * Try to find a match at the next position. 2130 * 2131 * Note: since we already have a match at the *current* 2132 * position, we use only half the 'max_search_depth' 2133 * when checking the *next* position. This is a useful 2134 * trade-off because it's more worthwhile to use a 2135 * greater search depth on the initial match. 2136 * 2137 * Note: it's possible to structure the code such that 2138 * there's only one call to longest_match(), which 2139 * handles both the "find the initial match" and "try to 2140 * find a longer match" cases. However, it is faster to 2141 * have two call sites, with longest_match() inlined at 2142 * each. 2143 */ 2144 if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) { 2145 max_len = in_end - in_next; 2146 nice_len = MIN(nice_len, max_len); 2147 } 2148 next_len = hc_matchfinder_longest_match(&c->p.g.hc_mf, 2149 &in_cur_base, 2150 in_next, 2151 cur_len, 2152 max_len, 2153 nice_len, 2154 c->max_search_depth / 2, 2155 next_hashes, 2156 &next_offset); 2157 in_next += 1; 2158 2159 if (next_len > cur_len) { 2160 /* Found a longer match at the next position. 2161 * Output a literal. Then the next match 2162 * becomes the current match. */ 2163 deflate_choose_literal(c, *(in_next - 2), &litrunlen); 2164 cur_len = next_len; 2165 cur_offset = next_offset; 2166 goto have_cur_match; 2167 } 2168 2169 /* No longer match at the next position. 2170 * Output the current match. */ 2171 deflate_choose_match(c, cur_len, cur_offset, 2172 &litrunlen, &next_seq); 2173 in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf, 2174 &in_cur_base, 2175 in_next, 2176 in_end, 2177 cur_len - 2, 2178 next_hashes); 2179 2180 /* Check if it's time to output another block. */ 2181 } while (in_next < in_max_block_end && 2182 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); 2183 2184 deflate_finish_sequence(next_seq, litrunlen); 2185 deflate_flush_block(c, &os, in_block_begin, 2186 in_next - in_block_begin, 2187 in_next == in_end, false); 2188 } while (in_next != in_end); 2189 2190 return deflate_flush_output(&os); 2191 } 2192 2193 #if SUPPORT_NEAR_OPTIMAL_PARSING 2194 2195 /* 2196 * Follow the minimum-cost path in the graph of possible match/literal choices 2197 * for the current block and compute the frequencies of the Huffman symbols that 2198 * would be needed to output those matches and literals. 2199 */ 2200 static void 2201 deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length) 2202 { 2203 struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0]; 2204 struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length]; 2205 do { 2206 unsigned length = cur_node->item & OPTIMUM_LEN_MASK; 2207 unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; 2208 2209 if (length == 1) { 2210 /* Literal */ 2211 c->freqs.litlen[offset]++; 2212 } else { 2213 /* Match */ 2214 c->freqs.litlen[257 + deflate_length_slot[length]]++; 2215 c->freqs.offset[deflate_get_offset_slot(c, offset)]++; 2216 } 2217 cur_node += length; 2218 } while (cur_node != end_node); 2219 } 2220 2221 /* Set the current cost model from the codeword lengths specified in @lens. */ 2222 static void 2223 deflate_set_costs_from_codes(struct libdeflate_compressor *c, 2224 const struct deflate_lens *lens) 2225 { 2226 unsigned i; 2227 2228 /* Literals */ 2229 for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { 2230 u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS); 2231 c->p.n.costs.literal[i] = bits << COST_SHIFT; 2232 } 2233 2234 /* Lengths */ 2235 for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) { 2236 unsigned length_slot = deflate_length_slot[i]; 2237 unsigned litlen_sym = 257 + length_slot; 2238 u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS); 2239 bits += deflate_extra_length_bits[length_slot]; 2240 c->p.n.costs.length[i] = bits << COST_SHIFT; 2241 } 2242 2243 /* Offset slots */ 2244 for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) { 2245 u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS); 2246 bits += deflate_extra_offset_bits[i]; 2247 c->p.n.costs.offset_slot[i] = bits << COST_SHIFT; 2248 } 2249 } 2250 2251 static forceinline u32 2252 deflate_default_literal_cost(unsigned literal) 2253 { 2254 STATIC_ASSERT(COST_SHIFT == 3); 2255 /* 66 is 8.25 bits/symbol */ 2256 return 66; 2257 } 2258 2259 static forceinline u32 2260 deflate_default_length_slot_cost(unsigned length_slot) 2261 { 2262 STATIC_ASSERT(COST_SHIFT == 3); 2263 /* 60 is 7.5 bits/symbol */ 2264 return 60 + ((u32)deflate_extra_length_bits[length_slot] << COST_SHIFT); 2265 } 2266 2267 static forceinline u32 2268 deflate_default_offset_slot_cost(unsigned offset_slot) 2269 { 2270 STATIC_ASSERT(COST_SHIFT == 3); 2271 /* 39 is 4.875 bits/symbol */ 2272 return 39 + ((u32)deflate_extra_offset_bits[offset_slot] << COST_SHIFT); 2273 } 2274 2275 /* 2276 * Set default symbol costs for the first block's first optimization pass. 2277 * 2278 * It works well to assume that each symbol is equally probable. This results 2279 * in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 << 2280 * COST_SHIFT)) where 'num_syms' is the number of symbols in the corresponding 2281 * alphabet. However, we intentionally bias the parse towards matches rather 2282 * than literals by using a slightly lower default cost for length symbols than 2283 * for literals. This often improves the compression ratio slightly. 2284 */ 2285 static void 2286 deflate_set_default_costs(struct libdeflate_compressor *c) 2287 { 2288 unsigned i; 2289 2290 /* Literals */ 2291 for (i = 0; i < DEFLATE_NUM_LITERALS; i++) 2292 c->p.n.costs.literal[i] = deflate_default_literal_cost(i); 2293 2294 /* Lengths */ 2295 for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) 2296 c->p.n.costs.length[i] = deflate_default_length_slot_cost( 2297 deflate_length_slot[i]); 2298 2299 /* Offset slots */ 2300 for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) 2301 c->p.n.costs.offset_slot[i] = deflate_default_offset_slot_cost(i); 2302 } 2303 2304 static forceinline void 2305 deflate_adjust_cost(u32 *cost_p, u32 default_cost) 2306 { 2307 *cost_p += ((s32)default_cost - (s32)*cost_p) >> 1; 2308 } 2309 2310 /* 2311 * Adjust the costs when beginning a new block. 2312 * 2313 * Since the current costs have been optimized for the data, it's undesirable to 2314 * throw them away and start over with the default costs. At the same time, we 2315 * don't want to bias the parse by assuming that the next block will be similar 2316 * to the current block. As a compromise, make the costs closer to the 2317 * defaults, but don't simply set them to the defaults. 2318 */ 2319 static void 2320 deflate_adjust_costs(struct libdeflate_compressor *c) 2321 { 2322 unsigned i; 2323 2324 /* Literals */ 2325 for (i = 0; i < DEFLATE_NUM_LITERALS; i++) 2326 deflate_adjust_cost(&c->p.n.costs.literal[i], 2327 deflate_default_literal_cost(i)); 2328 2329 /* Lengths */ 2330 for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) 2331 deflate_adjust_cost(&c->p.n.costs.length[i], 2332 deflate_default_length_slot_cost( 2333 deflate_length_slot[i])); 2334 2335 /* Offset slots */ 2336 for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) 2337 deflate_adjust_cost(&c->p.n.costs.offset_slot[i], 2338 deflate_default_offset_slot_cost(i)); 2339 } 2340 2341 /* 2342 * Find the minimum-cost path through the graph of possible match/literal 2343 * choices for this block. 2344 * 2345 * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which 2346 * represents the node at the beginning of the block, to 2347 * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of 2348 * the block. Edge costs are evaluated using the cost model 'c->p.n.costs'. 2349 * 2350 * The algorithm works backwards, starting at the end node and proceeding 2351 * backwards one node at a time. At each node, the minimum cost to reach the 2352 * end node is computed and the match/literal choice that begins that path is 2353 * saved. 2354 */ 2355 static void 2356 deflate_find_min_cost_path(struct libdeflate_compressor *c, 2357 const u32 block_length, 2358 const struct lz_match *cache_ptr) 2359 { 2360 struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length]; 2361 struct deflate_optimum_node *cur_node = end_node; 2362 2363 cur_node->cost_to_end = 0; 2364 do { 2365 unsigned num_matches; 2366 unsigned literal; 2367 u32 best_cost_to_end; 2368 2369 cur_node--; 2370 cache_ptr--; 2371 2372 num_matches = cache_ptr->length; 2373 literal = cache_ptr->offset; 2374 2375 /* It's always possible to choose a literal. */ 2376 best_cost_to_end = c->p.n.costs.literal[literal] + 2377 (cur_node + 1)->cost_to_end; 2378 cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1; 2379 2380 /* Also consider matches if there are any. */ 2381 if (num_matches) { 2382 const struct lz_match *match; 2383 unsigned len; 2384 unsigned offset; 2385 unsigned offset_slot; 2386 u32 offset_cost; 2387 u32 cost_to_end; 2388 2389 /* 2390 * Consider each length from the minimum 2391 * (DEFLATE_MIN_MATCH_LEN) to the length of the longest 2392 * match found at this position. For each length, we 2393 * consider only the smallest offset for which that 2394 * length is available. Although this is not guaranteed 2395 * to be optimal due to the possibility of a larger 2396 * offset costing less than a smaller offset to code, 2397 * this is a very useful heuristic. 2398 */ 2399 match = cache_ptr - num_matches; 2400 len = DEFLATE_MIN_MATCH_LEN; 2401 do { 2402 offset = match->offset; 2403 offset_slot = deflate_get_offset_slot(c, offset); 2404 offset_cost = c->p.n.costs.offset_slot[offset_slot]; 2405 do { 2406 cost_to_end = offset_cost + 2407 c->p.n.costs.length[len] + 2408 (cur_node + len)->cost_to_end; 2409 if (cost_to_end < best_cost_to_end) { 2410 best_cost_to_end = cost_to_end; 2411 cur_node->item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len; 2412 } 2413 } while (++len <= match->length); 2414 } while (++match != cache_ptr); 2415 cache_ptr -= num_matches; 2416 } 2417 cur_node->cost_to_end = best_cost_to_end; 2418 } while (cur_node != &c->p.n.optimum_nodes[0]); 2419 } 2420 2421 /* 2422 * Choose the literal/match sequence to use for the current block. The basic 2423 * algorithm finds a minimum-cost path through the block's graph of 2424 * literal/match choices, given a cost model. However, the cost of each symbol 2425 * is unknown until the Huffman codes have been built, but at the same time the 2426 * Huffman codes depend on the frequencies of chosen symbols. Consequently, 2427 * multiple passes must be used to try to approximate an optimal solution. The 2428 * first pass uses default costs, mixed with the costs from the previous block 2429 * if any. Later passes use the Huffman codeword lengths from the previous pass 2430 * as the costs. 2431 */ 2432 static void 2433 deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length, 2434 const struct lz_match *cache_ptr, bool is_first_block) 2435 { 2436 unsigned num_passes_remaining = c->p.n.num_optim_passes; 2437 u32 i; 2438 2439 /* Force the block to really end at the desired length, even if some 2440 * matches extend beyond it. */ 2441 for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN, 2442 ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++) 2443 c->p.n.optimum_nodes[i].cost_to_end = 0x80000000; 2444 2445 /* Set the initial costs. */ 2446 if (is_first_block) 2447 deflate_set_default_costs(c); 2448 else 2449 deflate_adjust_costs(c); 2450 2451 for (;;) { 2452 /* Find the minimum cost path for this pass. */ 2453 deflate_find_min_cost_path(c, block_length, cache_ptr); 2454 2455 /* Compute frequencies of the chosen symbols. */ 2456 deflate_reset_symbol_frequencies(c); 2457 deflate_tally_item_list(c, block_length); 2458 2459 if (--num_passes_remaining == 0) 2460 break; 2461 2462 /* At least one optimization pass remains; update the costs. */ 2463 deflate_make_huffman_codes(&c->freqs, &c->codes); 2464 deflate_set_costs_from_codes(c, &c->codes.lens); 2465 } 2466 } 2467 2468 /* 2469 * This is the "near-optimal" DEFLATE compressor. It computes the optimal 2470 * representation of each DEFLATE block using a minimum-cost path search over 2471 * the graph of possible match/literal choices for that block, assuming a 2472 * certain cost for each Huffman symbol. 2473 * 2474 * For several reasons, the end result is not guaranteed to be optimal: 2475 * 2476 * - Nonoptimal choice of blocks 2477 * - Heuristic limitations on which matches are actually considered 2478 * - Symbol costs are unknown until the symbols have already been chosen 2479 * (so iterative optimization must be used) 2480 */ 2481 static size_t 2482 deflate_compress_near_optimal(struct libdeflate_compressor * restrict c, 2483 const u8 * restrict in, size_t in_nbytes, 2484 u8 * restrict out, size_t out_nbytes_avail) 2485 { 2486 const u8 *in_next = in; 2487 const u8 *in_end = in_next + in_nbytes; 2488 struct deflate_output_bitstream os; 2489 const u8 *in_cur_base = in_next; 2490 const u8 *in_next_slide = in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE); 2491 unsigned max_len = DEFLATE_MAX_MATCH_LEN; 2492 unsigned nice_len = MIN(c->nice_match_length, max_len); 2493 u32 next_hashes[2] = {0, 0}; 2494 2495 deflate_init_output(&os, out, out_nbytes_avail); 2496 bt_matchfinder_init(&c->p.n.bt_mf); 2497 2498 do { 2499 /* Starting a new DEFLATE block. */ 2500 2501 struct lz_match *cache_ptr = c->p.n.match_cache; 2502 const u8 * const in_block_begin = in_next; 2503 const u8 * const in_max_block_end = 2504 in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH); 2505 const u8 *next_observation = in_next; 2506 2507 init_block_split_stats(&c->split_stats); 2508 2509 /* 2510 * Find matches until we decide to end the block. We end the 2511 * block if any of the following is true: 2512 * 2513 * (1) Maximum block length has been reached 2514 * (2) Match catch may overflow. 2515 * (3) Block split heuristic says to split now. 2516 */ 2517 do { 2518 struct lz_match *matches; 2519 unsigned best_len; 2520 2521 /* Slide the window forward if needed. */ 2522 if (in_next == in_next_slide) { 2523 bt_matchfinder_slide_window(&c->p.n.bt_mf); 2524 in_cur_base = in_next; 2525 in_next_slide = in_next + MIN(in_end - in_next, 2526 MATCHFINDER_WINDOW_SIZE); 2527 } 2528 2529 /* Decrease the maximum and nice match lengths if we're 2530 * approaching the end of the input buffer. */ 2531 if (unlikely(max_len > in_end - in_next)) { 2532 max_len = in_end - in_next; 2533 nice_len = MIN(nice_len, max_len); 2534 } 2535 2536 /* 2537 * Find matches with the current position using the 2538 * binary tree matchfinder and save them in 2539 * 'match_cache'. 2540 * 2541 * Note: the binary tree matchfinder is more suited for 2542 * optimal parsing than the hash chain matchfinder. The 2543 * reasons for this include: 2544 * 2545 * - The binary tree matchfinder can find more matches 2546 * in the same number of steps. 2547 * - One of the major advantages of hash chains is that 2548 * skipping positions (not searching for matches at 2549 * them) is faster; however, with optimal parsing we 2550 * search for matches at almost all positions, so this 2551 * advantage of hash chains is negated. 2552 */ 2553 matches = cache_ptr; 2554 best_len = 0; 2555 if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) { 2556 cache_ptr = bt_matchfinder_get_matches(&c->p.n.bt_mf, 2557 in_cur_base, 2558 in_next - in_cur_base, 2559 max_len, 2560 nice_len, 2561 c->max_search_depth, 2562 next_hashes, 2563 &best_len, 2564 matches); 2565 } 2566 2567 if (in_next >= next_observation) { 2568 if (best_len >= 4) { 2569 observe_match(&c->split_stats, best_len); 2570 next_observation = in_next + best_len; 2571 } else { 2572 observe_literal(&c->split_stats, *in_next); 2573 next_observation = in_next + 1; 2574 } 2575 } 2576 2577 cache_ptr->length = cache_ptr - matches; 2578 cache_ptr->offset = *in_next; 2579 in_next++; 2580 cache_ptr++; 2581 2582 /* 2583 * If there was a very long match found, don't cache any 2584 * matches for the bytes covered by that match. This 2585 * avoids degenerate behavior when compressing highly 2586 * redundant data, where the number of matches can be 2587 * very large. 2588 * 2589 * This heuristic doesn't actually hurt the compression 2590 * ratio very much. If there's a long match, then the 2591 * data must be highly compressible, so it doesn't 2592 * matter much what we do. 2593 */ 2594 if (best_len >= DEFLATE_MIN_MATCH_LEN && best_len >= nice_len) { 2595 --best_len; 2596 do { 2597 if (in_next == in_next_slide) { 2598 bt_matchfinder_slide_window(&c->p.n.bt_mf); 2599 in_cur_base = in_next; 2600 in_next_slide = in_next + MIN(in_end - in_next, 2601 MATCHFINDER_WINDOW_SIZE); 2602 } 2603 if (unlikely(max_len > in_end - in_next)) { 2604 max_len = in_end - in_next; 2605 nice_len = MIN(nice_len, max_len); 2606 } 2607 if (max_len >= BT_MATCHFINDER_REQUIRED_NBYTES) { 2608 bt_matchfinder_skip_position(&c->p.n.bt_mf, 2609 in_cur_base, 2610 in_next - in_cur_base, 2611 nice_len, 2612 c->max_search_depth, 2613 next_hashes); 2614 } 2615 cache_ptr->length = 0; 2616 cache_ptr->offset = *in_next; 2617 in_next++; 2618 cache_ptr++; 2619 } while (--best_len); 2620 } 2621 } while (in_next < in_max_block_end && 2622 cache_ptr < &c->p.n.match_cache[CACHE_LENGTH] && 2623 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); 2624 2625 /* All the matches for this block have been cached. Now choose 2626 * the sequence of items to output and flush the block. */ 2627 deflate_optimize_block(c, in_next - in_block_begin, cache_ptr, 2628 in_block_begin == in); 2629 deflate_flush_block(c, &os, in_block_begin, in_next - in_block_begin, 2630 in_next == in_end, true); 2631 } while (in_next != in_end); 2632 2633 return deflate_flush_output(&os); 2634 } 2635 2636 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ 2637 2638 /* Initialize c->offset_slot_fast. */ 2639 static void 2640 deflate_init_offset_slot_fast(struct libdeflate_compressor *c) 2641 { 2642 unsigned offset_slot; 2643 unsigned offset; 2644 unsigned offset_end; 2645 2646 for (offset_slot = 0; 2647 offset_slot < ARRAY_LEN(deflate_offset_slot_base); 2648 offset_slot++) 2649 { 2650 offset = deflate_offset_slot_base[offset_slot]; 2651 #if USE_FULL_OFFSET_SLOT_FAST 2652 offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); 2653 do { 2654 c->offset_slot_fast[offset] = offset_slot; 2655 } while (++offset != offset_end); 2656 #else 2657 if (offset <= 256) { 2658 offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); 2659 do { 2660 c->offset_slot_fast[offset - 1] = offset_slot; 2661 } while (++offset != offset_end); 2662 } else { 2663 offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); 2664 do { 2665 c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot; 2666 } while ((offset += (1 << 7)) != offset_end); 2667 } 2668 #endif 2669 } 2670 } 2671 2672 LIBDEFLATEAPI struct libdeflate_compressor * 2673 libdeflate_alloc_compressor(int compression_level) 2674 { 2675 struct libdeflate_compressor *c; 2676 size_t size; 2677 2678 #if SUPPORT_NEAR_OPTIMAL_PARSING 2679 if (compression_level >= 8) 2680 size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.n); 2681 else 2682 #endif 2683 size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.g); 2684 2685 c = aligned_malloc(MATCHFINDER_ALIGNMENT, size); 2686 if (!c) 2687 return NULL; 2688 2689 switch (compression_level) { 2690 case 1: 2691 c->impl = deflate_compress_greedy; 2692 c->max_search_depth = 2; 2693 c->nice_match_length = 8; 2694 break; 2695 case 2: 2696 c->impl = deflate_compress_greedy; 2697 c->max_search_depth = 6; 2698 c->nice_match_length = 10; 2699 break; 2700 case 3: 2701 c->impl = deflate_compress_greedy; 2702 c->max_search_depth = 12; 2703 c->nice_match_length = 14; 2704 break; 2705 case 4: 2706 c->impl = deflate_compress_greedy; 2707 c->max_search_depth = 24; 2708 c->nice_match_length = 24; 2709 break; 2710 case 5: 2711 c->impl = deflate_compress_lazy; 2712 c->max_search_depth = 20; 2713 c->nice_match_length = 30; 2714 break; 2715 case 6: 2716 c->impl = deflate_compress_lazy; 2717 c->max_search_depth = 40; 2718 c->nice_match_length = 65; 2719 break; 2720 case 7: 2721 c->impl = deflate_compress_lazy; 2722 c->max_search_depth = 100; 2723 c->nice_match_length = 130; 2724 break; 2725 #if SUPPORT_NEAR_OPTIMAL_PARSING 2726 case 8: 2727 c->impl = deflate_compress_near_optimal; 2728 c->max_search_depth = 12; 2729 c->nice_match_length = 20; 2730 c->p.n.num_optim_passes = 1; 2731 break; 2732 case 9: 2733 c->impl = deflate_compress_near_optimal; 2734 c->max_search_depth = 16; 2735 c->nice_match_length = 26; 2736 c->p.n.num_optim_passes = 2; 2737 break; 2738 case 10: 2739 c->impl = deflate_compress_near_optimal; 2740 c->max_search_depth = 30; 2741 c->nice_match_length = 50; 2742 c->p.n.num_optim_passes = 2; 2743 break; 2744 case 11: 2745 c->impl = deflate_compress_near_optimal; 2746 c->max_search_depth = 60; 2747 c->nice_match_length = 80; 2748 c->p.n.num_optim_passes = 3; 2749 break; 2750 case 12: 2751 c->impl = deflate_compress_near_optimal; 2752 c->max_search_depth = 100; 2753 c->nice_match_length = 133; 2754 c->p.n.num_optim_passes = 4; 2755 break; 2756 #else 2757 case 8: 2758 c->impl = deflate_compress_lazy; 2759 c->max_search_depth = 150; 2760 c->nice_match_length = 200; 2761 break; 2762 case 9: 2763 c->impl = deflate_compress_lazy; 2764 c->max_search_depth = 200; 2765 c->nice_match_length = DEFLATE_MAX_MATCH_LEN; 2766 break; 2767 #endif 2768 default: 2769 aligned_free(c); 2770 return NULL; 2771 } 2772 2773 c->compression_level = compression_level; 2774 2775 deflate_init_offset_slot_fast(c); 2776 deflate_init_static_codes(c); 2777 2778 return c; 2779 } 2780 2781 LIBDEFLATEAPI size_t 2782 libdeflate_deflate_compress(struct libdeflate_compressor *c, 2783 const void *in, size_t in_nbytes, 2784 void *out, size_t out_nbytes_avail) 2785 { 2786 if (unlikely(out_nbytes_avail < OUTPUT_END_PADDING)) 2787 return 0; 2788 2789 /* For extremely small inputs just use a single uncompressed block. */ 2790 if (unlikely(in_nbytes < 16)) { 2791 struct deflate_output_bitstream os; 2792 deflate_init_output(&os, out, out_nbytes_avail); 2793 if (in_nbytes == 0) 2794 in = &os; /* Avoid passing NULL to memcpy() */ 2795 deflate_write_uncompressed_block(&os, in, in_nbytes, true); 2796 return deflate_flush_output(&os); 2797 } 2798 2799 return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail); 2800 } 2801 2802 LIBDEFLATEAPI void 2803 libdeflate_free_compressor(struct libdeflate_compressor *c) 2804 { 2805 aligned_free(c); 2806 } 2807 2808 unsigned int 2809 deflate_get_compression_level(struct libdeflate_compressor *c) 2810 { 2811 return c->compression_level; 2812 } 2813 2814 LIBDEFLATEAPI size_t 2815 libdeflate_deflate_compress_bound(struct libdeflate_compressor *c, 2816 size_t in_nbytes) 2817 { 2818 /* 2819 * The worst case is all uncompressed blocks where one block has length 2820 * <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH. 2821 * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE, 2822 * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN. 2823 */ 2824 size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1); 2825 return (5 * max_num_blocks) + in_nbytes + 1 + OUTPUT_END_PADDING; 2826 }