github.com/grailbio/base@v0.0.11/compress/libdeflate/decompress_template.h

github.com/grailbio/base@v0.0.11/compress/libdeflate/decompress_template.h (about)

     1  // NOLINT(build/header_guard)
     2  /*
     3   * decompress_template.h
     4   *
     5   * Copyright 2016 Eric Biggers
     6   *
     7   * Permission is hereby granted, free of charge, to any person
     8   * obtaining a copy of this software and associated documentation
     9   * files (the "Software"), to deal in the Software without
    10   * restriction, including without limitation the rights to use,
    11   * copy, modify, merge, publish, distribute, sublicense, and/or sell
    12   * copies of the Software, and to permit persons to whom the
    13   * Software is furnished to do so, subject to the following
    14   * conditions:
    15   *
    16   * The above copyright notice and this permission notice shall be
    17   * included in all copies or substantial portions of the Software.
    18   *
    19   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    20   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    21   * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    22   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    23   * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    24   * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    25   * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    26   * OTHER DEALINGS IN THE SOFTWARE.
    27   */
    28  
    29  /*
    30   * This is the actual DEFLATE decompression routine, lifted out of
    31   * deflate_decompress.c so that it can be compiled multiple times with different
    32   * target instruction sets.
    33   */
    34  
    35  static enum libdeflate_result ATTRIBUTES
    36  FUNCNAME(struct libdeflate_decompressor * restrict d,
    37  	 const void * restrict in, size_t in_nbytes,
    38  	 void * restrict out, size_t out_nbytes_avail,
    39  	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
    40  {
    41  	u8 *out_next = out;
    42  	u8 * const out_end = out_next + out_nbytes_avail;
    43  	const u8 *in_next = in;
    44  	const u8 * const in_end = in_next + in_nbytes;
    45  	bitbuf_t bitbuf = 0;
    46  	unsigned bitsleft = 0;
    47  	size_t overrun_count = 0;
    48  	unsigned i;
    49  	unsigned is_final_block;
    50  	unsigned block_type;
    51  	u16 len;
    52  	u16 nlen;
    53  	unsigned num_litlen_syms;
    54  	unsigned num_offset_syms;
    55  	u16 tmp16;
    56  	u32 tmp32;
    57  
    58  next_block:
    59  	/* Starting to read the next block.  */
    60  	;
    61  
    62  	STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4));
    63  	ENSURE_BITS(1 + 2 + 5 + 5 + 4);
    64  
    65  	/* BFINAL: 1 bit  */
    66  	is_final_block = POP_BITS(1);
    67  
    68  	/* BTYPE: 2 bits  */
    69  	block_type = POP_BITS(2);
    70  
    71  	if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
    72  
    73  		/* Dynamic Huffman block.  */
    74  
    75  		/* The order in which precode lengths are stored.  */
    76  		static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
    77  			16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
    78  		};
    79  
    80  		unsigned num_explicit_precode_lens;
    81  
    82  		/* Read the codeword length counts.  */
    83  
    84  		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257);
    85  		num_litlen_syms = POP_BITS(5) + 257;
    86  
    87  		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1);
    88  		num_offset_syms = POP_BITS(5) + 1;
    89  
    90  		STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4);
    91  		num_explicit_precode_lens = POP_BITS(4) + 4;
    92  
    93  		/* Read the precode codeword lengths.  */
    94  		STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
    95  		if (CAN_ENSURE(DEFLATE_NUM_PRECODE_SYMS * 3)) {
    96  
    97  			ENSURE_BITS(DEFLATE_NUM_PRECODE_SYMS * 3);
    98  
    99  			for (i = 0; i < num_explicit_precode_lens; i++)
   100  				d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
   101  		} else {
   102  			for (i = 0; i < num_explicit_precode_lens; i++) {
   103  				ENSURE_BITS(3);
   104  				d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
   105  			}
   106  		}
   107  
   108  		for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
   109  			d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
   110  
   111  		/* Build the decode table for the precode.  */
   112  		SAFETY_CHECK(build_precode_decode_table(d));
   113  
   114  		/* Expand the literal/length and offset codeword lengths.  */
   115  		for (i = 0; i < num_litlen_syms + num_offset_syms; ) {
   116  			u32 entry;
   117  			unsigned presym;
   118  			u8 rep_val;
   119  			unsigned rep_count;
   120  
   121  			ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7);
   122  
   123  			/* (The code below assumes that the precode decode table
   124  			 * does not have any subtables.)  */
   125  			STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
   126  
   127  			/* Read the next precode symbol.  */
   128  			entry = d->u.l.precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)];
   129  			REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
   130  			presym = entry >> HUFFDEC_RESULT_SHIFT;
   131  
   132  			if (presym < 16) {
   133  				/* Explicit codeword length  */
   134  				d->u.l.lens[i++] = presym;
   135  				continue;
   136  			}
   137  
   138  			/* Run-length encoded codeword lengths  */
   139  
   140  			/* Note: we don't need verify that the repeat count
   141  			 * doesn't overflow the number of elements, since we
   142  			 * have enough extra spaces to allow for the worst-case
   143  			 * overflow (138 zeroes when only 1 length was
   144  			 * remaining).
   145  			 *
   146  			 * In the case of the small repeat counts (presyms 16
   147  			 * and 17), it is fastest to always write the maximum
   148  			 * number of entries.  That gets rid of branches that
   149  			 * would otherwise be required.
   150  			 *
   151  			 * It is not just because of the numerical order that
   152  			 * our checks go in the order 'presym < 16', 'presym ==
   153  			 * 16', and 'presym == 17'.  For typical data this is
   154  			 * ordered from most frequent to least frequent case.
   155  			 */
   156  			STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
   157  
   158  			if (presym == 16) {
   159  				/* Repeat the previous length 3 - 6 times  */
   160  				SAFETY_CHECK(i != 0);
   161  				rep_val = d->u.l.lens[i - 1];
   162  				STATIC_ASSERT(3 + ((1 << 2) - 1) == 6);
   163  				rep_count = 3 + POP_BITS(2);
   164  				d->u.l.lens[i + 0] = rep_val;
   165  				d->u.l.lens[i + 1] = rep_val;
   166  				d->u.l.lens[i + 2] = rep_val;
   167  				d->u.l.lens[i + 3] = rep_val;
   168  				d->u.l.lens[i + 4] = rep_val;
   169  				d->u.l.lens[i + 5] = rep_val;
   170  				i += rep_count;
   171  			} else if (presym == 17) {
   172  				/* Repeat zero 3 - 10 times  */
   173  				STATIC_ASSERT(3 + ((1 << 3) - 1) == 10);
   174  				rep_count = 3 + POP_BITS(3);
   175  				d->u.l.lens[i + 0] = 0;
   176  				d->u.l.lens[i + 1] = 0;
   177  				d->u.l.lens[i + 2] = 0;
   178  				d->u.l.lens[i + 3] = 0;
   179  				d->u.l.lens[i + 4] = 0;
   180  				d->u.l.lens[i + 5] = 0;
   181  				d->u.l.lens[i + 6] = 0;
   182  				d->u.l.lens[i + 7] = 0;
   183  				d->u.l.lens[i + 8] = 0;
   184  				d->u.l.lens[i + 9] = 0;
   185  				i += rep_count;
   186  			} else {
   187  				/* Repeat zero 11 - 138 times  */
   188  				STATIC_ASSERT(11 + ((1 << 7) - 1) == 138);
   189  				rep_count = 11 + POP_BITS(7);
   190  				memset(&d->u.l.lens[i], 0,
   191  				       rep_count * sizeof(d->u.l.lens[i]));
   192  				i += rep_count;
   193  			}
   194  		}
   195  	} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
   196  
   197  		/* Uncompressed block: copy 'len' bytes literally from the input
   198  		 * buffer to the output buffer.  */
   199  
   200  		ALIGN_INPUT();
   201  
   202  		SAFETY_CHECK(in_end - in_next >= 4);
   203  
   204  		len = READ_U16();
   205  		nlen = READ_U16();
   206  
   207  		SAFETY_CHECK(len == (u16)~nlen);
   208  		if (unlikely(len > out_end - out_next))
   209  			return LIBDEFLATE_INSUFFICIENT_SPACE;
   210  		SAFETY_CHECK(len <= in_end - in_next);
   211  
   212  		memcpy(out_next, in_next, len);
   213  		in_next += len;
   214  		out_next += len;
   215  
   216  		goto block_done;
   217  
   218  	} else {
   219  		SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
   220  
   221  		/* Static Huffman block: set the static Huffman codeword
   222  		 * lengths.  Then the remainder is the same as decompressing a
   223  		 * dynamic Huffman block.  */
   224  
   225  		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
   226  		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
   227  
   228  		for (i = 0; i < 144; i++)
   229  			d->u.l.lens[i] = 8;
   230  		for (; i < 256; i++)
   231  			d->u.l.lens[i] = 9;
   232  		for (; i < 280; i++)
   233  			d->u.l.lens[i] = 7;
   234  		for (; i < 288; i++)
   235  			d->u.l.lens[i] = 8;
   236  
   237  		for (; i < 288 + 32; i++)
   238  			d->u.l.lens[i] = 5;
   239  
   240  		num_litlen_syms = 288;
   241  		num_offset_syms = 32;
   242  
   243  	}
   244  
   245  	/* Decompressing a Huffman block (either dynamic or static)  */
   246  
   247  	SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
   248  	SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
   249  
   250  	/* The main DEFLATE decode loop  */
   251  	for (;;) {
   252  		u32 entry;
   253  		u32 length;
   254  		u32 offset;
   255  
   256  		/* Decode a litlen symbol.  */
   257  		ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN);
   258  		entry = d->u.litlen_decode_table[BITS(LITLEN_TABLEBITS)];
   259  		if (entry & HUFFDEC_SUBTABLE_POINTER) {
   260  			/* Litlen subtable required (uncommon case)  */
   261  			REMOVE_BITS(LITLEN_TABLEBITS);
   262  			entry = d->u.litlen_decode_table[
   263  				((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
   264  				BITS(entry & HUFFDEC_LENGTH_MASK)];
   265  		}
   266  		REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
   267  		if (entry & HUFFDEC_LITERAL) {
   268  			/* Literal  */
   269  			if (unlikely(out_next == out_end))
   270  				return LIBDEFLATE_INSUFFICIENT_SPACE;
   271  			*out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT);
   272  			continue;
   273  		}
   274  
   275  		/* Match or end-of-block  */
   276  
   277  		entry >>= HUFFDEC_RESULT_SHIFT;
   278  		ENSURE_BITS(MAX_ENSURE);
   279  
   280  		/* Pop the extra length bits and add them to the length base to
   281  		 * produce the full length.  */
   282  		length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) +
   283  			 POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK);
   284  
   285  		/* The match destination must not end after the end of the
   286  		 * output buffer.  For efficiency, combine this check with the
   287  		 * end-of-block check.  We're using 0 for the special
   288  		 * end-of-block length, so subtract 1 and it turn it into
   289  		 * SIZE_MAX.  */
   290  		STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0);
   291  		if (unlikely((size_t)length - 1 >= out_end - out_next)) {
   292  			if (unlikely(length != HUFFDEC_END_OF_BLOCK_LENGTH))
   293  				return LIBDEFLATE_INSUFFICIENT_SPACE;
   294  			goto block_done;
   295  		}
   296  
   297  		/* Decode the match offset.  */
   298  
   299  		entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)];
   300  		if (entry & HUFFDEC_SUBTABLE_POINTER) {
   301  			/* Offset subtable required (uncommon case)  */
   302  			REMOVE_BITS(OFFSET_TABLEBITS);
   303  			entry = d->offset_decode_table[
   304  				((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
   305  				BITS(entry & HUFFDEC_LENGTH_MASK)];
   306  		}
   307  		REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
   308  		entry >>= HUFFDEC_RESULT_SHIFT;
   309  
   310  		STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
   311  					 DEFLATE_MAX_OFFSET_CODEWORD_LEN) &&
   312  			      CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS));
   313  		if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
   314  				DEFLATE_MAX_OFFSET_CODEWORD_LEN +
   315  				DEFLATE_MAX_EXTRA_OFFSET_BITS))
   316  			ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS);
   317  
   318  		/* Pop the extra offset bits and add them to the offset base to
   319  		 * produce the full offset.  */
   320  		offset = (entry & HUFFDEC_OFFSET_BASE_MASK) +
   321  			 POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT);
   322  
   323  		/* The match source must not begin before the beginning of the
   324  		 * output buffer.  */
   325  		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
   326  
   327  		/* Copy the match: 'length' bytes at 'out_next - offset' to
   328  		 * 'out_next'.  */
   329  
   330  		if (UNALIGNED_ACCESS_IS_FAST &&
   331  		    length <= (3 * WORDBYTES) &&
   332  		    offset >= WORDBYTES &&
   333  		    length + (3 * WORDBYTES) <= out_end - out_next)
   334  		{
   335  			/* Fast case: short length, no overlaps if we copy one
   336  			 * word at a time, and we aren't getting too close to
   337  			 * the end of the output array.  */
   338  			copy_word_unaligned(out_next - offset + (0 * WORDBYTES),
   339  					    out_next + (0 * WORDBYTES));
   340  			copy_word_unaligned(out_next - offset + (1 * WORDBYTES),
   341  					    out_next + (1 * WORDBYTES));
   342  			copy_word_unaligned(out_next - offset + (2 * WORDBYTES),
   343  					    out_next + (2 * WORDBYTES));
   344  		} else {
   345  			const u8 *src = out_next - offset;
   346  			u8 *dst = out_next;
   347  			u8 *end = out_next + length;
   348  
   349  			if (UNALIGNED_ACCESS_IS_FAST &&
   350  			    likely(out_end - end >= WORDBYTES - 1)) {
   351  				if (offset >= WORDBYTES) {
   352  					copy_word_unaligned(src, dst);
   353  					src += WORDBYTES;
   354  					dst += WORDBYTES;
   355  					if (dst < end) {
   356  						do {
   357  							copy_word_unaligned(src, dst);
   358  							src += WORDBYTES;
   359  							dst += WORDBYTES;
   360  						} while (dst < end);
   361  					}
   362  				} else if (offset == 1) {
   363  					machine_word_t v = repeat_byte(*(dst - 1));
   364  					do {
   365  						store_word_unaligned(v, dst);
   366  						src += WORDBYTES;
   367  						dst += WORDBYTES;
   368  					} while (dst < end);
   369  				} else {
   370  					*dst++ = *src++;
   371  					*dst++ = *src++;
   372  					do {
   373  						*dst++ = *src++;
   374  					} while (dst < end);
   375  				}
   376  			} else {
   377  				*dst++ = *src++;
   378  				*dst++ = *src++;
   379  				do {
   380  					*dst++ = *src++;
   381  				} while (dst < end);
   382  			}
   383  		}
   384  
   385  		out_next += length;
   386  	}
   387  
   388  block_done:
   389  	/* Finished decoding a block.  */
   390  
   391  	if (!is_final_block)
   392  		goto next_block;
   393  
   394  	/* That was the last block.  */
   395  
   396  	/* Discard any readahead bits and check for excessive overread */
   397  	ALIGN_INPUT();
   398  
   399  	/* Optionally return the actual number of bytes read */
   400  	if (actual_in_nbytes_ret)
   401  		*actual_in_nbytes_ret = in_next - (u8 *)in;
   402  
   403  	/* Optionally return the actual number of bytes written */
   404  	if (actual_out_nbytes_ret) {
   405  		*actual_out_nbytes_ret = out_next - (u8 *)out;
   406  	} else {
   407  		if (out_next != out_end)
   408  			return LIBDEFLATE_SHORT_OUTPUT;
   409  	}
   410  	return LIBDEFLATE_SUCCESS;
   411  }
   412  
   413  #undef FUNCNAME
   414  #undef ATTRIBUTES