github.com/grailbio/base@v0.0.11/compress/libdeflate/decompress_template.h (about) 1 // NOLINT(build/header_guard) 2 /* 3 * decompress_template.h 4 * 5 * Copyright 2016 Eric Biggers 6 * 7 * Permission is hereby granted, free of charge, to any person 8 * obtaining a copy of this software and associated documentation 9 * files (the "Software"), to deal in the Software without 10 * restriction, including without limitation the rights to use, 11 * copy, modify, merge, publish, distribute, sublicense, and/or sell 12 * copies of the Software, and to permit persons to whom the 13 * Software is furnished to do so, subject to the following 14 * conditions: 15 * 16 * The above copyright notice and this permission notice shall be 17 * included in all copies or substantial portions of the Software. 18 * 19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 20 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 21 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 22 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 23 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 24 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 25 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 26 * OTHER DEALINGS IN THE SOFTWARE. 27 */ 28 29 /* 30 * This is the actual DEFLATE decompression routine, lifted out of 31 * deflate_decompress.c so that it can be compiled multiple times with different 32 * target instruction sets. 33 */ 34 35 static enum libdeflate_result ATTRIBUTES 36 FUNCNAME(struct libdeflate_decompressor * restrict d, 37 const void * restrict in, size_t in_nbytes, 38 void * restrict out, size_t out_nbytes_avail, 39 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) 40 { 41 u8 *out_next = out; 42 u8 * const out_end = out_next + out_nbytes_avail; 43 const u8 *in_next = in; 44 const u8 * const in_end = in_next + in_nbytes; 45 bitbuf_t bitbuf = 0; 46 unsigned bitsleft = 0; 47 size_t overrun_count = 0; 48 unsigned i; 49 unsigned is_final_block; 50 unsigned block_type; 51 u16 len; 52 u16 nlen; 53 unsigned num_litlen_syms; 54 unsigned num_offset_syms; 55 u16 tmp16; 56 u32 tmp32; 57 58 next_block: 59 /* Starting to read the next block. */ 60 ; 61 62 STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4)); 63 ENSURE_BITS(1 + 2 + 5 + 5 + 4); 64 65 /* BFINAL: 1 bit */ 66 is_final_block = POP_BITS(1); 67 68 /* BTYPE: 2 bits */ 69 block_type = POP_BITS(2); 70 71 if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) { 72 73 /* Dynamic Huffman block. */ 74 75 /* The order in which precode lengths are stored. */ 76 static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { 77 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 78 }; 79 80 unsigned num_explicit_precode_lens; 81 82 /* Read the codeword length counts. */ 83 84 STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257); 85 num_litlen_syms = POP_BITS(5) + 257; 86 87 STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1); 88 num_offset_syms = POP_BITS(5) + 1; 89 90 STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4); 91 num_explicit_precode_lens = POP_BITS(4) + 4; 92 93 /* Read the precode codeword lengths. */ 94 STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1); 95 if (CAN_ENSURE(DEFLATE_NUM_PRECODE_SYMS * 3)) { 96 97 ENSURE_BITS(DEFLATE_NUM_PRECODE_SYMS * 3); 98 99 for (i = 0; i < num_explicit_precode_lens; i++) 100 d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3); 101 } else { 102 for (i = 0; i < num_explicit_precode_lens; i++) { 103 ENSURE_BITS(3); 104 d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3); 105 } 106 } 107 108 for (; i < DEFLATE_NUM_PRECODE_SYMS; i++) 109 d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0; 110 111 /* Build the decode table for the precode. */ 112 SAFETY_CHECK(build_precode_decode_table(d)); 113 114 /* Expand the literal/length and offset codeword lengths. */ 115 for (i = 0; i < num_litlen_syms + num_offset_syms; ) { 116 u32 entry; 117 unsigned presym; 118 u8 rep_val; 119 unsigned rep_count; 120 121 ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7); 122 123 /* (The code below assumes that the precode decode table 124 * does not have any subtables.) */ 125 STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN); 126 127 /* Read the next precode symbol. */ 128 entry = d->u.l.precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)]; 129 REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); 130 presym = entry >> HUFFDEC_RESULT_SHIFT; 131 132 if (presym < 16) { 133 /* Explicit codeword length */ 134 d->u.l.lens[i++] = presym; 135 continue; 136 } 137 138 /* Run-length encoded codeword lengths */ 139 140 /* Note: we don't need verify that the repeat count 141 * doesn't overflow the number of elements, since we 142 * have enough extra spaces to allow for the worst-case 143 * overflow (138 zeroes when only 1 length was 144 * remaining). 145 * 146 * In the case of the small repeat counts (presyms 16 147 * and 17), it is fastest to always write the maximum 148 * number of entries. That gets rid of branches that 149 * would otherwise be required. 150 * 151 * It is not just because of the numerical order that 152 * our checks go in the order 'presym < 16', 'presym == 153 * 16', and 'presym == 17'. For typical data this is 154 * ordered from most frequent to least frequent case. 155 */ 156 STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1); 157 158 if (presym == 16) { 159 /* Repeat the previous length 3 - 6 times */ 160 SAFETY_CHECK(i != 0); 161 rep_val = d->u.l.lens[i - 1]; 162 STATIC_ASSERT(3 + ((1 << 2) - 1) == 6); 163 rep_count = 3 + POP_BITS(2); 164 d->u.l.lens[i + 0] = rep_val; 165 d->u.l.lens[i + 1] = rep_val; 166 d->u.l.lens[i + 2] = rep_val; 167 d->u.l.lens[i + 3] = rep_val; 168 d->u.l.lens[i + 4] = rep_val; 169 d->u.l.lens[i + 5] = rep_val; 170 i += rep_count; 171 } else if (presym == 17) { 172 /* Repeat zero 3 - 10 times */ 173 STATIC_ASSERT(3 + ((1 << 3) - 1) == 10); 174 rep_count = 3 + POP_BITS(3); 175 d->u.l.lens[i + 0] = 0; 176 d->u.l.lens[i + 1] = 0; 177 d->u.l.lens[i + 2] = 0; 178 d->u.l.lens[i + 3] = 0; 179 d->u.l.lens[i + 4] = 0; 180 d->u.l.lens[i + 5] = 0; 181 d->u.l.lens[i + 6] = 0; 182 d->u.l.lens[i + 7] = 0; 183 d->u.l.lens[i + 8] = 0; 184 d->u.l.lens[i + 9] = 0; 185 i += rep_count; 186 } else { 187 /* Repeat zero 11 - 138 times */ 188 STATIC_ASSERT(11 + ((1 << 7) - 1) == 138); 189 rep_count = 11 + POP_BITS(7); 190 memset(&d->u.l.lens[i], 0, 191 rep_count * sizeof(d->u.l.lens[i])); 192 i += rep_count; 193 } 194 } 195 } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { 196 197 /* Uncompressed block: copy 'len' bytes literally from the input 198 * buffer to the output buffer. */ 199 200 ALIGN_INPUT(); 201 202 SAFETY_CHECK(in_end - in_next >= 4); 203 204 len = READ_U16(); 205 nlen = READ_U16(); 206 207 SAFETY_CHECK(len == (u16)~nlen); 208 if (unlikely(len > out_end - out_next)) 209 return LIBDEFLATE_INSUFFICIENT_SPACE; 210 SAFETY_CHECK(len <= in_end - in_next); 211 212 memcpy(out_next, in_next, len); 213 in_next += len; 214 out_next += len; 215 216 goto block_done; 217 218 } else { 219 SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN); 220 221 /* Static Huffman block: set the static Huffman codeword 222 * lengths. Then the remainder is the same as decompressing a 223 * dynamic Huffman block. */ 224 225 STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288); 226 STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32); 227 228 for (i = 0; i < 144; i++) 229 d->u.l.lens[i] = 8; 230 for (; i < 256; i++) 231 d->u.l.lens[i] = 9; 232 for (; i < 280; i++) 233 d->u.l.lens[i] = 7; 234 for (; i < 288; i++) 235 d->u.l.lens[i] = 8; 236 237 for (; i < 288 + 32; i++) 238 d->u.l.lens[i] = 5; 239 240 num_litlen_syms = 288; 241 num_offset_syms = 32; 242 243 } 244 245 /* Decompressing a Huffman block (either dynamic or static) */ 246 247 SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms)); 248 SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms)); 249 250 /* The main DEFLATE decode loop */ 251 for (;;) { 252 u32 entry; 253 u32 length; 254 u32 offset; 255 256 /* Decode a litlen symbol. */ 257 ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN); 258 entry = d->u.litlen_decode_table[BITS(LITLEN_TABLEBITS)]; 259 if (entry & HUFFDEC_SUBTABLE_POINTER) { 260 /* Litlen subtable required (uncommon case) */ 261 REMOVE_BITS(LITLEN_TABLEBITS); 262 entry = d->u.litlen_decode_table[ 263 ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) + 264 BITS(entry & HUFFDEC_LENGTH_MASK)]; 265 } 266 REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); 267 if (entry & HUFFDEC_LITERAL) { 268 /* Literal */ 269 if (unlikely(out_next == out_end)) 270 return LIBDEFLATE_INSUFFICIENT_SPACE; 271 *out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT); 272 continue; 273 } 274 275 /* Match or end-of-block */ 276 277 entry >>= HUFFDEC_RESULT_SHIFT; 278 ENSURE_BITS(MAX_ENSURE); 279 280 /* Pop the extra length bits and add them to the length base to 281 * produce the full length. */ 282 length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) + 283 POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK); 284 285 /* The match destination must not end after the end of the 286 * output buffer. For efficiency, combine this check with the 287 * end-of-block check. We're using 0 for the special 288 * end-of-block length, so subtract 1 and it turn it into 289 * SIZE_MAX. */ 290 STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0); 291 if (unlikely((size_t)length - 1 >= out_end - out_next)) { 292 if (unlikely(length != HUFFDEC_END_OF_BLOCK_LENGTH)) 293 return LIBDEFLATE_INSUFFICIENT_SPACE; 294 goto block_done; 295 } 296 297 /* Decode the match offset. */ 298 299 entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)]; 300 if (entry & HUFFDEC_SUBTABLE_POINTER) { 301 /* Offset subtable required (uncommon case) */ 302 REMOVE_BITS(OFFSET_TABLEBITS); 303 entry = d->offset_decode_table[ 304 ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) + 305 BITS(entry & HUFFDEC_LENGTH_MASK)]; 306 } 307 REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); 308 entry >>= HUFFDEC_RESULT_SHIFT; 309 310 STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS + 311 DEFLATE_MAX_OFFSET_CODEWORD_LEN) && 312 CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS)); 313 if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS + 314 DEFLATE_MAX_OFFSET_CODEWORD_LEN + 315 DEFLATE_MAX_EXTRA_OFFSET_BITS)) 316 ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS); 317 318 /* Pop the extra offset bits and add them to the offset base to 319 * produce the full offset. */ 320 offset = (entry & HUFFDEC_OFFSET_BASE_MASK) + 321 POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT); 322 323 /* The match source must not begin before the beginning of the 324 * output buffer. */ 325 SAFETY_CHECK(offset <= out_next - (const u8 *)out); 326 327 /* Copy the match: 'length' bytes at 'out_next - offset' to 328 * 'out_next'. */ 329 330 if (UNALIGNED_ACCESS_IS_FAST && 331 length <= (3 * WORDBYTES) && 332 offset >= WORDBYTES && 333 length + (3 * WORDBYTES) <= out_end - out_next) 334 { 335 /* Fast case: short length, no overlaps if we copy one 336 * word at a time, and we aren't getting too close to 337 * the end of the output array. */ 338 copy_word_unaligned(out_next - offset + (0 * WORDBYTES), 339 out_next + (0 * WORDBYTES)); 340 copy_word_unaligned(out_next - offset + (1 * WORDBYTES), 341 out_next + (1 * WORDBYTES)); 342 copy_word_unaligned(out_next - offset + (2 * WORDBYTES), 343 out_next + (2 * WORDBYTES)); 344 } else { 345 const u8 *src = out_next - offset; 346 u8 *dst = out_next; 347 u8 *end = out_next + length; 348 349 if (UNALIGNED_ACCESS_IS_FAST && 350 likely(out_end - end >= WORDBYTES - 1)) { 351 if (offset >= WORDBYTES) { 352 copy_word_unaligned(src, dst); 353 src += WORDBYTES; 354 dst += WORDBYTES; 355 if (dst < end) { 356 do { 357 copy_word_unaligned(src, dst); 358 src += WORDBYTES; 359 dst += WORDBYTES; 360 } while (dst < end); 361 } 362 } else if (offset == 1) { 363 machine_word_t v = repeat_byte(*(dst - 1)); 364 do { 365 store_word_unaligned(v, dst); 366 src += WORDBYTES; 367 dst += WORDBYTES; 368 } while (dst < end); 369 } else { 370 *dst++ = *src++; 371 *dst++ = *src++; 372 do { 373 *dst++ = *src++; 374 } while (dst < end); 375 } 376 } else { 377 *dst++ = *src++; 378 *dst++ = *src++; 379 do { 380 *dst++ = *src++; 381 } while (dst < end); 382 } 383 } 384 385 out_next += length; 386 } 387 388 block_done: 389 /* Finished decoding a block. */ 390 391 if (!is_final_block) 392 goto next_block; 393 394 /* That was the last block. */ 395 396 /* Discard any readahead bits and check for excessive overread */ 397 ALIGN_INPUT(); 398 399 /* Optionally return the actual number of bytes read */ 400 if (actual_in_nbytes_ret) 401 *actual_in_nbytes_ret = in_next - (u8 *)in; 402 403 /* Optionally return the actual number of bytes written */ 404 if (actual_out_nbytes_ret) { 405 *actual_out_nbytes_ret = out_next - (u8 *)out; 406 } else { 407 if (out_next != out_end) 408 return LIBDEFLATE_SHORT_OUTPUT; 409 } 410 return LIBDEFLATE_SUCCESS; 411 } 412 413 #undef FUNCNAME 414 #undef ATTRIBUTES