github.com/grailbio/base@v0.0.11/compress/libdeflate/hc_matchfinder.h

github.com/grailbio/base@v0.0.11/compress/libdeflate/hc_matchfinder.h (about)

     1  #ifndef GO_SRC_GITHUB_COM_GRAILBIO_BASE_COMPRESS_LIBDEFLATE_HC_MATCHFINDER_H_
     2  #define GO_SRC_GITHUB_COM_GRAILBIO_BASE_COMPRESS_LIBDEFLATE_HC_MATCHFINDER_H_
     3  /*
     4   * hc_matchfinder.h - Lempel-Ziv matchfinding with a hash table of linked lists
     5   *
     6   * Originally public domain; changes after 2016-09-07 are copyrighted.
     7   *
     8   * Copyright 2016 Eric Biggers
     9   *
    10   * Permission is hereby granted, free of charge, to any person
    11   * obtaining a copy of this software and associated documentation
    12   * files (the "Software"), to deal in the Software without
    13   * restriction, including without limitation the rights to use,
    14   * copy, modify, merge, publish, distribute, sublicense, and/or sell
    15   * copies of the Software, and to permit persons to whom the
    16   * Software is furnished to do so, subject to the following
    17   * conditions:
    18   *
    19   * The above copyright notice and this permission notice shall be
    20   * included in all copies or substantial portions of the Software.
    21   *
    22   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    23   * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    24   * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    25   * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    26   * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    27   * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    28   * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    29   * OTHER DEALINGS IN THE SOFTWARE.
    30   *
    31   * ---------------------------------------------------------------------------
    32   *
    33   *				   Algorithm
    34   *
    35   * This is a Hash Chains (hc) based matchfinder.
    36   *
    37   * The main data structure is a hash table where each hash bucket contains a
    38   * linked list (or "chain") of sequences whose first 4 bytes share the same hash
    39   * code.  Each sequence is identified by its starting position in the input
    40   * buffer.
    41   *
    42   * The algorithm processes the input buffer sequentially.  At each byte
    43   * position, the hash code of the first 4 bytes of the sequence beginning at
    44   * that position (the sequence being matched against) is computed.  This
    45   * identifies the hash bucket to use for that position.  Then, this hash
    46   * bucket's linked list is searched for matches.  Then, a new linked list node
    47   * is created to represent the current sequence and is prepended to the list.
    48   *
    49   * This algorithm has several useful properties:
    50   *
    51   * - It only finds true Lempel-Ziv matches; i.e., those where the matching
    52   *   sequence occurs prior to the sequence being matched against.
    53   *
    54   * - The sequences in each linked list are always sorted by decreasing starting
    55   *   position.  Therefore, the closest (smallest offset) matches are found
    56   *   first, which in many compression formats tend to be the cheapest to encode.
    57   *
    58   * - Although fast running time is not guaranteed due to the possibility of the
    59   *   lists getting very long, the worst degenerate behavior can be easily
    60   *   prevented by capping the number of nodes searched at each position.
    61   *
    62   * - If the compressor decides not to search for matches at a certain position,
    63   *   then that position can be quickly inserted without searching the list.
    64   *
    65   * - The algorithm is adaptable to sliding windows: just store the positions
    66   *   relative to a "base" value that is updated from time to time, and stop
    67   *   searching each list when the sequences get too far away.
    68   *
    69   * ----------------------------------------------------------------------------
    70   *
    71   *				 Optimizations
    72   *
    73   * The main hash table and chains handle length 4+ matches.  Length 3 matches
    74   * are handled by a separate hash table with no chains.  This works well for
    75   * typical "greedy" or "lazy"-style compressors, where length 3 matches are
    76   * often only helpful if they have small offsets.  Instead of searching a full
    77   * chain for length 3+ matches, the algorithm just checks for one close length 3
    78   * match, then focuses on finding length 4+ matches.
    79   *
    80   * The longest_match() and skip_positions() functions are inlined into the
    81   * compressors that use them.  This isn't just about saving the overhead of a
    82   * function call.  These functions are intended to be called from the inner
    83   * loops of compressors, where giving the compiler more control over register
    84   * allocation is very helpful.  There is also significant benefit to be gained
    85   * from allowing the CPU to predict branches independently at each call site.
    86   * For example, "lazy"-style compressors can be written with two calls to
    87   * longest_match(), each of which starts with a different 'best_len' and
    88   * therefore has significantly different performance characteristics.
    89   *
    90   * Although any hash function can be used, a multiplicative hash is fast and
    91   * works well.
    92   *
    93   * On some processors, it is significantly faster to extend matches by whole
    94   * words (32 or 64 bits) instead of by individual bytes.  For this to be the
    95   * case, the processor must implement unaligned memory accesses efficiently and
    96   * must have either a fast "find first set bit" instruction or a fast "find last
    97   * set bit" instruction, depending on the processor's endianness.
    98   *
    99   * The code uses one loop for finding the first match and one loop for finding a
   100   * longer match.  Each of these loops is tuned for its respective task and in
   101   * combination are faster than a single generalized loop that handles both
   102   * tasks.
   103   *
   104   * The code also uses a tight inner loop that only compares the last and first
   105   * bytes of a potential match.  It is only when these bytes match that a full
   106   * match extension is attempted.
   107   *
   108   * ----------------------------------------------------------------------------
   109   */
   110  
   111  #include "matchfinder_common.h"
   112  
   113  #define HC_MATCHFINDER_HASH3_ORDER	15
   114  #define HC_MATCHFINDER_HASH4_ORDER	16
   115  
   116  #define HC_MATCHFINDER_TOTAL_HASH_LENGTH		\
   117  	((1UL << HC_MATCHFINDER_HASH3_ORDER) +		\
   118  	 (1UL << HC_MATCHFINDER_HASH4_ORDER))
   119  
   120  struct hc_matchfinder {
   121  
   122  	/* The hash table for finding length 3 matches  */
   123  	mf_pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER];
   124  
   125  	/* The hash table which contains the first nodes of the linked lists for
   126  	 * finding length 4+ matches  */
   127  	mf_pos_t hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER];
   128  
   129  	/* The "next node" references for the linked lists.  The "next node" of
   130  	 * the node for the sequence with position 'pos' is 'next_tab[pos]'.  */
   131  	mf_pos_t next_tab[MATCHFINDER_WINDOW_SIZE];
   132  
   133  }
   134  #ifdef _aligned_attribute
   135    _aligned_attribute(MATCHFINDER_ALIGNMENT)
   136  #endif
   137  ;
   138  
   139  /* Prepare the matchfinder for a new input buffer.  */
   140  static forceinline void
   141  hc_matchfinder_init(struct hc_matchfinder *mf)
   142  {
   143  	matchfinder_init((mf_pos_t *)mf, HC_MATCHFINDER_TOTAL_HASH_LENGTH);
   144  }
   145  
   146  static forceinline void
   147  hc_matchfinder_slide_window(struct hc_matchfinder *mf)
   148  {
   149  	matchfinder_rebase((mf_pos_t *)mf,
   150  			   sizeof(struct hc_matchfinder) / sizeof(mf_pos_t));
   151  }
   152  
   153  /*
   154   * Find the longest match longer than 'best_len' bytes.
   155   *
   156   * @mf
   157   *	The matchfinder structure.
   158   * @in_base_p
   159   *	Location of a pointer which points to the place in the input data the
   160   *	matchfinder currently stores positions relative to.  This may be updated
   161   *	by this function.
   162   * @cur_pos
   163   *	The current position in the input buffer relative to @in_base (the
   164   *	position of the sequence being matched against).
   165   * @best_len
   166   *	Require a match longer than this length.
   167   * @max_len
   168   *	The maximum permissible match length at this position.
   169   * @nice_len
   170   *	Stop searching if a match of at least this length is found.
   171   *	Must be <= @max_len.
   172   * @max_search_depth
   173   *	Limit on the number of potential matches to consider.  Must be >= 1.
   174   * @next_hashes
   175   *	The precomputed hash codes for the sequence beginning at @in_next.
   176   *	These will be used and then updated with the precomputed hashcodes for
   177   *	the sequence beginning at @in_next + 1.
   178   * @offset_ret
   179   *	If a match is found, its offset is returned in this location.
   180   *
   181   * Return the length of the match found, or 'best_len' if no match longer than
   182   * 'best_len' was found.
   183   */
   184  static forceinline u32
   185  hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf,
   186  			     const u8 ** const restrict in_base_p,
   187  			     const u8 * const restrict in_next,
   188  			     u32 best_len,
   189  			     const u32 max_len,
   190  			     const u32 nice_len,
   191  			     const u32 max_search_depth,
   192  			     u32 * const restrict next_hashes,
   193  			     u32 * const restrict offset_ret)
   194  {
   195  	u32 depth_remaining = max_search_depth;
   196  	const u8 *best_matchptr = in_next;
   197  	mf_pos_t cur_node3, cur_node4;
   198  	u32 hash3, hash4;
   199  	u32 next_hashseq;
   200  	u32 seq4;
   201  	const u8 *matchptr;
   202  	u32 len;
   203  	u32 cur_pos = in_next - *in_base_p;
   204  	const u8 *in_base;
   205  	mf_pos_t cutoff;
   206  
   207  	if (cur_pos == MATCHFINDER_WINDOW_SIZE) {
   208  		hc_matchfinder_slide_window(mf);
   209  		*in_base_p += MATCHFINDER_WINDOW_SIZE;
   210  		cur_pos = 0;
   211  	}
   212  
   213  	in_base = *in_base_p;
   214  	cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
   215  
   216  	if (unlikely(max_len < 5)) /* can we read 4 bytes from 'in_next + 1'? */
   217  		goto out;
   218  
   219  	/* Get the precomputed hash codes.  */
   220  	hash3 = next_hashes[0];
   221  	hash4 = next_hashes[1];
   222  
   223  	/* From the hash buckets, get the first node of each linked list.  */
   224  	cur_node3 = mf->hash3_tab[hash3];
   225  	cur_node4 = mf->hash4_tab[hash4];
   226  
   227  	/* Update for length 3 matches.  This replaces the singleton node in the
   228  	 * 'hash3' bucket with the node for the current sequence.  */
   229  	mf->hash3_tab[hash3] = cur_pos;
   230  
   231  	/* Update for length 4 matches.  This prepends the node for the current
   232  	 * sequence to the linked list in the 'hash4' bucket.  */
   233  	mf->hash4_tab[hash4] = cur_pos;
   234  	mf->next_tab[cur_pos] = cur_node4;
   235  
   236  	/* Compute the next hash codes.  */
   237  	next_hashseq = get_unaligned_le32(in_next + 1);
   238  	next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER);
   239  	next_hashes[1] = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER);
   240  	prefetchw(&mf->hash3_tab[next_hashes[0]]);
   241  	prefetchw(&mf->hash4_tab[next_hashes[1]]);
   242  
   243  	if (best_len < 4) {  /* No match of length >= 4 found yet?  */
   244  
   245  		/* Check for a length 3 match if needed.  */
   246  
   247  		if (cur_node3 <= cutoff)
   248  			goto out;
   249  
   250  		seq4 = load_u32_unaligned(in_next);
   251  
   252  		if (best_len < 3) {
   253  			matchptr = &in_base[cur_node3];
   254  			if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) {
   255  				best_len = 3;
   256  				best_matchptr = matchptr;
   257  			}
   258  		}
   259  
   260  		/* Check for a length 4 match.  */
   261  
   262  		if (cur_node4 <= cutoff)
   263  			goto out;
   264  
   265  		for (;;) {
   266  			/* No length 4 match found yet.  Check the first 4 bytes.  */
   267  			matchptr = &in_base[cur_node4];
   268  
   269  			if (load_u32_unaligned(matchptr) == seq4)
   270  				break;
   271  
   272  			/* The first 4 bytes did not match.  Keep trying.  */
   273  			cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
   274  			if (cur_node4 <= cutoff || !--depth_remaining)
   275  				goto out;
   276  		}
   277  
   278  		/* Found a match of length >= 4.  Extend it to its full length.  */
   279  		best_matchptr = matchptr;
   280  		best_len = lz_extend(in_next, best_matchptr, 4, max_len);
   281  		if (best_len >= nice_len)
   282  			goto out;
   283  		cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
   284  		if (cur_node4 <= cutoff || !--depth_remaining)
   285  			goto out;
   286  	} else {
   287  		if (cur_node4 <= cutoff || best_len >= nice_len)
   288  			goto out;
   289  	}
   290  
   291  	/* Check for matches of length >= 5.  */
   292  
   293  	for (;;) {
   294  		for (;;) {
   295  			matchptr = &in_base[cur_node4];
   296  
   297  			/* Already found a length 4 match.  Try for a longer
   298  			 * match; start by checking either the last 4 bytes and
   299  			 * the first 4 bytes, or the last byte.  (The last byte,
   300  			 * the one which would extend the match length by 1, is
   301  			 * the most important.)  */
   302  		#if UNALIGNED_ACCESS_IS_FAST
   303  			if ((load_u32_unaligned(matchptr + best_len - 3) ==
   304  			     load_u32_unaligned(in_next + best_len - 3)) &&
   305  			    (load_u32_unaligned(matchptr) ==
   306  			     load_u32_unaligned(in_next)))
   307  		#else
   308  			if (matchptr[best_len] == in_next[best_len])
   309  		#endif
   310  				break;
   311  
   312  			/* Continue to the next node in the list.  */
   313  			cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
   314  			if (cur_node4 <= cutoff || !--depth_remaining)
   315  				goto out;
   316  		}
   317  
   318  	#if UNALIGNED_ACCESS_IS_FAST
   319  		len = 4;
   320  	#else
   321  		len = 0;
   322  	#endif
   323  		len = lz_extend(in_next, matchptr, len, max_len);
   324  		if (len > best_len) {
   325  			/* This is the new longest match.  */
   326  			best_len = len;
   327  			best_matchptr = matchptr;
   328  			if (best_len >= nice_len)
   329  				goto out;
   330  		}
   331  
   332  		/* Continue to the next node in the list.  */
   333  		cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
   334  		if (cur_node4 <= cutoff || !--depth_remaining)
   335  			goto out;
   336  	}
   337  out:
   338  	*offset_ret = in_next - best_matchptr;
   339  	return best_len;
   340  }
   341  
   342  /*
   343   * Advance the matchfinder, but don't search for matches.
   344   *
   345   * @mf
   346   *	The matchfinder structure.
   347   * @in_base_p
   348   *	Location of a pointer which points to the place in the input data the
   349   *	matchfinder currently stores positions relative to.  This may be updated
   350   *	by this function.
   351   * @cur_pos
   352   *	The current position in the input buffer relative to @in_base.
   353   * @end_pos
   354   *	The end position of the input buffer, relative to @in_base.
   355   * @next_hashes
   356   *	The precomputed hash codes for the sequence beginning at @in_next.
   357   *	These will be used and then updated with the precomputed hashcodes for
   358   *	the sequence beginning at @in_next + @count.
   359   * @count
   360   *	The number of bytes to advance.  Must be > 0.
   361   *
   362   * Returns @in_next + @count.
   363   */
   364  static forceinline const u8 *
   365  hc_matchfinder_skip_positions(struct hc_matchfinder * const restrict mf,
   366  			      const u8 ** const restrict in_base_p,
   367  			      const u8 *in_next,
   368  			      const u8 * const in_end,
   369  			      const u32 count,
   370  			      u32 * const restrict next_hashes)
   371  {
   372  	u32 cur_pos;
   373  	u32 hash3, hash4;
   374  	u32 next_hashseq;
   375  	u32 remaining = count;
   376  
   377  	if (unlikely(count + 5 > in_end - in_next))
   378  		return &in_next[count];
   379  
   380  	cur_pos = in_next - *in_base_p;
   381  	hash3 = next_hashes[0];
   382  	hash4 = next_hashes[1];
   383  	do {
   384  		if (cur_pos == MATCHFINDER_WINDOW_SIZE) {
   385  			hc_matchfinder_slide_window(mf);
   386  			*in_base_p += MATCHFINDER_WINDOW_SIZE;
   387  			cur_pos = 0;
   388  		}
   389  		mf->hash3_tab[hash3] = cur_pos;
   390  		mf->next_tab[cur_pos] = mf->hash4_tab[hash4];
   391  		mf->hash4_tab[hash4] = cur_pos;
   392  
   393  		next_hashseq = get_unaligned_le32(++in_next);
   394  		hash3 = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER);
   395  		hash4 = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER);
   396  		cur_pos++;
   397  	} while (--remaining);
   398  
   399  	prefetchw(&mf->hash3_tab[hash3]);
   400  	prefetchw(&mf->hash4_tab[hash4]);
   401  	next_hashes[0] = hash3;
   402  	next_hashes[1] = hash4;
   403  
   404  	return in_next;
   405  }
   406  
   407  #endif  // GO_SRC_GITHUB_COM_GRAILBIO_BASE_COMPRESS_LIBDEFLATE_HC_MATCHFINDER_H_