github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/sloppy/sloppy.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  //
    15  // This file incorporates work covered by the following copyright and
    16  // permission notice:
    17  //
    18  // Copyright 2017 Attic Labs, Inc. All rights reserved.
    19  // Licensed under the Apache License, version 2.0:
    20  // http://www.apache.org/licenses/LICENSE-2.0
    21  
    22  package sloppy
    23  
    24  import (
    25  	"github.com/dolthub/dolt/go/store/d"
    26  )
    27  
    28  const (
    29  	maxOffsetPOT = uint16(12)
    30  	maxTableSize = 1 << 14
    31  	maxLength    = 1<<12 - 1
    32  	tableMask    = maxTableSize - 1
    33  	shift        = uint32(20)
    34  )
    35  
    36  // TODO: Make this configurable
    37  var maxOffset = int(1<<maxOffsetPOT - 1)
    38  
    39  // Sloppy is a logical variant of Snappy. Its purpose to provide a kind of
    40  // estimate of how a given byte sequence *will be* compressed by Snappy. It is
    41  // useful when a byte stream is fed into a rolling hash with the goal of
    42  // achieving a given average chunk byte length *after compression*. Sloppy is
    43  // logically similar to snappy, but prefers "copies" which are closer to the
    44  // repeated byte sequence (snappy prefers to refer to the *first* instance of a
    45  // repeated byte sequence). This is important for mitigating the likelihood that
    46  // altering any byte in an input stream will cause chunk boundaries to be
    47  // redrawn downstream.
    48  //
    49  // The high-level approach is to maintain a logical mapping between four-byte
    50  // sequences which have been observed in the stream so-far and the integer
    51  // offset of observed sequence (the mapping is done with a "cheap" hash-function
    52  // which permits false-positives because they can be trivial filtered out). In
    53  // the non-matched state, for each new byte consumed, a uint32 is computed from
    54  // the next 4 bytes and then a look-up is performed to check for a matching 4
    55  // bytes earlier in the stream. Snappy and sloppy behave roughly identical thus
    56  // far.
    57  //
    58  // When in the "matched state" (attempting to extend the current match), Snappy
    59  // does not re-index new 4-byte sequences, but Sloppy does. The reason for this
    60  // is that Sloppy would like match the most recent occurence as it moves
    61  // forward.
    62  //
    63  // Lastly, Sloppy adds two novel heuritics, both aimed at further mitigating
    64  // the chance of chunk boundaries being redrawn because of byte value changes:
    65  //
    66  // 1) During the first 2 bytes of match, it *continues* to look for closer
    67  // matches (effectively prefering a closer but shorter copy to a further but
    68  // longer one). The reason for this is that when sequences repeat frequently in
    69  // a byte stream, randomness provides for a good chance that a one or two byte
    70  // prefix on a repeated sequence will match "far away". E.g.
    71  //
    72  // "23hello my friend, 12hello my friend, 01hello my friend, 23hello my friend"
    73  //
    74  // In the above sequence, sloppy would prefer to copy the final
    75  // "hello my friend" 19 bytes backwards rather than "23hello my friend" quite a
    76  // bit further.
    77  //
    78  // 2) Sloppy will only emit copies which are "worth it". I.e. The longer the
    79  // reference back, the longer the length of the copy must be.
    80  type Sloppy struct {
    81  	enc                      encoder
    82  	idx                      int
    83  	matching                 bool
    84  	matchOffset, matchLength int
    85  	table                    [maxTableSize]uint32
    86  }
    87  
    88  // New returns a new sloppy encoder which will encode to |f|. If |f| ever
    89  // returns false, then encoding ends immediately. |f| is a callback because
    90  // the primary use is that the "encoded" byte stream is fed byte-by-byte
    91  // into a rolling hash function.
    92  func New(f func(b byte) bool) *Sloppy {
    93  	return &Sloppy{
    94  		binaryEncoder{f},
    95  		0,
    96  		false,
    97  		0, 0,
    98  		[maxTableSize]uint32{},
    99  	}
   100  }
   101  
   102  // Update continues the encoding of a given input stream. The caller is expected
   103  // to call update after having (ONLY) appended bytes to |src|. When |Update|
   104  // returns, sloppy will have emitted 0 or more literals or copies by calling
   105  // the |sf.f|. Note that sloppy will ALWAYS buffer the final three bytes of
   106  // input.
   107  func (sl *Sloppy) Update(src []byte) {
   108  	// Only consume up to the point that a "look-ahead" can include 4 bytes.
   109  	for ; sl.idx < len(src)-3; sl.idx++ {
   110  		nextHash := fbhash(load32(src, sl.idx))
   111  
   112  		if sl.matching && (sl.matchLength > maxLength || src[sl.idx] != src[sl.matchOffset+sl.matchLength]) {
   113  			// End Match
   114  			if sl.maybeCopy(src) {
   115  				return // terminate if consumer has "closed"
   116  			}
   117  		}
   118  
   119  		// Look for a match if we are beyond the first byte AND either there is no
   120  		// match yet, OR we are matching, but fewer than 3 bytes have been
   121  		// matched. The later condition allows for giving up to 2 bytes of a copy
   122  		// in order to reference a "closer" sequence. Empirical tests on
   123  		// structured data, suggests this reduces the average offset by ~2/3.
   124  		if sl.idx > 0 && (!sl.matching || sl.matchLength < 3) {
   125  			matchPos := int(sl.table[nextHash&tableMask])
   126  
   127  			if sl.idx > matchPos &&
   128  				src[sl.idx] == src[matchPos] && // filter false positives
   129  				sl.idx-matchPos <= maxOffset && // don't refer back beyond maxOffset
   130  				(!sl.matching || matchPos >= sl.matchOffset+4) { // if we are "rematching", ensure the new match is at least 4 bytes closer
   131  
   132  				if sl.matching {
   133  					// We are dropping an existing match for a closer one. Emit the
   134  					// matched bytes as literals
   135  					if sl.dontCopy(src, sl.idx-sl.matchLength, sl.idx) {
   136  						return // terminate if consumer has "closed"
   137  					}
   138  				}
   139  
   140  				// Begin a new match
   141  				sl.matching = true
   142  				sl.matchOffset = matchPos
   143  				sl.matchLength = 0
   144  			}
   145  		}
   146  
   147  		// Store new hashed offset
   148  		sl.table[nextHash&tableMask] = uint32(sl.idx)
   149  
   150  		if sl.matching {
   151  			sl.matchLength++
   152  		} else {
   153  			if sl.enc.emitLiteral(src[sl.idx]) {
   154  				return // terminate if consumer has "closed"
   155  			}
   156  		}
   157  	}
   158  }
   159  
   160  func (sl *Sloppy) Reset() {
   161  	sl.idx = 0
   162  	sl.matching = false
   163  	sl.matchOffset = 0
   164  	sl.matchLength = 0
   165  	sl.table = [maxTableSize]uint32{}
   166  }
   167  
   168  // len >= 2^(2 + log2(maxOffset) - log2(maxOffset-off)). IOW, for the first 1/2
   169  // of the maxOffset, a copy must be >= 4. For 1/2 of what remains, a copy must
   170  // be >= 8, etc...
   171  func copyLongEnough(off, len uint16) bool {
   172  	d.PanicIfTrue(off == 0)
   173  
   174  	p := uint16(0)
   175  	x := (1 << maxOffsetPOT) - off
   176  	for x > 0 {
   177  		x = x >> 1
   178  		p++
   179  	}
   180  
   181  	i := maxOffsetPOT - p
   182  	min := 4
   183  	for i > 0 {
   184  		min = min << 1
   185  		i--
   186  	}
   187  
   188  	return int(len) >= min
   189  }
   190  
   191  // Emit matches bytes as literals.
   192  func (sl *Sloppy) dontCopy(src []byte, from, to int) bool {
   193  	for ; from < to; from++ {
   194  		if sl.enc.emitLiteral(src[from]) {
   195  			return true
   196  		}
   197  	}
   198  	return false
   199  }
   200  
   201  // Emit a copy if the length is sufficient for a given offset
   202  func (sl *Sloppy) maybeCopy(src []byte) bool {
   203  	off, len := uint16(sl.idx-(sl.matchOffset+sl.matchLength)), uint16(sl.matchLength)
   204  	sl.matching = false
   205  	sl.matchOffset = 0
   206  	sl.matchLength = 0
   207  
   208  	if !copyLongEnough(off, len) {
   209  		return sl.dontCopy(src, sl.idx-int(len), sl.idx)
   210  	}
   211  
   212  	return sl.enc.emitCopy(off, len)
   213  }
   214  
   215  type encoder interface {
   216  	emitLiteral(b byte) bool
   217  	emitCopy(offset, length uint16) bool
   218  }
   219  
   220  type binaryEncoder struct {
   221  	f func(b byte) bool
   222  }
   223  
   224  func (be binaryEncoder) emitLiteral(b byte) bool {
   225  	return be.f(b)
   226  }
   227  
   228  func (be binaryEncoder) emitCopy(offset, length uint16) bool {
   229  	// all copies are encoded as 3 bytes.
   230  	// 12 bits for offset and 12 bits for length
   231  
   232  	// 8 MSBits of offset
   233  	if be.f(byte(offset >> 4)) {
   234  		return true
   235  	}
   236  
   237  	// 4 LSBits offset | 4 MSBits length
   238  	if be.f(byte(offset<<4) | byte(length>>4)) {
   239  		return true
   240  	}
   241  
   242  	// 8 LSBits of length
   243  	if be.f(byte(length)) {
   244  		return true
   245  	}
   246  
   247  	return false
   248  }
   249  
   250  func fbhash(u uint32) uint32 {
   251  	return (u * 0x1e35a7bd) >> shift
   252  }
   253  
   254  func load32(b []byte, i int) uint32 {
   255  	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
   256  	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
   257  }