github.com/ndau/noms@v1.0.5/go/sloppy/sloppy.go (about)

     1  // Copyright 2017 Attic Labs, Inc. All rights reserved.
     2  // Licensed under the Apache License, version 2.0:
     3  // http://www.apache.org/licenses/LICENSE-2.0
     4  
     5  package sloppy
     6  
     7  import (
     8  	"github.com/ndau/noms/go/d"
     9  )
    10  
    11  const (
    12  	maxOffsetPOT = uint16(12)
    13  	maxTableSize = 1 << 14
    14  	maxLength    = 1<<12 - 1
    15  	tableMask    = maxTableSize - 1
    16  	shift        = uint32(20)
    17  )
    18  
    19  // TODO: Make this configurable
    20  var maxOffset = int(1<<maxOffsetPOT - 1)
    21  
    22  // Sloppy is a logical variant of Snappy. Its purpose to provide a kind of
    23  // estimate of how a given byte sequence *will be* compressed by Snappy. It is
    24  // useful when a byte stream is fed into a rolling hash with the goal of
    25  // achieving a given average chunk byte length *after compression*. Sloppy is
    26  // logically similar to snappy, but prefers "copies" which are closer to the
    27  // repeated byte sequence (snappy prefers to refer to the *first* instance of a
    28  // repeated byte sequence). This is important for mitigating the likelihood that
    29  // altering any byte in an input stream will cause chunk boundaries to be
    30  // redrawn downstream.
    31  //
    32  // The high-level approach is to maintain a logical mapping between four-byte
    33  // sequences which have been observed in the stream so-far and the integer
    34  // offset of observed sequence (the mapping is done with a "cheap" hash-function
    35  // which permits false-positives because they can be trivial filtered out). In
    36  // the non-matched state, for each new byte consumed, a uint32 is computed from
    37  // the next 4 bytes and then a look-up is performed to check for a matching 4
    38  // bytes earlier in the stream. Snappy and sloppy behave roughly identical thus
    39  // far.
    40  //
    41  // When in the "matched state" (attempting to extend the current match), Snappy
    42  // does not re-index new 4-byte sequences, but Sloppy does. The reason for this
    43  // is that Sloppy would like match the most recent occurence as it moves
    44  // forward.
    45  //
    46  // Lastly, Sloppy adds two novel heuritics, both aimed at further mitigating
    47  // the chance of chunk boundaries being redrawn because of byte value changes:
    48  //
    49  // 1) During the first 2 bytes of match, it *continues* to look for closer
    50  // matches (effectively prefering a closer but shorter copy to a further but
    51  // longer one). The reason for this is that when sequences repeat frequently in
    52  // a byte stream, randomness provides for a good chance that a one or two byte
    53  // prefix on a repeated sequence will match "far away". E.g.
    54  //
    55  // "23hello my friend, 12hello my friend, 01hello my friend, 23hello my friend"
    56  //
    57  // In the above sequence, sloppy would prefer to copy the final
    58  // "hello my friend" 19 bytes backwards rather than "23hello my friend" quite a
    59  // bit further.
    60  //
    61  // 2) Sloppy will only emit copies which are "worth it". I.e. The longer the
    62  // reference back, the longer the length of the copy must be.
    63  type Sloppy struct {
    64  	enc                      encoder
    65  	idx                      int
    66  	matching                 bool
    67  	matchOffset, matchLength int
    68  	table                    [maxTableSize]uint32
    69  }
    70  
    71  // New returns a new sloppy encoder which will encode to |f|. If |f| ever
    72  // returns false, then encoding ends immediately. |f| is a callback because
    73  // the primary use is that the "encoded" byte stream is fed byte-by-byte
    74  // into a rolling hash function.
    75  func New(f func(b byte) bool) *Sloppy {
    76  	return &Sloppy{
    77  		binaryEncoder{f},
    78  		0,
    79  		false,
    80  		0, 0,
    81  		[maxTableSize]uint32{},
    82  	}
    83  }
    84  
    85  // Update continues the encoding of a given input stream. The caller is expected
    86  // to call update after having (ONLY) appended bytes to |src|. When |Update|
    87  // returns, sloppy will have emitted 0 or more literals or copies by calling
    88  // the |sf.f|. Note that sloppy will ALWAYS buffer the final three bytes of
    89  // input.
    90  func (sl *Sloppy) Update(src []byte) {
    91  	// Only consume up to the point that a "look-ahead" can include 4 bytes.
    92  	for ; sl.idx < len(src)-3; sl.idx++ {
    93  		nextHash := fbhash(load32(src, sl.idx))
    94  
    95  		if sl.matching && (sl.matchLength > maxLength || src[sl.idx] != src[sl.matchOffset+sl.matchLength]) {
    96  			// End Match
    97  			if sl.maybeCopy(src) {
    98  				return // terminate if consumer has "closed"
    99  			}
   100  		}
   101  
   102  		// Look for a match if we are beyond the first byte AND either there is no
   103  		// match yet, OR we are matching, but fewer than 3 bytes have been
   104  		// matched. The later condition allows for giving up to 2 bytes of a copy
   105  		// in order to reference a "closer" sequence. Empirical tests on
   106  		// structured data, suggests this reduces the average offset by ~2/3.
   107  		if sl.idx > 0 && (!sl.matching || sl.matchLength < 3) {
   108  			matchPos := int(sl.table[nextHash&tableMask])
   109  
   110  			if sl.idx > matchPos &&
   111  				src[sl.idx] == src[matchPos] && // filter false positives
   112  				sl.idx-matchPos <= maxOffset && // don't refer back beyond maxOffset
   113  				(!sl.matching || matchPos >= sl.matchOffset+4) { // if we are "rematching", ensure the new match is at least 4 bytes closer
   114  
   115  				if sl.matching {
   116  					// We are dropping an existing match for a closer one. Emit the
   117  					// matched bytes as literals
   118  					if sl.dontCopy(src, sl.idx-sl.matchLength, sl.idx) {
   119  						return // terminate if consumer has "closed"
   120  					}
   121  				}
   122  
   123  				// Begin a new match
   124  				sl.matching = true
   125  				sl.matchOffset = matchPos
   126  				sl.matchLength = 0
   127  			}
   128  		}
   129  
   130  		// Store new hashed offset
   131  		sl.table[nextHash&tableMask] = uint32(sl.idx)
   132  
   133  		if sl.matching {
   134  			sl.matchLength++
   135  		} else {
   136  			if sl.enc.emitLiteral(src[sl.idx]) {
   137  				return // terminate if consumer has "closed"
   138  			}
   139  		}
   140  	}
   141  }
   142  
   143  func (sl *Sloppy) Reset() {
   144  	sl.idx = 0
   145  	sl.matching = false
   146  	sl.matchOffset = 0
   147  	sl.matchLength = 0
   148  	sl.table = [maxTableSize]uint32{}
   149  }
   150  
   151  // len >= 2^(2 + log2(maxOffset) - log2(maxOffset-off)). IOW, for the first 1/2
   152  // of the maxOffset, a copy must be >= 4. For 1/2 of what remains, a copy must
   153  // be >= 8, etc...
   154  func copyLongEnough(off, len uint16) bool {
   155  	d.PanicIfTrue(off == 0)
   156  
   157  	p := uint16(0)
   158  	x := (1 << maxOffsetPOT) - off
   159  	for x > 0 {
   160  		x = x >> 1
   161  		p++
   162  	}
   163  
   164  	i := maxOffsetPOT - p
   165  	min := 4
   166  	for i > 0 {
   167  		min = min << 1
   168  		i--
   169  	}
   170  
   171  	return int(len) >= min
   172  }
   173  
   174  // Emit matches bytes as literals.
   175  func (sl *Sloppy) dontCopy(src []byte, from, to int) bool {
   176  	for ; from < to; from++ {
   177  		if sl.enc.emitLiteral(src[from]) {
   178  			return true
   179  		}
   180  	}
   181  	return false
   182  }
   183  
   184  // Emit a copy if the length is sufficient for a given offset
   185  func (sl *Sloppy) maybeCopy(src []byte) bool {
   186  	off, len := uint16(sl.idx-(sl.matchOffset+sl.matchLength)), uint16(sl.matchLength)
   187  	sl.matching = false
   188  	sl.matchOffset = 0
   189  	sl.matchLength = 0
   190  
   191  	if !copyLongEnough(off, len) {
   192  		return sl.dontCopy(src, sl.idx-int(len), sl.idx)
   193  	}
   194  
   195  	return sl.enc.emitCopy(off, len)
   196  }
   197  
   198  type encoder interface {
   199  	emitLiteral(b byte) bool
   200  	emitCopy(offset, length uint16) bool
   201  }
   202  
   203  type binaryEncoder struct {
   204  	f func(b byte) bool
   205  }
   206  
   207  func (be binaryEncoder) emitLiteral(b byte) bool {
   208  	return be.f(b)
   209  }
   210  
   211  func (be binaryEncoder) emitCopy(offset, length uint16) bool {
   212  	// all copies are encoded as 3 bytes.
   213  	// 12 bits for offset and 12 bits for length
   214  
   215  	// 8 MSBits of offset
   216  	if be.f(byte(offset >> 4)) {
   217  		return true
   218  	}
   219  
   220  	// 4 LSBits offset | 4 MSBits length
   221  	if be.f(byte(offset<<4) | byte(length>>4)) {
   222  		return true
   223  	}
   224  
   225  	// 8 LSBits of length
   226  	if be.f(byte(length)) {
   227  		return true
   228  	}
   229  
   230  	return false
   231  }
   232  
   233  func fbhash(u uint32) uint32 {
   234  	return (u * 0x1e35a7bd) >> shift
   235  }
   236  
   237  func load32(b []byte, i int) uint32 {
   238  	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
   239  	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
   240  }