github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/compressing/lz4c.go (about)

     1  package compressing
     2  
     3  import (
     4  	"github.com/balzaczyy/golucene/core/util/packed"
     5  )
     6  
     7  // codecs/compressing/LZ4.java#compress
     8  
     9  const (
    10  	MEMORY_USAGE = 14
    11  )
    12  
    13  type DataOutput interface {
    14  	WriteByte(b byte) error
    15  	WriteBytes(buf []byte) error
    16  	WriteInt(i int32) error
    17  	WriteVInt(i int32) error
    18  	WriteString(string) error
    19  }
    20  
    21  func hash(i, hashBits int) int {
    22  	assert(hashBits >= 0 && hashBits <= 32)
    23  	return int(uint32(int32(i)*-1640531535) >> uint(32-hashBits))
    24  }
    25  
    26  func readInt(buf []byte, i int) int {
    27  	return ((int(buf[i]) & 0xFF) << 24) | ((int(buf[i+1]) & 0xFF) << 16) |
    28  		((int(buf[i+2]) & 0xFF) << 8) | (int(buf[i+3]) & 0xFF)
    29  }
    30  
    31  func commonBytes(b1, b2 []byte) int {
    32  	assert(len(b1) > len(b2))
    33  	count, limit := 0, len(b2)
    34  	for count < limit && b1[count] == b2[count] {
    35  		count++
    36  	}
    37  	return count
    38  }
    39  
    40  func encodeLen(l int, out DataOutput) error {
    41  	var err error
    42  	for l >= 0xFF && err == nil {
    43  		err = out.WriteByte(byte(0xFF))
    44  		l -= 0xFF
    45  	}
    46  	if err == nil {
    47  		err = out.WriteByte(byte(l))
    48  	}
    49  	return err
    50  }
    51  
    52  func encodeLiterals(bytes []byte, token byte, out DataOutput) error {
    53  	err := out.WriteByte(token)
    54  	if err != nil {
    55  		return err
    56  	}
    57  
    58  	// encode literal length
    59  	if len(bytes) >= 0x0F {
    60  		err = encodeLen(len(bytes)-0x0F, out)
    61  		if err != nil {
    62  			return err
    63  		}
    64  	}
    65  
    66  	// encode literals
    67  	return out.WriteBytes(bytes)
    68  }
    69  
    70  func encodeLastLiterals(bytes []byte, out DataOutput) error {
    71  	token := byte(min(len(bytes), 0x0F) << 4)
    72  	return encodeLiterals(bytes, token, out)
    73  }
    74  
    75  func encodeSequence(bytes []byte, matchDec, matchLen int, out DataOutput) error {
    76  	literalLen := len(bytes)
    77  	assert(matchLen >= 4)
    78  	// encode token
    79  	token := byte((min(literalLen, 0x0F) << 4) | min(matchLen-4, 0x0F))
    80  	err := encodeLiterals(bytes, token, out)
    81  	if err != nil {
    82  		return err
    83  	}
    84  
    85  	// encode match dec
    86  	assert(matchDec > 0 && matchDec < 1<<16)
    87  	err = out.WriteByte(byte(matchDec))
    88  	if err == nil {
    89  		err = out.WriteByte(byte(uint(matchDec) >> 8))
    90  	}
    91  	if err != nil {
    92  		return err
    93  	}
    94  
    95  	// encode match len
    96  	if matchLen >= MIN_MATCH+0x0F {
    97  		return encodeLen(matchLen-0x0F-MIN_MATCH, out)
    98  	}
    99  	return nil
   100  }
   101  
   102  func min(a, b int) int {
   103  	if a < b {
   104  		return a
   105  	}
   106  	return b
   107  }
   108  
   109  type LZ4HashTable struct {
   110  	hashLog   int
   111  	hashTable packed.Mutable
   112  }
   113  
   114  func (h *LZ4HashTable) reset(length int) {
   115  	bitsPerOffset := packed.BitsRequired(int64(length - LAST_LITERALS))
   116  	bitsPerOffsetLog := ceilLog2(bitsPerOffset)
   117  	h.hashLog = MEMORY_USAGE + 3 - bitsPerOffsetLog
   118  	assert(h.hashLog > 0)
   119  	if h.hashTable == nil || h.hashTable.Size() < (1<<uint(h.hashLog)) || h.hashTable.BitsPerValue() < bitsPerOffset {
   120  		h.hashTable = packed.MutableFor(1<<uint(h.hashLog), bitsPerOffset, packed.PackedInts.DEFAULT)
   121  	} else {
   122  		h.hashTable.Clear()
   123  	}
   124  }
   125  
   126  // 32 - leadingZero(n-1)
   127  func ceilLog2(n int) int {
   128  	assert(n >= 1)
   129  	if n == 1 {
   130  		return 0
   131  	}
   132  	n--
   133  	ans := 0
   134  	for n > 0 {
   135  		n >>= 1
   136  		ans++
   137  	}
   138  	return ans
   139  }
   140  
   141  /*
   142  Compress bytes into out using at most 16KB of memory. ht shouldn't be
   143  shared across threads but can safely be reused.
   144  */
   145  func LZ4Compress(bytes []byte, out DataOutput, ht *LZ4HashTable) error {
   146  	offset, length := 0, len(bytes)
   147  	base, end := offset, offset+length
   148  
   149  	anchor := offset
   150  	offset++
   151  
   152  	if length > LAST_LITERALS+MIN_MATCH {
   153  		limit := end - LAST_LITERALS
   154  		matchLimit := limit - MIN_MATCH
   155  		ht.reset(length)
   156  		hashLog := ht.hashLog
   157  		hashTable := ht.hashTable
   158  
   159  		for offset <= limit {
   160  			// find a match
   161  			var ref int
   162  			var hasMore = offset < matchLimit
   163  			for hasMore {
   164  				v := readInt(bytes, offset)
   165  				h := hash(v, hashLog)
   166  				ref = base + int(hashTable.Get(h))
   167  				assert(packed.BitsRequired(int64(offset-base)) <= hashTable.BitsPerValue())
   168  				hashTable.Set(h, int64(offset-base))
   169  				if offset-ref < MAX_DISTANCE && readInt(bytes, ref) == v {
   170  					break
   171  				}
   172  				offset++
   173  				hasMore = offset < matchLimit
   174  			}
   175  			if !hasMore {
   176  				break
   177  			}
   178  
   179  			// compute match length
   180  			matchLen := MIN_MATCH + commonBytes(
   181  				bytes[ref+MIN_MATCH:limit],
   182  				bytes[offset+MIN_MATCH:limit])
   183  
   184  			err := encodeSequence(bytes[anchor:offset], offset-ref, matchLen, out)
   185  			if err != nil {
   186  				return err
   187  			}
   188  			offset += matchLen
   189  			anchor = offset
   190  		}
   191  	}
   192  
   193  	// last literals
   194  	literalLen := end - anchor
   195  	assert(literalLen >= LAST_LITERALS || literalLen == length)
   196  	return encodeLastLiterals(bytes[anchor:], out)
   197  }