github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/compressing/lz4c.go (about) 1 package compressing 2 3 import ( 4 "github.com/balzaczyy/golucene/core/util/packed" 5 ) 6 7 // codecs/compressing/LZ4.java#compress 8 9 const ( 10 MEMORY_USAGE = 14 11 ) 12 13 type DataOutput interface { 14 WriteByte(b byte) error 15 WriteBytes(buf []byte) error 16 WriteInt(i int32) error 17 WriteVInt(i int32) error 18 WriteString(string) error 19 } 20 21 func hash(i, hashBits int) int { 22 assert(hashBits >= 0 && hashBits <= 32) 23 return int(uint32(int32(i)*-1640531535) >> uint(32-hashBits)) 24 } 25 26 func readInt(buf []byte, i int) int { 27 return ((int(buf[i]) & 0xFF) << 24) | ((int(buf[i+1]) & 0xFF) << 16) | 28 ((int(buf[i+2]) & 0xFF) << 8) | (int(buf[i+3]) & 0xFF) 29 } 30 31 func commonBytes(b1, b2 []byte) int { 32 assert(len(b1) > len(b2)) 33 count, limit := 0, len(b2) 34 for count < limit && b1[count] == b2[count] { 35 count++ 36 } 37 return count 38 } 39 40 func encodeLen(l int, out DataOutput) error { 41 var err error 42 for l >= 0xFF && err == nil { 43 err = out.WriteByte(byte(0xFF)) 44 l -= 0xFF 45 } 46 if err == nil { 47 err = out.WriteByte(byte(l)) 48 } 49 return err 50 } 51 52 func encodeLiterals(bytes []byte, token byte, out DataOutput) error { 53 err := out.WriteByte(token) 54 if err != nil { 55 return err 56 } 57 58 // encode literal length 59 if len(bytes) >= 0x0F { 60 err = encodeLen(len(bytes)-0x0F, out) 61 if err != nil { 62 return err 63 } 64 } 65 66 // encode literals 67 return out.WriteBytes(bytes) 68 } 69 70 func encodeLastLiterals(bytes []byte, out DataOutput) error { 71 token := byte(min(len(bytes), 0x0F) << 4) 72 return encodeLiterals(bytes, token, out) 73 } 74 75 func encodeSequence(bytes []byte, matchDec, matchLen int, out DataOutput) error { 76 literalLen := len(bytes) 77 assert(matchLen >= 4) 78 // encode token 79 token := byte((min(literalLen, 0x0F) << 4) | min(matchLen-4, 0x0F)) 80 err := encodeLiterals(bytes, token, out) 81 if err != nil { 82 return err 83 } 84 85 // encode match dec 86 assert(matchDec > 0 && matchDec < 1<<16) 87 err = out.WriteByte(byte(matchDec)) 88 if err == nil { 89 err = out.WriteByte(byte(uint(matchDec) >> 8)) 90 } 91 if err != nil { 92 return err 93 } 94 95 // encode match len 96 if matchLen >= MIN_MATCH+0x0F { 97 return encodeLen(matchLen-0x0F-MIN_MATCH, out) 98 } 99 return nil 100 } 101 102 func min(a, b int) int { 103 if a < b { 104 return a 105 } 106 return b 107 } 108 109 type LZ4HashTable struct { 110 hashLog int 111 hashTable packed.Mutable 112 } 113 114 func (h *LZ4HashTable) reset(length int) { 115 bitsPerOffset := packed.BitsRequired(int64(length - LAST_LITERALS)) 116 bitsPerOffsetLog := ceilLog2(bitsPerOffset) 117 h.hashLog = MEMORY_USAGE + 3 - bitsPerOffsetLog 118 assert(h.hashLog > 0) 119 if h.hashTable == nil || h.hashTable.Size() < (1<<uint(h.hashLog)) || h.hashTable.BitsPerValue() < bitsPerOffset { 120 h.hashTable = packed.MutableFor(1<<uint(h.hashLog), bitsPerOffset, packed.PackedInts.DEFAULT) 121 } else { 122 h.hashTable.Clear() 123 } 124 } 125 126 // 32 - leadingZero(n-1) 127 func ceilLog2(n int) int { 128 assert(n >= 1) 129 if n == 1 { 130 return 0 131 } 132 n-- 133 ans := 0 134 for n > 0 { 135 n >>= 1 136 ans++ 137 } 138 return ans 139 } 140 141 /* 142 Compress bytes into out using at most 16KB of memory. ht shouldn't be 143 shared across threads but can safely be reused. 144 */ 145 func LZ4Compress(bytes []byte, out DataOutput, ht *LZ4HashTable) error { 146 offset, length := 0, len(bytes) 147 base, end := offset, offset+length 148 149 anchor := offset 150 offset++ 151 152 if length > LAST_LITERALS+MIN_MATCH { 153 limit := end - LAST_LITERALS 154 matchLimit := limit - MIN_MATCH 155 ht.reset(length) 156 hashLog := ht.hashLog 157 hashTable := ht.hashTable 158 159 for offset <= limit { 160 // find a match 161 var ref int 162 var hasMore = offset < matchLimit 163 for hasMore { 164 v := readInt(bytes, offset) 165 h := hash(v, hashLog) 166 ref = base + int(hashTable.Get(h)) 167 assert(packed.BitsRequired(int64(offset-base)) <= hashTable.BitsPerValue()) 168 hashTable.Set(h, int64(offset-base)) 169 if offset-ref < MAX_DISTANCE && readInt(bytes, ref) == v { 170 break 171 } 172 offset++ 173 hasMore = offset < matchLimit 174 } 175 if !hasMore { 176 break 177 } 178 179 // compute match length 180 matchLen := MIN_MATCH + commonBytes( 181 bytes[ref+MIN_MATCH:limit], 182 bytes[offset+MIN_MATCH:limit]) 183 184 err := encodeSequence(bytes[anchor:offset], offset-ref, matchLen, out) 185 if err != nil { 186 return err 187 } 188 offset += matchLen 189 anchor = offset 190 } 191 } 192 193 // last literals 194 literalLen := end - anchor 195 assert(literalLen >= LAST_LITERALS || literalLen == length) 196 return encodeLastLiterals(bytes[anchor:], out) 197 }