github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/codec/codec.go (about) 1 /* 2 * Copyright 2018 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package codec 18 19 import ( 20 "bytes" 21 "math" 22 "sort" 23 24 "github.com/dgraph-io/dgraph/protos/pb" 25 "github.com/dgryski/go-groupvarint" 26 ) 27 28 type seekPos int 29 30 const ( 31 // SeekStart is used with Seek() to search relative to the Uid, returning it in the results. 32 SeekStart seekPos = iota 33 // SeekCurrent to Seek() a Uid using it as offset, not as part of the results. 34 SeekCurrent 35 ) 36 37 var ( 38 bitMask uint64 = 0xffffffff00000000 39 ) 40 41 // Encoder is used to convert a list of UIDs into a pb.UidPack object. 42 type Encoder struct { 43 BlockSize int 44 pack *pb.UidPack 45 uids []uint64 46 } 47 48 func (e *Encoder) packBlock() { 49 if len(e.uids) == 0 { 50 return 51 } 52 block := &pb.UidBlock{Base: e.uids[0], NumUids: uint32(len(e.uids))} 53 last := e.uids[0] 54 e.uids = e.uids[1:] 55 56 var out bytes.Buffer 57 buf := make([]byte, 17) 58 tmpUids := make([]uint32, 4) 59 for { 60 for i := 0; i < 4; i++ { 61 if i >= len(e.uids) { 62 // Padding with '0' because Encode4 encodes only in batch of 4. 63 tmpUids[i] = 0 64 } else { 65 tmpUids[i] = uint32(e.uids[i] - last) 66 last = e.uids[i] 67 } 68 } 69 70 data := groupvarint.Encode4(buf, tmpUids) 71 out.Write(data) 72 73 // e.uids has ended and we have padded tmpUids with 0s 74 if len(e.uids) <= 4 { 75 e.uids = e.uids[:0] 76 break 77 } 78 e.uids = e.uids[4:] 79 } 80 81 block.Deltas = out.Bytes() 82 e.pack.Blocks = append(e.pack.Blocks, block) 83 } 84 85 // Add takes an uid and adds it to the list of UIDs to be encoded. 86 func (e *Encoder) Add(uid uint64) { 87 if e.pack == nil { 88 e.pack = &pb.UidPack{BlockSize: uint32(e.BlockSize)} 89 } 90 91 size := len(e.uids) 92 if size > 0 && !match32MSB(e.uids[size-1], uid) { 93 e.packBlock() 94 e.uids = e.uids[:0] 95 } 96 97 e.uids = append(e.uids, uid) 98 if len(e.uids) >= e.BlockSize { 99 e.packBlock() 100 e.uids = e.uids[:0] 101 } 102 } 103 104 // Done returns the final output of the encoder. 105 func (e *Encoder) Done() *pb.UidPack { 106 e.packBlock() 107 return e.pack 108 } 109 110 // Decoder is used to read a pb.UidPack object back into a list of UIDs. 111 type Decoder struct { 112 Pack *pb.UidPack 113 blockIdx int 114 uids []uint64 115 } 116 117 func (d *Decoder) unpackBlock() []uint64 { 118 if len(d.uids) > 0 { 119 // We were previously preallocating the d.uids slice to block size. This caused slowdown 120 // because many blocks are small and only contain a few ints, causing wastage while still 121 // paying cost of allocation. 122 d.uids = d.uids[:0] 123 } 124 125 if d.blockIdx >= len(d.Pack.Blocks) { 126 return d.uids 127 } 128 block := d.Pack.Blocks[d.blockIdx] 129 130 last := block.Base 131 d.uids = append(d.uids, last) 132 133 tmpUids := make([]uint32, 4) 134 var sum uint64 135 encData := block.Deltas 136 137 for uint32(len(d.uids)) < block.NumUids { 138 if len(encData) < 17 { 139 // Decode4 decodes 4 uids from encData. It moves slice(encData) forward while 140 // decoding and expects it to be of length >= 4 at all the stages. Padding 141 // with zero to make sure lenght is always >= 4. 142 encData = append(encData, 0, 0, 0) 143 } 144 145 groupvarint.Decode4(tmpUids, encData) 146 encData = encData[groupvarint.BytesUsed[encData[0]]:] 147 for i := 0; i < 4; i++ { 148 sum = last + uint64(tmpUids[i]) 149 d.uids = append(d.uids, sum) 150 last = sum 151 } 152 } 153 154 d.uids = d.uids[:block.NumUids] 155 return d.uids 156 } 157 158 // ApproxLen returns the approximate number of UIDs in the pb.UidPack object. 159 func (d *Decoder) ApproxLen() int { 160 return int(d.Pack.BlockSize) * (len(d.Pack.Blocks) - d.blockIdx) 161 } 162 163 type searchFunc func(int) bool 164 165 // Seek will search for uid in a packed block using the specified whence position. 166 // The value of whence must be one of the predefined values SeekStart or SeekCurrent. 167 // SeekStart searches uid and includes it as part of the results. 168 // SeekCurrent searches uid but only as offset, it won't be included with results. 169 // 170 // Returns a slice of all uids whence the position, or an empty slice if none found. 171 func (d *Decoder) Seek(uid uint64, whence seekPos) []uint64 { 172 if d.Pack == nil { 173 return []uint64{} 174 } 175 d.blockIdx = 0 176 if uid == 0 { 177 return d.unpackBlock() 178 } 179 180 pack := d.Pack 181 blocksFunc := func() searchFunc { 182 var f searchFunc 183 switch whence { 184 case SeekStart: 185 f = func(i int) bool { return pack.Blocks[i].Base >= uid } 186 case SeekCurrent: 187 f = func(i int) bool { return pack.Blocks[i].Base > uid } 188 } 189 return f 190 } 191 192 idx := sort.Search(len(pack.Blocks), blocksFunc()) 193 // The first block.Base >= uid. 194 if idx == 0 { 195 return d.unpackBlock() 196 } 197 // The uid is the first entry in the block. 198 if idx < len(pack.Blocks) && pack.Blocks[idx].Base == uid { 199 d.blockIdx = idx 200 return d.unpackBlock() 201 } 202 203 // Either the idx = len(pack.Blocks) that means it wasn't found in any of the block's base. Or, 204 // we found the first block index whose base is greater than uid. In these cases, go to the 205 // previous block and search there. 206 d.blockIdx = idx - 1 // Move to the previous block. If blockIdx<0, unpack will deal with it. 207 d.unpackBlock() // And get all their uids. 208 209 uidsFunc := func() searchFunc { 210 var f searchFunc 211 switch whence { 212 case SeekStart: 213 f = func(i int) bool { return d.uids[i] >= uid } 214 case SeekCurrent: 215 f = func(i int) bool { return d.uids[i] > uid } 216 } 217 return f 218 } 219 220 // uidx points to the first uid in the uid list, which is >= uid. 221 uidx := sort.Search(len(d.uids), uidsFunc()) 222 if uidx < len(d.uids) { // Found an entry in uids, which >= uid. 223 d.uids = d.uids[uidx:] 224 return d.uids 225 } 226 // Could not find any uid in the block, which is >= uid. The next block might still have valid 227 // entries > uid. 228 return d.Next() 229 } 230 231 // Uids returns all the uids in the pb.UidPack object as an array of integers. 232 // uids are owned by the Decoder, and the slice contents would be changed on the next call. They 233 // should be copied if passed around. 234 func (d *Decoder) Uids() []uint64 { 235 return d.uids 236 } 237 238 // LinearSeek returns uids of the last block whose base is less than seek. 239 // If there are no such blocks i.e. seek < base of first block, it returns uids of first 240 // block. LinearSeek is used to get closest uids which are >= seek. 241 func (d *Decoder) LinearSeek(seek uint64) []uint64 { 242 for { 243 v := d.PeekNextBase() 244 if seek < v { 245 break 246 } 247 d.blockIdx++ 248 } 249 250 return d.unpackBlock() 251 } 252 253 // PeekNextBase returns the base of the next block without advancing the decoder. 254 func (d *Decoder) PeekNextBase() uint64 { 255 bidx := d.blockIdx + 1 256 if bidx < len(d.Pack.Blocks) { 257 return d.Pack.Blocks[bidx].Base 258 } 259 return math.MaxUint64 260 } 261 262 // Valid returns true if the decoder has not reached the end of the packed data. 263 func (d *Decoder) Valid() bool { 264 return d.blockIdx < len(d.Pack.Blocks) 265 } 266 267 // Next moves the decoder on to the next block. 268 func (d *Decoder) Next() []uint64 { 269 d.blockIdx++ 270 return d.unpackBlock() 271 } 272 273 // Encode takes in a list of uids and a block size. It would pack these uids into blocks of the 274 // given size, with the last block having fewer uids. Within each block, it stores the first uid as 275 // base. For each next uid, a delta = uids[i] - uids[i-1] is stored. Protobuf uses Varint encoding, 276 // as mentioned here: https://developers.google.com/protocol-buffers/docs/encoding . This ensures 277 // that the deltas being considerably smaller than the original uids are nicely packed in fewer 278 // bytes. Our benchmarks on artificial data show compressed size to be 13% of the original. This 279 // mechanism is a LOT simpler to understand and if needed, debug. 280 func Encode(uids []uint64, blockSize int) *pb.UidPack { 281 enc := Encoder{BlockSize: blockSize} 282 for _, uid := range uids { 283 enc.Add(uid) 284 } 285 return enc.Done() 286 } 287 288 // ApproxLen would indicate the total number of UIDs in the pack. Can be used for int slice 289 // allocations. 290 func ApproxLen(pack *pb.UidPack) int { 291 if pack == nil { 292 return 0 293 } 294 return len(pack.Blocks) * int(pack.BlockSize) 295 } 296 297 // ExactLen would calculate the total number of UIDs. Instead of using a UidPack, it accepts blocks, 298 // so we can calculate the number of uids after a seek. 299 func ExactLen(pack *pb.UidPack) int { 300 if pack == nil { 301 return 0 302 } 303 sz := len(pack.Blocks) 304 if sz == 0 { 305 return 0 306 } 307 num := 0 308 for _, b := range pack.Blocks { 309 num += int(b.NumUids) // NumUids includes the base UID. 310 } 311 return num 312 } 313 314 // Decode decodes the UidPack back into the list of uids. This is a stop-gap function, Decode would 315 // need to do more specific things than just return the list back. 316 func Decode(pack *pb.UidPack, seek uint64) []uint64 { 317 uids := make([]uint64, 0, ApproxLen(pack)) 318 dec := Decoder{Pack: pack} 319 320 for block := dec.Seek(seek, SeekStart); len(block) > 0; block = dec.Next() { 321 uids = append(uids, block...) 322 } 323 return uids 324 } 325 326 func match32MSB(num1, num2 uint64) bool { 327 return (num1 & bitMask) == (num2 & bitMask) 328 }