github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/codec/codec.go (about)

     1  /*
     2   * Copyright 2018 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package codec
    18  
    19  import (
    20  	"bytes"
    21  	"math"
    22  	"sort"
    23  
    24  	"github.com/dgraph-io/dgraph/protos/pb"
    25  	"github.com/dgryski/go-groupvarint"
    26  )
    27  
    28  type seekPos int
    29  
    30  const (
    31  	// SeekStart is used with Seek() to search relative to the Uid, returning it in the results.
    32  	SeekStart seekPos = iota
    33  	// SeekCurrent to Seek() a Uid using it as offset, not as part of the results.
    34  	SeekCurrent
    35  )
    36  
    37  var (
    38  	bitMask uint64 = 0xffffffff00000000
    39  )
    40  
    41  // Encoder is used to convert a list of UIDs into a pb.UidPack object.
    42  type Encoder struct {
    43  	BlockSize int
    44  	pack      *pb.UidPack
    45  	uids      []uint64
    46  }
    47  
    48  func (e *Encoder) packBlock() {
    49  	if len(e.uids) == 0 {
    50  		return
    51  	}
    52  	block := &pb.UidBlock{Base: e.uids[0], NumUids: uint32(len(e.uids))}
    53  	last := e.uids[0]
    54  	e.uids = e.uids[1:]
    55  
    56  	var out bytes.Buffer
    57  	buf := make([]byte, 17)
    58  	tmpUids := make([]uint32, 4)
    59  	for {
    60  		for i := 0; i < 4; i++ {
    61  			if i >= len(e.uids) {
    62  				// Padding with '0' because Encode4 encodes only in batch of 4.
    63  				tmpUids[i] = 0
    64  			} else {
    65  				tmpUids[i] = uint32(e.uids[i] - last)
    66  				last = e.uids[i]
    67  			}
    68  		}
    69  
    70  		data := groupvarint.Encode4(buf, tmpUids)
    71  		out.Write(data)
    72  
    73  		// e.uids has ended and we have padded tmpUids with 0s
    74  		if len(e.uids) <= 4 {
    75  			e.uids = e.uids[:0]
    76  			break
    77  		}
    78  		e.uids = e.uids[4:]
    79  	}
    80  
    81  	block.Deltas = out.Bytes()
    82  	e.pack.Blocks = append(e.pack.Blocks, block)
    83  }
    84  
    85  // Add takes an uid and adds it to the list of UIDs to be encoded.
    86  func (e *Encoder) Add(uid uint64) {
    87  	if e.pack == nil {
    88  		e.pack = &pb.UidPack{BlockSize: uint32(e.BlockSize)}
    89  	}
    90  
    91  	size := len(e.uids)
    92  	if size > 0 && !match32MSB(e.uids[size-1], uid) {
    93  		e.packBlock()
    94  		e.uids = e.uids[:0]
    95  	}
    96  
    97  	e.uids = append(e.uids, uid)
    98  	if len(e.uids) >= e.BlockSize {
    99  		e.packBlock()
   100  		e.uids = e.uids[:0]
   101  	}
   102  }
   103  
   104  // Done returns the final output of the encoder.
   105  func (e *Encoder) Done() *pb.UidPack {
   106  	e.packBlock()
   107  	return e.pack
   108  }
   109  
   110  // Decoder is used to read a pb.UidPack object back into a list of UIDs.
   111  type Decoder struct {
   112  	Pack     *pb.UidPack
   113  	blockIdx int
   114  	uids     []uint64
   115  }
   116  
   117  func (d *Decoder) unpackBlock() []uint64 {
   118  	if len(d.uids) > 0 {
   119  		// We were previously preallocating the d.uids slice to block size. This caused slowdown
   120  		// because many blocks are small and only contain a few ints, causing wastage while still
   121  		// paying cost of allocation.
   122  		d.uids = d.uids[:0]
   123  	}
   124  
   125  	if d.blockIdx >= len(d.Pack.Blocks) {
   126  		return d.uids
   127  	}
   128  	block := d.Pack.Blocks[d.blockIdx]
   129  
   130  	last := block.Base
   131  	d.uids = append(d.uids, last)
   132  
   133  	tmpUids := make([]uint32, 4)
   134  	var sum uint64
   135  	encData := block.Deltas
   136  
   137  	for uint32(len(d.uids)) < block.NumUids {
   138  		if len(encData) < 17 {
   139  			// Decode4 decodes 4 uids from encData. It moves slice(encData) forward while
   140  			// decoding and expects it to be of length >= 4 at all the stages. Padding
   141  			// with zero to make sure lenght is always >= 4.
   142  			encData = append(encData, 0, 0, 0)
   143  		}
   144  
   145  		groupvarint.Decode4(tmpUids, encData)
   146  		encData = encData[groupvarint.BytesUsed[encData[0]]:]
   147  		for i := 0; i < 4; i++ {
   148  			sum = last + uint64(tmpUids[i])
   149  			d.uids = append(d.uids, sum)
   150  			last = sum
   151  		}
   152  	}
   153  
   154  	d.uids = d.uids[:block.NumUids]
   155  	return d.uids
   156  }
   157  
   158  // ApproxLen returns the approximate number of UIDs in the pb.UidPack object.
   159  func (d *Decoder) ApproxLen() int {
   160  	return int(d.Pack.BlockSize) * (len(d.Pack.Blocks) - d.blockIdx)
   161  }
   162  
   163  type searchFunc func(int) bool
   164  
   165  // Seek will search for uid in a packed block using the specified whence position.
   166  // The value of whence must be one of the predefined values SeekStart or SeekCurrent.
   167  // SeekStart searches uid and includes it as part of the results.
   168  // SeekCurrent searches uid but only as offset, it won't be included with results.
   169  //
   170  // Returns a slice of all uids whence the position, or an empty slice if none found.
   171  func (d *Decoder) Seek(uid uint64, whence seekPos) []uint64 {
   172  	if d.Pack == nil {
   173  		return []uint64{}
   174  	}
   175  	d.blockIdx = 0
   176  	if uid == 0 {
   177  		return d.unpackBlock()
   178  	}
   179  
   180  	pack := d.Pack
   181  	blocksFunc := func() searchFunc {
   182  		var f searchFunc
   183  		switch whence {
   184  		case SeekStart:
   185  			f = func(i int) bool { return pack.Blocks[i].Base >= uid }
   186  		case SeekCurrent:
   187  			f = func(i int) bool { return pack.Blocks[i].Base > uid }
   188  		}
   189  		return f
   190  	}
   191  
   192  	idx := sort.Search(len(pack.Blocks), blocksFunc())
   193  	// The first block.Base >= uid.
   194  	if idx == 0 {
   195  		return d.unpackBlock()
   196  	}
   197  	// The uid is the first entry in the block.
   198  	if idx < len(pack.Blocks) && pack.Blocks[idx].Base == uid {
   199  		d.blockIdx = idx
   200  		return d.unpackBlock()
   201  	}
   202  
   203  	// Either the idx = len(pack.Blocks) that means it wasn't found in any of the block's base. Or,
   204  	// we found the first block index whose base is greater than uid. In these cases, go to the
   205  	// previous block and search there.
   206  	d.blockIdx = idx - 1 // Move to the previous block. If blockIdx<0, unpack will deal with it.
   207  	d.unpackBlock()      // And get all their uids.
   208  
   209  	uidsFunc := func() searchFunc {
   210  		var f searchFunc
   211  		switch whence {
   212  		case SeekStart:
   213  			f = func(i int) bool { return d.uids[i] >= uid }
   214  		case SeekCurrent:
   215  			f = func(i int) bool { return d.uids[i] > uid }
   216  		}
   217  		return f
   218  	}
   219  
   220  	// uidx points to the first uid in the uid list, which is >= uid.
   221  	uidx := sort.Search(len(d.uids), uidsFunc())
   222  	if uidx < len(d.uids) { // Found an entry in uids, which >= uid.
   223  		d.uids = d.uids[uidx:]
   224  		return d.uids
   225  	}
   226  	// Could not find any uid in the block, which is >= uid. The next block might still have valid
   227  	// entries > uid.
   228  	return d.Next()
   229  }
   230  
   231  // Uids returns all the uids in the pb.UidPack object as an array of integers.
   232  // uids are owned by the Decoder, and the slice contents would be changed on the next call. They
   233  // should be copied if passed around.
   234  func (d *Decoder) Uids() []uint64 {
   235  	return d.uids
   236  }
   237  
   238  // LinearSeek returns uids of the last block whose base is less than seek.
   239  // If there are no such blocks i.e. seek < base of first block, it returns uids of first
   240  // block. LinearSeek is used to get closest uids which are >= seek.
   241  func (d *Decoder) LinearSeek(seek uint64) []uint64 {
   242  	for {
   243  		v := d.PeekNextBase()
   244  		if seek < v {
   245  			break
   246  		}
   247  		d.blockIdx++
   248  	}
   249  
   250  	return d.unpackBlock()
   251  }
   252  
   253  // PeekNextBase returns the base of the next block without advancing the decoder.
   254  func (d *Decoder) PeekNextBase() uint64 {
   255  	bidx := d.blockIdx + 1
   256  	if bidx < len(d.Pack.Blocks) {
   257  		return d.Pack.Blocks[bidx].Base
   258  	}
   259  	return math.MaxUint64
   260  }
   261  
   262  // Valid returns true if the decoder has not reached the end of the packed data.
   263  func (d *Decoder) Valid() bool {
   264  	return d.blockIdx < len(d.Pack.Blocks)
   265  }
   266  
   267  // Next moves the decoder on to the next block.
   268  func (d *Decoder) Next() []uint64 {
   269  	d.blockIdx++
   270  	return d.unpackBlock()
   271  }
   272  
   273  // Encode takes in a list of uids and a block size. It would pack these uids into blocks of the
   274  // given size, with the last block having fewer uids. Within each block, it stores the first uid as
   275  // base. For each next uid, a delta = uids[i] - uids[i-1] is stored. Protobuf uses Varint encoding,
   276  // as mentioned here: https://developers.google.com/protocol-buffers/docs/encoding . This ensures
   277  // that the deltas being considerably smaller than the original uids are nicely packed in fewer
   278  // bytes. Our benchmarks on artificial data show compressed size to be 13% of the original. This
   279  // mechanism is a LOT simpler to understand and if needed, debug.
   280  func Encode(uids []uint64, blockSize int) *pb.UidPack {
   281  	enc := Encoder{BlockSize: blockSize}
   282  	for _, uid := range uids {
   283  		enc.Add(uid)
   284  	}
   285  	return enc.Done()
   286  }
   287  
   288  // ApproxLen would indicate the total number of UIDs in the pack. Can be used for int slice
   289  // allocations.
   290  func ApproxLen(pack *pb.UidPack) int {
   291  	if pack == nil {
   292  		return 0
   293  	}
   294  	return len(pack.Blocks) * int(pack.BlockSize)
   295  }
   296  
   297  // ExactLen would calculate the total number of UIDs. Instead of using a UidPack, it accepts blocks,
   298  // so we can calculate the number of uids after a seek.
   299  func ExactLen(pack *pb.UidPack) int {
   300  	if pack == nil {
   301  		return 0
   302  	}
   303  	sz := len(pack.Blocks)
   304  	if sz == 0 {
   305  		return 0
   306  	}
   307  	num := 0
   308  	for _, b := range pack.Blocks {
   309  		num += int(b.NumUids) // NumUids includes the base UID.
   310  	}
   311  	return num
   312  }
   313  
   314  // Decode decodes the UidPack back into the list of uids. This is a stop-gap function, Decode would
   315  // need to do more specific things than just return the list back.
   316  func Decode(pack *pb.UidPack, seek uint64) []uint64 {
   317  	uids := make([]uint64, 0, ApproxLen(pack))
   318  	dec := Decoder{Pack: pack}
   319  
   320  	for block := dec.Seek(seek, SeekStart); len(block) > 0; block = dec.Next() {
   321  		uids = append(uids, block...)
   322  	}
   323  	return uids
   324  }
   325  
   326  func match32MSB(num1, num2 uint64) bool {
   327  	return (num1 & bitMask) == (num2 & bitMask)
   328  }