github.com/wrgl/wrgl@v0.14.0/pkg/index/ordered_hash_set.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright © 2022 Wrangle Ltd
     3  
     4  package index
     5  
     6  import (
     7  	"io"
     8  	"sort"
     9  )
    10  
    11  type OrderedHashSetWriter struct {
    12  	fanout  [256]uint32
    13  	offsets []uint32
    14  	hashes  [][]byte
    15  	w       io.Writer
    16  }
    17  
    18  func NewOrderedHashSetWriter(w io.Writer, rows [][]byte) *OrderedHashSetWriter {
    19  	n := len(rows)
    20  	hashes := make([][]byte, n)
    21  	offsets := make([]uint32, n)
    22  	for i, row := range rows {
    23  		hashes[i] = make([]byte, 16)
    24  		copy(hashes[i], row[:16])
    25  		offsets[i] = uint32(i)
    26  	}
    27  	iw := &OrderedHashSetWriter{
    28  		hashes:  hashes,
    29  		offsets: offsets,
    30  		w:       w,
    31  	}
    32  	sort.Sort(iw)
    33  	computeFanoutTable(&iw.fanout, iw.hashes)
    34  	return iw
    35  }
    36  
    37  func (w *OrderedHashSetWriter) Len() int {
    38  	return len(w.offsets)
    39  }
    40  
    41  func (w *OrderedHashSetWriter) Less(a, b int) bool {
    42  	for i := 0; i < 16; i++ {
    43  		if w.hashes[a][i] == w.hashes[b][i] {
    44  			continue
    45  		}
    46  		return w.hashes[a][i] < w.hashes[b][i]
    47  	}
    48  	return false
    49  }
    50  
    51  func (w *OrderedHashSetWriter) Swap(a, b int) {
    52  	w.hashes[a], w.hashes[b] = w.hashes[b], w.hashes[a]
    53  	w.offsets[a], w.offsets[b] = w.offsets[b], w.offsets[a]
    54  }
    55  
    56  func (w *OrderedHashSetWriter) Flush() error {
    57  	err := writeUint32s(w.w, w.fanout[:])
    58  	if err != nil {
    59  		return err
    60  	}
    61  	for _, b := range w.hashes {
    62  		_, err := w.w.Write(b)
    63  		if err != nil {
    64  			return err
    65  		}
    66  	}
    67  	return writeUint32s(w.w, w.offsets)
    68  }
    69  
    70  type OrderedHashSet struct {
    71  	size uint32
    72  	r    io.ReadSeekCloser
    73  	buf  []byte
    74  }
    75  
    76  func NewOrderedHashSet(r io.ReadSeekCloser) (s *OrderedHashSet, err error) {
    77  	s = &OrderedHashSet{
    78  		r:   r,
    79  		buf: make([]byte, 16),
    80  	}
    81  	s.size, err = s.readFanout(255)
    82  	if err != nil {
    83  		return nil, err
    84  	}
    85  	return
    86  }
    87  
    88  func (s *OrderedHashSet) Close() error {
    89  	return s.r.Close()
    90  }
    91  
    92  func (s *OrderedHashSet) readFanout(off byte) (uint32, error) {
    93  	return readUint32(s.r, s.buf, 0, int(off))
    94  }
    95  
    96  func (s *OrderedHashSet) readOffset(ind uint32) (off int, err error) {
    97  	u, err := readUint32(s.r, s.buf, 1024+int64(s.size)*16, int(ind))
    98  	return int(u), err
    99  }
   100  
   101  func (s *OrderedHashSet) IndexOf(b []byte) (off int, err error) {
   102  	pos, err := indexOf(s.r, s.buf, b)
   103  	if err != nil {
   104  		return
   105  	}
   106  	if pos == -1 {
   107  		return -1, nil
   108  	}
   109  	return s.readOffset(uint32(pos))
   110  }