github.com/wrgl/wrgl@v0.14.0/pkg/index/ordered_hash_set.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright © 2022 Wrangle Ltd 3 4 package index 5 6 import ( 7 "io" 8 "sort" 9 ) 10 11 type OrderedHashSetWriter struct { 12 fanout [256]uint32 13 offsets []uint32 14 hashes [][]byte 15 w io.Writer 16 } 17 18 func NewOrderedHashSetWriter(w io.Writer, rows [][]byte) *OrderedHashSetWriter { 19 n := len(rows) 20 hashes := make([][]byte, n) 21 offsets := make([]uint32, n) 22 for i, row := range rows { 23 hashes[i] = make([]byte, 16) 24 copy(hashes[i], row[:16]) 25 offsets[i] = uint32(i) 26 } 27 iw := &OrderedHashSetWriter{ 28 hashes: hashes, 29 offsets: offsets, 30 w: w, 31 } 32 sort.Sort(iw) 33 computeFanoutTable(&iw.fanout, iw.hashes) 34 return iw 35 } 36 37 func (w *OrderedHashSetWriter) Len() int { 38 return len(w.offsets) 39 } 40 41 func (w *OrderedHashSetWriter) Less(a, b int) bool { 42 for i := 0; i < 16; i++ { 43 if w.hashes[a][i] == w.hashes[b][i] { 44 continue 45 } 46 return w.hashes[a][i] < w.hashes[b][i] 47 } 48 return false 49 } 50 51 func (w *OrderedHashSetWriter) Swap(a, b int) { 52 w.hashes[a], w.hashes[b] = w.hashes[b], w.hashes[a] 53 w.offsets[a], w.offsets[b] = w.offsets[b], w.offsets[a] 54 } 55 56 func (w *OrderedHashSetWriter) Flush() error { 57 err := writeUint32s(w.w, w.fanout[:]) 58 if err != nil { 59 return err 60 } 61 for _, b := range w.hashes { 62 _, err := w.w.Write(b) 63 if err != nil { 64 return err 65 } 66 } 67 return writeUint32s(w.w, w.offsets) 68 } 69 70 type OrderedHashSet struct { 71 size uint32 72 r io.ReadSeekCloser 73 buf []byte 74 } 75 76 func NewOrderedHashSet(r io.ReadSeekCloser) (s *OrderedHashSet, err error) { 77 s = &OrderedHashSet{ 78 r: r, 79 buf: make([]byte, 16), 80 } 81 s.size, err = s.readFanout(255) 82 if err != nil { 83 return nil, err 84 } 85 return 86 } 87 88 func (s *OrderedHashSet) Close() error { 89 return s.r.Close() 90 } 91 92 func (s *OrderedHashSet) readFanout(off byte) (uint32, error) { 93 return readUint32(s.r, s.buf, 0, int(off)) 94 } 95 96 func (s *OrderedHashSet) readOffset(ind uint32) (off int, err error) { 97 u, err := readUint32(s.r, s.buf, 1024+int64(s.size)*16, int(ind)) 98 return int(u), err 99 } 100 101 func (s *OrderedHashSet) IndexOf(b []byte) (off int, err error) { 102 pos, err := indexOf(s.r, s.buf, b) 103 if err != nil { 104 return 105 } 106 if pos == -1 { 107 return -1, nil 108 } 109 return s.readOffset(uint32(pos)) 110 }