github.com/wrgl/wrgl@v0.14.0/pkg/objects/str_list.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright © 2022 Wrangle Ltd
     3  
     4  package objects
     5  
     6  import (
     7  	"bytes"
     8  	"encoding/binary"
     9  	"errors"
    10  	"fmt"
    11  	"io"
    12  	"sort"
    13  )
    14  
    15  // StrListEncoder encodes string slice. Max bytes size for each string is 65536 bytes
    16  type StrListEncoder struct {
    17  	buf          []byte
    18  	reuseRecords bool
    19  }
    20  
    21  func NewStrListEncoder(reuseRecords bool) *StrListEncoder {
    22  	return &StrListEncoder{
    23  		buf:          make([]byte, 0, 256),
    24  		reuseRecords: reuseRecords,
    25  	}
    26  }
    27  
    28  func (e *StrListEncoder) Encode(sl []string) []byte {
    29  	bufLen := 4
    30  	for _, s := range sl {
    31  		bufLen += len(s) + 2
    32  	}
    33  	if bufLen > cap(e.buf) {
    34  		e.buf = make([]byte, bufLen)
    35  	} else {
    36  		e.buf = e.buf[:bufLen]
    37  	}
    38  	if len(sl) > maxUint32 {
    39  		panic(fmt.Errorf("slice length is too long (%d > 4294967296)", len(sl)))
    40  	}
    41  	binary.BigEndian.PutUint32(e.buf, uint32(len(sl)))
    42  	var offset uint16 = 4
    43  	for _, s := range sl {
    44  		if len(s) > 65536 {
    45  			panic(fmt.Errorf("cell value %q is too long (%d > 65536)", s[:40]+"...", len(s)))
    46  		}
    47  		l := uint16(len(s))
    48  		binary.BigEndian.PutUint16(e.buf[offset:], l)
    49  		offset += 2
    50  		copy(e.buf[offset:], s)
    51  		offset += l
    52  	}
    53  	b := e.buf
    54  	if !e.reuseRecords {
    55  		b = make([]byte, len(e.buf))
    56  		copy(b, e.buf)
    57  	}
    58  	return b
    59  }
    60  
    61  // StrListDecoder decodes string slice.
    62  type StrListDecoder struct {
    63  	strs         []string
    64  	buf          []byte
    65  	reuseRecords bool
    66  	pos          int
    67  }
    68  
    69  func NewStrListDecoder(reuseRecords bool) *StrListDecoder {
    70  	d := &StrListDecoder{
    71  		buf:          make([]byte, 4),
    72  		reuseRecords: reuseRecords,
    73  	}
    74  	if reuseRecords {
    75  		d.strs = make([]string, 0, 256)
    76  	}
    77  	return d
    78  }
    79  
    80  func (d *StrListDecoder) strSlice(n uint32) []string {
    81  	if d.strs != nil {
    82  		if n > uint32(cap(d.strs)) {
    83  			d.strs = make([]string, 0, n)
    84  		}
    85  		return d.strs[:0]
    86  	}
    87  	return make([]string, 0, n)
    88  }
    89  
    90  func (d *StrListDecoder) Decode(b []byte) []string {
    91  	count := binary.BigEndian.Uint32(b)
    92  	sl := d.strSlice(count)
    93  	var offset uint16 = 4
    94  	var i uint32
    95  	for i = 0; i < count; i++ {
    96  		l := binary.BigEndian.Uint16(b[offset:])
    97  		offset += 2
    98  		if l == 0 {
    99  			sl = append(sl, "")
   100  			continue
   101  		}
   102  		d.ensureBufSize(int(l))
   103  		copy(d.buf[:l], b[offset:])
   104  		offset += l
   105  		sl = append(sl, string(d.buf[:l]))
   106  	}
   107  	return sl
   108  }
   109  
   110  func ValidateStrListBytes(b []byte) (int, error) {
   111  	count := int(binary.BigEndian.Uint32(b))
   112  	offset := 4
   113  	n := len(b)
   114  	for i := 0; i < count; i++ {
   115  		l := binary.BigEndian.Uint16(b[offset:])
   116  		offset += 2 + int(l)
   117  		if offset > n {
   118  			return 0, fmt.Errorf("invalid strList")
   119  		}
   120  	}
   121  	return offset, nil
   122  }
   123  
   124  func (d *StrListDecoder) ensureBufSize(n int) {
   125  	for n > cap(d.buf) {
   126  		b := make([]byte, cap(d.buf)*2)
   127  		copy(b, d.buf)
   128  		d.buf = b
   129  	}
   130  }
   131  
   132  func (d *StrListDecoder) readUint16(r io.Reader) (uint16, error) {
   133  	d.buf[0], d.buf[1] = 0, 0
   134  	b := d.buf[:2]
   135  	n, err := io.ReadFull(r, b)
   136  	d.pos += n
   137  	return binary.BigEndian.Uint16(b), err
   138  }
   139  
   140  func (d *StrListDecoder) readUint32(r io.Reader) (uint32, error) {
   141  	b := d.buf[:4]
   142  	n, err := io.ReadFull(r, b)
   143  	if err != nil {
   144  		return 0, err
   145  	}
   146  	d.pos += n
   147  	return binary.BigEndian.Uint32(b), nil
   148  }
   149  
   150  func (d *StrListDecoder) Read(r io.Reader) (int64, []string, error) {
   151  	d.pos = 0
   152  	count, err := d.readUint32(r)
   153  	if err != nil {
   154  		return 0, nil, err
   155  	}
   156  	sl := d.strSlice(count)
   157  	var i uint32
   158  	for i = 0; i < count; i++ {
   159  		l, err := d.readUint16(r)
   160  		if err != nil {
   161  			return 0, nil, err
   162  		}
   163  		if l == 0 {
   164  			sl = append(sl, "")
   165  			continue
   166  		}
   167  		d.ensureBufSize(int(l))
   168  		n, err := io.ReadFull(r, d.buf[:l])
   169  		d.pos += n
   170  		sl = append(sl, string(d.buf[:n]))
   171  		if errors.Is(err, io.EOF) && i == count-1 {
   172  			break
   173  		}
   174  		if err != nil {
   175  			return 0, nil, err
   176  		}
   177  	}
   178  	return int64(d.pos), sl, nil
   179  }
   180  
   181  // ReadBytes returns the number of bytes and the actual bytes of encoded StrList
   182  func (d *StrListDecoder) ReadBytes(r io.Reader) (n int, b []byte, err error) {
   183  	// read number of strings in the list
   184  	_, err = io.ReadFull(r, d.buf[:4])
   185  	if err != nil {
   186  		err = fmt.Errorf("error reading number of strings: %w", err)
   187  		return
   188  	}
   189  	count := binary.BigEndian.Uint32(d.buf)
   190  
   191  	n = 4
   192  	var i uint32
   193  	var m int
   194  	for i = 0; i < count; i++ {
   195  		d.ensureBufSize(n + 2)
   196  		_, err = io.ReadFull(r, d.buf[n:n+2])
   197  		if err != nil {
   198  			err = fmt.Errorf("error reading string length (2 bytes): %w", err)
   199  			return
   200  		}
   201  		l := binary.BigEndian.Uint16(d.buf[n:])
   202  
   203  		n += 2
   204  		d.ensureBufSize(n + int(l))
   205  		m, err = io.ReadFull(r, d.buf[n:n+int(l)])
   206  		n += m
   207  		if errors.Is(err, io.EOF) && i == count-1 {
   208  			break
   209  		}
   210  		if err != nil {
   211  			err = fmt.Errorf("error reading string (%d bytes): %w", l, err)
   212  			return
   213  		}
   214  	}
   215  	if !d.reuseRecords {
   216  		b = make([]byte, n)
   217  		copy(b, d.buf[:n])
   218  		return n, b, nil
   219  	}
   220  	return n, d.buf[:n], nil
   221  }
   222  
   223  type StrList []byte
   224  
   225  func (b StrList) seekColumnOffset(u uint32) (off, n int) {
   226  	var i uint32
   227  	l := len(b)
   228  	c := binary.BigEndian.Uint32(b)
   229  	if u >= c {
   230  		panic(fmt.Errorf("column out of bound: %d >= %d", u, c))
   231  	}
   232  	off = 4
   233  	for i = 0; off < l; i++ {
   234  		n = int(binary.BigEndian.Uint16(b[off : off+2]))
   235  		off += 2
   236  		if i == u {
   237  			return
   238  		}
   239  		off += n
   240  	}
   241  	panic(fmt.Errorf("corrupted strList bytes"))
   242  }
   243  
   244  func (b StrList) seekColumn(u uint32) []byte {
   245  	off, n := b.seekColumnOffset(u)
   246  	return b[off : off+n]
   247  }
   248  
   249  func (b StrList) ReadColumns(columns []uint32) []string {
   250  	sl := make([]string, len(columns))
   251  	for i, u := range columns {
   252  		sl[i] = string(b.seekColumn(u))
   253  	}
   254  	return sl
   255  }
   256  
   257  func StringSliceIsLess(pk []uint32, a, b []string) bool {
   258  	if len(pk) == 0 {
   259  		for i, s := range a {
   260  			if s < b[i] {
   261  				return true
   262  			} else if s > b[i] {
   263  				return false
   264  			}
   265  		}
   266  		return false
   267  	}
   268  	for _, u := range pk {
   269  		if a[u] < b[u] {
   270  			return true
   271  		} else if a[u] > b[u] {
   272  			return false
   273  		}
   274  	}
   275  	return false
   276  }
   277  
   278  // LessThan returns true if a is less than b based on given column indices
   279  func (b StrList) LessThan(columns []uint32, c StrList) bool {
   280  	if len(columns) == 0 {
   281  		n := binary.BigEndian.Uint32(b)
   282  		var i uint32
   283  		for i = 0; i < n; i++ {
   284  			if v := bytes.Compare(b.seekColumn(i), c.seekColumn(i)); v == 1 {
   285  				return false
   286  			} else if v == -1 {
   287  				return true
   288  			}
   289  		}
   290  		return false
   291  	}
   292  	for _, u := range columns {
   293  		if v := bytes.Compare(b.seekColumn(u), c.seekColumn(u)); v == 1 {
   294  			return false
   295  		} else if v == -1 {
   296  			return true
   297  		}
   298  	}
   299  	return false
   300  }
   301  
   302  // StrListEditor can either remove certain columns from StrList or
   303  // remove everything except certain columns. It is built to minimize
   304  // allocations so given StrList will always be edit in place.
   305  type StrListEditor struct {
   306  	sortedColumns []uint32
   307  	colIndices    []int
   308  	offsets       []int
   309  	lens          []int
   310  }
   311  
   312  func NewStrListEditor(columns []uint32) *StrListEditor {
   313  	n := len(columns)
   314  	r := &StrListEditor{
   315  		colIndices:    make([]int, n),
   316  		sortedColumns: make([]uint32, n),
   317  		offsets:       make([]int, n),
   318  		lens:          make([]int, n),
   319  	}
   320  	copy(r.sortedColumns, columns)
   321  	sort.Slice(r.sortedColumns, func(i, j int) bool {
   322  		return r.sortedColumns[i] < r.sortedColumns[j]
   323  	})
   324  	m := map[uint32]int{}
   325  	for i, j := range r.sortedColumns {
   326  		m[j] = i
   327  	}
   328  	for i := range r.colIndices {
   329  		r.colIndices[i] = m[columns[i]]
   330  	}
   331  	return r
   332  }
   333  
   334  func (r *StrListEditor) findOffsets(b []byte) (origLen uint32) {
   335  	var j uint32
   336  	l := len(b)
   337  	c := binary.BigEndian.Uint32(b)
   338  	off := 4
   339  	var n int
   340  mainLoop:
   341  	for i, u := range r.sortedColumns {
   342  		if u >= c {
   343  			panic(fmt.Errorf("column out of bound: %d >= %d", u, c))
   344  		}
   345  		for off < l {
   346  			n = int(binary.BigEndian.Uint16(b[off:]))
   347  			if j == u {
   348  				r.offsets[i] = off
   349  				r.lens[i] = n + 2
   350  			}
   351  			off += 2 + n
   352  			j++
   353  			if j-1 == u {
   354  				continue mainLoop
   355  			}
   356  		}
   357  		panic(fmt.Errorf("corrupted strList bytes"))
   358  	}
   359  	return c
   360  }
   361  
   362  func (r *StrListEditor) RemoveFrom(b []byte) []byte {
   363  	l := r.findOffsets(b)
   364  	binary.BigEndian.PutUint32(b, l-uint32(len(r.offsets)))
   365  	for i := len(r.offsets) - 1; i >= 0; i-- {
   366  		b = append(b[:r.offsets[i]], b[r.offsets[i]+r.lens[i]:]...)
   367  	}
   368  	return b
   369  }
   370  
   371  func (r *StrListEditor) ensureLength(b []byte, n int) []byte {
   372  	if n > cap(b) {
   373  		c := make([]byte, n)
   374  		copy(c, b)
   375  		b = c
   376  	} else {
   377  		b = b[:n]
   378  	}
   379  	return b
   380  }
   381  
   382  func (r *StrListEditor) PickFrom(dst, src []byte) []byte {
   383  	r.findOffsets(src)
   384  	total := 0
   385  	for _, n := range r.lens {
   386  		total += n
   387  	}
   388  	dst = r.ensureLength(dst, 4+total)
   389  	binary.BigEndian.PutUint32(dst, uint32(len(r.sortedColumns)))
   390  	off := 4
   391  	for _, i := range r.colIndices {
   392  		copy(dst[off:], src[r.offsets[i]:r.offsets[i]+r.lens[i]])
   393  		off += r.lens[i]
   394  	}
   395  	return dst
   396  }