github.com/wrgl/wrgl@v0.14.0/pkg/objects/str_list.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright © 2022 Wrangle Ltd 3 4 package objects 5 6 import ( 7 "bytes" 8 "encoding/binary" 9 "errors" 10 "fmt" 11 "io" 12 "sort" 13 ) 14 15 // StrListEncoder encodes string slice. Max bytes size for each string is 65536 bytes 16 type StrListEncoder struct { 17 buf []byte 18 reuseRecords bool 19 } 20 21 func NewStrListEncoder(reuseRecords bool) *StrListEncoder { 22 return &StrListEncoder{ 23 buf: make([]byte, 0, 256), 24 reuseRecords: reuseRecords, 25 } 26 } 27 28 func (e *StrListEncoder) Encode(sl []string) []byte { 29 bufLen := 4 30 for _, s := range sl { 31 bufLen += len(s) + 2 32 } 33 if bufLen > cap(e.buf) { 34 e.buf = make([]byte, bufLen) 35 } else { 36 e.buf = e.buf[:bufLen] 37 } 38 if len(sl) > maxUint32 { 39 panic(fmt.Errorf("slice length is too long (%d > 4294967296)", len(sl))) 40 } 41 binary.BigEndian.PutUint32(e.buf, uint32(len(sl))) 42 var offset uint16 = 4 43 for _, s := range sl { 44 if len(s) > 65536 { 45 panic(fmt.Errorf("cell value %q is too long (%d > 65536)", s[:40]+"...", len(s))) 46 } 47 l := uint16(len(s)) 48 binary.BigEndian.PutUint16(e.buf[offset:], l) 49 offset += 2 50 copy(e.buf[offset:], s) 51 offset += l 52 } 53 b := e.buf 54 if !e.reuseRecords { 55 b = make([]byte, len(e.buf)) 56 copy(b, e.buf) 57 } 58 return b 59 } 60 61 // StrListDecoder decodes string slice. 62 type StrListDecoder struct { 63 strs []string 64 buf []byte 65 reuseRecords bool 66 pos int 67 } 68 69 func NewStrListDecoder(reuseRecords bool) *StrListDecoder { 70 d := &StrListDecoder{ 71 buf: make([]byte, 4), 72 reuseRecords: reuseRecords, 73 } 74 if reuseRecords { 75 d.strs = make([]string, 0, 256) 76 } 77 return d 78 } 79 80 func (d *StrListDecoder) strSlice(n uint32) []string { 81 if d.strs != nil { 82 if n > uint32(cap(d.strs)) { 83 d.strs = make([]string, 0, n) 84 } 85 return d.strs[:0] 86 } 87 return make([]string, 0, n) 88 } 89 90 func (d *StrListDecoder) Decode(b []byte) []string { 91 count := binary.BigEndian.Uint32(b) 92 sl := d.strSlice(count) 93 var offset uint16 = 4 94 var i uint32 95 for i = 0; i < count; i++ { 96 l := binary.BigEndian.Uint16(b[offset:]) 97 offset += 2 98 if l == 0 { 99 sl = append(sl, "") 100 continue 101 } 102 d.ensureBufSize(int(l)) 103 copy(d.buf[:l], b[offset:]) 104 offset += l 105 sl = append(sl, string(d.buf[:l])) 106 } 107 return sl 108 } 109 110 func ValidateStrListBytes(b []byte) (int, error) { 111 count := int(binary.BigEndian.Uint32(b)) 112 offset := 4 113 n := len(b) 114 for i := 0; i < count; i++ { 115 l := binary.BigEndian.Uint16(b[offset:]) 116 offset += 2 + int(l) 117 if offset > n { 118 return 0, fmt.Errorf("invalid strList") 119 } 120 } 121 return offset, nil 122 } 123 124 func (d *StrListDecoder) ensureBufSize(n int) { 125 for n > cap(d.buf) { 126 b := make([]byte, cap(d.buf)*2) 127 copy(b, d.buf) 128 d.buf = b 129 } 130 } 131 132 func (d *StrListDecoder) readUint16(r io.Reader) (uint16, error) { 133 d.buf[0], d.buf[1] = 0, 0 134 b := d.buf[:2] 135 n, err := io.ReadFull(r, b) 136 d.pos += n 137 return binary.BigEndian.Uint16(b), err 138 } 139 140 func (d *StrListDecoder) readUint32(r io.Reader) (uint32, error) { 141 b := d.buf[:4] 142 n, err := io.ReadFull(r, b) 143 if err != nil { 144 return 0, err 145 } 146 d.pos += n 147 return binary.BigEndian.Uint32(b), nil 148 } 149 150 func (d *StrListDecoder) Read(r io.Reader) (int64, []string, error) { 151 d.pos = 0 152 count, err := d.readUint32(r) 153 if err != nil { 154 return 0, nil, err 155 } 156 sl := d.strSlice(count) 157 var i uint32 158 for i = 0; i < count; i++ { 159 l, err := d.readUint16(r) 160 if err != nil { 161 return 0, nil, err 162 } 163 if l == 0 { 164 sl = append(sl, "") 165 continue 166 } 167 d.ensureBufSize(int(l)) 168 n, err := io.ReadFull(r, d.buf[:l]) 169 d.pos += n 170 sl = append(sl, string(d.buf[:n])) 171 if errors.Is(err, io.EOF) && i == count-1 { 172 break 173 } 174 if err != nil { 175 return 0, nil, err 176 } 177 } 178 return int64(d.pos), sl, nil 179 } 180 181 // ReadBytes returns the number of bytes and the actual bytes of encoded StrList 182 func (d *StrListDecoder) ReadBytes(r io.Reader) (n int, b []byte, err error) { 183 // read number of strings in the list 184 _, err = io.ReadFull(r, d.buf[:4]) 185 if err != nil { 186 err = fmt.Errorf("error reading number of strings: %w", err) 187 return 188 } 189 count := binary.BigEndian.Uint32(d.buf) 190 191 n = 4 192 var i uint32 193 var m int 194 for i = 0; i < count; i++ { 195 d.ensureBufSize(n + 2) 196 _, err = io.ReadFull(r, d.buf[n:n+2]) 197 if err != nil { 198 err = fmt.Errorf("error reading string length (2 bytes): %w", err) 199 return 200 } 201 l := binary.BigEndian.Uint16(d.buf[n:]) 202 203 n += 2 204 d.ensureBufSize(n + int(l)) 205 m, err = io.ReadFull(r, d.buf[n:n+int(l)]) 206 n += m 207 if errors.Is(err, io.EOF) && i == count-1 { 208 break 209 } 210 if err != nil { 211 err = fmt.Errorf("error reading string (%d bytes): %w", l, err) 212 return 213 } 214 } 215 if !d.reuseRecords { 216 b = make([]byte, n) 217 copy(b, d.buf[:n]) 218 return n, b, nil 219 } 220 return n, d.buf[:n], nil 221 } 222 223 type StrList []byte 224 225 func (b StrList) seekColumnOffset(u uint32) (off, n int) { 226 var i uint32 227 l := len(b) 228 c := binary.BigEndian.Uint32(b) 229 if u >= c { 230 panic(fmt.Errorf("column out of bound: %d >= %d", u, c)) 231 } 232 off = 4 233 for i = 0; off < l; i++ { 234 n = int(binary.BigEndian.Uint16(b[off : off+2])) 235 off += 2 236 if i == u { 237 return 238 } 239 off += n 240 } 241 panic(fmt.Errorf("corrupted strList bytes")) 242 } 243 244 func (b StrList) seekColumn(u uint32) []byte { 245 off, n := b.seekColumnOffset(u) 246 return b[off : off+n] 247 } 248 249 func (b StrList) ReadColumns(columns []uint32) []string { 250 sl := make([]string, len(columns)) 251 for i, u := range columns { 252 sl[i] = string(b.seekColumn(u)) 253 } 254 return sl 255 } 256 257 func StringSliceIsLess(pk []uint32, a, b []string) bool { 258 if len(pk) == 0 { 259 for i, s := range a { 260 if s < b[i] { 261 return true 262 } else if s > b[i] { 263 return false 264 } 265 } 266 return false 267 } 268 for _, u := range pk { 269 if a[u] < b[u] { 270 return true 271 } else if a[u] > b[u] { 272 return false 273 } 274 } 275 return false 276 } 277 278 // LessThan returns true if a is less than b based on given column indices 279 func (b StrList) LessThan(columns []uint32, c StrList) bool { 280 if len(columns) == 0 { 281 n := binary.BigEndian.Uint32(b) 282 var i uint32 283 for i = 0; i < n; i++ { 284 if v := bytes.Compare(b.seekColumn(i), c.seekColumn(i)); v == 1 { 285 return false 286 } else if v == -1 { 287 return true 288 } 289 } 290 return false 291 } 292 for _, u := range columns { 293 if v := bytes.Compare(b.seekColumn(u), c.seekColumn(u)); v == 1 { 294 return false 295 } else if v == -1 { 296 return true 297 } 298 } 299 return false 300 } 301 302 // StrListEditor can either remove certain columns from StrList or 303 // remove everything except certain columns. It is built to minimize 304 // allocations so given StrList will always be edit in place. 305 type StrListEditor struct { 306 sortedColumns []uint32 307 colIndices []int 308 offsets []int 309 lens []int 310 } 311 312 func NewStrListEditor(columns []uint32) *StrListEditor { 313 n := len(columns) 314 r := &StrListEditor{ 315 colIndices: make([]int, n), 316 sortedColumns: make([]uint32, n), 317 offsets: make([]int, n), 318 lens: make([]int, n), 319 } 320 copy(r.sortedColumns, columns) 321 sort.Slice(r.sortedColumns, func(i, j int) bool { 322 return r.sortedColumns[i] < r.sortedColumns[j] 323 }) 324 m := map[uint32]int{} 325 for i, j := range r.sortedColumns { 326 m[j] = i 327 } 328 for i := range r.colIndices { 329 r.colIndices[i] = m[columns[i]] 330 } 331 return r 332 } 333 334 func (r *StrListEditor) findOffsets(b []byte) (origLen uint32) { 335 var j uint32 336 l := len(b) 337 c := binary.BigEndian.Uint32(b) 338 off := 4 339 var n int 340 mainLoop: 341 for i, u := range r.sortedColumns { 342 if u >= c { 343 panic(fmt.Errorf("column out of bound: %d >= %d", u, c)) 344 } 345 for off < l { 346 n = int(binary.BigEndian.Uint16(b[off:])) 347 if j == u { 348 r.offsets[i] = off 349 r.lens[i] = n + 2 350 } 351 off += 2 + n 352 j++ 353 if j-1 == u { 354 continue mainLoop 355 } 356 } 357 panic(fmt.Errorf("corrupted strList bytes")) 358 } 359 return c 360 } 361 362 func (r *StrListEditor) RemoveFrom(b []byte) []byte { 363 l := r.findOffsets(b) 364 binary.BigEndian.PutUint32(b, l-uint32(len(r.offsets))) 365 for i := len(r.offsets) - 1; i >= 0; i-- { 366 b = append(b[:r.offsets[i]], b[r.offsets[i]+r.lens[i]:]...) 367 } 368 return b 369 } 370 371 func (r *StrListEditor) ensureLength(b []byte, n int) []byte { 372 if n > cap(b) { 373 c := make([]byte, n) 374 copy(c, b) 375 b = c 376 } else { 377 b = b[:n] 378 } 379 return b 380 } 381 382 func (r *StrListEditor) PickFrom(dst, src []byte) []byte { 383 r.findOffsets(src) 384 total := 0 385 for _, n := range r.lens { 386 total += n 387 } 388 dst = r.ensureLength(dst, 4+total) 389 binary.BigEndian.PutUint32(dst, uint32(len(r.sortedColumns))) 390 off := 4 391 for _, i := range r.colIndices { 392 copy(dst[off:], src[r.offsets[i]:r.offsets[i]+r.lens[i]]) 393 off += r.lens[i] 394 } 395 return dst 396 }