github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/m3ninx/index/segment/fst/writer.go (about) 1 // Copyright (c) 2018 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package fst 22 23 import ( 24 "fmt" 25 "io" 26 27 "github.com/m3db/m3/src/m3ninx/generated/proto/fswriter" 28 sgmt "github.com/m3db/m3/src/m3ninx/index/segment" 29 "github.com/m3db/m3/src/m3ninx/index/segment/fst/encoding" 30 "github.com/m3db/m3/src/m3ninx/postings" 31 "github.com/m3db/m3/src/m3ninx/postings/pilosa" 32 "github.com/m3db/m3/src/m3ninx/x" 33 34 "github.com/golang/protobuf/proto" 35 ) 36 37 var ( 38 defaultInitialPostingsOffsetsSize = 1024 39 defaultInitialFSTTermsOffsetsSize = 1024 40 defaultInitialDocOffsetsSize = 1024 41 defaultInitialPostingsNeedsUnionSize = 1024 42 defaultInitialIntEncoderSize = 128 43 defaultPilosaRoaringMaxContainerSize = 128 44 ) 45 46 type writer struct { 47 version Version 48 builder sgmt.Builder 49 size int64 50 51 intEncoder *encoding.Encoder 52 postingsEncoder *pilosa.Encoder 53 fstWriter *fstWriter 54 docsWriter *DocumentsWriter 55 56 metadata []byte 57 docsDataFileWritten bool 58 postingsFileWritten bool 59 fstTermsFileWritten bool 60 fstTermsOffsets []uint64 61 termPostingsOffsets []uint64 62 63 // only used by versions >= 1.1 64 fieldPostingsOffsets []uint64 65 fieldData *fswriter.FieldData 66 fieldBuffer proto.Buffer 67 } 68 69 // WriterOptions is a set of options used when writing an FST. 70 type WriterOptions struct { 71 // DisableRegistry disables the FST builder node registry cache which can 72 // de-duplicate transitions that are an exact match of each other during 73 // a final compilation phase, this helps compress the FST by a significant 74 // amount (e.g. 2x). You can disable this to speed up high fixed cost 75 // lookups to during building of the FST however. 76 DisableRegistry bool 77 } 78 79 // NewWriter returns a new writer. 80 func NewWriter(opts WriterOptions) (Writer, error) { 81 return newWriterWithVersion(opts, nil) 82 } 83 84 // newWriterWithVersion is a constructor used by tests to override version. 85 func newWriterWithVersion(opts WriterOptions, vers *Version) (Writer, error) { 86 v := CurrentVersion 87 if vers != nil { 88 v = *vers 89 } 90 if err := v.Supported(); err != nil { 91 return nil, err 92 } 93 94 docsWriter, err := NewDocumentsWriter() 95 if err != nil { 96 return nil, err 97 } 98 99 return &writer{ 100 version: v, 101 intEncoder: encoding.NewEncoder(defaultInitialIntEncoderSize), 102 postingsEncoder: pilosa.NewEncoder(), 103 fstWriter: newFSTWriter(opts), 104 docsWriter: docsWriter, 105 fstTermsOffsets: make([]uint64, 0, defaultInitialFSTTermsOffsetsSize), 106 termPostingsOffsets: make([]uint64, 0, defaultInitialPostingsOffsetsSize), 107 108 fieldPostingsOffsets: make([]uint64, 0, defaultInitialPostingsOffsetsSize), 109 fieldData: &fswriter.FieldData{}, 110 }, nil 111 } 112 113 func (w *writer) clear() { 114 w.builder = nil 115 116 w.fstWriter.Reset(nil) 117 w.intEncoder.Reset() 118 w.postingsEncoder.Reset() 119 w.docsWriter.Reset(DocumentsWriterOptions{}) 120 121 w.metadata = nil 122 w.docsDataFileWritten = false 123 w.postingsFileWritten = false 124 w.fstTermsFileWritten = false 125 w.fstTermsOffsets = w.fstTermsOffsets[:0] 126 w.termPostingsOffsets = w.termPostingsOffsets[:0] 127 128 w.fieldPostingsOffsets = w.fieldPostingsOffsets[:0] 129 w.fieldData.Reset() 130 w.fieldBuffer.Reset() 131 } 132 133 func (w *writer) Reset(b sgmt.Builder) error { 134 w.clear() 135 136 if b == nil { 137 return nil 138 } 139 140 numDocs := len(b.Docs()) 141 metadata := defaultV1Metadata() 142 metadata.NumDocs = int64(numDocs) 143 metadataBytes, err := metadata.Marshal() 144 if err != nil { 145 return err 146 } 147 148 w.metadata = metadataBytes 149 w.builder = b 150 w.size = int64(numDocs) 151 return nil 152 } 153 154 func (w *writer) MajorVersion() int { 155 return w.version.Major 156 } 157 158 func (w *writer) MinorVersion() int { 159 return w.version.Minor 160 } 161 162 func (w *writer) Metadata() []byte { 163 return w.metadata 164 } 165 166 func (w *writer) WriteDocumentsData(iow io.Writer) error { 167 iter, err := w.builder.AllDocs() 168 closer := x.NewSafeCloser(iter) 169 defer closer.Close() 170 if err != nil { 171 return err 172 } 173 174 w.docsWriter.Reset(DocumentsWriterOptions{ 175 Iter: iter, 176 SizeHint: int(w.size), 177 }) 178 if err := w.docsWriter.WriteDocumentsData(iow); err != nil { 179 return err 180 } 181 182 w.docsDataFileWritten = true 183 return closer.Close() 184 } 185 186 func (w *writer) WriteDocumentsIndex(iow io.Writer) error { 187 if !w.docsDataFileWritten { 188 return fmt.Errorf("documents data file has to be written before documents index file") 189 } 190 191 return w.docsWriter.WriteDocumentsIndex(iow) 192 } 193 194 func (w *writer) WritePostingsOffsets(iow io.Writer) error { 195 var ( 196 writeFieldsPostingList = w.version.supportsFieldPostingsList() 197 currentOffset = uint64(0) 198 ) 199 writePL := func(pl postings.List) (uint64, error) { // helper method 200 // serialize the postings list 201 w.postingsEncoder.Reset() 202 postingsBytes, err := w.postingsEncoder.Encode(pl) 203 if err != nil { 204 return 0, err 205 } 206 return w.writePayloadAndSizeAndMagicNumber(iow, postingsBytes) 207 } 208 209 // retrieve known fields 210 fields, err := w.builder.FieldsPostingsList() 211 if err != nil { 212 return err 213 } 214 215 // for each known field 216 for fields.Next() { 217 f, fieldPostingsList := fields.Current() 218 // retrieve known terms for current field 219 terms, err := w.builder.Terms(f) 220 if err != nil { 221 return err 222 } 223 224 // for each term corresponding to the current field 225 for terms.Next() { 226 _, pl := terms.Current() 227 // write the postings list 228 n, err := writePL(pl) 229 if err != nil { 230 return err 231 } 232 // update offset with the number of bytes we've written 233 currentOffset += n 234 // track current offset as the offset for the current field/term 235 w.termPostingsOffsets = append(w.termPostingsOffsets, currentOffset) 236 } 237 238 // write the field level postings list 239 if writeFieldsPostingList { 240 // Write the unioned postings list out. 241 n, err := writePL(fieldPostingsList) 242 if err != nil { 243 return err 244 } 245 // update offset with the number of bytes we've written 246 currentOffset += n 247 // track current offset as the offset for the current field 248 w.fieldPostingsOffsets = append(w.fieldPostingsOffsets, currentOffset) 249 } 250 251 if err := terms.Err(); err != nil { 252 return err 253 } 254 255 if err := terms.Close(); err != nil { 256 return err 257 } 258 } 259 260 if err := fields.Err(); err != nil { 261 return err 262 } 263 264 if err := fields.Close(); err != nil { 265 return err 266 } 267 268 w.postingsFileWritten = true 269 return nil 270 } 271 272 func (w *writer) WriteFSTTerms(iow io.Writer) error { 273 if !w.postingsFileWritten { 274 return fmt.Errorf("postings offsets have to be written before fst terms can be written") 275 } 276 277 var ( 278 writeFieldsPostingList = w.version.supportsFieldPostingsList() 279 currentOffset = uint64(0) // track offset of writes into `iow`. 280 ) 281 282 // retrieve all known fields 283 fields, err := w.builder.FieldsPostingsList() 284 if err != nil { 285 return err 286 } 287 288 // iterate term|field postings offsets 289 var ( 290 termOffsets = w.termPostingsOffsets 291 fieldOffsets = w.fieldPostingsOffsets 292 ) 293 294 // build a fst for each field's terms 295 for fields.Next() { 296 f, _ := fields.Current() 297 298 // write fields level postings list if required 299 if writeFieldsPostingList { 300 po := fieldOffsets[0] 301 fieldOffsets = fieldOffsets[1:] 302 md, err := w.fieldsMetadata(po) 303 if err != nil { 304 return err 305 } 306 if _, err := iow.Write(md); err != nil { 307 return err 308 } 309 numBytesMD := uint64(len(md)) 310 numBytesMDSize, err := w.writeUint64(iow, numBytesMD) 311 if err != nil { 312 return err 313 } 314 currentOffset += numBytesMD + numBytesMDSize 315 } 316 317 // reset writer for this field's fst 318 if err := w.fstWriter.Reset(iow); err != nil { 319 return err 320 } 321 322 // retrieve all terms for this field 323 terms, err := w.builder.Terms(f) 324 if err != nil { 325 return err 326 } 327 328 // for each term corresponding to this field 329 for terms.Next() { 330 t, _ := terms.Current() 331 332 // retieve postsings offset for the current field,term 333 if len(termOffsets) == 0 { 334 return fmt.Errorf("postings offset not found for: field=%s, term=%s", f, t) 335 } 336 337 po := termOffsets[0] 338 termOffsets = termOffsets[1:] 339 340 // add the term -> posting offset into the term's fst 341 if err := w.fstWriter.Add(t, po); err != nil { 342 return err 343 } 344 } 345 if err := terms.Err(); err != nil { 346 return err 347 } 348 349 if err := terms.Close(); err != nil { 350 return err 351 } 352 353 // retrieve a serialized representation of the field's fst 354 numBytesFST, err := w.fstWriter.Close() 355 if err != nil { 356 return err 357 } 358 359 // serialize the size of the fst 360 n, err := w.writeSizeAndMagicNumber(iow, numBytesFST) 361 if err != nil { 362 return err 363 } 364 // update offset with the number of bytes we've written 365 currentOffset += numBytesFST + n 366 367 // track current offset as the offset for the current field's fst 368 w.fstTermsOffsets = append(w.fstTermsOffsets, currentOffset) 369 } 370 371 if err := fields.Err(); err != nil { 372 return err 373 } 374 375 if err := fields.Close(); err != nil { 376 return err 377 } 378 379 // make sure we consumed all the postings offsets 380 if len(termOffsets) != 0 { 381 return fmt.Errorf("term postings offsets remain at end of terms: remaining=%d", 382 len(termOffsets)) 383 } 384 385 // make sure we consumed all the postings offsets 386 if len(fieldOffsets) != 0 { 387 return fmt.Errorf("field postings offsets remain at end of terms: remaining=%d", 388 len(fieldOffsets)) 389 } 390 391 // all good! 392 w.fstTermsFileWritten = true 393 return nil 394 } 395 396 func (w *writer) fieldsMetadata(fieldPostingsOffset uint64) ([]byte, error) { 397 w.fieldBuffer.Reset() 398 w.fieldData.FieldPostingsListOffset = fieldPostingsOffset 399 if err := w.fieldBuffer.Marshal(w.fieldData); err != nil { 400 return nil, err 401 } 402 return w.fieldBuffer.Bytes(), nil 403 } 404 405 func (w *writer) WriteFSTFields(iow io.Writer) error { 406 if !w.fstTermsFileWritten { 407 return fmt.Errorf("fst terms files have to be written before fst fields can be written") 408 } 409 410 // reset fst writer 411 if err := w.fstWriter.Reset(iow); err != nil { 412 return err 413 } 414 415 // iterate field offsets 416 offsets := w.fstTermsOffsets 417 418 // retrieve all known fields 419 fields, err := w.builder.FieldsPostingsList() 420 if err != nil { 421 return err 422 } 423 424 // insert each field into fst 425 for fields.Next() { 426 f, _ := fields.Current() 427 428 // get offset for this field's term fst 429 if len(offsets) == 0 { 430 return fmt.Errorf("fst field offset not found for: field=%s", f) 431 } 432 433 offset := offsets[0] 434 offsets = offsets[1:] 435 436 // add field, offset into fst 437 if err := w.fstWriter.Add(f, offset); err != nil { 438 return err 439 } 440 } 441 442 if err := fields.Err(); err != nil { 443 return err 444 } 445 446 if err := fields.Close(); err != nil { 447 return err 448 } 449 450 // flush the fst writer 451 _, err = w.fstWriter.Close() 452 453 // make sure we consumed all the postings offsets 454 if len(offsets) != 0 { 455 return fmt.Errorf("field offsets remain at end of fields: remaining=%d", 456 len(offsets)) 457 } 458 459 return err 460 } 461 462 // given a payload []byte, and io.Writer; this method writes the following data out to the writer 463 // | payload - len(payload) bytes | 8 bytes for uint64 (size of payload) | 8 bytes for `magicNumber` | 464 func (w *writer) writePayloadAndSizeAndMagicNumber(iow io.Writer, payload []byte) (uint64, error) { 465 numBytesWritten := uint64(0) 466 size, err := iow.Write(payload) 467 if err != nil { 468 return 0, err 469 } 470 numBytesWritten += uint64(size) 471 n, err := w.writeSizeAndMagicNumber(iow, uint64(size)) 472 if err != nil { 473 return 0, err 474 } 475 numBytesWritten += n 476 return numBytesWritten, nil 477 } 478 479 func (w *writer) writeUint64(iow io.Writer, x uint64) (uint64, error) { 480 // serialize the size, magicNumber 481 w.intEncoder.Reset() 482 w.intEncoder.PutUint64(x) 483 xBytes := w.intEncoder.Bytes() 484 485 // write out the size 486 n, err := iow.Write(xBytes) 487 if err != nil { 488 return 0, err 489 } 490 return uint64(n), nil 491 } 492 493 func (w *writer) writeSizeAndMagicNumber(iow io.Writer, size uint64) (uint64, error) { 494 // serialize the size, magicNumber 495 w.intEncoder.Reset() 496 w.intEncoder.PutUint64(size) 497 w.intEncoder.PutUint64(uint64(magicNumber)) 498 sizeBytes := w.intEncoder.Bytes() 499 500 // write out the size 501 n, err := iow.Write(sizeBytes) 502 if err != nil { 503 return 0, err 504 } 505 return uint64(n), nil 506 } 507 508 func defaultV1Metadata() fswriter.Metadata { 509 return fswriter.Metadata{ 510 PostingsFormat: fswriter.PostingsFormat_PILOSAV1_POSTINGS_FORMAT, 511 } 512 } 513 514 type docOffset struct { 515 postings.ID 516 offset uint64 517 }