github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/blocktree/termsReader.go (about) 1 package blocktree 2 3 import ( 4 "errors" 5 "fmt" 6 "github.com/balzaczyy/golucene/core/codec" 7 . "github.com/balzaczyy/golucene/core/codec/spi" 8 . "github.com/balzaczyy/golucene/core/index/model" 9 "github.com/balzaczyy/golucene/core/store" 10 "github.com/balzaczyy/golucene/core/util" 11 ) 12 13 // BlockTreeTermsReader.java 14 15 const ( 16 BTT_OUTPUT_FLAGS_NUM_BITS = 2 17 BTT_OUTPUT_FLAG_IS_FLOOR = 1 18 BTT_OUTPUT_FLAG_HAS_TERMS = 2 19 20 // BTT_INDEX_EXTENSION = "tip" 21 // BTT_INDEX_CODEC_NAME = "BLOCK_TREE_TERMS_INDEX" 22 // BTT_INDEX_VERSION_START = 0 23 // BTT_INDEX_VERSION_APPEND_ONLY = 1 24 // BTT_INDEX_VERSION_CURRENT = BTT_INDEX_VERSION_APPEND_ONLY 25 ) 26 27 /* A block-based terms index and dictionary that assigns 28 terms to variable length blocks according to how they 29 share prefixes. The terms index is a prefix trie 30 whose leaves are term blocks. The advantage of this 31 approach is that seekExact is often able to 32 determine a term cannot exist without doing any IO, and 33 intersection with Automata is very fast. NOte that this 34 terms dictionary has its own fixed terms index (ie, it 35 does not support a pluggable terms index 36 implementation). 37 38 NOTE: this terms dictionary does not support 39 index divisor when opening an IndexReader. Instead, you 40 can change the min/maxItemsPerBlock during indexing. 41 42 The data strucure used by this implementation is very 43 similar to a [burst trie] 44 (http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499), 45 but with added logic to break up too-large blocks of all 46 terms sharing a given prefix into smaller ones. 47 48 Use CheckIndex with the -verbose 49 option to see summary statistics on the blocks in the 50 dictionary. */ 51 type BlockTreeTermsReader struct { 52 // Open input to the main terms dict file (_X.tib) 53 in store.IndexInput 54 // Reads the terms dict entries, to gather state to 55 // produce DocsEnum on demand 56 postingsReader PostingsReaderBase 57 fields map[string]FieldReader 58 // File offset where the directory starts in the terms file. 59 dirOffset int64 60 // File offset where the directory starts in the index file. 61 indexDirOffset int64 62 segment string 63 version int 64 } 65 66 func NewBlockTreeTermsReader(dir store.Directory, 67 fieldInfos FieldInfos, info *SegmentInfo, 68 postingsReader PostingsReaderBase, ctx store.IOContext, 69 segmentSuffix string, indexDivisor int) (p FieldsProducer, err error) { 70 71 // log.Print("Initializing BlockTreeTermsReader...") 72 fp := &BlockTreeTermsReader{ 73 postingsReader: postingsReader, 74 fields: make(map[string]FieldReader), 75 segment: info.Name, 76 } 77 fp.in, err = dir.OpenInput(util.SegmentFileName(info.Name, segmentSuffix, TERMS_EXTENSION), ctx) 78 if err != nil { 79 return nil, err 80 } 81 82 success := false 83 var indexIn store.IndexInput 84 defer func() { 85 if !success { 86 fmt.Println("Failed to initialize BlockTreeTermsReader.") 87 if err != nil { 88 fmt.Println("DEBUG ", err) 89 } 90 // this.close() will close in: 91 util.CloseWhileSuppressingError(indexIn, fp) 92 } 93 }() 94 95 fp.version, err = fp.readHeader(fp.in) 96 if err != nil { 97 return nil, err 98 } 99 // log.Printf("Version: %v", fp.version) 100 101 if indexDivisor != -1 { 102 filename := util.SegmentFileName(info.Name, segmentSuffix, TERMS_INDEX_EXTENSION) 103 indexIn, err = dir.OpenInput(filename, ctx) 104 if err != nil { 105 return nil, err 106 } 107 108 indexVersion, err := fp.readIndexHeader(indexIn) 109 if err != nil { 110 return nil, err 111 } 112 // log.Printf("Index version: %v", indexVersion) 113 if int(indexVersion) != fp.version { 114 return nil, errors.New(fmt.Sprintf("mixmatched version files: %v=%v,%v=%v", fp.in, fp.version, indexIn, indexVersion)) 115 } 116 } 117 118 // verify 119 if indexIn != nil && fp.version >= TERMS_VERSION_CURRENT { 120 if _, err = store.ChecksumEntireFile(indexIn); err != nil { 121 return nil, err 122 } 123 } 124 125 // Have PostingsReader init itself 126 postingsReader.Init(fp.in) 127 128 if fp.version >= TERMS_VERSION_CHECKSUM { 129 // NOTE: data file is too costly to verify checksum against all the 130 // bytes on open, but for now we at least verify proper structure 131 // of the checksum footer: which looks for FOOTER_MAGIC + 132 // algorithmID. This is cheap and can detect some forms of 133 // corruption such as file trucation. 134 if _, err = codec.RetrieveChecksum(fp.in); err != nil { 135 return nil, err 136 } 137 } 138 139 // Read per-field details 140 fp.seekDir(fp.in, fp.dirOffset) 141 if indexDivisor != -1 { 142 fp.seekDir(indexIn, fp.indexDirOffset) 143 } 144 145 numFields, err := fp.in.ReadVInt() 146 if err != nil { 147 return nil, err 148 } 149 // log.Printf("Fields number: %v", numFields) 150 if numFields < 0 { 151 return nil, errors.New(fmt.Sprintf("invalid numFields: %v (resource=%v)", numFields, fp.in)) 152 } 153 154 for i := int32(0); i < numFields; i++ { 155 // log.Printf("Next field...") 156 field, err := fp.in.ReadVInt() 157 if err != nil { 158 return nil, err 159 } 160 // log.Printf("Field: %v", field) 161 162 numTerms, err := fp.in.ReadVLong() 163 if err != nil { 164 return nil, err 165 } 166 assert2(numTerms > 0, 167 "Illegal numTerms for field number: %v (resource=%v)", field, fp.in) 168 // log.Printf("Terms number: %v", numTerms) 169 170 numBytes, err := fp.in.ReadVInt() 171 if err != nil { 172 return nil, err 173 } 174 assert2(numBytes >= 0, 175 "invalid rootCode for field number: %v, numBytes=%v (resource=%v)", 176 field, numBytes, fp.in) 177 // log.Printf("Bytes number: %v", numBytes) 178 179 rootCode := make([]byte, numBytes) 180 err = fp.in.ReadBytes(rootCode) 181 if err != nil { 182 return nil, err 183 } 184 fieldInfo := fieldInfos.FieldInfoByNumber(int(field)) 185 assert2(fieldInfo != nil, "invalid field numebr: %v (resource=%v)", field, fp.in) 186 var sumTotalTermFreq int64 187 if fieldInfo.IndexOptions() == INDEX_OPT_DOCS_ONLY { 188 sumTotalTermFreq = -1 189 } else { 190 sumTotalTermFreq, err = fp.in.ReadVLong() 191 if err != nil { 192 return nil, err 193 } 194 } 195 sumDocFreq, err := fp.in.ReadVLong() 196 if err != nil { 197 return nil, err 198 } 199 var docCount int 200 if docCount, err = asInt(fp.in.ReadVInt()); err != nil { 201 return nil, err 202 } 203 // fmt.Printf("DocCount: %v\n", docCount) 204 var longsSize int 205 if fp.version >= TERMS_VERSION_META_ARRAY { 206 if longsSize, err = asInt(fp.in.ReadVInt()); err != nil { 207 return nil, err 208 } 209 } 210 assert2(longsSize >= 0, 211 "invalid longsSize for field: %v, longsSize=%v (resource=%v)", 212 fieldInfo.Name, longsSize, fp.in) 213 var minTerm, maxTerm []byte 214 if fp.version >= TERMS_VERSION_MIN_MAX_TERMS { 215 if minTerm, err = readBytesRef(fp.in); err != nil { 216 return nil, err 217 } 218 if maxTerm, err = readBytesRef(fp.in); err != nil { 219 return nil, err 220 } 221 } 222 if docCount < 0 || int(docCount) > info.DocCount() { // #docs with field must be <= #docs 223 return nil, errors.New(fmt.Sprintf( 224 "invalid docCount: %v maxDoc: %v (resource=%v)", 225 docCount, info.DocCount(), fp.in)) 226 } 227 if sumDocFreq < int64(docCount) { // #postings must be >= #docs with field 228 return nil, errors.New(fmt.Sprintf( 229 "invalid sumDocFreq: %v docCount: %v (resource=%v)", 230 sumDocFreq, docCount, fp.in)) 231 } 232 if sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq { // #positions must be >= #postings 233 return nil, errors.New(fmt.Sprintf( 234 "invalid sumTotalTermFreq: %v sumDocFreq: %v (resource=%v)", 235 sumTotalTermFreq, sumDocFreq, fp.in)) 236 } 237 238 var indexStartFP int64 239 if indexDivisor != -1 { 240 if indexStartFP, err = indexIn.ReadVLong(); err != nil { 241 return nil, err 242 } 243 } 244 // log.Printf("indexStartFP: %v", indexStartFP) 245 if _, ok := fp.fields[fieldInfo.Name]; ok { 246 return nil, errors.New(fmt.Sprintf( 247 "duplicate field: %v (resource=%v)", fieldInfo.Name, fp.in)) 248 } 249 if fp.fields[fieldInfo.Name], err = newFieldReader(fp, 250 fieldInfo, numTerms, rootCode, sumTotalTermFreq, 251 sumDocFreq, docCount, indexStartFP, longsSize, 252 indexIn, minTerm, maxTerm); err != nil { 253 return nil, err 254 } 255 } 256 257 if indexDivisor != -1 { 258 if err = indexIn.Close(); err != nil { 259 return nil, err 260 } 261 } 262 263 success = true 264 265 return fp, nil 266 } 267 268 func asInt(n int32, err error) (n2 int, err2 error) { 269 return int(n), err 270 } 271 272 func readBytesRef(in store.IndexInput) ([]byte, error) { 273 length, err := asInt(in.ReadVInt()) 274 if err != nil { 275 return nil, err 276 } 277 bytes := make([]byte, length) 278 if err = in.ReadBytes(bytes); err != nil { 279 return nil, err 280 } 281 return bytes, nil 282 } 283 284 func (r *BlockTreeTermsReader) readHeader(input store.IndexInput) (version int, err error) { 285 version, err = asInt(codec.CheckHeader(input, TERMS_CODEC_NAME, TERMS_VERSION_START, TERMS_VERSION_CURRENT)) 286 if err != nil { 287 return int(version), err 288 } 289 if version < TERMS_VERSION_APPEND_ONLY { 290 r.dirOffset, err = input.ReadLong() 291 if err != nil { 292 return int(version), err 293 } 294 } 295 return int(version), nil 296 } 297 298 func (r *BlockTreeTermsReader) readIndexHeader(input store.IndexInput) (version int, err error) { 299 version, err = asInt(codec.CheckHeader(input, TERMS_INDEX_CODEC_NAME, TERMS_VERSION_START, TERMS_VERSION_CURRENT)) 300 if err != nil { 301 return version, err 302 } 303 if version < TERMS_VERSION_APPEND_ONLY { 304 r.indexDirOffset, err = input.ReadLong() 305 if err != nil { 306 return version, err 307 } 308 } 309 return version, nil 310 } 311 312 func (r *BlockTreeTermsReader) seekDir(input store.IndexInput, dirOffset int64) (err error) { 313 // log.Printf("Seeking to: %v", dirOffset) 314 if r.version >= TERMS_VERSION_CHECKSUM { 315 if err = input.Seek(input.Length() - codec.FOOTER_LENGTH - 8); err != nil { 316 return 317 } 318 if dirOffset, err = input.ReadLong(); err != nil { 319 return 320 } 321 } else if r.version >= TERMS_VERSION_APPEND_ONLY { 322 if err = input.Seek(input.Length() - 8); err != nil { 323 return 324 } 325 if dirOffset, err = input.ReadLong(); err != nil { 326 return 327 } 328 } 329 return input.Seek(dirOffset) 330 } 331 332 func (r *BlockTreeTermsReader) Terms(field string) Terms { 333 ans := r.fields[field] 334 return &ans 335 } 336 337 func (r *BlockTreeTermsReader) Close() error { 338 defer func() { 339 // Clear so refs to terms index is GCable even if 340 // app hangs onto us: 341 r.fields = make(map[string]FieldReader) 342 }() 343 return util.Close(r.in, r.postingsReader) 344 }