github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/blocktree/termsReader.go (about)

     1  package blocktree
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"github.com/balzaczyy/golucene/core/codec"
     7  	. "github.com/balzaczyy/golucene/core/codec/spi"
     8  	. "github.com/balzaczyy/golucene/core/index/model"
     9  	"github.com/balzaczyy/golucene/core/store"
    10  	"github.com/balzaczyy/golucene/core/util"
    11  )
    12  
    13  // BlockTreeTermsReader.java
    14  
    15  const (
    16  	BTT_OUTPUT_FLAGS_NUM_BITS = 2
    17  	BTT_OUTPUT_FLAG_IS_FLOOR  = 1
    18  	BTT_OUTPUT_FLAG_HAS_TERMS = 2
    19  
    20  	// BTT_INDEX_EXTENSION           = "tip"
    21  	// BTT_INDEX_CODEC_NAME          = "BLOCK_TREE_TERMS_INDEX"
    22  	// BTT_INDEX_VERSION_START       = 0
    23  	// BTT_INDEX_VERSION_APPEND_ONLY = 1
    24  	// BTT_INDEX_VERSION_CURRENT     = BTT_INDEX_VERSION_APPEND_ONLY
    25  )
    26  
    27  /* A block-based terms index and dictionary that assigns
    28  terms to variable length blocks according to how they
    29  share prefixes. The terms index is a prefix trie
    30  whose leaves are term blocks. The advantage of this
    31  approach is that seekExact is often able to
    32  determine a term cannot exist without doing any IO, and
    33  intersection with Automata is very fast. NOte that this
    34  terms dictionary has its own fixed terms index (ie, it
    35  does not support a pluggable terms index
    36  implementation).
    37  
    38  NOTE: this terms dictionary does not support
    39  index divisor when opening an IndexReader. Instead, you
    40  can change the min/maxItemsPerBlock during indexing.
    41  
    42  The data strucure used by this implementation is very
    43  similar to a [burst trie]
    44  (http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499),
    45  but with added logic to break up too-large blocks of all
    46  terms sharing a given prefix into smaller ones.
    47  
    48  Use CheckIndex with the -verbose
    49  option to see summary statistics on the blocks in the
    50  dictionary. */
    51  type BlockTreeTermsReader struct {
    52  	// Open input to the main terms dict file (_X.tib)
    53  	in store.IndexInput
    54  	// Reads the terms dict entries, to gather state to
    55  	// produce DocsEnum on demand
    56  	postingsReader PostingsReaderBase
    57  	fields         map[string]FieldReader
    58  	// File offset where the directory starts in the terms file.
    59  	dirOffset int64
    60  	// File offset where the directory starts in the index file.
    61  	indexDirOffset int64
    62  	segment        string
    63  	version        int
    64  }
    65  
    66  func NewBlockTreeTermsReader(dir store.Directory,
    67  	fieldInfos FieldInfos, info *SegmentInfo,
    68  	postingsReader PostingsReaderBase, ctx store.IOContext,
    69  	segmentSuffix string, indexDivisor int) (p FieldsProducer, err error) {
    70  
    71  	// log.Print("Initializing BlockTreeTermsReader...")
    72  	fp := &BlockTreeTermsReader{
    73  		postingsReader: postingsReader,
    74  		fields:         make(map[string]FieldReader),
    75  		segment:        info.Name,
    76  	}
    77  	fp.in, err = dir.OpenInput(util.SegmentFileName(info.Name, segmentSuffix, TERMS_EXTENSION), ctx)
    78  	if err != nil {
    79  		return nil, err
    80  	}
    81  
    82  	success := false
    83  	var indexIn store.IndexInput
    84  	defer func() {
    85  		if !success {
    86  			fmt.Println("Failed to initialize BlockTreeTermsReader.")
    87  			if err != nil {
    88  				fmt.Println("DEBUG ", err)
    89  			}
    90  			// this.close() will close in:
    91  			util.CloseWhileSuppressingError(indexIn, fp)
    92  		}
    93  	}()
    94  
    95  	fp.version, err = fp.readHeader(fp.in)
    96  	if err != nil {
    97  		return nil, err
    98  	}
    99  	// log.Printf("Version: %v", fp.version)
   100  
   101  	if indexDivisor != -1 {
   102  		filename := util.SegmentFileName(info.Name, segmentSuffix, TERMS_INDEX_EXTENSION)
   103  		indexIn, err = dir.OpenInput(filename, ctx)
   104  		if err != nil {
   105  			return nil, err
   106  		}
   107  
   108  		indexVersion, err := fp.readIndexHeader(indexIn)
   109  		if err != nil {
   110  			return nil, err
   111  		}
   112  		// log.Printf("Index version: %v", indexVersion)
   113  		if int(indexVersion) != fp.version {
   114  			return nil, errors.New(fmt.Sprintf("mixmatched version files: %v=%v,%v=%v", fp.in, fp.version, indexIn, indexVersion))
   115  		}
   116  	}
   117  
   118  	// verify
   119  	if indexIn != nil && fp.version >= TERMS_VERSION_CURRENT {
   120  		if _, err = store.ChecksumEntireFile(indexIn); err != nil {
   121  			return nil, err
   122  		}
   123  	}
   124  
   125  	// Have PostingsReader init itself
   126  	postingsReader.Init(fp.in)
   127  
   128  	if fp.version >= TERMS_VERSION_CHECKSUM {
   129  		// NOTE: data file is too costly to verify checksum against all the
   130  		// bytes on open, but for now we at least verify proper structure
   131  		// of the checksum footer: which looks for FOOTER_MAGIC +
   132  		// algorithmID. This is cheap and can detect some forms of
   133  		// corruption such as file trucation.
   134  		if _, err = codec.RetrieveChecksum(fp.in); err != nil {
   135  			return nil, err
   136  		}
   137  	}
   138  
   139  	// Read per-field details
   140  	fp.seekDir(fp.in, fp.dirOffset)
   141  	if indexDivisor != -1 {
   142  		fp.seekDir(indexIn, fp.indexDirOffset)
   143  	}
   144  
   145  	numFields, err := fp.in.ReadVInt()
   146  	if err != nil {
   147  		return nil, err
   148  	}
   149  	// log.Printf("Fields number: %v", numFields)
   150  	if numFields < 0 {
   151  		return nil, errors.New(fmt.Sprintf("invalid numFields: %v (resource=%v)", numFields, fp.in))
   152  	}
   153  
   154  	for i := int32(0); i < numFields; i++ {
   155  		// log.Printf("Next field...")
   156  		field, err := fp.in.ReadVInt()
   157  		if err != nil {
   158  			return nil, err
   159  		}
   160  		// log.Printf("Field: %v", field)
   161  
   162  		numTerms, err := fp.in.ReadVLong()
   163  		if err != nil {
   164  			return nil, err
   165  		}
   166  		assert2(numTerms > 0,
   167  			"Illegal numTerms for field number: %v (resource=%v)", field, fp.in)
   168  		// log.Printf("Terms number: %v", numTerms)
   169  
   170  		numBytes, err := fp.in.ReadVInt()
   171  		if err != nil {
   172  			return nil, err
   173  		}
   174  		assert2(numBytes >= 0,
   175  			"invalid rootCode for field number: %v, numBytes=%v (resource=%v)",
   176  			field, numBytes, fp.in)
   177  		// log.Printf("Bytes number: %v", numBytes)
   178  
   179  		rootCode := make([]byte, numBytes)
   180  		err = fp.in.ReadBytes(rootCode)
   181  		if err != nil {
   182  			return nil, err
   183  		}
   184  		fieldInfo := fieldInfos.FieldInfoByNumber(int(field))
   185  		assert2(fieldInfo != nil, "invalid field numebr: %v (resource=%v)", field, fp.in)
   186  		var sumTotalTermFreq int64
   187  		if fieldInfo.IndexOptions() == INDEX_OPT_DOCS_ONLY {
   188  			sumTotalTermFreq = -1
   189  		} else {
   190  			sumTotalTermFreq, err = fp.in.ReadVLong()
   191  			if err != nil {
   192  				return nil, err
   193  			}
   194  		}
   195  		sumDocFreq, err := fp.in.ReadVLong()
   196  		if err != nil {
   197  			return nil, err
   198  		}
   199  		var docCount int
   200  		if docCount, err = asInt(fp.in.ReadVInt()); err != nil {
   201  			return nil, err
   202  		}
   203  		// fmt.Printf("DocCount: %v\n", docCount)
   204  		var longsSize int
   205  		if fp.version >= TERMS_VERSION_META_ARRAY {
   206  			if longsSize, err = asInt(fp.in.ReadVInt()); err != nil {
   207  				return nil, err
   208  			}
   209  		}
   210  		assert2(longsSize >= 0,
   211  			"invalid longsSize for field: %v, longsSize=%v (resource=%v)",
   212  			fieldInfo.Name, longsSize, fp.in)
   213  		var minTerm, maxTerm []byte
   214  		if fp.version >= TERMS_VERSION_MIN_MAX_TERMS {
   215  			if minTerm, err = readBytesRef(fp.in); err != nil {
   216  				return nil, err
   217  			}
   218  			if maxTerm, err = readBytesRef(fp.in); err != nil {
   219  				return nil, err
   220  			}
   221  		}
   222  		if docCount < 0 || int(docCount) > info.DocCount() { // #docs with field must be <= #docs
   223  			return nil, errors.New(fmt.Sprintf(
   224  				"invalid docCount: %v maxDoc: %v (resource=%v)",
   225  				docCount, info.DocCount(), fp.in))
   226  		}
   227  		if sumDocFreq < int64(docCount) { // #postings must be >= #docs with field
   228  			return nil, errors.New(fmt.Sprintf(
   229  				"invalid sumDocFreq: %v docCount: %v (resource=%v)",
   230  				sumDocFreq, docCount, fp.in))
   231  		}
   232  		if sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq { // #positions must be >= #postings
   233  			return nil, errors.New(fmt.Sprintf(
   234  				"invalid sumTotalTermFreq: %v sumDocFreq: %v (resource=%v)",
   235  				sumTotalTermFreq, sumDocFreq, fp.in))
   236  		}
   237  
   238  		var indexStartFP int64
   239  		if indexDivisor != -1 {
   240  			if indexStartFP, err = indexIn.ReadVLong(); err != nil {
   241  				return nil, err
   242  			}
   243  		}
   244  		// log.Printf("indexStartFP: %v", indexStartFP)
   245  		if _, ok := fp.fields[fieldInfo.Name]; ok {
   246  			return nil, errors.New(fmt.Sprintf(
   247  				"duplicate field: %v (resource=%v)", fieldInfo.Name, fp.in))
   248  		}
   249  		if fp.fields[fieldInfo.Name], err = newFieldReader(fp,
   250  			fieldInfo, numTerms, rootCode, sumTotalTermFreq,
   251  			sumDocFreq, docCount, indexStartFP, longsSize,
   252  			indexIn, minTerm, maxTerm); err != nil {
   253  			return nil, err
   254  		}
   255  	}
   256  
   257  	if indexDivisor != -1 {
   258  		if err = indexIn.Close(); err != nil {
   259  			return nil, err
   260  		}
   261  	}
   262  
   263  	success = true
   264  
   265  	return fp, nil
   266  }
   267  
   268  func asInt(n int32, err error) (n2 int, err2 error) {
   269  	return int(n), err
   270  }
   271  
   272  func readBytesRef(in store.IndexInput) ([]byte, error) {
   273  	length, err := asInt(in.ReadVInt())
   274  	if err != nil {
   275  		return nil, err
   276  	}
   277  	bytes := make([]byte, length)
   278  	if err = in.ReadBytes(bytes); err != nil {
   279  		return nil, err
   280  	}
   281  	return bytes, nil
   282  }
   283  
   284  func (r *BlockTreeTermsReader) readHeader(input store.IndexInput) (version int, err error) {
   285  	version, err = asInt(codec.CheckHeader(input, TERMS_CODEC_NAME, TERMS_VERSION_START, TERMS_VERSION_CURRENT))
   286  	if err != nil {
   287  		return int(version), err
   288  	}
   289  	if version < TERMS_VERSION_APPEND_ONLY {
   290  		r.dirOffset, err = input.ReadLong()
   291  		if err != nil {
   292  			return int(version), err
   293  		}
   294  	}
   295  	return int(version), nil
   296  }
   297  
   298  func (r *BlockTreeTermsReader) readIndexHeader(input store.IndexInput) (version int, err error) {
   299  	version, err = asInt(codec.CheckHeader(input, TERMS_INDEX_CODEC_NAME, TERMS_VERSION_START, TERMS_VERSION_CURRENT))
   300  	if err != nil {
   301  		return version, err
   302  	}
   303  	if version < TERMS_VERSION_APPEND_ONLY {
   304  		r.indexDirOffset, err = input.ReadLong()
   305  		if err != nil {
   306  			return version, err
   307  		}
   308  	}
   309  	return version, nil
   310  }
   311  
   312  func (r *BlockTreeTermsReader) seekDir(input store.IndexInput, dirOffset int64) (err error) {
   313  	// log.Printf("Seeking to: %v", dirOffset)
   314  	if r.version >= TERMS_VERSION_CHECKSUM {
   315  		if err = input.Seek(input.Length() - codec.FOOTER_LENGTH - 8); err != nil {
   316  			return
   317  		}
   318  		if dirOffset, err = input.ReadLong(); err != nil {
   319  			return
   320  		}
   321  	} else if r.version >= TERMS_VERSION_APPEND_ONLY {
   322  		if err = input.Seek(input.Length() - 8); err != nil {
   323  			return
   324  		}
   325  		if dirOffset, err = input.ReadLong(); err != nil {
   326  			return
   327  		}
   328  	}
   329  	return input.Seek(dirOffset)
   330  }
   331  
   332  func (r *BlockTreeTermsReader) Terms(field string) Terms {
   333  	ans := r.fields[field]
   334  	return &ans
   335  }
   336  
   337  func (r *BlockTreeTermsReader) Close() error {
   338  	defer func() {
   339  		// Clear so refs to terms index is GCable even if
   340  		// app hangs onto us:
   341  		r.fields = make(map[string]FieldReader)
   342  	}()
   343  	return util.Close(r.in, r.postingsReader)
   344  }