github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/search/search.go

github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/search/search.go (about)

     1  package search
     2  
     3  import (
     4  	"fmt"
     5  	. "github.com/balzaczyy/golucene/core/codec/spi"
     6  	"github.com/balzaczyy/golucene/core/index"
     7  	"github.com/balzaczyy/golucene/core/util"
     8  	"log"
     9  	"math"
    10  )
    11  
    12  /* Define service that can be overrided */
    13  type IndexSearcherSPI interface {
    14  	CreateNormalizedWeight(Query) (Weight, error)
    15  	Rewrite(Query) (Query, error)
    16  	WrapFilter(Query, Filter) Query
    17  	SearchLWC([]*index.AtomicReaderContext, Weight, Collector) error
    18  }
    19  
    20  // IndexSearcher
    21  type IndexSearcher struct {
    22  	spi           IndexSearcherSPI
    23  	reader        index.IndexReader
    24  	readerContext index.IndexReaderContext
    25  	leafContexts  []*index.AtomicReaderContext
    26  	similarity    Similarity
    27  }
    28  
    29  func NewIndexSearcher(r index.IndexReader) *IndexSearcher {
    30  	// log.Print("Initializing IndexSearcher from IndexReader: ", r)
    31  	return NewIndexSearcherFromContext(r.Context())
    32  }
    33  
    34  func NewIndexSearcherFromContext(context index.IndexReaderContext) *IndexSearcher {
    35  	// assert2(context.isTopLevel, "IndexSearcher's ReaderContext must be topLevel for reader %v", context.reader())
    36  	defaultSimilarity := NewDefaultSimilarity()
    37  	ss := &IndexSearcher{nil, context.Reader(), context, context.Leaves(), defaultSimilarity}
    38  	ss.spi = ss
    39  	return ss
    40  }
    41  
    42  /* Expert: set the similarity implementation used by this IndexSearcher. */
    43  func (ss *IndexSearcher) SetSimilarity(similarity Similarity) {
    44  	ss.similarity = similarity
    45  }
    46  
    47  func (ss *IndexSearcher) SearchTop(q Query, n int) (topDocs TopDocs, err error) {
    48  	return ss.Search(q, nil, n)
    49  }
    50  
    51  func (ss *IndexSearcher) Search(q Query, f Filter, n int) (topDocs TopDocs, err error) {
    52  	w, err := ss.spi.CreateNormalizedWeight(ss.spi.WrapFilter(q, f))
    53  	if err != nil {
    54  		return TopDocs{}, err
    55  	}
    56  	return ss.searchWSI(w, nil, n), nil
    57  }
    58  
    59  /** Expert: Low-level search implementation.  Finds the top <code>n</code>
    60   * hits for <code>query</code>, applying <code>filter</code> if non-null.
    61   *
    62   * <p>Applications should usually call {@link IndexSearcher#search(Query,int)} or
    63   * {@link IndexSearcher#search(Query,Filter,int)} instead.
    64   * @throws BooleanQuery.TooManyClauses If a query would exceed
    65   *         {@link BooleanQuery#getMaxClauseCount()} clauses.
    66   */
    67  func (ss *IndexSearcher) searchWSI(w Weight, after *ScoreDoc, nDocs int) TopDocs {
    68  	// TODO support concurrent search
    69  	return ss.searchLWSI(ss.leafContexts, w, after, nDocs)
    70  }
    71  
    72  /** Expert: Low-level search implementation.  Finds the top <code>n</code>
    73   * hits for <code>query</code>.
    74   *
    75   * <p>Applications should usually call {@link IndexSearcher#search(Query,int)} or
    76   * {@link IndexSearcher#search(Query,Filter,int)} instead.
    77   * @throws BooleanQuery.TooManyClauses If a query would exceed
    78   *         {@link BooleanQuery#getMaxClauseCount()} clauses.
    79   */
    80  func (ss *IndexSearcher) searchLWSI(leaves []*index.AtomicReaderContext,
    81  	w Weight, after *ScoreDoc, nDocs int) TopDocs {
    82  	// single thread
    83  	limit := ss.reader.MaxDoc()
    84  	if limit == 0 {
    85  		limit = 1
    86  	}
    87  	if nDocs > limit {
    88  		nDocs = limit
    89  	}
    90  	collector := NewTopScoreDocCollector(nDocs, after, !w.IsScoresDocsOutOfOrder())
    91  	ss.spi.SearchLWC(leaves, w, collector)
    92  	return collector.TopDocs()
    93  }
    94  
    95  func (ss *IndexSearcher) SearchLWC(leaves []*index.AtomicReaderContext, w Weight, c Collector) (err error) {
    96  	// TODO: should we make this
    97  	// threaded...?  the Collector could be sync'd?
    98  	// always use single thread:
    99  	for _, ctx := range leaves { // search each subreader
   100  		// TODO catch CollectionTerminatedException
   101  		c.SetNextReader(ctx)
   102  
   103  		scorer, err := w.BulkScorer(ctx, !c.AcceptsDocsOutOfOrder(),
   104  			ctx.Reader().(index.AtomicReader).LiveDocs())
   105  		if err != nil {
   106  			return err
   107  		}
   108  		if scorer != nil {
   109  			err = scorer.ScoreAndCollect(c)
   110  		} // TODO catch CollectionTerminatedException
   111  	}
   112  	return
   113  }
   114  
   115  func (ss *IndexSearcher) WrapFilter(q Query, f Filter) Query {
   116  	if f == nil {
   117  		return q
   118  	}
   119  	panic("FilteredQuery not supported yet")
   120  }
   121  
   122  /*
   123  Returns an Explanation that describes how doc scored against query.
   124  
   125  This is intended to be used in developing Similiarity implemenations, and, for
   126  good performance, should not be displayed with every hit. Computing an
   127  explanation is as expensive as executing the query over the entire index.
   128  */
   129  func (ss *IndexSearcher) Explain(query Query, doc int) (exp Explanation, err error) {
   130  	w, err := ss.spi.CreateNormalizedWeight(query)
   131  	if err == nil {
   132  		return ss.explain(w, doc)
   133  	}
   134  	return
   135  }
   136  
   137  /*
   138  Expert: low-level implementation method
   139  Returns an Explanation that describes how doc scored against weight.
   140  
   141  This is intended to be used in developing Similarity implementations, and, for
   142  good performance, should not be displayed with every hit. Computing an
   143  explanation is as expensive as executing the query over the entire index.
   144  
   145  Applications should call explain(Query, int).
   146  */
   147  func (ss *IndexSearcher) explain(weight Weight, doc int) (exp Explanation, err error) {
   148  	n := index.SubIndex(doc, ss.leafContexts)
   149  	ctx := ss.leafContexts[n]
   150  	deBasedDoc := doc - ctx.DocBase
   151  	return weight.Explain(ctx, deBasedDoc)
   152  }
   153  
   154  func (ss *IndexSearcher) CreateNormalizedWeight(q Query) (w Weight, err error) {
   155  	q, err = ss.spi.Rewrite(q)
   156  	if err != nil {
   157  		return nil, err
   158  	}
   159  	log.Printf("After rewrite: %v", q)
   160  	w, err = q.CreateWeight(ss)
   161  	if err != nil {
   162  		return nil, err
   163  	}
   164  	v := w.ValueForNormalization()
   165  	norm := ss.similarity.QueryNorm(v)
   166  	if math.IsInf(float64(norm), 1) || math.IsNaN(float64(norm)) {
   167  		norm = 1.0
   168  	}
   169  	w.Normalize(norm, 1.0)
   170  	return w, nil
   171  }
   172  
   173  func (ss *IndexSearcher) Rewrite(q Query) (Query, error) {
   174  	log.Printf("Rewriting '%v'...", q)
   175  	after := q.Rewrite(ss.reader)
   176  	for after != q {
   177  		q = after
   178  		after = q.Rewrite(ss.reader)
   179  	}
   180  	return q, nil
   181  }
   182  
   183  // Returns this searhcers the top-level IndexReaderContext
   184  func (ss *IndexSearcher) TopReaderContext() index.IndexReaderContext {
   185  	return ss.readerContext
   186  }
   187  
   188  func (ss *IndexSearcher) String() string {
   189  	return fmt.Sprintf("IndexSearcher(%v)", ss.reader)
   190  }
   191  
   192  func (ss *IndexSearcher) TermStatistics(term *index.Term, context *index.TermContext) TermStatistics {
   193  	return NewTermStatistics(term.Bytes, int64(context.DocFreq), context.TotalTermFreq)
   194  }
   195  
   196  func (ss *IndexSearcher) CollectionStatistics(field string) CollectionStatistics {
   197  	terms := index.GetMultiTerms(ss.reader, field)
   198  	if terms == nil {
   199  		return NewCollectionStatistics(field, int64(ss.reader.MaxDoc()), 0, 0, 0)
   200  	}
   201  	return NewCollectionStatistics(field, int64(ss.reader.MaxDoc()), int64(terms.DocCount()), terms.SumTotalTermFreq(), terms.SumDocFreq())
   202  }
   203  
   204  type TermStatistics struct {
   205  	Term                   []byte
   206  	DocFreq, TotalTermFreq int64
   207  }
   208  
   209  func NewTermStatistics(term []byte, docFreq, totalTermFreq int64) TermStatistics {
   210  	// assert docFreq >= 0;
   211  	// assert totalTermFreq == -1 || totalTermFreq >= docFreq; // #positions must be >= #postings
   212  	return TermStatistics{term, docFreq, totalTermFreq}
   213  }
   214  
   215  type CollectionStatistics struct {
   216  	field                                          string
   217  	maxDoc, docCount, sumTotalTermFreq, sumDocFreq int64
   218  }
   219  
   220  func NewCollectionStatistics(field string, maxDoc, docCount, sumTotalTermFreq, sumDocFreq int64) CollectionStatistics {
   221  	// assert maxDoc >= 0;
   222  	// assert docCount >= -1 && docCount <= maxDoc; // #docs with field must be <= #docs
   223  	// assert sumDocFreq == -1 || sumDocFreq >= docCount; // #postings must be >= #docs with field
   224  	// assert sumTotalTermFreq == -1 || sumTotalTermFreq >= sumDocFreq; // #positions must be >= #postings
   225  	return CollectionStatistics{field, maxDoc, docCount, sumTotalTermFreq, sumDocFreq}
   226  }
   227  
   228  /**
   229   * API for scoring "sloppy" queries such as {@link TermQuery},
   230   * {@link SpanQuery}, and {@link PhraseQuery}.
   231   * <p>
   232   * Frequencies are floating-point values: an approximate
   233   * within-document frequency adjusted for "sloppiness" by
   234   * {@link SimScorer#computeSlopFactor(int)}.
   235   */
   236  type SimScorer interface {
   237  	/**
   238  	 * Score a single document
   239  	 * @param doc document id within the inverted index segment
   240  	 * @param freq sloppy term frequency
   241  	 * @return document's score
   242  	 */
   243  	Score(doc int, freq float32) float32
   244  	// Explain the score for a single document
   245  	explain(int, Explanation) Explanation
   246  }
   247  
   248  type SimWeight interface {
   249  	ValueForNormalization() float32
   250  	Normalize(norm float32, topLevelBoost float32)
   251  }
   252  
   253  // search/similarities/TFIDFSimilarity.java
   254  
   255  type ITFIDFSimilarity interface {
   256  	/** Computes a score factor based on a term or phrase's frequency in a
   257  	 * document.  This value is multiplied by the {@link #idf(long, long)}
   258  	 * factor for each term in the query and these products are then summed to
   259  	 * form the initial score for a document.
   260  	 *
   261  	 * <p>Terms and phrases repeated in a document indicate the topic of the
   262  	 * document, so implementations of this method usually return larger values
   263  	 * when <code>freq</code> is large, and smaller values when <code>freq</code>
   264  	 * is small.
   265  	 *
   266  	 * @param freq the frequency of a term within a document
   267  	 * @return a score factor based on a term's within-document frequency
   268  	 */
   269  	tf(freq float32) float32
   270  	/** Computes a score factor based on a term's document frequency (the number
   271  	 * of documents which contain the term).  This value is multiplied by the
   272  	 * {@link #tf(float)} factor for each term in the query and these products are
   273  	 * then summed to form the initial score for a document.
   274  	 *
   275  	 * <p>Terms that occur in fewer documents are better indicators of topic, so
   276  	 * implementations of this method usually return larger values for rare terms,
   277  	 * and smaller values for common terms.
   278  	 *
   279  	 * @param docFreq the number of documents which contain the term
   280  	 * @param numDocs the total number of documents in the collection
   281  	 * @return a score factor based on the term's document frequency
   282  	 */
   283  	idf(docFreq int64, numDocs int64) float32
   284  	// Compute an index-time normalization value for this field instance.
   285  	//
   286  	// This value will be stored in a single byte lossy representation
   287  	// by encodeNormValue().
   288  	lengthNorm(*index.FieldInvertState) float32
   289  	// Decodes a normalization factor stored in an index.
   290  	decodeNormValue(norm int64) float32
   291  	// Encodes a normalization factor for storage in an index.
   292  	encodeNormValue(float32) int64
   293  }
   294  
   295  type TFIDFSimilarity struct {
   296  	spi ITFIDFSimilarity
   297  }
   298  
   299  func newTFIDFSimilarity(spi ITFIDFSimilarity) *TFIDFSimilarity {
   300  	return &TFIDFSimilarity{spi}
   301  }
   302  
   303  func (ts *TFIDFSimilarity) idfExplainTerm(collectionStats CollectionStatistics, termStats TermStatistics) Explanation {
   304  	df, max := termStats.DocFreq, collectionStats.maxDoc
   305  	idf := ts.spi.idf(df, max)
   306  	return newExplanation(idf, fmt.Sprintf("idf(docFreq=%v, maxDocs=%v)", df, max))
   307  }
   308  
   309  func (ts *TFIDFSimilarity) idfExplainPhrase(collectionStats CollectionStatistics, termStats []TermStatistics) Explanation {
   310  	details := make([]Explanation, len(termStats))
   311  	var idf float32 = 0
   312  	for i, stat := range termStats {
   313  		details[i] = ts.idfExplainTerm(collectionStats, stat)
   314  		idf += details[i].(*ExplanationImpl).value
   315  	}
   316  	ans := newExplanation(idf, fmt.Sprintf("idf(), sum of:"))
   317  	ans.details = details
   318  	return ans
   319  }
   320  
   321  func (ts *TFIDFSimilarity) ComputeNorm(state *index.FieldInvertState) int64 {
   322  	return ts.spi.encodeNormValue(ts.spi.lengthNorm(state))
   323  }
   324  
   325  func (ts *TFIDFSimilarity) computeWeight(queryBoost float32, collectionStats CollectionStatistics, termStats ...TermStatistics) SimWeight {
   326  	var idf Explanation
   327  	if len(termStats) == 1 {
   328  		idf = ts.idfExplainTerm(collectionStats, termStats[0])
   329  	} else {
   330  		idf = ts.idfExplainPhrase(collectionStats, termStats)
   331  	}
   332  	return newIDFStats(collectionStats.field, idf, queryBoost)
   333  }
   334  
   335  func (ts *TFIDFSimilarity) simScorer(stats SimWeight, ctx *index.AtomicReaderContext) (ss SimScorer, err error) {
   336  	idfstats := stats.(*idfStats)
   337  	ndv, err := ctx.Reader().(index.AtomicReader).NormValues(idfstats.field)
   338  	if err != nil {
   339  		return nil, err
   340  	}
   341  	return newTFIDFSimScorer(ts, idfstats, ndv), nil
   342  }
   343  
   344  type tfIDFSimScorer struct {
   345  	owner       *TFIDFSimilarity
   346  	stats       *idfStats
   347  	weightValue float32
   348  	norms       NumericDocValues
   349  }
   350  
   351  func newTFIDFSimScorer(owner *TFIDFSimilarity, stats *idfStats, norms NumericDocValues) *tfIDFSimScorer {
   352  	return &tfIDFSimScorer{owner, stats, stats.value, norms}
   353  }
   354  
   355  func (ss *tfIDFSimScorer) Score(doc int, freq float32) float32 {
   356  	raw := ss.owner.spi.tf(freq) * ss.weightValue // compute tf(f)*weight
   357  	if ss.norms == nil {
   358  		return raw
   359  	}
   360  	return raw * ss.owner.spi.decodeNormValue(ss.norms(doc)) // normalize for field
   361  }
   362  
   363  func (ss *tfIDFSimScorer) explain(doc int, freq Explanation) Explanation {
   364  	return ss.owner.explainScore(doc, freq, ss.stats, ss.norms)
   365  }
   366  
   367  /** Collection statistics for the TF-IDF model. The only statistic of interest
   368   * to this model is idf. */
   369  type idfStats struct {
   370  	field string
   371  	/** The idf and its explanation */
   372  	idf         Explanation
   373  	queryNorm   float32
   374  	queryWeight float32
   375  	queryBoost  float32
   376  	value       float32
   377  }
   378  
   379  func newIDFStats(field string, idf Explanation, queryBoost float32) *idfStats {
   380  	// TODO: validate?
   381  	return &idfStats{
   382  		field:       field,
   383  		idf:         idf,
   384  		queryBoost:  queryBoost,
   385  		queryWeight: idf.(*ExplanationImpl).value * queryBoost, // compute query weight
   386  	}
   387  }
   388  
   389  func (stats *idfStats) ValueForNormalization() float32 {
   390  	// TODO: (sorta LUCENE-1907) make non-static class and expose this squaring via a nice method to subclasses?
   391  	return stats.queryWeight * stats.queryWeight // sum of squared weights
   392  }
   393  
   394  func (stats *idfStats) Normalize(queryNorm float32, topLevelBoost float32) {
   395  	stats.queryNorm = queryNorm * topLevelBoost
   396  	stats.queryWeight *= stats.queryNorm                                 // normalize query weight
   397  	stats.value = stats.queryWeight * stats.idf.(*ExplanationImpl).value // idf for document
   398  }
   399  
   400  func (ss *TFIDFSimilarity) explainScore(doc int, freq Explanation,
   401  	stats *idfStats, norms NumericDocValues) Explanation {
   402  
   403  	// explain query weight
   404  	boostExpl := newExplanation(stats.queryBoost, "boost")
   405  	queryNormExpl := newExplanation(stats.queryNorm, "queryNorm")
   406  	queryExpl := newExplanation(
   407  		boostExpl.value*stats.idf.Value()*queryNormExpl.value,
   408  		"queryWeight, product of:")
   409  	if stats.queryBoost != 1 {
   410  		queryExpl.addDetail(boostExpl)
   411  	}
   412  	queryExpl.addDetail(stats.idf)
   413  	queryExpl.addDetail(queryNormExpl)
   414  
   415  	// explain field weight
   416  	tfExplanation := newExplanation(ss.spi.tf(freq.Value()),
   417  		fmt.Sprintf("tf(freq=%v), with freq of:", freq.Value()))
   418  	tfExplanation.addDetail(freq)
   419  	fieldNorm := float32(1)
   420  	if norms != nil {
   421  		fieldNorm = ss.spi.decodeNormValue(norms(doc))
   422  	}
   423  	fieldNormExpl := newExplanation(fieldNorm, fmt.Sprintf("fieldNorm(doc=%v)", doc))
   424  	fieldExpl := newExplanation(
   425  		tfExplanation.value*stats.idf.Value()*fieldNormExpl.value,
   426  		fmt.Sprintf("fieldWeight in %v, product of:", doc))
   427  	fieldExpl.addDetail(tfExplanation)
   428  	fieldExpl.addDetail(stats.idf)
   429  	fieldExpl.addDetail(fieldNormExpl)
   430  
   431  	if queryExpl.value == 1 {
   432  		return fieldExpl
   433  	}
   434  
   435  	// combine them
   436  	ans := newExplanation(queryExpl.value*fieldExpl.value,
   437  		fmt.Sprintf("score(doc=%v,freq=%v), product of:", doc, freq))
   438  	ans.addDetail(queryExpl)
   439  	ans.addDetail(fieldExpl)
   440  	return ans
   441  }
   442  
   443  // search/similarities/DefaultSimilarity.java
   444  
   445  /** Cache of decoded bytes. */
   446  var NORM_TABLE []float32 = buildNormTable()
   447  
   448  func buildNormTable() []float32 {
   449  	table := make([]float32, 256)
   450  	for i, _ := range table {
   451  		table[i] = util.Byte315ToFloat(byte(i))
   452  	}
   453  	return table
   454  }
   455  
   456  type DefaultSimilarity struct {
   457  	*TFIDFSimilarity
   458  	discountOverlaps bool
   459  }
   460  
   461  func NewDefaultSimilarity() *DefaultSimilarity {
   462  	ans := &DefaultSimilarity{discountOverlaps: true}
   463  	ans.TFIDFSimilarity = newTFIDFSimilarity(ans)
   464  	return ans
   465  }
   466  
   467  func (ds *DefaultSimilarity) Coord(overlap, maxOverlap int) float32 {
   468  	return float32(overlap) / float32(maxOverlap)
   469  }
   470  
   471  func (ds *DefaultSimilarity) QueryNorm(sumOfSquaredWeights float32) float32 {
   472  	return float32(1.0 / math.Sqrt(float64(sumOfSquaredWeights)))
   473  }
   474  
   475  /*
   476  Encodes a normalization factor for storage in an index.
   477  
   478  The encoding uses a three-bit mantissa, a five-bit exponent, and the
   479  zero-exponent point at 15, thus representing values from around
   480  7x10^9 to 2x10^-9 with about one significant decimal digit of
   481  accuracy. Zero is also represented. Negative numbers are rounded up
   482  to zero. Values too large to represent are rounded down to the
   483  largest representable value. Positive values too small to represent
   484  are rounded up to the smallest positive representable value.
   485  */
   486  func (ds *DefaultSimilarity) encodeNormValue(f float32) int64 {
   487  	return int64(util.FloatToByte315(f))
   488  }
   489  
   490  func (ds *DefaultSimilarity) decodeNormValue(norm int64) float32 {
   491  	return NORM_TABLE[int(norm&0xff)] // & 0xFF maps negative bytes to positive above 127
   492  }
   493  
   494  /*
   495  Implemented as state.boost() * lengthNorm(numTerms), where numTerms
   496  is FieldInvertState.length() if setDiscountOverlaps() is false, else
   497  it's FieldInvertState.length() - FieldInvertState.numOverlap().
   498  */
   499  func (ds *DefaultSimilarity) lengthNorm(state *index.FieldInvertState) float32 {
   500  	var numTerms int
   501  	if ds.discountOverlaps {
   502  		numTerms = state.Length() - state.NumOverlap()
   503  	} else {
   504  		numTerms = state.Length()
   505  	}
   506  	return state.Boost() * float32(1.0/math.Sqrt(float64(numTerms)))
   507  }
   508  
   509  func (ds *DefaultSimilarity) tf(freq float32) float32 {
   510  	return float32(math.Sqrt(float64(freq)))
   511  }
   512  
   513  func (ds *DefaultSimilarity) idf(docFreq int64, numDocs int64) float32 {
   514  	return float32(math.Log(float64(numDocs)/float64(docFreq+1))) + 1.0
   515  }
   516  
   517  func (ds *DefaultSimilarity) String() string {
   518  	return "DefaultSImilarity"
   519  }