github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/search/similarities.go

github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/search/similarities.go (about)

     1  package search
     2  
     3  import (
     4  	"github.com/balzaczyy/golucene/core/index"
     5  )
     6  
     7  // search/similarities/Similarity.java
     8  
     9  /*
    10  Similarity defines the components of Lucene scoring.
    11  
    12  Expert: Scoring API.
    13  
    14  This is a low-level API, you should only extend this API if you want
    15  to implement an information retrieval model. If you are instead
    16  looking for a convenient way to alter Lucene's scoring, consider
    17  extending a high-level implementation such as TFIDFSimilarity, which
    18  implements the vector space model with this API, or just tweaking the
    19  default implementation: DefaultSimilarity.
    20  
    21  Similarity determines how Lucene weights terms, and Lucene interacts
    22  with this class at both index-time and query-time.
    23  
    24  ######Index-time
    25  
    26  At indexing time, the indexer calls computeNorm(), allowing the
    27  Similarity implementation to set a per-document value for the field
    28  that will be later accessible via AtomicReader.NormValues(). Lucene
    29  makes no assumption about what is in this norm, but it is most useful
    30  for encoding length normalization information.
    31  
    32  Implementations should carefully consider how the normalization is
    33  encoded: while Lucene's classical TFIDFSimilarity encodes a
    34  combination of index-time boost and length normalization information
    35  with SmallFLoat into a single byte, this might not be suitble for all
    36  purposes.
    37  
    38  Many formulas require the use of average document length, which can
    39  be computed via a combination of CollectionStatistics.SumTotalTermFreq()
    40  and CollectionStatistics.MaxDoc() or CollectionStatistics.DocCount(),
    41  depending upon whether the average should reflect field sparsity.
    42  
    43  Additional scoring factors can be stored in named NumericDocValuesFields
    44  and accessed at query-time with AtomicReader.NumericDocValues().
    45  
    46  Finally, using index-time boosts (either via folding into the
    47  normalization byte or via DocValues), is an inefficient way to boost
    48  the scores of different fields if the boost will be the same for
    49  every document, instead the Similarity can simply take a constant
    50  boost parameter C, and PerFieldSimilarityWrapper can return different
    51  instances with different boosts depending upon field name.
    52  
    53  ######Query-time
    54  
    55  At query-time, Quries interact with the Similarity via these steps:
    56  
    57  1. The computeWeight() method is called a single time, allowing the
    58  implementation to compute any statistics (such as IDF, average
    59  document length, etc) across the entire collection. The TermStatistics
    60  and CollectionStatistics passed in already contain all of the raw
    61  statistics involved, so a Similarity can freely use any combination
    62  of statistics without causing any additional I/O. Lucene makes no
    63  assumption about what is stored in the returned SimWeight object.
    64  2. The query normalization process occurs a single time:
    65  SimWeight.ValueForNormalization() is called for each query leaf node,
    66  queryNorm() is called for the top-level query, and finally
    67  SimWeight.Normalize() passes down the normalization value and any
    68  top-level boosts (e.g. from enclosing BooleanQuerys).
    69  3. For each sgment in the index, the Query creates a SimScorer. The
    70  score() method is called for each matching document.
    71  
    72  ######Exlain-time
    73  When IndexSearcher.explain() is called, queries consult the
    74  Similarity's DocScorer for an explanation of how it computed its
    75  score. The query passes in a the document id and an explanation of
    76  how the frequency was computed.
    77  */
    78  type Similarity interface {
    79  	Coord(int, int) float32
    80  	// Computes the normalization value for a query given the sum of
    81  	// the normalized weights SimWeight.ValueForNormalization of each
    82  	// of the query terms. This value is passed back to the weight
    83  	// (SimWeight.normalize()) of each query term, to provide a  hook
    84  	// to attempt to make scores from different queries comparable.
    85  	QueryNorm(valueForNormalization float32) float32
    86  	/*
    87  		Computes the normalization value for a field, given the
    88  		accumulated state of term processing for this field (see
    89  		FieldInvertState).
    90  
    91  		Matches in longer fields are less precise, so implementations
    92  		of this method usually set smaller values when state.Lenght() is
    93  		larger, and larger values when state.Lenght() is smaller.
    94  	*/
    95  	ComputeNorm(state *index.FieldInvertState) int64
    96  	// Compute any collection-level weight (e.g. IDF, average document
    97  	// length, etc) needed for scoring a query.
    98  	computeWeight(queryBoost float32, collectionStats CollectionStatistics, termStats ...TermStatistics) SimWeight
    99  	// Creates a new SimScorer to score matching documents from a
   100  	// segment of the inverted index.
   101  	simScorer(w SimWeight, ctx *index.AtomicReaderContext) (ss SimScorer, err error)
   102  }
   103  
   104  // similarities/PerFieldSimilarityWrapper
   105  
   106  type PerFieldSimilarityWrapperSPI interface {
   107  	Get(name string) Similarity
   108  }
   109  
   110  /*
   111  Provides the ability to use a different Similarity for different
   112  fields.
   113  
   114  Subclasses should implement Get() to return an appropriate Similarity
   115  (for example, using field-specific parameter values) for the field.
   116  */
   117  type PerFieldSimilarityWrapper struct {
   118  	spi PerFieldSimilarityWrapperSPI
   119  }
   120  
   121  func NewPerFieldSimilarityWrapper(spi PerFieldSimilarityWrapperSPI) *PerFieldSimilarityWrapper {
   122  	return &PerFieldSimilarityWrapper{spi: spi}
   123  }
   124  
   125  func (wrapper *PerFieldSimilarityWrapper) ComputeNorm(state *index.FieldInvertState) int64 {
   126  	return wrapper.spi.Get(state.Name()).ComputeNorm(state)
   127  }
   128  
   129  func (wrapper *PerFieldSimilarityWrapper) computeWeight(queryBoost float32,
   130  	collectionStats CollectionStatistics, termStats ...TermStatistics) SimWeight {
   131  	sim := wrapper.spi.Get(collectionStats.field)
   132  	return &PerFieldSimWeight{sim, sim.computeWeight(queryBoost, collectionStats, termStats...)}
   133  }
   134  
   135  func (wrapper *PerFieldSimilarityWrapper) simScorer(w SimWeight, ctx *index.AtomicReaderContext) (ss SimScorer, err error) {
   136  	panic("not implemented yet")
   137  }
   138  
   139  type PerFieldSimWeight struct {
   140  	delegate       Similarity
   141  	delegateWeight SimWeight
   142  }
   143  
   144  func (w *PerFieldSimWeight) ValueForNormalization() float32 {
   145  	return w.delegateWeight.ValueForNormalization()
   146  }
   147  
   148  func (w *PerFieldSimWeight) Normalize(queryNorm, topLevelBoost float32) {
   149  	w.delegateWeight.Normalize(queryNorm, topLevelBoost)
   150  }