github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/search/similarities.go (about) 1 package search 2 3 import ( 4 "github.com/balzaczyy/golucene/core/index" 5 ) 6 7 // search/similarities/Similarity.java 8 9 /* 10 Similarity defines the components of Lucene scoring. 11 12 Expert: Scoring API. 13 14 This is a low-level API, you should only extend this API if you want 15 to implement an information retrieval model. If you are instead 16 looking for a convenient way to alter Lucene's scoring, consider 17 extending a high-level implementation such as TFIDFSimilarity, which 18 implements the vector space model with this API, or just tweaking the 19 default implementation: DefaultSimilarity. 20 21 Similarity determines how Lucene weights terms, and Lucene interacts 22 with this class at both index-time and query-time. 23 24 ######Index-time 25 26 At indexing time, the indexer calls computeNorm(), allowing the 27 Similarity implementation to set a per-document value for the field 28 that will be later accessible via AtomicReader.NormValues(). Lucene 29 makes no assumption about what is in this norm, but it is most useful 30 for encoding length normalization information. 31 32 Implementations should carefully consider how the normalization is 33 encoded: while Lucene's classical TFIDFSimilarity encodes a 34 combination of index-time boost and length normalization information 35 with SmallFLoat into a single byte, this might not be suitble for all 36 purposes. 37 38 Many formulas require the use of average document length, which can 39 be computed via a combination of CollectionStatistics.SumTotalTermFreq() 40 and CollectionStatistics.MaxDoc() or CollectionStatistics.DocCount(), 41 depending upon whether the average should reflect field sparsity. 42 43 Additional scoring factors can be stored in named NumericDocValuesFields 44 and accessed at query-time with AtomicReader.NumericDocValues(). 45 46 Finally, using index-time boosts (either via folding into the 47 normalization byte or via DocValues), is an inefficient way to boost 48 the scores of different fields if the boost will be the same for 49 every document, instead the Similarity can simply take a constant 50 boost parameter C, and PerFieldSimilarityWrapper can return different 51 instances with different boosts depending upon field name. 52 53 ######Query-time 54 55 At query-time, Quries interact with the Similarity via these steps: 56 57 1. The computeWeight() method is called a single time, allowing the 58 implementation to compute any statistics (such as IDF, average 59 document length, etc) across the entire collection. The TermStatistics 60 and CollectionStatistics passed in already contain all of the raw 61 statistics involved, so a Similarity can freely use any combination 62 of statistics without causing any additional I/O. Lucene makes no 63 assumption about what is stored in the returned SimWeight object. 64 2. The query normalization process occurs a single time: 65 SimWeight.ValueForNormalization() is called for each query leaf node, 66 queryNorm() is called for the top-level query, and finally 67 SimWeight.Normalize() passes down the normalization value and any 68 top-level boosts (e.g. from enclosing BooleanQuerys). 69 3. For each sgment in the index, the Query creates a SimScorer. The 70 score() method is called for each matching document. 71 72 ######Exlain-time 73 When IndexSearcher.explain() is called, queries consult the 74 Similarity's DocScorer for an explanation of how it computed its 75 score. The query passes in a the document id and an explanation of 76 how the frequency was computed. 77 */ 78 type Similarity interface { 79 Coord(int, int) float32 80 // Computes the normalization value for a query given the sum of 81 // the normalized weights SimWeight.ValueForNormalization of each 82 // of the query terms. This value is passed back to the weight 83 // (SimWeight.normalize()) of each query term, to provide a hook 84 // to attempt to make scores from different queries comparable. 85 QueryNorm(valueForNormalization float32) float32 86 /* 87 Computes the normalization value for a field, given the 88 accumulated state of term processing for this field (see 89 FieldInvertState). 90 91 Matches in longer fields are less precise, so implementations 92 of this method usually set smaller values when state.Lenght() is 93 larger, and larger values when state.Lenght() is smaller. 94 */ 95 ComputeNorm(state *index.FieldInvertState) int64 96 // Compute any collection-level weight (e.g. IDF, average document 97 // length, etc) needed for scoring a query. 98 computeWeight(queryBoost float32, collectionStats CollectionStatistics, termStats ...TermStatistics) SimWeight 99 // Creates a new SimScorer to score matching documents from a 100 // segment of the inverted index. 101 simScorer(w SimWeight, ctx *index.AtomicReaderContext) (ss SimScorer, err error) 102 } 103 104 // similarities/PerFieldSimilarityWrapper 105 106 type PerFieldSimilarityWrapperSPI interface { 107 Get(name string) Similarity 108 } 109 110 /* 111 Provides the ability to use a different Similarity for different 112 fields. 113 114 Subclasses should implement Get() to return an appropriate Similarity 115 (for example, using field-specific parameter values) for the field. 116 */ 117 type PerFieldSimilarityWrapper struct { 118 spi PerFieldSimilarityWrapperSPI 119 } 120 121 func NewPerFieldSimilarityWrapper(spi PerFieldSimilarityWrapperSPI) *PerFieldSimilarityWrapper { 122 return &PerFieldSimilarityWrapper{spi: spi} 123 } 124 125 func (wrapper *PerFieldSimilarityWrapper) ComputeNorm(state *index.FieldInvertState) int64 { 126 return wrapper.spi.Get(state.Name()).ComputeNorm(state) 127 } 128 129 func (wrapper *PerFieldSimilarityWrapper) computeWeight(queryBoost float32, 130 collectionStats CollectionStatistics, termStats ...TermStatistics) SimWeight { 131 sim := wrapper.spi.Get(collectionStats.field) 132 return &PerFieldSimWeight{sim, sim.computeWeight(queryBoost, collectionStats, termStats...)} 133 } 134 135 func (wrapper *PerFieldSimilarityWrapper) simScorer(w SimWeight, ctx *index.AtomicReaderContext) (ss SimScorer, err error) { 136 panic("not implemented yet") 137 } 138 139 type PerFieldSimWeight struct { 140 delegate Similarity 141 delegateWeight SimWeight 142 } 143 144 func (w *PerFieldSimWeight) ValueForNormalization() float32 { 145 return w.delegateWeight.ValueForNormalization() 146 } 147 148 func (w *PerFieldSimWeight) Normalize(queryNorm, topLevelBoost float32) { 149 w.delegateWeight.Normalize(queryNorm, topLevelBoost) 150 }