github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/search/search.go (about) 1 package search 2 3 import ( 4 "fmt" 5 . "github.com/balzaczyy/golucene/core/codec/spi" 6 "github.com/balzaczyy/golucene/core/index" 7 "github.com/balzaczyy/golucene/core/util" 8 "log" 9 "math" 10 ) 11 12 /* Define service that can be overrided */ 13 type IndexSearcherSPI interface { 14 CreateNormalizedWeight(Query) (Weight, error) 15 Rewrite(Query) (Query, error) 16 WrapFilter(Query, Filter) Query 17 SearchLWC([]*index.AtomicReaderContext, Weight, Collector) error 18 } 19 20 // IndexSearcher 21 type IndexSearcher struct { 22 spi IndexSearcherSPI 23 reader index.IndexReader 24 readerContext index.IndexReaderContext 25 leafContexts []*index.AtomicReaderContext 26 similarity Similarity 27 } 28 29 func NewIndexSearcher(r index.IndexReader) *IndexSearcher { 30 // log.Print("Initializing IndexSearcher from IndexReader: ", r) 31 return NewIndexSearcherFromContext(r.Context()) 32 } 33 34 func NewIndexSearcherFromContext(context index.IndexReaderContext) *IndexSearcher { 35 // assert2(context.isTopLevel, "IndexSearcher's ReaderContext must be topLevel for reader %v", context.reader()) 36 defaultSimilarity := NewDefaultSimilarity() 37 ss := &IndexSearcher{nil, context.Reader(), context, context.Leaves(), defaultSimilarity} 38 ss.spi = ss 39 return ss 40 } 41 42 /* Expert: set the similarity implementation used by this IndexSearcher. */ 43 func (ss *IndexSearcher) SetSimilarity(similarity Similarity) { 44 ss.similarity = similarity 45 } 46 47 func (ss *IndexSearcher) SearchTop(q Query, n int) (topDocs TopDocs, err error) { 48 return ss.Search(q, nil, n) 49 } 50 51 func (ss *IndexSearcher) Search(q Query, f Filter, n int) (topDocs TopDocs, err error) { 52 w, err := ss.spi.CreateNormalizedWeight(ss.spi.WrapFilter(q, f)) 53 if err != nil { 54 return TopDocs{}, err 55 } 56 return ss.searchWSI(w, nil, n), nil 57 } 58 59 /** Expert: Low-level search implementation. Finds the top <code>n</code> 60 * hits for <code>query</code>, applying <code>filter</code> if non-null. 61 * 62 * <p>Applications should usually call {@link IndexSearcher#search(Query,int)} or 63 * {@link IndexSearcher#search(Query,Filter,int)} instead. 64 * @throws BooleanQuery.TooManyClauses If a query would exceed 65 * {@link BooleanQuery#getMaxClauseCount()} clauses. 66 */ 67 func (ss *IndexSearcher) searchWSI(w Weight, after *ScoreDoc, nDocs int) TopDocs { 68 // TODO support concurrent search 69 return ss.searchLWSI(ss.leafContexts, w, after, nDocs) 70 } 71 72 /** Expert: Low-level search implementation. Finds the top <code>n</code> 73 * hits for <code>query</code>. 74 * 75 * <p>Applications should usually call {@link IndexSearcher#search(Query,int)} or 76 * {@link IndexSearcher#search(Query,Filter,int)} instead. 77 * @throws BooleanQuery.TooManyClauses If a query would exceed 78 * {@link BooleanQuery#getMaxClauseCount()} clauses. 79 */ 80 func (ss *IndexSearcher) searchLWSI(leaves []*index.AtomicReaderContext, 81 w Weight, after *ScoreDoc, nDocs int) TopDocs { 82 // single thread 83 limit := ss.reader.MaxDoc() 84 if limit == 0 { 85 limit = 1 86 } 87 if nDocs > limit { 88 nDocs = limit 89 } 90 collector := NewTopScoreDocCollector(nDocs, after, !w.IsScoresDocsOutOfOrder()) 91 ss.spi.SearchLWC(leaves, w, collector) 92 return collector.TopDocs() 93 } 94 95 func (ss *IndexSearcher) SearchLWC(leaves []*index.AtomicReaderContext, w Weight, c Collector) (err error) { 96 // TODO: should we make this 97 // threaded...? the Collector could be sync'd? 98 // always use single thread: 99 for _, ctx := range leaves { // search each subreader 100 // TODO catch CollectionTerminatedException 101 c.SetNextReader(ctx) 102 103 scorer, err := w.BulkScorer(ctx, !c.AcceptsDocsOutOfOrder(), 104 ctx.Reader().(index.AtomicReader).LiveDocs()) 105 if err != nil { 106 return err 107 } 108 if scorer != nil { 109 err = scorer.ScoreAndCollect(c) 110 } // TODO catch CollectionTerminatedException 111 } 112 return 113 } 114 115 func (ss *IndexSearcher) WrapFilter(q Query, f Filter) Query { 116 if f == nil { 117 return q 118 } 119 panic("FilteredQuery not supported yet") 120 } 121 122 /* 123 Returns an Explanation that describes how doc scored against query. 124 125 This is intended to be used in developing Similiarity implemenations, and, for 126 good performance, should not be displayed with every hit. Computing an 127 explanation is as expensive as executing the query over the entire index. 128 */ 129 func (ss *IndexSearcher) Explain(query Query, doc int) (exp Explanation, err error) { 130 w, err := ss.spi.CreateNormalizedWeight(query) 131 if err == nil { 132 return ss.explain(w, doc) 133 } 134 return 135 } 136 137 /* 138 Expert: low-level implementation method 139 Returns an Explanation that describes how doc scored against weight. 140 141 This is intended to be used in developing Similarity implementations, and, for 142 good performance, should not be displayed with every hit. Computing an 143 explanation is as expensive as executing the query over the entire index. 144 145 Applications should call explain(Query, int). 146 */ 147 func (ss *IndexSearcher) explain(weight Weight, doc int) (exp Explanation, err error) { 148 n := index.SubIndex(doc, ss.leafContexts) 149 ctx := ss.leafContexts[n] 150 deBasedDoc := doc - ctx.DocBase 151 return weight.Explain(ctx, deBasedDoc) 152 } 153 154 func (ss *IndexSearcher) CreateNormalizedWeight(q Query) (w Weight, err error) { 155 q, err = ss.spi.Rewrite(q) 156 if err != nil { 157 return nil, err 158 } 159 log.Printf("After rewrite: %v", q) 160 w, err = q.CreateWeight(ss) 161 if err != nil { 162 return nil, err 163 } 164 v := w.ValueForNormalization() 165 norm := ss.similarity.QueryNorm(v) 166 if math.IsInf(float64(norm), 1) || math.IsNaN(float64(norm)) { 167 norm = 1.0 168 } 169 w.Normalize(norm, 1.0) 170 return w, nil 171 } 172 173 func (ss *IndexSearcher) Rewrite(q Query) (Query, error) { 174 log.Printf("Rewriting '%v'...", q) 175 after := q.Rewrite(ss.reader) 176 for after != q { 177 q = after 178 after = q.Rewrite(ss.reader) 179 } 180 return q, nil 181 } 182 183 // Returns this searhcers the top-level IndexReaderContext 184 func (ss *IndexSearcher) TopReaderContext() index.IndexReaderContext { 185 return ss.readerContext 186 } 187 188 func (ss *IndexSearcher) String() string { 189 return fmt.Sprintf("IndexSearcher(%v)", ss.reader) 190 } 191 192 func (ss *IndexSearcher) TermStatistics(term *index.Term, context *index.TermContext) TermStatistics { 193 return NewTermStatistics(term.Bytes, int64(context.DocFreq), context.TotalTermFreq) 194 } 195 196 func (ss *IndexSearcher) CollectionStatistics(field string) CollectionStatistics { 197 terms := index.GetMultiTerms(ss.reader, field) 198 if terms == nil { 199 return NewCollectionStatistics(field, int64(ss.reader.MaxDoc()), 0, 0, 0) 200 } 201 return NewCollectionStatistics(field, int64(ss.reader.MaxDoc()), int64(terms.DocCount()), terms.SumTotalTermFreq(), terms.SumDocFreq()) 202 } 203 204 type TermStatistics struct { 205 Term []byte 206 DocFreq, TotalTermFreq int64 207 } 208 209 func NewTermStatistics(term []byte, docFreq, totalTermFreq int64) TermStatistics { 210 // assert docFreq >= 0; 211 // assert totalTermFreq == -1 || totalTermFreq >= docFreq; // #positions must be >= #postings 212 return TermStatistics{term, docFreq, totalTermFreq} 213 } 214 215 type CollectionStatistics struct { 216 field string 217 maxDoc, docCount, sumTotalTermFreq, sumDocFreq int64 218 } 219 220 func NewCollectionStatistics(field string, maxDoc, docCount, sumTotalTermFreq, sumDocFreq int64) CollectionStatistics { 221 // assert maxDoc >= 0; 222 // assert docCount >= -1 && docCount <= maxDoc; // #docs with field must be <= #docs 223 // assert sumDocFreq == -1 || sumDocFreq >= docCount; // #postings must be >= #docs with field 224 // assert sumTotalTermFreq == -1 || sumTotalTermFreq >= sumDocFreq; // #positions must be >= #postings 225 return CollectionStatistics{field, maxDoc, docCount, sumTotalTermFreq, sumDocFreq} 226 } 227 228 /** 229 * API for scoring "sloppy" queries such as {@link TermQuery}, 230 * {@link SpanQuery}, and {@link PhraseQuery}. 231 * <p> 232 * Frequencies are floating-point values: an approximate 233 * within-document frequency adjusted for "sloppiness" by 234 * {@link SimScorer#computeSlopFactor(int)}. 235 */ 236 type SimScorer interface { 237 /** 238 * Score a single document 239 * @param doc document id within the inverted index segment 240 * @param freq sloppy term frequency 241 * @return document's score 242 */ 243 Score(doc int, freq float32) float32 244 // Explain the score for a single document 245 explain(int, Explanation) Explanation 246 } 247 248 type SimWeight interface { 249 ValueForNormalization() float32 250 Normalize(norm float32, topLevelBoost float32) 251 } 252 253 // search/similarities/TFIDFSimilarity.java 254 255 type ITFIDFSimilarity interface { 256 /** Computes a score factor based on a term or phrase's frequency in a 257 * document. This value is multiplied by the {@link #idf(long, long)} 258 * factor for each term in the query and these products are then summed to 259 * form the initial score for a document. 260 * 261 * <p>Terms and phrases repeated in a document indicate the topic of the 262 * document, so implementations of this method usually return larger values 263 * when <code>freq</code> is large, and smaller values when <code>freq</code> 264 * is small. 265 * 266 * @param freq the frequency of a term within a document 267 * @return a score factor based on a term's within-document frequency 268 */ 269 tf(freq float32) float32 270 /** Computes a score factor based on a term's document frequency (the number 271 * of documents which contain the term). This value is multiplied by the 272 * {@link #tf(float)} factor for each term in the query and these products are 273 * then summed to form the initial score for a document. 274 * 275 * <p>Terms that occur in fewer documents are better indicators of topic, so 276 * implementations of this method usually return larger values for rare terms, 277 * and smaller values for common terms. 278 * 279 * @param docFreq the number of documents which contain the term 280 * @param numDocs the total number of documents in the collection 281 * @return a score factor based on the term's document frequency 282 */ 283 idf(docFreq int64, numDocs int64) float32 284 // Compute an index-time normalization value for this field instance. 285 // 286 // This value will be stored in a single byte lossy representation 287 // by encodeNormValue(). 288 lengthNorm(*index.FieldInvertState) float32 289 // Decodes a normalization factor stored in an index. 290 decodeNormValue(norm int64) float32 291 // Encodes a normalization factor for storage in an index. 292 encodeNormValue(float32) int64 293 } 294 295 type TFIDFSimilarity struct { 296 spi ITFIDFSimilarity 297 } 298 299 func newTFIDFSimilarity(spi ITFIDFSimilarity) *TFIDFSimilarity { 300 return &TFIDFSimilarity{spi} 301 } 302 303 func (ts *TFIDFSimilarity) idfExplainTerm(collectionStats CollectionStatistics, termStats TermStatistics) Explanation { 304 df, max := termStats.DocFreq, collectionStats.maxDoc 305 idf := ts.spi.idf(df, max) 306 return newExplanation(idf, fmt.Sprintf("idf(docFreq=%v, maxDocs=%v)", df, max)) 307 } 308 309 func (ts *TFIDFSimilarity) idfExplainPhrase(collectionStats CollectionStatistics, termStats []TermStatistics) Explanation { 310 details := make([]Explanation, len(termStats)) 311 var idf float32 = 0 312 for i, stat := range termStats { 313 details[i] = ts.idfExplainTerm(collectionStats, stat) 314 idf += details[i].(*ExplanationImpl).value 315 } 316 ans := newExplanation(idf, fmt.Sprintf("idf(), sum of:")) 317 ans.details = details 318 return ans 319 } 320 321 func (ts *TFIDFSimilarity) ComputeNorm(state *index.FieldInvertState) int64 { 322 return ts.spi.encodeNormValue(ts.spi.lengthNorm(state)) 323 } 324 325 func (ts *TFIDFSimilarity) computeWeight(queryBoost float32, collectionStats CollectionStatistics, termStats ...TermStatistics) SimWeight { 326 var idf Explanation 327 if len(termStats) == 1 { 328 idf = ts.idfExplainTerm(collectionStats, termStats[0]) 329 } else { 330 idf = ts.idfExplainPhrase(collectionStats, termStats) 331 } 332 return newIDFStats(collectionStats.field, idf, queryBoost) 333 } 334 335 func (ts *TFIDFSimilarity) simScorer(stats SimWeight, ctx *index.AtomicReaderContext) (ss SimScorer, err error) { 336 idfstats := stats.(*idfStats) 337 ndv, err := ctx.Reader().(index.AtomicReader).NormValues(idfstats.field) 338 if err != nil { 339 return nil, err 340 } 341 return newTFIDFSimScorer(ts, idfstats, ndv), nil 342 } 343 344 type tfIDFSimScorer struct { 345 owner *TFIDFSimilarity 346 stats *idfStats 347 weightValue float32 348 norms NumericDocValues 349 } 350 351 func newTFIDFSimScorer(owner *TFIDFSimilarity, stats *idfStats, norms NumericDocValues) *tfIDFSimScorer { 352 return &tfIDFSimScorer{owner, stats, stats.value, norms} 353 } 354 355 func (ss *tfIDFSimScorer) Score(doc int, freq float32) float32 { 356 raw := ss.owner.spi.tf(freq) * ss.weightValue // compute tf(f)*weight 357 if ss.norms == nil { 358 return raw 359 } 360 return raw * ss.owner.spi.decodeNormValue(ss.norms(doc)) // normalize for field 361 } 362 363 func (ss *tfIDFSimScorer) explain(doc int, freq Explanation) Explanation { 364 return ss.owner.explainScore(doc, freq, ss.stats, ss.norms) 365 } 366 367 /** Collection statistics for the TF-IDF model. The only statistic of interest 368 * to this model is idf. */ 369 type idfStats struct { 370 field string 371 /** The idf and its explanation */ 372 idf Explanation 373 queryNorm float32 374 queryWeight float32 375 queryBoost float32 376 value float32 377 } 378 379 func newIDFStats(field string, idf Explanation, queryBoost float32) *idfStats { 380 // TODO: validate? 381 return &idfStats{ 382 field: field, 383 idf: idf, 384 queryBoost: queryBoost, 385 queryWeight: idf.(*ExplanationImpl).value * queryBoost, // compute query weight 386 } 387 } 388 389 func (stats *idfStats) ValueForNormalization() float32 { 390 // TODO: (sorta LUCENE-1907) make non-static class and expose this squaring via a nice method to subclasses? 391 return stats.queryWeight * stats.queryWeight // sum of squared weights 392 } 393 394 func (stats *idfStats) Normalize(queryNorm float32, topLevelBoost float32) { 395 stats.queryNorm = queryNorm * topLevelBoost 396 stats.queryWeight *= stats.queryNorm // normalize query weight 397 stats.value = stats.queryWeight * stats.idf.(*ExplanationImpl).value // idf for document 398 } 399 400 func (ss *TFIDFSimilarity) explainScore(doc int, freq Explanation, 401 stats *idfStats, norms NumericDocValues) Explanation { 402 403 // explain query weight 404 boostExpl := newExplanation(stats.queryBoost, "boost") 405 queryNormExpl := newExplanation(stats.queryNorm, "queryNorm") 406 queryExpl := newExplanation( 407 boostExpl.value*stats.idf.Value()*queryNormExpl.value, 408 "queryWeight, product of:") 409 if stats.queryBoost != 1 { 410 queryExpl.addDetail(boostExpl) 411 } 412 queryExpl.addDetail(stats.idf) 413 queryExpl.addDetail(queryNormExpl) 414 415 // explain field weight 416 tfExplanation := newExplanation(ss.spi.tf(freq.Value()), 417 fmt.Sprintf("tf(freq=%v), with freq of:", freq.Value())) 418 tfExplanation.addDetail(freq) 419 fieldNorm := float32(1) 420 if norms != nil { 421 fieldNorm = ss.spi.decodeNormValue(norms(doc)) 422 } 423 fieldNormExpl := newExplanation(fieldNorm, fmt.Sprintf("fieldNorm(doc=%v)", doc)) 424 fieldExpl := newExplanation( 425 tfExplanation.value*stats.idf.Value()*fieldNormExpl.value, 426 fmt.Sprintf("fieldWeight in %v, product of:", doc)) 427 fieldExpl.addDetail(tfExplanation) 428 fieldExpl.addDetail(stats.idf) 429 fieldExpl.addDetail(fieldNormExpl) 430 431 if queryExpl.value == 1 { 432 return fieldExpl 433 } 434 435 // combine them 436 ans := newExplanation(queryExpl.value*fieldExpl.value, 437 fmt.Sprintf("score(doc=%v,freq=%v), product of:", doc, freq)) 438 ans.addDetail(queryExpl) 439 ans.addDetail(fieldExpl) 440 return ans 441 } 442 443 // search/similarities/DefaultSimilarity.java 444 445 /** Cache of decoded bytes. */ 446 var NORM_TABLE []float32 = buildNormTable() 447 448 func buildNormTable() []float32 { 449 table := make([]float32, 256) 450 for i, _ := range table { 451 table[i] = util.Byte315ToFloat(byte(i)) 452 } 453 return table 454 } 455 456 type DefaultSimilarity struct { 457 *TFIDFSimilarity 458 discountOverlaps bool 459 } 460 461 func NewDefaultSimilarity() *DefaultSimilarity { 462 ans := &DefaultSimilarity{discountOverlaps: true} 463 ans.TFIDFSimilarity = newTFIDFSimilarity(ans) 464 return ans 465 } 466 467 func (ds *DefaultSimilarity) Coord(overlap, maxOverlap int) float32 { 468 return float32(overlap) / float32(maxOverlap) 469 } 470 471 func (ds *DefaultSimilarity) QueryNorm(sumOfSquaredWeights float32) float32 { 472 return float32(1.0 / math.Sqrt(float64(sumOfSquaredWeights))) 473 } 474 475 /* 476 Encodes a normalization factor for storage in an index. 477 478 The encoding uses a three-bit mantissa, a five-bit exponent, and the 479 zero-exponent point at 15, thus representing values from around 480 7x10^9 to 2x10^-9 with about one significant decimal digit of 481 accuracy. Zero is also represented. Negative numbers are rounded up 482 to zero. Values too large to represent are rounded down to the 483 largest representable value. Positive values too small to represent 484 are rounded up to the smallest positive representable value. 485 */ 486 func (ds *DefaultSimilarity) encodeNormValue(f float32) int64 { 487 return int64(util.FloatToByte315(f)) 488 } 489 490 func (ds *DefaultSimilarity) decodeNormValue(norm int64) float32 { 491 return NORM_TABLE[int(norm&0xff)] // & 0xFF maps negative bytes to positive above 127 492 } 493 494 /* 495 Implemented as state.boost() * lengthNorm(numTerms), where numTerms 496 is FieldInvertState.length() if setDiscountOverlaps() is false, else 497 it's FieldInvertState.length() - FieldInvertState.numOverlap(). 498 */ 499 func (ds *DefaultSimilarity) lengthNorm(state *index.FieldInvertState) float32 { 500 var numTerms int 501 if ds.discountOverlaps { 502 numTerms = state.Length() - state.NumOverlap() 503 } else { 504 numTerms = state.Length() 505 } 506 return state.Boost() * float32(1.0/math.Sqrt(float64(numTerms))) 507 } 508 509 func (ds *DefaultSimilarity) tf(freq float32) float32 { 510 return float32(math.Sqrt(float64(freq))) 511 } 512 513 func (ds *DefaultSimilarity) idf(docFreq int64, numDocs int64) float32 { 514 return float32(math.Log(float64(numDocs)/float64(docFreq+1))) + 1.0 515 } 516 517 func (ds *DefaultSimilarity) String() string { 518 return "DefaultSImilarity" 519 }