github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/search/collect.go (about) 1 package search 2 3 import ( 4 "container/heap" 5 "fmt" 6 "github.com/balzaczyy/golucene/core/index" 7 "math" 8 ) 9 10 /** Holds one hit in {@link TopDocs}. */ 11 type ScoreDoc struct { 12 /** The score of this document for the query. */ 13 Score float32 14 /** A hit document's number. 15 * @see IndexSearcher#doc(int) */ 16 Doc int 17 /** Only set by {@link TopDocs#merge} */ 18 shardIndex int 19 } 20 21 func newScoreDoc(doc int, score float32) *ScoreDoc { 22 return newShardedScoreDoc(doc, score, -1) 23 } 24 25 func newShardedScoreDoc(doc int, score float32, shardIndex int) *ScoreDoc { 26 return &ScoreDoc{score, doc, shardIndex} 27 } 28 29 func (d *ScoreDoc) String() string { 30 return fmt.Sprintf("doc=%v score=%v shardIndex=%v", d.Doc, d.Score, d.shardIndex) 31 } 32 33 type PriorityQueue struct { 34 items []interface{} 35 less func(i, j int) bool 36 } 37 38 func (pq PriorityQueue) Len() int { return len(pq.items) } 39 func (pq PriorityQueue) Less(i, j int) bool { return pq.less(i, j) } 40 func (pq PriorityQueue) Swap(i, j int) { pq.items[i], pq.items[j] = pq.items[j], pq.items[i] } 41 func (pq *PriorityQueue) Push(x interface{}) { pq.items = append(pq.items, x) } 42 func (pq *PriorityQueue) Pop() interface{} { 43 n := pq.Len() 44 ans := pq.items[n-1] 45 pq.items = pq.items[0 : n-1] 46 return ans 47 } 48 func (pq *PriorityQueue) updateTop() interface{} { 49 heap.Fix(pq, 0) 50 return pq.items[0] 51 } 52 53 type TopDocs struct { 54 TotalHits int 55 ScoreDocs []*ScoreDoc 56 maxScore float64 57 } 58 59 type Collector interface { 60 SetScorer(s Scorer) 61 Collect(doc int) error 62 SetNextReader(ctx *index.AtomicReaderContext) 63 AcceptsDocsOutOfOrder() bool 64 } 65 66 // search/TopDocsCollector.java 67 /** 68 * A base class for all collectors that return a {@link TopDocs} output. This 69 * collector allows easy extension by providing a single constructor which 70 * accepts a {@link PriorityQueue} as well as protected members for that 71 * priority queue and a counter of the number of total hits.<br> 72 * Extending classes can override any of the methods to provide their own 73 * implementation, as well as avoid the use of the priority queue entirely by 74 * passing null to {@link #TopDocsCollector(PriorityQueue)}. In that case 75 * however, you might want to consider overriding all methods, in order to avoid 76 * a NullPointerException. 77 */ 78 type TopDocsCollector interface { 79 Collector 80 /** Returns the top docs that were collected by this collector. */ 81 TopDocs() TopDocs 82 /** 83 * Returns the documents in the rage [start .. start+howMany) that were 84 * collected by this collector. Note that if start >= pq.size(), an empty 85 * TopDocs is returned, and if pq.size() - start < howMany, then only the 86 * available documents in [start .. pq.size()) are returned.<br> 87 * This method is useful to call in case pagination of search results is 88 * allowed by the search application, as well as it attempts to optimize the 89 * memory used by allocating only as much as requested by howMany.<br> 90 * <b>NOTE:</b> you cannot call this method more than once for each search 91 * execution. If you need to call it more than once, passing each time a 92 * different range, you should call {@link #topDocs()} and work with the 93 * returned {@link TopDocs} object, which will contain all the results this 94 * search execution collected. 95 */ 96 TopDocsRange(start, howMany int) TopDocs 97 } 98 99 type TopDocsCreator interface { 100 /** 101 * Populates the results array with the ScoreDoc instances. This can be 102 * overridden in case a different ScoreDoc type should be returned. 103 */ 104 populateResults(results []*ScoreDoc, howMany int) 105 /** 106 * Returns a {@link TopDocs} instance containing the given results. If 107 * <code>results</code> is null it means there are no results to return, 108 * either because there were 0 calls to collect() or because the arguments to 109 * topDocs were invalid. 110 */ 111 newTopDocs(results []*ScoreDoc, start int) TopDocs 112 /** The number of valid PQ entries */ 113 topDocsSize() int 114 } 115 116 type abstractTopDocsCollector struct { 117 Collector 118 TopDocsCreator 119 pq *PriorityQueue // PriorityQueue 120 TotalHits int 121 } 122 123 func newTopDocsCollector(self interface{}, pq *PriorityQueue) *abstractTopDocsCollector { 124 return &abstractTopDocsCollector{ 125 Collector: self.(Collector), 126 TopDocsCreator: self.(TopDocsCreator), 127 pq: pq, 128 } 129 } 130 131 func (c *abstractTopDocsCollector) AcceptsDocsOutOfOrder() bool { 132 return false 133 } 134 135 func (c *abstractTopDocsCollector) populateResults(results []*ScoreDoc, howMany int) { 136 for i := howMany - 1; i >= 0; i-- { 137 results[i] = heap.Pop(c.pq).(*ScoreDoc) 138 } 139 } 140 141 func (c *abstractTopDocsCollector) topDocsSize() int { 142 // In case pq was populated with sentinel values, there might be less 143 // results than pq.size(). Therefore return all results until either 144 // pq.size() or totalHits. 145 if n := c.pq.Len(); c.TotalHits >= n { 146 return n 147 } 148 return c.TotalHits 149 } 150 151 func (c *abstractTopDocsCollector) TopDocs() TopDocs { 152 // In case pq was populated with sentinel values, there might be less 153 // results than pq.size(). Therefore return all results until either 154 // pq.size() or totalHits. 155 return c.TopDocsRange(0, c.topDocsSize()) 156 } 157 158 func (c *abstractTopDocsCollector) TopDocsRange(start, howMany int) TopDocs { 159 // In case pq was populated with sentinel values, there might be less 160 // results than pq.size(). Therefore return all results until either 161 // pq.size() or totalHits. 162 size := c.topDocsSize() 163 164 // Don't bother to throw an exception, just return an empty TopDocs in case 165 // the parameters are invalid or out of range. 166 // TODO: shouldn't we throw IAE if apps give bad params here so they dont 167 // have sneaky silent bugs? 168 if start < 0 || start >= size || howMany <= 0 { 169 return c.newTopDocs(nil, start) 170 } 171 172 // We know that start < pqsize, so just fix howMany. 173 if size-start < howMany { 174 howMany = size - start 175 } 176 results := make([]*ScoreDoc, howMany) 177 178 // pq's pop() returns the 'least' element in the queue, therefore need 179 // to discard the first ones, until we reach the requested range. 180 // Note that this loop will usually not be executed, since the common usage 181 // should be that the caller asks for the last howMany results. However it's 182 // needed here for completeness. 183 for i := c.pq.Len() - start - howMany; i > 0; i-- { 184 heap.Pop(c.pq) 185 } 186 187 // Get the requested results from pq. 188 c.populateResults(results, howMany) 189 190 return c.newTopDocs(results, start) 191 } 192 193 type TopScoreDocCollector struct { 194 *abstractTopDocsCollector 195 pqTop *ScoreDoc 196 docBase int 197 scorer Scorer 198 } 199 200 func newTocScoreDocCollector(numHits int) *TopScoreDocCollector { 201 docs := make([]interface{}, numHits) 202 for i, _ := range docs { 203 docs[i] = newScoreDoc(math.MaxInt32, -math.MaxFloat32) 204 } 205 pq := &PriorityQueue{items: docs} 206 pq.less = func(i, j int) bool { 207 hitA := pq.items[i].(*ScoreDoc) 208 hitB := pq.items[j].(*ScoreDoc) 209 if hitA.Score == hitB.Score { 210 return hitA.Doc > hitB.Doc 211 } 212 return hitA.Score < hitB.Score 213 } 214 heap.Init(pq) 215 216 pqTop := heap.Pop(pq).(*ScoreDoc) 217 heap.Push(pq, pqTop) 218 c := &TopScoreDocCollector{pqTop: pqTop} 219 c.abstractTopDocsCollector = newTopDocsCollector(c, pq) 220 return c 221 } 222 223 func (c *TopScoreDocCollector) newTopDocs(results []*ScoreDoc, start int) TopDocs { 224 if results == nil { 225 return TopDocs{0, []*ScoreDoc{}, math.NaN()} 226 } 227 228 // We need to compute maxScore in order to set it in TopDocs. If start == 0, 229 // it means the largest element is already in results, use its score as 230 // maxScore. Otherwise pop everything else, until the largest element is 231 // extracted and use its score as maxScore. 232 maxScore := math.NaN() 233 if start == 0 { 234 maxScore = float64(results[0].Score) 235 } else { 236 pq := c.pq 237 for i := pq.Len(); i > 1; i-- { 238 heap.Pop(pq) 239 } 240 maxScore = float64(heap.Pop(pq).(ScoreDoc).Score) 241 } 242 243 return TopDocs{c.TotalHits, results, maxScore} 244 } 245 246 func (c *TopScoreDocCollector) SetNextReader(ctx *index.AtomicReaderContext) { 247 c.docBase = ctx.DocBase 248 } 249 250 func (c *TopScoreDocCollector) SetScorer(scorer Scorer) { 251 c.scorer = scorer 252 } 253 254 func NewTopScoreDocCollector(numHits int, after *ScoreDoc, docsScoredInOrder bool) TopDocsCollector { 255 if numHits < 0 { 256 panic("numHits must be > 0; please use TotalHitCountCollector if you just need the total hit count") 257 } 258 259 if docsScoredInOrder { 260 if after == nil { 261 return newInOrderTopScoreDocCollector(numHits) 262 } 263 panic("not implemented yet") 264 // TODO support paging 265 } else { 266 if after == nil { 267 return newOutOfOrderTopScoreDocCollector(numHits) 268 } 269 panic("not implemented yet") 270 } 271 } 272 273 // Assumes docs are scored in order. 274 type InOrderTopScoreDocCollector struct { 275 *TopScoreDocCollector 276 } 277 278 func newInOrderTopScoreDocCollector(numHits int) *InOrderTopScoreDocCollector { 279 return &InOrderTopScoreDocCollector{newTocScoreDocCollector(numHits)} 280 } 281 282 func (c *InOrderTopScoreDocCollector) Collect(doc int) (err error) { 283 score, err := c.scorer.Score() 284 if err != nil { 285 return err 286 } 287 288 // This collector cannot handle these scores: 289 assert(score != -math.MaxFloat32) 290 assert(!math.IsNaN(float64(score))) 291 292 c.TotalHits++ 293 if score <= c.pqTop.Score { 294 // Since docs are returned in-order (i.e., increasing doc Id), a document 295 // with equal score to pqTop.score cannot compete since HitQueue favors 296 // documents with lower doc Ids. Therefore reject those docs too. 297 return 298 } 299 c.pqTop.Doc = doc + c.docBase 300 c.pqTop.Score = float32(score) 301 c.pqTop = c.pq.updateTop().(*ScoreDoc) 302 return 303 } 304 305 func (c *InOrderTopScoreDocCollector) AcceptsDocsOutOfOrder() bool { 306 return false 307 } 308 309 type OutOfOrderTopScoreDocCollector struct { 310 *TopScoreDocCollector 311 } 312 313 func newOutOfOrderTopScoreDocCollector(numHits int) *OutOfOrderTopScoreDocCollector { 314 return &OutOfOrderTopScoreDocCollector{ 315 TopScoreDocCollector: newTocScoreDocCollector(numHits), 316 } 317 } 318 319 func (c *OutOfOrderTopScoreDocCollector) Collect(doc int) (err error) { 320 var score float32 321 if score, err = c.scorer.Score(); err != nil { 322 return err 323 } 324 325 // This collector cannot handle NaN 326 assert(!math.IsNaN(float64(score))) 327 328 c.TotalHits++ 329 if score < c.pqTop.Score { 330 // Doesn't compete w/ bottom entry in queue 331 return nil 332 } 333 doc += c.docBase 334 if score == c.pqTop.Score && doc > c.pqTop.Doc { 335 // Break tie in score by doc ID: 336 return nil 337 } 338 c.pqTop.Doc = doc 339 c.pqTop.Score = score 340 c.pqTop = c.pq.updateTop().(*ScoreDoc) 341 return nil 342 } 343 344 func (c *OutOfOrderTopScoreDocCollector) AcceptsDocsOutOfOrder() bool { 345 return true 346 }