github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/search/collect.go (about)

     1  package search
     2  
     3  import (
     4  	"container/heap"
     5  	"fmt"
     6  	"github.com/balzaczyy/golucene/core/index"
     7  	"math"
     8  )
     9  
    10  /** Holds one hit in {@link TopDocs}. */
    11  type ScoreDoc struct {
    12  	/** The score of this document for the query. */
    13  	Score float32
    14  	/** A hit document's number.
    15  	 * @see IndexSearcher#doc(int) */
    16  	Doc int
    17  	/** Only set by {@link TopDocs#merge} */
    18  	shardIndex int
    19  }
    20  
    21  func newScoreDoc(doc int, score float32) *ScoreDoc {
    22  	return newShardedScoreDoc(doc, score, -1)
    23  }
    24  
    25  func newShardedScoreDoc(doc int, score float32, shardIndex int) *ScoreDoc {
    26  	return &ScoreDoc{score, doc, shardIndex}
    27  }
    28  
    29  func (d *ScoreDoc) String() string {
    30  	return fmt.Sprintf("doc=%v score=%v shardIndex=%v", d.Doc, d.Score, d.shardIndex)
    31  }
    32  
    33  type PriorityQueue struct {
    34  	items []interface{}
    35  	less  func(i, j int) bool
    36  }
    37  
    38  func (pq PriorityQueue) Len() int            { return len(pq.items) }
    39  func (pq PriorityQueue) Less(i, j int) bool  { return pq.less(i, j) }
    40  func (pq PriorityQueue) Swap(i, j int)       { pq.items[i], pq.items[j] = pq.items[j], pq.items[i] }
    41  func (pq *PriorityQueue) Push(x interface{}) { pq.items = append(pq.items, x) }
    42  func (pq *PriorityQueue) Pop() interface{} {
    43  	n := pq.Len()
    44  	ans := pq.items[n-1]
    45  	pq.items = pq.items[0 : n-1]
    46  	return ans
    47  }
    48  func (pq *PriorityQueue) updateTop() interface{} {
    49  	heap.Fix(pq, 0)
    50  	return pq.items[0]
    51  }
    52  
    53  type TopDocs struct {
    54  	TotalHits int
    55  	ScoreDocs []*ScoreDoc
    56  	maxScore  float64
    57  }
    58  
    59  type Collector interface {
    60  	SetScorer(s Scorer)
    61  	Collect(doc int) error
    62  	SetNextReader(ctx *index.AtomicReaderContext)
    63  	AcceptsDocsOutOfOrder() bool
    64  }
    65  
    66  // search/TopDocsCollector.java
    67  /**
    68   * A base class for all collectors that return a {@link TopDocs} output. This
    69   * collector allows easy extension by providing a single constructor which
    70   * accepts a {@link PriorityQueue} as well as protected members for that
    71   * priority queue and a counter of the number of total hits.<br>
    72   * Extending classes can override any of the methods to provide their own
    73   * implementation, as well as avoid the use of the priority queue entirely by
    74   * passing null to {@link #TopDocsCollector(PriorityQueue)}. In that case
    75   * however, you might want to consider overriding all methods, in order to avoid
    76   * a NullPointerException.
    77   */
    78  type TopDocsCollector interface {
    79  	Collector
    80  	/** Returns the top docs that were collected by this collector. */
    81  	TopDocs() TopDocs
    82  	/**
    83  	 * Returns the documents in the rage [start .. start+howMany) that were
    84  	 * collected by this collector. Note that if start >= pq.size(), an empty
    85  	 * TopDocs is returned, and if pq.size() - start &lt; howMany, then only the
    86  	 * available documents in [start .. pq.size()) are returned.<br>
    87  	 * This method is useful to call in case pagination of search results is
    88  	 * allowed by the search application, as well as it attempts to optimize the
    89  	 * memory used by allocating only as much as requested by howMany.<br>
    90  	 * <b>NOTE:</b> you cannot call this method more than once for each search
    91  	 * execution. If you need to call it more than once, passing each time a
    92  	 * different range, you should call {@link #topDocs()} and work with the
    93  	 * returned {@link TopDocs} object, which will contain all the results this
    94  	 * search execution collected.
    95  	 */
    96  	TopDocsRange(start, howMany int) TopDocs
    97  }
    98  
    99  type TopDocsCreator interface {
   100  	/**
   101  	 * Populates the results array with the ScoreDoc instances. This can be
   102  	 * overridden in case a different ScoreDoc type should be returned.
   103  	 */
   104  	populateResults(results []*ScoreDoc, howMany int)
   105  	/**
   106  	 * Returns a {@link TopDocs} instance containing the given results. If
   107  	 * <code>results</code> is null it means there are no results to return,
   108  	 * either because there were 0 calls to collect() or because the arguments to
   109  	 * topDocs were invalid.
   110  	 */
   111  	newTopDocs(results []*ScoreDoc, start int) TopDocs
   112  	/** The number of valid PQ entries */
   113  	topDocsSize() int
   114  }
   115  
   116  type abstractTopDocsCollector struct {
   117  	Collector
   118  	TopDocsCreator
   119  	pq        *PriorityQueue // PriorityQueue
   120  	TotalHits int
   121  }
   122  
   123  func newTopDocsCollector(self interface{}, pq *PriorityQueue) *abstractTopDocsCollector {
   124  	return &abstractTopDocsCollector{
   125  		Collector:      self.(Collector),
   126  		TopDocsCreator: self.(TopDocsCreator),
   127  		pq:             pq,
   128  	}
   129  }
   130  
   131  func (c *abstractTopDocsCollector) AcceptsDocsOutOfOrder() bool {
   132  	return false
   133  }
   134  
   135  func (c *abstractTopDocsCollector) populateResults(results []*ScoreDoc, howMany int) {
   136  	for i := howMany - 1; i >= 0; i-- {
   137  		results[i] = heap.Pop(c.pq).(*ScoreDoc)
   138  	}
   139  }
   140  
   141  func (c *abstractTopDocsCollector) topDocsSize() int {
   142  	// In case pq was populated with sentinel values, there might be less
   143  	// results than pq.size(). Therefore return all results until either
   144  	// pq.size() or totalHits.
   145  	if n := c.pq.Len(); c.TotalHits >= n {
   146  		return n
   147  	}
   148  	return c.TotalHits
   149  }
   150  
   151  func (c *abstractTopDocsCollector) TopDocs() TopDocs {
   152  	// In case pq was populated with sentinel values, there might be less
   153  	// results than pq.size(). Therefore return all results until either
   154  	// pq.size() or totalHits.
   155  	return c.TopDocsRange(0, c.topDocsSize())
   156  }
   157  
   158  func (c *abstractTopDocsCollector) TopDocsRange(start, howMany int) TopDocs {
   159  	// In case pq was populated with sentinel values, there might be less
   160  	// results than pq.size(). Therefore return all results until either
   161  	// pq.size() or totalHits.
   162  	size := c.topDocsSize()
   163  
   164  	// Don't bother to throw an exception, just return an empty TopDocs in case
   165  	// the parameters are invalid or out of range.
   166  	// TODO: shouldn't we throw IAE if apps give bad params here so they dont
   167  	// have sneaky silent bugs?
   168  	if start < 0 || start >= size || howMany <= 0 {
   169  		return c.newTopDocs(nil, start)
   170  	}
   171  
   172  	// We know that start < pqsize, so just fix howMany.
   173  	if size-start < howMany {
   174  		howMany = size - start
   175  	}
   176  	results := make([]*ScoreDoc, howMany)
   177  
   178  	// pq's pop() returns the 'least' element in the queue, therefore need
   179  	// to discard the first ones, until we reach the requested range.
   180  	// Note that this loop will usually not be executed, since the common usage
   181  	// should be that the caller asks for the last howMany results. However it's
   182  	// needed here for completeness.
   183  	for i := c.pq.Len() - start - howMany; i > 0; i-- {
   184  		heap.Pop(c.pq)
   185  	}
   186  
   187  	// Get the requested results from pq.
   188  	c.populateResults(results, howMany)
   189  
   190  	return c.newTopDocs(results, start)
   191  }
   192  
   193  type TopScoreDocCollector struct {
   194  	*abstractTopDocsCollector
   195  	pqTop   *ScoreDoc
   196  	docBase int
   197  	scorer  Scorer
   198  }
   199  
   200  func newTocScoreDocCollector(numHits int) *TopScoreDocCollector {
   201  	docs := make([]interface{}, numHits)
   202  	for i, _ := range docs {
   203  		docs[i] = newScoreDoc(math.MaxInt32, -math.MaxFloat32)
   204  	}
   205  	pq := &PriorityQueue{items: docs}
   206  	pq.less = func(i, j int) bool {
   207  		hitA := pq.items[i].(*ScoreDoc)
   208  		hitB := pq.items[j].(*ScoreDoc)
   209  		if hitA.Score == hitB.Score {
   210  			return hitA.Doc > hitB.Doc
   211  		}
   212  		return hitA.Score < hitB.Score
   213  	}
   214  	heap.Init(pq)
   215  
   216  	pqTop := heap.Pop(pq).(*ScoreDoc)
   217  	heap.Push(pq, pqTop)
   218  	c := &TopScoreDocCollector{pqTop: pqTop}
   219  	c.abstractTopDocsCollector = newTopDocsCollector(c, pq)
   220  	return c
   221  }
   222  
   223  func (c *TopScoreDocCollector) newTopDocs(results []*ScoreDoc, start int) TopDocs {
   224  	if results == nil {
   225  		return TopDocs{0, []*ScoreDoc{}, math.NaN()}
   226  	}
   227  
   228  	// We need to compute maxScore in order to set it in TopDocs. If start == 0,
   229  	// it means the largest element is already in results, use its score as
   230  	// maxScore. Otherwise pop everything else, until the largest element is
   231  	// extracted and use its score as maxScore.
   232  	maxScore := math.NaN()
   233  	if start == 0 {
   234  		maxScore = float64(results[0].Score)
   235  	} else {
   236  		pq := c.pq
   237  		for i := pq.Len(); i > 1; i-- {
   238  			heap.Pop(pq)
   239  		}
   240  		maxScore = float64(heap.Pop(pq).(ScoreDoc).Score)
   241  	}
   242  
   243  	return TopDocs{c.TotalHits, results, maxScore}
   244  }
   245  
   246  func (c *TopScoreDocCollector) SetNextReader(ctx *index.AtomicReaderContext) {
   247  	c.docBase = ctx.DocBase
   248  }
   249  
   250  func (c *TopScoreDocCollector) SetScorer(scorer Scorer) {
   251  	c.scorer = scorer
   252  }
   253  
   254  func NewTopScoreDocCollector(numHits int, after *ScoreDoc, docsScoredInOrder bool) TopDocsCollector {
   255  	if numHits < 0 {
   256  		panic("numHits must be > 0; please use TotalHitCountCollector if you just need the total hit count")
   257  	}
   258  
   259  	if docsScoredInOrder {
   260  		if after == nil {
   261  			return newInOrderTopScoreDocCollector(numHits)
   262  		}
   263  		panic("not implemented yet")
   264  		// TODO support paging
   265  	} else {
   266  		if after == nil {
   267  			return newOutOfOrderTopScoreDocCollector(numHits)
   268  		}
   269  		panic("not implemented yet")
   270  	}
   271  }
   272  
   273  // Assumes docs are scored in order.
   274  type InOrderTopScoreDocCollector struct {
   275  	*TopScoreDocCollector
   276  }
   277  
   278  func newInOrderTopScoreDocCollector(numHits int) *InOrderTopScoreDocCollector {
   279  	return &InOrderTopScoreDocCollector{newTocScoreDocCollector(numHits)}
   280  }
   281  
   282  func (c *InOrderTopScoreDocCollector) Collect(doc int) (err error) {
   283  	score, err := c.scorer.Score()
   284  	if err != nil {
   285  		return err
   286  	}
   287  
   288  	// This collector cannot handle these scores:
   289  	assert(score != -math.MaxFloat32)
   290  	assert(!math.IsNaN(float64(score)))
   291  
   292  	c.TotalHits++
   293  	if score <= c.pqTop.Score {
   294  		// Since docs are returned in-order (i.e., increasing doc Id), a document
   295  		// with equal score to pqTop.score cannot compete since HitQueue favors
   296  		// documents with lower doc Ids. Therefore reject those docs too.
   297  		return
   298  	}
   299  	c.pqTop.Doc = doc + c.docBase
   300  	c.pqTop.Score = float32(score)
   301  	c.pqTop = c.pq.updateTop().(*ScoreDoc)
   302  	return
   303  }
   304  
   305  func (c *InOrderTopScoreDocCollector) AcceptsDocsOutOfOrder() bool {
   306  	return false
   307  }
   308  
   309  type OutOfOrderTopScoreDocCollector struct {
   310  	*TopScoreDocCollector
   311  }
   312  
   313  func newOutOfOrderTopScoreDocCollector(numHits int) *OutOfOrderTopScoreDocCollector {
   314  	return &OutOfOrderTopScoreDocCollector{
   315  		TopScoreDocCollector: newTocScoreDocCollector(numHits),
   316  	}
   317  }
   318  
   319  func (c *OutOfOrderTopScoreDocCollector) Collect(doc int) (err error) {
   320  	var score float32
   321  	if score, err = c.scorer.Score(); err != nil {
   322  		return err
   323  	}
   324  
   325  	// This collector cannot handle NaN
   326  	assert(!math.IsNaN(float64(score)))
   327  
   328  	c.TotalHits++
   329  	if score < c.pqTop.Score {
   330  		// Doesn't compete w/ bottom entry in queue
   331  		return nil
   332  	}
   333  	doc += c.docBase
   334  	if score == c.pqTop.Score && doc > c.pqTop.Doc {
   335  		// Break tie in score by doc ID:
   336  		return nil
   337  	}
   338  	c.pqTop.Doc = doc
   339  	c.pqTop.Score = score
   340  	c.pqTop = c.pq.updateTop().(*ScoreDoc)
   341  	return nil
   342  }
   343  
   344  func (c *OutOfOrderTopScoreDocCollector) AcceptsDocsOutOfOrder() bool {
   345  	return true
   346  }