github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/index/deleteStream.go (about)

     1  package index
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	. "github.com/balzaczyy/golucene/core/codec/spi"
     7  	"github.com/balzaczyy/golucene/core/store"
     8  	"github.com/balzaczyy/golucene/core/util"
     9  	"log"
    10  	"math"
    11  	"sort"
    12  	"sync"
    13  	"sync/atomic"
    14  	"time"
    15  )
    16  
    17  // index/BufferedUpdatesStream.java
    18  
    19  type ApplyDeletesResult struct {
    20  	// True if any actual deletes took place:
    21  	anyDeletes bool
    22  
    23  	// Curreng gen, for the merged segment:
    24  	gen int64
    25  
    26  	// If non-nil, contains segments that are 100% deleted
    27  	allDeleted []*SegmentCommitInfo
    28  }
    29  
    30  type SegInfoByDelGen []*SegmentCommitInfo
    31  
    32  func (a SegInfoByDelGen) Len() int           { return len(a) }
    33  func (a SegInfoByDelGen) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
    34  func (a SegInfoByDelGen) Less(i, j int) bool { return a[i].BufferedUpdatesGen < a[j].BufferedUpdatesGen }
    35  
    36  type Query interface{}
    37  
    38  type QueryAndLimit struct {
    39  }
    40  
    41  // index/CoalescedUpdates.java
    42  
    43  type CoalescedUpdates struct {
    44  	_queries         map[Query]int
    45  	numericDVUpdates []*DocValuesUpdate
    46  	binaryDVUpdates  []*DocValuesUpdate
    47  }
    48  
    49  func newCoalescedUpdates() *CoalescedUpdates {
    50  	return &CoalescedUpdates{
    51  		_queries: make(map[Query]int),
    52  	}
    53  }
    54  
    55  func (cd *CoalescedUpdates) String() string {
    56  	panic("not implemented yet")
    57  }
    58  
    59  func (cd *CoalescedUpdates) update(in *FrozenBufferedUpdates) {
    60  	panic("not implemented yet")
    61  }
    62  
    63  func (cd *CoalescedUpdates) terms() []*Term {
    64  	panic("not implemented yet")
    65  }
    66  
    67  func (cd *CoalescedUpdates) queries() []*QueryAndLimit {
    68  	panic("not implemented yet")
    69  }
    70  
    71  /*
    72  Tracks the stream of BufferedUpdates. When DocumentsWriterPerThread
    73  flushes, its buffered deletes and updates are appended to this stream.
    74  We later apply them (resolve them to the actual docIDs, per segment)
    75  when a merge is started (only to the to-be-merged segments). We also
    76  apply to all segments when NRT reader is pulled, commit/close is
    77  called, or when too many deletes or updates are buffered and must be
    78  flushed (by RAM usage or by count).
    79  
    80  Each packet is assigned a generation, and each flushed or merged
    81  segment is also assigned a generation, so we can track when
    82  BufferedUpdates packets to apply to any given segment.
    83  */
    84  type BufferedUpdatesStream struct {
    85  	sync.Locker
    86  	// TODO: maybe linked list?
    87  	updates []*FrozenBufferedUpdates
    88  
    89  	// Starts at 1 so that SegmentInfos that have never had deletes
    90  	// applied (whose bufferedDelGen defaults to 0) will be correct:
    91  	nextGen int64
    92  
    93  	// used only by assert
    94  	lastDeleteTerm *Term
    95  
    96  	infoStream util.InfoStream
    97  	bytesUsed  int64 // atomic
    98  	numTerms   int32 // atomic
    99  }
   100  
   101  func newBufferedUpdatesStream(infoStream util.InfoStream) *BufferedUpdatesStream {
   102  	return &BufferedUpdatesStream{
   103  		Locker:     &sync.Mutex{},
   104  		updates:    make([]*FrozenBufferedUpdates, 0),
   105  		nextGen:    1,
   106  		infoStream: infoStream,
   107  	}
   108  }
   109  
   110  /* Appends a new packet of buffered deletes to the stream, setting its generation: */
   111  func (s *BufferedUpdatesStream) push(packet *FrozenBufferedUpdates) int64 {
   112  	panic("not implemented yet")
   113  }
   114  
   115  func (ds *BufferedUpdatesStream) clear() {
   116  	ds.Lock()
   117  	defer ds.Unlock()
   118  
   119  	ds.updates = nil
   120  	ds.nextGen = 1
   121  	atomic.StoreInt32(&ds.numTerms, 0)
   122  	atomic.StoreInt64(&ds.bytesUsed, 0)
   123  }
   124  
   125  func (ds *BufferedUpdatesStream) any() bool {
   126  	return atomic.LoadInt64(&ds.bytesUsed) != 0
   127  }
   128  
   129  func (ds *BufferedUpdatesStream) RamBytesUsed() int64 {
   130  	return atomic.LoadInt64(&ds.bytesUsed)
   131  }
   132  
   133  /*
   134  Resolves the buffered deleted Term/Query/docIDs, into actual deleted
   135  docIDs in the liveDocs MutableBits for each SegmentReader.
   136  */
   137  func (ds *BufferedUpdatesStream) applyDeletesAndUpdates(readerPool *ReaderPool, infos []*SegmentCommitInfo) (*ApplyDeletesResult, error) {
   138  	ds.Lock()
   139  	defer ds.Unlock()
   140  
   141  	if len(infos) == 0 {
   142  		ds.nextGen++
   143  		return &ApplyDeletesResult{false, ds.nextGen - 1, nil}, nil
   144  	}
   145  
   146  	t0 := time.Now()
   147  	ds.assertDeleteStats()
   148  	if !ds.any() {
   149  		if ds.infoStream.IsEnabled("BD") {
   150  			ds.infoStream.Message("BD", "applyDeletes: no deletes; skipping")
   151  		}
   152  		ds.nextGen++
   153  		return &ApplyDeletesResult{false, ds.nextGen - 1, nil}, nil
   154  	}
   155  
   156  	if ds.infoStream.IsEnabled("BD") {
   157  		ds.infoStream.Message("BD", "applyDeletes: infos=%v packetCount=%v", infos, len(ds.updates))
   158  	}
   159  
   160  	gen := ds.nextGen
   161  	ds.nextGen++
   162  
   163  	infos2 := make([]*SegmentCommitInfo, len(infos))
   164  	copy(infos2, infos)
   165  	sort.Sort(SegInfoByDelGen(infos2))
   166  
   167  	var coalescedUpdates *CoalescedUpdates
   168  	var anyNewDeletes bool
   169  
   170  	infosIDX := len(infos2) - 1
   171  	delIDX := len(ds.updates) - 1
   172  
   173  	var allDeleted []*SegmentCommitInfo
   174  
   175  	for infosIDX >= 0 {
   176  		log.Printf("BD: cycle delIDX=%v infoIDX=%v", delIDX, infosIDX)
   177  
   178  		var packet *FrozenBufferedUpdates
   179  		if delIDX >= 0 {
   180  			packet = ds.updates[delIDX]
   181  		}
   182  		info := infos2[infosIDX]
   183  		segGen := info.BufferedUpdatesGen
   184  
   185  		if packet != nil && segGen < packet.gen {
   186  			log.Println("  coalesce")
   187  			if coalescedUpdates == nil {
   188  				coalescedUpdates = newCoalescedUpdates()
   189  			}
   190  			if !packet.isSegmentPrivate {
   191  				// Only coalesce if we are NOT on a segment private del
   192  				// packet: the segment private del packet must only be
   193  				// applied to segments with the same delGen. yet, if a
   194  				// segment is already deleted from the SI since it had no
   195  				// more documents remaining after some del packets younger
   196  				// than its segPrivate packet (higher delGen) have been
   197  				// applied, the segPrivate packet has not been removed.
   198  				coalescedUpdates.update(packet)
   199  			}
   200  			delIDX--
   201  
   202  		} else if packet != nil && segGen == packet.gen {
   203  			assertn(packet.isSegmentPrivate,
   204  				"Packet and Segments deletegen can only match on a segment private del packet gen=%v",
   205  				segGen)
   206  			log.Println("  eq")
   207  
   208  			// Lockorder: IW -> BD -> RP
   209  			assert(readerPool.infoIsLive(info))
   210  			rld := readerPool.get(info, true)
   211  			reader, err := rld.reader(store.IO_CONTEXT_READ)
   212  			if err != nil {
   213  				return nil, err
   214  			}
   215  			delCount, segAllDeletes, err := func() (delCount int64, segAllDeletes bool, err error) {
   216  				defer func() {
   217  					err = mergeError(err, rld.release(reader))
   218  					err = mergeError(err, readerPool.release(rld))
   219  				}()
   220  				dvUpdates := newDocValuesFieldUpdatesContainer()
   221  				if coalescedUpdates != nil {
   222  					fmt.Println("    del coalesced")
   223  					var delta int64
   224  					delta, err = ds._applyTermDeletes(coalescedUpdates.terms(), rld, reader)
   225  					if err == nil {
   226  						delCount += delta
   227  						delta, err = applyQueryDeletes(coalescedUpdates.queries(), rld, reader)
   228  						if err == nil {
   229  							delCount += delta
   230  							err = ds.applyDocValuesUpdates(coalescedUpdates.numericDVUpdates, rld, reader, dvUpdates)
   231  							if err == nil {
   232  								err = ds.applyDocValuesUpdates(coalescedUpdates.binaryDVUpdates, rld, reader, dvUpdates)
   233  							}
   234  						}
   235  					}
   236  					if err != nil {
   237  						return
   238  					}
   239  				}
   240  				fmt.Println("    del exact")
   241  				// Don't delete by Term here; DWPT already did that on flush:
   242  				var delta int64
   243  				delta, err = applyQueryDeletes(packet.queries(), rld, reader)
   244  				if err == nil {
   245  					delCount += delta
   246  					err = ds.applyDocValuesUpdates(packet.numericDVUpdates, rld, reader, dvUpdates)
   247  					if err == nil {
   248  						err = ds.applyDocValuesUpdates(packet.binaryDVUpdates, rld, reader, dvUpdates)
   249  						if err == nil && dvUpdates.any() {
   250  							err = rld.writeFieldUpdates(info.Info.Dir, dvUpdates)
   251  						}
   252  					}
   253  				}
   254  				if err != nil {
   255  					return
   256  				}
   257  				fullDelCount := rld.info.DelCount() + rld.pendingDeleteCount()
   258  				infoDocCount := rld.info.Info.DocCount()
   259  				assert(fullDelCount <= infoDocCount)
   260  				return delCount, fullDelCount == infoDocCount, nil
   261  			}()
   262  			if err != nil {
   263  				return nil, err
   264  			}
   265  			anyNewDeletes = anyNewDeletes || (delCount > 0)
   266  
   267  			if segAllDeletes {
   268  				allDeleted = append(allDeleted, info)
   269  			}
   270  
   271  			if ds.infoStream.IsEnabled("BD") {
   272  				var suffix string
   273  				if segAllDeletes {
   274  					suffix = " 100%% deleted"
   275  				}
   276  				ds.infoStream.Message("BD", "Seg=%v segGen=%v segDeletes=[%v]; coalesced deletes=[%v] newDelCount=%v%v",
   277  					info, segGen, packet, coalescedUpdates, delCount, suffix)
   278  			}
   279  
   280  			if coalescedUpdates == nil {
   281  				coalescedUpdates = newCoalescedUpdates()
   282  			}
   283  
   284  			// Since we are on a segment private del packet we must not
   285  			// update the CoalescedUpdates here! We can simply advance to
   286  			// the next packet and seginfo.
   287  			delIDX--
   288  			infosIDX--
   289  			info.SetBufferedUpdatesGen(gen)
   290  
   291  		} else {
   292  			log.Println("  gt")
   293  
   294  			if coalescedUpdates != nil {
   295  				// Lock order: IW -> BD -> RP
   296  				assert(readerPool.infoIsLive(info))
   297  				rld := readerPool.get(info, true)
   298  				reader, err := rld.reader(store.IO_CONTEXT_READ)
   299  				if err != nil {
   300  					return nil, err
   301  				}
   302  				delCount, segAllDeletes, err := func() (delCount int64, segAllDeletes bool, err error) {
   303  					defer func() {
   304  						err = mergeError(err, rld.release(reader))
   305  						err = mergeError(err, readerPool.release(rld))
   306  					}()
   307  					var delta int64
   308  					delta, err = ds._applyTermDeletes(coalescedUpdates.terms(), rld, reader)
   309  					if err == nil {
   310  						delCount += delta
   311  						delta, err = applyQueryDeletes(coalescedUpdates.queries(), rld, reader)
   312  						if err == nil {
   313  							delCount += delta
   314  							dvUpdates := newDocValuesFieldUpdatesContainer()
   315  							err = ds.applyDocValuesUpdates(coalescedUpdates.numericDVUpdates, rld, reader, dvUpdates)
   316  							if err == nil {
   317  								err = ds.applyDocValuesUpdates(coalescedUpdates.binaryDVUpdates, rld, reader, dvUpdates)
   318  								if err == nil && dvUpdates.any() {
   319  									err = rld.writeFieldUpdates(info.Info.Dir, dvUpdates)
   320  								}
   321  							}
   322  						}
   323  					}
   324  					if err != nil {
   325  						return
   326  					}
   327  
   328  					fullDelCount := rld.info.DelCount() + rld.pendingDeleteCount()
   329  					infoDocCount := rld.info.Info.DocCount()
   330  					assert(fullDelCount <= infoDocCount)
   331  					return delCount, fullDelCount == infoDocCount, nil
   332  				}()
   333  				if err != nil {
   334  					return nil, err
   335  				}
   336  				anyNewDeletes = anyNewDeletes || (delCount > 0)
   337  
   338  				if segAllDeletes {
   339  					allDeleted = append(allDeleted, info)
   340  				}
   341  
   342  				if ds.infoStream.IsEnabled("BD") {
   343  					var suffix string
   344  					if segAllDeletes {
   345  						suffix = " 100%% deleted"
   346  					}
   347  					ds.infoStream.Message("BD", "Seg=%v segGen=%v coalesced deletes=[%v] newDelCount=%v%v",
   348  						info, segGen, coalescedUpdates, delCount, suffix)
   349  				}
   350  			}
   351  			info.SetBufferedUpdatesGen(gen)
   352  
   353  			infosIDX--
   354  		}
   355  	}
   356  
   357  	ds.assertDeleteStats()
   358  	if ds.infoStream.IsEnabled("BD") {
   359  		ds.infoStream.Message("BD", "applyDeletes took %v", time.Now().Sub(t0))
   360  	}
   361  
   362  	return &ApplyDeletesResult{anyNewDeletes, gen, allDeleted}, nil
   363  }
   364  
   365  func mergeError(err, err2 error) error {
   366  	if err == nil {
   367  		return err2
   368  	} else {
   369  		return errors.New(fmt.Sprintf("%v\n  %v", err, err2))
   370  	}
   371  }
   372  
   373  // Lock order IW -> BD
   374  /*
   375  Removes any BufferedUpdates that we no longer need to store because
   376  all segments in the index have had the deletes applied.
   377  */
   378  func (ds *BufferedUpdatesStream) prune(infos *SegmentInfos) {
   379  	ds.assertDeleteStats()
   380  	var minGen int64 = math.MaxInt64
   381  	for _, info := range infos.Segments {
   382  		if info.BufferedUpdatesGen < minGen {
   383  			minGen = info.BufferedUpdatesGen
   384  		}
   385  	}
   386  
   387  	if ds.infoStream.IsEnabled("BD") {
   388  		var dir store.Directory
   389  		if len(infos.Segments) > 0 {
   390  			dir = infos.Segments[0].Info.Dir
   391  		}
   392  		ds.infoStream.Message("BD", "prune sis=%v minGen=%v packetCount=%v",
   393  			infos.toString(dir), minGen, len(ds.updates))
   394  	}
   395  	for delIDX, update := range ds.updates {
   396  		if update.gen >= minGen {
   397  			ds.pruneUpdates(delIDX)
   398  			ds.assertDeleteStats()
   399  			return
   400  		}
   401  	}
   402  
   403  	// All deletes pruned
   404  	ds.pruneUpdates(len(ds.updates))
   405  	assert(!ds.any())
   406  	ds.assertDeleteStats()
   407  }
   408  
   409  func (ds *BufferedUpdatesStream) pruneUpdates(count int) {
   410  	if count > 0 {
   411  		if ds.infoStream.IsEnabled("BD") {
   412  			ds.infoStream.Message("BD", "pruneDeletes: prune %v packets; %v packets remain",
   413  				count, len(ds.updates)-count)
   414  		}
   415  		for delIDX := 0; delIDX < count; delIDX++ {
   416  			packet := ds.updates[delIDX]
   417  			n := atomic.AddInt32(&ds.numTerms, -int32(packet.numTermDeletes))
   418  			assert(n >= 0)
   419  			n2 := atomic.AddInt64(&ds.bytesUsed, -int64(packet.bytesUsed))
   420  			assert(n2 >= 0)
   421  			ds.updates[delIDX] = nil
   422  		}
   423  		ds.updates = ds.updates[count:]
   424  	}
   425  }
   426  
   427  /* Delete by term */
   428  func (ds *BufferedUpdatesStream) _applyTermDeletes(terms []*Term,
   429  	rld *ReadersAndUpdates, reader *SegmentReader) (int64, error) {
   430  	panic("not implemented yet")
   431  }
   432  
   433  /* DocValues updates */
   434  func (ds *BufferedUpdatesStream) applyDocValuesUpdates(updates []*DocValuesUpdate,
   435  	rld *ReadersAndUpdates, reader *SegmentReader,
   436  	dvUpdatesCntainer *DocValuesFieldUpdatesContainer) error {
   437  	panic("not implemented yet")
   438  }
   439  
   440  /* Delete by query */
   441  func applyQueryDeletes(queries []*QueryAndLimit,
   442  	rld *ReadersAndUpdates, reader *SegmentReader) (int64, error) {
   443  	panic("not implemented yet")
   444  }
   445  
   446  func (ds *BufferedUpdatesStream) assertDeleteStats() {
   447  	var numTerms2 int
   448  	var bytesUsed2 int64
   449  	for _, packet := range ds.updates {
   450  		numTerms2 += packet.numTermDeletes
   451  		bytesUsed2 += int64(packet.bytesUsed)
   452  	}
   453  	n1 := int(atomic.LoadInt32(&ds.numTerms))
   454  	assertn(numTerms2 == n1, "numTerms2=%v vs %v", numTerms2, n1)
   455  	n2 := int64(atomic.LoadInt64(&ds.bytesUsed))
   456  	assertn(bytesUsed2 == n2, "bytesUsed2=%v vs %v", bytesUsed2, n2)
   457  }