github.com/grailbio/base@v0.0.11/intervalmap/intervalmap.go

github.com/grailbio/base@v0.0.11/intervalmap/intervalmap.go (about)

     1  // Package intervalmap stores a set of (potentially overlapping) intervals.  It
     2  // supports searching for intervals that overlap user-provided interval.
     3  //
     4  // The implementation uses an 1-D version of Kd tree with randomized
     5  // surface-area heuristic
     6  // (http://www.sci.utah.edu/~wald/Publications/2007/ParallelBVHBuild/fastbuild.pdf).
     7  package intervalmap
     8  
     9  //go:generate ../gtl/generate_randomized_freepool.py --output=search_freepool --prefix=searcher --PREFIX=searcher -DELEM=*searcher --package=intervalmap
    10  
    11  import (
    12  	"bytes"
    13  	"encoding/gob"
    14  	"fmt"
    15  	"math"
    16  	"math/rand"
    17  	"runtime"
    18  	"unsafe"
    19  
    20  	"github.com/grailbio/base/log"
    21  	"github.com/grailbio/base/must"
    22  )
    23  
    24  // Key is the type for interval boundaries.
    25  type Key = int64
    26  
    27  // Interval defines an half-open interval, [Start, Limit).
    28  type Interval struct {
    29  	// Start is included
    30  	Start Key
    31  	// Limit is excluded.
    32  	Limit Key
    33  }
    34  
    35  var emptyInterval = Interval{math.MaxInt64, math.MinInt64}
    36  
    37  func min(x, y Key) Key {
    38  	if x < y {
    39  		return x
    40  	}
    41  	return y
    42  }
    43  
    44  func max(x, y Key) Key {
    45  	if x < y {
    46  		return y
    47  	}
    48  	return x
    49  }
    50  
    51  // Intersects checks if (i∩j) != ∅
    52  func (i Interval) Intersects(j Interval) bool {
    53  	return i.Limit > j.Start && j.Limit > i.Start
    54  }
    55  
    56  // Intersect computes i ∩ j.
    57  func (i Interval) Intersect(j Interval) Interval {
    58  	minKey := max(i.Start, j.Start)
    59  	maxKey := min(i.Limit, j.Limit)
    60  	return Interval{minKey, maxKey}
    61  }
    62  
    63  // Empty checks if the interval is empty.
    64  func (i Interval) Empty() bool { return i.Start >= i.Limit }
    65  
    66  // Span computes a minimal interval that spans over both i and j.  If either i
    67  // or j is an empty set, this function returns the other set.
    68  func (i Interval) Span(j Interval) Interval {
    69  	switch {
    70  	case i.Empty():
    71  		return j
    72  	case j.Empty():
    73  		return i
    74  	default:
    75  		return Interval{min(i.Start, j.Start), max(i.Limit, j.Limit)}
    76  	}
    77  }
    78  
    79  const (
    80  	maxEntsInNode = 16 // max size of node.ents.
    81  )
    82  
    83  // Entry represents one interval.
    84  type Entry struct {
    85  	// Interval defines a half-open interval, [Start,Limit)
    86  	Interval Interval
    87  	// Data is an arbitrary user-defined payload
    88  	Data interface{}
    89  }
    90  
    91  type entry struct {
    92  	Entry
    93  	id int // dense sequence number 0, 1, 2, ...
    94  }
    95  
    96  // node represents one node in Kdtree.
    97  type node struct {
    98  	bounds      Interval // interval covered by this node.
    99  	left, right *node    // children. Maybe nil.
   100  	ents        []*entry // Nonempty iff. left=nil&&right=nil.
   101  	label       string   // for debugging only.
   102  }
   103  
   104  // TreeStats shows tree-wide stats.
   105  type TreeStats struct {
   106  	// Nodes is the total number of tree nodes.
   107  	Nodes int
   108  	// Nodes is the total number of leaf nodes.
   109  	//
   110  	// Invariant: LeafNodes < Nodes
   111  	LeafNodes int
   112  	// MaxDepth is the max depth of the tree.
   113  	MaxDepth int
   114  	// MaxLeafNodeSize is the maximum len(node.ents) of all nodes in the tree.
   115  	MaxLeafNodeSize int
   116  	// TotalLeafDepth is the sum of depth of all leaf nodes.
   117  	TotalLeafDepth int
   118  	// TotalLeafDepth is the sum of len(node.ents) of all leaf nodes.
   119  	TotalLeafNodeSize int
   120  }
   121  
   122  // T represents the intervalmap. It must be created using New().
   123  type T struct {
   124  	root  node
   125  	stats TreeStats
   126  	pool  *searcherFreePool
   127  }
   128  
   129  // New creates a new tree with the given set of entries.  The intervals may
   130  // overlap, and they need not be sorted.
   131  func New(ents []Entry) *T {
   132  	entsCopy := make([]entry, len(ents))
   133  	for i := range ents {
   134  		entsCopy[i] = entry{Entry: ents[i], id: i}
   135  	}
   136  	ients := make([]*entry, len(ents))
   137  	for i := range entsCopy {
   138  		ients[i] = &entsCopy[i]
   139  	}
   140  	r := rand.New(rand.NewSource(0))
   141  	t := &T{}
   142  	t.stats.MaxDepth = -1
   143  	t.stats.MaxLeafNodeSize = -1
   144  	t.root.init("", ients, keyRange(ients), r, &t.stats)
   145  	t.pool = newSearcherFreePool(t, len(ents))
   146  	return t
   147  }
   148  
   149  func newSearcherFreePool(t *T, nEnt int) *searcherFreePool {
   150  	return NewsearcherFreePool(func() *searcher {
   151  		return &searcher{
   152  			tree: t,
   153  			hits: make([]uint32, nEnt),
   154  		}
   155  	}, runtime.NumCPU()*2)
   156  }
   157  
   158  // searcher keeps state needed during one search episode.  It is owned by one
   159  // goroutine.
   160  type searcher struct {
   161  	tree     *T
   162  	searchID uint32   // increments on every search
   163  	hits     []uint32 // hits[i] == searchID if the i'th entry has already been visited
   164  }
   165  
   166  func (s *searcher) visit(i int) bool {
   167  	if s.hits[i] != s.searchID {
   168  		s.hits[i] = s.searchID
   169  		return true
   170  	}
   171  	return false
   172  }
   173  
   174  // Stats returns tree-wide stats.
   175  func (t *T) Stats() TreeStats { return t.stats }
   176  
   177  // Get finds all the entries that intersect the given interval and return them
   178  // in *ents.
   179  func (t *T) Get(interval Interval, ents *[]*Entry) {
   180  	s := t.pool.Get()
   181  	s.searchID++
   182  	*ents = (*ents)[:0]
   183  	t.root.get(interval, ents, s)
   184  	if s.searchID < math.MaxUint32 {
   185  		t.pool.Put(s)
   186  	}
   187  }
   188  
   189  // Any checks if any of the entries intersect the given interval.
   190  func (t *T) Any(interval Interval) bool {
   191  	s := t.pool.Get()
   192  	s.searchID++
   193  	found := t.root.any(interval, s)
   194  	if s.searchID < math.MaxUint32 {
   195  		t.pool.Put(s)
   196  	}
   197  	return found
   198  }
   199  
   200  func keyRange(ents []*entry) Interval {
   201  	i := emptyInterval
   202  	for _, e := range ents {
   203  		i = i.Span(e.Interval)
   204  	}
   205  	return i
   206  }
   207  
   208  const maxSample = 8
   209  
   210  // randomSample picks maxSample random elements from ents[]. It shuffles ents[]
   211  // in place.
   212  func randomSample(ents []*entry, r *rand.Rand) []*entry {
   213  	if len(ents) <= maxSample {
   214  		return ents
   215  	}
   216  	shuffleFirstN := func(n int) { // Fisher-Yates shuffle
   217  		for i := 0; i < n-1; i++ {
   218  			j := i + r.Intn(len(ents)-i)
   219  			ents[i], ents[j] = ents[j], ents[i]
   220  		}
   221  	}
   222  	n := maxSample
   223  	if len(ents)-n < n {
   224  		// When maxSample < len(n) < maxSample*2, it's faster to compute the
   225  		// complement set.
   226  		n = len(ents) - n
   227  		shuffleFirstN(len(ents) - n)
   228  		return ents[n:]
   229  	}
   230  	shuffleFirstN(n)
   231  	return ents[:n]
   232  }
   233  
   234  // This function splits interval "bounds" into two balanced subintervals,
   235  // [bounds.Start, mid) and [mid, bounds.Limit). left (right) will store a subset
   236  // of ents[] that fits in the first (second, resp) subinterval. Note that an
   237  // entry in ents[] may belong to both left and right, if the entry spans over
   238  // the midpoint.
   239  //
   240  // Ok=false if this function fails to find a good split point.
   241  func split(label string, ents []*entry, bounds Interval, r *rand.Rand) (mid Key, left []*entry, right []*entry, ok bool) {
   242  	// A good interval split point is guaranteed to be at one of the interval
   243  	// endpoints.  To bound the compute time, we sample up to 16 intervals in
   244  	// ents[], and examine their endpoints one by one.
   245  	sample := randomSample(ents, r)
   246  	sampleRange := keyRange(sample).Intersect(bounds)
   247  	log.Debug.Printf("%s: Split %+v, %d ents", label, sampleRange, len(ents))
   248  	if sampleRange.Empty() {
   249  		panic(sample)
   250  	}
   251  	var (
   252  		candidates [maxSample * 2]Key
   253  		nCandidate int
   254  	)
   255  	for i, e := range sample {
   256  		candidates[i*2] = e.Interval.Start
   257  		candidates[i*2+1] = e.Interval.Limit
   258  		nCandidate += 2
   259  	}
   260  
   261  	// splitAt splits ents[] into two subsets, assuming bounds is split at mid.
   262  	splitAt := func(ents []*entry, mid Key, left, right *[]*entry) {
   263  		*left = (*left)[:0]
   264  		*right = (*right)[:0]
   265  		for _, e := range ents {
   266  			if e.Interval.Intersects(Interval{bounds.Start, mid}) {
   267  				*left = append(*left, e)
   268  			}
   269  			if e.Interval.Intersects(Interval{mid, bounds.Limit}) {
   270  				*right = append(*right, e)
   271  			}
   272  		}
   273  	}
   274  
   275  	// Compute the cost of splitting at each of candidates[].
   276  	// We use the surface-area heuristics. The best explanation is in
   277  	// the following paper:
   278  	//
   279  	// Ingo Wald, Realtime ray tracing and interactive global illumination,
   280  	// http://www.sci.utah.edu/~wald/Publications/2004/PhD/phd.pdf
   281  	//
   282  	// The basic idea is the following:
   283  	//
   284  	// - Assume we split the parent interval [s, e) into two intervals
   285  	//   [s,m) and [m,e)
   286  	//
   287  	// - The cost C(x) of searching a subinterval x is roughly
   288  	//    C(x) = (length of x) * (# of entries that intersect x).
   289  	//
   290  	//    The first term is the probability that a query hits the subinterval, and
   291  	//    the 2nd term is the cost of searching inside the subinterval.
   292  	//
   293  	//    This assumes that a query is distributed uniformly over the domain (in
   294  	//    our case, [-maxint32, maxint32].
   295  	//
   296  	// - The best split point is m that minimizes C([s,m)) + C([m,e))
   297  	minCost := math.MaxFloat64
   298  	var minMid Key
   299  	var minLeft, minRight []*entry
   300  	var tmpLeft, tmpRight []*entry
   301  
   302  	for _, mid := range candidates[:nCandidate] {
   303  		splitAt(ents, mid, &tmpLeft, &tmpRight)
   304  		if len(tmpLeft) == 0 || len(tmpRight) == 0 {
   305  			continue
   306  		}
   307  		cost := float64(len(tmpLeft))*float64(mid-sampleRange.Start) +
   308  			float64(len(tmpRight))*float64(sampleRange.Limit-mid)
   309  		if cost < minCost {
   310  			minMid = mid
   311  			minLeft, tmpLeft = tmpLeft, minLeft
   312  			minRight, tmpRight = tmpRight, minRight
   313  			minCost = cost
   314  		}
   315  	}
   316  	if minCost == math.MaxFloat64 || len(minLeft) == len(ents) || len(minRight) == len(ents) {
   317  		return
   318  	}
   319  	mid = minMid
   320  	left = minLeft
   321  	right = minRight
   322  	ok = true
   323  	return
   324  }
   325  
   326  func (n *node) init(label string, ents []*entry, bounds Interval, r *rand.Rand, stats *TreeStats) {
   327  	defer func() {
   328  		// Update the stats.
   329  		stats.Nodes++
   330  		depth := len(n.label)
   331  		if depth > stats.MaxDepth {
   332  			stats.MaxDepth = depth
   333  		}
   334  		if e := len(n.ents); e > 0 { // Leaf node
   335  			stats.LeafNodes++
   336  			stats.TotalLeafNodeSize += e
   337  			stats.TotalLeafDepth += depth
   338  			if e > stats.MaxLeafNodeSize {
   339  				stats.MaxLeafNodeSize = e
   340  			}
   341  		}
   342  	}()
   343  
   344  	n.label = label
   345  	n.bounds = bounds
   346  	if len(ents) <= maxEntsInNode {
   347  		n.ents = ents
   348  		return
   349  	}
   350  	mid, left, right, ok := split(n.label, ents, bounds, r)
   351  	if !ok {
   352  		n.ents = ents
   353  		return
   354  	}
   355  	n.left = &node{}
   356  
   357  	leftInterval := Interval{n.bounds.Start, mid}
   358  	leftKR := keyRange(left)
   359  	log.Debug.Printf("%v (bounds %v): left %v %v %v", n.label, n.bounds, leftKR, leftInterval, leftKR.Intersect(leftInterval))
   360  	n.left.init(label+"L", left, leftKR.Intersect(leftInterval), r, stats)
   361  	n.right = &node{}
   362  	n.right.init(label+"R", right, keyRange(right).Intersect(Interval{mid, n.bounds.Limit}), r, stats)
   363  }
   364  
   365  func addEntry(ents *[]*Entry, e *entry, s *searcher) {
   366  	if s.visit(e.id) {
   367  		*ents = append(*ents, (*Entry)(unsafe.Pointer(e)))
   368  	}
   369  }
   370  
   371  func (n *node) get(interval Interval, ents *[]*Entry, s *searcher) {
   372  	interval = interval.Intersect(n.bounds)
   373  	if interval.Empty() {
   374  		return
   375  	}
   376  	if len(n.ents) > 0 { // Leaf node
   377  		for _, e := range n.ents {
   378  			if interval.Intersects(e.Interval) {
   379  				addEntry(ents, e, s)
   380  			}
   381  		}
   382  		return
   383  	}
   384  	n.left.get(interval, ents, s)
   385  	n.right.get(interval, ents, s)
   386  }
   387  
   388  func (n *node) any(interval Interval, s *searcher) bool {
   389  	interval = interval.Intersect(n.bounds)
   390  	if interval.Empty() {
   391  		return false
   392  	}
   393  	if len(n.ents) > 0 { // Leaf node
   394  		for _, e := range n.ents {
   395  			if interval.Intersects(e.Interval) {
   396  				return true
   397  			}
   398  		}
   399  		return false
   400  	}
   401  	found := n.left.any(interval, s)
   402  	if !found {
   403  		found = n.right.any(interval, s)
   404  	}
   405  	return found
   406  }
   407  
   408  // GOB support
   409  
   410  const gobFormatVersion = 1
   411  
   412  // MarshalBinary implements encoding.BinaryMarshaler interface.  It allows T to
   413  // be encoded and decoded using Gob.
   414  func (t *T) MarshalBinary() (data []byte, err error) {
   415  	buf := bytes.Buffer{}
   416  	e := gob.NewEncoder(&buf)
   417  	must.Nil(e.Encode(gobFormatVersion))
   418  	marshalNode(e, &t.root)
   419  	must.Nil(e.Encode(t.stats))
   420  	return buf.Bytes(), nil
   421  }
   422  
   423  func marshalNode(e *gob.Encoder, n *node) {
   424  	if n == nil {
   425  		must.Nil(e.Encode(false))
   426  		return
   427  	}
   428  	must.Nil(e.Encode(true))
   429  	must.Nil(e.Encode(n.bounds))
   430  	marshalNode(e, n.left)
   431  	marshalNode(e, n.right)
   432  	must.Nil(e.Encode(len(n.ents)))
   433  	for _, ent := range n.ents {
   434  		must.Nil(e.Encode(ent.Entry))
   435  		must.Nil(e.Encode(ent.id))
   436  	}
   437  	must.Nil(e.Encode(n.label))
   438  }
   439  
   440  // UnmarshalBinary implements encoding.BinaryUnmarshaler interface.
   441  // It allows T to be encoded and decoded using Gob.
   442  func (t *T) UnmarshalBinary(data []byte) error {
   443  	buf := bytes.NewReader(data)
   444  	d := gob.NewDecoder(buf)
   445  	var version int
   446  	if err := d.Decode(&version); err != nil {
   447  		return err
   448  	}
   449  	if version != gobFormatVersion {
   450  		return fmt.Errorf("gob decode: got version %d, want %d", version, gobFormatVersion)
   451  	}
   452  	var (
   453  		maxid = -1
   454  		err   error
   455  		root  *node
   456  	)
   457  	if root, err = unmarshalNode(d, &maxid); err != nil {
   458  		return err
   459  	}
   460  	t.root = *root
   461  	if err := d.Decode(&t.stats); err != nil {
   462  		return err
   463  	}
   464  	t.pool = newSearcherFreePool(t, maxid+1)
   465  	return nil
   466  }
   467  
   468  func unmarshalNode(d *gob.Decoder, maxid *int) (*node, error) {
   469  	var (
   470  		exist bool
   471  		err   error
   472  	)
   473  	if err = d.Decode(&exist); err != nil {
   474  		return nil, err
   475  	}
   476  	if !exist {
   477  		return nil, nil
   478  	}
   479  	n := &node{}
   480  	if err := d.Decode(&n.bounds); err != nil {
   481  		return nil, err
   482  	}
   483  	if n.left, err = unmarshalNode(d, maxid); err != nil {
   484  		return nil, err
   485  	}
   486  	if n.right, err = unmarshalNode(d, maxid); err != nil {
   487  		return nil, err
   488  	}
   489  	var nEnt int
   490  	if err := d.Decode(&nEnt); err != nil {
   491  		return nil, err
   492  	}
   493  	n.ents = make([]*entry, nEnt)
   494  	for i := 0; i < nEnt; i++ {
   495  		n.ents[i] = &entry{}
   496  		if err := d.Decode(&n.ents[i].Entry); err != nil {
   497  			return nil, err
   498  		}
   499  		if err := d.Decode(&n.ents[i].id); err != nil {
   500  			return nil, err
   501  		}
   502  		if n.ents[i].id > *maxid {
   503  			*maxid = n.ents[i].id
   504  		}
   505  	}
   506  	if err := d.Decode(&n.label); err != nil {
   507  		return nil, err
   508  	}
   509  	return n, nil
   510  }