github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/geo/geoindex/index.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package geoindex
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"math"
    17  	"strings"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/geo"
    20  	"github.com/golang/geo/s2"
    21  )
    22  
    23  // Interfaces for accelerating certain spatial operations, by allowing the
    24  // caller to use an externally stored index.
    25  //
    26  // An index interface has methods that specify what to write or read from the
    27  // stored index. It is the caller's responsibility to do the actual writes and
    28  // reads. Index keys are represented using uint64 which implies that the index
    29  // is transforming the 2D space (spherical or planar) to 1D space, say using a
    30  // space-filling curve. The index can return false positives so query
    31  // execution must use an exact filtering step after using the index. The
    32  // current implementations use the S2 geometry library.
    33  //
    34  // The externally stored index must support key-value semantics and range
    35  // queries over keys. The return values from Covers/CoveredBy/Intersects are
    36  // specialized to the computation that needs to be performed:
    37  // - Intersects needs to union (a) all the ranges corresponding to subtrees
    38  //   rooted at the covering of g, (b) all the parent nodes of the covering
    39  //   of g. The individual entries in (b) are representable as ranges of
    40  //   length 1. All of this is represented as UnionKeySpans. Covers, which
    41  //   is the shape that g covers, currently delegates to Interects so
    42  //   returns the same.
    43  //
    44  // - CoveredBy, which are the shapes that g is covered-by, needs to compute
    45  //   on the paths from each covering node to the root. For example, consider
    46  //   a quad-tree approach to dividing the space, where the nodes/cells
    47  //   covering g are c53, c61, c64. The corresponding ancestor path sets are
    48  //   {c53, c13, c3, c0}, {c61, c15, c3, c0}, {c64, c15, c3, c0}. Let I(c53)
    49  //   represent the index entries for c53. Then,
    50  //   I(c53) \union I(c13) \union I(c3) \union I(c0)
    51  //   represents all the shapes that cover cell c53. Similar union expressions
    52  //   can be constructed for all these paths. The computation needs to
    53  //   intersect these unions since all these cells need to be covered by a
    54  //   shape that covers g. One can extract the common sub-expressions to give
    55  //   I(c0) \union I(c3) \union
    56  //    ((I(c13) \union I(c53)) \intersection
    57  //     (I(c15) \union (I(c61) \intersection I(c64)))
    58  //   CoveredBy returns this factored expression in Reverse Polish notation.
    59  
    60  // GeographyIndex is an index over the unit sphere.
    61  type GeographyIndex interface {
    62  	// InvertedIndexKeys returns the keys to store this object under when adding
    63  	// it to the index.
    64  	InvertedIndexKeys(c context.Context, g *geo.Geography) ([]Key, error)
    65  
    66  	// Acceleration for topological relationships (see
    67  	// https://postgis.net/docs/reference.html#Spatial_Relationships). Distance
    68  	// relationships can be accelerated by adjusting g before calling these
    69  	// functions. Bounding box operators are not accelerated since we do not
    70  	// index the bounding box -- bounding box queries are an implementation
    71  	// detail of a particular indexing approach in PostGIS and are not part of
    72  	// the OGC or SQL/MM specs.
    73  
    74  	// Covers returns the index spans to read and union for the relationship
    75  	// ST_Covers(g, x), where x are the indexed geometries.
    76  	Covers(c context.Context, g *geo.Geography) (UnionKeySpans, error)
    77  
    78  	// CoveredBy returns the index entries to read and the expression to compute
    79  	// for ST_CoveredBy(g, x), where x are the indexed geometries.
    80  	CoveredBy(c context.Context, g *geo.Geography) (RPKeyExpr, error)
    81  
    82  	// Intersects returns the index spans to read and union for the relationship
    83  	// ST_Intersects(g, x), where x are the indexed geometries.
    84  	Intersects(c context.Context, g *geo.Geography) (UnionKeySpans, error)
    85  
    86  	// TestingInnerCovering returns an inner covering of g.
    87  	TestingInnerCovering(g *geo.Geography) s2.CellUnion
    88  }
    89  
    90  // GeometryIndex is an index over 2D cartesian coordinates.
    91  type GeometryIndex interface {
    92  	// InvertedIndexKeys returns the keys to store this object under when adding
    93  	// it to the index.
    94  	InvertedIndexKeys(c context.Context, g *geo.Geometry) ([]Key, error)
    95  
    96  	// Acceleration for topological relationships (see
    97  	// https://postgis.net/docs/reference.html#Spatial_Relationships). Distance
    98  	// relationships can be accelerated by adjusting g before calling these
    99  	// functions. Bounding box operators are not accelerated since we do not
   100  	// index the bounding box -- bounding box queries are an implementation
   101  	// detail of a particular indexing approach in PostGIS and are not part of
   102  	// the OGC or SQL/MM specs.
   103  
   104  	// Covers returns the index spans to read and union for the relationship
   105  	// ST_Covers(g, x), where x are the indexed geometries.
   106  	Covers(c context.Context, g *geo.Geometry) (UnionKeySpans, error)
   107  
   108  	// CoveredBy returns the index entries to read and the expression to compute
   109  	// for ST_CoveredBy(g, x), where x are the indexed geometries.
   110  	CoveredBy(c context.Context, g *geo.Geometry) (RPKeyExpr, error)
   111  
   112  	// Intersects returns the index spans to read and union for the relationship
   113  	// ST_Intersects(g, x), where x are the indexed geometries.
   114  	Intersects(c context.Context, g *geo.Geometry) (UnionKeySpans, error)
   115  
   116  	// TestingInnerCovering returns an inner covering of g.
   117  	TestingInnerCovering(g *geo.Geometry) s2.CellUnion
   118  }
   119  
   120  // RelationshipType stores a type of geospatial relationship query that can
   121  // be accelerated using an index.
   122  type RelationshipType uint8
   123  
   124  const (
   125  	// Covers corresponds to the relationship in which one geospatial object
   126  	// covers another geospatial object.
   127  	Covers RelationshipType = (1 << iota)
   128  
   129  	// CoveredBy corresponds to the relationship in which one geospatial object
   130  	// is covered by another geospatial object.
   131  	CoveredBy
   132  
   133  	// Intersects corresponds to the relationship in which one geospatial object
   134  	// intersects another geospatial object.
   135  	Intersects
   136  )
   137  
   138  var geoRelationshipTypeStr = map[RelationshipType]string{
   139  	Covers:     "covers",
   140  	CoveredBy:  "covered by",
   141  	Intersects: "intersects",
   142  }
   143  
   144  func (gr RelationshipType) String() string {
   145  	return geoRelationshipTypeStr[gr]
   146  }
   147  
   148  // IsEmptyConfig returns whether the given config contains a geospatial index
   149  // configuration.
   150  func IsEmptyConfig(cfg *Config) bool {
   151  	if cfg == nil {
   152  		return true
   153  	}
   154  	return cfg.S2Geography == nil && cfg.S2Geometry == nil
   155  }
   156  
   157  // IsGeographyConfig returns whether the config is a geography geospatial
   158  // index configuration.
   159  func IsGeographyConfig(cfg *Config) bool {
   160  	if cfg == nil {
   161  		return false
   162  	}
   163  	return cfg.S2Geography != nil
   164  }
   165  
   166  // IsGeometryConfig returns whether the config is a geometry geospatial
   167  // index configuration.
   168  func IsGeometryConfig(cfg *Config) bool {
   169  	if cfg == nil {
   170  		return false
   171  	}
   172  	return cfg.S2Geometry != nil
   173  }
   174  
   175  // Key is one entry under which a geospatial shape is stored on behalf of an
   176  // Index. The index is of the form (Key, Primary Key).
   177  type Key uint64
   178  
   179  // rpExprElement implements the RPExprElement interface.
   180  func (k Key) rpExprElement() {}
   181  
   182  func (k Key) String() string {
   183  	c := s2.CellID(k)
   184  	if !c.IsValid() {
   185  		return "spilled"
   186  	}
   187  	var b strings.Builder
   188  	b.WriteByte('F')
   189  	b.WriteByte("012345"[c.Face()])
   190  	fmt.Fprintf(&b, "/L%d/", c.Level())
   191  	for level := 1; level <= c.Level(); level++ {
   192  		b.WriteByte("0123"[c.ChildPosition(level)])
   193  	}
   194  	return b.String()
   195  }
   196  
   197  // KeySpan represents a range of Keys.
   198  type KeySpan struct {
   199  	// Both Start and End are inclusive, i.e., [Start, End].
   200  	Start, End Key
   201  }
   202  
   203  // UnionKeySpans is the set of indexed spans to retrieve and combine via set
   204  // union. The spans are guaranteed to be non-overlapping. Duplicate primary
   205  // keys will not be retrieved by any individual key, but they may be present
   206  // if more than one key is retrieved (including duplicates in a single span
   207  // where End - Start > 1).
   208  type UnionKeySpans []KeySpan
   209  
   210  func (s UnionKeySpans) String() string {
   211  	return s.toString(math.MaxInt32)
   212  }
   213  
   214  func (s UnionKeySpans) toString(wrap int) string {
   215  	b := newStringBuilderWithWrap(&strings.Builder{}, wrap)
   216  	for i, span := range s {
   217  		if span.Start == span.End {
   218  			fmt.Fprintf(b, "%s", span.Start)
   219  		} else {
   220  			fmt.Fprintf(b, "[%s, %s]", span.Start, span.End)
   221  		}
   222  		if i != len(s)-1 {
   223  			b.WriteString(", ")
   224  		}
   225  		b.tryWrap()
   226  	}
   227  	return b.String()
   228  }
   229  
   230  // RPExprElement is an element in the Reverse Polish notation expression.
   231  // It is implemented by Key and RPSetOperator.
   232  type RPExprElement interface {
   233  	rpExprElement()
   234  }
   235  
   236  // RPSetOperator is a set operator in the Reverse Polish notation expression.
   237  type RPSetOperator int
   238  
   239  const (
   240  	// RPSetUnion is the union operator.
   241  	RPSetUnion RPSetOperator = iota + 1
   242  
   243  	// RPSetIntersection is the intersection operator.
   244  	RPSetIntersection
   245  )
   246  
   247  // rpExprElement implements the RPExprElement interface.
   248  func (o RPSetOperator) rpExprElement() {}
   249  
   250  // RPKeyExpr is an expression to evaluate over primary keys retrieved for
   251  // index keys. If we view each index key as a posting list of primary keys,
   252  // the expression involves union and intersection over the sets represented by
   253  // each posting list. For S2, this expression represents an intersection of
   254  // ancestors of different keys (cell ids) and is likely to contain many common
   255  // keys. This special structure allows us to efficiently and easily eliminate
   256  // common sub-expressions, hence the interface presents the factored
   257  // expression. The expression is represented in Reverse Polish notation.
   258  type RPKeyExpr []RPExprElement
   259  
   260  func (x RPKeyExpr) String() string {
   261  	var elements []string
   262  	for _, e := range x {
   263  		switch elem := e.(type) {
   264  		case Key:
   265  			elements = append(elements, elem.String())
   266  		case RPSetOperator:
   267  			switch elem {
   268  			case RPSetUnion:
   269  				elements = append(elements, `\U`)
   270  			case RPSetIntersection:
   271  				elements = append(elements, `\I`)
   272  			}
   273  		}
   274  	}
   275  	return strings.Join(elements, " ")
   276  }
   277  
   278  // Helper functions for index implementations that use the S2 geometry
   279  // library.
   280  
   281  func covering(rc *s2.RegionCoverer, regions []s2.Region) s2.CellUnion {
   282  	// TODO(sumeer): Add a max cells constraint for the whole covering,
   283  	// to respect the index configuration.
   284  	var u s2.CellUnion
   285  	for _, r := range regions {
   286  		u = append(u, rc.Covering(r)...)
   287  	}
   288  	// Ensure the cells are non-overlapping.
   289  	u.Normalize()
   290  	return u
   291  }
   292  
   293  // The "inner covering", for shape s, represented by the regions parameter, is
   294  // used to find shapes that contain shape s. A regular covering that includes
   295  // a cell c not completely covered by shape s could result in false negatives,
   296  // since shape x that covers shape s could use a finer cell covering (using
   297  // cells below c). For example, consider a portion of the cell quad-tree
   298  // below:
   299  //
   300  //                      c0
   301  //                      |
   302  //                      c3
   303  //                      |
   304  //                  +---+---+
   305  //                  |       |
   306  //                 c13      c15
   307  //                  |       |
   308  //                 c53   +--+--+
   309  //                       |     |
   310  //                      c61    c64
   311  //
   312  // Shape s could have a regular covering c15, c53, where c15 has 4 child cells
   313  // c61..c64, and shape s only intersects wit c61, c64. A different shape x
   314  // that covers shape s may have a covering c61, c64, c53. That is, it has used
   315  // the finer cells c61, c64. If we used both regular coverings it is hard to
   316  // know that x covers g. Hence, we compute the "inner covering" of g (defined
   317  // below).
   318  //
   319  // The interior covering of shape s includes only cells covered by s. This is
   320  // computed by RegionCoverer.InteriorCovering() and is intuitively what we
   321  // need. But the interior covering is naturally empty for points and lines
   322  // (and can be empty for polygons too), and an empty covering is not useful
   323  // for constraining index lookups. We observe that leaf cells that intersect
   324  // shape s can be used in the covering, since the covering of shape x must
   325  // also cover these cells. This allows us to compute non-empty coverings for
   326  // all shapes. Since this is not technically the interior covering, we use the
   327  // term "inner covering".
   328  func innerCovering(rc *s2.RegionCoverer, regions []s2.Region) s2.CellUnion {
   329  	var u s2.CellUnion
   330  	for _, r := range regions {
   331  		switch region := r.(type) {
   332  		case s2.Point:
   333  			cellID := s2.CellFromPoint(region).ID()
   334  			if !cellID.IsLeaf() {
   335  				panic("bug in S2")
   336  			}
   337  			u = append(u, cellID)
   338  		case *s2.Polyline:
   339  			// TODO(sumeer): for long lines could also pick some intermediate
   340  			// points along the line. Decide based on experiments.
   341  			for _, p := range *region {
   342  				cellID := s2.CellFromPoint(p).ID()
   343  				u = append(u, cellID)
   344  			}
   345  		case *s2.Polygon:
   346  			// Iterate over all exterior points
   347  			if region.NumLoops() > 0 {
   348  				loop := region.Loop(0)
   349  				for _, p := range loop.Vertices() {
   350  					cellID := s2.CellFromPoint(p).ID()
   351  					u = append(u, cellID)
   352  				}
   353  				// Arbitrary threshold value. This is to avoid computing an expensive
   354  				// region covering for regions with small area.
   355  				// TODO(sumeer): Improve this heuristic:
   356  				// - Area() may be expensive.
   357  				// - For large area regions, put an upper bound on the
   358  				//   level used for cells.
   359  				// Decide based on experiments.
   360  				const smallPolygonLevelThreshold = 25
   361  				if region.Area() > s2.AvgAreaMetric.Value(smallPolygonLevelThreshold) {
   362  					u = append(u, rc.InteriorCovering(region)...)
   363  				}
   364  			}
   365  		default:
   366  			panic("bug: code should not be producing unhandled Region type")
   367  		}
   368  	}
   369  	// Ensure the cells are non-overlapping.
   370  	u.Normalize()
   371  
   372  	// TODO(sumeer): if the number of cells is too many, make the list sparser.
   373  	// u[len(u)-1] - u[0] / len(u) is the mean distance between cells. Pick a
   374  	// target distance based on the goal to reduce to k cells: target_distance
   375  	// := mean_distance * k / len(u) Then iterate over u and for every sequence
   376  	// of cells that are within target_distance, replace by median cell or by
   377  	// largest cell. Decide based on experiments.
   378  
   379  	return u
   380  }
   381  
   382  // ancestorCells returns the set of cells containing these cells, not
   383  // including the given cells.
   384  //
   385  // TODO(sumeer): use the MinLevel and LevelMod of the RegionCoverer used
   386  // for the index to constrain the ancestors set.
   387  func ancestorCells(cells []s2.CellID) []s2.CellID {
   388  	var ancestors []s2.CellID
   389  	var seen map[s2.CellID]struct{}
   390  	if len(cells) > 1 {
   391  		seen = make(map[s2.CellID]struct{})
   392  	}
   393  	for _, c := range cells {
   394  		for l := c.Level() - 1; l >= 0; l-- {
   395  			p := c.Parent(l)
   396  			if seen != nil {
   397  				if _, ok := seen[p]; ok {
   398  					break
   399  				}
   400  				seen[p] = struct{}{}
   401  			}
   402  			ancestors = append(ancestors, p)
   403  		}
   404  	}
   405  	return ancestors
   406  }
   407  
   408  // Helper for InvertedIndexKeys.
   409  func invertedIndexKeys(_ context.Context, rc *s2.RegionCoverer, r []s2.Region) []Key {
   410  	covering := covering(rc, r)
   411  	keys := make([]Key, len(covering))
   412  	for i, cid := range covering {
   413  		keys[i] = Key(cid)
   414  	}
   415  	return keys
   416  }
   417  
   418  // TODO(sumeer): examine RegionCoverer carefully to see if we can strengthen
   419  // the covering invariant, which would increase the efficiency of covers() and
   420  // remove the need for TestingInnerCovering().
   421  //
   422  // Helper for Covers.
   423  func covers(c context.Context, rc *s2.RegionCoverer, r []s2.Region) UnionKeySpans {
   424  	// We use intersects since geometries covered by r may have been indexed
   425  	// using cells that are ancestors of the covering of r. We could avoid
   426  	// reading ancestors if we had a stronger covering invariant, such as by
   427  	// indexing inner coverings.
   428  	return intersects(c, rc, r)
   429  }
   430  
   431  // Helper for Intersects. Returns spans in sorted order for convenience of
   432  // scans.
   433  func intersects(_ context.Context, rc *s2.RegionCoverer, r []s2.Region) UnionKeySpans {
   434  	covering := covering(rc, r)
   435  	querySpans := make([]KeySpan, len(covering))
   436  	for i, cid := range covering {
   437  		querySpans[i] = KeySpan{Start: Key(cid.RangeMin()), End: Key(cid.RangeMax())}
   438  	}
   439  	for _, cid := range ancestorCells(covering) {
   440  		querySpans = append(querySpans, KeySpan{Start: Key(cid), End: Key(cid)})
   441  	}
   442  	return querySpans
   443  }
   444  
   445  // Helper for CoveredBy.
   446  func coveredBy(_ context.Context, rc *s2.RegionCoverer, r []s2.Region) RPKeyExpr {
   447  	covering := innerCovering(rc, r)
   448  	ancestors := ancestorCells(covering)
   449  
   450  	// The covering is normalized so no 2 cells are such that one is an ancestor
   451  	// of another. Arrange these cells and their ancestors in a quad-tree. Any cell
   452  	// with more than one child needs to be unioned with the intersection of the
   453  	// expressions corresponding to each child. See the detailed comment in
   454  	// generateRPExprForTree().
   455  
   456  	// It is sufficient to represent the tree(s) using presentCells since the ids
   457  	// of all possible 4 children of a cell can be computed and checked for
   458  	// presence in the map.
   459  	presentCells := make(map[s2.CellID]struct{}, len(covering)+len(ancestors))
   460  	for _, c := range covering {
   461  		presentCells[c] = struct{}{}
   462  	}
   463  	for _, c := range ancestors {
   464  		presentCells[c] = struct{}{}
   465  	}
   466  
   467  	// Construct the reverse polish expression. Note that there are up to 6
   468  	// trees corresponding to the 6 faces in S2. The expressions for the
   469  	// trees need to be intersected with each other.
   470  	expr := make([]RPExprElement, 0, len(presentCells)*2)
   471  	numFaces := 0
   472  	for face := 0; face < 6; face++ {
   473  		rootID := s2.CellIDFromFace(face)
   474  		if _, ok := presentCells[rootID]; !ok {
   475  			continue
   476  		}
   477  		expr = append(expr, generateRPExprForTree(rootID, presentCells)...)
   478  		numFaces++
   479  		if numFaces > 1 {
   480  			expr = append(expr, RPSetIntersection)
   481  		}
   482  	}
   483  	return expr
   484  }
   485  
   486  // The quad-trees stored in presentCells together represent a set expression.
   487  // This expression specifies:
   488  // - the path for each leaf to the root of that quad-tree. The index entries
   489  //   on each such path represent the shapes that cover that leaf. Hence these
   490  //   index entries for a single path need to be unioned to give the shapes
   491  //   that cover the leaf.
   492  // - The full expression specifies the shapes that cover all the leaves, so
   493  //   the union expressions for the paths must be intersected with each other.
   494  //
   495  // Reusing an example from earlier in this file, say the quad-tree is:
   496  //                      c0
   497  //                      |
   498  //                      c3
   499  //                      |
   500  //                  +---+---+
   501  //                  |       |
   502  //                 c13      c15
   503  //                  |       |
   504  //                 c53   +--+--+
   505  //                       |     |
   506  //                      c61    c64
   507  //
   508  // This tree represents the following expression (where I(c) are the index
   509  // entries stored at cell c):
   510  //  (I(c64) \union I(c15) \union I(c3) \union I(c0)) \intersection
   511  //  (I(c61) \union I(c15) \union I(c3) \union I(c0)) \intersection
   512  //  (I(c53) \union I(c13) \union I(c3) \union I(c0))
   513  // In this example all the union sub-expressions have the same number of terms
   514  // but that does not need to be true.
   515  //
   516  // The above expression can be factored to eliminate repetition of the
   517  // same cell. The factored expression for this example is:
   518  //   I(c0) \union I(c3) \union
   519  //    ((I(c13) \union I(c53)) \intersection
   520  //     (I(c15) \union (I(c61) \intersection I(c64)))
   521  //
   522  // This function generates this factored expression represented in reverse
   523  // polish notation.
   524  //
   525  // One can generate the factored expression in reverse polish notation using
   526  // a post-order traversal of this tree:
   527  // Step A. append the expression for the subtree rooted at c3
   528  // Step B. append c0 and the union operator
   529  // For Step A:
   530  // - append the expression for the subtree rooted at c13
   531  // - append the expression for the subtree rooted at c15
   532  // - append the intersection operator
   533  // - append c13
   534  // - append the union operator
   535  func generateRPExprForTree(rootID s2.CellID, presentCells map[s2.CellID]struct{}) []RPExprElement {
   536  	expr := []RPExprElement{Key(rootID)}
   537  	if rootID.IsLeaf() {
   538  		return expr
   539  	}
   540  	numChildren := 0
   541  	for _, childCellID := range rootID.Children() {
   542  		if _, ok := presentCells[childCellID]; !ok {
   543  			continue
   544  		}
   545  		expr = append(expr, generateRPExprForTree(childCellID, presentCells)...)
   546  		numChildren++
   547  		if numChildren > 1 {
   548  			expr = append(expr, RPSetIntersection)
   549  		}
   550  	}
   551  	if numChildren > 0 {
   552  		expr = append(expr, RPSetUnion)
   553  	}
   554  	return expr
   555  }
   556  
   557  // stringBuilderWithWrap is a strings.Builder that approximately wraps at a
   558  // certain number of characters. Newline characters should only be
   559  // written using tryWrap and doWrap.
   560  type stringBuilderWithWrap struct {
   561  	*strings.Builder
   562  	wrap     int
   563  	lastWrap int
   564  }
   565  
   566  func newStringBuilderWithWrap(b *strings.Builder, wrap int) *stringBuilderWithWrap {
   567  	return &stringBuilderWithWrap{
   568  		Builder:  b,
   569  		wrap:     wrap,
   570  		lastWrap: b.Len(),
   571  	}
   572  }
   573  
   574  func (b *stringBuilderWithWrap) tryWrap() {
   575  	if b.Len()-b.lastWrap > b.wrap {
   576  		b.doWrap()
   577  	}
   578  }
   579  
   580  func (b *stringBuilderWithWrap) doWrap() {
   581  	fmt.Fprintln(b)
   582  	b.lastWrap = b.Len()
   583  }
   584  
   585  func defaultS2Config() *S2Config {
   586  	return &S2Config{
   587  		MinLevel: 0,
   588  		MaxLevel: 30,
   589  		LevelMod: 1,
   590  		MaxCells: 4,
   591  	}
   592  }