github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/geo/geoindex/index.go (about) 1 // Copyright 2020 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package geoindex 12 13 import ( 14 "context" 15 "fmt" 16 "math" 17 "strings" 18 19 "github.com/cockroachdb/cockroach/pkg/geo" 20 "github.com/golang/geo/s2" 21 ) 22 23 // Interfaces for accelerating certain spatial operations, by allowing the 24 // caller to use an externally stored index. 25 // 26 // An index interface has methods that specify what to write or read from the 27 // stored index. It is the caller's responsibility to do the actual writes and 28 // reads. Index keys are represented using uint64 which implies that the index 29 // is transforming the 2D space (spherical or planar) to 1D space, say using a 30 // space-filling curve. The index can return false positives so query 31 // execution must use an exact filtering step after using the index. The 32 // current implementations use the S2 geometry library. 33 // 34 // The externally stored index must support key-value semantics and range 35 // queries over keys. The return values from Covers/CoveredBy/Intersects are 36 // specialized to the computation that needs to be performed: 37 // - Intersects needs to union (a) all the ranges corresponding to subtrees 38 // rooted at the covering of g, (b) all the parent nodes of the covering 39 // of g. The individual entries in (b) are representable as ranges of 40 // length 1. All of this is represented as UnionKeySpans. Covers, which 41 // is the shape that g covers, currently delegates to Interects so 42 // returns the same. 43 // 44 // - CoveredBy, which are the shapes that g is covered-by, needs to compute 45 // on the paths from each covering node to the root. For example, consider 46 // a quad-tree approach to dividing the space, where the nodes/cells 47 // covering g are c53, c61, c64. The corresponding ancestor path sets are 48 // {c53, c13, c3, c0}, {c61, c15, c3, c0}, {c64, c15, c3, c0}. Let I(c53) 49 // represent the index entries for c53. Then, 50 // I(c53) \union I(c13) \union I(c3) \union I(c0) 51 // represents all the shapes that cover cell c53. Similar union expressions 52 // can be constructed for all these paths. The computation needs to 53 // intersect these unions since all these cells need to be covered by a 54 // shape that covers g. One can extract the common sub-expressions to give 55 // I(c0) \union I(c3) \union 56 // ((I(c13) \union I(c53)) \intersection 57 // (I(c15) \union (I(c61) \intersection I(c64))) 58 // CoveredBy returns this factored expression in Reverse Polish notation. 59 60 // GeographyIndex is an index over the unit sphere. 61 type GeographyIndex interface { 62 // InvertedIndexKeys returns the keys to store this object under when adding 63 // it to the index. 64 InvertedIndexKeys(c context.Context, g *geo.Geography) ([]Key, error) 65 66 // Acceleration for topological relationships (see 67 // https://postgis.net/docs/reference.html#Spatial_Relationships). Distance 68 // relationships can be accelerated by adjusting g before calling these 69 // functions. Bounding box operators are not accelerated since we do not 70 // index the bounding box -- bounding box queries are an implementation 71 // detail of a particular indexing approach in PostGIS and are not part of 72 // the OGC or SQL/MM specs. 73 74 // Covers returns the index spans to read and union for the relationship 75 // ST_Covers(g, x), where x are the indexed geometries. 76 Covers(c context.Context, g *geo.Geography) (UnionKeySpans, error) 77 78 // CoveredBy returns the index entries to read and the expression to compute 79 // for ST_CoveredBy(g, x), where x are the indexed geometries. 80 CoveredBy(c context.Context, g *geo.Geography) (RPKeyExpr, error) 81 82 // Intersects returns the index spans to read and union for the relationship 83 // ST_Intersects(g, x), where x are the indexed geometries. 84 Intersects(c context.Context, g *geo.Geography) (UnionKeySpans, error) 85 86 // TestingInnerCovering returns an inner covering of g. 87 TestingInnerCovering(g *geo.Geography) s2.CellUnion 88 } 89 90 // GeometryIndex is an index over 2D cartesian coordinates. 91 type GeometryIndex interface { 92 // InvertedIndexKeys returns the keys to store this object under when adding 93 // it to the index. 94 InvertedIndexKeys(c context.Context, g *geo.Geometry) ([]Key, error) 95 96 // Acceleration for topological relationships (see 97 // https://postgis.net/docs/reference.html#Spatial_Relationships). Distance 98 // relationships can be accelerated by adjusting g before calling these 99 // functions. Bounding box operators are not accelerated since we do not 100 // index the bounding box -- bounding box queries are an implementation 101 // detail of a particular indexing approach in PostGIS and are not part of 102 // the OGC or SQL/MM specs. 103 104 // Covers returns the index spans to read and union for the relationship 105 // ST_Covers(g, x), where x are the indexed geometries. 106 Covers(c context.Context, g *geo.Geometry) (UnionKeySpans, error) 107 108 // CoveredBy returns the index entries to read and the expression to compute 109 // for ST_CoveredBy(g, x), where x are the indexed geometries. 110 CoveredBy(c context.Context, g *geo.Geometry) (RPKeyExpr, error) 111 112 // Intersects returns the index spans to read and union for the relationship 113 // ST_Intersects(g, x), where x are the indexed geometries. 114 Intersects(c context.Context, g *geo.Geometry) (UnionKeySpans, error) 115 116 // TestingInnerCovering returns an inner covering of g. 117 TestingInnerCovering(g *geo.Geometry) s2.CellUnion 118 } 119 120 // RelationshipType stores a type of geospatial relationship query that can 121 // be accelerated using an index. 122 type RelationshipType uint8 123 124 const ( 125 // Covers corresponds to the relationship in which one geospatial object 126 // covers another geospatial object. 127 Covers RelationshipType = (1 << iota) 128 129 // CoveredBy corresponds to the relationship in which one geospatial object 130 // is covered by another geospatial object. 131 CoveredBy 132 133 // Intersects corresponds to the relationship in which one geospatial object 134 // intersects another geospatial object. 135 Intersects 136 ) 137 138 var geoRelationshipTypeStr = map[RelationshipType]string{ 139 Covers: "covers", 140 CoveredBy: "covered by", 141 Intersects: "intersects", 142 } 143 144 func (gr RelationshipType) String() string { 145 return geoRelationshipTypeStr[gr] 146 } 147 148 // IsEmptyConfig returns whether the given config contains a geospatial index 149 // configuration. 150 func IsEmptyConfig(cfg *Config) bool { 151 if cfg == nil { 152 return true 153 } 154 return cfg.S2Geography == nil && cfg.S2Geometry == nil 155 } 156 157 // IsGeographyConfig returns whether the config is a geography geospatial 158 // index configuration. 159 func IsGeographyConfig(cfg *Config) bool { 160 if cfg == nil { 161 return false 162 } 163 return cfg.S2Geography != nil 164 } 165 166 // IsGeometryConfig returns whether the config is a geometry geospatial 167 // index configuration. 168 func IsGeometryConfig(cfg *Config) bool { 169 if cfg == nil { 170 return false 171 } 172 return cfg.S2Geometry != nil 173 } 174 175 // Key is one entry under which a geospatial shape is stored on behalf of an 176 // Index. The index is of the form (Key, Primary Key). 177 type Key uint64 178 179 // rpExprElement implements the RPExprElement interface. 180 func (k Key) rpExprElement() {} 181 182 func (k Key) String() string { 183 c := s2.CellID(k) 184 if !c.IsValid() { 185 return "spilled" 186 } 187 var b strings.Builder 188 b.WriteByte('F') 189 b.WriteByte("012345"[c.Face()]) 190 fmt.Fprintf(&b, "/L%d/", c.Level()) 191 for level := 1; level <= c.Level(); level++ { 192 b.WriteByte("0123"[c.ChildPosition(level)]) 193 } 194 return b.String() 195 } 196 197 // KeySpan represents a range of Keys. 198 type KeySpan struct { 199 // Both Start and End are inclusive, i.e., [Start, End]. 200 Start, End Key 201 } 202 203 // UnionKeySpans is the set of indexed spans to retrieve and combine via set 204 // union. The spans are guaranteed to be non-overlapping. Duplicate primary 205 // keys will not be retrieved by any individual key, but they may be present 206 // if more than one key is retrieved (including duplicates in a single span 207 // where End - Start > 1). 208 type UnionKeySpans []KeySpan 209 210 func (s UnionKeySpans) String() string { 211 return s.toString(math.MaxInt32) 212 } 213 214 func (s UnionKeySpans) toString(wrap int) string { 215 b := newStringBuilderWithWrap(&strings.Builder{}, wrap) 216 for i, span := range s { 217 if span.Start == span.End { 218 fmt.Fprintf(b, "%s", span.Start) 219 } else { 220 fmt.Fprintf(b, "[%s, %s]", span.Start, span.End) 221 } 222 if i != len(s)-1 { 223 b.WriteString(", ") 224 } 225 b.tryWrap() 226 } 227 return b.String() 228 } 229 230 // RPExprElement is an element in the Reverse Polish notation expression. 231 // It is implemented by Key and RPSetOperator. 232 type RPExprElement interface { 233 rpExprElement() 234 } 235 236 // RPSetOperator is a set operator in the Reverse Polish notation expression. 237 type RPSetOperator int 238 239 const ( 240 // RPSetUnion is the union operator. 241 RPSetUnion RPSetOperator = iota + 1 242 243 // RPSetIntersection is the intersection operator. 244 RPSetIntersection 245 ) 246 247 // rpExprElement implements the RPExprElement interface. 248 func (o RPSetOperator) rpExprElement() {} 249 250 // RPKeyExpr is an expression to evaluate over primary keys retrieved for 251 // index keys. If we view each index key as a posting list of primary keys, 252 // the expression involves union and intersection over the sets represented by 253 // each posting list. For S2, this expression represents an intersection of 254 // ancestors of different keys (cell ids) and is likely to contain many common 255 // keys. This special structure allows us to efficiently and easily eliminate 256 // common sub-expressions, hence the interface presents the factored 257 // expression. The expression is represented in Reverse Polish notation. 258 type RPKeyExpr []RPExprElement 259 260 func (x RPKeyExpr) String() string { 261 var elements []string 262 for _, e := range x { 263 switch elem := e.(type) { 264 case Key: 265 elements = append(elements, elem.String()) 266 case RPSetOperator: 267 switch elem { 268 case RPSetUnion: 269 elements = append(elements, `\U`) 270 case RPSetIntersection: 271 elements = append(elements, `\I`) 272 } 273 } 274 } 275 return strings.Join(elements, " ") 276 } 277 278 // Helper functions for index implementations that use the S2 geometry 279 // library. 280 281 func covering(rc *s2.RegionCoverer, regions []s2.Region) s2.CellUnion { 282 // TODO(sumeer): Add a max cells constraint for the whole covering, 283 // to respect the index configuration. 284 var u s2.CellUnion 285 for _, r := range regions { 286 u = append(u, rc.Covering(r)...) 287 } 288 // Ensure the cells are non-overlapping. 289 u.Normalize() 290 return u 291 } 292 293 // The "inner covering", for shape s, represented by the regions parameter, is 294 // used to find shapes that contain shape s. A regular covering that includes 295 // a cell c not completely covered by shape s could result in false negatives, 296 // since shape x that covers shape s could use a finer cell covering (using 297 // cells below c). For example, consider a portion of the cell quad-tree 298 // below: 299 // 300 // c0 301 // | 302 // c3 303 // | 304 // +---+---+ 305 // | | 306 // c13 c15 307 // | | 308 // c53 +--+--+ 309 // | | 310 // c61 c64 311 // 312 // Shape s could have a regular covering c15, c53, where c15 has 4 child cells 313 // c61..c64, and shape s only intersects wit c61, c64. A different shape x 314 // that covers shape s may have a covering c61, c64, c53. That is, it has used 315 // the finer cells c61, c64. If we used both regular coverings it is hard to 316 // know that x covers g. Hence, we compute the "inner covering" of g (defined 317 // below). 318 // 319 // The interior covering of shape s includes only cells covered by s. This is 320 // computed by RegionCoverer.InteriorCovering() and is intuitively what we 321 // need. But the interior covering is naturally empty for points and lines 322 // (and can be empty for polygons too), and an empty covering is not useful 323 // for constraining index lookups. We observe that leaf cells that intersect 324 // shape s can be used in the covering, since the covering of shape x must 325 // also cover these cells. This allows us to compute non-empty coverings for 326 // all shapes. Since this is not technically the interior covering, we use the 327 // term "inner covering". 328 func innerCovering(rc *s2.RegionCoverer, regions []s2.Region) s2.CellUnion { 329 var u s2.CellUnion 330 for _, r := range regions { 331 switch region := r.(type) { 332 case s2.Point: 333 cellID := s2.CellFromPoint(region).ID() 334 if !cellID.IsLeaf() { 335 panic("bug in S2") 336 } 337 u = append(u, cellID) 338 case *s2.Polyline: 339 // TODO(sumeer): for long lines could also pick some intermediate 340 // points along the line. Decide based on experiments. 341 for _, p := range *region { 342 cellID := s2.CellFromPoint(p).ID() 343 u = append(u, cellID) 344 } 345 case *s2.Polygon: 346 // Iterate over all exterior points 347 if region.NumLoops() > 0 { 348 loop := region.Loop(0) 349 for _, p := range loop.Vertices() { 350 cellID := s2.CellFromPoint(p).ID() 351 u = append(u, cellID) 352 } 353 // Arbitrary threshold value. This is to avoid computing an expensive 354 // region covering for regions with small area. 355 // TODO(sumeer): Improve this heuristic: 356 // - Area() may be expensive. 357 // - For large area regions, put an upper bound on the 358 // level used for cells. 359 // Decide based on experiments. 360 const smallPolygonLevelThreshold = 25 361 if region.Area() > s2.AvgAreaMetric.Value(smallPolygonLevelThreshold) { 362 u = append(u, rc.InteriorCovering(region)...) 363 } 364 } 365 default: 366 panic("bug: code should not be producing unhandled Region type") 367 } 368 } 369 // Ensure the cells are non-overlapping. 370 u.Normalize() 371 372 // TODO(sumeer): if the number of cells is too many, make the list sparser. 373 // u[len(u)-1] - u[0] / len(u) is the mean distance between cells. Pick a 374 // target distance based on the goal to reduce to k cells: target_distance 375 // := mean_distance * k / len(u) Then iterate over u and for every sequence 376 // of cells that are within target_distance, replace by median cell or by 377 // largest cell. Decide based on experiments. 378 379 return u 380 } 381 382 // ancestorCells returns the set of cells containing these cells, not 383 // including the given cells. 384 // 385 // TODO(sumeer): use the MinLevel and LevelMod of the RegionCoverer used 386 // for the index to constrain the ancestors set. 387 func ancestorCells(cells []s2.CellID) []s2.CellID { 388 var ancestors []s2.CellID 389 var seen map[s2.CellID]struct{} 390 if len(cells) > 1 { 391 seen = make(map[s2.CellID]struct{}) 392 } 393 for _, c := range cells { 394 for l := c.Level() - 1; l >= 0; l-- { 395 p := c.Parent(l) 396 if seen != nil { 397 if _, ok := seen[p]; ok { 398 break 399 } 400 seen[p] = struct{}{} 401 } 402 ancestors = append(ancestors, p) 403 } 404 } 405 return ancestors 406 } 407 408 // Helper for InvertedIndexKeys. 409 func invertedIndexKeys(_ context.Context, rc *s2.RegionCoverer, r []s2.Region) []Key { 410 covering := covering(rc, r) 411 keys := make([]Key, len(covering)) 412 for i, cid := range covering { 413 keys[i] = Key(cid) 414 } 415 return keys 416 } 417 418 // TODO(sumeer): examine RegionCoverer carefully to see if we can strengthen 419 // the covering invariant, which would increase the efficiency of covers() and 420 // remove the need for TestingInnerCovering(). 421 // 422 // Helper for Covers. 423 func covers(c context.Context, rc *s2.RegionCoverer, r []s2.Region) UnionKeySpans { 424 // We use intersects since geometries covered by r may have been indexed 425 // using cells that are ancestors of the covering of r. We could avoid 426 // reading ancestors if we had a stronger covering invariant, such as by 427 // indexing inner coverings. 428 return intersects(c, rc, r) 429 } 430 431 // Helper for Intersects. Returns spans in sorted order for convenience of 432 // scans. 433 func intersects(_ context.Context, rc *s2.RegionCoverer, r []s2.Region) UnionKeySpans { 434 covering := covering(rc, r) 435 querySpans := make([]KeySpan, len(covering)) 436 for i, cid := range covering { 437 querySpans[i] = KeySpan{Start: Key(cid.RangeMin()), End: Key(cid.RangeMax())} 438 } 439 for _, cid := range ancestorCells(covering) { 440 querySpans = append(querySpans, KeySpan{Start: Key(cid), End: Key(cid)}) 441 } 442 return querySpans 443 } 444 445 // Helper for CoveredBy. 446 func coveredBy(_ context.Context, rc *s2.RegionCoverer, r []s2.Region) RPKeyExpr { 447 covering := innerCovering(rc, r) 448 ancestors := ancestorCells(covering) 449 450 // The covering is normalized so no 2 cells are such that one is an ancestor 451 // of another. Arrange these cells and their ancestors in a quad-tree. Any cell 452 // with more than one child needs to be unioned with the intersection of the 453 // expressions corresponding to each child. See the detailed comment in 454 // generateRPExprForTree(). 455 456 // It is sufficient to represent the tree(s) using presentCells since the ids 457 // of all possible 4 children of a cell can be computed and checked for 458 // presence in the map. 459 presentCells := make(map[s2.CellID]struct{}, len(covering)+len(ancestors)) 460 for _, c := range covering { 461 presentCells[c] = struct{}{} 462 } 463 for _, c := range ancestors { 464 presentCells[c] = struct{}{} 465 } 466 467 // Construct the reverse polish expression. Note that there are up to 6 468 // trees corresponding to the 6 faces in S2. The expressions for the 469 // trees need to be intersected with each other. 470 expr := make([]RPExprElement, 0, len(presentCells)*2) 471 numFaces := 0 472 for face := 0; face < 6; face++ { 473 rootID := s2.CellIDFromFace(face) 474 if _, ok := presentCells[rootID]; !ok { 475 continue 476 } 477 expr = append(expr, generateRPExprForTree(rootID, presentCells)...) 478 numFaces++ 479 if numFaces > 1 { 480 expr = append(expr, RPSetIntersection) 481 } 482 } 483 return expr 484 } 485 486 // The quad-trees stored in presentCells together represent a set expression. 487 // This expression specifies: 488 // - the path for each leaf to the root of that quad-tree. The index entries 489 // on each such path represent the shapes that cover that leaf. Hence these 490 // index entries for a single path need to be unioned to give the shapes 491 // that cover the leaf. 492 // - The full expression specifies the shapes that cover all the leaves, so 493 // the union expressions for the paths must be intersected with each other. 494 // 495 // Reusing an example from earlier in this file, say the quad-tree is: 496 // c0 497 // | 498 // c3 499 // | 500 // +---+---+ 501 // | | 502 // c13 c15 503 // | | 504 // c53 +--+--+ 505 // | | 506 // c61 c64 507 // 508 // This tree represents the following expression (where I(c) are the index 509 // entries stored at cell c): 510 // (I(c64) \union I(c15) \union I(c3) \union I(c0)) \intersection 511 // (I(c61) \union I(c15) \union I(c3) \union I(c0)) \intersection 512 // (I(c53) \union I(c13) \union I(c3) \union I(c0)) 513 // In this example all the union sub-expressions have the same number of terms 514 // but that does not need to be true. 515 // 516 // The above expression can be factored to eliminate repetition of the 517 // same cell. The factored expression for this example is: 518 // I(c0) \union I(c3) \union 519 // ((I(c13) \union I(c53)) \intersection 520 // (I(c15) \union (I(c61) \intersection I(c64))) 521 // 522 // This function generates this factored expression represented in reverse 523 // polish notation. 524 // 525 // One can generate the factored expression in reverse polish notation using 526 // a post-order traversal of this tree: 527 // Step A. append the expression for the subtree rooted at c3 528 // Step B. append c0 and the union operator 529 // For Step A: 530 // - append the expression for the subtree rooted at c13 531 // - append the expression for the subtree rooted at c15 532 // - append the intersection operator 533 // - append c13 534 // - append the union operator 535 func generateRPExprForTree(rootID s2.CellID, presentCells map[s2.CellID]struct{}) []RPExprElement { 536 expr := []RPExprElement{Key(rootID)} 537 if rootID.IsLeaf() { 538 return expr 539 } 540 numChildren := 0 541 for _, childCellID := range rootID.Children() { 542 if _, ok := presentCells[childCellID]; !ok { 543 continue 544 } 545 expr = append(expr, generateRPExprForTree(childCellID, presentCells)...) 546 numChildren++ 547 if numChildren > 1 { 548 expr = append(expr, RPSetIntersection) 549 } 550 } 551 if numChildren > 0 { 552 expr = append(expr, RPSetUnion) 553 } 554 return expr 555 } 556 557 // stringBuilderWithWrap is a strings.Builder that approximately wraps at a 558 // certain number of characters. Newline characters should only be 559 // written using tryWrap and doWrap. 560 type stringBuilderWithWrap struct { 561 *strings.Builder 562 wrap int 563 lastWrap int 564 } 565 566 func newStringBuilderWithWrap(b *strings.Builder, wrap int) *stringBuilderWithWrap { 567 return &stringBuilderWithWrap{ 568 Builder: b, 569 wrap: wrap, 570 lastWrap: b.Len(), 571 } 572 } 573 574 func (b *stringBuilderWithWrap) tryWrap() { 575 if b.Len()-b.lastWrap > b.wrap { 576 b.doWrap() 577 } 578 } 579 580 func (b *stringBuilderWithWrap) doWrap() { 581 fmt.Fprintln(b) 582 b.lastWrap = b.Len() 583 } 584 585 func defaultS2Config() *S2Config { 586 return &S2Config{ 587 MinLevel: 0, 588 MaxLevel: 30, 589 LevelMod: 1, 590 MaxCells: 4, 591 } 592 }