github.com/cayleygraph/cayley@v0.7.7/graph/iterator/and_optimize.go (about) 1 // Copyright 2014 The Cayley Authors. All rights reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package iterator 16 17 import ( 18 "context" 19 "sort" 20 21 "github.com/cayleygraph/cayley/clog" 22 "github.com/cayleygraph/cayley/graph" 23 ) 24 25 // Perhaps the most tricky file in this entire module. Really a method on the 26 // And, but important enough to deserve its own file. 27 // 28 // Calling Optimize() on an And iterator, like any iterator, requires that we 29 // preserve the underlying meaning. However, the And has many choices, namely, 30 // which one of it's subiterators will be the branch that does the Next()ing, 31 // and which ordering of the remaining iterators is the most efficient. In 32 // short, this is where a lot of the query optimization happens, and there are 33 // many wins to be had here, as well as many bad bugs. The worst class of bug 34 // changes the meaning of the query. The second worst class makes things really 35 // slow. 36 // 37 // The good news is this: If Optimize() is never called (turned off, perhaps) we can 38 // be sure the results are as good as the query language called for. 39 // 40 // In short, tread lightly. 41 42 // Optimizes the And, by picking the most efficient way to Next() and 43 // Contains() its subiterators. For SQL fans, this is equivalent to JOIN. 44 func (it *and) Optimize(ctx context.Context) (graph.IteratorShape, bool) { 45 // First, let's get the slice of iterators, in order (first one is Next()ed, 46 // the rest are Contains()ed) 47 old := it.sub 48 if len(old) == 0 { 49 return newNull(), true 50 } 51 52 // And call Optimize() on our subtree, replacing each one in the order we 53 // found them. it_list is the newly optimized versions of these, and changed 54 // is another list, of only the ones that have returned replacements and 55 // changed. 56 its := optimizeSubIterators2(ctx, old) 57 58 // If we can find only one subiterator which is equivalent to this whole and, 59 // we can replace the And... 60 if out := optimizeReplacement(its); out != nil && len(it.opt) == 0 { 61 // ...And return it. 62 return out, true 63 } 64 65 // And now, without changing any of the iterators, we reorder them. it_list is 66 // now a permutation of itself, but the contents are unchanged. 67 its = optimizeOrder(ctx, its) 68 69 its, _ = materializeIts(ctx, its) 70 71 // Okay! At this point we have an optimized order. 72 73 // The easiest thing to do at this point is merely to create a new And iterator 74 // and replace ourselves with our (reordered, optimized) clone. 75 // Add the subiterators in order. 76 newAnd := newAnd(its...) 77 78 opt := optimizeSubIterators2(ctx, it.opt) 79 for _, sub := range opt { 80 newAnd.AddOptionalIterator(sub) 81 } 82 83 _ = newAnd.optimizeContains(ctx) 84 if clog.V(3) { 85 clog.Infof("%p become %p", it, newAnd) 86 } 87 return newAnd, true 88 } 89 90 // Find if there is a single subiterator which is a valid replacement for this 91 // And. 92 func optimizeReplacement(its []graph.IteratorShape) graph.IteratorShape { 93 // If we were created with no SubIterators, we're as good as Null. 94 if len(its) == 0 { 95 return newNull() 96 } 97 if len(its) == 1 { 98 // When there's only one iterator, there's only one choice. 99 return its[0] 100 } 101 // If any of our subiterators, post-optimization, are also Null, then 102 // there's no point in continuing the branch, we will have no results 103 // and we are null as well. 104 if hasAnyNullIterators(its) { 105 return newNull() 106 } 107 return nil 108 } 109 110 // optimizeOrder(l) takes a list and returns a list, containing the same contents 111 // but with a new ordering, however it wishes. 112 func optimizeOrder(ctx context.Context, its []graph.IteratorShape) []graph.IteratorShape { 113 var ( 114 // bad contains iterators that can't be (efficiently) nexted, such as 115 // graph.Optional or graph.Not. Separate them out and tack them on at the end. 116 bad []graph.IteratorShape 117 best graph.IteratorShape 118 bestCost = int64(1 << 62) 119 ) 120 121 // Find the iterator with the projected "best" total cost. 122 // Total cost is defined as The Next()ed iterator's cost to Next() out 123 // all of it's contents, and to Contains() each of those against everyone 124 // else. 125 for _, root := range its { 126 rootStats, _ := root.Stats(ctx) 127 cost := rootStats.NextCost 128 for _, f := range its { 129 if f == root { 130 continue 131 } 132 stats, _ := f.Stats(ctx) 133 cost += stats.ContainsCost * (1 + (rootStats.Size.Size / (stats.Size.Size + 1))) 134 } 135 cost *= rootStats.Size.Size 136 if clog.V(3) { 137 clog.Infof("And: Root: %p Total Cost: %v Best: %v", root, cost, bestCost) 138 } 139 if cost < bestCost { 140 best = root 141 bestCost = cost 142 } 143 } 144 if clog.V(3) { 145 clog.Infof("And: Choosing: %p Best: %v", best, bestCost) 146 } 147 148 // TODO(barakmich): Optimization of order need not stop here. Picking a smart 149 // Contains() order based on probability of getting a false Contains() first is 150 // useful (fail faster). 151 152 var out []graph.IteratorShape 153 // Put the best iterator (the one we wish to Next()) at the front... 154 if best != nil { 155 out = append(out, best) 156 } 157 158 // ... push everyone else after... 159 for _, it := range its { 160 if it != best { 161 out = append(out, it) 162 } 163 } 164 165 // ...and finally, the difficult children on the end. 166 return append(out, bad...) 167 } 168 169 func sortByContainsCost(ctx context.Context, arr []graph.IteratorShape) error { 170 cost := make([]graph.IteratorCosts, 0, len(arr)) 171 var last error 172 for _, s := range arr { 173 c, err := s.Stats(ctx) 174 if err != nil { 175 last = err 176 } 177 cost = append(cost, c) 178 } 179 sort.Sort(byCost{ 180 list: arr, 181 cost: cost, 182 }) 183 return last 184 } 185 186 // TODO(dennwc): store stats slice once 187 type byCost struct { 188 list []graph.IteratorShape 189 cost []graph.IteratorCosts 190 } 191 192 func (c byCost) Len() int { return len(c.list) } 193 func (c byCost) Less(i, j int) bool { 194 return c.cost[i].ContainsCost < c.cost[j].ContainsCost 195 } 196 func (c byCost) Swap(i, j int) { 197 c.list[i], c.list[j] = c.list[j], c.list[i] 198 c.cost[i], c.cost[j] = c.cost[j], c.cost[i] 199 } 200 201 // optimizeContains() creates an alternate check list, containing the same contents 202 // but with a new ordering, however it wishes. 203 func (it *and) optimizeContains(ctx context.Context) error { 204 // GetSubIterators allocates, so this is currently safe. 205 // TODO(kortschak) Reuse it.checkList if possible. 206 // This involves providing GetSubIterators with a slice to fill. 207 // Generally this is a worthwhile thing to do in other places as well. 208 it.checkList = append([]graph.IteratorShape{}, it.sub...) 209 return sortByContainsCost(ctx, it.checkList) 210 } 211 212 // optimizeSubIterators(l) takes a list of iterators and calls Optimize() on all 213 // of them. It returns two lists -- the first contains the same list as l, where 214 // any replacements are made by Optimize() and the second contains the originals 215 // which were replaced. 216 func optimizeSubIterators(its []graph.Iterator) []graph.Iterator { 217 out := make([]graph.Iterator, 0, len(its)) 218 for _, it := range its { 219 o, _ := it.Optimize() 220 out = append(out, o) 221 } 222 return out 223 } 224 225 // optimizeSubIterators(l) takes a list of iterators and calls Optimize() on all 226 // of them. It returns two lists -- the first contains the same list as l, where 227 // any replacements are made by Optimize() and the second contains the originals 228 // which were replaced. 229 func optimizeSubIterators2(ctx context.Context, its []graph.IteratorShape) []graph.IteratorShape { 230 out := make([]graph.IteratorShape, 0, len(its)) 231 for _, it := range its { 232 o, _ := it.Optimize(ctx) 233 out = append(out, o) 234 } 235 return out 236 } 237 238 // Check a list of iterators for any Null iterators. 239 func hasAnyNullIterators(its []graph.IteratorShape) bool { 240 for _, it := range its { 241 if IsNull2(it) { 242 return true 243 } 244 } 245 return false 246 } 247 248 func materializeIts(ctx context.Context, its []graph.IteratorShape) ([]graph.IteratorShape, error) { 249 var out []graph.IteratorShape 250 251 allStats, stats, err := getStatsForSlice(ctx, its, nil) 252 out = append(out, its[0]) 253 for i, it := range its[1:] { 254 st := stats[i+1] 255 if st.Size.Size*st.NextCost < (st.ContainsCost * (1 + (st.Size.Size / (allStats.Size.Size + 1)))) { 256 if graph.Height(graph.AsLegacy(it), func(it graph.Iterator) bool { 257 _, ok := it.(*Materialize) 258 return !ok 259 }) > 10 { 260 out = append(out, newMaterialize(it)) 261 continue 262 } 263 } 264 out = append(out, it) 265 } 266 return out, err 267 } 268 269 func getStatsForSlice(ctx context.Context, its, opt []graph.IteratorShape) (graph.IteratorCosts, []graph.IteratorCosts, error) { 270 if len(its) == 0 { 271 return graph.IteratorCosts{}, nil, nil 272 } 273 274 arr := make([]graph.IteratorCosts, 0, len(its)) 275 276 primaryStats, _ := its[0].Stats(ctx) 277 arr = append(arr, primaryStats) 278 279 containsCost := primaryStats.ContainsCost 280 nextCost := primaryStats.NextCost 281 size := primaryStats.Size.Size 282 exact := primaryStats.Size.Exact 283 284 var last error 285 for _, sub := range its[1:] { 286 stats, err := sub.Stats(ctx) 287 if err != nil { 288 last = err 289 } 290 arr = append(arr, stats) 291 nextCost += stats.ContainsCost * (1 + (primaryStats.Size.Size / (stats.Size.Size + 1))) 292 containsCost += stats.ContainsCost 293 if size > stats.Size.Size { 294 size = stats.Size.Size 295 exact = stats.Size.Exact 296 } 297 } 298 for _, sub := range opt { 299 stats, _ := sub.Stats(ctx) 300 nextCost += stats.ContainsCost * (1 + (primaryStats.Size.Size / (stats.Size.Size + 1))) 301 containsCost += stats.ContainsCost 302 } 303 return graph.IteratorCosts{ 304 ContainsCost: containsCost, 305 NextCost: nextCost, 306 Size: graph.Size{ 307 Size: size, 308 Exact: exact, 309 }, 310 }, arr, last 311 } 312 313 // and.Stats() lives here in and-iterator-optimize.go because it may 314 // in the future return different statistics based on how it is optimized. 315 // For now, however, it's pretty static. 316 // 317 // Returns the approximate size of the And iterator. Because we're dealing 318 // with an intersection, we know that the largest we can be is the size of the 319 // smallest iterator. This is the heuristic we shall follow. Better heuristics 320 // welcome. 321 func (it *and) Stats(ctx context.Context) (graph.IteratorCosts, error) { 322 stats, _, err := getStatsForSlice(ctx, it.sub, it.opt) 323 return stats, err 324 }