github.com/cayleygraph/cayley@v0.7.7/graph/iterator/and_optimize.go (about)

     1  // Copyright 2014 The Cayley Authors. All rights reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package iterator
    16  
    17  import (
    18  	"context"
    19  	"sort"
    20  
    21  	"github.com/cayleygraph/cayley/clog"
    22  	"github.com/cayleygraph/cayley/graph"
    23  )
    24  
    25  // Perhaps the most tricky file in this entire module. Really a method on the
    26  // And, but important enough to deserve its own file.
    27  //
    28  // Calling Optimize() on an And iterator, like any iterator, requires that we
    29  // preserve the underlying meaning. However, the And has many choices, namely,
    30  // which one of it's subiterators will be the branch that does the Next()ing,
    31  // and which ordering of the remaining iterators is the most efficient. In
    32  // short, this is where a lot of the query optimization happens, and there are
    33  // many wins to be had here, as well as many bad bugs. The worst class of bug
    34  // changes the meaning of the query. The second worst class makes things really
    35  // slow.
    36  //
    37  // The good news is this: If Optimize() is never called (turned off, perhaps) we can
    38  // be sure the results are as good as the query language called for.
    39  //
    40  // In short, tread lightly.
    41  
    42  // Optimizes the And, by picking the most efficient way to Next() and
    43  // Contains() its subiterators. For SQL fans, this is equivalent to JOIN.
    44  func (it *and) Optimize(ctx context.Context) (graph.IteratorShape, bool) {
    45  	// First, let's get the slice of iterators, in order (first one is Next()ed,
    46  	// the rest are Contains()ed)
    47  	old := it.sub
    48  	if len(old) == 0 {
    49  		return newNull(), true
    50  	}
    51  
    52  	// And call Optimize() on our subtree, replacing each one in the order we
    53  	// found them. it_list is the newly optimized versions of these, and changed
    54  	// is another list, of only the ones that have returned replacements and
    55  	// changed.
    56  	its := optimizeSubIterators2(ctx, old)
    57  
    58  	// If we can find only one subiterator which is equivalent to this whole and,
    59  	// we can replace the And...
    60  	if out := optimizeReplacement(its); out != nil && len(it.opt) == 0 {
    61  		// ...And return it.
    62  		return out, true
    63  	}
    64  
    65  	// And now, without changing any of the iterators, we reorder them. it_list is
    66  	// now a permutation of itself, but the contents are unchanged.
    67  	its = optimizeOrder(ctx, its)
    68  
    69  	its, _ = materializeIts(ctx, its)
    70  
    71  	// Okay! At this point we have an optimized order.
    72  
    73  	// The easiest thing to do at this point is merely to create a new And iterator
    74  	// and replace ourselves with our (reordered, optimized) clone.
    75  	// Add the subiterators in order.
    76  	newAnd := newAnd(its...)
    77  
    78  	opt := optimizeSubIterators2(ctx, it.opt)
    79  	for _, sub := range opt {
    80  		newAnd.AddOptionalIterator(sub)
    81  	}
    82  
    83  	_ = newAnd.optimizeContains(ctx)
    84  	if clog.V(3) {
    85  		clog.Infof("%p become %p", it, newAnd)
    86  	}
    87  	return newAnd, true
    88  }
    89  
    90  // Find if there is a single subiterator which is a valid replacement for this
    91  // And.
    92  func optimizeReplacement(its []graph.IteratorShape) graph.IteratorShape {
    93  	// If we were created with no SubIterators, we're as good as Null.
    94  	if len(its) == 0 {
    95  		return newNull()
    96  	}
    97  	if len(its) == 1 {
    98  		// When there's only one iterator, there's only one choice.
    99  		return its[0]
   100  	}
   101  	// If any of our subiterators, post-optimization, are also Null, then
   102  	// there's no point in continuing the branch, we will have no results
   103  	// and we are null as well.
   104  	if hasAnyNullIterators(its) {
   105  		return newNull()
   106  	}
   107  	return nil
   108  }
   109  
   110  // optimizeOrder(l) takes a list and returns a list, containing the same contents
   111  // but with a new ordering, however it wishes.
   112  func optimizeOrder(ctx context.Context, its []graph.IteratorShape) []graph.IteratorShape {
   113  	var (
   114  		// bad contains iterators that can't be (efficiently) nexted, such as
   115  		// graph.Optional or graph.Not. Separate them out and tack them on at the end.
   116  		bad      []graph.IteratorShape
   117  		best     graph.IteratorShape
   118  		bestCost = int64(1 << 62)
   119  	)
   120  
   121  	// Find the iterator with the projected "best" total cost.
   122  	// Total cost is defined as The Next()ed iterator's cost to Next() out
   123  	// all of it's contents, and to Contains() each of those against everyone
   124  	// else.
   125  	for _, root := range its {
   126  		rootStats, _ := root.Stats(ctx)
   127  		cost := rootStats.NextCost
   128  		for _, f := range its {
   129  			if f == root {
   130  				continue
   131  			}
   132  			stats, _ := f.Stats(ctx)
   133  			cost += stats.ContainsCost * (1 + (rootStats.Size.Size / (stats.Size.Size + 1)))
   134  		}
   135  		cost *= rootStats.Size.Size
   136  		if clog.V(3) {
   137  			clog.Infof("And: Root: %p Total Cost: %v Best: %v", root, cost, bestCost)
   138  		}
   139  		if cost < bestCost {
   140  			best = root
   141  			bestCost = cost
   142  		}
   143  	}
   144  	if clog.V(3) {
   145  		clog.Infof("And: Choosing: %p Best: %v", best, bestCost)
   146  	}
   147  
   148  	// TODO(barakmich): Optimization of order need not stop here. Picking a smart
   149  	// Contains() order based on probability of getting a false Contains() first is
   150  	// useful (fail faster).
   151  
   152  	var out []graph.IteratorShape
   153  	// Put the best iterator (the one we wish to Next()) at the front...
   154  	if best != nil {
   155  		out = append(out, best)
   156  	}
   157  
   158  	// ... push everyone else after...
   159  	for _, it := range its {
   160  		if it != best {
   161  			out = append(out, it)
   162  		}
   163  	}
   164  
   165  	// ...and finally, the difficult children on the end.
   166  	return append(out, bad...)
   167  }
   168  
   169  func sortByContainsCost(ctx context.Context, arr []graph.IteratorShape) error {
   170  	cost := make([]graph.IteratorCosts, 0, len(arr))
   171  	var last error
   172  	for _, s := range arr {
   173  		c, err := s.Stats(ctx)
   174  		if err != nil {
   175  			last = err
   176  		}
   177  		cost = append(cost, c)
   178  	}
   179  	sort.Sort(byCost{
   180  		list: arr,
   181  		cost: cost,
   182  	})
   183  	return last
   184  }
   185  
   186  // TODO(dennwc): store stats slice once
   187  type byCost struct {
   188  	list []graph.IteratorShape
   189  	cost []graph.IteratorCosts
   190  }
   191  
   192  func (c byCost) Len() int { return len(c.list) }
   193  func (c byCost) Less(i, j int) bool {
   194  	return c.cost[i].ContainsCost < c.cost[j].ContainsCost
   195  }
   196  func (c byCost) Swap(i, j int) {
   197  	c.list[i], c.list[j] = c.list[j], c.list[i]
   198  	c.cost[i], c.cost[j] = c.cost[j], c.cost[i]
   199  }
   200  
   201  // optimizeContains() creates an alternate check list, containing the same contents
   202  // but with a new ordering, however it wishes.
   203  func (it *and) optimizeContains(ctx context.Context) error {
   204  	// GetSubIterators allocates, so this is currently safe.
   205  	// TODO(kortschak) Reuse it.checkList if possible.
   206  	// This involves providing GetSubIterators with a slice to fill.
   207  	// Generally this is a worthwhile thing to do in other places as well.
   208  	it.checkList = append([]graph.IteratorShape{}, it.sub...)
   209  	return sortByContainsCost(ctx, it.checkList)
   210  }
   211  
   212  // optimizeSubIterators(l) takes a list of iterators and calls Optimize() on all
   213  // of them. It returns two lists -- the first contains the same list as l, where
   214  // any replacements are made by Optimize() and the second contains the originals
   215  // which were replaced.
   216  func optimizeSubIterators(its []graph.Iterator) []graph.Iterator {
   217  	out := make([]graph.Iterator, 0, len(its))
   218  	for _, it := range its {
   219  		o, _ := it.Optimize()
   220  		out = append(out, o)
   221  	}
   222  	return out
   223  }
   224  
   225  // optimizeSubIterators(l) takes a list of iterators and calls Optimize() on all
   226  // of them. It returns two lists -- the first contains the same list as l, where
   227  // any replacements are made by Optimize() and the second contains the originals
   228  // which were replaced.
   229  func optimizeSubIterators2(ctx context.Context, its []graph.IteratorShape) []graph.IteratorShape {
   230  	out := make([]graph.IteratorShape, 0, len(its))
   231  	for _, it := range its {
   232  		o, _ := it.Optimize(ctx)
   233  		out = append(out, o)
   234  	}
   235  	return out
   236  }
   237  
   238  // Check a list of iterators for any Null iterators.
   239  func hasAnyNullIterators(its []graph.IteratorShape) bool {
   240  	for _, it := range its {
   241  		if IsNull2(it) {
   242  			return true
   243  		}
   244  	}
   245  	return false
   246  }
   247  
   248  func materializeIts(ctx context.Context, its []graph.IteratorShape) ([]graph.IteratorShape, error) {
   249  	var out []graph.IteratorShape
   250  
   251  	allStats, stats, err := getStatsForSlice(ctx, its, nil)
   252  	out = append(out, its[0])
   253  	for i, it := range its[1:] {
   254  		st := stats[i+1]
   255  		if st.Size.Size*st.NextCost < (st.ContainsCost * (1 + (st.Size.Size / (allStats.Size.Size + 1)))) {
   256  			if graph.Height(graph.AsLegacy(it), func(it graph.Iterator) bool {
   257  				_, ok := it.(*Materialize)
   258  				return !ok
   259  			}) > 10 {
   260  				out = append(out, newMaterialize(it))
   261  				continue
   262  			}
   263  		}
   264  		out = append(out, it)
   265  	}
   266  	return out, err
   267  }
   268  
   269  func getStatsForSlice(ctx context.Context, its, opt []graph.IteratorShape) (graph.IteratorCosts, []graph.IteratorCosts, error) {
   270  	if len(its) == 0 {
   271  		return graph.IteratorCosts{}, nil, nil
   272  	}
   273  
   274  	arr := make([]graph.IteratorCosts, 0, len(its))
   275  
   276  	primaryStats, _ := its[0].Stats(ctx)
   277  	arr = append(arr, primaryStats)
   278  
   279  	containsCost := primaryStats.ContainsCost
   280  	nextCost := primaryStats.NextCost
   281  	size := primaryStats.Size.Size
   282  	exact := primaryStats.Size.Exact
   283  
   284  	var last error
   285  	for _, sub := range its[1:] {
   286  		stats, err := sub.Stats(ctx)
   287  		if err != nil {
   288  			last = err
   289  		}
   290  		arr = append(arr, stats)
   291  		nextCost += stats.ContainsCost * (1 + (primaryStats.Size.Size / (stats.Size.Size + 1)))
   292  		containsCost += stats.ContainsCost
   293  		if size > stats.Size.Size {
   294  			size = stats.Size.Size
   295  			exact = stats.Size.Exact
   296  		}
   297  	}
   298  	for _, sub := range opt {
   299  		stats, _ := sub.Stats(ctx)
   300  		nextCost += stats.ContainsCost * (1 + (primaryStats.Size.Size / (stats.Size.Size + 1)))
   301  		containsCost += stats.ContainsCost
   302  	}
   303  	return graph.IteratorCosts{
   304  		ContainsCost: containsCost,
   305  		NextCost:     nextCost,
   306  		Size: graph.Size{
   307  			Size:  size,
   308  			Exact: exact,
   309  		},
   310  	}, arr, last
   311  }
   312  
   313  // and.Stats() lives here in and-iterator-optimize.go because it may
   314  // in the future return different statistics based on how it is optimized.
   315  // For now, however, it's pretty static.
   316  //
   317  // Returns the approximate size of the And iterator. Because we're dealing
   318  // with an intersection, we know that the largest we can be is the size of the
   319  // smallest iterator. This is the heuristic we shall follow. Better heuristics
   320  // welcome.
   321  func (it *and) Stats(ctx context.Context) (graph.IteratorCosts, error) {
   322  	stats, _, err := getStatsForSlice(ctx, it.sub, it.opt)
   323  	return stats, err
   324  }