github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/opt/memo/memo.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package memo
    12  
    13  import (
    14  	"context"
    15  
    16  	"github.com/cockroachdb/cockroach/pkg/sql/opt"
    17  	"github.com/cockroachdb/cockroach/pkg/sql/opt/cat"
    18  	"github.com/cockroachdb/cockroach/pkg/sql/opt/props"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/opt/props/physical"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/sessiondata"
    22  	"github.com/cockroachdb/cockroach/pkg/util/log"
    23  	"github.com/cockroachdb/errors"
    24  )
    25  
    26  // Memo is a data structure for efficiently storing a forest of query plans.
    27  // Conceptually, the memo is composed of a numbered set of equivalency classes
    28  // called groups where each group contains a set of logically equivalent
    29  // expressions. Two expressions are considered logically equivalent if:
    30  //
    31  //   1. They return the same number and data type of columns. However, order and
    32  //      naming of columns doesn't matter.
    33  //   2. They return the same number of rows, with the same values in each row.
    34  //      However, order of rows doesn't matter.
    35  //
    36  // The different expressions in a single group are called memo expressions
    37  // (memo-ized expressions). The children of a memo expression can themselves be
    38  // part of memo groups. Therefore, the memo forest is composed of every possible
    39  // combination of parent expression with its possible child expressions,
    40  // recursively applied.
    41  //
    42  // Memo expressions can be relational (e.g. join) or scalar (e.g. <). Operators
    43  // are always both logical (specify results) and physical (specify results and
    44  // a particular implementation). This means that even a "raw" unoptimized
    45  // expression tree can be executed (naively). Both relational and scalar
    46  // operators are uniformly represented as nodes in memo expression trees, which
    47  // facilitates tree pattern matching and replacement. However, because scalar
    48  // expression memo groups never have more than one expression, scalar
    49  // expressions can use a simpler representation.
    50  //
    51  // Because memo groups contain logically equivalent expressions, all the memo
    52  // expressions in a group share the same logical properties. However, it's
    53  // possible for two logically equivalent expression to be placed in different
    54  // memo groups. This occurs because determining logical equivalency of two
    55  // relational expressions is too complex to perform 100% correctly. A
    56  // correctness failure (i.e. considering two expressions logically equivalent
    57  // when they are not) results in invalid transformations and invalid plans.
    58  // But placing two logically equivalent expressions in different groups has a
    59  // much gentler failure mode: the memo and transformations are less efficient.
    60  // Expressions within the memo may have different physical properties. For
    61  // example, a memo group might contain both hash join and merge join
    62  // expressions which produce the same set of output rows, but produce them in
    63  // different orders.
    64  //
    65  // Expressions are inserted into the memo by the factory, which ensure that
    66  // expressions have been fully normalized before insertion (see the comment in
    67  // factory.go for more details). A new group is created only when unique
    68  // normalized expressions are created by the factory during construction or
    69  // rewrite of the tree. Uniqueness is determined by "interning" each expression,
    70  // which means that multiple equivalent expressions are mapped to a single
    71  // in-memory instance. This allows interned expressions to be checked for
    72  // equivalence by simple pointer comparison. For example:
    73  //
    74  //   SELECT * FROM a, b WHERE a.x = b.x
    75  //
    76  // After insertion into the memo, the memo would contain these six groups, with
    77  // numbers substituted for pointers to the normalized expression in each group:
    78  //
    79  //   G6: [inner-join [G1 G2 G5]]
    80  //   G5: [eq [G3 G4]]
    81  //   G4: [variable b.x]
    82  //   G3: [variable a.x]
    83  //   G2: [scan b]
    84  //   G1: [scan a]
    85  //
    86  // Each leaf expressions is interned by hashing its operator type and any
    87  // private field values. Expressions higher in the tree can then rely on the
    88  // fact that all children have been interned, and include their pointer values
    89  // in its hash value. Therefore, the memo need only hash the expression's fields
    90  // in order to determine whether the expression already exists in the memo.
    91  // Walking the subtree is not necessary.
    92  //
    93  // The normalizing factory will never add more than one expression to a memo
    94  // group. But the explorer does add denormalized expressions to existing memo
    95  // groups, since oftentimes one of these equivalent, but denormalized
    96  // expressions will have a lower cost than the initial normalized expression
    97  // added by the factory. For example, the join commutativity transformation
    98  // expands the memo like this:
    99  //
   100  //   G6: [inner-join [G1 G2 G5]] [inner-join [G2 G1 G5]]
   101  //   G5: [eq [G3 G4]]
   102  //   G4: [variable b.x]
   103  //   G3: [variable a.x]
   104  //   G2: [scan b]
   105  //   G1: [scan a]
   106  //
   107  // See the comments in explorer.go for more details.
   108  type Memo struct {
   109  	// metadata provides information about the columns and tables used in this
   110  	// particular query.
   111  	metadata opt.Metadata
   112  
   113  	// interner interns all expressions in the memo, ensuring that there is at
   114  	// most one instance of each expression in the memo.
   115  	interner interner
   116  
   117  	// logPropsBuilder is inlined in the memo so that it can be reused each time
   118  	// scalar or relational properties need to be built.
   119  	logPropsBuilder logicalPropsBuilder
   120  
   121  	// rootExpr is the root expression of the memo expression forest. It is set
   122  	// via a call to SetRoot. After optimization, it is set to be the root of the
   123  	// lowest cost tree in the forest.
   124  	rootExpr opt.Expr
   125  
   126  	// rootProps are the physical properties required of the root memo expression.
   127  	// It is set via a call to SetRoot.
   128  	rootProps *physical.Required
   129  
   130  	// memEstimate is the approximate memory usage of the memo, in bytes.
   131  	memEstimate int64
   132  
   133  	// The following are selected fields from SessionData which can affect
   134  	// planning. We need to cross-check these before reusing a cached memo.
   135  	dataConversion      sessiondata.DataConversionConfig
   136  	reorderJoinsLimit   int
   137  	zigzagJoinEnabled   bool
   138  	optimizerFKChecks   bool
   139  	optimizerFKCascades bool
   140  	useHistograms       bool
   141  	useMultiColStats    bool
   142  	safeUpdates         bool
   143  	saveTablesPrefix    string
   144  	insertFastPath      bool
   145  
   146  	// curID is the highest currently in-use scalar expression ID.
   147  	curID opt.ScalarID
   148  
   149  	// curWithID is the highest currently in-use WITH ID.
   150  	curWithID opt.WithID
   151  
   152  	newGroupFn func(opt.Expr)
   153  
   154  	// WARNING: if you add more members, add initialization code in Init.
   155  }
   156  
   157  // Init initializes a new empty memo instance, or resets existing state so it
   158  // can be reused. It must be called before use (or reuse). The memo collects
   159  // information about the context in which it is compiled from the evalContext
   160  // argument. If any of that changes, then the memo must be invalidated (see the
   161  // IsStale method for more details).
   162  func (m *Memo) Init(evalCtx *tree.EvalContext) {
   163  	m.metadata.Init()
   164  	m.interner.Clear()
   165  	m.logPropsBuilder.init(evalCtx, m)
   166  
   167  	m.rootExpr = nil
   168  	m.rootProps = nil
   169  	m.memEstimate = 0
   170  
   171  	m.dataConversion = evalCtx.SessionData.DataConversion
   172  	m.reorderJoinsLimit = evalCtx.SessionData.ReorderJoinsLimit
   173  	m.zigzagJoinEnabled = evalCtx.SessionData.ZigzagJoinEnabled
   174  	m.optimizerFKChecks = evalCtx.SessionData.OptimizerFKChecks
   175  	m.optimizerFKCascades = evalCtx.SessionData.OptimizerFKCascades
   176  	m.useHistograms = evalCtx.SessionData.OptimizerUseHistograms
   177  	m.useMultiColStats = evalCtx.SessionData.OptimizerUseMultiColStats
   178  	m.safeUpdates = evalCtx.SessionData.SafeUpdates
   179  	m.saveTablesPrefix = evalCtx.SessionData.SaveTablesPrefix
   180  	m.insertFastPath = evalCtx.SessionData.InsertFastPath
   181  
   182  	m.curID = 0
   183  	m.curWithID = 0
   184  }
   185  
   186  // NotifyOnNewGroup sets a callback function which is invoked each time we
   187  // create a new memo group.
   188  func (m *Memo) NotifyOnNewGroup(fn func(opt.Expr)) {
   189  	m.newGroupFn = fn
   190  }
   191  
   192  // IsEmpty returns true if there are no expressions in the memo.
   193  func (m *Memo) IsEmpty() bool {
   194  	// Root expression can be nil before optimization and interner is empty after
   195  	// exploration, so check both.
   196  	return m.interner.Count() == 0 && m.rootExpr == nil
   197  }
   198  
   199  // MemoryEstimate returns a rough estimate of the memo's memory usage, in bytes.
   200  // It only includes memory usage that is proportional to the size and complexity
   201  // of the query, rather than constant overhead bytes.
   202  func (m *Memo) MemoryEstimate() int64 {
   203  	// Multiply by 2 to take rough account of allocation fragmentation, private
   204  	// data, list overhead, properties, etc.
   205  	return m.memEstimate * 2
   206  }
   207  
   208  // Metadata returns the metadata instance associated with the memo.
   209  func (m *Memo) Metadata() *opt.Metadata {
   210  	return &m.metadata
   211  }
   212  
   213  // RootExpr returns the root memo expression previously set via a call to
   214  // SetRoot.
   215  func (m *Memo) RootExpr() opt.Expr {
   216  	return m.rootExpr
   217  }
   218  
   219  // RootProps returns the physical properties required of the root memo group,
   220  // previously set via a call to SetRoot.
   221  func (m *Memo) RootProps() *physical.Required {
   222  	return m.rootProps
   223  }
   224  
   225  // SetRoot stores the root memo expression when it is a relational expression,
   226  // and also stores the physical properties required of the root group.
   227  func (m *Memo) SetRoot(e RelExpr, phys *physical.Required) {
   228  	m.rootExpr = e
   229  	if m.rootProps != phys {
   230  		m.rootProps = m.InternPhysicalProps(phys)
   231  	}
   232  
   233  	// Once memo is optimized, release reference to the eval context and free up
   234  	// the memory used by the interner.
   235  	if m.IsOptimized() {
   236  		m.logPropsBuilder.clear()
   237  		m.interner.Clear()
   238  	}
   239  }
   240  
   241  // SetScalarRoot stores the root memo expression when it is a scalar expression.
   242  // Used only for testing.
   243  func (m *Memo) SetScalarRoot(scalar opt.ScalarExpr) {
   244  	m.rootExpr = scalar
   245  }
   246  
   247  // HasPlaceholders returns true if the memo contains at least one placeholder
   248  // operator.
   249  func (m *Memo) HasPlaceholders() bool {
   250  	rel, ok := m.rootExpr.(RelExpr)
   251  	if !ok {
   252  		panic(errors.AssertionFailedf("placeholders only supported when memo root is relational"))
   253  	}
   254  
   255  	return rel.Relational().HasPlaceholder
   256  }
   257  
   258  // IsStale returns true if the memo has been invalidated by changes to any of
   259  // its dependencies. Once a memo is known to be stale, it must be ejected from
   260  // any query cache or prepared statement and replaced with a recompiled memo
   261  // that takes into account the changes. IsStale checks the following
   262  // dependencies:
   263  //
   264  //   1. Current database: this can change name resolution.
   265  //   2. Current search path: this can change name resolution.
   266  //   3. Current location: this determines time zone, and can change how time-
   267  //      related types are constructed and compared.
   268  //   4. Data source schema: this determines most aspects of how the query is
   269  //      compiled.
   270  //   5. Data source privileges: current user may no longer have access to one or
   271  //      more data sources.
   272  //
   273  // This function cannot swallow errors and return only a boolean, as it may
   274  // perform KV operations on behalf of the transaction associated with the
   275  // provided catalog, and those errors are required to be propagated.
   276  func (m *Memo) IsStale(
   277  	ctx context.Context, evalCtx *tree.EvalContext, catalog cat.Catalog,
   278  ) (bool, error) {
   279  	// Memo is stale if fields from SessionData that can affect planning have
   280  	// changed.
   281  	if !m.dataConversion.Equals(&evalCtx.SessionData.DataConversion) ||
   282  		m.reorderJoinsLimit != evalCtx.SessionData.ReorderJoinsLimit ||
   283  		m.zigzagJoinEnabled != evalCtx.SessionData.ZigzagJoinEnabled ||
   284  		m.optimizerFKChecks != evalCtx.SessionData.OptimizerFKChecks ||
   285  		m.optimizerFKCascades != evalCtx.SessionData.OptimizerFKCascades ||
   286  		m.useHistograms != evalCtx.SessionData.OptimizerUseHistograms ||
   287  		m.useMultiColStats != evalCtx.SessionData.OptimizerUseMultiColStats ||
   288  		m.safeUpdates != evalCtx.SessionData.SafeUpdates ||
   289  		m.saveTablesPrefix != evalCtx.SessionData.SaveTablesPrefix ||
   290  		m.insertFastPath != evalCtx.SessionData.InsertFastPath {
   291  		return true, nil
   292  	}
   293  
   294  	// Memo is stale if the fingerprint of any object in the memo's metadata has
   295  	// changed, or if the current user no longer has sufficient privilege to
   296  	// access the object.
   297  	if depsUpToDate, err := m.Metadata().CheckDependencies(ctx, catalog); err != nil {
   298  		return true, err
   299  	} else if !depsUpToDate {
   300  		return true, nil
   301  	}
   302  	return false, nil
   303  }
   304  
   305  // InternPhysicalProps adds the given physical props to the memo if they haven't
   306  // yet been added. If the same props was added previously, then return a pointer
   307  // to the previously added props. This allows interned physical props to be
   308  // compared for equality using simple pointer comparison.
   309  func (m *Memo) InternPhysicalProps(phys *physical.Required) *physical.Required {
   310  	// Special case physical properties that require nothing of operator.
   311  	if !phys.Defined() {
   312  		return physical.MinRequired
   313  	}
   314  	return m.interner.InternPhysicalProps(phys)
   315  }
   316  
   317  // SetBestProps updates the physical properties, provided ordering, and cost of
   318  // a relational expression's memo group (see the relevant methods of RelExpr).
   319  // It is called by the optimizer once it determines the expression in the group
   320  // that is part of the lowest cost tree (for the overall query).
   321  func (m *Memo) SetBestProps(
   322  	e RelExpr, required *physical.Required, provided *physical.Provided, cost Cost,
   323  ) {
   324  	if e.RequiredPhysical() != nil {
   325  		if e.RequiredPhysical() != required ||
   326  			!e.ProvidedPhysical().Equals(provided) ||
   327  			e.Cost() != cost {
   328  			panic(errors.AssertionFailedf(
   329  				"cannot overwrite %s / %s (%.9g) with %s / %s (%.9g)",
   330  				e.RequiredPhysical(),
   331  				e.ProvidedPhysical(),
   332  				log.Safe(e.Cost()),
   333  				required.String(),
   334  				provided.String(), // Call String() so provided doesn't escape.
   335  				cost,
   336  			))
   337  		}
   338  		return
   339  	}
   340  	bp := e.bestProps()
   341  	bp.required = required
   342  	bp.provided = *provided
   343  	bp.cost = cost
   344  }
   345  
   346  // ResetCost updates the cost of a relational expression's memo group. It
   347  // should *only* be called by Optimizer.RecomputeCost() for testing purposes.
   348  func (m *Memo) ResetCost(e RelExpr, cost Cost) {
   349  	e.bestProps().cost = cost
   350  }
   351  
   352  // IsOptimized returns true if the memo has been fully optimized.
   353  func (m *Memo) IsOptimized() bool {
   354  	// The memo is optimized once the root expression has its physical properties
   355  	// assigned.
   356  	rel, ok := m.rootExpr.(RelExpr)
   357  	return ok && rel.RequiredPhysical() != nil
   358  }
   359  
   360  // NextID returns a new unique ScalarID to number expressions with.
   361  func (m *Memo) NextID() opt.ScalarID {
   362  	m.curID++
   363  	return m.curID
   364  }
   365  
   366  // RequestColStat calculates and returns the column statistic calculated on the
   367  // relational expression.
   368  func (m *Memo) RequestColStat(
   369  	expr RelExpr, cols opt.ColSet,
   370  ) (colStat *props.ColumnStatistic, ok bool) {
   371  	// When SetRoot is called, the statistics builder may have been cleared.
   372  	// If this happens, we can't serve the request anymore.
   373  	if m.logPropsBuilder.sb.md != nil {
   374  		return m.logPropsBuilder.sb.colStat(cols, expr), true
   375  	}
   376  	return nil, false
   377  }
   378  
   379  // RowsProcessed calculates and returns the number of rows processed by the
   380  // relational expression. It is currently only supported for joins.
   381  func (m *Memo) RowsProcessed(expr RelExpr) (_ float64, ok bool) {
   382  	// When SetRoot is called, the statistics builder may have been cleared.
   383  	// If this happens, we can't serve the request anymore.
   384  	if m.logPropsBuilder.sb.md != nil {
   385  		return m.logPropsBuilder.sb.rowsProcessed(expr), true
   386  	}
   387  	return 0, false
   388  }
   389  
   390  // NextWithID returns a not-yet-assigned identifier for a WITH expression.
   391  func (m *Memo) NextWithID() opt.WithID {
   392  	m.curWithID++
   393  	return m.curWithID
   394  }
   395  
   396  // ClearColStats clears all column statistics from every relational expression
   397  // in the memo. This is used to free up the potentially large amount of memory
   398  // used by histograms.
   399  func (m *Memo) ClearColStats(parent opt.Expr) {
   400  	for i, n := 0, parent.ChildCount(); i < n; i++ {
   401  		child := parent.Child(i)
   402  		m.ClearColStats(child)
   403  	}
   404  
   405  	switch t := parent.(type) {
   406  	case RelExpr:
   407  		t.Relational().Stats.ColStats = props.ColStatsMap{}
   408  	}
   409  }