github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/opt/props/func_dep.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/opt/props/func_dep.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package props
    12  
    13  import (
    14  	"fmt"
    15  	"strings"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/sql/opt"
    18  	"github.com/cockroachdb/cockroach/pkg/util/log"
    19  	"github.com/cockroachdb/errors"
    20  )
    21  
    22  // FuncDepSet is a set of functional dependencies (FDs) that encode useful
    23  // relationships between columns in a base or derived relation. Given two sets
    24  // of columns A and B, a functional dependency A-->B holds if A fully determines
    25  // B. In other words, if two different rows have equal values for columns in A,
    26  // then those two rows will also have equal values for columns in B. For
    27  // example, where columns (a1, a2) are in set A, and column (b1) is in set B:
    28  //
    29  //   a1 a2 b1
    30  //   --------
    31  //   1  2  5
    32  //   1  2  5
    33  //   3  4  6
    34  //   3  4  6
    35  //
    36  // The left side of a functional dependency is called the "determinant", and
    37  // the right side is called the "dependant". Each side can contain zero or more
    38  // columns, though the FuncDepSet library will fold away certain combinations
    39  // that don't provide useful information, like A-->A and A-->(), since every
    40  // column trivially determines itself, as well as the empty set.
    41  //
    42  // When a dependant contains multiple columns, it is equivalent to splitting
    43  // the single FD into multiple FDs, each with a single column dependant:
    44  //
    45  //   (a)-->(b,c)
    46  //
    47  // is equivalent to these two FDs:
    48  //
    49  //   (a)-->(b)
    50  //   (a)-->(c)
    51  //
    52  // When a determinant contains zero columns, as in ()-->A, then A is fully
    53  // determined without reference to any other columns. An equivalent statement is
    54  // that any arbitrary combination of determinant columns trivially determines A.
    55  // And both of these statements are just another way of saying that columns in A
    56  // are constant:
    57  //
    58  //   a1 a2    b1 c1
    59  //   ----------------
    60  //   1  NULL  3  3
    61  //   1  NULL  3  NULL
    62  //   1  NULL  4  NULL
    63  //
    64  // When a determinant contains multiple columns, then the functional dependency
    65  // holds for the *composite* value of those columns. For example:
    66  //
    67  //   a1 a2 b1
    68  //   --------
    69  //   1  2  5
    70  //   1  2  5
    71  //   1  3  4
    72  //
    73  // These are valid values, even though a1 has the same values for all three
    74  // rows, because it is only the combination of (a1,a2) that determines (b1).
    75  //
    76  // Multiple FDs can be transitively applied in order to compute the "closure" of
    77  // a set of input columns. The closure includes the input columns plus all
    78  // columns that are functionally dependent on those columns, either directly or
    79  // indirectly. Consider this set of FD's:
    80  //
    81  //   (a)-->(b,c,d)
    82  //   (b,c,e)-->(f)
    83  //   (d)-->(e)
    84  //
    85  // The transitive closure of (a) is (a,b,c,d,e,f). To start, (a) determines
    86  // (b,c,d). From there, (d) transitively determines (e). And now that (b,c,e)
    87  // have been determined, they in turn determine (f). Because (a) determines all
    88  // other columns, if two rows have the same value for (a), then the rows will be
    89  // duplicates, since all other columns will be equal. And if there are no
    90  // duplicate rows, then (a) is a key for the relation.
    91  //
    92  // Deriving FD Sets
    93  //
    94  // Base table primary keys can be trivially mapped into an FD set, since the
    95  // primary key always uniquely determines the other columns:
    96  //
    97  //   CREATE TABLE t (a INT PRIMARY KEY, b INT, c INT)
    98  //   (a)-->(b,c)
    99  //
   100  // Each SQL relational operator derives its own FD set from the FD sets of its
   101  // inputs. For example, the Select operator augments the FD set of its input,
   102  // based on its filter condition:
   103  //
   104  //   SELECT * FROM t WHERE a=1
   105  //
   106  // Equating a column to a constant value constructs a new FD with an empty
   107  // determinant, so that the augmented FD set becomes:
   108  //
   109  //   (a)-->(b,c)
   110  //   ()-->(a)
   111  //
   112  // Since the value of column "a" is always the same, and since "a" functionally
   113  // determines "b" and "c", the values of all columns are constants. Furthermore,
   114  // because "a" is known to be a key, the result set can have at most one row.
   115  //
   116  // This is but one example of how FDs can assist the optimizer in proving useful
   117  // properties about query results. This information powers many optimizations,
   118  // including eliminating unnecessary DISTINCT operators, simplifying ORDER BY
   119  // columns, removing Max1Row operators, and mapping semi-joins to inner-joins.
   120  //
   121  // NULL Values
   122  //
   123  // FDs become more complex when the possibility of NULL values is introduced.
   124  // SQL semantics often treat a NULL value as an "unknown" value that is not
   125  // equal to any other value, including another NULL value. For example, SQL
   126  // unique indexes exhibit this behavior:
   127  //
   128  //   CREATE TABLE t (a INT PRIMARY KEY, b INT, c INT, UNIQUE (b))
   129  //
   130  // Here, "b" column values are unique...except for the case of multiple NULL
   131  // values, which are allowed because each NULL is treated as if it was a
   132  // different value. Contrast this with the different NULL handling rules used
   133  // by SQL's GROUP BY and DISTINCT operators. Those operators treat multiple NULL
   134  // values as duplicates, because each NULL is treated as if it was the same
   135  // value.
   136  //
   137  // The functional dependencies described up until now always use the "NULLs are
   138  // equal" semantics (denoted as NULL= hereafter) in order to answer the question
   139  // "are these two columns equal". The semantics are identical to what this SQL
   140  // expression returns:
   141  //
   142  //   ((c1 = c2) OR (c1 IS NULL AND c2 IS NULL)) IS True
   143  //
   144  // And here are some examples:
   145  //
   146  //   c1    c2    NULL=
   147  //   -----------------
   148  //   1     1     true
   149  //   NULL  NULL  true
   150  //   1     2     false
   151  //   1     NULL  false
   152  //   NULL  1     false
   153  //
   154  // So now for the definition of A-->B that incorporates NULL values:
   155  //
   156  //   for any two rows r1 and r2 in the relation:
   157  //   A(r1) NULL= A(r2) ==> B(r1) NULL= B(r2)
   158  //
   159  // Intuitively, if two different rows have equal values for A using "NULLs are
   160  // equal" semantics, then those rows will also have equal values for B using
   161  // those same semantics. As an example, the following sets of rows would be
   162  // valid for the dependency (b)-->(c):
   163  //
   164  //   b     c
   165  //   ----------
   166  //   1     NULL
   167  //   1     NULL
   168  //   NULL  1
   169  //   NULL  1
   170  //   2     3
   171  //   2     3
   172  //
   173  //   b     c
   174  //   ----------
   175  //   NULL  NULL
   176  //   NULL  NULL
   177  //
   178  // but these sets of rows would be invalid:
   179  //
   180  //   b     c
   181  //   ----------
   182  //   NULL  1
   183  //   NULL  NULL
   184  //
   185  //   b     c
   186  //   ----------
   187  //   NULL  1
   188  //   NULL  2
   189  //
   190  // Unique constraints allow the latter cases, however, and therefore it is
   191  // desirable to somehow encode these weaker dependencies as FDs, because they
   192  // can be strengthened later on if NULL values are filtered from determinant
   193  // columns (more on that below).
   194  //
   195  // The solution is to store an extra "strict" bit on each FD. If true, then the
   196  // the FD is a "strict" dependency, and behaves as described above. However, if
   197  // false, then the FD is a "lax" dependency. Lax dependencies use "squiggly"
   198  // arrow notation to differentiate them from the strict variant:
   199  //
   200  //   A~~>B
   201  //
   202  // In contrast to strict dependencies, lax dependencies treat NULLs on
   203  // determinant columns as distinct from one another, with equality semantics
   204  // identical to this SQL expression:
   205  //
   206  //   (c1 = c2) IS True
   207  //
   208  // In other words, if either c1 or c2 is NULL, or both are NULL, then c1 is
   209  // considered not equal to c2. The definition for A~~>B follows from that:
   210  //
   211  //   for any two rows r1 and r2 in the relation:
   212  //   (A(r1) = A(r2)) IS True ==> B(r1) NULL= B(r2)
   213  //
   214  // In other words, if two different non-NULL rows have equal values for A, then
   215  // those rows will also have equal values for B using NULL= semantics. Note that
   216  // both strict and lax equality definitions collapse to the same semantics when
   217  // the columns of A are not-NULL. The example row sets shown above that were
   218  // invalid for a strict dependency are valid for a lax dependency:
   219  //
   220  //   b     c
   221  //   ----------
   222  //   NULL  1
   223  //   NULL  NULL
   224  //
   225  //   b     c
   226  //   ----------
   227  //   NULL  1
   228  //   NULL  2
   229  //
   230  // To continue the CREATE TABLE example shown above, another FD can now be
   231  // derived from that statement, in addition to the primary key FD:
   232  //
   233  //   (a)-->(b,c)
   234  //   (b)~~>(a,c)
   235  //
   236  // Lax dependencies are *not* transitive, and have limited usefulness as-is.
   237  // However, some operators (like Select) can "reject" NULL values, which means
   238  // that they filter out rows containing the troublesome NULL values. That makes
   239  // it possible for the operator to "upgrade" a lax dependency to a strict
   240  // dependency (recall that the both have identical semantics when NULLs are not
   241  // present), as in this example:
   242  //
   243  //   SELECT * FROM t WHERE b>5
   244  //
   245  // The ">" operator rejects NULL values, which means that the Select operator
   246  // can convert the lax dependency to a strict dependency:
   247  //
   248  //   (a)-->(b,c)
   249  //   (b)-->(a,c)
   250  //
   251  // Now, either the "a" or "b" column determines the values of all other columns,
   252  // and both are keys for the relation.
   253  //
   254  // Another thing to note is that a lax dependency with an empty determinant is
   255  // the same as the corresponding strict dependency:
   256  //
   257  //   ()~~>(a,b)
   258  //   ()-->(a,b)
   259  //
   260  // As described above, a strict dependency differs from a lax dependency only in
   261  // terms of what values are allowed in determinant columns. Since the
   262  // determinant has no columns in these cases, the semantics will be identical.
   263  // For that reason, this library automatically maps lax constant dependencies to
   264  // strict constant dependencies.
   265  //
   266  // Keys
   267  //
   268  // A key is a set of columns that have a unique composite value for every row in
   269  // the relation. There are two kinds of keys, strict and lax, that parallel the
   270  // two kinds of functional dependencies. Strict keys treat NULL values in key
   271  // columns as equal to one another:
   272  //
   273  //   b     c
   274  //   --------
   275  //   1     10
   276  //   2     20
   277  //   NULL  30
   278  //
   279  // Here, "b" is a key for the relation, even though it contains a NULL value,
   280  // because there is only one such value. Multiple NULL values would violate the
   281  // strict key, because they would compare as equal, and therefore would be
   282  // considered duplicates. The SQL GROUP BY operator uses the same semantics for
   283  // grouping (it's no coincidence that the definition for strict keys follows
   284  // that lead).
   285  //
   286  // By contrast, lax keys treat NULL values in key columns as distinct from one
   287  // another, and so considers column "b" as unique in the following example:
   288  //
   289  //   b     c
   290  //   --------
   291  //   1     10
   292  //   2     20
   293  //   NULL  30
   294  //   NULL  40
   295  //
   296  // Note that both strict and lax keys treat non-NULL values identically; values
   297  // from two different rows must never compare equal to one another. In addition,
   298  // the presence of a strict or lax key always implies a functional dependency
   299  // with the key as determinant and all other columns in the relation as
   300  // dependants. Here is an example assuming a table with columns (a,b,c,d):
   301  //
   302  //   lax-key(a,b)    => (a,b)~~>(c,d)
   303  //   strict-key(a,b) => (a,b)-->(c,d)
   304  //
   305  // The "empty key" is a special key that has zero columns. It is used when the
   306  // relation is guaranteed to have at most one row. In this special case, every
   307  // column is constant. Every combination of columns is a trivial key for the
   308  // relation and determines every other column. Because the lax and strict key
   309  // designations are equivalent when there is a single row, empty keys are always
   310  // normalized to be strict for convenience.
   311  //
   312  // FuncDepSet tracks whether at least one key (whether it be strict or lax)
   313  // exists for the relation. If this is true, then all possible keys for the
   314  // relation can be enumerated using the FD set. This is because any subset of
   315  // columns forms a key if its FD closure contains every column in the relation.
   316  // Therefore, all keys can be brute force enumerated by checking the closure of
   317  // each combination in the power set. Again, this is only possible when the
   318  // relation is known to have a key; otherwise, knowing the closure contains all
   319  // columns is not a sufficient condition to identify a key, because of the
   320  // possibility of duplicate rows.
   321  //
   322  // In practice, it is never necessary to enumerate all possible keys (fortunate,
   323  // since there can be O(2**N) of them), since the vast majority of them turn out
   324  // to have redundant columns that can be functionally determined from other
   325  // columns in the key. Of more value is the set of "candidate keys", which are
   326  // keys that contain no redundant columns. Removing any column from such a key
   327  // causes it to longer be a key. It is possible to enumerate the set of
   328  // candidate keys in polynomial rather than exponential time (see Wikipedia
   329  // "Candidate key" entry).
   330  //
   331  // However, since even polynomial time can be problematic, this library tries to
   332  // avoid enumerating keys by storing and maintaining a single candidate key for
   333  // the relation. And while it is not always successful, the library tries to
   334  // keep the candidate key that has the fewest number of columns. In most cases,
   335  // this single key is enough to satisfy the requirements of the optimizer. But
   336  // when it is not enough, or the existing key is no longer valid, then a new
   337  // candidate key can always be generated.
   338  //
   339  // It turns out that the most common key-related question that must be answered
   340  // is not "what are the list of keys for this relation?", but instead, "does
   341  // this set of columns contain a key for the relation?". The latter question can
   342  // be easily answered by computing the closure of the columns, and checking
   343  // whether the closure contains the key maintained by FuncDepSet. And when a
   344  // relatively short key is needed (e.g. during decorrelation), FuncDepSet has
   345  // one ready to go.
   346  //
   347  // Equivalent Columns
   348  //
   349  // FD sets encode "equivalent columns", which are pairs of columns that always
   350  // have equal values using the SQL equality operator with NULL= semantics. Two
   351  // columns a and b are equivalent if the following expression returns true:
   352  //
   353  //   ((a = b) OR (a IS NULL AND b IS NULL)) IS True
   354  //
   355  // Equivalent columns are typically derived from a Select filter condition, and
   356  // are represented as two FDs with each column acting as both determinant and
   357  // dependant:
   358  //
   359  //   SELECT * FROM t WHERE b=c
   360  //   (a)-->(b,c)
   361  //   (b)~~>(a,c)
   362  //   (b)==(c)
   363  //   (c)==(b)
   364  //
   365  // In the common case shown above, the WHERE clause rejects NULL values, so the
   366  // equivalency will always be strict, which means it retains all the same
   367  // properties of a strict dependency. While lax equivalencies are theoretically
   368  // possible, the library currently maps them into regular lax dependencies to
   369  // simplify implementation.
   370  //
   371  // Theory to Practice
   372  //
   373  // For a more rigorous examination of functional dependencies and their
   374  // interaction with various SQL operators, see the following Master's Thesis:
   375  //
   376  //   Norman Paulley, Glenn. (2000).
   377  //   Exploiting Functional Dependence in Query Optimization.
   378  //   https://cs.uwaterloo.ca/research/tr/2000/11/CS-2000-11.thesis.pdf
   379  //
   380  // While this paper served as the inspiration for this library, a number of
   381  // details differ, including (but not limited to):
   382  //
   383  //   1. Most importantly, the definition of "lax" used in the paper differs from
   384  //      the definition used by this library. For a lax dependency A~~>B, the
   385  //      paper allows this set of rows:
   386  //
   387  //        a  b
   388  //        -------
   389  //        1  1
   390  //        1  NULL
   391  //
   392  //      This library disallows that, since it requires that if the determinant
   393  //      of a lax dependency is not-null, then it is equivalent to a strict
   394  //      dependency. This alternate definition is briefly covered in section
   395  //      2.5.3.2 of the paper (see definition 2.19). The reason for this change
   396  //      is to allow a lax dependency to be upgraded to a strict dependency more
   397  //      readily, needing only the determinant columns to be not-null rather than
   398  //      both determinant and dependant columns.
   399  //
   400  //   2. The paper simplifies FD sets so that dependants never contain more than
   401  //      one column. This library allows multiple dependent columns, since they
   402  //      can be so efficiently stored and processed as ColSets.
   403  //
   404  //   3. The paper deliberately avoids all simplifications when a SQL operator
   405  //      adds new FDs to an existing FD set, in order to avoid unneeded work and
   406  //      expensive reductions. This library performs quite a few simplifications
   407  //      in order to keep the FD set more manageable and understandable.
   408  //
   409  //   4. The paper "colors" columns black when they are no longer part of a
   410  //      derived relation. Rather than marking removed columns, this library
   411  //      actually removes them from the FD set.
   412  //
   413  //   5. In order to ensure a unique key for every relation, the paper uses a
   414  //      special "tuple identifier" that acts like a virtual column and can be
   415  //      both a determinant and a dependant. If the transitive closure of any set
   416  //      of columns includes the tuple identifier column, then that set of
   417  //      columns is a super key for the relation. As described in the Keys
   418  //      section above, this library takes a simplified approach so that it
   419  //      doesn't need to allocate virtual columns in property derivation code.
   420  //
   421  type FuncDepSet struct {
   422  	// deps contains the functional dependencies that have a non-trivial
   423  	// determinant and dependant (i.e. not empty, with no overlapping columns):
   424  	//
   425  	//   (a)-->(b,c)
   426  	//   (b,c)~~>(a,d)
   427  	//   (d)==(e)
   428  	//   (e)==(d)
   429  	//
   430  	// See the above comments for more details.
   431  	//
   432  	// This slice is owned by this FuncDepSet and shouldn't be shared unless
   433  	// all referencing sets are treated as immutable.
   434  	deps []funcDep
   435  
   436  	// hasKey is:
   437  	//  - strictKey if the relation has no duplicate rows, which means at least
   438  	//    one subset of its columns form a key (all columns, if no other subset).
   439  	//    The key field contains one such key. See the "Keys" section above for
   440  	//    more details. A strict key can be empty.
   441  	//  - laxKey if there is a at least one subset of columns that form a lax key.
   442  	//    The key field contains one such key. A lax key cannot be empty.
   443  	//
   444  	// See the "Keys" section above for more details.
   445  	hasKey keyType
   446  
   447  	// key contains a set of columns that form a key or a lax key for the
   448  	// relation, depending on hasKey; empty if hasKey is noKey.
   449  	//
   450  	// There is no guarantee that the key has the minimum possible number of
   451  	// columns, but a best effort is made to keep it as short as possible.
   452  	//
   453  	// See the "Keys" section above for more details.
   454  	//
   455  	// This set is immutable; to update it, replace it with a different set
   456  	// containing the desired columns.
   457  	key opt.ColSet
   458  }
   459  
   460  type keyType int8
   461  
   462  const (
   463  	noKey keyType = iota
   464  	laxKey
   465  	strictKey
   466  )
   467  
   468  // funcDep stores a single functional dependency. See the comment for FuncDepSet
   469  // for more details.
   470  type funcDep struct {
   471  	// from is the determinant of the functional dependency (easier to read the
   472  	// code when "from" is used rather than "determinant").
   473  	//
   474  	// This set is immutable; to update it, replace it with a different set
   475  	// containing the desired columns.
   476  	from opt.ColSet
   477  
   478  	// to is the dependant of the functional dependency (easier to read the code
   479  	// when "to" is used rather than "dependant").
   480  	//
   481  	// This set is immutable; to update it, replace it with a different set
   482  	// containing the desired columns.
   483  	to opt.ColSet
   484  
   485  	// strict is true if NULL values in the determinant are treated as if they are
   486  	// equal to other NULL values. Every NULL determinant must therefore map to
   487  	// the same dependant value. If strict is false, then two NULL determinants
   488  	// can map to different dependant values. See the NULL Values section in the
   489  	// FuncDeps comment for more details.
   490  	strict bool
   491  
   492  	// equiv is true if the value of the determinant equals the value of each of
   493  	// the dependant columns, and false if there's no known equality relationship.
   494  	// If equiv is true, the determinant may only consist of a single column.
   495  	equiv bool
   496  }
   497  
   498  // StrictKey returns a strict key for the relation, if there is one.
   499  // A best effort is made to return a candidate key that has the fewest columns.
   500  func (f *FuncDepSet) StrictKey() (_ opt.ColSet, ok bool) {
   501  	if f.hasKey == strictKey {
   502  		return f.key, true
   503  	}
   504  	return opt.ColSet{}, false
   505  }
   506  
   507  // LaxKey returns a lax key for the relation, if there is one.
   508  // Note that strict keys are implicitly also lax keys, so if the relation has a
   509  // strict key, this returns the same key as StrictKey().
   510  // A best effort is made to return a lax key that has the fewest columns.
   511  func (f *FuncDepSet) LaxKey() (_ opt.ColSet, ok bool) {
   512  	if f.hasKey != noKey {
   513  		return f.key, true
   514  	}
   515  	return opt.ColSet{}, false
   516  }
   517  
   518  // Empty is true if the set contains no FDs and no key.
   519  func (f *FuncDepSet) Empty() bool {
   520  	return len(f.deps) == 0 && f.hasKey == noKey
   521  }
   522  
   523  // ColSet returns all columns referenced by the FD set.
   524  func (f *FuncDepSet) ColSet() opt.ColSet {
   525  	var cols opt.ColSet
   526  	for i := 0; i < len(f.deps); i++ {
   527  		fd := &f.deps[i]
   528  		cols.UnionWith(fd.from)
   529  		cols.UnionWith(fd.to)
   530  	}
   531  	if f.hasKey != noKey {
   532  		// There are cases where key columns don't show up in FDs. For example:
   533  		//   lax-key(2,3); ()-->(1)
   534  		cols.UnionWith(f.key)
   535  	}
   536  	return cols
   537  }
   538  
   539  // HasMax1Row returns true if the relation has zero or one rows.
   540  func (f *FuncDepSet) HasMax1Row() bool {
   541  	return f.hasKey == strictKey && f.key.Empty()
   542  }
   543  
   544  // CopyFrom copies the given FD into this FD, replacing any existing data.
   545  func (f *FuncDepSet) CopyFrom(fdset *FuncDepSet) {
   546  	// Make certain to copy FDs to the slice owned by this set.
   547  	f.deps = f.deps[:0]
   548  	f.deps = append(f.deps, fdset.deps...)
   549  	f.key = fdset.key
   550  	f.hasKey = fdset.hasKey
   551  }
   552  
   553  // ColsAreStrictKey returns true if the given columns contain a strict key for the
   554  // relation. This means that any two rows in the relation will never have the
   555  // same values for this set of columns. If the columns are nullable, then at
   556  // most one row could have NULL values for all of the columns. For example,
   557  // (a,b) is a strict key for the following relation, but (a) is not (because
   558  // there are multiple rows where a=1 and a=NULL):
   559  //
   560  //   a     b     c
   561  //   ----------------
   562  //   NULL  NULL  NULL
   563  //   NULL  1     1
   564  //   1     NULL  1
   565  //   1     1     1
   566  //
   567  func (f *FuncDepSet) ColsAreStrictKey(cols opt.ColSet) bool {
   568  	return f.colsAreKey(cols, strictKey)
   569  }
   570  
   571  // ColsAreLaxKey returns true if the given columns contain a lax key for the
   572  // relation. This means that any two rows in the relation will never have the
   573  // same values for this set of columns, except potentially in the case where at
   574  // least one of the columns is NULL. For example, (a,b) is a lax key for the
   575  // following relation, but (a) is not (because there are multiple rows where
   576  // a=1):
   577  //
   578  //   a     b     c
   579  //   ----------------
   580  //   NULL  NULL  NULL
   581  //   NULL  NULL  1
   582  //   NULL  NULL  2
   583  //   NULL  1     1
   584  //   NULL  1     2
   585  //   1     NULL  1
   586  //   1     NULL  2
   587  //   1     1     1
   588  //
   589  func (f *FuncDepSet) ColsAreLaxKey(cols opt.ColSet) bool {
   590  	return f.colsAreKey(cols, laxKey)
   591  }
   592  
   593  // ConstantCols returns the set of columns that will always have the same value
   594  // for all rows in the relation.
   595  func (f *FuncDepSet) ConstantCols() opt.ColSet {
   596  	if len(f.deps) > 0 && f.deps[0].isConstant() {
   597  		return f.deps[0].to
   598  	}
   599  	return opt.ColSet{}
   600  }
   601  
   602  // ReduceCols removes redundant columns from the given set. Redundant columns
   603  // can be functionally determined from the remaining columns. If the columns
   604  // contain a key for the relation, then the reduced columns will form a
   605  // candidate key for the relation.
   606  //
   607  // The reduction algorithm removes one column at a time (in an arbitrary order),
   608  // and then tests to see if the closure still includes the removed column. If
   609  // so, then the column is redundant. This algorithm has decent running time, but
   610  // will not necessarily find the candidate key with the fewest columns.
   611  func (f *FuncDepSet) ReduceCols(cols opt.ColSet) opt.ColSet {
   612  	var removed opt.ColSet
   613  	cols = cols.Copy()
   614  	for i, ok := cols.Next(0); ok; i, ok = cols.Next(i + 1) {
   615  		cols.Remove(i)
   616  		removed.Add(i)
   617  		if !f.inClosureOf(removed, cols, true /* strict */) {
   618  			// The column is not functionally determined by the other columns, so
   619  			// retain it in the set.
   620  			cols.Add(i)
   621  		}
   622  		removed.Remove(i)
   623  	}
   624  	return cols
   625  }
   626  
   627  // InClosureOf returns true if the given columns are functionally determined by
   628  // the "in" column set.
   629  func (f *FuncDepSet) InClosureOf(cols, in opt.ColSet) bool {
   630  	return f.inClosureOf(cols, in, true /* strict */)
   631  }
   632  
   633  // ComputeClosure returns the strict closure of the given columns. The closure
   634  // includes the input columns plus all columns that are functionally dependent
   635  // on those columns, either directly or indirectly. Consider this set of FD's:
   636  //
   637  //   (a)-->(b,c,d)
   638  //   (b,c,e)-->(f)
   639  //   (d)-->(e)
   640  //
   641  // The strict closure of (a) is (a,b,c,d,e,f), because (a) determines all other
   642  // columns. Therefore, if two rows have the same value for (a), then the rows
   643  // will be duplicates, since all other columns will be equal.
   644  func (f *FuncDepSet) ComputeClosure(cols opt.ColSet) opt.ColSet {
   645  	cols = cols.Copy()
   646  	for i := 0; i < len(f.deps); i++ {
   647  		fd := &f.deps[i]
   648  
   649  		if fd.strict && fd.from.SubsetOf(cols) && !fd.to.SubsetOf(cols) {
   650  			cols.UnionWith(fd.to)
   651  
   652  			// Restart iteration to get transitive closure.
   653  			i = -1
   654  		}
   655  	}
   656  	return cols
   657  }
   658  
   659  // ComputeEquivClosure returns the equivalence closure of the given columns. The
   660  // closure includes the input columns plus all columns that are equivalent to
   661  // any of these columns, either directly or indirectly. For example:
   662  //
   663  //   (a)==(b)
   664  //   (b)==(c)
   665  //   (a)==(d)
   666  //
   667  // The equivalence closure for (a) is (a,b,c,d) because (a) is transitively
   668  // equivalent to all other columns. Therefore, all columns must have equal
   669  // non-NULL values, or else all must be NULL (see definition for NULL= in the
   670  // comment for FuncDepSet).
   671  func (f *FuncDepSet) ComputeEquivClosure(cols opt.ColSet) opt.ColSet {
   672  	// Don't need to get transitive closure, because equivalence closures are
   673  	// already maintained for every column.
   674  	cols = cols.Copy()
   675  	for i := 0; i < len(f.deps); i++ {
   676  		fd := &f.deps[i]
   677  		if fd.equiv && fd.from.SubsetOf(cols) && !fd.to.SubsetOf(cols) {
   678  			cols.UnionWith(fd.to)
   679  		}
   680  	}
   681  	return cols
   682  }
   683  
   684  // AddStrictKey adds an FD for a new key. The given key columns are reduced to a
   685  // candidate key, and that becomes the determinant for the allCols column set.
   686  // The resulting FD is strict, meaning that a NULL key value always maps to the
   687  // same set of values in the rest of the relation's columns. For key columns
   688  // (a,b) and relation columns (a,b,c,d), an FD like this is created:
   689  //
   690  //   (a,b)-->(c,d)
   691  //
   692  // If the resulting candidate key has fewer columns than the current key, then
   693  // the new key is adopted in its place.
   694  func (f *FuncDepSet) AddStrictKey(keyCols, allCols opt.ColSet) {
   695  	if !keyCols.SubsetOf(allCols) {
   696  		panic(errors.AssertionFailedf("allCols does not include keyCols"))
   697  	}
   698  
   699  	// Ensure we have candidate key (i.e. has no columns that are functionally
   700  	// determined by other columns).
   701  	keyCols = f.ReduceCols(keyCols)
   702  	f.addDependency(keyCols, allCols, true /* strict */, false /* equiv */)
   703  
   704  	// Try to use the new FD to reduce any existing key first.
   705  	f.tryToReduceKey(opt.ColSet{} /* notNullCols */)
   706  
   707  	if f.hasKey != strictKey || keyCols.Len() < f.key.Len() {
   708  		f.setKey(keyCols, strictKey)
   709  	}
   710  }
   711  
   712  // AddLaxKey is similar to AddStrictKey, except that it creates a lax FD rather
   713  // than a strict FD. This means that two rows with NULL key values might not
   714  // have the same values in other non-key columns. For key columns (a,b) and
   715  // relation columns (a,b,c,d), and FD like this is created:
   716  //
   717  //   (a,b)~~>(c,d)
   718  //
   719  func (f *FuncDepSet) AddLaxKey(keyCols, allCols opt.ColSet) {
   720  	if !keyCols.SubsetOf(allCols) {
   721  		panic(errors.AssertionFailedf("allCols does not include keyCols"))
   722  	}
   723  	if keyCols.Empty() {
   724  		panic(errors.AssertionFailedf("lax key cannot be empty"))
   725  	}
   726  
   727  	// Ensure we have candidate key (i.e. has no columns that are functionally
   728  	// determined by other columns).
   729  	f.addDependency(keyCols, allCols, false /* strict */, false /* equiv */)
   730  
   731  	// TODO(radu): without null column information, we cannot reduce lax keys (see
   732  	// tryToReduceKey). Consider passing that information (or storing it with the
   733  	// FDs to begin with). In that case we would need to reduce both the given key
   734  	// and the existing key, similar to AddStrictKey.
   735  
   736  	if f.hasKey == noKey || (f.hasKey == laxKey && keyCols.Len() < f.key.Len()) {
   737  		f.setKey(keyCols, laxKey)
   738  	}
   739  }
   740  
   741  // MakeMax1Row initializes the FD set for a relation containing either zero or
   742  // one rows, and with the given columns. In this special case, the value of
   743  // every column is trivially considered a constant, and the key is the empty
   744  // set, because no columns are required to ensure uniqueness of rows. This
   745  // special case may seem trivial, but it is quite important to detect during
   746  // optimization. For a relation with columns (a, b), the following FD is
   747  // created in the set:
   748  //
   749  //   ()-->(a,b)
   750  //
   751  func (f *FuncDepSet) MakeMax1Row(cols opt.ColSet) {
   752  	f.deps = f.deps[:0]
   753  	if !cols.Empty() {
   754  		f.deps = append(f.deps, funcDep{to: cols, strict: true})
   755  	}
   756  	f.setKey(opt.ColSet{}, strictKey)
   757  }
   758  
   759  // MakeNotNull modifies the FD set based on which columns cannot contain NULL
   760  // values. This often allows upgrading lax dependencies to strict dependencies,
   761  // and lax keys to strict keys.
   762  //
   763  // Note: this function should be called with all known null columns; it won't do
   764  // as good of a job if it's called multiple times with different subsets.
   765  func (f *FuncDepSet) MakeNotNull(notNullCols opt.ColSet) {
   766  	// We have to collect all the FDs that can be made strict. We avoid allocation
   767  	// for the case where there is at most one such FD.
   768  	var firstLaxFD *funcDep
   769  	var otherLaxFDs []funcDep
   770  	for i := range f.deps {
   771  		fd := &f.deps[i]
   772  		if fd.strict {
   773  			continue
   774  		}
   775  
   776  		// FD can be made strict if all determinant columns are not null.
   777  		if fd.from.SubsetOf(notNullCols) {
   778  			if firstLaxFD == nil {
   779  				firstLaxFD = fd
   780  			} else {
   781  				otherLaxFDs = append(otherLaxFDs, *fd)
   782  			}
   783  		}
   784  	}
   785  
   786  	if firstLaxFD != nil {
   787  		f.addDependency(firstLaxFD.from, firstLaxFD.to, true /* strict */, false /* equiv */)
   788  		for i := range otherLaxFDs {
   789  			f.addDependency(otherLaxFDs[i].from, otherLaxFDs[i].to, true /* strict */, false /* equiv */)
   790  		}
   791  	}
   792  
   793  	f.tryToReduceKey(notNullCols)
   794  }
   795  
   796  // AddEquivalency adds two FDs to the set that establish a strict equivalence
   797  // between the given columns. Either "a" equals "b" according to SQL equality
   798  // semantics, or else "a" is NULL and "b" is NULL. The following FDs are
   799  // created in the set:
   800  //
   801  //   (a)==(b)
   802  //   (b)==(a)
   803  //
   804  func (f *FuncDepSet) AddEquivalency(a, b opt.ColumnID) {
   805  	if a == b {
   806  		return
   807  	}
   808  
   809  	var equiv opt.ColSet
   810  	equiv.Add(a)
   811  	equiv.Add(b)
   812  	f.addEquivalency(equiv)
   813  }
   814  
   815  // AddConstants adds a strict FD to the set that declares the given column as
   816  // having the same constant value for all rows. If the column is nullable, then
   817  // its value may be NULL, but then the column must be NULL for all rows. For
   818  // column "a", the FD looks like this:
   819  //
   820  //   ()-->(a)
   821  //
   822  // Since it is a constant, any set of determinant columns (including the empty
   823  // set) trivially determines the value of "a".
   824  func (f *FuncDepSet) AddConstants(cols opt.ColSet) {
   825  	if cols.Empty() {
   826  		return
   827  	}
   828  
   829  	// Determine complete set of constants by computing closure.
   830  	cols = f.ComputeClosure(cols)
   831  
   832  	// Ensure that first FD in the set is a constant FD and make sure the
   833  	// constants are part of it.
   834  	if len(f.deps) == 0 || !f.deps[0].isConstant() {
   835  		deps := make([]funcDep, len(f.deps)+1)
   836  		deps[0] = funcDep{to: cols, strict: true}
   837  		copy(deps[1:], f.deps)
   838  		f.deps = deps
   839  	} else {
   840  		// Update existing constant FD to include all constant columns in the set.
   841  		f.deps[0].to = cols
   842  	}
   843  
   844  	// Remove any other FDs made redundant by adding the constants.
   845  	n := 1
   846  	for i := 1; i < len(f.deps); i++ {
   847  		fd := &f.deps[i]
   848  
   849  		// Always retain equivalency information, even for constants.
   850  		if !fd.equiv {
   851  			if fd.strict {
   852  				// Constant columns can be removed from the determinant of a strict
   853  				// FD. If all determinant columns are constant, then the entire FD
   854  				// can be removed, since this means that the dependant columns must
   855  				// also be constant (and were part of constant closure added to the
   856  				// constant FD above).
   857  				if !fd.removeFromCols(cols) {
   858  					continue
   859  				}
   860  			}
   861  
   862  			// Dependant constants are redundant, so remove them.
   863  			if !fd.removeToCols(cols) {
   864  				continue
   865  			}
   866  		}
   867  
   868  		if n != i {
   869  			f.deps[n] = f.deps[i]
   870  		}
   871  		n++
   872  	}
   873  	f.deps = f.deps[:n]
   874  
   875  	f.tryToReduceKey(opt.ColSet{} /* notNullCols */)
   876  }
   877  
   878  // AddSynthesizedCol adds an FD to the set that is derived from a synthesized
   879  // column in a projection list. The synthesized column is often derived from
   880  // other columns, in which case AddSynthesizedCol creates a new FD like this:
   881  //
   882  //   (a,b)-->(c)
   883  //
   884  // Or it may be a constant column, like this:
   885  //
   886  //   ()-->(c)
   887  //
   888  func (f *FuncDepSet) AddSynthesizedCol(from opt.ColSet, col opt.ColumnID) {
   889  	if from.Contains(col) {
   890  		panic(errors.AssertionFailedf("synthesized column cannot depend upon itself"))
   891  	}
   892  
   893  	var colSet opt.ColSet
   894  	colSet.Add(col)
   895  	f.addDependency(from, colSet, true /* strict */, false /* equiv */)
   896  
   897  	f.tryToReduceKey(opt.ColSet{} /* notNullCols */)
   898  }
   899  
   900  // ProjectCols removes all columns that are not in the given set. It does this
   901  // by replacing any un-projected dependants by their closures, and then removing
   902  // all FDs containing un-projected columns. While this algorithm may cause some
   903  // loss of information in edge cases, it does a good job of preserving the most
   904  // important dependency information.
   905  func (f *FuncDepSet) ProjectCols(cols opt.ColSet) {
   906  	// Ensure that any existing key contains only projected columns. Do this
   907  	// before removing any FDs from the set, in order to take advantage of all
   908  	// existing transitive relationships.
   909  	if f.hasKey != noKey && !f.key.SubsetOf(cols) {
   910  		// Derive new candidate key (or key is no longer possible).
   911  		if f.hasKey == strictKey && f.ColsAreStrictKey(cols) {
   912  			f.setKey(cols, strictKey)
   913  			f.tryToReduceKey(opt.ColSet{} /* notNullCols */)
   914  		} else if f.ColsAreLaxKey(cols) {
   915  			f.setKey(cols, laxKey)
   916  			f.tryToReduceKey(opt.ColSet{} /* notNullCols */)
   917  		} else {
   918  			f.clearKey()
   919  		}
   920  	}
   921  
   922  	// Special case of <= 1 row.
   923  	if f.hasKey == strictKey && f.key.Empty() {
   924  		f.MakeMax1Row(cols)
   925  		return
   926  	}
   927  
   928  	// During first pass, add closures of un-projected columns in dependants.
   929  	// This will ensure that transitive relationships between remaining columns
   930  	// won't be lost. Also, track list of un-projected columns that are part of
   931  	// non-equivalent determinants. It's possible these can be mapped to
   932  	// equivalent columns.
   933  	var constCols, detCols, equivCols opt.ColSet
   934  	for i := range f.deps {
   935  		fd := &f.deps[i]
   936  
   937  		// Remember constant columns.
   938  		if fd.isConstant() {
   939  			constCols = fd.to
   940  		}
   941  
   942  		// Add closures to dependants containing un-projected columns.
   943  		if !fd.to.SubsetOf(cols) {
   944  			// Equivalence dependencies already maintain closure, so skip them.
   945  			if !fd.equiv {
   946  				fd.to = f.ComputeClosure(fd.to)
   947  			}
   948  		}
   949  
   950  		// Track list of un-projected columns that can possibly be mapped to
   951  		// equivalent columns.
   952  		if !fd.equiv && !fd.from.SubsetOf(cols) {
   953  			detCols.UnionWith(fd.from)
   954  			detCols.DifferenceWith(cols)
   955  		}
   956  
   957  		// Track all columns that have equivalent alternates that are part of the
   958  		// projection.
   959  		if fd.equiv && fd.to.Intersects(cols) {
   960  			equivCols.UnionWith(fd.from)
   961  		}
   962  	}
   963  
   964  	// Construct equivalence map that supports substitution of an equivalent
   965  	// column in place of a removed column.
   966  	detCols.IntersectionWith(equivCols)
   967  	equivMap := f.makeEquivMap(detCols, cols)
   968  
   969  	// If constants were found, then normalize constants to preserve FDs in a
   970  	// case like this where (2) is removed:
   971  	//
   972  	//   ()-->(2), (2,3)-->(4)
   973  	//
   974  	// Rather than removing both FDs, the second FD should be preserved in this
   975  	// form:
   976  	//
   977  	//   (3)-->(4)
   978  	//
   979  	if !constCols.Empty() {
   980  		f.AddConstants(constCols)
   981  	}
   982  
   983  	// During second pass, remove all dependencies with un-projected columns.
   984  	var newFDs []funcDep
   985  	n := 0
   986  	for i := range f.deps {
   987  		fd := &f.deps[i]
   988  
   989  		// Subtract out un-projected columns from dependants. Also subtract strict
   990  		// constant columns from dependants for nicer presentation.
   991  		if !fd.to.SubsetOf(cols) {
   992  			fd.to = fd.to.Intersection(cols)
   993  			if !fd.isConstant() {
   994  				fd.to.DifferenceWith(constCols)
   995  			}
   996  			if !fd.removeToCols(fd.from) {
   997  				continue
   998  			}
   999  		}
  1000  
  1001  		// Try to substitute equivalent columns for removed determinant columns.
  1002  		if !fd.from.SubsetOf(cols) {
  1003  			if fd.equiv {
  1004  				// Always discard equivalency with removed determinant, since other
  1005  				// equivalencies will already include this column.
  1006  				continue
  1007  			}
  1008  
  1009  			// Start with "before" list of columns that need to be mapped, and try
  1010  			// to find an "after" list containing equivalent columns.
  1011  			var afterCols opt.ColSet
  1012  			beforeCols := fd.from.Difference(cols)
  1013  			foundAll := true
  1014  			for c, ok := beforeCols.Next(0); ok; c, ok = beforeCols.Next(c + 1) {
  1015  				var id opt.ColumnID
  1016  				if id, foundAll = equivMap[c]; !foundAll {
  1017  					break
  1018  				}
  1019  				afterCols.Add(id)
  1020  			}
  1021  			if foundAll {
  1022  				// Dependency can be remapped using equivalencies.
  1023  				from := fd.from.Union(afterCols)
  1024  				from.DifferenceWith(beforeCols)
  1025  				newFDs = append(newFDs, funcDep{from: from, to: fd.to, strict: fd.strict, equiv: fd.equiv})
  1026  			}
  1027  			continue
  1028  		}
  1029  
  1030  		if n != i {
  1031  			f.deps[n] = f.deps[i]
  1032  		}
  1033  		n++
  1034  	}
  1035  	f.deps = f.deps[:n]
  1036  
  1037  	for i := range newFDs {
  1038  		fd := &newFDs[i]
  1039  		f.addDependency(fd.from, fd.to, fd.strict, fd.equiv)
  1040  	}
  1041  
  1042  	// Ensure that key still determines all other columns.
  1043  	f.ensureKeyClosure(cols)
  1044  }
  1045  
  1046  // AddFrom merges two FD sets by adding each FD from the given set to this set.
  1047  // While this requires O(N**2) time, it's useful when the two FD sets may
  1048  // overlap one another and substantial simplifications are possible (as with
  1049  // IndexJoin). It is up to the caller to ensure that the two FD sets are
  1050  // "compatible", meaning that they operate on the same relations, with the same
  1051  // keys, same columns, etc.
  1052  func (f *FuncDepSet) AddFrom(fdset *FuncDepSet) {
  1053  	for i := range fdset.deps {
  1054  		fd := &fdset.deps[i]
  1055  		f.addDependency(fd.from, fd.to, fd.strict, fd.equiv)
  1056  	}
  1057  }
  1058  
  1059  // AddEquivFrom is similar to AddFrom, except that it only adds equivalence
  1060  // dependencies from the given set to this set.
  1061  func (f *FuncDepSet) AddEquivFrom(fdset *FuncDepSet) {
  1062  	for i := range fdset.deps {
  1063  		fd := &fdset.deps[i]
  1064  		if fd.equiv {
  1065  			f.addDependency(fd.from, fd.to, fd.strict, fd.equiv)
  1066  		}
  1067  	}
  1068  }
  1069  
  1070  // MakeProduct modifies the FD set to reflect the impact of a cartesian product
  1071  // operation between this set and the given set. The result is a union of the
  1072  // FDs from each set, as well as a union of their keys. The two FD sets are
  1073  // expected to operate on disjoint columns, so the FDs from each are simply
  1074  // concatenated, rather than simplified via calls to addDependency (except for
  1075  // case of constant columns).
  1076  func (f *FuncDepSet) MakeProduct(inner *FuncDepSet) {
  1077  	for i := range inner.deps {
  1078  		fd := &inner.deps[i]
  1079  		if fd.isConstant() {
  1080  			f.addDependency(fd.from, fd.to, fd.strict, fd.equiv)
  1081  		} else {
  1082  			f.deps = append(f.deps, *fd)
  1083  		}
  1084  	}
  1085  
  1086  	if f.hasKey != noKey && inner.hasKey != noKey {
  1087  		// If both sides have a strict key, the union of keys is a strict key.
  1088  		// If one side has a lax key and the other has a lax or strict key, the
  1089  		// union is a lax key.
  1090  		typ := laxKey
  1091  		if f.hasKey == strictKey && inner.hasKey == strictKey {
  1092  			typ = strictKey
  1093  		}
  1094  		f.setKey(f.key.Union(inner.key), typ)
  1095  	} else {
  1096  		f.clearKey()
  1097  	}
  1098  }
  1099  
  1100  // MakeApply modifies the FD set to reflect the impact of an apply join. This
  1101  // FD set reflects the properties of the outer query, and the given FD set
  1102  // reflects the properties of the inner query. Constant FDs from inner set no
  1103  // longer hold and some other dependencies need to be augmented in order to be
  1104  // valid for the apply join operator. Consider this example:
  1105  //
  1106  //   SELECT *
  1107  //   FROM a
  1108  //   INNER JOIN LATERAL (SELECT * FROM b WHERE b.y=a.y)
  1109  //   ON True
  1110  //
  1111  // 1. The constant dependency created from the outer column reference b.y=a.y
  1112  //    does not hold for the Apply operator, since b.y is no longer constant at
  1113  //    this level. In general, constant dependencies cannot be retained, because
  1114  //    they may have been generated from outer column equivalencies.
  1115  // 2. If a strict dependency (b.x,b.y)-->(b.z) held, it would have been reduced
  1116  //    to (b.x)-->(b.z) because (b.y) is constant in the inner query. However,
  1117  //    (b.x)-->(b.z) does not hold for the Apply operator, since (b.y) is not
  1118  //    constant in that case. However, the dependency *does* hold as long as its
  1119  //    determinant is augmented by the left input's key columns (if key exists).
  1120  // 3. Lax dependencies follow the same rules as #2.
  1121  // 4. Equivalence dependencies in the inner query still hold for the Apply
  1122  //    operator.
  1123  // 5. If both the outer and inner inputs of the apply join have keys, then the
  1124  //    concatenation of those keys is a key on the apply join result.
  1125  //
  1126  func (f *FuncDepSet) MakeApply(inner *FuncDepSet) {
  1127  	for i := range inner.deps {
  1128  		fd := &inner.deps[i]
  1129  		if fd.equiv {
  1130  			f.addDependency(fd.from, fd.to, fd.strict, fd.equiv)
  1131  		} else if !fd.isConstant() && f.hasKey == strictKey {
  1132  			f.addDependency(f.key.Union(fd.from), fd.to, fd.strict, fd.equiv)
  1133  		}
  1134  		// TODO(radu): can we use a laxKey here?
  1135  	}
  1136  
  1137  	if f.hasKey == strictKey && inner.hasKey == strictKey {
  1138  		f.setKey(f.key.Union(inner.key), strictKey)
  1139  		f.ensureKeyClosure(inner.ColSet())
  1140  	} else {
  1141  		// TODO(radu): can we use a laxKey here?
  1142  		f.clearKey()
  1143  	}
  1144  }
  1145  
  1146  // MakeLeftOuter modifies the cartesian product FD set to reflect the impact of
  1147  // adding NULL-extended rows to the results of an inner join. An inner join can
  1148  // be modeled as a cartesian product + ON filtering, and an outer join is
  1149  // modeled as an inner join + union of NULL-extended rows. MakeLeftOuter enacts
  1150  // the filtering and null-extension of the cartesian product. If it is possible
  1151  // to prove that there is a key over the join result that consists only of
  1152  // columns from the left input, that key will be used.
  1153  //
  1154  // This same logic applies for right joins as well (by reversing sides).
  1155  //
  1156  // See the "Left outer join" section on page 84 of the Master's Thesis for the
  1157  // impact of outer joins on FDs.
  1158  func (f *FuncDepSet) MakeLeftOuter(
  1159  	leftFDs, filtersFDs *FuncDepSet, leftCols, rightCols, notNullInputCols opt.ColSet,
  1160  ) {
  1161  	// The columns from the left input form a key over the result of the LeftJoin
  1162  	// if the following conditions are met:
  1163  	//
  1164  	//  1. The left input has a strict key.
  1165  	//
  1166  	//  2. The left columns form a strict key over the filtered cartesian product.
  1167  	//     (In other words, the left columns would form a key over an inner join
  1168  	//      with the same filters).
  1169  	//
  1170  	// The above conditions are sufficient because a strict key (over the filtered
  1171  	// cartesian product) that only contains columns from the left side implies
  1172  	// that no left rows were duplicated. This is because even a single duplicated
  1173  	// row would prohibit a strict key containing only those columns. And if there
  1174  	// was already a strict key in the original left input, adding back filtered
  1175  	// left rows will not create any duplicates. This means that the LeftJoin will
  1176  	// not duplicate any of the left rows. Therefore, a key over the left input
  1177  	// must also be a key over the result of the join.
  1178  	//
  1179  	// If the conditions are not met, a key over the unfiltered cartesian product
  1180  	// (if one exists) is used. Why is this key valid to use?
  1181  	//
  1182  	//   * A left join can filter rows and null-extend rows from the cartesian
  1183  	//     product.
  1184  	//   * Filtering rows does not affect the presence of a key.
  1185  	//   * Null-extending rows does not affect the presence of a key because the
  1186  	//     cartesian product could only have a key if the left and right inputs
  1187  	//     also had keys (see FuncDepSet.MakeProduct). Therefore, null-extended
  1188  	//     left rows that are added back by the left join must be unique.
  1189  	//
  1190  	// As an example, consider this data and this query:
  1191  	//
  1192  	//   a      b
  1193  	//   -      -
  1194  	//   1      1
  1195  	//   2      2
  1196  	//   3
  1197  	//   4
  1198  	//
  1199  	//   SELECT * FROM a_tab LEFT JOIN b_tab ON a < 3
  1200  	//
  1201  	// Both tables a and b have a strict key. If we take their cartesian product,
  1202  	// we get something like this:
  1203  	//
  1204  	//   a  b
  1205  	//   ----
  1206  	//   1  1
  1207  	//   1  2
  1208  	//   2  1
  1209  	//   2  2
  1210  	//   3  1
  1211  	//   3  2
  1212  	//   4  1
  1213  	//   4  2
  1214  	//
  1215  	// Now, columns a and b together form a strict key over the cartesian product.
  1216  	// If either a or b had duplicate rows to begin with, a key over the cartesian
  1217  	// product would not be possible. Now, the left join's "a < 3" on condition is
  1218  	// applied:
  1219  	//
  1220  	//   a  b
  1221  	//   ----
  1222  	//   1  1
  1223  	//   1  2
  1224  	//   2  1
  1225  	//   2  2
  1226  	//
  1227  	// Finally, the left join adds back the rows of a, null-extending b:
  1228  	//
  1229  	//   a  b
  1230  	//   ----
  1231  	//   1  1
  1232  	//   1  2
  1233  	//   2  1
  1234  	//   2  2
  1235  	//   3  NULL
  1236  	//   4  NULL
  1237  	//
  1238  	// Since a had a key to begin with, the "3" and "4" rows that are added back
  1239  	// are unique. Therefore, a and b are a strict key for the left join.
  1240  	//
  1241  	// TODO(drewk): support for lax keys/dependencies from the right input can be
  1242  	//  added if it turns out to be useful.
  1243  
  1244  	// Save a strict key from the left input, if one exists.
  1245  	leftKey, leftHasKey := leftFDs.StrictKey()
  1246  
  1247  	// Save a key from the unfiltered cartesian product, if one exists.
  1248  	oldKey := f.key
  1249  	oldKeyType := f.hasKey
  1250  
  1251  	// If the left input has a key, add the FDs from the join filters to a copy of
  1252  	// the cartesian product FD set. Next, check whether the columns of the left
  1253  	// input form a strict key over the result of applying the join filters to the
  1254  	// cartesian product.
  1255  	//
  1256  	// We have to apply the filters to a copy because filter FDs are often not
  1257  	// valid after null-extended rows are added. For example:
  1258  	//
  1259  	//   a  b  c     d     e
  1260  	//   ----------------------
  1261  	//   1  1  1     NULL  1
  1262  	//   1  2  NULL  NULL  NULL
  1263  	//   2  1  NULL  NULL  NULL
  1264  	//
  1265  	// Let's say this table is the result of a join between 'ab' and 'cde'. The
  1266  	// join condition might have included e = 1, but it would not be correct to
  1267  	// add the corresponding constant FD to the final join FD set because the e
  1268  	// column has been null extended, and therefore the condition doesn't hold for
  1269  	// the final outer join result.
  1270  	leftColsAreInnerJoinKey := false
  1271  	if leftHasKey {
  1272  		c := FuncDepSet{}
  1273  		c.CopyFrom(f)
  1274  		c.AddFrom(filtersFDs)
  1275  		leftColsAreInnerJoinKey = c.ColsAreStrictKey(leftCols)
  1276  	}
  1277  
  1278  	// Modify the cartesian product FD set to reflect the impact of adding
  1279  	// NULL-extended rows to the results of the filtered cartesian product (or, in
  1280  	// other words, the results of an inner join).
  1281  	f.nullExtendRightRows(rightCols, notNullInputCols)
  1282  
  1283  	// If the conditions have been met, use the key from the left side. Otherwise,
  1284  	// use the key from the cartesian product.
  1285  	if leftHasKey && leftColsAreInnerJoinKey {
  1286  		f.setKey(leftKey, strictKey)
  1287  	} else {
  1288  		// See the comment at the top of the function for why it is valid to use the
  1289  		// key from the cartesian product.
  1290  		f.setKey(oldKey, oldKeyType)
  1291  		// Call tryToReduceKey with only the left columns from notNullInputCols
  1292  		// because the right columns may have been null-extended.
  1293  		f.tryToReduceKey(leftCols.Intersection(notNullInputCols))
  1294  	}
  1295  	// ensureKeyClosure must be called when oldKey is used as well as the new
  1296  	// leftKey because nullExtendRightRows can remove FDs, such that the closure
  1297  	// of oldKey ends up missing some columns from the right.
  1298  	f.ensureKeyClosure(leftCols.Union(rightCols))
  1299  }
  1300  
  1301  // MakeFullOuter modifies the cartesian product FD set to reflect the impact of
  1302  // adding NULL-extended rows to the results of an inner join. An inner join can
  1303  // be modeled as a cartesian product + ON filtering, and an outer join is
  1304  // modeled as an inner join + union of NULL-extended rows. MakeFullOuter
  1305  // performs the final step for a full join, given the set of columns on each
  1306  // side, as well as the set of input columns from both sides of the join that
  1307  // are not null.
  1308  func (f *FuncDepSet) MakeFullOuter(leftCols, rightCols, notNullInputCols opt.ColSet) {
  1309  	if f.hasKey == strictKey {
  1310  		if f.key.Empty() {
  1311  			// The cartesian product has an empty key when both sides have an empty key;
  1312  			// but the outer join can have two rows so the empty key doesn't hold.
  1313  			f.hasKey = noKey
  1314  			f.key = opt.ColSet{}
  1315  		} else if !f.key.Intersects(notNullInputCols) {
  1316  			// If the cartesian product has a strict key, the key holds on the full
  1317  			// outer result only if one of the key columns is known to be not-null in
  1318  			// the input. Otherwise, a row where all the key columns are NULL can
  1319  			// "conflict" with a row where these columns are NULL because of
  1320  			// null-extension. For example:
  1321  			//   -- t1 and t2 each have one row containing NULL for column x.
  1322  			//   SELECT * FROM t1 FULL JOIN t2 ON t1.x=t2.x
  1323  			//
  1324  			//   t1.x  t2.x
  1325  			//   ----------
  1326  			//   NULL  NULL
  1327  			//   NULL  NULL
  1328  			f.hasKey = laxKey
  1329  		}
  1330  	}
  1331  	f.nullExtendRightRows(leftCols, notNullInputCols)
  1332  	f.nullExtendRightRows(rightCols, notNullInputCols)
  1333  	f.ensureKeyClosure(leftCols.Union(rightCols))
  1334  }
  1335  
  1336  // nullExtendRightRows is used by MakeLeftOuter and MakeFullOuter to modify the
  1337  // cartesian product FD set to reflect the impact of adding NULL-extended rows
  1338  // to the results of an inner join. See the MakeLeftOuter comment for more
  1339  // information.
  1340  func (f *FuncDepSet) nullExtendRightRows(rightCols, notNullInputCols opt.ColSet) {
  1341  	var newFDs []funcDep
  1342  
  1343  	n := 0
  1344  	for i := range f.deps {
  1345  		fd := &f.deps[i]
  1346  
  1347  		if fd.isConstant() {
  1348  			// Null-extended constant columns are no longer constant, because they
  1349  			// now may contain NULL values.
  1350  			if fd.to.Intersects(rightCols) {
  1351  				constCols := fd.to.Intersection(rightCols)
  1352  				if !fd.removeToCols(constCols) {
  1353  					continue
  1354  				}
  1355  			}
  1356  		} else {
  1357  			// The next several rules depend on whether the dependency's determinant
  1358  			// and dependants are on the null-supplying or row-supplying sides of
  1359  			// the join (or both). The rules will use the following join and set of
  1360  			// result rows to give examples:
  1361  			//
  1362  			//   CREATE TABLE ab (a INT, b INT, PRIMARY KEY(a, b))
  1363  			//   CREATE TABLE cde (c INT PRIMARY KEY, d INT, e INT)
  1364  			//   SELECT * FROM ab LEFT OUTER JOIN cde ON a=c AND b=1
  1365  			//
  1366  			//   a  b  c     d     e
  1367  			//   ----------------------
  1368  			//   1  1  1     NULL  1
  1369  			//   1  2  NULL  NULL  NULL
  1370  			//   2  1  NULL  NULL  NULL
  1371  			//
  1372  			// Here are the rules:
  1373  			//
  1374  			// 1. A strict dependency with determinant on the null-supplying side of
  1375  			//    the join becomes lax for any dependants on the row-supplying side
  1376  			//    of the join. In the example above, null-extending the (c) column
  1377  			//    violates the (a)==(c) equivalence dependency. Even the strict
  1378  			//    (a)-->(c) and (c)-->(a) dependencies no longer hold. The only
  1379  			//    dependency that still holds is (c)~~>(a), and even that is only
  1380  			//    one way, since (a)~~>(c) is not valid.
  1381  			//
  1382  			// 2. A strict dependency with both determinant and dependants on the
  1383  			//    null-supplying side of join becomes lax if all determinant columns
  1384  			//    are nullable. In the example above, null-extending the (c,d,e)
  1385  			//    columns violates a strict (d)-->(e) dependency, because the NULL
  1386  			//    "d" value now maps to both 1 and NULL. So it must be weakened to
  1387  			//    a lax dependency. But if at least one non-NULL column is part of
  1388  			//    the determinant, such as (c,d)-->(e), then the (NULL,NULL)
  1389  			//    determinant will be unique, thus preserving a strict FD.
  1390  			//
  1391  			// 3. A dependency with determinant columns drawn from both sides of
  1392  			//    the join is discarded, unless the determinant is a key for the
  1393  			//    relation. Null-extending one side of the join does not disturb
  1394  			//    the relation's keys, and keys always determine all other columns.
  1395  			//
  1396  			if fd.from.Intersects(rightCols) {
  1397  				if !fd.from.SubsetOf(rightCols) {
  1398  					// Rule #3, described above.
  1399  					if !f.ColsAreStrictKey(fd.from) {
  1400  						continue
  1401  					}
  1402  				} else {
  1403  					// Rule #1, described above (determinant is on null-supplying side).
  1404  					if !fd.to.SubsetOf(rightCols) {
  1405  						// Split the dependants by which side of the join they're on.
  1406  						laxCols := fd.to.Difference(rightCols)
  1407  						newFDs = append(newFDs, funcDep{from: fd.from, to: laxCols})
  1408  						if !fd.removeToCols(laxCols) {
  1409  							continue
  1410  						}
  1411  					}
  1412  
  1413  					// Rule #2, described above. Note that this rule does not apply to
  1414  					// equivalence FDs, which remain valid.
  1415  					if fd.strict && !fd.equiv && !fd.from.Intersects(notNullInputCols) {
  1416  						newFDs = append(newFDs, funcDep{from: fd.from, to: fd.to})
  1417  						continue
  1418  					}
  1419  				}
  1420  			} else {
  1421  				// Rule #1, described above (determinant is on row-supplying side).
  1422  				if !fd.removeToCols(rightCols) {
  1423  					continue
  1424  				}
  1425  			}
  1426  		}
  1427  
  1428  		if n != i {
  1429  			f.deps[n] = f.deps[i]
  1430  		}
  1431  		n++
  1432  	}
  1433  	f.deps = f.deps[:n]
  1434  
  1435  	for i := range newFDs {
  1436  		fd := &newFDs[i]
  1437  		f.addDependency(fd.from, fd.to, fd.strict, fd.equiv)
  1438  	}
  1439  }
  1440  
  1441  // EquivReps returns one "representative" column from each equivalency group in
  1442  // the FD set. ComputeEquivGroup can be called to obtain the remaining columns
  1443  // from each equivalency group.
  1444  func (f *FuncDepSet) EquivReps() opt.ColSet {
  1445  	var reps opt.ColSet
  1446  
  1447  	// Equivalence closures are already maintained for every column.
  1448  	for i := 0; i < len(f.deps); i++ {
  1449  		fd := &f.deps[i]
  1450  		if fd.equiv && !fd.to.Intersects(reps) {
  1451  			reps.UnionWith(fd.from)
  1452  		}
  1453  	}
  1454  	return reps
  1455  }
  1456  
  1457  // ComputeEquivGroup returns the group of columns that are equivalent to the
  1458  // given column. See ComputeEquivClosure for more details.
  1459  func (f *FuncDepSet) ComputeEquivGroup(rep opt.ColumnID) opt.ColSet {
  1460  	return f.ComputeEquivClosure(opt.MakeColSet(rep))
  1461  }
  1462  
  1463  // ensureKeyClosure checks whether the closure for this FD set's key (if there
  1464  // is one) includes the given columns. If not, then it adds a dependency so that
  1465  // the key determines the columns.
  1466  func (f *FuncDepSet) ensureKeyClosure(cols opt.ColSet) {
  1467  	if f.hasKey != noKey {
  1468  		closure := f.ComputeClosure(f.key)
  1469  		if !cols.SubsetOf(closure) {
  1470  			cols = cols.Difference(closure)
  1471  
  1472  			// If we have a strict key, we add a strict dependency; otherwise we add a
  1473  			// lax dependency.
  1474  			strict := f.hasKey == strictKey
  1475  			f.addDependency(f.key, cols, strict, false /* equiv */)
  1476  		}
  1477  	}
  1478  }
  1479  
  1480  // Verify runs consistency checks against the FD set, in order to ensure that it
  1481  // conforms to several invariants:
  1482  //
  1483  //   1. An FD determinant should not intersect its dependants.
  1484  //   2. If a constant FD is present, it's the first FD in the set.
  1485  //   3. A constant FD must be strict.
  1486  //   4. Lax equivalencies should be reduced to lax dependencies.
  1487  //   5. Equivalence determinant should be exactly one column.
  1488  //   6. The dependants of an equivalence is always its closure.
  1489  //   7. If FD set has a key, it should be a candidate key (already reduced).
  1490  //   8. Closure of key should include all known columns in the FD set.
  1491  //   9. If FD set has no key then key columns should be empty.
  1492  //
  1493  func (f *FuncDepSet) Verify() {
  1494  	for i := range f.deps {
  1495  		fd := &f.deps[i]
  1496  
  1497  		if fd.from.Intersects(fd.to) {
  1498  			panic(errors.AssertionFailedf("expected FD determinant and dependants to be disjoint: %s (%d)", log.Safe(f), log.Safe(i)))
  1499  		}
  1500  
  1501  		if fd.isConstant() {
  1502  			if i != 0 {
  1503  				panic(errors.AssertionFailedf("expected constant FD to be first FD in set: %s (%d)", log.Safe(f), log.Safe(i)))
  1504  			}
  1505  			if !fd.strict {
  1506  				panic(errors.AssertionFailedf("expected constant FD to be strict: %s", log.Safe(f)))
  1507  			}
  1508  		}
  1509  
  1510  		if fd.equiv {
  1511  			if !fd.strict {
  1512  				panic(errors.AssertionFailedf("expected equivalency to be strict: %s (%d)", f, i))
  1513  			}
  1514  
  1515  			if fd.from.Len() != 1 {
  1516  				panic(errors.AssertionFailedf("expected equivalence determinant to be single col: %s (%d)", log.Safe(f), log.Safe(i)))
  1517  			}
  1518  
  1519  			if !f.ComputeEquivClosure(fd.from).Equals(fd.from.Union(fd.to)) {
  1520  				panic(errors.AssertionFailedf("expected equivalence dependants to be its closure: %s (%d)", log.Safe(f), log.Safe(i)))
  1521  			}
  1522  		}
  1523  	}
  1524  
  1525  	if f.hasKey != noKey {
  1526  		if f.hasKey == strictKey {
  1527  			if reduced := f.ReduceCols(f.key); !reduced.Equals(f.key) {
  1528  				panic(errors.AssertionFailedf("expected FD to have candidate key %s: %s", reduced, f))
  1529  			}
  1530  
  1531  			allCols := f.ColSet()
  1532  			allCols.UnionWith(f.key)
  1533  			if !f.ComputeClosure(f.key).Equals(allCols) {
  1534  				panic(errors.AssertionFailedf("expected closure of FD key to include all known cols: %s", log.Safe(f)))
  1535  			}
  1536  		}
  1537  
  1538  		if f.hasKey == laxKey && f.key.Empty() {
  1539  			panic(errors.AssertionFailedf("expected lax key to be not empty"))
  1540  		}
  1541  	} else {
  1542  		if !f.key.Empty() {
  1543  			panic(errors.AssertionFailedf("expected empty key columns since no key: %s", f))
  1544  		}
  1545  	}
  1546  }
  1547  
  1548  // StringOnlyFDs returns a string representation of the FDs (without the key
  1549  // information).
  1550  func (f FuncDepSet) StringOnlyFDs() string {
  1551  	var b strings.Builder
  1552  	f.formatFDs(&b)
  1553  	return b.String()
  1554  }
  1555  
  1556  func (f FuncDepSet) String() string {
  1557  	var b strings.Builder
  1558  
  1559  	if f.hasKey != noKey {
  1560  		// The key shows up as key(1,2) or lax-key(1,2).
  1561  		if f.hasKey == laxKey {
  1562  			b.WriteString("lax-")
  1563  		}
  1564  		fmt.Fprintf(&b, "key%s", f.key)
  1565  		if len(f.deps) > 0 {
  1566  			b.WriteString("; ")
  1567  		}
  1568  	}
  1569  
  1570  	f.formatFDs(&b)
  1571  	return b.String()
  1572  }
  1573  
  1574  func (f FuncDepSet) formatFDs(b *strings.Builder) {
  1575  	for i := range f.deps {
  1576  		if i != 0 {
  1577  			b.WriteString(", ")
  1578  		}
  1579  		f.deps[i].format(b)
  1580  	}
  1581  }
  1582  
  1583  // colsAreKey returns true if the given columns contain a strict or lax key for
  1584  // the relation.
  1585  func (f *FuncDepSet) colsAreKey(cols opt.ColSet, typ keyType) bool {
  1586  	switch f.hasKey {
  1587  	case strictKey:
  1588  		// Determine whether the key is in the closure of the given columns. The
  1589  		// closure is necessary in the general case since it's possible that the
  1590  		// columns form a different key. For example:
  1591  		//
  1592  		//   f.key = (a)
  1593  		//   cols  = (b,c)
  1594  		//
  1595  		// and yet both column sets form keys for the relation.
  1596  		return f.inClosureOf(f.key, cols, typ == strictKey)
  1597  
  1598  	case laxKey:
  1599  		if typ == strictKey {
  1600  			// We have a lax key but we need a strict key.
  1601  			return false
  1602  		}
  1603  
  1604  		// For a lax key, we cannot use the strict closure, because the columns we
  1605  		// bring in from the closure might be null. For example, say that
  1606  		//   - column a is constant but (always) null: ()-->(a)
  1607  		//   - (a,b) is the known lax key.
  1608  		// The strict closure of (b) is the lax key (a,b), but (b) is not a lax
  1609  		// key.
  1610  		//
  1611  		// We can however use the equivalent closure, because those columns are null
  1612  		// only if one of the initial cols is null.
  1613  		//
  1614  		// Note: if we had information, we could use just the not-null columns from
  1615  		// the strict closure.
  1616  		return f.key.SubsetOf(f.ComputeEquivClosure(cols))
  1617  
  1618  	default:
  1619  		return false
  1620  	}
  1621  }
  1622  
  1623  // inClosureOf computes the strict or lax closure of the "in" column set, and
  1624  // returns true if the "cols" columns are all contained in the resulting
  1625  // closure.
  1626  func (f *FuncDepSet) inClosureOf(cols, in opt.ColSet, strict bool) bool {
  1627  	// Short-circuit if the "in" set already contains all the columns.
  1628  	if cols.SubsetOf(in) {
  1629  		return true
  1630  	}
  1631  
  1632  	in = in.Copy()
  1633  
  1634  	// Lax dependencies are not transitive (see figure 2.1 in the paper for
  1635  	// properties that hold for lax dependencies), so only include them if they
  1636  	// are reachable in a single lax dependency step from the input set.
  1637  	if !strict {
  1638  		// Keep track of all columns reached through a lax or strict dependency.
  1639  		laxIn := in.Copy()
  1640  		for i := 0; i < len(f.deps); i++ {
  1641  			fd := &f.deps[i]
  1642  			if fd.from.SubsetOf(in) && !fd.to.SubsetOf(in) {
  1643  				laxIn.UnionWith(fd.to)
  1644  
  1645  				// Equivalencies are always transitive.
  1646  				if fd.equiv {
  1647  					in.UnionWith(fd.to)
  1648  
  1649  					// Restart iteration to get transitive closure.
  1650  					i = -1
  1651  				}
  1652  
  1653  				// Short-circuit if the "laxIn" set now contains all the columns.
  1654  				if cols.SubsetOf(laxIn) {
  1655  					return true
  1656  				}
  1657  			}
  1658  		}
  1659  
  1660  		// Use the set that includes columns reached via lax dependencies.
  1661  		in = laxIn
  1662  	}
  1663  
  1664  	// Now continue with full transitive closure of strict dependencies.
  1665  	for i := 0; i < len(f.deps); i++ {
  1666  		fd := &f.deps[i]
  1667  
  1668  		if fd.strict && fd.from.SubsetOf(in) && !fd.to.SubsetOf(in) {
  1669  			in.UnionWith(fd.to)
  1670  
  1671  			// Short-circuit if the "in" set now contains all the columns.
  1672  			if cols.SubsetOf(in) {
  1673  				return true
  1674  			}
  1675  
  1676  			// Restart iteration to get transitive closure.
  1677  			i = -1
  1678  		}
  1679  	}
  1680  	return false
  1681  }
  1682  
  1683  // addDependency adds a new dependency into the set. If another FD implies the
  1684  // new FD, then it's not added. If it can be merged with an existing FD, that is
  1685  // done. Otherwise, a brand new FD is added to the set.
  1686  func (f *FuncDepSet) addDependency(from, to opt.ColSet, strict, equiv bool) {
  1687  	// Fast-path for trivial no-op dependency.
  1688  	if to.SubsetOf(from) {
  1689  		return
  1690  	}
  1691  
  1692  	// Delegate equivalence dependency.
  1693  	if equiv {
  1694  		f.addEquivalency(from.Union(to))
  1695  		return
  1696  	}
  1697  
  1698  	// Delegate constant dependency.
  1699  	if from.Empty() {
  1700  		if !strict {
  1701  			panic(errors.AssertionFailedf("expected constant FD to be strict: %s", log.Safe(f)))
  1702  		}
  1703  		f.AddConstants(to)
  1704  		return
  1705  	}
  1706  
  1707  	// Any column in the "from" set is already an implied "to" column, so no
  1708  	// need to include it.
  1709  	if to.Intersects(from) {
  1710  		to = to.Difference(from)
  1711  	}
  1712  
  1713  	newFD := funcDep{from: from, to: to, strict: strict, equiv: equiv}
  1714  
  1715  	// Merge the new dependency into the existing set.
  1716  	n := 0
  1717  	added := false
  1718  	for i := range f.deps {
  1719  		fd := &f.deps[i]
  1720  
  1721  		if newFD.implies(fd) {
  1722  			// The new FD is >= the existing FD, so can replace it.
  1723  			if added {
  1724  				// New FD is already part of the set, so discard this existing FD.
  1725  				continue
  1726  			}
  1727  
  1728  			// Update the existing FD.
  1729  			fd.from = from
  1730  			fd.to = to
  1731  			fd.strict = strict
  1732  			fd.equiv = equiv
  1733  
  1734  			// Keep searching, in case there's another implied FD.
  1735  			added = true
  1736  		} else if !added {
  1737  			if fd.implies(&newFD) {
  1738  				// The new FD does not add any additional information.
  1739  				added = true
  1740  			} else if fd.strict == strict && fd.equiv == equiv && fd.from.Equals(from) {
  1741  				// The new FD can at least add its determinant to an existing FD.
  1742  				fd.to = fd.to.Union(to)
  1743  				added = true
  1744  			}
  1745  		}
  1746  
  1747  		if n != i {
  1748  			f.deps[n] = f.deps[i]
  1749  		}
  1750  		n++
  1751  	}
  1752  
  1753  	f.deps = f.deps[:n]
  1754  
  1755  	if !added {
  1756  		// Add a new FD.
  1757  		f.deps = append(f.deps, newFD)
  1758  	}
  1759  }
  1760  
  1761  func (f *FuncDepSet) addEquivalency(equiv opt.ColSet) {
  1762  	var addConst bool
  1763  	var found opt.ColSet
  1764  
  1765  	// Start by finding complete set of all columns that are equivalent to the
  1766  	// given set.
  1767  	equiv = f.ComputeEquivClosure(equiv)
  1768  
  1769  	n := 0
  1770  	for i := 0; i < len(f.deps); i++ {
  1771  		fd := &f.deps[i]
  1772  
  1773  		if fd.isConstant() {
  1774  			// If any equivalent column is a constant, then all are constants.
  1775  			if fd.to.Intersects(equiv) && !equiv.SubsetOf(fd.to) {
  1776  				addConst = true
  1777  			}
  1778  		} else if fd.from.SubsetOf(equiv) {
  1779  			// All determinant columns are equivalent to one another.
  1780  			if fd.equiv {
  1781  				// Ensure that each equivalent column directly maps to all other
  1782  				// columns in the group.
  1783  				fd.to = fd.to.Union(equiv)
  1784  				fd.to.DifferenceWith(fd.from)
  1785  				found.UnionWith(fd.from)
  1786  			} else {
  1787  				// Remove dependant columns that are equivalent, because equivalence
  1788  				// is a stronger relationship than a strict or lax dependency.
  1789  				if !fd.removeToCols(equiv) {
  1790  					continue
  1791  				}
  1792  			}
  1793  		}
  1794  
  1795  		if n != i {
  1796  			f.deps[n] = f.deps[i]
  1797  		}
  1798  		n++
  1799  	}
  1800  	f.deps = f.deps[:n]
  1801  
  1802  	if addConst {
  1803  		// Ensure that all equivalent columns are marked as constant.
  1804  		f.AddConstants(equiv)
  1805  	}
  1806  
  1807  	if !equiv.SubsetOf(found) {
  1808  		add := equiv.Difference(found)
  1809  		deps := make([]funcDep, 0, len(f.deps)+add.Len())
  1810  		deps = append(deps, f.deps...)
  1811  
  1812  		for id, ok := add.Next(0); ok; id, ok = add.Next(id + 1) {
  1813  			fd := funcDep{strict: true, equiv: true}
  1814  			fd.from.Add(id)
  1815  			fd.to = equiv.Copy()
  1816  			fd.to.Remove(id)
  1817  			deps = append(deps, fd)
  1818  		}
  1819  		f.deps = deps
  1820  	}
  1821  
  1822  	f.tryToReduceKey(opt.ColSet{} /* notNullCols */)
  1823  }
  1824  
  1825  // setKey updates the key that the set is currently maintaining.
  1826  func (f *FuncDepSet) setKey(key opt.ColSet, typ keyType) {
  1827  	f.hasKey = typ
  1828  	f.key = key
  1829  	if f.hasKey == laxKey && f.key.Empty() {
  1830  		// An empty lax key is by definition equivalent to an empty strict key; we
  1831  		// normalize it to be strict.
  1832  		f.hasKey = strictKey
  1833  	}
  1834  }
  1835  
  1836  // clearKey removes any strict or lax key.
  1837  func (f *FuncDepSet) clearKey() {
  1838  	f.setKey(opt.ColSet{}, noKey)
  1839  }
  1840  
  1841  // tryToReduceKey tries to reduce any set key, used after new FDs are added.
  1842  func (f *FuncDepSet) tryToReduceKey(notNullCols opt.ColSet) {
  1843  	switch f.hasKey {
  1844  	case laxKey:
  1845  		if !notNullCols.Empty() {
  1846  			// We can only remove columns from a lax key if we know they are
  1847  			// not null; other columns must be retained.
  1848  			nullableKeyCols := f.key.Difference(notNullCols)
  1849  			if nullableKeyCols.Empty() {
  1850  				// All key columns are not-null; we can upgrade the key to strict.
  1851  				f.AddStrictKey(f.key, f.ColSet())
  1852  			} else {
  1853  				reduced := f.ReduceCols(f.key)
  1854  				reduced.UnionWith(nullableKeyCols)
  1855  				f.key = reduced
  1856  			}
  1857  		}
  1858  
  1859  	case strictKey:
  1860  		f.key = f.ReduceCols(f.key)
  1861  	}
  1862  }
  1863  
  1864  // makeEquivMap constructs a map with an entry for each column in the "from" set
  1865  // that is equivalent to a column in the "to" set. When there are multiple
  1866  // equivalent columns, then makeEquivMap arbitrarily chooses one of the
  1867  // alternatives. Note that some from columns may not have a mapping. If none of
  1868  // them do, then makeEquivMap returns nil.
  1869  func (f *FuncDepSet) makeEquivMap(from, to opt.ColSet) map[opt.ColumnID]opt.ColumnID {
  1870  	var equivMap map[opt.ColumnID]opt.ColumnID
  1871  	for i, ok := from.Next(0); ok; i, ok = from.Next(i + 1) {
  1872  		var oneCol opt.ColSet
  1873  		oneCol.Add(i)
  1874  		closure := f.ComputeEquivClosure(oneCol)
  1875  		closure.IntersectionWith(to)
  1876  		if !closure.Empty() {
  1877  			if equivMap == nil {
  1878  				equivMap = make(map[opt.ColumnID]opt.ColumnID)
  1879  			}
  1880  			id, _ := closure.Next(0)
  1881  			equivMap[i] = id
  1882  		}
  1883  	}
  1884  	return equivMap
  1885  }
  1886  
  1887  // isConstant returns true if this FD contains the set of constant columns. If
  1888  // it exists, it must always be the first FD in the set.
  1889  func (f *funcDep) isConstant() bool {
  1890  	return f.from.Empty()
  1891  }
  1892  
  1893  // implies returns true if this FD is at least as strong as the given FD. This
  1894  // is true when:
  1895  //   - the determinant is a subset of the given FD's determinant
  1896  //   - the dependant is a superset of the given FD's dependant
  1897  //   - the FD is at least as strict and equivalent as the given FD
  1898  func (f *funcDep) implies(fd *funcDep) bool {
  1899  	if f.from.SubsetOf(fd.from) && fd.to.SubsetOf(f.to) {
  1900  		if (f.strict || !fd.strict) && (f.equiv || !fd.equiv) {
  1901  			return true
  1902  		}
  1903  	}
  1904  	return false
  1905  }
  1906  
  1907  // removeFromCols removes columns in the given set from this FD's determinant.
  1908  // If removing columns results in an empty determinant, then removeFromCols
  1909  // returns false.
  1910  func (f *funcDep) removeFromCols(remove opt.ColSet) bool {
  1911  	if f.from.Intersects(remove) {
  1912  		f.from = f.from.Difference(remove)
  1913  	}
  1914  	return !f.isConstant()
  1915  
  1916  }
  1917  
  1918  // removeToCols removes columns in the given set from this FD's dependant set.
  1919  // If removing columns results in an empty dependant set, then removeToCols
  1920  // returns false.
  1921  func (f *funcDep) removeToCols(remove opt.ColSet) bool {
  1922  	if f.to.Intersects(remove) {
  1923  		f.to = f.to.Difference(remove)
  1924  	}
  1925  	return !f.to.Empty()
  1926  }
  1927  
  1928  func (f *funcDep) format(b *strings.Builder) {
  1929  	if f.equiv {
  1930  		if !f.strict {
  1931  			panic(errors.AssertionFailedf("lax equivalent columns are not supported"))
  1932  		}
  1933  		fmt.Fprintf(b, "%s==%s", f.from, f.to)
  1934  	} else {
  1935  		if f.strict {
  1936  			fmt.Fprintf(b, "%s-->%s", f.from, f.to)
  1937  		} else {
  1938  			fmt.Fprintf(b, "%s~~>%s", f.from, f.to)
  1939  		}
  1940  	}
  1941  }
  1942  
  1943  func (f *funcDep) String() string {
  1944  	var b strings.Builder
  1945  	f.format(&b)
  1946  	return b.String()
  1947  }