github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/sqle/dtables/statistics_table.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package dtables
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"github.com/dolthub/go-mysql-server/sql"
    21  	"github.com/dolthub/go-mysql-server/sql/stats"
    22  
    23  	"github.com/dolthub/dolt/go/libraries/doltcore/doltdb"
    24  	"github.com/dolthub/dolt/go/libraries/doltcore/schema"
    25  	"github.com/dolthub/dolt/go/libraries/doltcore/sqle/dsess"
    26  	"github.com/dolthub/dolt/go/libraries/doltcore/sqle/index"
    27  )
    28  
    29  // StatisticsTable is a sql.Table implementation that implements a system table which shows the dolt commit log
    30  type StatisticsTable struct {
    31  	dbName string
    32  	branch string
    33  	ddb    *doltdb.DoltDB
    34  }
    35  
    36  var _ sql.Table = (*StatisticsTable)(nil)
    37  var _ sql.StatisticsTable = (*StatisticsTable)(nil)
    38  
    39  // NewStatisticsTable creates a StatisticsTable
    40  func NewStatisticsTable(_ *sql.Context, dbName string, ddb *doltdb.DoltDB, asOf interface{}) sql.Table {
    41  	ret := &StatisticsTable{dbName: dbName, ddb: ddb}
    42  	if branch, ok := asOf.(string); ok {
    43  		ret.branch = branch
    44  	}
    45  	return ret
    46  }
    47  
    48  // DataLength implements sql.StatisticsTable
    49  func (st *StatisticsTable) DataLength(ctx *sql.Context) (uint64, error) {
    50  	numBytesPerRow := schema.SchemaAvgLength(schema.StatsTableSqlSchema(st.dbName).Schema)
    51  	numRows, _, err := st.RowCount(ctx)
    52  	if err != nil {
    53  		return 0, err
    54  	}
    55  
    56  	// maxSize is the upper bound for how much space a table takes up on disk. It will typically
    57  	// greatly overestimate the actual size of the table on disk because it does not take into
    58  	// account that the data on disk is compressed and it assumes that every variable length
    59  	// field is fully used. Because of this, maxSize can easily be several orders of magnitude
    60  	// larger than the actual space used by the table on disk.
    61  	maxSize := numBytesPerRow * numRows
    62  
    63  	// To return a more realistic estimate of the size of the table on disk, we multiply maxSize by
    64  	// compressionFactor. This will still not give an accurate size of the table on disk, but it
    65  	// will generally be much closer than maxSize. This value comes from quickly testing some dbs
    66  	// with only columns that have a fixed length (e.g. int) and some with only columns that have
    67  	// a variable length (e.g. TEXT). 0.002 was between the two sets of values. Ultimately, having
    68  	// accurate table statistics is a better long term solution for this.
    69  	// https://github.com/dolthub/dolt/issues/6624
    70  	const compressionFactor = 0.002
    71  	estimatedSize := float64(maxSize) * compressionFactor
    72  	return uint64(estimatedSize), nil
    73  }
    74  
    75  type BranchStatsProvider interface {
    76  	GetTableDoltStats(ctx *sql.Context, branch, db, table string) ([]sql.Statistic, error)
    77  }
    78  
    79  // RowCount implements sql.StatisticsTable
    80  func (st *StatisticsTable) RowCount(ctx *sql.Context) (uint64, bool, error) {
    81  	dSess := dsess.DSessFromSess(ctx.Session)
    82  	prov := dSess.Provider()
    83  
    84  	sqlDb, err := prov.Database(ctx, st.dbName)
    85  	if err != nil {
    86  		return 0, false, err
    87  	}
    88  
    89  	tables, err := sqlDb.GetTableNames(ctx)
    90  	if err != nil {
    91  		return 0, false, err
    92  	}
    93  
    94  	var cnt int
    95  	for _, table := range tables {
    96  		// only Dolt-specific provider has branch support
    97  		dbStats, err := dSess.StatsProvider().(BranchStatsProvider).GetTableDoltStats(ctx, st.branch, st.dbName, table)
    98  		if err != nil {
    99  
   100  		}
   101  		for _, dbStat := range dbStats {
   102  			cnt += len(dbStat.Histogram())
   103  		}
   104  	}
   105  
   106  	return uint64(cnt), true, nil
   107  }
   108  
   109  // Name is a sql.Table interface function which returns the name of the table which is defined by the constant
   110  // StatisticsTableName
   111  func (st *StatisticsTable) Name() string {
   112  	return doltdb.StatisticsTableName
   113  }
   114  
   115  // String is a sql.Table interface function which returns the name of the table which is defined by the constant
   116  // StatisticsTableName
   117  func (st *StatisticsTable) String() string {
   118  	return doltdb.StatisticsTableName
   119  }
   120  
   121  // Schema is a sql.Table interface function that gets the sql.Schema of the log system table.
   122  func (st *StatisticsTable) Schema() sql.Schema {
   123  	return schema.StatsTableSqlSchema(st.dbName).Schema
   124  }
   125  
   126  // Collation implements the sql.Table interface.
   127  func (st *StatisticsTable) Collation() sql.CollationID {
   128  	return sql.Collation_Default
   129  }
   130  
   131  // Partitions is a sql.Table interface function that returns a partition of the data.  Currently the data is unpartitioned.
   132  func (st *StatisticsTable) Partitions(*sql.Context) (sql.PartitionIter, error) {
   133  	return index.SinglePartitionIterFromNomsMap(nil), nil
   134  }
   135  
   136  // PartitionRows is a sql.Table interface function that gets a row iterator for a partition
   137  func (st *StatisticsTable) PartitionRows(ctx *sql.Context, _ sql.Partition) (sql.RowIter, error) {
   138  	dSess := dsess.DSessFromSess(ctx.Session)
   139  	prov := dSess.Provider()
   140  
   141  	var sqlDb sql.Database
   142  	var err error
   143  	if st.branch != "" {
   144  		sqlDb, err = prov.Database(ctx, fmt.Sprintf("%s/%s", st.dbName, st.branch))
   145  	} else {
   146  		sqlDb, err = prov.Database(ctx, st.dbName)
   147  	}
   148  	if err != nil {
   149  		return nil, err
   150  	}
   151  
   152  	tables, err := sqlDb.GetTableNames(ctx)
   153  	if err != nil {
   154  		return nil, err
   155  	}
   156  
   157  	statsPro := dSess.StatsProvider().(BranchStatsProvider)
   158  	var dStats []sql.Statistic
   159  	for _, table := range tables {
   160  		dbStats, err := statsPro.GetTableDoltStats(ctx, st.branch, st.dbName, table)
   161  		if err != nil {
   162  			return nil, err
   163  		}
   164  		dStats = append(dStats, dbStats...)
   165  	}
   166  	return stats.NewStatsIter(ctx, dStats...)
   167  }
   168  
   169  // PreciseMatch implements sql.IndexAddressable
   170  func (st *StatisticsTable) PreciseMatch() bool {
   171  	return true
   172  }