github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/metastore/ms_client.go (about)

     1  package metastore
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"strings"
     8  
     9  	"github.com/treeverse/lakefs/pkg/catalog"
    10  	"github.com/treeverse/lakefs/pkg/logging"
    11  	mserrors "github.com/treeverse/lakefs/pkg/metastore/errors"
    12  )
    13  
    14  const dbfsPrefix = "dbfs:/"
    15  
    16  type ReadClient interface {
    17  	GetTable(ctx context.Context, dbName string, tableName string) (r *Table, err error)
    18  	HasTable(ctx context.Context, dbName string, tableName string) (hasTable bool, err error)
    19  	GetPartitions(ctx context.Context, dbName string, tableName string) (r []*Partition, err error)
    20  	GetPartition(ctx context.Context, dbName string, tableName string, values []string) (r *Partition, err error)
    21  	GetDatabase(ctx context.Context, name string) (r *Database, err error)
    22  	GetDatabases(ctx context.Context, pattern string) (databases []*Database, err error)
    23  	GetTables(ctx context.Context, dbName string, pattern string) (tables []*Table, err error)
    24  }
    25  
    26  type WriteClient interface {
    27  	CreateTable(ctx context.Context, tbl *Table) error
    28  	AlterTable(ctx context.Context, dbName string, tableName string, newTable *Table) error
    29  	AddPartitions(ctx context.Context, tableName string, dbName string, newParts []*Partition) error
    30  	AlterPartitions(ctx context.Context, dbName string, tableName string, newPartitions []*Partition) error
    31  	AlterPartition(ctx context.Context, dbName string, tableName string, partition *Partition) error
    32  	AddPartition(ctx context.Context, tableName string, dbName string, newPartition *Partition) error
    33  	DropPartition(ctx context.Context, dbName string, tableName string, values []string) error
    34  	CreateDatabase(ctx context.Context, database *Database) error
    35  	NormalizeDBName(name string) string // NormalizeDBName changes the db name to be a valid name for the client
    36  	GetDBLocation(dbName string) string // getDBLocation returns the expected locationURI of the database
    37  }
    38  
    39  type Client interface {
    40  	ReadClient
    41  	WriteClient
    42  }
    43  
    44  func CopyOrMerge(ctx context.Context, fromClient, toClient Client, fromDB, fromTable, toDB, toTable, toBranch, serde string, partition []string, fixSparkPlaceHolder bool, dbfsLocation string) error {
    45  	transformLocation := func(location string) (string, error) {
    46  		location = HandleDBFSLocation(ctx, location, dbfsLocation)
    47  		transformedLocation, err := ReplaceBranchName(location, toBranch)
    48  		if err != nil {
    49  			return "", fmt.Errorf("failed to replace branch name with location: '%s' and branch: '%s': %w", location, toBranch, err)
    50  		}
    51  		return transformedLocation, nil
    52  	}
    53  	return copyOrMergeWithTransformLocation(ctx, fromClient, toClient, fromDB, fromTable, toDB, toTable, serde, false, partition, transformLocation, fixSparkPlaceHolder)
    54  }
    55  
    56  func CopyDB(ctx context.Context, fromClient, toClient Client, fromDB, toDB, toBranch string, dbfsLocation string) error {
    57  	transformLocation := func(location string) (string, error) {
    58  		if location == "" {
    59  			return "", nil
    60  		}
    61  		location = HandleDBFSLocation(ctx, location, dbfsLocation)
    62  		transformedLocation, err := ReplaceBranchName(location, toBranch)
    63  		if err != nil {
    64  			return "", fmt.Errorf("failed to replace branch name with location: '%s' and branch: '%s': %w", location, toBranch, err)
    65  		}
    66  		return transformedLocation, nil
    67  	}
    68  	return copyDBWithTransformLocation(ctx, fromClient, toClient, fromDB, toDB, transformLocation)
    69  }
    70  
    71  func copyDBWithTransformLocation(ctx context.Context, fromClient, toClient Client, fromDB string, toDB string, transformLocation func(location string) (string, error)) error {
    72  	schema, err := fromClient.GetDatabase(ctx, fromDB)
    73  	if err != nil {
    74  		return fmt.Errorf("failed to get database on copy from '%s': %w", fromDB, err)
    75  	}
    76  	schema.Name = toDB
    77  	schema.LocationURI, err = transformLocation(schema.LocationURI)
    78  	if err != nil {
    79  		return err
    80  	}
    81  	err = toClient.CreateDatabase(ctx, schema)
    82  	if err != nil {
    83  		return fmt.Errorf("failed to create database with name '%s' and location '%s': %w", schema.Name, schema.LocationURI, err)
    84  	}
    85  	return nil
    86  }
    87  
    88  func copyOrMergeWithTransformLocation(ctx context.Context, fromClient, toClient Client, fromDB, fromTable, toDB, toTable, serde string, setSymlink bool, partition []string, transformLocation func(location string) (string, error), fixSparkPlaceHolder bool) error {
    89  	log := logging.FromContext(ctx).WithFields(logging.Fields{
    90  		"from_db":       fromDB,
    91  		"from_table":    fromTable,
    92  		"to_db":         toDB,
    93  		"to_table":      toTable,
    94  		"set_symlink":   setSymlink,
    95  		"serde":         serde,
    96  		"partition_len": len(partition),
    97  	})
    98  	if len(partition) > 0 {
    99  		log.Debug("CopyPartition")
   100  		return CopyPartition(ctx, fromClient, toClient, fromDB, fromTable, toDB, toTable, serde, setSymlink, partition, transformLocation, fixSparkPlaceHolder)
   101  	}
   102  	hasTable, err := toClient.HasTable(ctx, toDB, toTable)
   103  	if err != nil {
   104  		return err
   105  	}
   106  	if !hasTable {
   107  		log.Debug("Copy")
   108  		table, err := fromClient.GetTable(ctx, fromDB, fromTable)
   109  		if err != nil {
   110  			return err
   111  		}
   112  		partitions, err := fromClient.GetPartitions(ctx, fromDB, fromTable)
   113  		if err != nil {
   114  			return err
   115  		}
   116  		return Copy(ctx, table, partitions, toDB, toTable, serde, setSymlink, toClient, transformLocation, fixSparkPlaceHolder)
   117  	}
   118  	log.Debug("Merge")
   119  	table, err := fromClient.GetTable(ctx, fromDB, fromTable)
   120  	if err != nil {
   121  		return err
   122  	}
   123  	partitions, err := fromClient.GetPartitions(ctx, fromDB, fromTable)
   124  	if err != nil {
   125  		return err
   126  	}
   127  	partitionCollection := NewPartitionCollection(partitions)
   128  	return Merge(ctx, table, partitionCollection, toDB, toTable, serde, setSymlink, toClient, transformLocation, fixSparkPlaceHolder)
   129  }
   130  
   131  func CopyOrMergeFromValues(ctx context.Context, fromClient Client, fTable *Table, toClient Client, fromDB, fromTable, toDB, toTable, serde string, transformLocation func(location string) (string, error), fixSparkPlaceHolder bool) error {
   132  	hasTable, err := toClient.HasTable(ctx, toDB, toTable)
   133  	if err != nil {
   134  		return err
   135  	}
   136  	partitions, err := fromClient.GetPartitions(ctx, fromDB, fromTable)
   137  	if err != nil {
   138  		return err
   139  	}
   140  	if !hasTable {
   141  		return Copy(ctx, fTable, partitions, toDB, toTable, serde, false, toClient, transformLocation, fixSparkPlaceHolder)
   142  	}
   143  	partitionCollection := NewPartitionCollection(partitions)
   144  	return Merge(ctx, fTable, partitionCollection, toDB, toTable, serde, false, toClient, transformLocation, fixSparkPlaceHolder)
   145  }
   146  
   147  func CopyOrMergeAll(ctx context.Context, fromClient, toClient Client, schemaFilter, tableFilter, toBranch string, continueOnError, fixSparkPlaceHolder bool, dbfsLocation string) error {
   148  	databases, err := fromClient.GetDatabases(ctx, schemaFilter)
   149  	if err != nil {
   150  		return err
   151  	}
   152  	transformLocation := func(location string) (string, error) {
   153  		location = HandleDBFSLocation(ctx, location, dbfsLocation)
   154  		return ReplaceBranchName(location, toBranch)
   155  	}
   156  	return applyAll(ctx, fromClient, toClient, databases, tableFilter, transformLocation, fixSparkPlaceHolder, continueOnError)
   157  }
   158  
   159  // HandleDBFSLocation translates Data Bricks File system path to the S3 path using the dbfsLocation
   160  func HandleDBFSLocation(ctx context.Context, location string, dbfsLocation string) string {
   161  	l := location
   162  	if dbfsLocation != "" && strings.HasPrefix(location, dbfsPrefix) {
   163  		l = strings.Replace(location, dbfsPrefix, dbfsLocation, 1)
   164  	}
   165  	logging.FromContext(ctx).WithFields(logging.Fields{"dbfsLocation": dbfsLocation, "location": location, "new_location": l}).Info("translate databricks file system path to s3 path")
   166  	return l
   167  }
   168  
   169  func ImportAll(ctx context.Context, fromClient, toClient Client, schemaFilter, tableFilter, repo, toBranch string, continueOnError, fixSparkPlaceHolder bool, dbfsLocation string) error {
   170  	databases, err := fromClient.GetDatabases(ctx, schemaFilter)
   171  	if err != nil {
   172  		return err
   173  	}
   174  	transformLocation := func(location string) (string, error) {
   175  		location = HandleDBFSLocation(ctx, location, dbfsLocation)
   176  		return ReplaceExternalToLakeFSImported(location, repo, toBranch)
   177  	}
   178  	return applyAll(ctx, fromClient, toClient, databases, tableFilter, transformLocation, fixSparkPlaceHolder, continueOnError)
   179  }
   180  
   181  func applyAll(ctx context.Context, fromClient Client, toClient Client, databases []*Database, tableFilter string, transformLocation func(location string) (string, error), fixSparkPlaceHolder bool, continueOnError bool) error {
   182  	for _, database := range databases {
   183  		fromDBName := database.Name
   184  		toDBName := toClient.NormalizeDBName(database.Name)
   185  		err := copyDBWithTransformLocation(ctx, fromClient, toClient, fromDBName, toDBName, transformLocation)
   186  		if err != nil && !errors.Is(err, mserrors.ErrSchemaExists) {
   187  			return err
   188  		}
   189  		tables, err := fromClient.GetTables(ctx, fromDBName, tableFilter)
   190  		if err != nil {
   191  			return err
   192  		}
   193  		for _, table := range tables {
   194  			tableName := table.TableName
   195  			fmt.Printf("table %s.%s -> %s.%s\n", fromDBName, tableName, toDBName, tableName)
   196  			err = CopyOrMergeFromValues(ctx, fromClient, table, toClient, fromDBName, tableName, toDBName, tableName, tableName, transformLocation, fixSparkPlaceHolder)
   197  			if err != nil {
   198  				if !continueOnError {
   199  					return err
   200  				}
   201  				fmt.Println(err)
   202  			}
   203  		}
   204  	}
   205  	return nil
   206  }
   207  
   208  func Copy(ctx context.Context, fromTable *Table, partitions []*Partition, toDB, toTable, serde string, setSymlink bool, toClient WriteClient, transformLocation func(location string) (string, error), fixSparkPlaceHolder bool) error {
   209  	isSparkSQLTable := fromTable.isSparkSQLTable()
   210  	err := fromTable.Update(ctx, toDB, toTable, serde, setSymlink, transformLocation, isSparkSQLTable, fixSparkPlaceHolder)
   211  	if err != nil {
   212  		return err
   213  	}
   214  	for _, partition := range partitions {
   215  		err := partition.Update(ctx, toDB, toTable, serde, setSymlink, transformLocation, isSparkSQLTable, fixSparkPlaceHolder)
   216  		if err != nil {
   217  			return err
   218  		}
   219  	}
   220  	err = toClient.CreateTable(ctx, fromTable)
   221  	if err != nil {
   222  		return err
   223  	}
   224  	err = toClient.AddPartitions(ctx, toTable, toDB, partitions)
   225  	return err
   226  }
   227  
   228  func Merge(ctx context.Context, table *Table, partitionIter Collection, toDB, toTable, serde string, setSymlink bool, toClient Client, transformLocation func(location string) (string, error), fixSparkPlaceHolder bool) error {
   229  	isSparkSQLTable := table.isSparkSQLTable()
   230  	err := table.Update(ctx, toDB, toTable, serde, setSymlink, transformLocation, isSparkSQLTable, fixSparkPlaceHolder)
   231  	if err != nil {
   232  		return err
   233  	}
   234  	toPartitions, err := toClient.GetPartitions(ctx, toDB, toTable)
   235  	if err != nil {
   236  		return err
   237  	}
   238  	toPartitionIter := NewPartitionCollection(toPartitions)
   239  	var addPartitions, removePartitions, alterPartitions []*Partition
   240  	err = DiffIterable(partitionIter, toPartitionIter, func(difference catalog.DifferenceType, value interface{}, _ string) error {
   241  		partition, ok := value.(*Partition)
   242  		if !ok {
   243  			return fmt.Errorf("%w at diffIterable, got %T while expected  *Partition", mserrors.ErrExpectedType, value)
   244  		}
   245  		err = partition.Update(ctx, toDB, toTable, serde, setSymlink, transformLocation, isSparkSQLTable, fixSparkPlaceHolder)
   246  		if err != nil {
   247  			return err
   248  		}
   249  		switch difference {
   250  		case catalog.DifferenceTypeRemoved:
   251  			removePartitions = append(removePartitions, partition)
   252  		case catalog.DifferenceTypeAdded:
   253  			addPartitions = append(addPartitions, partition)
   254  		default:
   255  			alterPartitions = append(alterPartitions, partition)
   256  		}
   257  		return nil
   258  	})
   259  	if err != nil {
   260  		return err
   261  	}
   262  
   263  	err = toClient.AlterTable(ctx, toDB, toTable, table)
   264  	if err != nil {
   265  		return err
   266  	}
   267  	err = toClient.AddPartitions(ctx, toTable, toDB, addPartitions)
   268  	if err != nil {
   269  		return err
   270  	}
   271  	err = toClient.AlterPartitions(ctx, toDB, toTable, alterPartitions)
   272  	if err != nil {
   273  		return err
   274  	}
   275  	// drop one by one
   276  	for _, partition := range removePartitions {
   277  		values := partition.Values
   278  		err = toClient.DropPartition(ctx, toDB, toTable, values)
   279  		if err != nil {
   280  			return err
   281  		}
   282  	}
   283  	return nil
   284  }
   285  
   286  func CopyPartition(ctx context.Context, fromClient ReadClient, toClient Client, fromDB, fromTable, toDB, toTable, serde string, setSymlink bool, partition []string, transformLocation func(location string) (string, error), fixSparkPlaceHolder bool) error {
   287  	t1, err := fromClient.GetTable(ctx, fromDB, fromTable)
   288  	if err != nil {
   289  		return err
   290  	}
   291  	p1, err := fromClient.GetPartition(ctx, fromDB, fromTable, partition)
   292  	if err != nil {
   293  		return err
   294  	}
   295  	p2, err := toClient.GetPartition(ctx, toDB, toTable, partition)
   296  	if err != nil {
   297  		return err
   298  	}
   299  	err = p1.Update(ctx, toDB, toTable, serde, setSymlink, transformLocation, t1.isSparkSQLTable(), fixSparkPlaceHolder)
   300  	if err != nil {
   301  		return err
   302  	}
   303  	if p2 == nil {
   304  		err = toClient.AddPartition(ctx, "", "", p1)
   305  	} else {
   306  		err = toClient.AlterPartition(ctx, toDB, toTable, p1)
   307  	}
   308  	return err
   309  }
   310  
   311  func GetDiff(ctx context.Context, fromClient, toClient ReadClient, fromDB, fromTable, toDB, toTable string) (*MetaDiff, error) {
   312  	diffColumns, err := getColumnDiff(ctx, fromClient, toClient, fromDB, fromTable, toDB, toTable)
   313  	if err != nil {
   314  		return nil, err
   315  	}
   316  	partitionDiff, err := getPartitionsDiff(ctx, fromClient, toClient, fromDB, fromTable, toDB, toTable)
   317  	if err != nil {
   318  		return nil, err
   319  	}
   320  	return &MetaDiff{
   321  		PartitionDiff: partitionDiff,
   322  		ColumnsDiff:   diffColumns,
   323  	}, nil
   324  }
   325  
   326  func getPartitionsDiff(ctx context.Context, fromClient, toClient ReadClient, fromDB string, fromTable string, toDB string, toTable string) (catalog.Differences, error) {
   327  	partitions, err := fromClient.GetPartitions(ctx, fromDB, fromTable)
   328  	if err != nil {
   329  		return nil, err
   330  	}
   331  	partitionIter := NewPartitionCollection(partitions)
   332  	toPartitions, err := toClient.GetPartitions(ctx, toDB, toTable)
   333  	if err != nil {
   334  		return nil, err
   335  	}
   336  	toPartitionIter := NewPartitionCollection(toPartitions)
   337  	return Diff(partitionIter, toPartitionIter)
   338  }
   339  
   340  func getColumnDiff(ctx context.Context, fromClient, toClient ReadClient, fromDB, fromTable, toDB, toTable string) (catalog.Differences, error) {
   341  	table, err := fromClient.GetTable(ctx, fromDB, fromTable)
   342  	if err != nil {
   343  		return nil, err
   344  	}
   345  	colsIter := NewColumnCollection(table.Sd.Cols)
   346  
   347  	toTbl, err := toClient.GetTable(ctx, toDB, toTable)
   348  	if err != nil {
   349  		return nil, err
   350  	}
   351  	toColumns := toTbl.Sd.Cols // TODO(Guys): change name
   352  	toColsIter := NewColumnCollection(toColumns)
   353  
   354  	return Diff(colsIter, toColsIter)
   355  }
   356  
   357  func CopyOrMergeToSymlink(ctx context.Context, fromClient, toClient Client, fromDB, fromTable, toDB, toTable, locationPrefix string, fixSparkPlaceHolder bool) error {
   358  	transformLocation := func(location string) (string, error) {
   359  		return GetSymlinkLocation(location, locationPrefix)
   360  	}
   361  	return copyOrMergeWithTransformLocation(ctx, fromClient, toClient, fromDB, fromTable, toDB, toTable, "", true, nil, transformLocation, fixSparkPlaceHolder)
   362  }