github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/metastore/ms_client.go (about) 1 package metastore 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "strings" 8 9 "github.com/treeverse/lakefs/pkg/catalog" 10 "github.com/treeverse/lakefs/pkg/logging" 11 mserrors "github.com/treeverse/lakefs/pkg/metastore/errors" 12 ) 13 14 const dbfsPrefix = "dbfs:/" 15 16 type ReadClient interface { 17 GetTable(ctx context.Context, dbName string, tableName string) (r *Table, err error) 18 HasTable(ctx context.Context, dbName string, tableName string) (hasTable bool, err error) 19 GetPartitions(ctx context.Context, dbName string, tableName string) (r []*Partition, err error) 20 GetPartition(ctx context.Context, dbName string, tableName string, values []string) (r *Partition, err error) 21 GetDatabase(ctx context.Context, name string) (r *Database, err error) 22 GetDatabases(ctx context.Context, pattern string) (databases []*Database, err error) 23 GetTables(ctx context.Context, dbName string, pattern string) (tables []*Table, err error) 24 } 25 26 type WriteClient interface { 27 CreateTable(ctx context.Context, tbl *Table) error 28 AlterTable(ctx context.Context, dbName string, tableName string, newTable *Table) error 29 AddPartitions(ctx context.Context, tableName string, dbName string, newParts []*Partition) error 30 AlterPartitions(ctx context.Context, dbName string, tableName string, newPartitions []*Partition) error 31 AlterPartition(ctx context.Context, dbName string, tableName string, partition *Partition) error 32 AddPartition(ctx context.Context, tableName string, dbName string, newPartition *Partition) error 33 DropPartition(ctx context.Context, dbName string, tableName string, values []string) error 34 CreateDatabase(ctx context.Context, database *Database) error 35 NormalizeDBName(name string) string // NormalizeDBName changes the db name to be a valid name for the client 36 GetDBLocation(dbName string) string // getDBLocation returns the expected locationURI of the database 37 } 38 39 type Client interface { 40 ReadClient 41 WriteClient 42 } 43 44 func CopyOrMerge(ctx context.Context, fromClient, toClient Client, fromDB, fromTable, toDB, toTable, toBranch, serde string, partition []string, fixSparkPlaceHolder bool, dbfsLocation string) error { 45 transformLocation := func(location string) (string, error) { 46 location = HandleDBFSLocation(ctx, location, dbfsLocation) 47 transformedLocation, err := ReplaceBranchName(location, toBranch) 48 if err != nil { 49 return "", fmt.Errorf("failed to replace branch name with location: '%s' and branch: '%s': %w", location, toBranch, err) 50 } 51 return transformedLocation, nil 52 } 53 return copyOrMergeWithTransformLocation(ctx, fromClient, toClient, fromDB, fromTable, toDB, toTable, serde, false, partition, transformLocation, fixSparkPlaceHolder) 54 } 55 56 func CopyDB(ctx context.Context, fromClient, toClient Client, fromDB, toDB, toBranch string, dbfsLocation string) error { 57 transformLocation := func(location string) (string, error) { 58 if location == "" { 59 return "", nil 60 } 61 location = HandleDBFSLocation(ctx, location, dbfsLocation) 62 transformedLocation, err := ReplaceBranchName(location, toBranch) 63 if err != nil { 64 return "", fmt.Errorf("failed to replace branch name with location: '%s' and branch: '%s': %w", location, toBranch, err) 65 } 66 return transformedLocation, nil 67 } 68 return copyDBWithTransformLocation(ctx, fromClient, toClient, fromDB, toDB, transformLocation) 69 } 70 71 func copyDBWithTransformLocation(ctx context.Context, fromClient, toClient Client, fromDB string, toDB string, transformLocation func(location string) (string, error)) error { 72 schema, err := fromClient.GetDatabase(ctx, fromDB) 73 if err != nil { 74 return fmt.Errorf("failed to get database on copy from '%s': %w", fromDB, err) 75 } 76 schema.Name = toDB 77 schema.LocationURI, err = transformLocation(schema.LocationURI) 78 if err != nil { 79 return err 80 } 81 err = toClient.CreateDatabase(ctx, schema) 82 if err != nil { 83 return fmt.Errorf("failed to create database with name '%s' and location '%s': %w", schema.Name, schema.LocationURI, err) 84 } 85 return nil 86 } 87 88 func copyOrMergeWithTransformLocation(ctx context.Context, fromClient, toClient Client, fromDB, fromTable, toDB, toTable, serde string, setSymlink bool, partition []string, transformLocation func(location string) (string, error), fixSparkPlaceHolder bool) error { 89 log := logging.FromContext(ctx).WithFields(logging.Fields{ 90 "from_db": fromDB, 91 "from_table": fromTable, 92 "to_db": toDB, 93 "to_table": toTable, 94 "set_symlink": setSymlink, 95 "serde": serde, 96 "partition_len": len(partition), 97 }) 98 if len(partition) > 0 { 99 log.Debug("CopyPartition") 100 return CopyPartition(ctx, fromClient, toClient, fromDB, fromTable, toDB, toTable, serde, setSymlink, partition, transformLocation, fixSparkPlaceHolder) 101 } 102 hasTable, err := toClient.HasTable(ctx, toDB, toTable) 103 if err != nil { 104 return err 105 } 106 if !hasTable { 107 log.Debug("Copy") 108 table, err := fromClient.GetTable(ctx, fromDB, fromTable) 109 if err != nil { 110 return err 111 } 112 partitions, err := fromClient.GetPartitions(ctx, fromDB, fromTable) 113 if err != nil { 114 return err 115 } 116 return Copy(ctx, table, partitions, toDB, toTable, serde, setSymlink, toClient, transformLocation, fixSparkPlaceHolder) 117 } 118 log.Debug("Merge") 119 table, err := fromClient.GetTable(ctx, fromDB, fromTable) 120 if err != nil { 121 return err 122 } 123 partitions, err := fromClient.GetPartitions(ctx, fromDB, fromTable) 124 if err != nil { 125 return err 126 } 127 partitionCollection := NewPartitionCollection(partitions) 128 return Merge(ctx, table, partitionCollection, toDB, toTable, serde, setSymlink, toClient, transformLocation, fixSparkPlaceHolder) 129 } 130 131 func CopyOrMergeFromValues(ctx context.Context, fromClient Client, fTable *Table, toClient Client, fromDB, fromTable, toDB, toTable, serde string, transformLocation func(location string) (string, error), fixSparkPlaceHolder bool) error { 132 hasTable, err := toClient.HasTable(ctx, toDB, toTable) 133 if err != nil { 134 return err 135 } 136 partitions, err := fromClient.GetPartitions(ctx, fromDB, fromTable) 137 if err != nil { 138 return err 139 } 140 if !hasTable { 141 return Copy(ctx, fTable, partitions, toDB, toTable, serde, false, toClient, transformLocation, fixSparkPlaceHolder) 142 } 143 partitionCollection := NewPartitionCollection(partitions) 144 return Merge(ctx, fTable, partitionCollection, toDB, toTable, serde, false, toClient, transformLocation, fixSparkPlaceHolder) 145 } 146 147 func CopyOrMergeAll(ctx context.Context, fromClient, toClient Client, schemaFilter, tableFilter, toBranch string, continueOnError, fixSparkPlaceHolder bool, dbfsLocation string) error { 148 databases, err := fromClient.GetDatabases(ctx, schemaFilter) 149 if err != nil { 150 return err 151 } 152 transformLocation := func(location string) (string, error) { 153 location = HandleDBFSLocation(ctx, location, dbfsLocation) 154 return ReplaceBranchName(location, toBranch) 155 } 156 return applyAll(ctx, fromClient, toClient, databases, tableFilter, transformLocation, fixSparkPlaceHolder, continueOnError) 157 } 158 159 // HandleDBFSLocation translates Data Bricks File system path to the S3 path using the dbfsLocation 160 func HandleDBFSLocation(ctx context.Context, location string, dbfsLocation string) string { 161 l := location 162 if dbfsLocation != "" && strings.HasPrefix(location, dbfsPrefix) { 163 l = strings.Replace(location, dbfsPrefix, dbfsLocation, 1) 164 } 165 logging.FromContext(ctx).WithFields(logging.Fields{"dbfsLocation": dbfsLocation, "location": location, "new_location": l}).Info("translate databricks file system path to s3 path") 166 return l 167 } 168 169 func ImportAll(ctx context.Context, fromClient, toClient Client, schemaFilter, tableFilter, repo, toBranch string, continueOnError, fixSparkPlaceHolder bool, dbfsLocation string) error { 170 databases, err := fromClient.GetDatabases(ctx, schemaFilter) 171 if err != nil { 172 return err 173 } 174 transformLocation := func(location string) (string, error) { 175 location = HandleDBFSLocation(ctx, location, dbfsLocation) 176 return ReplaceExternalToLakeFSImported(location, repo, toBranch) 177 } 178 return applyAll(ctx, fromClient, toClient, databases, tableFilter, transformLocation, fixSparkPlaceHolder, continueOnError) 179 } 180 181 func applyAll(ctx context.Context, fromClient Client, toClient Client, databases []*Database, tableFilter string, transformLocation func(location string) (string, error), fixSparkPlaceHolder bool, continueOnError bool) error { 182 for _, database := range databases { 183 fromDBName := database.Name 184 toDBName := toClient.NormalizeDBName(database.Name) 185 err := copyDBWithTransformLocation(ctx, fromClient, toClient, fromDBName, toDBName, transformLocation) 186 if err != nil && !errors.Is(err, mserrors.ErrSchemaExists) { 187 return err 188 } 189 tables, err := fromClient.GetTables(ctx, fromDBName, tableFilter) 190 if err != nil { 191 return err 192 } 193 for _, table := range tables { 194 tableName := table.TableName 195 fmt.Printf("table %s.%s -> %s.%s\n", fromDBName, tableName, toDBName, tableName) 196 err = CopyOrMergeFromValues(ctx, fromClient, table, toClient, fromDBName, tableName, toDBName, tableName, tableName, transformLocation, fixSparkPlaceHolder) 197 if err != nil { 198 if !continueOnError { 199 return err 200 } 201 fmt.Println(err) 202 } 203 } 204 } 205 return nil 206 } 207 208 func Copy(ctx context.Context, fromTable *Table, partitions []*Partition, toDB, toTable, serde string, setSymlink bool, toClient WriteClient, transformLocation func(location string) (string, error), fixSparkPlaceHolder bool) error { 209 isSparkSQLTable := fromTable.isSparkSQLTable() 210 err := fromTable.Update(ctx, toDB, toTable, serde, setSymlink, transformLocation, isSparkSQLTable, fixSparkPlaceHolder) 211 if err != nil { 212 return err 213 } 214 for _, partition := range partitions { 215 err := partition.Update(ctx, toDB, toTable, serde, setSymlink, transformLocation, isSparkSQLTable, fixSparkPlaceHolder) 216 if err != nil { 217 return err 218 } 219 } 220 err = toClient.CreateTable(ctx, fromTable) 221 if err != nil { 222 return err 223 } 224 err = toClient.AddPartitions(ctx, toTable, toDB, partitions) 225 return err 226 } 227 228 func Merge(ctx context.Context, table *Table, partitionIter Collection, toDB, toTable, serde string, setSymlink bool, toClient Client, transformLocation func(location string) (string, error), fixSparkPlaceHolder bool) error { 229 isSparkSQLTable := table.isSparkSQLTable() 230 err := table.Update(ctx, toDB, toTable, serde, setSymlink, transformLocation, isSparkSQLTable, fixSparkPlaceHolder) 231 if err != nil { 232 return err 233 } 234 toPartitions, err := toClient.GetPartitions(ctx, toDB, toTable) 235 if err != nil { 236 return err 237 } 238 toPartitionIter := NewPartitionCollection(toPartitions) 239 var addPartitions, removePartitions, alterPartitions []*Partition 240 err = DiffIterable(partitionIter, toPartitionIter, func(difference catalog.DifferenceType, value interface{}, _ string) error { 241 partition, ok := value.(*Partition) 242 if !ok { 243 return fmt.Errorf("%w at diffIterable, got %T while expected *Partition", mserrors.ErrExpectedType, value) 244 } 245 err = partition.Update(ctx, toDB, toTable, serde, setSymlink, transformLocation, isSparkSQLTable, fixSparkPlaceHolder) 246 if err != nil { 247 return err 248 } 249 switch difference { 250 case catalog.DifferenceTypeRemoved: 251 removePartitions = append(removePartitions, partition) 252 case catalog.DifferenceTypeAdded: 253 addPartitions = append(addPartitions, partition) 254 default: 255 alterPartitions = append(alterPartitions, partition) 256 } 257 return nil 258 }) 259 if err != nil { 260 return err 261 } 262 263 err = toClient.AlterTable(ctx, toDB, toTable, table) 264 if err != nil { 265 return err 266 } 267 err = toClient.AddPartitions(ctx, toTable, toDB, addPartitions) 268 if err != nil { 269 return err 270 } 271 err = toClient.AlterPartitions(ctx, toDB, toTable, alterPartitions) 272 if err != nil { 273 return err 274 } 275 // drop one by one 276 for _, partition := range removePartitions { 277 values := partition.Values 278 err = toClient.DropPartition(ctx, toDB, toTable, values) 279 if err != nil { 280 return err 281 } 282 } 283 return nil 284 } 285 286 func CopyPartition(ctx context.Context, fromClient ReadClient, toClient Client, fromDB, fromTable, toDB, toTable, serde string, setSymlink bool, partition []string, transformLocation func(location string) (string, error), fixSparkPlaceHolder bool) error { 287 t1, err := fromClient.GetTable(ctx, fromDB, fromTable) 288 if err != nil { 289 return err 290 } 291 p1, err := fromClient.GetPartition(ctx, fromDB, fromTable, partition) 292 if err != nil { 293 return err 294 } 295 p2, err := toClient.GetPartition(ctx, toDB, toTable, partition) 296 if err != nil { 297 return err 298 } 299 err = p1.Update(ctx, toDB, toTable, serde, setSymlink, transformLocation, t1.isSparkSQLTable(), fixSparkPlaceHolder) 300 if err != nil { 301 return err 302 } 303 if p2 == nil { 304 err = toClient.AddPartition(ctx, "", "", p1) 305 } else { 306 err = toClient.AlterPartition(ctx, toDB, toTable, p1) 307 } 308 return err 309 } 310 311 func GetDiff(ctx context.Context, fromClient, toClient ReadClient, fromDB, fromTable, toDB, toTable string) (*MetaDiff, error) { 312 diffColumns, err := getColumnDiff(ctx, fromClient, toClient, fromDB, fromTable, toDB, toTable) 313 if err != nil { 314 return nil, err 315 } 316 partitionDiff, err := getPartitionsDiff(ctx, fromClient, toClient, fromDB, fromTable, toDB, toTable) 317 if err != nil { 318 return nil, err 319 } 320 return &MetaDiff{ 321 PartitionDiff: partitionDiff, 322 ColumnsDiff: diffColumns, 323 }, nil 324 } 325 326 func getPartitionsDiff(ctx context.Context, fromClient, toClient ReadClient, fromDB string, fromTable string, toDB string, toTable string) (catalog.Differences, error) { 327 partitions, err := fromClient.GetPartitions(ctx, fromDB, fromTable) 328 if err != nil { 329 return nil, err 330 } 331 partitionIter := NewPartitionCollection(partitions) 332 toPartitions, err := toClient.GetPartitions(ctx, toDB, toTable) 333 if err != nil { 334 return nil, err 335 } 336 toPartitionIter := NewPartitionCollection(toPartitions) 337 return Diff(partitionIter, toPartitionIter) 338 } 339 340 func getColumnDiff(ctx context.Context, fromClient, toClient ReadClient, fromDB, fromTable, toDB, toTable string) (catalog.Differences, error) { 341 table, err := fromClient.GetTable(ctx, fromDB, fromTable) 342 if err != nil { 343 return nil, err 344 } 345 colsIter := NewColumnCollection(table.Sd.Cols) 346 347 toTbl, err := toClient.GetTable(ctx, toDB, toTable) 348 if err != nil { 349 return nil, err 350 } 351 toColumns := toTbl.Sd.Cols // TODO(Guys): change name 352 toColsIter := NewColumnCollection(toColumns) 353 354 return Diff(colsIter, toColsIter) 355 } 356 357 func CopyOrMergeToSymlink(ctx context.Context, fromClient, toClient Client, fromDB, fromTable, toDB, toTable, locationPrefix string, fixSparkPlaceHolder bool) error { 358 transformLocation := func(location string) (string, error) { 359 return GetSymlinkLocation(location, locationPrefix) 360 } 361 return copyOrMergeWithTransformLocation(ctx, fromClient, toClient, fromDB, fromTable, toDB, toTable, "", true, nil, transformLocation, fixSparkPlaceHolder) 362 }