github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/metastore/glue/metastore_client.go (about)

     1  package glue
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"regexp"
     8  	"strings"
     9  
    10  	"github.com/aws/aws-sdk-go-v2/aws"
    11  	"github.com/aws/aws-sdk-go-v2/service/glue"
    12  	"github.com/aws/aws-sdk-go-v2/service/glue/types"
    13  	"github.com/treeverse/lakefs/pkg/logging"
    14  	"github.com/treeverse/lakefs/pkg/metastore"
    15  	mserrors "github.com/treeverse/lakefs/pkg/metastore/errors"
    16  )
    17  
    18  const MaxParts = 1000 // max possible 1000
    19  
    20  type MSClient struct {
    21  	client          *glue.Client
    22  	catalogID       string
    23  	baseLocationURI string
    24  }
    25  
    26  func (g *MSClient) GetDBLocation(dbName string) string {
    27  	return fmt.Sprintf("%s/%s", g.baseLocationURI, dbName)
    28  }
    29  
    30  func (g *MSClient) NormalizeDBName(db string) string {
    31  	return db
    32  }
    33  
    34  func NewMSClient(client *glue.Client, catalogID, baselLocationURI string) (metastore.Client, error) {
    35  	if catalogID == "" {
    36  		logging.ContextUnavailable().Warn("Glue catalog id is empty")
    37  	}
    38  	return &MSClient{
    39  		client:          client,
    40  		catalogID:       catalogID,
    41  		baseLocationURI: strings.TrimRight(baselLocationURI, "/"),
    42  	}, nil
    43  }
    44  
    45  func (g *MSClient) HasTable(ctx context.Context, dbName string, tableName string) (bool, error) {
    46  	table, err := g.GetTable(ctx, dbName, tableName)
    47  	var errEnityNotFound *types.EntityNotFoundException // TODO(Guys): validate this one
    48  	if err != nil && !errors.As(err, &errEnityNotFound) {
    49  		return false, err
    50  	}
    51  	return table != nil, nil
    52  }
    53  
    54  func (g *MSClient) GetDatabase(ctx context.Context, name string) (*metastore.Database, error) {
    55  	db, err := g.client.GetDatabase(ctx, &glue.GetDatabaseInput{
    56  		CatalogId: aws.String(g.catalogID),
    57  		Name:      aws.String(name),
    58  	})
    59  	if err != nil {
    60  		return nil, err
    61  	}
    62  	return DatabaseGlueToLocal(db.Database), nil
    63  }
    64  
    65  func (g *MSClient) getDatabaseFromGlue(ctx context.Context, token *string, parts int) (*glue.GetDatabasesOutput, error) {
    66  	return g.client.GetDatabases(ctx, &glue.GetDatabasesInput{
    67  		CatalogId:  aws.String(g.catalogID),
    68  		MaxResults: aws.Int32(int32(parts)),
    69  		NextToken:  token,
    70  	})
    71  }
    72  
    73  func (g *MSClient) GetDatabases(ctx context.Context, pattern string) ([]*metastore.Database, error) {
    74  	var nextToken *string
    75  	var allDatabases []*metastore.Database
    76  
    77  	for {
    78  		getDatabasesOutput, err := g.getDatabaseFromGlue(ctx, nextToken, MaxParts)
    79  		if err != nil {
    80  			return nil, err
    81  		}
    82  		nextToken = getDatabasesOutput.NextToken
    83  		filteredDatabases, err := filterDatabases(getDatabasesOutput.DatabaseList, pattern)
    84  		if err != nil {
    85  			return nil, err
    86  		}
    87  		databases := DatabasesGlueToLocal(filteredDatabases)
    88  		allDatabases = append(allDatabases, databases...)
    89  		if nextToken == nil {
    90  			break
    91  		}
    92  	}
    93  	return allDatabases, nil
    94  }
    95  
    96  func filterDatabases(databases []types.Database, pattern string) ([]types.Database, error) {
    97  	if pattern == "" {
    98  		return databases, nil
    99  	}
   100  	r, err := regexp.Compile(pattern)
   101  	if err != nil {
   102  		return nil, err
   103  	}
   104  	res := make([]types.Database, 0)
   105  	for _, database := range databases {
   106  		if r.MatchString(aws.ToString(database.Name)) {
   107  			res = append(res, database)
   108  		}
   109  	}
   110  	return res, nil
   111  }
   112  
   113  func (g *MSClient) GetTables(ctx context.Context, dbName string, pattern string) ([]*metastore.Table, error) {
   114  	var nextToken *string
   115  	allTables := make([]*metastore.Table, 0)
   116  	for {
   117  		getTablesOutput, err := g.client.GetTables(ctx, &glue.GetTablesInput{
   118  			CatalogId:    aws.String(g.catalogID),
   119  			DatabaseName: aws.String(dbName),
   120  			Expression:   aws.String(pattern),
   121  			MaxResults:   aws.Int32(MaxParts),
   122  			NextToken:    nextToken,
   123  		})
   124  		if err != nil {
   125  			return nil, err
   126  		}
   127  		nextToken = getTablesOutput.NextToken
   128  
   129  		tables := TablesGlueToLocal(getTablesOutput.TableList)
   130  		allTables = append(allTables, tables...)
   131  		if nextToken == nil {
   132  			break
   133  		}
   134  	}
   135  	return allTables, nil
   136  }
   137  
   138  func (g *MSClient) AlterTable(ctx context.Context, dbName string, _ string, newTable *metastore.Table) error {
   139  	table := TableLocalToGlue(newTable)
   140  	_, err := g.client.UpdateTable(ctx, &glue.UpdateTableInput{
   141  		CatalogId:    aws.String(g.catalogID),
   142  		DatabaseName: aws.String(dbName),
   143  		SkipArchive:  aws.Bool(false), // UpdateTable always creates an archived version of the table before updating it. However, if skipArchive is set to true, UpdateTable does not create the archived version.
   144  		TableInput:   table,
   145  	})
   146  	return err
   147  }
   148  
   149  func (g *MSClient) DropPartition(ctx context.Context, dbName string, tableName string, values []string) error {
   150  	_, err := g.client.DeletePartition(ctx, &glue.DeletePartitionInput{
   151  		CatalogId:       aws.String(g.catalogID),
   152  		DatabaseName:    aws.String(dbName),
   153  		PartitionValues: values,
   154  		TableName:       aws.String(tableName),
   155  	})
   156  	return err
   157  }
   158  
   159  func (g *MSClient) CreateDatabase(ctx context.Context, database *metastore.Database) error {
   160  	databaseInput := DatabaseLocalToGlue(database)
   161  	_, err := g.client.CreateDatabase(ctx, &glue.CreateDatabaseInput{
   162  		CatalogId:     aws.String(g.catalogID),
   163  		DatabaseInput: databaseInput,
   164  	})
   165  	var errExists *types.AlreadyExistsException
   166  	if errors.As(err, &errExists) {
   167  		return mserrors.ErrSchemaExists
   168  	}
   169  	return err
   170  }
   171  
   172  func (g *MSClient) getTableData(ctx context.Context, dbName string, tblName string) (*types.Table, error) {
   173  	table, err := g.client.GetTable(ctx,
   174  		&glue.GetTableInput{
   175  			CatalogId:    aws.String(g.catalogID),
   176  			DatabaseName: aws.String(dbName),
   177  			Name:         aws.String(tblName),
   178  		})
   179  	if err != nil {
   180  		return nil, err
   181  	}
   182  	return table.Table, nil
   183  }
   184  
   185  func (g *MSClient) GetTable(ctx context.Context, dbName string, tableName string) (*metastore.Table, error) {
   186  	table, err := g.getTableData(ctx, dbName, tableName)
   187  	if err != nil {
   188  		return nil, err
   189  	}
   190  	return TableGlueToLocal(table), nil
   191  }
   192  
   193  func (g *MSClient) CreateTable(ctx context.Context, tbl *metastore.Table) error {
   194  	table := TableLocalToGlue(tbl)
   195  	dbName := tbl.DBName
   196  	_, err := g.client.CreateTable(ctx,
   197  		&glue.CreateTableInput{
   198  			CatalogId:    aws.String(g.catalogID),
   199  			DatabaseName: aws.String(dbName),
   200  			TableInput:   table,
   201  		})
   202  	return err
   203  }
   204  
   205  func (g *MSClient) GetPartition(ctx context.Context, dbName string, tableName string, values []string) (*metastore.Partition, error) {
   206  	output, err := g.client.GetPartition(ctx,
   207  		&glue.GetPartitionInput{
   208  			CatalogId:       aws.String(g.catalogID),
   209  			DatabaseName:    aws.String(dbName),
   210  			PartitionValues: values,
   211  			TableName:       aws.String(tableName),
   212  		})
   213  	if err != nil {
   214  		return nil, err
   215  	}
   216  	return PartitionGlueToLocal(output.Partition), nil
   217  }
   218  
   219  func (g *MSClient) GetPartitions(ctx context.Context, dbName string, tableName string) ([]*metastore.Partition, error) {
   220  	partitions, err := g.GetAllPartitions(ctx, dbName, tableName)
   221  	if err != nil {
   222  		return nil, err
   223  	}
   224  
   225  	return PartitionsGlueToLocal(partitions), nil
   226  }
   227  
   228  func (g *MSClient) getPartitionsFromGlue(ctx context.Context, dbName, tableName string, nextToken *string, maxParts int16) (*glue.GetPartitionsOutput, error) {
   229  	return g.client.GetPartitions(ctx,
   230  		&glue.GetPartitionsInput{
   231  			CatalogId:    aws.String(g.catalogID),
   232  			DatabaseName: aws.String(dbName),
   233  			MaxResults:   aws.Int32(int32(maxParts)),
   234  			NextToken:    nextToken,
   235  			TableName:    aws.String(tableName),
   236  		})
   237  }
   238  
   239  func (g *MSClient) GetAllPartitions(ctx context.Context, dbName, tableName string) ([]types.Partition, error) {
   240  	var nextToken *string
   241  	var allPartitions []types.Partition
   242  	for {
   243  		getPartitionsOutput, err := g.getPartitionsFromGlue(ctx, dbName, tableName, nextToken, MaxParts)
   244  		if err != nil {
   245  			return nil, err
   246  		}
   247  		nextToken = getPartitionsOutput.NextToken
   248  		partitions := getPartitionsOutput.Partitions
   249  		allPartitions = append(allPartitions, partitions...)
   250  		if nextToken == nil {
   251  			break
   252  		}
   253  	}
   254  	return allPartitions, nil
   255  }
   256  
   257  func (g *MSClient) AddPartition(ctx context.Context, tableName string, dbName string, newPartition *metastore.Partition) error {
   258  	gluePartition := PartitionLocalToGlue(newPartition)
   259  	_, err := g.client.CreatePartition(ctx,
   260  		&glue.CreatePartitionInput{
   261  			CatalogId:      aws.String(g.catalogID),
   262  			DatabaseName:   aws.String(dbName),
   263  			PartitionInput: gluePartition,
   264  			TableName:      aws.String(tableName),
   265  		})
   266  	return err
   267  }
   268  
   269  func (g *MSClient) AddPartitions(ctx context.Context, tableName string, dbName string, newParts []*metastore.Partition) error {
   270  	gluePartitions := PartitionsLocalToGlue(newParts)
   271  
   272  	partitionList := make([]types.PartitionInput, 0, len(gluePartitions))
   273  	for _, partition := range gluePartitions {
   274  		partitionList = append(partitionList, types.PartitionInput{
   275  			LastAccessTime:    partition.LastAccessTime,
   276  			LastAnalyzedTime:  partition.LastAnalyzedTime,
   277  			Parameters:        partition.Parameters,
   278  			StorageDescriptor: partition.StorageDescriptor,
   279  			Values:            partition.Values,
   280  		})
   281  	}
   282  	_, err := g.client.BatchCreatePartition(ctx,
   283  		&glue.BatchCreatePartitionInput{
   284  			CatalogId:          aws.String(g.catalogID),
   285  			DatabaseName:       aws.String(dbName),
   286  			PartitionInputList: partitionList,
   287  			TableName:          aws.String(tableName),
   288  		})
   289  
   290  	return err
   291  }
   292  
   293  func (g *MSClient) AlterPartition(ctx context.Context, dbName string, tableName string, partition *metastore.Partition) error {
   294  	// No batch alter partitions we will need to do it one by one
   295  	gluePartition := PartitionLocalToGlue(partition)
   296  
   297  	_, err := g.client.UpdatePartition(ctx,
   298  		&glue.UpdatePartitionInput{
   299  			CatalogId:          aws.String(g.catalogID),
   300  			DatabaseName:       aws.String(dbName),
   301  			PartitionInput:     gluePartition,
   302  			PartitionValueList: gluePartition.Values,
   303  			TableName:          aws.String(tableName),
   304  		})
   305  	return err
   306  }
   307  
   308  func (g *MSClient) AlterPartitions(ctx context.Context, dbName string, tableName string, newPartitions []*metastore.Partition) error {
   309  	// No batch alter partitions we will need to do it one by one
   310  	for _, partition := range newPartitions {
   311  		err := g.AlterPartition(ctx, dbName, tableName, partition)
   312  		if err != nil {
   313  			return err
   314  		}
   315  	}
   316  	return nil
   317  }