github.com/treeverse/lakefs@v1.24.1-0.20240520134607-95648127bfb0/pkg/actions/lua/formats/delta.go (about)

     1  package formats
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"net/url"
     8  	"regexp"
     9  
    10  	"github.com/Shopify/go-lua"
    11  	"github.com/aws/aws-sdk-go-v2/aws"
    12  	delta "github.com/csimplestring/delta-go"
    13  	"github.com/csimplestring/delta-go/action"
    14  	"github.com/csimplestring/delta-go/storage"
    15  	deltaStore "github.com/csimplestring/delta-go/store"
    16  	luautil "github.com/treeverse/lakefs/pkg/actions/lua/util"
    17  )
    18  
    19  type storageType string
    20  
    21  const (
    22  	s3StorageType storageType = "s3"
    23  )
    24  
    25  var errUnimplementedProvided = errors.New("unimplemented provider")
    26  
    27  type DeltaClient struct {
    28  	accessProvider AccessProvider
    29  	ctx            context.Context
    30  }
    31  
    32  func newDeltaTableMetadata(meta *action.Metadata) map[string]any {
    33  	return map[string]any{
    34  		"description":       meta.Description,
    35  		"id":                meta.ID,
    36  		"name":              meta.Name,
    37  		"schema_string":     meta.SchemaString,
    38  		"partition_columns": meta.PartitionColumns,
    39  		"configuration":     meta.Configuration,
    40  		"created_time":      *meta.CreatedTime,
    41  	}
    42  }
    43  
    44  func (dc *DeltaClient) fetchS3Table(repo, ref, prefix string, awsProps *storage.AWSProperties) (map[int64][]string, map[string]any, error) {
    45  	table, err := dc.getS3DeltaTable(repo, ref, prefix, awsProps)
    46  	if err != nil {
    47  		return nil, nil, err
    48  	}
    49  	log, err := dc.buildLog(table)
    50  	if err != nil {
    51  		return nil, nil, err
    52  	}
    53  	meta, err := dc.getTableMetadata(table)
    54  	if err != nil {
    55  		return nil, nil, err
    56  	}
    57  	return log, meta, nil
    58  }
    59  
    60  func (dc *DeltaClient) getTableMetadata(log delta.Log) (map[string]any, error) {
    61  	s, err := log.Snapshot()
    62  	if err != nil {
    63  		return nil, err
    64  	}
    65  	m, err := s.Metadata()
    66  	if err != nil {
    67  		return nil, err
    68  	}
    69  	return newDeltaTableMetadata(m), nil
    70  }
    71  
    72  func (dc *DeltaClient) getS3DeltaTable(repo, ref, prefix string, awsProps *storage.AWSProperties) (delta.Log, error) {
    73  	config := delta.Config{StoreType: string(s3StorageType)}
    74  	u := fmt.Sprintf("lakefs://%s/%s/%s", repo, ref, prefix)
    75  	parsedURL, err := url.Parse(u)
    76  	if err != nil {
    77  		return nil, err
    78  	}
    79  	s3LogStore, err := deltaStore.NewS3CompatLogStore(awsProps, parsedURL)
    80  	if err != nil {
    81  		return nil, err
    82  	}
    83  	store := deltaStore.Store(s3LogStore)
    84  	return delta.ForTableWithStore(u, config, &delta.SystemClock{}, &store)
    85  }
    86  
    87  func (dc *DeltaClient) buildLog(table delta.Log) (map[int64][]string, error) {
    88  	s, err := table.Snapshot()
    89  	if err != nil {
    90  		return nil, err
    91  	}
    92  	version, err := s.EarliestVersion()
    93  	if err != nil {
    94  		return nil, err
    95  	}
    96  	versionLog, err := table.Changes(version, false)
    97  	if err != nil {
    98  		return nil, err
    99  	}
   100  
   101  	entries := make(map[int64][]string)
   102  	for entry, err := versionLog.Next(); err == nil; entry, err = versionLog.Next() {
   103  		strLog := make([]string, 0)
   104  		entryVersion := entry.Version()
   105  		actions, aErr := entry.Actions()
   106  		if aErr != nil {
   107  			return nil, aErr
   108  		}
   109  		for _, a := range actions {
   110  			aj, _ := a.Json()
   111  			strLog = append(strLog, aj)
   112  		}
   113  		entries[entryVersion] = strLog
   114  	}
   115  	return entries, nil
   116  }
   117  
   118  func (dc *DeltaClient) fetchTableLog(repo, ref, prefix string) (map[int64][]string, map[string]any, error) {
   119  	ap, _ := dc.accessProvider.GetAccessProperties()
   120  	switch access := ap.(type) {
   121  	case AWSInfo:
   122  		return dc.fetchS3Table(repo, ref, prefix, &access.AWSProps)
   123  	default:
   124  		return nil, nil, errUnimplementedProvided
   125  	}
   126  }
   127  
   128  func getTable(client *DeltaClient) lua.Function {
   129  	return func(l *lua.State) int {
   130  		repo := lua.CheckString(l, 1)
   131  		ref := lua.CheckString(l, 2)
   132  		prefix := lua.CheckString(l, 3)
   133  		tableLog, metadata, err := client.fetchTableLog(repo, ref, prefix)
   134  		if err != nil {
   135  			lua.Errorf(l, "%s", err.Error())
   136  			panic("failed fetching table log")
   137  		}
   138  		luautil.DeepPush(l, tableLog)
   139  		luautil.DeepPush(l, metadata)
   140  		return 2
   141  	}
   142  }
   143  
   144  var functions = map[string]func(client *DeltaClient) lua.Function{
   145  	"get_table": getTable,
   146  }
   147  
   148  // AccessProvider is used to provide different expected access properties to different storage providers
   149  type AccessProvider interface {
   150  	GetAccessProperties() (interface{}, error)
   151  }
   152  
   153  type AWSInfo struct {
   154  	AWSProps storage.AWSProperties
   155  }
   156  
   157  func (awsI AWSInfo) GetAccessProperties() (interface{}, error) {
   158  	return awsI, nil
   159  }
   160  
   161  // newDelta is a factory function to create server/cloud specific Delta Lake client
   162  // lakeFSAddr is the domain or "authority:port" of the running lakeFS server
   163  func newDelta(ctx context.Context, lakeFSAddr string) lua.Function {
   164  	if regexp.MustCompile(`^:\d+`).MatchString(lakeFSAddr) {
   165  		// workaround in case we listen on all interfaces without specifying ip
   166  		lakeFSAddr = fmt.Sprintf("localhost%s", lakeFSAddr)
   167  	}
   168  	lakeFSAddr = fmt.Sprintf("http://%s", lakeFSAddr)
   169  	return func(l *lua.State) int {
   170  		client := newS3DeltaClient(l, ctx, lakeFSAddr)
   171  		l.NewTable()
   172  		for name, goFn := range functions {
   173  			l.PushGoFunction(goFn(client))
   174  			l.SetField(-2, name)
   175  		}
   176  		return 1
   177  	}
   178  }
   179  
   180  func newS3DeltaClient(l *lua.State, ctx context.Context, lakeFSAddr string) *DeltaClient {
   181  	accessKeyID := lua.CheckString(l, 1)
   182  	secretAccessKey := lua.CheckString(l, 2)
   183  	awsProps := storage.AWSProperties{
   184  		ForcePathStyle: true,
   185  		CredsProvider: aws.CredentialsProviderFunc(func(context.Context) (aws.Credentials, error) {
   186  			return aws.Credentials{
   187  				AccessKeyID:     accessKeyID,
   188  				SecretAccessKey: secretAccessKey,
   189  			}, nil
   190  		}),
   191  		Endpoint: lakeFSAddr,
   192  	}
   193  	if !l.IsNone(3) {
   194  		awsProps.Region = lua.CheckString(l, 3)
   195  	}
   196  
   197  	storage.RegisterS3CompatBucketURLOpener("lakefs", &awsProps)
   198  
   199  	return &DeltaClient{accessProvider: AWSInfo{AWSProps: awsProps}, ctx: ctx}
   200  }