github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/mvdata/file_data_loc.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package mvdata
    16  
    17  import (
    18  	"context"
    19  	"errors"
    20  	"fmt"
    21  	"io"
    22  	"os"
    23  	"strings"
    24  
    25  	"github.com/dolthub/dolt/go/libraries/doltcore/env"
    26  
    27  	"github.com/dolthub/dolt/go/libraries/doltcore/doltdb"
    28  	"github.com/dolthub/dolt/go/libraries/doltcore/schema"
    29  	"github.com/dolthub/dolt/go/libraries/doltcore/table"
    30  	"github.com/dolthub/dolt/go/libraries/doltcore/table/editor"
    31  	"github.com/dolthub/dolt/go/libraries/doltcore/table/typed/json"
    32  	"github.com/dolthub/dolt/go/libraries/doltcore/table/typed/parquet"
    33  	"github.com/dolthub/dolt/go/libraries/doltcore/table/untyped/csv"
    34  	"github.com/dolthub/dolt/go/libraries/doltcore/table/untyped/sqlexport"
    35  	"github.com/dolthub/dolt/go/libraries/doltcore/table/untyped/xlsx"
    36  	"github.com/dolthub/dolt/go/libraries/utils/filesys"
    37  )
    38  
    39  // DFFromString returns a data object from a string.
    40  func DFFromString(dfStr string) DataFormat {
    41  	switch strings.ToLower(dfStr) {
    42  	case "csv", ".csv":
    43  		return CsvFile
    44  	case "psv", ".psv":
    45  		return PsvFile
    46  	case "xlsx", ".xlsx":
    47  		return XlsxFile
    48  	case "json", ".json":
    49  		return JsonFile
    50  	case "sql", ".sql":
    51  		return SqlFile
    52  	case "parquet", ".parquet":
    53  		return ParquetFile
    54  	default:
    55  		return InvalidDataFormat
    56  	}
    57  }
    58  
    59  // FileDataLocation is a file that that can be imported from or exported to.
    60  type FileDataLocation struct {
    61  	// Path is the path of the file on the filesystem
    62  	Path string
    63  
    64  	// Format is the DataFormat of the file
    65  	Format DataFormat
    66  }
    67  
    68  // String returns a string representation of the data location.
    69  func (dl FileDataLocation) String() string {
    70  	return dl.Format.ReadableStr() + ":" + dl.Path
    71  }
    72  
    73  // Exists returns true if the DataLocation already exists
    74  func (dl FileDataLocation) Exists(ctx context.Context, root doltdb.RootValue, fs filesys.ReadableFS) (bool, error) {
    75  	exists, _ := fs.Exists(dl.Path)
    76  	return exists, nil
    77  }
    78  
    79  // NewReader creates a TableReadCloser for the DataLocation
    80  func (dl FileDataLocation) NewReader(ctx context.Context, dEnv *env.DoltEnv, opts interface{}) (rdCl table.SqlRowReader, sorted bool, err error) {
    81  	fs := dEnv.FS
    82  	root, err := dEnv.WorkingRoot(ctx)
    83  	if err != nil {
    84  		return nil, false, err
    85  	}
    86  
    87  	exists, isDir := fs.Exists(dl.Path)
    88  
    89  	if !exists {
    90  		return nil, false, os.ErrNotExist
    91  	} else if isDir {
    92  		return nil, false, filesys.ErrIsDir
    93  	}
    94  
    95  	switch dl.Format {
    96  	case CsvFile:
    97  		delim := ","
    98  
    99  		if opts != nil {
   100  			csvOpts, _ := opts.(CsvOptions)
   101  
   102  			if len(csvOpts.Delim) != 0 {
   103  				delim = csvOpts.Delim
   104  			}
   105  		}
   106  
   107  		rd, err := csv.OpenCSVReader(root.VRW().Format(), dl.Path, fs, csv.NewCSVInfo().SetDelim(delim))
   108  
   109  		return rd, false, err
   110  
   111  	case PsvFile:
   112  		rd, err := csv.OpenCSVReader(root.VRW().Format(), dl.Path, fs, csv.NewCSVInfo().SetDelim("|"))
   113  		return rd, false, err
   114  
   115  	case XlsxFile:
   116  		xlsxOpts := opts.(XlsxOptions)
   117  		rd, err := xlsx.OpenXLSXReader(ctx, root.VRW(), dl.Path, fs, &xlsx.XLSXFileInfo{SheetName: xlsxOpts.SheetName})
   118  		return rd, false, err
   119  
   120  	case JsonFile:
   121  		var sch schema.Schema
   122  		jsonOpts, _ := opts.(JSONOptions)
   123  		if jsonOpts.SchFile != "" {
   124  			tn, s, err := SchAndTableNameFromFile(ctx, jsonOpts.SchFile, dEnv)
   125  			if err != nil {
   126  				return nil, false, err
   127  			}
   128  			if tn != jsonOpts.TableName {
   129  				return nil, false, fmt.Errorf("table name '%s' from schema file %s does not match table arg '%s'", tn, jsonOpts.SchFile, jsonOpts.TableName)
   130  			}
   131  			sch = s
   132  		} else {
   133  			if opts == nil {
   134  				return nil, false, errors.New("Unable to determine table name on JSON import")
   135  			}
   136  			tbl, exists, err := root.GetTable(context.TODO(), doltdb.TableName{Name: jsonOpts.TableName})
   137  			if !exists {
   138  				return nil, false, fmt.Errorf("The following table could not be found:\n%v", jsonOpts.TableName)
   139  			}
   140  			if err != nil {
   141  				return nil, false, fmt.Errorf("An error occurred attempting to read the table:\n%v", err.Error())
   142  			}
   143  			sch, err = tbl.GetSchema(context.TODO())
   144  			if err != nil {
   145  				return nil, false, fmt.Errorf("An error occurred attempting to read the table schema:\n%v", err.Error())
   146  			}
   147  		}
   148  
   149  		rd, err := json.OpenJSONReader(root.VRW(), dl.Path, fs, sch)
   150  		return rd, false, err
   151  
   152  	case ParquetFile:
   153  		var tableSch schema.Schema
   154  		parquetOpts, _ := opts.(ParquetOptions)
   155  		if parquetOpts.SchFile != "" {
   156  			tn, s, tnErr := SchAndTableNameFromFile(ctx, parquetOpts.SchFile, dEnv)
   157  			if tnErr != nil {
   158  				return nil, false, tnErr
   159  			}
   160  			if tn != parquetOpts.TableName {
   161  				return nil, false, fmt.Errorf("table name '%s' from schema file %s does not match table arg '%s'", tn, parquetOpts.SchFile, parquetOpts.TableName)
   162  			}
   163  			tableSch = s
   164  		} else {
   165  			if opts == nil {
   166  				return nil, false, errors.New("Unable to determine table name on JSON import")
   167  			}
   168  			tbl, tableExists, tErr := root.GetTable(context.TODO(), doltdb.TableName{Name: parquetOpts.TableName})
   169  			if !tableExists {
   170  				return nil, false, fmt.Errorf("The following table could not be found:\n%v", parquetOpts.TableName)
   171  			}
   172  			if tErr != nil {
   173  				return nil, false, fmt.Errorf("An error occurred attempting to read the table:\n%v", err.Error())
   174  			}
   175  			tableSch, err = tbl.GetSchema(context.TODO())
   176  			if err != nil {
   177  				return nil, false, fmt.Errorf("An error occurred attempting to read the table schema:\n%v", err.Error())
   178  			}
   179  		}
   180  		rd, rErr := parquet.OpenParquetReader(root.VRW(), dl.Path, tableSch)
   181  		return rd, false, rErr
   182  	}
   183  
   184  	return nil, false, errors.New("unsupported format")
   185  }
   186  
   187  // NewCreatingWriter will create a TableWriteCloser for a DataLocation that will create a new table, or overwrite
   188  // an existing table.
   189  func (dl FileDataLocation) NewCreatingWriter(ctx context.Context, mvOpts DataMoverOptions, root doltdb.RootValue, outSch schema.Schema, opts editor.Options, wr io.WriteCloser) (table.SqlRowWriter, error) {
   190  	switch dl.Format {
   191  	case CsvFile:
   192  		return csv.NewCSVWriter(wr, outSch, csv.NewCSVInfo())
   193  	case PsvFile:
   194  		return csv.NewCSVWriter(wr, outSch, csv.NewCSVInfo().SetDelim("|"))
   195  	case XlsxFile:
   196  		panic("writing to xlsx files is not supported yet")
   197  	case JsonFile:
   198  		return json.NewJSONWriter(wr, outSch)
   199  	case SqlFile:
   200  		if mvOpts.IsBatched() {
   201  			return sqlexport.OpenBatchedSQLExportWriter(ctx, wr, root, mvOpts.SrcName(), mvOpts.IsAutocommitOff(), outSch, opts)
   202  		} else {
   203  			return sqlexport.OpenSQLExportWriter(ctx, wr, root, mvOpts.SrcName(), mvOpts.IsAutocommitOff(), outSch, opts)
   204  		}
   205  	case ParquetFile:
   206  		return parquet.NewParquetRowWriterForFile(outSch, mvOpts.DestName())
   207  	}
   208  
   209  	panic("Invalid Data Format." + string(dl.Format))
   210  }