github.com/fraugster/parquet-go@v0.12.0/cmd/parquet-tool/cmds/split.go (about)

     1  package cmds
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"log"
     7  	"os"
     8  	"path/filepath"
     9  	"strings"
    10  
    11  	goparquet "github.com/fraugster/parquet-go"
    12  	"github.com/fraugster/parquet-go/parquet"
    13  	"github.com/spf13/cobra"
    14  )
    15  
    16  var (
    17  	partSize          *string
    18  	targetFolder      *string
    19  	rowGroupSize      *string
    20  	compressionMethod *string
    21  )
    22  
    23  func init() {
    24  	partSize = splitFile.PersistentFlags().StringP("file-size", "s", "100MB", "The target size of parquet files, it is not the *exact* size on the output")
    25  	targetFolder = splitFile.PersistentFlags().StringP("target-folder", "t", "", "Target folder to write the files, use the source file folder if it's empty")
    26  	rowGroupSize = splitFile.PersistentFlags().StringP("row-group-size", "r", "128MB", "Uncompressed row group size")
    27  	compressionMethod = splitFile.PersistentFlags().StringP("compression", "c", "Snappy", "Compression method, valid values are Snappy, Gzip, None")
    28  	rootCmd.AddCommand(splitFile)
    29  }
    30  
    31  var splitFile = &cobra.Command{
    32  	Use:   "split file-name.parquet",
    33  	Short: "Split the parquet file into multiple parquet files",
    34  	Run: func(cmd *cobra.Command, args []string) {
    35  		if len(args) != 1 {
    36  			_ = cmd.Usage()
    37  			os.Exit(1)
    38  		}
    39  
    40  		rgSize, err := humanToByte(*rowGroupSize)
    41  		if err != nil {
    42  			log.Fatalf("Invalid row group size: %q", *rowGroupSize)
    43  		}
    44  
    45  		pSize, err := humanToByte(*partSize)
    46  		if err != nil {
    47  			log.Fatalf("Invalid file size: %q", *partSize)
    48  		}
    49  
    50  		comp := parquet.CompressionCodec_UNCOMPRESSED
    51  		switch strings.ToUpper(*compressionMethod) {
    52  		case "SNAPPY":
    53  			comp = parquet.CompressionCodec_SNAPPY
    54  		case "GZIP":
    55  			comp = parquet.CompressionCodec_GZIP
    56  		case "NONE":
    57  			comp = parquet.CompressionCodec_UNCOMPRESSED
    58  		default:
    59  			log.Fatalf("Invalid compression codec: %q", *rowGroupSize)
    60  		}
    61  
    62  		fl, err := os.Open(args[0])
    63  		if err != nil {
    64  			log.Fatalf("Can not open the file: %q", err)
    65  		}
    66  		defer fl.Close()
    67  
    68  		reader, err := goparquet.NewFileReader(fl)
    69  		if err != nil {
    70  			log.Fatalf("could not create parquet reader: %q", err)
    71  		}
    72  
    73  		opts := []goparquet.FileWriterOption{
    74  			goparquet.WithSchemaDefinition(reader.GetSchemaDefinition()),
    75  			goparquet.WithCompressionCodec(comp),
    76  			goparquet.WithMaxRowGroupSize(rgSize),
    77  		}
    78  
    79  		for i := 1; ; i++ {
    80  			path := filepath.Join(*targetFolder, fmt.Sprintf("part_%d.parquet", i))
    81  			ok, err := copyData(reader, path, pSize, opts...)
    82  			if err != nil {
    83  				log.Fatalf("Writing part failed: %q", err)
    84  			}
    85  
    86  			if ok {
    87  				break
    88  			}
    89  		}
    90  	},
    91  }
    92  
    93  func copyData(reader *goparquet.FileReader, path string, size int64, opts ...goparquet.FileWriterOption) (bool, error) {
    94  	fl, err := os.Create(path)
    95  	if err != nil {
    96  		return false, err
    97  	}
    98  	defer fl.Close()
    99  
   100  	writer := goparquet.NewFileWriter(fl, opts...)
   101  	for {
   102  		row, err := reader.NextRow()
   103  		if err == io.EOF {
   104  			return true, writer.Close()
   105  		}
   106  		if err != nil {
   107  			return false, err
   108  		}
   109  		if err := writer.AddData(row); err != nil {
   110  			return false, err
   111  		}
   112  
   113  		if writer.CurrentFileSize() >= size {
   114  			return false, writer.Close()
   115  		}
   116  	}
   117  }