github.com/fraugster/parquet-go@v0.12.0/cmd/parquet-tool/cmds/split.go (about) 1 package cmds 2 3 import ( 4 "fmt" 5 "io" 6 "log" 7 "os" 8 "path/filepath" 9 "strings" 10 11 goparquet "github.com/fraugster/parquet-go" 12 "github.com/fraugster/parquet-go/parquet" 13 "github.com/spf13/cobra" 14 ) 15 16 var ( 17 partSize *string 18 targetFolder *string 19 rowGroupSize *string 20 compressionMethod *string 21 ) 22 23 func init() { 24 partSize = splitFile.PersistentFlags().StringP("file-size", "s", "100MB", "The target size of parquet files, it is not the *exact* size on the output") 25 targetFolder = splitFile.PersistentFlags().StringP("target-folder", "t", "", "Target folder to write the files, use the source file folder if it's empty") 26 rowGroupSize = splitFile.PersistentFlags().StringP("row-group-size", "r", "128MB", "Uncompressed row group size") 27 compressionMethod = splitFile.PersistentFlags().StringP("compression", "c", "Snappy", "Compression method, valid values are Snappy, Gzip, None") 28 rootCmd.AddCommand(splitFile) 29 } 30 31 var splitFile = &cobra.Command{ 32 Use: "split file-name.parquet", 33 Short: "Split the parquet file into multiple parquet files", 34 Run: func(cmd *cobra.Command, args []string) { 35 if len(args) != 1 { 36 _ = cmd.Usage() 37 os.Exit(1) 38 } 39 40 rgSize, err := humanToByte(*rowGroupSize) 41 if err != nil { 42 log.Fatalf("Invalid row group size: %q", *rowGroupSize) 43 } 44 45 pSize, err := humanToByte(*partSize) 46 if err != nil { 47 log.Fatalf("Invalid file size: %q", *partSize) 48 } 49 50 comp := parquet.CompressionCodec_UNCOMPRESSED 51 switch strings.ToUpper(*compressionMethod) { 52 case "SNAPPY": 53 comp = parquet.CompressionCodec_SNAPPY 54 case "GZIP": 55 comp = parquet.CompressionCodec_GZIP 56 case "NONE": 57 comp = parquet.CompressionCodec_UNCOMPRESSED 58 default: 59 log.Fatalf("Invalid compression codec: %q", *rowGroupSize) 60 } 61 62 fl, err := os.Open(args[0]) 63 if err != nil { 64 log.Fatalf("Can not open the file: %q", err) 65 } 66 defer fl.Close() 67 68 reader, err := goparquet.NewFileReader(fl) 69 if err != nil { 70 log.Fatalf("could not create parquet reader: %q", err) 71 } 72 73 opts := []goparquet.FileWriterOption{ 74 goparquet.WithSchemaDefinition(reader.GetSchemaDefinition()), 75 goparquet.WithCompressionCodec(comp), 76 goparquet.WithMaxRowGroupSize(rgSize), 77 } 78 79 for i := 1; ; i++ { 80 path := filepath.Join(*targetFolder, fmt.Sprintf("part_%d.parquet", i)) 81 ok, err := copyData(reader, path, pSize, opts...) 82 if err != nil { 83 log.Fatalf("Writing part failed: %q", err) 84 } 85 86 if ok { 87 break 88 } 89 } 90 }, 91 } 92 93 func copyData(reader *goparquet.FileReader, path string, size int64, opts ...goparquet.FileWriterOption) (bool, error) { 94 fl, err := os.Create(path) 95 if err != nil { 96 return false, err 97 } 98 defer fl.Close() 99 100 writer := goparquet.NewFileWriter(fl, opts...) 101 for { 102 row, err := reader.NextRow() 103 if err == io.EOF { 104 return true, writer.Close() 105 } 106 if err != nil { 107 return false, err 108 } 109 if err := writer.AddData(row); err != nil { 110 return false, err 111 } 112 113 if writer.CurrentFileSize() >= size { 114 return false, writer.Close() 115 } 116 } 117 }