github.com/apache/arrow/go/v10@v10.0.1/parquet/cmd/parquet_reader/main.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package main
    18  
    19  import (
    20  	"bufio"
    21  	"encoding/json"
    22  	"fmt"
    23  	"io"
    24  	"log"
    25  	"os"
    26  	"strconv"
    27  	"strings"
    28  
    29  	"github.com/apache/arrow/go/v10/parquet"
    30  	"github.com/apache/arrow/go/v10/parquet/file"
    31  	"github.com/apache/arrow/go/v10/parquet/metadata"
    32  	"github.com/apache/arrow/go/v10/parquet/schema"
    33  	"github.com/docopt/docopt-go"
    34  )
    35  
    36  var version = ""
    37  var usage = `Parquet Reader (version ` + version + `)
    38  Usage:
    39    parquet_reader -h | --help
    40    parquet_reader [--only-metadata] [--no-metadata] [--no-memory-map] [--json] [--csv] [--output=FILE]
    41                   [--print-key-value-metadata] [--int96-timestamp] [--columns=COLUMNS] <file>
    42  Options:
    43    -h --help                     Show this screen.
    44    --print-key-value-metadata    Print out the key-value metadata. [default: false]
    45    --only-metadata               Stop after printing metadata, no values.
    46    --no-metadata                 Do not print metadata.
    47    --output=FILE                 Specify output file for data. [default: -]
    48    --no-memory-map               Disable memory mapping the file.
    49    --int96-timestamp             Parse INT96 as TIMESTAMP for legacy support.
    50    --json                        Format output as JSON instead of text.
    51    --csv                         Format output as CSV instead of text.
    52    --columns=COLUMNS             Specify a subset of columns to print, comma delimited indexes.`
    53  
    54  func main() {
    55  	opts, _ := docopt.ParseDoc(usage)
    56  	var config struct {
    57  		PrintKeyValueMetadata bool
    58  		OnlyMetadata          bool
    59  		NoMetadata            bool
    60  		Output                string
    61  		NoMemoryMap           bool
    62  		JSON                  bool `docopt:"--json"`
    63  		CSV                   bool `docopt:"--csv"`
    64  		ParseInt96AsTimestamp bool `docopt:"--int96-timestamp"`
    65  		Columns               string
    66  		File                  string
    67  	}
    68  	opts.Bind(&config)
    69  
    70  	parseInt96AsTimestamp = config.ParseInt96AsTimestamp
    71  
    72  	var dataOut io.Writer
    73  	dataOut = os.Stdout
    74  	if config.Output != "-" {
    75  		var err error
    76  		fileOut, err := os.Create(config.Output)
    77  		if err != nil {
    78  			fmt.Fprintf(os.Stderr, "error: --output %q cannot be created, %s\n", config.Output, err)
    79  			os.Exit(1)
    80  		}
    81  		bufOut := bufio.NewWriter(fileOut)
    82  		defer func() {
    83  			bufOut.Flush()
    84  			fileOut.Close()
    85  		}()
    86  		dataOut = bufOut
    87  	}
    88  
    89  	if config.CSV && config.JSON {
    90  		fmt.Fprintln(os.Stderr, "error: both --json and --csv outputs selected.")
    91  		os.Exit(1)
    92  	}
    93  
    94  	selectedColumns := []int{}
    95  	if config.Columns != "" {
    96  		for _, c := range strings.Split(config.Columns, ",") {
    97  			cval, err := strconv.Atoi(c)
    98  			if err != nil {
    99  				fmt.Fprintln(os.Stderr, "error: --columns needs to be comma-delimited integers")
   100  				os.Exit(1)
   101  			}
   102  			selectedColumns = append(selectedColumns, cval)
   103  		}
   104  	}
   105  
   106  	rdr, err := file.OpenParquetFile(config.File, !config.NoMemoryMap)
   107  	if err != nil {
   108  		fmt.Fprintln(os.Stderr, "error opening parquet file: ", err)
   109  		os.Exit(1)
   110  	}
   111  
   112  	fileMetadata := rdr.MetaData()
   113  
   114  	if !config.NoMetadata {
   115  		fmt.Println("File name:", config.File)
   116  		fmt.Println("Version:", fileMetadata.Version())
   117  		fmt.Println("Created By:", fileMetadata.GetCreatedBy())
   118  		fmt.Println("Num Rows:", rdr.NumRows())
   119  
   120  		keyvaluemeta := fileMetadata.KeyValueMetadata()
   121  		if config.PrintKeyValueMetadata && keyvaluemeta != nil {
   122  			fmt.Println("Key Value File Metadata:", keyvaluemeta.Len(), "entries")
   123  			keys := keyvaluemeta.Keys()
   124  			values := keyvaluemeta.Values()
   125  			for i := 0; i < keyvaluemeta.Len(); i++ {
   126  				fmt.Printf("Key nr %d %s: %s\n", i, keys[i], values[i])
   127  			}
   128  		}
   129  
   130  		fmt.Println("Number of RowGroups:", rdr.NumRowGroups())
   131  		fmt.Println("Number of Real Columns:", fileMetadata.Schema.Root().NumFields())
   132  		fmt.Println("Number of Columns:", fileMetadata.Schema.NumColumns())
   133  	}
   134  
   135  	if len(selectedColumns) == 0 {
   136  		for i := 0; i < fileMetadata.Schema.NumColumns(); i++ {
   137  			selectedColumns = append(selectedColumns, i)
   138  		}
   139  	} else {
   140  		for _, c := range selectedColumns {
   141  			if c < 0 || c >= fileMetadata.Schema.NumColumns() {
   142  				fmt.Fprintln(os.Stderr, "selected column is out of range")
   143  				os.Exit(1)
   144  			}
   145  		}
   146  	}
   147  
   148  	if !config.NoMetadata {
   149  		fmt.Println("Number of Selected Columns:", len(selectedColumns))
   150  		for _, c := range selectedColumns {
   151  			descr := fileMetadata.Schema.Column(c)
   152  			fmt.Printf("Column %d: %s (%s", c, descr.Path(), descr.PhysicalType())
   153  			if descr.ConvertedType() != schema.ConvertedTypes.None {
   154  				fmt.Printf("/%s", descr.ConvertedType())
   155  				if descr.ConvertedType() == schema.ConvertedTypes.Decimal {
   156  					dec := descr.LogicalType().(*schema.DecimalLogicalType)
   157  					fmt.Printf("(%d,%d)", dec.Precision(), dec.Scale())
   158  				}
   159  			}
   160  			fmt.Print(")\n")
   161  		}
   162  	}
   163  
   164  	for r := 0; r < rdr.NumRowGroups(); r++ {
   165  		if !config.NoMetadata {
   166  			fmt.Println("--- Row Group:", r, " ---")
   167  		}
   168  
   169  		rgr := rdr.RowGroup(r)
   170  		rowGroupMeta := rgr.MetaData()
   171  		if !config.NoMetadata {
   172  			fmt.Println("--- Total Bytes:", rowGroupMeta.TotalByteSize(), " ---")
   173  			fmt.Println("--- Rows:", rgr.NumRows(), " ---")
   174  		}
   175  
   176  		for _, c := range selectedColumns {
   177  			chunkMeta, err := rowGroupMeta.ColumnChunk(c)
   178  			if err != nil {
   179  				log.Fatal(err)
   180  			}
   181  
   182  			if !config.NoMetadata {
   183  				fmt.Println("Column", c)
   184  				if set, _ := chunkMeta.StatsSet(); set {
   185  					stats, err := chunkMeta.Statistics()
   186  					if err != nil {
   187  						log.Fatal(err)
   188  					}
   189  					fmt.Printf(" Values: %d", chunkMeta.NumValues())
   190  					if stats.HasMinMax() {
   191  						fmt.Printf(", Min: %v, Max: %v",
   192  							metadata.GetStatValue(stats.Type(), stats.EncodeMin()),
   193  							metadata.GetStatValue(stats.Type(), stats.EncodeMax()))
   194  					}
   195  					if stats.HasNullCount() {
   196  						fmt.Printf(", Null Values: %d", stats.NullCount())
   197  					}
   198  					if stats.HasDistinctCount() {
   199  						fmt.Printf(", Distinct Values: %d", stats.DistinctCount())
   200  					}
   201  					fmt.Println()
   202  				} else {
   203  					fmt.Println(" Values:", chunkMeta.NumValues(), "Statistics Not Set")
   204  				}
   205  
   206  				fmt.Print(" Compression: ", chunkMeta.Compression())
   207  				fmt.Print(", Encodings:")
   208  				for _, enc := range chunkMeta.Encodings() {
   209  					fmt.Print(" ", enc)
   210  				}
   211  				fmt.Println()
   212  
   213  				fmt.Print(" Uncompressed Size: ", chunkMeta.TotalUncompressedSize())
   214  				fmt.Println(", Compressed Size:", chunkMeta.TotalCompressedSize())
   215  			}
   216  		}
   217  
   218  		if config.OnlyMetadata {
   219  			continue
   220  		}
   221  
   222  		if !config.NoMetadata {
   223  			fmt.Println("--- Values ---")
   224  		}
   225  
   226  		switch {
   227  		case config.JSON:
   228  			fmt.Fprint(dataOut, "[")
   229  
   230  			scanners := make([]*Dumper, len(selectedColumns))
   231  			fields := make([]string, len(selectedColumns))
   232  			for idx, c := range selectedColumns {
   233  				col, err := rgr.Column(c)
   234  				if err != nil {
   235  					log.Fatalf("unable to fetch column=%d err=%s", c, err)
   236  				}
   237  				scanners[idx] = createDumper(col)
   238  				fields[idx] = col.Descriptor().Path()
   239  			}
   240  
   241  			var line string
   242  			for {
   243  				if line == "" {
   244  					line = "\n  {"
   245  				} else {
   246  					line = ",\n  {"
   247  				}
   248  
   249  				data := false
   250  				first := true
   251  				for idx, s := range scanners {
   252  					if val, ok := s.Next(); ok {
   253  						if !data {
   254  							fmt.Fprint(dataOut, line)
   255  						}
   256  						data = true
   257  						if val == nil {
   258  							continue
   259  						}
   260  						if !first {
   261  							fmt.Fprint(dataOut, ",")
   262  						}
   263  						first = false
   264  						switch val.(type) {
   265  						case bool, int32, int64, float32, float64:
   266  						default:
   267  							val = s.FormatValue(val, 0)
   268  						}
   269  						jsonVal, err := json.Marshal(val)
   270  						if err != nil {
   271  							fmt.Fprintf(os.Stderr, "error: marshalling json for %+v, %s\n", val, err)
   272  							os.Exit(1)
   273  						}
   274  						fmt.Fprintf(dataOut, "\n    %q: %s", fields[idx], jsonVal)
   275  					}
   276  				}
   277  				if !data {
   278  					break
   279  				}
   280  				fmt.Fprint(dataOut, "\n  }")
   281  			}
   282  
   283  			fmt.Fprintln(dataOut, "\n]")
   284  		case config.CSV:
   285  			scanners := make([]*Dumper, len(selectedColumns))
   286  			for idx, c := range selectedColumns {
   287  				if idx > 0 {
   288  					fmt.Fprint(dataOut, ",")
   289  				}
   290  				col, err := rgr.Column(c)
   291  				if err != nil {
   292  					log.Fatalf("unable to fetch col=%d err=%s", c, err)
   293  				}
   294  				scanners[idx] = createDumper(col)
   295  				fmt.Fprintf(dataOut, "%q", col.Descriptor().Path())
   296  			}
   297  			fmt.Fprintln(dataOut)
   298  
   299  			var line string
   300  			for {
   301  				data := false
   302  				for idx, s := range scanners {
   303  					if idx > 0 {
   304  						if data {
   305  							fmt.Fprint(dataOut, ",")
   306  						} else {
   307  							line += ","
   308  						}
   309  					}
   310  					if val, ok := s.Next(); ok {
   311  						if !data {
   312  							fmt.Fprint(dataOut, line)
   313  						}
   314  						data = true
   315  						if val == nil {
   316  							fmt.Fprint(dataOut, "")
   317  							continue
   318  						}
   319  						switch val.(type) {
   320  						case bool, int32, int64, parquet.Int96, float32, float64:
   321  							fmt.Fprintf(dataOut, "%v", val)
   322  						default:
   323  							fmt.Fprintf(dataOut, "%q", s.FormatValue(val, 0))
   324  						}
   325  					} else {
   326  						if data {
   327  							fmt.Fprint(dataOut, ",")
   328  						} else {
   329  							line += ","
   330  						}
   331  					}
   332  				}
   333  				if !data {
   334  					break
   335  				}
   336  				fmt.Fprintln(dataOut)
   337  				line = ""
   338  			}
   339  			fmt.Fprintln(dataOut)
   340  		default:
   341  			const colwidth = 18
   342  
   343  			scanners := make([]*Dumper, len(selectedColumns))
   344  			for idx, c := range selectedColumns {
   345  				col, err := rgr.Column(c)
   346  				if err != nil {
   347  					log.Fatalf("unable to fetch column=%d err=%s", c, err)
   348  				}
   349  				scanners[idx] = createDumper(col)
   350  				fmt.Fprintf(dataOut, fmt.Sprintf("%%-%ds|", colwidth), col.Descriptor().Name())
   351  			}
   352  			fmt.Fprintln(dataOut)
   353  
   354  			var line string
   355  			for {
   356  				data := false
   357  				for _, s := range scanners {
   358  					if val, ok := s.Next(); ok {
   359  						if !data {
   360  							fmt.Fprint(dataOut, line)
   361  						}
   362  						fmt.Fprint(dataOut, s.FormatValue(val, colwidth), "|")
   363  						data = true
   364  					} else {
   365  						if data {
   366  							fmt.Fprintf(dataOut, fmt.Sprintf("%%-%ds|", colwidth), "")
   367  						} else {
   368  							line += fmt.Sprintf(fmt.Sprintf("%%-%ds|", colwidth), "")
   369  						}
   370  					}
   371  				}
   372  				if !data {
   373  					break
   374  				}
   375  				fmt.Fprintln(dataOut)
   376  				line = ""
   377  			}
   378  			fmt.Fprintln(dataOut)
   379  		}
   380  	}
   381  }