github.com/apache/arrow/go/v7@v7.0.1/parquet/cmd/parquet_reader/main.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package main
    18  
    19  import (
    20  	"fmt"
    21  	"log"
    22  	"os"
    23  	"strconv"
    24  	"strings"
    25  
    26  	"github.com/apache/arrow/go/v7/parquet/file"
    27  	"github.com/apache/arrow/go/v7/parquet/metadata"
    28  	"github.com/apache/arrow/go/v7/parquet/schema"
    29  	"github.com/docopt/docopt-go"
    30  )
    31  
    32  const usage = `Parquet Reader.
    33  Usage:
    34    parquet_reader -h | --help
    35    parquet_reader [--only-metadata] [--no-memory-map] [--json]
    36                   [--print-key-value-metadata] [--columns=COLUMNS] <file>
    37  Options:
    38    -h --help                     Show this screen.
    39    --print-key-value-metadata    Print out the key-value metadata [default: false]
    40    --only-metadata               Stop after printing metadata, no values.
    41    --no-memory-map               Disable memory mapping the file.
    42    --json                        Format output as JSON instead of text.
    43    --columns=COLUMNS             Specify a subset of columns to print, comma delimited indexes.`
    44  
    45  func main() {
    46  	opts, _ := docopt.ParseDoc(usage)
    47  	var config struct {
    48  		PrintKeyValueMetadata bool
    49  		OnlyMetadata          bool
    50  		NoMemoryMap           bool
    51  		JSON                  bool `docopt:"--json"`
    52  		Columns               string
    53  		File                  string
    54  	}
    55  	opts.Bind(&config)
    56  
    57  	if config.JSON {
    58  		fmt.Fprintln(os.Stderr, "error: json output not implemented yet! falling back to regular")
    59  	}
    60  
    61  	selectedColumns := []int{}
    62  	if config.Columns != "" {
    63  		for _, c := range strings.Split(config.Columns, ",") {
    64  			cval, err := strconv.Atoi(c)
    65  			if err != nil {
    66  				fmt.Fprintln(os.Stderr, "error: --columns needs to be comma-delimited integers")
    67  				os.Exit(1)
    68  			}
    69  			selectedColumns = append(selectedColumns, cval)
    70  		}
    71  	}
    72  
    73  	rdr, err := file.OpenParquetFile(config.File, !config.NoMemoryMap, nil, nil)
    74  	if err != nil {
    75  		fmt.Fprintln(os.Stderr, "error opening parquet file: ", err)
    76  		os.Exit(1)
    77  	}
    78  
    79  	fileMetadata := rdr.MetaData()
    80  
    81  	fmt.Println("File name:", config.File)
    82  	fmt.Println("Version:", fileMetadata.Version())
    83  	fmt.Println("Created By:", fileMetadata.GetCreatedBy())
    84  	fmt.Println("Num Rows:", rdr.NumRows())
    85  
    86  	keyvaluemeta := fileMetadata.KeyValueMetadata()
    87  	if config.PrintKeyValueMetadata && keyvaluemeta != nil {
    88  		fmt.Println("Key Value File Metadata:", keyvaluemeta.Len(), "entries")
    89  		keys := keyvaluemeta.Keys()
    90  		values := keyvaluemeta.Values()
    91  		for i := 0; i < keyvaluemeta.Len(); i++ {
    92  			fmt.Printf("Key nr %d %s: %s\n", i, keys[i], values[i])
    93  		}
    94  	}
    95  
    96  	fmt.Println("Number of RowGroups:", rdr.NumRowGroups())
    97  	fmt.Println("Number of Real Columns:", fileMetadata.Schema.Root().NumFields())
    98  	fmt.Println("Number of Columns:", fileMetadata.Schema.NumColumns())
    99  
   100  	if len(selectedColumns) == 0 {
   101  		for i := 0; i < fileMetadata.Schema.NumColumns(); i++ {
   102  			selectedColumns = append(selectedColumns, i)
   103  		}
   104  	} else {
   105  		for _, c := range selectedColumns {
   106  			if c < 0 || c >= fileMetadata.Schema.NumColumns() {
   107  				fmt.Fprintln(os.Stderr, "selected column is out of range")
   108  				os.Exit(1)
   109  			}
   110  		}
   111  	}
   112  
   113  	fmt.Println("Number of Selected Columns:", len(selectedColumns))
   114  	for _, c := range selectedColumns {
   115  		descr := fileMetadata.Schema.Column(c)
   116  		fmt.Printf("Column %d: %s (%s", c, descr.Path(), descr.PhysicalType())
   117  		if descr.ConvertedType() != schema.ConvertedTypes.None {
   118  			fmt.Printf("/%s", descr.ConvertedType())
   119  			if descr.ConvertedType() == schema.ConvertedTypes.Decimal {
   120  				dec := descr.LogicalType().(*schema.DecimalLogicalType)
   121  				fmt.Printf("(%d,%d)", dec.Precision(), dec.Scale())
   122  			}
   123  		}
   124  		fmt.Print(")\n")
   125  	}
   126  
   127  	for r := 0; r < rdr.NumRowGroups(); r++ {
   128  		fmt.Println("--- Row Group:", r, " ---")
   129  
   130  		rgr := rdr.RowGroup(r)
   131  		rowGroupMeta := rgr.MetaData()
   132  		fmt.Println("--- Total Bytes:", rowGroupMeta.TotalByteSize(), " ---")
   133  		fmt.Println("--- Rows:", rgr.NumRows(), " ---")
   134  
   135  		for _, c := range selectedColumns {
   136  			chunkMeta, err := rowGroupMeta.ColumnChunk(c)
   137  			if err != nil {
   138  				log.Fatal(err)
   139  			}
   140  
   141  			fmt.Println("Column", c)
   142  			if set, _ := chunkMeta.StatsSet(); set {
   143  				stats, err := chunkMeta.Statistics()
   144  				if err != nil {
   145  					log.Fatal(err)
   146  				}
   147  				fmt.Printf(" Values: %d", chunkMeta.NumValues())
   148  				if stats.HasMinMax() {
   149  					fmt.Printf(", Min: %v, Max: %v",
   150  						metadata.GetStatValue(stats.Type(), stats.EncodeMin()),
   151  						metadata.GetStatValue(stats.Type(), stats.EncodeMax()))
   152  				}
   153  				if stats.HasNullCount() {
   154  					fmt.Printf(", Null Values: %d", stats.NullCount())
   155  				}
   156  				if stats.HasDistinctCount() {
   157  					fmt.Printf(", Distinct Values: %d", stats.DistinctCount())
   158  				}
   159  				fmt.Println()
   160  			} else {
   161  				fmt.Println(" Values:", chunkMeta.NumValues(), "Statistics Not Set")
   162  			}
   163  
   164  			fmt.Print(" Compression: ", chunkMeta.Compression())
   165  			fmt.Print(", Encodings:")
   166  			for _, enc := range chunkMeta.Encodings() {
   167  				fmt.Print(" ", enc)
   168  			}
   169  			fmt.Println()
   170  
   171  			fmt.Print(" Uncompressed Size: ", chunkMeta.TotalUncompressedSize())
   172  			fmt.Println(", Compressed Size:", chunkMeta.TotalCompressedSize())
   173  		}
   174  
   175  		if config.OnlyMetadata {
   176  			continue
   177  		}
   178  
   179  		fmt.Println("--- Values ---")
   180  
   181  		const colwidth = 18
   182  
   183  		scanners := make([]*Dumper, len(selectedColumns))
   184  		for idx, c := range selectedColumns {
   185  			scanners[idx] = createDumper(rgr.Column(c))
   186  			fmt.Printf(fmt.Sprintf("%%-%ds|", colwidth), rgr.Column(c).Descriptor().Name())
   187  		}
   188  		fmt.Println()
   189  
   190  		for {
   191  			data := false
   192  			for _, s := range scanners {
   193  				if val, ok := s.Next(); ok {
   194  					fmt.Print(s.FormatValue(val, colwidth), "|")
   195  					data = true
   196  				} else {
   197  					fmt.Printf(fmt.Sprintf("%%-%ds|", colwidth), "")
   198  				}
   199  			}
   200  			fmt.Println()
   201  			if !data {
   202  				break
   203  			}
   204  		}
   205  		fmt.Println()
   206  	}
   207  }