github.com/apache/arrow/go/v7@v7.0.1/parquet/cmd/parquet_reader/main.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package main 18 19 import ( 20 "fmt" 21 "log" 22 "os" 23 "strconv" 24 "strings" 25 26 "github.com/apache/arrow/go/v7/parquet/file" 27 "github.com/apache/arrow/go/v7/parquet/metadata" 28 "github.com/apache/arrow/go/v7/parquet/schema" 29 "github.com/docopt/docopt-go" 30 ) 31 32 const usage = `Parquet Reader. 33 Usage: 34 parquet_reader -h | --help 35 parquet_reader [--only-metadata] [--no-memory-map] [--json] 36 [--print-key-value-metadata] [--columns=COLUMNS] <file> 37 Options: 38 -h --help Show this screen. 39 --print-key-value-metadata Print out the key-value metadata [default: false] 40 --only-metadata Stop after printing metadata, no values. 41 --no-memory-map Disable memory mapping the file. 42 --json Format output as JSON instead of text. 43 --columns=COLUMNS Specify a subset of columns to print, comma delimited indexes.` 44 45 func main() { 46 opts, _ := docopt.ParseDoc(usage) 47 var config struct { 48 PrintKeyValueMetadata bool 49 OnlyMetadata bool 50 NoMemoryMap bool 51 JSON bool `docopt:"--json"` 52 Columns string 53 File string 54 } 55 opts.Bind(&config) 56 57 if config.JSON { 58 fmt.Fprintln(os.Stderr, "error: json output not implemented yet! falling back to regular") 59 } 60 61 selectedColumns := []int{} 62 if config.Columns != "" { 63 for _, c := range strings.Split(config.Columns, ",") { 64 cval, err := strconv.Atoi(c) 65 if err != nil { 66 fmt.Fprintln(os.Stderr, "error: --columns needs to be comma-delimited integers") 67 os.Exit(1) 68 } 69 selectedColumns = append(selectedColumns, cval) 70 } 71 } 72 73 rdr, err := file.OpenParquetFile(config.File, !config.NoMemoryMap, nil, nil) 74 if err != nil { 75 fmt.Fprintln(os.Stderr, "error opening parquet file: ", err) 76 os.Exit(1) 77 } 78 79 fileMetadata := rdr.MetaData() 80 81 fmt.Println("File name:", config.File) 82 fmt.Println("Version:", fileMetadata.Version()) 83 fmt.Println("Created By:", fileMetadata.GetCreatedBy()) 84 fmt.Println("Num Rows:", rdr.NumRows()) 85 86 keyvaluemeta := fileMetadata.KeyValueMetadata() 87 if config.PrintKeyValueMetadata && keyvaluemeta != nil { 88 fmt.Println("Key Value File Metadata:", keyvaluemeta.Len(), "entries") 89 keys := keyvaluemeta.Keys() 90 values := keyvaluemeta.Values() 91 for i := 0; i < keyvaluemeta.Len(); i++ { 92 fmt.Printf("Key nr %d %s: %s\n", i, keys[i], values[i]) 93 } 94 } 95 96 fmt.Println("Number of RowGroups:", rdr.NumRowGroups()) 97 fmt.Println("Number of Real Columns:", fileMetadata.Schema.Root().NumFields()) 98 fmt.Println("Number of Columns:", fileMetadata.Schema.NumColumns()) 99 100 if len(selectedColumns) == 0 { 101 for i := 0; i < fileMetadata.Schema.NumColumns(); i++ { 102 selectedColumns = append(selectedColumns, i) 103 } 104 } else { 105 for _, c := range selectedColumns { 106 if c < 0 || c >= fileMetadata.Schema.NumColumns() { 107 fmt.Fprintln(os.Stderr, "selected column is out of range") 108 os.Exit(1) 109 } 110 } 111 } 112 113 fmt.Println("Number of Selected Columns:", len(selectedColumns)) 114 for _, c := range selectedColumns { 115 descr := fileMetadata.Schema.Column(c) 116 fmt.Printf("Column %d: %s (%s", c, descr.Path(), descr.PhysicalType()) 117 if descr.ConvertedType() != schema.ConvertedTypes.None { 118 fmt.Printf("/%s", descr.ConvertedType()) 119 if descr.ConvertedType() == schema.ConvertedTypes.Decimal { 120 dec := descr.LogicalType().(*schema.DecimalLogicalType) 121 fmt.Printf("(%d,%d)", dec.Precision(), dec.Scale()) 122 } 123 } 124 fmt.Print(")\n") 125 } 126 127 for r := 0; r < rdr.NumRowGroups(); r++ { 128 fmt.Println("--- Row Group:", r, " ---") 129 130 rgr := rdr.RowGroup(r) 131 rowGroupMeta := rgr.MetaData() 132 fmt.Println("--- Total Bytes:", rowGroupMeta.TotalByteSize(), " ---") 133 fmt.Println("--- Rows:", rgr.NumRows(), " ---") 134 135 for _, c := range selectedColumns { 136 chunkMeta, err := rowGroupMeta.ColumnChunk(c) 137 if err != nil { 138 log.Fatal(err) 139 } 140 141 fmt.Println("Column", c) 142 if set, _ := chunkMeta.StatsSet(); set { 143 stats, err := chunkMeta.Statistics() 144 if err != nil { 145 log.Fatal(err) 146 } 147 fmt.Printf(" Values: %d", chunkMeta.NumValues()) 148 if stats.HasMinMax() { 149 fmt.Printf(", Min: %v, Max: %v", 150 metadata.GetStatValue(stats.Type(), stats.EncodeMin()), 151 metadata.GetStatValue(stats.Type(), stats.EncodeMax())) 152 } 153 if stats.HasNullCount() { 154 fmt.Printf(", Null Values: %d", stats.NullCount()) 155 } 156 if stats.HasDistinctCount() { 157 fmt.Printf(", Distinct Values: %d", stats.DistinctCount()) 158 } 159 fmt.Println() 160 } else { 161 fmt.Println(" Values:", chunkMeta.NumValues(), "Statistics Not Set") 162 } 163 164 fmt.Print(" Compression: ", chunkMeta.Compression()) 165 fmt.Print(", Encodings:") 166 for _, enc := range chunkMeta.Encodings() { 167 fmt.Print(" ", enc) 168 } 169 fmt.Println() 170 171 fmt.Print(" Uncompressed Size: ", chunkMeta.TotalUncompressedSize()) 172 fmt.Println(", Compressed Size:", chunkMeta.TotalCompressedSize()) 173 } 174 175 if config.OnlyMetadata { 176 continue 177 } 178 179 fmt.Println("--- Values ---") 180 181 const colwidth = 18 182 183 scanners := make([]*Dumper, len(selectedColumns)) 184 for idx, c := range selectedColumns { 185 scanners[idx] = createDumper(rgr.Column(c)) 186 fmt.Printf(fmt.Sprintf("%%-%ds|", colwidth), rgr.Column(c).Descriptor().Name()) 187 } 188 fmt.Println() 189 190 for { 191 data := false 192 for _, s := range scanners { 193 if val, ok := s.Next(); ok { 194 fmt.Print(s.FormatValue(val, colwidth), "|") 195 data = true 196 } else { 197 fmt.Printf(fmt.Sprintf("%%-%ds|", colwidth), "") 198 } 199 } 200 fmt.Println() 201 if !data { 202 break 203 } 204 } 205 fmt.Println() 206 } 207 }