github.com/apache/arrow/go/v10@v10.0.1/parquet/cmd/parquet_reader/main.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package main 18 19 import ( 20 "bufio" 21 "encoding/json" 22 "fmt" 23 "io" 24 "log" 25 "os" 26 "strconv" 27 "strings" 28 29 "github.com/apache/arrow/go/v10/parquet" 30 "github.com/apache/arrow/go/v10/parquet/file" 31 "github.com/apache/arrow/go/v10/parquet/metadata" 32 "github.com/apache/arrow/go/v10/parquet/schema" 33 "github.com/docopt/docopt-go" 34 ) 35 36 var version = "" 37 var usage = `Parquet Reader (version ` + version + `) 38 Usage: 39 parquet_reader -h | --help 40 parquet_reader [--only-metadata] [--no-metadata] [--no-memory-map] [--json] [--csv] [--output=FILE] 41 [--print-key-value-metadata] [--int96-timestamp] [--columns=COLUMNS] <file> 42 Options: 43 -h --help Show this screen. 44 --print-key-value-metadata Print out the key-value metadata. [default: false] 45 --only-metadata Stop after printing metadata, no values. 46 --no-metadata Do not print metadata. 47 --output=FILE Specify output file for data. [default: -] 48 --no-memory-map Disable memory mapping the file. 49 --int96-timestamp Parse INT96 as TIMESTAMP for legacy support. 50 --json Format output as JSON instead of text. 51 --csv Format output as CSV instead of text. 52 --columns=COLUMNS Specify a subset of columns to print, comma delimited indexes.` 53 54 func main() { 55 opts, _ := docopt.ParseDoc(usage) 56 var config struct { 57 PrintKeyValueMetadata bool 58 OnlyMetadata bool 59 NoMetadata bool 60 Output string 61 NoMemoryMap bool 62 JSON bool `docopt:"--json"` 63 CSV bool `docopt:"--csv"` 64 ParseInt96AsTimestamp bool `docopt:"--int96-timestamp"` 65 Columns string 66 File string 67 } 68 opts.Bind(&config) 69 70 parseInt96AsTimestamp = config.ParseInt96AsTimestamp 71 72 var dataOut io.Writer 73 dataOut = os.Stdout 74 if config.Output != "-" { 75 var err error 76 fileOut, err := os.Create(config.Output) 77 if err != nil { 78 fmt.Fprintf(os.Stderr, "error: --output %q cannot be created, %s\n", config.Output, err) 79 os.Exit(1) 80 } 81 bufOut := bufio.NewWriter(fileOut) 82 defer func() { 83 bufOut.Flush() 84 fileOut.Close() 85 }() 86 dataOut = bufOut 87 } 88 89 if config.CSV && config.JSON { 90 fmt.Fprintln(os.Stderr, "error: both --json and --csv outputs selected.") 91 os.Exit(1) 92 } 93 94 selectedColumns := []int{} 95 if config.Columns != "" { 96 for _, c := range strings.Split(config.Columns, ",") { 97 cval, err := strconv.Atoi(c) 98 if err != nil { 99 fmt.Fprintln(os.Stderr, "error: --columns needs to be comma-delimited integers") 100 os.Exit(1) 101 } 102 selectedColumns = append(selectedColumns, cval) 103 } 104 } 105 106 rdr, err := file.OpenParquetFile(config.File, !config.NoMemoryMap) 107 if err != nil { 108 fmt.Fprintln(os.Stderr, "error opening parquet file: ", err) 109 os.Exit(1) 110 } 111 112 fileMetadata := rdr.MetaData() 113 114 if !config.NoMetadata { 115 fmt.Println("File name:", config.File) 116 fmt.Println("Version:", fileMetadata.Version()) 117 fmt.Println("Created By:", fileMetadata.GetCreatedBy()) 118 fmt.Println("Num Rows:", rdr.NumRows()) 119 120 keyvaluemeta := fileMetadata.KeyValueMetadata() 121 if config.PrintKeyValueMetadata && keyvaluemeta != nil { 122 fmt.Println("Key Value File Metadata:", keyvaluemeta.Len(), "entries") 123 keys := keyvaluemeta.Keys() 124 values := keyvaluemeta.Values() 125 for i := 0; i < keyvaluemeta.Len(); i++ { 126 fmt.Printf("Key nr %d %s: %s\n", i, keys[i], values[i]) 127 } 128 } 129 130 fmt.Println("Number of RowGroups:", rdr.NumRowGroups()) 131 fmt.Println("Number of Real Columns:", fileMetadata.Schema.Root().NumFields()) 132 fmt.Println("Number of Columns:", fileMetadata.Schema.NumColumns()) 133 } 134 135 if len(selectedColumns) == 0 { 136 for i := 0; i < fileMetadata.Schema.NumColumns(); i++ { 137 selectedColumns = append(selectedColumns, i) 138 } 139 } else { 140 for _, c := range selectedColumns { 141 if c < 0 || c >= fileMetadata.Schema.NumColumns() { 142 fmt.Fprintln(os.Stderr, "selected column is out of range") 143 os.Exit(1) 144 } 145 } 146 } 147 148 if !config.NoMetadata { 149 fmt.Println("Number of Selected Columns:", len(selectedColumns)) 150 for _, c := range selectedColumns { 151 descr := fileMetadata.Schema.Column(c) 152 fmt.Printf("Column %d: %s (%s", c, descr.Path(), descr.PhysicalType()) 153 if descr.ConvertedType() != schema.ConvertedTypes.None { 154 fmt.Printf("/%s", descr.ConvertedType()) 155 if descr.ConvertedType() == schema.ConvertedTypes.Decimal { 156 dec := descr.LogicalType().(*schema.DecimalLogicalType) 157 fmt.Printf("(%d,%d)", dec.Precision(), dec.Scale()) 158 } 159 } 160 fmt.Print(")\n") 161 } 162 } 163 164 for r := 0; r < rdr.NumRowGroups(); r++ { 165 if !config.NoMetadata { 166 fmt.Println("--- Row Group:", r, " ---") 167 } 168 169 rgr := rdr.RowGroup(r) 170 rowGroupMeta := rgr.MetaData() 171 if !config.NoMetadata { 172 fmt.Println("--- Total Bytes:", rowGroupMeta.TotalByteSize(), " ---") 173 fmt.Println("--- Rows:", rgr.NumRows(), " ---") 174 } 175 176 for _, c := range selectedColumns { 177 chunkMeta, err := rowGroupMeta.ColumnChunk(c) 178 if err != nil { 179 log.Fatal(err) 180 } 181 182 if !config.NoMetadata { 183 fmt.Println("Column", c) 184 if set, _ := chunkMeta.StatsSet(); set { 185 stats, err := chunkMeta.Statistics() 186 if err != nil { 187 log.Fatal(err) 188 } 189 fmt.Printf(" Values: %d", chunkMeta.NumValues()) 190 if stats.HasMinMax() { 191 fmt.Printf(", Min: %v, Max: %v", 192 metadata.GetStatValue(stats.Type(), stats.EncodeMin()), 193 metadata.GetStatValue(stats.Type(), stats.EncodeMax())) 194 } 195 if stats.HasNullCount() { 196 fmt.Printf(", Null Values: %d", stats.NullCount()) 197 } 198 if stats.HasDistinctCount() { 199 fmt.Printf(", Distinct Values: %d", stats.DistinctCount()) 200 } 201 fmt.Println() 202 } else { 203 fmt.Println(" Values:", chunkMeta.NumValues(), "Statistics Not Set") 204 } 205 206 fmt.Print(" Compression: ", chunkMeta.Compression()) 207 fmt.Print(", Encodings:") 208 for _, enc := range chunkMeta.Encodings() { 209 fmt.Print(" ", enc) 210 } 211 fmt.Println() 212 213 fmt.Print(" Uncompressed Size: ", chunkMeta.TotalUncompressedSize()) 214 fmt.Println(", Compressed Size:", chunkMeta.TotalCompressedSize()) 215 } 216 } 217 218 if config.OnlyMetadata { 219 continue 220 } 221 222 if !config.NoMetadata { 223 fmt.Println("--- Values ---") 224 } 225 226 switch { 227 case config.JSON: 228 fmt.Fprint(dataOut, "[") 229 230 scanners := make([]*Dumper, len(selectedColumns)) 231 fields := make([]string, len(selectedColumns)) 232 for idx, c := range selectedColumns { 233 col, err := rgr.Column(c) 234 if err != nil { 235 log.Fatalf("unable to fetch column=%d err=%s", c, err) 236 } 237 scanners[idx] = createDumper(col) 238 fields[idx] = col.Descriptor().Path() 239 } 240 241 var line string 242 for { 243 if line == "" { 244 line = "\n {" 245 } else { 246 line = ",\n {" 247 } 248 249 data := false 250 first := true 251 for idx, s := range scanners { 252 if val, ok := s.Next(); ok { 253 if !data { 254 fmt.Fprint(dataOut, line) 255 } 256 data = true 257 if val == nil { 258 continue 259 } 260 if !first { 261 fmt.Fprint(dataOut, ",") 262 } 263 first = false 264 switch val.(type) { 265 case bool, int32, int64, float32, float64: 266 default: 267 val = s.FormatValue(val, 0) 268 } 269 jsonVal, err := json.Marshal(val) 270 if err != nil { 271 fmt.Fprintf(os.Stderr, "error: marshalling json for %+v, %s\n", val, err) 272 os.Exit(1) 273 } 274 fmt.Fprintf(dataOut, "\n %q: %s", fields[idx], jsonVal) 275 } 276 } 277 if !data { 278 break 279 } 280 fmt.Fprint(dataOut, "\n }") 281 } 282 283 fmt.Fprintln(dataOut, "\n]") 284 case config.CSV: 285 scanners := make([]*Dumper, len(selectedColumns)) 286 for idx, c := range selectedColumns { 287 if idx > 0 { 288 fmt.Fprint(dataOut, ",") 289 } 290 col, err := rgr.Column(c) 291 if err != nil { 292 log.Fatalf("unable to fetch col=%d err=%s", c, err) 293 } 294 scanners[idx] = createDumper(col) 295 fmt.Fprintf(dataOut, "%q", col.Descriptor().Path()) 296 } 297 fmt.Fprintln(dataOut) 298 299 var line string 300 for { 301 data := false 302 for idx, s := range scanners { 303 if idx > 0 { 304 if data { 305 fmt.Fprint(dataOut, ",") 306 } else { 307 line += "," 308 } 309 } 310 if val, ok := s.Next(); ok { 311 if !data { 312 fmt.Fprint(dataOut, line) 313 } 314 data = true 315 if val == nil { 316 fmt.Fprint(dataOut, "") 317 continue 318 } 319 switch val.(type) { 320 case bool, int32, int64, parquet.Int96, float32, float64: 321 fmt.Fprintf(dataOut, "%v", val) 322 default: 323 fmt.Fprintf(dataOut, "%q", s.FormatValue(val, 0)) 324 } 325 } else { 326 if data { 327 fmt.Fprint(dataOut, ",") 328 } else { 329 line += "," 330 } 331 } 332 } 333 if !data { 334 break 335 } 336 fmt.Fprintln(dataOut) 337 line = "" 338 } 339 fmt.Fprintln(dataOut) 340 default: 341 const colwidth = 18 342 343 scanners := make([]*Dumper, len(selectedColumns)) 344 for idx, c := range selectedColumns { 345 col, err := rgr.Column(c) 346 if err != nil { 347 log.Fatalf("unable to fetch column=%d err=%s", c, err) 348 } 349 scanners[idx] = createDumper(col) 350 fmt.Fprintf(dataOut, fmt.Sprintf("%%-%ds|", colwidth), col.Descriptor().Name()) 351 } 352 fmt.Fprintln(dataOut) 353 354 var line string 355 for { 356 data := false 357 for _, s := range scanners { 358 if val, ok := s.Next(); ok { 359 if !data { 360 fmt.Fprint(dataOut, line) 361 } 362 fmt.Fprint(dataOut, s.FormatValue(val, colwidth), "|") 363 data = true 364 } else { 365 if data { 366 fmt.Fprintf(dataOut, fmt.Sprintf("%%-%ds|", colwidth), "") 367 } else { 368 line += fmt.Sprintf(fmt.Sprintf("%%-%ds|", colwidth), "") 369 } 370 } 371 } 372 if !data { 373 break 374 } 375 fmt.Fprintln(dataOut) 376 line = "" 377 } 378 fmt.Fprintln(dataOut) 379 } 380 } 381 }