github.com/fraugster/parquet-go@v0.12.0/cmd/parquet-tool/cmds/readfile.go (about) 1 package cmds 2 3 import ( 4 "fmt" 5 "io" 6 "log" 7 "os" 8 "sort" 9 "strings" 10 "text/tabwriter" 11 12 goparquet "github.com/fraugster/parquet-go" 13 "github.com/fraugster/parquet-go/parquetschema" 14 ) 15 16 func catFile(w io.Writer, address string, n int) error { 17 fl, err := os.Open(address) 18 if err != nil { 19 return fmt.Errorf("can not open the file: %q", err) 20 } 21 defer fl.Close() 22 23 reader, err := goparquet.NewFileReader(fl) 24 if err != nil { 25 return fmt.Errorf("failed to read the parquet header: %q", err) 26 } 27 28 columnOrder := getColumnOrder(reader.GetSchemaDefinition()) 29 30 for i := 0; (n == -1) || i < n; i++ { 31 data, err := reader.NextRow() 32 if err == io.EOF { 33 return nil 34 } 35 if err != nil { 36 log.Printf("Reading data failed with error, skip current row group: %q", err) 37 continue 38 } 39 40 printData(w, data, "", columnOrder) 41 fmt.Println() 42 } 43 44 return nil 45 } 46 47 func getColumnOrder(schemaDef *parquetschema.SchemaDefinition) map[string]int { 48 cols := getColumnList(schemaDef.RootColumn.Children, "") 49 50 colOrder := map[string]int{} 51 52 for idx, colName := range cols { 53 colOrder[colName] = idx 54 } 55 56 return colOrder 57 } 58 59 func getColumnList(colDefs []*parquetschema.ColumnDefinition, prefix string) []string { 60 cols := []string{} 61 for _, col := range colDefs { 62 cols = append(cols, prefix+col.SchemaElement.Name) 63 if col.Children != nil { 64 cols = append(cols, getColumnList(col.Children, col.SchemaElement.Name+".")...) 65 } 66 } 67 return cols 68 } 69 70 func printPrimitive(w io.Writer, ident, name string, v interface{}) { 71 _, _ = fmt.Fprintln(w, ident+name+" = "+fmt.Sprint(v)) 72 } 73 74 func printData(w io.Writer, m map[string]interface{}, ident string, columnOrder map[string]int) { 75 cols := []string{} 76 77 for colName := range m { 78 cols = append(cols, colName) 79 } 80 81 sort.Slice(cols, func(i, j int) bool { 82 return columnOrder[ident+cols[i]] < columnOrder[ident+cols[j]] 83 }) 84 85 for _, colName := range cols { 86 switch t := m[colName].(type) { 87 case map[string]interface{}: 88 _, _ = fmt.Fprintln(w, ident+colName+":") 89 printData(w, t, ident+".", columnOrder) 90 case []map[string]interface{}: 91 for j := range t { 92 _, _ = fmt.Fprintln(w, ident+colName+":") 93 printData(w, t[j], ident+".", columnOrder) 94 } 95 case []byte: 96 _, _ = fmt.Fprintln(w, ident+colName+" = "+string(t)) 97 case [][]byte: 98 for j := range t { 99 _, _ = fmt.Fprintln(w, ident+colName+" = "+string(t[j])) 100 } 101 case []interface{}: 102 for j := range t { 103 _, _ = fmt.Fprintln(w, ident+colName+" = "+fmt.Sprint(t[j])) 104 } 105 default: 106 printPrimitive(w, ident, colName, t) 107 } 108 } 109 } 110 111 func metaFile(w io.Writer, address string) error { 112 fl, err := os.Open(address) 113 if err != nil { 114 return fmt.Errorf("can not open the file: %q", err) 115 } 116 defer fl.Close() 117 118 reader, err := goparquet.NewFileReader(fl) 119 if err != nil { 120 return fmt.Errorf("failed to read the parquet header: %q", err) 121 } 122 123 cols := reader.Columns() 124 writer := tabwriter.NewWriter(w, 8, 8, 0, '\t', 0) 125 printFlatSchema(writer, cols, 0) 126 return writer.Flush() 127 } 128 129 func printFlatSchema(w io.Writer, cols []*goparquet.Column, lvl int) { 130 dot := strings.Repeat(".", lvl) 131 for _, column := range cols { 132 _, _ = fmt.Fprintf(w, "%s%s:\t\t", dot, column.Name()) 133 _, _ = fmt.Fprintf(w, "%s ", column.RepetitionType().String()) 134 if column.DataColumn() { 135 _, _ = fmt.Fprintf(w, "%s R:%d D:%d\n", column.Type().String(), column.MaxRepetitionLevel(), column.MaxDefinitionLevel()) 136 continue 137 } else { 138 _, _ = fmt.Fprintf(w, "F:%d\n", column.ChildrenCount()) 139 printFlatSchema(w, column.Children(), lvl+1) 140 } 141 } 142 }