github.com/fraugster/parquet-go@v0.12.0/cmd/parquet-tool/cmds/readfile.go (about)

     1  package cmds
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"log"
     7  	"os"
     8  	"sort"
     9  	"strings"
    10  	"text/tabwriter"
    11  
    12  	goparquet "github.com/fraugster/parquet-go"
    13  	"github.com/fraugster/parquet-go/parquetschema"
    14  )
    15  
    16  func catFile(w io.Writer, address string, n int) error {
    17  	fl, err := os.Open(address)
    18  	if err != nil {
    19  		return fmt.Errorf("can not open the file: %q", err)
    20  	}
    21  	defer fl.Close()
    22  
    23  	reader, err := goparquet.NewFileReader(fl)
    24  	if err != nil {
    25  		return fmt.Errorf("failed to read the parquet header: %q", err)
    26  	}
    27  
    28  	columnOrder := getColumnOrder(reader.GetSchemaDefinition())
    29  
    30  	for i := 0; (n == -1) || i < n; i++ {
    31  		data, err := reader.NextRow()
    32  		if err == io.EOF {
    33  			return nil
    34  		}
    35  		if err != nil {
    36  			log.Printf("Reading data failed with error, skip current row group: %q", err)
    37  			continue
    38  		}
    39  
    40  		printData(w, data, "", columnOrder)
    41  		fmt.Println()
    42  	}
    43  
    44  	return nil
    45  }
    46  
    47  func getColumnOrder(schemaDef *parquetschema.SchemaDefinition) map[string]int {
    48  	cols := getColumnList(schemaDef.RootColumn.Children, "")
    49  
    50  	colOrder := map[string]int{}
    51  
    52  	for idx, colName := range cols {
    53  		colOrder[colName] = idx
    54  	}
    55  
    56  	return colOrder
    57  }
    58  
    59  func getColumnList(colDefs []*parquetschema.ColumnDefinition, prefix string) []string {
    60  	cols := []string{}
    61  	for _, col := range colDefs {
    62  		cols = append(cols, prefix+col.SchemaElement.Name)
    63  		if col.Children != nil {
    64  			cols = append(cols, getColumnList(col.Children, col.SchemaElement.Name+".")...)
    65  		}
    66  	}
    67  	return cols
    68  }
    69  
    70  func printPrimitive(w io.Writer, ident, name string, v interface{}) {
    71  	_, _ = fmt.Fprintln(w, ident+name+" = "+fmt.Sprint(v))
    72  }
    73  
    74  func printData(w io.Writer, m map[string]interface{}, ident string, columnOrder map[string]int) {
    75  	cols := []string{}
    76  
    77  	for colName := range m {
    78  		cols = append(cols, colName)
    79  	}
    80  
    81  	sort.Slice(cols, func(i, j int) bool {
    82  		return columnOrder[ident+cols[i]] < columnOrder[ident+cols[j]]
    83  	})
    84  
    85  	for _, colName := range cols {
    86  		switch t := m[colName].(type) {
    87  		case map[string]interface{}:
    88  			_, _ = fmt.Fprintln(w, ident+colName+":")
    89  			printData(w, t, ident+".", columnOrder)
    90  		case []map[string]interface{}:
    91  			for j := range t {
    92  				_, _ = fmt.Fprintln(w, ident+colName+":")
    93  				printData(w, t[j], ident+".", columnOrder)
    94  			}
    95  		case []byte:
    96  			_, _ = fmt.Fprintln(w, ident+colName+" = "+string(t))
    97  		case [][]byte:
    98  			for j := range t {
    99  				_, _ = fmt.Fprintln(w, ident+colName+" = "+string(t[j]))
   100  			}
   101  		case []interface{}:
   102  			for j := range t {
   103  				_, _ = fmt.Fprintln(w, ident+colName+" = "+fmt.Sprint(t[j]))
   104  			}
   105  		default:
   106  			printPrimitive(w, ident, colName, t)
   107  		}
   108  	}
   109  }
   110  
   111  func metaFile(w io.Writer, address string) error {
   112  	fl, err := os.Open(address)
   113  	if err != nil {
   114  		return fmt.Errorf("can not open the file: %q", err)
   115  	}
   116  	defer fl.Close()
   117  
   118  	reader, err := goparquet.NewFileReader(fl)
   119  	if err != nil {
   120  		return fmt.Errorf("failed to read the parquet header: %q", err)
   121  	}
   122  
   123  	cols := reader.Columns()
   124  	writer := tabwriter.NewWriter(w, 8, 8, 0, '\t', 0)
   125  	printFlatSchema(writer, cols, 0)
   126  	return writer.Flush()
   127  }
   128  
   129  func printFlatSchema(w io.Writer, cols []*goparquet.Column, lvl int) {
   130  	dot := strings.Repeat(".", lvl)
   131  	for _, column := range cols {
   132  		_, _ = fmt.Fprintf(w, "%s%s:\t\t", dot, column.Name())
   133  		_, _ = fmt.Fprintf(w, "%s ", column.RepetitionType().String())
   134  		if column.DataColumn() {
   135  			_, _ = fmt.Fprintf(w, "%s R:%d D:%d\n", column.Type().String(), column.MaxRepetitionLevel(), column.MaxDefinitionLevel())
   136  			continue
   137  		} else {
   138  			_, _ = fmt.Fprintf(w, "F:%d\n", column.ChildrenCount())
   139  			printFlatSchema(w, column.Children(), lvl+1)
   140  		}
   141  	}
   142  }