github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/file_test.go (about) 1 package parquet_test 2 3 import ( 4 "io" 5 "os" 6 "path/filepath" 7 "strings" 8 "testing" 9 10 "github.com/segmentio/parquet-go" 11 ) 12 13 var testdataFiles []string 14 15 func init() { 16 entries, _ := os.ReadDir("testdata") 17 for _, e := range entries { 18 testdataFiles = append(testdataFiles, filepath.Join("testdata", e.Name())) 19 } 20 } 21 22 func TestOpenFile(t *testing.T) { 23 for _, path := range testdataFiles { 24 t.Run(path, func(t *testing.T) { 25 f, err := os.Open(path) 26 if err != nil { 27 t.Fatal(err) 28 } 29 defer f.Close() 30 31 s, err := f.Stat() 32 if err != nil { 33 t.Fatal(err) 34 } 35 36 p, err := parquet.OpenFile(f, s.Size()) 37 if err != nil { 38 t.Fatal(err) 39 } 40 41 if size := p.Size(); size != s.Size() { 42 t.Errorf("file size mismatch: want=%d got=%d", s.Size(), size) 43 } 44 45 root := p.Root() 46 b := new(strings.Builder) 47 parquet.PrintSchema(b, root.Name(), root) 48 t.Log(b) 49 50 printColumns(t, p.Root(), "") 51 }) 52 } 53 } 54 55 func printColumns(t *testing.T, col *parquet.Column, indent string) { 56 if t.Failed() { 57 return 58 } 59 60 path := strings.Join(col.Path(), ".") 61 if col.Leaf() { 62 t.Logf("%s%s %v %v", indent, path, col.Encoding(), col.Compression()) 63 } else { 64 t.Logf("%s%s", indent, path) 65 } 66 indent += ". " 67 68 buffer := make([]parquet.Value, 42) 69 pages := col.Pages() 70 defer pages.Close() 71 for { 72 p, err := pages.ReadPage() 73 if err != nil { 74 if err != io.EOF { 75 t.Error(err) 76 } 77 break 78 } 79 80 values := p.Values() 81 numValues := int64(0) 82 nullCount := int64(0) 83 84 for { 85 n, err := values.ReadValues(buffer) 86 for _, v := range buffer[:n] { 87 if v.Column() != col.Index() { 88 t.Errorf("value read from page of column %d says it belongs to column %d", col.Index(), v.Column()) 89 return 90 } 91 if v.IsNull() { 92 nullCount++ 93 } 94 } 95 numValues += int64(n) 96 if err != nil { 97 if err != io.EOF { 98 t.Error(err) 99 return 100 } 101 break 102 } 103 } 104 105 if numValues != p.NumValues() { 106 t.Errorf("page of column %d declared %d values but %d were read", col.Index(), p.NumValues(), numValues) 107 return 108 } 109 110 if nullCount != p.NumNulls() { 111 t.Errorf("page of column %d declared %d nulls but %d were read", col.Index(), p.NumNulls(), nullCount) 112 return 113 } 114 115 parquet.Release(p) 116 } 117 118 for _, child := range col.Columns() { 119 printColumns(t, child, indent) 120 } 121 } 122 123 func TestFileKeyValueMetadata(t *testing.T) { 124 type Row struct { 125 Name string 126 } 127 128 f, err := createParquetFile( 129 makeRows([]Row{{Name: "A"}, {Name: "B"}, {Name: "C"}}), 130 parquet.KeyValueMetadata("hello", "ignore this one"), 131 parquet.KeyValueMetadata("hello", "world"), 132 parquet.KeyValueMetadata("answer", "42"), 133 ) 134 if err != nil { 135 t.Fatal(err) 136 } 137 138 for _, want := range [][2]string{ 139 {"hello", "world"}, 140 {"answer", "42"}, 141 } { 142 key, value := want[0], want[1] 143 if found, ok := f.Lookup(key); !ok || found != value { 144 t.Errorf("key/value metadata mismatch: want %q=%q but got %q=%q (found=%t)", key, value, key, found, ok) 145 } 146 } 147 }