github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/file_test.go (about) 1 package parquet_test 2 3 import ( 4 "errors" 5 "io" 6 "os" 7 "path/filepath" 8 "strings" 9 "testing" 10 11 "github.com/parquet-go/parquet-go" 12 ) 13 14 var testdataFiles []string 15 16 func init() { 17 entries, _ := os.ReadDir("testdata") 18 for _, e := range entries { 19 testdataFiles = append(testdataFiles, filepath.Join("testdata", e.Name())) 20 } 21 } 22 23 func TestOpenFile(t *testing.T) { 24 for _, path := range testdataFiles { 25 t.Run(path, func(t *testing.T) { 26 f, err := os.Open(path) 27 if err != nil { 28 t.Fatal(err) 29 } 30 defer f.Close() 31 32 s, err := f.Stat() 33 if err != nil { 34 t.Fatal(err) 35 } 36 37 p, err := parquet.OpenFile(f, s.Size()) 38 if err != nil { 39 t.Fatal(err) 40 } 41 42 if size := p.Size(); size != s.Size() { 43 t.Errorf("file size mismatch: want=%d got=%d", s.Size(), size) 44 } 45 46 root := p.Root() 47 b := new(strings.Builder) 48 parquet.PrintSchema(b, root.Name(), root) 49 t.Log(b) 50 51 printColumns(t, p.Root(), "") 52 }) 53 } 54 } 55 56 func TestOpenFileWithoutPageIndex(t *testing.T) { 57 for _, path := range testdataFiles { 58 t.Run(path, func(t *testing.T) { 59 f, err := os.Open(path) 60 if err != nil { 61 t.Fatal(err) 62 } 63 defer f.Close() 64 65 s, err := f.Stat() 66 if err != nil { 67 t.Fatal(err) 68 } 69 70 fileWithIndex, err := parquet.OpenFile(f, s.Size()) 71 if err != nil { 72 t.Fatal(err) 73 } 74 fileWithoutIndex, err := parquet.OpenFile(f, s.Size(), parquet.SkipPageIndex(true)) 75 if err != nil { 76 t.Fatal(err) 77 } 78 79 if size := fileWithoutIndex.Size(); size != s.Size() { 80 t.Errorf("file size mismatch: want=%d got=%d", s.Size(), size) 81 } 82 83 for iRowGroup, rowGroup := range fileWithoutIndex.RowGroups() { 84 for iChunk, chunk := range rowGroup.ColumnChunks() { 85 chunkMeta := fileWithoutIndex.Metadata().RowGroups[iRowGroup].Columns[iChunk].MetaData 86 87 preloadedColumnIndex, pErr := fileWithIndex.RowGroups()[iRowGroup].ColumnChunks()[iChunk].ColumnIndex() 88 if errors.Is(pErr, parquet.ErrMissingColumnIndex) && chunkMeta.IndexPageOffset != 0 { 89 t.Errorf("get column index for %s: %s", chunkMeta.PathInSchema[0], pErr) 90 } 91 columnIndex, err := chunk.ColumnIndex() 92 if errors.Is(err, parquet.ErrMissingColumnIndex) && chunkMeta.IndexPageOffset != 0 { 93 t.Errorf("get column index for %s: %s", chunkMeta.PathInSchema[0], err) 94 } 95 if !errors.Is(err, pErr) { 96 t.Errorf("mismatch when opening file with and without index, chunk=%d, row group=%d", iChunk, iRowGroup) 97 } 98 if preloadedColumnIndex == nil && columnIndex != nil || preloadedColumnIndex != nil && columnIndex == nil { 99 t.Errorf("mismatch when opening file with and without index, chunk=%d, row group=%d", iChunk, iRowGroup) 100 } 101 102 preloadedOffsetIndex, pErr := fileWithIndex.RowGroups()[iRowGroup].ColumnChunks()[iChunk].OffsetIndex() 103 if errors.Is(pErr, parquet.ErrMissingOffsetIndex) && chunkMeta.IndexPageOffset != 0 { 104 t.Errorf("get offset index for %s: %s", chunkMeta.PathInSchema[0], pErr) 105 } 106 offsetIndex, err := chunk.OffsetIndex() 107 if errors.Is(err, parquet.ErrMissingOffsetIndex) && chunkMeta.IndexPageOffset != 0 { 108 t.Errorf("get offset index for %s: %s", chunkMeta.PathInSchema[0], err) 109 } 110 if !errors.Is(err, pErr) { 111 t.Errorf("mismatch when opening file with and without index, chunk=%d, row group=%d", iChunk, iRowGroup) 112 } 113 if preloadedOffsetIndex == nil && offsetIndex != nil || preloadedOffsetIndex != nil && offsetIndex == nil { 114 t.Errorf("mismatch when opening file with and without index, chunk=%d, row group=%d", iChunk, iRowGroup) 115 } 116 } 117 } 118 }) 119 } 120 } 121 122 func printColumns(t *testing.T, col *parquet.Column, indent string) { 123 if t.Failed() { 124 return 125 } 126 127 path := strings.Join(col.Path(), ".") 128 if col.Leaf() { 129 t.Logf("%s%s %v %v", indent, path, col.Encoding(), col.Compression()) 130 } else { 131 t.Logf("%s%s", indent, path) 132 } 133 indent += ". " 134 135 buffer := make([]parquet.Value, 42) 136 pages := col.Pages() 137 defer pages.Close() 138 for { 139 p, err := pages.ReadPage() 140 if err != nil { 141 if err != io.EOF { 142 t.Error(err) 143 } 144 break 145 } 146 147 values := p.Values() 148 numValues := int64(0) 149 nullCount := int64(0) 150 151 for { 152 n, err := values.ReadValues(buffer) 153 for _, v := range buffer[:n] { 154 if v.Column() != col.Index() { 155 t.Errorf("value read from page of column %d says it belongs to column %d", col.Index(), v.Column()) 156 return 157 } 158 if v.IsNull() { 159 nullCount++ 160 } 161 } 162 numValues += int64(n) 163 if err != nil { 164 if err != io.EOF { 165 t.Error(err) 166 return 167 } 168 break 169 } 170 } 171 172 if numValues != p.NumValues() { 173 t.Errorf("page of column %d declared %d values but %d were read", col.Index(), p.NumValues(), numValues) 174 return 175 } 176 177 if nullCount != p.NumNulls() { 178 t.Errorf("page of column %d declared %d nulls but %d were read", col.Index(), p.NumNulls(), nullCount) 179 return 180 } 181 182 parquet.Release(p) 183 } 184 185 for _, child := range col.Columns() { 186 printColumns(t, child, indent) 187 } 188 } 189 190 func TestFileKeyValueMetadata(t *testing.T) { 191 type Row struct { 192 Name string 193 } 194 195 f, err := createParquetFile( 196 makeRows([]Row{{Name: "A"}, {Name: "B"}, {Name: "C"}}), 197 parquet.KeyValueMetadata("hello", "ignore this one"), 198 parquet.KeyValueMetadata("hello", "world"), 199 parquet.KeyValueMetadata("answer", "42"), 200 ) 201 if err != nil { 202 t.Fatal(err) 203 } 204 205 for _, want := range [][2]string{ 206 {"hello", "world"}, 207 {"answer", "42"}, 208 } { 209 key, value := want[0], want[1] 210 if found, ok := f.Lookup(key); !ok || found != value { 211 t.Errorf("key/value metadata mismatch: want %q=%q but got %q=%q (found=%t)", key, value, key, found, ok) 212 } 213 } 214 }