github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/file_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"errors"
     5  	"io"
     6  	"os"
     7  	"path/filepath"
     8  	"strings"
     9  	"testing"
    10  
    11  	"github.com/parquet-go/parquet-go"
    12  )
    13  
    14  var testdataFiles []string
    15  
    16  func init() {
    17  	entries, _ := os.ReadDir("testdata")
    18  	for _, e := range entries {
    19  		testdataFiles = append(testdataFiles, filepath.Join("testdata", e.Name()))
    20  	}
    21  }
    22  
    23  func TestOpenFile(t *testing.T) {
    24  	for _, path := range testdataFiles {
    25  		t.Run(path, func(t *testing.T) {
    26  			f, err := os.Open(path)
    27  			if err != nil {
    28  				t.Fatal(err)
    29  			}
    30  			defer f.Close()
    31  
    32  			s, err := f.Stat()
    33  			if err != nil {
    34  				t.Fatal(err)
    35  			}
    36  
    37  			p, err := parquet.OpenFile(f, s.Size())
    38  			if err != nil {
    39  				t.Fatal(err)
    40  			}
    41  
    42  			if size := p.Size(); size != s.Size() {
    43  				t.Errorf("file size mismatch: want=%d got=%d", s.Size(), size)
    44  			}
    45  
    46  			root := p.Root()
    47  			b := new(strings.Builder)
    48  			parquet.PrintSchema(b, root.Name(), root)
    49  			t.Log(b)
    50  
    51  			printColumns(t, p.Root(), "")
    52  		})
    53  	}
    54  }
    55  
    56  func TestOpenFileWithoutPageIndex(t *testing.T) {
    57  	for _, path := range testdataFiles {
    58  		t.Run(path, func(t *testing.T) {
    59  			f, err := os.Open(path)
    60  			if err != nil {
    61  				t.Fatal(err)
    62  			}
    63  			defer f.Close()
    64  
    65  			s, err := f.Stat()
    66  			if err != nil {
    67  				t.Fatal(err)
    68  			}
    69  
    70  			fileWithIndex, err := parquet.OpenFile(f, s.Size())
    71  			if err != nil {
    72  				t.Fatal(err)
    73  			}
    74  			fileWithoutIndex, err := parquet.OpenFile(f, s.Size(), parquet.SkipPageIndex(true))
    75  			if err != nil {
    76  				t.Fatal(err)
    77  			}
    78  
    79  			if size := fileWithoutIndex.Size(); size != s.Size() {
    80  				t.Errorf("file size mismatch: want=%d got=%d", s.Size(), size)
    81  			}
    82  
    83  			for iRowGroup, rowGroup := range fileWithoutIndex.RowGroups() {
    84  				for iChunk, chunk := range rowGroup.ColumnChunks() {
    85  					chunkMeta := fileWithoutIndex.Metadata().RowGroups[iRowGroup].Columns[iChunk].MetaData
    86  
    87  					preloadedColumnIndex, pErr := fileWithIndex.RowGroups()[iRowGroup].ColumnChunks()[iChunk].ColumnIndex()
    88  					if errors.Is(pErr, parquet.ErrMissingColumnIndex) && chunkMeta.IndexPageOffset != 0 {
    89  						t.Errorf("get column index for %s: %s", chunkMeta.PathInSchema[0], pErr)
    90  					}
    91  					columnIndex, err := chunk.ColumnIndex()
    92  					if errors.Is(err, parquet.ErrMissingColumnIndex) && chunkMeta.IndexPageOffset != 0 {
    93  						t.Errorf("get column index for %s: %s", chunkMeta.PathInSchema[0], err)
    94  					}
    95  					if !errors.Is(err, pErr) {
    96  						t.Errorf("mismatch when opening file with and without index, chunk=%d, row group=%d", iChunk, iRowGroup)
    97  					}
    98  					if preloadedColumnIndex == nil && columnIndex != nil || preloadedColumnIndex != nil && columnIndex == nil {
    99  						t.Errorf("mismatch when opening file with and without index, chunk=%d, row group=%d", iChunk, iRowGroup)
   100  					}
   101  
   102  					preloadedOffsetIndex, pErr := fileWithIndex.RowGroups()[iRowGroup].ColumnChunks()[iChunk].OffsetIndex()
   103  					if errors.Is(pErr, parquet.ErrMissingOffsetIndex) && chunkMeta.IndexPageOffset != 0 {
   104  						t.Errorf("get offset index for %s: %s", chunkMeta.PathInSchema[0], pErr)
   105  					}
   106  					offsetIndex, err := chunk.OffsetIndex()
   107  					if errors.Is(err, parquet.ErrMissingOffsetIndex) && chunkMeta.IndexPageOffset != 0 {
   108  						t.Errorf("get offset index for %s: %s", chunkMeta.PathInSchema[0], err)
   109  					}
   110  					if !errors.Is(err, pErr) {
   111  						t.Errorf("mismatch when opening file with and without index, chunk=%d, row group=%d", iChunk, iRowGroup)
   112  					}
   113  					if preloadedOffsetIndex == nil && offsetIndex != nil || preloadedOffsetIndex != nil && offsetIndex == nil {
   114  						t.Errorf("mismatch when opening file with and without index, chunk=%d, row group=%d", iChunk, iRowGroup)
   115  					}
   116  				}
   117  			}
   118  		})
   119  	}
   120  }
   121  
   122  func printColumns(t *testing.T, col *parquet.Column, indent string) {
   123  	if t.Failed() {
   124  		return
   125  	}
   126  
   127  	path := strings.Join(col.Path(), ".")
   128  	if col.Leaf() {
   129  		t.Logf("%s%s %v %v", indent, path, col.Encoding(), col.Compression())
   130  	} else {
   131  		t.Logf("%s%s", indent, path)
   132  	}
   133  	indent += ". "
   134  
   135  	buffer := make([]parquet.Value, 42)
   136  	pages := col.Pages()
   137  	defer pages.Close()
   138  	for {
   139  		p, err := pages.ReadPage()
   140  		if err != nil {
   141  			if err != io.EOF {
   142  				t.Error(err)
   143  			}
   144  			break
   145  		}
   146  
   147  		values := p.Values()
   148  		numValues := int64(0)
   149  		nullCount := int64(0)
   150  
   151  		for {
   152  			n, err := values.ReadValues(buffer)
   153  			for _, v := range buffer[:n] {
   154  				if v.Column() != col.Index() {
   155  					t.Errorf("value read from page of column %d says it belongs to column %d", col.Index(), v.Column())
   156  					return
   157  				}
   158  				if v.IsNull() {
   159  					nullCount++
   160  				}
   161  			}
   162  			numValues += int64(n)
   163  			if err != nil {
   164  				if err != io.EOF {
   165  					t.Error(err)
   166  					return
   167  				}
   168  				break
   169  			}
   170  		}
   171  
   172  		if numValues != p.NumValues() {
   173  			t.Errorf("page of column %d declared %d values but %d were read", col.Index(), p.NumValues(), numValues)
   174  			return
   175  		}
   176  
   177  		if nullCount != p.NumNulls() {
   178  			t.Errorf("page of column %d declared %d nulls but %d were read", col.Index(), p.NumNulls(), nullCount)
   179  			return
   180  		}
   181  
   182  		parquet.Release(p)
   183  	}
   184  
   185  	for _, child := range col.Columns() {
   186  		printColumns(t, child, indent)
   187  	}
   188  }
   189  
   190  func TestFileKeyValueMetadata(t *testing.T) {
   191  	type Row struct {
   192  		Name string
   193  	}
   194  
   195  	f, err := createParquetFile(
   196  		makeRows([]Row{{Name: "A"}, {Name: "B"}, {Name: "C"}}),
   197  		parquet.KeyValueMetadata("hello", "ignore this one"),
   198  		parquet.KeyValueMetadata("hello", "world"),
   199  		parquet.KeyValueMetadata("answer", "42"),
   200  	)
   201  	if err != nil {
   202  		t.Fatal(err)
   203  	}
   204  
   205  	for _, want := range [][2]string{
   206  		{"hello", "world"},
   207  		{"answer", "42"},
   208  	} {
   209  		key, value := want[0], want[1]
   210  		if found, ok := f.Lookup(key); !ok || found != value {
   211  			t.Errorf("key/value metadata mismatch: want %q=%q but got %q=%q (found=%t)", key, value, key, found, ok)
   212  		}
   213  	}
   214  }