github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/file_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"io"
     5  	"os"
     6  	"path/filepath"
     7  	"strings"
     8  	"testing"
     9  
    10  	"github.com/segmentio/parquet-go"
    11  )
    12  
    13  var testdataFiles []string
    14  
    15  func init() {
    16  	entries, _ := os.ReadDir("testdata")
    17  	for _, e := range entries {
    18  		testdataFiles = append(testdataFiles, filepath.Join("testdata", e.Name()))
    19  	}
    20  }
    21  
    22  func TestOpenFile(t *testing.T) {
    23  	for _, path := range testdataFiles {
    24  		t.Run(path, func(t *testing.T) {
    25  			f, err := os.Open(path)
    26  			if err != nil {
    27  				t.Fatal(err)
    28  			}
    29  			defer f.Close()
    30  
    31  			s, err := f.Stat()
    32  			if err != nil {
    33  				t.Fatal(err)
    34  			}
    35  
    36  			p, err := parquet.OpenFile(f, s.Size())
    37  			if err != nil {
    38  				t.Fatal(err)
    39  			}
    40  
    41  			if size := p.Size(); size != s.Size() {
    42  				t.Errorf("file size mismatch: want=%d got=%d", s.Size(), size)
    43  			}
    44  
    45  			root := p.Root()
    46  			b := new(strings.Builder)
    47  			parquet.PrintSchema(b, root.Name(), root)
    48  			t.Log(b)
    49  
    50  			printColumns(t, p.Root(), "")
    51  		})
    52  	}
    53  }
    54  
    55  func printColumns(t *testing.T, col *parquet.Column, indent string) {
    56  	if t.Failed() {
    57  		return
    58  	}
    59  
    60  	path := strings.Join(col.Path(), ".")
    61  	if col.Leaf() {
    62  		t.Logf("%s%s %v %v", indent, path, col.Encoding(), col.Compression())
    63  	} else {
    64  		t.Logf("%s%s", indent, path)
    65  	}
    66  	indent += ". "
    67  
    68  	buffer := make([]parquet.Value, 42)
    69  	pages := col.Pages()
    70  	defer pages.Close()
    71  	for {
    72  		p, err := pages.ReadPage()
    73  		if err != nil {
    74  			if err != io.EOF {
    75  				t.Error(err)
    76  			}
    77  			break
    78  		}
    79  
    80  		values := p.Values()
    81  		numValues := int64(0)
    82  		nullCount := int64(0)
    83  
    84  		for {
    85  			n, err := values.ReadValues(buffer)
    86  			for _, v := range buffer[:n] {
    87  				if v.Column() != col.Index() {
    88  					t.Errorf("value read from page of column %d says it belongs to column %d", col.Index(), v.Column())
    89  					return
    90  				}
    91  				if v.IsNull() {
    92  					nullCount++
    93  				}
    94  			}
    95  			numValues += int64(n)
    96  			if err != nil {
    97  				if err != io.EOF {
    98  					t.Error(err)
    99  					return
   100  				}
   101  				break
   102  			}
   103  		}
   104  
   105  		if numValues != p.NumValues() {
   106  			t.Errorf("page of column %d declared %d values but %d were read", col.Index(), p.NumValues(), numValues)
   107  			return
   108  		}
   109  
   110  		if nullCount != p.NumNulls() {
   111  			t.Errorf("page of column %d declared %d nulls but %d were read", col.Index(), p.NumNulls(), nullCount)
   112  			return
   113  		}
   114  
   115  		parquet.Release(p)
   116  	}
   117  
   118  	for _, child := range col.Columns() {
   119  		printColumns(t, child, indent)
   120  	}
   121  }
   122  
   123  func TestFileKeyValueMetadata(t *testing.T) {
   124  	type Row struct {
   125  		Name string
   126  	}
   127  
   128  	f, err := createParquetFile(
   129  		makeRows([]Row{{Name: "A"}, {Name: "B"}, {Name: "C"}}),
   130  		parquet.KeyValueMetadata("hello", "ignore this one"),
   131  		parquet.KeyValueMetadata("hello", "world"),
   132  		parquet.KeyValueMetadata("answer", "42"),
   133  	)
   134  	if err != nil {
   135  		t.Fatal(err)
   136  	}
   137  
   138  	for _, want := range [][2]string{
   139  		{"hello", "world"},
   140  		{"answer", "42"},
   141  	} {
   142  		key, value := want[0], want[1]
   143  		if found, ok := f.Lookup(key); !ok || found != value {
   144  			t.Errorf("key/value metadata mismatch: want %q=%q but got %q=%q (found=%t)", key, value, key, found, ok)
   145  		}
   146  	}
   147  }