github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/writer_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"os"
     7  	"os/exec"
     8  	"strings"
     9  	"testing"
    10  
    11  	"github.com/google/uuid"
    12  	"github.com/hexops/gotextdiff"
    13  	"github.com/hexops/gotextdiff/myers"
    14  	"github.com/hexops/gotextdiff/span"
    15  
    16  	"github.com/segmentio/parquet-go"
    17  	"github.com/segmentio/parquet-go/compress"
    18  )
    19  
    20  const (
    21  	v1 = 1
    22  	v2 = 2
    23  )
    24  
    25  func scanParquetFile(f *os.File) error {
    26  	s, err := f.Stat()
    27  	if err != nil {
    28  		return err
    29  	}
    30  
    31  	p, err := parquet.OpenFile(f, s.Size())
    32  	if err != nil {
    33  		return err
    34  	}
    35  
    36  	return scanParquetValues(p.Root())
    37  }
    38  
    39  func scanParquetValues(col *parquet.Column) error {
    40  	return forEachColumnValue(col, func(leaf *parquet.Column, value parquet.Value) error {
    41  		fmt.Printf("%s > %+v\n", strings.Join(leaf.Path(), "."), value)
    42  		return nil
    43  	})
    44  }
    45  
    46  func generateParquetFile(rows rows, options ...parquet.WriterOption) ([]byte, error) {
    47  	tmp, err := os.CreateTemp("/tmp", "*.parquet")
    48  	if err != nil {
    49  		return nil, err
    50  	}
    51  	defer tmp.Close()
    52  	path := tmp.Name()
    53  	defer os.Remove(path)
    54  	// fmt.Println(path)
    55  
    56  	writerOptions := []parquet.WriterOption{parquet.PageBufferSize(20)}
    57  	writerOptions = append(writerOptions, options...)
    58  
    59  	if err := writeParquetFile(tmp, rows, writerOptions...); err != nil {
    60  		return nil, err
    61  	}
    62  
    63  	if err := scanParquetFile(tmp); err != nil {
    64  		return nil, err
    65  	}
    66  
    67  	return parquetTools("dump", path)
    68  }
    69  
    70  type firstAndLastName struct {
    71  	FirstName string `parquet:"first_name,dict,zstd"`
    72  	LastName  string `parquet:"last_name,delta,zstd"`
    73  }
    74  
    75  type timeseries struct {
    76  	Name      string  `parquet:"name,dict"`
    77  	Timestamp int64   `parquet:"timestamp,delta"`
    78  	Value     float64 `parquet:"value"`
    79  }
    80  
    81  type event struct {
    82  	Name     string  `parquet:"name,dict"`
    83  	Type     string  `parquet:"-"`
    84  	Value    float64 `parquet:"value"`
    85  	Category string  `parquet:"-"`
    86  }
    87  
    88  var writerTests = []struct {
    89  	scenario string
    90  	version  int
    91  	codec    compress.Codec
    92  	rows     []interface{}
    93  	dump     string
    94  }{
    95  	{
    96  		scenario: "page v1 with dictionary encoding",
    97  		version:  v1,
    98  		rows: []interface{}{
    99  			&firstAndLastName{FirstName: "Han", LastName: "Solo"},
   100  			&firstAndLastName{FirstName: "Leia", LastName: "Skywalker"},
   101  			&firstAndLastName{FirstName: "Luke", LastName: "Skywalker"},
   102  		},
   103  		dump: `row group 0
   104  --------------------------------------------------------------------------------
   105  first_name:  BINARY ZSTD DO:4 FPO:55 SZ:90/72/0.80 VC:3 ENC:RLE_DICTIONARY,PLAIN ST:[min: Han, max: Luke, num_nulls not defined]
   106  last_name:   BINARY ZSTD DO:0 FPO:94 SZ:127/121/0.95 VC:3 ENC:DELTA_BYTE_ARRAY ST:[min: Skywalker, max: Solo, num_nulls not defined]
   107  
   108      first_name TV=3 RL=0 DL=0 DS: 3 DE:PLAIN
   109      ----------------------------------------------------------------------------
   110      page 0:                        DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:7 VC:3
   111  
   112      last_name TV=3 RL=0 DL=0
   113      ----------------------------------------------------------------------------
   114      page 0:                        DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:56 VC:2
   115      page 1:                        DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:19 VC:1
   116  
   117  BINARY first_name
   118  --------------------------------------------------------------------------------
   119  *** row group 1 of 1, values 1 to 3 ***
   120  value 1: R:0 D:0 V:Han
   121  value 2: R:0 D:0 V:Leia
   122  value 3: R:0 D:0 V:Luke
   123  
   124  BINARY last_name
   125  --------------------------------------------------------------------------------
   126  *** row group 1 of 1, values 1 to 3 ***
   127  value 1: R:0 D:0 V:Solo
   128  value 2: R:0 D:0 V:Skywalker
   129  value 3: R:0 D:0 V:Skywalker
   130  `,
   131  	},
   132  
   133  	{ // same as the previous test but uses page v2 where data pages aren't compressed
   134  		scenario: "page v2 with dictionary encoding",
   135  		version:  v2,
   136  		rows: []interface{}{
   137  			&firstAndLastName{FirstName: "Han", LastName: "Solo"},
   138  			&firstAndLastName{FirstName: "Leia", LastName: "Skywalker"},
   139  			&firstAndLastName{FirstName: "Luke", LastName: "Skywalker"},
   140  		},
   141  		dump: `row group 0
   142  --------------------------------------------------------------------------------
   143  first_name:  BINARY ZSTD DO:4 FPO:55 SZ:86/77/0.90 VC:3 ENC:PLAIN,RLE_DICTIONARY ST:[min: Han, max: Luke, num_nulls not defined]
   144  last_name:   BINARY ZSTD DO:0 FPO:90 SZ:137/131/0.96 VC:3 ENC:DELTA_BYTE_ARRAY ST:[min: Skywalker, max: Solo, num_nulls not defined]
   145  
   146      first_name TV=3 RL=0 DL=0 DS: 3 DE:PLAIN
   147      ----------------------------------------------------------------------------
   148      page 0:                        DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:7 VC:3
   149  
   150      last_name TV=3 RL=0 DL=0
   151      ----------------------------------------------------------------------------
   152      page 0:                        DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] SZ:56 VC:2
   153      page 1:                        DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] SZ:19 VC:1
   154  
   155  BINARY first_name
   156  --------------------------------------------------------------------------------
   157  *** row group 1 of 1, values 1 to 3 ***
   158  value 1: R:0 D:0 V:Han
   159  value 2: R:0 D:0 V:Leia
   160  value 3: R:0 D:0 V:Luke
   161  
   162  BINARY last_name
   163  --------------------------------------------------------------------------------
   164  *** row group 1 of 1, values 1 to 3 ***
   165  value 1: R:0 D:0 V:Solo
   166  value 2: R:0 D:0 V:Skywalker
   167  value 3: R:0 D:0 V:Skywalker
   168  `,
   169  	},
   170  
   171  	{
   172  		scenario: "timeseries with delta encoding",
   173  		version:  v2,
   174  		codec:    &parquet.Gzip,
   175  		rows: []interface{}{
   176  			timeseries{Name: "http_request_total", Timestamp: 1639444033, Value: 100},
   177  			timeseries{Name: "http_request_total", Timestamp: 1639444058, Value: 0},
   178  			timeseries{Name: "http_request_total", Timestamp: 1639444085, Value: 42},
   179  			timeseries{Name: "http_request_total", Timestamp: 1639444093, Value: 1},
   180  			timeseries{Name: "http_request_total", Timestamp: 1639444101, Value: 2},
   181  			timeseries{Name: "http_request_total", Timestamp: 1639444108, Value: 5},
   182  			timeseries{Name: "http_request_total", Timestamp: 1639444133, Value: 4},
   183  			timeseries{Name: "http_request_total", Timestamp: 1639444137, Value: 5},
   184  			timeseries{Name: "http_request_total", Timestamp: 1639444141, Value: 6},
   185  			timeseries{Name: "http_request_total", Timestamp: 1639444144, Value: 10},
   186  		},
   187  		dump: `row group 0
   188  --------------------------------------------------------------------------------
   189  name:       BINARY GZIP DO:4 FPO:70 SZ:126/101/0.80 VC:10 ENC:PLAIN,RLE_DICTIONARY ST:[min: http_request_total, max: http_request_total, num_nulls not defined]
   190  timestamp:  INT64 GZIP DO:0 FPO:130 SZ:299/550/1.84 VC:10 ENC:DELTA_BINARY_PACKED ST:[min: 1639444033, max: 1639444144, num_nulls not defined]
   191  value:      DOUBLE GZIP DO:0 FPO:429 SZ:292/192/0.66 VC:10 ENC:PLAIN ST:[min: -0.0, max: 100.0, num_nulls not defined]
   192  
   193      name TV=10 RL=0 DL=0 DS: 1 DE:PLAIN
   194      ----------------------------------------------------------------------------
   195      page 0:                   DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:2 VC:5
   196      page 1:                   DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:2 VC:5
   197  
   198      timestamp TV=10 RL=0 DL=0
   199      ----------------------------------------------------------------------------
   200      page 0:                   DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:142 VC:3
   201      page 1:                   DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:142 VC:3
   202      page 2:                   DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:142 VC:3
   203      page 3:                   DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:9 VC:1
   204  
   205      value TV=10 RL=0 DL=0
   206      ----------------------------------------------------------------------------
   207      page 0:                   DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:24 VC:3
   208      page 1:                   DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:24 VC:3
   209      page 2:                   DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:24 VC:3
   210      page 3:                   DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:8 VC:1
   211  
   212  BINARY name
   213  --------------------------------------------------------------------------------
   214  *** row group 1 of 1, values 1 to 10 ***
   215  value 1:  R:0 D:0 V:http_request_total
   216  value 2:  R:0 D:0 V:http_request_total
   217  value 3:  R:0 D:0 V:http_request_total
   218  value 4:  R:0 D:0 V:http_request_total
   219  value 5:  R:0 D:0 V:http_request_total
   220  value 6:  R:0 D:0 V:http_request_total
   221  value 7:  R:0 D:0 V:http_request_total
   222  value 8:  R:0 D:0 V:http_request_total
   223  value 9:  R:0 D:0 V:http_request_total
   224  value 10: R:0 D:0 V:http_request_total
   225  
   226  INT64 timestamp
   227  --------------------------------------------------------------------------------
   228  *** row group 1 of 1, values 1 to 10 ***
   229  value 1:  R:0 D:0 V:1639444033
   230  value 2:  R:0 D:0 V:1639444058
   231  value 3:  R:0 D:0 V:1639444085
   232  value 4:  R:0 D:0 V:1639444093
   233  value 5:  R:0 D:0 V:1639444101
   234  value 6:  R:0 D:0 V:1639444108
   235  value 7:  R:0 D:0 V:1639444133
   236  value 8:  R:0 D:0 V:1639444137
   237  value 9:  R:0 D:0 V:1639444141
   238  value 10: R:0 D:0 V:1639444144
   239  
   240  DOUBLE value
   241  --------------------------------------------------------------------------------
   242  *** row group 1 of 1, values 1 to 10 ***
   243  value 1:  R:0 D:0 V:100.0
   244  value 2:  R:0 D:0 V:0.0
   245  value 3:  R:0 D:0 V:42.0
   246  value 4:  R:0 D:0 V:1.0
   247  value 5:  R:0 D:0 V:2.0
   248  value 6:  R:0 D:0 V:5.0
   249  value 7:  R:0 D:0 V:4.0
   250  value 8:  R:0 D:0 V:5.0
   251  value 9:  R:0 D:0 V:6.0
   252  value 10: R:0 D:0 V:10.0
   253  `,
   254  	},
   255  
   256  	{
   257  		scenario: "example from the twitter blog (v1)",
   258  		version:  v1,
   259  		rows: []interface{}{
   260  			AddressBook{
   261  				Owner: "Julien Le Dem",
   262  				OwnerPhoneNumbers: []string{
   263  					"555 123 4567",
   264  					"555 666 1337",
   265  				},
   266  				Contacts: []Contact{
   267  					{
   268  						Name:        "Dmitriy Ryaboy",
   269  						PhoneNumber: "555 987 6543",
   270  					},
   271  					{
   272  						Name: "Chris Aniszczyk",
   273  					},
   274  				},
   275  			},
   276  			AddressBook{
   277  				Owner:             "A. Nonymous",
   278  				OwnerPhoneNumbers: nil,
   279  			},
   280  		},
   281  
   282  		dump: `row group 0
   283  --------------------------------------------------------------------------------
   284  owner:              BINARY ZSTD DO:0 FPO:4 SZ:81/73/0.90 VC:2 ENC:DELTA_LENGTH_BYTE_ARRAY ST:[min: A. Nonymous, max: Julien Le Dem, num_nulls not defined]
   285  ownerPhoneNumbers:  BINARY GZIP DO:0 FPO:85 SZ:179/129/0.72 VC:3 ENC:RLE,DELTA_LENGTH_BYTE_ARRAY ST:[min: 555 123 4567, max: 555 666 1337, num_nulls: 1]
   286  contacts:
   287  .name:              BINARY UNCOMPRESSED DO:0 FPO:264 SZ:138/138/1.00 VC:3 ENC:RLE,DELTA_LENGTH_BYTE_ARRAY ST:[min: Chris Aniszczyk, max: Dmitriy Ryaboy, num_nulls: 1]
   288  .phoneNumber:       BINARY ZSTD DO:0 FPO:402 SZ:113/95/0.84 VC:3 ENC:RLE,DELTA_LENGTH_BYTE_ARRAY ST:[min: 555 987 6543, max: 555 987 6543, num_nulls: 2]
   289  
   290      owner TV=2 RL=0 DL=0
   291      ----------------------------------------------------------------------------
   292      page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:50 VC:2
   293  
   294      ownerPhoneNumbers TV=3 RL=1 DL=1
   295      ----------------------------------------------------------------------------
   296      page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:64 VC:2
   297      page 1:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:17 VC:1
   298  
   299      contacts.name TV=3 RL=1 DL=1
   300      ----------------------------------------------------------------------------
   301      page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[verified] SZ:73 VC:2
   302      page 1:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[verified] SZ:17 VC:1
   303  
   304      contacts.phoneNumber TV=3 RL=1 DL=2
   305      ----------------------------------------------------------------------------
   306      page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:33 VC:2
   307      page 1:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:17 VC:1
   308  
   309  BINARY owner
   310  --------------------------------------------------------------------------------
   311  *** row group 1 of 1, values 1 to 2 ***
   312  value 1: R:0 D:0 V:Julien Le Dem
   313  value 2: R:0 D:0 V:A. Nonymous
   314  
   315  BINARY ownerPhoneNumbers
   316  --------------------------------------------------------------------------------
   317  *** row group 1 of 1, values 1 to 3 ***
   318  value 1: R:0 D:1 V:555 123 4567
   319  value 2: R:1 D:1 V:555 666 1337
   320  value 3: R:0 D:0 V:<null>
   321  
   322  BINARY contacts.name
   323  --------------------------------------------------------------------------------
   324  *** row group 1 of 1, values 1 to 3 ***
   325  value 1: R:0 D:1 V:Dmitriy Ryaboy
   326  value 2: R:1 D:1 V:Chris Aniszczyk
   327  value 3: R:0 D:0 V:<null>
   328  
   329  BINARY contacts.phoneNumber
   330  --------------------------------------------------------------------------------
   331  *** row group 1 of 1, values 1 to 3 ***
   332  value 1: R:0 D:2 V:555 987 6543
   333  value 2: R:1 D:1 V:<null>
   334  value 3: R:0 D:0 V:<null>
   335  `,
   336  	},
   337  
   338  	{
   339  		scenario: "example from the twitter blog (v2)",
   340  		version:  v2,
   341  		rows: []interface{}{
   342  			AddressBook{
   343  				Owner: "Julien Le Dem",
   344  				OwnerPhoneNumbers: []string{
   345  					"555 123 4567",
   346  					"555 666 1337",
   347  				},
   348  				Contacts: []Contact{
   349  					{
   350  						Name:        "Dmitriy Ryaboy",
   351  						PhoneNumber: "555 987 6543",
   352  					},
   353  					{
   354  						Name: "Chris Aniszczyk",
   355  					},
   356  				},
   357  			},
   358  			AddressBook{
   359  				Owner:             "A. Nonymous",
   360  				OwnerPhoneNumbers: nil,
   361  			},
   362  		},
   363  
   364  		dump: `row group 0
   365  --------------------------------------------------------------------------------
   366  owner:              BINARY ZSTD DO:0 FPO:4 SZ:86/78/0.91 VC:2 ENC:DELTA_LENGTH_BYTE_ARRAY ST:[min: A. Nonymous, max: Julien Le Dem, num_nulls not defined]
   367  ownerPhoneNumbers:  BINARY GZIP DO:0 FPO:90 SZ:172/122/0.71 VC:3 ENC:RLE,DELTA_LENGTH_BYTE_ARRAY ST:[min: 555 123 4567, max: 555 666 1337, num_nulls: 1]
   368  contacts:
   369  .name:              BINARY UNCOMPRESSED DO:0 FPO:262 SZ:132/132/1.00 VC:3 ENC:RLE,DELTA_LENGTH_BYTE_ARRAY ST:[min: Chris Aniszczyk, max: Dmitriy Ryaboy, num_nulls: 1]
   370  .phoneNumber:       BINARY ZSTD DO:0 FPO:394 SZ:108/90/0.83 VC:3 ENC:RLE,DELTA_LENGTH_BYTE_ARRAY ST:[min: 555 987 6543, max: 555 987 6543, num_nulls: 2]
   371  
   372      owner TV=2 RL=0 DL=0
   373      ----------------------------------------------------------------------------
   374      page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:50 VC:2
   375  
   376      ownerPhoneNumbers TV=3 RL=1 DL=1
   377      ----------------------------------------------------------------------------
   378      page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:56 VC:2
   379      page 1:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:9 VC:1
   380  
   381      contacts.name TV=3 RL=1 DL=1
   382      ----------------------------------------------------------------------------
   383      page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:65 VC:2
   384      page 1:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:9 VC:1
   385  
   386      contacts.phoneNumber TV=3 RL=1 DL=2
   387      ----------------------------------------------------------------------------
   388      page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:25 VC:2
   389      page 1:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:9 VC:1
   390  
   391  BINARY owner
   392  --------------------------------------------------------------------------------
   393  *** row group 1 of 1, values 1 to 2 ***
   394  value 1: R:0 D:0 V:Julien Le Dem
   395  value 2: R:0 D:0 V:A. Nonymous
   396  
   397  BINARY ownerPhoneNumbers
   398  --------------------------------------------------------------------------------
   399  *** row group 1 of 1, values 1 to 3 ***
   400  value 1: R:0 D:1 V:555 123 4567
   401  value 2: R:1 D:1 V:555 666 1337
   402  value 3: R:0 D:0 V:<null>
   403  
   404  BINARY contacts.name
   405  --------------------------------------------------------------------------------
   406  *** row group 1 of 1, values 1 to 3 ***
   407  value 1: R:0 D:1 V:Dmitriy Ryaboy
   408  value 2: R:1 D:1 V:Chris Aniszczyk
   409  value 3: R:0 D:0 V:<null>
   410  
   411  BINARY contacts.phoneNumber
   412  --------------------------------------------------------------------------------
   413  *** row group 1 of 1, values 1 to 3 ***
   414  value 1: R:0 D:2 V:555 987 6543
   415  value 2: R:1 D:1 V:<null>
   416  value 3: R:0 D:0 V:<null>
   417  `,
   418  	},
   419  
   420  	{
   421  		scenario: "omit `-` fields",
   422  		version:  v1,
   423  		rows: []interface{}{
   424  			&event{Name: "customer1", Type: "request", Value: 42.0},
   425  			&event{Name: "customer2", Type: "access", Value: 1.0},
   426  		},
   427  		dump: `row group 0
   428  --------------------------------------------------------------------------------
   429  name:   BINARY UNCOMPRESSED DO:4 FPO:49 SZ:73/73/1.00 VC:2 ENC:RLE_DICTIONARY,PLAIN ST:[min: customer1, max: customer2, num_nulls not defined]
   430  value:  DOUBLE UNCOMPRESSED DO:0 FPO:77 SZ:39/39/1.00 VC:2 ENC:PLAIN ST:[min: 1.0, max: 42.0, num_nulls not defined]
   431  
   432      name TV=2 RL=0 DL=0 DS: 2 DE:PLAIN
   433      ----------------------------------------------------------------------------
   434      page 0:                  DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] CRC:[verified] SZ:5 VC:2
   435  
   436      value TV=2 RL=0 DL=0
   437      ----------------------------------------------------------------------------
   438      page 0:                  DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] CRC:[verified] SZ:16 VC:2
   439  
   440  BINARY name
   441  --------------------------------------------------------------------------------
   442  *** row group 1 of 1, values 1 to 2 ***
   443  value 1: R:0 D:0 V:customer1
   444  value 2: R:0 D:0 V:customer2
   445  
   446  DOUBLE value
   447  --------------------------------------------------------------------------------
   448  *** row group 1 of 1, values 1 to 2 ***
   449  value 1: R:0 D:0 V:42.0
   450  value 2: R:0 D:0 V:1.0
   451  `,
   452  	},
   453  }
   454  
   455  func TestWriter(t *testing.T) {
   456  	if !hasParquetTools() {
   457  		t.Skip("Skipping TestWriter writerTests because parquet-tools are not installed in Github CI. FIXME.") // TODO
   458  	}
   459  
   460  	for _, test := range writerTests {
   461  		dataPageVersion := test.version
   462  		codec := test.codec
   463  		rows := test.rows
   464  		dump := test.dump
   465  
   466  		t.Run(test.scenario, func(t *testing.T) {
   467  			t.Parallel()
   468  
   469  			b, err := generateParquetFile(makeRows(rows),
   470  				parquet.DataPageVersion(dataPageVersion),
   471  				parquet.Compression(codec),
   472  			)
   473  			if err != nil {
   474  				t.Logf("\n%s", string(b))
   475  				t.Fatal(err)
   476  			}
   477  
   478  			if string(b) != dump {
   479  				edits := myers.ComputeEdits(span.URIFromPath("want.txt"), dump, string(b))
   480  				diff := fmt.Sprint(gotextdiff.ToUnified("want.txt", "got.txt", dump, edits))
   481  				t.Errorf("\n%s", diff)
   482  			}
   483  		})
   484  	}
   485  }
   486  
   487  func hasParquetTools() bool {
   488  	_, err := exec.LookPath("parquet-tools")
   489  	return err == nil
   490  }
   491  
   492  func parquetTools(cmd, path string) ([]byte, error) {
   493  	p := exec.Command("parquet-tools", cmd, "--debug", "--disable-crop", path)
   494  
   495  	output, err := p.CombinedOutput()
   496  	if err != nil {
   497  		return output, err
   498  	}
   499  
   500  	// parquet-tools has trailing spaces on some lines
   501  	lines := bytes.Split(output, []byte("\n"))
   502  
   503  	for i, line := range lines {
   504  		lines[i] = bytes.TrimRight(line, " ")
   505  	}
   506  
   507  	return bytes.Join(lines, []byte("\n")), nil
   508  }
   509  
   510  func TestWriterGenerateBloomFilters(t *testing.T) {
   511  	type Person struct {
   512  		FirstName utf8string `parquet:"first_name"`
   513  		LastName  utf8string `parquet:"last_name"`
   514  	}
   515  
   516  	err := quickCheck(func(rows []Person) bool {
   517  		if len(rows) == 0 { // TODO: support writing files with no rows
   518  			return true
   519  		}
   520  
   521  		buffer := new(bytes.Buffer)
   522  		writer := parquet.NewWriter(buffer,
   523  			parquet.BloomFilters(
   524  				parquet.SplitBlockFilter(10, "last_name"),
   525  			),
   526  		)
   527  		for i := range rows {
   528  			if err := writer.Write(&rows[i]); err != nil {
   529  				t.Error(err)
   530  				return false
   531  			}
   532  		}
   533  		if err := writer.Close(); err != nil {
   534  			t.Error(err)
   535  			return false
   536  		}
   537  
   538  		reader := bytes.NewReader(buffer.Bytes())
   539  		f, err := parquet.OpenFile(reader, reader.Size())
   540  		if err != nil {
   541  			t.Error(err)
   542  			return false
   543  		}
   544  		rowGroup := f.RowGroups()[0]
   545  		columns := rowGroup.ColumnChunks()
   546  		firstName := columns[0]
   547  		lastName := columns[1]
   548  
   549  		if firstName.BloomFilter() != nil {
   550  			t.Errorf(`"first_name" column has a bloom filter even though none were configured`)
   551  			return false
   552  		}
   553  
   554  		bloomFilter := lastName.BloomFilter()
   555  		if bloomFilter == nil {
   556  			t.Error(`"last_name" column has no bloom filter despite being configured to have one`)
   557  			return false
   558  		}
   559  
   560  		for i, row := range rows {
   561  			if ok, err := bloomFilter.Check(parquet.ValueOf(row.LastName)); err != nil {
   562  				t.Errorf("unexpected error checking bloom filter: %v", err)
   563  				return false
   564  			} else if !ok {
   565  				t.Errorf("bloom filter does not contain value %q of row %d", row.LastName, i)
   566  				return false
   567  			}
   568  		}
   569  
   570  		return true
   571  	})
   572  	if err != nil {
   573  		t.Error(err)
   574  	}
   575  }
   576  
   577  func TestBloomFilterForDict(t *testing.T) {
   578  	type testStruct struct {
   579  		A string `parquet:"a,dict"`
   580  	}
   581  
   582  	schema := parquet.SchemaOf(&testStruct{})
   583  
   584  	b := bytes.NewBuffer(nil)
   585  	w := parquet.NewWriter(
   586  		b,
   587  		schema,
   588  		parquet.BloomFilters(parquet.SplitBlockFilter(10, "a")),
   589  	)
   590  
   591  	err := w.Write(&testStruct{A: "test"})
   592  	if err != nil {
   593  		t.Fatal(err)
   594  	}
   595  
   596  	err = w.Close()
   597  	if err != nil {
   598  		t.Fatal(err)
   599  	}
   600  
   601  	f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len()))
   602  	if err != nil {
   603  		t.Fatal(err)
   604  	}
   605  
   606  	ok, err := f.RowGroups()[0].ColumnChunks()[0].BloomFilter().Check(parquet.ValueOf("test"))
   607  	if err != nil {
   608  		t.Fatal(err)
   609  	}
   610  	if !ok {
   611  		t.Error("bloom filter should have contained 'test'")
   612  	}
   613  }
   614  
   615  func TestWriterRepeatedUUIDDict(t *testing.T) {
   616  	inputID := uuid.MustParse("123456ab-0000-0000-0000-000000000000")
   617  	records := []struct {
   618  		List []uuid.UUID `parquet:"list,dict"`
   619  	}{{
   620  		[]uuid.UUID{inputID},
   621  	}}
   622  	schema := parquet.SchemaOf(&records[0])
   623  	b := bytes.NewBuffer(nil)
   624  	w := parquet.NewWriter(b, schema)
   625  	if err := w.Write(records[0]); err != nil {
   626  		t.Fatal(err)
   627  	}
   628  	if err := w.Close(); err != nil {
   629  		t.Fatal(err)
   630  	}
   631  
   632  	f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len()))
   633  	if err != nil {
   634  		t.Fatal(err)
   635  	}
   636  
   637  	rowbuf := make([]parquet.Row, 1)
   638  	rows := f.RowGroups()[0].Rows()
   639  	defer rows.Close()
   640  	n, err := rows.ReadRows(rowbuf)
   641  	if n == 0 {
   642  		t.Fatalf("reading row from parquet file: %v", err)
   643  	}
   644  	if len(rowbuf[0]) != 1 {
   645  		t.Errorf("expected 1 value in row, got %d", len(rowbuf[0]))
   646  	}
   647  	if !bytes.Equal(inputID[:], rowbuf[0][0].Bytes()) {
   648  		t.Errorf("expected to get UUID %q back out, got %q", inputID, rowbuf[0][0].Bytes())
   649  	}
   650  }
   651  
   652  func TestWriterResetWithBloomFilters(t *testing.T) {
   653  	type Test struct {
   654  		Value string `parquet:"value,dict"`
   655  	}
   656  
   657  	writer := parquet.NewWriter(new(bytes.Buffer),
   658  		parquet.BloomFilters(
   659  			parquet.SplitBlockFilter(10, "value"),
   660  		),
   661  	)
   662  
   663  	if err := writer.Write(&Test{Value: "foo"}); err != nil {
   664  		t.Fatal(err)
   665  	}
   666  
   667  	if err := writer.Close(); err != nil {
   668  		t.Fatal(err)
   669  	}
   670  
   671  	writer.Reset(new(bytes.Buffer))
   672  
   673  	if err := writer.Write(&Test{Value: "bar"}); err != nil {
   674  		t.Fatal(err)
   675  	}
   676  
   677  	if err := writer.Close(); err != nil {
   678  		t.Fatal(err)
   679  	}
   680  }
   681  
   682  func TestWriterMaxRowsPerRowGroup(t *testing.T) {
   683  	output := new(bytes.Buffer)
   684  	writer := parquet.NewWriter(output, parquet.MaxRowsPerRowGroup(10))
   685  
   686  	for i := 0; i < 100; i++ {
   687  		err := writer.Write(struct{ FirstName, LastName string }{
   688  			FirstName: "0123456789"[i%10 : i%10+1],
   689  			LastName:  "foo",
   690  		})
   691  		if err != nil {
   692  			t.Fatal(err)
   693  		}
   694  	}
   695  
   696  	if err := writer.Close(); err != nil {
   697  		t.Fatal(err)
   698  	}
   699  
   700  	f, err := parquet.OpenFile(bytes.NewReader(output.Bytes()), int64(output.Len()))
   701  	if err != nil {
   702  		t.Fatal(err)
   703  	}
   704  
   705  	rowGroups := f.RowGroups()
   706  	if len(rowGroups) != 10 {
   707  		t.Errorf("wrong number of row groups in parquet file: want=10 got=%d", len(rowGroups))
   708  	}
   709  }
   710  
   711  func TestSetKeyValueMetadata(t *testing.T) {
   712  	testKey := "test-key"
   713  	testValue := "test-value"
   714  
   715  	type testStruct struct {
   716  		A string `parquet:"a,dict"`
   717  	}
   718  
   719  	schema := parquet.SchemaOf(&testStruct{})
   720  
   721  	b := bytes.NewBuffer(nil)
   722  	w := parquet.NewWriter(
   723  		b,
   724  		schema,
   725  	)
   726  
   727  	err := w.Write(&testStruct{A: "test"})
   728  	if err != nil {
   729  		t.Fatal(err)
   730  	}
   731  
   732  	w.SetKeyValueMetadata(testKey, testValue)
   733  
   734  	err = w.Close()
   735  	if err != nil {
   736  		t.Fatal(err)
   737  	}
   738  
   739  	f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len()))
   740  	if err != nil {
   741  		t.Fatal(err)
   742  	}
   743  
   744  	value, ok := f.Lookup(testKey)
   745  	if !ok {
   746  		t.Fatalf("key/value metadata should have included %q", testKey)
   747  	}
   748  	if value != testValue {
   749  		t.Errorf("expected %q, got %q", testValue, value)
   750  	}
   751  }
   752  
   753  func TestSetKeyValueMetadataOverwritesExisting(t *testing.T) {
   754  	testKey := "test-key"
   755  	testValue := "test-value"
   756  
   757  	type testStruct struct {
   758  		A string `parquet:"a,dict"`
   759  	}
   760  
   761  	schema := parquet.SchemaOf(&testStruct{})
   762  
   763  	b := bytes.NewBuffer(nil)
   764  	w := parquet.NewWriter(
   765  		b,
   766  		schema,
   767  		parquet.KeyValueMetadata(testKey, "original-value"),
   768  	)
   769  
   770  	err := w.Write(&testStruct{A: "test"})
   771  	if err != nil {
   772  		t.Fatal(err)
   773  	}
   774  
   775  	w.SetKeyValueMetadata(testKey, testValue)
   776  
   777  	err = w.Close()
   778  	if err != nil {
   779  		t.Fatal(err)
   780  	}
   781  
   782  	f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len()))
   783  	if err != nil {
   784  		t.Fatal(err)
   785  	}
   786  
   787  	value, ok := f.Lookup(testKey)
   788  	if !ok {
   789  		t.Fatalf("key/value metadata should have included %q", testKey)
   790  	}
   791  	if value != testValue {
   792  		t.Errorf("expected %q, got %q", testValue, value)
   793  	}
   794  }