github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/writer_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"os"
     7  	"os/exec"
     8  	"strings"
     9  	"testing"
    10  
    11  	"github.com/google/uuid"
    12  	"github.com/hexops/gotextdiff"
    13  	"github.com/hexops/gotextdiff/myers"
    14  	"github.com/hexops/gotextdiff/span"
    15  	"github.com/vc42/parquet-go"
    16  	"github.com/vc42/parquet-go/compress"
    17  )
    18  
    19  const (
    20  	v1 = 1
    21  	v2 = 2
    22  )
    23  
    24  func scanParquetFile(f *os.File) error {
    25  	s, err := f.Stat()
    26  	if err != nil {
    27  		return err
    28  	}
    29  
    30  	p, err := parquet.OpenFile(f, s.Size())
    31  	if err != nil {
    32  		return err
    33  	}
    34  
    35  	return scanParquetValues(p.Root())
    36  }
    37  
    38  func scanParquetValues(col *parquet.Column) error {
    39  	return forEachColumnValue(col, func(leaf *parquet.Column, value parquet.Value) error {
    40  		fmt.Printf("%s > %+v\n", strings.Join(leaf.Path(), "."), value)
    41  		return nil
    42  	})
    43  }
    44  
    45  func generateParquetFile(rows rows, options ...parquet.WriterOption) ([]byte, error) {
    46  	tmp, err := os.CreateTemp("/tmp", "*.parquet")
    47  	if err != nil {
    48  		return nil, err
    49  	}
    50  	defer tmp.Close()
    51  	path := tmp.Name()
    52  	defer os.Remove(path)
    53  	//fmt.Println(path)
    54  
    55  	writerOptions := []parquet.WriterOption{parquet.PageBufferSize(20)}
    56  	writerOptions = append(writerOptions, options...)
    57  
    58  	if err := writeParquetFile(tmp, rows, writerOptions...); err != nil {
    59  		return nil, err
    60  	}
    61  
    62  	if err := scanParquetFile(tmp); err != nil {
    63  		return nil, err
    64  	}
    65  
    66  	return parquetTools("dump", path)
    67  }
    68  
    69  type firstAndLastName struct {
    70  	FirstName string `parquet:"first_name,dict,zstd"`
    71  	LastName  string `parquet:"last_name,delta,zstd"`
    72  }
    73  
    74  type timeseries struct {
    75  	Name      string  `parquet:"name,dict"`
    76  	Timestamp int64   `parquet:"timestamp,delta"`
    77  	Value     float64 `parquet:"value"`
    78  }
    79  
    80  type event struct {
    81  	Name     string  `parquet:"name,dict"`
    82  	Type     string  `parquet:"-"`
    83  	Value    float64 `parquet:"value"`
    84  	Category string  `parquet:"-"`
    85  }
    86  
    87  var writerTests = []struct {
    88  	scenario string
    89  	version  int
    90  	codec    compress.Codec
    91  	rows     []interface{}
    92  	dump     string
    93  }{
    94  	{
    95  		scenario: "page v1 with dictionary encoding",
    96  		version:  v1,
    97  		rows: []interface{}{
    98  			&firstAndLastName{FirstName: "Han", LastName: "Solo"},
    99  			&firstAndLastName{FirstName: "Leia", LastName: "Skywalker"},
   100  			&firstAndLastName{FirstName: "Luke", LastName: "Skywalker"},
   101  		},
   102  		dump: `row group 0
   103  --------------------------------------------------------------------------------
   104  first_name:  BINARY ZSTD DO:4 FPO:55 SZ:123/96/0.78 VC:3 ENC:PLAIN,RLE_DICTIONARY ST:[no stats for this column]
   105  last_name:   BINARY ZSTD DO:0 FPO:127 SZ:127/121/0.95 VC:3 ENC:DELTA_BYTE_ARRAY ST:[no stats for this column]
   106  
   107      first_name TV=3 RL=0 DL=0 DS: 3 DE:PLAIN
   108      ----------------------------------------------------------------------------
   109      page 0:                        DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:5 VC:2
   110      page 1:                        DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:3 VC:1
   111  
   112      last_name TV=3 RL=0 DL=0
   113      ----------------------------------------------------------------------------
   114      page 0:                        DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:56 VC:2
   115      page 1:                        DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:19 VC:1
   116  
   117  BINARY first_name
   118  --------------------------------------------------------------------------------
   119  *** row group 1 of 1, values 1 to 3 ***
   120  value 1: R:0 D:0 V:Han
   121  value 2: R:0 D:0 V:Leia
   122  value 3: R:0 D:0 V:Luke
   123  
   124  BINARY last_name
   125  --------------------------------------------------------------------------------
   126  *** row group 1 of 1, values 1 to 3 ***
   127  value 1: R:0 D:0 V:Solo
   128  value 2: R:0 D:0 V:Skywalker
   129  value 3: R:0 D:0 V:Skywalker
   130  `,
   131  	},
   132  
   133  	{ // same as the previous test but uses page v2 where data pages aren't compressed
   134  		scenario: "page v2 with dictionary encoding",
   135  		version:  v2,
   136  		rows: []interface{}{
   137  			&firstAndLastName{FirstName: "Han", LastName: "Solo"},
   138  			&firstAndLastName{FirstName: "Leia", LastName: "Skywalker"},
   139  			&firstAndLastName{FirstName: "Luke", LastName: "Skywalker"},
   140  		},
   141  		dump: `row group 0
   142  --------------------------------------------------------------------------------
   143  first_name:  BINARY ZSTD DO:4 FPO:55 SZ:115/106/0.92 VC:3 ENC:RLE_DICTIONARY,PLAIN ST:[no stats for this column]
   144  last_name:   BINARY ZSTD DO:0 FPO:119 SZ:137/131/0.96 VC:3 ENC:DELTA_BYTE_ARRAY ST:[no stats for this column]
   145  
   146      first_name TV=3 RL=0 DL=0 DS: 3 DE:PLAIN
   147      ----------------------------------------------------------------------------
   148      page 0:                        DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:5 VC:2
   149      page 1:                        DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:3 VC:1
   150  
   151      last_name TV=3 RL=0 DL=0
   152      ----------------------------------------------------------------------------
   153      page 0:                        DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] SZ:56 VC:2
   154      page 1:                        DLE:RLE RLE:RLE VLE:DELTA_BYTE_ARRAY ST:[no stats for this column] SZ:19 VC:1
   155  
   156  BINARY first_name
   157  --------------------------------------------------------------------------------
   158  *** row group 1 of 1, values 1 to 3 ***
   159  value 1: R:0 D:0 V:Han
   160  value 2: R:0 D:0 V:Leia
   161  value 3: R:0 D:0 V:Luke
   162  
   163  BINARY last_name
   164  --------------------------------------------------------------------------------
   165  *** row group 1 of 1, values 1 to 3 ***
   166  value 1: R:0 D:0 V:Solo
   167  value 2: R:0 D:0 V:Skywalker
   168  value 3: R:0 D:0 V:Skywalker
   169  `,
   170  	},
   171  
   172  	{
   173  		scenario: "timeseries with delta encoding",
   174  		version:  v2,
   175  		codec:    &parquet.Gzip,
   176  		rows: []interface{}{
   177  			timeseries{Name: "http_request_total", Timestamp: 1639444033, Value: 100},
   178  			timeseries{Name: "http_request_total", Timestamp: 1639444058, Value: 0},
   179  			timeseries{Name: "http_request_total", Timestamp: 1639444085, Value: 42},
   180  			timeseries{Name: "http_request_total", Timestamp: 1639444093, Value: 1},
   181  			timeseries{Name: "http_request_total", Timestamp: 1639444101, Value: 2},
   182  			timeseries{Name: "http_request_total", Timestamp: 1639444108, Value: 5},
   183  			timeseries{Name: "http_request_total", Timestamp: 1639444133, Value: 4},
   184  			timeseries{Name: "http_request_total", Timestamp: 1639444137, Value: 5},
   185  			timeseries{Name: "http_request_total", Timestamp: 1639444141, Value: 6},
   186  			timeseries{Name: "http_request_total", Timestamp: 1639444144, Value: 10},
   187  		},
   188  		dump: `row group 0
   189  --------------------------------------------------------------------------------
   190  name:       BINARY GZIP DO:4 FPO:70 SZ:216/191/0.88 VC:10 ENC:PLAIN,RLE_DICTIONARY ST:[no stats for this column]
   191  timestamp:  INT64 GZIP DO:0 FPO:220 SZ:299/550/1.84 VC:10 ENC:DELTA_BINARY_PACKED ST:[no stats for this column]
   192  value:      DOUBLE GZIP DO:0 FPO:519 SZ:292/192/0.66 VC:10 ENC:PLAIN ST:[no stats for this column]
   193  
   194      name TV=10 RL=0 DL=0 DS: 1 DE:PLAIN
   195      ----------------------------------------------------------------------------
   196      page 0:                   DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:2 VC:2
   197      page 1:                   DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:2 VC:2
   198      page 2:                   DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:2 VC:2
   199      page 3:                   DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:2 VC:2
   200      page 4:                   DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] SZ:2 VC:2
   201  
   202      timestamp TV=10 RL=0 DL=0
   203      ----------------------------------------------------------------------------
   204      page 0:                   DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:142 VC:3
   205      page 1:                   DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:142 VC:3
   206      page 2:                   DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:142 VC:3
   207      page 3:                   DLE:RLE RLE:RLE VLE:DELTA_BINARY_PACKED ST:[no stats for this column] SZ:9 VC:1
   208  
   209      value TV=10 RL=0 DL=0
   210      ----------------------------------------------------------------------------
   211      page 0:                   DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:24 VC:3
   212      page 1:                   DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:24 VC:3
   213      page 2:                   DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:24 VC:3
   214      page 3:                   DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] SZ:8 VC:1
   215  
   216  BINARY name
   217  --------------------------------------------------------------------------------
   218  *** row group 1 of 1, values 1 to 10 ***
   219  value 1:  R:0 D:0 V:http_request_total
   220  value 2:  R:0 D:0 V:http_request_total
   221  value 3:  R:0 D:0 V:http_request_total
   222  value 4:  R:0 D:0 V:http_request_total
   223  value 5:  R:0 D:0 V:http_request_total
   224  value 6:  R:0 D:0 V:http_request_total
   225  value 7:  R:0 D:0 V:http_request_total
   226  value 8:  R:0 D:0 V:http_request_total
   227  value 9:  R:0 D:0 V:http_request_total
   228  value 10: R:0 D:0 V:http_request_total
   229  
   230  INT64 timestamp
   231  --------------------------------------------------------------------------------
   232  *** row group 1 of 1, values 1 to 10 ***
   233  value 1:  R:0 D:0 V:1639444033
   234  value 2:  R:0 D:0 V:1639444058
   235  value 3:  R:0 D:0 V:1639444085
   236  value 4:  R:0 D:0 V:1639444093
   237  value 5:  R:0 D:0 V:1639444101
   238  value 6:  R:0 D:0 V:1639444108
   239  value 7:  R:0 D:0 V:1639444133
   240  value 8:  R:0 D:0 V:1639444137
   241  value 9:  R:0 D:0 V:1639444141
   242  value 10: R:0 D:0 V:1639444144
   243  
   244  DOUBLE value
   245  --------------------------------------------------------------------------------
   246  *** row group 1 of 1, values 1 to 10 ***
   247  value 1:  R:0 D:0 V:100.0
   248  value 2:  R:0 D:0 V:0.0
   249  value 3:  R:0 D:0 V:42.0
   250  value 4:  R:0 D:0 V:1.0
   251  value 5:  R:0 D:0 V:2.0
   252  value 6:  R:0 D:0 V:5.0
   253  value 7:  R:0 D:0 V:4.0
   254  value 8:  R:0 D:0 V:5.0
   255  value 9:  R:0 D:0 V:6.0
   256  value 10: R:0 D:0 V:10.0
   257  `,
   258  	},
   259  
   260  	{
   261  		scenario: "example from the twitter blog (v1)",
   262  		version:  v1,
   263  		rows: []interface{}{
   264  			AddressBook{
   265  				Owner: "Julien Le Dem",
   266  				OwnerPhoneNumbers: []string{
   267  					"555 123 4567",
   268  					"555 666 1337",
   269  				},
   270  				Contacts: []Contact{
   271  					{
   272  						Name:        "Dmitriy Ryaboy",
   273  						PhoneNumber: "555 987 6543",
   274  					},
   275  					{
   276  						Name: "Chris Aniszczyk",
   277  					},
   278  				},
   279  			},
   280  			AddressBook{
   281  				Owner:             "A. Nonymous",
   282  				OwnerPhoneNumbers: nil,
   283  			},
   284  		},
   285  
   286  		dump: `row group 0
   287  --------------------------------------------------------------------------------
   288  owner:              BINARY ZSTD DO:0 FPO:4 SZ:81/73/0.90 VC:2 ENC:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column]
   289  ownerPhoneNumbers:  BINARY GZIP DO:0 FPO:85 SZ:179/129/0.72 VC:3 ENC:DELTA_LENGTH_BYTE_ARRAY,RLE ST:[no stats for this column]
   290  contacts:
   291  .name:              BINARY UNCOMPRESSED DO:0 FPO:264 SZ:138/138/1.00 VC:3 ENC:DELTA_LENGTH_BYTE_ARRAY,RLE ST:[no stats for this column]
   292  .phoneNumber:       BINARY ZSTD DO:0 FPO:402 SZ:113/95/0.84 VC:3 ENC:DELTA_LENGTH_BYTE_ARRAY,RLE ST:[no stats for this column]
   293  
   294      owner TV=2 RL=0 DL=0
   295      ----------------------------------------------------------------------------
   296      page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:50 VC:2
   297  
   298      ownerPhoneNumbers TV=3 RL=1 DL=1
   299      ----------------------------------------------------------------------------
   300      page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:64 VC:2
   301      page 1:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:17 VC:1
   302  
   303      contacts.name TV=3 RL=1 DL=1
   304      ----------------------------------------------------------------------------
   305      page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[verified] SZ:73 VC:2
   306      page 1:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[verified] SZ:17 VC:1
   307  
   308      contacts.phoneNumber TV=3 RL=1 DL=2
   309      ----------------------------------------------------------------------------
   310      page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:33 VC:2
   311      page 1:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] CRC:[PAGE CORRUPT] SZ:17 VC:1
   312  
   313  BINARY owner
   314  --------------------------------------------------------------------------------
   315  *** row group 1 of 1, values 1 to 2 ***
   316  value 1: R:0 D:0 V:Julien Le Dem
   317  value 2: R:0 D:0 V:A. Nonymous
   318  
   319  BINARY ownerPhoneNumbers
   320  --------------------------------------------------------------------------------
   321  *** row group 1 of 1, values 1 to 3 ***
   322  value 1: R:0 D:1 V:555 123 4567
   323  value 2: R:1 D:1 V:555 666 1337
   324  value 3: R:0 D:0 V:<null>
   325  
   326  BINARY contacts.name
   327  --------------------------------------------------------------------------------
   328  *** row group 1 of 1, values 1 to 3 ***
   329  value 1: R:0 D:1 V:Dmitriy Ryaboy
   330  value 2: R:1 D:1 V:Chris Aniszczyk
   331  value 3: R:0 D:0 V:<null>
   332  
   333  BINARY contacts.phoneNumber
   334  --------------------------------------------------------------------------------
   335  *** row group 1 of 1, values 1 to 3 ***
   336  value 1: R:0 D:2 V:555 987 6543
   337  value 2: R:1 D:1 V:<null>
   338  value 3: R:0 D:0 V:<null>
   339  `,
   340  	},
   341  
   342  	{
   343  		scenario: "example from the twitter blog (v2)",
   344  		version:  v2,
   345  		rows: []interface{}{
   346  			AddressBook{
   347  				Owner: "Julien Le Dem",
   348  				OwnerPhoneNumbers: []string{
   349  					"555 123 4567",
   350  					"555 666 1337",
   351  				},
   352  				Contacts: []Contact{
   353  					{
   354  						Name:        "Dmitriy Ryaboy",
   355  						PhoneNumber: "555 987 6543",
   356  					},
   357  					{
   358  						Name: "Chris Aniszczyk",
   359  					},
   360  				},
   361  			},
   362  			AddressBook{
   363  				Owner:             "A. Nonymous",
   364  				OwnerPhoneNumbers: nil,
   365  			},
   366  		},
   367  
   368  		dump: `row group 0
   369  --------------------------------------------------------------------------------
   370  owner:              BINARY ZSTD DO:0 FPO:4 SZ:86/78/0.91 VC:2 ENC:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column]
   371  ownerPhoneNumbers:  BINARY GZIP DO:0 FPO:90 SZ:172/122/0.71 VC:3 ENC:DELTA_LENGTH_BYTE_ARRAY,RLE ST:[no stats for this column]
   372  contacts:
   373  .name:              BINARY UNCOMPRESSED DO:0 FPO:262 SZ:132/132/1.00 VC:3 ENC:DELTA_LENGTH_BYTE_ARRAY,RLE ST:[no stats for this column]
   374  .phoneNumber:       BINARY ZSTD DO:0 FPO:394 SZ:108/90/0.83 VC:3 ENC:DELTA_LENGTH_BYTE_ARRAY,RLE ST:[no stats for this column]
   375  
   376      owner TV=2 RL=0 DL=0
   377      ----------------------------------------------------------------------------
   378      page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:50 VC:2
   379  
   380      ownerPhoneNumbers TV=3 RL=1 DL=1
   381      ----------------------------------------------------------------------------
   382      page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:56 VC:2
   383      page 1:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:9 VC:1
   384  
   385      contacts.name TV=3 RL=1 DL=1
   386      ----------------------------------------------------------------------------
   387      page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:65 VC:2
   388      page 1:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:9 VC:1
   389  
   390      contacts.phoneNumber TV=3 RL=1 DL=2
   391      ----------------------------------------------------------------------------
   392      page 0:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:25 VC:2
   393      page 1:  DLE:RLE RLE:RLE VLE:DELTA_LENGTH_BYTE_ARRAY ST:[no stats for this column] SZ:9 VC:1
   394  
   395  BINARY owner
   396  --------------------------------------------------------------------------------
   397  *** row group 1 of 1, values 1 to 2 ***
   398  value 1: R:0 D:0 V:Julien Le Dem
   399  value 2: R:0 D:0 V:A. Nonymous
   400  
   401  BINARY ownerPhoneNumbers
   402  --------------------------------------------------------------------------------
   403  *** row group 1 of 1, values 1 to 3 ***
   404  value 1: R:0 D:1 V:555 123 4567
   405  value 2: R:1 D:1 V:555 666 1337
   406  value 3: R:0 D:0 V:<null>
   407  
   408  BINARY contacts.name
   409  --------------------------------------------------------------------------------
   410  *** row group 1 of 1, values 1 to 3 ***
   411  value 1: R:0 D:1 V:Dmitriy Ryaboy
   412  value 2: R:1 D:1 V:Chris Aniszczyk
   413  value 3: R:0 D:0 V:<null>
   414  
   415  BINARY contacts.phoneNumber
   416  --------------------------------------------------------------------------------
   417  *** row group 1 of 1, values 1 to 3 ***
   418  value 1: R:0 D:2 V:555 987 6543
   419  value 2: R:1 D:1 V:<null>
   420  value 3: R:0 D:0 V:<null>
   421  `,
   422  	},
   423  
   424  	{
   425  		scenario: "omit `-` fields",
   426  		version:  v1,
   427  		rows: []interface{}{
   428  			&event{Name: "customer1", Type: "request", Value: 42.0},
   429  			&event{Name: "customer2", Type: "access", Value: 1.0},
   430  		},
   431  		dump: `row group 0
   432  --------------------------------------------------------------------------------
   433  name:   BINARY UNCOMPRESSED DO:4 FPO:49 SZ:73/73/1.00 VC:2 ENC:PLAIN,RLE_DICTIONARY ST:[no stats for this column]
   434  value:  DOUBLE UNCOMPRESSED DO:0 FPO:77 SZ:39/39/1.00 VC:2 ENC:PLAIN ST:[no stats for this column]
   435  
   436      name TV=2 RL=0 DL=0 DS: 2 DE:PLAIN
   437      ----------------------------------------------------------------------------
   438      page 0:                  DLE:RLE RLE:RLE VLE:RLE_DICTIONARY ST:[no stats for this column] CRC:[verified] SZ:5 VC:2
   439  
   440      value TV=2 RL=0 DL=0
   441      ----------------------------------------------------------------------------
   442      page 0:                  DLE:RLE RLE:RLE VLE:PLAIN ST:[no stats for this column] CRC:[verified] SZ:16 VC:2
   443  
   444  BINARY name
   445  --------------------------------------------------------------------------------
   446  *** row group 1 of 1, values 1 to 2 ***
   447  value 1: R:0 D:0 V:customer1
   448  value 2: R:0 D:0 V:customer2
   449  
   450  DOUBLE value
   451  --------------------------------------------------------------------------------
   452  *** row group 1 of 1, values 1 to 2 ***
   453  value 1: R:0 D:0 V:42.0
   454  value 2: R:0 D:0 V:1.0
   455  `,
   456  	},
   457  }
   458  
   459  func TestWriter(t *testing.T) {
   460  	if !hasParquetTools() {
   461  		t.Skip("parquet-tools are not installed")
   462  	}
   463  
   464  	for _, test := range writerTests {
   465  		dataPageVersion := test.version
   466  		codec := test.codec
   467  		rows := test.rows
   468  		dump := test.dump
   469  
   470  		t.Run(test.scenario, func(t *testing.T) {
   471  			t.Parallel()
   472  
   473  			b, err := generateParquetFile(makeRows(rows),
   474  				parquet.DataPageVersion(dataPageVersion),
   475  				parquet.Compression(codec),
   476  			)
   477  			if err != nil {
   478  				t.Logf("\n%s", string(b))
   479  				t.Fatal(err)
   480  			}
   481  
   482  			if string(b) != dump {
   483  				edits := myers.ComputeEdits(span.URIFromPath("want.txt"), dump, string(b))
   484  				diff := fmt.Sprint(gotextdiff.ToUnified("want.txt", "got.txt", dump, edits))
   485  				t.Errorf("\n%s", diff)
   486  			}
   487  		})
   488  	}
   489  }
   490  
   491  func hasParquetTools() bool {
   492  	_, err := exec.LookPath("parquet-tools")
   493  	return err == nil
   494  }
   495  
   496  func parquetTools(cmd, path string) ([]byte, error) {
   497  	p := exec.Command("parquet-tools", cmd, "--debug", "--disable-crop", path)
   498  
   499  	output, err := p.CombinedOutput()
   500  	if err != nil {
   501  		return output, err
   502  	}
   503  
   504  	// parquet-tools has trailing spaces on some lines
   505  	lines := bytes.Split(output, []byte("\n"))
   506  
   507  	for i, line := range lines {
   508  		lines[i] = bytes.TrimRight(line, " ")
   509  	}
   510  
   511  	return bytes.Join(lines, []byte("\n")), nil
   512  }
   513  
   514  func TestWriterGenerateBloomFilters(t *testing.T) {
   515  	type Person struct {
   516  		FirstName utf8string `parquet:"first_name"`
   517  		LastName  utf8string `parquet:"last_name"`
   518  	}
   519  
   520  	err := quickCheck(func(rows []Person) bool {
   521  		if len(rows) == 0 { // TODO: support writing files with no rows
   522  			return true
   523  		}
   524  
   525  		buffer := new(bytes.Buffer)
   526  		writer := parquet.NewWriter(buffer,
   527  			parquet.BloomFilters(
   528  				parquet.SplitBlockFilter("last_name"),
   529  			),
   530  		)
   531  		for i := range rows {
   532  			if err := writer.Write(&rows[i]); err != nil {
   533  				t.Error(err)
   534  				return false
   535  			}
   536  		}
   537  		if err := writer.Close(); err != nil {
   538  			t.Error(err)
   539  			return false
   540  		}
   541  
   542  		reader := bytes.NewReader(buffer.Bytes())
   543  		f, err := parquet.OpenFile(reader, reader.Size())
   544  		if err != nil {
   545  			t.Error(err)
   546  			return false
   547  		}
   548  		rowGroup := f.RowGroups()[0]
   549  		columns := rowGroup.ColumnChunks()
   550  		firstName := columns[0]
   551  		lastName := columns[1]
   552  
   553  		if firstName.BloomFilter() != nil {
   554  			t.Errorf(`"first_name" column has a bloom filter even though none were configured`)
   555  			return false
   556  		}
   557  
   558  		bloomFilter := lastName.BloomFilter()
   559  		if bloomFilter == nil {
   560  			t.Error(`"last_name" column has no bloom filter despite being configured to have one`)
   561  			return false
   562  		}
   563  
   564  		for i, row := range rows {
   565  			if ok, err := bloomFilter.Check(parquet.ValueOf(row.LastName)); err != nil {
   566  				t.Errorf("unexpected error checking bloom filter: %v", err)
   567  				return false
   568  			} else if !ok {
   569  				t.Errorf("bloom filter does not contain value %q of row %d", row.LastName, i)
   570  				return false
   571  			}
   572  		}
   573  
   574  		return true
   575  	})
   576  	if err != nil {
   577  		t.Error(err)
   578  	}
   579  }
   580  
   581  func TestBloomFilterForDict(t *testing.T) {
   582  	type testStruct struct {
   583  		A string `parquet:"a,dict"`
   584  	}
   585  
   586  	schema := parquet.SchemaOf(&testStruct{})
   587  
   588  	b := bytes.NewBuffer(nil)
   589  	w := parquet.NewWriter(
   590  		b,
   591  		schema,
   592  		parquet.BloomFilters(parquet.SplitBlockFilter("a")),
   593  	)
   594  
   595  	err := w.Write(&testStruct{A: "test"})
   596  	if err != nil {
   597  		t.Fatal(err)
   598  	}
   599  
   600  	err = w.Close()
   601  	if err != nil {
   602  		t.Fatal(err)
   603  	}
   604  
   605  	f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len()))
   606  	if err != nil {
   607  		t.Fatal(err)
   608  	}
   609  
   610  	ok, err := f.RowGroups()[0].ColumnChunks()[0].BloomFilter().Check(parquet.ValueOf("test"))
   611  	if err != nil {
   612  		t.Fatal(err)
   613  	}
   614  	if !ok {
   615  		t.Error("bloom filter should have contained 'test'")
   616  	}
   617  }
   618  
   619  func TestWriterRepeatedUUIDDict(t *testing.T) {
   620  	inputID := uuid.MustParse("123456ab-0000-0000-0000-000000000000")
   621  	records := []struct {
   622  		List []uuid.UUID `parquet:"list,dict"`
   623  	}{{
   624  		[]uuid.UUID{inputID},
   625  	}}
   626  	schema := parquet.SchemaOf(&records[0])
   627  	b := bytes.NewBuffer(nil)
   628  	w := parquet.NewWriter(b, schema)
   629  	if err := w.Write(records[0]); err != nil {
   630  		t.Fatal(err)
   631  	}
   632  	if err := w.Close(); err != nil {
   633  		t.Fatal(err)
   634  	}
   635  
   636  	f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len()))
   637  	if err != nil {
   638  		t.Fatal(err)
   639  	}
   640  
   641  	rowbuf := make([]parquet.Row, 1)
   642  	rows := f.RowGroups()[0].Rows()
   643  	defer rows.Close()
   644  	n, err := rows.ReadRows(rowbuf)
   645  	if n == 0 {
   646  		t.Fatalf("reading row from parquet file: %v", err)
   647  	}
   648  	if len(rowbuf[0]) != 1 {
   649  		t.Errorf("expected 1 value in row, got %d", len(rowbuf[0]))
   650  	}
   651  	if !bytes.Equal(inputID[:], rowbuf[0][0].Bytes()) {
   652  		t.Errorf("expected to get UUID %q back out, got %q", inputID, rowbuf[0][0].Bytes())
   653  	}
   654  }