github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/reader_go18_test.go (about) 1 //go:build go1.18 2 3 package parquet_test 4 5 import ( 6 "bytes" 7 "errors" 8 "fmt" 9 "io" 10 "math/rand" 11 "os" 12 "reflect" 13 "testing" 14 15 "github.com/segmentio/parquet-go" 16 ) 17 18 func TestGenericReader(t *testing.T) { 19 testGenericReader[booleanColumn](t) 20 testGenericReader[int32Column](t) 21 testGenericReader[int64Column](t) 22 testGenericReader[int96Column](t) 23 testGenericReader[floatColumn](t) 24 testGenericReader[doubleColumn](t) 25 testGenericReader[byteArrayColumn](t) 26 testGenericReader[fixedLenByteArrayColumn](t) 27 testGenericReader[stringColumn](t) 28 testGenericReader[indexedStringColumn](t) 29 testGenericReader[uuidColumn](t) 30 testGenericReader[timeColumn](t) 31 testGenericReader[timeInMillisColumn](t) 32 testGenericReader[mapColumn](t) 33 testGenericReader[decimalColumn](t) 34 testGenericReader[addressBook](t) 35 testGenericReader[contact](t) 36 testGenericReader[listColumn2](t) 37 testGenericReader[listColumn1](t) 38 testGenericReader[listColumn0](t) 39 testGenericReader[nestedListColumn1](t) 40 testGenericReader[nestedListColumn](t) 41 testGenericReader[*contact](t) 42 testGenericReader[paddedBooleanColumn](t) 43 testGenericReader[optionalInt32Column](t) 44 testGenericReader[repeatedInt32Column](t) 45 } 46 47 func testGenericReader[Row any](t *testing.T) { 48 var model Row 49 t.Run(reflect.TypeOf(model).Name(), func(t *testing.T) { 50 err := quickCheck(func(rows []Row) bool { 51 if len(rows) == 0 { 52 return true // TODO: fix support for parquet files with zero rows 53 } 54 if err := testGenericReaderRows(rows); err != nil { 55 t.Error(err) 56 return false 57 } 58 return true 59 }) 60 if err != nil { 61 t.Error(err) 62 } 63 }) 64 } 65 66 func testGenericReaderRows[Row any](rows []Row) error { 67 setNullPointers(rows) 68 buffer := new(bytes.Buffer) 69 writer := parquet.NewGenericWriter[Row](buffer) 70 _, err := writer.Write(rows) 71 if err != nil { 72 return err 73 } 74 if err := writer.Close(); err != nil { 75 return err 76 } 77 reader := parquet.NewGenericReader[Row](bytes.NewReader(buffer.Bytes())) 78 result := make([]Row, len(rows)) 79 n, err := reader.Read(result) 80 if err != nil && !errors.Is(err, io.EOF) { 81 return err 82 } 83 if n < len(rows) { 84 return fmt.Errorf("not enough values were read: want=%d got=%d", len(rows), n) 85 } 86 if !reflect.DeepEqual(rows, result) { 87 return fmt.Errorf("rows mismatch:\nwant: %+v\ngot: %+v", rows, result) 88 } 89 return nil 90 } 91 92 func TestIssue400(t *testing.T) { 93 type B struct { 94 Name string 95 } 96 type A struct { 97 B []B `parquet:",optional"` 98 } 99 100 b := new(bytes.Buffer) 101 w := parquet.NewGenericWriter[A](b) 102 expect := []A{ 103 { 104 B: []B{ 105 { 106 // 32 bytes random so we can see in the binary parquet if we 107 // actually wrote the value 108 Name: "9e7eb1f0-bbcc-43ec-bfad-a9fac1bb0feb", 109 }, 110 }, 111 }, 112 } 113 _, err := w.Write(expect) 114 if err != nil { 115 t.Fatal(err) 116 } 117 if err = w.Close(); err != nil { 118 t.Fatal(err) 119 } 120 121 r := parquet.NewGenericReader[A](bytes.NewReader(b.Bytes())) 122 values := make([]A, 1) 123 _, err = r.Read(values) 124 if err != nil { 125 t.Fatal(err) 126 } 127 if !reflect.DeepEqual(expect[0], values[0]) { 128 t.Errorf("want %q got %q", values[0], expect[0]) 129 } 130 } 131 132 func TestReadMinPageSize(t *testing.T) { 133 // NOTE: min page size is 307 for MyRow schema 134 t.Run("test read less than min page size", func(t *testing.T) { testReadMinPageSize(128, t) }) 135 t.Run("test read equal to min page size", func(t *testing.T) { testReadMinPageSize(307, t) }) 136 t.Run("test read more than min page size", func(t *testing.T) { testReadMinPageSize(384, t) }) 137 // NOTE: num rows is 20,000 138 t.Run("test read equal to num rows", func(t *testing.T) { testReadMinPageSize(20_000, t) }) 139 t.Run("test read more than num rows", func(t *testing.T) { testReadMinPageSize(25_000, t) }) 140 } 141 142 func testReadMinPageSize(readSize int, t *testing.T) { 143 type MyRow struct { 144 ID [16]byte `parquet:"id,delta,uuid"` 145 File string `parquet:"file,dict,zstd"` 146 Index int64 `parquet:"index,delta,zstd"` 147 } 148 149 numRows := 20_000 150 maxPageBytes := 5000 151 152 tmp, err := os.CreateTemp("/tmp", "*.parquet") 153 if err != nil { 154 t.Fatal("os.CreateTemp: ", err) 155 } 156 path := tmp.Name() 157 defer os.Remove(path) 158 t.Log("file:", path) 159 160 // The page buffer size ensures we get multiple pages out of this example. 161 w := parquet.NewGenericWriter[MyRow](tmp, parquet.PageBufferSize(maxPageBytes)) 162 // Need to write 1 row at a time here as writing many at once disregards PageBufferSize option. 163 for i := 0; i < numRows; i++ { 164 row := MyRow{ 165 ID: [16]byte{15: byte(i)}, 166 File: "hi" + fmt.Sprint(i), 167 Index: int64(i), 168 } 169 _, err := w.Write([]MyRow{row}) 170 if err != nil { 171 t.Fatal("w.Write: ", err) 172 } 173 // Flush writes rows as row group. 4 total (20k/5k) in this file. 174 if (i+1)%maxPageBytes == 0 { 175 err = w.Flush() 176 if err != nil { 177 t.Fatal("w.Flush: ", err) 178 } 179 } 180 } 181 err = w.Close() 182 if err != nil { 183 t.Fatal("w.Close: ", err) 184 } 185 err = tmp.Close() 186 if err != nil { 187 t.Fatal("tmp.Close: ", err) 188 } 189 190 file, err := os.Open(path) 191 if err != nil { 192 t.Fatal("os.Open", err) 193 } 194 reader := parquet.NewGenericReader[MyRow](file) 195 read := int64(0) 196 nRows := reader.NumRows() 197 rows := make([]MyRow, 0, nRows) 198 buf := make([]MyRow, readSize) // NOTE: min page size is 307 for MyRow schema 199 200 for read < nRows { 201 num, err := reader.Read(buf) 202 read += int64(num) 203 if err != nil && !errors.Is(err, io.EOF) { 204 t.Fatal("Read:", err) 205 } 206 rows = append(rows, buf...) 207 } 208 209 if err := reader.Close(); err != nil { 210 t.Fatal("Close", err) 211 } 212 213 if len(rows) < numRows { 214 t.Fatalf("not enough values were read: want=%d got=%d", len(rows), numRows) 215 } 216 for i, row := range rows[:numRows] { 217 id := [16]byte{15: byte(i)} 218 file := "hi" + fmt.Sprint(i) 219 index := int64(i) 220 221 if row.ID != id || row.File != file || row.Index != index { 222 t.Fatalf("rows mismatch at index: %d got: %+v", i, row) 223 } 224 } 225 } 226 227 func BenchmarkGenericReader(b *testing.B) { 228 benchmarkGenericReader[benchmarkRowType](b) 229 benchmarkGenericReader[booleanColumn](b) 230 benchmarkGenericReader[int32Column](b) 231 benchmarkGenericReader[int64Column](b) 232 benchmarkGenericReader[floatColumn](b) 233 benchmarkGenericReader[doubleColumn](b) 234 benchmarkGenericReader[byteArrayColumn](b) 235 benchmarkGenericReader[fixedLenByteArrayColumn](b) 236 benchmarkGenericReader[stringColumn](b) 237 benchmarkGenericReader[indexedStringColumn](b) 238 benchmarkGenericReader[uuidColumn](b) 239 benchmarkGenericReader[timeColumn](b) 240 benchmarkGenericReader[timeInMillisColumn](b) 241 benchmarkGenericReader[mapColumn](b) 242 benchmarkGenericReader[decimalColumn](b) 243 benchmarkGenericReader[contact](b) 244 benchmarkGenericReader[paddedBooleanColumn](b) 245 benchmarkGenericReader[optionalInt32Column](b) 246 } 247 248 func benchmarkGenericReader[Row generator[Row]](b *testing.B) { 249 var model Row 250 b.Run(reflect.TypeOf(model).Name(), func(b *testing.B) { 251 prng := rand.New(rand.NewSource(0)) 252 rows := make([]Row, benchmarkNumRows) 253 for i := range rows { 254 rows[i] = rows[i].generate(prng) 255 } 256 257 rowbuf := make([]Row, benchmarkRowsPerStep) 258 buffer := parquet.NewGenericBuffer[Row]() 259 buffer.Write(rows) 260 261 b.Run("go1.17", func(b *testing.B) { 262 reader := parquet.NewRowGroupReader(buffer) 263 benchmarkRowsPerSecond(b, func() int { 264 for i := range rowbuf { 265 if err := reader.Read(&rowbuf[i]); err != nil { 266 if err != io.EOF { 267 b.Fatal(err) 268 } else { 269 reader.Reset() 270 } 271 } 272 } 273 return len(rowbuf) 274 }) 275 }) 276 277 b.Run("go1.18", func(b *testing.B) { 278 reader := parquet.NewGenericRowGroupReader[Row](buffer) 279 benchmarkRowsPerSecond(b, func() int { 280 n, err := reader.Read(rowbuf) 281 if err != nil { 282 if err != io.EOF { 283 b.Fatal(err) 284 } else { 285 reader.Reset() 286 } 287 } 288 return n 289 }) 290 }) 291 }) 292 }