github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/row_builder_test.go (about) 1 package parquet_test 2 3 import ( 4 "fmt" 5 "testing" 6 7 "github.com/segmentio/parquet-go" 8 ) 9 10 func ExampleRowBuilder() { 11 builder := parquet.NewRowBuilder(parquet.Group{ 12 "birth_date": parquet.Optional(parquet.Date()), 13 "first_name": parquet.String(), 14 "last_name": parquet.String(), 15 }) 16 17 builder.Add(1, parquet.ByteArrayValue([]byte("Luke"))) 18 builder.Add(2, parquet.ByteArrayValue([]byte("Skywalker"))) 19 20 row := builder.Row() 21 row.Range(func(columnIndex int, columnValues []parquet.Value) bool { 22 fmt.Printf("%+v\n", columnValues[0]) 23 return true 24 }) 25 26 // Output: 27 // C:0 D:0 R:0 V:<null> 28 // C:1 D:0 R:0 V:Luke 29 // C:2 D:0 R:0 V:Skywalker 30 } 31 32 func TestRowBuilder(t *testing.T) { 33 type ( 34 operation = func(*parquet.RowBuilder) 35 operations = []operation 36 ) 37 38 add := func(columnIndex int, columnValue parquet.Value) operation { 39 return func(b *parquet.RowBuilder) { b.Add(columnIndex, columnValue) } 40 } 41 42 next := func(columnIndex int) operation { 43 return func(b *parquet.RowBuilder) { b.Next(columnIndex) } 44 } 45 46 tests := []struct { 47 scenario string 48 operations operations 49 want parquet.Row 50 schema parquet.Node 51 }{ 52 { 53 scenario: "add missing required column value", 54 want: parquet.Row{ 55 parquet.Int64Value(0).Level(0, 0, 0), 56 }, 57 schema: parquet.Group{ 58 "id": parquet.Int(64), 59 }, 60 }, 61 62 { 63 scenario: "set required column value", 64 operations: operations{ 65 add(0, parquet.Int64Value(1)), 66 }, 67 want: parquet.Row{ 68 parquet.Int64Value(1).Level(0, 0, 0), 69 }, 70 schema: parquet.Group{ 71 "id": parquet.Int(64), 72 }, 73 }, 74 75 { 76 scenario: "set repeated column values", 77 operations: operations{ 78 add(0, parquet.Int64Value(1)), 79 add(1, parquet.ByteArrayValue([]byte(`1`))), 80 add(1, parquet.ByteArrayValue([]byte(`2`))), 81 add(1, parquet.ByteArrayValue([]byte(`3`))), 82 }, 83 want: parquet.Row{ 84 parquet.Int64Value(1).Level(0, 0, 0), 85 parquet.ByteArrayValue([]byte(`1`)).Level(0, 1, 1), 86 parquet.ByteArrayValue([]byte(`2`)).Level(1, 1, 1), 87 parquet.ByteArrayValue([]byte(`3`)).Level(1, 1, 1), 88 }, 89 schema: parquet.Group{ 90 "id": parquet.Int(64), 91 "names": parquet.Repeated(parquet.String()), 92 }, 93 }, 94 95 { 96 scenario: "add missing repeated column value", 97 operations: operations{ 98 add(0, parquet.Int64Value(1)), 99 }, 100 want: parquet.Row{ 101 parquet.Int64Value(1).Level(0, 0, 0), 102 parquet.NullValue().Level(0, 0, 1), 103 }, 104 schema: parquet.Group{ 105 "id": parquet.Int(64), 106 "names": parquet.Repeated(parquet.String()), 107 }, 108 }, 109 110 { 111 scenario: "add missing optional column value", 112 operations: operations{ 113 add(0, parquet.Int64Value(1)), 114 }, 115 want: parquet.Row{ 116 parquet.Int64Value(1).Level(0, 0, 0), 117 parquet.NullValue().Level(0, 0, 1), 118 }, 119 schema: parquet.Group{ 120 "id": parquet.Int(64), 121 "name": parquet.Optional(parquet.String()), 122 }, 123 }, 124 125 { 126 scenario: "add missing nested column values", 127 operations: operations{ 128 add(0, parquet.Int64Value(1)), 129 }, 130 want: parquet.Row{ 131 parquet.Int64Value(1).Level(0, 0, 0), 132 parquet.NullValue().Level(0, 0, 1), 133 parquet.ByteArrayValue(nil).Level(0, 0, 2), 134 parquet.ByteArrayValue(nil).Level(0, 0, 3), 135 }, 136 schema: parquet.Group{ 137 "id": parquet.Int(64), 138 "profile": parquet.Group{ 139 "first_name": parquet.String(), 140 "last_name": parquet.String(), 141 "birth_date": parquet.Optional(parquet.Date()), 142 }, 143 }, 144 }, 145 146 { 147 scenario: "add missing repeated column group", 148 operations: operations{ 149 add(0, parquet.Int64Value(1)), 150 add(2, parquet.ByteArrayValue([]byte(`me`))), 151 add(1, parquet.Int32Value(0)), 152 add(1, parquet.Int32Value(123456)), 153 add(2, parquet.ByteArrayValue([]byte(`you`))), 154 }, 155 want: parquet.Row{ 156 parquet.Int64Value(1).Level(0, 0, 0), 157 158 parquet.Int32Value(0).Level(0, 2, 1), 159 parquet.Int32Value(123456).Level(1, 2, 1), 160 161 parquet.ByteArrayValue([]byte(`me`)).Level(0, 1, 2), 162 parquet.ByteArrayValue([]byte(`you`)).Level(1, 1, 2), 163 164 parquet.NullValue().Level(0, 1, 3), 165 parquet.NullValue().Level(1, 1, 3), 166 }, 167 schema: parquet.Group{ 168 "id": parquet.Int(64), 169 "profiles": parquet.Repeated(parquet.Group{ 170 "first_name": parquet.String(), 171 "last_name": parquet.String(), 172 "birth_date": parquet.Optional(parquet.Date()), 173 }), 174 }, 175 }, 176 177 { 178 scenario: "empty map", 179 want: parquet.Row{ 180 parquet.Value{}.Level(0, 0, 0), 181 parquet.Value{}.Level(0, 0, 1), 182 }, 183 schema: parquet.Group{ 184 "map": parquet.Repeated(parquet.Group{ 185 "key_value": parquet.Group{ 186 "key": parquet.String(), 187 "value": parquet.Optional(parquet.String()), 188 }, 189 }), 190 }, 191 }, 192 193 { 194 scenario: "one nested maps", 195 operations: operations{ 196 add(0, parquet.ByteArrayValue([]byte(`A`))), 197 add(1, parquet.ByteArrayValue([]byte(`1`))), 198 add(0, parquet.ByteArrayValue([]byte(`B`))), 199 add(1, parquet.ByteArrayValue([]byte(`2`))), 200 }, 201 want: parquet.Row{ 202 // objects.attributes.key_value.key 203 parquet.ByteArrayValue([]byte(`A`)).Level(0, 2, 0), 204 parquet.ByteArrayValue([]byte(`B`)).Level(2, 2, 0), 205 // objects.attributes.key_value.value 206 parquet.ByteArrayValue([]byte(`1`)).Level(0, 3, 1), 207 parquet.ByteArrayValue([]byte(`2`)).Level(2, 3, 1), 208 }, 209 schema: parquet.Group{ 210 "objects": parquet.Repeated(parquet.Group{ 211 "attributes": parquet.Repeated(parquet.Group{ 212 "key_value": parquet.Group{ 213 "key": parquet.String(), 214 "value": parquet.Optional(parquet.String()), 215 }, 216 }), 217 }), 218 }, 219 }, 220 221 { 222 scenario: "multiple nested maps", 223 operations: operations{ 224 add(0, parquet.ByteArrayValue([]byte(`A`))), 225 add(1, parquet.ByteArrayValue([]byte(`1`))), 226 add(0, parquet.ByteArrayValue([]byte(`B`))), 227 add(1, parquet.ByteArrayValue([]byte(`2`))), 228 next(1), // same as next(0) because the columns are in the same group 229 add(0, parquet.ByteArrayValue([]byte(`C`))), 230 add(1, parquet.ByteArrayValue([]byte(`3`))), 231 }, 232 want: parquet.Row{ 233 // objects.attributes.key_value.key 234 parquet.ByteArrayValue([]byte(`A`)).Level(0, 2, 0), 235 parquet.ByteArrayValue([]byte(`B`)).Level(2, 2, 0), 236 parquet.ByteArrayValue([]byte(`C`)).Level(1, 2, 0), 237 // objects.attributes.key_value.value 238 parquet.ByteArrayValue([]byte(`1`)).Level(0, 3, 1), 239 parquet.ByteArrayValue([]byte(`2`)).Level(2, 3, 1), 240 parquet.ByteArrayValue([]byte(`3`)).Level(1, 3, 1), 241 }, 242 schema: parquet.Group{ 243 "objects": parquet.Repeated(parquet.Group{ 244 "attributes": parquet.Repeated(parquet.Group{ 245 "key_value": parquet.Group{ 246 "key": parquet.String(), 247 "value": parquet.Optional(parquet.String()), 248 }, 249 }), 250 }), 251 }, 252 }, 253 } 254 255 for _, test := range tests { 256 t.Run(test.scenario, func(t *testing.T) { 257 b := parquet.NewRowBuilder(test.schema) 258 259 for i := 0; i < 2; i++ { 260 for _, op := range test.operations { 261 op(b) 262 } 263 264 if got := b.Row(); !got.Equal(test.want) { 265 t.Fatalf("test %d: rows are not equal\nwant = %+v\ngot = %+v", i+1, test.want, got) 266 } 267 268 b.Reset() 269 } 270 }) 271 } 272 } 273 274 func BenchmarkRowBuilderAdd(b *testing.B) { 275 builder := parquet.NewRowBuilder(parquet.Group{ 276 "ids": parquet.Repeated(parquet.Int(64)), 277 }) 278 279 for i := 0; i < b.N; i++ { 280 builder.Add(0, parquet.Int64Value(int64(i))) 281 282 if (i % 128) == 0 { 283 builder.Reset() // so don't run out of memory ;) 284 } 285 } 286 }