github.com/apache/arrow/go/v14@v14.0.2/parquet/schema/schema_test.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package schema_test 18 19 import ( 20 "os" 21 "testing" 22 23 "github.com/apache/arrow/go/v14/parquet" 24 format "github.com/apache/arrow/go/v14/parquet/internal/gen-go/parquet" 25 "github.com/apache/arrow/go/v14/parquet/schema" 26 "github.com/apache/thrift/lib/go/thrift" 27 "github.com/stretchr/testify/assert" 28 "github.com/stretchr/testify/suite" 29 ) 30 31 func TestColumnPath(t *testing.T) { 32 p := parquet.ColumnPath([]string{"toplevel", "leaf"}) 33 assert.Equal(t, "toplevel.leaf", p.String()) 34 35 p2 := parquet.ColumnPathFromString("toplevel.leaf") 36 assert.Equal(t, "toplevel.leaf", p2.String()) 37 38 extend := p2.Extend("anotherlevel") 39 assert.Equal(t, "toplevel.leaf.anotherlevel", extend.String()) 40 } 41 42 func NewPrimitive(name string, repetition format.FieldRepetitionType, typ format.Type, fieldID int32) *format.SchemaElement { 43 ret := &format.SchemaElement{ 44 Name: name, 45 RepetitionType: format.FieldRepetitionTypePtr(repetition), 46 Type: format.TypePtr(typ), 47 } 48 if fieldID >= 0 { 49 ret.FieldID = &fieldID 50 } 51 return ret 52 } 53 54 func NewGroup(name string, repetition format.FieldRepetitionType, numChildren, fieldID int32) *format.SchemaElement { 55 ret := &format.SchemaElement{ 56 Name: name, 57 RepetitionType: format.FieldRepetitionTypePtr(repetition), 58 NumChildren: &numChildren, 59 } 60 if fieldID >= 0 { 61 ret.FieldID = &fieldID 62 } 63 return ret 64 } 65 66 func TestSchemaNodes(t *testing.T) { 67 suite.Run(t, new(PrimitiveNodeTestSuite)) 68 suite.Run(t, new(GroupNodeTestSuite)) 69 suite.Run(t, new(SchemaConverterSuite)) 70 } 71 72 type PrimitiveNodeTestSuite struct { 73 suite.Suite 74 75 name string 76 fieldID int32 77 node schema.Node 78 } 79 80 func (p *PrimitiveNodeTestSuite) SetupTest() { 81 p.name = "name" 82 p.fieldID = 5 83 } 84 85 func (p *PrimitiveNodeTestSuite) convert(elt *format.SchemaElement) { 86 p.node = schema.MustPrimitive(schema.PrimitiveNodeFromThrift(elt)) 87 p.IsType(&schema.PrimitiveNode{}, p.node) 88 } 89 90 func (p *PrimitiveNodeTestSuite) TestAttrs() { 91 node1 := schema.NewInt32Node("foo" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */) 92 node2 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("bar" /* name */, parquet.Repetitions.Optional, parquet.Types.ByteArray, 93 schema.ConvertedTypes.UTF8, 0 /* type len */, 0 /* precision */, 0 /* scale */, -1 /* fieldID */)) 94 95 p.Equal("foo", node1.Name()) 96 p.Equal(schema.Primitive, node1.Type()) 97 p.Equal(schema.Primitive, node2.Type()) 98 99 p.Equal(parquet.Repetitions.Repeated, node1.RepetitionType()) 100 p.Equal(parquet.Repetitions.Optional, node2.RepetitionType()) 101 102 p.Equal(parquet.Types.Int32, node1.PhysicalType()) 103 p.Equal(parquet.Types.ByteArray, node2.PhysicalType()) 104 105 p.Equal(schema.ConvertedTypes.None, node1.ConvertedType()) 106 p.Equal(schema.ConvertedTypes.UTF8, node2.ConvertedType()) 107 } 108 109 func (p *PrimitiveNodeTestSuite) TestFromParquet() { 110 p.Run("Optional Int32", func() { 111 elt := NewPrimitive(p.name, format.FieldRepetitionType_OPTIONAL, format.Type_INT32, p.fieldID) 112 p.convert(elt) 113 114 p.Equal(p.name, p.node.Name()) 115 p.Equal(p.fieldID, p.node.FieldID()) 116 p.Equal(parquet.Repetitions.Optional, p.node.RepetitionType()) 117 p.Equal(parquet.Types.Int32, p.node.(*schema.PrimitiveNode).PhysicalType()) 118 p.Equal(schema.ConvertedTypes.None, p.node.ConvertedType()) 119 }) 120 121 p.Run("LogicalType", func() { 122 elt := NewPrimitive(p.name, format.FieldRepetitionType_REQUIRED, format.Type_BYTE_ARRAY, p.fieldID) 123 elt.ConvertedType = format.ConvertedTypePtr(format.ConvertedType_UTF8) 124 p.convert(elt) 125 126 p.Equal(parquet.Repetitions.Required, p.node.RepetitionType()) 127 p.Equal(parquet.Types.ByteArray, p.node.(*schema.PrimitiveNode).PhysicalType()) 128 p.Equal(schema.ConvertedTypes.UTF8, p.node.ConvertedType()) 129 }) 130 131 p.Run("FixedLenByteArray", func() { 132 elt := NewPrimitive(p.name, format.FieldRepetitionType_OPTIONAL, format.Type_FIXED_LEN_BYTE_ARRAY, p.fieldID) 133 elt.TypeLength = thrift.Int32Ptr(16) 134 p.convert(elt) 135 136 p.Equal(p.name, p.node.Name()) 137 p.Equal(p.fieldID, p.node.FieldID()) 138 p.Equal(parquet.Repetitions.Optional, p.node.RepetitionType()) 139 p.Equal(parquet.Types.FixedLenByteArray, p.node.(*schema.PrimitiveNode).PhysicalType()) 140 p.Equal(16, p.node.(*schema.PrimitiveNode).TypeLength()) 141 }) 142 143 p.Run("convertedtype::decimal", func() { 144 elt := NewPrimitive(p.name, format.FieldRepetitionType_OPTIONAL, format.Type_FIXED_LEN_BYTE_ARRAY, p.fieldID) 145 elt.ConvertedType = format.ConvertedTypePtr(format.ConvertedType_DECIMAL) 146 elt.TypeLength = thrift.Int32Ptr(6) 147 elt.Scale = thrift.Int32Ptr(2) 148 elt.Precision = thrift.Int32Ptr(12) 149 150 p.convert(elt) 151 p.Equal(parquet.Types.FixedLenByteArray, p.node.(*schema.PrimitiveNode).PhysicalType()) 152 p.Equal(schema.ConvertedTypes.Decimal, p.node.ConvertedType()) 153 p.Equal(6, p.node.(*schema.PrimitiveNode).TypeLength()) 154 p.EqualValues(2, p.node.(*schema.PrimitiveNode).DecimalMetadata().Scale) 155 p.EqualValues(12, p.node.(*schema.PrimitiveNode).DecimalMetadata().Precision) 156 }) 157 } 158 159 func (p *PrimitiveNodeTestSuite) TestEquals() { 160 const fieldID = -1 161 node1 := schema.NewInt32Node("foo" /* name */, parquet.Repetitions.Required, fieldID) 162 node2 := schema.NewInt64Node("foo" /* name */, parquet.Repetitions.Required, fieldID) 163 node3 := schema.NewInt32Node("bar" /* name */, parquet.Repetitions.Required, fieldID) 164 node4 := schema.NewInt32Node("foo" /* name */, parquet.Repetitions.Optional, fieldID) 165 node5 := schema.NewInt32Node("foo" /* name */, parquet.Repetitions.Required, fieldID) 166 167 p.True(node1.Equals(node1)) 168 p.False(node1.Equals(node2)) 169 p.False(node1.Equals(node3)) 170 p.False(node1.Equals(node4)) 171 p.True(node1.Equals(node5)) 172 173 flba1 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray, 174 schema.ConvertedTypes.Decimal, 12 /* type len */, 4 /* precision */, 2 /* scale */, fieldID)) 175 flba2 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray, 176 schema.ConvertedTypes.Decimal, 1 /* type len */, 4 /* precision */, 2 /* scale */, fieldID)) 177 flba2.SetTypeLength(12) 178 179 flba3 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray, 180 schema.ConvertedTypes.Decimal, 1 /* type len */, 4 /* precision */, 2 /* scale */, fieldID)) 181 flba3.SetTypeLength(16) 182 183 flba4 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray, 184 schema.ConvertedTypes.Decimal, 12 /* type len */, 4 /* precision */, 0 /* scale */, fieldID)) 185 flba5 := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, parquet.Types.FixedLenByteArray, 186 schema.ConvertedTypes.None, 12 /* type len */, 4 /* precision */, 0 /* scale */, fieldID)) 187 188 p.True(flba1.Equals(flba2)) 189 p.False(flba1.Equals(flba3)) 190 p.False(flba1.Equals(flba4)) 191 p.False(flba1.Equals(flba5)) 192 } 193 194 func (p *PrimitiveNodeTestSuite) TestPhysicalLogicalMapping() { 195 tests := []struct { 196 typ parquet.Type 197 cnv schema.ConvertedType 198 typLen int 199 precision int 200 scale int 201 shouldErr bool 202 }{ 203 {parquet.Types.Int32, schema.ConvertedTypes.Int32, 0 /* type len */, 0 /* precision */, 0 /* scale */, false}, 204 {parquet.Types.ByteArray, schema.ConvertedTypes.JSON, 0 /* type len */, 0 /* precision */, 0 /* scale */, false}, 205 {parquet.Types.Int32, schema.ConvertedTypes.JSON, 0 /* type len */, 0 /* precision */, 0 /* scale */, true}, 206 {parquet.Types.Int64, schema.ConvertedTypes.TimestampMillis, 0 /* type len */, 0 /* precision */, 0 /* scale */, false}, 207 {parquet.Types.Int32, schema.ConvertedTypes.Int64, 0 /* type len */, 0 /* precision */, 0 /* scale */, true}, 208 {parquet.Types.ByteArray, schema.ConvertedTypes.Int8, 0 /* type len */, 0 /* precision */, 0 /* scale */, true}, 209 {parquet.Types.ByteArray, schema.ConvertedTypes.Interval, 0 /* type len */, 0 /* precision */, 0 /* scale */, true}, 210 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Enum, 0 /* type len */, 0 /* precision */, 0 /* scale */, true}, 211 {parquet.Types.ByteArray, schema.ConvertedTypes.Enum, 0 /* type len */, 0 /* precision */, 0 /* scale */, false}, 212 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 0 /* type len */, 2 /* precision */, 4 /* scale */, true}, 213 {parquet.Types.Float, schema.ConvertedTypes.Decimal, 0 /* type len */, 2 /* precision */, 4 /* scale */, true}, 214 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 0 /* type len */, 4 /* precision */, 0 /* scale */, true}, 215 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 10 /* type len */, 4 /* precision */, -1 /* scale */, true}, 216 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 10 /* type len */, 2 /* precision */, 4 /* scale */, true}, 217 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 10 /* type len */, 6 /* precision */, 4 /* scale */, false}, 218 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Interval, 12 /* type len */, 0 /* precision */, 0 /* scale */, false}, 219 {parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Interval, 10 /* type len */, 0 /* precision */, 0 /* scale */, true}, 220 } 221 for _, tt := range tests { 222 p.Run(tt.typ.String(), func() { 223 _, err := schema.NewPrimitiveNodeConverted("foo" /* name */, parquet.Repetitions.Required, tt.typ, tt.cnv, tt.typLen, tt.precision, tt.scale, -1 /* fieldID */) 224 if tt.shouldErr { 225 p.Error(err) 226 } else { 227 p.NoError(err) 228 } 229 }) 230 } 231 } 232 233 type GroupNodeTestSuite struct { 234 suite.Suite 235 } 236 237 func (g *GroupNodeTestSuite) fields1() []schema.Node { 238 return schema.FieldList{ 239 schema.NewInt32Node("one" /* name */, parquet.Repetitions.Required, -1 /* fieldID */), 240 schema.NewInt64Node("two" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */), 241 schema.NewFloat64Node("three" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */), 242 } 243 } 244 245 func (g *GroupNodeTestSuite) fields2() []schema.Node { 246 return schema.FieldList{ 247 schema.NewInt32Node("duplicate" /* name */, parquet.Repetitions.Required, -1 /* fieldID */), 248 schema.NewInt64Node("unique" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */), 249 schema.NewFloat64Node("duplicate" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */), 250 } 251 } 252 253 func (g *GroupNodeTestSuite) TestAttrs() { 254 fields := g.fields1() 255 256 node1 := schema.MustGroup(schema.NewGroupNode("foo" /* name */, parquet.Repetitions.Repeated, fields, -1 /* fieldID */)) 257 node2 := schema.MustGroup(schema.NewGroupNodeConverted("bar" /* name */, parquet.Repetitions.Optional, fields, schema.ConvertedTypes.List, -1 /* fieldID */)) 258 259 g.Equal("foo", node1.Name()) 260 g.Equal(schema.Group, node1.Type()) 261 g.Equal(len(fields), node1.NumFields()) 262 g.Equal(parquet.Repetitions.Repeated, node1.RepetitionType()) 263 g.Equal(parquet.Repetitions.Optional, node2.RepetitionType()) 264 265 g.Equal(schema.ConvertedTypes.None, node1.ConvertedType()) 266 g.Equal(schema.ConvertedTypes.List, node2.ConvertedType()) 267 } 268 269 func (g *GroupNodeTestSuite) TestEquals() { 270 f1 := g.fields1() 271 f2 := g.fields1() 272 273 group1 := schema.Must(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Repeated, f1, -1 /* fieldID */)) 274 group2 := schema.Must(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Repeated, f2, -1 /* fieldID */)) 275 group3 := schema.Must(schema.NewGroupNode("group2" /* name */, parquet.Repetitions.Repeated, f2, -1 /* fieldID */)) 276 277 f2 = append(f2, schema.NewFloat32Node("four" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */)) 278 group4 := schema.Must(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Repeated, f2, -1 /* fieldID */)) 279 group5 := schema.Must(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Repeated, g.fields1(), -1 /* fieldID */)) 280 281 g.True(group1.Equals(group1)) 282 g.True(group1.Equals(group2)) 283 g.False(group1.Equals(group3)) 284 g.False(group1.Equals(group4)) 285 g.False(group5.Equals(group4)) 286 } 287 288 func (g *GroupNodeTestSuite) TestFieldIndex() { 289 fields := g.fields1() 290 group := schema.MustGroup(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Required, fields, -1 /* fieldID */)) 291 for idx, field := range fields { 292 f := group.Field(idx) 293 g.Same(field, f) 294 g.Equal(idx, group.FieldIndexByField(f)) 295 g.Equal(idx, group.FieldIndexByName(field.Name())) 296 } 297 298 // Non field nodes 299 nonFieldAlien := schema.NewInt32Node("alien" /* name */, parquet.Repetitions.Required, -1 /* fieldID */) 300 nonFieldFamiliar := schema.NewInt32Node("one" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */) 301 g.Less(group.FieldIndexByField(nonFieldAlien), 0) 302 g.Less(group.FieldIndexByField(nonFieldFamiliar), 0) 303 } 304 305 func (g *GroupNodeTestSuite) TestFieldIndexDuplicateName() { 306 fields := g.fields2() 307 group := schema.MustGroup(schema.NewGroupNode("group" /* name */, parquet.Repetitions.Required, fields, -1 /* fieldID */)) 308 for idx, field := range fields { 309 f := group.Field(idx) 310 g.Same(f, field) 311 g.Equal(idx, group.FieldIndexByField(f)) 312 } 313 } 314 315 type SchemaConverterSuite struct { 316 suite.Suite 317 318 name string 319 node schema.Node 320 } 321 322 func (s *SchemaConverterSuite) SetupSuite() { 323 s.name = "parquet_schema" 324 } 325 326 func (s *SchemaConverterSuite) convert(elems []*format.SchemaElement) { 327 s.node = schema.Must(schema.FromParquet(elems)) 328 s.Equal(schema.Group, s.node.Type()) 329 } 330 331 func (s *SchemaConverterSuite) checkParentConsistency(groupRoot *schema.GroupNode) bool { 332 // each node should have the group as parent 333 for i := 0; i < groupRoot.NumFields(); i++ { 334 field := groupRoot.Field(i) 335 if field.Parent() != groupRoot { 336 return false 337 } 338 if field.Type() == schema.Group { 339 if !s.checkParentConsistency(field.(*schema.GroupNode)) { 340 return false 341 } 342 } 343 } 344 return true 345 } 346 347 func (s *SchemaConverterSuite) TestNestedExample() { 348 elements := make([]*format.SchemaElement, 0) 349 elements = append(elements, 350 NewGroup(s.name, format.FieldRepetitionType_REPEATED, 2 /* numChildren */, 0 /* fieldID */), 351 NewPrimitive("a" /* name */, format.FieldRepetitionType_REQUIRED, format.Type_INT32, 1 /* fieldID */), 352 NewGroup("bag" /* name */, format.FieldRepetitionType_OPTIONAL, 1 /* numChildren */, 2 /* fieldID */)) 353 elt := NewGroup("b" /* name */, format.FieldRepetitionType_REPEATED, 1 /* numChildren */, 3 /* fieldID */) 354 elt.ConvertedType = format.ConvertedTypePtr(format.ConvertedType_LIST) 355 elements = append(elements, elt, NewPrimitive("item" /* name */, format.FieldRepetitionType_OPTIONAL, format.Type_INT64, 4 /* fieldID */)) 356 357 s.convert(elements) 358 359 // construct the expected schema 360 fields := make([]schema.Node, 0) 361 fields = append(fields, schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, 1 /* fieldID */)) 362 363 // 3-level list encoding 364 item := schema.NewInt64Node("item" /* name */, parquet.Repetitions.Optional, 4 /* fieldID */) 365 list := schema.MustGroup(schema.NewGroupNodeConverted("b" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item}, schema.ConvertedTypes.List, 3 /* fieldID */)) 366 bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, 2 /* fieldID */)) 367 fields = append(fields, bag) 368 369 sc := schema.MustGroup(schema.NewGroupNode(s.name, parquet.Repetitions.Repeated, fields, 0 /* fieldID */)) 370 s.True(sc.Equals(s.node)) 371 s.Nil(s.node.Parent()) 372 s.True(s.checkParentConsistency(s.node.(*schema.GroupNode))) 373 } 374 375 func (s *SchemaConverterSuite) TestZeroColumns() { 376 elements := []*format.SchemaElement{NewGroup("schema" /* name */, format.FieldRepetitionType_REPEATED, 0 /* numChildren */, 0 /* fieldID */)} 377 s.NotPanics(func() { s.convert(elements) }) 378 } 379 380 func (s *SchemaConverterSuite) TestInvalidRoot() { 381 // According to the Parquet spec, the first element in the list<SchemaElement> 382 // is a group whose children (and their descendants) contain all of the rest of 383 // the flattened schema elments. If the first element is not a group, it is malformed 384 elements := []*format.SchemaElement{NewPrimitive("not-a-group" /* name */, format.FieldRepetitionType_REQUIRED, 385 format.Type_INT32, 0 /* fieldID */), format.NewSchemaElement()} 386 s.Panics(func() { s.convert(elements) }) 387 388 // While the parquet spec indicates that the root group should have REPEATED 389 // repetition type, some implementations may return REQUIRED or OPTIONAL 390 // groups as the first element. These tests check that this is okay as a 391 // practicality matter 392 elements = []*format.SchemaElement{ 393 NewGroup("not-repeated" /* name */, format.FieldRepetitionType_REQUIRED, 1 /* numChildren */, 0 /* fieldID */), 394 NewPrimitive("a" /* name */, format.FieldRepetitionType_REQUIRED, format.Type_INT32, 1 /* fieldID */)} 395 s.NotPanics(func() { s.convert(elements) }) 396 397 elements[0] = NewGroup("not-repeated" /* name */, format.FieldRepetitionType_OPTIONAL, 1 /* numChildren */, 0 /* fieldID */) 398 s.NotPanics(func() { s.convert(elements) }) 399 } 400 401 func (s *SchemaConverterSuite) TestNotEnoughChildren() { 402 s.Panics(func() { 403 s.convert([]*format.SchemaElement{NewGroup(s.name, format.FieldRepetitionType_REPEATED, 2 /* numChildren */, 0 /* fieldID */)}) 404 }) 405 } 406 407 func TestColumnDesc(t *testing.T) { 408 n := schema.MustPrimitive(schema.NewPrimitiveNodeConverted("name" /* name */, parquet.Repetitions.Optional, parquet.Types.ByteArray, 409 schema.ConvertedTypes.UTF8, 0 /* type len */, 0 /* precision */, 0 /* scale */, -1 /* fieldID */)) 410 descr := schema.NewColumn(n, 4, 1) 411 412 assert.Equal(t, "name", descr.Name()) 413 assert.EqualValues(t, 4, descr.MaxDefinitionLevel()) 414 assert.EqualValues(t, 1, descr.MaxRepetitionLevel()) 415 assert.Equal(t, parquet.Types.ByteArray, descr.PhysicalType()) 416 assert.Equal(t, -1, descr.TypeLength()) 417 418 expectedDesc := `column descriptor = { 419 name: name, 420 path: , 421 physical_type: BYTE_ARRAY, 422 converted_type: UTF8, 423 logical_type: String, 424 max_definition_level: 4, 425 max_repetition_level: 1, 426 }` 427 assert.Equal(t, expectedDesc, descr.String()) 428 429 n = schema.MustPrimitive(schema.NewPrimitiveNodeConverted("name" /* name */, parquet.Repetitions.Optional, parquet.Types.FixedLenByteArray, schema.ConvertedTypes.Decimal, 12 /* type len */, 10 /* precision */, 4 /* scale */, -1 /* fieldID */)) 430 descr2 := schema.NewColumn(n, 4, 1) 431 432 assert.Equal(t, parquet.Types.FixedLenByteArray, descr2.PhysicalType()) 433 assert.Equal(t, 12, descr2.TypeLength()) 434 435 expectedDesc = `column descriptor = { 436 name: name, 437 path: , 438 physical_type: FIXED_LEN_BYTE_ARRAY, 439 converted_type: DECIMAL, 440 logical_type: Decimal(precision=10, scale=4), 441 max_definition_level: 4, 442 max_repetition_level: 1, 443 length: 12, 444 precision: 10, 445 scale: 4, 446 }` 447 assert.Equal(t, expectedDesc, descr2.String()) 448 } 449 450 func TestSchemaDescriptor(t *testing.T) { 451 t.Run("Equals", func(t *testing.T) { 452 inta := schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, -1 /* fieldID */) 453 intb := schema.NewInt64Node("b" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */) 454 intb2 := schema.NewInt64Node("b2" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */) 455 intc := schema.NewByteArrayNode("c" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */) 456 457 item1 := schema.NewInt64Node("item1" /* name */, parquet.Repetitions.Required, -1 /* fieldID */) 458 item2 := schema.NewBooleanNode("item2" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */) 459 item3 := schema.NewInt32Node("item3" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */) 460 list := schema.MustGroup(schema.NewGroupNodeConverted("records" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item1, item2, item3}, schema.ConvertedTypes.List, -1 /* fieldID */)) 461 462 bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, -1 /* fieldID */)) 463 bag2 := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Required, schema.FieldList{list}, -1 /* fieldID */)) 464 465 descr1 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag}, -1 /* fieldID */))) 466 assert.True(t, descr1.Equals(descr1)) 467 468 descr2 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag2}, -1 /* fieldID */))) 469 assert.False(t, descr1.Equals(descr2)) 470 471 descr3 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb2, intc, bag}, -1 /* fieldID */))) 472 assert.False(t, descr1.Equals(descr3)) 473 474 descr4 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("SCHEMA" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag}, -1 /* fieldID */))) 475 assert.True(t, descr1.Equals(descr4)) 476 477 descr5 := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, schema.FieldList{inta, intb, intc, bag, intb2}, -1 /* fieldID */))) 478 assert.False(t, descr1.Equals(descr5)) 479 480 col1 := schema.NewColumn(inta, 5 /* maxDefLvl */, 1 /* maxRepLvl */) 481 col2 := schema.NewColumn(inta, 6 /* maxDefLvl */, 1 /* maxRepLvl */) 482 col3 := schema.NewColumn(inta, 5 /* maxDefLvl */, 2 /* maxRepLvl */) 483 484 assert.True(t, col1.Equals(col1)) 485 assert.False(t, col1.Equals(col2)) 486 assert.False(t, col2.Equals(col3)) 487 }) 488 489 t.Run("BuildTree", func(t *testing.T) { 490 inta := schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, -1 /* fieldID */) 491 fields := schema.FieldList{inta} 492 fields = append(fields, 493 schema.NewInt64Node("b" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */), 494 schema.NewByteArrayNode("c" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */)) 495 496 item1 := schema.NewInt64Node("item1" /* name */, parquet.Repetitions.Required, -1 /* fieldID */) 497 item2 := schema.NewBooleanNode("item2" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */) 498 item3 := schema.NewInt32Node("item3" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */) 499 list := schema.MustGroup(schema.NewGroupNodeConverted("records" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item1, item2, item3}, schema.ConvertedTypes.List, -1 /* fieldID */)) 500 bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, -1 /* fieldID */)) 501 fields = append(fields, bag) 502 503 sc := schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, fields, -1 /* fieldID */)) 504 descr := schema.NewSchema(sc) 505 506 const nleaves = 6 507 assert.Equal(t, nleaves, descr.NumColumns()) 508 509 // mdef mrep 510 // required int32 a 0 0 511 // optional int64 b 1 0 512 // repeated byte_array c 1 1 513 // optional group bag 1 0 514 // repeated group records 2 1 515 // required int64 item1 2 1 516 // optional boolean item2 3 1 517 // repeated int32 item3 3 2 518 var ( 519 exMaxDefLevels = [...]int16{0, 1, 1, 2, 3, 3} 520 exMaxRepLevels = [...]int16{0, 0, 1, 1, 1, 2} 521 ) 522 523 for i := 0; i < nleaves; i++ { 524 col := descr.Column(i) 525 assert.Equal(t, exMaxDefLevels[i], col.MaxDefinitionLevel()) 526 assert.Equal(t, exMaxRepLevels[i], col.MaxRepetitionLevel()) 527 } 528 529 assert.Equal(t, "a", descr.Column(0).Path()) 530 assert.Equal(t, "b", descr.Column(1).Path()) 531 assert.Equal(t, "c", descr.Column(2).Path()) 532 assert.Equal(t, "bag.records.item1", descr.Column(3).Path()) 533 assert.Equal(t, "bag.records.item2", descr.Column(4).Path()) 534 assert.Equal(t, "bag.records.item3", descr.Column(5).Path()) 535 536 for i := 0; i < nleaves; i++ { 537 col := descr.Column(i) 538 assert.Equal(t, i, descr.ColumnIndexByNode(col.SchemaNode())) 539 } 540 541 nonColumnAlien := schema.NewInt32Node("alien" /* name */, parquet.Repetitions.Required, -1 /* fieldID */) 542 nonColumnFamiliar := schema.NewInt32Node("a" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */) 543 assert.Less(t, descr.ColumnIndexByNode(nonColumnAlien), 0) 544 assert.Less(t, descr.ColumnIndexByNode(nonColumnFamiliar), 0) 545 546 assert.Same(t, inta, descr.ColumnRoot(0)) 547 assert.Same(t, bag, descr.ColumnRoot(3)) 548 assert.Same(t, bag, descr.ColumnRoot(4)) 549 assert.Same(t, bag, descr.ColumnRoot(5)) 550 551 assert.Same(t, sc, descr.Root()) 552 }) 553 554 t.Run("HasRepeatedFields", func(t *testing.T) { 555 inta := schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, -1 /* fieldID */) 556 fields := schema.FieldList{inta} 557 fields = append(fields, 558 schema.NewInt64Node("b" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */), 559 schema.NewByteArrayNode("c" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */)) 560 561 sc := schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, fields, -1 /* fieldID */)) 562 descr := schema.NewSchema(sc) 563 assert.True(t, descr.HasRepeatedFields()) 564 565 item1 := schema.NewInt64Node("item1" /* name */, parquet.Repetitions.Required, -1 /* fieldID */) 566 item2 := schema.NewBooleanNode("item2" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */) 567 item3 := schema.NewInt32Node("item3" /* name */, parquet.Repetitions.Repeated, -1 /* fieldID */) 568 list := schema.MustGroup(schema.NewGroupNodeConverted("records" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item1, item2, item3}, schema.ConvertedTypes.List, -1 /* fieldID */)) 569 bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, -1 /* fieldID */)) 570 fields = append(fields, bag) 571 572 sc = schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, fields, -1 /* fieldID */)) 573 descr = schema.NewSchema(sc) 574 assert.True(t, descr.HasRepeatedFields()) 575 576 itemKey := schema.NewInt64Node("key" /* name */, parquet.Repetitions.Required, -1 /* fieldID */) 577 itemValue := schema.NewBooleanNode("value" /* name */, parquet.Repetitions.Optional, -1 /* fieldID */) 578 sc = schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, append(fields, schema.FieldList{ 579 schema.MustGroup(schema.NewGroupNode("my_map" /* name */, parquet.Repetitions.Optional, schema.FieldList{ 580 schema.MustGroup(schema.NewGroupNodeConverted("map" /* name */, parquet.Repetitions.Repeated, schema.FieldList{itemKey, itemValue}, schema.ConvertedTypes.Map, -1 /* fieldID */)), 581 }, -1 /* fieldID */)), 582 }...), -1 /* fieldID */)) 583 descr = schema.NewSchema(sc) 584 assert.True(t, descr.HasRepeatedFields()) 585 }) 586 } 587 588 func ExamplePrintSchema() { 589 fields := schema.FieldList{schema.NewInt32Node("a" /* name */, parquet.Repetitions.Required, 1 /* fieldID */)} 590 item1 := schema.NewInt64Node("item1" /* name */, parquet.Repetitions.Optional, 4 /* fieldID */) 591 item2 := schema.NewBooleanNode("item2" /* name */, parquet.Repetitions.Required, 5 /* fieldID */) 592 list := schema.MustGroup(schema.NewGroupNodeConverted("b" /* name */, parquet.Repetitions.Repeated, schema.FieldList{item1, item2}, schema.ConvertedTypes.List, 3 /* fieldID */)) 593 bag := schema.MustGroup(schema.NewGroupNode("bag" /* name */, parquet.Repetitions.Optional, schema.FieldList{list}, 2 /* fieldID */)) 594 fields = append(fields, bag) 595 596 fields = append(fields, 597 schema.MustPrimitive(schema.NewPrimitiveNodeConverted("c" /* name */, parquet.Repetitions.Required, parquet.Types.Int32, schema.ConvertedTypes.Decimal, 0 /* type len */, 3 /* precision */, 2 /* scale */, 6 /* fieldID */)), 598 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("d" /* name */, parquet.Repetitions.Required, schema.NewDecimalLogicalType(10 /* precision */, 5 /* scale */), parquet.Types.Int64, -1 /* type len */, 7 /* fieldID */))) 599 600 sc := schema.MustGroup(schema.NewGroupNode("schema" /* name */, parquet.Repetitions.Repeated, fields, 0 /* fieldID */)) 601 schema.PrintSchema(sc, os.Stdout, 2) 602 603 // Output: 604 // repeated group field_id=0 schema { 605 // required int32 field_id=1 a; 606 // optional group field_id=2 bag { 607 // repeated group field_id=3 b (List) { 608 // optional int64 field_id=4 item1; 609 // required boolean field_id=5 item2; 610 // } 611 // } 612 // required int32 field_id=6 c (Decimal(precision=3, scale=2)); 613 // required int64 field_id=7 d (Decimal(precision=10, scale=5)); 614 // } 615 } 616 617 func TestPanicSchemaNodeCreation(t *testing.T) { 618 assert.Panics(t, func() { 619 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("map" /* name */, parquet.Repetitions.Required, schema.MapLogicalType{}, parquet.Types.Int64, -1 /* type len */, -1 /* fieldID */)) 620 }, "nested logical type on non-group node") 621 622 assert.Panics(t, func() { 623 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("string" /* name */, parquet.Repetitions.Required, schema.StringLogicalType{}, parquet.Types.Boolean, -1 /* type len */, -1 /* fieldID */)) 624 }, "incompatible primitive type") 625 626 assert.Panics(t, func() { 627 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("interval" /* name */, parquet.Repetitions.Required, schema.IntervalLogicalType{}, parquet.Types.FixedLenByteArray, 11 /* type len */, -1 /* fieldID */)) 628 }, "incompatible primitive length") 629 630 assert.Panics(t, func() { 631 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("decimal" /* name */, parquet.Repetitions.Required, schema.NewDecimalLogicalType(16, 6), parquet.Types.Int32, -1 /* type len */, -1 /* fieldID */)) 632 }, "primitive too small for given precision") 633 634 assert.Panics(t, func() { 635 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("uuid" /* name */, parquet.Repetitions.Required, schema.UUIDLogicalType{}, parquet.Types.FixedLenByteArray, 64 /* type len */, -1 /* fieldID */)) 636 }, "incompatible primitive length") 637 638 assert.Panics(t, func() { 639 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("negative_len" /* name */, parquet.Repetitions.Required, schema.NoLogicalType{}, parquet.Types.FixedLenByteArray, -16 /* type len */, -1 /* fieldID */)) 640 }, "non-positive length for fixed length binary") 641 642 assert.Panics(t, func() { 643 schema.MustPrimitive(schema.NewPrimitiveNodeLogical("zero_len" /* name */, parquet.Repetitions.Required, schema.NoLogicalType{}, parquet.Types.FixedLenByteArray, 0 /* type len */, -1 /* fieldID */)) 644 }, "non-positive length for fixed length binary") 645 646 assert.Panics(t, func() { 647 schema.MustGroup(schema.NewGroupNodeLogical("list" /* name */, parquet.Repetitions.Repeated, schema.FieldList{}, schema.JSONLogicalType{}, -1 /* fieldID */)) 648 }, "non-nested logical type on group node") 649 } 650 651 func TestNullLogicalConvertsToNone(t *testing.T) { 652 var ( 653 empty schema.LogicalType 654 n schema.Node 655 ) 656 assert.NotPanics(t, func() { 657 n = schema.MustPrimitive(schema.NewPrimitiveNodeLogical("value" /* name */, parquet.Repetitions.Required, empty, parquet.Types.Double, -1 /* type len */, -1 /* fieldID */)) 658 }) 659 assert.True(t, n.LogicalType().IsNone()) 660 assert.Equal(t, schema.ConvertedTypes.None, n.ConvertedType()) 661 assert.NotPanics(t, func() { 662 n = schema.MustGroup(schema.NewGroupNodeLogical("items" /* name */, parquet.Repetitions.Repeated, schema.FieldList{}, empty, -1 /* fieldID */)) 663 }) 664 assert.True(t, n.LogicalType().IsNone()) 665 assert.Equal(t, schema.ConvertedTypes.None, n.ConvertedType()) 666 }