github.com/apache/arrow/go/v14@v14.0.1/parquet/file/level_conversion.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file
    18  
    19  import (
    20  	"fmt"
    21  	"math"
    22  	"math/bits"
    23  	"unsafe"
    24  
    25  	shared_utils "github.com/apache/arrow/go/v14/internal/utils"
    26  	"github.com/apache/arrow/go/v14/parquet"
    27  	"github.com/apache/arrow/go/v14/parquet/internal/bmi"
    28  	"github.com/apache/arrow/go/v14/parquet/internal/utils"
    29  	"github.com/apache/arrow/go/v14/parquet/schema"
    30  	"golang.org/x/xerrors"
    31  )
    32  
    33  type LevelInfo struct {
    34  	// How many slots an undefined but present (i.e. null) element in
    35  	// parquet consumes when decoding to Arrow.
    36  	// "Slot" is used in the same context as the Arrow specification
    37  	// (i.e. a value holder).
    38  	// This is only ever >1 for descendents of FixedSizeList.
    39  	NullSlotUsage int32
    40  	// The definition level at which the value for the field
    41  	// is considered not null (definition levels greater than
    42  	// or equal to this value indicate a not-null
    43  	// value for the field). For list fields definition levels
    44  	// greater than or equal to this field indicate a present,
    45  	// possibly null, child value.
    46  	DefLevel int16
    47  	// The repetition level corresponding to this element
    48  	// or the closest repeated ancestor.  Any repetition
    49  	// level less than this indicates either a new list OR
    50  	// an empty list (which is determined in conjunction
    51  	// with definition levels).
    52  	RepLevel int16
    53  	// The definition level indicating the level at which the closest
    54  	// repeated ancestor is not empty.  This is used to discriminate
    55  	// between a value less than |def_level| being null or excluded entirely.
    56  	// For instance if we have an arrow schema like:
    57  	// list(struct(f0: int)).  Then then there are the following
    58  	// definition levels:
    59  	//   0 = null list
    60  	//   1 = present but empty list.
    61  	//   2 = a null value in the list
    62  	//   3 = a non null struct but null integer.
    63  	//   4 = a present integer.
    64  	// When reconstructing, the struct and integer arrays'
    65  	// repeated_ancestor_def_level would be 2.  Any
    66  	// def_level < 2 indicates that there isn't a corresponding
    67  	// child value in the list.
    68  	// i.e. [null, [], [null], [{f0: null}], [{f0: 1}]]
    69  	// has the def levels [0, 1, 2, 3, 4].  The actual
    70  	// struct array is only of length 3: [not-set, set, set] and
    71  	// the int array is also of length 3: [N/A, null, 1].
    72  	RepeatedAncestorDefLevel int16
    73  }
    74  
    75  func (l *LevelInfo) Equal(rhs *LevelInfo) bool {
    76  	return l.NullSlotUsage == rhs.NullSlotUsage &&
    77  		l.DefLevel == rhs.DefLevel &&
    78  		l.RepLevel == rhs.RepLevel &&
    79  		l.RepeatedAncestorDefLevel == rhs.RepeatedAncestorDefLevel
    80  }
    81  
    82  func (l *LevelInfo) HasNullableValues() bool {
    83  	return l.RepeatedAncestorDefLevel < l.DefLevel
    84  }
    85  
    86  func (l *LevelInfo) IncrementOptional() {
    87  	l.DefLevel++
    88  }
    89  
    90  func (l *LevelInfo) IncrementRepeated() int16 {
    91  	lastRepAncestor := l.RepeatedAncestorDefLevel
    92  	// Repeated fields add both a repetition and definition level. This is used
    93  	// to distinguish between an empty list and a list with an item in it.
    94  	l.RepLevel++
    95  	l.DefLevel++
    96  
    97  	// For levels >= repeated_ancenstor_def_level it indicates the list was
    98  	// non-null and had at least one element.  This is important
    99  	// for later decoding because we need to add a slot for these
   100  	// values.  for levels < current_def_level no slots are added
   101  	// to arrays.
   102  	l.RepeatedAncestorDefLevel = l.DefLevel
   103  	return lastRepAncestor
   104  }
   105  
   106  func (l *LevelInfo) Increment(n schema.Node) {
   107  	switch n.RepetitionType() {
   108  	case parquet.Repetitions.Repeated:
   109  		l.IncrementRepeated()
   110  	case parquet.Repetitions.Optional:
   111  		l.IncrementOptional()
   112  	}
   113  }
   114  
   115  // Input/Output structure for reconstructed validity bitmaps.
   116  type ValidityBitmapInputOutput struct {
   117  	// Input only.
   118  	// The maximum number of values_read expected (actual
   119  	// values read must be less than or equal to this value).
   120  	// If this number is exceeded methods will throw a
   121  	// ParquetException. Exceeding this limit indicates
   122  	// either a corrupt or incorrectly written file.
   123  	ReadUpperBound int64
   124  	// Output only. The number of values added to the encountered
   125  	// (this is logically the count of the number of elements
   126  	// for an Arrow array).
   127  	Read int64
   128  	// Input/Output. The number of nulls encountered.
   129  	NullCount int64
   130  	// Output only. The validity bitmap to populate. May be be null only
   131  	// for DefRepLevelsToListInfo (if all that is needed is list offsets).
   132  	ValidBits []byte
   133  	// Input only, offset into valid_bits to start at.
   134  	ValidBitsOffset int64
   135  }
   136  
   137  // create a bitmap out of the definition Levels and return the number of non-null values
   138  func defLevelsBatchToBitmap(defLevels []int16, remainingUpperBound int64, info LevelInfo, wr utils.BitmapWriter, hasRepeatedParent bool) (count uint64) {
   139  	const maxbatch = 8 * int(unsafe.Sizeof(uint64(0)))
   140  
   141  	if !hasRepeatedParent && int64(len(defLevels)) > remainingUpperBound {
   142  		panic("values read exceed upper bound")
   143  	}
   144  
   145  	var batch []int16
   146  	for len(defLevels) > 0 {
   147  		batchSize := shared_utils.MinInt(maxbatch, len(defLevels))
   148  		batch, defLevels = defLevels[:batchSize], defLevels[batchSize:]
   149  		definedBitmap := bmi.GreaterThanBitmap(batch, info.DefLevel-1)
   150  
   151  		if hasRepeatedParent {
   152  			// Greater than level_info.repeated_ancestor_def_level - 1 implies >= the
   153  			// repeated_ancestor_def_level
   154  			presentBitmap := bmi.GreaterThanBitmap(batch, info.RepeatedAncestorDefLevel-1)
   155  			selectedBits := bmi.ExtractBits(definedBitmap, presentBitmap)
   156  			selectedCount := int64(bits.OnesCount64(presentBitmap))
   157  			if selectedCount > remainingUpperBound {
   158  				panic("values read exceeded upper bound")
   159  			}
   160  			wr.AppendWord(selectedBits, selectedCount)
   161  			count += uint64(bits.OnesCount64(selectedBits))
   162  			continue
   163  		}
   164  
   165  		wr.AppendWord(definedBitmap, int64(len(batch)))
   166  		count += uint64(bits.OnesCount64(definedBitmap))
   167  	}
   168  	return
   169  }
   170  
   171  // create a bitmap out of the definition Levels
   172  func defLevelsToBitmapInternal(defLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput, hasRepeatedParent bool) {
   173  	wr := utils.NewFirstTimeBitmapWriter(out.ValidBits, out.ValidBitsOffset, int64(out.ReadUpperBound))
   174  	defer wr.Finish()
   175  	setCount := defLevelsBatchToBitmap(defLevels, out.ReadUpperBound, info, wr, hasRepeatedParent)
   176  	out.Read = int64(wr.Pos())
   177  	out.NullCount += out.Read - int64(setCount)
   178  }
   179  
   180  // DefLevelsToBitmap creates a validitybitmap out of the passed in definition levels and info object.
   181  func DefLevelsToBitmap(defLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput) {
   182  	hasRepeatedParent := false
   183  	if info.RepLevel > 0 {
   184  		hasRepeatedParent = true
   185  	}
   186  	defLevelsToBitmapInternal(defLevels, info, out, hasRepeatedParent)
   187  }
   188  
   189  // DefRepLevelsToListInfo takes in the definition and repetition levels in order to populate the validity bitmap
   190  // and properly handle nested lists and update the offsets for them.
   191  func DefRepLevelsToListInfo(defLevels, repLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput, offsets []int32) error {
   192  	var wr utils.BitmapWriter
   193  	if out.ValidBits != nil {
   194  		wr = utils.NewFirstTimeBitmapWriter(out.ValidBits, out.ValidBitsOffset, out.ReadUpperBound)
   195  		defer wr.Finish()
   196  	}
   197  	offsetPos := 0
   198  	for idx := range defLevels {
   199  		// skip items that belong to empty or null ancestor lists and further nested lists
   200  		if defLevels[idx] < info.RepeatedAncestorDefLevel || repLevels[idx] > info.RepLevel {
   201  			continue
   202  		}
   203  
   204  		if repLevels[idx] == info.RepLevel {
   205  			// continuation of an existing list.
   206  			// offsets can be null for structs with repeated children
   207  			if offsetPos < len(offsets) {
   208  				if offsets[offsetPos] == math.MaxInt32 {
   209  					return xerrors.New("list index overflow")
   210  				}
   211  				offsets[offsetPos]++
   212  			}
   213  		} else {
   214  			if (wr != nil && int64(wr.Pos()) >= out.ReadUpperBound) || (offsetPos >= int(out.ReadUpperBound)) {
   215  				return fmt.Errorf("definition levels exceeded upper bound: %d", out.ReadUpperBound)
   216  			}
   217  
   218  			// current_rep < list rep_level i.e. start of a list (ancestor empty lists
   219  			// are filtered out above)
   220  			// offsets can be null for structs with repeated children
   221  			if offsetPos+1 < len(offsets) {
   222  				offsetPos++
   223  				// use cumulative offsets because variable size lists are more common
   224  				// than fixed size lists so it should be cheaper to make these
   225  				// cumulative and subtract when validating fixed size lists
   226  				offsets[offsetPos] = offsets[offsetPos-1]
   227  				if defLevels[idx] >= info.DefLevel {
   228  					if offsets[offsetPos] == math.MaxInt32 {
   229  						return xerrors.New("list index overflow")
   230  					}
   231  					offsets[offsetPos]++
   232  				}
   233  			}
   234  
   235  			if wr != nil {
   236  				// the level info def level for lists reflects element present level
   237  				// the prior level distinguishes between empty lists
   238  				if defLevels[idx] >= info.DefLevel-1 {
   239  					wr.Set()
   240  				} else {
   241  					out.NullCount++
   242  					wr.Clear()
   243  				}
   244  				wr.Next()
   245  			}
   246  		}
   247  	}
   248  
   249  	if len(offsets) > 0 {
   250  		out.Read = int64(offsetPos)
   251  	} else if wr != nil {
   252  		out.Read = int64(wr.Pos())
   253  	}
   254  
   255  	if out.NullCount > 0 && info.NullSlotUsage > 1 {
   256  		return xerrors.New("null values with null_slot_usage > 1 not supported.")
   257  	}
   258  	return nil
   259  }
   260  
   261  // DefRepLevelsToBitmap constructs a full validitybitmap out of the definition and repetition levels
   262  // properly handling nested lists and parents.
   263  func DefRepLevelsToBitmap(defLevels, repLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput) error {
   264  	info.RepLevel++
   265  	info.DefLevel++
   266  	return DefRepLevelsToListInfo(defLevels, repLevels, info, out, nil)
   267  }