github.com/apache/arrow/go/v7@v7.0.1/parquet/file/level_conversion.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file
    18  
    19  import (
    20  	"math"
    21  	"math/bits"
    22  	"unsafe"
    23  
    24  	"github.com/apache/arrow/go/v7/parquet"
    25  	"github.com/apache/arrow/go/v7/parquet/internal/bmi"
    26  	"github.com/apache/arrow/go/v7/parquet/internal/utils"
    27  	"github.com/apache/arrow/go/v7/parquet/schema"
    28  	"golang.org/x/xerrors"
    29  )
    30  
    31  type LevelInfo struct {
    32  	// How many slots an undefined but present (i.e. null) element in
    33  	// parquet consumes when decoding to Arrow.
    34  	// "Slot" is used in the same context as the Arrow specification
    35  	// (i.e. a value holder).
    36  	// This is only ever >1 for descendents of FixedSizeList.
    37  	NullSlotUsage int32
    38  	// The definition level at which the value for the field
    39  	// is considered not null (definition levels greater than
    40  	// or equal to this value indicate a not-null
    41  	// value for the field). For list fields definition levels
    42  	// greater than or equal to this field indicate a present,
    43  	// possibly null, child value.
    44  	DefLevel int16
    45  	// The repetition level corresponding to this element
    46  	// or the closest repeated ancestor.  Any repetition
    47  	// level less than this indicates either a new list OR
    48  	// an empty list (which is determined in conjunction
    49  	// with definition levels).
    50  	RepLevel int16
    51  	// The definition level indicating the level at which the closest
    52  	// repeated ancestor is not empty.  This is used to discriminate
    53  	// between a value less than |def_level| being null or excluded entirely.
    54  	// For instance if we have an arrow schema like:
    55  	// list(struct(f0: int)).  Then then there are the following
    56  	// definition levels:
    57  	//   0 = null list
    58  	//   1 = present but empty list.
    59  	//   2 = a null value in the list
    60  	//   3 = a non null struct but null integer.
    61  	//   4 = a present integer.
    62  	// When reconstructing, the struct and integer arrays'
    63  	// repeated_ancestor_def_level would be 2.  Any
    64  	// def_level < 2 indicates that there isn't a corresponding
    65  	// child value in the list.
    66  	// i.e. [null, [], [null], [{f0: null}], [{f0: 1}]]
    67  	// has the def levels [0, 1, 2, 3, 4].  The actual
    68  	// struct array is only of length 3: [not-set, set, set] and
    69  	// the int array is also of length 3: [N/A, null, 1].
    70  	RepeatedAncestorDefLevel int16
    71  }
    72  
    73  func newDefaultLevelInfo() *LevelInfo {
    74  	return &LevelInfo{NullSlotUsage: 1}
    75  }
    76  
    77  func (l *LevelInfo) Equal(rhs *LevelInfo) bool {
    78  	return l.NullSlotUsage == rhs.NullSlotUsage &&
    79  		l.DefLevel == rhs.DefLevel &&
    80  		l.RepLevel == rhs.RepLevel &&
    81  		l.RepeatedAncestorDefLevel == rhs.RepeatedAncestorDefLevel
    82  }
    83  
    84  func (l *LevelInfo) HasNullableValues() bool {
    85  	return l.RepeatedAncestorDefLevel < l.DefLevel
    86  }
    87  
    88  func (l *LevelInfo) IncrementOptional() {
    89  	l.DefLevel++
    90  }
    91  
    92  func (l *LevelInfo) IncrementRepeated() int16 {
    93  	lastRepAncestor := l.RepeatedAncestorDefLevel
    94  	// Repeated fields add both a repetition and definition level. This is used
    95  	// to distinguish between an empty list and a list with an item in it.
    96  	l.RepLevel++
    97  	l.DefLevel++
    98  
    99  	// For levels >= repeated_ancenstor_def_level it indicates the list was
   100  	// non-null and had at least one element.  This is important
   101  	// for later decoding because we need to add a slot for these
   102  	// values.  for levels < current_def_level no slots are added
   103  	// to arrays.
   104  	l.RepeatedAncestorDefLevel = l.DefLevel
   105  	return lastRepAncestor
   106  }
   107  
   108  func (l *LevelInfo) Increment(n schema.Node) {
   109  	switch n.RepetitionType() {
   110  	case parquet.Repetitions.Repeated:
   111  		l.IncrementRepeated()
   112  	case parquet.Repetitions.Optional:
   113  		l.IncrementOptional()
   114  	}
   115  }
   116  
   117  // Input/Output structure for reconstructed validity bitmaps.
   118  type ValidityBitmapInputOutput struct {
   119  	// Input only.
   120  	// The maximum number of values_read expected (actual
   121  	// values read must be less than or equal to this value).
   122  	// If this number is exceeded methods will throw a
   123  	// ParquetException. Exceeding this limit indicates
   124  	// either a corrupt or incorrectly written file.
   125  	ReadUpperBound int64
   126  	// Output only. The number of values added to the encountered
   127  	// (this is logically the count of the number of elements
   128  	// for an Arrow array).
   129  	Read int64
   130  	// Input/Output. The number of nulls encountered.
   131  	NullCount int64
   132  	// Output only. The validity bitmap to populate. May be be null only
   133  	// for DefRepLevelsToListInfo (if all that is needed is list offsets).
   134  	ValidBits []byte
   135  	// Input only, offset into valid_bits to start at.
   136  	ValidBitsOffset int64
   137  }
   138  
   139  // create a bitmap out of the definition Levels and return the number of non-null values
   140  func defLevelsBatchToBitmap(defLevels []int16, remainingUpperBound int64, info LevelInfo, wr utils.BitmapWriter, hasRepeatedParent bool) (count uint64) {
   141  	const maxbatch = 8 * int(unsafe.Sizeof(uint64(0)))
   142  
   143  	if !hasRepeatedParent && int64(len(defLevels)) > remainingUpperBound {
   144  		panic("values read exceed upper bound")
   145  	}
   146  
   147  	var batch []int16
   148  	for len(defLevels) > 0 {
   149  		batchSize := utils.MinInt(maxbatch, len(defLevels))
   150  		batch, defLevels = defLevels[:batchSize], defLevels[batchSize:]
   151  		definedBitmap := bmi.GreaterThanBitmap(batch, info.DefLevel-1)
   152  
   153  		if hasRepeatedParent {
   154  			// Greater than level_info.repeated_ancestor_def_level - 1 implies >= the
   155  			// repeated_ancestor_def_level
   156  			presentBitmap := bmi.GreaterThanBitmap(batch, info.RepeatedAncestorDefLevel-1)
   157  			selectedBits := bmi.ExtractBits(definedBitmap, presentBitmap)
   158  			selectedCount := int64(bits.OnesCount64(presentBitmap))
   159  			if selectedCount > remainingUpperBound {
   160  				panic("values read exceeded upper bound")
   161  			}
   162  			wr.AppendWord(selectedBits, selectedCount)
   163  			count += uint64(bits.OnesCount64(selectedBits))
   164  			continue
   165  		}
   166  
   167  		wr.AppendWord(definedBitmap, int64(len(batch)))
   168  		count += uint64(bits.OnesCount64(definedBitmap))
   169  	}
   170  	return
   171  }
   172  
   173  // create a bitmap out of the definition Levels
   174  func defLevelsToBitmapInternal(defLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput, hasRepeatedParent bool) {
   175  	wr := utils.NewFirstTimeBitmapWriter(out.ValidBits, out.ValidBitsOffset, int64(len(defLevels)))
   176  	defer wr.Finish()
   177  	setCount := defLevelsBatchToBitmap(defLevels, out.ReadUpperBound, info, wr, hasRepeatedParent)
   178  	out.Read = int64(wr.Pos())
   179  	out.NullCount += out.Read - int64(setCount)
   180  }
   181  
   182  // DefLevelsToBitmap creates a validitybitmap out of the passed in definition levels and info object.
   183  func DefLevelsToBitmap(defLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput) {
   184  	hasRepeatedParent := false
   185  	if info.RepLevel > 0 {
   186  		hasRepeatedParent = true
   187  	}
   188  	defLevelsToBitmapInternal(defLevels, info, out, hasRepeatedParent)
   189  }
   190  
   191  // DefRepLevelsToListInfo takes in the definition and repetition levels in order to populate the validity bitmap
   192  // and properly handle nested lists and update the offsets for them.
   193  func DefRepLevelsToListInfo(defLevels, repLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput, offsets []int32) error {
   194  	var wr utils.BitmapWriter
   195  	if out.ValidBits != nil {
   196  		wr = utils.NewFirstTimeBitmapWriter(out.ValidBits, out.ValidBitsOffset, out.ReadUpperBound)
   197  		defer wr.Finish()
   198  	}
   199  	offsetPos := 0
   200  	for idx := range defLevels {
   201  		// skip items that belong to empty or null ancestor lists and further nested lists
   202  		if defLevels[idx] < info.RepeatedAncestorDefLevel || repLevels[idx] > info.RepLevel {
   203  			continue
   204  		}
   205  
   206  		if repLevels[idx] == info.RepLevel {
   207  			// continuation of an existing list.
   208  			// offsets can be null for structs with repeated children
   209  			if offsetPos < len(offsets) {
   210  				if offsets[offsetPos] == math.MaxInt32 {
   211  					return xerrors.New("list index overflow")
   212  				}
   213  				offsets[offsetPos]++
   214  			}
   215  		} else {
   216  			if (wr != nil && int64(wr.Pos()) >= out.ReadUpperBound) || (offsetPos >= int(out.ReadUpperBound)) {
   217  				return xerrors.Errorf("definition levels exceeded upper bound: %d", out.ReadUpperBound)
   218  			}
   219  
   220  			// current_rep < list rep_level i.e. start of a list (ancestor empty lists
   221  			// are filtered out above)
   222  			// offsets can be null for structs with repeated children
   223  			if offsetPos+1 < len(offsets) {
   224  				offsetPos++
   225  				// use cumulative offsets because variable size lists are more common
   226  				// than fixed size lists so it should be cheaper to make these
   227  				// cumulative and subtract when validating fixed size lists
   228  				offsets[offsetPos] = offsets[offsetPos-1]
   229  				if defLevels[idx] >= info.DefLevel {
   230  					if offsets[offsetPos] == math.MaxInt32 {
   231  						return xerrors.New("list index overflow")
   232  					}
   233  					offsets[offsetPos]++
   234  				}
   235  			}
   236  
   237  			if wr != nil {
   238  				// the level info def level for lists reflects element present level
   239  				// the prior level distinguishes between empty lists
   240  				if defLevels[idx] >= info.DefLevel-1 {
   241  					wr.Set()
   242  				} else {
   243  					out.NullCount++
   244  					wr.Clear()
   245  				}
   246  				wr.Next()
   247  			}
   248  		}
   249  	}
   250  
   251  	if len(offsets) > 0 {
   252  		out.Read = int64(offsetPos)
   253  	} else if wr != nil {
   254  		out.Read = int64(wr.Pos())
   255  	}
   256  
   257  	if out.NullCount > 0 && info.NullSlotUsage > 1 {
   258  		return xerrors.New("null values with null_slot_usage > 1 not supported.")
   259  	}
   260  	return nil
   261  }
   262  
   263  // DefRepLevelsToBitmap constructs a full validitybitmap out of the definition and repetition levels
   264  // properly handling nested lists and parents.
   265  func DefRepLevelsToBitmap(defLevels, repLevels []int16, info LevelInfo, out *ValidityBitmapInputOutput) error {
   266  	info.RepLevel++
   267  	info.DefLevel++
   268  	return DefRepLevelsToListInfo(defLevels, repLevels, info, out, nil)
   269  }