github.com/apache/arrow/go/v14@v14.0.1/parquet/pqarrow/encode_dict_compute.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  //go:build go1.18
    18  
    19  package pqarrow
    20  
    21  import (
    22  	"context"
    23  
    24  	"github.com/apache/arrow/go/v14/arrow"
    25  	"github.com/apache/arrow/go/v14/arrow/array"
    26  	"github.com/apache/arrow/go/v14/arrow/compute"
    27  	"github.com/apache/arrow/go/v14/arrow/memory"
    28  	"github.com/apache/arrow/go/v14/parquet"
    29  	"github.com/apache/arrow/go/v14/parquet/file"
    30  	"github.com/apache/arrow/go/v14/parquet/internal/debug"
    31  	"github.com/apache/arrow/go/v14/parquet/internal/encoding"
    32  )
    33  
    34  func isDictEncoding(enc parquet.Encoding) bool {
    35  	return enc == parquet.Encodings.PlainDict
    36  }
    37  
    38  func dictionaryDirectWriteSupported(arr arrow.Array) bool {
    39  	debug.Assert(arr.DataType().ID() == arrow.DICTIONARY, "should only be called with dictionary type")
    40  	dt := arr.DataType().(*arrow.DictionaryType)
    41  	return arrow.IsPrimitive(dt.ValueType.ID()) || arrow.IsBaseBinary(dt.ValueType.ID())
    42  }
    43  
    44  func convertDictionaryToDense(mem memory.Allocator, arr arrow.Array) (arrow.Array, error) {
    45  	dt := arr.DataType().(*arrow.DictionaryType).ValueType
    46  	ctx := compute.WithAllocator(context.Background(), mem)
    47  	return compute.CastArray(ctx, arr, compute.SafeCastOptions(dt))
    48  }
    49  
    50  func writeDictionaryArrow(ctx *arrowWriteContext, cw file.ColumnChunkWriter, leafArr arrow.Array, defLevels, repLevels []int16, maybeParentNulls bool) (err error) {
    51  	// if this is the first time writing a dictionary array,
    52  	// then there's a few possible paths to take:
    53  	//
    54  	// - If dictionary encoding is not enabled, just convert to densely
    55  	//   encoded and call writeDenseArrow
    56  	// - Dictionary Encoding is enabled:
    57  	//   - If this is the first time this is called, then we
    58  	//     call PutDictionary into the encoder and PutIndices on each
    59  	//     chunk. We store the dictionary that was written so that
    60  	//     subsequent calls to this method can make sure the dictionary
    61  	//     hasn't changed.
    62  	//   - on subsequent calls, we have to check whether the dictionary
    63  	//     has changed. If it has, then we trigger the varying dictionary
    64  	//     path and materialize each chunk and call writeDenseArrow with that
    65  	writeDense := func() error {
    66  		denseArr, err := convertDictionaryToDense(ctx.props.mem, leafArr)
    67  		if err != nil {
    68  			return err
    69  		}
    70  		defer denseArr.Release()
    71  		return writeDenseArrow(ctx, cw, denseArr, defLevels, repLevels, maybeParentNulls)
    72  	}
    73  
    74  	if !isDictEncoding(cw.CurrentEncoder().Encoding()) || !dictionaryDirectWriteSupported(leafArr) {
    75  		// no longer dictionary-encoding for whatever reason, maybe we never were
    76  		// or we decided to stop. Note that writeArrowToColumn can be invoked multiple
    77  		// times with both dense and dictionary-encoded versions of the same data
    78  		// without a problem. Any dense data will be hashed to indices until the
    79  		// dictionary page limit is reached, at which everything (dict and dense)
    80  		// will fall back to plain encoding
    81  		return writeDense()
    82  	}
    83  
    84  	var (
    85  		dictEncoder = cw.CurrentEncoder().(encoding.DictEncoder)
    86  		data        = leafArr.(*array.Dictionary)
    87  		dict        = data.Dictionary()
    88  		indices     = data.Indices()
    89  		preserved   = dictEncoder.PreservedDictionary()
    90  		pageStats   = cw.PageStatistics()
    91  	)
    92  
    93  	updateStats := func() error {
    94  		var referencedDict arrow.Array
    95  
    96  		ctx := compute.WithAllocator(context.Background(), ctx.props.mem)
    97  		// if dictionary is the same dictionary we already have, just use that
    98  		if preserved != nil && preserved == dict {
    99  			referencedDict = preserved
   100  		} else {
   101  			referencedIndices, err := compute.UniqueArray(ctx, indices)
   102  			if err != nil {
   103  				return err
   104  			}
   105  
   106  			// on first run, we might be able to re-use the existing dict
   107  			if referencedIndices.Len() == dict.Len() {
   108  				referencedDict = dict
   109  			} else {
   110  				referencedDict, err = compute.TakeArrayOpts(ctx, dict, referencedIndices, compute.TakeOptions{BoundsCheck: false})
   111  				if err != nil {
   112  					return err
   113  				}
   114  				defer referencedDict.Release()
   115  			}
   116  			referencedIndices.Release()
   117  		}
   118  
   119  		nonNullCount := indices.Len() - indices.NullN()
   120  		pageStats.IncNulls(int64(len(defLevels) - nonNullCount))
   121  		pageStats.IncNumValues(int64(nonNullCount))
   122  		return pageStats.UpdateFromArrow(referencedDict, false)
   123  	}
   124  
   125  	switch {
   126  	case preserved == nil:
   127  		if err := dictEncoder.PutDictionary(dict); err != nil {
   128  			return err
   129  		}
   130  
   131  		// if there were duplicate values in the dictionary, the encoder's
   132  		// memo table will be out of sync with the indices in the arrow array
   133  		// the easiest solution for this uncommon case is to fallback to plain
   134  		// encoding
   135  		if dictEncoder.NumEntries() != dict.Len() {
   136  			cw.FallbackToPlain()
   137  			return writeDense()
   138  		}
   139  
   140  		if pageStats != nil {
   141  			if err := updateStats(); err != nil {
   142  				return err
   143  			}
   144  		}
   145  
   146  	case !array.Equal(dict, preserved):
   147  		// dictionary has changed
   148  		cw.FallbackToPlain()
   149  		return writeDense()
   150  	default:
   151  		// dictionary is the same but we need to update stats
   152  		if pageStats != nil {
   153  			if err := updateStats(); err != nil {
   154  				return err
   155  			}
   156  		}
   157  	}
   158  
   159  	return cw.WriteDictIndices(indices, defLevels, repLevels)
   160  }