github.com/apache/arrow/go/v14@v14.0.2/parquet/pqarrow/encode_dict_compute.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 //go:build go1.18 18 19 package pqarrow 20 21 import ( 22 "context" 23 24 "github.com/apache/arrow/go/v14/arrow" 25 "github.com/apache/arrow/go/v14/arrow/array" 26 "github.com/apache/arrow/go/v14/arrow/compute" 27 "github.com/apache/arrow/go/v14/arrow/memory" 28 "github.com/apache/arrow/go/v14/parquet" 29 "github.com/apache/arrow/go/v14/parquet/file" 30 "github.com/apache/arrow/go/v14/parquet/internal/debug" 31 "github.com/apache/arrow/go/v14/parquet/internal/encoding" 32 ) 33 34 func isDictEncoding(enc parquet.Encoding) bool { 35 return enc == parquet.Encodings.PlainDict 36 } 37 38 func dictionaryDirectWriteSupported(arr arrow.Array) bool { 39 debug.Assert(arr.DataType().ID() == arrow.DICTIONARY, "should only be called with dictionary type") 40 dt := arr.DataType().(*arrow.DictionaryType) 41 return arrow.IsPrimitive(dt.ValueType.ID()) || arrow.IsBaseBinary(dt.ValueType.ID()) 42 } 43 44 func convertDictionaryToDense(mem memory.Allocator, arr arrow.Array) (arrow.Array, error) { 45 dt := arr.DataType().(*arrow.DictionaryType).ValueType 46 ctx := compute.WithAllocator(context.Background(), mem) 47 return compute.CastArray(ctx, arr, compute.SafeCastOptions(dt)) 48 } 49 50 func writeDictionaryArrow(ctx *arrowWriteContext, cw file.ColumnChunkWriter, leafArr arrow.Array, defLevels, repLevels []int16, maybeParentNulls bool) (err error) { 51 // if this is the first time writing a dictionary array, 52 // then there's a few possible paths to take: 53 // 54 // - If dictionary encoding is not enabled, just convert to densely 55 // encoded and call writeDenseArrow 56 // - Dictionary Encoding is enabled: 57 // - If this is the first time this is called, then we 58 // call PutDictionary into the encoder and PutIndices on each 59 // chunk. We store the dictionary that was written so that 60 // subsequent calls to this method can make sure the dictionary 61 // hasn't changed. 62 // - on subsequent calls, we have to check whether the dictionary 63 // has changed. If it has, then we trigger the varying dictionary 64 // path and materialize each chunk and call writeDenseArrow with that 65 writeDense := func() error { 66 denseArr, err := convertDictionaryToDense(ctx.props.mem, leafArr) 67 if err != nil { 68 return err 69 } 70 defer denseArr.Release() 71 return writeDenseArrow(ctx, cw, denseArr, defLevels, repLevels, maybeParentNulls) 72 } 73 74 if !isDictEncoding(cw.CurrentEncoder().Encoding()) || !dictionaryDirectWriteSupported(leafArr) { 75 // no longer dictionary-encoding for whatever reason, maybe we never were 76 // or we decided to stop. Note that writeArrowToColumn can be invoked multiple 77 // times with both dense and dictionary-encoded versions of the same data 78 // without a problem. Any dense data will be hashed to indices until the 79 // dictionary page limit is reached, at which everything (dict and dense) 80 // will fall back to plain encoding 81 return writeDense() 82 } 83 84 var ( 85 dictEncoder = cw.CurrentEncoder().(encoding.DictEncoder) 86 data = leafArr.(*array.Dictionary) 87 dict = data.Dictionary() 88 indices = data.Indices() 89 preserved = dictEncoder.PreservedDictionary() 90 pageStats = cw.PageStatistics() 91 ) 92 93 updateStats := func() error { 94 var referencedDict arrow.Array 95 96 ctx := compute.WithAllocator(context.Background(), ctx.props.mem) 97 // if dictionary is the same dictionary we already have, just use that 98 if preserved != nil && preserved == dict { 99 referencedDict = preserved 100 } else { 101 referencedIndices, err := compute.UniqueArray(ctx, indices) 102 if err != nil { 103 return err 104 } 105 106 // on first run, we might be able to re-use the existing dict 107 if referencedIndices.Len() == dict.Len() { 108 referencedDict = dict 109 } else { 110 referencedDict, err = compute.TakeArrayOpts(ctx, dict, referencedIndices, compute.TakeOptions{BoundsCheck: false}) 111 if err != nil { 112 return err 113 } 114 defer referencedDict.Release() 115 } 116 referencedIndices.Release() 117 } 118 119 nonNullCount := indices.Len() - indices.NullN() 120 pageStats.IncNulls(int64(len(defLevels) - nonNullCount)) 121 pageStats.IncNumValues(int64(nonNullCount)) 122 return pageStats.UpdateFromArrow(referencedDict, false) 123 } 124 125 switch { 126 case preserved == nil: 127 if err := dictEncoder.PutDictionary(dict); err != nil { 128 return err 129 } 130 131 // if there were duplicate values in the dictionary, the encoder's 132 // memo table will be out of sync with the indices in the arrow array 133 // the easiest solution for this uncommon case is to fallback to plain 134 // encoding 135 if dictEncoder.NumEntries() != dict.Len() { 136 cw.FallbackToPlain() 137 return writeDense() 138 } 139 140 if pageStats != nil { 141 if err := updateStats(); err != nil { 142 return err 143 } 144 } 145 146 case !array.Equal(dict, preserved): 147 // dictionary has changed 148 cw.FallbackToPlain() 149 return writeDense() 150 default: 151 // dictionary is the same but we need to update stats 152 if pageStats != nil { 153 if err := updateStats(); err != nil { 154 return err 155 } 156 } 157 } 158 159 return cw.WriteDictIndices(indices, defLevels, repLevels) 160 }