github.com/apache/arrow/go/v7@v7.0.1/parquet/cmd/parquet_reader/dumper.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package main 18 19 import ( 20 "encoding/binary" 21 "fmt" 22 "reflect" 23 24 "github.com/apache/arrow/go/v7/parquet" 25 "github.com/apache/arrow/go/v7/parquet/file" 26 "github.com/apache/arrow/go/v7/parquet/schema" 27 ) 28 29 const defaultBatchSize = 128 30 31 type Dumper struct { 32 reader file.ColumnChunkReader 33 batchSize int64 34 valueOffset int 35 valuesBuffered int 36 37 levelOffset int64 38 levelsBuffered int64 39 defLevels []int16 40 repLevels []int16 41 42 valueBuffer interface{} 43 } 44 45 func createDumper(reader file.ColumnChunkReader) *Dumper { 46 batchSize := defaultBatchSize 47 48 var valueBuffer interface{} 49 switch reader.(type) { 50 case *file.BooleanColumnChunkReader: 51 valueBuffer = make([]bool, batchSize) 52 case *file.Int32ColumnChunkReader: 53 valueBuffer = make([]int32, batchSize) 54 case *file.Int64ColumnChunkReader: 55 valueBuffer = make([]int64, batchSize) 56 case *file.Float32ColumnChunkReader: 57 valueBuffer = make([]float32, batchSize) 58 case *file.Float64ColumnChunkReader: 59 valueBuffer = make([]float64, batchSize) 60 case *file.Int96ColumnChunkReader: 61 valueBuffer = make([]parquet.Int96, batchSize) 62 case *file.ByteArrayColumnChunkReader: 63 valueBuffer = make([]parquet.ByteArray, batchSize) 64 case *file.FixedLenByteArrayColumnChunkReader: 65 valueBuffer = make([]parquet.FixedLenByteArray, batchSize) 66 } 67 68 return &Dumper{ 69 reader: reader, 70 batchSize: int64(batchSize), 71 defLevels: make([]int16, batchSize), 72 repLevels: make([]int16, batchSize), 73 valueBuffer: valueBuffer, 74 } 75 } 76 77 func (dump *Dumper) readNextBatch() { 78 switch reader := dump.reader.(type) { 79 case *file.BooleanColumnChunkReader: 80 values := dump.valueBuffer.([]bool) 81 dump.levelsBuffered, dump.valuesBuffered, _ = reader.ReadBatch(dump.batchSize, values, dump.defLevels, dump.repLevels) 82 case *file.Int32ColumnChunkReader: 83 values := dump.valueBuffer.([]int32) 84 dump.levelsBuffered, dump.valuesBuffered, _ = reader.ReadBatch(dump.batchSize, values, dump.defLevels, dump.repLevels) 85 case *file.Int64ColumnChunkReader: 86 values := dump.valueBuffer.([]int64) 87 dump.levelsBuffered, dump.valuesBuffered, _ = reader.ReadBatch(dump.batchSize, values, dump.defLevels, dump.repLevels) 88 case *file.Float32ColumnChunkReader: 89 values := dump.valueBuffer.([]float32) 90 dump.levelsBuffered, dump.valuesBuffered, _ = reader.ReadBatch(dump.batchSize, values, dump.defLevels, dump.repLevels) 91 case *file.Float64ColumnChunkReader: 92 values := dump.valueBuffer.([]float64) 93 dump.levelsBuffered, dump.valuesBuffered, _ = reader.ReadBatch(dump.batchSize, values, dump.defLevels, dump.repLevels) 94 case *file.Int96ColumnChunkReader: 95 values := dump.valueBuffer.([]parquet.Int96) 96 dump.levelsBuffered, dump.valuesBuffered, _ = reader.ReadBatch(dump.batchSize, values, dump.defLevels, dump.repLevels) 97 case *file.ByteArrayColumnChunkReader: 98 values := dump.valueBuffer.([]parquet.ByteArray) 99 dump.levelsBuffered, dump.valuesBuffered, _ = reader.ReadBatch(dump.batchSize, values, dump.defLevels, dump.repLevels) 100 case *file.FixedLenByteArrayColumnChunkReader: 101 values := dump.valueBuffer.([]parquet.FixedLenByteArray) 102 dump.levelsBuffered, dump.valuesBuffered, _ = reader.ReadBatch(dump.batchSize, values, dump.defLevels, dump.repLevels) 103 } 104 105 dump.valueOffset = 0 106 dump.levelOffset = 0 107 } 108 109 func (dump *Dumper) hasNext() bool { 110 return dump.levelOffset < dump.levelsBuffered || dump.reader.HasNext() 111 } 112 113 func (dump *Dumper) FormatValue(val interface{}, width int) string { 114 fmtstring := fmt.Sprintf("-%d", width) 115 switch val := val.(type) { 116 case nil: 117 return fmt.Sprintf("%"+fmtstring+"s", "NULL") 118 case bool: 119 return fmt.Sprintf("%"+fmtstring+"t", val) 120 case int32: 121 return fmt.Sprintf("%"+fmtstring+"d", val) 122 case int64: 123 return fmt.Sprintf("%"+fmtstring+"d", val) 124 case float32: 125 return fmt.Sprintf("%"+fmtstring+"f", val) 126 case float64: 127 return fmt.Sprintf("%"+fmtstring+"f", val) 128 case parquet.Int96: 129 return fmt.Sprintf("%"+fmtstring+"s", 130 fmt.Sprintf("%d %d %d", 131 binary.LittleEndian.Uint32(val[:4]), 132 binary.LittleEndian.Uint32(val[4:]), 133 binary.LittleEndian.Uint32(val[8:]))) 134 case parquet.ByteArray: 135 if dump.reader.Descriptor().ConvertedType() == schema.ConvertedTypes.UTF8 { 136 return fmt.Sprintf("%"+fmtstring+"s", string(val)) 137 } 138 return fmt.Sprintf("% "+fmtstring+"X", val) 139 case parquet.FixedLenByteArray: 140 return fmt.Sprintf("% "+fmtstring+"X", val) 141 default: 142 return fmt.Sprintf("%"+fmtstring+"s", fmt.Sprintf("%v", val)) 143 } 144 } 145 146 func (dump *Dumper) Next() (interface{}, bool) { 147 if dump.levelOffset == dump.levelsBuffered { 148 if !dump.hasNext() { 149 return nil, false 150 } 151 dump.readNextBatch() 152 if dump.levelsBuffered == 0 { 153 return nil, false 154 } 155 } 156 157 defLevel := dump.defLevels[int(dump.levelOffset)] 158 // repLevel := dump.repLevels[int(dump.levelOffset)] 159 dump.levelOffset++ 160 161 if defLevel < dump.reader.Descriptor().MaxDefinitionLevel() { 162 return nil, true 163 } 164 165 vb := reflect.ValueOf(dump.valueBuffer) 166 v := vb.Index(dump.valueOffset).Interface() 167 dump.valueOffset++ 168 169 return v, true 170 }