github.com/apache/arrow/go/v7@v7.0.1/parquet/cmd/parquet_reader/dumper.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package main
    18  
    19  import (
    20  	"encoding/binary"
    21  	"fmt"
    22  	"reflect"
    23  
    24  	"github.com/apache/arrow/go/v7/parquet"
    25  	"github.com/apache/arrow/go/v7/parquet/file"
    26  	"github.com/apache/arrow/go/v7/parquet/schema"
    27  )
    28  
    29  const defaultBatchSize = 128
    30  
    31  type Dumper struct {
    32  	reader         file.ColumnChunkReader
    33  	batchSize      int64
    34  	valueOffset    int
    35  	valuesBuffered int
    36  
    37  	levelOffset    int64
    38  	levelsBuffered int64
    39  	defLevels      []int16
    40  	repLevels      []int16
    41  
    42  	valueBuffer interface{}
    43  }
    44  
    45  func createDumper(reader file.ColumnChunkReader) *Dumper {
    46  	batchSize := defaultBatchSize
    47  
    48  	var valueBuffer interface{}
    49  	switch reader.(type) {
    50  	case *file.BooleanColumnChunkReader:
    51  		valueBuffer = make([]bool, batchSize)
    52  	case *file.Int32ColumnChunkReader:
    53  		valueBuffer = make([]int32, batchSize)
    54  	case *file.Int64ColumnChunkReader:
    55  		valueBuffer = make([]int64, batchSize)
    56  	case *file.Float32ColumnChunkReader:
    57  		valueBuffer = make([]float32, batchSize)
    58  	case *file.Float64ColumnChunkReader:
    59  		valueBuffer = make([]float64, batchSize)
    60  	case *file.Int96ColumnChunkReader:
    61  		valueBuffer = make([]parquet.Int96, batchSize)
    62  	case *file.ByteArrayColumnChunkReader:
    63  		valueBuffer = make([]parquet.ByteArray, batchSize)
    64  	case *file.FixedLenByteArrayColumnChunkReader:
    65  		valueBuffer = make([]parquet.FixedLenByteArray, batchSize)
    66  	}
    67  
    68  	return &Dumper{
    69  		reader:      reader,
    70  		batchSize:   int64(batchSize),
    71  		defLevels:   make([]int16, batchSize),
    72  		repLevels:   make([]int16, batchSize),
    73  		valueBuffer: valueBuffer,
    74  	}
    75  }
    76  
    77  func (dump *Dumper) readNextBatch() {
    78  	switch reader := dump.reader.(type) {
    79  	case *file.BooleanColumnChunkReader:
    80  		values := dump.valueBuffer.([]bool)
    81  		dump.levelsBuffered, dump.valuesBuffered, _ = reader.ReadBatch(dump.batchSize, values, dump.defLevels, dump.repLevels)
    82  	case *file.Int32ColumnChunkReader:
    83  		values := dump.valueBuffer.([]int32)
    84  		dump.levelsBuffered, dump.valuesBuffered, _ = reader.ReadBatch(dump.batchSize, values, dump.defLevels, dump.repLevels)
    85  	case *file.Int64ColumnChunkReader:
    86  		values := dump.valueBuffer.([]int64)
    87  		dump.levelsBuffered, dump.valuesBuffered, _ = reader.ReadBatch(dump.batchSize, values, dump.defLevels, dump.repLevels)
    88  	case *file.Float32ColumnChunkReader:
    89  		values := dump.valueBuffer.([]float32)
    90  		dump.levelsBuffered, dump.valuesBuffered, _ = reader.ReadBatch(dump.batchSize, values, dump.defLevels, dump.repLevels)
    91  	case *file.Float64ColumnChunkReader:
    92  		values := dump.valueBuffer.([]float64)
    93  		dump.levelsBuffered, dump.valuesBuffered, _ = reader.ReadBatch(dump.batchSize, values, dump.defLevels, dump.repLevels)
    94  	case *file.Int96ColumnChunkReader:
    95  		values := dump.valueBuffer.([]parquet.Int96)
    96  		dump.levelsBuffered, dump.valuesBuffered, _ = reader.ReadBatch(dump.batchSize, values, dump.defLevels, dump.repLevels)
    97  	case *file.ByteArrayColumnChunkReader:
    98  		values := dump.valueBuffer.([]parquet.ByteArray)
    99  		dump.levelsBuffered, dump.valuesBuffered, _ = reader.ReadBatch(dump.batchSize, values, dump.defLevels, dump.repLevels)
   100  	case *file.FixedLenByteArrayColumnChunkReader:
   101  		values := dump.valueBuffer.([]parquet.FixedLenByteArray)
   102  		dump.levelsBuffered, dump.valuesBuffered, _ = reader.ReadBatch(dump.batchSize, values, dump.defLevels, dump.repLevels)
   103  	}
   104  
   105  	dump.valueOffset = 0
   106  	dump.levelOffset = 0
   107  }
   108  
   109  func (dump *Dumper) hasNext() bool {
   110  	return dump.levelOffset < dump.levelsBuffered || dump.reader.HasNext()
   111  }
   112  
   113  func (dump *Dumper) FormatValue(val interface{}, width int) string {
   114  	fmtstring := fmt.Sprintf("-%d", width)
   115  	switch val := val.(type) {
   116  	case nil:
   117  		return fmt.Sprintf("%"+fmtstring+"s", "NULL")
   118  	case bool:
   119  		return fmt.Sprintf("%"+fmtstring+"t", val)
   120  	case int32:
   121  		return fmt.Sprintf("%"+fmtstring+"d", val)
   122  	case int64:
   123  		return fmt.Sprintf("%"+fmtstring+"d", val)
   124  	case float32:
   125  		return fmt.Sprintf("%"+fmtstring+"f", val)
   126  	case float64:
   127  		return fmt.Sprintf("%"+fmtstring+"f", val)
   128  	case parquet.Int96:
   129  		return fmt.Sprintf("%"+fmtstring+"s",
   130  			fmt.Sprintf("%d %d %d",
   131  				binary.LittleEndian.Uint32(val[:4]),
   132  				binary.LittleEndian.Uint32(val[4:]),
   133  				binary.LittleEndian.Uint32(val[8:])))
   134  	case parquet.ByteArray:
   135  		if dump.reader.Descriptor().ConvertedType() == schema.ConvertedTypes.UTF8 {
   136  			return fmt.Sprintf("%"+fmtstring+"s", string(val))
   137  		}
   138  		return fmt.Sprintf("% "+fmtstring+"X", val)
   139  	case parquet.FixedLenByteArray:
   140  		return fmt.Sprintf("% "+fmtstring+"X", val)
   141  	default:
   142  		return fmt.Sprintf("%"+fmtstring+"s", fmt.Sprintf("%v", val))
   143  	}
   144  }
   145  
   146  func (dump *Dumper) Next() (interface{}, bool) {
   147  	if dump.levelOffset == dump.levelsBuffered {
   148  		if !dump.hasNext() {
   149  			return nil, false
   150  		}
   151  		dump.readNextBatch()
   152  		if dump.levelsBuffered == 0 {
   153  			return nil, false
   154  		}
   155  	}
   156  
   157  	defLevel := dump.defLevels[int(dump.levelOffset)]
   158  	// repLevel := dump.repLevels[int(dump.levelOffset)]
   159  	dump.levelOffset++
   160  
   161  	if defLevel < dump.reader.Descriptor().MaxDefinitionLevel() {
   162  		return nil, true
   163  	}
   164  
   165  	vb := reflect.ValueOf(dump.valueBuffer)
   166  	v := vb.Index(dump.valueOffset).Interface()
   167  	dump.valueOffset++
   168  
   169  	return v, true
   170  }