storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/tools/parquet2csv/parquet2csv.go (about)

     1  /*
     2   * Minio Cloud Storage, (C) 2018 Minio, Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package main
    18  
    19  import (
    20  	"encoding/csv"
    21  	"fmt"
    22  	"io"
    23  	"os"
    24  	"path"
    25  	"strings"
    26  
    27  	"github.com/minio/minio-go/v7/pkg/set"
    28  
    29  	parquet "storj.io/minio/pkg/s3select/internal/parquet-go"
    30  )
    31  
    32  func getReader(name string, offset int64, length int64) (io.ReadCloser, error) {
    33  	file, err := os.Open(name)
    34  	if err != nil {
    35  		return nil, err
    36  	}
    37  
    38  	fi, err := file.Stat()
    39  	if err != nil {
    40  		return nil, err
    41  	}
    42  
    43  	if offset < 0 {
    44  		offset = fi.Size() + offset
    45  	}
    46  
    47  	if _, err = file.Seek(offset, io.SeekStart); err != nil {
    48  		return nil, err
    49  	}
    50  
    51  	return file, nil
    52  }
    53  
    54  func printUsage() {
    55  	progName := path.Base(os.Args[0])
    56  	fmt.Printf("usage: %v PARQUET-FILE [COLUMN...]\n", progName)
    57  	fmt.Println()
    58  	fmt.Printf("examples:\n")
    59  	fmt.Printf("# Convert all columns to CSV\n")
    60  	fmt.Printf("$ %v example.parquet\n", progName)
    61  	fmt.Println()
    62  	fmt.Printf("# Convert specific columns to CSV\n")
    63  	fmt.Printf("$ %v example.par firstname dob\n", progName)
    64  	fmt.Println()
    65  }
    66  
    67  func main() {
    68  	if len(os.Args) < 2 {
    69  		printUsage()
    70  		os.Exit(-1)
    71  	}
    72  
    73  	name := os.Args[1]
    74  	ext := path.Ext(name)
    75  	csvFilename := name + ".csv"
    76  	if ext == ".parquet" || ext == ".par" {
    77  		csvFilename = strings.TrimSuffix(name, ext) + ".csv"
    78  	}
    79  
    80  	columns := set.CreateStringSet(os.Args[2:]...)
    81  	if len(columns) == 0 {
    82  		columns = nil
    83  	}
    84  
    85  	file, err := parquet.NewReader(
    86  		func(offset, length int64) (io.ReadCloser, error) {
    87  			return getReader(name, offset, length)
    88  		},
    89  		columns,
    90  	)
    91  	if err != nil {
    92  		fmt.Printf("%v: %v\n", name, err)
    93  		os.Exit(1)
    94  	}
    95  
    96  	defer file.Close()
    97  
    98  	csvFile, err := os.OpenFile(csvFilename, os.O_RDWR|os.O_CREATE, 0755)
    99  	if err != nil {
   100  		fmt.Printf("%v: %v\n", csvFilename, err)
   101  		os.Exit(1)
   102  	}
   103  
   104  	defer csvFile.Close()
   105  
   106  	csvWriter := csv.NewWriter(csvFile)
   107  	defer csvWriter.Flush()
   108  
   109  	headerWritten := false
   110  	for {
   111  		record, err := file.Read()
   112  		if err != nil {
   113  			if err != io.EOF {
   114  				fmt.Printf("%v: %v\n", name, err)
   115  				os.Exit(1)
   116  			}
   117  
   118  			break
   119  		}
   120  
   121  		if !headerWritten {
   122  			var csvRecord []string
   123  			record.Range(func(name string, value parquet.Value) bool {
   124  				csvRecord = append(csvRecord, name)
   125  				return true
   126  			})
   127  
   128  			if err = csvWriter.Write(csvRecord); err != nil {
   129  				fmt.Printf("%v: %v\n", csvFilename, err)
   130  				os.Exit(1)
   131  			}
   132  
   133  			headerWritten = true
   134  		}
   135  
   136  		var csvRecord []string
   137  		record.Range(func(name string, value parquet.Value) bool {
   138  			csvRecord = append(csvRecord, fmt.Sprintf("%v", value.Value))
   139  			return true
   140  		})
   141  
   142  		if err = csvWriter.Write(csvRecord); err != nil {
   143  			fmt.Printf("%v: %v\n", csvFilename, err)
   144  			os.Exit(1)
   145  		}
   146  	}
   147  }