github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/parquet.go (about)

     1  // Copyright 2022 Twilio Inc.
     2  
     3  // Package parquet is a library for working with parquet files. For an overview
     4  // of Parquet's qualities as a storage format, see this blog post:
     5  // https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet
     6  //
     7  // Or see the Parquet documentation: https://parquet.apache.org/docs/
     8  package parquet
     9  
    10  import (
    11  	"io"
    12  	"os"
    13  	"reflect"
    14  )
    15  
    16  // Read reads and returns rows from the parquet file in the given reader.
    17  //
    18  // The type T defines the type of rows read from r. T must be compatible with
    19  // the file's schema or an error will be returned. The row type might represent
    20  // a subset of the full schema, in which case only a subset of the columns will
    21  // be loaded from r.
    22  //
    23  // This function is provided for convenience to facilitate reading of parquet
    24  // files from arbitrary locations in cases where the data set fit in memory.
    25  func Read[T any](r io.ReaderAt, size int64, options ...ReaderOption) (rows []T, err error) {
    26  	config, err := NewReaderConfig(options...)
    27  	if err != nil {
    28  		return nil, err
    29  	}
    30  	file, err := OpenFile(r, size)
    31  	if err != nil {
    32  		return nil, err
    33  	}
    34  	rows = make([]T, file.NumRows())
    35  	reader := NewGenericReader[T](file, config)
    36  	n, err := reader.Read(rows)
    37  	if err == io.EOF {
    38  		err = nil
    39  	}
    40  	reader.Close()
    41  	return rows[:n], err
    42  }
    43  
    44  // ReadFile reads rows of the parquet file at the given path.
    45  //
    46  // The type T defines the type of rows read from r. T must be compatible with
    47  // the file's schema or an error will be returned. The row type might represent
    48  // a subset of the full schema, in which case only a subset of the columns will
    49  // be loaded from the file.
    50  //
    51  // This function is provided for convenience to facilitate reading of parquet
    52  // files from the file system in cases where the data set fit in memory.
    53  func ReadFile[T any](path string, options ...ReaderOption) (rows []T, err error) {
    54  	f, err := os.Open(path)
    55  	if err != nil {
    56  		return nil, err
    57  	}
    58  	defer f.Close()
    59  	s, err := f.Stat()
    60  	if err != nil {
    61  		return nil, err
    62  	}
    63  	return Read[T](f, s.Size())
    64  }
    65  
    66  // Write writes the given list of rows to a parquet file written to w.
    67  //
    68  // This function is provided for convenience to facilitate the creation of
    69  // parquet files.
    70  func Write[T any](w io.Writer, rows []T, options ...WriterOption) error {
    71  	config, err := NewWriterConfig(options...)
    72  	if err != nil {
    73  		return err
    74  	}
    75  	writer := NewGenericWriter[T](w, config)
    76  	if _, err := writer.Write(rows); err != nil {
    77  		return err
    78  	}
    79  	return writer.Close()
    80  }
    81  
    82  // Write writes the given list of rows to a parquet file written to w.
    83  //
    84  // This function is provided for convenience to facilitate writing parquet
    85  // files to the file system.
    86  func WriteFile[T any](path string, rows []T, options ...WriterOption) error {
    87  	f, err := os.Create(path)
    88  	if err != nil {
    89  		return err
    90  	}
    91  	defer f.Close()
    92  	return Write(f, rows, options...)
    93  }
    94  
    95  func atLeastOne(size int) int {
    96  	return atLeast(size, 1)
    97  }
    98  
    99  func atLeast(size, least int) int {
   100  	if size < least {
   101  		return least
   102  	}
   103  	return size
   104  }
   105  
   106  func min(a, b int) int {
   107  	if a < b {
   108  		return a
   109  	}
   110  	return b
   111  }
   112  
   113  func max(a, b int) int {
   114  	if a > b {
   115  		return a
   116  	}
   117  	return b
   118  }
   119  
   120  func typeNameOf(t reflect.Type) string {
   121  	s1 := t.String()
   122  	s2 := t.Kind().String()
   123  	if s1 == s2 {
   124  		return s1
   125  	}
   126  	return s1 + " (" + s2 + ")"
   127  }
   128  
   129  func isZero(b []byte) bool {
   130  	for _, c := range b {
   131  		if c != 0 {
   132  			return false
   133  		}
   134  	}
   135  	return true
   136  }