github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/parquet.go (about) 1 // Copyright 2022 Twilio Inc. 2 3 // Package parquet is a library for working with parquet files. For an overview 4 // of Parquet's qualities as a storage format, see this blog post: 5 // https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet 6 // 7 // Or see the Parquet documentation: https://parquet.apache.org/docs/ 8 package parquet 9 10 import ( 11 "io" 12 "os" 13 "reflect" 14 ) 15 16 // Read reads and returns rows from the parquet file in the given reader. 17 // 18 // The type T defines the type of rows read from r. T must be compatible with 19 // the file's schema or an error will be returned. The row type might represent 20 // a subset of the full schema, in which case only a subset of the columns will 21 // be loaded from r. 22 // 23 // This function is provided for convenience to facilitate reading of parquet 24 // files from arbitrary locations in cases where the data set fit in memory. 25 func Read[T any](r io.ReaderAt, size int64, options ...ReaderOption) (rows []T, err error) { 26 config, err := NewReaderConfig(options...) 27 if err != nil { 28 return nil, err 29 } 30 file, err := OpenFile(r, size) 31 if err != nil { 32 return nil, err 33 } 34 rows = make([]T, file.NumRows()) 35 reader := NewGenericReader[T](file, config) 36 n, err := reader.Read(rows) 37 if err == io.EOF { 38 err = nil 39 } 40 reader.Close() 41 return rows[:n], err 42 } 43 44 // ReadFile reads rows of the parquet file at the given path. 45 // 46 // The type T defines the type of rows read from r. T must be compatible with 47 // the file's schema or an error will be returned. The row type might represent 48 // a subset of the full schema, in which case only a subset of the columns will 49 // be loaded from the file. 50 // 51 // This function is provided for convenience to facilitate reading of parquet 52 // files from the file system in cases where the data set fit in memory. 53 func ReadFile[T any](path string, options ...ReaderOption) (rows []T, err error) { 54 f, err := os.Open(path) 55 if err != nil { 56 return nil, err 57 } 58 defer f.Close() 59 s, err := f.Stat() 60 if err != nil { 61 return nil, err 62 } 63 return Read[T](f, s.Size()) 64 } 65 66 // Write writes the given list of rows to a parquet file written to w. 67 // 68 // This function is provided for convenience to facilitate the creation of 69 // parquet files. 70 func Write[T any](w io.Writer, rows []T, options ...WriterOption) error { 71 config, err := NewWriterConfig(options...) 72 if err != nil { 73 return err 74 } 75 writer := NewGenericWriter[T](w, config) 76 if _, err := writer.Write(rows); err != nil { 77 return err 78 } 79 return writer.Close() 80 } 81 82 // Write writes the given list of rows to a parquet file written to w. 83 // 84 // This function is provided for convenience to facilitate writing parquet 85 // files to the file system. 86 func WriteFile[T any](path string, rows []T, options ...WriterOption) error { 87 f, err := os.Create(path) 88 if err != nil { 89 return err 90 } 91 defer f.Close() 92 return Write(f, rows, options...) 93 } 94 95 func atLeastOne(size int) int { 96 return atLeast(size, 1) 97 } 98 99 func atLeast(size, least int) int { 100 if size < least { 101 return least 102 } 103 return size 104 } 105 106 func min(a, b int) int { 107 if a < b { 108 return a 109 } 110 return b 111 } 112 113 func max(a, b int) int { 114 if a > b { 115 return a 116 } 117 return b 118 } 119 120 func typeNameOf(t reflect.Type) string { 121 s1 := t.String() 122 s2 := t.Kind().String() 123 if s1 == s2 { 124 return s1 125 } 126 return s1 + " (" + s2 + ")" 127 } 128 129 func isZero(b []byte) bool { 130 for _, c := range b { 131 if c != 0 { 132 return false 133 } 134 } 135 return true 136 }