github.com/grafana/pyroscope@v1.18.0/pkg/parquet/row_writer.go (about) 1 package parquet 2 3 import ( 4 "io" 5 6 "github.com/parquet-go/parquet-go" 7 ) 8 9 type RowWriterFlusher interface { 10 parquet.RowWriter 11 Flush() error 12 } 13 14 // CopyAsRowGroups copies row groups to dst from src and flush a rowgroup per rowGroupNumCount read. 15 // It returns the total number of rows copied and the number of row groups written. 16 // Flush is called to create a new row group. 17 func CopyAsRowGroups(dst RowWriterFlusher, src parquet.RowReader, rowGroupNumCount int) (total uint64, rowGroupCount uint64, err error) { 18 if rowGroupNumCount <= 0 { 19 panic("rowGroupNumCount must be positive") 20 } 21 bufferSize := defaultRowBufferSize 22 // We clamp the buffer to the rowGroupNumCount to avoid allocating a buffer that is too large. 23 if rowGroupNumCount < bufferSize { 24 bufferSize = rowGroupNumCount 25 } 26 var ( 27 buffer = make([]parquet.Row, bufferSize) 28 currentGroupCount int 29 ) 30 31 for { 32 n, readErr := src.ReadRows(buffer[:bufferSize]) 33 if readErr != nil && readErr != io.EOF { 34 return 0, 0, readErr 35 } 36 if n == 0 { 37 break 38 } 39 buffer := buffer[:n] 40 if currentGroupCount+n >= rowGroupNumCount { 41 batchSize := rowGroupNumCount - currentGroupCount 42 written, err := dst.WriteRows(buffer[:batchSize]) 43 if err != nil { 44 return 0, 0, err 45 } 46 buffer = buffer[batchSize:] 47 total += uint64(written) 48 if err := dst.Flush(); err != nil { 49 return 0, 0, err 50 } 51 rowGroupCount++ 52 currentGroupCount = 0 53 } 54 if len(buffer) == 0 { 55 if readErr == io.EOF { 56 break 57 } 58 continue 59 } 60 written, err := dst.WriteRows(buffer) 61 if err != nil { 62 return 0, 0, err 63 } 64 total += uint64(written) 65 currentGroupCount += written 66 if readErr == io.EOF { 67 break 68 } 69 } 70 if currentGroupCount > 0 { 71 if err := dst.Flush(); err != nil { 72 return 0, 0, err 73 } 74 rowGroupCount++ 75 } 76 return 77 }