github.com/grafana/pyroscope@v1.18.0/pkg/parquet/row_writer.go (about)

     1  package parquet
     2  
     3  import (
     4  	"io"
     5  
     6  	"github.com/parquet-go/parquet-go"
     7  )
     8  
     9  type RowWriterFlusher interface {
    10  	parquet.RowWriter
    11  	Flush() error
    12  }
    13  
    14  // CopyAsRowGroups copies row groups to dst from src and flush a rowgroup per rowGroupNumCount read.
    15  // It returns the total number of rows copied and the number of row groups written.
    16  // Flush is called to create a new row group.
    17  func CopyAsRowGroups(dst RowWriterFlusher, src parquet.RowReader, rowGroupNumCount int) (total uint64, rowGroupCount uint64, err error) {
    18  	if rowGroupNumCount <= 0 {
    19  		panic("rowGroupNumCount must be positive")
    20  	}
    21  	bufferSize := defaultRowBufferSize
    22  	// We clamp the buffer to the rowGroupNumCount to avoid allocating a buffer that is too large.
    23  	if rowGroupNumCount < bufferSize {
    24  		bufferSize = rowGroupNumCount
    25  	}
    26  	var (
    27  		buffer            = make([]parquet.Row, bufferSize)
    28  		currentGroupCount int
    29  	)
    30  
    31  	for {
    32  		n, readErr := src.ReadRows(buffer[:bufferSize])
    33  		if readErr != nil && readErr != io.EOF {
    34  			return 0, 0, readErr
    35  		}
    36  		if n == 0 {
    37  			break
    38  		}
    39  		buffer := buffer[:n]
    40  		if currentGroupCount+n >= rowGroupNumCount {
    41  			batchSize := rowGroupNumCount - currentGroupCount
    42  			written, err := dst.WriteRows(buffer[:batchSize])
    43  			if err != nil {
    44  				return 0, 0, err
    45  			}
    46  			buffer = buffer[batchSize:]
    47  			total += uint64(written)
    48  			if err := dst.Flush(); err != nil {
    49  				return 0, 0, err
    50  			}
    51  			rowGroupCount++
    52  			currentGroupCount = 0
    53  		}
    54  		if len(buffer) == 0 {
    55  			if readErr == io.EOF {
    56  				break
    57  			}
    58  			continue
    59  		}
    60  		written, err := dst.WriteRows(buffer)
    61  		if err != nil {
    62  			return 0, 0, err
    63  		}
    64  		total += uint64(written)
    65  		currentGroupCount += written
    66  		if readErr == io.EOF {
    67  			break
    68  		}
    69  	}
    70  	if currentGroupCount > 0 {
    71  		if err := dst.Flush(); err != nil {
    72  			return 0, 0, err
    73  		}
    74  		rowGroupCount++
    75  	}
    76  	return
    77  }