github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/buffer_pool.go (about)

     1  package parquet
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"os"
     7  	"path/filepath"
     8  	"sync"
     9  )
    10  
    11  // BufferPool is an interface abstracting the underlying implementation of
    12  // page buffer pools.
    13  //
    14  // The parquet-go package provides two implementations of this interface, one
    15  // backed by in-memory buffers (on the Go heap), and the other using temporary
    16  // files on disk.
    17  //
    18  // Applications which need finer grain control over the allocation and retention
    19  // of page buffers may choose to provide their own implementation and install it
    20  // via the parquet.ColumnPageBuffers writer option.
    21  //
    22  // BufferPool implementations must be safe to use concurrently from multiple
    23  // goroutines.
    24  type BufferPool interface {
    25  	// GetBuffer is called when a parquet writer needs to acquire a new
    26  	// page buffer from the pool.
    27  	GetBuffer() io.ReadWriteSeeker
    28  
    29  	// PutBuffer is called when a parquet writer releases a page buffer to
    30  	// the pool.
    31  	//
    32  	// The parquet.Writer type guarantees that the buffers it calls this method
    33  	// with were previously acquired by a call to GetBuffer on the same
    34  	// pool, and that it will not use them anymore after the call.
    35  	PutBuffer(io.ReadWriteSeeker)
    36  }
    37  
    38  // NewBufferPool creates a new in-memory page buffer pool.
    39  //
    40  // The implementation is backed by sync.Pool and allocates memory buffers on the
    41  // Go heap.
    42  func NewBufferPool() BufferPool { return new(memoryBufferPool) }
    43  
    44  type memoryBuffer struct {
    45  	data []byte
    46  	off  int
    47  }
    48  
    49  func (p *memoryBuffer) Reset() {
    50  	p.data, p.off = p.data[:0], 0
    51  }
    52  
    53  func (p *memoryBuffer) Read(b []byte) (n int, err error) {
    54  	n = copy(b, p.data[p.off:])
    55  	p.off += n
    56  	if p.off == len(p.data) {
    57  		err = io.EOF
    58  	}
    59  	return n, err
    60  }
    61  
    62  func (p *memoryBuffer) Write(b []byte) (int, error) {
    63  	n := copy(p.data[p.off:cap(p.data)], b)
    64  	p.data = p.data[:p.off+n]
    65  
    66  	if n < len(b) {
    67  		p.data = append(p.data, b[n:]...)
    68  	}
    69  
    70  	p.off += len(b)
    71  	return len(b), nil
    72  }
    73  
    74  func (p *memoryBuffer) WriteTo(w io.Writer) (int64, error) {
    75  	n, err := w.Write(p.data[p.off:])
    76  	p.off += n
    77  	return int64(n), err
    78  }
    79  
    80  func (p *memoryBuffer) Seek(offset int64, whence int) (int64, error) {
    81  	switch whence {
    82  	case io.SeekCurrent:
    83  		offset += int64(p.off)
    84  	case io.SeekEnd:
    85  		offset += int64(len(p.data))
    86  	}
    87  	if offset < 0 {
    88  		return 0, fmt.Errorf("seek: negative offset: %d<0", offset)
    89  	}
    90  	if offset > int64(len(p.data)) {
    91  		offset = int64(len(p.data))
    92  	}
    93  	p.off = int(offset)
    94  	return offset, nil
    95  }
    96  
    97  type memoryBufferPool struct{ sync.Pool }
    98  
    99  func (pool *memoryBufferPool) GetBuffer() io.ReadWriteSeeker {
   100  	b, _ := pool.Get().(*memoryBuffer)
   101  	if b == nil {
   102  		b = new(memoryBuffer)
   103  	} else {
   104  		b.Reset()
   105  	}
   106  	return b
   107  }
   108  
   109  func (pool *memoryBufferPool) PutBuffer(buf io.ReadWriteSeeker) {
   110  	if b, _ := buf.(*memoryBuffer); b != nil {
   111  		pool.Put(b)
   112  	}
   113  }
   114  
   115  // NewChunkBufferPool creates a new in-memory page buffer pool.
   116  //
   117  // The implementation is backed by sync.Pool and allocates memory buffers on the
   118  // Go heap in fixed-size chunks.
   119  func NewChunkBufferPool(chunkSize int) BufferPool {
   120  	return newChunkMemoryBufferPool(chunkSize)
   121  }
   122  
   123  func newChunkMemoryBufferPool(chunkSize int) *chunkMemoryBufferPool {
   124  	pool := &chunkMemoryBufferPool{}
   125  	pool.bytesPool.New = func() any {
   126  		return make([]byte, chunkSize)
   127  	}
   128  	return pool
   129  }
   130  
   131  // chunkMemoryBuffer implements an io.ReadWriteSeeker by storing a slice of fixed-size
   132  // buffers into which it copies data. (It uses a sync.Pool to reuse buffers across
   133  // instances.)
   134  type chunkMemoryBuffer struct {
   135  	bytesPool *sync.Pool
   136  
   137  	data [][]byte
   138  	idx  int
   139  	off  int
   140  }
   141  
   142  func (c *chunkMemoryBuffer) Reset() {
   143  	for i := range c.data {
   144  		c.bytesPool.Put(c.data[i])
   145  	}
   146  	for i := range c.data {
   147  		c.data[i] = nil
   148  	}
   149  	c.data, c.idx, c.off = c.data[:0], 0, 0
   150  }
   151  
   152  func (c *chunkMemoryBuffer) Read(b []byte) (n int, err error) {
   153  	if len(b) == 0 {
   154  		return 0, nil
   155  	}
   156  
   157  	if c.idx >= len(c.data) {
   158  		return 0, io.EOF
   159  	}
   160  
   161  	curData := c.data[c.idx]
   162  
   163  	if c.idx == len(c.data)-1 && c.off == len(curData) {
   164  		return 0, io.EOF
   165  	}
   166  
   167  	n = copy(b, curData[c.off:])
   168  	c.off += n
   169  
   170  	if c.off == cap(curData) {
   171  		c.idx++
   172  		c.off = 0
   173  	}
   174  
   175  	return n, err
   176  }
   177  
   178  func (c *chunkMemoryBuffer) Write(b []byte) (int, error) {
   179  	lenB := len(b)
   180  
   181  	if lenB == 0 {
   182  		return 0, nil
   183  	}
   184  
   185  	for len(b) > 0 {
   186  		if c.idx == len(c.data) {
   187  			c.data = append(c.data, c.bytesPool.Get().([]byte)[:0])
   188  		}
   189  		curData := c.data[c.idx]
   190  		n := copy(curData[c.off:cap(curData)], b)
   191  		c.data[c.idx] = curData[:c.off+n]
   192  		c.off += n
   193  		b = b[n:]
   194  		if c.off >= cap(curData) {
   195  			c.idx++
   196  			c.off = 0
   197  		}
   198  	}
   199  
   200  	return lenB, nil
   201  }
   202  
   203  func (c *chunkMemoryBuffer) WriteTo(w io.Writer) (int64, error) {
   204  	var numWritten int64
   205  	var err error
   206  	for err == nil {
   207  		curData := c.data[c.idx]
   208  		n, e := w.Write(curData[c.off:])
   209  		numWritten += int64(n)
   210  		err = e
   211  		if c.idx == len(c.data)-1 {
   212  			c.off = int(numWritten)
   213  			break
   214  		}
   215  		c.idx++
   216  		c.off = 0
   217  	}
   218  	return numWritten, err
   219  }
   220  
   221  func (c *chunkMemoryBuffer) Seek(offset int64, whence int) (int64, error) {
   222  	// Because this is the common case, we check it first to avoid computing endOff.
   223  	if offset == 0 && whence == io.SeekStart {
   224  		c.idx = 0
   225  		c.off = 0
   226  		return offset, nil
   227  	}
   228  	endOff := c.endOff()
   229  	switch whence {
   230  	case io.SeekCurrent:
   231  		offset += c.currentOff()
   232  	case io.SeekEnd:
   233  		offset += endOff
   234  	}
   235  	if offset < 0 {
   236  		return 0, fmt.Errorf("seek: negative offset: %d<0", offset)
   237  	}
   238  	if offset > endOff {
   239  		offset = endOff
   240  	}
   241  	// Repeat this case now that we know the absolute offset. This is a bit faster, but
   242  	// mainly protects us from an out-of-bounds if c.data is empty. (If the buffer is
   243  	// empty and the absolute offset isn't zero, we'd have errored (if negative) or
   244  	// clamped to zero (if positive) above.
   245  	if offset == 0 {
   246  		c.idx = 0
   247  		c.off = 0
   248  	} else {
   249  		stride := cap(c.data[0])
   250  		c.idx = int(offset) / stride
   251  		c.off = int(offset) % stride
   252  	}
   253  	return offset, nil
   254  }
   255  
   256  func (c *chunkMemoryBuffer) currentOff() int64 {
   257  	if c.idx == 0 {
   258  		return int64(c.off)
   259  	}
   260  	return int64((c.idx-1)*cap(c.data[0]) + c.off)
   261  }
   262  
   263  func (c *chunkMemoryBuffer) endOff() int64 {
   264  	if len(c.data) == 0 {
   265  		return 0
   266  	}
   267  	l := len(c.data)
   268  	last := c.data[l-1]
   269  	return int64(cap(last)*(l-1) + len(last))
   270  }
   271  
   272  type chunkMemoryBufferPool struct {
   273  	sync.Pool
   274  	bytesPool sync.Pool
   275  }
   276  
   277  func (pool *chunkMemoryBufferPool) GetBuffer() io.ReadWriteSeeker {
   278  	b, _ := pool.Get().(*chunkMemoryBuffer)
   279  	if b == nil {
   280  		b = &chunkMemoryBuffer{bytesPool: &pool.bytesPool}
   281  	} else {
   282  		b.Reset()
   283  	}
   284  	return b
   285  }
   286  
   287  func (pool *chunkMemoryBufferPool) PutBuffer(buf io.ReadWriteSeeker) {
   288  	if b, _ := buf.(*chunkMemoryBuffer); b != nil {
   289  		for _, bytes := range b.data {
   290  			b.bytesPool.Put(bytes)
   291  		}
   292  		for i := range b.data {
   293  			b.data[i] = nil
   294  		}
   295  		b.data = b.data[:0]
   296  		pool.Put(b)
   297  	}
   298  }
   299  
   300  type fileBufferPool struct {
   301  	err     error
   302  	tempdir string
   303  	pattern string
   304  }
   305  
   306  // NewFileBufferPool creates a new on-disk page buffer pool.
   307  func NewFileBufferPool(tempdir, pattern string) BufferPool {
   308  	pool := &fileBufferPool{
   309  		tempdir: tempdir,
   310  		pattern: pattern,
   311  	}
   312  	pool.tempdir, pool.err = filepath.Abs(pool.tempdir)
   313  	return pool
   314  }
   315  
   316  func (pool *fileBufferPool) GetBuffer() io.ReadWriteSeeker {
   317  	if pool.err != nil {
   318  		return &errorBuffer{err: pool.err}
   319  	}
   320  	f, err := os.CreateTemp(pool.tempdir, pool.pattern)
   321  	if err != nil {
   322  		return &errorBuffer{err: err}
   323  	}
   324  	return f
   325  }
   326  
   327  func (pool *fileBufferPool) PutBuffer(buf io.ReadWriteSeeker) {
   328  	if f, _ := buf.(*os.File); f != nil {
   329  		defer f.Close()
   330  		os.Remove(f.Name())
   331  	}
   332  }
   333  
   334  type errorBuffer struct{ err error }
   335  
   336  func (buf *errorBuffer) Read([]byte) (int, error)          { return 0, buf.err }
   337  func (buf *errorBuffer) Write([]byte) (int, error)         { return 0, buf.err }
   338  func (buf *errorBuffer) ReadFrom(io.Reader) (int64, error) { return 0, buf.err }
   339  func (buf *errorBuffer) WriteTo(io.Writer) (int64, error)  { return 0, buf.err }
   340  func (buf *errorBuffer) Seek(int64, int) (int64, error)    { return 0, buf.err }
   341  
   342  var (
   343  	defaultColumnBufferPool  = *newChunkMemoryBufferPool(256 * 1024)
   344  	defaultSortingBufferPool memoryBufferPool
   345  
   346  	_ io.ReaderFrom      = (*errorBuffer)(nil)
   347  	_ io.WriterTo        = (*errorBuffer)(nil)
   348  	_ io.ReadWriteSeeker = (*memoryBuffer)(nil)
   349  	_ io.WriterTo        = (*memoryBuffer)(nil)
   350  	_ io.ReadWriteSeeker = (*chunkMemoryBuffer)(nil)
   351  	_ io.WriterTo        = (*chunkMemoryBuffer)(nil)
   352  )
   353  
   354  type readerAt struct {
   355  	reader io.ReadSeeker
   356  	offset int64
   357  }
   358  
   359  func (r *readerAt) ReadAt(b []byte, off int64) (int, error) {
   360  	if r.offset < 0 || off != r.offset {
   361  		off, err := r.reader.Seek(off, io.SeekStart)
   362  		if err != nil {
   363  			return 0, err
   364  		}
   365  		r.offset = off
   366  	}
   367  	n, err := r.reader.Read(b)
   368  	r.offset += int64(n)
   369  	return n, err
   370  }
   371  
   372  func newReaderAt(r io.ReadSeeker) io.ReaderAt {
   373  	if rr, ok := r.(io.ReaderAt); ok {
   374  		return rr
   375  	}
   376  	return &readerAt{reader: r, offset: -1}
   377  }