github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/buffer_pool.go (about) 1 package parquet 2 3 import ( 4 "fmt" 5 "io" 6 "os" 7 "path/filepath" 8 "sync" 9 ) 10 11 // BufferPool is an interface abstracting the underlying implementation of 12 // page buffer pools. 13 // 14 // The parquet-go package provides two implementations of this interface, one 15 // backed by in-memory buffers (on the Go heap), and the other using temporary 16 // files on disk. 17 // 18 // Applications which need finer grain control over the allocation and retention 19 // of page buffers may choose to provide their own implementation and install it 20 // via the parquet.ColumnPageBuffers writer option. 21 // 22 // BufferPool implementations must be safe to use concurrently from multiple 23 // goroutines. 24 type BufferPool interface { 25 // GetBuffer is called when a parquet writer needs to acquire a new 26 // page buffer from the pool. 27 GetBuffer() io.ReadWriteSeeker 28 29 // PutBuffer is called when a parquet writer releases a page buffer to 30 // the pool. 31 // 32 // The parquet.Writer type guarantees that the buffers it calls this method 33 // with were previously acquired by a call to GetBuffer on the same 34 // pool, and that it will not use them anymore after the call. 35 PutBuffer(io.ReadWriteSeeker) 36 } 37 38 // NewBufferPool creates a new in-memory page buffer pool. 39 // 40 // The implementation is backed by sync.Pool and allocates memory buffers on the 41 // Go heap. 42 func NewBufferPool() BufferPool { return new(memoryBufferPool) } 43 44 type memoryBuffer struct { 45 data []byte 46 off int 47 } 48 49 func (p *memoryBuffer) Reset() { 50 p.data, p.off = p.data[:0], 0 51 } 52 53 func (p *memoryBuffer) Read(b []byte) (n int, err error) { 54 n = copy(b, p.data[p.off:]) 55 p.off += n 56 if p.off == len(p.data) { 57 err = io.EOF 58 } 59 return n, err 60 } 61 62 func (p *memoryBuffer) Write(b []byte) (int, error) { 63 n := copy(p.data[p.off:cap(p.data)], b) 64 p.data = p.data[:p.off+n] 65 66 if n < len(b) { 67 p.data = append(p.data, b[n:]...) 68 } 69 70 p.off += len(b) 71 return len(b), nil 72 } 73 74 func (p *memoryBuffer) WriteTo(w io.Writer) (int64, error) { 75 n, err := w.Write(p.data[p.off:]) 76 p.off += n 77 return int64(n), err 78 } 79 80 func (p *memoryBuffer) Seek(offset int64, whence int) (int64, error) { 81 switch whence { 82 case io.SeekCurrent: 83 offset += int64(p.off) 84 case io.SeekEnd: 85 offset += int64(len(p.data)) 86 } 87 if offset < 0 { 88 return 0, fmt.Errorf("seek: negative offset: %d<0", offset) 89 } 90 if offset > int64(len(p.data)) { 91 offset = int64(len(p.data)) 92 } 93 p.off = int(offset) 94 return offset, nil 95 } 96 97 type memoryBufferPool struct{ sync.Pool } 98 99 func (pool *memoryBufferPool) GetBuffer() io.ReadWriteSeeker { 100 b, _ := pool.Get().(*memoryBuffer) 101 if b == nil { 102 b = new(memoryBuffer) 103 } else { 104 b.Reset() 105 } 106 return b 107 } 108 109 func (pool *memoryBufferPool) PutBuffer(buf io.ReadWriteSeeker) { 110 if b, _ := buf.(*memoryBuffer); b != nil { 111 pool.Put(b) 112 } 113 } 114 115 // NewChunkBufferPool creates a new in-memory page buffer pool. 116 // 117 // The implementation is backed by sync.Pool and allocates memory buffers on the 118 // Go heap in fixed-size chunks. 119 func NewChunkBufferPool(chunkSize int) BufferPool { 120 return newChunkMemoryBufferPool(chunkSize) 121 } 122 123 func newChunkMemoryBufferPool(chunkSize int) *chunkMemoryBufferPool { 124 pool := &chunkMemoryBufferPool{} 125 pool.bytesPool.New = func() any { 126 return make([]byte, chunkSize) 127 } 128 return pool 129 } 130 131 // chunkMemoryBuffer implements an io.ReadWriteSeeker by storing a slice of fixed-size 132 // buffers into which it copies data. (It uses a sync.Pool to reuse buffers across 133 // instances.) 134 type chunkMemoryBuffer struct { 135 bytesPool *sync.Pool 136 137 data [][]byte 138 idx int 139 off int 140 } 141 142 func (c *chunkMemoryBuffer) Reset() { 143 for i := range c.data { 144 c.bytesPool.Put(c.data[i]) 145 } 146 for i := range c.data { 147 c.data[i] = nil 148 } 149 c.data, c.idx, c.off = c.data[:0], 0, 0 150 } 151 152 func (c *chunkMemoryBuffer) Read(b []byte) (n int, err error) { 153 if len(b) == 0 { 154 return 0, nil 155 } 156 157 if c.idx >= len(c.data) { 158 return 0, io.EOF 159 } 160 161 curData := c.data[c.idx] 162 163 if c.idx == len(c.data)-1 && c.off == len(curData) { 164 return 0, io.EOF 165 } 166 167 n = copy(b, curData[c.off:]) 168 c.off += n 169 170 if c.off == cap(curData) { 171 c.idx++ 172 c.off = 0 173 } 174 175 return n, err 176 } 177 178 func (c *chunkMemoryBuffer) Write(b []byte) (int, error) { 179 lenB := len(b) 180 181 if lenB == 0 { 182 return 0, nil 183 } 184 185 for len(b) > 0 { 186 if c.idx == len(c.data) { 187 c.data = append(c.data, c.bytesPool.Get().([]byte)[:0]) 188 } 189 curData := c.data[c.idx] 190 n := copy(curData[c.off:cap(curData)], b) 191 c.data[c.idx] = curData[:c.off+n] 192 c.off += n 193 b = b[n:] 194 if c.off >= cap(curData) { 195 c.idx++ 196 c.off = 0 197 } 198 } 199 200 return lenB, nil 201 } 202 203 func (c *chunkMemoryBuffer) WriteTo(w io.Writer) (int64, error) { 204 var numWritten int64 205 var err error 206 for err == nil { 207 curData := c.data[c.idx] 208 n, e := w.Write(curData[c.off:]) 209 numWritten += int64(n) 210 err = e 211 if c.idx == len(c.data)-1 { 212 c.off = int(numWritten) 213 break 214 } 215 c.idx++ 216 c.off = 0 217 } 218 return numWritten, err 219 } 220 221 func (c *chunkMemoryBuffer) Seek(offset int64, whence int) (int64, error) { 222 // Because this is the common case, we check it first to avoid computing endOff. 223 if offset == 0 && whence == io.SeekStart { 224 c.idx = 0 225 c.off = 0 226 return offset, nil 227 } 228 endOff := c.endOff() 229 switch whence { 230 case io.SeekCurrent: 231 offset += c.currentOff() 232 case io.SeekEnd: 233 offset += endOff 234 } 235 if offset < 0 { 236 return 0, fmt.Errorf("seek: negative offset: %d<0", offset) 237 } 238 if offset > endOff { 239 offset = endOff 240 } 241 // Repeat this case now that we know the absolute offset. This is a bit faster, but 242 // mainly protects us from an out-of-bounds if c.data is empty. (If the buffer is 243 // empty and the absolute offset isn't zero, we'd have errored (if negative) or 244 // clamped to zero (if positive) above. 245 if offset == 0 { 246 c.idx = 0 247 c.off = 0 248 } else { 249 stride := cap(c.data[0]) 250 c.idx = int(offset) / stride 251 c.off = int(offset) % stride 252 } 253 return offset, nil 254 } 255 256 func (c *chunkMemoryBuffer) currentOff() int64 { 257 if c.idx == 0 { 258 return int64(c.off) 259 } 260 return int64((c.idx-1)*cap(c.data[0]) + c.off) 261 } 262 263 func (c *chunkMemoryBuffer) endOff() int64 { 264 if len(c.data) == 0 { 265 return 0 266 } 267 l := len(c.data) 268 last := c.data[l-1] 269 return int64(cap(last)*(l-1) + len(last)) 270 } 271 272 type chunkMemoryBufferPool struct { 273 sync.Pool 274 bytesPool sync.Pool 275 } 276 277 func (pool *chunkMemoryBufferPool) GetBuffer() io.ReadWriteSeeker { 278 b, _ := pool.Get().(*chunkMemoryBuffer) 279 if b == nil { 280 b = &chunkMemoryBuffer{bytesPool: &pool.bytesPool} 281 } else { 282 b.Reset() 283 } 284 return b 285 } 286 287 func (pool *chunkMemoryBufferPool) PutBuffer(buf io.ReadWriteSeeker) { 288 if b, _ := buf.(*chunkMemoryBuffer); b != nil { 289 for _, bytes := range b.data { 290 b.bytesPool.Put(bytes) 291 } 292 for i := range b.data { 293 b.data[i] = nil 294 } 295 b.data = b.data[:0] 296 pool.Put(b) 297 } 298 } 299 300 type fileBufferPool struct { 301 err error 302 tempdir string 303 pattern string 304 } 305 306 // NewFileBufferPool creates a new on-disk page buffer pool. 307 func NewFileBufferPool(tempdir, pattern string) BufferPool { 308 pool := &fileBufferPool{ 309 tempdir: tempdir, 310 pattern: pattern, 311 } 312 pool.tempdir, pool.err = filepath.Abs(pool.tempdir) 313 return pool 314 } 315 316 func (pool *fileBufferPool) GetBuffer() io.ReadWriteSeeker { 317 if pool.err != nil { 318 return &errorBuffer{err: pool.err} 319 } 320 f, err := os.CreateTemp(pool.tempdir, pool.pattern) 321 if err != nil { 322 return &errorBuffer{err: err} 323 } 324 return f 325 } 326 327 func (pool *fileBufferPool) PutBuffer(buf io.ReadWriteSeeker) { 328 if f, _ := buf.(*os.File); f != nil { 329 defer f.Close() 330 os.Remove(f.Name()) 331 } 332 } 333 334 type errorBuffer struct{ err error } 335 336 func (buf *errorBuffer) Read([]byte) (int, error) { return 0, buf.err } 337 func (buf *errorBuffer) Write([]byte) (int, error) { return 0, buf.err } 338 func (buf *errorBuffer) ReadFrom(io.Reader) (int64, error) { return 0, buf.err } 339 func (buf *errorBuffer) WriteTo(io.Writer) (int64, error) { return 0, buf.err } 340 func (buf *errorBuffer) Seek(int64, int) (int64, error) { return 0, buf.err } 341 342 var ( 343 defaultColumnBufferPool = *newChunkMemoryBufferPool(256 * 1024) 344 defaultSortingBufferPool memoryBufferPool 345 346 _ io.ReaderFrom = (*errorBuffer)(nil) 347 _ io.WriterTo = (*errorBuffer)(nil) 348 _ io.ReadWriteSeeker = (*memoryBuffer)(nil) 349 _ io.WriterTo = (*memoryBuffer)(nil) 350 _ io.ReadWriteSeeker = (*chunkMemoryBuffer)(nil) 351 _ io.WriterTo = (*chunkMemoryBuffer)(nil) 352 ) 353 354 type readerAt struct { 355 reader io.ReadSeeker 356 offset int64 357 } 358 359 func (r *readerAt) ReadAt(b []byte, off int64) (int, error) { 360 if r.offset < 0 || off != r.offset { 361 off, err := r.reader.Seek(off, io.SeekStart) 362 if err != nil { 363 return 0, err 364 } 365 r.offset = off 366 } 367 n, err := r.reader.Read(b) 368 r.offset += int64(n) 369 return n, err 370 } 371 372 func newReaderAt(r io.ReadSeeker) io.ReaderAt { 373 if rr, ok := r.(io.ReaderAt); ok { 374 return rr 375 } 376 return &readerAt{reader: r, offset: -1} 377 }