github.com/grailbio/base@v0.0.11/digest/digestreader.go (about)

     1  // Copyright 2017 GRAIL, Inc. All rights reserved.
     2  // Use of this source code is governed by the Apache 2.0
     3  // license that can be found in the LICENSE file.
     4  
     5  package digest
     6  
     7  import (
     8  	"fmt"
     9  	"io"
    10  	"sync"
    11  	"sync/atomic"
    12  )
    13  
    14  // Reader can be used to calculate the digest of a file as it is being
    15  // read. It uses back pressure to stall reads when a block is missing.
    16  // This can cause deadlock if the application doesn't retry immediately.
    17  //
    18  // The s3manager uploader differentiates between two kinds of readers to
    19  // improve upload performance: Simple Readers and "ReaderSeekers" for
    20  // performance. This implementation creates either a simpleReaderAt or a
    21  // readerAtSeeker depending on the underlying ReaderAt.
    22  //
    23  // Expects the reads to be complete and non-overlapping.
    24  type Reader interface {
    25  	io.Reader
    26  	Digest() (Digest, error)
    27  }
    28  
    29  type readerWrap struct {
    30  	mu           sync.Mutex // GUARDS reader.
    31  	err          error
    32  	digestWriter Writer
    33  	source       io.Reader
    34  }
    35  
    36  // Digest returns the digest for the data that has been read.
    37  func (r *readerWrap) Digest() (Digest, error) {
    38  	r.mu.Lock()
    39  	defer r.mu.Unlock()
    40  
    41  	if r.err != nil {
    42  		return Digest{}, r.err
    43  	}
    44  
    45  	return r.digestWriter.Digest(), nil
    46  }
    47  
    48  // Read implements the io.Reader interface. It reads data from the file
    49  // and places it in p, returning the number of bytes placed in the slice as
    50  // well as any error.
    51  func (r *readerWrap) Read(p []byte) (int, error) {
    52  	r.mu.Lock()
    53  	defer r.mu.Unlock()
    54  
    55  	n, err := r.source.Read(p)
    56  	r.err = err
    57  
    58  	if r.err != nil {
    59  		return n, r.err
    60  	}
    61  
    62  	q := p[:n]
    63  	// todo(ysiato, schandra, pknudsgaard) this looks like another intentional no-error-check like digest.go:407
    64  	r.digestWriter.Write(q)
    65  
    66  	return n, r.err
    67  }
    68  
    69  type readerAtSeeker interface {
    70  	io.ReaderAt
    71  	io.ReadSeeker
    72  }
    73  
    74  type readerAtSeekerWrap struct {
    75  	mu           sync.Mutex
    76  	cond         *sync.Cond
    77  	pending      int64
    78  	err          error
    79  	current      int64
    80  	digestWriter Writer
    81  	source       readerAtSeeker
    82  }
    83  
    84  // Read is present to fulfill the io.Reader API, but should not be called.
    85  func (ras *readerAtSeekerWrap) Read(p []byte) (n int, err error) {
    86  	panic("Read should not be called on ReaderAt")
    87  }
    88  
    89  // ReadAt implements the ReaderAt interface.
    90  func (ras *readerAtSeekerWrap) ReadAt(p []byte, off int64) (int, error) {
    91  	// pending should be incremented, but concurrency for the source.ReadAt
    92  	// should be maintained. Using atomic means that we don't have to
    93  	// acquire/release/read/acquire/release.
    94  	for {
    95  		n := atomic.LoadInt64(&ras.pending)
    96  		if n < 0 {
    97  			panic("digest already called")
    98  		}
    99  		if atomic.CompareAndSwapInt64(&ras.pending, n, n+1) {
   100  			break
   101  		}
   102  	}
   103  	defer atomic.AddInt64(&ras.pending, -1)
   104  
   105  	n, err := ras.source.ReadAt(p, off)
   106  
   107  	ras.mu.Lock()
   108  	defer ras.mu.Unlock()
   109  
   110  	if ras.err != nil {
   111  		return 0, ras.err
   112  	}
   113  
   114  	ras.err = err
   115  
   116  	for ras.current != off && ras.err == nil {
   117  		ras.cond.Wait()
   118  	}
   119  
   120  	if ras.err != nil {
   121  		return 0, ras.err
   122  	}
   123  
   124  	q := p[:n]
   125  	ras.digestWriter.Write(q)
   126  
   127  	ras.current += int64(n)
   128  	ras.cond.Broadcast()
   129  
   130  	return n, ras.err
   131  }
   132  
   133  func (ras *readerAtSeekerWrap) Seek(offset int64, whence int) (int64, error) {
   134  	return ras.source.Seek(offset, whence)
   135  }
   136  
   137  // Digest returns the digest for the data. Digest cannot be called with pending
   138  // reads.
   139  func (ras *readerAtSeekerWrap) Digest() (Digest, error) {
   140  	ras.mu.Lock()
   141  	defer ras.mu.Unlock()
   142  
   143  	for {
   144  		n := atomic.LoadInt64(&ras.pending)
   145  		if n > 0 {
   146  			panic(fmt.Sprintf("Digest() called before all writes have completed, %d pending", ras.pending))
   147  		}
   148  		if n < 0 || atomic.CompareAndSwapInt64(&ras.pending, n, -1) {
   149  			break
   150  		}
   151  	}
   152  
   153  	if ras.err != nil {
   154  		return Digest{}, ras.err
   155  	}
   156  
   157  	return ras.digestWriter.Digest(), nil
   158  }
   159  
   160  // NewReader creates a new WriterAt.
   161  func (d Digester) NewReader(source io.Reader) Reader {
   162  	ras, ok := source.(readerAtSeeker)
   163  	if ok {
   164  		result := &readerAtSeekerWrap{
   165  			digestWriter: d.NewWriter(),
   166  			source:       ras,
   167  		}
   168  		result.cond = sync.NewCond(&result.mu)
   169  
   170  		return result
   171  	}
   172  
   173  	result := &readerWrap{
   174  		digestWriter: d.NewWriter(),
   175  		source:       source,
   176  	}
   177  
   178  	return result
   179  }