storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/cmd/erasure-decode.go

storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/cmd/erasure-decode.go (about)

     1  /*
     2   * MinIO Cloud Storage, (C) 2016-2020 MinIO, Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package cmd
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"io"
    23  	"sync"
    24  	"sync/atomic"
    25  
    26  	"storj.io/minio/cmd/logger"
    27  )
    28  
    29  // Reads in parallel from readers.
    30  type parallelReader struct {
    31  	readers       []io.ReaderAt
    32  	orgReaders    []io.ReaderAt
    33  	dataBlocks    int
    34  	errs          []error
    35  	offset        int64
    36  	shardSize     int64
    37  	shardFileSize int64
    38  	buf           [][]byte
    39  	readerToBuf   []int
    40  }
    41  
    42  // newParallelReader returns parallelReader.
    43  func newParallelReader(readers []io.ReaderAt, e Erasure, offset, totalLength int64) *parallelReader {
    44  	r2b := make([]int, len(readers))
    45  	for i := range r2b {
    46  		r2b[i] = i
    47  	}
    48  	return &parallelReader{
    49  		readers:       readers,
    50  		orgReaders:    readers,
    51  		errs:          make([]error, len(readers)),
    52  		dataBlocks:    e.dataBlocks,
    53  		offset:        (offset / e.blockSize) * e.ShardSize(),
    54  		shardSize:     e.ShardSize(),
    55  		shardFileSize: e.ShardFileSize(totalLength),
    56  		buf:           make([][]byte, len(readers)),
    57  		readerToBuf:   r2b,
    58  	}
    59  }
    60  
    61  // preferReaders can mark readers as preferred.
    62  // These will be chosen before others.
    63  func (p *parallelReader) preferReaders(prefer []bool) {
    64  	if len(prefer) != len(p.orgReaders) {
    65  		return
    66  	}
    67  	// Copy so we don't change our input.
    68  	tmp := make([]io.ReaderAt, len(p.orgReaders))
    69  	copy(tmp, p.orgReaders)
    70  	p.readers = tmp
    71  	// next is the next non-preferred index.
    72  	next := 0
    73  	for i, ok := range prefer {
    74  		if !ok || p.readers[i] == nil {
    75  			continue
    76  		}
    77  		if i == next {
    78  			next++
    79  			continue
    80  		}
    81  		// Move reader with index i to index next.
    82  		// Do this by swapping next and i
    83  		p.readers[next], p.readers[i] = p.readers[i], p.readers[next]
    84  		p.readerToBuf[next] = i
    85  		p.readerToBuf[i] = next
    86  		next++
    87  	}
    88  }
    89  
    90  // Returns if buf can be erasure decoded.
    91  func (p *parallelReader) canDecode(buf [][]byte) bool {
    92  	bufCount := 0
    93  	for _, b := range buf {
    94  		if len(b) > 0 {
    95  			bufCount++
    96  		}
    97  	}
    98  	return bufCount >= p.dataBlocks
    99  }
   100  
   101  // Read reads from readers in parallel. Returns p.dataBlocks number of bufs.
   102  func (p *parallelReader) Read(dst [][]byte) ([][]byte, error) {
   103  	newBuf := dst
   104  	if len(dst) != len(p.readers) {
   105  		newBuf = make([][]byte, len(p.readers))
   106  	} else {
   107  		for i := range newBuf {
   108  			newBuf[i] = newBuf[i][:0]
   109  		}
   110  	}
   111  	var newBufLK sync.RWMutex
   112  
   113  	if p.offset+p.shardSize > p.shardFileSize {
   114  		p.shardSize = p.shardFileSize - p.offset
   115  	}
   116  	if p.shardSize == 0 {
   117  		return newBuf, nil
   118  	}
   119  
   120  	readTriggerCh := make(chan bool, len(p.readers))
   121  	for i := 0; i < p.dataBlocks; i++ {
   122  		// Setup read triggers for p.dataBlocks number of reads so that it reads in parallel.
   123  		readTriggerCh <- true
   124  	}
   125  
   126  	bitrotHeal := int32(0)       // Atomic bool flag.
   127  	missingPartsHeal := int32(0) // Atomic bool flag.
   128  	readerIndex := 0
   129  	var wg sync.WaitGroup
   130  	// if readTrigger is true, it implies next disk.ReadAt() should be tried
   131  	// if readTrigger is false, it implies previous disk.ReadAt() was successful and there is no need
   132  	// to try reading the next disk.
   133  	for readTrigger := range readTriggerCh {
   134  		newBufLK.RLock()
   135  		canDecode := p.canDecode(newBuf)
   136  		newBufLK.RUnlock()
   137  		if canDecode {
   138  			break
   139  		}
   140  		if readerIndex == len(p.readers) {
   141  			break
   142  		}
   143  		if !readTrigger {
   144  			continue
   145  		}
   146  		wg.Add(1)
   147  		go func(i int) {
   148  			defer wg.Done()
   149  			rr := p.readers[i]
   150  			if rr == nil {
   151  				// Since reader is nil, trigger another read.
   152  				readTriggerCh <- true
   153  				return
   154  			}
   155  			bufIdx := p.readerToBuf[i]
   156  			if p.buf[bufIdx] == nil {
   157  				// Reading first time on this disk, hence the buffer needs to be allocated.
   158  				// Subsequent reads will re-use this buffer.
   159  				p.buf[bufIdx] = make([]byte, p.shardSize)
   160  			}
   161  			// For the last shard, the shardsize might be less than previous shard sizes.
   162  			// Hence the following statement ensures that the buffer size is reset to the right size.
   163  			p.buf[bufIdx] = p.buf[bufIdx][:p.shardSize]
   164  			n, err := rr.ReadAt(p.buf[bufIdx], p.offset)
   165  			if err != nil {
   166  				if errors.Is(err, errFileNotFound) {
   167  					atomic.StoreInt32(&missingPartsHeal, 1)
   168  				} else if errors.Is(err, errFileCorrupt) {
   169  					atomic.StoreInt32(&bitrotHeal, 1)
   170  				}
   171  
   172  				// This will be communicated upstream.
   173  				p.orgReaders[bufIdx] = nil
   174  				p.readers[i] = nil
   175  				p.errs[i] = err
   176  
   177  				// Since ReadAt returned error, trigger another read.
   178  				readTriggerCh <- true
   179  				return
   180  			}
   181  			newBufLK.Lock()
   182  			newBuf[bufIdx] = p.buf[bufIdx][:n]
   183  			newBufLK.Unlock()
   184  			// Since ReadAt returned success, there is no need to trigger another read.
   185  			readTriggerCh <- false
   186  		}(readerIndex)
   187  		readerIndex++
   188  	}
   189  	wg.Wait()
   190  	if p.canDecode(newBuf) {
   191  		p.offset += p.shardSize
   192  		if atomic.LoadInt32(&missingPartsHeal) == 1 {
   193  			return newBuf, errFileNotFound
   194  		} else if atomic.LoadInt32(&bitrotHeal) == 1 {
   195  			return newBuf, errFileCorrupt
   196  		}
   197  		return newBuf, nil
   198  	}
   199  
   200  	return nil, reduceReadQuorumErrs(context.Background(), p.errs, objectOpIgnoredErrs, p.dataBlocks)
   201  }
   202  
   203  // Decode reads from readers, reconstructs data if needed and writes the data to the writer.
   204  // A set of preferred drives can be supplied. In that case they will be used and the data reconstructed.
   205  func (e Erasure) Decode(ctx context.Context, writer io.Writer, readers []io.ReaderAt, offset, length, totalLength int64, prefer []bool) (written int64, derr error) {
   206  	if offset < 0 || length < 0 {
   207  		logger.LogIf(ctx, errInvalidArgument)
   208  		return -1, errInvalidArgument
   209  	}
   210  	if offset+length > totalLength {
   211  		logger.LogIf(ctx, errInvalidArgument)
   212  		return -1, errInvalidArgument
   213  	}
   214  
   215  	if length == 0 {
   216  		return 0, nil
   217  	}
   218  
   219  	reader := newParallelReader(readers, e, offset, totalLength)
   220  	if len(prefer) == len(readers) {
   221  		reader.preferReaders(prefer)
   222  	}
   223  
   224  	startBlock := offset / e.blockSize
   225  	endBlock := (offset + length) / e.blockSize
   226  
   227  	var bytesWritten int64
   228  	var bufs [][]byte
   229  	for block := startBlock; block <= endBlock; block++ {
   230  		var blockOffset, blockLength int64
   231  		switch {
   232  		case startBlock == endBlock:
   233  			blockOffset = offset % e.blockSize
   234  			blockLength = length
   235  		case block == startBlock:
   236  			blockOffset = offset % e.blockSize
   237  			blockLength = e.blockSize - blockOffset
   238  		case block == endBlock:
   239  			blockOffset = 0
   240  			blockLength = (offset + length) % e.blockSize
   241  		default:
   242  			blockOffset = 0
   243  			blockLength = e.blockSize
   244  		}
   245  		if blockLength == 0 {
   246  			break
   247  		}
   248  
   249  		var err error
   250  		bufs, err = reader.Read(bufs)
   251  		if len(bufs) > 0 {
   252  			// Set only if there are be enough data for reconstruction.
   253  			// and only for expected errors, also set once.
   254  			if errors.Is(err, errFileNotFound) || errors.Is(err, errFileCorrupt) {
   255  				if derr == nil {
   256  					derr = err
   257  				}
   258  			}
   259  		} else if err != nil {
   260  			// For all errors that cannot be reconstructed fail the read operation.
   261  			return -1, err
   262  		}
   263  
   264  		if err = e.DecodeDataBlocks(bufs); err != nil {
   265  			logger.LogIf(ctx, err)
   266  			return -1, err
   267  		}
   268  
   269  		n, err := writeDataBlocks(ctx, writer, bufs, e.dataBlocks, blockOffset, blockLength)
   270  		if err != nil {
   271  			return -1, err
   272  		}
   273  
   274  		bytesWritten += n
   275  	}
   276  
   277  	if bytesWritten != length {
   278  		logger.LogIf(ctx, errLessData)
   279  		return bytesWritten, errLessData
   280  	}
   281  
   282  	return bytesWritten, derr
   283  }