github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/erasure-decode.go (about)

     1  // Copyright (c) 2015-2021 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package cmd
    19  
    20  import (
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"io"
    25  	"sync"
    26  	"sync/atomic"
    27  
    28  	xioutil "github.com/minio/minio/internal/ioutil"
    29  )
    30  
    31  // Reads in parallel from readers.
    32  type parallelReader struct {
    33  	readers       []io.ReaderAt
    34  	orgReaders    []io.ReaderAt
    35  	dataBlocks    int
    36  	offset        int64
    37  	shardSize     int64
    38  	shardFileSize int64
    39  	buf           [][]byte
    40  	readerToBuf   []int
    41  }
    42  
    43  // newParallelReader returns parallelReader.
    44  func newParallelReader(readers []io.ReaderAt, e Erasure, offset, totalLength int64) *parallelReader {
    45  	r2b := make([]int, len(readers))
    46  	for i := range r2b {
    47  		r2b[i] = i
    48  	}
    49  	return &parallelReader{
    50  		readers:       readers,
    51  		orgReaders:    readers,
    52  		dataBlocks:    e.dataBlocks,
    53  		offset:        (offset / e.blockSize) * e.ShardSize(),
    54  		shardSize:     e.ShardSize(),
    55  		shardFileSize: e.ShardFileSize(totalLength),
    56  		buf:           make([][]byte, len(readers)),
    57  		readerToBuf:   r2b,
    58  	}
    59  }
    60  
    61  // preferReaders can mark readers as preferred.
    62  // These will be chosen before others.
    63  func (p *parallelReader) preferReaders(prefer []bool) {
    64  	if len(prefer) != len(p.orgReaders) {
    65  		return
    66  	}
    67  	// Copy so we don't change our input.
    68  	tmp := make([]io.ReaderAt, len(p.orgReaders))
    69  	copy(tmp, p.orgReaders)
    70  	p.readers = tmp
    71  	// next is the next non-preferred index.
    72  	next := 0
    73  	for i, ok := range prefer {
    74  		if !ok || p.readers[i] == nil {
    75  			continue
    76  		}
    77  		if i == next {
    78  			next++
    79  			continue
    80  		}
    81  		// Move reader with index i to index next.
    82  		// Do this by swapping next and i
    83  		p.readers[next], p.readers[i] = p.readers[i], p.readers[next]
    84  		p.readerToBuf[next] = i
    85  		p.readerToBuf[i] = next
    86  		next++
    87  	}
    88  }
    89  
    90  // Returns if buf can be erasure decoded.
    91  func (p *parallelReader) canDecode(buf [][]byte) bool {
    92  	bufCount := 0
    93  	for _, b := range buf {
    94  		if len(b) > 0 {
    95  			bufCount++
    96  		}
    97  	}
    98  	return bufCount >= p.dataBlocks
    99  }
   100  
   101  // Read reads from readers in parallel. Returns p.dataBlocks number of bufs.
   102  func (p *parallelReader) Read(dst [][]byte) ([][]byte, error) {
   103  	newBuf := dst
   104  	if len(dst) != len(p.readers) {
   105  		newBuf = make([][]byte, len(p.readers))
   106  	} else {
   107  		for i := range newBuf {
   108  			newBuf[i] = newBuf[i][:0]
   109  		}
   110  	}
   111  	var newBufLK sync.RWMutex
   112  
   113  	if p.offset+p.shardSize > p.shardFileSize {
   114  		p.shardSize = p.shardFileSize - p.offset
   115  	}
   116  	if p.shardSize == 0 {
   117  		return newBuf, nil
   118  	}
   119  
   120  	readTriggerCh := make(chan bool, len(p.readers))
   121  	defer xioutil.SafeClose(readTriggerCh) // close the channel upon return
   122  
   123  	for i := 0; i < p.dataBlocks; i++ {
   124  		// Setup read triggers for p.dataBlocks number of reads so that it reads in parallel.
   125  		readTriggerCh <- true
   126  	}
   127  
   128  	disksNotFound := int32(0)
   129  	bitrotHeal := int32(0)       // Atomic bool flag.
   130  	missingPartsHeal := int32(0) // Atomic bool flag.
   131  	readerIndex := 0
   132  	var wg sync.WaitGroup
   133  	// if readTrigger is true, it implies next disk.ReadAt() should be tried
   134  	// if readTrigger is false, it implies previous disk.ReadAt() was successful and there is no need
   135  	// to try reading the next disk.
   136  	for readTrigger := range readTriggerCh {
   137  		newBufLK.RLock()
   138  		canDecode := p.canDecode(newBuf)
   139  		newBufLK.RUnlock()
   140  		if canDecode {
   141  			break
   142  		}
   143  		if readerIndex == len(p.readers) {
   144  			break
   145  		}
   146  		if !readTrigger {
   147  			continue
   148  		}
   149  		wg.Add(1)
   150  		go func(i int) {
   151  			defer wg.Done()
   152  			rr := p.readers[i]
   153  			if rr == nil {
   154  				// Since reader is nil, trigger another read.
   155  				readTriggerCh <- true
   156  				return
   157  			}
   158  			bufIdx := p.readerToBuf[i]
   159  			if p.buf[bufIdx] == nil {
   160  				// Reading first time on this disk, hence the buffer needs to be allocated.
   161  				// Subsequent reads will reuse this buffer.
   162  				p.buf[bufIdx] = make([]byte, p.shardSize)
   163  			}
   164  			// For the last shard, the shardsize might be less than previous shard sizes.
   165  			// Hence the following statement ensures that the buffer size is reset to the right size.
   166  			p.buf[bufIdx] = p.buf[bufIdx][:p.shardSize]
   167  			n, err := rr.ReadAt(p.buf[bufIdx], p.offset)
   168  			if err != nil {
   169  				switch {
   170  				case errors.Is(err, errFileNotFound):
   171  					atomic.StoreInt32(&missingPartsHeal, 1)
   172  				case errors.Is(err, errFileCorrupt):
   173  					atomic.StoreInt32(&bitrotHeal, 1)
   174  				case errors.Is(err, errDiskNotFound):
   175  					atomic.AddInt32(&disksNotFound, 1)
   176  				}
   177  
   178  				// This will be communicated upstream.
   179  				p.orgReaders[bufIdx] = nil
   180  				p.readers[i] = nil
   181  
   182  				// Since ReadAt returned error, trigger another read.
   183  				readTriggerCh <- true
   184  				return
   185  			}
   186  			newBufLK.Lock()
   187  			newBuf[bufIdx] = p.buf[bufIdx][:n]
   188  			newBufLK.Unlock()
   189  			// Since ReadAt returned success, there is no need to trigger another read.
   190  			readTriggerCh <- false
   191  		}(readerIndex)
   192  		readerIndex++
   193  	}
   194  	wg.Wait()
   195  	if p.canDecode(newBuf) {
   196  		p.offset += p.shardSize
   197  		if missingPartsHeal == 1 {
   198  			return newBuf, errFileNotFound
   199  		} else if bitrotHeal == 1 {
   200  			return newBuf, errFileCorrupt
   201  		}
   202  		return newBuf, nil
   203  	}
   204  
   205  	// If we cannot decode, just return read quorum error.
   206  	return nil, fmt.Errorf("%w (offline-disks=%d/%d)", errErasureReadQuorum, disksNotFound, len(p.readers))
   207  }
   208  
   209  // Decode reads from readers, reconstructs data if needed and writes the data to the writer.
   210  // A set of preferred drives can be supplied. In that case they will be used and the data reconstructed.
   211  func (e Erasure) Decode(ctx context.Context, writer io.Writer, readers []io.ReaderAt, offset, length, totalLength int64, prefer []bool) (written int64, derr error) {
   212  	if offset < 0 || length < 0 {
   213  		return -1, errInvalidArgument
   214  	}
   215  	if offset+length > totalLength {
   216  		return -1, errInvalidArgument
   217  	}
   218  
   219  	if length == 0 {
   220  		return 0, nil
   221  	}
   222  
   223  	reader := newParallelReader(readers, e, offset, totalLength)
   224  	if len(prefer) == len(readers) {
   225  		reader.preferReaders(prefer)
   226  	}
   227  
   228  	startBlock := offset / e.blockSize
   229  	endBlock := (offset + length) / e.blockSize
   230  
   231  	var bytesWritten int64
   232  	var bufs [][]byte
   233  	for block := startBlock; block <= endBlock; block++ {
   234  		var blockOffset, blockLength int64
   235  		switch {
   236  		case startBlock == endBlock:
   237  			blockOffset = offset % e.blockSize
   238  			blockLength = length
   239  		case block == startBlock:
   240  			blockOffset = offset % e.blockSize
   241  			blockLength = e.blockSize - blockOffset
   242  		case block == endBlock:
   243  			blockOffset = 0
   244  			blockLength = (offset + length) % e.blockSize
   245  		default:
   246  			blockOffset = 0
   247  			blockLength = e.blockSize
   248  		}
   249  		if blockLength == 0 {
   250  			break
   251  		}
   252  
   253  		var err error
   254  		bufs, err = reader.Read(bufs)
   255  		if len(bufs) > 0 {
   256  			// Set only if there are be enough data for reconstruction.
   257  			// and only for expected errors, also set once.
   258  			if errors.Is(err, errFileNotFound) || errors.Is(err, errFileCorrupt) {
   259  				if derr == nil {
   260  					derr = err
   261  				}
   262  			}
   263  		} else if err != nil {
   264  			// For all errors that cannot be reconstructed fail the read operation.
   265  			return -1, err
   266  		}
   267  
   268  		if err = e.DecodeDataBlocks(bufs); err != nil {
   269  			return -1, err
   270  		}
   271  
   272  		n, err := writeDataBlocks(ctx, writer, bufs, e.dataBlocks, blockOffset, blockLength)
   273  		if err != nil {
   274  			return -1, err
   275  		}
   276  
   277  		bytesWritten += n
   278  	}
   279  
   280  	if bytesWritten != length {
   281  		return bytesWritten, errLessData
   282  	}
   283  
   284  	return bytesWritten, derr
   285  }
   286  
   287  // Heal reads from readers, reconstruct shards and writes the data to the writers.
   288  func (e Erasure) Heal(ctx context.Context, writers []io.Writer, readers []io.ReaderAt, totalLength int64, prefer []bool) (derr error) {
   289  	if len(writers) != e.parityBlocks+e.dataBlocks {
   290  		return errInvalidArgument
   291  	}
   292  
   293  	reader := newParallelReader(readers, e, 0, totalLength)
   294  	if len(readers) == len(prefer) {
   295  		reader.preferReaders(prefer)
   296  	}
   297  
   298  	startBlock := int64(0)
   299  	endBlock := totalLength / e.blockSize
   300  	if totalLength%e.blockSize != 0 {
   301  		endBlock++
   302  	}
   303  
   304  	var bufs [][]byte
   305  	for block := startBlock; block < endBlock; block++ {
   306  		var err error
   307  		bufs, err = reader.Read(bufs)
   308  		if len(bufs) > 0 {
   309  			if errors.Is(err, errFileNotFound) || errors.Is(err, errFileCorrupt) {
   310  				if derr == nil {
   311  					derr = err
   312  				}
   313  			}
   314  		} else if err != nil {
   315  			return err
   316  		}
   317  
   318  		if err = e.DecodeDataAndParityBlocks(ctx, bufs); err != nil {
   319  			return err
   320  		}
   321  
   322  		w := parallelWriter{
   323  			writers:     writers,
   324  			writeQuorum: 1,
   325  			errs:        make([]error, len(writers)),
   326  		}
   327  
   328  		if err = w.Write(ctx, bufs); err != nil {
   329  			return err
   330  		}
   331  	}
   332  
   333  	return derr
   334  }