github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/cmd/erasure-decode.go (about) 1 // Copyright (c) 2015-2021 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package cmd 19 20 import ( 21 "context" 22 "errors" 23 "fmt" 24 "io" 25 "sync" 26 "sync/atomic" 27 28 xioutil "github.com/minio/minio/internal/ioutil" 29 ) 30 31 // Reads in parallel from readers. 32 type parallelReader struct { 33 readers []io.ReaderAt 34 orgReaders []io.ReaderAt 35 dataBlocks int 36 offset int64 37 shardSize int64 38 shardFileSize int64 39 buf [][]byte 40 readerToBuf []int 41 } 42 43 // newParallelReader returns parallelReader. 44 func newParallelReader(readers []io.ReaderAt, e Erasure, offset, totalLength int64) *parallelReader { 45 r2b := make([]int, len(readers)) 46 for i := range r2b { 47 r2b[i] = i 48 } 49 return ¶llelReader{ 50 readers: readers, 51 orgReaders: readers, 52 dataBlocks: e.dataBlocks, 53 offset: (offset / e.blockSize) * e.ShardSize(), 54 shardSize: e.ShardSize(), 55 shardFileSize: e.ShardFileSize(totalLength), 56 buf: make([][]byte, len(readers)), 57 readerToBuf: r2b, 58 } 59 } 60 61 // preferReaders can mark readers as preferred. 62 // These will be chosen before others. 63 func (p *parallelReader) preferReaders(prefer []bool) { 64 if len(prefer) != len(p.orgReaders) { 65 return 66 } 67 // Copy so we don't change our input. 68 tmp := make([]io.ReaderAt, len(p.orgReaders)) 69 copy(tmp, p.orgReaders) 70 p.readers = tmp 71 // next is the next non-preferred index. 72 next := 0 73 for i, ok := range prefer { 74 if !ok || p.readers[i] == nil { 75 continue 76 } 77 if i == next { 78 next++ 79 continue 80 } 81 // Move reader with index i to index next. 82 // Do this by swapping next and i 83 p.readers[next], p.readers[i] = p.readers[i], p.readers[next] 84 p.readerToBuf[next] = i 85 p.readerToBuf[i] = next 86 next++ 87 } 88 } 89 90 // Returns if buf can be erasure decoded. 91 func (p *parallelReader) canDecode(buf [][]byte) bool { 92 bufCount := 0 93 for _, b := range buf { 94 if len(b) > 0 { 95 bufCount++ 96 } 97 } 98 return bufCount >= p.dataBlocks 99 } 100 101 // Read reads from readers in parallel. Returns p.dataBlocks number of bufs. 102 func (p *parallelReader) Read(dst [][]byte) ([][]byte, error) { 103 newBuf := dst 104 if len(dst) != len(p.readers) { 105 newBuf = make([][]byte, len(p.readers)) 106 } else { 107 for i := range newBuf { 108 newBuf[i] = newBuf[i][:0] 109 } 110 } 111 var newBufLK sync.RWMutex 112 113 if p.offset+p.shardSize > p.shardFileSize { 114 p.shardSize = p.shardFileSize - p.offset 115 } 116 if p.shardSize == 0 { 117 return newBuf, nil 118 } 119 120 readTriggerCh := make(chan bool, len(p.readers)) 121 defer xioutil.SafeClose(readTriggerCh) // close the channel upon return 122 123 for i := 0; i < p.dataBlocks; i++ { 124 // Setup read triggers for p.dataBlocks number of reads so that it reads in parallel. 125 readTriggerCh <- true 126 } 127 128 disksNotFound := int32(0) 129 bitrotHeal := int32(0) // Atomic bool flag. 130 missingPartsHeal := int32(0) // Atomic bool flag. 131 readerIndex := 0 132 var wg sync.WaitGroup 133 // if readTrigger is true, it implies next disk.ReadAt() should be tried 134 // if readTrigger is false, it implies previous disk.ReadAt() was successful and there is no need 135 // to try reading the next disk. 136 for readTrigger := range readTriggerCh { 137 newBufLK.RLock() 138 canDecode := p.canDecode(newBuf) 139 newBufLK.RUnlock() 140 if canDecode { 141 break 142 } 143 if readerIndex == len(p.readers) { 144 break 145 } 146 if !readTrigger { 147 continue 148 } 149 wg.Add(1) 150 go func(i int) { 151 defer wg.Done() 152 rr := p.readers[i] 153 if rr == nil { 154 // Since reader is nil, trigger another read. 155 readTriggerCh <- true 156 return 157 } 158 bufIdx := p.readerToBuf[i] 159 if p.buf[bufIdx] == nil { 160 // Reading first time on this disk, hence the buffer needs to be allocated. 161 // Subsequent reads will reuse this buffer. 162 p.buf[bufIdx] = make([]byte, p.shardSize) 163 } 164 // For the last shard, the shardsize might be less than previous shard sizes. 165 // Hence the following statement ensures that the buffer size is reset to the right size. 166 p.buf[bufIdx] = p.buf[bufIdx][:p.shardSize] 167 n, err := rr.ReadAt(p.buf[bufIdx], p.offset) 168 if err != nil { 169 switch { 170 case errors.Is(err, errFileNotFound): 171 atomic.StoreInt32(&missingPartsHeal, 1) 172 case errors.Is(err, errFileCorrupt): 173 atomic.StoreInt32(&bitrotHeal, 1) 174 case errors.Is(err, errDiskNotFound): 175 atomic.AddInt32(&disksNotFound, 1) 176 } 177 178 // This will be communicated upstream. 179 p.orgReaders[bufIdx] = nil 180 p.readers[i] = nil 181 182 // Since ReadAt returned error, trigger another read. 183 readTriggerCh <- true 184 return 185 } 186 newBufLK.Lock() 187 newBuf[bufIdx] = p.buf[bufIdx][:n] 188 newBufLK.Unlock() 189 // Since ReadAt returned success, there is no need to trigger another read. 190 readTriggerCh <- false 191 }(readerIndex) 192 readerIndex++ 193 } 194 wg.Wait() 195 if p.canDecode(newBuf) { 196 p.offset += p.shardSize 197 if missingPartsHeal == 1 { 198 return newBuf, errFileNotFound 199 } else if bitrotHeal == 1 { 200 return newBuf, errFileCorrupt 201 } 202 return newBuf, nil 203 } 204 205 // If we cannot decode, just return read quorum error. 206 return nil, fmt.Errorf("%w (offline-disks=%d/%d)", errErasureReadQuorum, disksNotFound, len(p.readers)) 207 } 208 209 // Decode reads from readers, reconstructs data if needed and writes the data to the writer. 210 // A set of preferred drives can be supplied. In that case they will be used and the data reconstructed. 211 func (e Erasure) Decode(ctx context.Context, writer io.Writer, readers []io.ReaderAt, offset, length, totalLength int64, prefer []bool) (written int64, derr error) { 212 if offset < 0 || length < 0 { 213 return -1, errInvalidArgument 214 } 215 if offset+length > totalLength { 216 return -1, errInvalidArgument 217 } 218 219 if length == 0 { 220 return 0, nil 221 } 222 223 reader := newParallelReader(readers, e, offset, totalLength) 224 if len(prefer) == len(readers) { 225 reader.preferReaders(prefer) 226 } 227 228 startBlock := offset / e.blockSize 229 endBlock := (offset + length) / e.blockSize 230 231 var bytesWritten int64 232 var bufs [][]byte 233 for block := startBlock; block <= endBlock; block++ { 234 var blockOffset, blockLength int64 235 switch { 236 case startBlock == endBlock: 237 blockOffset = offset % e.blockSize 238 blockLength = length 239 case block == startBlock: 240 blockOffset = offset % e.blockSize 241 blockLength = e.blockSize - blockOffset 242 case block == endBlock: 243 blockOffset = 0 244 blockLength = (offset + length) % e.blockSize 245 default: 246 blockOffset = 0 247 blockLength = e.blockSize 248 } 249 if blockLength == 0 { 250 break 251 } 252 253 var err error 254 bufs, err = reader.Read(bufs) 255 if len(bufs) > 0 { 256 // Set only if there are be enough data for reconstruction. 257 // and only for expected errors, also set once. 258 if errors.Is(err, errFileNotFound) || errors.Is(err, errFileCorrupt) { 259 if derr == nil { 260 derr = err 261 } 262 } 263 } else if err != nil { 264 // For all errors that cannot be reconstructed fail the read operation. 265 return -1, err 266 } 267 268 if err = e.DecodeDataBlocks(bufs); err != nil { 269 return -1, err 270 } 271 272 n, err := writeDataBlocks(ctx, writer, bufs, e.dataBlocks, blockOffset, blockLength) 273 if err != nil { 274 return -1, err 275 } 276 277 bytesWritten += n 278 } 279 280 if bytesWritten != length { 281 return bytesWritten, errLessData 282 } 283 284 return bytesWritten, derr 285 } 286 287 // Heal reads from readers, reconstruct shards and writes the data to the writers. 288 func (e Erasure) Heal(ctx context.Context, writers []io.Writer, readers []io.ReaderAt, totalLength int64, prefer []bool) (derr error) { 289 if len(writers) != e.parityBlocks+e.dataBlocks { 290 return errInvalidArgument 291 } 292 293 reader := newParallelReader(readers, e, 0, totalLength) 294 if len(readers) == len(prefer) { 295 reader.preferReaders(prefer) 296 } 297 298 startBlock := int64(0) 299 endBlock := totalLength / e.blockSize 300 if totalLength%e.blockSize != 0 { 301 endBlock++ 302 } 303 304 var bufs [][]byte 305 for block := startBlock; block < endBlock; block++ { 306 var err error 307 bufs, err = reader.Read(bufs) 308 if len(bufs) > 0 { 309 if errors.Is(err, errFileNotFound) || errors.Is(err, errFileCorrupt) { 310 if derr == nil { 311 derr = err 312 } 313 } 314 } else if err != nil { 315 return err 316 } 317 318 if err = e.DecodeDataAndParityBlocks(ctx, bufs); err != nil { 319 return err 320 } 321 322 w := parallelWriter{ 323 writers: writers, 324 writeQuorum: 1, 325 errs: make([]error, len(writers)), 326 } 327 328 if err = w.Write(ctx, bufs); err != nil { 329 return err 330 } 331 } 332 333 return derr 334 }