storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/cmd/erasure-decode.go (about) 1 /* 2 * MinIO Cloud Storage, (C) 2016-2020 MinIO, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package cmd 18 19 import ( 20 "context" 21 "errors" 22 "io" 23 "sync" 24 "sync/atomic" 25 26 "storj.io/minio/cmd/logger" 27 ) 28 29 // Reads in parallel from readers. 30 type parallelReader struct { 31 readers []io.ReaderAt 32 orgReaders []io.ReaderAt 33 dataBlocks int 34 errs []error 35 offset int64 36 shardSize int64 37 shardFileSize int64 38 buf [][]byte 39 readerToBuf []int 40 } 41 42 // newParallelReader returns parallelReader. 43 func newParallelReader(readers []io.ReaderAt, e Erasure, offset, totalLength int64) *parallelReader { 44 r2b := make([]int, len(readers)) 45 for i := range r2b { 46 r2b[i] = i 47 } 48 return ¶llelReader{ 49 readers: readers, 50 orgReaders: readers, 51 errs: make([]error, len(readers)), 52 dataBlocks: e.dataBlocks, 53 offset: (offset / e.blockSize) * e.ShardSize(), 54 shardSize: e.ShardSize(), 55 shardFileSize: e.ShardFileSize(totalLength), 56 buf: make([][]byte, len(readers)), 57 readerToBuf: r2b, 58 } 59 } 60 61 // preferReaders can mark readers as preferred. 62 // These will be chosen before others. 63 func (p *parallelReader) preferReaders(prefer []bool) { 64 if len(prefer) != len(p.orgReaders) { 65 return 66 } 67 // Copy so we don't change our input. 68 tmp := make([]io.ReaderAt, len(p.orgReaders)) 69 copy(tmp, p.orgReaders) 70 p.readers = tmp 71 // next is the next non-preferred index. 72 next := 0 73 for i, ok := range prefer { 74 if !ok || p.readers[i] == nil { 75 continue 76 } 77 if i == next { 78 next++ 79 continue 80 } 81 // Move reader with index i to index next. 82 // Do this by swapping next and i 83 p.readers[next], p.readers[i] = p.readers[i], p.readers[next] 84 p.readerToBuf[next] = i 85 p.readerToBuf[i] = next 86 next++ 87 } 88 } 89 90 // Returns if buf can be erasure decoded. 91 func (p *parallelReader) canDecode(buf [][]byte) bool { 92 bufCount := 0 93 for _, b := range buf { 94 if len(b) > 0 { 95 bufCount++ 96 } 97 } 98 return bufCount >= p.dataBlocks 99 } 100 101 // Read reads from readers in parallel. Returns p.dataBlocks number of bufs. 102 func (p *parallelReader) Read(dst [][]byte) ([][]byte, error) { 103 newBuf := dst 104 if len(dst) != len(p.readers) { 105 newBuf = make([][]byte, len(p.readers)) 106 } else { 107 for i := range newBuf { 108 newBuf[i] = newBuf[i][:0] 109 } 110 } 111 var newBufLK sync.RWMutex 112 113 if p.offset+p.shardSize > p.shardFileSize { 114 p.shardSize = p.shardFileSize - p.offset 115 } 116 if p.shardSize == 0 { 117 return newBuf, nil 118 } 119 120 readTriggerCh := make(chan bool, len(p.readers)) 121 for i := 0; i < p.dataBlocks; i++ { 122 // Setup read triggers for p.dataBlocks number of reads so that it reads in parallel. 123 readTriggerCh <- true 124 } 125 126 bitrotHeal := int32(0) // Atomic bool flag. 127 missingPartsHeal := int32(0) // Atomic bool flag. 128 readerIndex := 0 129 var wg sync.WaitGroup 130 // if readTrigger is true, it implies next disk.ReadAt() should be tried 131 // if readTrigger is false, it implies previous disk.ReadAt() was successful and there is no need 132 // to try reading the next disk. 133 for readTrigger := range readTriggerCh { 134 newBufLK.RLock() 135 canDecode := p.canDecode(newBuf) 136 newBufLK.RUnlock() 137 if canDecode { 138 break 139 } 140 if readerIndex == len(p.readers) { 141 break 142 } 143 if !readTrigger { 144 continue 145 } 146 wg.Add(1) 147 go func(i int) { 148 defer wg.Done() 149 rr := p.readers[i] 150 if rr == nil { 151 // Since reader is nil, trigger another read. 152 readTriggerCh <- true 153 return 154 } 155 bufIdx := p.readerToBuf[i] 156 if p.buf[bufIdx] == nil { 157 // Reading first time on this disk, hence the buffer needs to be allocated. 158 // Subsequent reads will re-use this buffer. 159 p.buf[bufIdx] = make([]byte, p.shardSize) 160 } 161 // For the last shard, the shardsize might be less than previous shard sizes. 162 // Hence the following statement ensures that the buffer size is reset to the right size. 163 p.buf[bufIdx] = p.buf[bufIdx][:p.shardSize] 164 n, err := rr.ReadAt(p.buf[bufIdx], p.offset) 165 if err != nil { 166 if errors.Is(err, errFileNotFound) { 167 atomic.StoreInt32(&missingPartsHeal, 1) 168 } else if errors.Is(err, errFileCorrupt) { 169 atomic.StoreInt32(&bitrotHeal, 1) 170 } 171 172 // This will be communicated upstream. 173 p.orgReaders[bufIdx] = nil 174 p.readers[i] = nil 175 p.errs[i] = err 176 177 // Since ReadAt returned error, trigger another read. 178 readTriggerCh <- true 179 return 180 } 181 newBufLK.Lock() 182 newBuf[bufIdx] = p.buf[bufIdx][:n] 183 newBufLK.Unlock() 184 // Since ReadAt returned success, there is no need to trigger another read. 185 readTriggerCh <- false 186 }(readerIndex) 187 readerIndex++ 188 } 189 wg.Wait() 190 if p.canDecode(newBuf) { 191 p.offset += p.shardSize 192 if atomic.LoadInt32(&missingPartsHeal) == 1 { 193 return newBuf, errFileNotFound 194 } else if atomic.LoadInt32(&bitrotHeal) == 1 { 195 return newBuf, errFileCorrupt 196 } 197 return newBuf, nil 198 } 199 200 return nil, reduceReadQuorumErrs(context.Background(), p.errs, objectOpIgnoredErrs, p.dataBlocks) 201 } 202 203 // Decode reads from readers, reconstructs data if needed and writes the data to the writer. 204 // A set of preferred drives can be supplied. In that case they will be used and the data reconstructed. 205 func (e Erasure) Decode(ctx context.Context, writer io.Writer, readers []io.ReaderAt, offset, length, totalLength int64, prefer []bool) (written int64, derr error) { 206 if offset < 0 || length < 0 { 207 logger.LogIf(ctx, errInvalidArgument) 208 return -1, errInvalidArgument 209 } 210 if offset+length > totalLength { 211 logger.LogIf(ctx, errInvalidArgument) 212 return -1, errInvalidArgument 213 } 214 215 if length == 0 { 216 return 0, nil 217 } 218 219 reader := newParallelReader(readers, e, offset, totalLength) 220 if len(prefer) == len(readers) { 221 reader.preferReaders(prefer) 222 } 223 224 startBlock := offset / e.blockSize 225 endBlock := (offset + length) / e.blockSize 226 227 var bytesWritten int64 228 var bufs [][]byte 229 for block := startBlock; block <= endBlock; block++ { 230 var blockOffset, blockLength int64 231 switch { 232 case startBlock == endBlock: 233 blockOffset = offset % e.blockSize 234 blockLength = length 235 case block == startBlock: 236 blockOffset = offset % e.blockSize 237 blockLength = e.blockSize - blockOffset 238 case block == endBlock: 239 blockOffset = 0 240 blockLength = (offset + length) % e.blockSize 241 default: 242 blockOffset = 0 243 blockLength = e.blockSize 244 } 245 if blockLength == 0 { 246 break 247 } 248 249 var err error 250 bufs, err = reader.Read(bufs) 251 if len(bufs) > 0 { 252 // Set only if there are be enough data for reconstruction. 253 // and only for expected errors, also set once. 254 if errors.Is(err, errFileNotFound) || errors.Is(err, errFileCorrupt) { 255 if derr == nil { 256 derr = err 257 } 258 } 259 } else if err != nil { 260 // For all errors that cannot be reconstructed fail the read operation. 261 return -1, err 262 } 263 264 if err = e.DecodeDataBlocks(bufs); err != nil { 265 logger.LogIf(ctx, err) 266 return -1, err 267 } 268 269 n, err := writeDataBlocks(ctx, writer, bufs, e.dataBlocks, blockOffset, blockLength) 270 if err != nil { 271 return -1, err 272 } 273 274 bytesWritten += n 275 } 276 277 if bytesWritten != length { 278 logger.LogIf(ctx, errLessData) 279 return bytesWritten, errLessData 280 } 281 282 return bytesWritten, derr 283 }