go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/logdog/common/storage/archive/storage.go (about) 1 // Copyright 2015 The LUCI Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package archive implements a storage.Storage instance that retrieves logs 16 // from a Google Storage archive. 17 // 18 // This is a special implementation of storage.Storage, and does not fully 19 // conform to the API expecations. Namely: 20 // - It is read-only. Mutation methods will return storage.ErrReadOnly. 21 // - Storage methods ignore the supplied Path argument, instead opting for 22 // the archive configured in its Options. 23 package archive 24 25 import ( 26 "bytes" 27 "context" 28 "fmt" 29 "io" 30 "sort" 31 "sync/atomic" 32 33 "go.chromium.org/luci/common/data/recordio" 34 "go.chromium.org/luci/common/errors" 35 "go.chromium.org/luci/common/gcloud/gs" 36 "go.chromium.org/luci/common/iotools" 37 log "go.chromium.org/luci/common/logging" 38 "go.chromium.org/luci/logdog/api/logpb" 39 "go.chromium.org/luci/logdog/common/storage" 40 "go.chromium.org/luci/logdog/common/types" 41 42 cloudStorage "cloud.google.com/go/storage" 43 "github.com/golang/protobuf/proto" 44 ) 45 46 const ( 47 // maxStreamRecordSize is the maximum record size we're willing to read from 48 // our archived log stream. This will help prevent out-of-memory errors if the 49 // arhived log stream is malicious or corrupt. 50 // 51 // Make this twice as large as the maximum log entry size 52 maxStreamRecordSize = 2 * types.MaxLogEntryDataSize 53 ) 54 55 // Options is the set of configuration options for this Storage instance. 56 // 57 // Unlike other Storage instances, this is bound to a single archived stream. 58 // Project and Path parameters in requests will be ignored in favor of the 59 // Google Storage URLs. 60 type Options struct { 61 // Index is the Google Storage URL for the stream's index. 62 Index gs.Path 63 // Stream is the Google Storage URL for the stream's entries. 64 Stream gs.Path 65 66 // Client is the HTTP client to use for authentication. 67 // 68 // Closing this Storage instance does not close the underlying Client. 69 Client gs.Client 70 71 // Cache, if not nil, will be used to cache data. 72 Cache storage.Cache 73 } 74 75 type storageImpl struct { 76 *Options 77 78 index atomic.Value 79 } 80 81 // New instantiates a new Storage instance, bound to the supplied Options. 82 func New(o Options) (storage.Storage, error) { 83 s := storageImpl{ 84 Options: &o, 85 } 86 87 if !s.Stream.IsFullPath() { 88 return nil, fmt.Errorf("invalid stream URL: %q", s.Stream) 89 } 90 if s.Index != "" && !s.Index.IsFullPath() { 91 return nil, fmt.Errorf("invalid index URL: %v", s.Index) 92 } 93 94 return &s, nil 95 } 96 97 func (s *storageImpl) Close() {} 98 99 func (s *storageImpl) Put(context.Context, storage.PutRequest) error { return storage.ErrReadOnly } 100 101 func (s *storageImpl) Expunge(context.Context, storage.ExpungeRequest) error { 102 return storage.ErrReadOnly 103 } 104 105 func (s *storageImpl) Get(c context.Context, req storage.GetRequest, cb storage.GetCallback) error { 106 idx, err := s.getIndex(c) 107 if err != nil { 108 return err 109 } 110 111 // Identify the byte offsets that we want to fetch from the entries stream. 112 st := buildGetStrategy(&req, idx) 113 if st == nil { 114 // No more records to read. 115 return nil 116 } 117 118 switch err := s.getLogEntriesIter(c, st, cb); errors.Unwrap(err) { 119 case nil, io.EOF: 120 // We hit the end of our log stream. 121 return nil 122 123 case cloudStorage.ErrObjectNotExist, cloudStorage.ErrBucketNotExist: 124 return storage.ErrDoesNotExist 125 126 default: 127 return errors.Annotate(err, "failed to read log stream").Err() 128 } 129 } 130 131 // getLogEntriesImpl retrieves log entries from archive until complete. 132 func (s *storageImpl) getLogEntriesIter(c context.Context, st *getStrategy, cb storage.GetCallback) error { 133 // Get our maximum byte limit. If we are externally constrained via MaxBytes, 134 // apply that limit too. 135 // Get an archive reader. 136 var ( 137 offset = st.startOffset 138 length = st.length() 139 ) 140 141 storageReader, err := s.Client.NewReader(s.Stream, int64(offset), length) 142 if err != nil { 143 log.WithError(err).Errorf(c, "Failed to create stream Reader.") 144 return errors.Annotate(err, "failed to create stream Reader").Err() 145 } 146 defer func() { 147 if tmpErr := storageReader.Close(); tmpErr != nil { 148 // (Non-fatal) 149 log.WithError(tmpErr).Warningf(c, "Error closing stream Reader.") 150 } 151 }() 152 153 // Count how many bytes we've read. 154 cr := iotools.CountingReader{Reader: storageReader} 155 156 // Iteratively update our strategy's start offset each time we read a complete 157 // frame. 158 var ( 159 rio = recordio.NewReader(&cr, maxStreamRecordSize) 160 buf bytes.Buffer 161 remaining = st.count 162 ) 163 for { 164 // Reset the count so we know how much we read for this frame. 165 cr.Count = 0 166 167 sz, r, err := rio.ReadFrame() 168 if err != nil { 169 return errors.Annotate(err, "failed to read frame").Err() 170 } 171 172 buf.Reset() 173 buf.Grow(int(sz)) 174 175 switch amt, err := buf.ReadFrom(r); { 176 case err != nil: 177 log.Fields{ 178 log.ErrorKey: err, 179 "frameOffset": offset, 180 "frameSize": sz, 181 }.Errorf(c, "Failed to read frame data.") 182 return errors.Annotate(err, "failed to read frame data").Err() 183 184 case amt != sz: 185 // If we didn't buffer the complete frame, we hit a premature EOF. 186 return errors.Annotate(io.EOF, "incomplete frame read").Err() 187 } 188 189 // If we read from offset 0, the first frame will be the log stream's 190 // descriptor, which we can discard. 191 discardFrame := (offset == 0) 192 offset += uint64(cr.Count) 193 if discardFrame { 194 continue 195 } 196 197 // Punt this log entry to our callback, if appropriate. 198 entry := storage.MakeEntry(buf.Bytes(), -1) 199 switch idx, err := entry.GetStreamIndex(); { 200 case err != nil: 201 log.Fields{ 202 log.ErrorKey: err, 203 "frameOffset": offset, 204 "frameSize": sz, 205 }.Errorf(c, "Failed to get log entry index.") 206 return errors.Annotate(err, "failed to get log entry index").Err() 207 208 case idx < st.startIndex: 209 // Skip this entry, as it's before the first requested entry. 210 continue 211 } 212 213 // We want to punt this entry, but we also want to re-use our Buffer. Clone 214 // its data so it is independent. 215 entry.D = make([]byte, len(entry.D)) 216 copy(entry.D, buf.Bytes()) 217 if !cb(entry) { 218 return nil 219 } 220 221 // Enforce our limit, if one is supplied. 222 if remaining > 0 { 223 remaining-- 224 if remaining == 0 { 225 return nil 226 } 227 } 228 } 229 } 230 231 func (s *storageImpl) Tail(c context.Context, project string, path types.StreamPath) (*storage.Entry, error) { 232 idx, err := s.getIndex(c) 233 if err != nil { 234 return nil, err 235 } 236 237 // Get the offset that is as close to our tail record as possible. If we know 238 // what that index is (from "idx"), we can request it directly. Otherwise, we 239 // will get as close as possible and read forwards from there. 240 req := storage.GetRequest{} 241 switch { 242 case idx.LastStreamIndex > 0: 243 req.Index = types.MessageIndex(idx.LastStreamIndex) 244 req.Limit = 1 245 246 case len(idx.Entries) > 0: 247 req.Index = types.MessageIndex(idx.Entries[len(idx.Entries)-1].StreamIndex) 248 } 249 250 // Build a Get strategy for our closest-to-Tail index. 251 st := buildGetStrategy(&req, idx) 252 if st == nil { 253 return nil, storage.ErrDoesNotExist 254 } 255 256 // Read forwards to EOF. Retain the last entry that we read. 257 var lastEntry *storage.Entry 258 err = s.Get(c, req, func(e *storage.Entry) bool { 259 lastEntry = e 260 261 // We can stop if we have the last stream index and this is that index. 262 if idx.LastStreamIndex > 0 { 263 // Get the index for this entry. 264 // 265 // We can ignore this error, since "Get" will have already resolved the 266 // index successfully. 267 if sidx, _ := e.GetStreamIndex(); sidx == types.MessageIndex(idx.LastStreamIndex) { 268 return false 269 } 270 } 271 return true 272 }) 273 switch { 274 case err != nil: 275 return nil, err 276 277 case lastEntry == nil: 278 return nil, storage.ErrDoesNotExist 279 280 default: 281 return lastEntry, nil 282 } 283 } 284 285 // getIndex returns the cached log stream index, fetching it if necessary. 286 func (s *storageImpl) getIndex(c context.Context) (*logpb.LogIndex, error) { 287 idx := s.index.Load() 288 if idx != nil { 289 return idx.(*logpb.LogIndex), nil 290 } 291 292 index, err := loadIndex(c, s.Client, s.Index, s.Cache) 293 switch errors.Unwrap(err) { 294 case nil: 295 break 296 297 case cloudStorage.ErrBucketNotExist, cloudStorage.ErrObjectNotExist: 298 // Treat a missing index the same as an empty index. 299 log.WithError(err).Warningf(c, "Index is invalid, using empty index.") 300 index = &logpb.LogIndex{} 301 302 default: 303 return nil, err 304 } 305 306 s.index.Store(index) 307 return index, nil 308 } 309 310 func loadIndex(c context.Context, client gs.Client, path gs.Path, cache storage.Cache) (*logpb.LogIndex, error) { 311 // If there is no path, then return an empty index. 312 if path == "" { 313 log.Infof(c, "No index path, using empty index.") 314 return &logpb.LogIndex{}, nil 315 } 316 317 // If we have a cache, see if the index is cached. 318 var ( 319 indexData []byte 320 cached bool 321 ) 322 if cache != nil { 323 var ok bool 324 indexData, ok = getCachedLogIndexData(c, cache, path) 325 if ok { 326 cached = true 327 } 328 } 329 330 if indexData == nil { 331 // No cache, or no cached entry. Load from storage. 332 r, err := client.NewReader(path, 0, -1) 333 if err != nil { 334 log.WithError(err).Errorf(c, "Failed to create index Reader.") 335 return nil, errors.Annotate(err, "failed to create index Reader").Err() 336 } 337 defer func() { 338 if err := r.Close(); err != nil { 339 log.WithError(err).Warningf(c, "Error closing index Reader.") 340 } 341 }() 342 343 if indexData, err = io.ReadAll(r); err != nil { 344 log.WithError(err).Errorf(c, "Failed to read index.") 345 return nil, errors.Annotate(err, "failed to read index").Err() 346 } 347 } 348 349 index := logpb.LogIndex{} 350 if err := proto.Unmarshal(indexData, &index); err != nil { 351 log.WithError(err).Errorf(c, "Failed to unmarshal index.") 352 return nil, errors.Annotate(err, "failed to unmarshal index").Err() 353 } 354 355 // If the index is valid, but wasn't cached previously, then cache it. 356 if cache != nil && !cached { 357 putCachedLogIndexData(c, cache, path, indexData) 358 } 359 360 return &index, nil 361 } 362 363 type getStrategy struct { 364 // startIndex is desired initial log entry index. 365 startIndex types.MessageIndex 366 367 // startOffset is the beginning byte offset of the log entry stream. This may 368 // be lower than the offset of the starting record if the index is sparse. 369 startOffset uint64 370 // endOffset is the ending byte offset of the log entry stream. This will be 371 // 0 if an end offset is not known. 372 endOffset uint64 373 374 // count is the number of log entries that will be fetched. If 0, no upper 375 // bound was calculated. 376 count uint64 377 } 378 379 func (gs *getStrategy) length() int64 { 380 if gs.startOffset < gs.endOffset { 381 return int64(gs.endOffset - gs.startOffset) 382 } 383 return -1 384 } 385 386 // setCount sets the `count` field. If called multiple times, the smallest 387 // assigned value will be retained. 388 func (gs *getStrategy) setCount(v uint64) { 389 if gs.count == 0 || gs.count > v { 390 gs.count = v 391 } 392 } 393 394 func buildGetStrategy(req *storage.GetRequest, idx *logpb.LogIndex) *getStrategy { 395 st := getStrategy{ 396 startIndex: req.Index, 397 } 398 399 // If the user has requested an index past the end of the stream, return no 400 // entries (count == 0). This only works if the last stream index is known. 401 if idx.LastStreamIndex > 0 && req.Index > types.MessageIndex(idx.LastStreamIndex) { 402 return nil 403 } 404 405 // Identify the closest index entry to the requested log. 406 // 407 // If the requested log starts before the first index entry, we must read from 408 // record #0. 409 startIndexEntry := indexEntryFor(idx.Entries, req.Index) 410 if startIndexEntry >= 0 { 411 st.startOffset = idx.Entries[startIndexEntry].Offset 412 } 413 414 // Determine an upper bound based on our limits. 415 // 416 // If we have a count limit, identify the maximum entry that can be loaded, 417 // find the index entry closest to it, and use that to determine our upper 418 // bound. 419 if req.Limit > 0 { 420 st.setCount(uint64(req.Limit)) 421 422 // Find the index entry for the stream entry AFTER the last one we are going 423 // to return. 424 entryAfterGetBlock := req.Index + types.MessageIndex(req.Limit) 425 endIndexEntry := indexEntryFor(idx.Entries, entryAfterGetBlock) 426 switch { 427 case endIndexEntry < 0: 428 // The last possible request log entry is before the first index entry. 429 // Read up to the first index entry. 430 endIndexEntry = 0 431 432 case endIndexEntry <= startIndexEntry: 433 // The last possible request log entry is closest to the start index 434 // entry. Use the index entry immediately after it. 435 endIndexEntry = startIndexEntry + 1 436 437 default: 438 // We have the index entry <= the stream entry after the last one that we 439 // will return. 440 // 441 // If we're sparse, this could be the index at or before our last entry. 442 // If this is the case, use the next index entry, which will be after 443 // "entryAfterGetBlock" (EAGB). 444 // 445 // START ------ LIMIT (LIMIT+1) 446 // | [IDX] | [IDX] 447 // index | entryAfterGetBlock | 448 // endIndexEntry (endIndexEntry+1) 449 if types.MessageIndex(idx.Entries[endIndexEntry].StreamIndex) < entryAfterGetBlock { 450 endIndexEntry++ 451 } 452 } 453 454 // If we're pointing to a valid index entry, set our upper bound. 455 if endIndexEntry < len(idx.Entries) { 456 st.endOffset = idx.Entries[endIndexEntry].Offset 457 } 458 } 459 460 return &st 461 } 462 463 // indexEntryFor identifies the log index entry closest (<=) to the specified 464 // index. 465 // 466 // If the first index entry is greater than our search index, -1 will be 467 // returned. This should never happen in practice, though, since our index 468 // construction always indexes log entry #0. 469 // 470 // It does this by performing a binary search over the index entries. 471 func indexEntryFor(entries []*logpb.LogIndex_Entry, i types.MessageIndex) int { 472 ui := uint64(i) 473 s := sort.Search(len(entries), func(i int) bool { 474 return entries[i].StreamIndex > ui 475 }) 476 477 // The returned index is the one immediately after the index that we want. If 478 // our search returned 0, the first index entry is > our search entry, and we 479 // will return nil. 480 return s - 1 481 }