go.chromium.org/luci@v0.0.0-20240309015107-7cdc2e660f33/logdog/server/archivist/archivist.go (about)

     1  // Copyright 2016 The LUCI Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package archivist
    16  
    17  import (
    18  	"context"
    19  	"crypto/sha256"
    20  	"encoding/base64"
    21  	"io"
    22  	"regexp"
    23  
    24  	cl "cloud.google.com/go/logging"
    25  	"github.com/golang/protobuf/proto"
    26  	mrpb "google.golang.org/genproto/googleapis/api/monitoredres"
    27  
    28  	"go.chromium.org/luci/common/errors"
    29  	"go.chromium.org/luci/common/gcloud"
    30  	"go.chromium.org/luci/common/gcloud/gs"
    31  	"go.chromium.org/luci/common/logging"
    32  	"go.chromium.org/luci/common/retry/transient"
    33  	"go.chromium.org/luci/common/sync/parallel"
    34  	"go.chromium.org/luci/common/tsmon/distribution"
    35  	"go.chromium.org/luci/common/tsmon/field"
    36  	"go.chromium.org/luci/common/tsmon/metric"
    37  	tsmon_types "go.chromium.org/luci/common/tsmon/types"
    38  	"go.chromium.org/luci/config"
    39  
    40  	logdog "go.chromium.org/luci/logdog/api/endpoints/coordinator/services/v1"
    41  	"go.chromium.org/luci/logdog/api/logpb"
    42  	"go.chromium.org/luci/logdog/common/archive"
    43  	"go.chromium.org/luci/logdog/common/storage"
    44  	"go.chromium.org/luci/logdog/common/types"
    45  	"go.chromium.org/luci/logdog/common/viewer"
    46  )
    47  
    48  const (
    49  	tsEntriesField = "entries"
    50  	tsIndexField   = "index"
    51  )
    52  
    53  var logIDRe = regexp.MustCompile(`^[[:alnum:]._\-][[:alnum:]./_\-]{0,510}`)
    54  
    55  // CLClient is a general interface for CloudLogging client and intended to enable
    56  // unit tests to stub out CloudLogging.
    57  type CLClient interface {
    58  	Close() error
    59  	Logger(logID string, opts ...cl.LoggerOption) *cl.Logger
    60  	Ping(context.Context) error
    61  }
    62  
    63  var (
    64  	// tsCount counts the raw number of archival tasks that this instance has
    65  	// processed, regardless of success/failure.
    66  	tsCount = metric.NewCounter("logdog/archivist/archive/count",
    67  		"The number of archival tasks processed.",
    68  		nil,
    69  		field.String("project"),
    70  		field.Bool("successful"))
    71  
    72  	// tsSize tracks the archive binary file size distribution of completed
    73  	// archives.
    74  	//
    75  	// The "archive" field is the specific type of archive (entries, index, data)
    76  	// that is being tracked.
    77  	//
    78  	// The "stream" field is the type of log stream that is being archived.
    79  	tsSize = metric.NewCumulativeDistribution("logdog/archivist/archive/size",
    80  		"The size (in bytes) of each archive file.",
    81  		&tsmon_types.MetricMetadata{Units: tsmon_types.Bytes},
    82  		distribution.DefaultBucketer,
    83  		field.String("project"),
    84  		field.String("archive"),
    85  		field.String("stream"))
    86  
    87  	// tsTotalBytes tracks the cumulative total number of bytes that have
    88  	// been archived by this instance.
    89  	//
    90  	// The "archive" field is the specific type of archive (entries, index, data)
    91  	// that is being tracked.
    92  	//
    93  	// The "stream" field is the type of log stream that is being archived.
    94  	tsTotalBytes = metric.NewCounter("logdog/archivist/archive/total_bytes",
    95  		"The total number of archived bytes.",
    96  		&tsmon_types.MetricMetadata{Units: tsmon_types.Bytes},
    97  		field.String("project"),
    98  		field.String("archive"),
    99  		field.String("stream"))
   100  
   101  	// tsLogEntries tracks the number of log entries per individual
   102  	// archival.
   103  	//
   104  	// The "stream" field is the type of log stream that is being archived.
   105  	tsLogEntries = metric.NewCumulativeDistribution("logdog/archivist/archive/log_entries",
   106  		"The total number of log entries per archive.",
   107  		nil,
   108  		distribution.DefaultBucketer,
   109  		field.String("project"),
   110  		field.String("stream"))
   111  
   112  	// tsTotalLogEntries tracks the total number of log entries that have
   113  	// been archived by this instance.
   114  	//
   115  	// The "stream" field is the type of log stream that is being archived.
   116  	tsTotalLogEntries = metric.NewCounter("logdog/archivist/archive/total_log_entries",
   117  		"The total number of log entries.",
   118  		nil,
   119  		field.String("project"),
   120  		field.String("stream"))
   121  )
   122  
   123  // Settings defines the archival parameters for a specific archival operation.
   124  //
   125  // In practice, this will be formed from service and project settings.
   126  type Settings struct {
   127  	// GSBase is the base Google Storage path. This includes the bucket name
   128  	// and any associated path.
   129  	GSBase gs.Path
   130  	// GSStagingBase is the base Google Storage path for archive staging. This
   131  	// includes the bucket name and any associated path.
   132  	GSStagingBase gs.Path
   133  
   134  	// IndexStreamRange is the maximum number of stream indexes in between index
   135  	// entries. See archive.Manifest for more information.
   136  	IndexStreamRange int
   137  	// IndexPrefixRange is the maximum number of prefix indexes in between index
   138  	// entries. See archive.Manifest for more information.
   139  	IndexPrefixRange int
   140  	// IndexByteRange is the maximum number of stream data bytes in between index
   141  	// entries. See archive.Manifest for more information.
   142  	IndexByteRange int
   143  
   144  	// CloudLoggingProjectID is the ID of the Google Cloud Platform project to export
   145  	// logs to.
   146  	//
   147  	// May be empty, if no export is configured.
   148  	CloudLoggingProjectID string
   149  	// CloudLoggingBufferLimit is the maximum number of megabytes that the
   150  	// Cloud Logger will keep in memory per concurrent-task before flushing them
   151  	// out.
   152  	CloudLoggingBufferLimit int
   153  }
   154  
   155  // SettingsLoader returns archival Settings for a given project.
   156  type SettingsLoader func(ctx context.Context, project string) (*Settings, error)
   157  
   158  // Archivist is a stateless configuration capable of archiving individual log
   159  // streams.
   160  type Archivist struct {
   161  	// Service is the client to use to communicate with Coordinator's Services
   162  	// endpoint.
   163  	Service logdog.ServicesClient
   164  
   165  	// SettingsLoader loads archival settings for a specific project.
   166  	SettingsLoader SettingsLoader
   167  
   168  	// Storage is the archival source Storage instance.
   169  	Storage storage.Storage
   170  
   171  	// GSClientFactory obtains a Google Storage client for archive generation.
   172  	GSClientFactory func(ctx context.Context, luciProject string) (gs.Client, error)
   173  
   174  	// CLClientFactory obtains a Cloud Logging client for log exports.
   175  	// `luciProject` is the ID of the LUCI project to export logs from, and
   176  	// `clProject` is the ID of the Google Cloud project to export logs to.
   177  	CLClientFactory func(ctx context.Context, luciProject, clProject string, onError func(err error)) (CLClient, error)
   178  }
   179  
   180  // storageBufferSize is the size, in bytes, of the LogEntry buffer that is used
   181  // to during archival. This should be greater than the maximum LogEntry size.
   182  const storageBufferSize = types.MaxLogEntryDataSize * 64
   183  
   184  // ArchiveTask processes and executes a single log stream archive task.
   185  //
   186  // If the supplied Context is Done, operation may terminate before completion,
   187  // returning the Context's error.
   188  func (a *Archivist) ArchiveTask(ctx context.Context, task *logdog.ArchiveTask) error {
   189  	err := a.archiveTaskImpl(ctx, task)
   190  
   191  	failure := isFailure(err)
   192  
   193  	// Add a result metric.
   194  	tsCount.Add(ctx, 1, task.Project, !failure)
   195  
   196  	return err
   197  }
   198  
   199  // archiveTaskImpl performs the actual task archival.
   200  //
   201  // Its error return value is used to indicate how the archive failed. isFailure
   202  // will be called to determine if the returned error value is a failure or a
   203  // status error.
   204  func (a *Archivist) archiveTaskImpl(ctx context.Context, task *logdog.ArchiveTask) error {
   205  	// Validate the project name.
   206  	if err := config.ValidateProjectName(task.Project); err != nil {
   207  		logging.WithError(err).Errorf(ctx, "invalid project name %q: %s", task.Project)
   208  		return nil
   209  	}
   210  
   211  	// Load archival settings for this project.
   212  	settings, err := a.loadSettings(ctx, task.Project)
   213  	switch {
   214  	case err == config.ErrNoConfig:
   215  		logging.WithError(err).Errorf(ctx, "The project config doesn't exist; discarding the task.")
   216  		return nil
   217  	case transient.Tag.In(err):
   218  		// If this is a transient error, exit immediately and do not delete the
   219  		// archival task.
   220  		logging.WithError(err).Warningf(ctx, "TRANSIENT error during loading the project config.")
   221  		return err
   222  	case err != nil:
   223  		// This project has bad or no archival settings, this is non-transient,
   224  		// discard the task.
   225  		logging.WithError(err).Errorf(ctx, "Failed to load settings for project.")
   226  		return nil
   227  	}
   228  
   229  	// Load the log stream's current state. If it is already archived, we will
   230  	// return an immediate success.
   231  	ls, err := a.Service.LoadStream(ctx, &logdog.LoadStreamRequest{
   232  		Project: task.Project,
   233  		Id:      task.Id,
   234  		Desc:    true,
   235  	})
   236  	switch {
   237  	case err != nil:
   238  		logging.WithError(err).Errorf(ctx, "Failed to load log stream.")
   239  		return err
   240  
   241  	case ls.State == nil:
   242  		logging.Errorf(ctx, "Log stream did not include state.")
   243  		return errors.New("log stream did not include state")
   244  
   245  	case ls.State.Purged:
   246  		logging.Warningf(ctx, "Log stream is purged. Discarding archival request.")
   247  		a.expungeStorage(ctx, task.Project, ls.Desc, ls.State.TerminalIndex)
   248  		return nil
   249  
   250  	case ls.State.Archived:
   251  		logging.Infof(ctx, "Log stream is already archived. Discarding archival request.")
   252  		a.expungeStorage(ctx, task.Project, ls.Desc, ls.State.TerminalIndex)
   253  		return nil
   254  
   255  	case ls.State.ProtoVersion != logpb.Version:
   256  		logging.Fields{
   257  			"protoVersion":    ls.State.ProtoVersion,
   258  			"expectedVersion": logpb.Version,
   259  		}.Errorf(ctx, "Unsupported log stream protobuf version.")
   260  		return errors.New("unsupported log stream protobuf version")
   261  
   262  	case ls.Desc == nil:
   263  		logging.Errorf(ctx, "Log stream did not include a descriptor.")
   264  		return errors.New("log stream did not include a descriptor")
   265  	}
   266  
   267  	ar := logdog.ArchiveStreamRequest{
   268  		Project: task.Project,
   269  		Id:      task.Id,
   270  	}
   271  
   272  	// Build our staged archival plan. This doesn't actually do any archiving.
   273  	staged, err := a.makeStagedArchival(ctx, task.Project, task.Realm, settings, ls)
   274  	if err != nil {
   275  		logging.WithError(err).Errorf(ctx, "Failed to create staged archival plan.")
   276  		return err
   277  	}
   278  
   279  	// TODO(crbug.com/1164124) - handle the error from clClient.Close()
   280  	defer staged.Close()
   281  
   282  	// Archive to staging.
   283  	//
   284  	// If a non-transient failure occurs here, we will report it to the Archivist
   285  	// under the assumption that it will continue occurring.
   286  	//
   287  	// We will handle error creating the plan and executing the plan in the same
   288  	// switch statement below.
   289  	switch err = staged.stage(); {
   290  	case transient.Tag.In(err):
   291  		// If this is a transient error, exit immediately and do not delete the
   292  		// archival task.
   293  		logging.WithError(err).Warningf(ctx, "TRANSIENT error during archival operation.")
   294  		return err
   295  
   296  	case err != nil:
   297  		// This is a non-transient error, so we are confident that any future
   298  		// Archival will also encounter this error. We will mark this archival
   299  		// as an error and report it to the Coordinator.
   300  		logging.WithError(err).Errorf(ctx, "Archival failed with non-transient error.")
   301  		ar.Error = err.Error()
   302  		if ar.Error == "" {
   303  			// This needs to be non-nil, so if our actual error has an empty string,
   304  			// fill in a generic message.
   305  			ar.Error = "archival error"
   306  		}
   307  
   308  	default:
   309  		// In case something fails, clean up our staged archival (best effort).
   310  		defer staged.cleanup()
   311  
   312  		// Finalize the archival.
   313  		if err := staged.finalize(&ar); err != nil {
   314  			logging.WithError(err).Errorf(ctx, "Failed to finalize archival.")
   315  			return err
   316  		}
   317  
   318  		// Add metrics for this successful archival.
   319  		streamType := staged.desc.StreamType.String()
   320  
   321  		staged.stream.addMetrics(ctx, task.Project, tsEntriesField, streamType)
   322  		staged.index.addMetrics(ctx, task.Project, tsIndexField, streamType)
   323  
   324  		tsLogEntries.Add(ctx, float64(staged.logEntryCount), task.Project, streamType)
   325  		tsTotalLogEntries.Add(ctx, staged.logEntryCount, task.Project, streamType)
   326  	}
   327  
   328  	if _, err := a.Service.ArchiveStream(ctx, &ar); err != nil {
   329  		logging.WithError(err).Errorf(ctx, "Failed to report archive state.")
   330  		return err
   331  	}
   332  	a.expungeStorage(ctx, task.Project, ls.Desc, ar.TerminalIndex)
   333  
   334  	return nil
   335  }
   336  
   337  // expungeStorage does a best-effort expunging of the intermediate storage
   338  // (BigTable) rows after successful archival.
   339  //
   340  // `desc` is a binary-encoded LogStreamDescriptor
   341  // `terminalIndex` should be the terminal index of the archived stream. If it's
   342  //
   343  //	<0 (an empty stream) we skip the expunge.
   344  func (a *Archivist) expungeStorage(ctx context.Context, project string, desc []byte, terminalIndex int64) {
   345  	if terminalIndex < 0 {
   346  		// no log rows
   347  		return
   348  	}
   349  
   350  	if desc == nil {
   351  		logging.Warningf(ctx, "expungeStorage: nil desc")
   352  		return
   353  	}
   354  
   355  	var lsd logpb.LogStreamDescriptor
   356  	if err := proto.Unmarshal(desc, &lsd); err != nil {
   357  		logging.WithError(err).Warningf(ctx, "expungeStorage: decoding desc")
   358  		return
   359  	}
   360  
   361  	err := a.Storage.Expunge(ctx, storage.ExpungeRequest{
   362  		Path:    lsd.Path(),
   363  		Project: project,
   364  	})
   365  	if err != nil {
   366  		logging.WithError(err).Warningf(ctx, "expungeStorage: failed")
   367  	}
   368  }
   369  
   370  // loadSettings loads and validates archival settings.
   371  func (a *Archivist) loadSettings(ctx context.Context, project string) (*Settings, error) {
   372  	if a.SettingsLoader == nil {
   373  		panic("no settings loader configured")
   374  	}
   375  
   376  	st, err := a.SettingsLoader(ctx, project)
   377  	switch {
   378  	case err != nil:
   379  		return nil, err
   380  
   381  	case st.GSBase.Bucket() == "":
   382  		logging.Fields{
   383  			logging.ErrorKey: err,
   384  			"gsBase":         st.GSBase,
   385  		}.Errorf(ctx, "Invalid storage base.")
   386  		return nil, errors.New("invalid storage base")
   387  
   388  	case st.GSStagingBase.Bucket() == "":
   389  		logging.Fields{
   390  			logging.ErrorKey: err,
   391  			"gsStagingBase":  st.GSStagingBase,
   392  		}.Errorf(ctx, "Invalid storage staging base.")
   393  		return nil, errors.New("invalid storage staging base")
   394  
   395  	default:
   396  		return st, nil
   397  	}
   398  }
   399  
   400  func (a *Archivist) makeStagedArchival(ctx context.Context, project string, realm string,
   401  	st *Settings, ls *logdog.LoadStreamResponse) (*stagedArchival, error) {
   402  
   403  	gsClient, err := a.GSClientFactory(ctx, project)
   404  	if err != nil {
   405  		logging.Fields{
   406  			logging.ErrorKey: err,
   407  			"protoVersion":   ls.State.ProtoVersion,
   408  		}.Errorf(ctx, "Failed to obtain GSClient.")
   409  		return nil, err
   410  	}
   411  
   412  	sa := stagedArchival{
   413  		Archivist: a,
   414  		Settings:  st,
   415  
   416  		ctx:      ctx,
   417  		project:  project,
   418  		realm:    realm,
   419  		gsclient: gsClient,
   420  
   421  		terminalIndex: types.MessageIndex(ls.State.TerminalIndex),
   422  	}
   423  
   424  	// Deserialize and validate the descriptor protobuf. If this fails, it is a
   425  	// non-transient error.
   426  	if err := proto.Unmarshal(ls.Desc, &sa.desc); err != nil {
   427  		logging.Fields{
   428  			logging.ErrorKey: err,
   429  			"protoVersion":   ls.State.ProtoVersion,
   430  		}.Errorf(ctx, "Failed to unmarshal descriptor protobuf.")
   431  		return nil, err
   432  	}
   433  	sa.path = sa.desc.Path()
   434  
   435  	// Construct staged archival paths sa.stream and sa.index. The path length
   436  	// must not exceed 1024 bytes, it is GCS limit.
   437  	if err = sa.makeStagingPaths(1024); err != nil {
   438  		return nil, err
   439  	}
   440  
   441  	// Construct a CloudLogging client, if the config is set and the input
   442  	// stream type is TEXT.
   443  	if st.CloudLoggingProjectID != "" && sa.desc.StreamType == logpb.StreamType_TEXT {
   444  		// Validate the project ID, and ping the project to verify the auth.
   445  		if err = gcloud.ValidateProjectID(st.CloudLoggingProjectID); err != nil {
   446  			return nil, errors.Annotate(err, "CloudLoggingProjectID %q", st.CloudLoggingProjectID).Err()
   447  		}
   448  		onError := func(err error) {
   449  			logging.Fields{
   450  				"luciProject":  project,
   451  				"cloudProject": st.CloudLoggingProjectID,
   452  				"path":         sa.path,
   453  			}.Errorf(ctx, "archiving log to Cloud Logging: %v", err)
   454  		}
   455  
   456  		clc, err := a.CLClientFactory(ctx, project, st.CloudLoggingProjectID, onError)
   457  		if err != nil {
   458  			logging.Fields{
   459  				logging.ErrorKey: err,
   460  				"protoVersion":   ls.State.ProtoVersion,
   461  			}.Errorf(ctx, "Failed to obtain CloudLogging client.")
   462  			return nil, err
   463  		}
   464  		if err = clc.Ping(ctx); err != nil {
   465  			return nil, errors.Annotate(
   466  				err, "failed to ping CloudProject %q for Cloud Logging export",
   467  				st.CloudLoggingProjectID).Err()
   468  		}
   469  		sa.clclient = clc
   470  	}
   471  
   472  	return &sa, nil
   473  }
   474  
   475  type stagedArchival struct {
   476  	*Archivist
   477  	*Settings
   478  
   479  	ctx     context.Context
   480  	project string
   481  	realm   string
   482  	path    types.StreamPath
   483  	desc    logpb.LogStreamDescriptor
   484  
   485  	stream stagingPaths
   486  	index  stagingPaths
   487  
   488  	terminalIndex types.MessageIndex
   489  	logEntryCount int64
   490  
   491  	gsclient gs.Client
   492  	clclient CLClient
   493  }
   494  
   495  func base64Hash(p types.StreamName) string {
   496  	hasher := sha256.New()
   497  	hasher.Write([]byte(p))
   498  	return base64.RawURLEncoding.EncodeToString(hasher.Sum(nil))
   499  }
   500  
   501  // makeStagingPaths populates `staged` and `final` fields in sa.stream and
   502  // sa.index.
   503  //
   504  // It prefixes the staging GCS paths with a hash of stream's Logdog prefix to
   505  // make sure we spread the load across GCS namespace to avoid hotspotting its
   506  // metadata server.
   507  //
   508  // These paths may be shared between projects. To enforce an absence of
   509  // conflicts, we will insert the project name as part of the path.
   510  func (sa *stagedArchival) makeStagingPaths(maxGSFilenameLength int) error {
   511  	// "<prefix>/+/<name>" => (<prefix>, <name>).
   512  	prefix, name := sa.path.Split()
   513  	if name == "" {
   514  		return errors.Reason("got prefix-only path %q, don't know how to stage it", sa.path).Err()
   515  	}
   516  
   517  	// base64 encoded SHA256 hash of the prefix.
   518  	prefixHash := "p/" + base64Hash(prefix)
   519  
   520  	// GCS paths we need to generate are:
   521  	//   <GSStagingBase>/<project>/<prefixHash>/+/<name>/logstream.entries
   522  	//   <GSStagingBase>/<project>/<prefixHash>/+/<name>/logstream.index
   523  	//   <GSBase>/<project>/<prefix>/+/<name>/logstream.entries
   524  	//   <GSBase>/<project>/<prefix>/+/<name>/logstream.index
   525  	//
   526  	// Each path length must be less than maxGSFilenameLength bytes. And we want
   527  	// <name> component in all paths to be identical. If some path doesn't fit
   528  	// the limit, we replace <name> with "<name-prefix>-TRUNCATED-<hash>"
   529  	// everywhere, making it fit the limit.
   530  
   531  	// Note: len("logstream.entries") > len("logstream.index"), use it for max len.
   532  	maxStagingLen := len(sa.GSStagingBase.Concat(sa.project, prefixHash, "+", string(name), "logstream.entries").Filename())
   533  	maxFinalLen := len(sa.GSBase.Concat(sa.project, string(prefix), "+", string(name), "logstream.entries").Filename())
   534  
   535  	// See if we need to truncate <name> to fit GCS paths into limits.
   536  	//
   537  	// The sa.path is user-provided and is unlimited. It is known to be large
   538  	// enough to exceed max ID length (https://crbug.com/1138017).
   539  	// So, truncate it if needed while avoiding overwrites by using crypto hash.
   540  	maxPathLen := maxStagingLen
   541  	if maxFinalLen > maxStagingLen {
   542  		maxPathLen = maxFinalLen
   543  	}
   544  	if bytesToCut := maxPathLen - maxGSFilenameLength; bytesToCut > 0 {
   545  		nameSuffix := types.StreamName("-TRUNCATED-" + base64Hash(name)[:16])
   546  		// Replace last len(nameSuffix)+bytesToCut bytes with nameSuffix. It will
   547  		// reduce the overall name size by `bytesToCut` bytes, as we need.
   548  		if len(nameSuffix)+bytesToCut > len(name) {
   549  			// There's no enough space even to fit nameSuffix. The prefix is too
   550  			// huge. This should be rare, abort.
   551  			return errors.Reason("can't stage %q of project %q, prefix is too long", sa.path, sa.project).Err()
   552  		}
   553  		name = name[:len(name)-len(nameSuffix)-bytesToCut] + nameSuffix
   554  	}
   555  
   556  	// Everything should fit into the limits now.
   557  	nameMap := map[string]*stagingPaths{
   558  		"logstream.entries": &sa.stream,
   559  		"logstream.index":   &sa.index,
   560  	}
   561  	for file, spaths := range nameMap {
   562  		spaths.staged = sa.GSStagingBase.Concat(sa.project, prefixHash, "+", string(name), file)
   563  		spaths.final = sa.GSBase.Concat(sa.project, string(prefix), "+", string(name), file)
   564  	}
   565  	return nil
   566  }
   567  
   568  // stage executes the archival process, archiving to the staged storage paths.
   569  //
   570  // If stage fails, it may return a transient error.
   571  func (sa *stagedArchival) stage() (err error) {
   572  	// Group any transient errors that occur during cleanup. If we aren't
   573  	// returning a non-transient error, return a transient "terr".
   574  	var terr errors.MultiError
   575  	defer func() {
   576  		if err == nil && len(terr) > 0 {
   577  			logging.Errorf(sa.ctx, "Encountered transient errors: %s", terr)
   578  			err = transient.Tag.Apply(terr)
   579  		}
   580  	}()
   581  
   582  	// Close our writers on exit. If any of them fail to close, mark the archival
   583  	// as a transient failure.
   584  	closeWriter := func(closer io.Closer, path gs.Path) {
   585  		// Close the Writer. If this results in an error, append it to our transient
   586  		// error MultiError.
   587  		if ierr := closer.Close(); ierr != nil {
   588  			logging.Warningf(sa.ctx, "Error closing writer to %s: %s", path, ierr)
   589  			terr = append(terr, ierr)
   590  		}
   591  
   592  		// If we have an archival error, also delete the path associated with this
   593  		// stream. This is a non-fatal failure, since we've already hit a fatal
   594  		// one.
   595  		if err != nil || len(terr) > 0 {
   596  			logging.Warningf(sa.ctx, "Cleaning up %s after error", path)
   597  			if ierr := sa.gsclient.Delete(path); ierr != nil {
   598  				logging.Fields{
   599  					logging.ErrorKey: ierr,
   600  					"path":           path,
   601  				}.Warningf(sa.ctx, "Failed to delete stream on error.")
   602  			}
   603  		}
   604  	}
   605  
   606  	// createWriter is a shorthand function for creating a writer to a path and
   607  	// reporting an error if it failed.
   608  	createWriter := func(p gs.Path) (gs.Writer, error) {
   609  		w, ierr := sa.gsclient.NewWriter(p)
   610  		if ierr != nil {
   611  			logging.Fields{
   612  				logging.ErrorKey: ierr,
   613  				"path":           p,
   614  			}.Errorf(sa.ctx, "Failed to create writer.")
   615  			return nil, ierr
   616  		}
   617  		return w, nil
   618  	}
   619  
   620  	var streamWriter, indexWriter gs.Writer
   621  	if streamWriter, err = createWriter(sa.stream.staged); err != nil {
   622  		return err
   623  	}
   624  	defer closeWriter(streamWriter, sa.stream.staged)
   625  
   626  	if indexWriter, err = createWriter(sa.index.staged); err != nil {
   627  		return err
   628  	}
   629  	defer closeWriter(indexWriter, sa.index.staged)
   630  
   631  	// Read our log entries from intermediate storage.
   632  	ss := storageSource{
   633  		Context:       sa.ctx,
   634  		st:            sa.Storage,
   635  		project:       sa.project,
   636  		path:          sa.path,
   637  		terminalIndex: sa.terminalIndex,
   638  		lastIndex:     -1,
   639  	}
   640  
   641  	m := archive.Manifest{
   642  		LUCIProject:      sa.project,
   643  		Desc:             &sa.desc,
   644  		Source:           &ss,
   645  		LogWriter:        streamWriter,
   646  		IndexWriter:      indexWriter,
   647  		StreamIndexRange: sa.IndexStreamRange,
   648  		PrefixIndexRange: sa.IndexPrefixRange,
   649  		ByteRange:        sa.IndexByteRange,
   650  
   651  		Logger: logging.Get(sa.ctx),
   652  	}
   653  
   654  	if sa.clclient != nil {
   655  		logID := "luci-logs"
   656  		tags := sa.desc.GetTags()
   657  		if tags == nil {
   658  			tags = map[string]string{}
   659  		}
   660  		if sa.realm != "" {
   661  			tags["realm"] = sa.realm
   662  		}
   663  
   664  		// bbagent adds viewer.LogDogViewerURLTag to log streams for
   665  		// "back to build" link in UI
   666  		//
   667  		// This URL isn't useful in Cloud Logging UI, and doesn't add any value
   668  		// to search capabilities. So, remove it.
   669  		delete(tags, viewer.LogDogViewerURLTag)
   670  
   671  		switch val, ok := tags["luci.CloudLogExportID"]; {
   672  		case !ok, len(val) == 0: // skip
   673  
   674  		// len(LogID) must be < 512, and allows ./_- and alphanumerics.
   675  		// If CloudLogExportID is too long or contains unsupported chars, fall back to
   676  		// the default LogID.
   677  		case len(val) > 511:
   678  			logging.Errorf(sa.ctx, "CloudLogExportID: too long - %d", len(val))
   679  
   680  		case !logIDRe.MatchString(val):
   681  			logging.Errorf(sa.ctx, "CloudLogExportID(%s): does not match %s", val, logIDRe)
   682  
   683  		default:
   684  			logID = val
   685  		}
   686  
   687  		m.CloudLogger = sa.clclient.Logger(
   688  			logID,
   689  			cl.CommonLabels(tags),
   690  			cl.CommonResource(&mrpb.MonitoredResource{
   691  				Type: "generic_task",
   692  				Labels: map[string]string{
   693  					"project_id": sa.project,
   694  					"location":   sa.desc.GetName(),
   695  					"namespace":  sa.desc.GetPrefix(),
   696  					"job":        "cloud-logging-export",
   697  				},
   698  			}),
   699  			cl.BufferedByteLimit(sa.CloudLoggingBufferLimit*1024*1024),
   700  		)
   701  	}
   702  
   703  	if err = archive.Archive(m); err != nil {
   704  		logging.WithError(err).Errorf(sa.ctx, "Failed to archive log stream.")
   705  		return err
   706  	}
   707  
   708  	if ss.logEntryCount == 0 {
   709  		// If our last log index was <0, then no logs were archived.
   710  		logging.Warningf(sa.ctx, "No log entries were archived.")
   711  	}
   712  
   713  	// Update our state with archival results.
   714  	sa.terminalIndex = ss.lastIndex
   715  	sa.logEntryCount = ss.logEntryCount
   716  	sa.stream.bytesWritten = streamWriter.Count()
   717  	sa.index.bytesWritten = indexWriter.Count()
   718  	return nil
   719  }
   720  
   721  type stagingPaths struct {
   722  	staged       gs.Path
   723  	final        gs.Path
   724  	bytesWritten int64
   725  }
   726  
   727  func (d *stagingPaths) clearStaged() { d.staged = "" }
   728  
   729  func (d *stagingPaths) enabled() bool { return d.final != "" }
   730  
   731  func (d *stagingPaths) addMetrics(ctx context.Context, projectField, archiveField, streamField string) {
   732  	tsSize.Add(ctx, float64(d.bytesWritten), projectField, archiveField, streamField)
   733  	tsTotalBytes.Add(ctx, d.bytesWritten, projectField, archiveField, streamField)
   734  }
   735  
   736  func (sa *stagedArchival) finalize(ar *logdog.ArchiveStreamRequest) error {
   737  	err := parallel.FanOutIn(func(taskC chan<- func() error) {
   738  		for _, d := range sa.getStagingPaths() {
   739  			d := d
   740  
   741  			// Don't finalize zero-sized streams.
   742  			if !d.enabled() || d.bytesWritten == 0 {
   743  				continue
   744  			}
   745  
   746  			taskC <- func() error {
   747  				if err := sa.gsclient.Rename(d.staged, d.final); err != nil {
   748  					logging.Fields{
   749  						logging.ErrorKey: err,
   750  						"stagedPath":     d.staged,
   751  						"finalPath":      d.final,
   752  					}.Errorf(sa.ctx, "Failed to rename GS object.")
   753  					return err
   754  				}
   755  
   756  				// Clear the staged value to indicate that it no longer exists.
   757  				d.clearStaged()
   758  				return nil
   759  			}
   760  		}
   761  	})
   762  	if err != nil {
   763  		return err
   764  	}
   765  
   766  	ar.TerminalIndex = int64(sa.terminalIndex)
   767  	ar.LogEntryCount = sa.logEntryCount
   768  	ar.StreamUrl = string(sa.stream.final)
   769  	ar.StreamSize = sa.stream.bytesWritten
   770  	ar.IndexUrl = string(sa.index.final)
   771  	ar.IndexSize = sa.index.bytesWritten
   772  	return nil
   773  }
   774  
   775  func (sa *stagedArchival) Close() error {
   776  	var clErr error
   777  	if sa.clclient != nil {
   778  		clErr = errors.Annotate(sa.clclient.Close(),
   779  			"while closing CloudLogging client for (%s/%s/+/%s)",
   780  			sa.project, sa.desc.GetPrefix(), sa.desc.GetName()).Err()
   781  	}
   782  	return errors.Flatten(errors.MultiError{sa.gsclient.Close(), clErr})
   783  }
   784  
   785  func (sa *stagedArchival) cleanup() {
   786  	for _, d := range sa.getStagingPaths() {
   787  		if d.staged == "" {
   788  			continue
   789  		}
   790  
   791  		logging.Warningf(sa.ctx, "Cleaning up staged path %s", d.staged)
   792  		if err := sa.gsclient.Delete(d.staged); err != nil {
   793  			logging.Fields{
   794  				logging.ErrorKey: err,
   795  				"path":           d.staged,
   796  			}.Warningf(sa.ctx, "Failed to clean up staged path.")
   797  		}
   798  
   799  		d.clearStaged()
   800  	}
   801  }
   802  
   803  func (sa *stagedArchival) getStagingPaths() []*stagingPaths {
   804  	return []*stagingPaths{
   805  		&sa.stream,
   806  		&sa.index,
   807  	}
   808  }
   809  
   810  // statusErrorWrapper is an error wrapper. It is detected by IsFailure and used to
   811  // determine whether the supplied error represents a failure or just a status
   812  // error.
   813  type statusErrorWrapper struct {
   814  	inner error
   815  }
   816  
   817  var _ interface {
   818  	error
   819  	errors.Wrapped
   820  } = (*statusErrorWrapper)(nil)
   821  
   822  func (e *statusErrorWrapper) Error() string {
   823  	if e.inner != nil {
   824  		return e.inner.Error()
   825  	}
   826  	return ""
   827  }
   828  
   829  func (e *statusErrorWrapper) Unwrap() error {
   830  	return e.inner
   831  }
   832  
   833  func isFailure(err error) bool {
   834  	if err == nil {
   835  		return false
   836  	}
   837  	_, ok := err.(*statusErrorWrapper)
   838  	return !ok
   839  }