github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/changefeedccl/sink_cloudstorage.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/changefeedccl/sink_cloudstorage.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Licensed as a CockroachDB Enterprise file under the Cockroach Community
     4  // License (the "License"); you may not use this file except in compliance with
     5  // the License. You may obtain a copy of the License at
     6  //
     7  //     https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt
     8  
     9  package changefeedccl
    10  
    11  import (
    12  	"bytes"
    13  	"compress/gzip"
    14  	"context"
    15  	"fmt"
    16  	"io"
    17  	"net/url"
    18  	"path/filepath"
    19  	"strings"
    20  	"sync/atomic"
    21  
    22  	"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase"
    23  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    24  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    25  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    26  	"github.com/cockroachdb/cockroach/pkg/storage/cloud"
    27  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    28  	"github.com/cockroachdb/cockroach/pkg/util/log"
    29  	"github.com/cockroachdb/errors"
    30  	"github.com/google/btree"
    31  )
    32  
    33  func isCloudStorageSink(u *url.URL) bool {
    34  	switch u.Scheme {
    35  	case `experimental-s3`, `experimental-gs`, `experimental-nodelocal`, `experimental-http`,
    36  		`experimental-https`, `experimental-azure`:
    37  		return true
    38  	default:
    39  		return false
    40  	}
    41  }
    42  
    43  // cloudStorageFormatTime formats times as YYYYMMDDHHMMSSNNNNNNNNNLLLLLLLLLL.
    44  func cloudStorageFormatTime(ts hlc.Timestamp) string {
    45  	// TODO(dan): This is an absurdly long way to print out this timestamp, but
    46  	// I kept hitting bugs while trying to do something clever to make it
    47  	// shorter. Revisit.
    48  	const f = `20060102150405`
    49  	t := ts.GoTime()
    50  	return fmt.Sprintf(`%s%09d%010d`, t.Format(f), t.Nanosecond(), ts.Logical)
    51  }
    52  
    53  type cloudStorageSinkFile struct {
    54  	cloudStorageSinkKey
    55  	codec   io.WriteCloser
    56  	rawSize int
    57  	buf     bytes.Buffer
    58  }
    59  
    60  var _ io.Writer = &cloudStorageSinkFile{}
    61  
    62  func (f *cloudStorageSinkFile) Write(p []byte) (int, error) {
    63  	f.rawSize += len(p)
    64  	if f.codec != nil {
    65  		return f.codec.Write(p)
    66  	}
    67  	return f.buf.Write(p)
    68  }
    69  
    70  // cloudStorageSink writes changefeed output to files in a cloud storage bucket
    71  // (S3/GCS/HTTP) maintaining CDC's ordering guarantees (see below) for each
    72  // row through lexicographical filename ordering.
    73  //
    74  // Changefeeds offer the following two ordering guarantees to external clients:
    75  //
    76  // 1. Rows are emitted with a timestamp. Individual rows are emitted in
    77  // timestamp order. There may be duplicates, but once a row is seen at a given
    78  // timestamp no previously unseen version of that row will be emitted at a less
    79  // (or equal) timestamp. For example, you may see 1 2 1 2, or even 1 2 1, but
    80  // never simply 2 1.
    81  // 2. Periodically, a resolved timestamp is emitted. This is a changefeed-wide
    82  // guarantee that no previously unseen row will later be seen with a timestamp
    83  // less (or equal) to the resolved one. The cloud storage sink is structured as
    84  // a number of distsql processors that each emit some part of the total changefeed.
    85  // These processors only write files containing row data (initially only in ndjson
    86  // format in this cloudStorageSink). This mapping is stable for a given distsql
    87  // flow of a changefeed (meaning any given row is always emitted by the same
    88  // processor), but it's not stable across restarts (pause/unpause). Each of these
    89  // processors report partial progress information to a central coordinator
    90  // (changeFrontier), which is responsible for writing the resolved timestamp files.
    91  //
    92  // In addition to the guarantees required of any changefeed, the cloud storage
    93  // sink adds some quality of life guarantees of its own.
    94  // 3. All rows in a file are from the same table. Further, all rows in a file are
    95  // from the same schema version of that table, and so all have the same schema.
    96  // 4. All files are partitioned into folders by the date part of the filename.
    97  //
    98  // Two methods of the cloudStorageSink on each data emitting processor are
    99  // called. EmitRow is called with each row change and Flush is called before
   100  // sending partial progress information to the coordinator. This happens with no
   101  // concurrency, all EmitRow and Flush calls for a sink are serialized.
   102  // EmitResolvedTimestamp is only called by the `changeFrontier`.
   103  //
   104  // The rows handed to EmitRow by the changefeed are guaranteed to satisfy
   105  // condition (1). Further, as long as the sink has written every EmitRow it's
   106  // gotten before returning from Flush, condition (2) is upheld.
   107  //
   108  // The cloudStorageSink uses lexicographic filename ordering to provide a total
   109  // ordering for the output of this sink. Guarantees (1) and (2) depend on this
   110  // ordering. Specifically, at any given time, the order of the data written by
   111  // the sink is by lexicographic filename and then by order within the file.
   112  //
   113  // Batching these row updates into files is complicated because:
   114  // a) We need to pick a representative timestamp for the file. This is required
   115  // for comparison with resolved timestamp filenames as part of guarantee (2).
   116  // b) For each topic, the row ordering guarantees must be preserved.
   117  // One intuitive way of solving (b) is to ensure that filenames are emitted in
   118  // strictly lexically increasing order (see assertion in `flushFile()`). This
   119  // guarantees correctness as long as the underlying system is correct.
   120  //
   121  // Before the local progress is sent to the coordinator (called the "local frontier" as
   122  // opposed to the resolved timestamp which is exactly a "global frontier" or
   123  // "changefeed-level frontier"), all buffered data which preceded that update is flushed.
   124  // To accomplish (a), we need two invariants. (a1) is that once Flush is called, we can
   125  // never write a file with a timestamp that is less than or equal to the local frontier.
   126  // This is because the local progress update could indeed cause a resolved timestamp file
   127  // to be written with that timestamp. We cannot break this invariant because the client is
   128  // free to ignore any files with a lexically lesser filename. Additionally, because we
   129  // picked the resolved timestamp filename to sort after a data file with the same
   130  // timestamp, a data file can't even be emitted at the same timestamp, it must be emitted
   131  // at a timestamp that is strictly greater than the last globally resolved timestamp. Note
   132  // that the local frontier is a guarantee that the sink will never get an EmitRow with
   133  // that timestamp or lower. (a2) is that whenever Flush is called, all files written by
   134  // the sink must be named using timestamps less than or equal to the one for the local
   135  // frontier at the time Flush is called. This is again because our local progress update
   136  // could cause the global progress to be updated and we need everything written so far to
   137  // lexically compare as less than the new resolved timestamp.
   138  //
   139  // The data files written by this sink are named according to the pattern
   140  // `<timestamp>-<uniquer>-<topic_id>-<schema_id>.<ext>`, each component of which is as
   141  // follows:
   142  //
   143  // `<timestamp>` is the smallest resolved timestamp being tracked by this sink's
   144  // `changeAggregator`, as of the time the last `Flush()` call was made (or `StatementTime`
   145  // if `Flush()` hasn't been called yet). Intuitively, this can be thought of as an
   146  // inclusive lower bound on the timestamps of updates that can be seen in a given file.
   147  //
   148  // `<topic>` corresponds to one SQL table.
   149  //
   150  // `<schema_id>` changes whenever the SQL table schema changes, which allows us
   151  // to guarantee to users that _all entries in a given file have the same
   152  // schema_.
   153  //
   154  // `<uniquer>` is used to keep nodes in a cluster from overwriting each other's data and
   155  // should be ignored by external users. It also keeps a single node from overwriting its
   156  // own data if there are multiple changefeeds, or if a changefeed gets
   157  // canceled/restarted/zombied. Internally, it's generated by
   158  // `<session_id>-<node_id>-<sink_id>-<file_id>` where `<sink_id>` is a unique id for each
   159  // cloudStorageSink in a running process, `<file_id>` is a unique id for each file written
   160  // by a given `<sink_id>` and <session_id> is a unique identifying string for the job
   161  // session running the `changeAggregator` that owns this sink.
   162  //
   163  // `<ext>` implies the format of the file: currently the only option is
   164  // `ndjson`, which means a text file conforming to the "Newline Delimited JSON"
   165  // spec.
   166  //
   167  // This naming convention of data files is carefully chosen in order to preserve
   168  // the external ordering guarantees of CDC. Naming output files in this fashion
   169  // provides monotonicity among files emitted by a given sink for a given table
   170  // name, table schema version pair within a given job session. This ensures that
   171  // all row updates for a given span are read in an order that preserves the CDC
   172  // ordering guarantees, even in the presence of job restarts (see proof below).
   173  // Each record in the data files is a value, keys are not included, so the
   174  // `envelope` option must be set to `value_only`. Within a file, records are not
   175  // guaranteed to be sorted by timestamp. A duplicate of some records might exist
   176  // in a different file or even in the same file.
   177  //
   178  //
   179  // The resolved timestamp files are named `<timestamp>.RESOLVED`. This is
   180  // carefully done so that we can offer the following external guarantee: At any
   181  // given time, if the the files are iterated in lexicographic filename order,
   182  // then encountering any filename containing `RESOLVED` means that everything
   183  // before it is finalized (and thus can be ingested into some other system and
   184  // deleted, included in hive queries, etc). A typical user of cloudStorageSink
   185  // would periodically do exactly this.
   186  //
   187  // Still TODO is writing out data schemas, Avro support, bounding memory usage.
   188  //
   189  // Now what follows is a proof of why the above is correct even in the presence
   190  // of multiple job restarts. We begin by establishing some terminology and by
   191  // formally (re)stating some invariants about the underlying system.
   192  //
   193  // Terminology
   194  // 1. Characters A,B...Z refer to job sessions.
   195  // 2. Ai, for i in Nat, refers to `the filename of the i'th data file
   196  // emitted by session A`. Note that because of the invariants we will state,
   197  // this can also be taken to mean "the filename of lexically the i'th data
   198  // file emitted by session A". This is a notation simply used for convenience.
   199  // 3. Ae > Bf refers to a lexical comparison of Ae and Bf.
   200  // 4. ts(Xi) refers to the <timestamp> part of Xi.
   201  //
   202  // Invariants
   203  // 1. We assume that the ordering guarantee (1) stated at the beginning of this
   204  // comment blob is upheld by the underlying system. More specifically, this proof
   205  // only proves correctness of the cloudStorageSink, not the entire system.
   206  // To re-state, if the rows are read in the order they are emitted by the underlying
   207  // system, it is impossible to see a previously unseen timestamp that is lower
   208  // than some timestamp we've seen before.
   209  // 2. Data files emitted by a single session of a changefeed job are lexically
   210  // ordered exactly as they were emitted. Xi lexically precedes X(i-1), for i in
   211  // Nat, for all job sessions X. The naming convention described above guarantees
   212  // this.
   213  // 3. Data files are named using the successor of the "local frontier" timestamp as of the
   214  // time the last `Flush()` call was made (or StatementTime in case `Flush()` hasn't been
   215  // called yet). Since all EmitRow calls are guaranteed to be for rows that equal or
   216  // succeed this timestamp, ts(Xi) is an inclusive lower bound for the rows contained
   217  // inside Xi.
   218  // 4. When a job restarts, the new job session starts with a catch-up scan
   219  // from the last globally resolved timestamp of the changefeed. This catch-up
   220  // scan replays all rows since this resolved timestamp preserving invariant 1.
   221  //
   222  // Corollary 1: It is impossible to see a previously unseen timestamp that is
   223  // lower than any timestamp seen thus far, in a lexical ordering of files if the
   224  // files satisfy invariant 2 and the underlying system satisfies invariant 1.
   225  //
   226  // Note that correctness does not necessarily imply invariants 1 and 2.
   227  //
   228  // Lemma 1: Given two totally ordered sets of files X and Y that preserve CDC's ordering
   229  // guarantee along with invariants 3 and 4, their union produces a totally ordered set of
   230  // files that preserves this guarantee.
   231  // Proof of lemma: Lets refer to the data filenames emitted by these sessions as X1,X2....
   232  // and similarly for session Y. Additionally, lets refer to the last file ever emitted by
   233  // session X as Xn, for some n in Nat. Now lexically speaking there are 2 cases here: 1.
   234  // Y1 < Xn: For the sake of contradiction, let's assume there is a violation here. Since
   235  // there is a total lexical ordering among files in each set individually, we must have
   236  // read Y(e-1) before Ye, for all e in Nat. Similarly for X. Without loss of generality,
   237  // lets say there are 2 files Ye and Xf such that (1.1) Ye < Xf and Xf contains an unseen
   238  // timestamp that is lower than at least one timestamp seen in Ye. call this timestamp t.
   239  // More explicitly, it must be the case that this timestamp t does not exist in any files
   240  // Y1...Ye. This must mean that timestamp t lies before the starting point of session Y's
   241  // catch-up scan (again from invariant 4). Thus it must be the case that (1.2) ts(Y1) > t.
   242  // Now, due to invariant 3, we know that we won't see any rows in a file Xi with a
   243  // timestamp that is lower than ts(Xi). This must mean that (1.3) ts(Xf) <= t. Statements
   244  // 1.1, 1.2 and 1.3 together give us a contradiction. 2. Y1 > Xn. This case means that all
   245  // data files of session Y lexically succeed all the data files of session X. This means
   246  // that all data files are ordered monotonically relative to when they were emitted, this
   247  // gives us invariant 2 (but for 2 sessions). Correctness follows from this and invariant
   248  // 1. Note that Y1 == Xn is not possible because sessions are assigned unique session IDs.
   249  // QED.
   250  //
   251  // Proof of correctness: It is impossible to see a previously unseen timestamp that is
   252  // lower than any timestamp seen thus far, across n job sessions for all n, n in Nat. We
   253  // do this by induction, let k be the number of job sessions a changefeed job goes
   254  // through:
   255  // Case k = 1: Correctness for this case follows from corollary 1.
   256  // Case k = 2: This follows from lemma 1 stated above.
   257  // Case k > 2 (induction case): Assume that the statement of the proof is true for the
   258  // output of a changefeed job with k sessions. We will show that it must also be true for
   259  // the output of a changefeed job with k+1 sessions. Let's refer to the first k jobs as
   260  // P1,P2,..PK, and the k+1st job as Q. Now, since we assumed the statement is true for
   261  // P1,P2...PK, it must produce a totally ordered (lexical ordering) set of files that
   262  // satisfies requirements of lemma 1. So we can consider these k jobs conceptually as one
   263  // job (call it P). Now, we're back to the case where k = 2 with jobs P and Q. Thus, by
   264  // induction we have the required proof.
   265  //
   266  type cloudStorageSink struct {
   267  	nodeID            roachpb.NodeID
   268  	sinkID            int64
   269  	targetMaxFileSize int64
   270  	settings          *cluster.Settings
   271  	partitionFormat   string
   272  
   273  	ext           string
   274  	recordDelimFn func(io.Writer) error
   275  
   276  	compression string
   277  
   278  	es cloud.ExternalStorage
   279  
   280  	// These are fields to track information needed to output files based on the naming
   281  	// convention described above. See comment on cloudStorageSink above for more details.
   282  	fileID int64
   283  	files  *btree.BTree // of *cloudStorageSinkFile
   284  
   285  	timestampOracle timestampLowerBoundOracle
   286  	jobSessionID    string
   287  	// We keep track of the successor of the least resolved timestamp in the local
   288  	// frontier as of the time of the last `Flush()` call. If `Flush()` hasn't been
   289  	// called, these fields are based on the statement time of the changefeed.
   290  	dataFileTs        string
   291  	dataFilePartition string
   292  	prevFilename      string
   293  }
   294  
   295  const sinkCompressionGzip = "gzip"
   296  
   297  var cloudStorageSinkIDAtomic int64
   298  
   299  func makeCloudStorageSink(
   300  	ctx context.Context,
   301  	baseURI string,
   302  	nodeID roachpb.NodeID,
   303  	targetMaxFileSize int64,
   304  	settings *cluster.Settings,
   305  	opts map[string]string,
   306  	timestampOracle timestampLowerBoundOracle,
   307  	makeExternalStorageFromURI cloud.ExternalStorageFromURIFactory,
   308  ) (Sink, error) {
   309  	// Date partitioning is pretty standard, so no override for now, but we could
   310  	// plumb one down if someone needs it.
   311  	const defaultPartitionFormat = `2006-01-02`
   312  
   313  	sinkID := atomic.AddInt64(&cloudStorageSinkIDAtomic, 1)
   314  	s := &cloudStorageSink{
   315  		nodeID:            nodeID,
   316  		sinkID:            sinkID,
   317  		settings:          settings,
   318  		targetMaxFileSize: targetMaxFileSize,
   319  		files:             btree.New(8),
   320  		partitionFormat:   defaultPartitionFormat,
   321  		timestampOracle:   timestampOracle,
   322  		// TODO(dan,ajwerner): Use the jobs framework's session ID once that's available.
   323  		jobSessionID: generateChangefeedSessionID(),
   324  	}
   325  	if timestampOracle != nil {
   326  		s.dataFileTs = cloudStorageFormatTime(timestampOracle.inclusiveLowerBoundTS())
   327  		s.dataFilePartition = timestampOracle.inclusiveLowerBoundTS().GoTime().Format(s.partitionFormat)
   328  	}
   329  
   330  	switch changefeedbase.FormatType(opts[changefeedbase.OptFormat]) {
   331  	case changefeedbase.OptFormatJSON:
   332  		// TODO(dan): It seems like these should be on the encoder, but that
   333  		// would require a bit of refactoring.
   334  		s.ext = `.ndjson`
   335  		s.recordDelimFn = func(w io.Writer) error {
   336  			_, err := w.Write([]byte{'\n'})
   337  			return err
   338  		}
   339  	default:
   340  		return nil, errors.Errorf(`this sink is incompatible with %s=%s`,
   341  			changefeedbase.OptFormat, opts[changefeedbase.OptFormat])
   342  	}
   343  
   344  	switch changefeedbase.EnvelopeType(opts[changefeedbase.OptEnvelope]) {
   345  	case changefeedbase.OptEnvelopeWrapped:
   346  	default:
   347  		return nil, errors.Errorf(`this sink is incompatible with %s=%s`,
   348  			changefeedbase.OptEnvelope, opts[changefeedbase.OptEnvelope])
   349  	}
   350  
   351  	if _, ok := opts[changefeedbase.OptKeyInValue]; !ok {
   352  		return nil, errors.Errorf(`this sink requires the WITH %s option`, changefeedbase.OptKeyInValue)
   353  	}
   354  
   355  	if codec, ok := opts[changefeedbase.OptCompression]; ok && codec != "" {
   356  		if strings.EqualFold(codec, "gzip") {
   357  			s.compression = sinkCompressionGzip
   358  			s.ext = s.ext + ".gz"
   359  		} else {
   360  			return nil, errors.Errorf(`unsupported compression codec %q`, codec)
   361  		}
   362  	}
   363  
   364  	var err error
   365  	if s.es, err = makeExternalStorageFromURI(ctx, baseURI); err != nil {
   366  		return nil, err
   367  	}
   368  
   369  	return s, nil
   370  }
   371  
   372  func (s *cloudStorageSink) getOrCreateFile(
   373  	topic string, schemaID sqlbase.DescriptorVersion,
   374  ) *cloudStorageSinkFile {
   375  	key := cloudStorageSinkKey{topic, schemaID}
   376  	if item := s.files.Get(key); item != nil {
   377  		return item.(*cloudStorageSinkFile)
   378  	}
   379  	f := &cloudStorageSinkFile{
   380  		cloudStorageSinkKey: key,
   381  	}
   382  	switch s.compression {
   383  	case sinkCompressionGzip:
   384  		f.codec = gzip.NewWriter(&f.buf)
   385  	}
   386  	s.files.ReplaceOrInsert(f)
   387  	return f
   388  }
   389  
   390  // EmitRow implements the Sink interface.
   391  func (s *cloudStorageSink) EmitRow(
   392  	ctx context.Context, table *sqlbase.TableDescriptor, _, value []byte, updated hlc.Timestamp,
   393  ) error {
   394  	if s.files == nil {
   395  		return errors.New(`cannot EmitRow on a closed sink`)
   396  	}
   397  
   398  	file := s.getOrCreateFile(table.Name, table.Version)
   399  
   400  	// TODO(dan): Memory monitoring for this
   401  	if _, err := file.Write(value); err != nil {
   402  		return err
   403  	}
   404  	if err := s.recordDelimFn(file); err != nil {
   405  		return err
   406  	}
   407  
   408  	if int64(file.buf.Len()) > s.targetMaxFileSize {
   409  		if err := s.flushTopicVersions(ctx, file.topic, file.schemaID); err != nil {
   410  			return err
   411  		}
   412  	}
   413  	return nil
   414  }
   415  
   416  // EmitResolvedTimestamp implements the Sink interface.
   417  func (s *cloudStorageSink) EmitResolvedTimestamp(
   418  	ctx context.Context, encoder Encoder, resolved hlc.Timestamp,
   419  ) error {
   420  	if s.files == nil {
   421  		return errors.New(`cannot EmitRow on a closed sink`)
   422  	}
   423  
   424  	var noTopic string
   425  	payload, err := encoder.EncodeResolvedTimestamp(ctx, noTopic, resolved)
   426  	if err != nil {
   427  		return err
   428  	}
   429  	// Don't need to copy payload because we never buffer it anywhere.
   430  
   431  	part := resolved.GoTime().Format(s.partitionFormat)
   432  	filename := fmt.Sprintf(`%s.RESOLVED`, cloudStorageFormatTime(resolved))
   433  	if log.V(1) {
   434  		log.Infof(ctx, "writing file %s %s", filename, resolved.AsOfSystemTime())
   435  	}
   436  	return s.es.WriteFile(ctx, filepath.Join(part, filename), bytes.NewReader(payload))
   437  }
   438  
   439  // flushTopicVersions flushes all open files for the provided topic up to and
   440  // including maxVersionToFlush.
   441  //
   442  // To understand why we need to do this, consider the following example in case
   443  // we didn't have this logic:
   444  //
   445  //  1. The sink starts buffering a file for schema 1.
   446  //  2. It then starts buffering a file for schema 2.
   447  //  3. The newer, schema 2 file exceeds the file size threshold and thus gets
   448  //     flushed at timestamp x with fileID 0.
   449  //  4. The older, schema 1 file is also flushed at timestamp x and thus is
   450  //     assigned a fileID greater than 0.
   451  //
   452  // This would lead to the older file being lexically ordered after the newer,
   453  // schema 2 file, leading to a violation of our ordering guarantees (see comment
   454  // on cloudStorageSink)
   455  func (s *cloudStorageSink) flushTopicVersions(
   456  	ctx context.Context, topic string, maxVersionToFlush sqlbase.DescriptorVersion,
   457  ) (err error) {
   458  	var toRemoveAlloc [2]sqlbase.DescriptorVersion // generally avoid allocating
   459  	toRemove := toRemoveAlloc[:0]                  // schemaIDs of flushed files
   460  	gte := cloudStorageSinkKey{topic: topic}
   461  	lt := cloudStorageSinkKey{topic: topic, schemaID: maxVersionToFlush + 1}
   462  	s.files.AscendRange(gte, lt, func(i btree.Item) (wantMore bool) {
   463  		f := i.(*cloudStorageSinkFile)
   464  		if err = s.flushFile(ctx, f); err == nil {
   465  			toRemove = append(toRemove, f.schemaID)
   466  		}
   467  		return err == nil
   468  	})
   469  	for _, v := range toRemove {
   470  		s.files.Delete(cloudStorageSinkKey{topic: topic, schemaID: v})
   471  	}
   472  	return err
   473  }
   474  
   475  // Flush implements the Sink interface.
   476  func (s *cloudStorageSink) Flush(ctx context.Context) error {
   477  	if s.files == nil {
   478  		return errors.New(`cannot Flush on a closed sink`)
   479  	}
   480  
   481  	var err error
   482  	s.files.Ascend(func(i btree.Item) (wantMore bool) {
   483  		err = s.flushFile(ctx, i.(*cloudStorageSinkFile))
   484  		return err == nil
   485  	})
   486  	if err != nil {
   487  		return err
   488  	}
   489  	s.files.Clear(true /* addNodesToFreeList */)
   490  
   491  	// Record the least resolved timestamp being tracked in the frontier as of this point,
   492  	// to use for naming files until the next `Flush()`. See comment on cloudStorageSink
   493  	// for an overview of the naming convention and proof of correctness.
   494  	s.dataFileTs = cloudStorageFormatTime(s.timestampOracle.inclusiveLowerBoundTS())
   495  	s.dataFilePartition = s.timestampOracle.inclusiveLowerBoundTS().GoTime().Format(s.partitionFormat)
   496  	return nil
   497  }
   498  
   499  // file should not be used after flushing.
   500  func (s *cloudStorageSink) flushFile(ctx context.Context, file *cloudStorageSinkFile) error {
   501  	if file.rawSize == 0 {
   502  		// This method shouldn't be called with an empty file, but be defensive
   503  		// about not writing empty files anyway.
   504  		return nil
   505  	}
   506  
   507  	// If the file is written via compression codec, close the codec to ensure it
   508  	// has flushed to the underlying buffer.
   509  	if file.codec != nil {
   510  		if err := file.codec.Close(); err != nil {
   511  			return err
   512  		}
   513  	}
   514  
   515  	// We use this monotonically increasing fileID to ensure correct ordering
   516  	// among files emitted at the same timestamp during the same job session.
   517  	fileID := s.fileID
   518  	s.fileID++
   519  	// Pad file ID to maintain lexical ordering among files from the same sink.
   520  	// Note that we use `-` here to delimit the filename because we want
   521  	// `%d.RESOLVED` files to lexicographically succeed data files that have the
   522  	// same timestamp. This works because ascii `-` < ascii '.'.
   523  	filename := fmt.Sprintf(`%s-%s-%d-%d-%08x-%s-%x%s`, s.dataFileTs,
   524  		s.jobSessionID, s.nodeID, s.sinkID, fileID, file.topic, file.schemaID, s.ext)
   525  	if s.prevFilename != "" && filename < s.prevFilename {
   526  		return errors.AssertionFailedf("error: detected a filename %s that lexically "+
   527  			"precedes a file emitted before: %s", filename, s.prevFilename)
   528  	}
   529  	s.prevFilename = filename
   530  	return s.es.WriteFile(ctx, filepath.Join(s.dataFilePartition, filename), bytes.NewReader(file.buf.Bytes()))
   531  }
   532  
   533  // Close implements the Sink interface.
   534  func (s *cloudStorageSink) Close() error {
   535  	s.files = nil
   536  	return s.es.Close()
   537  }
   538  
   539  type cloudStorageSinkKey struct {
   540  	topic    string
   541  	schemaID sqlbase.DescriptorVersion
   542  }
   543  
   544  func (k cloudStorageSinkKey) Less(other btree.Item) bool {
   545  	switch other := other.(type) {
   546  	case *cloudStorageSinkFile:
   547  		return keyLess(k, other.cloudStorageSinkKey)
   548  	case cloudStorageSinkKey:
   549  		return keyLess(k, other)
   550  	default:
   551  		panic(errors.Errorf("unexpected item type %T", other))
   552  	}
   553  }
   554  
   555  func keyLess(a, b cloudStorageSinkKey) bool {
   556  	if a.topic == b.topic {
   557  		return a.schemaID < b.schemaID
   558  	}
   559  	return a.topic < b.topic
   560  }