github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/changefeedccl/sink_cloudstorage.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Licensed as a CockroachDB Enterprise file under the Cockroach Community 4 // License (the "License"); you may not use this file except in compliance with 5 // the License. You may obtain a copy of the License at 6 // 7 // https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt 8 9 package changefeedccl 10 11 import ( 12 "bytes" 13 "compress/gzip" 14 "context" 15 "fmt" 16 "io" 17 "net/url" 18 "path/filepath" 19 "strings" 20 "sync/atomic" 21 22 "github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase" 23 "github.com/cockroachdb/cockroach/pkg/roachpb" 24 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 25 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 26 "github.com/cockroachdb/cockroach/pkg/storage/cloud" 27 "github.com/cockroachdb/cockroach/pkg/util/hlc" 28 "github.com/cockroachdb/cockroach/pkg/util/log" 29 "github.com/cockroachdb/errors" 30 "github.com/google/btree" 31 ) 32 33 func isCloudStorageSink(u *url.URL) bool { 34 switch u.Scheme { 35 case `experimental-s3`, `experimental-gs`, `experimental-nodelocal`, `experimental-http`, 36 `experimental-https`, `experimental-azure`: 37 return true 38 default: 39 return false 40 } 41 } 42 43 // cloudStorageFormatTime formats times as YYYYMMDDHHMMSSNNNNNNNNNLLLLLLLLLL. 44 func cloudStorageFormatTime(ts hlc.Timestamp) string { 45 // TODO(dan): This is an absurdly long way to print out this timestamp, but 46 // I kept hitting bugs while trying to do something clever to make it 47 // shorter. Revisit. 48 const f = `20060102150405` 49 t := ts.GoTime() 50 return fmt.Sprintf(`%s%09d%010d`, t.Format(f), t.Nanosecond(), ts.Logical) 51 } 52 53 type cloudStorageSinkFile struct { 54 cloudStorageSinkKey 55 codec io.WriteCloser 56 rawSize int 57 buf bytes.Buffer 58 } 59 60 var _ io.Writer = &cloudStorageSinkFile{} 61 62 func (f *cloudStorageSinkFile) Write(p []byte) (int, error) { 63 f.rawSize += len(p) 64 if f.codec != nil { 65 return f.codec.Write(p) 66 } 67 return f.buf.Write(p) 68 } 69 70 // cloudStorageSink writes changefeed output to files in a cloud storage bucket 71 // (S3/GCS/HTTP) maintaining CDC's ordering guarantees (see below) for each 72 // row through lexicographical filename ordering. 73 // 74 // Changefeeds offer the following two ordering guarantees to external clients: 75 // 76 // 1. Rows are emitted with a timestamp. Individual rows are emitted in 77 // timestamp order. There may be duplicates, but once a row is seen at a given 78 // timestamp no previously unseen version of that row will be emitted at a less 79 // (or equal) timestamp. For example, you may see 1 2 1 2, or even 1 2 1, but 80 // never simply 2 1. 81 // 2. Periodically, a resolved timestamp is emitted. This is a changefeed-wide 82 // guarantee that no previously unseen row will later be seen with a timestamp 83 // less (or equal) to the resolved one. The cloud storage sink is structured as 84 // a number of distsql processors that each emit some part of the total changefeed. 85 // These processors only write files containing row data (initially only in ndjson 86 // format in this cloudStorageSink). This mapping is stable for a given distsql 87 // flow of a changefeed (meaning any given row is always emitted by the same 88 // processor), but it's not stable across restarts (pause/unpause). Each of these 89 // processors report partial progress information to a central coordinator 90 // (changeFrontier), which is responsible for writing the resolved timestamp files. 91 // 92 // In addition to the guarantees required of any changefeed, the cloud storage 93 // sink adds some quality of life guarantees of its own. 94 // 3. All rows in a file are from the same table. Further, all rows in a file are 95 // from the same schema version of that table, and so all have the same schema. 96 // 4. All files are partitioned into folders by the date part of the filename. 97 // 98 // Two methods of the cloudStorageSink on each data emitting processor are 99 // called. EmitRow is called with each row change and Flush is called before 100 // sending partial progress information to the coordinator. This happens with no 101 // concurrency, all EmitRow and Flush calls for a sink are serialized. 102 // EmitResolvedTimestamp is only called by the `changeFrontier`. 103 // 104 // The rows handed to EmitRow by the changefeed are guaranteed to satisfy 105 // condition (1). Further, as long as the sink has written every EmitRow it's 106 // gotten before returning from Flush, condition (2) is upheld. 107 // 108 // The cloudStorageSink uses lexicographic filename ordering to provide a total 109 // ordering for the output of this sink. Guarantees (1) and (2) depend on this 110 // ordering. Specifically, at any given time, the order of the data written by 111 // the sink is by lexicographic filename and then by order within the file. 112 // 113 // Batching these row updates into files is complicated because: 114 // a) We need to pick a representative timestamp for the file. This is required 115 // for comparison with resolved timestamp filenames as part of guarantee (2). 116 // b) For each topic, the row ordering guarantees must be preserved. 117 // One intuitive way of solving (b) is to ensure that filenames are emitted in 118 // strictly lexically increasing order (see assertion in `flushFile()`). This 119 // guarantees correctness as long as the underlying system is correct. 120 // 121 // Before the local progress is sent to the coordinator (called the "local frontier" as 122 // opposed to the resolved timestamp which is exactly a "global frontier" or 123 // "changefeed-level frontier"), all buffered data which preceded that update is flushed. 124 // To accomplish (a), we need two invariants. (a1) is that once Flush is called, we can 125 // never write a file with a timestamp that is less than or equal to the local frontier. 126 // This is because the local progress update could indeed cause a resolved timestamp file 127 // to be written with that timestamp. We cannot break this invariant because the client is 128 // free to ignore any files with a lexically lesser filename. Additionally, because we 129 // picked the resolved timestamp filename to sort after a data file with the same 130 // timestamp, a data file can't even be emitted at the same timestamp, it must be emitted 131 // at a timestamp that is strictly greater than the last globally resolved timestamp. Note 132 // that the local frontier is a guarantee that the sink will never get an EmitRow with 133 // that timestamp or lower. (a2) is that whenever Flush is called, all files written by 134 // the sink must be named using timestamps less than or equal to the one for the local 135 // frontier at the time Flush is called. This is again because our local progress update 136 // could cause the global progress to be updated and we need everything written so far to 137 // lexically compare as less than the new resolved timestamp. 138 // 139 // The data files written by this sink are named according to the pattern 140 // `<timestamp>-<uniquer>-<topic_id>-<schema_id>.<ext>`, each component of which is as 141 // follows: 142 // 143 // `<timestamp>` is the smallest resolved timestamp being tracked by this sink's 144 // `changeAggregator`, as of the time the last `Flush()` call was made (or `StatementTime` 145 // if `Flush()` hasn't been called yet). Intuitively, this can be thought of as an 146 // inclusive lower bound on the timestamps of updates that can be seen in a given file. 147 // 148 // `<topic>` corresponds to one SQL table. 149 // 150 // `<schema_id>` changes whenever the SQL table schema changes, which allows us 151 // to guarantee to users that _all entries in a given file have the same 152 // schema_. 153 // 154 // `<uniquer>` is used to keep nodes in a cluster from overwriting each other's data and 155 // should be ignored by external users. It also keeps a single node from overwriting its 156 // own data if there are multiple changefeeds, or if a changefeed gets 157 // canceled/restarted/zombied. Internally, it's generated by 158 // `<session_id>-<node_id>-<sink_id>-<file_id>` where `<sink_id>` is a unique id for each 159 // cloudStorageSink in a running process, `<file_id>` is a unique id for each file written 160 // by a given `<sink_id>` and <session_id> is a unique identifying string for the job 161 // session running the `changeAggregator` that owns this sink. 162 // 163 // `<ext>` implies the format of the file: currently the only option is 164 // `ndjson`, which means a text file conforming to the "Newline Delimited JSON" 165 // spec. 166 // 167 // This naming convention of data files is carefully chosen in order to preserve 168 // the external ordering guarantees of CDC. Naming output files in this fashion 169 // provides monotonicity among files emitted by a given sink for a given table 170 // name, table schema version pair within a given job session. This ensures that 171 // all row updates for a given span are read in an order that preserves the CDC 172 // ordering guarantees, even in the presence of job restarts (see proof below). 173 // Each record in the data files is a value, keys are not included, so the 174 // `envelope` option must be set to `value_only`. Within a file, records are not 175 // guaranteed to be sorted by timestamp. A duplicate of some records might exist 176 // in a different file or even in the same file. 177 // 178 // 179 // The resolved timestamp files are named `<timestamp>.RESOLVED`. This is 180 // carefully done so that we can offer the following external guarantee: At any 181 // given time, if the the files are iterated in lexicographic filename order, 182 // then encountering any filename containing `RESOLVED` means that everything 183 // before it is finalized (and thus can be ingested into some other system and 184 // deleted, included in hive queries, etc). A typical user of cloudStorageSink 185 // would periodically do exactly this. 186 // 187 // Still TODO is writing out data schemas, Avro support, bounding memory usage. 188 // 189 // Now what follows is a proof of why the above is correct even in the presence 190 // of multiple job restarts. We begin by establishing some terminology and by 191 // formally (re)stating some invariants about the underlying system. 192 // 193 // Terminology 194 // 1. Characters A,B...Z refer to job sessions. 195 // 2. Ai, for i in Nat, refers to `the filename of the i'th data file 196 // emitted by session A`. Note that because of the invariants we will state, 197 // this can also be taken to mean "the filename of lexically the i'th data 198 // file emitted by session A". This is a notation simply used for convenience. 199 // 3. Ae > Bf refers to a lexical comparison of Ae and Bf. 200 // 4. ts(Xi) refers to the <timestamp> part of Xi. 201 // 202 // Invariants 203 // 1. We assume that the ordering guarantee (1) stated at the beginning of this 204 // comment blob is upheld by the underlying system. More specifically, this proof 205 // only proves correctness of the cloudStorageSink, not the entire system. 206 // To re-state, if the rows are read in the order they are emitted by the underlying 207 // system, it is impossible to see a previously unseen timestamp that is lower 208 // than some timestamp we've seen before. 209 // 2. Data files emitted by a single session of a changefeed job are lexically 210 // ordered exactly as they were emitted. Xi lexically precedes X(i-1), for i in 211 // Nat, for all job sessions X. The naming convention described above guarantees 212 // this. 213 // 3. Data files are named using the successor of the "local frontier" timestamp as of the 214 // time the last `Flush()` call was made (or StatementTime in case `Flush()` hasn't been 215 // called yet). Since all EmitRow calls are guaranteed to be for rows that equal or 216 // succeed this timestamp, ts(Xi) is an inclusive lower bound for the rows contained 217 // inside Xi. 218 // 4. When a job restarts, the new job session starts with a catch-up scan 219 // from the last globally resolved timestamp of the changefeed. This catch-up 220 // scan replays all rows since this resolved timestamp preserving invariant 1. 221 // 222 // Corollary 1: It is impossible to see a previously unseen timestamp that is 223 // lower than any timestamp seen thus far, in a lexical ordering of files if the 224 // files satisfy invariant 2 and the underlying system satisfies invariant 1. 225 // 226 // Note that correctness does not necessarily imply invariants 1 and 2. 227 // 228 // Lemma 1: Given two totally ordered sets of files X and Y that preserve CDC's ordering 229 // guarantee along with invariants 3 and 4, their union produces a totally ordered set of 230 // files that preserves this guarantee. 231 // Proof of lemma: Lets refer to the data filenames emitted by these sessions as X1,X2.... 232 // and similarly for session Y. Additionally, lets refer to the last file ever emitted by 233 // session X as Xn, for some n in Nat. Now lexically speaking there are 2 cases here: 1. 234 // Y1 < Xn: For the sake of contradiction, let's assume there is a violation here. Since 235 // there is a total lexical ordering among files in each set individually, we must have 236 // read Y(e-1) before Ye, for all e in Nat. Similarly for X. Without loss of generality, 237 // lets say there are 2 files Ye and Xf such that (1.1) Ye < Xf and Xf contains an unseen 238 // timestamp that is lower than at least one timestamp seen in Ye. call this timestamp t. 239 // More explicitly, it must be the case that this timestamp t does not exist in any files 240 // Y1...Ye. This must mean that timestamp t lies before the starting point of session Y's 241 // catch-up scan (again from invariant 4). Thus it must be the case that (1.2) ts(Y1) > t. 242 // Now, due to invariant 3, we know that we won't see any rows in a file Xi with a 243 // timestamp that is lower than ts(Xi). This must mean that (1.3) ts(Xf) <= t. Statements 244 // 1.1, 1.2 and 1.3 together give us a contradiction. 2. Y1 > Xn. This case means that all 245 // data files of session Y lexically succeed all the data files of session X. This means 246 // that all data files are ordered monotonically relative to when they were emitted, this 247 // gives us invariant 2 (but for 2 sessions). Correctness follows from this and invariant 248 // 1. Note that Y1 == Xn is not possible because sessions are assigned unique session IDs. 249 // QED. 250 // 251 // Proof of correctness: It is impossible to see a previously unseen timestamp that is 252 // lower than any timestamp seen thus far, across n job sessions for all n, n in Nat. We 253 // do this by induction, let k be the number of job sessions a changefeed job goes 254 // through: 255 // Case k = 1: Correctness for this case follows from corollary 1. 256 // Case k = 2: This follows from lemma 1 stated above. 257 // Case k > 2 (induction case): Assume that the statement of the proof is true for the 258 // output of a changefeed job with k sessions. We will show that it must also be true for 259 // the output of a changefeed job with k+1 sessions. Let's refer to the first k jobs as 260 // P1,P2,..PK, and the k+1st job as Q. Now, since we assumed the statement is true for 261 // P1,P2...PK, it must produce a totally ordered (lexical ordering) set of files that 262 // satisfies requirements of lemma 1. So we can consider these k jobs conceptually as one 263 // job (call it P). Now, we're back to the case where k = 2 with jobs P and Q. Thus, by 264 // induction we have the required proof. 265 // 266 type cloudStorageSink struct { 267 nodeID roachpb.NodeID 268 sinkID int64 269 targetMaxFileSize int64 270 settings *cluster.Settings 271 partitionFormat string 272 273 ext string 274 recordDelimFn func(io.Writer) error 275 276 compression string 277 278 es cloud.ExternalStorage 279 280 // These are fields to track information needed to output files based on the naming 281 // convention described above. See comment on cloudStorageSink above for more details. 282 fileID int64 283 files *btree.BTree // of *cloudStorageSinkFile 284 285 timestampOracle timestampLowerBoundOracle 286 jobSessionID string 287 // We keep track of the successor of the least resolved timestamp in the local 288 // frontier as of the time of the last `Flush()` call. If `Flush()` hasn't been 289 // called, these fields are based on the statement time of the changefeed. 290 dataFileTs string 291 dataFilePartition string 292 prevFilename string 293 } 294 295 const sinkCompressionGzip = "gzip" 296 297 var cloudStorageSinkIDAtomic int64 298 299 func makeCloudStorageSink( 300 ctx context.Context, 301 baseURI string, 302 nodeID roachpb.NodeID, 303 targetMaxFileSize int64, 304 settings *cluster.Settings, 305 opts map[string]string, 306 timestampOracle timestampLowerBoundOracle, 307 makeExternalStorageFromURI cloud.ExternalStorageFromURIFactory, 308 ) (Sink, error) { 309 // Date partitioning is pretty standard, so no override for now, but we could 310 // plumb one down if someone needs it. 311 const defaultPartitionFormat = `2006-01-02` 312 313 sinkID := atomic.AddInt64(&cloudStorageSinkIDAtomic, 1) 314 s := &cloudStorageSink{ 315 nodeID: nodeID, 316 sinkID: sinkID, 317 settings: settings, 318 targetMaxFileSize: targetMaxFileSize, 319 files: btree.New(8), 320 partitionFormat: defaultPartitionFormat, 321 timestampOracle: timestampOracle, 322 // TODO(dan,ajwerner): Use the jobs framework's session ID once that's available. 323 jobSessionID: generateChangefeedSessionID(), 324 } 325 if timestampOracle != nil { 326 s.dataFileTs = cloudStorageFormatTime(timestampOracle.inclusiveLowerBoundTS()) 327 s.dataFilePartition = timestampOracle.inclusiveLowerBoundTS().GoTime().Format(s.partitionFormat) 328 } 329 330 switch changefeedbase.FormatType(opts[changefeedbase.OptFormat]) { 331 case changefeedbase.OptFormatJSON: 332 // TODO(dan): It seems like these should be on the encoder, but that 333 // would require a bit of refactoring. 334 s.ext = `.ndjson` 335 s.recordDelimFn = func(w io.Writer) error { 336 _, err := w.Write([]byte{'\n'}) 337 return err 338 } 339 default: 340 return nil, errors.Errorf(`this sink is incompatible with %s=%s`, 341 changefeedbase.OptFormat, opts[changefeedbase.OptFormat]) 342 } 343 344 switch changefeedbase.EnvelopeType(opts[changefeedbase.OptEnvelope]) { 345 case changefeedbase.OptEnvelopeWrapped: 346 default: 347 return nil, errors.Errorf(`this sink is incompatible with %s=%s`, 348 changefeedbase.OptEnvelope, opts[changefeedbase.OptEnvelope]) 349 } 350 351 if _, ok := opts[changefeedbase.OptKeyInValue]; !ok { 352 return nil, errors.Errorf(`this sink requires the WITH %s option`, changefeedbase.OptKeyInValue) 353 } 354 355 if codec, ok := opts[changefeedbase.OptCompression]; ok && codec != "" { 356 if strings.EqualFold(codec, "gzip") { 357 s.compression = sinkCompressionGzip 358 s.ext = s.ext + ".gz" 359 } else { 360 return nil, errors.Errorf(`unsupported compression codec %q`, codec) 361 } 362 } 363 364 var err error 365 if s.es, err = makeExternalStorageFromURI(ctx, baseURI); err != nil { 366 return nil, err 367 } 368 369 return s, nil 370 } 371 372 func (s *cloudStorageSink) getOrCreateFile( 373 topic string, schemaID sqlbase.DescriptorVersion, 374 ) *cloudStorageSinkFile { 375 key := cloudStorageSinkKey{topic, schemaID} 376 if item := s.files.Get(key); item != nil { 377 return item.(*cloudStorageSinkFile) 378 } 379 f := &cloudStorageSinkFile{ 380 cloudStorageSinkKey: key, 381 } 382 switch s.compression { 383 case sinkCompressionGzip: 384 f.codec = gzip.NewWriter(&f.buf) 385 } 386 s.files.ReplaceOrInsert(f) 387 return f 388 } 389 390 // EmitRow implements the Sink interface. 391 func (s *cloudStorageSink) EmitRow( 392 ctx context.Context, table *sqlbase.TableDescriptor, _, value []byte, updated hlc.Timestamp, 393 ) error { 394 if s.files == nil { 395 return errors.New(`cannot EmitRow on a closed sink`) 396 } 397 398 file := s.getOrCreateFile(table.Name, table.Version) 399 400 // TODO(dan): Memory monitoring for this 401 if _, err := file.Write(value); err != nil { 402 return err 403 } 404 if err := s.recordDelimFn(file); err != nil { 405 return err 406 } 407 408 if int64(file.buf.Len()) > s.targetMaxFileSize { 409 if err := s.flushTopicVersions(ctx, file.topic, file.schemaID); err != nil { 410 return err 411 } 412 } 413 return nil 414 } 415 416 // EmitResolvedTimestamp implements the Sink interface. 417 func (s *cloudStorageSink) EmitResolvedTimestamp( 418 ctx context.Context, encoder Encoder, resolved hlc.Timestamp, 419 ) error { 420 if s.files == nil { 421 return errors.New(`cannot EmitRow on a closed sink`) 422 } 423 424 var noTopic string 425 payload, err := encoder.EncodeResolvedTimestamp(ctx, noTopic, resolved) 426 if err != nil { 427 return err 428 } 429 // Don't need to copy payload because we never buffer it anywhere. 430 431 part := resolved.GoTime().Format(s.partitionFormat) 432 filename := fmt.Sprintf(`%s.RESOLVED`, cloudStorageFormatTime(resolved)) 433 if log.V(1) { 434 log.Infof(ctx, "writing file %s %s", filename, resolved.AsOfSystemTime()) 435 } 436 return s.es.WriteFile(ctx, filepath.Join(part, filename), bytes.NewReader(payload)) 437 } 438 439 // flushTopicVersions flushes all open files for the provided topic up to and 440 // including maxVersionToFlush. 441 // 442 // To understand why we need to do this, consider the following example in case 443 // we didn't have this logic: 444 // 445 // 1. The sink starts buffering a file for schema 1. 446 // 2. It then starts buffering a file for schema 2. 447 // 3. The newer, schema 2 file exceeds the file size threshold and thus gets 448 // flushed at timestamp x with fileID 0. 449 // 4. The older, schema 1 file is also flushed at timestamp x and thus is 450 // assigned a fileID greater than 0. 451 // 452 // This would lead to the older file being lexically ordered after the newer, 453 // schema 2 file, leading to a violation of our ordering guarantees (see comment 454 // on cloudStorageSink) 455 func (s *cloudStorageSink) flushTopicVersions( 456 ctx context.Context, topic string, maxVersionToFlush sqlbase.DescriptorVersion, 457 ) (err error) { 458 var toRemoveAlloc [2]sqlbase.DescriptorVersion // generally avoid allocating 459 toRemove := toRemoveAlloc[:0] // schemaIDs of flushed files 460 gte := cloudStorageSinkKey{topic: topic} 461 lt := cloudStorageSinkKey{topic: topic, schemaID: maxVersionToFlush + 1} 462 s.files.AscendRange(gte, lt, func(i btree.Item) (wantMore bool) { 463 f := i.(*cloudStorageSinkFile) 464 if err = s.flushFile(ctx, f); err == nil { 465 toRemove = append(toRemove, f.schemaID) 466 } 467 return err == nil 468 }) 469 for _, v := range toRemove { 470 s.files.Delete(cloudStorageSinkKey{topic: topic, schemaID: v}) 471 } 472 return err 473 } 474 475 // Flush implements the Sink interface. 476 func (s *cloudStorageSink) Flush(ctx context.Context) error { 477 if s.files == nil { 478 return errors.New(`cannot Flush on a closed sink`) 479 } 480 481 var err error 482 s.files.Ascend(func(i btree.Item) (wantMore bool) { 483 err = s.flushFile(ctx, i.(*cloudStorageSinkFile)) 484 return err == nil 485 }) 486 if err != nil { 487 return err 488 } 489 s.files.Clear(true /* addNodesToFreeList */) 490 491 // Record the least resolved timestamp being tracked in the frontier as of this point, 492 // to use for naming files until the next `Flush()`. See comment on cloudStorageSink 493 // for an overview of the naming convention and proof of correctness. 494 s.dataFileTs = cloudStorageFormatTime(s.timestampOracle.inclusiveLowerBoundTS()) 495 s.dataFilePartition = s.timestampOracle.inclusiveLowerBoundTS().GoTime().Format(s.partitionFormat) 496 return nil 497 } 498 499 // file should not be used after flushing. 500 func (s *cloudStorageSink) flushFile(ctx context.Context, file *cloudStorageSinkFile) error { 501 if file.rawSize == 0 { 502 // This method shouldn't be called with an empty file, but be defensive 503 // about not writing empty files anyway. 504 return nil 505 } 506 507 // If the file is written via compression codec, close the codec to ensure it 508 // has flushed to the underlying buffer. 509 if file.codec != nil { 510 if err := file.codec.Close(); err != nil { 511 return err 512 } 513 } 514 515 // We use this monotonically increasing fileID to ensure correct ordering 516 // among files emitted at the same timestamp during the same job session. 517 fileID := s.fileID 518 s.fileID++ 519 // Pad file ID to maintain lexical ordering among files from the same sink. 520 // Note that we use `-` here to delimit the filename because we want 521 // `%d.RESOLVED` files to lexicographically succeed data files that have the 522 // same timestamp. This works because ascii `-` < ascii '.'. 523 filename := fmt.Sprintf(`%s-%s-%d-%d-%08x-%s-%x%s`, s.dataFileTs, 524 s.jobSessionID, s.nodeID, s.sinkID, fileID, file.topic, file.schemaID, s.ext) 525 if s.prevFilename != "" && filename < s.prevFilename { 526 return errors.AssertionFailedf("error: detected a filename %s that lexically "+ 527 "precedes a file emitted before: %s", filename, s.prevFilename) 528 } 529 s.prevFilename = filename 530 return s.es.WriteFile(ctx, filepath.Join(s.dataFilePartition, filename), bytes.NewReader(file.buf.Bytes())) 531 } 532 533 // Close implements the Sink interface. 534 func (s *cloudStorageSink) Close() error { 535 s.files = nil 536 return s.es.Close() 537 } 538 539 type cloudStorageSinkKey struct { 540 topic string 541 schemaID sqlbase.DescriptorVersion 542 } 543 544 func (k cloudStorageSinkKey) Less(other btree.Item) bool { 545 switch other := other.(type) { 546 case *cloudStorageSinkFile: 547 return keyLess(k, other.cloudStorageSinkKey) 548 case cloudStorageSinkKey: 549 return keyLess(k, other) 550 default: 551 panic(errors.Errorf("unexpected item type %T", other)) 552 } 553 } 554 555 func keyLess(a, b cloudStorageSinkKey) bool { 556 if a.topic == b.topic { 557 return a.schemaID < b.schemaID 558 } 559 return a.topic < b.topic 560 }