github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/workloadccl/fixture.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Licensed as a CockroachDB Enterprise file under the Cockroach Community 4 // License (the "License"); you may not use this file except in compliance with 5 // the License. You may obtain a copy of the License at 6 // 7 // https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt 8 9 package workloadccl 10 11 import ( 12 "bytes" 13 "context" 14 gosql "database/sql" 15 "database/sql/driver" 16 "encoding/json" 17 "fmt" 18 "net/url" 19 "path/filepath" 20 "strconv" 21 "strings" 22 "sync/atomic" 23 24 "cloud.google.com/go/storage" 25 "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" 26 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" 27 "github.com/cockroachdb/cockroach/pkg/util/log" 28 "github.com/cockroachdb/cockroach/pkg/util/retry" 29 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 30 "github.com/cockroachdb/cockroach/pkg/workload" 31 "github.com/cockroachdb/errors" 32 "github.com/spf13/pflag" 33 "google.golang.org/api/iterator" 34 ) 35 36 const ( 37 fixtureGCSURIScheme = `gs` 38 ) 39 40 func init() { 41 workload.ImportDataLoader = ImportDataLoader{} 42 } 43 44 // FixtureConfig describes a storage place for fixtures. 45 type FixtureConfig struct { 46 // GCSBucket is a Google Cloud Storage bucket. 47 GCSBucket string 48 49 // GCSPrefix is a prefix to prepend to each Google Cloud Storage object 50 // path. 51 GCSPrefix string 52 53 // CSVServerURL is a url to a `./workload csv-server` to use as a source of 54 // CSV data. The url is anything accepted by our backup/restore. Notably, if 55 // you run a csv-server next to each CockroachDB node, 56 // `http://localhost:<port>` will work. 57 CSVServerURL string 58 59 // BillingProject if non-empty, is the Google Cloud project to bill for all 60 // storage requests. This is required to be set if using a "requestor pays" 61 // bucket. 62 BillingProject string 63 } 64 65 func (s FixtureConfig) objectPathToURI(folder string) string { 66 u := &url.URL{ 67 Scheme: fixtureGCSURIScheme, 68 Host: s.GCSBucket, 69 Path: folder, 70 } 71 if s.BillingProject != `` { 72 u.RawQuery = `GOOGLE_BILLING_PROJECT=` + url.QueryEscape(s.BillingProject) 73 } 74 return u.String() 75 } 76 77 // Fixture describes pre-computed data for a Generator, allowing quick 78 // initialization of large clusters. 79 type Fixture struct { 80 Config FixtureConfig 81 Generator workload.Generator 82 Tables []FixtureTable 83 } 84 85 // FixtureTable describes pre-computed data for a single table in a Generator, 86 // allowing quick initializaiton of large clusters. 87 type FixtureTable struct { 88 TableName string 89 BackupURI string 90 } 91 92 // serializeOptions deterministically represents the configuration of a 93 // Generator as a string. 94 func serializeOptions(gen workload.Generator) string { 95 f, ok := gen.(workload.Flagser) 96 if !ok { 97 return `` 98 } 99 // NB: VisitAll visits in a deterministic (alphabetical) order. 100 var buf bytes.Buffer 101 flags := f.Flags() 102 flags.VisitAll(func(f *pflag.Flag) { 103 if flags.Meta != nil && flags.Meta[f.Name].RuntimeOnly { 104 return 105 } 106 if buf.Len() > 0 { 107 buf.WriteString(`,`) 108 } 109 fmt.Fprintf(&buf, `%s=%s`, url.PathEscape(f.Name), url.PathEscape(f.Value.String())) 110 }) 111 return buf.String() 112 } 113 114 func generatorToGCSFolder(config FixtureConfig, gen workload.Generator) string { 115 meta := gen.Meta() 116 return filepath.Join( 117 config.GCSPrefix, 118 meta.Name, 119 fmt.Sprintf(`version=%s,%s`, meta.Version, serializeOptions(gen)), 120 ) 121 } 122 123 // FixtureURL returns the URL for pre-computed Generator data stored on GCS. 124 func FixtureURL(config FixtureConfig, gen workload.Generator) string { 125 return config.objectPathToURI(generatorToGCSFolder(config, gen)) 126 } 127 128 // GetFixture returns a handle for pre-computed Generator data stored on GCS. It 129 // is expected that the generator will have had Configure called on it. 130 func GetFixture( 131 ctx context.Context, gcs *storage.Client, config FixtureConfig, gen workload.Generator, 132 ) (Fixture, error) { 133 var fixture Fixture 134 var err error 135 var notFound bool 136 for r := retry.StartWithCtx(ctx, retry.Options{MaxRetries: 10}); r.Next(); { 137 err = func() error { 138 b := gcs.Bucket(config.GCSBucket) 139 if config.BillingProject != `` { 140 b = b.UserProject(config.BillingProject) 141 } 142 143 fixtureFolder := generatorToGCSFolder(config, gen) 144 _, err := b.Objects(ctx, &storage.Query{Prefix: fixtureFolder, Delimiter: `/`}).Next() 145 if errors.Is(err, iterator.Done) { 146 notFound = true 147 return errors.Errorf(`fixture not found: %s`, fixtureFolder) 148 } else if err != nil { 149 return err 150 } 151 152 fixture = Fixture{Config: config, Generator: gen} 153 for _, table := range gen.Tables() { 154 tableFolder := filepath.Join(fixtureFolder, table.Name) 155 _, err := b.Objects(ctx, &storage.Query{Prefix: tableFolder, Delimiter: `/`}).Next() 156 if errors.Is(err, iterator.Done) { 157 return errors.Errorf(`fixture table not found: %s`, tableFolder) 158 } else if err != nil { 159 return err 160 } 161 fixture.Tables = append(fixture.Tables, FixtureTable{ 162 TableName: table.Name, 163 BackupURI: config.objectPathToURI(tableFolder), 164 }) 165 } 166 return nil 167 }() 168 if err == nil || notFound { 169 break 170 } 171 } 172 return fixture, err 173 } 174 175 func csvServerPaths( 176 csvServerURL string, gen workload.Generator, table workload.Table, numNodes int, 177 ) []string { 178 if table.InitialRows.FillBatch == nil { 179 // Some workloads don't support initial table data. 180 return nil 181 } 182 183 // More files means more granularity in the progress tracking, but more 184 // files also means larger jobs table entries, so this is a balance. The 185 // IMPORT code round-robins the files in an import per node, so it's best to 186 // have some integer multiple of the number of nodes in the cluster, which 187 // will guarantee that the work is balanced across the cluster. In practice, 188 // even as few as 100 files caused jobs badness when creating tpcc fixtures, 189 // so our "integer multiple" is picked to be 1 to minimize this effect. Too 190 // bad about the progress tracking granularity. 191 numFiles := numNodes 192 rowStep := table.InitialRows.NumBatches / numFiles 193 if rowStep == 0 { 194 rowStep = 1 195 } 196 197 var paths []string 198 for rowIdx := 0; ; { 199 chunkRowStart, chunkRowEnd := rowIdx, rowIdx+rowStep 200 if chunkRowEnd > table.InitialRows.NumBatches { 201 chunkRowEnd = table.InitialRows.NumBatches 202 } 203 204 params := url.Values{ 205 `row-start`: []string{strconv.Itoa(chunkRowStart)}, 206 `row-end`: []string{strconv.Itoa(chunkRowEnd)}, 207 `version`: []string{gen.Meta().Version}, 208 } 209 if f, ok := gen.(workload.Flagser); ok { 210 flags := f.Flags() 211 flags.VisitAll(func(f *pflag.Flag) { 212 if flags.Meta[f.Name].RuntimeOnly { 213 return 214 } 215 params[f.Name] = append(params[f.Name], f.Value.String()) 216 }) 217 } 218 path := fmt.Sprintf(`%s/csv/%s/%s?%s`, 219 csvServerURL, gen.Meta().Name, table.Name, params.Encode()) 220 paths = append(paths, path) 221 222 rowIdx = chunkRowEnd 223 if rowIdx >= table.InitialRows.NumBatches { 224 break 225 } 226 } 227 return paths 228 } 229 230 // Specify an explicit empty prefix for crdb_internal to avoid an error if 231 // the database we're connected to does not exist. 232 const numNodesQuery = `SELECT count(node_id) FROM "".crdb_internal.gossip_liveness` 233 234 // MakeFixture regenerates a fixture, storing it to GCS. It is expected that the 235 // generator will have had Configure called on it. 236 // 237 // There's some ideal world in which we can generate backups (and thus 238 // fixtures) directly from a Generator, but for now, we use `IMPORT ... CSV 239 // DATA`. First a CSV file with the table data is written to GCS. `IMPORT 240 // ... CSV DATA` works by turning a set of CSV files for a single table into a 241 // backup file, then restoring that file into a cluster. The `transform` option 242 // gives us only the first half (which is all we want for fixture generation). 243 func MakeFixture( 244 ctx context.Context, 245 sqlDB *gosql.DB, 246 gcs *storage.Client, 247 config FixtureConfig, 248 gen workload.Generator, 249 filesPerNode int, 250 ) (Fixture, error) { 251 for _, t := range gen.Tables() { 252 if t.InitialRows.FillBatch == nil { 253 return Fixture{}, errors.Errorf( 254 `make fixture is not supported for workload %s`, gen.Meta().Name, 255 ) 256 } 257 } 258 259 fixtureFolder := generatorToGCSFolder(config, gen) 260 if _, err := GetFixture(ctx, gcs, config, gen); err == nil { 261 return Fixture{}, errors.Errorf( 262 `fixture %s already exists`, config.objectPathToURI(fixtureFolder)) 263 } 264 265 dbName := gen.Meta().Name 266 if _, err := sqlDB.Exec(`CREATE DATABASE IF NOT EXISTS ` + dbName); err != nil { 267 return Fixture{}, err 268 } 269 l := ImportDataLoader{ 270 FilesPerNode: filesPerNode, 271 } 272 // NB: Intentionally don't use workloadsql.Setup because it runs the PostLoad 273 // hooks (adding foreign keys, etc), but historically the BACKUPs created by 274 // `fixtures make` didn't have them. Instead they're added by `fixtures load`. 275 // Ideally, the PostLoad hooks would be idempotent and we could include them 276 // here (but still run them on load for old fixtures without them), but that 277 // yak will remain unshaved. 278 if _, err := l.InitialDataLoad(ctx, sqlDB, gen); err != nil { 279 return Fixture{}, err 280 } 281 282 g := ctxgroup.WithContext(ctx) 283 for _, t := range gen.Tables() { 284 t := t 285 g.Go(func() error { 286 q := fmt.Sprintf(`BACKUP "%s"."%s" TO $1`, dbName, t.Name) 287 output := config.objectPathToURI(filepath.Join(fixtureFolder, t.Name)) 288 log.Infof(ctx, "Backing %s up to %q...", t.Name, output) 289 _, err := sqlDB.Exec(q, output) 290 return err 291 }) 292 } 293 if err := g.Wait(); err != nil { 294 return Fixture{}, err 295 } 296 297 return GetFixture(ctx, gcs, config, gen) 298 } 299 300 // ImportDataLoader is an InitialDataLoader implementation that loads data with 301 // IMPORT. The zero-value gets some sane defaults for the tunable settings. 302 type ImportDataLoader struct { 303 FilesPerNode int 304 InjectStats bool 305 CSVServer string 306 } 307 308 // InitialDataLoad implements the InitialDataLoader interface. 309 func (l ImportDataLoader) InitialDataLoad( 310 ctx context.Context, db *gosql.DB, gen workload.Generator, 311 ) (int64, error) { 312 if l.FilesPerNode == 0 { 313 l.FilesPerNode = 1 314 } 315 316 log.Infof(ctx, "starting import of %d tables", len(gen.Tables())) 317 start := timeutil.Now() 318 const useConnectionDB = `` 319 bytes, err := ImportFixture( 320 ctx, db, gen, useConnectionDB, l.FilesPerNode, l.InjectStats, l.CSVServer) 321 if err != nil { 322 return 0, errors.Wrap(err, `importing fixture`) 323 } 324 elapsed := timeutil.Since(start) 325 log.Infof(ctx, "imported %s bytes in %d tables (took %s, %s)", 326 humanizeutil.IBytes(bytes), len(gen.Tables()), elapsed, humanizeutil.DataRate(bytes, elapsed)) 327 328 return bytes, nil 329 } 330 331 // ImportFixture works like MakeFixture, but instead of stopping halfway or 332 // writing a backup to cloud storage, it finishes ingesting the data. 333 // It also includes the option to inject pre-calculated table statistics if 334 // injectStats is true. 335 func ImportFixture( 336 ctx context.Context, 337 sqlDB *gosql.DB, 338 gen workload.Generator, 339 dbName string, 340 filesPerNode int, 341 injectStats bool, 342 csvServer string, 343 ) (int64, error) { 344 for _, t := range gen.Tables() { 345 if t.InitialRows.FillBatch == nil { 346 return 0, errors.Errorf( 347 `import fixture is not supported for workload %s`, gen.Meta().Name, 348 ) 349 } 350 } 351 352 var numNodes int 353 if err := sqlDB.QueryRow(numNodesQuery).Scan(&numNodes); err != nil { 354 return 0, err 355 } 356 357 var bytesAtomic int64 358 g := ctxgroup.WithContext(ctx) 359 tables := gen.Tables() 360 if injectStats && tablesHaveStats(tables) { 361 // Turn off automatic stats temporarily so we don't trigger stats creation 362 // after the IMPORT. We will inject stats inside importFixtureTable. 363 // TODO(rytaft): It would be better if the automatic statistics code would 364 // just trigger a no-op if there are new stats available so we wouldn't 365 // have to disable and re-enable automatic stats here. 366 enableFn := disableAutoStats(ctx, sqlDB) 367 defer enableFn() 368 } 369 370 pathPrefix := csvServer 371 if pathPrefix == `` { 372 pathPrefix = `workload://` 373 } 374 375 for _, t := range tables { 376 table := t 377 paths := csvServerPaths(pathPrefix, gen, table, numNodes*filesPerNode) 378 g.GoCtx(func(ctx context.Context) error { 379 tableBytes, err := importFixtureTable( 380 ctx, sqlDB, dbName, table, paths, `` /* output */, injectStats) 381 atomic.AddInt64(&bytesAtomic, tableBytes) 382 return errors.Wrapf(err, `importing table %s`, table.Name) 383 }) 384 } 385 if err := g.Wait(); err != nil { 386 return 0, err 387 } 388 return atomic.LoadInt64(&bytesAtomic), nil 389 } 390 391 func importFixtureTable( 392 ctx context.Context, 393 sqlDB *gosql.DB, 394 dbName string, 395 table workload.Table, 396 paths []string, 397 output string, 398 injectStats bool, 399 ) (int64, error) { 400 start := timeutil.Now() 401 var buf bytes.Buffer 402 var params []interface{} 403 qualifiedTableName := makeQualifiedTableName(dbName, &table) 404 fmt.Fprintf(&buf, `IMPORT TABLE %s %s CSV DATA (`, qualifiedTableName, table.Schema) 405 // Generate $1,...,$N-1, where N is the number of csv paths. 406 for _, path := range paths { 407 params = append(params, path) 408 if len(params) != 1 { 409 buf.WriteString(`,`) 410 } 411 fmt.Fprintf(&buf, `$%d`, len(params)) 412 } 413 buf.WriteString(`) WITH nullif='NULL'`) 414 if len(output) > 0 { 415 params = append(params, output) 416 fmt.Fprintf(&buf, `, transform=$%d`, len(params)) 417 } 418 var rows, index, tableBytes int64 419 var discard driver.Value 420 res, err := sqlDB.Query(buf.String(), params...) 421 if err != nil { 422 return 0, err 423 } 424 defer res.Close() 425 if !res.Next() { 426 return 0, gosql.ErrNoRows 427 } 428 resCols, err := res.Columns() 429 if err != nil { 430 return 0, err 431 } 432 if len(resCols) == 7 { 433 if err := res.Scan( 434 &discard, &discard, &discard, &rows, &index, &discard, &tableBytes, 435 ); err != nil { 436 return 0, err 437 } 438 } else { 439 if err := res.Scan( 440 &discard, &discard, &discard, &rows, &index, &tableBytes, 441 ); err != nil { 442 return 0, err 443 } 444 } 445 elapsed := timeutil.Since(start) 446 log.Infof(ctx, `imported %s in %s table (%d rows, %d index entries, took %s, %s)`, 447 humanizeutil.IBytes(tableBytes), table.Name, rows, index, elapsed, 448 humanizeutil.DataRate(tableBytes, elapsed)) 449 450 // Inject pre-calculated stats. 451 if injectStats && len(table.Stats) > 0 { 452 if err := injectStatistics(qualifiedTableName, &table, sqlDB); err != nil { 453 return 0, err 454 } 455 } 456 457 return tableBytes, nil 458 } 459 460 // tablesHaveStats returns whether any of the provided tables have associated 461 // table statistics to inject. 462 func tablesHaveStats(tables []workload.Table) bool { 463 for _, t := range tables { 464 if len(t.Stats) > 0 { 465 return true 466 } 467 } 468 return false 469 } 470 471 // disableAutoStats disables automatic stats if they are enabled and returns 472 // a function to re-enable them later. If automatic stats are already disabled, 473 // disableAutoStats does nothing and returns an empty function. 474 func disableAutoStats(ctx context.Context, sqlDB *gosql.DB) func() { 475 var autoStatsEnabled bool 476 err := sqlDB.QueryRow( 477 `SHOW CLUSTER SETTING sql.stats.automatic_collection.enabled`, 478 ).Scan(&autoStatsEnabled) 479 if err != nil { 480 log.Warningf(ctx, "error retrieving automatic stats cluster setting: %v", err) 481 return func() {} 482 } 483 484 if autoStatsEnabled { 485 _, err = sqlDB.Exec( 486 `SET CLUSTER SETTING sql.stats.automatic_collection.enabled=false`, 487 ) 488 if err != nil { 489 log.Warningf(ctx, "error disabling automatic stats: %v", err) 490 return func() {} 491 } 492 return func() { 493 _, err := sqlDB.Exec( 494 `SET CLUSTER SETTING sql.stats.automatic_collection.enabled=true`, 495 ) 496 if err != nil { 497 log.Warningf(ctx, "error enabling automatic stats: %v", err) 498 } 499 } 500 } 501 502 return func() {} 503 } 504 505 // injectStatistics injects pre-calculated statistics for the given table. 506 func injectStatistics(qualifiedTableName string, table *workload.Table, sqlDB *gosql.DB) error { 507 var encoded []byte 508 encoded, err := json.Marshal(table.Stats) 509 if err != nil { 510 return err 511 } 512 if _, err := sqlDB.Exec( 513 fmt.Sprintf(`ALTER TABLE %s INJECT STATISTICS '%s'`, qualifiedTableName, encoded), 514 ); err != nil { 515 if strings.Contains(err.Error(), "syntax error") { 516 // This syntax was added in v2.1, so ignore the syntax error 517 // if run against versions earlier than this. 518 return nil 519 } 520 return err 521 } 522 return nil 523 } 524 525 // makeQualifiedTableName constructs a qualified table name from the specified 526 // database name and table. 527 func makeQualifiedTableName(dbName string, table *workload.Table) string { 528 if dbName == "" { 529 return fmt.Sprintf(`"%s"`, table.Name) 530 } 531 return fmt.Sprintf(`"%s"."%s"`, dbName, table.Name) 532 } 533 534 // RestoreFixture loads a fixture into a CockroachDB cluster. An enterprise 535 // license is required to have been set in the cluster. 536 func RestoreFixture( 537 ctx context.Context, sqlDB *gosql.DB, fixture Fixture, database string, injectStats bool, 538 ) (int64, error) { 539 var bytesAtomic int64 540 g := ctxgroup.WithContext(ctx) 541 genName := fixture.Generator.Meta().Name 542 tables := fixture.Generator.Tables() 543 if injectStats && tablesHaveStats(tables) { 544 // Turn off automatic stats temporarily so we don't trigger stats creation 545 // after the RESTORE. 546 // TODO(rytaft): It would be better if the automatic statistics code would 547 // just trigger a no-op if there are new stats available so we wouldn't 548 // have to disable and re-enable automatic stats here. 549 enableFn := disableAutoStats(ctx, sqlDB) 550 defer enableFn() 551 } 552 for _, table := range fixture.Tables { 553 table := table 554 g.GoCtx(func(ctx context.Context) error { 555 start := timeutil.Now() 556 importStmt := fmt.Sprintf(`RESTORE %s.%s FROM $1 WITH into_db=$2`, genName, table.TableName) 557 var rows, index, tableBytes int64 558 var discard interface{} 559 res, err := sqlDB.Query(importStmt, table.BackupURI, database) 560 if err != nil { 561 return err 562 } 563 defer res.Close() 564 if !res.Next() { 565 return gosql.ErrNoRows 566 } 567 resCols, err := res.Columns() 568 if err != nil { 569 return err 570 } 571 if len(resCols) == 7 { 572 if err := res.Scan( 573 &discard, &discard, &discard, &rows, &index, &discard, &tableBytes, 574 ); err != nil { 575 return err 576 } 577 } else { 578 if err := res.Scan( 579 &discard, &discard, &discard, &rows, &index, &tableBytes, 580 ); err != nil { 581 return err 582 } 583 } 584 atomic.AddInt64(&bytesAtomic, tableBytes) 585 elapsed := timeutil.Since(start) 586 log.Infof(ctx, `loaded %s table %s in %s (%d rows, %d index entries, %s)`, 587 humanizeutil.IBytes(tableBytes), table.TableName, elapsed, rows, index, 588 humanizeutil.IBytes(int64(float64(tableBytes)/elapsed.Seconds()))) 589 return nil 590 }) 591 } 592 if err := g.Wait(); err != nil { 593 return 0, err 594 } 595 if injectStats { 596 for i := range tables { 597 t := &tables[i] 598 if len(t.Stats) > 0 { 599 qualifiedTableName := makeQualifiedTableName(genName, t) 600 if err := injectStatistics(qualifiedTableName, t, sqlDB); err != nil { 601 return 0, err 602 } 603 } 604 } 605 } 606 return atomic.LoadInt64(&bytesAtomic), nil 607 } 608 609 // ListFixtures returns the object paths to all fixtures stored in a FixtureConfig. 610 func ListFixtures( 611 ctx context.Context, gcs *storage.Client, config FixtureConfig, 612 ) ([]string, error) { 613 b := gcs.Bucket(config.GCSBucket) 614 if config.BillingProject != `` { 615 b = b.UserProject(config.BillingProject) 616 } 617 618 var fixtures []string 619 gensPrefix := config.GCSPrefix + `/` 620 for genIter := b.Objects(ctx, &storage.Query{Prefix: gensPrefix, Delimiter: `/`}); ; { 621 gen, err := genIter.Next() 622 if errors.Is(err, iterator.Done) { 623 break 624 } else if err != nil { 625 return nil, err 626 } 627 for genConfigIter := b.Objects(ctx, &storage.Query{Prefix: gen.Prefix, Delimiter: `/`}); ; { 628 genConfig, err := genConfigIter.Next() 629 if errors.Is(err, iterator.Done) { 630 break 631 } else if err != nil { 632 return nil, err 633 } 634 fixtures = append(fixtures, genConfig.Prefix) 635 } 636 } 637 return fixtures, nil 638 }