code.vegaprotocol.io/vega@v0.79.0/datanode/networkhistory/snapshot/service_create_snapshot.go (about)

     1  // Copyright (C) 2023 Gobalsky Labs Limited
     2  //
     3  // This program is free software: you can redistribute it and/or modify
     4  // it under the terms of the GNU Affero General Public License as
     5  // published by the Free Software Foundation, either version 3 of the
     6  // License, or (at your option) any later version.
     7  //
     8  // This program is distributed in the hope that it will be useful,
     9  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    10  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    11  // GNU Affero General Public License for more details.
    12  //
    13  // You should have received a copy of the GNU Affero General Public License
    14  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    15  
    16  package snapshot
    17  
    18  import (
    19  	"context"
    20  	"errors"
    21  	"fmt"
    22  	"io"
    23  	"os"
    24  	"path"
    25  	"sort"
    26  	"time"
    27  
    28  	"code.vegaprotocol.io/vega/datanode/metrics"
    29  	"code.vegaprotocol.io/vega/datanode/networkhistory/segment"
    30  	"code.vegaprotocol.io/vega/datanode/sqlstore"
    31  	"code.vegaprotocol.io/vega/libs/fs"
    32  	vio "code.vegaprotocol.io/vega/libs/io"
    33  	"code.vegaprotocol.io/vega/logging"
    34  
    35  	"github.com/georgysavva/scany/pgxscan"
    36  	"github.com/jackc/pgx/v4"
    37  	"golang.org/x/exp/maps"
    38  )
    39  
    40  var (
    41  	ErrSnapshotExists = errors.New("snapshot exists")
    42  	ErrNoLastSnapshot = errors.New("no last snapshot")
    43  )
    44  
    45  func (b *Service) CreateSnapshot(ctx context.Context, chainID string, toHeight int64) (segment.Unpublished, error) {
    46  	return b.createNewSnapshot(ctx, chainID, toHeight, false)
    47  }
    48  
    49  func (b *Service) CreateSnapshotAsynchronously(ctx context.Context, chainID string, toHeight int64) (segment.Unpublished, error) {
    50  	return b.createNewSnapshot(ctx, chainID, toHeight, true)
    51  }
    52  
    53  func (b *Service) createNewSnapshot(ctx context.Context, chainID string, toHeight int64,
    54  	async bool,
    55  ) (segment.Unpublished, error) {
    56  	var err error
    57  	if len(chainID) == 0 {
    58  		return segment.Unpublished{}, fmt.Errorf("chain id is required")
    59  	}
    60  
    61  	dbMetaData, err := NewDatabaseMetaData(ctx, b.connPool)
    62  	if err != nil {
    63  		return segment.Unpublished{}, fmt.Errorf("failed to get data dump metadata: %w", err)
    64  	}
    65  
    66  	var cleanUp []func()
    67  	ctxWithTimeout, cancelFn := context.WithTimeout(ctx, b.config.WaitForCreationLockTimeout.Duration)
    68  	defer cancelFn()
    69  
    70  	// This lock ensures snapshots cannot be created in parallel, during normal run this should never be an issue
    71  	// as the time between snapshots is sufficiently large, however during event replay (and some testing/dev scenarios)
    72  	// the time between snapshots can be sufficiently small to run the risk that snapshotting could overlap without this
    73  	// lock.
    74  	if !b.createSnapshotLock.Lock(ctxWithTimeout) {
    75  		panic("context cancelled whilst waiting for create snapshot lock")
    76  	}
    77  
    78  	cleanUp = append(cleanUp, func() { b.createSnapshotLock.Unlock() })
    79  
    80  	copyDataTx, err := b.connPool.Begin(ctx)
    81  	if err != nil {
    82  		runAllInReverseOrder(cleanUp)
    83  		return segment.Unpublished{}, fmt.Errorf("failed to begin copy table data transaction: %w", err)
    84  	}
    85  	// Rolling back a committed transaction does nothing
    86  	cleanUp = append(cleanUp, func() { _ = copyDataTx.Rollback(ctx) })
    87  
    88  	if _, err = copyDataTx.Exec(ctx, "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE"); err != nil {
    89  		runAllInReverseOrder(cleanUp)
    90  		return segment.Unpublished{}, fmt.Errorf("failed to set transaction isolation level to serilizable: %w", err)
    91  	}
    92  
    93  	nextSpan, err := getNextSnapshotSpan(ctx, toHeight, copyDataTx)
    94  	if err != nil {
    95  		runAllInReverseOrder(cleanUp)
    96  		if errors.Is(err, ErrSnapshotExists) {
    97  			return segment.Unpublished{}, ErrSnapshotExists
    98  		}
    99  		return segment.Unpublished{}, fmt.Errorf("failed to get next snapshot span:%w", err)
   100  	}
   101  
   102  	s := segment.Unpublished{
   103  		Base: segment.Base{
   104  			HeightFrom:      nextSpan.FromHeight,
   105  			HeightTo:        nextSpan.ToHeight,
   106  			DatabaseVersion: dbMetaData.DatabaseVersion,
   107  			ChainID:         chainID,
   108  		},
   109  		Directory: b.copyToPath,
   110  	}
   111  
   112  	b.log.Infof("creating snapshot for %+v", s)
   113  
   114  	if _, err = os.Create(s.InProgressFilePath()); err != nil {
   115  		runAllInReverseOrder(cleanUp)
   116  		return segment.Unpublished{}, fmt.Errorf("failed to create write lock file:%w", err)
   117  	}
   118  	cleanUp = append(cleanUp, func() { _ = os.Remove(s.InProgressFilePath()) })
   119  
   120  	// To ensure reads are isolated from this point forward execute a read on last block
   121  	_, err = sqlstore.GetLastBlockUsingConnection(ctx, copyDataTx)
   122  	if err != nil {
   123  		runAllInReverseOrder(cleanUp)
   124  		return segment.Unpublished{}, fmt.Errorf("failed to get last block using connection: %w", err)
   125  	}
   126  
   127  	snapshotData := func() {
   128  		defer func() { runAllInReverseOrder(cleanUp) }()
   129  		err = b.snapshotData(ctx, copyDataTx, dbMetaData, s)
   130  		if err != nil {
   131  			b.log.Panic("failed to snapshot data", logging.Error(err))
   132  		}
   133  	}
   134  
   135  	if async {
   136  		go snapshotData()
   137  	} else {
   138  		snapshotData()
   139  	}
   140  
   141  	return s, nil
   142  }
   143  
   144  func getNextSnapshotSpan(ctx context.Context, toHeight int64, copyDataTx pgx.Tx) (Span, error) {
   145  	lastSnapshotSpan, err := getLastSnapshotSpan(ctx, copyDataTx)
   146  
   147  	var nextSpan Span
   148  	if err != nil {
   149  		if errors.Is(err, ErrNoLastSnapshot) {
   150  			oldestHistoryBlock, err := sqlstore.GetOldestHistoryBlockUsingConnection(ctx, copyDataTx)
   151  			if err != nil {
   152  				return Span{}, fmt.Errorf("failed to get oldest history block:%w", err)
   153  			}
   154  			nextSpan = Span{
   155  				FromHeight: oldestHistoryBlock.Height,
   156  				ToHeight:   toHeight,
   157  			}
   158  		} else {
   159  			return nextSpan, fmt.Errorf("failed to get last snapshot span:%w", err)
   160  		}
   161  	} else {
   162  		if toHeight < lastSnapshotSpan.ToHeight {
   163  			return Span{}, fmt.Errorf("toHeight %d is less than last snapshot span %+v", toHeight, lastSnapshotSpan)
   164  		}
   165  
   166  		if toHeight == lastSnapshotSpan.ToHeight {
   167  			return Span{}, ErrSnapshotExists
   168  		}
   169  
   170  		nextSpan = Span{FromHeight: lastSnapshotSpan.ToHeight + 1, ToHeight: toHeight}
   171  	}
   172  
   173  	err = setLastSnapshotSpan(ctx, copyDataTx, nextSpan.FromHeight, nextSpan.ToHeight)
   174  	if err != nil {
   175  		return Span{}, fmt.Errorf("failed to set last snapshot span:%w", err)
   176  	}
   177  
   178  	return nextSpan, nil
   179  }
   180  
   181  type Span struct {
   182  	FromHeight int64
   183  	ToHeight   int64
   184  }
   185  
   186  func setLastSnapshotSpan(ctx context.Context, connection sqlstore.Connection, fromHeight, toHeight int64) error {
   187  	_, err := connection.Exec(ctx, `Insert into last_snapshot_span (from_height, to_height) VALUES($1, $2)
   188  	 on conflict(onerow_check) do update set from_height=EXCLUDED.from_height, to_height=EXCLUDED.to_height`,
   189  		fromHeight, toHeight)
   190  	if err != nil {
   191  		return fmt.Errorf("failed to update last_snapshot_span table:%w", err)
   192  	}
   193  	return nil
   194  }
   195  
   196  func getLastSnapshotSpan(ctx context.Context, connection sqlstore.Connection) (*Span, error) {
   197  	ls := &Span{}
   198  	err := pgxscan.Get(ctx, connection, ls,
   199  		`SELECT from_height, to_height
   200  		FROM last_snapshot_span`)
   201  
   202  	if errors.Is(err, pgx.ErrNoRows) {
   203  		return nil, ErrNoLastSnapshot
   204  	}
   205  
   206  	return ls, err
   207  }
   208  
   209  func runAllInReverseOrder(functions []func()) {
   210  	for i := len(functions) - 1; i >= 0; i-- {
   211  		functions[i]()
   212  	}
   213  }
   214  
   215  func (b *Service) snapshotData(ctx context.Context, tx pgx.Tx, dbMetaData DatabaseMetadata, seg segment.Unpublished) error {
   216  	defer func() {
   217  		// Calling rollback on a committed transaction has no effect, hence we can rollback in defer to ensure
   218  		// always rolled back if the transaction was not successfully committed
   219  		_ = tx.Rollback(ctx)
   220  	}()
   221  
   222  	if _, err := tx.Exec(ctx, "SET TIME ZONE 0"); err != nil {
   223  		return fmt.Errorf("failed to set timezone to UTC:%w", err)
   224  	}
   225  
   226  	start := time.Now()
   227  	b.log.Infof("copying all table data....")
   228  
   229  	currentStateDir := path.Join(seg.UnpublishedSnapshotDataDirectory(), "currentstate")
   230  	historyStateDir := path.Join(seg.UnpublishedSnapshotDataDirectory(), "history")
   231  
   232  	err := os.MkdirAll(currentStateDir, os.ModePerm)
   233  	if err != nil {
   234  		return fmt.Errorf("failed to create current state directory:%w", err)
   235  	}
   236  
   237  	err = os.MkdirAll(historyStateDir, os.ModePerm)
   238  	if err != nil {
   239  		return fmt.Errorf("failed to create history state directory:%w", err)
   240  	}
   241  
   242  	// Write Current State
   243  	currentSQL := currentStateCopySQL(dbMetaData)
   244  	currentRowsCopied, currentStateBytesCopied, err := copyTablesData(ctx, tx, currentSQL, currentStateDir)
   245  	if err != nil {
   246  		return fmt.Errorf("failed to copy current state table data:%w", err)
   247  	}
   248  
   249  	// Write History
   250  	historySQL := historyCopySQL(dbMetaData, seg)
   251  	historyRowsCopied, historyBytesCopied, err := copyTablesData(ctx, tx, historySQL, historyStateDir)
   252  	if err != nil {
   253  		return fmt.Errorf("failed to copy history table data:%w", err)
   254  	}
   255  
   256  	err = tx.Commit(ctx)
   257  	if err != nil {
   258  		return fmt.Errorf("failed to commit snapshot transaction:%w", err)
   259  	}
   260  
   261  	metrics.SetLastSnapshotRowcount(float64(currentRowsCopied + historyRowsCopied))
   262  	metrics.SetLastSnapshotCurrentStateBytes(float64(currentStateBytesCopied))
   263  	metrics.SetLastSnapshotHistoryBytes(float64(historyBytesCopied))
   264  	metrics.SetLastSnapshotSeconds(time.Since(start).Seconds())
   265  
   266  	b.log.Info("finished creating snapshot for chain", logging.String("chain", seg.ChainID),
   267  		logging.Int64("from height", seg.HeightFrom),
   268  		logging.Int64("to height", seg.HeightTo), logging.Duration("time taken", time.Since(start)),
   269  		logging.Int64("rows copied", currentRowsCopied+historyRowsCopied),
   270  		logging.Int64("current state data size", currentStateBytesCopied),
   271  		logging.Int64("history data size", historyBytesCopied),
   272  	)
   273  
   274  	return nil
   275  }
   276  
   277  func currentStateCopySQL(dbMetaData DatabaseMetadata) []TableCopySql {
   278  	var copySQL []TableCopySql
   279  	tablesNames := maps.Keys(dbMetaData.TableNameToMetaData)
   280  	sort.Strings(tablesNames)
   281  
   282  	for _, tableName := range tablesNames {
   283  		meta := dbMetaData.TableNameToMetaData[tableName]
   284  		if !dbMetaData.TableNameToMetaData[tableName].Hypertable {
   285  			tableCopySQL := fmt.Sprintf(`copy (select * from %s order by %s) TO STDOUT WITH (FORMAT csv, HEADER) `, tableName,
   286  				meta.SortOrder)
   287  			copySQL = append(copySQL, TableCopySql{meta, tableCopySQL})
   288  		}
   289  	}
   290  	return copySQL
   291  }
   292  
   293  func historyCopySQL(dbMetaData DatabaseMetadata, segment interface{ GetFromHeight() int64 }) []TableCopySql {
   294  	var copySQL []TableCopySql
   295  	tablesNames := maps.Keys(dbMetaData.TableNameToMetaData)
   296  	sort.Strings(tablesNames)
   297  
   298  	for _, tableName := range tablesNames {
   299  		meta := dbMetaData.TableNameToMetaData[tableName]
   300  		if dbMetaData.TableNameToMetaData[tableName].Hypertable {
   301  			partitionColumn := dbMetaData.TableNameToMetaData[tableName].PartitionColumn
   302  			hyperTableCopySQL := fmt.Sprintf(`copy (select * from %s where %s >= (SELECT vega_time from blocks where height = %d) order by %s) to STDOUT (FORMAT csv, HEADER)`,
   303  				tableName,
   304  				partitionColumn,
   305  				segment.GetFromHeight(),
   306  				meta.SortOrder)
   307  			copySQL = append(copySQL, TableCopySql{meta, hyperTableCopySQL})
   308  		}
   309  	}
   310  	return copySQL
   311  }
   312  
   313  func copyTablesData(ctx context.Context, tx pgx.Tx, copySQL []TableCopySql, toDir string) (int64, int64, error) {
   314  	var totalRowsCopied int64
   315  	var totalBytesCopied int64
   316  	for _, tableSql := range copySQL {
   317  		filePath := path.Join(toDir, tableSql.metaData.Name)
   318  		numRowsCopied, bytesCopied, err := writeTableToDataFile(ctx, tx, filePath, tableSql)
   319  		if err != nil {
   320  			return 0, 0, fmt.Errorf("failed to write table %s to file %s:%w", tableSql.metaData.Name, filePath, err)
   321  		}
   322  		totalRowsCopied += numRowsCopied
   323  		totalBytesCopied += bytesCopied
   324  	}
   325  
   326  	return totalRowsCopied, totalBytesCopied, nil
   327  }
   328  
   329  func writeTableToDataFile(ctx context.Context, tx pgx.Tx, filePath string, tableSql TableCopySql) (int64, int64, error) {
   330  	file, err := os.Create(filePath)
   331  	if err != nil {
   332  		return 0, 0, fmt.Errorf("failed to create file %s:%w", filePath, err)
   333  	}
   334  	defer file.Close()
   335  
   336  	fileWriter := vio.NewCountWriter(file)
   337  
   338  	numRowsCopied, err := executeCopy(ctx, tx, tableSql, fileWriter)
   339  	if err != nil {
   340  		return 0, 0, fmt.Errorf("failed to execute copy: %w", err)
   341  	}
   342  	return numRowsCopied, fileWriter.Count(), nil
   343  }
   344  
   345  func executeCopy(ctx context.Context, tx pgx.Tx, tableSql TableCopySql, w io.Writer) (int64, error) {
   346  	defer metrics.StartNetworkHistoryCopy(tableSql.metaData.Name)()
   347  
   348  	tag, err := tx.Conn().PgConn().CopyTo(ctx, w, tableSql.copySql)
   349  	if err != nil {
   350  		return 0, fmt.Errorf("failed to execute copy sql %s: %w", tableSql.copySql, err)
   351  	}
   352  
   353  	rowsCopied := tag.RowsAffected()
   354  	metrics.NetworkHistoryRowsCopied(tableSql.metaData.Name, rowsCopied)
   355  
   356  	return rowsCopied, nil
   357  }
   358  
   359  func (b *Service) GetUnpublishedSnapshots() ([]segment.Unpublished, error) {
   360  	files, err := os.ReadDir(b.copyToPath)
   361  	if err != nil {
   362  		return nil, fmt.Errorf("failed to get files in snapshot directory:%w", err)
   363  	}
   364  
   365  	segments := []segment.Unpublished{}
   366  	chainID := ""
   367  	for _, file := range files {
   368  		if file.IsDir() {
   369  			baseSegment, err := segment.NewFromSnapshotDataDirectory(file.Name())
   370  			if err != nil {
   371  				continue
   372  			}
   373  			segment := segment.Unpublished{
   374  				Base:      baseSegment,
   375  				Directory: b.copyToPath,
   376  			}
   377  
   378  			if len(chainID) == 0 {
   379  				chainID = segment.ChainID
   380  			}
   381  
   382  			if segment.ChainID != chainID {
   383  				return nil, fmt.Errorf("current state snapshots for multiple chain ids exist in snapshots directory %s", b.copyToPath)
   384  			}
   385  
   386  			lockFileExists, err := fs.FileExists(segment.InProgressFilePath())
   387  			if err != nil {
   388  				return nil, fmt.Errorf("failed to check for lock file:%w", err)
   389  			}
   390  
   391  			if lockFileExists {
   392  				continue
   393  			}
   394  			segments = append(segments, segment)
   395  		}
   396  	}
   397  
   398  	return segments, nil
   399  }
   400  
   401  type TableCopySql struct {
   402  	metaData TableMetadata
   403  	copySql  string
   404  }