code.vegaprotocol.io/vega@v0.79.0/datanode/networkhistory/snapshot/service_create_snapshot.go (about) 1 // Copyright (C) 2023 Gobalsky Labs Limited 2 // 3 // This program is free software: you can redistribute it and/or modify 4 // it under the terms of the GNU Affero General Public License as 5 // published by the Free Software Foundation, either version 3 of the 6 // License, or (at your option) any later version. 7 // 8 // This program is distributed in the hope that it will be useful, 9 // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 // GNU Affero General Public License for more details. 12 // 13 // You should have received a copy of the GNU Affero General Public License 14 // along with this program. If not, see <http://www.gnu.org/licenses/>. 15 16 package snapshot 17 18 import ( 19 "context" 20 "errors" 21 "fmt" 22 "io" 23 "os" 24 "path" 25 "sort" 26 "time" 27 28 "code.vegaprotocol.io/vega/datanode/metrics" 29 "code.vegaprotocol.io/vega/datanode/networkhistory/segment" 30 "code.vegaprotocol.io/vega/datanode/sqlstore" 31 "code.vegaprotocol.io/vega/libs/fs" 32 vio "code.vegaprotocol.io/vega/libs/io" 33 "code.vegaprotocol.io/vega/logging" 34 35 "github.com/georgysavva/scany/pgxscan" 36 "github.com/jackc/pgx/v4" 37 "golang.org/x/exp/maps" 38 ) 39 40 var ( 41 ErrSnapshotExists = errors.New("snapshot exists") 42 ErrNoLastSnapshot = errors.New("no last snapshot") 43 ) 44 45 func (b *Service) CreateSnapshot(ctx context.Context, chainID string, toHeight int64) (segment.Unpublished, error) { 46 return b.createNewSnapshot(ctx, chainID, toHeight, false) 47 } 48 49 func (b *Service) CreateSnapshotAsynchronously(ctx context.Context, chainID string, toHeight int64) (segment.Unpublished, error) { 50 return b.createNewSnapshot(ctx, chainID, toHeight, true) 51 } 52 53 func (b *Service) createNewSnapshot(ctx context.Context, chainID string, toHeight int64, 54 async bool, 55 ) (segment.Unpublished, error) { 56 var err error 57 if len(chainID) == 0 { 58 return segment.Unpublished{}, fmt.Errorf("chain id is required") 59 } 60 61 dbMetaData, err := NewDatabaseMetaData(ctx, b.connPool) 62 if err != nil { 63 return segment.Unpublished{}, fmt.Errorf("failed to get data dump metadata: %w", err) 64 } 65 66 var cleanUp []func() 67 ctxWithTimeout, cancelFn := context.WithTimeout(ctx, b.config.WaitForCreationLockTimeout.Duration) 68 defer cancelFn() 69 70 // This lock ensures snapshots cannot be created in parallel, during normal run this should never be an issue 71 // as the time between snapshots is sufficiently large, however during event replay (and some testing/dev scenarios) 72 // the time between snapshots can be sufficiently small to run the risk that snapshotting could overlap without this 73 // lock. 74 if !b.createSnapshotLock.Lock(ctxWithTimeout) { 75 panic("context cancelled whilst waiting for create snapshot lock") 76 } 77 78 cleanUp = append(cleanUp, func() { b.createSnapshotLock.Unlock() }) 79 80 copyDataTx, err := b.connPool.Begin(ctx) 81 if err != nil { 82 runAllInReverseOrder(cleanUp) 83 return segment.Unpublished{}, fmt.Errorf("failed to begin copy table data transaction: %w", err) 84 } 85 // Rolling back a committed transaction does nothing 86 cleanUp = append(cleanUp, func() { _ = copyDataTx.Rollback(ctx) }) 87 88 if _, err = copyDataTx.Exec(ctx, "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE"); err != nil { 89 runAllInReverseOrder(cleanUp) 90 return segment.Unpublished{}, fmt.Errorf("failed to set transaction isolation level to serilizable: %w", err) 91 } 92 93 nextSpan, err := getNextSnapshotSpan(ctx, toHeight, copyDataTx) 94 if err != nil { 95 runAllInReverseOrder(cleanUp) 96 if errors.Is(err, ErrSnapshotExists) { 97 return segment.Unpublished{}, ErrSnapshotExists 98 } 99 return segment.Unpublished{}, fmt.Errorf("failed to get next snapshot span:%w", err) 100 } 101 102 s := segment.Unpublished{ 103 Base: segment.Base{ 104 HeightFrom: nextSpan.FromHeight, 105 HeightTo: nextSpan.ToHeight, 106 DatabaseVersion: dbMetaData.DatabaseVersion, 107 ChainID: chainID, 108 }, 109 Directory: b.copyToPath, 110 } 111 112 b.log.Infof("creating snapshot for %+v", s) 113 114 if _, err = os.Create(s.InProgressFilePath()); err != nil { 115 runAllInReverseOrder(cleanUp) 116 return segment.Unpublished{}, fmt.Errorf("failed to create write lock file:%w", err) 117 } 118 cleanUp = append(cleanUp, func() { _ = os.Remove(s.InProgressFilePath()) }) 119 120 // To ensure reads are isolated from this point forward execute a read on last block 121 _, err = sqlstore.GetLastBlockUsingConnection(ctx, copyDataTx) 122 if err != nil { 123 runAllInReverseOrder(cleanUp) 124 return segment.Unpublished{}, fmt.Errorf("failed to get last block using connection: %w", err) 125 } 126 127 snapshotData := func() { 128 defer func() { runAllInReverseOrder(cleanUp) }() 129 err = b.snapshotData(ctx, copyDataTx, dbMetaData, s) 130 if err != nil { 131 b.log.Panic("failed to snapshot data", logging.Error(err)) 132 } 133 } 134 135 if async { 136 go snapshotData() 137 } else { 138 snapshotData() 139 } 140 141 return s, nil 142 } 143 144 func getNextSnapshotSpan(ctx context.Context, toHeight int64, copyDataTx pgx.Tx) (Span, error) { 145 lastSnapshotSpan, err := getLastSnapshotSpan(ctx, copyDataTx) 146 147 var nextSpan Span 148 if err != nil { 149 if errors.Is(err, ErrNoLastSnapshot) { 150 oldestHistoryBlock, err := sqlstore.GetOldestHistoryBlockUsingConnection(ctx, copyDataTx) 151 if err != nil { 152 return Span{}, fmt.Errorf("failed to get oldest history block:%w", err) 153 } 154 nextSpan = Span{ 155 FromHeight: oldestHistoryBlock.Height, 156 ToHeight: toHeight, 157 } 158 } else { 159 return nextSpan, fmt.Errorf("failed to get last snapshot span:%w", err) 160 } 161 } else { 162 if toHeight < lastSnapshotSpan.ToHeight { 163 return Span{}, fmt.Errorf("toHeight %d is less than last snapshot span %+v", toHeight, lastSnapshotSpan) 164 } 165 166 if toHeight == lastSnapshotSpan.ToHeight { 167 return Span{}, ErrSnapshotExists 168 } 169 170 nextSpan = Span{FromHeight: lastSnapshotSpan.ToHeight + 1, ToHeight: toHeight} 171 } 172 173 err = setLastSnapshotSpan(ctx, copyDataTx, nextSpan.FromHeight, nextSpan.ToHeight) 174 if err != nil { 175 return Span{}, fmt.Errorf("failed to set last snapshot span:%w", err) 176 } 177 178 return nextSpan, nil 179 } 180 181 type Span struct { 182 FromHeight int64 183 ToHeight int64 184 } 185 186 func setLastSnapshotSpan(ctx context.Context, connection sqlstore.Connection, fromHeight, toHeight int64) error { 187 _, err := connection.Exec(ctx, `Insert into last_snapshot_span (from_height, to_height) VALUES($1, $2) 188 on conflict(onerow_check) do update set from_height=EXCLUDED.from_height, to_height=EXCLUDED.to_height`, 189 fromHeight, toHeight) 190 if err != nil { 191 return fmt.Errorf("failed to update last_snapshot_span table:%w", err) 192 } 193 return nil 194 } 195 196 func getLastSnapshotSpan(ctx context.Context, connection sqlstore.Connection) (*Span, error) { 197 ls := &Span{} 198 err := pgxscan.Get(ctx, connection, ls, 199 `SELECT from_height, to_height 200 FROM last_snapshot_span`) 201 202 if errors.Is(err, pgx.ErrNoRows) { 203 return nil, ErrNoLastSnapshot 204 } 205 206 return ls, err 207 } 208 209 func runAllInReverseOrder(functions []func()) { 210 for i := len(functions) - 1; i >= 0; i-- { 211 functions[i]() 212 } 213 } 214 215 func (b *Service) snapshotData(ctx context.Context, tx pgx.Tx, dbMetaData DatabaseMetadata, seg segment.Unpublished) error { 216 defer func() { 217 // Calling rollback on a committed transaction has no effect, hence we can rollback in defer to ensure 218 // always rolled back if the transaction was not successfully committed 219 _ = tx.Rollback(ctx) 220 }() 221 222 if _, err := tx.Exec(ctx, "SET TIME ZONE 0"); err != nil { 223 return fmt.Errorf("failed to set timezone to UTC:%w", err) 224 } 225 226 start := time.Now() 227 b.log.Infof("copying all table data....") 228 229 currentStateDir := path.Join(seg.UnpublishedSnapshotDataDirectory(), "currentstate") 230 historyStateDir := path.Join(seg.UnpublishedSnapshotDataDirectory(), "history") 231 232 err := os.MkdirAll(currentStateDir, os.ModePerm) 233 if err != nil { 234 return fmt.Errorf("failed to create current state directory:%w", err) 235 } 236 237 err = os.MkdirAll(historyStateDir, os.ModePerm) 238 if err != nil { 239 return fmt.Errorf("failed to create history state directory:%w", err) 240 } 241 242 // Write Current State 243 currentSQL := currentStateCopySQL(dbMetaData) 244 currentRowsCopied, currentStateBytesCopied, err := copyTablesData(ctx, tx, currentSQL, currentStateDir) 245 if err != nil { 246 return fmt.Errorf("failed to copy current state table data:%w", err) 247 } 248 249 // Write History 250 historySQL := historyCopySQL(dbMetaData, seg) 251 historyRowsCopied, historyBytesCopied, err := copyTablesData(ctx, tx, historySQL, historyStateDir) 252 if err != nil { 253 return fmt.Errorf("failed to copy history table data:%w", err) 254 } 255 256 err = tx.Commit(ctx) 257 if err != nil { 258 return fmt.Errorf("failed to commit snapshot transaction:%w", err) 259 } 260 261 metrics.SetLastSnapshotRowcount(float64(currentRowsCopied + historyRowsCopied)) 262 metrics.SetLastSnapshotCurrentStateBytes(float64(currentStateBytesCopied)) 263 metrics.SetLastSnapshotHistoryBytes(float64(historyBytesCopied)) 264 metrics.SetLastSnapshotSeconds(time.Since(start).Seconds()) 265 266 b.log.Info("finished creating snapshot for chain", logging.String("chain", seg.ChainID), 267 logging.Int64("from height", seg.HeightFrom), 268 logging.Int64("to height", seg.HeightTo), logging.Duration("time taken", time.Since(start)), 269 logging.Int64("rows copied", currentRowsCopied+historyRowsCopied), 270 logging.Int64("current state data size", currentStateBytesCopied), 271 logging.Int64("history data size", historyBytesCopied), 272 ) 273 274 return nil 275 } 276 277 func currentStateCopySQL(dbMetaData DatabaseMetadata) []TableCopySql { 278 var copySQL []TableCopySql 279 tablesNames := maps.Keys(dbMetaData.TableNameToMetaData) 280 sort.Strings(tablesNames) 281 282 for _, tableName := range tablesNames { 283 meta := dbMetaData.TableNameToMetaData[tableName] 284 if !dbMetaData.TableNameToMetaData[tableName].Hypertable { 285 tableCopySQL := fmt.Sprintf(`copy (select * from %s order by %s) TO STDOUT WITH (FORMAT csv, HEADER) `, tableName, 286 meta.SortOrder) 287 copySQL = append(copySQL, TableCopySql{meta, tableCopySQL}) 288 } 289 } 290 return copySQL 291 } 292 293 func historyCopySQL(dbMetaData DatabaseMetadata, segment interface{ GetFromHeight() int64 }) []TableCopySql { 294 var copySQL []TableCopySql 295 tablesNames := maps.Keys(dbMetaData.TableNameToMetaData) 296 sort.Strings(tablesNames) 297 298 for _, tableName := range tablesNames { 299 meta := dbMetaData.TableNameToMetaData[tableName] 300 if dbMetaData.TableNameToMetaData[tableName].Hypertable { 301 partitionColumn := dbMetaData.TableNameToMetaData[tableName].PartitionColumn 302 hyperTableCopySQL := fmt.Sprintf(`copy (select * from %s where %s >= (SELECT vega_time from blocks where height = %d) order by %s) to STDOUT (FORMAT csv, HEADER)`, 303 tableName, 304 partitionColumn, 305 segment.GetFromHeight(), 306 meta.SortOrder) 307 copySQL = append(copySQL, TableCopySql{meta, hyperTableCopySQL}) 308 } 309 } 310 return copySQL 311 } 312 313 func copyTablesData(ctx context.Context, tx pgx.Tx, copySQL []TableCopySql, toDir string) (int64, int64, error) { 314 var totalRowsCopied int64 315 var totalBytesCopied int64 316 for _, tableSql := range copySQL { 317 filePath := path.Join(toDir, tableSql.metaData.Name) 318 numRowsCopied, bytesCopied, err := writeTableToDataFile(ctx, tx, filePath, tableSql) 319 if err != nil { 320 return 0, 0, fmt.Errorf("failed to write table %s to file %s:%w", tableSql.metaData.Name, filePath, err) 321 } 322 totalRowsCopied += numRowsCopied 323 totalBytesCopied += bytesCopied 324 } 325 326 return totalRowsCopied, totalBytesCopied, nil 327 } 328 329 func writeTableToDataFile(ctx context.Context, tx pgx.Tx, filePath string, tableSql TableCopySql) (int64, int64, error) { 330 file, err := os.Create(filePath) 331 if err != nil { 332 return 0, 0, fmt.Errorf("failed to create file %s:%w", filePath, err) 333 } 334 defer file.Close() 335 336 fileWriter := vio.NewCountWriter(file) 337 338 numRowsCopied, err := executeCopy(ctx, tx, tableSql, fileWriter) 339 if err != nil { 340 return 0, 0, fmt.Errorf("failed to execute copy: %w", err) 341 } 342 return numRowsCopied, fileWriter.Count(), nil 343 } 344 345 func executeCopy(ctx context.Context, tx pgx.Tx, tableSql TableCopySql, w io.Writer) (int64, error) { 346 defer metrics.StartNetworkHistoryCopy(tableSql.metaData.Name)() 347 348 tag, err := tx.Conn().PgConn().CopyTo(ctx, w, tableSql.copySql) 349 if err != nil { 350 return 0, fmt.Errorf("failed to execute copy sql %s: %w", tableSql.copySql, err) 351 } 352 353 rowsCopied := tag.RowsAffected() 354 metrics.NetworkHistoryRowsCopied(tableSql.metaData.Name, rowsCopied) 355 356 return rowsCopied, nil 357 } 358 359 func (b *Service) GetUnpublishedSnapshots() ([]segment.Unpublished, error) { 360 files, err := os.ReadDir(b.copyToPath) 361 if err != nil { 362 return nil, fmt.Errorf("failed to get files in snapshot directory:%w", err) 363 } 364 365 segments := []segment.Unpublished{} 366 chainID := "" 367 for _, file := range files { 368 if file.IsDir() { 369 baseSegment, err := segment.NewFromSnapshotDataDirectory(file.Name()) 370 if err != nil { 371 continue 372 } 373 segment := segment.Unpublished{ 374 Base: baseSegment, 375 Directory: b.copyToPath, 376 } 377 378 if len(chainID) == 0 { 379 chainID = segment.ChainID 380 } 381 382 if segment.ChainID != chainID { 383 return nil, fmt.Errorf("current state snapshots for multiple chain ids exist in snapshots directory %s", b.copyToPath) 384 } 385 386 lockFileExists, err := fs.FileExists(segment.InProgressFilePath()) 387 if err != nil { 388 return nil, fmt.Errorf("failed to check for lock file:%w", err) 389 } 390 391 if lockFileExists { 392 continue 393 } 394 segments = append(segments, segment) 395 } 396 } 397 398 return segments, nil 399 } 400 401 type TableCopySql struct { 402 metaData TableMetadata 403 copySql string 404 }