code.vegaprotocol.io/vega@v0.79.0/datanode/networkhistory/service.go (about) 1 // Copyright (C) 2023 Gobalsky Labs Limited 2 // 3 // This program is free software: you can redistribute it and/or modify 4 // it under the terms of the GNU Affero General Public License as 5 // published by the Free Software Foundation, either version 3 of the 6 // License, or (at your option) any later version. 7 // 8 // This program is distributed in the hope that it will be useful, 9 // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 // GNU Affero General Public License for more details. 12 // 13 // You should have received a copy of the GNU Affero General Public License 14 // along with this program. If not, see <http://www.gnu.org/licenses/>. 15 16 package networkhistory 17 18 import ( 19 "context" 20 "errors" 21 "fmt" 22 "io" 23 "sort" 24 "strings" 25 "sync" 26 "time" 27 28 "code.vegaprotocol.io/vega/datanode/networkhistory/segment" 29 "code.vegaprotocol.io/vega/datanode/networkhistory/snapshot" 30 "code.vegaprotocol.io/vega/datanode/networkhistory/store" 31 "code.vegaprotocol.io/vega/datanode/sqlstore" 32 "code.vegaprotocol.io/vega/logging" 33 v2 "code.vegaprotocol.io/vega/protos/data-node/api/v2" 34 35 "github.com/jackc/pgx/v4/pgxpool" 36 "github.com/multiformats/go-multiaddr" 37 ) 38 39 type Service struct { 40 cfg Config 41 42 log *logging.Logger 43 connPool *pgxpool.Pool 44 45 snapshotService *snapshot.Service 46 store *store.Store 47 48 chainID string 49 50 snapshotsCopyToPath string 51 52 datanodeGrpcAPIPort int 53 54 publishLock sync.Mutex 55 } 56 57 func New(ctx context.Context, log *logging.Logger, chainID string, cfg Config, connPool *pgxpool.Pool, 58 snapshotService *snapshot.Service, 59 networkHistoryStore *store.Store, datanodeGrpcAPIPort int, 60 snapshotsCopyToPath string, 61 ) (*Service, error) { 62 s := &Service{ 63 cfg: cfg, 64 log: log, 65 connPool: connPool, 66 snapshotService: snapshotService, 67 store: networkHistoryStore, 68 chainID: chainID, 69 snapshotsCopyToPath: snapshotsCopyToPath, 70 datanodeGrpcAPIPort: datanodeGrpcAPIPort, 71 } 72 73 if cfg.Publish { 74 var err error 75 go func() { 76 ticker := time.NewTicker(5 * time.Second) 77 for { 78 select { 79 case <-ctx.Done(): 80 return 81 case <-ticker.C: 82 err = s.PublishSegments(ctx) 83 if err != nil { 84 s.log.Errorf("failed to add all snapshot data to store:%s", err) 85 } 86 } 87 } 88 }() 89 } 90 91 return s, nil 92 } 93 94 func (d *Service) RollbackToHeight(ctx context.Context, log snapshot.LoadLog, height int64) error { 95 datanodeBlockSpan, err := sqlstore.GetDatanodeBlockSpan(ctx, d.connPool) 96 if err != nil { 97 return fmt.Errorf("failed to get data node block span: %w", err) 98 } 99 100 if height < datanodeBlockSpan.FromHeight || height >= datanodeBlockSpan.ToHeight { 101 return fmt.Errorf("rollback to height, %d, is not within the datanodes current block span, %d to %d", 102 height, datanodeBlockSpan.FromHeight, datanodeBlockSpan.ToHeight) 103 } 104 105 rollbackToSegment, err := d.store.GetSegmentForHeight(height) 106 if err != nil { 107 return fmt.Errorf("failed to get history segment for height %d: %w", height, err) 108 } 109 110 err = d.snapshotService.RollbackToSegment(ctx, log, rollbackToSegment) 111 112 if err != nil { 113 return fmt.Errorf("failed to rollback to segment: %w", err) 114 } 115 116 entries, err := d.store.ListAllIndexEntriesMostRecentFirst() 117 if err != nil { 118 return fmt.Errorf("failed to list all entries: %w", err) 119 } 120 121 var segmentsToRemove []segment.Full 122 for _, entry := range entries { 123 if entry.HeightTo > rollbackToSegment.HeightTo { 124 segmentsToRemove = append(segmentsToRemove, entry) 125 } else { 126 break 127 } 128 } 129 130 if err = d.store.RemoveSegments(ctx, segmentsToRemove); err != nil { 131 return fmt.Errorf("failed to remove segments: %w", err) 132 } 133 134 log.Infof("finished rolling back to height %d", height) 135 136 return nil 137 } 138 139 func (d *Service) GetHistorySegmentReader(ctx context.Context, historySegmentID string) (io.ReadSeekCloser, int64, error) { 140 return d.store.GetHistorySegmentReader(ctx, historySegmentID) 141 } 142 143 func (d *Service) CopyHistorySegmentToFile(ctx context.Context, historySegmentID string, outFile string) error { 144 return d.store.CopyHistorySegmentToFile(ctx, historySegmentID, outFile) 145 } 146 147 func (d *Service) GetHighestBlockHeightHistorySegment() (segment.Full, error) { 148 return d.store.GetHighestBlockHeightEntry() 149 } 150 151 func (d *Service) ListAllHistorySegments() (segment.Segments[segment.Full], error) { 152 return d.store.ListAllIndexEntriesOldestFirst() 153 } 154 155 func (d *Service) FetchHistorySegment(parentCtx context.Context, historySegmentID string) (segment.Full, error) { 156 // An IPFS fetch will hang on a reasonably frequent basis. Issuing a re-fetch resolves this 157 // most of the time. In the case where the fetch hangs, some of the blocks for the context 158 // will usually have been retrieved, such that subsequent fetch has fewer blocks to fetch. 159 // From experimentation, the very simple retry logic below seems to give a good trade off between average time 160 // taken to fetch a segment in the case where it hangs and ensuring that the segment is eventually fetched. 161 var err error 162 for retry := 1; retry <= d.cfg.FetchRetryMax; retry++ { 163 contextTimeout := d.cfg.RetryTimeout.Duration * time.Duration(retry) 164 d.log.Infof("fetching history segment %s (attempt %d, timeout %s)", historySegmentID, retry, contextTimeout) 165 ctx, cancelFn := context.WithTimeout(parentCtx, contextTimeout) 166 segment, err := d.store.FetchHistorySegment(ctx, historySegmentID) 167 cancelFn() 168 if err == nil { 169 return segment, nil 170 } 171 d.log.Warningf("failed to fetch segment: %s", err) 172 } 173 174 return segment.Full{}, fmt.Errorf("failed to fetch history segment %s after %d attempts: %w", historySegmentID, d.cfg.FetchRetryMax, err) 175 } 176 177 func (d *Service) CreateAndPublishSegment(ctx context.Context, chainID string, toHeight int64) error { 178 _, err := d.snapshotService.CreateSnapshot(ctx, chainID, toHeight) 179 if err != nil { 180 if !errors.Is(err, snapshot.ErrSnapshotExists) { 181 return fmt.Errorf("failed to create snapshot: %w", err) 182 } 183 } 184 185 if err = d.PublishSegments(ctx); err != nil { 186 return fmt.Errorf("failed to publish snapshots: %w", err) 187 } 188 189 return nil 190 } 191 192 func (d *Service) GetBootstrapPeers() []string { 193 return d.cfg.Store.BootstrapPeers 194 } 195 196 func (d *Service) GetSwarmKey() string { 197 return d.store.GetSwarmKey() 198 } 199 200 func (d *Service) GetIpfsAddress() (string, error) { 201 node, err := d.store.GetLocalNode() 202 if err != nil { 203 return "", fmt.Errorf("failed to load node: %w", err) 204 } 205 206 ipfsAddress, err := node.IpfsAddress() 207 if err != nil { 208 return "", fmt.Errorf("failed to get ipfs address: %w", err) 209 } 210 211 return ipfsAddress.String(), nil 212 } 213 214 func (d *Service) GetConnectedPeerAddresses() ([]string, error) { 215 connectedPeers := d.store.GetConnectedPeers() 216 217 addr := make([]string, 0, len(connectedPeers)) 218 for _, peer := range connectedPeers { 219 ipfsAddress, err := peer.Remote.IpfsAddress() 220 if err != nil { 221 return nil, fmt.Errorf("failed to get ipfs address of remote peer: %w", err) 222 } 223 addr = append(addr, ipfsAddress.String()) 224 } 225 226 return addr, nil 227 } 228 229 func (d *Service) GetActivePeerIPAddresses() []string { 230 ip4Protocol := multiaddr.ProtocolWithName("ip4") 231 ip6Protocol := multiaddr.ProtocolWithName("ip6") 232 var activePeerIPAddresses []string 233 234 activePeerIPAddresses = nil 235 connectedPeers := d.store.GetConnectedPeers() 236 237 for _, addr := range connectedPeers { 238 ipAddr, err := addr.Remote.Addr.ValueForProtocol(ip4Protocol.Code) 239 if err == nil { 240 activePeerIPAddresses = append(activePeerIPAddresses, ipAddr) 241 } 242 243 ipAddr, err = addr.Remote.Addr.ValueForProtocol(ip6Protocol.Code) 244 if err == nil { 245 activePeerIPAddresses = append(activePeerIPAddresses, ipAddr) 246 } 247 } 248 249 return activePeerIPAddresses 250 } 251 252 func (d *Service) GetSwarmKeySeed() string { 253 return d.store.GetSwarmKeySeed() 254 } 255 256 func (d *Service) LoadNetworkHistoryIntoDatanode(ctx context.Context, chunk segment.ContiguousHistory[segment.Full], 257 connConfig sqlstore.ConnectionConfig, withIndexesAndOrderTriggers, verbose bool, 258 ) (snapshot.LoadResult, error) { 259 return d.LoadNetworkHistoryIntoDatanodeWithLog(ctx, d.log, chunk, connConfig, withIndexesAndOrderTriggers, verbose) 260 } 261 262 func (d *Service) LoadNetworkHistoryIntoDatanodeWithLog(ctx context.Context, log snapshot.LoadLog, chunk segment.ContiguousHistory[segment.Full], 263 connConfig sqlstore.ConnectionConfig, withIndexesAndOrderTriggers, verbose bool, 264 ) (snapshot.LoadResult, error) { 265 maxRetries := 3 266 // the deadlock error that should trigger a retry 267 status := "deadlock detected (SQLSTATE 40P01)" 268 datanodeBlockSpan, err := sqlstore.GetDatanodeBlockSpan(ctx, d.connPool) 269 if err != nil { 270 return snapshot.LoadResult{}, fmt.Errorf("failed to get data node block span: %w", err) 271 } 272 273 log.Info("loading network history into the datanode", logging.Int64("fromHeight", chunk.HeightFrom), 274 logging.Int64("toHeight", chunk.HeightFrom), logging.Int64("currentDatanodeFromHeight", datanodeBlockSpan.FromHeight), 275 logging.Int64("currentDatanodeToHeight", datanodeBlockSpan.ToHeight), logging.Bool("withIndexesAndOrderTriggers", withIndexesAndOrderTriggers)) 276 277 start := time.Now() 278 279 var rErr error // return error 280 chunks := chunk.Slice(datanodeBlockSpan.ToHeight+1, chunk.HeightTo) 281 for retries := 0; retries < maxRetries; retries++ { 282 loadResult, err := d.snapshotService.LoadSnapshotData(ctx, log, chunks, connConfig, withIndexesAndOrderTriggers, verbose) 283 if err == nil { 284 log.Info("loaded all available data into datanode", 285 logging.String("result", fmt.Sprintf("%+v", loadResult)), 286 logging.Duration("time taken", time.Since(start)), 287 logging.Int("retry-count", retries), 288 ) 289 return loadResult, nil 290 } 291 // keep track of the last error 292 rErr = err 293 if !strings.Contains(err.Error(), status) { 294 // some error other than 40P01 encountered 295 break 296 } 297 } 298 // retries still ended up failing 299 return snapshot.LoadResult{}, fmt.Errorf("failed to load snapshot data:%w", rErr) 300 } 301 302 func (d *Service) GetMostRecentHistorySegmentFromBootstrapPeers(ctx context.Context, 303 grpcAPIPorts []int, 304 ) (*PeerResponse, map[string]*v2.GetMostRecentNetworkHistorySegmentResponse, error) { 305 bootstrapPeers := d.GetBootstrapPeers() 306 if len(bootstrapPeers) == 0 { 307 return nil, nil, errors.New("no bootstrap peers found") 308 } 309 310 ip4Protocol := multiaddr.ProtocolWithName("ip4") 311 ip6Protocol := multiaddr.ProtocolWithName("ip6") 312 dnsProtocol := multiaddr.ProtocolWithName("dns") 313 314 bootstrapPeerAddresses := make([]string, 0, len(bootstrapPeers)) 315 316 for _, bootstrapPeer := range bootstrapPeers { 317 addr, err := multiaddr.NewMultiaddr(bootstrapPeer) 318 if err != nil { 319 return nil, nil, fmt.Errorf("failed to parse bootstrap peer address %s: %w", bootstrapPeer, err) 320 } 321 322 ipAddr, err := addr.ValueForProtocol(ip4Protocol.Code) 323 if err == nil { 324 bootstrapPeerAddresses = append(bootstrapPeerAddresses, ipAddr) 325 } 326 327 ipAddr, err = addr.ValueForProtocol(ip6Protocol.Code) 328 if err == nil { 329 bootstrapPeerAddresses = append(bootstrapPeerAddresses, ipAddr) 330 } 331 332 dnsAddr, err := addr.ValueForProtocol(dnsProtocol.Code) 333 if err == nil { 334 bootstrapPeerAddresses = append(bootstrapPeerAddresses, dnsAddr) 335 } 336 } 337 338 return GetMostRecentHistorySegmentFromPeersAddresses(ctx, bootstrapPeerAddresses, d.GetSwarmKeySeed(), grpcAPIPorts) 339 } 340 341 func (d *Service) GetDatanodeBlockSpan(ctx context.Context) (sqlstore.DatanodeBlockSpan, error) { 342 return sqlstore.GetDatanodeBlockSpan(ctx, d.connPool) 343 } 344 345 func (d *Service) PublishSegments(ctx context.Context) error { 346 d.publishLock.Lock() 347 defer d.publishLock.Unlock() 348 349 segments, err := d.snapshotService.GetUnpublishedSnapshots() 350 if err != nil { 351 return fmt.Errorf("failed to list snapshots:%w", err) 352 } 353 354 sort.Slice(segments, func(i, j int) bool { 355 return segments[i].HeightTo < segments[j].HeightTo 356 }) 357 358 for _, segment := range segments { 359 err = d.store.AddSnapshotData(ctx, segment) 360 if err != nil { 361 return fmt.Errorf("failed to publish snapshot %s:%w", segment, err) 362 } 363 } 364 365 return nil 366 } 367 368 func (d *Service) Stop() { 369 d.log.Info("stopping network history service") 370 d.store.Stop() 371 d.connPool.Close() 372 } 373 374 func KillAllConnectionsToDatabase(ctx context.Context, connConfig sqlstore.ConnectionConfig) error { 375 conn, err := pgxpool.Connect(ctx, connConfig.GetConnectionString()) 376 if err != nil { 377 return fmt.Errorf("unable to connect to database: %w", err) 378 } 379 defer conn.Close() 380 381 killAllConnectionsQuery := fmt.Sprintf( 382 `SELECT 383 pg_terminate_backend(pg_stat_activity.pid) 384 FROM 385 pg_stat_activity 386 WHERE 387 pg_stat_activity.datname = '%s' 388 AND pid <> pg_backend_pid();`, connConfig.Database) 389 390 _, err = conn.Exec(ctx, killAllConnectionsQuery) 391 if err != nil { 392 return fmt.Errorf("failed to kill all database connection: %w", err) 393 } 394 395 return nil 396 }