code.vegaprotocol.io/vega@v0.79.0/datanode/networkhistory/service.go (about)

     1  // Copyright (C) 2023 Gobalsky Labs Limited
     2  //
     3  // This program is free software: you can redistribute it and/or modify
     4  // it under the terms of the GNU Affero General Public License as
     5  // published by the Free Software Foundation, either version 3 of the
     6  // License, or (at your option) any later version.
     7  //
     8  // This program is distributed in the hope that it will be useful,
     9  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    10  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    11  // GNU Affero General Public License for more details.
    12  //
    13  // You should have received a copy of the GNU Affero General Public License
    14  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    15  
    16  package networkhistory
    17  
    18  import (
    19  	"context"
    20  	"errors"
    21  	"fmt"
    22  	"io"
    23  	"sort"
    24  	"strings"
    25  	"sync"
    26  	"time"
    27  
    28  	"code.vegaprotocol.io/vega/datanode/networkhistory/segment"
    29  	"code.vegaprotocol.io/vega/datanode/networkhistory/snapshot"
    30  	"code.vegaprotocol.io/vega/datanode/networkhistory/store"
    31  	"code.vegaprotocol.io/vega/datanode/sqlstore"
    32  	"code.vegaprotocol.io/vega/logging"
    33  	v2 "code.vegaprotocol.io/vega/protos/data-node/api/v2"
    34  
    35  	"github.com/jackc/pgx/v4/pgxpool"
    36  	"github.com/multiformats/go-multiaddr"
    37  )
    38  
    39  type Service struct {
    40  	cfg Config
    41  
    42  	log      *logging.Logger
    43  	connPool *pgxpool.Pool
    44  
    45  	snapshotService *snapshot.Service
    46  	store           *store.Store
    47  
    48  	chainID string
    49  
    50  	snapshotsCopyToPath string
    51  
    52  	datanodeGrpcAPIPort int
    53  
    54  	publishLock sync.Mutex
    55  }
    56  
    57  func New(ctx context.Context, log *logging.Logger, chainID string, cfg Config, connPool *pgxpool.Pool,
    58  	snapshotService *snapshot.Service,
    59  	networkHistoryStore *store.Store, datanodeGrpcAPIPort int,
    60  	snapshotsCopyToPath string,
    61  ) (*Service, error) {
    62  	s := &Service{
    63  		cfg:                 cfg,
    64  		log:                 log,
    65  		connPool:            connPool,
    66  		snapshotService:     snapshotService,
    67  		store:               networkHistoryStore,
    68  		chainID:             chainID,
    69  		snapshotsCopyToPath: snapshotsCopyToPath,
    70  		datanodeGrpcAPIPort: datanodeGrpcAPIPort,
    71  	}
    72  
    73  	if cfg.Publish {
    74  		var err error
    75  		go func() {
    76  			ticker := time.NewTicker(5 * time.Second)
    77  			for {
    78  				select {
    79  				case <-ctx.Done():
    80  					return
    81  				case <-ticker.C:
    82  					err = s.PublishSegments(ctx)
    83  					if err != nil {
    84  						s.log.Errorf("failed to add all snapshot data to store:%s", err)
    85  					}
    86  				}
    87  			}
    88  		}()
    89  	}
    90  
    91  	return s, nil
    92  }
    93  
    94  func (d *Service) RollbackToHeight(ctx context.Context, log snapshot.LoadLog, height int64) error {
    95  	datanodeBlockSpan, err := sqlstore.GetDatanodeBlockSpan(ctx, d.connPool)
    96  	if err != nil {
    97  		return fmt.Errorf("failed to get data node block span: %w", err)
    98  	}
    99  
   100  	if height < datanodeBlockSpan.FromHeight || height >= datanodeBlockSpan.ToHeight {
   101  		return fmt.Errorf("rollback to height, %d, is not within the datanodes current block span, %d to %d",
   102  			height, datanodeBlockSpan.FromHeight, datanodeBlockSpan.ToHeight)
   103  	}
   104  
   105  	rollbackToSegment, err := d.store.GetSegmentForHeight(height)
   106  	if err != nil {
   107  		return fmt.Errorf("failed to get history segment for height %d: %w", height, err)
   108  	}
   109  
   110  	err = d.snapshotService.RollbackToSegment(ctx, log, rollbackToSegment)
   111  
   112  	if err != nil {
   113  		return fmt.Errorf("failed to rollback to segment: %w", err)
   114  	}
   115  
   116  	entries, err := d.store.ListAllIndexEntriesMostRecentFirst()
   117  	if err != nil {
   118  		return fmt.Errorf("failed to list all entries: %w", err)
   119  	}
   120  
   121  	var segmentsToRemove []segment.Full
   122  	for _, entry := range entries {
   123  		if entry.HeightTo > rollbackToSegment.HeightTo {
   124  			segmentsToRemove = append(segmentsToRemove, entry)
   125  		} else {
   126  			break
   127  		}
   128  	}
   129  
   130  	if err = d.store.RemoveSegments(ctx, segmentsToRemove); err != nil {
   131  		return fmt.Errorf("failed to remove segments: %w", err)
   132  	}
   133  
   134  	log.Infof("finished rolling back to height %d", height)
   135  
   136  	return nil
   137  }
   138  
   139  func (d *Service) GetHistorySegmentReader(ctx context.Context, historySegmentID string) (io.ReadSeekCloser, int64, error) {
   140  	return d.store.GetHistorySegmentReader(ctx, historySegmentID)
   141  }
   142  
   143  func (d *Service) CopyHistorySegmentToFile(ctx context.Context, historySegmentID string, outFile string) error {
   144  	return d.store.CopyHistorySegmentToFile(ctx, historySegmentID, outFile)
   145  }
   146  
   147  func (d *Service) GetHighestBlockHeightHistorySegment() (segment.Full, error) {
   148  	return d.store.GetHighestBlockHeightEntry()
   149  }
   150  
   151  func (d *Service) ListAllHistorySegments() (segment.Segments[segment.Full], error) {
   152  	return d.store.ListAllIndexEntriesOldestFirst()
   153  }
   154  
   155  func (d *Service) FetchHistorySegment(parentCtx context.Context, historySegmentID string) (segment.Full, error) {
   156  	// An IPFS fetch will hang on a reasonably frequent basis.  Issuing a re-fetch resolves this
   157  	// most of the time. In the case where the fetch hangs, some of the blocks for the context
   158  	// will usually have been retrieved, such that subsequent fetch has fewer blocks to fetch.
   159  	// From experimentation, the very simple retry logic below seems to give a good trade off between average time
   160  	// taken to fetch a segment in the case where it hangs and ensuring that the segment is eventually fetched.
   161  	var err error
   162  	for retry := 1; retry <= d.cfg.FetchRetryMax; retry++ {
   163  		contextTimeout := d.cfg.RetryTimeout.Duration * time.Duration(retry)
   164  		d.log.Infof("fetching history segment %s (attempt %d, timeout %s)", historySegmentID, retry, contextTimeout)
   165  		ctx, cancelFn := context.WithTimeout(parentCtx, contextTimeout)
   166  		segment, err := d.store.FetchHistorySegment(ctx, historySegmentID)
   167  		cancelFn()
   168  		if err == nil {
   169  			return segment, nil
   170  		}
   171  		d.log.Warningf("failed to fetch segment: %s", err)
   172  	}
   173  
   174  	return segment.Full{}, fmt.Errorf("failed to fetch history segment %s after %d attempts: %w", historySegmentID, d.cfg.FetchRetryMax, err)
   175  }
   176  
   177  func (d *Service) CreateAndPublishSegment(ctx context.Context, chainID string, toHeight int64) error {
   178  	_, err := d.snapshotService.CreateSnapshot(ctx, chainID, toHeight)
   179  	if err != nil {
   180  		if !errors.Is(err, snapshot.ErrSnapshotExists) {
   181  			return fmt.Errorf("failed to create snapshot: %w", err)
   182  		}
   183  	}
   184  
   185  	if err = d.PublishSegments(ctx); err != nil {
   186  		return fmt.Errorf("failed to publish snapshots: %w", err)
   187  	}
   188  
   189  	return nil
   190  }
   191  
   192  func (d *Service) GetBootstrapPeers() []string {
   193  	return d.cfg.Store.BootstrapPeers
   194  }
   195  
   196  func (d *Service) GetSwarmKey() string {
   197  	return d.store.GetSwarmKey()
   198  }
   199  
   200  func (d *Service) GetIpfsAddress() (string, error) {
   201  	node, err := d.store.GetLocalNode()
   202  	if err != nil {
   203  		return "", fmt.Errorf("failed to load node: %w", err)
   204  	}
   205  
   206  	ipfsAddress, err := node.IpfsAddress()
   207  	if err != nil {
   208  		return "", fmt.Errorf("failed to get ipfs address: %w", err)
   209  	}
   210  
   211  	return ipfsAddress.String(), nil
   212  }
   213  
   214  func (d *Service) GetConnectedPeerAddresses() ([]string, error) {
   215  	connectedPeers := d.store.GetConnectedPeers()
   216  
   217  	addr := make([]string, 0, len(connectedPeers))
   218  	for _, peer := range connectedPeers {
   219  		ipfsAddress, err := peer.Remote.IpfsAddress()
   220  		if err != nil {
   221  			return nil, fmt.Errorf("failed to get ipfs address of remote peer: %w", err)
   222  		}
   223  		addr = append(addr, ipfsAddress.String())
   224  	}
   225  
   226  	return addr, nil
   227  }
   228  
   229  func (d *Service) GetActivePeerIPAddresses() []string {
   230  	ip4Protocol := multiaddr.ProtocolWithName("ip4")
   231  	ip6Protocol := multiaddr.ProtocolWithName("ip6")
   232  	var activePeerIPAddresses []string
   233  
   234  	activePeerIPAddresses = nil
   235  	connectedPeers := d.store.GetConnectedPeers()
   236  
   237  	for _, addr := range connectedPeers {
   238  		ipAddr, err := addr.Remote.Addr.ValueForProtocol(ip4Protocol.Code)
   239  		if err == nil {
   240  			activePeerIPAddresses = append(activePeerIPAddresses, ipAddr)
   241  		}
   242  
   243  		ipAddr, err = addr.Remote.Addr.ValueForProtocol(ip6Protocol.Code)
   244  		if err == nil {
   245  			activePeerIPAddresses = append(activePeerIPAddresses, ipAddr)
   246  		}
   247  	}
   248  
   249  	return activePeerIPAddresses
   250  }
   251  
   252  func (d *Service) GetSwarmKeySeed() string {
   253  	return d.store.GetSwarmKeySeed()
   254  }
   255  
   256  func (d *Service) LoadNetworkHistoryIntoDatanode(ctx context.Context, chunk segment.ContiguousHistory[segment.Full],
   257  	connConfig sqlstore.ConnectionConfig, withIndexesAndOrderTriggers, verbose bool,
   258  ) (snapshot.LoadResult, error) {
   259  	return d.LoadNetworkHistoryIntoDatanodeWithLog(ctx, d.log, chunk, connConfig, withIndexesAndOrderTriggers, verbose)
   260  }
   261  
   262  func (d *Service) LoadNetworkHistoryIntoDatanodeWithLog(ctx context.Context, log snapshot.LoadLog, chunk segment.ContiguousHistory[segment.Full],
   263  	connConfig sqlstore.ConnectionConfig, withIndexesAndOrderTriggers, verbose bool,
   264  ) (snapshot.LoadResult, error) {
   265  	maxRetries := 3
   266  	// the deadlock error that should trigger a retry
   267  	status := "deadlock detected (SQLSTATE 40P01)"
   268  	datanodeBlockSpan, err := sqlstore.GetDatanodeBlockSpan(ctx, d.connPool)
   269  	if err != nil {
   270  		return snapshot.LoadResult{}, fmt.Errorf("failed to get data node block span: %w", err)
   271  	}
   272  
   273  	log.Info("loading network history into the datanode", logging.Int64("fromHeight", chunk.HeightFrom),
   274  		logging.Int64("toHeight", chunk.HeightFrom), logging.Int64("currentDatanodeFromHeight", datanodeBlockSpan.FromHeight),
   275  		logging.Int64("currentDatanodeToHeight", datanodeBlockSpan.ToHeight), logging.Bool("withIndexesAndOrderTriggers", withIndexesAndOrderTriggers))
   276  
   277  	start := time.Now()
   278  
   279  	var rErr error // return error
   280  	chunks := chunk.Slice(datanodeBlockSpan.ToHeight+1, chunk.HeightTo)
   281  	for retries := 0; retries < maxRetries; retries++ {
   282  		loadResult, err := d.snapshotService.LoadSnapshotData(ctx, log, chunks, connConfig, withIndexesAndOrderTriggers, verbose)
   283  		if err == nil {
   284  			log.Info("loaded all available data into datanode",
   285  				logging.String("result", fmt.Sprintf("%+v", loadResult)),
   286  				logging.Duration("time taken", time.Since(start)),
   287  				logging.Int("retry-count", retries),
   288  			)
   289  			return loadResult, nil
   290  		}
   291  		// keep track of the last error
   292  		rErr = err
   293  		if !strings.Contains(err.Error(), status) {
   294  			// some error other than 40P01 encountered
   295  			break
   296  		}
   297  	}
   298  	// retries still ended up failing
   299  	return snapshot.LoadResult{}, fmt.Errorf("failed to load snapshot data:%w", rErr)
   300  }
   301  
   302  func (d *Service) GetMostRecentHistorySegmentFromBootstrapPeers(ctx context.Context,
   303  	grpcAPIPorts []int,
   304  ) (*PeerResponse, map[string]*v2.GetMostRecentNetworkHistorySegmentResponse, error) {
   305  	bootstrapPeers := d.GetBootstrapPeers()
   306  	if len(bootstrapPeers) == 0 {
   307  		return nil, nil, errors.New("no bootstrap peers found")
   308  	}
   309  
   310  	ip4Protocol := multiaddr.ProtocolWithName("ip4")
   311  	ip6Protocol := multiaddr.ProtocolWithName("ip6")
   312  	dnsProtocol := multiaddr.ProtocolWithName("dns")
   313  
   314  	bootstrapPeerAddresses := make([]string, 0, len(bootstrapPeers))
   315  
   316  	for _, bootstrapPeer := range bootstrapPeers {
   317  		addr, err := multiaddr.NewMultiaddr(bootstrapPeer)
   318  		if err != nil {
   319  			return nil, nil, fmt.Errorf("failed to parse bootstrap peer address %s: %w", bootstrapPeer, err)
   320  		}
   321  
   322  		ipAddr, err := addr.ValueForProtocol(ip4Protocol.Code)
   323  		if err == nil {
   324  			bootstrapPeerAddresses = append(bootstrapPeerAddresses, ipAddr)
   325  		}
   326  
   327  		ipAddr, err = addr.ValueForProtocol(ip6Protocol.Code)
   328  		if err == nil {
   329  			bootstrapPeerAddresses = append(bootstrapPeerAddresses, ipAddr)
   330  		}
   331  
   332  		dnsAddr, err := addr.ValueForProtocol(dnsProtocol.Code)
   333  		if err == nil {
   334  			bootstrapPeerAddresses = append(bootstrapPeerAddresses, dnsAddr)
   335  		}
   336  	}
   337  
   338  	return GetMostRecentHistorySegmentFromPeersAddresses(ctx, bootstrapPeerAddresses, d.GetSwarmKeySeed(), grpcAPIPorts)
   339  }
   340  
   341  func (d *Service) GetDatanodeBlockSpan(ctx context.Context) (sqlstore.DatanodeBlockSpan, error) {
   342  	return sqlstore.GetDatanodeBlockSpan(ctx, d.connPool)
   343  }
   344  
   345  func (d *Service) PublishSegments(ctx context.Context) error {
   346  	d.publishLock.Lock()
   347  	defer d.publishLock.Unlock()
   348  
   349  	segments, err := d.snapshotService.GetUnpublishedSnapshots()
   350  	if err != nil {
   351  		return fmt.Errorf("failed to list snapshots:%w", err)
   352  	}
   353  
   354  	sort.Slice(segments, func(i, j int) bool {
   355  		return segments[i].HeightTo < segments[j].HeightTo
   356  	})
   357  
   358  	for _, segment := range segments {
   359  		err = d.store.AddSnapshotData(ctx, segment)
   360  		if err != nil {
   361  			return fmt.Errorf("failed to publish snapshot %s:%w", segment, err)
   362  		}
   363  	}
   364  
   365  	return nil
   366  }
   367  
   368  func (d *Service) Stop() {
   369  	d.log.Info("stopping network history service")
   370  	d.store.Stop()
   371  	d.connPool.Close()
   372  }
   373  
   374  func KillAllConnectionsToDatabase(ctx context.Context, connConfig sqlstore.ConnectionConfig) error {
   375  	conn, err := pgxpool.Connect(ctx, connConfig.GetConnectionString())
   376  	if err != nil {
   377  		return fmt.Errorf("unable to connect to database: %w", err)
   378  	}
   379  	defer conn.Close()
   380  
   381  	killAllConnectionsQuery := fmt.Sprintf(
   382  		`SELECT
   383  	pg_terminate_backend(pg_stat_activity.pid)
   384  		FROM
   385  	pg_stat_activity
   386  		WHERE
   387  	pg_stat_activity.datname = '%s'
   388  	AND pid <> pg_backend_pid();`, connConfig.Database)
   389  
   390  	_, err = conn.Exec(ctx, killAllConnectionsQuery)
   391  	if err != nil {
   392  		return fmt.Errorf("failed to kill all database connection: %w", err)
   393  	}
   394  
   395  	return nil
   396  }