
     1  // Copyright (c) 2022 IoTeX Foundation
     2  // This source code is provided 'as is' and no warranties are given as to title or non-infringement, merchantability
     3  // or fitness for purpose and, to the extent permitted by law, all liability for your use of the code is disclaimed.
     4  // This source code is governed by Apache License 2.0 that can be found in the LICENSE file.
     6  package p2p
     8  import (
     9  	"context"
    10  	"encoding/hex"
    11  	"fmt"
    12  	"strconv"
    13  	"strings"
    14  	"time"
    16  	""
    17  	""
    18  	""
    19  	""
    20  	""
    21  	""
    22  	""
    24  	""
    25  	""
    26  	goproto ""
    27  	""
    29  	""
    30  	""
    31  	""
    32  	""
    33  	""
    34  )
    36  const (
    37  	_successStr = "success"
    38  	_failureStr = "failure"
    39  )
    41  var (
    42  	_p2pMsgCounter = prometheus.NewCounterVec(
    43  		prometheus.CounterOpts{
    44  			Name: "iotex_p2p_message_counter",
    45  			Help: "P2P message stats",
    46  		},
    47  		[]string{"protocol", "message", "direction", "peer", "status"},
    48  	)
    49  	_p2pMsgLatency = prometheus.NewHistogramVec(
    50  		prometheus.HistogramOpts{
    51  			Name:    "iotex_p2p_message_latency",
    52  			Help:    "message latency",
    53  			Buckets: prometheus.LinearBuckets(0, 10, 200),
    54  		},
    55  		[]string{"protocol", "message", "status"},
    56  	)
    57  	// ErrAgentNotStarted is the error returned when p2p agent has not been started
    58  	ErrAgentNotStarted = errors.New("p2p agent has not been started")
    59  )
    61  func init() {
    62  	prometheus.MustRegister(_p2pMsgCounter)
    63  	prometheus.MustRegister(_p2pMsgLatency)
    64  }
    66  const (
    67  	// TODO: the topic could be fine tuned
    68  	_broadcastTopic    = "broadcast"
    69  	_unicastTopic      = "unicast"
    70  	_numDialRetries    = 8
    71  	_dialRetryInterval = 2 * time.Second
    72  )
    74  type (
    75  	// HandleBroadcastInbound handles broadcast message when agent listens it from the network
    76  	HandleBroadcastInbound func(context.Context, uint32, string, proto.Message)
    78  	// HandleUnicastInboundAsync handles unicast message when agent listens it from the network
    79  	HandleUnicastInboundAsync func(context.Context, uint32, peer.AddrInfo, proto.Message)
    81  	// Config is the config of p2p
    82  	Config struct {
    83  		Host           string   `yaml:"host"`
    84  		Port           int      `yaml:"port"`
    85  		ExternalHost   string   `yaml:"externalHost"`
    86  		ExternalPort   int      `yaml:"externalPort"`
    87  		BootstrapNodes []string `yaml:"bootstrapNodes"`
    88  		MasterKey      string   `yaml:"masterKey"` // master key will be PrivateKey if not set.
    89  		// RelayType is the type of P2P network relay. By default, the value is empty, meaning disabled. Two relay types
    90  		// are supported: active, nat.
    91  		RelayType         string              `yaml:"relayType"`
    92  		ReconnectInterval time.Duration       `yaml:"reconnectInterval"`
    93  		RateLimit         p2p.RateLimitConfig `yaml:"rateLimit"`
    94  		EnableRateLimit   bool                `yaml:"enableRateLimit"`
    95  		PrivateNetworkPSK string              `yaml:"privateNetworkPSK"`
    96  		MaxPeers          int                 `yaml:"maxPeers"`
    97  		MaxMessageSize    int                 `yaml:"maxMessageSize"`
    98  	}
   100  	// Agent is the agent to help the blockchain node connect into the P2P networks and send/receive messages
   101  	Agent interface {
   102  		lifecycle.StartStopper
   103  		nodestats.StatsReporter
   104  		// BroadcastOutbound sends a broadcast message to the whole network
   105  		BroadcastOutbound(ctx context.Context, msg proto.Message) (err error)
   106  		// UnicastOutbound sends a unicast message to the given address
   107  		UnicastOutbound(_ context.Context, peer peer.AddrInfo, msg proto.Message) (err error)
   108  		// Info returns agents' peer info.
   109  		Info() (peer.AddrInfo, error)
   110  		// Self returns the self network address
   111  		Self() ([]multiaddr.Multiaddr, error)
   112  		// ConnectedPeers returns the connected peers' info
   113  		ConnectedPeers() ([]peer.AddrInfo, error)
   114  		// BlockPeer blocks the peer in p2p layer
   115  		BlockPeer(string)
   116  	}
   118  	dummyAgent struct{}
   120  	agent struct {
   121  		cfg                        Config
   122  		chainID                    uint32
   123  		topicSuffix                string
   124  		broadcastInboundHandler    HandleBroadcastInbound
   125  		unicastInboundAsyncHandler HandleUnicastInboundAsync
   126  		host                       *p2p.Host
   127  		bootNodeAddr               []multiaddr.Multiaddr
   128  		reconnectTimeout           time.Duration
   129  		reconnectTask              *routine.RecurringTask
   130  		qosMetrics                 *Qos
   131  	}
   132  )
   134  // DefaultConfig is the default config of p2p
   135  var DefaultConfig = Config{
   136  	Host:              "",
   137  	Port:              4689,
   138  	ExternalHost:      "",
   139  	ExternalPort:      4689,
   140  	BootstrapNodes:    []string{},
   141  	MasterKey:         "",
   142  	RateLimit:         p2p.DefaultRatelimitConfig,
   143  	ReconnectInterval: 150 * time.Second,
   144  	EnableRateLimit:   true,
   145  	PrivateNetworkPSK: "",
   146  	MaxPeers:          30,
   147  	MaxMessageSize:    p2p.DefaultConfig.MaxMessageSize,
   148  }
   150  // NewDummyAgent creates a dummy p2p agent
   151  func NewDummyAgent() Agent {
   152  	return &dummyAgent{}
   153  }
   155  func (*dummyAgent) Start(context.Context) error {
   156  	return nil
   157  }
   159  func (*dummyAgent) Stop(context.Context) error {
   160  	return nil
   161  }
   163  func (*dummyAgent) BroadcastOutbound(ctx context.Context, msg proto.Message) error {
   164  	return nil
   165  }
   167  func (*dummyAgent) UnicastOutbound(_ context.Context, peer peer.AddrInfo, msg proto.Message) error {
   168  	return nil
   169  }
   171  func (*dummyAgent) Info() (peer.AddrInfo, error) {
   172  	return peer.AddrInfo{}, nil
   173  }
   175  func (*dummyAgent) Self() ([]multiaddr.Multiaddr, error) {
   176  	return nil, nil
   177  }
   179  func (*dummyAgent) ConnectedPeers() ([]peer.AddrInfo, error) {
   180  	return nil, nil
   181  }
   183  func (*dummyAgent) BlockPeer(string) {
   184  	return
   185  }
   187  func (*dummyAgent) BuildReport() string {
   188  	return ""
   189  }
   191  // NewAgent instantiates a local P2P agent instance
   192  func NewAgent(cfg Config, chainID uint32, genesisHash hash.Hash256, broadcastHandler HandleBroadcastInbound, unicastHandler HandleUnicastInboundAsync) Agent {
   193  	log.L().Info("p2p agent", log.Hex("topicSuffix", genesisHash[22:]))
   194  	return &agent{
   195  		cfg:     cfg,
   196  		chainID: chainID,
   197  		// Make sure the honest node only care the messages related the chain from the same genesis
   198  		topicSuffix:                hex.EncodeToString(genesisHash[22:]), // last 10 bytes of genesis hash
   199  		broadcastInboundHandler:    broadcastHandler,
   200  		unicastInboundAsyncHandler: unicastHandler,
   201  		reconnectTimeout:           cfg.ReconnectInterval,
   202  		qosMetrics:                 NewQoS(time.Now(), 2*cfg.ReconnectInterval),
   203  	}
   204  }
   206  func (p *agent) Start(ctx context.Context) error {
   207  	ready := make(chan interface{})
   208  	p2p.SetLogger(log.L())
   209  	opts := []p2p.Option{
   210  		p2p.HostName(p.cfg.Host),
   211  		p2p.Port(p.cfg.Port),
   212  		p2p.Gossip(),
   213  		p2p.SecureIO(),
   214  		p2p.MasterKey(p.cfg.MasterKey),
   215  		p2p.PrivateNetworkPSK(p.cfg.PrivateNetworkPSK),
   216  		p2p.DHTProtocolID(p.chainID),
   217  		p2p.DHTGroupID(p.chainID),
   218  		p2p.WithMaxPeer(uint32(p.cfg.MaxPeers)),
   219  		p2p.WithMaxMessageSize(p.cfg.MaxMessageSize),
   220  	}
   221  	if p.cfg.EnableRateLimit {
   222  		opts = append(opts, p2p.WithRateLimit(p.cfg.RateLimit))
   223  	}
   224  	if p.cfg.ExternalHost != "" {
   225  		opts = append(opts, p2p.ExternalHostName(p.cfg.ExternalHost))
   226  		opts = append(opts, p2p.ExternalPort(p.cfg.ExternalPort))
   227  	}
   228  	if p.cfg.RelayType != "" {
   229  		opts = append(opts, p2p.WithRelay(p.cfg.RelayType))
   230  	}
   231  	host, err := p2p.NewHost(ctx, opts...)
   232  	if err != nil {
   233  		return errors.Wrap(err, "error when instantiating Agent host")
   234  	}
   236  	if err := host.AddBroadcastPubSub(ctx, _broadcastTopic+p.topicSuffix, func(ctx context.Context, data []byte) (err error) {
   237  		// Blocking handling the broadcast message until the agent is started
   238  		<-ready
   239  		var (
   240  			peerID    string
   241  			broadcast iotexrpc.BroadcastMsg
   242  			latency   int64
   243  		)
   244  		skip := false
   245  		defer func() {
   246  			// Skip accounting if the broadcast message is not handled
   247  			if skip {
   248  				return
   249  			}
   250  			status := _successStr
   251  			if err != nil {
   252  				status = _failureStr
   253  			}
   254  			_p2pMsgCounter.WithLabelValues("broadcast", strconv.Itoa(int(broadcast.MsgType)), "in", peerID, status).Inc()
   255  			_p2pMsgLatency.WithLabelValues("broadcast", strconv.Itoa(int(broadcast.MsgType)), status).Observe(float64(latency))
   256  		}()
   257  		if err = proto.Unmarshal(data, &broadcast); err != nil {
   258  			err = errors.Wrap(err, "error when marshaling broadcast message")
   259  			return
   260  		}
   261  		// Skip the broadcast message if it's from the node itself
   262  		rawmsg, ok := p2p.GetBroadcastMsg(ctx)
   263  		if !ok {
   264  			err = errors.New("error when asserting broadcast msg context")
   265  			return
   266  		}
   267  		peerID = rawmsg.GetFrom().Pretty()
   268  		if == peerID {
   269  			skip = true
   270  			return
   271  		}
   272  		if broadcast.ChainId != p.chainID {
   273  			err = errors.Errorf("chain ID mismatch, received %d, expecting %d", broadcast.ChainId, p.chainID)
   274  			return
   275  		}
   277  		t := broadcast.GetTimestamp().AsTime()
   278  		latency = time.Since(t).Nanoseconds() / time.Millisecond.Nanoseconds()
   280  		msg, err := goproto.TypifyRPCMsg(broadcast.MsgType, broadcast.MsgBody)
   281  		if err != nil {
   282  			err = errors.Wrap(err, "error when typifying broadcast message")
   283  			return
   284  		}
   285  		p.broadcastInboundHandler(ctx, broadcast.ChainId, peerID, msg)
   286  		p.qosMetrics.updateRecvBroadcast(time.Now())
   287  		return
   288  	}); err != nil {
   289  		return errors.Wrap(err, "error when adding broadcast pubsub")
   290  	}
   292  	if err := host.AddUnicastPubSub(_unicastTopic+p.topicSuffix, func(ctx context.Context, peerInfo peer.AddrInfo, data []byte) (err error) {
   293  		// Blocking handling the unicast message until the agent is started
   294  		<-ready
   295  		var (
   296  			unicast iotexrpc.UnicastMsg
   297  			peerID  = peerInfo.ID.Pretty()
   298  			latency int64
   299  		)
   300  		defer func() {
   301  			status := _successStr
   302  			if err != nil {
   303  				status = _failureStr
   304  			}
   305  			_p2pMsgCounter.WithLabelValues("unicast", strconv.Itoa(int(unicast.MsgType)), "in", peerID, status).Inc()
   306  			_p2pMsgLatency.WithLabelValues("unicast", strconv.Itoa(int(unicast.MsgType)), status).Observe(float64(latency))
   307  		}()
   308  		if err = proto.Unmarshal(data, &unicast); err != nil {
   309  			err = errors.Wrap(err, "error when marshaling unicast message")
   310  			return
   311  		}
   312  		msg, err := goproto.TypifyRPCMsg(unicast.MsgType, unicast.MsgBody)
   313  		if err != nil {
   314  			err = errors.Wrap(err, "error when typifying unicast message")
   315  			return
   316  		}
   317  		if unicast.ChainId != p.chainID {
   318  			err = errors.Errorf("chain ID mismatch, received %d, expecting %d", unicast.ChainId, p.chainID)
   319  			return
   320  		}
   322  		t := unicast.GetTimestamp().AsTime()
   323  		latency = time.Since(t).Nanoseconds() / time.Millisecond.Nanoseconds()
   325  		p.unicastInboundAsyncHandler(ctx, unicast.ChainId, peerInfo, msg)
   326  		p.qosMetrics.updateRecvUnicast(peerID, time.Now())
   327  		return
   328  	}); err != nil {
   329  		return errors.Wrap(err, "error when adding unicast pubsub")
   330  	}
   332  	// create boot nodes list except itself
   333  	hostName := host.HostIdentity()
   334  	for _, bootstrapNode := range p.cfg.BootstrapNodes {
   335  		bootAddr := multiaddr.StringCast(bootstrapNode)
   336  		if !strings.Contains(bootAddr.String(), hostName) {
   337  			p.bootNodeAddr = append(p.bootNodeAddr, bootAddr)
   338  		}
   339  	}
   340  	if err := host.AddBootstrap(p.bootNodeAddr); err != nil {
   341  		return err
   342  	}
   343  	host.JoinOverlay()
   344 = host
   346  	// connect to bootstrap nodes
   347  	if err := p.connectBootNode(ctx); err != nil {
   348  		log.L().Error("fail to connect bootnode", zap.Error(err))
   349  		return err
   350  	}
   351  	if err :=; err != nil {
   352  		return err
   353  	}
   354  	if err :=; err != nil {
   355  		return err
   356  	}
   358  	close(ready)
   360  	// check network connectivity every 60 blocks, and reconnect in case of disconnection
   361  	p.reconnectTask = routine.NewRecurringTask(p.reconnect, p.reconnectTimeout)
   362  	return p.reconnectTask.Start(ctx)
   363  }
   365  func (p *agent) Stop(ctx context.Context) error {
   366  	if == nil {
   367  		return ErrAgentNotStarted
   368  	}
   369  	log.L().Info("p2p is shutting down.", zap.Error(ctx.Err()))
   370  	if err := p.reconnectTask.Stop(ctx); err != nil {
   371  		return err
   372  	}
   373  	if err :=; err != nil {
   374  		return errors.Wrap(err, "error when closing Agent host")
   375  	}
   376  	return nil
   377  }
   379  func (p *agent) BroadcastOutbound(ctx context.Context, msg proto.Message) (err error) {
   380  	_, span := tracer.NewSpan(ctx, "Agent.BroadcastOutbound")
   381  	defer span.End()
   383  	host :=
   384  	if host == nil {
   385  		return ErrAgentNotStarted
   386  	}
   387  	var msgType iotexrpc.MessageType
   388  	var msgBody []byte
   389  	defer func() {
   390  		status := _successStr
   391  		if err != nil {
   392  			status = _failureStr
   393  		}
   394  		_p2pMsgCounter.WithLabelValues(
   395  			"broadcast",
   396  			strconv.Itoa(int(msgType)),
   397  			"out",
   398  			host.HostIdentity(),
   399  			status,
   400  		).Inc()
   401  	}()
   402  	msgType, msgBody, err = convertAppMsg(msg)
   403  	if err != nil {
   404  		return
   405  	}
   406  	broadcast := iotexrpc.BroadcastMsg{
   407  		ChainId:   p.chainID,
   408  		PeerId:    host.HostIdentity(),
   409  		MsgType:   msgType,
   410  		MsgBody:   msgBody,
   411  		Timestamp: timestamppb.Now(),
   412  	}
   413  	data, err := proto.Marshal(&broadcast)
   414  	if err != nil {
   415  		err = errors.Wrap(err, "error when marshaling broadcast message")
   416  		return
   417  	}
   418  	t := time.Now()
   419  	if err = host.Broadcast(ctx, _broadcastTopic+p.topicSuffix, data); err != nil {
   420  		err = errors.Wrap(err, "error when sending broadcast message")
   421  		p.qosMetrics.updateSendBroadcast(t, false)
   422  		return
   423  	}
   424  	p.qosMetrics.updateSendBroadcast(t, true)
   425  	return
   426  }
   428  func (p *agent) UnicastOutbound(ctx context.Context, peer peer.AddrInfo, msg proto.Message) (err error) {
   429  	host :=
   430  	if host == nil {
   431  		return ErrAgentNotStarted
   432  	}
   433  	var (
   434  		peerName = peer.ID.Pretty()
   435  		msgType  iotexrpc.MessageType
   436  		msgBody  []byte
   437  	)
   438  	defer func() {
   439  		status := _successStr
   440  		if err != nil {
   441  			status = _failureStr
   442  		}
   443  		_p2pMsgCounter.WithLabelValues("unicast", strconv.Itoa(int(msgType)), "out", peer.ID.Pretty(), status).Inc()
   444  	}()
   446  	msgType, msgBody, err = convertAppMsg(msg)
   447  	if err != nil {
   448  		return
   449  	}
   450  	unicast := iotexrpc.UnicastMsg{
   451  		ChainId:   p.chainID,
   452  		PeerId:    host.HostIdentity(),
   453  		MsgType:   msgType,
   454  		MsgBody:   msgBody,
   455  		Timestamp: timestamppb.Now(),
   456  	}
   457  	data, err := proto.Marshal(&unicast)
   458  	if err != nil {
   459  		err = errors.Wrap(err, "error when marshaling unicast message")
   460  		return
   461  	}
   463  	t := time.Now()
   464  	if err = host.Unicast(ctx, peer, _unicastTopic+p.topicSuffix, data); err != nil {
   465  		err = errors.Wrap(err, "error when sending unicast message")
   466  		p.qosMetrics.updateSendUnicast(peerName, t, false)
   467  		return
   468  	}
   469  	p.qosMetrics.updateSendUnicast(peerName, t, true)
   470  	return
   471  }
   473  func (p *agent) Info() (peer.AddrInfo, error) {
   474  	if == nil {
   475  		return peer.AddrInfo{}, ErrAgentNotStarted
   476  	}
   477  	return, nil
   478  }
   480  func (p *agent) Self() ([]multiaddr.Multiaddr, error) {
   481  	if == nil {
   482  		return nil, ErrAgentNotStarted
   483  	}
   484  	return, nil
   485  }
   487  func (p *agent) ConnectedPeers() ([]peer.AddrInfo, error) {
   488  	if == nil {
   489  		return nil, ErrAgentNotStarted
   490  	}
   491  	return, nil
   492  }
   494  func (p *agent) BlockPeer(pidStr string) {
   495  	pid, err := peer.Decode(pidStr)
   496  	if err != nil {
   497  		return
   498  	}
   500  }
   502  // BuildReport builds a report of p2p agent
   503  func (p *agent) BuildReport() string {
   504  	neighbors, err := p.ConnectedPeers()
   505  	if err == nil {
   506  		return fmt.Sprintf("P2P ConnectedPeers: %d", len(neighbors))
   507  	}
   508  	return ""
   509  }
   511  func (p *agent) connectBootNode(ctx context.Context) error {
   512  	if len(p.cfg.BootstrapNodes) == 0 {
   513  		return nil
   514  	}
   515  	var errNum, connNum, desiredConnNum int
   516  	conn := make(chan struct{}, len(p.cfg.BootstrapNodes))
   517  	connErrChan := make(chan error, len(p.cfg.BootstrapNodes))
   519  	// try to connect to all bootstrap node beside itself.
   520  	for i := range p.bootNodeAddr {
   521  		bootAddr := p.bootNodeAddr[i]
   522  		go func() {
   523  			if err := exponentialRetry(
   524  				func() error { return, bootAddr) },
   525  				_dialRetryInterval,
   526  				_numDialRetries,
   527  			); err != nil {
   528  				err := errors.Wrap(err, fmt.Sprintf("error when connecting bootstrap node %s", bootAddr.String()))
   529  				connErrChan <- err
   530  				return
   531  			}
   532  			conn <- struct{}{}
   533  			log.L().Info("Connected bootstrap node.", zap.String("address", bootAddr.String()))
   534  		}()
   535  	}
   537  	// wait until half+1 bootnodes get connected
   538  	desiredConnNum = len(p.bootNodeAddr)/2 + 1
   539  	for {
   540  		select {
   541  		case err := <-connErrChan:
   542  			log.L().Info("Connection failed.", zap.Error(err))
   543  			errNum++
   544  			if errNum == len(p.bootNodeAddr) {
   545  				return errors.New("failed to connect to any bootstrap node")
   546  			}
   547  		case <-conn:
   548  			connNum++
   549  		}
   550  		// can add more condition later
   551  		if connNum >= desiredConnNum {
   552  			break
   553  		}
   554  	}
   555  	return nil
   556  }
   558  func (p *agent) reconnect() {
   559  	if == nil {
   560  		return
   561  	}
   562  	if len( == 0 || p.qosMetrics.lostConnection() {
   563  		log.L().Info("network lost, try re-connecting.")
   565  		if err := p.connectBootNode(context.Background()); err != nil {
   566  			log.L().Error("fail to connect bootnode", zap.Error(err))
   567  			return
   568  		}
   569  		if err :=; err != nil {
   570  			log.L().Error("fail to advertise", zap.Error(err))
   571  			return
   572  		}
   573  	}
   574  	if err :=; err != nil {
   575  		log.L().Error("fail to find peer", zap.Error(err))
   576  	}
   577  }
   579  func convertAppMsg(msg proto.Message) (iotexrpc.MessageType, []byte, error) {
   580  	msgType, err := goproto.GetTypeFromRPCMsg(msg)
   581  	if err != nil {
   582  		return 0, nil, errors.Wrap(err, "error when converting application message to proto")
   583  	}
   584  	msgBody, err := proto.Marshal(msg)
   585  	if err != nil {
   586  		return 0, nil, errors.Wrap(err, "error when marshaling application message")
   587  	}
   588  	return msgType, msgBody, nil
   589  }
   591  func exponentialRetry(f func() error, retryInterval time.Duration, numRetries int) (err error) {
   592  	for i := 0; i < numRetries; i++ {
   593  		if err = f(); err == nil {
   594  			return
   595  		}
   596  		log.L().Error("Error happens, will retry.", zap.Error(err))
   597  		time.Sleep(retryInterval)
   598  		retryInterval *= 2
   599  	}
   600  	return
   601  }