github.com/grafana/pyroscope@v1.18.0/pkg/metastore/raftnode/node_bootstrap.go (about) 1 package raftnode 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "slices" 8 "strings" 9 "time" 10 11 "github.com/go-kit/log" 12 "github.com/go-kit/log/level" 13 "github.com/grafana/dskit/backoff" 14 "github.com/grafana/dskit/dns" 15 "github.com/hashicorp/raft" 16 17 "github.com/grafana/pyroscope/pkg/metastore/discovery" 18 "github.com/grafana/pyroscope/pkg/metastore/raftnode/raftnodepb" 19 ) 20 21 func (n *Node) bootstrap() error { 22 peers, err := n.bootstrapPeersWithRetries() 23 if err != nil { 24 return fmt.Errorf("failed to resolve peers: %w", err) 25 } 26 logger := log.With(n.logger, 27 "server_id", n.config.ServerID, 28 "advertise_address", n.config.AdvertiseAddress, 29 "peers", fmt.Sprint(peers)) 30 lastPeer := peers[len(peers)-1] 31 if raft.ServerAddress(n.config.AdvertiseAddress) != lastPeer.Address { 32 level.Info(logger).Log("msg", "not the bootstrap node, skipping") 33 return nil 34 } 35 level.Info(logger).Log("msg", "bootstrapping raft") 36 bootstrap := n.raft.BootstrapCluster(raft.Configuration{Servers: peers}) 37 if bootstrapErr := bootstrap.Error(); bootstrapErr != nil { 38 if !errors.Is(bootstrapErr, raft.ErrCantBootstrap) { 39 return fmt.Errorf("failed to bootstrap raft: %w", bootstrapErr) 40 } 41 } 42 return nil 43 } 44 45 func (n *Node) bootstrapPeersWithRetries() (peers []raft.Server, err error) { 46 prov := dns.NewProvider(n.logger, n.reg, dns.MiekgdnsResolverType) 47 attempt := func() bool { 48 peers, err = n.bootstrapPeers(prov) 49 level.Debug(n.logger).Log("msg", "resolving bootstrap peers", "peers", fmt.Sprint(peers), "err", err) 50 if err != nil { 51 _ = level.Error(n.logger).Log("msg", "failed to resolve bootstrap peers", "err", err) 52 return false 53 } 54 return true 55 } 56 backoffConfig := backoff.Config{ 57 MinBackoff: 1 * time.Second, 58 MaxBackoff: 10 * time.Second, 59 MaxRetries: 20, 60 } 61 backOff := backoff.New(context.Background(), backoffConfig) 62 for backOff.Ongoing() { 63 if !attempt() { 64 backOff.Wait() 65 } else { 66 return peers, nil 67 } 68 } 69 return nil, fmt.Errorf("failed to resolve bootstrap peers after %d retries %w", backOff.NumRetries(), err) 70 } 71 72 const autoJoinTimeout = 10 * time.Second 73 74 func (n *Node) tryAutoJoin() error { 75 // we can only auto-join if there is a real raft cluster running 76 ctx, cancel := context.WithTimeout(context.Background(), autoJoinTimeout) 77 defer cancel() 78 79 readIndexResp, err := n.raftNodeClient.ReadIndex(ctx, &raftnodepb.ReadIndexRequest{}) 80 if err != nil { 81 return fmt.Errorf("failed to get current term for auto-join: %w", err) 82 } 83 84 logger := log.With(n.logger, 85 "server_id", n.config.ServerID, 86 "advertise_address", n.config.AdvertiseAddress) 87 88 // try to join the cluster via the leader 89 level.Info(logger).Log("msg", "attempting to join existing cluster", "current_term", readIndexResp.Term) 90 _, err = n.raftNodeClient.AddNode(ctx, &raftnodepb.AddNodeRequest{ 91 ServerId: n.config.AdvertiseAddress, 92 CurrentTerm: readIndexResp.Term, 93 }) 94 95 if err != nil { 96 return fmt.Errorf("failed to auto-join cluster: %w", err) 97 } 98 99 return nil 100 } 101 102 func (n *Node) bootstrapPeers(prov *dns.Provider) ([]raft.Server, error) { 103 // The peer list always includes the local node. 104 peers := make([]raft.Server, 0, len(n.config.BootstrapPeers)+1) 105 peers = append(peers, raft.Server{ 106 Suffrage: raft.Voter, 107 ID: raft.ServerID(n.config.ServerID), 108 Address: raft.ServerAddress(n.config.AdvertiseAddress), 109 }) 110 // Note that raft requires stable node IDs, therefore we're using 111 // the node FQDN:port for both purposes: as the identifier and as the 112 // address. This requires a DNS SRV record lookup without further 113 // resolution of A records (dnssrvnoa+). 114 // 115 // Alternatively, peers may be specified explicitly in the 116 // "{addr}</{node_id}>" format, where the node is the optional node 117 // identifier. 118 var resolve []string 119 for _, peer := range n.config.BootstrapPeers { 120 if strings.Contains(peer, "+") { 121 resolve = append(resolve, peer) 122 } else { 123 peers = append(peers, discovery.ParsePeer(peer)) 124 } 125 } 126 if len(resolve) > 0 { 127 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 128 defer cancel() 129 if err := prov.Resolve(ctx, resolve); err != nil { 130 return nil, fmt.Errorf("failed to resolve bootstrap peers: %w", err) 131 } 132 resolvedPeers := prov.Addresses() 133 if len(resolvedPeers) == 0 { 134 // The local node is the only one in the cluster, but peers 135 // are expected to be present. Stop here to avoid bootstrapping 136 // a single-node cluster. 137 return nil, fmt.Errorf("bootstrap peers can't be resolved") 138 } 139 for _, peer := range resolvedPeers { 140 peers = append(peers, raft.Server{ 141 Suffrage: raft.Voter, 142 ID: raft.ServerID(peer), 143 Address: raft.ServerAddress(peer), 144 }) 145 } 146 } 147 // Finally, we sort and deduplicate the peers: the first one 148 // is to boostrap the cluster. If there are nodes with distinct 149 // IDs but the same address, bootstrapping will fail. 150 slices.SortFunc(peers, func(a, b raft.Server) int { 151 return strings.Compare(string(a.ID), string(b.ID)) 152 }) 153 peers = slices.CompactFunc(peers, func(a, b raft.Server) bool { 154 return a.ID == b.ID 155 }) 156 if len(peers) != n.config.BootstrapExpectPeers { 157 return nil, fmt.Errorf("expected number of bootstrap peers not reached: got %d, expected %d\n%+v", 158 len(peers), n.config.BootstrapExpectPeers, peers) 159 } 160 return peers, nil 161 }