github.com/bartle-stripe/trillian@v1.2.1/monitoring/prometheus/etcdiscover/main.go (about)

     1  // Copyright 2017 Google Inc. All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // The etcdiscover binary monitors etcd to track the set of instances that
    16  // support a gRPC service, and updates a file so that Prometheus can track
    17  // those instances.
    18  package main
    19  
    20  import (
    21  	"context"
    22  	"encoding/json"
    23  	"flag"
    24  	"fmt"
    25  	"io/ioutil"
    26  	"os"
    27  	"path"
    28  	"path/filepath"
    29  	"strings"
    30  	"sync"
    31  	"time"
    32  
    33  	"github.com/coreos/etcd/clientv3"
    34  	etcdnaming "github.com/coreos/etcd/clientv3/naming"
    35  	"github.com/golang/glog"
    36  	"github.com/google/trillian/util"
    37  	"google.golang.org/grpc/naming"
    38  )
    39  
    40  var (
    41  	etcdServers  = flag.String("etcd_servers", "", "Comma-separated list of etcd servers")
    42  	etcdServices = flag.String("etcd_services", "", "Comma-separated list of service names to monitor for endpoints")
    43  	targetFile   = flag.String("target", "", "File to update with service endpoint locations")
    44  )
    45  
    46  type serviceInstanceInfo struct {
    47  	servers  []string
    48  	services []string
    49  	target   string
    50  
    51  	mu        sync.RWMutex
    52  	watcher   map[string]naming.Watcher // nolint: megacheck
    53  	instances map[string]map[string]bool
    54  }
    55  
    56  func newServiceInstanceInfo(etcdServers, etcdServices, target string) *serviceInstanceInfo {
    57  	s := serviceInstanceInfo{
    58  		servers:   strings.Split(etcdServers, ","),
    59  		services:  strings.Split(etcdServices, ","),
    60  		watcher:   make(map[string]naming.Watcher), // nolint: megacheck
    61  		target:    target,
    62  		instances: make(map[string]map[string]bool),
    63  	}
    64  	for _, service := range s.services {
    65  		s.instances[service] = make(map[string]bool)
    66  	}
    67  	return &s
    68  }
    69  
    70  // Watch starts a collection of goroutines (one per service) that monitor etcd for
    71  // changes in the endpoints serving the services. Blocks until Close() called.
    72  func (s *serviceInstanceInfo) Watch() {
    73  	var wg sync.WaitGroup
    74  	for _, service := range s.services {
    75  		wg.Add(1)
    76  		go func(service string) {
    77  			defer wg.Done()
    78  			s.watchService(service)
    79  		}(service)
    80  	}
    81  	wg.Wait()
    82  }
    83  
    84  // Close terminates monitoring.
    85  func (s *serviceInstanceInfo) Close() {
    86  	s.mu.Lock()
    87  	defer s.mu.Unlock()
    88  	for _, service := range s.services {
    89  		glog.Infof("close watcher for %s", service)
    90  		if s.watcher[service] != nil {
    91  			s.watcher[service].Close()
    92  		}
    93  	}
    94  }
    95  
    96  type prometheusJobInfo struct {
    97  	Targets []string          `json:"targets,omitempty"`
    98  	Labels  map[string]string `json:"labels,omitempty"`
    99  }
   100  
   101  // Export produces a JSON format description of the services and their endpoints
   102  // in a format suitable for use as Prometheus targets.
   103  func (s *serviceInstanceInfo) Export() ([]byte, error) {
   104  	s.mu.RLock()
   105  	defer s.mu.RUnlock()
   106  	jobs := make([]*prometheusJobInfo, len(s.services))
   107  	for i, service := range s.services {
   108  		info := prometheusJobInfo{
   109  			Labels: map[string]string{"job": service},
   110  		}
   111  		for endpoint, present := range s.instances[service] {
   112  			if present {
   113  				info.Targets = append(info.Targets, endpoint)
   114  			}
   115  		}
   116  		jobs[i] = &info
   117  	}
   118  	return json.MarshalIndent(jobs, "", "\t")
   119  }
   120  
   121  // Update updates the target file with the current state.
   122  func (s *serviceInstanceInfo) Update() {
   123  	jsonData, err := s.Export()
   124  	if err != nil {
   125  		glog.Errorf("failed to export JSON data: %v", err)
   126  		return
   127  	}
   128  	if s.target == "" {
   129  		fmt.Printf("State:\n%s\n", jsonData)
   130  		return
   131  	}
   132  	glog.V(1).Infof("Writing current state:\n%s", string(jsonData))
   133  
   134  	// Write to a temporary file.
   135  	tempFile, err := ioutil.TempFile(filepath.Dir(s.target), "pending-"+path.Base(s.target))
   136  	if err != nil {
   137  		glog.Errorf("failed to create tempfile: %v", err)
   138  		return
   139  	}
   140  	if _, err := tempFile.Write(jsonData); err != nil {
   141  		glog.Errorf("failed to write JSON data to tempfile %q: %v", tempFile.Name(), err)
   142  	}
   143  	tempFile.Close()
   144  
   145  	// Rename the temporary file to the target so it is updated more atomically.
   146  	if err := os.Rename(tempFile.Name(), s.target); err != nil {
   147  		glog.Errorf("failed to rename tempfile %q to %q: %v", tempFile.Name(), s.target, err)
   148  	}
   149  }
   150  
   151  func (s *serviceInstanceInfo) watchService(service string) {
   152  	cfg := clientv3.Config{Endpoints: s.servers, DialTimeout: 5 * time.Second}
   153  	client, err := clientv3.New(cfg)
   154  	if err != nil {
   155  		glog.Exitf("Failed to connect to etcd at %v: %v", s.servers, err)
   156  	}
   157  	res := &etcdnaming.GRPCResolver{Client: client}
   158  	watcher, err := res.Resolve(service)
   159  	if err != nil {
   160  		glog.Exitf("Failed to watch %s for updates: %v", service, err)
   161  	}
   162  
   163  	// Save the watcher so external code can Close() it.
   164  	s.mu.Lock()
   165  	s.watcher[service] = watcher
   166  	s.mu.Unlock()
   167  
   168  	for {
   169  		updates, err := watcher.Next()
   170  		if err != nil {
   171  			glog.Errorf("Failed on Next(): %v", err)
   172  			return
   173  		}
   174  		for _, update := range updates {
   175  			switch update.Op {
   176  			case naming.Add:
   177  				glog.V(1).Infof("Add(%s, +%s)", service, update.Addr)
   178  				s.mu.Lock()
   179  				s.instances[service][update.Addr] = true
   180  				s.mu.Unlock()
   181  			case naming.Delete:
   182  				glog.V(1).Infof("Delete(%s, -%s)", service, update.Addr)
   183  				s.mu.Lock()
   184  				s.instances[service][update.Addr] = false
   185  				s.mu.Unlock()
   186  			}
   187  		}
   188  		s.Update()
   189  	}
   190  }
   191  
   192  func main() {
   193  	flag.Parse()
   194  	defer glog.Flush()
   195  
   196  	if *etcdServers == "" {
   197  		glog.Exitf("No etcd servers configured with --etcd_servers")
   198  	}
   199  	if *etcdServices == "" {
   200  		glog.Exitf("No etcd services configured with --etcd_services")
   201  	}
   202  
   203  	state := newServiceInstanceInfo(*etcdServers, *etcdServices, *targetFile)
   204  	ctx, cancel := context.WithCancel(context.Background())
   205  	defer cancel()
   206  	go util.AwaitSignal(ctx, func() {
   207  		state.Close()
   208  	})
   209  	state.Watch()
   210  }