github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachprod/cloud/gc.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package cloud
    12  
    13  import (
    14  	"encoding/base64"
    15  	"fmt"
    16  	"hash/fnv"
    17  	"io/ioutil"
    18  	"log"
    19  	"os"
    20  	"path/filepath"
    21  	"sort"
    22  	"strings"
    23  	"text/tabwriter"
    24  	"time"
    25  
    26  	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/config"
    27  	"github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm"
    28  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    29  	"github.com/cockroachdb/errors"
    30  	"github.com/nlopes/slack"
    31  )
    32  
    33  var errNoSlackClient = fmt.Errorf("no Slack client")
    34  
    35  type status struct {
    36  	good    []*Cluster
    37  	warn    []*Cluster
    38  	destroy []*Cluster
    39  }
    40  
    41  func (s *status) add(c *Cluster, now time.Time) {
    42  	exp := c.ExpiresAt()
    43  	if exp.After(now) {
    44  		if exp.Before(now.Add(2 * time.Hour)) {
    45  			s.warn = append(s.warn, c)
    46  		} else {
    47  			s.good = append(s.good, c)
    48  		}
    49  	} else {
    50  		s.destroy = append(s.destroy, c)
    51  	}
    52  }
    53  
    54  // messageHash computes a base64-encoded hash value to show whether
    55  // or not two status values would result in a duplicate
    56  // notification to a user.
    57  func (s *status) notificationHash() string {
    58  	// Use stdlib hash function, since we don't need any crypto guarantees
    59  	hash := fnv.New32a()
    60  
    61  	for i, list := range [][]*Cluster{s.good, s.warn, s.destroy} {
    62  		_, _ = hash.Write([]byte{byte(i)})
    63  
    64  		var data []string
    65  		for _, c := range list {
    66  			// Deduplicate by cluster name and expiration time
    67  			data = append(data, fmt.Sprintf("%s %s", c.Name, c.ExpiresAt()))
    68  		}
    69  		// Ensure results are stable
    70  		sort.Strings(data)
    71  
    72  		for _, d := range data {
    73  			_, _ = hash.Write([]byte(d))
    74  		}
    75  	}
    76  
    77  	bytes := hash.Sum(nil)
    78  	return base64.StdEncoding.EncodeToString(bytes)
    79  }
    80  
    81  func makeSlackClient() *slack.Client {
    82  	if config.SlackToken == "" {
    83  		return nil
    84  	}
    85  	client := slack.New(config.SlackToken)
    86  	// client.SetDebug(true)
    87  	return client
    88  }
    89  
    90  func findChannel(client *slack.Client, name string) (string, error) {
    91  	if client != nil {
    92  		channels, err := client.GetChannels(true)
    93  		if err != nil {
    94  			return "", err
    95  		}
    96  		for _, channel := range channels {
    97  			if channel.Name == name {
    98  				return channel.ID, nil
    99  			}
   100  		}
   101  	}
   102  	return "", fmt.Errorf("not found")
   103  }
   104  
   105  func findUserChannel(client *slack.Client, email string) (string, error) {
   106  	if client == nil {
   107  		return "", errNoSlackClient
   108  	}
   109  	u, err := client.GetUserByEmail(email)
   110  	if err != nil {
   111  		return "", err
   112  	}
   113  	_, _, channelID, err := client.OpenIMChannel(u.ID)
   114  	return channelID, err
   115  }
   116  
   117  func postStatus(client *slack.Client, channel string, dryrun bool, s *status, badVMs vm.List) {
   118  	if dryrun {
   119  		tw := tabwriter.NewWriter(os.Stdout, 0, 8, 2, ' ', 0)
   120  		for _, c := range s.good {
   121  			fmt.Fprintf(tw, "good:\t%s\t%s\t(%s)\n", c.Name,
   122  				c.GCAt().Format(time.Stamp),
   123  				c.LifetimeRemaining().Round(time.Second))
   124  		}
   125  		for _, c := range s.warn {
   126  			fmt.Fprintf(tw, "warn:\t%s\t%s\t(%s)\n", c.Name,
   127  				c.GCAt().Format(time.Stamp),
   128  				c.LifetimeRemaining().Round(time.Second))
   129  		}
   130  		for _, c := range s.destroy {
   131  			fmt.Fprintf(tw, "destroy:\t%s\t%s\t(%s)\n", c.Name,
   132  				c.GCAt().Format(time.Stamp),
   133  				c.LifetimeRemaining().Round(time.Second))
   134  		}
   135  		_ = tw.Flush()
   136  	}
   137  
   138  	if client == nil || channel == "" {
   139  		return
   140  	}
   141  
   142  	// Debounce messages, unless we have badVMs since that indicates
   143  	// a problem that needs manual intervention
   144  	if len(badVMs) == 0 {
   145  		send, err := shouldSend(channel, s)
   146  		if err != nil {
   147  			log.Printf("unable to deduplicate notification: %s", err)
   148  		}
   149  		if !send {
   150  			return
   151  		}
   152  	}
   153  
   154  	makeStatusFields := func(clusters []*Cluster) []slack.AttachmentField {
   155  		var names []string
   156  		var expirations []string
   157  		for _, c := range clusters {
   158  			names = append(names, c.Name)
   159  			expirations = append(expirations,
   160  				fmt.Sprintf("<!date^%[1]d^{date_short_pretty} {time}|%[2]s>",
   161  					c.GCAt().Unix(),
   162  					c.LifetimeRemaining().Round(time.Second)))
   163  		}
   164  		return []slack.AttachmentField{
   165  			{
   166  				Title: "name",
   167  				Value: strings.Join(names, "\n"),
   168  				Short: true,
   169  			},
   170  			{
   171  				Title: "expiration",
   172  				Value: strings.Join(expirations, "\n"),
   173  				Short: true,
   174  			},
   175  		}
   176  	}
   177  
   178  	params := slack.PostMessageParameters{
   179  		Username: "roachprod",
   180  	}
   181  	fallback := fmt.Sprintf("clusters: %d live, %d expired, %d destroyed",
   182  		len(s.good), len(s.warn), len(s.destroy))
   183  	if len(s.good) > 0 {
   184  		params.Attachments = append(params.Attachments,
   185  			slack.Attachment{
   186  				Color:    "good",
   187  				Title:    "Live Clusters",
   188  				Fallback: fallback,
   189  				Fields:   makeStatusFields(s.good),
   190  			})
   191  	}
   192  	if len(s.warn) > 0 {
   193  		params.Attachments = append(params.Attachments,
   194  			slack.Attachment{
   195  				Color:    "warning",
   196  				Title:    "Expiring Clusters",
   197  				Fallback: fallback,
   198  				Fields:   makeStatusFields(s.warn),
   199  			})
   200  	}
   201  	if len(s.destroy) > 0 {
   202  		params.Attachments = append(params.Attachments,
   203  			slack.Attachment{
   204  				Color:    "danger",
   205  				Title:    "Destroyed Clusters",
   206  				Fallback: fallback,
   207  				Fields:   makeStatusFields(s.destroy),
   208  			})
   209  	}
   210  	if len(badVMs) > 0 {
   211  		var names []string
   212  		for _, vm := range badVMs {
   213  			names = append(names, vm.Name)
   214  		}
   215  		sort.Strings(names)
   216  		params.Attachments = append(params.Attachments,
   217  			slack.Attachment{
   218  				Color: "danger",
   219  				Title: "Bad VMs",
   220  				Text:  strings.Join(names, "\n"),
   221  			})
   222  	}
   223  
   224  	_, _, err := client.PostMessage(channel, "", params)
   225  	if err != nil {
   226  		log.Println(err)
   227  	}
   228  }
   229  
   230  func postError(client *slack.Client, channel string, err error) {
   231  	log.Println(err)
   232  	if client == nil || channel == "" {
   233  		return
   234  	}
   235  
   236  	params := slack.PostMessageParameters{
   237  		Username:   "roachprod",
   238  		Markdown:   true,
   239  		EscapeText: false,
   240  	}
   241  	_, _, err = client.PostMessage(channel, fmt.Sprintf("`%s`", err), params)
   242  	if err != nil {
   243  		log.Println(err)
   244  	}
   245  }
   246  
   247  // shouldSend determines whether or not the given status was previously
   248  // sent to the channel.  The error returned by this function is
   249  // advisory; the boolean value is always a reasonable behavior.
   250  func shouldSend(channel string, status *status) (bool, error) {
   251  	hashDir := os.ExpandEnv(filepath.Join("${HOME}", ".roachprod", "slack"))
   252  	if err := os.MkdirAll(hashDir, 0755); err != nil {
   253  		return true, err
   254  	}
   255  	hashPath := os.ExpandEnv(filepath.Join(hashDir, "notification-"+channel))
   256  	fileBytes, err := ioutil.ReadFile(hashPath)
   257  	if err != nil && !os.IsNotExist(err) {
   258  		return true, err
   259  	}
   260  	oldHash := string(fileBytes)
   261  	newHash := status.notificationHash()
   262  
   263  	if newHash == oldHash {
   264  		return false, nil
   265  	}
   266  
   267  	return true, ioutil.WriteFile(hashPath, []byte(newHash), 0644)
   268  }
   269  
   270  // GCClusters checks all cluster to see if they should be deleted. It only
   271  // fails on failure to perform cloud actions. All others actions (load/save
   272  // file, email) do not abort.
   273  func GCClusters(cloud *Cloud, dryrun bool) error {
   274  	now := timeutil.Now()
   275  
   276  	var names []string
   277  	for name := range cloud.Clusters {
   278  		if name != config.Local {
   279  			names = append(names, name)
   280  		}
   281  	}
   282  	sort.Strings(names)
   283  
   284  	var s status
   285  	users := make(map[string]*status)
   286  	for _, name := range names {
   287  		c := cloud.Clusters[name]
   288  		u := users[c.User]
   289  		if u == nil {
   290  			u = &status{}
   291  			users[c.User] = u
   292  		}
   293  		s.add(c, now)
   294  		u.add(c, now)
   295  	}
   296  
   297  	// Compile list of "bad vms" and destroy them.
   298  	var badVMs vm.List
   299  	for _, vm := range cloud.BadInstances {
   300  		// We only delete "bad vms" if they were created more than 1h ago.
   301  		if now.Sub(vm.CreatedAt) >= time.Hour {
   302  			badVMs = append(badVMs, vm)
   303  		}
   304  	}
   305  
   306  	// Send out notification to #roachprod-status.
   307  	client := makeSlackClient()
   308  	channel, _ := findChannel(client, "roachprod-status")
   309  	postStatus(client, channel, dryrun, &s, badVMs)
   310  
   311  	// Send out user notifications if any of the user's clusters are expired or
   312  	// will be destroyed.
   313  	for user, status := range users {
   314  		if len(status.warn) > 0 || len(status.destroy) > 0 {
   315  			userChannel, err := findUserChannel(client, user+config.EmailDomain)
   316  			if err == nil {
   317  				postStatus(client, userChannel, dryrun, status, nil)
   318  			} else if !errors.Is(err, errNoSlackClient) {
   319  				log.Printf("could not deliver Slack DM to %s: %v", user+config.EmailDomain, err)
   320  			}
   321  		}
   322  	}
   323  
   324  	if !dryrun {
   325  		if len(badVMs) > 0 {
   326  			// Destroy bad VMs.
   327  			err := vm.FanOut(badVMs, func(p vm.Provider, vms vm.List) error {
   328  				return p.Delete(vms)
   329  			})
   330  			if err != nil {
   331  				postError(client, channel, err)
   332  			}
   333  		}
   334  
   335  		// Destroy expired clusters.
   336  		for _, c := range s.destroy {
   337  			if err := DestroyCluster(c); err != nil {
   338  				postError(client, channel, err)
   339  			}
   340  		}
   341  	}
   342  	return nil
   343  }