github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/cmd/roachprod/cloud/gc.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package cloud 12 13 import ( 14 "encoding/base64" 15 "fmt" 16 "hash/fnv" 17 "io/ioutil" 18 "log" 19 "os" 20 "path/filepath" 21 "sort" 22 "strings" 23 "text/tabwriter" 24 "time" 25 26 "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/config" 27 "github.com/cockroachdb/cockroach/pkg/cmd/roachprod/vm" 28 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 29 "github.com/cockroachdb/errors" 30 "github.com/nlopes/slack" 31 ) 32 33 var errNoSlackClient = fmt.Errorf("no Slack client") 34 35 type status struct { 36 good []*Cluster 37 warn []*Cluster 38 destroy []*Cluster 39 } 40 41 func (s *status) add(c *Cluster, now time.Time) { 42 exp := c.ExpiresAt() 43 if exp.After(now) { 44 if exp.Before(now.Add(2 * time.Hour)) { 45 s.warn = append(s.warn, c) 46 } else { 47 s.good = append(s.good, c) 48 } 49 } else { 50 s.destroy = append(s.destroy, c) 51 } 52 } 53 54 // messageHash computes a base64-encoded hash value to show whether 55 // or not two status values would result in a duplicate 56 // notification to a user. 57 func (s *status) notificationHash() string { 58 // Use stdlib hash function, since we don't need any crypto guarantees 59 hash := fnv.New32a() 60 61 for i, list := range [][]*Cluster{s.good, s.warn, s.destroy} { 62 _, _ = hash.Write([]byte{byte(i)}) 63 64 var data []string 65 for _, c := range list { 66 // Deduplicate by cluster name and expiration time 67 data = append(data, fmt.Sprintf("%s %s", c.Name, c.ExpiresAt())) 68 } 69 // Ensure results are stable 70 sort.Strings(data) 71 72 for _, d := range data { 73 _, _ = hash.Write([]byte(d)) 74 } 75 } 76 77 bytes := hash.Sum(nil) 78 return base64.StdEncoding.EncodeToString(bytes) 79 } 80 81 func makeSlackClient() *slack.Client { 82 if config.SlackToken == "" { 83 return nil 84 } 85 client := slack.New(config.SlackToken) 86 // client.SetDebug(true) 87 return client 88 } 89 90 func findChannel(client *slack.Client, name string) (string, error) { 91 if client != nil { 92 channels, err := client.GetChannels(true) 93 if err != nil { 94 return "", err 95 } 96 for _, channel := range channels { 97 if channel.Name == name { 98 return channel.ID, nil 99 } 100 } 101 } 102 return "", fmt.Errorf("not found") 103 } 104 105 func findUserChannel(client *slack.Client, email string) (string, error) { 106 if client == nil { 107 return "", errNoSlackClient 108 } 109 u, err := client.GetUserByEmail(email) 110 if err != nil { 111 return "", err 112 } 113 _, _, channelID, err := client.OpenIMChannel(u.ID) 114 return channelID, err 115 } 116 117 func postStatus(client *slack.Client, channel string, dryrun bool, s *status, badVMs vm.List) { 118 if dryrun { 119 tw := tabwriter.NewWriter(os.Stdout, 0, 8, 2, ' ', 0) 120 for _, c := range s.good { 121 fmt.Fprintf(tw, "good:\t%s\t%s\t(%s)\n", c.Name, 122 c.GCAt().Format(time.Stamp), 123 c.LifetimeRemaining().Round(time.Second)) 124 } 125 for _, c := range s.warn { 126 fmt.Fprintf(tw, "warn:\t%s\t%s\t(%s)\n", c.Name, 127 c.GCAt().Format(time.Stamp), 128 c.LifetimeRemaining().Round(time.Second)) 129 } 130 for _, c := range s.destroy { 131 fmt.Fprintf(tw, "destroy:\t%s\t%s\t(%s)\n", c.Name, 132 c.GCAt().Format(time.Stamp), 133 c.LifetimeRemaining().Round(time.Second)) 134 } 135 _ = tw.Flush() 136 } 137 138 if client == nil || channel == "" { 139 return 140 } 141 142 // Debounce messages, unless we have badVMs since that indicates 143 // a problem that needs manual intervention 144 if len(badVMs) == 0 { 145 send, err := shouldSend(channel, s) 146 if err != nil { 147 log.Printf("unable to deduplicate notification: %s", err) 148 } 149 if !send { 150 return 151 } 152 } 153 154 makeStatusFields := func(clusters []*Cluster) []slack.AttachmentField { 155 var names []string 156 var expirations []string 157 for _, c := range clusters { 158 names = append(names, c.Name) 159 expirations = append(expirations, 160 fmt.Sprintf("<!date^%[1]d^{date_short_pretty} {time}|%[2]s>", 161 c.GCAt().Unix(), 162 c.LifetimeRemaining().Round(time.Second))) 163 } 164 return []slack.AttachmentField{ 165 { 166 Title: "name", 167 Value: strings.Join(names, "\n"), 168 Short: true, 169 }, 170 { 171 Title: "expiration", 172 Value: strings.Join(expirations, "\n"), 173 Short: true, 174 }, 175 } 176 } 177 178 params := slack.PostMessageParameters{ 179 Username: "roachprod", 180 } 181 fallback := fmt.Sprintf("clusters: %d live, %d expired, %d destroyed", 182 len(s.good), len(s.warn), len(s.destroy)) 183 if len(s.good) > 0 { 184 params.Attachments = append(params.Attachments, 185 slack.Attachment{ 186 Color: "good", 187 Title: "Live Clusters", 188 Fallback: fallback, 189 Fields: makeStatusFields(s.good), 190 }) 191 } 192 if len(s.warn) > 0 { 193 params.Attachments = append(params.Attachments, 194 slack.Attachment{ 195 Color: "warning", 196 Title: "Expiring Clusters", 197 Fallback: fallback, 198 Fields: makeStatusFields(s.warn), 199 }) 200 } 201 if len(s.destroy) > 0 { 202 params.Attachments = append(params.Attachments, 203 slack.Attachment{ 204 Color: "danger", 205 Title: "Destroyed Clusters", 206 Fallback: fallback, 207 Fields: makeStatusFields(s.destroy), 208 }) 209 } 210 if len(badVMs) > 0 { 211 var names []string 212 for _, vm := range badVMs { 213 names = append(names, vm.Name) 214 } 215 sort.Strings(names) 216 params.Attachments = append(params.Attachments, 217 slack.Attachment{ 218 Color: "danger", 219 Title: "Bad VMs", 220 Text: strings.Join(names, "\n"), 221 }) 222 } 223 224 _, _, err := client.PostMessage(channel, "", params) 225 if err != nil { 226 log.Println(err) 227 } 228 } 229 230 func postError(client *slack.Client, channel string, err error) { 231 log.Println(err) 232 if client == nil || channel == "" { 233 return 234 } 235 236 params := slack.PostMessageParameters{ 237 Username: "roachprod", 238 Markdown: true, 239 EscapeText: false, 240 } 241 _, _, err = client.PostMessage(channel, fmt.Sprintf("`%s`", err), params) 242 if err != nil { 243 log.Println(err) 244 } 245 } 246 247 // shouldSend determines whether or not the given status was previously 248 // sent to the channel. The error returned by this function is 249 // advisory; the boolean value is always a reasonable behavior. 250 func shouldSend(channel string, status *status) (bool, error) { 251 hashDir := os.ExpandEnv(filepath.Join("${HOME}", ".roachprod", "slack")) 252 if err := os.MkdirAll(hashDir, 0755); err != nil { 253 return true, err 254 } 255 hashPath := os.ExpandEnv(filepath.Join(hashDir, "notification-"+channel)) 256 fileBytes, err := ioutil.ReadFile(hashPath) 257 if err != nil && !os.IsNotExist(err) { 258 return true, err 259 } 260 oldHash := string(fileBytes) 261 newHash := status.notificationHash() 262 263 if newHash == oldHash { 264 return false, nil 265 } 266 267 return true, ioutil.WriteFile(hashPath, []byte(newHash), 0644) 268 } 269 270 // GCClusters checks all cluster to see if they should be deleted. It only 271 // fails on failure to perform cloud actions. All others actions (load/save 272 // file, email) do not abort. 273 func GCClusters(cloud *Cloud, dryrun bool) error { 274 now := timeutil.Now() 275 276 var names []string 277 for name := range cloud.Clusters { 278 if name != config.Local { 279 names = append(names, name) 280 } 281 } 282 sort.Strings(names) 283 284 var s status 285 users := make(map[string]*status) 286 for _, name := range names { 287 c := cloud.Clusters[name] 288 u := users[c.User] 289 if u == nil { 290 u = &status{} 291 users[c.User] = u 292 } 293 s.add(c, now) 294 u.add(c, now) 295 } 296 297 // Compile list of "bad vms" and destroy them. 298 var badVMs vm.List 299 for _, vm := range cloud.BadInstances { 300 // We only delete "bad vms" if they were created more than 1h ago. 301 if now.Sub(vm.CreatedAt) >= time.Hour { 302 badVMs = append(badVMs, vm) 303 } 304 } 305 306 // Send out notification to #roachprod-status. 307 client := makeSlackClient() 308 channel, _ := findChannel(client, "roachprod-status") 309 postStatus(client, channel, dryrun, &s, badVMs) 310 311 // Send out user notifications if any of the user's clusters are expired or 312 // will be destroyed. 313 for user, status := range users { 314 if len(status.warn) > 0 || len(status.destroy) > 0 { 315 userChannel, err := findUserChannel(client, user+config.EmailDomain) 316 if err == nil { 317 postStatus(client, userChannel, dryrun, status, nil) 318 } else if !errors.Is(err, errNoSlackClient) { 319 log.Printf("could not deliver Slack DM to %s: %v", user+config.EmailDomain, err) 320 } 321 } 322 } 323 324 if !dryrun { 325 if len(badVMs) > 0 { 326 // Destroy bad VMs. 327 err := vm.FanOut(badVMs, func(p vm.Provider, vms vm.List) error { 328 return p.Delete(vms) 329 }) 330 if err != nil { 331 postError(client, channel, err) 332 } 333 } 334 335 // Destroy expired clusters. 336 for _, c := range s.destroy { 337 if err := DestroyCluster(c); err != nil { 338 postError(client, channel, err) 339 } 340 } 341 } 342 return nil 343 }