github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/updates.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package server 12 13 import ( 14 "bytes" 15 "context" 16 "encoding/json" 17 "io" 18 "io/ioutil" 19 "math/rand" 20 "net/http" 21 "reflect" 22 "runtime" 23 "time" 24 25 "github.com/cockroachdb/cockroach/pkg/base" 26 "github.com/cockroachdb/cockroach/pkg/build" 27 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 28 "github.com/cockroachdb/cockroach/pkg/keys" 29 "github.com/cockroachdb/cockroach/pkg/roachpb" 30 "github.com/cockroachdb/cockroach/pkg/security" 31 "github.com/cockroachdb/cockroach/pkg/server/diagnosticspb" 32 "github.com/cockroachdb/cockroach/pkg/server/telemetry" 33 "github.com/cockroachdb/cockroach/pkg/settings" 34 "github.com/cockroachdb/cockroach/pkg/sql" 35 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 36 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 37 "github.com/cockroachdb/cockroach/pkg/util/cloudinfo" 38 "github.com/cockroachdb/cockroach/pkg/util/httputil" 39 "github.com/cockroachdb/cockroach/pkg/util/log" 40 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 41 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 42 "github.com/cockroachdb/errors" 43 "github.com/gogo/protobuf/proto" 44 "github.com/mitchellh/reflectwalk" 45 "github.com/shirou/gopsutil/cpu" 46 "github.com/shirou/gopsutil/host" 47 "github.com/shirou/gopsutil/load" 48 "github.com/shirou/gopsutil/mem" 49 ) 50 51 const ( 52 updateCheckFrequency = time.Hour * 24 53 // TODO(dt): switch to settings. 54 updateCheckPostStartup = time.Minute * 5 55 updateCheckRetryFrequency = time.Hour 56 updateMaxVersionsToReport = 3 57 58 updateCheckJitterSeconds = 120 59 ) 60 61 var diagnosticReportFrequency = settings.RegisterPublicNonNegativeDurationSetting( 62 "diagnostics.reporting.interval", 63 "interval at which diagnostics data should be reported", 64 time.Hour, 65 ) 66 67 // randomly shift `d` to be up to `jitterSec` shorter or longer. 68 func addJitter(d time.Duration, jitterSec int) time.Duration { 69 j := time.Duration(rand.Intn(jitterSec*2)-jitterSec) * time.Second 70 return d + j 71 } 72 73 type versionInfo struct { 74 Version string `json:"version"` 75 Details string `json:"details"` 76 } 77 78 // PeriodicallyCheckForUpdates starts a background worker that periodically 79 // phones home to check for updates and report usage. 80 func (s *Server) PeriodicallyCheckForUpdates(ctx context.Context) { 81 s.stopper.RunWorker(ctx, func(ctx context.Context) { 82 defer log.RecoverAndReportNonfatalPanic(ctx, &s.st.SV) 83 nextUpdateCheck := s.startTime 84 nextDiagnosticReport := s.startTime 85 86 var timer timeutil.Timer 87 defer timer.Stop() 88 for { 89 now := timeutil.Now() 90 runningTime := now.Sub(s.startTime) 91 92 nextUpdateCheck = s.maybeCheckForUpdates(ctx, now, nextUpdateCheck, runningTime) 93 nextDiagnosticReport = s.maybeReportDiagnostics(ctx, now, nextDiagnosticReport) 94 95 sooner := nextUpdateCheck 96 if nextDiagnosticReport.Before(sooner) { 97 sooner = nextDiagnosticReport 98 } 99 100 timer.Reset(addJitter(sooner.Sub(timeutil.Now()), updateCheckJitterSeconds)) 101 select { 102 case <-s.stopper.ShouldQuiesce(): 103 return 104 case <-timer.C: 105 timer.Read = true 106 } 107 } 108 }) 109 } 110 111 // maybeCheckForUpdates determines if it is time to check for updates and does 112 // so if it is, before returning the time at which the next check be done. 113 func (s *Server) maybeCheckForUpdates( 114 ctx context.Context, now, scheduled time.Time, runningTime time.Duration, 115 ) time.Time { 116 if scheduled.After(now) { 117 return scheduled 118 } 119 120 // if diagnostics reporting is disabled, we should assume that means that the 121 // user doesn't want us phoning home for new-version checks either. 122 if !log.DiagnosticsReportingEnabled.Get(&s.st.SV) { 123 return now.Add(updateCheckFrequency) 124 } 125 126 // checkForUpdates handles its own errors, but it returns a bool indicating if 127 // it succeeded, so we can schedule a re-attempt if it did not. 128 if succeeded := s.checkForUpdates(ctx); !succeeded { 129 return now.Add(updateCheckRetryFrequency) 130 } 131 132 // If we've just started up, we want to check again shortly after. 133 // During startup is when a message is most likely to be actually seen by a 134 // human operator so we check as early as possible, but this makes it hard to 135 // differentiate real deployments vs short-lived instances for tests. 136 if runningTime < updateCheckPostStartup { 137 return now.Add(time.Hour - runningTime) 138 } 139 140 return now.Add(updateCheckFrequency) 141 } 142 143 func fillHardwareInfo(ctx context.Context, n *diagnosticspb.NodeInfo) { 144 // Fill in hardware info (OS/CPU/Mem/etc). 145 if platform, family, version, err := host.PlatformInformation(); err == nil { 146 n.Os.Family = family 147 n.Os.Platform = platform 148 n.Os.Version = version 149 } 150 151 if virt, role, err := host.Virtualization(); err == nil && role == "guest" { 152 n.Hardware.Virtualization = virt 153 } 154 155 if m, err := mem.VirtualMemory(); err == nil { 156 n.Hardware.Mem.Available = m.Available 157 n.Hardware.Mem.Total = m.Total 158 } 159 160 n.Hardware.Cpu.Numcpu = int32(runtime.NumCPU()) 161 if cpus, err := cpu.InfoWithContext(ctx); err == nil && len(cpus) > 0 { 162 n.Hardware.Cpu.Sockets = int32(len(cpus)) 163 c := cpus[0] 164 n.Hardware.Cpu.Cores = c.Cores 165 n.Hardware.Cpu.Model = c.ModelName 166 n.Hardware.Cpu.Mhz = float32(c.Mhz) 167 n.Hardware.Cpu.Features = c.Flags 168 } 169 170 if l, err := load.AvgWithContext(ctx); err == nil { 171 n.Hardware.Loadavg15 = float32(l.Load15) 172 } 173 174 n.Hardware.Provider, n.Hardware.InstanceClass = cloudinfo.GetInstanceClass(ctx) 175 n.Topology.Provider, n.Topology.Region = cloudinfo.GetInstanceRegion(ctx) 176 } 177 178 // CheckForUpdates is part of the TestServerInterface. 179 func (s *Server) CheckForUpdates(ctx context.Context) { 180 s.checkForUpdates(ctx) 181 } 182 183 // checkForUpdates calls home to check for new versions for the current platform 184 // and logs messages if it finds them, as well as if it encounters any errors. 185 // The returned boolean indicates if the check succeeded (and thus does not need 186 // to be re-attempted by the scheduler after a retry-interval). 187 func (s *Server) checkForUpdates(ctx context.Context) bool { 188 ctx, span := s.AnnotateCtxWithSpan(ctx, "checkForUpdates") 189 defer span.Finish() 190 191 nodeInfo := s.collectNodeInfo(ctx) 192 193 clusterInfo := diagnosticspb.ClusterInfo{ 194 ClusterID: s.ClusterID(), 195 IsInsecure: s.cfg.Insecure, 196 IsInternal: sql.ClusterIsInternal(&s.st.SV), 197 } 198 var knobs *diagnosticspb.TestingKnobs 199 if s.cfg.TestingKnobs.Server != nil { 200 knobs = &s.cfg.TestingKnobs.Server.(*TestingKnobs).DiagnosticsTestingKnobs 201 } 202 updatesURL := diagnosticspb.BuildUpdatesURL(&clusterInfo, &nodeInfo, knobs) 203 if updatesURL == nil { 204 return true // don't bother with asking for retry -- we'll never succeed. 205 } 206 207 res, err := httputil.Get(ctx, updatesURL.String()) 208 if err != nil { 209 // This is probably going to be relatively common in production 210 // environments where network access is usually curtailed. 211 return false 212 } 213 defer res.Body.Close() 214 215 if res.StatusCode != http.StatusOK { 216 b, err := ioutil.ReadAll(res.Body) 217 log.Warningf(ctx, "failed to check for updates: status: %s, body: %s, error: %v", 218 res.Status, b, err) 219 return false 220 } 221 222 decoder := json.NewDecoder(res.Body) 223 r := struct { 224 Details []versionInfo `json:"details"` 225 }{} 226 227 err = decoder.Decode(&r) 228 if err != nil && err != io.EOF { 229 log.Warningf(ctx, "Error decoding updates info: %v", err) 230 return false 231 } 232 233 // Ideally the updates server only returns the most relevant updates for us, 234 // but if it replied with an excessive number of updates, limit log spam by 235 // only printing the last few. 236 if len(r.Details) > updateMaxVersionsToReport { 237 r.Details = r.Details[len(r.Details)-updateMaxVersionsToReport:] 238 } 239 for _, v := range r.Details { 240 log.Infof(ctx, "A new version is available: %s, details: %s", v.Version, v.Details) 241 } 242 return true 243 } 244 245 func (s *Server) maybeReportDiagnostics(ctx context.Context, now, scheduled time.Time) time.Time { 246 if scheduled.After(now) { 247 return scheduled 248 } 249 250 // TODO(dt): we should allow tuning the reset and report intervals separately. 251 // Consider something like rand.Float() > resetFreq/reportFreq here to sample 252 // stat reset periods for reporting. 253 if log.DiagnosticsReportingEnabled.Get(&s.st.SV) { 254 s.ReportDiagnostics(ctx) 255 } 256 257 return scheduled.Add(diagnosticReportFrequency.Get(&s.st.SV)) 258 } 259 260 func (s *Server) collectNodeInfo(ctx context.Context) diagnosticspb.NodeInfo { 261 n := diagnosticspb.NodeInfo{ 262 NodeID: s.node.Descriptor.NodeID, 263 Build: build.GetInfo(), 264 Uptime: int64(timeutil.Now().Sub(s.startTime).Seconds()), 265 } 266 267 licenseType, err := base.LicenseType(s.st) 268 if err == nil { 269 n.LicenseType = licenseType 270 } else { 271 log.Errorf(ctx, "error retrieving license type: %s", err) 272 } 273 274 fillHardwareInfo(ctx, &n) 275 return n 276 } 277 278 func (s *Server) getReportingInfo( 279 ctx context.Context, reset telemetry.ResetCounters, 280 ) *diagnosticspb.DiagnosticReport { 281 info := diagnosticspb.DiagnosticReport{} 282 n := s.node.recorder.GenerateNodeStatus(ctx) 283 info.Node = s.collectNodeInfo(ctx) 284 285 secret := sql.ClusterSecret.Get(&s.cfg.Settings.SV) 286 // Add in the localities. 287 for _, tier := range s.node.Descriptor.Locality.Tiers { 288 info.Node.Locality.Tiers = append(info.Node.Locality.Tiers, roachpb.Tier{ 289 Key: sql.HashForReporting(secret, tier.Key), 290 Value: sql.HashForReporting(secret, tier.Value), 291 }) 292 } 293 294 info.Stores = make([]diagnosticspb.StoreInfo, len(n.StoreStatuses)) 295 for i, r := range n.StoreStatuses { 296 info.Stores[i].NodeID = r.Desc.Node.NodeID 297 info.Stores[i].StoreID = r.Desc.StoreID 298 info.Stores[i].KeyCount = int64(r.Metrics["keycount"]) 299 info.Stores[i].Capacity = int64(r.Metrics["capacity"]) 300 info.Stores[i].Available = int64(r.Metrics["capacity.available"]) 301 info.Stores[i].Used = int64(r.Metrics["capacity.used"]) 302 info.Node.KeyCount += info.Stores[i].KeyCount 303 info.Stores[i].RangeCount = int64(r.Metrics["replicas"]) 304 info.Node.RangeCount += info.Stores[i].RangeCount 305 bytes := int64(r.Metrics["sysbytes"] + r.Metrics["intentbytes"] + r.Metrics["valbytes"] + r.Metrics["keybytes"]) 306 info.Stores[i].Bytes = bytes 307 info.Node.Bytes += bytes 308 info.Stores[i].EncryptionAlgorithm = int64(r.Metrics["rocksdb.encryption.algorithm"]) 309 } 310 311 schema, err := s.collectSchemaInfo(ctx) 312 if err != nil { 313 log.Warningf(ctx, "error collecting schema info for diagnostic report: %+v", err) 314 schema = nil 315 } 316 info.Schema = schema 317 318 info.FeatureUsage = telemetry.GetFeatureCounts(telemetry.Quantized, reset) 319 320 // Read the system.settings table to determine the settings for which we have 321 // explicitly set values -- the in-memory SV has the set and default values 322 // flattened for quick reads, but we'd rather only report the non-defaults. 323 if datums, err := s.sqlServer.internalExecutor.QueryEx( 324 ctx, "read-setting", nil, /* txn */ 325 sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser}, 326 "SELECT name FROM system.settings", 327 ); err != nil { 328 log.Warningf(ctx, "failed to read settings: %s", err) 329 } else { 330 info.AlteredSettings = make(map[string]string, len(datums)) 331 for _, row := range datums { 332 name := string(tree.MustBeDString(row[0])) 333 info.AlteredSettings[name] = settings.RedactedValue(name, &s.st.SV) 334 } 335 } 336 337 if datums, err := s.sqlServer.internalExecutor.QueryEx( 338 ctx, 339 "read-zone-configs", 340 nil, /* txn */ 341 sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser}, 342 "SELECT id, config FROM system.zones", 343 ); err != nil { 344 log.Warningf(ctx, "%v", err) 345 } else { 346 info.ZoneConfigs = make(map[int64]zonepb.ZoneConfig) 347 for _, row := range datums { 348 id := int64(tree.MustBeDInt(row[0])) 349 var zone zonepb.ZoneConfig 350 if bytes, ok := row[1].(*tree.DBytes); !ok { 351 continue 352 } else { 353 if err := protoutil.Unmarshal([]byte(*bytes), &zone); err != nil { 354 log.Warningf(ctx, "unable to parse zone config %d: %v", id, err) 355 continue 356 } 357 } 358 var anonymizedZone zonepb.ZoneConfig 359 anonymizeZoneConfig(&anonymizedZone, zone, secret) 360 info.ZoneConfigs[id] = anonymizedZone 361 } 362 } 363 364 info.SqlStats = s.sqlServer.pgServer.SQLServer.GetScrubbedReportingStats() 365 return &info 366 } 367 368 func anonymizeZoneConfig(dst *zonepb.ZoneConfig, src zonepb.ZoneConfig, secret string) { 369 if src.RangeMinBytes != nil { 370 dst.RangeMinBytes = proto.Int64(*src.RangeMinBytes) 371 } 372 if src.RangeMaxBytes != nil { 373 dst.RangeMaxBytes = proto.Int64(*src.RangeMaxBytes) 374 } 375 if src.GC != nil { 376 dst.GC = &zonepb.GCPolicy{TTLSeconds: src.GC.TTLSeconds} 377 } 378 if src.NumReplicas != nil { 379 dst.NumReplicas = proto.Int32(*src.NumReplicas) 380 } 381 dst.Constraints = make([]zonepb.ConstraintsConjunction, len(src.Constraints)) 382 for i := range src.Constraints { 383 dst.Constraints[i].NumReplicas = src.Constraints[i].NumReplicas 384 dst.Constraints[i].Constraints = make([]zonepb.Constraint, len(src.Constraints[i].Constraints)) 385 for j := range src.Constraints[i].Constraints { 386 dst.Constraints[i].Constraints[j].Type = src.Constraints[i].Constraints[j].Type 387 if key := src.Constraints[i].Constraints[j].Key; key != "" { 388 dst.Constraints[i].Constraints[j].Key = sql.HashForReporting(secret, key) 389 } 390 if val := src.Constraints[i].Constraints[j].Value; val != "" { 391 dst.Constraints[i].Constraints[j].Value = sql.HashForReporting(secret, val) 392 } 393 } 394 } 395 dst.LeasePreferences = make([]zonepb.LeasePreference, len(src.LeasePreferences)) 396 for i := range src.LeasePreferences { 397 dst.LeasePreferences[i].Constraints = make([]zonepb.Constraint, len(src.LeasePreferences[i].Constraints)) 398 for j := range src.LeasePreferences[i].Constraints { 399 dst.LeasePreferences[i].Constraints[j].Type = src.LeasePreferences[i].Constraints[j].Type 400 if key := src.LeasePreferences[i].Constraints[j].Key; key != "" { 401 dst.LeasePreferences[i].Constraints[j].Key = sql.HashForReporting(secret, key) 402 } 403 if val := src.LeasePreferences[i].Constraints[j].Value; val != "" { 404 dst.LeasePreferences[i].Constraints[j].Value = sql.HashForReporting(secret, val) 405 } 406 } 407 } 408 dst.Subzones = make([]zonepb.Subzone, len(src.Subzones)) 409 for i := range src.Subzones { 410 dst.Subzones[i].IndexID = src.Subzones[i].IndexID 411 dst.Subzones[i].PartitionName = sql.HashForReporting(secret, src.Subzones[i].PartitionName) 412 anonymizeZoneConfig(&dst.Subzones[i].Config, src.Subzones[i].Config, secret) 413 } 414 } 415 416 // ReportDiagnostics is part of the TestServerInterface. 417 func (s *Server) ReportDiagnostics(ctx context.Context) { 418 ctx, span := s.AnnotateCtxWithSpan(ctx, "usageReport") 419 defer span.Finish() 420 421 report := s.getReportingInfo(ctx, telemetry.ResetCounts) 422 423 clusterInfo := diagnosticspb.ClusterInfo{ 424 ClusterID: s.ClusterID(), 425 IsInsecure: s.cfg.Insecure, 426 IsInternal: sql.ClusterIsInternal(&s.st.SV), 427 } 428 var knobs *diagnosticspb.TestingKnobs 429 if s.cfg.TestingKnobs.Server != nil { 430 knobs = &s.cfg.TestingKnobs.Server.(*TestingKnobs).DiagnosticsTestingKnobs 431 } 432 reportingURL := diagnosticspb.BuildReportingURL(&clusterInfo, &report.Node, knobs) 433 if reportingURL == nil { 434 return 435 } 436 437 b, err := protoutil.Marshal(report) 438 if err != nil { 439 log.Warningf(ctx, "%v", err) 440 return 441 } 442 443 res, err := httputil.Post( 444 ctx, reportingURL.String(), "application/x-protobuf", bytes.NewReader(b), 445 ) 446 if err != nil { 447 if log.V(2) { 448 // This is probably going to be relatively common in production 449 // environments where network access is usually curtailed. 450 log.Warningf(ctx, "failed to report node usage metrics: %v", err) 451 } 452 return 453 } 454 defer res.Body.Close() 455 b, err = ioutil.ReadAll(res.Body) 456 if err != nil || res.StatusCode != http.StatusOK { 457 log.Warningf(ctx, "failed to report node usage metrics: status: %s, body: %s, "+ 458 "error: %v", res.Status, b, err) 459 return 460 } 461 s.sqlServer.pgServer.SQLServer.ResetReportedStats(ctx) 462 } 463 464 func (s *Server) collectSchemaInfo(ctx context.Context) ([]sqlbase.TableDescriptor, error) { 465 startKey := keys.TODOSQLCodec.TablePrefix(keys.DescriptorTableID) 466 endKey := startKey.PrefixEnd() 467 kvs, err := s.db.Scan(ctx, startKey, endKey, 0) 468 if err != nil { 469 return nil, err 470 } 471 tables := make([]sqlbase.TableDescriptor, 0, len(kvs)) 472 redactor := stringRedactor{} 473 for _, kv := range kvs { 474 var desc sqlbase.Descriptor 475 if err := kv.ValueProto(&desc); err != nil { 476 return nil, errors.Wrapf(err, "%s: unable to unmarshal SQL descriptor", kv.Key) 477 } 478 if t := desc.Table(kv.Value.Timestamp); t != nil && t.ID > keys.MaxReservedDescID { 479 if err := reflectwalk.Walk(t, redactor); err != nil { 480 panic(err) // stringRedactor never returns a non-nil err 481 } 482 tables = append(tables, *t) 483 } 484 } 485 return tables, nil 486 } 487 488 type stringRedactor struct{} 489 490 func (stringRedactor) Primitive(v reflect.Value) error { 491 if v.Kind() == reflect.String && v.String() != "" { 492 v.Set(reflect.ValueOf("_")) 493 } 494 return nil 495 }