github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/server/updates.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package server
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"encoding/json"
    17  	"io"
    18  	"io/ioutil"
    19  	"math/rand"
    20  	"net/http"
    21  	"reflect"
    22  	"runtime"
    23  	"time"
    24  
    25  	"github.com/cockroachdb/cockroach/pkg/base"
    26  	"github.com/cockroachdb/cockroach/pkg/build"
    27  	"github.com/cockroachdb/cockroach/pkg/config/zonepb"
    28  	"github.com/cockroachdb/cockroach/pkg/keys"
    29  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    30  	"github.com/cockroachdb/cockroach/pkg/security"
    31  	"github.com/cockroachdb/cockroach/pkg/server/diagnosticspb"
    32  	"github.com/cockroachdb/cockroach/pkg/server/telemetry"
    33  	"github.com/cockroachdb/cockroach/pkg/settings"
    34  	"github.com/cockroachdb/cockroach/pkg/sql"
    35  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    36  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    37  	"github.com/cockroachdb/cockroach/pkg/util/cloudinfo"
    38  	"github.com/cockroachdb/cockroach/pkg/util/httputil"
    39  	"github.com/cockroachdb/cockroach/pkg/util/log"
    40  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    41  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    42  	"github.com/cockroachdb/errors"
    43  	"github.com/gogo/protobuf/proto"
    44  	"github.com/mitchellh/reflectwalk"
    45  	"github.com/shirou/gopsutil/cpu"
    46  	"github.com/shirou/gopsutil/host"
    47  	"github.com/shirou/gopsutil/load"
    48  	"github.com/shirou/gopsutil/mem"
    49  )
    50  
    51  const (
    52  	updateCheckFrequency = time.Hour * 24
    53  	// TODO(dt): switch to settings.
    54  	updateCheckPostStartup    = time.Minute * 5
    55  	updateCheckRetryFrequency = time.Hour
    56  	updateMaxVersionsToReport = 3
    57  
    58  	updateCheckJitterSeconds = 120
    59  )
    60  
    61  var diagnosticReportFrequency = settings.RegisterPublicNonNegativeDurationSetting(
    62  	"diagnostics.reporting.interval",
    63  	"interval at which diagnostics data should be reported",
    64  	time.Hour,
    65  )
    66  
    67  // randomly shift `d` to be up to `jitterSec` shorter or longer.
    68  func addJitter(d time.Duration, jitterSec int) time.Duration {
    69  	j := time.Duration(rand.Intn(jitterSec*2)-jitterSec) * time.Second
    70  	return d + j
    71  }
    72  
    73  type versionInfo struct {
    74  	Version string `json:"version"`
    75  	Details string `json:"details"`
    76  }
    77  
    78  // PeriodicallyCheckForUpdates starts a background worker that periodically
    79  // phones home to check for updates and report usage.
    80  func (s *Server) PeriodicallyCheckForUpdates(ctx context.Context) {
    81  	s.stopper.RunWorker(ctx, func(ctx context.Context) {
    82  		defer log.RecoverAndReportNonfatalPanic(ctx, &s.st.SV)
    83  		nextUpdateCheck := s.startTime
    84  		nextDiagnosticReport := s.startTime
    85  
    86  		var timer timeutil.Timer
    87  		defer timer.Stop()
    88  		for {
    89  			now := timeutil.Now()
    90  			runningTime := now.Sub(s.startTime)
    91  
    92  			nextUpdateCheck = s.maybeCheckForUpdates(ctx, now, nextUpdateCheck, runningTime)
    93  			nextDiagnosticReport = s.maybeReportDiagnostics(ctx, now, nextDiagnosticReport)
    94  
    95  			sooner := nextUpdateCheck
    96  			if nextDiagnosticReport.Before(sooner) {
    97  				sooner = nextDiagnosticReport
    98  			}
    99  
   100  			timer.Reset(addJitter(sooner.Sub(timeutil.Now()), updateCheckJitterSeconds))
   101  			select {
   102  			case <-s.stopper.ShouldQuiesce():
   103  				return
   104  			case <-timer.C:
   105  				timer.Read = true
   106  			}
   107  		}
   108  	})
   109  }
   110  
   111  // maybeCheckForUpdates determines if it is time to check for updates and does
   112  // so if it is, before returning the time at which the next check be done.
   113  func (s *Server) maybeCheckForUpdates(
   114  	ctx context.Context, now, scheduled time.Time, runningTime time.Duration,
   115  ) time.Time {
   116  	if scheduled.After(now) {
   117  		return scheduled
   118  	}
   119  
   120  	// if diagnostics reporting is disabled, we should assume that means that the
   121  	// user doesn't want us phoning home for new-version checks either.
   122  	if !log.DiagnosticsReportingEnabled.Get(&s.st.SV) {
   123  		return now.Add(updateCheckFrequency)
   124  	}
   125  
   126  	// checkForUpdates handles its own errors, but it returns a bool indicating if
   127  	// it succeeded, so we can schedule a re-attempt if it did not.
   128  	if succeeded := s.checkForUpdates(ctx); !succeeded {
   129  		return now.Add(updateCheckRetryFrequency)
   130  	}
   131  
   132  	// If we've just started up, we want to check again shortly after.
   133  	// During startup is when a message is most likely to be actually seen by a
   134  	// human operator so we check as early as possible, but this makes it hard to
   135  	// differentiate real deployments vs short-lived instances for tests.
   136  	if runningTime < updateCheckPostStartup {
   137  		return now.Add(time.Hour - runningTime)
   138  	}
   139  
   140  	return now.Add(updateCheckFrequency)
   141  }
   142  
   143  func fillHardwareInfo(ctx context.Context, n *diagnosticspb.NodeInfo) {
   144  	// Fill in hardware info (OS/CPU/Mem/etc).
   145  	if platform, family, version, err := host.PlatformInformation(); err == nil {
   146  		n.Os.Family = family
   147  		n.Os.Platform = platform
   148  		n.Os.Version = version
   149  	}
   150  
   151  	if virt, role, err := host.Virtualization(); err == nil && role == "guest" {
   152  		n.Hardware.Virtualization = virt
   153  	}
   154  
   155  	if m, err := mem.VirtualMemory(); err == nil {
   156  		n.Hardware.Mem.Available = m.Available
   157  		n.Hardware.Mem.Total = m.Total
   158  	}
   159  
   160  	n.Hardware.Cpu.Numcpu = int32(runtime.NumCPU())
   161  	if cpus, err := cpu.InfoWithContext(ctx); err == nil && len(cpus) > 0 {
   162  		n.Hardware.Cpu.Sockets = int32(len(cpus))
   163  		c := cpus[0]
   164  		n.Hardware.Cpu.Cores = c.Cores
   165  		n.Hardware.Cpu.Model = c.ModelName
   166  		n.Hardware.Cpu.Mhz = float32(c.Mhz)
   167  		n.Hardware.Cpu.Features = c.Flags
   168  	}
   169  
   170  	if l, err := load.AvgWithContext(ctx); err == nil {
   171  		n.Hardware.Loadavg15 = float32(l.Load15)
   172  	}
   173  
   174  	n.Hardware.Provider, n.Hardware.InstanceClass = cloudinfo.GetInstanceClass(ctx)
   175  	n.Topology.Provider, n.Topology.Region = cloudinfo.GetInstanceRegion(ctx)
   176  }
   177  
   178  // CheckForUpdates is part of the TestServerInterface.
   179  func (s *Server) CheckForUpdates(ctx context.Context) {
   180  	s.checkForUpdates(ctx)
   181  }
   182  
   183  // checkForUpdates calls home to check for new versions for the current platform
   184  // and logs messages if it finds them, as well as if it encounters any errors.
   185  // The returned boolean indicates if the check succeeded (and thus does not need
   186  // to be re-attempted by the scheduler after a retry-interval).
   187  func (s *Server) checkForUpdates(ctx context.Context) bool {
   188  	ctx, span := s.AnnotateCtxWithSpan(ctx, "checkForUpdates")
   189  	defer span.Finish()
   190  
   191  	nodeInfo := s.collectNodeInfo(ctx)
   192  
   193  	clusterInfo := diagnosticspb.ClusterInfo{
   194  		ClusterID:  s.ClusterID(),
   195  		IsInsecure: s.cfg.Insecure,
   196  		IsInternal: sql.ClusterIsInternal(&s.st.SV),
   197  	}
   198  	var knobs *diagnosticspb.TestingKnobs
   199  	if s.cfg.TestingKnobs.Server != nil {
   200  		knobs = &s.cfg.TestingKnobs.Server.(*TestingKnobs).DiagnosticsTestingKnobs
   201  	}
   202  	updatesURL := diagnosticspb.BuildUpdatesURL(&clusterInfo, &nodeInfo, knobs)
   203  	if updatesURL == nil {
   204  		return true // don't bother with asking for retry -- we'll never succeed.
   205  	}
   206  
   207  	res, err := httputil.Get(ctx, updatesURL.String())
   208  	if err != nil {
   209  		// This is probably going to be relatively common in production
   210  		// environments where network access is usually curtailed.
   211  		return false
   212  	}
   213  	defer res.Body.Close()
   214  
   215  	if res.StatusCode != http.StatusOK {
   216  		b, err := ioutil.ReadAll(res.Body)
   217  		log.Warningf(ctx, "failed to check for updates: status: %s, body: %s, error: %v",
   218  			res.Status, b, err)
   219  		return false
   220  	}
   221  
   222  	decoder := json.NewDecoder(res.Body)
   223  	r := struct {
   224  		Details []versionInfo `json:"details"`
   225  	}{}
   226  
   227  	err = decoder.Decode(&r)
   228  	if err != nil && err != io.EOF {
   229  		log.Warningf(ctx, "Error decoding updates info: %v", err)
   230  		return false
   231  	}
   232  
   233  	// Ideally the updates server only returns the most relevant updates for us,
   234  	// but if it replied with an excessive number of updates, limit log spam by
   235  	// only printing the last few.
   236  	if len(r.Details) > updateMaxVersionsToReport {
   237  		r.Details = r.Details[len(r.Details)-updateMaxVersionsToReport:]
   238  	}
   239  	for _, v := range r.Details {
   240  		log.Infof(ctx, "A new version is available: %s, details: %s", v.Version, v.Details)
   241  	}
   242  	return true
   243  }
   244  
   245  func (s *Server) maybeReportDiagnostics(ctx context.Context, now, scheduled time.Time) time.Time {
   246  	if scheduled.After(now) {
   247  		return scheduled
   248  	}
   249  
   250  	// TODO(dt): we should allow tuning the reset and report intervals separately.
   251  	// Consider something like rand.Float() > resetFreq/reportFreq here to sample
   252  	// stat reset periods for reporting.
   253  	if log.DiagnosticsReportingEnabled.Get(&s.st.SV) {
   254  		s.ReportDiagnostics(ctx)
   255  	}
   256  
   257  	return scheduled.Add(diagnosticReportFrequency.Get(&s.st.SV))
   258  }
   259  
   260  func (s *Server) collectNodeInfo(ctx context.Context) diagnosticspb.NodeInfo {
   261  	n := diagnosticspb.NodeInfo{
   262  		NodeID: s.node.Descriptor.NodeID,
   263  		Build:  build.GetInfo(),
   264  		Uptime: int64(timeutil.Now().Sub(s.startTime).Seconds()),
   265  	}
   266  
   267  	licenseType, err := base.LicenseType(s.st)
   268  	if err == nil {
   269  		n.LicenseType = licenseType
   270  	} else {
   271  		log.Errorf(ctx, "error retrieving license type: %s", err)
   272  	}
   273  
   274  	fillHardwareInfo(ctx, &n)
   275  	return n
   276  }
   277  
   278  func (s *Server) getReportingInfo(
   279  	ctx context.Context, reset telemetry.ResetCounters,
   280  ) *diagnosticspb.DiagnosticReport {
   281  	info := diagnosticspb.DiagnosticReport{}
   282  	n := s.node.recorder.GenerateNodeStatus(ctx)
   283  	info.Node = s.collectNodeInfo(ctx)
   284  
   285  	secret := sql.ClusterSecret.Get(&s.cfg.Settings.SV)
   286  	// Add in the localities.
   287  	for _, tier := range s.node.Descriptor.Locality.Tiers {
   288  		info.Node.Locality.Tiers = append(info.Node.Locality.Tiers, roachpb.Tier{
   289  			Key:   sql.HashForReporting(secret, tier.Key),
   290  			Value: sql.HashForReporting(secret, tier.Value),
   291  		})
   292  	}
   293  
   294  	info.Stores = make([]diagnosticspb.StoreInfo, len(n.StoreStatuses))
   295  	for i, r := range n.StoreStatuses {
   296  		info.Stores[i].NodeID = r.Desc.Node.NodeID
   297  		info.Stores[i].StoreID = r.Desc.StoreID
   298  		info.Stores[i].KeyCount = int64(r.Metrics["keycount"])
   299  		info.Stores[i].Capacity = int64(r.Metrics["capacity"])
   300  		info.Stores[i].Available = int64(r.Metrics["capacity.available"])
   301  		info.Stores[i].Used = int64(r.Metrics["capacity.used"])
   302  		info.Node.KeyCount += info.Stores[i].KeyCount
   303  		info.Stores[i].RangeCount = int64(r.Metrics["replicas"])
   304  		info.Node.RangeCount += info.Stores[i].RangeCount
   305  		bytes := int64(r.Metrics["sysbytes"] + r.Metrics["intentbytes"] + r.Metrics["valbytes"] + r.Metrics["keybytes"])
   306  		info.Stores[i].Bytes = bytes
   307  		info.Node.Bytes += bytes
   308  		info.Stores[i].EncryptionAlgorithm = int64(r.Metrics["rocksdb.encryption.algorithm"])
   309  	}
   310  
   311  	schema, err := s.collectSchemaInfo(ctx)
   312  	if err != nil {
   313  		log.Warningf(ctx, "error collecting schema info for diagnostic report: %+v", err)
   314  		schema = nil
   315  	}
   316  	info.Schema = schema
   317  
   318  	info.FeatureUsage = telemetry.GetFeatureCounts(telemetry.Quantized, reset)
   319  
   320  	// Read the system.settings table to determine the settings for which we have
   321  	// explicitly set values -- the in-memory SV has the set and default values
   322  	// flattened for quick reads, but we'd rather only report the non-defaults.
   323  	if datums, err := s.sqlServer.internalExecutor.QueryEx(
   324  		ctx, "read-setting", nil, /* txn */
   325  		sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser},
   326  		"SELECT name FROM system.settings",
   327  	); err != nil {
   328  		log.Warningf(ctx, "failed to read settings: %s", err)
   329  	} else {
   330  		info.AlteredSettings = make(map[string]string, len(datums))
   331  		for _, row := range datums {
   332  			name := string(tree.MustBeDString(row[0]))
   333  			info.AlteredSettings[name] = settings.RedactedValue(name, &s.st.SV)
   334  		}
   335  	}
   336  
   337  	if datums, err := s.sqlServer.internalExecutor.QueryEx(
   338  		ctx,
   339  		"read-zone-configs",
   340  		nil, /* txn */
   341  		sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser},
   342  		"SELECT id, config FROM system.zones",
   343  	); err != nil {
   344  		log.Warningf(ctx, "%v", err)
   345  	} else {
   346  		info.ZoneConfigs = make(map[int64]zonepb.ZoneConfig)
   347  		for _, row := range datums {
   348  			id := int64(tree.MustBeDInt(row[0]))
   349  			var zone zonepb.ZoneConfig
   350  			if bytes, ok := row[1].(*tree.DBytes); !ok {
   351  				continue
   352  			} else {
   353  				if err := protoutil.Unmarshal([]byte(*bytes), &zone); err != nil {
   354  					log.Warningf(ctx, "unable to parse zone config %d: %v", id, err)
   355  					continue
   356  				}
   357  			}
   358  			var anonymizedZone zonepb.ZoneConfig
   359  			anonymizeZoneConfig(&anonymizedZone, zone, secret)
   360  			info.ZoneConfigs[id] = anonymizedZone
   361  		}
   362  	}
   363  
   364  	info.SqlStats = s.sqlServer.pgServer.SQLServer.GetScrubbedReportingStats()
   365  	return &info
   366  }
   367  
   368  func anonymizeZoneConfig(dst *zonepb.ZoneConfig, src zonepb.ZoneConfig, secret string) {
   369  	if src.RangeMinBytes != nil {
   370  		dst.RangeMinBytes = proto.Int64(*src.RangeMinBytes)
   371  	}
   372  	if src.RangeMaxBytes != nil {
   373  		dst.RangeMaxBytes = proto.Int64(*src.RangeMaxBytes)
   374  	}
   375  	if src.GC != nil {
   376  		dst.GC = &zonepb.GCPolicy{TTLSeconds: src.GC.TTLSeconds}
   377  	}
   378  	if src.NumReplicas != nil {
   379  		dst.NumReplicas = proto.Int32(*src.NumReplicas)
   380  	}
   381  	dst.Constraints = make([]zonepb.ConstraintsConjunction, len(src.Constraints))
   382  	for i := range src.Constraints {
   383  		dst.Constraints[i].NumReplicas = src.Constraints[i].NumReplicas
   384  		dst.Constraints[i].Constraints = make([]zonepb.Constraint, len(src.Constraints[i].Constraints))
   385  		for j := range src.Constraints[i].Constraints {
   386  			dst.Constraints[i].Constraints[j].Type = src.Constraints[i].Constraints[j].Type
   387  			if key := src.Constraints[i].Constraints[j].Key; key != "" {
   388  				dst.Constraints[i].Constraints[j].Key = sql.HashForReporting(secret, key)
   389  			}
   390  			if val := src.Constraints[i].Constraints[j].Value; val != "" {
   391  				dst.Constraints[i].Constraints[j].Value = sql.HashForReporting(secret, val)
   392  			}
   393  		}
   394  	}
   395  	dst.LeasePreferences = make([]zonepb.LeasePreference, len(src.LeasePreferences))
   396  	for i := range src.LeasePreferences {
   397  		dst.LeasePreferences[i].Constraints = make([]zonepb.Constraint, len(src.LeasePreferences[i].Constraints))
   398  		for j := range src.LeasePreferences[i].Constraints {
   399  			dst.LeasePreferences[i].Constraints[j].Type = src.LeasePreferences[i].Constraints[j].Type
   400  			if key := src.LeasePreferences[i].Constraints[j].Key; key != "" {
   401  				dst.LeasePreferences[i].Constraints[j].Key = sql.HashForReporting(secret, key)
   402  			}
   403  			if val := src.LeasePreferences[i].Constraints[j].Value; val != "" {
   404  				dst.LeasePreferences[i].Constraints[j].Value = sql.HashForReporting(secret, val)
   405  			}
   406  		}
   407  	}
   408  	dst.Subzones = make([]zonepb.Subzone, len(src.Subzones))
   409  	for i := range src.Subzones {
   410  		dst.Subzones[i].IndexID = src.Subzones[i].IndexID
   411  		dst.Subzones[i].PartitionName = sql.HashForReporting(secret, src.Subzones[i].PartitionName)
   412  		anonymizeZoneConfig(&dst.Subzones[i].Config, src.Subzones[i].Config, secret)
   413  	}
   414  }
   415  
   416  // ReportDiagnostics is part of the TestServerInterface.
   417  func (s *Server) ReportDiagnostics(ctx context.Context) {
   418  	ctx, span := s.AnnotateCtxWithSpan(ctx, "usageReport")
   419  	defer span.Finish()
   420  
   421  	report := s.getReportingInfo(ctx, telemetry.ResetCounts)
   422  
   423  	clusterInfo := diagnosticspb.ClusterInfo{
   424  		ClusterID:  s.ClusterID(),
   425  		IsInsecure: s.cfg.Insecure,
   426  		IsInternal: sql.ClusterIsInternal(&s.st.SV),
   427  	}
   428  	var knobs *diagnosticspb.TestingKnobs
   429  	if s.cfg.TestingKnobs.Server != nil {
   430  		knobs = &s.cfg.TestingKnobs.Server.(*TestingKnobs).DiagnosticsTestingKnobs
   431  	}
   432  	reportingURL := diagnosticspb.BuildReportingURL(&clusterInfo, &report.Node, knobs)
   433  	if reportingURL == nil {
   434  		return
   435  	}
   436  
   437  	b, err := protoutil.Marshal(report)
   438  	if err != nil {
   439  		log.Warningf(ctx, "%v", err)
   440  		return
   441  	}
   442  
   443  	res, err := httputil.Post(
   444  		ctx, reportingURL.String(), "application/x-protobuf", bytes.NewReader(b),
   445  	)
   446  	if err != nil {
   447  		if log.V(2) {
   448  			// This is probably going to be relatively common in production
   449  			// environments where network access is usually curtailed.
   450  			log.Warningf(ctx, "failed to report node usage metrics: %v", err)
   451  		}
   452  		return
   453  	}
   454  	defer res.Body.Close()
   455  	b, err = ioutil.ReadAll(res.Body)
   456  	if err != nil || res.StatusCode != http.StatusOK {
   457  		log.Warningf(ctx, "failed to report node usage metrics: status: %s, body: %s, "+
   458  			"error: %v", res.Status, b, err)
   459  		return
   460  	}
   461  	s.sqlServer.pgServer.SQLServer.ResetReportedStats(ctx)
   462  }
   463  
   464  func (s *Server) collectSchemaInfo(ctx context.Context) ([]sqlbase.TableDescriptor, error) {
   465  	startKey := keys.TODOSQLCodec.TablePrefix(keys.DescriptorTableID)
   466  	endKey := startKey.PrefixEnd()
   467  	kvs, err := s.db.Scan(ctx, startKey, endKey, 0)
   468  	if err != nil {
   469  		return nil, err
   470  	}
   471  	tables := make([]sqlbase.TableDescriptor, 0, len(kvs))
   472  	redactor := stringRedactor{}
   473  	for _, kv := range kvs {
   474  		var desc sqlbase.Descriptor
   475  		if err := kv.ValueProto(&desc); err != nil {
   476  			return nil, errors.Wrapf(err, "%s: unable to unmarshal SQL descriptor", kv.Key)
   477  		}
   478  		if t := desc.Table(kv.Value.Timestamp); t != nil && t.ID > keys.MaxReservedDescID {
   479  			if err := reflectwalk.Walk(t, redactor); err != nil {
   480  				panic(err) // stringRedactor never returns a non-nil err
   481  			}
   482  			tables = append(tables, *t)
   483  		}
   484  	}
   485  	return tables, nil
   486  }
   487  
   488  type stringRedactor struct{}
   489  
   490  func (stringRedactor) Primitive(v reflect.Value) error {
   491  	if v.Kind() == reflect.String && v.String() != "" {
   492  		v.Set(reflect.ValueOf("_"))
   493  	}
   494  	return nil
   495  }