github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/reports/constraint_stats_report.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package reports
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"strings"
    17  	"time"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/base"
    20  	"github.com/cockroachdb/cockroach/pkg/config"
    21  	"github.com/cockroachdb/cockroach/pkg/config/zonepb"
    22  	"github.com/cockroachdb/cockroach/pkg/kv"
    23  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    25  	"github.com/cockroachdb/cockroach/pkg/sql/sqlutil"
    26  	"github.com/cockroachdb/cockroach/pkg/util/log"
    27  	"github.com/cockroachdb/errors"
    28  )
    29  
    30  // replicationConstraintsReportID is the id of the row in the system.
    31  // reports_meta table corresponding to the constraints conformance report (i.e.
    32  // the system.replicationConstraintsReportID table).
    33  const replicationConstraintsReportID reportID = 1
    34  
    35  // ConstraintReport contains information about the constraint conformance for
    36  // the cluster's data.
    37  type ConstraintReport map[ConstraintStatusKey]ConstraintStatus
    38  
    39  // replicationConstraintStatsReportSaver deals with saving a ConstrainReport to
    40  // the database. The idea is for it to be used to save new version of the report
    41  // over and over. It maintains the previously-saved version of the report in
    42  // order to speed-up the saving of the next one.
    43  type replicationConstraintStatsReportSaver struct {
    44  	previousVersion     ConstraintReport
    45  	lastGenerated       time.Time
    46  	lastUpdatedRowCount int
    47  }
    48  
    49  // makeReplicationConstraintStatusReportSaver creates a new report saver.
    50  func makeReplicationConstraintStatusReportSaver() replicationConstraintStatsReportSaver {
    51  	return replicationConstraintStatsReportSaver{}
    52  }
    53  
    54  // LastUpdatedRowCount is the count of the rows that were touched during the last save.
    55  func (r *replicationConstraintStatsReportSaver) LastUpdatedRowCount() int {
    56  	return r.lastUpdatedRowCount
    57  }
    58  
    59  // ConstraintStatus is the leaf in the constraintReport.
    60  type ConstraintStatus struct {
    61  	FailRangeCount int
    62  }
    63  
    64  // ConstraintType indicates what type of constraint is an entry in the
    65  // constraint conformance report talking about.
    66  type ConstraintType string
    67  
    68  const (
    69  	// Constraint means that the entry refers to a constraint (i.e. a member of
    70  	// the constraints field in a zone config).
    71  	Constraint ConstraintType = "constraint"
    72  	// TODO(andrei): add leaseholder preference
    73  )
    74  
    75  // Less compares two ConstraintTypes.
    76  func (t ConstraintType) Less(other ConstraintType) bool {
    77  	return -1 == strings.Compare(string(t), string(other))
    78  }
    79  
    80  // ConstraintRepr is a string representation of a constraint.
    81  type ConstraintRepr string
    82  
    83  // Less compares two ConstraintReprs.
    84  func (c ConstraintRepr) Less(other ConstraintRepr) bool {
    85  	return -1 == strings.Compare(string(c), string(other))
    86  }
    87  
    88  // ConstraintStatusKey represents the key in the ConstraintReport.
    89  type ConstraintStatusKey struct {
    90  	ZoneKey
    91  	ViolationType ConstraintType
    92  	Constraint    ConstraintRepr
    93  }
    94  
    95  func (k ConstraintStatusKey) String() string {
    96  	return fmt.Sprintf("zone:%s type:%s constraint:%s", k.ZoneKey, k.ViolationType, k.Constraint)
    97  }
    98  
    99  // Less compares two ConstraintStatusKeys.
   100  func (k ConstraintStatusKey) Less(other ConstraintStatusKey) bool {
   101  	if k.ZoneKey.Less(other.ZoneKey) {
   102  		return true
   103  	}
   104  	if other.ZoneKey.Less(k.ZoneKey) {
   105  		return false
   106  	}
   107  	if k.ViolationType.Less(other.ViolationType) {
   108  		return true
   109  	}
   110  	if other.ViolationType.Less(k.ViolationType) {
   111  		return true
   112  	}
   113  	return k.Constraint.Less(other.Constraint)
   114  }
   115  
   116  // AddViolation add a constraint that is being violated for a given range. Each call
   117  // will increase the number of ranges that failed.
   118  func (r ConstraintReport) AddViolation(z ZoneKey, t ConstraintType, c ConstraintRepr) {
   119  	k := ConstraintStatusKey{
   120  		ZoneKey:       z,
   121  		ViolationType: t,
   122  		Constraint:    c,
   123  	}
   124  	if _, ok := r[k]; !ok {
   125  		r[k] = ConstraintStatus{}
   126  	}
   127  	cRep := r[k]
   128  	cRep.FailRangeCount++
   129  	r[k] = cRep
   130  }
   131  
   132  // ensureEntry us used to add an entry to the report even if there is no violation.
   133  func (r ConstraintReport) ensureEntry(z ZoneKey, t ConstraintType, c ConstraintRepr) {
   134  	k := ConstraintStatusKey{
   135  		ZoneKey:       z,
   136  		ViolationType: t,
   137  		Constraint:    c,
   138  	}
   139  	if _, ok := r[k]; !ok {
   140  		r[k] = ConstraintStatus{}
   141  	}
   142  }
   143  
   144  func (r ConstraintReport) ensureEntries(key ZoneKey, zone *zonepb.ZoneConfig) {
   145  	for _, conjunction := range zone.Constraints {
   146  		r.ensureEntry(key, Constraint, ConstraintRepr(conjunction.String()))
   147  	}
   148  	for i, sz := range zone.Subzones {
   149  		szKey := ZoneKey{ZoneID: key.ZoneID, SubzoneID: base.SubzoneIDFromIndex(i)}
   150  		r.ensureEntries(szKey, &sz.Config)
   151  	}
   152  }
   153  
   154  func (r *replicationConstraintStatsReportSaver) loadPreviousVersion(
   155  	ctx context.Context, ex sqlutil.InternalExecutor, txn *kv.Txn,
   156  ) error {
   157  	// The data for the previous save needs to be loaded if:
   158  	// - this is the first time that we call this method and lastUpdatedAt has never been set
   159  	// - in case that the lastUpdatedAt is set but is different than the timestamp in reports_meta
   160  	//   this indicates that some other worker wrote after we did the write.
   161  	if !r.lastGenerated.IsZero() {
   162  		generated, err := getReportGenerationTime(ctx, replicationConstraintsReportID, ex, txn)
   163  		if err != nil {
   164  			return err
   165  		}
   166  		// If the report is missing, this is the first time we are running and the
   167  		// reload is needed. In that case, generated will be the zero value.
   168  		if generated == r.lastGenerated {
   169  			// We have the latest report; reload not needed.
   170  			return nil
   171  		}
   172  	}
   173  	const prevViolations = "select zone_id, subzone_id, type, config, " +
   174  		"violating_ranges from system.replication_constraint_stats"
   175  	rows, err := ex.Query(
   176  		ctx, "get-previous-replication-constraint-stats", txn, prevViolations,
   177  	)
   178  	if err != nil {
   179  		return err
   180  	}
   181  
   182  	r.previousVersion = make(ConstraintReport, len(rows))
   183  	for _, row := range rows {
   184  		key := ConstraintStatusKey{}
   185  		key.ZoneID = (uint32)(*row[0].(*tree.DInt))
   186  		key.SubzoneID = base.SubzoneID((*row[1].(*tree.DInt)))
   187  		key.ViolationType = (ConstraintType)(*row[2].(*tree.DString))
   188  		key.Constraint = (ConstraintRepr)(*row[3].(*tree.DString))
   189  		r.previousVersion[key] = ConstraintStatus{(int)(*row[4].(*tree.DInt))}
   190  	}
   191  
   192  	return nil
   193  }
   194  
   195  func (r *replicationConstraintStatsReportSaver) updateTimestamp(
   196  	ctx context.Context, ex sqlutil.InternalExecutor, txn *kv.Txn, reportTS time.Time,
   197  ) error {
   198  	if !r.lastGenerated.IsZero() && reportTS == r.lastGenerated {
   199  		return errors.Errorf(
   200  			"The new time %s is the same as the time of the last update %s",
   201  			reportTS.String(),
   202  			r.lastGenerated.String(),
   203  		)
   204  	}
   205  
   206  	_, err := ex.Exec(
   207  		ctx,
   208  		"timestamp-upsert-replication-constraint-stats",
   209  		txn,
   210  		"upsert into system.reports_meta(id, generated) values($1, $2)",
   211  		replicationConstraintsReportID,
   212  		reportTS,
   213  	)
   214  	return err
   215  }
   216  
   217  // Save the report in the database.
   218  //
   219  // report should not be used by the caller any more after this call; the callee
   220  // takes ownership.
   221  // reportTS is the time that will be set in the updated_at column for every row.
   222  func (r *replicationConstraintStatsReportSaver) Save(
   223  	ctx context.Context,
   224  	report ConstraintReport,
   225  	reportTS time.Time,
   226  	db *kv.DB,
   227  	ex sqlutil.InternalExecutor,
   228  ) error {
   229  	r.lastUpdatedRowCount = 0
   230  	if err := db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   231  		err := r.loadPreviousVersion(ctx, ex, txn)
   232  		if err != nil {
   233  			return err
   234  		}
   235  
   236  		err = r.updateTimestamp(ctx, ex, txn, reportTS)
   237  		if err != nil {
   238  			return err
   239  		}
   240  
   241  		for k, zoneCons := range report {
   242  			if err := r.upsertConstraintStatus(
   243  				ctx, reportTS, txn, k, zoneCons.FailRangeCount, db, ex,
   244  			); err != nil {
   245  				return err
   246  			}
   247  		}
   248  
   249  		for key := range r.previousVersion {
   250  			if _, ok := report[key]; !ok {
   251  				_, err := ex.Exec(
   252  					ctx,
   253  					"delete-old-replication-constraint-stats",
   254  					txn,
   255  					"delete from system.replication_constraint_stats "+
   256  						"where zone_id = $1 and subzone_id = $2 and type = $3 and config = $4",
   257  					key.ZoneID,
   258  					key.SubzoneID,
   259  					key.ViolationType,
   260  					key.Constraint,
   261  				)
   262  
   263  				if err != nil {
   264  					return err
   265  				}
   266  				r.lastUpdatedRowCount++
   267  			}
   268  		}
   269  
   270  		return nil
   271  	}); err != nil {
   272  		return err
   273  	}
   274  
   275  	r.lastGenerated = reportTS
   276  	r.previousVersion = report
   277  
   278  	return nil
   279  }
   280  
   281  // upsertConstraintStatus upserts a row into system.replication_constraint_stats.
   282  //
   283  // existing is used to decide is this is a new violation.
   284  func (r *replicationConstraintStatsReportSaver) upsertConstraintStatus(
   285  	ctx context.Context,
   286  	reportTS time.Time,
   287  	txn *kv.Txn,
   288  	key ConstraintStatusKey,
   289  	violationCount int,
   290  	db *kv.DB,
   291  	ex sqlutil.InternalExecutor,
   292  ) error {
   293  	var err error
   294  	previousStatus, hasOldVersion := r.previousVersion[key]
   295  	if hasOldVersion && previousStatus.FailRangeCount == violationCount {
   296  		// No change in the status so no update.
   297  		return nil
   298  	} else if violationCount != 0 {
   299  		if previousStatus.FailRangeCount != 0 {
   300  			// Updating an old violation. No need to update the start timestamp.
   301  			_, err = ex.Exec(
   302  				ctx, "upsert-replication-constraint-stat", txn,
   303  				"upsert into system.replication_constraint_stats(report_id, zone_id, subzone_id, type, "+
   304  					"config, violating_ranges) values($1, $2, $3, $4, $5, $6)",
   305  				replicationConstraintsReportID,
   306  				key.ZoneID, key.SubzoneID, key.ViolationType, key.Constraint, violationCount,
   307  			)
   308  		} else if previousStatus.FailRangeCount == 0 {
   309  			// New violation detected. Need to update the start timestamp.
   310  			_, err = ex.Exec(
   311  				ctx, "upsert-replication-constraint-stat", txn,
   312  				"upsert into system.replication_constraint_stats(report_id, zone_id, subzone_id, type, "+
   313  					"config, violating_ranges, violation_start) values($1, $2, $3, $4, $5, $6, $7)",
   314  				replicationConstraintsReportID,
   315  				key.ZoneID, key.SubzoneID, key.ViolationType, key.Constraint, violationCount, reportTS,
   316  			)
   317  		}
   318  	} else {
   319  		// Need to set the violation start to null as there was an violation that doesn't exist anymore.
   320  		_, err = ex.Exec(
   321  			ctx, "upsert-replication-constraint-stat", txn,
   322  			"upsert into system.replication_constraint_stats(report_id, zone_id, subzone_id, type, config, "+
   323  				"violating_ranges, violation_start) values($1, $2, $3, $4, $5, $6, null)",
   324  			replicationConstraintsReportID,
   325  			key.ZoneID, key.SubzoneID, key.ViolationType, key.Constraint, violationCount,
   326  		)
   327  	}
   328  
   329  	if err != nil {
   330  		return err
   331  	}
   332  
   333  	r.lastUpdatedRowCount++
   334  	return nil
   335  }
   336  
   337  // constraintConformanceVisitor is a visitor that, when passed to visitRanges(),
   338  // computes the constraint conformance report (i.e. the
   339  // system.replication_constraint_stats table).
   340  type constraintConformanceVisitor struct {
   341  	cfg           *config.SystemConfig
   342  	storeResolver StoreResolver
   343  
   344  	// report is the output of the visitor. visit*() methods populate it.
   345  	// After visiting all the ranges, it can be retrieved with Report().
   346  	report   ConstraintReport
   347  	visitErr bool
   348  
   349  	// prevZoneKey and prevConstraints maintain state from one range to the next.
   350  	// This state can be reused when a range is covered by the same zone config as
   351  	// the previous one. Reusing it speeds up the report generation.
   352  	prevZoneKey     ZoneKey
   353  	prevConstraints []zonepb.ConstraintsConjunction
   354  }
   355  
   356  var _ rangeVisitor = &constraintConformanceVisitor{}
   357  
   358  func makeConstraintConformanceVisitor(
   359  	ctx context.Context, cfg *config.SystemConfig, storeResolver StoreResolver,
   360  ) constraintConformanceVisitor {
   361  	v := constraintConformanceVisitor{
   362  		cfg:           cfg,
   363  		storeResolver: storeResolver,
   364  	}
   365  	v.reset(ctx)
   366  	return v
   367  }
   368  
   369  // failed is part of the rangeVisitor interface.
   370  func (v *constraintConformanceVisitor) failed() bool {
   371  	return v.visitErr
   372  }
   373  
   374  // Report returns the ConstraintReport that was populated by previous visit*()
   375  // calls.
   376  func (v *constraintConformanceVisitor) Report() ConstraintReport {
   377  	return v.report
   378  }
   379  
   380  // reset is part of the rangeVisitor interface.
   381  func (v *constraintConformanceVisitor) reset(ctx context.Context) {
   382  	*v = constraintConformanceVisitor{
   383  		cfg:           v.cfg,
   384  		storeResolver: v.storeResolver,
   385  		report:        make(ConstraintReport, len(v.report)),
   386  	}
   387  
   388  	// Iterate through all the zone configs to create report entries for all the
   389  	// zones that have constraints. Otherwise, just iterating through the ranges
   390  	// wouldn't create entries for constraints that aren't violated, and
   391  	// definitely not for zones that don't apply to any ranges.
   392  	maxObjectID, err := v.cfg.GetLargestObjectID(0 /* maxID - return the largest ID in the config */)
   393  	if err != nil {
   394  		log.Fatalf(ctx, "unexpected failure to compute max object id: %s", err)
   395  	}
   396  	for i := uint32(1); i <= maxObjectID; i++ {
   397  		zone, err := getZoneByID(i, v.cfg)
   398  		if err != nil {
   399  			log.Fatalf(ctx, "unexpected failure to compute max object id: %s", err)
   400  		}
   401  		if zone == nil {
   402  			continue
   403  		}
   404  		v.report.ensureEntries(MakeZoneKey(i, NoSubzone), zone)
   405  	}
   406  }
   407  
   408  // visitNewZone is part of the rangeVisitor interface.
   409  func (v *constraintConformanceVisitor) visitNewZone(
   410  	ctx context.Context, r *roachpb.RangeDescriptor,
   411  ) (retErr error) {
   412  
   413  	defer func() {
   414  		v.visitErr = retErr != nil
   415  	}()
   416  
   417  	// Find the applicable constraints, which may be inherited.
   418  	var constraints []zonepb.ConstraintsConjunction
   419  	var zKey ZoneKey
   420  	_, err := visitZones(ctx, r, v.cfg, ignoreSubzonePlaceholders,
   421  		func(_ context.Context, zone *zonepb.ZoneConfig, key ZoneKey) bool {
   422  			if zone.Constraints == nil {
   423  				return false
   424  			}
   425  			constraints = zone.Constraints
   426  			zKey = key
   427  			return true
   428  		})
   429  	if err != nil {
   430  		return errors.Errorf("unexpected error visiting zones: %s", err)
   431  	}
   432  	v.prevZoneKey = zKey
   433  	v.prevConstraints = constraints
   434  	v.countRange(ctx, r, zKey, constraints)
   435  	return nil
   436  }
   437  
   438  // visitSameZone is part of the rangeVisitor interface.
   439  func (v *constraintConformanceVisitor) visitSameZone(
   440  	ctx context.Context, r *roachpb.RangeDescriptor,
   441  ) {
   442  	v.countRange(ctx, r, v.prevZoneKey, v.prevConstraints)
   443  }
   444  
   445  func (v *constraintConformanceVisitor) countRange(
   446  	ctx context.Context,
   447  	r *roachpb.RangeDescriptor,
   448  	key ZoneKey,
   449  	constraints []zonepb.ConstraintsConjunction,
   450  ) {
   451  	storeDescs := v.storeResolver(r)
   452  	violated := getViolations(ctx, storeDescs, constraints)
   453  	for _, c := range violated {
   454  		v.report.AddViolation(key, Constraint, c)
   455  	}
   456  }
   457  
   458  // getViolations returns the list of constraints violated by a range. The range
   459  // is represented by the descriptors of the replicas' stores.
   460  func getViolations(
   461  	ctx context.Context,
   462  	storeDescs []roachpb.StoreDescriptor,
   463  	constraintConjunctions []zonepb.ConstraintsConjunction,
   464  ) []ConstraintRepr {
   465  	var res []ConstraintRepr
   466  	// Evaluate all zone constraints for the stores (i.e. replicas) of the given range.
   467  	for _, conjunction := range constraintConjunctions {
   468  		replicasRequiredToMatch := int(conjunction.NumReplicas)
   469  		if replicasRequiredToMatch == 0 {
   470  			replicasRequiredToMatch = len(storeDescs)
   471  		}
   472  		for _, c := range conjunction.Constraints {
   473  			if !constraintSatisfied(c, replicasRequiredToMatch, storeDescs) {
   474  				res = append(res, ConstraintRepr(conjunction.String()))
   475  				break
   476  			}
   477  		}
   478  	}
   479  	return res
   480  }
   481  
   482  // constraintSatisfied checks that a range (represented by its replicas' stores)
   483  // satisfies a constraint.
   484  func constraintSatisfied(
   485  	c zonepb.Constraint, replicasRequiredToMatch int, storeDescs []roachpb.StoreDescriptor,
   486  ) bool {
   487  	passCount := 0
   488  	for _, storeDesc := range storeDescs {
   489  		// Consider stores for which we have no information to pass everything.
   490  		if storeDesc.StoreID == 0 {
   491  			passCount++
   492  			continue
   493  		}
   494  		if zonepb.StoreSatisfiesConstraint(storeDesc, c) {
   495  			passCount++
   496  		}
   497  	}
   498  	return replicasRequiredToMatch <= passCount
   499  }