github.com/whtcorpsinc/milevadb-prod@v0.0.0-20211104133533-f57f4be3b597/interlock/inspection_result.go (about)

     1  // Copyright 2020 WHTCORPS INC, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package interlock
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  	"math"
    20  	"sort"
    21  	"strconv"
    22  	"strings"
    23  
    24  	"github.com/whtcorpsinc/errors"
    25  	"github.com/whtcorpsinc/failpoint"
    26  	causetembedded "github.com/whtcorpsinc/milevadb/causet/embedded"
    27  	"github.com/whtcorpsinc/milevadb/schemareplicant"
    28  	"github.com/whtcorpsinc/milevadb/soliton"
    29  	"github.com/whtcorpsinc/milevadb/soliton/chunk"
    30  	"github.com/whtcorpsinc/milevadb/soliton/set"
    31  	"github.com/whtcorpsinc/milevadb/soliton/sqlexec"
    32  	"github.com/whtcorpsinc/milevadb/stochastikctx"
    33  	"github.com/whtcorpsinc/milevadb/stochastikctx/variable"
    34  	"github.com/whtcorpsinc/milevadb/types"
    35  )
    36  
    37  type (
    38  	// inspectionResult represents a abnormal diagnosis result
    39  	inspectionResult struct {
    40  		tp            string
    41  		instance      string
    42  		statusAddress string
    43  		// represents the diagnostics item, e.g: `dbs.lease` `raftstore.cpuusage`
    44  		item string
    45  		// diagnosis result value base on current cluster status
    46  		actual   string
    47  		expected string
    48  		severity string
    49  		detail   string
    50  		// degree only used for sort.
    51  		degree float64
    52  	}
    53  
    54  	inspectionName string
    55  
    56  	inspectionFilter struct {
    57  		set       set.StringSet
    58  		timeRange causetembedded.QueryTimeRange
    59  	}
    60  
    61  	inspectionMemrule interface {
    62  		name() string
    63  		inspect(ctx context.Context, sctx stochastikctx.Context, filter inspectionFilter) []inspectionResult
    64  	}
    65  )
    66  
    67  func (n inspectionName) name() string {
    68  	return string(n)
    69  }
    70  
    71  func (f inspectionFilter) enable(name string) bool {
    72  	return len(f.set) == 0 || f.set.Exist(name)
    73  }
    74  
    75  func (f inspectionFilter) exist(name string) bool {
    76  	return len(f.set) > 0 && f.set.Exist(name)
    77  }
    78  
    79  type (
    80  	// configInspection is used to check whether a same configuration item has a
    81  	// different value between different instance in the cluster
    82  	configInspection struct{ inspectionName }
    83  
    84  	// versionInspection is used to check whether the same component has different
    85  	// version in the cluster
    86  	versionInspection struct{ inspectionName }
    87  
    88  	// nodeLoadInspection is used to check the node load of memory/disk/cpu
    89  	// have reached a high-level threshold
    90  	nodeLoadInspection struct{ inspectionName }
    91  
    92  	// criticalErrorInspection is used to check are there some critical errors
    93  	// occurred in the past
    94  	criticalErrorInspection struct{ inspectionName }
    95  
    96  	// thresholdCheckInspection is used to check some threshold value, like CPU usage, leader count change.
    97  	thresholdCheckInspection struct{ inspectionName }
    98  )
    99  
   100  var inspectionMemrules = []inspectionMemrule{
   101  	&configInspection{inspectionName: "config"},
   102  	&versionInspection{inspectionName: "version"},
   103  	&nodeLoadInspection{inspectionName: "node-load"},
   104  	&criticalErrorInspection{inspectionName: "critical-error"},
   105  	&thresholdCheckInspection{inspectionName: "threshold-check"},
   106  }
   107  
   108  type inspectionResultRetriever struct {
   109  	dummyCloser
   110  	retrieved               bool
   111  	extractor               *causetembedded.InspectionResultBlockExtractor
   112  	timeRange               causetembedded.QueryTimeRange
   113  	instanceToStatusAddress map[string]string
   114  	statusToInstanceAddress map[string]string
   115  }
   116  
   117  func (e *inspectionResultRetriever) retrieve(ctx context.Context, sctx stochastikctx.Context) ([][]types.Causet, error) {
   118  	if e.retrieved || e.extractor.SkipInspection {
   119  		return nil, nil
   120  	}
   121  	e.retrieved = true
   122  
   123  	// Some data of cluster-level memory blocks will be retrieved many times in different inspection rules,
   124  	// and the cost of retrieving some data is expensive. We use the `BlockSnapshot` to cache those data
   125  	// and obtain them lazily, and provide a consistent view of inspection blocks for each inspection rules.
   126  	// All cached snapshots should be released at the end of retrieving.
   127  	sctx.GetStochastikVars().InspectionBlockCache = map[string]variable.BlockSnapshot{}
   128  	defer func() { sctx.GetStochastikVars().InspectionBlockCache = nil }()
   129  
   130  	failpoint.InjectContext(ctx, "mockMergeMockInspectionBlocks", func() {
   131  		// Merge mock snapshots injected from failpoint for test purpose
   132  		mockBlocks, ok := ctx.Value("__mockInspectionBlocks").(map[string]variable.BlockSnapshot)
   133  		if ok {
   134  			for name, snap := range mockBlocks {
   135  				sctx.GetStochastikVars().InspectionBlockCache[strings.ToLower(name)] = snap
   136  			}
   137  		}
   138  	})
   139  
   140  	if e.instanceToStatusAddress == nil {
   141  		// Get cluster info.
   142  		e.instanceToStatusAddress = make(map[string]string)
   143  		e.statusToInstanceAddress = make(map[string]string)
   144  		allegrosql := "select instance,status_address from information_schema.cluster_info;"
   145  		rows, _, err := sctx.(sqlexec.RestrictedALLEGROSQLInterlockingDirectorate).InterDircRestrictedALLEGROSQL(allegrosql)
   146  		if err != nil {
   147  			sctx.GetStochastikVars().StmtCtx.AppendWarning(fmt.Errorf("get cluster info failed: %v", err))
   148  		}
   149  		for _, event := range rows {
   150  			if event.Len() < 2 {
   151  				continue
   152  			}
   153  			e.instanceToStatusAddress[event.GetString(0)] = event.GetString(1)
   154  			e.statusToInstanceAddress[event.GetString(1)] = event.GetString(0)
   155  		}
   156  	}
   157  
   158  	rules := inspectionFilter{set: e.extractor.Memrules}
   159  	items := inspectionFilter{set: e.extractor.Items, timeRange: e.timeRange}
   160  	var finalEvents [][]types.Causet
   161  	for _, r := range inspectionMemrules {
   162  		name := r.name()
   163  		if !rules.enable(name) {
   164  			continue
   165  		}
   166  		results := r.inspect(ctx, sctx, items)
   167  		if len(results) == 0 {
   168  			continue
   169  		}
   170  		// make result sblock
   171  		sort.Slice(results, func(i, j int) bool {
   172  			if results[i].degree != results[j].degree {
   173  				return results[i].degree > results[j].degree
   174  			}
   175  			if lhs, rhs := results[i].item, results[j].item; lhs != rhs {
   176  				return lhs < rhs
   177  			}
   178  			if results[i].actual != results[j].actual {
   179  				return results[i].actual < results[j].actual
   180  			}
   181  			if lhs, rhs := results[i].tp, results[j].tp; lhs != rhs {
   182  				return lhs < rhs
   183  			}
   184  			return results[i].instance < results[j].instance
   185  		})
   186  		for _, result := range results {
   187  			if len(result.instance) == 0 {
   188  				result.instance = e.statusToInstanceAddress[result.statusAddress]
   189  			}
   190  			if len(result.statusAddress) == 0 {
   191  				result.statusAddress = e.instanceToStatusAddress[result.instance]
   192  			}
   193  			finalEvents = append(finalEvents, types.MakeCausets(
   194  				name,
   195  				result.item,
   196  				result.tp,
   197  				result.instance,
   198  				result.statusAddress,
   199  				result.actual,
   200  				result.expected,
   201  				result.severity,
   202  				result.detail,
   203  			))
   204  		}
   205  	}
   206  	return finalEvents, nil
   207  }
   208  
   209  func (c configInspection) inspect(ctx context.Context, sctx stochastikctx.Context, filter inspectionFilter) []inspectionResult {
   210  	var results []inspectionResult
   211  	results = append(results, c.inspectDiffConfig(ctx, sctx, filter)...)
   212  	results = append(results, c.inspectCheckConfig(ctx, sctx, filter)...)
   213  	return results
   214  }
   215  
   216  func (configInspection) inspectDiffConfig(_ context.Context, sctx stochastikctx.Context, filter inspectionFilter) []inspectionResult {
   217  	// check the configuration consistent
   218  	ignoreConfigKey := []string{
   219  		// MilevaDB
   220  		"port",
   221  		"status.status-port",
   222  		"host",
   223  		"path",
   224  		"advertise-address",
   225  		"status.status-port",
   226  		"log.file.filename",
   227  		"log.slow-query-file",
   228  		"tmp-storage-path",
   229  
   230  		// FIDel
   231  		"advertise-client-urls",
   232  		"advertise-peer-urls",
   233  		"client-urls",
   234  		"data-dir",
   235  		"log-file",
   236  		"log.file.filename",
   237  		"metric.job",
   238  		"name",
   239  		"peer-urls",
   240  
   241  		// EinsteinDB
   242  		"server.addr",
   243  		"server.advertise-addr",
   244  		"server.status-addr",
   245  		"log-file",
   246  		"raftstore.raftdb-path",
   247  		"storage.data-dir",
   248  		"storage.causet-cache.capacity",
   249  	}
   250  	allegrosql := fmt.Sprintf("select type, `key`, count(distinct value) as c from information_schema.cluster_config where `key` not in ('%s') group by type, `key` having c > 1",
   251  		strings.Join(ignoreConfigKey, "','"))
   252  	rows, _, err := sctx.(sqlexec.RestrictedALLEGROSQLInterlockingDirectorate).InterDircRestrictedALLEGROSQL(allegrosql)
   253  	if err != nil {
   254  		sctx.GetStochastikVars().StmtCtx.AppendWarning(fmt.Errorf("check configuration consistency failed: %v", err))
   255  	}
   256  
   257  	generateDetail := func(tp, item string) string {
   258  		query := fmt.Sprintf("select value, instance from information_schema.cluster_config where type='%s' and `key`='%s';", tp, item)
   259  		rows, _, err := sctx.(sqlexec.RestrictedALLEGROSQLInterlockingDirectorate).InterDircRestrictedALLEGROSQL(query)
   260  		if err != nil {
   261  			sctx.GetStochastikVars().StmtCtx.AppendWarning(fmt.Errorf("check configuration consistency failed: %v", err))
   262  			return fmt.Sprintf("the cluster has different config value of %[2]s, execute the allegrosql to see more detail: select * from information_schema.cluster_config where type='%[1]s' and `key`='%[2]s'",
   263  				tp, item)
   264  		}
   265  		m := make(map[string][]string)
   266  		for _, event := range rows {
   267  			value := event.GetString(0)
   268  			instance := event.GetString(1)
   269  			m[value] = append(m[value], instance)
   270  		}
   271  		groups := make([]string, 0, len(m))
   272  		for k, v := range m {
   273  			sort.Strings(v)
   274  			groups = append(groups, fmt.Sprintf("%s config value is %s", strings.Join(v, ","), k))
   275  		}
   276  		sort.Strings(groups)
   277  		return strings.Join(groups, "\n")
   278  	}
   279  
   280  	var results []inspectionResult
   281  	for _, event := range rows {
   282  		if filter.enable(event.GetString(1)) {
   283  			detail := generateDetail(event.GetString(0), event.GetString(1))
   284  			results = append(results, inspectionResult{
   285  				tp:       event.GetString(0),
   286  				instance: "",
   287  				item:     event.GetString(1), // key
   288  				actual:   "inconsistent",
   289  				expected: "consistent",
   290  				severity: "warning",
   291  				detail:   detail,
   292  			})
   293  		}
   294  	}
   295  	return results
   296  }
   297  
   298  func (c configInspection) inspectCheckConfig(ctx context.Context, sctx stochastikctx.Context, filter inspectionFilter) []inspectionResult {
   299  	// check the configuration in reason.
   300  	cases := []struct {
   301  		tp     string
   302  		key    string
   303  		value  string
   304  		detail string
   305  	}{
   306  		{
   307  			tp:     "milevadb",
   308  			key:    "log.slow-threshold",
   309  			value:  "0",
   310  			detail: "slow-threshold = 0 will record every query to slow log, it may affect performance",
   311  		},
   312  		{
   313  			tp:     "einsteindb",
   314  			key:    "raftstore.sync-log",
   315  			value:  "false",
   316  			detail: "sync-log should be true to avoid recover region when the machine breaks down",
   317  		},
   318  	}
   319  
   320  	var results []inspectionResult
   321  	for _, cas := range cases {
   322  		if !filter.enable(cas.key) {
   323  			continue
   324  		}
   325  		allegrosql := fmt.Sprintf("select instance from information_schema.cluster_config where type = '%s' and `key` = '%s' and value = '%s'",
   326  			cas.tp, cas.key, cas.value)
   327  		rows, _, err := sctx.(sqlexec.RestrictedALLEGROSQLInterlockingDirectorate).InterDircRestrictedALLEGROSQL(allegrosql)
   328  		if err != nil {
   329  			sctx.GetStochastikVars().StmtCtx.AppendWarning(fmt.Errorf("check configuration in reason failed: %v", err))
   330  		}
   331  
   332  		for _, event := range rows {
   333  			results = append(results, inspectionResult{
   334  				tp:       cas.tp,
   335  				instance: event.GetString(0),
   336  				item:     cas.key,
   337  				actual:   cas.value,
   338  				expected: "not " + cas.value,
   339  				severity: "warning",
   340  				detail:   cas.detail,
   341  			})
   342  		}
   343  	}
   344  	results = append(results, c.checkEinsteinDBBlockCacheSizeConfig(ctx, sctx, filter)...)
   345  	return results
   346  }
   347  
   348  func (c configInspection) checkEinsteinDBBlockCacheSizeConfig(ctx context.Context, sctx stochastikctx.Context, filter inspectionFilter) []inspectionResult {
   349  	item := "storage.causet-cache.capacity"
   350  	if !filter.enable(item) {
   351  		return nil
   352  	}
   353  	allegrosql := "select instance,value from information_schema.cluster_config where type='einsteindb' and `key` = 'storage.causet-cache.capacity'"
   354  	rows, _, err := sctx.(sqlexec.RestrictedALLEGROSQLInterlockingDirectorate).InterDircRestrictedALLEGROSQLWithContext(ctx, allegrosql)
   355  	if err != nil {
   356  		sctx.GetStochastikVars().StmtCtx.AppendWarning(fmt.Errorf("check configuration in reason failed: %v", err))
   357  	}
   358  	extractIP := func(addr string) string {
   359  		if idx := strings.Index(addr, ":"); idx > -1 {
   360  			return addr[0:idx]
   361  		}
   362  		return addr
   363  	}
   364  
   365  	ipToBlockSize := make(map[string]uint64)
   366  	ipToCount := make(map[string]int)
   367  	for _, event := range rows {
   368  		ip := extractIP(event.GetString(0))
   369  		size, err := c.convertReadableSizeToByteSize(event.GetString(1))
   370  		if err != nil {
   371  			sctx.GetStochastikVars().StmtCtx.AppendWarning(fmt.Errorf("check EinsteinDB causet-cache configuration in reason failed: %v", err))
   372  			return nil
   373  		}
   374  		ipToBlockSize[ip] += size
   375  		ipToCount[ip]++
   376  	}
   377  
   378  	allegrosql = "select instance, value from metrics_schema.node_total_memory where time=now()"
   379  	rows, _, err = sctx.(sqlexec.RestrictedALLEGROSQLInterlockingDirectorate).InterDircRestrictedALLEGROSQLWithContext(ctx, allegrosql)
   380  	if err != nil {
   381  		sctx.GetStochastikVars().StmtCtx.AppendWarning(fmt.Errorf("check configuration in reason failed: %v", err))
   382  	}
   383  	ipToMemorySize := make(map[string]float64)
   384  	for _, event := range rows {
   385  		ip := extractIP(event.GetString(0))
   386  		size := event.GetFloat64(1)
   387  		ipToMemorySize[ip] += size
   388  	}
   389  
   390  	var results []inspectionResult
   391  	for ip, blockSize := range ipToBlockSize {
   392  		if memorySize, ok := ipToMemorySize[ip]; ok {
   393  			if float64(blockSize) > memorySize*0.45 {
   394  				detail := fmt.Sprintf("There are %v EinsteinDB server in %v node, the total 'storage.causet-cache.capacity' of EinsteinDB is more than (0.45 * total node memory)",
   395  					ipToCount[ip], ip)
   396  				results = append(results, inspectionResult{
   397  					tp:       "einsteindb",
   398  					instance: ip,
   399  					item:     item,
   400  					actual:   fmt.Sprintf("%v", blockSize),
   401  					expected: fmt.Sprintf("< %.0f", memorySize*0.45),
   402  					severity: "warning",
   403  					detail:   detail,
   404  				})
   405  			}
   406  		}
   407  	}
   408  	return results
   409  }
   410  
   411  func (configInspection) convertReadableSizeToByteSize(sizeStr string) (uint64, error) {
   412  	const KB = uint64(1024)
   413  	const MB = KB * 1024
   414  	const GB = MB * 1024
   415  	const TB = GB * 1024
   416  	const PB = TB * 1024
   417  
   418  	rate := uint64(1)
   419  	if strings.HasSuffix(sizeStr, "KiB") {
   420  		rate = KB
   421  	} else if strings.HasSuffix(sizeStr, "MiB") {
   422  		rate = MB
   423  	} else if strings.HasSuffix(sizeStr, "GiB") {
   424  		rate = GB
   425  	} else if strings.HasSuffix(sizeStr, "TiB") {
   426  		rate = TB
   427  	} else if strings.HasSuffix(sizeStr, "PiB") {
   428  		rate = PB
   429  	}
   430  	if rate != 1 && len(sizeStr) > 3 {
   431  		sizeStr = sizeStr[:len(sizeStr)-3]
   432  	}
   433  	size, err := strconv.Atoi(sizeStr)
   434  	if err != nil {
   435  		return 0, errors.Trace(err)
   436  	}
   437  	return uint64(size) * rate, nil
   438  }
   439  
   440  func (versionInspection) inspect(_ context.Context, sctx stochastikctx.Context, filter inspectionFilter) []inspectionResult {
   441  	// check the configuration consistent
   442  	allegrosql := "select type, count(distinct git_hash) as c from information_schema.cluster_info group by type having c > 1;"
   443  	rows, _, err := sctx.(sqlexec.RestrictedALLEGROSQLInterlockingDirectorate).InterDircRestrictedALLEGROSQL(allegrosql)
   444  	if err != nil {
   445  		sctx.GetStochastikVars().StmtCtx.AppendWarning(fmt.Errorf("check version consistency failed: %v", err))
   446  	}
   447  
   448  	const name = "git_hash"
   449  	var results []inspectionResult
   450  	for _, event := range rows {
   451  		if filter.enable(name) {
   452  			results = append(results, inspectionResult{
   453  				tp:       event.GetString(0),
   454  				instance: "",
   455  				item:     name,
   456  				actual:   "inconsistent",
   457  				expected: "consistent",
   458  				severity: "critical",
   459  				detail:   fmt.Sprintf("the cluster has %[1]v different %[2]s versions, execute the allegrosql to see more detail: select * from information_schema.cluster_info where type='%[2]s'", event.GetUint64(1), event.GetString(0)),
   460  			})
   461  		}
   462  	}
   463  	return results
   464  }
   465  
   466  func (c nodeLoadInspection) inspect(ctx context.Context, sctx stochastikctx.Context, filter inspectionFilter) []inspectionResult {
   467  	var rules = []ruleChecker{
   468  		inspectCPULoad{item: "load1", tbl: "node_load1"},
   469  		inspectCPULoad{item: "load5", tbl: "node_load5"},
   470  		inspectCPULoad{item: "load15", tbl: "node_load15"},
   471  		inspectVirtualMemUsage{},
   472  		inspectSwapMemoryUsed{},
   473  		inspectDiskUsage{},
   474  	}
   475  	return checkMemrules(ctx, sctx, filter, rules)
   476  }
   477  
   478  type inspectVirtualMemUsage struct{}
   479  
   480  func (inspectVirtualMemUsage) genALLEGROSQL(timeRange causetembedded.QueryTimeRange) string {
   481  	allegrosql := fmt.Sprintf("select instance, max(value) as max_usage from metrics_schema.node_memory_usage %s group by instance having max_usage >= 70", timeRange.Condition())
   482  	return allegrosql
   483  }
   484  
   485  func (i inspectVirtualMemUsage) genResult(allegrosql string, event chunk.Event) inspectionResult {
   486  	return inspectionResult{
   487  		tp:       "node",
   488  		instance: event.GetString(0),
   489  		item:     i.getItem(),
   490  		actual:   fmt.Sprintf("%.1f%%", event.GetFloat64(1)),
   491  		expected: "< 70%",
   492  		severity: "warning",
   493  		detail:   "the memory-usage is too high",
   494  	}
   495  }
   496  
   497  func (inspectVirtualMemUsage) getItem() string {
   498  	return "virtual-memory-usage"
   499  }
   500  
   501  type inspectSwapMemoryUsed struct{}
   502  
   503  func (inspectSwapMemoryUsed) genALLEGROSQL(timeRange causetembedded.QueryTimeRange) string {
   504  	allegrosql := fmt.Sprintf("select instance, max(value) as max_used from metrics_schema.node_memory_swap_used %s group by instance having max_used > 0", timeRange.Condition())
   505  	return allegrosql
   506  }
   507  
   508  func (i inspectSwapMemoryUsed) genResult(allegrosql string, event chunk.Event) inspectionResult {
   509  	return inspectionResult{
   510  		tp:       "node",
   511  		instance: event.GetString(0),
   512  		item:     i.getItem(),
   513  		actual:   fmt.Sprintf("%.1f", event.GetFloat64(1)),
   514  		expected: "0",
   515  		severity: "warning",
   516  	}
   517  }
   518  
   519  func (inspectSwapMemoryUsed) getItem() string {
   520  	return "swap-memory-used"
   521  }
   522  
   523  type inspectDiskUsage struct{}
   524  
   525  func (inspectDiskUsage) genALLEGROSQL(timeRange causetembedded.QueryTimeRange) string {
   526  	allegrosql := fmt.Sprintf("select instance, device, max(value) as max_usage from metrics_schema.node_disk_usage %v and device like '/%%' group by instance, device having max_usage >= 70", timeRange.Condition())
   527  	return allegrosql
   528  }
   529  
   530  func (i inspectDiskUsage) genResult(allegrosql string, event chunk.Event) inspectionResult {
   531  	return inspectionResult{
   532  		tp:       "node",
   533  		instance: event.GetString(0),
   534  		item:     i.getItem(),
   535  		actual:   fmt.Sprintf("%.1f%%", event.GetFloat64(2)),
   536  		expected: "< 70%",
   537  		severity: "warning",
   538  		detail:   "the disk-usage of " + event.GetString(1) + " is too high",
   539  	}
   540  }
   541  
   542  func (inspectDiskUsage) getItem() string {
   543  	return "disk-usage"
   544  }
   545  
   546  type inspectCPULoad struct {
   547  	item string
   548  	tbl  string
   549  }
   550  
   551  func (i inspectCPULoad) genALLEGROSQL(timeRange causetembedded.QueryTimeRange) string {
   552  	allegrosql := fmt.Sprintf(`select t1.instance, t1.max_load , 0.7*t2.cpu_count from
   553  			(select instance,max(value) as max_load  from metrics_schema.%[1]s %[2]s group by instance) as t1 join
   554  			(select instance,max(value) as cpu_count from metrics_schema.node_virtual_cpus %[2]s group by instance) as t2
   555  			on t1.instance=t2.instance where t1.max_load>(0.7*t2.cpu_count);`, i.tbl, timeRange.Condition())
   556  	return allegrosql
   557  }
   558  
   559  func (i inspectCPULoad) genResult(allegrosql string, event chunk.Event) inspectionResult {
   560  	return inspectionResult{
   561  		tp:       "node",
   562  		instance: event.GetString(0),
   563  		item:     "cpu-" + i.item,
   564  		actual:   fmt.Sprintf("%.1f", event.GetFloat64(1)),
   565  		expected: fmt.Sprintf("< %.1f", event.GetFloat64(2)),
   566  		severity: "warning",
   567  		detail:   i.getItem() + " should less than (cpu_logical_embeddeds * 0.7)",
   568  	}
   569  }
   570  
   571  func (i inspectCPULoad) getItem() string {
   572  	return "cpu-" + i.item
   573  }
   574  
   575  func (c criticalErrorInspection) inspect(ctx context.Context, sctx stochastikctx.Context, filter inspectionFilter) []inspectionResult {
   576  	results := c.inspectError(ctx, sctx, filter)
   577  	results = append(results, c.inspectForServerDown(ctx, sctx, filter)...)
   578  	return results
   579  }
   580  func (criticalErrorInspection) inspectError(ctx context.Context, sctx stochastikctx.Context, filter inspectionFilter) []inspectionResult {
   581  	var rules = []struct {
   582  		tp   string
   583  		item string
   584  		tbl  string
   585  	}{
   586  		{tp: "einsteindb", item: "critical-error", tbl: "einsteindb_critical_error_total_count"},
   587  		{tp: "milevadb", item: "panic-count", tbl: "milevadb_panic_count_total_count"},
   588  		{tp: "milevadb", item: "binlog-error", tbl: "milevadb_binlog_error_total_count"},
   589  		{tp: "einsteindb", item: "scheduler-is-busy", tbl: "einsteindb_scheduler_is_busy_total_count"},
   590  		{tp: "einsteindb", item: "interlock-is-busy", tbl: "einsteindb_coprocessor_is_busy_total_count"},
   591  		{tp: "einsteindb", item: "channel-is-full", tbl: "einsteindb_channel_full_total_count"},
   592  		{tp: "einsteindb", item: "einsteindb_engine_write_stall", tbl: "einsteindb_engine_write_stall"},
   593  	}
   594  
   595  	condition := filter.timeRange.Condition()
   596  	var results []inspectionResult
   597  	for _, rule := range rules {
   598  		if filter.enable(rule.item) {
   599  			def, found := schemareplicant.MetricBlockMap[rule.tbl]
   600  			if !found {
   601  				sctx.GetStochastikVars().StmtCtx.AppendWarning(fmt.Errorf("metrics causet: %s not found", rule.tbl))
   602  				continue
   603  			}
   604  			allegrosql := fmt.Sprintf("select `%[1]s`,sum(value) as total from `%[2]s`.`%[3]s` %[4]s group by `%[1]s` having total>=1.0",
   605  				strings.Join(def.Labels, "`,`"), soliton.MetricSchemaName.L, rule.tbl, condition)
   606  			rows, _, err := sctx.(sqlexec.RestrictedALLEGROSQLInterlockingDirectorate).InterDircRestrictedALLEGROSQLWithContext(ctx, allegrosql)
   607  			if err != nil {
   608  				sctx.GetStochastikVars().StmtCtx.AppendWarning(fmt.Errorf("execute '%s' failed: %v", allegrosql, err))
   609  				continue
   610  			}
   611  			for _, event := range rows {
   612  				var actual, detail string
   613  				var degree float64
   614  				if rest := def.Labels[1:]; len(rest) > 0 {
   615  					values := make([]string, 0, len(rest))
   616  					// `i+1` and `1+len(rest)` means skip the first field `instance`
   617  					for i := range rest {
   618  						values = append(values, event.GetString(i+1))
   619  					}
   620  					// TODO: find a better way to construct the `actual` field
   621  					actual = fmt.Sprintf("%.2f(%s)", event.GetFloat64(1+len(rest)), strings.Join(values, ", "))
   622  					degree = event.GetFloat64(1 + len(rest))
   623  				} else {
   624  					actual = fmt.Sprintf("%.2f", event.GetFloat64(1))
   625  					degree = event.GetFloat64(1)
   626  				}
   627  				detail = fmt.Sprintf("the total number of errors about '%s' is too many", rule.item)
   628  				result := inspectionResult{
   629  					tp: rule.tp,
   630  					// NOTE: all blocks which can be inspected here whose first label must be `instance`
   631  					statusAddress: event.GetString(0),
   632  					item:          rule.item,
   633  					actual:        actual,
   634  					expected:      "0",
   635  					severity:      "critical",
   636  					detail:        detail,
   637  					degree:        degree,
   638  				}
   639  				results = append(results, result)
   640  			}
   641  		}
   642  	}
   643  	return results
   644  }
   645  
   646  func (criticalErrorInspection) inspectForServerDown(ctx context.Context, sctx stochastikctx.Context, filter inspectionFilter) []inspectionResult {
   647  	item := "server-down"
   648  	if !filter.enable(item) {
   649  		return nil
   650  	}
   651  	condition := filter.timeRange.Condition()
   652  	allegrosql := fmt.Sprintf(`select t1.job,t1.instance, t2.min_time from
   653  		(select instance,job from metrics_schema.up %[1]s group by instance,job having max(value)-min(value)>0) as t1 join
   654  		(select instance,min(time) as min_time from metrics_schema.up %[1]s and value=0 group by instance,job) as t2 on t1.instance=t2.instance order by job`, condition)
   655  	rows, _, err := sctx.(sqlexec.RestrictedALLEGROSQLInterlockingDirectorate).InterDircRestrictedALLEGROSQLWithContext(ctx, allegrosql)
   656  	if err != nil {
   657  		sctx.GetStochastikVars().StmtCtx.AppendWarning(fmt.Errorf("execute '%s' failed: %v", allegrosql, err))
   658  	}
   659  	var results []inspectionResult
   660  	for _, event := range rows {
   661  		if event.Len() < 3 {
   662  			continue
   663  		}
   664  		detail := fmt.Sprintf("%s %s disconnect with prometheus around time '%s'", event.GetString(0), event.GetString(1), event.GetTime(2))
   665  		result := inspectionResult{
   666  			tp:            event.GetString(0),
   667  			statusAddress: event.GetString(1),
   668  			item:          item,
   669  			actual:        "",
   670  			expected:      "",
   671  			severity:      "critical",
   672  			detail:        detail,
   673  			degree:        10000 + float64(len(results)),
   674  		}
   675  		results = append(results, result)
   676  	}
   677  	// Check from log.
   678  	allegrosql = fmt.Sprintf("select type,instance,time from information_schema.cluster_log %s and level = 'info' and message like '%%Welcome to'", condition)
   679  	rows, _, err = sctx.(sqlexec.RestrictedALLEGROSQLInterlockingDirectorate).InterDircRestrictedALLEGROSQLWithContext(ctx, allegrosql)
   680  	if err != nil {
   681  		sctx.GetStochastikVars().StmtCtx.AppendWarning(fmt.Errorf("execute '%s' failed: %v", allegrosql, err))
   682  	}
   683  	for _, event := range rows {
   684  		if event.Len() < 3 {
   685  			continue
   686  		}
   687  		detail := fmt.Sprintf("%s %s restarted at time '%s'", event.GetString(0), event.GetString(1), event.GetString(2))
   688  		result := inspectionResult{
   689  			tp:       event.GetString(0),
   690  			instance: event.GetString(1),
   691  			item:     item,
   692  			actual:   "",
   693  			expected: "",
   694  			severity: "critical",
   695  			detail:   detail,
   696  			degree:   10000 + float64(len(results)),
   697  		}
   698  		results = append(results, result)
   699  	}
   700  	return results
   701  }
   702  
   703  func (c thresholdCheckInspection) inspect(ctx context.Context, sctx stochastikctx.Context, filter inspectionFilter) []inspectionResult {
   704  	inspects := []func(context.Context, stochastikctx.Context, inspectionFilter) []inspectionResult{
   705  		c.inspectThreshold1,
   706  		c.inspectThreshold2,
   707  		c.inspectThreshold3,
   708  		c.inspectForLeaderDrop,
   709  	}
   710  	var results []inspectionResult
   711  	for _, inspect := range inspects {
   712  		re := inspect(ctx, sctx, filter)
   713  		results = append(results, re...)
   714  	}
   715  	return results
   716  }
   717  
   718  func (thresholdCheckInspection) inspectThreshold1(ctx context.Context, sctx stochastikctx.Context, filter inspectionFilter) []inspectionResult {
   719  	var rules = []struct {
   720  		item      string
   721  		component string
   722  		configKey string
   723  		threshold float64
   724  	}{
   725  		{
   726  			item:      "interlock-normal-cpu",
   727  			component: "cop_normal%",
   728  			configKey: "readpool.interlock.normal-concurrency",
   729  			threshold: 0.9},
   730  		{
   731  			item:      "interlock-high-cpu",
   732  			component: "cop_high%",
   733  			configKey: "readpool.interlock.high-concurrency",
   734  			threshold: 0.9,
   735  		},
   736  		{
   737  			item:      "interlock-low-cpu",
   738  			component: "cop_low%",
   739  			configKey: "readpool.interlock.low-concurrency",
   740  			threshold: 0.9,
   741  		},
   742  		{
   743  			item:      "grpc-cpu",
   744  			component: "grpc%",
   745  			configKey: "server.grpc-concurrency",
   746  			threshold: 0.9,
   747  		},
   748  		{
   749  			item:      "raftstore-cpu",
   750  			component: "raftstore_%",
   751  			configKey: "raftstore.causetstore-pool-size",
   752  			threshold: 0.8,
   753  		},
   754  		{
   755  			item:      "apply-cpu",
   756  			component: "apply_%",
   757  			configKey: "raftstore.apply-pool-size",
   758  			threshold: 0.8,
   759  		},
   760  		{
   761  			item:      "storage-readpool-normal-cpu",
   762  			component: "store_read_norm%",
   763  			configKey: "readpool.storage.normal-concurrency",
   764  			threshold: 0.9,
   765  		},
   766  		{
   767  			item:      "storage-readpool-high-cpu",
   768  			component: "store_read_high%",
   769  			configKey: "readpool.storage.high-concurrency",
   770  			threshold: 0.9,
   771  		},
   772  		{
   773  			item:      "storage-readpool-low-cpu",
   774  			component: "store_read_low%",
   775  			configKey: "readpool.storage.low-concurrency",
   776  			threshold: 0.9,
   777  		},
   778  		{
   779  			item:      "scheduler-worker-cpu",
   780  			component: "sched_%",
   781  			configKey: "storage.scheduler-worker-pool-size",
   782  			threshold: 0.85,
   783  		},
   784  		{
   785  			item:      "split-check-cpu",
   786  			component: "split_check",
   787  			threshold: 0.9,
   788  		},
   789  	}
   790  
   791  	condition := filter.timeRange.Condition()
   792  	var results []inspectionResult
   793  	for _, rule := range rules {
   794  		if !filter.enable(rule.item) {
   795  			continue
   796  		}
   797  
   798  		var allegrosql string
   799  		if len(rule.configKey) > 0 {
   800  			allegrosql = fmt.Sprintf("select t1.status_address, t1.cpu, (t2.value * %[2]f) as threshold, t2.value from "+
   801  				"(select status_address, max(sum_value) as cpu from (select instance as status_address, sum(value) as sum_value from metrics_schema.einsteindb_thread_cpu %[4]s and name like '%[1]s' group by instance, time) as tmp group by tmp.status_address) as t1 join "+
   802  				"(select instance, value from information_schema.cluster_config where type='einsteindb' and `key` = '%[3]s') as t2 join "+
   803  				"(select instance,status_address from information_schema.cluster_info where type='einsteindb') as t3 "+
   804  				"on t1.status_address=t3.status_address and t2.instance=t3.instance where t1.cpu > (t2.value * %[2]f)", rule.component, rule.threshold, rule.configKey, condition)
   805  		} else {
   806  			allegrosql = fmt.Sprintf("select t1.instance, t1.cpu, %[2]f from "+
   807  				"(select instance, max(value) as cpu from metrics_schema.einsteindb_thread_cpu %[3]s and name like '%[1]s' group by instance) as t1 "+
   808  				"where t1.cpu > %[2]f;", rule.component, rule.threshold, condition)
   809  		}
   810  		rows, _, err := sctx.(sqlexec.RestrictedALLEGROSQLInterlockingDirectorate).InterDircRestrictedALLEGROSQLWithContext(ctx, allegrosql)
   811  		if err != nil {
   812  			sctx.GetStochastikVars().StmtCtx.AppendWarning(fmt.Errorf("execute '%s' failed: %v", allegrosql, err))
   813  			continue
   814  		}
   815  		for _, event := range rows {
   816  			actual := fmt.Sprintf("%.2f", event.GetFloat64(1))
   817  			degree := math.Abs(event.GetFloat64(1)-event.GetFloat64(2)) / math.Max(event.GetFloat64(1), event.GetFloat64(2))
   818  			expected := ""
   819  			if len(rule.configKey) > 0 {
   820  				expected = fmt.Sprintf("< %.2f, config: %v=%v", event.GetFloat64(2), rule.configKey, event.GetString(3))
   821  			} else {
   822  				expected = fmt.Sprintf("< %.2f", event.GetFloat64(2))
   823  			}
   824  			detail := fmt.Sprintf("the '%s' max cpu-usage of %s einsteindb is too high", rule.item, event.GetString(0))
   825  			result := inspectionResult{
   826  				tp:            "einsteindb",
   827  				statusAddress: event.GetString(0),
   828  				item:          rule.item,
   829  				actual:        actual,
   830  				expected:      expected,
   831  				severity:      "warning",
   832  				detail:        detail,
   833  				degree:        degree,
   834  			}
   835  			results = append(results, result)
   836  		}
   837  	}
   838  	return results
   839  }
   840  
   841  func (thresholdCheckInspection) inspectThreshold2(ctx context.Context, sctx stochastikctx.Context, filter inspectionFilter) []inspectionResult {
   842  	var rules = []struct {
   843  		tp        string
   844  		item      string
   845  		tbl       string
   846  		condition string
   847  		threshold float64
   848  		factor    float64
   849  		isMin     bool
   850  		detail    string
   851  	}{
   852  		{
   853  			tp:        "milevadb",
   854  			item:      "tso-duration",
   855  			tbl:       "FIDel_tso_wait_duration",
   856  			condition: "quantile=0.999",
   857  			threshold: 0.05,
   858  		},
   859  		{
   860  			tp:        "milevadb",
   861  			item:      "get-token-duration",
   862  			tbl:       "milevadb_get_token_duration",
   863  			condition: "quantile=0.999",
   864  			threshold: 0.001,
   865  			factor:    10e5, // the unit is microsecond
   866  		},
   867  		{
   868  			tp:        "milevadb",
   869  			item:      "load-schemaReplicant-duration",
   870  			tbl:       "milevadb_load_schema_duration",
   871  			condition: "quantile=0.99",
   872  			threshold: 1,
   873  		},
   874  		{
   875  			tp:        "einsteindb",
   876  			item:      "scheduler-cmd-duration",
   877  			tbl:       "einsteindb_scheduler_command_duration",
   878  			condition: "quantile=0.99",
   879  			threshold: 0.1,
   880  		},
   881  		{
   882  			tp:        "einsteindb",
   883  			item:      "handle-snapshot-duration",
   884  			tbl:       "einsteindb_handle_snapshot_duration",
   885  			threshold: 30,
   886  		},
   887  		{
   888  			tp:        "einsteindb",
   889  			item:      "storage-write-duration",
   890  			tbl:       "einsteindb_storage_async_request_duration",
   891  			condition: "type='write'",
   892  			threshold: 0.1,
   893  		},
   894  		{
   895  			tp:        "einsteindb",
   896  			item:      "storage-snapshot-duration",
   897  			tbl:       "einsteindb_storage_async_request_duration",
   898  			condition: "type='snapshot'",
   899  			threshold: 0.05,
   900  		},
   901  		{
   902  			tp:        "einsteindb",
   903  			item:      "lmdb-write-duration",
   904  			tbl:       "einsteindb_engine_write_duration",
   905  			condition: "type='write_max'",
   906  			threshold: 0.1,
   907  			factor:    10e5, // the unit is microsecond
   908  		},
   909  		{
   910  			tp:        "einsteindb",
   911  			item:      "lmdb-get-duration",
   912  			tbl:       "einsteindb_engine_max_get_duration",
   913  			condition: "type='get_max'",
   914  			threshold: 0.05,
   915  			factor:    10e5,
   916  		},
   917  		{
   918  			tp:        "einsteindb",
   919  			item:      "lmdb-seek-duration",
   920  			tbl:       "einsteindb_engine_max_seek_duration",
   921  			condition: "type='seek_max'",
   922  			threshold: 0.05,
   923  			factor:    10e5, // the unit is microsecond
   924  		},
   925  		{
   926  			tp:        "einsteindb",
   927  			item:      "scheduler-pending-cmd-count",
   928  			tbl:       "einsteindb_scheduler_pending_commands",
   929  			threshold: 1000,
   930  			detail:    " %s einsteindb scheduler has too many pending commands",
   931  		},
   932  		{
   933  			tp:        "einsteindb",
   934  			item:      "index-causet-cache-hit",
   935  			tbl:       "einsteindb_block_index_cache_hit",
   936  			condition: "value > 0",
   937  			threshold: 0.95,
   938  			isMin:     true,
   939  		},
   940  		{
   941  			tp:        "einsteindb",
   942  			item:      "filter-causet-cache-hit",
   943  			tbl:       "einsteindb_block_filter_cache_hit",
   944  			condition: "value > 0",
   945  			threshold: 0.95,
   946  			isMin:     true,
   947  		},
   948  		{
   949  			tp:        "einsteindb",
   950  			item:      "data-causet-cache-hit",
   951  			tbl:       "einsteindb_block_data_cache_hit",
   952  			condition: "value > 0",
   953  			threshold: 0.80,
   954  			isMin:     true,
   955  		},
   956  	}
   957  
   958  	condition := filter.timeRange.Condition()
   959  	var results []inspectionResult
   960  	for _, rule := range rules {
   961  		if !filter.enable(rule.item) {
   962  			continue
   963  		}
   964  		var allegrosql string
   965  		cond := condition
   966  		if len(rule.condition) > 0 {
   967  			cond = fmt.Sprintf("%s and %s", cond, rule.condition)
   968  		}
   969  		if rule.factor == 0 {
   970  			rule.factor = 1
   971  		}
   972  		if rule.isMin {
   973  			allegrosql = fmt.Sprintf("select instance, min(value)/%.0f as min_value from metrics_schema.%s %s group by instance having min_value < %f;", rule.factor, rule.tbl, cond, rule.threshold)
   974  		} else {
   975  			allegrosql = fmt.Sprintf("select instance, max(value)/%.0f as max_value from metrics_schema.%s %s group by instance having max_value > %f;", rule.factor, rule.tbl, cond, rule.threshold)
   976  		}
   977  		rows, _, err := sctx.(sqlexec.RestrictedALLEGROSQLInterlockingDirectorate).InterDircRestrictedALLEGROSQLWithContext(ctx, allegrosql)
   978  		if err != nil {
   979  			sctx.GetStochastikVars().StmtCtx.AppendWarning(fmt.Errorf("execute '%s' failed: %v", allegrosql, err))
   980  			continue
   981  		}
   982  		for _, event := range rows {
   983  			actual := fmt.Sprintf("%.3f", event.GetFloat64(1))
   984  			degree := math.Abs(event.GetFloat64(1)-rule.threshold) / math.Max(event.GetFloat64(1), rule.threshold)
   985  			expected := ""
   986  			if rule.isMin {
   987  				expected = fmt.Sprintf("> %.3f", rule.threshold)
   988  			} else {
   989  				expected = fmt.Sprintf("< %.3f", rule.threshold)
   990  			}
   991  			detail := rule.detail
   992  			if len(detail) == 0 {
   993  				if strings.HasSuffix(rule.item, "duration") {
   994  					detail = fmt.Sprintf("max duration of %s %s %s is too slow", event.GetString(0), rule.tp, rule.item)
   995  				} else if strings.HasSuffix(rule.item, "hit") {
   996  					detail = fmt.Sprintf("min %s rate of %s %s is too low", rule.item, event.GetString(0), rule.tp)
   997  				}
   998  			} else {
   999  				detail = fmt.Sprintf(detail, event.GetString(0))
  1000  			}
  1001  			result := inspectionResult{
  1002  				tp:            rule.tp,
  1003  				statusAddress: event.GetString(0),
  1004  				item:          rule.item,
  1005  				actual:        actual,
  1006  				expected:      expected,
  1007  				severity:      "warning",
  1008  				detail:        detail,
  1009  				degree:        degree,
  1010  			}
  1011  			results = append(results, result)
  1012  		}
  1013  	}
  1014  	return results
  1015  }
  1016  
  1017  type ruleChecker interface {
  1018  	genALLEGROSQL(timeRange causetembedded.QueryTimeRange) string
  1019  	genResult(allegrosql string, event chunk.Event) inspectionResult
  1020  	getItem() string
  1021  }
  1022  
  1023  type compareStoreStatus struct {
  1024  	item      string
  1025  	tp        string
  1026  	threshold float64
  1027  }
  1028  
  1029  func (c compareStoreStatus) genALLEGROSQL(timeRange causetembedded.QueryTimeRange) string {
  1030  	condition := fmt.Sprintf(`where t1.time>='%[1]s' and t1.time<='%[2]s' and
  1031  		 t2.time>='%[1]s' and t2.time<='%[2]s'`, timeRange.From.Format(causetembedded.MetricBlockTimeFormat),
  1032  		timeRange.To.Format(causetembedded.MetricBlockTimeFormat))
  1033  	return fmt.Sprintf(`
  1034  		SELECT t1.address,
  1035          	max(t1.value),
  1036          	t2.address,
  1037          	min(t2.value),
  1038           	max((t1.value-t2.value)/t1.value) AS ratio
  1039  		FROM metrics_schema.FIDel_scheduler_store_status t1
  1040  		JOIN metrics_schema.FIDel_scheduler_store_status t2 %s
  1041          	AND t1.type='%s'
  1042          	AND t1.time = t2.time
  1043          	AND t1.type=t2.type
  1044          	AND t1.address != t2.address
  1045          	AND (t1.value-t2.value)/t1.value>%v
  1046          	AND t1.value > 0
  1047  		GROUP BY  t1.address,t2.address
  1048  		ORDER BY  ratio desc`, condition, c.tp, c.threshold)
  1049  }
  1050  
  1051  func (c compareStoreStatus) genResult(_ string, event chunk.Event) inspectionResult {
  1052  	addr1 := event.GetString(0)
  1053  	value1 := event.GetFloat64(1)
  1054  	addr2 := event.GetString(2)
  1055  	value2 := event.GetFloat64(3)
  1056  	ratio := event.GetFloat64(4)
  1057  	detail := fmt.Sprintf("%v max %s is %.2f, much more than %v min %s %.2f", addr1, c.tp, value1, addr2, c.tp, value2)
  1058  	return inspectionResult{
  1059  		tp:       "einsteindb",
  1060  		instance: addr2,
  1061  		item:     c.item,
  1062  		actual:   fmt.Sprintf("%.2f%%", ratio*100),
  1063  		expected: fmt.Sprintf("< %.2f%%", c.threshold*100),
  1064  		severity: "warning",
  1065  		detail:   detail,
  1066  		degree:   ratio,
  1067  	}
  1068  }
  1069  
  1070  func (c compareStoreStatus) getItem() string {
  1071  	return c.item
  1072  }
  1073  
  1074  type checkRegionHealth struct{}
  1075  
  1076  func (c checkRegionHealth) genALLEGROSQL(timeRange causetembedded.QueryTimeRange) string {
  1077  	condition := timeRange.Condition()
  1078  	return fmt.Sprintf(`select instance, sum(value) as sum_value from metrics_schema.FIDel_region_health %s and
  1079  		type in ('extra-peer-region-count','learner-peer-region-count','pending-peer-region-count') having sum_value>100`, condition)
  1080  }
  1081  
  1082  func (c checkRegionHealth) genResult(_ string, event chunk.Event) inspectionResult {
  1083  	detail := fmt.Sprintf("the count of extra-perr and learner-peer and pending-peer are %v, it means the scheduling is too frequent or too slow", event.GetFloat64(1))
  1084  	actual := fmt.Sprintf("%.2f", event.GetFloat64(1))
  1085  	degree := math.Abs(event.GetFloat64(1)-100) / math.Max(event.GetFloat64(1), 100)
  1086  	return inspectionResult{
  1087  		tp:       "fidel",
  1088  		instance: event.GetString(0),
  1089  		item:     c.getItem(),
  1090  		actual:   actual,
  1091  		expected: "< 100",
  1092  		severity: "warning",
  1093  		detail:   detail,
  1094  		degree:   degree,
  1095  	}
  1096  }
  1097  
  1098  func (c checkRegionHealth) getItem() string {
  1099  	return "region-health"
  1100  }
  1101  
  1102  type checkStoreRegionTooMuch struct{}
  1103  
  1104  func (c checkStoreRegionTooMuch) genALLEGROSQL(timeRange causetembedded.QueryTimeRange) string {
  1105  	condition := timeRange.Condition()
  1106  	return fmt.Sprintf(`select address, max(value) from metrics_schema.FIDel_scheduler_store_status %s and type='region_count' and value > 20000 group by address`, condition)
  1107  }
  1108  
  1109  func (c checkStoreRegionTooMuch) genResult(allegrosql string, event chunk.Event) inspectionResult {
  1110  	actual := fmt.Sprintf("%.2f", event.GetFloat64(1))
  1111  	degree := math.Abs(event.GetFloat64(1)-20000) / math.Max(event.GetFloat64(1), 20000)
  1112  	return inspectionResult{
  1113  		tp:       "einsteindb",
  1114  		instance: event.GetString(0),
  1115  		item:     c.getItem(),
  1116  		actual:   actual,
  1117  		expected: "<= 20000",
  1118  		severity: "warning",
  1119  		detail:   fmt.Sprintf("%s einsteindb has too many regions", event.GetString(0)),
  1120  		degree:   degree,
  1121  	}
  1122  }
  1123  
  1124  func (c checkStoreRegionTooMuch) getItem() string {
  1125  	return "region-count"
  1126  }
  1127  
  1128  func (thresholdCheckInspection) inspectThreshold3(ctx context.Context, sctx stochastikctx.Context, filter inspectionFilter) []inspectionResult {
  1129  	var rules = []ruleChecker{
  1130  		compareStoreStatus{
  1131  			item:      "leader-sembedded-balance",
  1132  			tp:        "leader_sembedded",
  1133  			threshold: 0.05,
  1134  		},
  1135  		compareStoreStatus{
  1136  			item:      "region-sembedded-balance",
  1137  			tp:        "region_sembedded",
  1138  			threshold: 0.05,
  1139  		},
  1140  		compareStoreStatus{
  1141  			item:      "causetstore-available-balance",
  1142  			tp:        "store_available",
  1143  			threshold: 0.2,
  1144  		},
  1145  		checkRegionHealth{},
  1146  		checkStoreRegionTooMuch{},
  1147  	}
  1148  	return checkMemrules(ctx, sctx, filter, rules)
  1149  }
  1150  
  1151  func checkMemrules(ctx context.Context, sctx stochastikctx.Context, filter inspectionFilter, rules []ruleChecker) []inspectionResult {
  1152  	var results []inspectionResult
  1153  	for _, rule := range rules {
  1154  		if !filter.enable(rule.getItem()) {
  1155  			continue
  1156  		}
  1157  		allegrosql := rule.genALLEGROSQL(filter.timeRange)
  1158  		rows, _, err := sctx.(sqlexec.RestrictedALLEGROSQLInterlockingDirectorate).InterDircRestrictedALLEGROSQLWithContext(ctx, allegrosql)
  1159  		if err != nil {
  1160  			sctx.GetStochastikVars().StmtCtx.AppendWarning(fmt.Errorf("execute '%s' failed: %v", allegrosql, err))
  1161  			continue
  1162  		}
  1163  		for _, event := range rows {
  1164  			results = append(results, rule.genResult(allegrosql, event))
  1165  		}
  1166  	}
  1167  	return results
  1168  }
  1169  
  1170  func (c thresholdCheckInspection) inspectForLeaderDrop(ctx context.Context, sctx stochastikctx.Context, filter inspectionFilter) []inspectionResult {
  1171  	condition := filter.timeRange.Condition()
  1172  	threshold := 50.0
  1173  	allegrosql := fmt.Sprintf(`select address,min(value) as mi,max(value) as mx from metrics_schema.FIDel_scheduler_store_status %s and type='leader_count' group by address having mx-mi>%v`, condition, threshold)
  1174  	rows, _, err := sctx.(sqlexec.RestrictedALLEGROSQLInterlockingDirectorate).InterDircRestrictedALLEGROSQLWithContext(ctx, allegrosql)
  1175  	if err != nil {
  1176  		sctx.GetStochastikVars().StmtCtx.AppendWarning(fmt.Errorf("execute '%s' failed: %v", allegrosql, err))
  1177  		return nil
  1178  	}
  1179  	var results []inspectionResult
  1180  	for _, event := range rows {
  1181  		address := event.GetString(0)
  1182  		allegrosql := fmt.Sprintf(`select time, value from metrics_schema.FIDel_scheduler_store_status %s and type='leader_count' and address = '%s' order by time`, condition, address)
  1183  		subEvents, _, err := sctx.(sqlexec.RestrictedALLEGROSQLInterlockingDirectorate).InterDircRestrictedALLEGROSQLWithContext(ctx, allegrosql)
  1184  		if err != nil {
  1185  			sctx.GetStochastikVars().StmtCtx.AppendWarning(fmt.Errorf("execute '%s' failed: %v", allegrosql, err))
  1186  			continue
  1187  		}
  1188  		lastValue := float64(0)
  1189  		for i, subEvents := range subEvents {
  1190  			v := subEvents.GetFloat64(1)
  1191  			if i == 0 {
  1192  				lastValue = v
  1193  				continue
  1194  			}
  1195  			if lastValue-v > threshold {
  1196  				level := "warning"
  1197  				if v == 0 {
  1198  					level = "critical"
  1199  				}
  1200  				results = append(results, inspectionResult{
  1201  					tp:       "einsteindb",
  1202  					instance: address,
  1203  					item:     "leader-drop",
  1204  					actual:   fmt.Sprintf("%.0f", lastValue-v),
  1205  					expected: fmt.Sprintf("<= %.0f", threshold),
  1206  					severity: level,
  1207  					detail:   fmt.Sprintf("%s einsteindb has too many leader-drop around time %s, leader count from %.0f drop to %.0f", address, subEvents.GetTime(0), lastValue, v),
  1208  					degree:   lastValue - v,
  1209  				})
  1210  				break
  1211  			}
  1212  			lastValue = v
  1213  		}
  1214  	}
  1215  	return results
  1216  }