github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/ruler/api.go (about)

     1  package ruler
     2  
     3  import (
     4  	"encoding/json"
     5  	"io/ioutil"
     6  	"net/http"
     7  	"net/url"
     8  	"sort"
     9  	"strconv"
    10  	"strings"
    11  	"time"
    12  
    13  	"github.com/go-kit/log"
    14  	"github.com/go-kit/log/level"
    15  	"github.com/gorilla/mux"
    16  	"github.com/pkg/errors"
    17  	v1 "github.com/prometheus/client_golang/api/prometheus/v1"
    18  	"github.com/prometheus/prometheus/pkg/labels"
    19  	"github.com/prometheus/prometheus/pkg/rulefmt"
    20  	"github.com/weaveworks/common/user"
    21  	"gopkg.in/yaml.v3"
    22  
    23  	"github.com/cortexproject/cortex/pkg/cortexpb"
    24  	"github.com/cortexproject/cortex/pkg/ruler/rulespb"
    25  	"github.com/cortexproject/cortex/pkg/ruler/rulestore"
    26  	"github.com/cortexproject/cortex/pkg/tenant"
    27  	util_log "github.com/cortexproject/cortex/pkg/util/log"
    28  )
    29  
    30  // In order to reimplement the prometheus rules API, a large amount of code was copied over
    31  // This is required because the prometheus api implementation does not allow us to return errors
    32  // on rule lookups, which might fail in Cortex's case.
    33  
    34  type response struct {
    35  	Status    string       `json:"status"`
    36  	Data      interface{}  `json:"data"`
    37  	ErrorType v1.ErrorType `json:"errorType"`
    38  	Error     string       `json:"error"`
    39  }
    40  
    41  // AlertDiscovery has info for all active alerts.
    42  type AlertDiscovery struct {
    43  	Alerts []*Alert `json:"alerts"`
    44  }
    45  
    46  // Alert has info for an alert.
    47  type Alert struct {
    48  	Labels      labels.Labels `json:"labels"`
    49  	Annotations labels.Labels `json:"annotations"`
    50  	State       string        `json:"state"`
    51  	ActiveAt    *time.Time    `json:"activeAt"`
    52  	Value       string        `json:"value"`
    53  }
    54  
    55  // RuleDiscovery has info for all rules
    56  type RuleDiscovery struct {
    57  	RuleGroups []*RuleGroup `json:"groups"`
    58  }
    59  
    60  // RuleGroup has info for rules which are part of a group
    61  type RuleGroup struct {
    62  	Name string `json:"name"`
    63  	File string `json:"file"`
    64  	// In order to preserve rule ordering, while exposing type (alerting or recording)
    65  	// specific properties, both alerting and recording rules are exposed in the
    66  	// same array.
    67  	Rules          []rule    `json:"rules"`
    68  	Interval       float64   `json:"interval"`
    69  	LastEvaluation time.Time `json:"lastEvaluation"`
    70  	EvaluationTime float64   `json:"evaluationTime"`
    71  }
    72  
    73  type rule interface{}
    74  
    75  type alertingRule struct {
    76  	// State can be "pending", "firing", "inactive".
    77  	State          string        `json:"state"`
    78  	Name           string        `json:"name"`
    79  	Query          string        `json:"query"`
    80  	Duration       float64       `json:"duration"`
    81  	Labels         labels.Labels `json:"labels"`
    82  	Annotations    labels.Labels `json:"annotations"`
    83  	Alerts         []*Alert      `json:"alerts"`
    84  	Health         string        `json:"health"`
    85  	LastError      string        `json:"lastError"`
    86  	Type           v1.RuleType   `json:"type"`
    87  	LastEvaluation time.Time     `json:"lastEvaluation"`
    88  	EvaluationTime float64       `json:"evaluationTime"`
    89  }
    90  
    91  type recordingRule struct {
    92  	Name           string        `json:"name"`
    93  	Query          string        `json:"query"`
    94  	Labels         labels.Labels `json:"labels"`
    95  	Health         string        `json:"health"`
    96  	LastError      string        `json:"lastError"`
    97  	Type           v1.RuleType   `json:"type"`
    98  	LastEvaluation time.Time     `json:"lastEvaluation"`
    99  	EvaluationTime float64       `json:"evaluationTime"`
   100  }
   101  
   102  func respondError(logger log.Logger, w http.ResponseWriter, msg string) {
   103  	b, err := json.Marshal(&response{
   104  		Status:    "error",
   105  		ErrorType: v1.ErrServer,
   106  		Error:     msg,
   107  		Data:      nil,
   108  	})
   109  
   110  	if err != nil {
   111  		level.Error(logger).Log("msg", "error marshaling json response", "err", err)
   112  		http.Error(w, err.Error(), http.StatusInternalServerError)
   113  		return
   114  	}
   115  
   116  	w.WriteHeader(http.StatusInternalServerError)
   117  	if n, err := w.Write(b); err != nil {
   118  		level.Error(logger).Log("msg", "error writing response", "bytesWritten", n, "err", err)
   119  	}
   120  }
   121  
   122  // API is used to handle HTTP requests for the ruler service
   123  type API struct {
   124  	ruler *Ruler
   125  	store rulestore.RuleStore
   126  
   127  	logger log.Logger
   128  }
   129  
   130  // NewAPI returns a new API struct with the provided ruler and rule store
   131  func NewAPI(r *Ruler, s rulestore.RuleStore, logger log.Logger) *API {
   132  	return &API{
   133  		ruler:  r,
   134  		store:  s,
   135  		logger: logger,
   136  	}
   137  }
   138  
   139  func (a *API) PrometheusRules(w http.ResponseWriter, req *http.Request) {
   140  	logger := util_log.WithContext(req.Context(), a.logger)
   141  	userID, err := tenant.TenantID(req.Context())
   142  	if err != nil || userID == "" {
   143  		level.Error(logger).Log("msg", "error extracting org id from context", "err", err)
   144  		respondError(logger, w, "no valid org id found")
   145  		return
   146  	}
   147  
   148  	w.Header().Set("Content-Type", "application/json")
   149  	rgs, err := a.ruler.GetRules(req.Context())
   150  
   151  	if err != nil {
   152  		respondError(logger, w, err.Error())
   153  		return
   154  	}
   155  
   156  	groups := make([]*RuleGroup, 0, len(rgs))
   157  
   158  	for _, g := range rgs {
   159  		grp := RuleGroup{
   160  			Name:           g.Group.Name,
   161  			File:           g.Group.Namespace,
   162  			Rules:          make([]rule, len(g.ActiveRules)),
   163  			Interval:       g.Group.Interval.Seconds(),
   164  			LastEvaluation: g.GetEvaluationTimestamp(),
   165  			EvaluationTime: g.GetEvaluationDuration().Seconds(),
   166  		}
   167  
   168  		for i, rl := range g.ActiveRules {
   169  			if g.ActiveRules[i].Rule.Alert != "" {
   170  				alerts := make([]*Alert, 0, len(rl.Alerts))
   171  				for _, a := range rl.Alerts {
   172  					alerts = append(alerts, &Alert{
   173  						Labels:      cortexpb.FromLabelAdaptersToLabels(a.Labels),
   174  						Annotations: cortexpb.FromLabelAdaptersToLabels(a.Annotations),
   175  						State:       a.GetState(),
   176  						ActiveAt:    &a.ActiveAt,
   177  						Value:       strconv.FormatFloat(a.Value, 'e', -1, 64),
   178  					})
   179  				}
   180  				grp.Rules[i] = alertingRule{
   181  					State:          rl.GetState(),
   182  					Name:           rl.Rule.GetAlert(),
   183  					Query:          rl.Rule.GetExpr(),
   184  					Duration:       rl.Rule.For.Seconds(),
   185  					Labels:         cortexpb.FromLabelAdaptersToLabels(rl.Rule.Labels),
   186  					Annotations:    cortexpb.FromLabelAdaptersToLabels(rl.Rule.Annotations),
   187  					Alerts:         alerts,
   188  					Health:         rl.GetHealth(),
   189  					LastError:      rl.GetLastError(),
   190  					LastEvaluation: rl.GetEvaluationTimestamp(),
   191  					EvaluationTime: rl.GetEvaluationDuration().Seconds(),
   192  					Type:           v1.RuleTypeAlerting,
   193  				}
   194  			} else {
   195  				grp.Rules[i] = recordingRule{
   196  					Name:           rl.Rule.GetRecord(),
   197  					Query:          rl.Rule.GetExpr(),
   198  					Labels:         cortexpb.FromLabelAdaptersToLabels(rl.Rule.Labels),
   199  					Health:         rl.GetHealth(),
   200  					LastError:      rl.GetLastError(),
   201  					LastEvaluation: rl.GetEvaluationTimestamp(),
   202  					EvaluationTime: rl.GetEvaluationDuration().Seconds(),
   203  					Type:           v1.RuleTypeRecording,
   204  				}
   205  			}
   206  		}
   207  		groups = append(groups, &grp)
   208  	}
   209  
   210  	// keep data.groups are in order
   211  	sort.Slice(groups, func(i, j int) bool {
   212  		return groups[i].File < groups[j].File
   213  	})
   214  
   215  	b, err := json.Marshal(&response{
   216  		Status: "success",
   217  		Data:   &RuleDiscovery{RuleGroups: groups},
   218  	})
   219  	if err != nil {
   220  		level.Error(logger).Log("msg", "error marshaling json response", "err", err)
   221  		respondError(logger, w, "unable to marshal the requested data")
   222  		return
   223  	}
   224  	w.Header().Set("Content-Type", "application/json")
   225  	w.WriteHeader(http.StatusOK)
   226  	if n, err := w.Write(b); err != nil {
   227  		level.Error(logger).Log("msg", "error writing response", "bytesWritten", n, "err", err)
   228  	}
   229  }
   230  
   231  func (a *API) PrometheusAlerts(w http.ResponseWriter, req *http.Request) {
   232  	logger := util_log.WithContext(req.Context(), a.logger)
   233  	userID, err := tenant.TenantID(req.Context())
   234  	if err != nil || userID == "" {
   235  		level.Error(logger).Log("msg", "error extracting org id from context", "err", err)
   236  		respondError(logger, w, "no valid org id found")
   237  		return
   238  	}
   239  
   240  	w.Header().Set("Content-Type", "application/json")
   241  	rgs, err := a.ruler.GetRules(req.Context())
   242  
   243  	if err != nil {
   244  		respondError(logger, w, err.Error())
   245  		return
   246  	}
   247  
   248  	alerts := []*Alert{}
   249  
   250  	for _, g := range rgs {
   251  		for _, rl := range g.ActiveRules {
   252  			if rl.Rule.Alert != "" {
   253  				for _, a := range rl.Alerts {
   254  					alerts = append(alerts, &Alert{
   255  						Labels:      cortexpb.FromLabelAdaptersToLabels(a.Labels),
   256  						Annotations: cortexpb.FromLabelAdaptersToLabels(a.Annotations),
   257  						State:       a.GetState(),
   258  						ActiveAt:    &a.ActiveAt,
   259  						Value:       strconv.FormatFloat(a.Value, 'e', -1, 64),
   260  					})
   261  				}
   262  			}
   263  		}
   264  	}
   265  
   266  	b, err := json.Marshal(&response{
   267  		Status: "success",
   268  		Data:   &AlertDiscovery{Alerts: alerts},
   269  	})
   270  	if err != nil {
   271  		level.Error(logger).Log("msg", "error marshaling json response", "err", err)
   272  		respondError(logger, w, "unable to marshal the requested data")
   273  		return
   274  	}
   275  	w.Header().Set("Content-Type", "application/json")
   276  	w.WriteHeader(http.StatusOK)
   277  	if n, err := w.Write(b); err != nil {
   278  		level.Error(logger).Log("msg", "error writing response", "bytesWritten", n, "err", err)
   279  	}
   280  }
   281  
   282  var (
   283  	// ErrNoNamespace signals that no namespace was specified in the request
   284  	ErrNoNamespace = errors.New("a namespace must be provided in the request")
   285  	// ErrNoGroupName signals a group name url parameter was not found
   286  	ErrNoGroupName = errors.New("a matching group name must be provided in the request")
   287  	// ErrNoRuleGroups signals the rule group requested does not exist
   288  	ErrNoRuleGroups = errors.New("no rule groups found")
   289  	// ErrBadRuleGroup is returned when the provided rule group can not be unmarshalled
   290  	ErrBadRuleGroup = errors.New("unable to decoded rule group")
   291  )
   292  
   293  func marshalAndSend(output interface{}, w http.ResponseWriter, logger log.Logger) {
   294  	d, err := yaml.Marshal(&output)
   295  	if err != nil {
   296  		level.Error(logger).Log("msg", "error marshalling yaml rule groups", "err", err)
   297  		http.Error(w, err.Error(), http.StatusInternalServerError)
   298  		return
   299  	}
   300  
   301  	w.Header().Set("Content-Type", "application/yaml")
   302  	if _, err := w.Write(d); err != nil {
   303  		level.Error(logger).Log("msg", "error writing yaml response", "err", err)
   304  		return
   305  	}
   306  }
   307  
   308  func respondAccepted(w http.ResponseWriter, logger log.Logger) {
   309  	b, err := json.Marshal(&response{
   310  		Status: "success",
   311  	})
   312  	if err != nil {
   313  		level.Error(logger).Log("msg", "error marshaling json response", "err", err)
   314  		respondError(logger, w, "unable to marshal the requested data")
   315  		return
   316  	}
   317  	w.Header().Set("Content-Type", "application/json")
   318  
   319  	// Return a status accepted because the rule has been stored and queued for polling, but is not currently active
   320  	w.WriteHeader(http.StatusAccepted)
   321  	if n, err := w.Write(b); err != nil {
   322  		level.Error(logger).Log("msg", "error writing response", "bytesWritten", n, "err", err)
   323  	}
   324  }
   325  
   326  // parseNamespace parses the namespace from the provided set of params, in this
   327  // api these params are derived from the url path
   328  func parseNamespace(params map[string]string) (string, error) {
   329  	namespace, exists := params["namespace"]
   330  	if !exists {
   331  		return "", ErrNoNamespace
   332  	}
   333  
   334  	namespace, err := url.PathUnescape(namespace)
   335  	if err != nil {
   336  		return "", err
   337  	}
   338  
   339  	return namespace, nil
   340  }
   341  
   342  // parseGroupName parses the group name from the provided set of params, in this
   343  // api these params are derived from the url path
   344  func parseGroupName(params map[string]string) (string, error) {
   345  	groupName, exists := params["groupName"]
   346  	if !exists {
   347  		return "", ErrNoGroupName
   348  	}
   349  
   350  	groupName, err := url.PathUnescape(groupName)
   351  	if err != nil {
   352  		return "", err
   353  	}
   354  
   355  	return groupName, nil
   356  }
   357  
   358  // parseRequest parses the incoming request to parse out the userID, rules namespace, and rule group name
   359  // and returns them in that order. It also allows users to require a namespace or group name and return
   360  // an error if it they can not be parsed.
   361  func parseRequest(req *http.Request, requireNamespace, requireGroup bool) (string, string, string, error) {
   362  	userID, err := tenant.TenantID(req.Context())
   363  	if err != nil {
   364  		return "", "", "", user.ErrNoOrgID
   365  	}
   366  
   367  	vars := mux.Vars(req)
   368  
   369  	namespace, err := parseNamespace(vars)
   370  	if err != nil {
   371  		if err != ErrNoNamespace || requireNamespace {
   372  			return "", "", "", err
   373  		}
   374  	}
   375  
   376  	group, err := parseGroupName(vars)
   377  	if err != nil {
   378  		if err != ErrNoGroupName || requireGroup {
   379  			return "", "", "", err
   380  		}
   381  	}
   382  
   383  	return userID, namespace, group, nil
   384  }
   385  
   386  func (a *API) ListRules(w http.ResponseWriter, req *http.Request) {
   387  	logger := util_log.WithContext(req.Context(), a.logger)
   388  
   389  	userID, namespace, _, err := parseRequest(req, false, false)
   390  	if err != nil {
   391  		respondError(logger, w, err.Error())
   392  		return
   393  	}
   394  
   395  	level.Debug(logger).Log("msg", "retrieving rule groups with namespace", "userID", userID, "namespace", namespace)
   396  	rgs, err := a.store.ListRuleGroupsForUserAndNamespace(req.Context(), userID, namespace)
   397  	if err != nil {
   398  		http.Error(w, err.Error(), http.StatusBadRequest)
   399  		return
   400  	}
   401  
   402  	if len(rgs) == 0 {
   403  		level.Info(logger).Log("msg", "no rule groups found", "userID", userID)
   404  		http.Error(w, ErrNoRuleGroups.Error(), http.StatusNotFound)
   405  		return
   406  	}
   407  
   408  	err = a.store.LoadRuleGroups(req.Context(), map[string]rulespb.RuleGroupList{userID: rgs})
   409  	if err != nil {
   410  		http.Error(w, err.Error(), http.StatusBadRequest)
   411  		return
   412  	}
   413  
   414  	level.Debug(logger).Log("msg", "retrieved rule groups from rule store", "userID", userID, "num_namespaces", len(rgs))
   415  
   416  	formatted := rgs.Formatted()
   417  	marshalAndSend(formatted, w, logger)
   418  }
   419  
   420  func (a *API) GetRuleGroup(w http.ResponseWriter, req *http.Request) {
   421  	logger := util_log.WithContext(req.Context(), a.logger)
   422  	userID, namespace, groupName, err := parseRequest(req, true, true)
   423  	if err != nil {
   424  		respondError(logger, w, err.Error())
   425  		return
   426  	}
   427  
   428  	rg, err := a.store.GetRuleGroup(req.Context(), userID, namespace, groupName)
   429  	if err != nil {
   430  		if errors.Is(err, rulestore.ErrGroupNotFound) {
   431  			http.Error(w, err.Error(), http.StatusNotFound)
   432  			return
   433  		}
   434  		http.Error(w, err.Error(), http.StatusBadRequest)
   435  		return
   436  	}
   437  
   438  	formatted := rulespb.FromProto(rg)
   439  	marshalAndSend(formatted, w, logger)
   440  }
   441  
   442  func (a *API) CreateRuleGroup(w http.ResponseWriter, req *http.Request) {
   443  	logger := util_log.WithContext(req.Context(), a.logger)
   444  	userID, namespace, _, err := parseRequest(req, true, false)
   445  	if err != nil {
   446  		respondError(logger, w, err.Error())
   447  		return
   448  	}
   449  
   450  	payload, err := ioutil.ReadAll(req.Body)
   451  	if err != nil {
   452  		level.Error(logger).Log("msg", "unable to read rule group payload", "err", err.Error())
   453  		http.Error(w, err.Error(), http.StatusBadRequest)
   454  		return
   455  	}
   456  
   457  	level.Debug(logger).Log("msg", "attempting to unmarshal rulegroup", "userID", userID, "group", string(payload))
   458  
   459  	rg := rulefmt.RuleGroup{}
   460  	err = yaml.Unmarshal(payload, &rg)
   461  	if err != nil {
   462  		level.Error(logger).Log("msg", "unable to unmarshal rule group payload", "err", err.Error())
   463  		http.Error(w, ErrBadRuleGroup.Error(), http.StatusBadRequest)
   464  		return
   465  	}
   466  
   467  	errs := a.ruler.manager.ValidateRuleGroup(rg)
   468  	if len(errs) > 0 {
   469  		e := []string{}
   470  		for _, err := range errs {
   471  			level.Error(logger).Log("msg", "unable to validate rule group payload", "err", err.Error())
   472  			e = append(e, err.Error())
   473  		}
   474  
   475  		http.Error(w, strings.Join(e, ", "), http.StatusBadRequest)
   476  		return
   477  	}
   478  
   479  	if err := a.ruler.AssertMaxRulesPerRuleGroup(userID, len(rg.Rules)); err != nil {
   480  		level.Error(logger).Log("msg", "limit validation failure", "err", err.Error(), "user", userID)
   481  		http.Error(w, err.Error(), http.StatusBadRequest)
   482  		return
   483  	}
   484  
   485  	rgs, err := a.store.ListRuleGroupsForUserAndNamespace(req.Context(), userID, "")
   486  	if err != nil {
   487  		level.Error(logger).Log("msg", "unable to fetch current rule groups for validation", "err", err.Error(), "user", userID)
   488  		http.Error(w, err.Error(), http.StatusInternalServerError)
   489  		return
   490  	}
   491  
   492  	if err := a.ruler.AssertMaxRuleGroups(userID, len(rgs)+1); err != nil {
   493  		level.Error(logger).Log("msg", "limit validation failure", "err", err.Error(), "user", userID)
   494  		http.Error(w, err.Error(), http.StatusBadRequest)
   495  		return
   496  	}
   497  
   498  	rgProto := rulespb.ToProto(userID, namespace, rg)
   499  
   500  	level.Debug(logger).Log("msg", "attempting to store rulegroup", "userID", userID, "group", rgProto.String())
   501  	err = a.store.SetRuleGroup(req.Context(), userID, namespace, rgProto)
   502  	if err != nil {
   503  		level.Error(logger).Log("msg", "unable to store rule group", "err", err.Error())
   504  		http.Error(w, err.Error(), http.StatusInternalServerError)
   505  		return
   506  	}
   507  
   508  	respondAccepted(w, logger)
   509  }
   510  
   511  func (a *API) DeleteNamespace(w http.ResponseWriter, req *http.Request) {
   512  	logger := util_log.WithContext(req.Context(), a.logger)
   513  
   514  	userID, namespace, _, err := parseRequest(req, true, false)
   515  	if err != nil {
   516  		respondError(logger, w, err.Error())
   517  		return
   518  	}
   519  
   520  	err = a.store.DeleteNamespace(req.Context(), userID, namespace)
   521  	if err != nil {
   522  		if err == rulestore.ErrGroupNamespaceNotFound {
   523  			http.Error(w, err.Error(), http.StatusNotFound)
   524  			return
   525  		}
   526  		respondError(logger, w, err.Error())
   527  		return
   528  	}
   529  
   530  	respondAccepted(w, logger)
   531  }
   532  
   533  func (a *API) DeleteRuleGroup(w http.ResponseWriter, req *http.Request) {
   534  	logger := util_log.WithContext(req.Context(), a.logger)
   535  
   536  	userID, namespace, groupName, err := parseRequest(req, true, true)
   537  	if err != nil {
   538  		respondError(logger, w, err.Error())
   539  		return
   540  	}
   541  
   542  	err = a.store.DeleteRuleGroup(req.Context(), userID, namespace, groupName)
   543  	if err != nil {
   544  		if err == rulestore.ErrGroupNotFound {
   545  			http.Error(w, err.Error(), http.StatusNotFound)
   546  			return
   547  		}
   548  		respondError(logger, w, err.Error())
   549  		return
   550  	}
   551  
   552  	respondAccepted(w, logger)
   553  }