github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/alertmanager/distributor.go (about)

     1  package alertmanager
     2  
     3  import (
     4  	"context"
     5  	"hash/fnv"
     6  	"io/ioutil"
     7  	"math/rand"
     8  	"net/http"
     9  	"path"
    10  	"strings"
    11  	"sync"
    12  
    13  	"github.com/go-kit/log"
    14  	"github.com/go-kit/log/level"
    15  	"github.com/grafana/dskit/ring"
    16  	"github.com/grafana/dskit/ring/client"
    17  	"github.com/grafana/dskit/services"
    18  	"github.com/opentracing/opentracing-go"
    19  	"github.com/pkg/errors"
    20  	"github.com/prometheus/client_golang/prometheus"
    21  	"github.com/weaveworks/common/httpgrpc"
    22  	"github.com/weaveworks/common/user"
    23  
    24  	"github.com/cortexproject/cortex/pkg/alertmanager/merger"
    25  	"github.com/cortexproject/cortex/pkg/tenant"
    26  	"github.com/cortexproject/cortex/pkg/util"
    27  	util_log "github.com/cortexproject/cortex/pkg/util/log"
    28  )
    29  
    30  // Distributor forwards requests to individual alertmanagers.
    31  type Distributor struct {
    32  	services.Service
    33  
    34  	cfg              ClientConfig
    35  	maxRecvMsgSize   int64
    36  	requestsInFlight sync.WaitGroup
    37  
    38  	alertmanagerRing        ring.ReadRing
    39  	alertmanagerClientsPool ClientsPool
    40  
    41  	logger log.Logger
    42  }
    43  
    44  // NewDistributor constructs a new Distributor
    45  func NewDistributor(cfg ClientConfig, maxRecvMsgSize int64, alertmanagersRing *ring.Ring, alertmanagerClientsPool ClientsPool, logger log.Logger, reg prometheus.Registerer) (d *Distributor, err error) {
    46  	if alertmanagerClientsPool == nil {
    47  		alertmanagerClientsPool = newAlertmanagerClientsPool(client.NewRingServiceDiscovery(alertmanagersRing), cfg, logger, reg)
    48  	}
    49  
    50  	d = &Distributor{
    51  		cfg:                     cfg,
    52  		logger:                  logger,
    53  		maxRecvMsgSize:          maxRecvMsgSize,
    54  		alertmanagerRing:        alertmanagersRing,
    55  		alertmanagerClientsPool: alertmanagerClientsPool,
    56  	}
    57  
    58  	d.Service = services.NewBasicService(nil, d.running, nil)
    59  	return d, nil
    60  }
    61  
    62  func (d *Distributor) running(ctx context.Context) error {
    63  	<-ctx.Done()
    64  	d.requestsInFlight.Wait()
    65  	return nil
    66  }
    67  
    68  // IsPathSupported returns true if the given route is currently supported by the Distributor.
    69  func (d *Distributor) IsPathSupported(p string) bool {
    70  	// API can be found at https://petstore.swagger.io/?url=https://raw.githubusercontent.com/prometheus/alertmanager/master/api/v2/openapi.yaml.
    71  	isQuorumReadPath, _ := d.isQuorumReadPath(p)
    72  	return d.isQuorumWritePath(p) || d.isUnaryWritePath(p) || d.isUnaryDeletePath(p) || d.isUnaryReadPath(p) || isQuorumReadPath
    73  }
    74  
    75  func (d *Distributor) isQuorumWritePath(p string) bool {
    76  	return strings.HasSuffix(p, "/alerts")
    77  }
    78  
    79  func (d *Distributor) isUnaryWritePath(p string) bool {
    80  	return strings.HasSuffix(p, "/silences")
    81  }
    82  
    83  func (d *Distributor) isUnaryDeletePath(p string) bool {
    84  	return strings.HasSuffix(path.Dir(p), "/silence")
    85  }
    86  
    87  func (d *Distributor) isQuorumReadPath(p string) (bool, merger.Merger) {
    88  	if strings.HasSuffix(p, "/v1/alerts") {
    89  		return true, merger.V1Alerts{}
    90  	}
    91  	if strings.HasSuffix(p, "/v2/alerts") {
    92  		return true, merger.V2Alerts{}
    93  	}
    94  	if strings.HasSuffix(p, "/v2/alerts/groups") {
    95  		return true, merger.V2AlertGroups{}
    96  	}
    97  	if strings.HasSuffix(p, "/v1/silences") {
    98  		return true, merger.V1Silences{}
    99  	}
   100  	if strings.HasSuffix(path.Dir(p), "/v1/silence") {
   101  		return true, merger.V1SilenceID{}
   102  	}
   103  	if strings.HasSuffix(p, "/v2/silences") {
   104  		return true, merger.V2Silences{}
   105  	}
   106  	if strings.HasSuffix(path.Dir(p), "/v2/silence") {
   107  		return true, merger.V2SilenceID{}
   108  	}
   109  	return false, nil
   110  }
   111  
   112  func (d *Distributor) isUnaryReadPath(p string) bool {
   113  	return strings.HasSuffix(p, "/status") ||
   114  		strings.HasSuffix(p, "/receivers")
   115  }
   116  
   117  // DistributeRequest shards the writes and returns as soon as the quorum is satisfied.
   118  // In case of reads, it proxies the request to one of the alertmanagers.
   119  // DistributeRequest assumes that the caller has verified IsPathSupported returns
   120  // true for the route.
   121  func (d *Distributor) DistributeRequest(w http.ResponseWriter, r *http.Request) {
   122  	d.requestsInFlight.Add(1)
   123  	defer d.requestsInFlight.Done()
   124  
   125  	userID, err := tenant.TenantID(r.Context())
   126  	if err != nil {
   127  		http.Error(w, err.Error(), http.StatusUnauthorized)
   128  		return
   129  	}
   130  
   131  	logger := util_log.WithContext(r.Context(), d.logger)
   132  
   133  	if r.Method == http.MethodPost {
   134  		if d.isQuorumWritePath(r.URL.Path) {
   135  			d.doQuorum(userID, w, r, logger, merger.Noop{})
   136  			return
   137  		}
   138  		if d.isUnaryWritePath(r.URL.Path) {
   139  			d.doUnary(userID, w, r, logger)
   140  			return
   141  		}
   142  	}
   143  	if r.Method == http.MethodDelete {
   144  		if d.isUnaryDeletePath(r.URL.Path) {
   145  			d.doUnary(userID, w, r, logger)
   146  			return
   147  		}
   148  	}
   149  	if r.Method == http.MethodGet || r.Method == http.MethodHead {
   150  		if ok, m := d.isQuorumReadPath(r.URL.Path); ok {
   151  			d.doQuorum(userID, w, r, logger, m)
   152  			return
   153  		}
   154  		if d.isUnaryReadPath(r.URL.Path) {
   155  			d.doUnary(userID, w, r, logger)
   156  			return
   157  		}
   158  	}
   159  
   160  	http.Error(w, "route not supported by distributor", http.StatusNotFound)
   161  }
   162  
   163  func (d *Distributor) doQuorum(userID string, w http.ResponseWriter, r *http.Request, logger log.Logger, m merger.Merger) {
   164  	var body []byte
   165  	var err error
   166  	if r.Body != nil {
   167  		body, err = ioutil.ReadAll(http.MaxBytesReader(w, r.Body, d.maxRecvMsgSize))
   168  		if err != nil {
   169  			if util.IsRequestBodyTooLarge(err) {
   170  				http.Error(w, "Request body too large", http.StatusRequestEntityTooLarge)
   171  				return
   172  			}
   173  			level.Error(logger).Log("msg", "failed to read the request body during write", "err", err)
   174  			w.WriteHeader(http.StatusInternalServerError)
   175  			return
   176  		}
   177  	}
   178  
   179  	var responses []*httpgrpc.HTTPResponse
   180  	var responsesMtx sync.Mutex
   181  	grpcHeaders := httpToHttpgrpcHeaders(r.Header)
   182  	err = ring.DoBatch(r.Context(), RingOp, d.alertmanagerRing, []uint32{shardByUser(userID)}, func(am ring.InstanceDesc, _ []int) error {
   183  		// Use a background context to make sure all alertmanagers get the request even if we return early.
   184  		localCtx := user.InjectOrgID(context.Background(), userID)
   185  		sp, localCtx := opentracing.StartSpanFromContext(localCtx, "Distributor.doQuorum")
   186  		defer sp.Finish()
   187  
   188  		resp, err := d.doRequest(localCtx, am, &httpgrpc.HTTPRequest{
   189  			Method:  r.Method,
   190  			Url:     r.RequestURI,
   191  			Body:    body,
   192  			Headers: grpcHeaders,
   193  		})
   194  		if err != nil {
   195  			return err
   196  		}
   197  
   198  		if resp.Code/100 != 2 {
   199  			return httpgrpc.ErrorFromHTTPResponse(resp)
   200  		}
   201  
   202  		responsesMtx.Lock()
   203  		responses = append(responses, resp)
   204  		responsesMtx.Unlock()
   205  
   206  		return nil
   207  	}, func() {})
   208  
   209  	if err != nil {
   210  		respondFromError(err, w, logger)
   211  		return
   212  	}
   213  
   214  	responsesMtx.Lock() // Another request might be ongoing after quorum.
   215  	resps := responses
   216  	responsesMtx.Unlock()
   217  
   218  	if len(resps) > 0 {
   219  		respondFromMultipleHTTPGRPCResponses(w, logger, resps, m)
   220  	} else {
   221  		// This should not happen.
   222  		level.Error(logger).Log("msg", "distributor did not receive any response from alertmanagers, but there were no errors")
   223  		w.WriteHeader(http.StatusInternalServerError)
   224  	}
   225  }
   226  
   227  func (d *Distributor) doUnary(userID string, w http.ResponseWriter, r *http.Request, logger log.Logger) {
   228  	key := shardByUser(userID)
   229  	replicationSet, err := d.alertmanagerRing.Get(key, RingOp, nil, nil, nil)
   230  	if err != nil {
   231  		level.Error(logger).Log("msg", "failed to get replication set from the ring", "err", err)
   232  		w.WriteHeader(http.StatusInternalServerError)
   233  		return
   234  	}
   235  
   236  	body, err := ioutil.ReadAll(http.MaxBytesReader(w, r.Body, d.maxRecvMsgSize))
   237  	if err != nil {
   238  		if util.IsRequestBodyTooLarge(err) {
   239  			http.Error(w, "Request body too large", http.StatusRequestEntityTooLarge)
   240  			return
   241  		}
   242  		level.Error(logger).Log("msg", "failed to read the request body during read", "err", err)
   243  		w.WriteHeader(http.StatusInternalServerError)
   244  		return
   245  	}
   246  	req := &httpgrpc.HTTPRequest{
   247  		Method:  r.Method,
   248  		Url:     r.RequestURI,
   249  		Body:    body,
   250  		Headers: httpToHttpgrpcHeaders(r.Header),
   251  	}
   252  
   253  	sp, ctx := opentracing.StartSpanFromContext(r.Context(), "Distributor.doUnary")
   254  	defer sp.Finish()
   255  	// Until we have a mechanism to combine the results from multiple alertmanagers,
   256  	// we forward the request to only only of the alertmanagers.
   257  	amDesc := replicationSet.Instances[rand.Intn(len(replicationSet.Instances))]
   258  	resp, err := d.doRequest(ctx, amDesc, req)
   259  	if err != nil {
   260  		respondFromError(err, w, logger)
   261  		return
   262  	}
   263  
   264  	respondFromHTTPGRPCResponse(w, resp)
   265  }
   266  
   267  func respondFromError(err error, w http.ResponseWriter, logger log.Logger) {
   268  	httpResp, ok := httpgrpc.HTTPResponseFromError(errors.Cause(err))
   269  	if !ok {
   270  		level.Error(logger).Log("msg", "failed to process the request to the alertmanager", "err", err)
   271  		http.Error(w, "Failed to process the request to the alertmanager", http.StatusInternalServerError)
   272  		return
   273  	}
   274  	respondFromHTTPGRPCResponse(w, httpResp)
   275  }
   276  
   277  func respondFromHTTPGRPCResponse(w http.ResponseWriter, httpResp *httpgrpc.HTTPResponse) {
   278  	for _, h := range httpResp.Headers {
   279  		for _, v := range h.Values {
   280  			w.Header().Add(h.Key, v)
   281  		}
   282  	}
   283  	w.WriteHeader(int(httpResp.Code))
   284  	w.Write(httpResp.Body) //nolint
   285  }
   286  
   287  func (d *Distributor) doRequest(ctx context.Context, am ring.InstanceDesc, req *httpgrpc.HTTPRequest) (*httpgrpc.HTTPResponse, error) {
   288  	ctx, cancel := context.WithTimeout(ctx, d.cfg.RemoteTimeout)
   289  	defer cancel()
   290  	amClient, err := d.alertmanagerClientsPool.GetClientFor(am.Addr)
   291  	if err != nil {
   292  		return nil, errors.Wrapf(err, "failed to get alertmanager client from pool (alertmanager address: %s)", am.Addr)
   293  	}
   294  
   295  	return amClient.HandleRequest(ctx, req)
   296  }
   297  
   298  func shardByUser(userID string) uint32 {
   299  	ringHasher := fnv.New32a()
   300  	// Hasher never returns err.
   301  	_, _ = ringHasher.Write([]byte(userID))
   302  	return ringHasher.Sum32()
   303  }
   304  
   305  func httpToHttpgrpcHeaders(hs http.Header) []*httpgrpc.Header {
   306  	result := make([]*httpgrpc.Header, 0, len(hs))
   307  	for k, vs := range hs {
   308  		result = append(result, &httpgrpc.Header{
   309  			Key:    k,
   310  			Values: vs,
   311  		})
   312  	}
   313  	return result
   314  }
   315  
   316  func respondFromMultipleHTTPGRPCResponses(w http.ResponseWriter, logger log.Logger, responses []*httpgrpc.HTTPResponse, merger merger.Merger) {
   317  	bodies := make([][]byte, len(responses))
   318  	for i, r := range responses {
   319  		bodies[i] = r.Body
   320  	}
   321  
   322  	body, err := merger.MergeResponses(bodies)
   323  	if err != nil {
   324  		level.Error(logger).Log("msg", "failed to merge responses for request", "err", err)
   325  		w.WriteHeader(http.StatusInternalServerError)
   326  		return
   327  	}
   328  
   329  	// It is assumed by using this function, the caller knows that the responses it receives
   330  	// have already been checked for success or failure, and that the headers will always
   331  	// match due to the nature of the request. If this is not the case, a different merge
   332  	// function should be implemented to cope with the differing responses.
   333  	response := &httpgrpc.HTTPResponse{
   334  		Code:    responses[0].Code,
   335  		Headers: responses[0].Headers,
   336  		Body:    body,
   337  	}
   338  
   339  	respondFromHTTPGRPCResponse(w, response)
   340  }