github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/alertmanager/distributor.go (about) 1 package alertmanager 2 3 import ( 4 "context" 5 "hash/fnv" 6 "io/ioutil" 7 "math/rand" 8 "net/http" 9 "path" 10 "strings" 11 "sync" 12 13 "github.com/go-kit/log" 14 "github.com/go-kit/log/level" 15 "github.com/grafana/dskit/ring" 16 "github.com/grafana/dskit/ring/client" 17 "github.com/grafana/dskit/services" 18 "github.com/opentracing/opentracing-go" 19 "github.com/pkg/errors" 20 "github.com/prometheus/client_golang/prometheus" 21 "github.com/weaveworks/common/httpgrpc" 22 "github.com/weaveworks/common/user" 23 24 "github.com/cortexproject/cortex/pkg/alertmanager/merger" 25 "github.com/cortexproject/cortex/pkg/tenant" 26 "github.com/cortexproject/cortex/pkg/util" 27 util_log "github.com/cortexproject/cortex/pkg/util/log" 28 ) 29 30 // Distributor forwards requests to individual alertmanagers. 31 type Distributor struct { 32 services.Service 33 34 cfg ClientConfig 35 maxRecvMsgSize int64 36 requestsInFlight sync.WaitGroup 37 38 alertmanagerRing ring.ReadRing 39 alertmanagerClientsPool ClientsPool 40 41 logger log.Logger 42 } 43 44 // NewDistributor constructs a new Distributor 45 func NewDistributor(cfg ClientConfig, maxRecvMsgSize int64, alertmanagersRing *ring.Ring, alertmanagerClientsPool ClientsPool, logger log.Logger, reg prometheus.Registerer) (d *Distributor, err error) { 46 if alertmanagerClientsPool == nil { 47 alertmanagerClientsPool = newAlertmanagerClientsPool(client.NewRingServiceDiscovery(alertmanagersRing), cfg, logger, reg) 48 } 49 50 d = &Distributor{ 51 cfg: cfg, 52 logger: logger, 53 maxRecvMsgSize: maxRecvMsgSize, 54 alertmanagerRing: alertmanagersRing, 55 alertmanagerClientsPool: alertmanagerClientsPool, 56 } 57 58 d.Service = services.NewBasicService(nil, d.running, nil) 59 return d, nil 60 } 61 62 func (d *Distributor) running(ctx context.Context) error { 63 <-ctx.Done() 64 d.requestsInFlight.Wait() 65 return nil 66 } 67 68 // IsPathSupported returns true if the given route is currently supported by the Distributor. 69 func (d *Distributor) IsPathSupported(p string) bool { 70 // API can be found at https://petstore.swagger.io/?url=https://raw.githubusercontent.com/prometheus/alertmanager/master/api/v2/openapi.yaml. 71 isQuorumReadPath, _ := d.isQuorumReadPath(p) 72 return d.isQuorumWritePath(p) || d.isUnaryWritePath(p) || d.isUnaryDeletePath(p) || d.isUnaryReadPath(p) || isQuorumReadPath 73 } 74 75 func (d *Distributor) isQuorumWritePath(p string) bool { 76 return strings.HasSuffix(p, "/alerts") 77 } 78 79 func (d *Distributor) isUnaryWritePath(p string) bool { 80 return strings.HasSuffix(p, "/silences") 81 } 82 83 func (d *Distributor) isUnaryDeletePath(p string) bool { 84 return strings.HasSuffix(path.Dir(p), "/silence") 85 } 86 87 func (d *Distributor) isQuorumReadPath(p string) (bool, merger.Merger) { 88 if strings.HasSuffix(p, "/v1/alerts") { 89 return true, merger.V1Alerts{} 90 } 91 if strings.HasSuffix(p, "/v2/alerts") { 92 return true, merger.V2Alerts{} 93 } 94 if strings.HasSuffix(p, "/v2/alerts/groups") { 95 return true, merger.V2AlertGroups{} 96 } 97 if strings.HasSuffix(p, "/v1/silences") { 98 return true, merger.V1Silences{} 99 } 100 if strings.HasSuffix(path.Dir(p), "/v1/silence") { 101 return true, merger.V1SilenceID{} 102 } 103 if strings.HasSuffix(p, "/v2/silences") { 104 return true, merger.V2Silences{} 105 } 106 if strings.HasSuffix(path.Dir(p), "/v2/silence") { 107 return true, merger.V2SilenceID{} 108 } 109 return false, nil 110 } 111 112 func (d *Distributor) isUnaryReadPath(p string) bool { 113 return strings.HasSuffix(p, "/status") || 114 strings.HasSuffix(p, "/receivers") 115 } 116 117 // DistributeRequest shards the writes and returns as soon as the quorum is satisfied. 118 // In case of reads, it proxies the request to one of the alertmanagers. 119 // DistributeRequest assumes that the caller has verified IsPathSupported returns 120 // true for the route. 121 func (d *Distributor) DistributeRequest(w http.ResponseWriter, r *http.Request) { 122 d.requestsInFlight.Add(1) 123 defer d.requestsInFlight.Done() 124 125 userID, err := tenant.TenantID(r.Context()) 126 if err != nil { 127 http.Error(w, err.Error(), http.StatusUnauthorized) 128 return 129 } 130 131 logger := util_log.WithContext(r.Context(), d.logger) 132 133 if r.Method == http.MethodPost { 134 if d.isQuorumWritePath(r.URL.Path) { 135 d.doQuorum(userID, w, r, logger, merger.Noop{}) 136 return 137 } 138 if d.isUnaryWritePath(r.URL.Path) { 139 d.doUnary(userID, w, r, logger) 140 return 141 } 142 } 143 if r.Method == http.MethodDelete { 144 if d.isUnaryDeletePath(r.URL.Path) { 145 d.doUnary(userID, w, r, logger) 146 return 147 } 148 } 149 if r.Method == http.MethodGet || r.Method == http.MethodHead { 150 if ok, m := d.isQuorumReadPath(r.URL.Path); ok { 151 d.doQuorum(userID, w, r, logger, m) 152 return 153 } 154 if d.isUnaryReadPath(r.URL.Path) { 155 d.doUnary(userID, w, r, logger) 156 return 157 } 158 } 159 160 http.Error(w, "route not supported by distributor", http.StatusNotFound) 161 } 162 163 func (d *Distributor) doQuorum(userID string, w http.ResponseWriter, r *http.Request, logger log.Logger, m merger.Merger) { 164 var body []byte 165 var err error 166 if r.Body != nil { 167 body, err = ioutil.ReadAll(http.MaxBytesReader(w, r.Body, d.maxRecvMsgSize)) 168 if err != nil { 169 if util.IsRequestBodyTooLarge(err) { 170 http.Error(w, "Request body too large", http.StatusRequestEntityTooLarge) 171 return 172 } 173 level.Error(logger).Log("msg", "failed to read the request body during write", "err", err) 174 w.WriteHeader(http.StatusInternalServerError) 175 return 176 } 177 } 178 179 var responses []*httpgrpc.HTTPResponse 180 var responsesMtx sync.Mutex 181 grpcHeaders := httpToHttpgrpcHeaders(r.Header) 182 err = ring.DoBatch(r.Context(), RingOp, d.alertmanagerRing, []uint32{shardByUser(userID)}, func(am ring.InstanceDesc, _ []int) error { 183 // Use a background context to make sure all alertmanagers get the request even if we return early. 184 localCtx := user.InjectOrgID(context.Background(), userID) 185 sp, localCtx := opentracing.StartSpanFromContext(localCtx, "Distributor.doQuorum") 186 defer sp.Finish() 187 188 resp, err := d.doRequest(localCtx, am, &httpgrpc.HTTPRequest{ 189 Method: r.Method, 190 Url: r.RequestURI, 191 Body: body, 192 Headers: grpcHeaders, 193 }) 194 if err != nil { 195 return err 196 } 197 198 if resp.Code/100 != 2 { 199 return httpgrpc.ErrorFromHTTPResponse(resp) 200 } 201 202 responsesMtx.Lock() 203 responses = append(responses, resp) 204 responsesMtx.Unlock() 205 206 return nil 207 }, func() {}) 208 209 if err != nil { 210 respondFromError(err, w, logger) 211 return 212 } 213 214 responsesMtx.Lock() // Another request might be ongoing after quorum. 215 resps := responses 216 responsesMtx.Unlock() 217 218 if len(resps) > 0 { 219 respondFromMultipleHTTPGRPCResponses(w, logger, resps, m) 220 } else { 221 // This should not happen. 222 level.Error(logger).Log("msg", "distributor did not receive any response from alertmanagers, but there were no errors") 223 w.WriteHeader(http.StatusInternalServerError) 224 } 225 } 226 227 func (d *Distributor) doUnary(userID string, w http.ResponseWriter, r *http.Request, logger log.Logger) { 228 key := shardByUser(userID) 229 replicationSet, err := d.alertmanagerRing.Get(key, RingOp, nil, nil, nil) 230 if err != nil { 231 level.Error(logger).Log("msg", "failed to get replication set from the ring", "err", err) 232 w.WriteHeader(http.StatusInternalServerError) 233 return 234 } 235 236 body, err := ioutil.ReadAll(http.MaxBytesReader(w, r.Body, d.maxRecvMsgSize)) 237 if err != nil { 238 if util.IsRequestBodyTooLarge(err) { 239 http.Error(w, "Request body too large", http.StatusRequestEntityTooLarge) 240 return 241 } 242 level.Error(logger).Log("msg", "failed to read the request body during read", "err", err) 243 w.WriteHeader(http.StatusInternalServerError) 244 return 245 } 246 req := &httpgrpc.HTTPRequest{ 247 Method: r.Method, 248 Url: r.RequestURI, 249 Body: body, 250 Headers: httpToHttpgrpcHeaders(r.Header), 251 } 252 253 sp, ctx := opentracing.StartSpanFromContext(r.Context(), "Distributor.doUnary") 254 defer sp.Finish() 255 // Until we have a mechanism to combine the results from multiple alertmanagers, 256 // we forward the request to only only of the alertmanagers. 257 amDesc := replicationSet.Instances[rand.Intn(len(replicationSet.Instances))] 258 resp, err := d.doRequest(ctx, amDesc, req) 259 if err != nil { 260 respondFromError(err, w, logger) 261 return 262 } 263 264 respondFromHTTPGRPCResponse(w, resp) 265 } 266 267 func respondFromError(err error, w http.ResponseWriter, logger log.Logger) { 268 httpResp, ok := httpgrpc.HTTPResponseFromError(errors.Cause(err)) 269 if !ok { 270 level.Error(logger).Log("msg", "failed to process the request to the alertmanager", "err", err) 271 http.Error(w, "Failed to process the request to the alertmanager", http.StatusInternalServerError) 272 return 273 } 274 respondFromHTTPGRPCResponse(w, httpResp) 275 } 276 277 func respondFromHTTPGRPCResponse(w http.ResponseWriter, httpResp *httpgrpc.HTTPResponse) { 278 for _, h := range httpResp.Headers { 279 for _, v := range h.Values { 280 w.Header().Add(h.Key, v) 281 } 282 } 283 w.WriteHeader(int(httpResp.Code)) 284 w.Write(httpResp.Body) //nolint 285 } 286 287 func (d *Distributor) doRequest(ctx context.Context, am ring.InstanceDesc, req *httpgrpc.HTTPRequest) (*httpgrpc.HTTPResponse, error) { 288 ctx, cancel := context.WithTimeout(ctx, d.cfg.RemoteTimeout) 289 defer cancel() 290 amClient, err := d.alertmanagerClientsPool.GetClientFor(am.Addr) 291 if err != nil { 292 return nil, errors.Wrapf(err, "failed to get alertmanager client from pool (alertmanager address: %s)", am.Addr) 293 } 294 295 return amClient.HandleRequest(ctx, req) 296 } 297 298 func shardByUser(userID string) uint32 { 299 ringHasher := fnv.New32a() 300 // Hasher never returns err. 301 _, _ = ringHasher.Write([]byte(userID)) 302 return ringHasher.Sum32() 303 } 304 305 func httpToHttpgrpcHeaders(hs http.Header) []*httpgrpc.Header { 306 result := make([]*httpgrpc.Header, 0, len(hs)) 307 for k, vs := range hs { 308 result = append(result, &httpgrpc.Header{ 309 Key: k, 310 Values: vs, 311 }) 312 } 313 return result 314 } 315 316 func respondFromMultipleHTTPGRPCResponses(w http.ResponseWriter, logger log.Logger, responses []*httpgrpc.HTTPResponse, merger merger.Merger) { 317 bodies := make([][]byte, len(responses)) 318 for i, r := range responses { 319 bodies[i] = r.Body 320 } 321 322 body, err := merger.MergeResponses(bodies) 323 if err != nil { 324 level.Error(logger).Log("msg", "failed to merge responses for request", "err", err) 325 w.WriteHeader(http.StatusInternalServerError) 326 return 327 } 328 329 // It is assumed by using this function, the caller knows that the responses it receives 330 // have already been checked for success or failure, and that the headers will always 331 // match due to the nature of the request. If this is not the case, a different merge 332 // function should be implemented to cope with the differing responses. 333 response := &httpgrpc.HTTPResponse{ 334 Code: responses[0].Code, 335 Headers: responses[0].Headers, 336 Body: body, 337 } 338 339 respondFromHTTPGRPCResponse(w, response) 340 }