github.com/influxdata/influxdb/v2@v2.7.6/replications/remotewrite/writer.go (about) 1 package remotewrite 2 3 import ( 4 "context" 5 "fmt" 6 "math" 7 "net" 8 "net/http" 9 "net/url" 10 "runtime" 11 "strconv" 12 "sync" 13 "time" 14 15 "github.com/influxdata/influx-cli/v2/api" 16 "github.com/influxdata/influxdb/v2" 17 ihttp "github.com/influxdata/influxdb/v2/http" 18 "github.com/influxdata/influxdb/v2/kit/platform" 19 ierrors "github.com/influxdata/influxdb/v2/kit/platform/errors" 20 "github.com/influxdata/influxdb/v2/replications/metrics" 21 "go.uber.org/zap" 22 ) 23 24 const ( 25 retryAfterHeaderKey = "Retry-After" 26 maximumBackoffTime = 15 * time.Minute 27 maximumAttempts = 10 // After this many attempts, wait maximumBackoffTime 28 DefaultTimeout = 2 * time.Minute 29 ) 30 31 var ( 32 userAgent = fmt.Sprintf( 33 "influxdb-oss-replication/%s (%s) Sha/%s Date/%s", 34 influxdb.GetBuildInfo().Version, 35 runtime.GOOS, 36 influxdb.GetBuildInfo().Commit, 37 influxdb.GetBuildInfo().Date) 38 ) 39 40 func invalidRemoteUrl(remoteUrl string, err error) *ierrors.Error { 41 return &ierrors.Error{ 42 Code: ierrors.EInvalid, 43 Msg: fmt.Sprintf("host URL %q is invalid", remoteUrl), 44 Err: err, 45 } 46 } 47 48 func invalidResponseCode(code int, err error) *ierrors.Error { 49 return &ierrors.Error{ 50 Code: ierrors.EInvalid, 51 Msg: fmt.Sprintf("invalid response code %d, must be %d", code, http.StatusNoContent), 52 Err: err, 53 } 54 } 55 56 type HttpConfigStore interface { 57 GetFullHTTPConfig(context.Context, platform.ID) (*influxdb.ReplicationHTTPConfig, error) 58 UpdateResponseInfo(context.Context, platform.ID, int, string) error 59 } 60 61 type waitFunc func(time.Duration) <-chan time.Time 62 63 type writer struct { 64 replicationID platform.ID 65 configStore HttpConfigStore 66 metrics *metrics.ReplicationsMetrics 67 logger *zap.Logger 68 maximumBackoffTime time.Duration 69 maximumAttemptsForBackoffTime int 70 clientTimeout time.Duration 71 done chan struct{} 72 waitFunc waitFunc // used for testing 73 } 74 75 func NewWriter(replicationID platform.ID, store HttpConfigStore, metrics *metrics.ReplicationsMetrics, logger *zap.Logger, done chan struct{}) *writer { 76 return &writer{ 77 replicationID: replicationID, 78 configStore: store, 79 metrics: metrics, 80 logger: logger, 81 maximumBackoffTime: maximumBackoffTime, 82 maximumAttemptsForBackoffTime: maximumAttempts, 83 clientTimeout: DefaultTimeout, 84 done: done, 85 waitFunc: func(t time.Duration) <-chan time.Time { 86 return time.After(t) 87 }, 88 } 89 } 90 91 func (w *writer) Write(data []byte, attempts int) (backoff time.Duration, err error) { 92 cancelOnce := &sync.Once{} 93 // Cancel any outstanding HTTP requests if the replicationQueue is closed. 94 ctx, cancel := context.WithCancel(context.Background()) 95 96 defer func() { 97 cancelOnce.Do(cancel) 98 }() 99 100 go func() { 101 select { 102 case <-w.done: 103 cancelOnce.Do(cancel) 104 case <-ctx.Done(): 105 // context is cancelled already 106 } 107 }() 108 109 // Get the most recent config on every attempt, in case the user has updated the config to correct errors. 110 conf, err := w.configStore.GetFullHTTPConfig(ctx, w.replicationID) 111 if err != nil { 112 return w.backoff(attempts), err 113 } 114 115 res, postWriteErr := PostWrite(ctx, conf, data, w.clientTimeout) 116 res, msg, ok := normalizeResponse(res, postWriteErr) 117 if !ok { 118 // Update Response info: 119 if err := w.configStore.UpdateResponseInfo(ctx, w.replicationID, res.StatusCode, msg); err != nil { 120 w.logger.Debug("failed to update config store with latest remote write response info", zap.Error(err)) 121 return w.backoff(attempts), err 122 } 123 // bail out 124 return w.backoff(attempts), postWriteErr 125 } 126 127 // Update metrics and most recent error diagnostic information. 128 if err := w.configStore.UpdateResponseInfo(ctx, w.replicationID, res.StatusCode, msg); err != nil { 129 // TODO: We shouldn't fail/retry a successful remote write for not successfully writing to the config store 130 // we should only log instead of returning, like: 131 w.logger.Debug("failed to update config store with latest remote write response info", zap.Error(err)) 132 // Unfortunately this will mess up a lot of tests that are using UpdateResponseInfo failures as a proxy for 133 // write failures. 134 return w.backoff(attempts), err 135 } 136 137 if postWriteErr == nil { 138 // Successful write 139 w.metrics.RemoteWriteSent(w.replicationID, len(data)) 140 w.logger.Debug("remote write successful", zap.Int("attempt", attempts), zap.Int("bytes", len(data))) 141 return 0, nil 142 } 143 144 w.metrics.RemoteWriteError(w.replicationID, res.StatusCode) 145 w.logger.Debug("remote write error", zap.Int("attempt", attempts), zap.String("error message", "msg"), zap.Int("status code", res.StatusCode)) 146 147 var waitTime time.Duration 148 hasSetWaitTime := false 149 150 switch res.StatusCode { 151 case http.StatusBadRequest: 152 if conf.DropNonRetryableData { 153 var errBody []byte 154 res.Body.Read(errBody) 155 w.logger.Warn("dropped data", zap.Int("bytes", len(data)), zap.String("reason", string(errBody))) 156 w.metrics.RemoteWriteDropped(w.replicationID, len(data)) 157 return 0, nil 158 } 159 case http.StatusTooManyRequests: 160 headerTime := w.waitTimeFromHeader(res) 161 if headerTime != 0 { 162 waitTime = headerTime 163 hasSetWaitTime = true 164 } 165 } 166 167 if !hasSetWaitTime { 168 waitTime = w.backoff(attempts) 169 } 170 171 return waitTime, postWriteErr 172 } 173 174 // normalizeResponse returns a guaranteed non-nil value for *http.Response, and an extracted error message string for use 175 // in logging. The returned bool indicates if the response is a time-out - false means that the write request should be 176 // aborted due to a malformed request. 177 func normalizeResponse(r *http.Response, err error) (*http.Response, string, bool) { 178 var errMsg string 179 if err != nil { 180 errMsg = err.Error() 181 } 182 183 if r == nil { 184 if errorIsTimeout(err) { 185 return &http.Response{}, errMsg, true 186 } 187 188 return &http.Response{}, errMsg, false 189 } 190 191 return r, errMsg, true 192 } 193 194 func errorIsTimeout(err error) bool { 195 if err, ok := err.(net.Error); ok && err.Timeout() { 196 return true 197 } 198 199 return false 200 } 201 202 func PostWrite(ctx context.Context, config *influxdb.ReplicationHTTPConfig, data []byte, timeout time.Duration) (*http.Response, error) { 203 u, err := url.Parse(config.RemoteURL) 204 if err != nil { 205 return nil, invalidRemoteUrl(config.RemoteURL, err) 206 } 207 208 params := api.ConfigParams{ 209 Host: u, 210 UserAgent: userAgent, 211 Token: &config.RemoteToken, 212 AllowInsecureTLS: config.AllowInsecureTLS, 213 } 214 conf := api.NewAPIConfig(params) 215 conf.HTTPClient.Timeout = timeout 216 client := api.NewAPIClient(conf).WriteApi 217 218 var bucket string 219 if config.RemoteBucketID == nil || config.RemoteBucketName != "" { 220 bucket = config.RemoteBucketName 221 } else { 222 bucket = config.RemoteBucketID.String() 223 } 224 225 var org string 226 if config.RemoteOrgID != nil { 227 org = config.RemoteOrgID.String() 228 } else { 229 // We need to provide something here for the write api to be happy 230 org = platform.InvalidID().String() 231 } 232 233 req := client.PostWrite(ctx). 234 Bucket(bucket). 235 Body(data). 236 Org(org) 237 238 // Don't set the encoding header for empty bodies, like those used for validation. 239 if len(data) > 0 { 240 req = req.ContentEncoding("gzip") 241 } 242 243 res, err := req.ExecuteWithHttpInfo() 244 if res == nil { 245 return nil, err 246 } 247 248 // Only a response of 204 is valid for a successful write 249 if res.StatusCode != http.StatusNoContent { 250 if err == nil { 251 err = ihttp.CheckError(res) 252 } 253 err = invalidResponseCode(res.StatusCode, err) 254 } 255 256 // Must return the response so that the status code and headers can be inspected by the caller, even if the response 257 // was not 204. 258 return res, err 259 } 260 261 func (w *writer) backoff(numAttempts int) time.Duration { 262 if numAttempts > w.maximumAttemptsForBackoffTime { 263 return w.maximumBackoffTime 264 } 265 266 s := 0.5 * math.Pow(2, float64(numAttempts-1)) 267 return time.Duration(s * float64(time.Second)) 268 } 269 270 func (w *writer) waitTimeFromHeader(r *http.Response) time.Duration { 271 str := r.Header.Get(retryAfterHeaderKey) 272 if str == "" { 273 return 0 274 } 275 276 // Use a minimal backoff time if the header is set to 0 for some reason, maybe due to rounding. 277 if str == "0" { 278 return w.backoff(1) 279 } 280 281 rtr, err := strconv.Atoi(str) 282 if err != nil { 283 return 0 284 } 285 286 return time.Duration(rtr * int(time.Second)) 287 }