github.com/influxdata/influxdb/v2@v2.7.6/replications/remotewrite/writer.go (about)

     1  package remotewrite
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"math"
     7  	"net"
     8  	"net/http"
     9  	"net/url"
    10  	"runtime"
    11  	"strconv"
    12  	"sync"
    13  	"time"
    14  
    15  	"github.com/influxdata/influx-cli/v2/api"
    16  	"github.com/influxdata/influxdb/v2"
    17  	ihttp "github.com/influxdata/influxdb/v2/http"
    18  	"github.com/influxdata/influxdb/v2/kit/platform"
    19  	ierrors "github.com/influxdata/influxdb/v2/kit/platform/errors"
    20  	"github.com/influxdata/influxdb/v2/replications/metrics"
    21  	"go.uber.org/zap"
    22  )
    23  
    24  const (
    25  	retryAfterHeaderKey = "Retry-After"
    26  	maximumBackoffTime  = 15 * time.Minute
    27  	maximumAttempts     = 10 // After this many attempts, wait maximumBackoffTime
    28  	DefaultTimeout      = 2 * time.Minute
    29  )
    30  
    31  var (
    32  	userAgent = fmt.Sprintf(
    33  		"influxdb-oss-replication/%s (%s) Sha/%s Date/%s",
    34  		influxdb.GetBuildInfo().Version,
    35  		runtime.GOOS,
    36  		influxdb.GetBuildInfo().Commit,
    37  		influxdb.GetBuildInfo().Date)
    38  )
    39  
    40  func invalidRemoteUrl(remoteUrl string, err error) *ierrors.Error {
    41  	return &ierrors.Error{
    42  		Code: ierrors.EInvalid,
    43  		Msg:  fmt.Sprintf("host URL %q is invalid", remoteUrl),
    44  		Err:  err,
    45  	}
    46  }
    47  
    48  func invalidResponseCode(code int, err error) *ierrors.Error {
    49  	return &ierrors.Error{
    50  		Code: ierrors.EInvalid,
    51  		Msg:  fmt.Sprintf("invalid response code %d, must be %d", code, http.StatusNoContent),
    52  		Err:  err,
    53  	}
    54  }
    55  
    56  type HttpConfigStore interface {
    57  	GetFullHTTPConfig(context.Context, platform.ID) (*influxdb.ReplicationHTTPConfig, error)
    58  	UpdateResponseInfo(context.Context, platform.ID, int, string) error
    59  }
    60  
    61  type waitFunc func(time.Duration) <-chan time.Time
    62  
    63  type writer struct {
    64  	replicationID                 platform.ID
    65  	configStore                   HttpConfigStore
    66  	metrics                       *metrics.ReplicationsMetrics
    67  	logger                        *zap.Logger
    68  	maximumBackoffTime            time.Duration
    69  	maximumAttemptsForBackoffTime int
    70  	clientTimeout                 time.Duration
    71  	done                          chan struct{}
    72  	waitFunc                      waitFunc // used for testing
    73  }
    74  
    75  func NewWriter(replicationID platform.ID, store HttpConfigStore, metrics *metrics.ReplicationsMetrics, logger *zap.Logger, done chan struct{}) *writer {
    76  	return &writer{
    77  		replicationID:                 replicationID,
    78  		configStore:                   store,
    79  		metrics:                       metrics,
    80  		logger:                        logger,
    81  		maximumBackoffTime:            maximumBackoffTime,
    82  		maximumAttemptsForBackoffTime: maximumAttempts,
    83  		clientTimeout:                 DefaultTimeout,
    84  		done:                          done,
    85  		waitFunc: func(t time.Duration) <-chan time.Time {
    86  			return time.After(t)
    87  		},
    88  	}
    89  }
    90  
    91  func (w *writer) Write(data []byte, attempts int) (backoff time.Duration, err error) {
    92  	cancelOnce := &sync.Once{}
    93  	// Cancel any outstanding HTTP requests if the replicationQueue is closed.
    94  	ctx, cancel := context.WithCancel(context.Background())
    95  
    96  	defer func() {
    97  		cancelOnce.Do(cancel)
    98  	}()
    99  
   100  	go func() {
   101  		select {
   102  		case <-w.done:
   103  			cancelOnce.Do(cancel)
   104  		case <-ctx.Done():
   105  			// context is cancelled already
   106  		}
   107  	}()
   108  
   109  	// Get the most recent config on every attempt, in case the user has updated the config to correct errors.
   110  	conf, err := w.configStore.GetFullHTTPConfig(ctx, w.replicationID)
   111  	if err != nil {
   112  		return w.backoff(attempts), err
   113  	}
   114  
   115  	res, postWriteErr := PostWrite(ctx, conf, data, w.clientTimeout)
   116  	res, msg, ok := normalizeResponse(res, postWriteErr)
   117  	if !ok {
   118  		// Update Response info:
   119  		if err := w.configStore.UpdateResponseInfo(ctx, w.replicationID, res.StatusCode, msg); err != nil {
   120  			w.logger.Debug("failed to update config store with latest remote write response info", zap.Error(err))
   121  			return w.backoff(attempts), err
   122  		}
   123  		// bail out
   124  		return w.backoff(attempts), postWriteErr
   125  	}
   126  
   127  	// Update metrics and most recent error diagnostic information.
   128  	if err := w.configStore.UpdateResponseInfo(ctx, w.replicationID, res.StatusCode, msg); err != nil {
   129  		// TODO: We shouldn't fail/retry a successful remote write for not successfully writing to the config store
   130  		// we should only log instead of returning, like:
   131  		w.logger.Debug("failed to update config store with latest remote write response info", zap.Error(err))
   132  		// Unfortunately this will mess up a lot of tests that are using UpdateResponseInfo failures as a proxy for
   133  		// write failures.
   134  		return w.backoff(attempts), err
   135  	}
   136  
   137  	if postWriteErr == nil {
   138  		// Successful write
   139  		w.metrics.RemoteWriteSent(w.replicationID, len(data))
   140  		w.logger.Debug("remote write successful", zap.Int("attempt", attempts), zap.Int("bytes", len(data)))
   141  		return 0, nil
   142  	}
   143  
   144  	w.metrics.RemoteWriteError(w.replicationID, res.StatusCode)
   145  	w.logger.Debug("remote write error", zap.Int("attempt", attempts), zap.String("error message", "msg"), zap.Int("status code", res.StatusCode))
   146  
   147  	var waitTime time.Duration
   148  	hasSetWaitTime := false
   149  
   150  	switch res.StatusCode {
   151  	case http.StatusBadRequest:
   152  		if conf.DropNonRetryableData {
   153  			var errBody []byte
   154  			res.Body.Read(errBody)
   155  			w.logger.Warn("dropped data", zap.Int("bytes", len(data)), zap.String("reason", string(errBody)))
   156  			w.metrics.RemoteWriteDropped(w.replicationID, len(data))
   157  			return 0, nil
   158  		}
   159  	case http.StatusTooManyRequests:
   160  		headerTime := w.waitTimeFromHeader(res)
   161  		if headerTime != 0 {
   162  			waitTime = headerTime
   163  			hasSetWaitTime = true
   164  		}
   165  	}
   166  
   167  	if !hasSetWaitTime {
   168  		waitTime = w.backoff(attempts)
   169  	}
   170  
   171  	return waitTime, postWriteErr
   172  }
   173  
   174  // normalizeResponse returns a guaranteed non-nil value for *http.Response, and an extracted error message string for use
   175  // in logging. The returned bool indicates if the response is a time-out - false means that the write request should be
   176  // aborted due to a malformed request.
   177  func normalizeResponse(r *http.Response, err error) (*http.Response, string, bool) {
   178  	var errMsg string
   179  	if err != nil {
   180  		errMsg = err.Error()
   181  	}
   182  
   183  	if r == nil {
   184  		if errorIsTimeout(err) {
   185  			return &http.Response{}, errMsg, true
   186  		}
   187  
   188  		return &http.Response{}, errMsg, false
   189  	}
   190  
   191  	return r, errMsg, true
   192  }
   193  
   194  func errorIsTimeout(err error) bool {
   195  	if err, ok := err.(net.Error); ok && err.Timeout() {
   196  		return true
   197  	}
   198  
   199  	return false
   200  }
   201  
   202  func PostWrite(ctx context.Context, config *influxdb.ReplicationHTTPConfig, data []byte, timeout time.Duration) (*http.Response, error) {
   203  	u, err := url.Parse(config.RemoteURL)
   204  	if err != nil {
   205  		return nil, invalidRemoteUrl(config.RemoteURL, err)
   206  	}
   207  
   208  	params := api.ConfigParams{
   209  		Host:             u,
   210  		UserAgent:        userAgent,
   211  		Token:            &config.RemoteToken,
   212  		AllowInsecureTLS: config.AllowInsecureTLS,
   213  	}
   214  	conf := api.NewAPIConfig(params)
   215  	conf.HTTPClient.Timeout = timeout
   216  	client := api.NewAPIClient(conf).WriteApi
   217  
   218  	var bucket string
   219  	if config.RemoteBucketID == nil || config.RemoteBucketName != "" {
   220  		bucket = config.RemoteBucketName
   221  	} else {
   222  		bucket = config.RemoteBucketID.String()
   223  	}
   224  
   225  	var org string
   226  	if config.RemoteOrgID != nil {
   227  		org = config.RemoteOrgID.String()
   228  	} else {
   229  		// We need to provide something here for the write api to be happy
   230  		org = platform.InvalidID().String()
   231  	}
   232  
   233  	req := client.PostWrite(ctx).
   234  		Bucket(bucket).
   235  		Body(data).
   236  		Org(org)
   237  
   238  	// Don't set the encoding header for empty bodies, like those used for validation.
   239  	if len(data) > 0 {
   240  		req = req.ContentEncoding("gzip")
   241  	}
   242  
   243  	res, err := req.ExecuteWithHttpInfo()
   244  	if res == nil {
   245  		return nil, err
   246  	}
   247  
   248  	// Only a response of 204 is valid for a successful write
   249  	if res.StatusCode != http.StatusNoContent {
   250  		if err == nil {
   251  			err = ihttp.CheckError(res)
   252  		}
   253  		err = invalidResponseCode(res.StatusCode, err)
   254  	}
   255  
   256  	// Must return the response so that the status code and headers can be inspected by the caller, even if the response
   257  	// was not 204.
   258  	return res, err
   259  }
   260  
   261  func (w *writer) backoff(numAttempts int) time.Duration {
   262  	if numAttempts > w.maximumAttemptsForBackoffTime {
   263  		return w.maximumBackoffTime
   264  	}
   265  
   266  	s := 0.5 * math.Pow(2, float64(numAttempts-1))
   267  	return time.Duration(s * float64(time.Second))
   268  }
   269  
   270  func (w *writer) waitTimeFromHeader(r *http.Response) time.Duration {
   271  	str := r.Header.Get(retryAfterHeaderKey)
   272  	if str == "" {
   273  		return 0
   274  	}
   275  
   276  	// Use a minimal backoff time if the header is set to 0 for some reason, maybe due to rounding.
   277  	if str == "0" {
   278  		return w.backoff(1)
   279  	}
   280  
   281  	rtr, err := strconv.Atoi(str)
   282  	if err != nil {
   283  		return 0
   284  	}
   285  
   286  	return time.Duration(rtr * int(time.Second))
   287  }