github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/pkg/sink/cloudstorage/config.go (about)

     1  // Copyright 2022 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package cloudstorage
    15  
    16  import (
    17  	"context"
    18  	"fmt"
    19  	"net/http"
    20  	"net/url"
    21  	"strings"
    22  	"time"
    23  
    24  	"github.com/gin-gonic/gin/binding"
    25  	"github.com/imdario/mergo"
    26  	"github.com/pingcap/log"
    27  	"github.com/pingcap/tiflow/pkg/config"
    28  	cerror "github.com/pingcap/tiflow/pkg/errors"
    29  	psink "github.com/pingcap/tiflow/pkg/sink"
    30  	"github.com/pingcap/tiflow/pkg/util"
    31  	"go.uber.org/zap"
    32  )
    33  
    34  const (
    35  	// defaultWorkerCount is the default value of worker-count.
    36  	defaultWorkerCount = 16
    37  	// the upper limit of worker-count.
    38  	maxWorkerCount = 512
    39  	// defaultFlushInterval is the default value of flush-interval.
    40  	defaultFlushInterval = 5 * time.Second
    41  	// the lower limit of flush-interval.
    42  	minFlushInterval = 2 * time.Second
    43  	// the upper limit of flush-interval.
    44  	maxFlushInterval = 10 * time.Minute
    45  	// defaultFlushConcurrency is the default value of flush-concurrency.
    46  	defaultFlushConcurrency = 1
    47  	// the lower limit of flush-concurrency.
    48  	minFlushConcurrency = 1
    49  	// the upper limit of flush-concurrency.
    50  	maxFlushConcurrency = 512
    51  	// defaultFileSize is the default value of file-size.
    52  	defaultFileSize = 64 * 1024 * 1024
    53  	// the lower limit of file size
    54  	minFileSize = 1024 * 1024
    55  	// the upper limit of file size
    56  	maxFileSize = 512 * 1024 * 1024
    57  
    58  	// disable file cleanup by default
    59  	defaultFileExpirationDays = 0
    60  	// Second | Minute | Hour | Dom | Month | DowOptional
    61  	// `0 0 2 * * ?` means 2:00:00 AM every day
    62  	defaultFileCleanupCronSpec = "0 0 2 * * *"
    63  )
    64  
    65  type urlConfig struct {
    66  	WorkerCount   *int    `form:"worker-count"`
    67  	FlushInterval *string `form:"flush-interval"`
    68  	FileSize      *int    `form:"file-size"`
    69  }
    70  
    71  // Config is the configuration for cloud storage sink.
    72  type Config struct {
    73  	WorkerCount              int
    74  	FlushInterval            time.Duration
    75  	FileSize                 int
    76  	FileIndexWidth           int
    77  	DateSeparator            string
    78  	FileExpirationDays       int
    79  	FileCleanupCronSpec      string
    80  	EnablePartitionSeparator bool
    81  	OutputColumnID           bool
    82  	FlushConcurrency         int
    83  }
    84  
    85  // NewConfig returns the default cloud storage sink config.
    86  func NewConfig() *Config {
    87  	return &Config{
    88  		WorkerCount:         defaultWorkerCount,
    89  		FlushInterval:       defaultFlushInterval,
    90  		FileSize:            defaultFileSize,
    91  		FileExpirationDays:  defaultFileExpirationDays,
    92  		FileCleanupCronSpec: defaultFileCleanupCronSpec,
    93  	}
    94  }
    95  
    96  // Apply applies the sink URI parameters to the config.
    97  func (c *Config) Apply(
    98  	ctx context.Context,
    99  	sinkURI *url.URL,
   100  	replicaConfig *config.ReplicaConfig,
   101  ) (err error) {
   102  	if sinkURI == nil {
   103  		return cerror.ErrStorageSinkInvalidConfig.GenWithStack(
   104  			"failed to open cloud storage sink, empty SinkURI")
   105  	}
   106  
   107  	scheme := strings.ToLower(sinkURI.Scheme)
   108  	if !psink.IsStorageScheme(scheme) {
   109  		return cerror.ErrStorageSinkInvalidConfig.GenWithStack(
   110  			"can't create cloud storage sink with unsupported scheme: %s", scheme)
   111  	}
   112  	req := &http.Request{URL: sinkURI}
   113  	urlParameter := &urlConfig{}
   114  	if err := binding.Query.Bind(req, urlParameter); err != nil {
   115  		return cerror.WrapError(cerror.ErrStorageSinkInvalidConfig, err)
   116  	}
   117  	if urlParameter, err = mergeConfig(replicaConfig, urlParameter); err != nil {
   118  		return err
   119  	}
   120  	if err = getWorkerCount(urlParameter, &c.WorkerCount); err != nil {
   121  		return err
   122  	}
   123  	err = getFlushInterval(urlParameter, &c.FlushInterval)
   124  	if err != nil {
   125  		return err
   126  	}
   127  	err = getFileSize(urlParameter, &c.FileSize)
   128  	if err != nil {
   129  		return err
   130  	}
   131  
   132  	c.DateSeparator = util.GetOrZero(replicaConfig.Sink.DateSeparator)
   133  	c.EnablePartitionSeparator = util.GetOrZero(replicaConfig.Sink.EnablePartitionSeparator)
   134  	c.FileIndexWidth = util.GetOrZero(replicaConfig.Sink.FileIndexWidth)
   135  	if replicaConfig.Sink.CloudStorageConfig != nil {
   136  		c.OutputColumnID = util.GetOrZero(replicaConfig.Sink.CloudStorageConfig.OutputColumnID)
   137  		if replicaConfig.Sink.CloudStorageConfig.FileExpirationDays != nil {
   138  			c.FileExpirationDays = *replicaConfig.Sink.CloudStorageConfig.FileExpirationDays
   139  		}
   140  		if replicaConfig.Sink.CloudStorageConfig.FileCleanupCronSpec != nil {
   141  			c.FileCleanupCronSpec = *replicaConfig.Sink.CloudStorageConfig.FileCleanupCronSpec
   142  		}
   143  		c.FlushConcurrency = util.GetOrZero(replicaConfig.Sink.CloudStorageConfig.FlushConcurrency)
   144  	}
   145  
   146  	if c.FileIndexWidth < config.MinFileIndexWidth || c.FileIndexWidth > config.MaxFileIndexWidth {
   147  		c.FileIndexWidth = config.DefaultFileIndexWidth
   148  	}
   149  	if c.FlushConcurrency < minFlushConcurrency || c.FlushConcurrency > maxFlushConcurrency {
   150  		c.FlushConcurrency = defaultFlushConcurrency
   151  	}
   152  
   153  	return nil
   154  }
   155  
   156  func mergeConfig(
   157  	replicaConfig *config.ReplicaConfig,
   158  	urlParameters *urlConfig,
   159  ) (*urlConfig, error) {
   160  	dest := &urlConfig{}
   161  	if replicaConfig.Sink != nil && replicaConfig.Sink.CloudStorageConfig != nil {
   162  		dest.WorkerCount = replicaConfig.Sink.CloudStorageConfig.WorkerCount
   163  		dest.FlushInterval = replicaConfig.Sink.CloudStorageConfig.FlushInterval
   164  		dest.FileSize = replicaConfig.Sink.CloudStorageConfig.FileSize
   165  	}
   166  	if err := mergo.Merge(dest, urlParameters, mergo.WithOverride); err != nil {
   167  		return nil, cerror.WrapError(cerror.ErrStorageSinkInvalidConfig, err)
   168  	}
   169  	return dest, nil
   170  }
   171  
   172  func getWorkerCount(values *urlConfig, workerCount *int) error {
   173  	if values.WorkerCount == nil {
   174  		return nil
   175  	}
   176  
   177  	c := *values.WorkerCount
   178  	if c <= 0 {
   179  		return cerror.WrapError(cerror.ErrStorageSinkInvalidConfig,
   180  			fmt.Errorf("invalid worker-count %d, it must be greater than 0", c))
   181  	}
   182  	if c > maxWorkerCount {
   183  		log.Warn("worker-count is too large",
   184  			zap.Int("original", c), zap.Int("override", maxWorkerCount))
   185  		c = maxWorkerCount
   186  	}
   187  
   188  	*workerCount = c
   189  	return nil
   190  }
   191  
   192  func getFlushInterval(values *urlConfig, flushInterval *time.Duration) error {
   193  	if values.FlushInterval == nil || len(*values.FlushInterval) == 0 {
   194  		return nil
   195  	}
   196  
   197  	d, err := time.ParseDuration(*values.FlushInterval)
   198  	if err != nil {
   199  		return cerror.WrapError(cerror.ErrStorageSinkInvalidConfig, err)
   200  	}
   201  
   202  	if d > maxFlushInterval {
   203  		log.Warn("flush-interval is too large", zap.Duration("original", d),
   204  			zap.Duration("override", maxFlushInterval))
   205  		d = maxFlushInterval
   206  	}
   207  	if d < minFlushInterval {
   208  		log.Warn("flush-interval is too small", zap.Duration("original", d),
   209  			zap.Duration("override", minFlushInterval))
   210  		d = minFlushInterval
   211  	}
   212  
   213  	*flushInterval = d
   214  	return nil
   215  }
   216  
   217  func getFileSize(values *urlConfig, fileSize *int) error {
   218  	if values.FileSize == nil {
   219  		return nil
   220  	}
   221  
   222  	sz := *values.FileSize
   223  	if sz > maxFileSize {
   224  		log.Warn("file-size is too large",
   225  			zap.Int("original", sz), zap.Int("override", maxFileSize))
   226  		sz = maxFileSize
   227  	}
   228  	if sz < minFileSize {
   229  		log.Warn("file-size is too small",
   230  			zap.Int("original", sz), zap.Int("override", minFileSize))
   231  		sz = minFileSize
   232  	}
   233  	*fileSize = sz
   234  	return nil
   235  }