github.com/pingcap/tidb-lightning@v5.0.0-rc.0.20210428090220-84b649866577+incompatible/lightning/mydump/router.go (about)

     1  package mydump
     2  
     3  import (
     4  	"regexp"
     5  	"strconv"
     6  	"strings"
     7  
     8  	"github.com/tikv/pd/pkg/slice"
     9  
    10  	"github.com/pingcap/errors"
    11  	"github.com/pingcap/tidb-tools/pkg/filter"
    12  
    13  	"github.com/pingcap/tidb-lightning/lightning/config"
    14  )
    15  
    16  type SourceType int
    17  
    18  const (
    19  	SourceTypeIgnore SourceType = iota
    20  	SourceTypeSchemaSchema
    21  	SourceTypeTableSchema
    22  	SourceTypeSQL
    23  	SourceTypeCSV
    24  	SourceTypeParquet
    25  	SourceTypeViewSchema
    26  )
    27  
    28  const (
    29  	SchemaSchema = "schema-schema"
    30  	TableSchema  = "table-schema"
    31  	ViewSchema   = "view-schema"
    32  	TypeSQL      = "sql"
    33  	TypeCSV      = "csv"
    34  	TypeParquet  = "parquet"
    35  	TypeIgnore   = "ignore"
    36  )
    37  
    38  type Compression int
    39  
    40  const (
    41  	CompressionNone Compression = iota
    42  	CompressionGZ
    43  	CompressionLZ4
    44  	CompressionZStd
    45  	CompressionXZ
    46  )
    47  
    48  func parseSourceType(t string) (SourceType, error) {
    49  	switch strings.ToLower(strings.TrimSpace(t)) {
    50  	case SchemaSchema:
    51  		return SourceTypeSchemaSchema, nil
    52  	case TableSchema:
    53  		return SourceTypeTableSchema, nil
    54  	case TypeSQL:
    55  		return SourceTypeSQL, nil
    56  	case TypeCSV:
    57  		return SourceTypeCSV, nil
    58  	case TypeParquet:
    59  		return SourceTypeParquet, nil
    60  	case TypeIgnore:
    61  		return SourceTypeIgnore, nil
    62  	case ViewSchema:
    63  		return SourceTypeViewSchema, nil
    64  	default:
    65  		return SourceTypeIgnore, errors.Errorf("unknown source type '%s'", t)
    66  	}
    67  }
    68  
    69  func (s SourceType) String() string {
    70  	switch s {
    71  	case SourceTypeSchemaSchema:
    72  		return SchemaSchema
    73  	case SourceTypeTableSchema:
    74  		return TableSchema
    75  	case SourceTypeCSV:
    76  		return TypeCSV
    77  	case SourceTypeSQL:
    78  		return TypeSQL
    79  	case SourceTypeParquet:
    80  		return TypeParquet
    81  	case SourceTypeViewSchema:
    82  		return ViewSchema
    83  	default:
    84  		return TypeIgnore
    85  	}
    86  }
    87  
    88  func parseCompressionType(t string) (Compression, error) {
    89  	switch strings.ToLower(strings.TrimSpace(t)) {
    90  	case "gz":
    91  		return CompressionGZ, nil
    92  	case "lz4":
    93  		return CompressionLZ4, nil
    94  	case "zstd":
    95  		return CompressionZStd, nil
    96  	case "xz":
    97  		return CompressionXZ, nil
    98  	case "":
    99  		return CompressionNone, nil
   100  	default:
   101  		return CompressionNone, errors.Errorf("invalid compression type '%s'", t)
   102  	}
   103  }
   104  
   105  var (
   106  	expandVariablePattern = regexp.MustCompile(`\$(?:\$|[\pL\p{Nd}_]+|\{[\pL\p{Nd}_]+\})`)
   107  )
   108  
   109  var (
   110  	defaultFileRouteRules = []*config.FileRouteRule{
   111  		// ignore *-schema-trigger.sql, *-schema-post.sql files
   112  		{Pattern: `(?i).*(-schema-trigger|-schema-post)\.sql$`, Type: "ignore"},
   113  		// db schema create file pattern, matches files like '{schema}-schema-create.sql'
   114  		{Pattern: `(?i)^(?:[^/]*/)*([^/.]+)-schema-create\.sql$`, Schema: "$1", Table: "", Type: SchemaSchema},
   115  		// table schema create file pattern, matches files like '{schema}.{table}-schema.sql'
   116  		{Pattern: `(?i)^(?:[^/]*/)*([^/.]+)\.(.*?)-schema\.sql$`, Schema: "$1", Table: "$2", Type: TableSchema},
   117  		// view schema create file pattern, matches files like '{schema}.{table}-schema-view.sql'
   118  		{Pattern: `(?i)^(?:[^/]*/)*([^/.]+)\.(.*?)-schema-view\.sql$`, Schema: "$1", Table: "$2", Type: ViewSchema},
   119  		// source file pattern, matches files like '{schema}.{table}.0001.{sql|csv}'
   120  		{Pattern: `(?i)^(?:[^/]*/)*([^/.]+)\.(.*?)(?:\.([0-9]+))?\.(sql|csv|parquet)$`, Schema: "$1", Table: "$2", Type: "$4", Key: "$3"},
   121  	}
   122  )
   123  
   124  // // RouteRule is a rule to route file path to target schema/table
   125  type FileRouter interface {
   126  	// Route apply rule to path. Return nil if path doesn't math route rule;
   127  	// return error if path match route rule but the captured value for field is invalid
   128  	Route(path string) (*RouteResult, error)
   129  }
   130  
   131  // chainRouters aggregates multi `FileRouter` as a router
   132  type chainRouters []FileRouter
   133  
   134  func (c chainRouters) Route(path string) (*RouteResult, error) {
   135  	for _, r := range c {
   136  		res, err := r.Route(path)
   137  		if err != nil {
   138  			return nil, err
   139  		}
   140  		if res != nil {
   141  			return res, nil
   142  		}
   143  	}
   144  	return nil, nil
   145  }
   146  
   147  func NewFileRouter(cfg []*config.FileRouteRule) (FileRouter, error) {
   148  	res := make([]FileRouter, 0, len(cfg))
   149  	p := regexRouterParser{}
   150  	for _, c := range cfg {
   151  		rule, err := p.Parse(c)
   152  		if err != nil {
   153  			return nil, err
   154  		}
   155  		res = append(res, rule)
   156  	}
   157  	return chainRouters(res), nil
   158  }
   159  
   160  // `RegexRouter` is a `FileRouter` implement that apply specific regex pattern to filepath.
   161  // if regex pattern match, then each extractors with capture the matched regexp pattern and
   162  // set value to target field in `RouteResult`
   163  type RegexRouter struct {
   164  	pattern    *regexp.Regexp
   165  	extractors []patExpander
   166  }
   167  
   168  func (r *RegexRouter) Route(path string) (*RouteResult, error) {
   169  	indexes := r.pattern.FindStringSubmatchIndex(path)
   170  	if len(indexes) == 0 {
   171  		return nil, nil
   172  	}
   173  	result := &RouteResult{}
   174  	for _, e := range r.extractors {
   175  		err := e.Expand(r.pattern, path, indexes, result)
   176  		if err != nil {
   177  			return nil, err
   178  		}
   179  	}
   180  	return result, nil
   181  }
   182  
   183  type regexRouterParser struct{}
   184  
   185  func (p regexRouterParser) Parse(r *config.FileRouteRule) (*RegexRouter, error) {
   186  	rule := &RegexRouter{}
   187  	if r.Path == "" && r.Pattern == "" {
   188  		return nil, errors.New("`path` and `pattern` must not be both empty in [[mydumper.files]]")
   189  	}
   190  	if r.Path != "" && r.Pattern != "" {
   191  		return nil, errors.New("can't set both `path` and `pattern` field in [[mydumper.files]]")
   192  	}
   193  	if r.Path != "" {
   194  		// convert constant string as a regexp pattern
   195  		r.Pattern = regexp.QuoteMeta(r.Path)
   196  		// escape all '$' by '$$' in match templates
   197  		quoteTmplFn := func(t string) string { return strings.ReplaceAll(t, "$", "$$") }
   198  		r.Table = quoteTmplFn(r.Table)
   199  		r.Schema = quoteTmplFn(r.Schema)
   200  		r.Type = quoteTmplFn(r.Type)
   201  		r.Compression = quoteTmplFn(r.Compression)
   202  		r.Key = quoteTmplFn(r.Key)
   203  
   204  	}
   205  	pattern, err := regexp.Compile(r.Pattern)
   206  	if err != nil {
   207  		return nil, errors.Trace(err)
   208  	}
   209  	rule.pattern = pattern
   210  
   211  	err = p.parseFieldExtractor(rule, "type", r.Type, func(result *RouteResult, value string) error {
   212  		ty, err := parseSourceType(value)
   213  		if err != nil {
   214  			return err
   215  		}
   216  		result.Type = ty
   217  		return nil
   218  	})
   219  	if err != nil {
   220  		return nil, err
   221  	}
   222  	// ignore pattern needn't parse other fields
   223  	if r.Type == TypeIgnore {
   224  		return rule, nil
   225  	}
   226  
   227  	err = p.parseFieldExtractor(rule, "schema", r.Schema, func(result *RouteResult, value string) error {
   228  		result.Schema = value
   229  		return nil
   230  	})
   231  	if err != nil {
   232  		return nil, err
   233  	}
   234  
   235  	// special case: when the pattern is for db schema, should not parse table name
   236  	if r.Type != SchemaSchema {
   237  		err = p.parseFieldExtractor(rule, "table", r.Table, func(result *RouteResult, value string) error {
   238  			result.Name = value
   239  			return nil
   240  		})
   241  		if err != nil {
   242  			return nil, err
   243  		}
   244  	}
   245  
   246  	if len(r.Key) > 0 {
   247  		err = p.parseFieldExtractor(rule, "key", r.Key, func(result *RouteResult, value string) error {
   248  			result.Key = value
   249  			return nil
   250  		})
   251  		if err != nil {
   252  			return nil, err
   253  		}
   254  	}
   255  
   256  	if len(r.Compression) > 0 {
   257  		err = p.parseFieldExtractor(rule, "compression", r.Compression, func(result *RouteResult, value string) error {
   258  			// TODO: should support restore compressed source files
   259  			compression, err := parseCompressionType(value)
   260  			if err != nil {
   261  				return err
   262  			}
   263  			if compression != CompressionNone {
   264  				return errors.New("Currently we don't support restore compressed source file yet")
   265  
   266  			}
   267  			result.Compression = compression
   268  			return nil
   269  		})
   270  		if err != nil {
   271  			return nil, err
   272  		}
   273  	}
   274  
   275  	return rule, nil
   276  }
   277  
   278  // parse each field extractor in `p.r` and set them to p.rule
   279  func (p regexRouterParser) parseFieldExtractor(
   280  	rule *RegexRouter,
   281  	field,
   282  	fieldPattern string,
   283  	applyFn func(result *RouteResult, value string) error,
   284  ) error {
   285  	// pattern is empty, return default rule
   286  	if len(fieldPattern) == 0 {
   287  		return errors.Errorf("field '%s' match pattern can't be empty", field)
   288  	}
   289  
   290  	// check and parse regexp template
   291  	if err := p.checkSubPatterns(rule.pattern, fieldPattern); err != nil {
   292  		return errors.Trace(err)
   293  	}
   294  	rule.extractors = append(rule.extractors, patExpander{
   295  		template: fieldPattern,
   296  		applyFn:  applyFn,
   297  	})
   298  	return nil
   299  }
   300  
   301  func (p regexRouterParser) checkSubPatterns(pat *regexp.Regexp, t string) error {
   302  	subPats := expandVariablePattern.FindAllString(t, -1)
   303  	for _, subVar := range subPats {
   304  		var tmplName string
   305  		switch {
   306  		case subVar == "$$":
   307  			continue
   308  		case strings.HasPrefix(subVar, "${"):
   309  			tmplName = subVar[2 : len(subVar)-1]
   310  		default:
   311  			tmplName = subVar[1:]
   312  		}
   313  		if number, err := strconv.Atoi(tmplName); err == nil {
   314  			if number > pat.NumSubexp() {
   315  				return errors.Errorf("sub pattern capture '%s' out of range", subVar)
   316  			}
   317  		} else if !slice.AnyOf(pat.SubexpNames(), func(i int) bool {
   318  			// FIXME: we should use re.SubexpIndex here, but not supported in go1.13 yet
   319  			return pat.SubexpNames()[i] == tmplName
   320  		}) {
   321  			return errors.Errorf("invalid named capture '%s'", subVar)
   322  		}
   323  	}
   324  
   325  	return nil
   326  }
   327  
   328  // patExpander extract string by expanding template with the regexp pattern
   329  type patExpander struct {
   330  	template string
   331  	applyFn  func(result *RouteResult, value string) error
   332  }
   333  
   334  func (p *patExpander) Expand(pattern *regexp.Regexp, path string, matchIndex []int, result *RouteResult) error {
   335  	value := pattern.ExpandString([]byte{}, p.template, path, matchIndex)
   336  	return p.applyFn(result, string(value))
   337  }
   338  
   339  type RouteResult struct {
   340  	filter.Table
   341  	Key         string
   342  	Compression Compression
   343  	Type        SourceType
   344  }