
     1  package mydump
     3  import (
     4  	"regexp"
     5  	"strconv"
     6  	"strings"
     8  	""
    10  	""
    11  	""
    13  	""
    14  )
    16  type SourceType int
    18  const (
    19  	SourceTypeIgnore SourceType = iota
    20  	SourceTypeSchemaSchema
    21  	SourceTypeTableSchema
    22  	SourceTypeSQL
    23  	SourceTypeCSV
    24  	SourceTypeParquet
    25  	SourceTypeViewSchema
    26  )
    28  const (
    29  	SchemaSchema = "schema-schema"
    30  	TableSchema  = "table-schema"
    31  	ViewSchema   = "view-schema"
    32  	TypeSQL      = "sql"
    33  	TypeCSV      = "csv"
    34  	TypeParquet  = "parquet"
    35  	TypeIgnore   = "ignore"
    36  )
    38  type Compression int
    40  const (
    41  	CompressionNone Compression = iota
    42  	CompressionGZ
    43  	CompressionLZ4
    44  	CompressionZStd
    45  	CompressionXZ
    46  )
    48  func parseSourceType(t string) (SourceType, error) {
    49  	switch strings.ToLower(strings.TrimSpace(t)) {
    50  	case SchemaSchema:
    51  		return SourceTypeSchemaSchema, nil
    52  	case TableSchema:
    53  		return SourceTypeTableSchema, nil
    54  	case TypeSQL:
    55  		return SourceTypeSQL, nil
    56  	case TypeCSV:
    57  		return SourceTypeCSV, nil
    58  	case TypeParquet:
    59  		return SourceTypeParquet, nil
    60  	case TypeIgnore:
    61  		return SourceTypeIgnore, nil
    62  	case ViewSchema:
    63  		return SourceTypeViewSchema, nil
    64  	default:
    65  		return SourceTypeIgnore, errors.Errorf("unknown source type '%s'", t)
    66  	}
    67  }
    69  func (s SourceType) String() string {
    70  	switch s {
    71  	case SourceTypeSchemaSchema:
    72  		return SchemaSchema
    73  	case SourceTypeTableSchema:
    74  		return TableSchema
    75  	case SourceTypeCSV:
    76  		return TypeCSV
    77  	case SourceTypeSQL:
    78  		return TypeSQL
    79  	case SourceTypeParquet:
    80  		return TypeParquet
    81  	case SourceTypeViewSchema:
    82  		return ViewSchema
    83  	default:
    84  		return TypeIgnore
    85  	}
    86  }
    88  func parseCompressionType(t string) (Compression, error) {
    89  	switch strings.ToLower(strings.TrimSpace(t)) {
    90  	case "gz":
    91  		return CompressionGZ, nil
    92  	case "lz4":
    93  		return CompressionLZ4, nil
    94  	case "zstd":
    95  		return CompressionZStd, nil
    96  	case "xz":
    97  		return CompressionXZ, nil
    98  	case "":
    99  		return CompressionNone, nil
   100  	default:
   101  		return CompressionNone, errors.Errorf("invalid compression type '%s'", t)
   102  	}
   103  }
   105  var expandVariablePattern = regexp.MustCompile(`\$(?:\$|[\pL\p{Nd}_]+|\{[\pL\p{Nd}_]+\})`)
   107  var defaultFileRouteRules = []*config.FileRouteRule{
   108  	// ignore *-schema-trigger.sql, *-schema-post.sql files
   109  	{Pattern: `(?i).*(-schema-trigger|-schema-post)\.sql$`, Type: "ignore"},
   110  	// db schema create file pattern, matches files like '{schema}-schema-create.sql'
   111  	{Pattern: `(?i)^(?:[^/]*/)*([^/.]+)-schema-create\.sql$`, Schema: "$1", Table: "", Type: SchemaSchema},
   112  	// table schema create file pattern, matches files like '{schema}.{table}-schema.sql'
   113  	{Pattern: `(?i)^(?:[^/]*/)*([^/.]+)\.(.*?)-schema\.sql$`, Schema: "$1", Table: "$2", Type: TableSchema},
   114  	// view schema create file pattern, matches files like '{schema}.{table}-schema-view.sql'
   115  	{Pattern: `(?i)^(?:[^/]*/)*([^/.]+)\.(.*?)-schema-view\.sql$`, Schema: "$1", Table: "$2", Type: ViewSchema},
   116  	// source file pattern, matches files like '{schema}.{table}.0001.{sql|csv}'
   117  	{Pattern: `(?i)^(?:[^/]*/)*([^/.]+)\.(.*?)(?:\.([0-9]+))?\.(sql|csv|parquet)$`, Schema: "$1", Table: "$2", Type: "$4", Key: "$3"},
   118  }
   120  // // RouteRule is a rule to route file path to target schema/table
   121  type FileRouter interface {
   122  	// Route apply rule to path. Return nil if path doesn't math route rule;
   123  	// return error if path match route rule but the captured value for field is invalid
   124  	Route(path string) (*RouteResult, error)
   125  }
   127  // chainRouters aggregates multi `FileRouter` as a router
   128  type chainRouters []FileRouter
   130  func (c chainRouters) Route(path string) (*RouteResult, error) {
   131  	for _, r := range c {
   132  		res, err := r.Route(path)
   133  		if err != nil {
   134  			return nil, err
   135  		}
   136  		if res != nil {
   137  			return res, nil
   138  		}
   139  	}
   140  	return nil, nil
   141  }
   143  func NewFileRouter(cfg []*config.FileRouteRule) (FileRouter, error) {
   144  	res := make([]FileRouter, 0, len(cfg))
   145  	p := regexRouterParser{}
   146  	for _, c := range cfg {
   147  		rule, err := p.Parse(c)
   148  		if err != nil {
   149  			return nil, err
   150  		}
   151  		res = append(res, rule)
   152  	}
   153  	return chainRouters(res), nil
   154  }
   156  // `RegexRouter` is a `FileRouter` implement that apply specific regex pattern to filepath.
   157  // if regex pattern match, then each extractors with capture the matched regexp pattern and
   158  // set value to target field in `RouteResult`
   159  type RegexRouter struct {
   160  	pattern    *regexp.Regexp
   161  	extractors []patExpander
   162  }
   164  func (r *RegexRouter) Route(path string) (*RouteResult, error) {
   165  	indexes := r.pattern.FindStringSubmatchIndex(path)
   166  	if len(indexes) == 0 {
   167  		return nil, nil
   168  	}
   169  	result := &RouteResult{}
   170  	for _, e := range r.extractors {
   171  		err := e.Expand(r.pattern, path, indexes, result)
   172  		if err != nil {
   173  			return nil, err
   174  		}
   175  	}
   176  	return result, nil
   177  }
   179  type regexRouterParser struct{}
   181  func (p regexRouterParser) Parse(r *config.FileRouteRule) (*RegexRouter, error) {
   182  	rule := &RegexRouter{}
   183  	if r.Path == "" && r.Pattern == "" {
   184  		return nil, errors.New("`path` and `pattern` must not be both empty in [[mydumper.files]]")
   185  	}
   186  	if r.Path != "" && r.Pattern != "" {
   187  		return nil, errors.New("can't set both `path` and `pattern` field in [[mydumper.files]]")
   188  	}
   189  	if r.Path != "" {
   190  		// convert constant string as a regexp pattern
   191  		r.Pattern = regexp.QuoteMeta(r.Path)
   192  		// escape all '$' by '$$' in match templates
   193  		quoteTmplFn := func(t string) string { return strings.ReplaceAll(t, "$", "$$") }
   194  		r.Table = quoteTmplFn(r.Table)
   195  		r.Schema = quoteTmplFn(r.Schema)
   196  		r.Type = quoteTmplFn(r.Type)
   197  		r.Compression = quoteTmplFn(r.Compression)
   198  		r.Key = quoteTmplFn(r.Key)
   199  	}
   200  	pattern, err := regexp.Compile(r.Pattern)
   201  	if err != nil {
   202  		return nil, errors.Trace(err)
   203  	}
   204  	rule.pattern = pattern
   206  	err = p.parseFieldExtractor(rule, "type", r.Type, func(result *RouteResult, value string) error {
   207  		ty, err := parseSourceType(value)
   208  		if err != nil {
   209  			return err
   210  		}
   211  		result.Type = ty
   212  		return nil
   213  	})
   214  	if err != nil {
   215  		return nil, err
   216  	}
   217  	// ignore pattern needn't parse other fields
   218  	if r.Type == TypeIgnore {
   219  		return rule, nil
   220  	}
   222  	err = p.parseFieldExtractor(rule, "schema", r.Schema, func(result *RouteResult, value string) error {
   223  		result.Schema = value
   224  		return nil
   225  	})
   226  	if err != nil {
   227  		return nil, err
   228  	}
   230  	// special case: when the pattern is for db schema, should not parse table name
   231  	if r.Type != SchemaSchema {
   232  		err = p.parseFieldExtractor(rule, "table", r.Table, func(result *RouteResult, value string) error {
   233  			result.Name = value
   234  			return nil
   235  		})
   236  		if err != nil {
   237  			return nil, err
   238  		}
   239  	}
   241  	if len(r.Key) > 0 {
   242  		err = p.parseFieldExtractor(rule, "key", r.Key, func(result *RouteResult, value string) error {
   243  			result.Key = value
   244  			return nil
   245  		})
   246  		if err != nil {
   247  			return nil, err
   248  		}
   249  	}
   251  	if len(r.Compression) > 0 {
   252  		err = p.parseFieldExtractor(rule, "compression", r.Compression, func(result *RouteResult, value string) error {
   253  			// TODO: should support restore compressed source files
   254  			compression, err := parseCompressionType(value)
   255  			if err != nil {
   256  				return err
   257  			}
   258  			if compression != CompressionNone {
   259  				return errors.New("Currently we don't support restore compressed source file yet")
   260  			}
   261  			result.Compression = compression
   262  			return nil
   263  		})
   264  		if err != nil {
   265  			return nil, err
   266  		}
   267  	}
   269  	return rule, nil
   270  }
   272  // parse each field extractor in `p.r` and set them to p.rule
   273  func (p regexRouterParser) parseFieldExtractor(
   274  	rule *RegexRouter,
   275  	field,
   276  	fieldPattern string,
   277  	applyFn func(result *RouteResult, value string) error,
   278  ) error {
   279  	// pattern is empty, return default rule
   280  	if len(fieldPattern) == 0 {
   281  		return errors.Errorf("field '%s' match pattern can't be empty", field)
   282  	}
   284  	// check and parse regexp template
   285  	if err := p.checkSubPatterns(rule.pattern, fieldPattern); err != nil {
   286  		return errors.Trace(err)
   287  	}
   288  	rule.extractors = append(rule.extractors, patExpander{
   289  		template: fieldPattern,
   290  		applyFn:  applyFn,
   291  	})
   292  	return nil
   293  }
   295  func (p regexRouterParser) checkSubPatterns(pat *regexp.Regexp, t string) error {
   296  	subPats := expandVariablePattern.FindAllString(t, -1)
   297  	for _, subVar := range subPats {
   298  		var tmplName string
   299  		switch {
   300  		case subVar == "$$":
   301  			continue
   302  		case strings.HasPrefix(subVar, "${"):
   303  			tmplName = subVar[2 : len(subVar)-1]
   304  		default:
   305  			tmplName = subVar[1:]
   306  		}
   307  		if number, err := strconv.Atoi(tmplName); err == nil {
   308  			if number > pat.NumSubexp() {
   309  				return errors.Errorf("sub pattern capture '%s' out of range", subVar)
   310  			}
   311  		} else if !slice.AnyOf(pat.SubexpNames(), func(i int) bool {
   312  			// FIXME: we should use re.SubexpIndex here, but not supported in go1.13 yet
   313  			return pat.SubexpNames()[i] == tmplName
   314  		}) {
   315  			return errors.Errorf("invalid named capture '%s'", subVar)
   316  		}
   317  	}
   319  	return nil
   320  }
   322  // patExpander extract string by expanding template with the regexp pattern
   323  type patExpander struct {
   324  	template string
   325  	applyFn  func(result *RouteResult, value string) error
   326  }
   328  func (p *patExpander) Expand(pattern *regexp.Regexp, path string, matchIndex []int, result *RouteResult) error {
   329  	value := pattern.ExpandString([]byte{}, p.template, path, matchIndex)
   330  	return p.applyFn(result, string(value))
   331  }
   333  type RouteResult struct {
   334  	filter.Table
   335  	Key         string
   336  	Compression Compression
   337  	Type        SourceType
   338  }