github.com/pingcap/tidb-lightning@v5.0.0-rc.0.20210428090220-84b649866577+incompatible/lightning/mydump/router.go (about) 1 package mydump 2 3 import ( 4 "regexp" 5 "strconv" 6 "strings" 7 8 "github.com/tikv/pd/pkg/slice" 9 10 "github.com/pingcap/errors" 11 "github.com/pingcap/tidb-tools/pkg/filter" 12 13 "github.com/pingcap/tidb-lightning/lightning/config" 14 ) 15 16 type SourceType int 17 18 const ( 19 SourceTypeIgnore SourceType = iota 20 SourceTypeSchemaSchema 21 SourceTypeTableSchema 22 SourceTypeSQL 23 SourceTypeCSV 24 SourceTypeParquet 25 SourceTypeViewSchema 26 ) 27 28 const ( 29 SchemaSchema = "schema-schema" 30 TableSchema = "table-schema" 31 ViewSchema = "view-schema" 32 TypeSQL = "sql" 33 TypeCSV = "csv" 34 TypeParquet = "parquet" 35 TypeIgnore = "ignore" 36 ) 37 38 type Compression int 39 40 const ( 41 CompressionNone Compression = iota 42 CompressionGZ 43 CompressionLZ4 44 CompressionZStd 45 CompressionXZ 46 ) 47 48 func parseSourceType(t string) (SourceType, error) { 49 switch strings.ToLower(strings.TrimSpace(t)) { 50 case SchemaSchema: 51 return SourceTypeSchemaSchema, nil 52 case TableSchema: 53 return SourceTypeTableSchema, nil 54 case TypeSQL: 55 return SourceTypeSQL, nil 56 case TypeCSV: 57 return SourceTypeCSV, nil 58 case TypeParquet: 59 return SourceTypeParquet, nil 60 case TypeIgnore: 61 return SourceTypeIgnore, nil 62 case ViewSchema: 63 return SourceTypeViewSchema, nil 64 default: 65 return SourceTypeIgnore, errors.Errorf("unknown source type '%s'", t) 66 } 67 } 68 69 func (s SourceType) String() string { 70 switch s { 71 case SourceTypeSchemaSchema: 72 return SchemaSchema 73 case SourceTypeTableSchema: 74 return TableSchema 75 case SourceTypeCSV: 76 return TypeCSV 77 case SourceTypeSQL: 78 return TypeSQL 79 case SourceTypeParquet: 80 return TypeParquet 81 case SourceTypeViewSchema: 82 return ViewSchema 83 default: 84 return TypeIgnore 85 } 86 } 87 88 func parseCompressionType(t string) (Compression, error) { 89 switch strings.ToLower(strings.TrimSpace(t)) { 90 case "gz": 91 return CompressionGZ, nil 92 case "lz4": 93 return CompressionLZ4, nil 94 case "zstd": 95 return CompressionZStd, nil 96 case "xz": 97 return CompressionXZ, nil 98 case "": 99 return CompressionNone, nil 100 default: 101 return CompressionNone, errors.Errorf("invalid compression type '%s'", t) 102 } 103 } 104 105 var ( 106 expandVariablePattern = regexp.MustCompile(`\$(?:\$|[\pL\p{Nd}_]+|\{[\pL\p{Nd}_]+\})`) 107 ) 108 109 var ( 110 defaultFileRouteRules = []*config.FileRouteRule{ 111 // ignore *-schema-trigger.sql, *-schema-post.sql files 112 {Pattern: `(?i).*(-schema-trigger|-schema-post)\.sql$`, Type: "ignore"}, 113 // db schema create file pattern, matches files like '{schema}-schema-create.sql' 114 {Pattern: `(?i)^(?:[^/]*/)*([^/.]+)-schema-create\.sql$`, Schema: "$1", Table: "", Type: SchemaSchema}, 115 // table schema create file pattern, matches files like '{schema}.{table}-schema.sql' 116 {Pattern: `(?i)^(?:[^/]*/)*([^/.]+)\.(.*?)-schema\.sql$`, Schema: "$1", Table: "$2", Type: TableSchema}, 117 // view schema create file pattern, matches files like '{schema}.{table}-schema-view.sql' 118 {Pattern: `(?i)^(?:[^/]*/)*([^/.]+)\.(.*?)-schema-view\.sql$`, Schema: "$1", Table: "$2", Type: ViewSchema}, 119 // source file pattern, matches files like '{schema}.{table}.0001.{sql|csv}' 120 {Pattern: `(?i)^(?:[^/]*/)*([^/.]+)\.(.*?)(?:\.([0-9]+))?\.(sql|csv|parquet)$`, Schema: "$1", Table: "$2", Type: "$4", Key: "$3"}, 121 } 122 ) 123 124 // // RouteRule is a rule to route file path to target schema/table 125 type FileRouter interface { 126 // Route apply rule to path. Return nil if path doesn't math route rule; 127 // return error if path match route rule but the captured value for field is invalid 128 Route(path string) (*RouteResult, error) 129 } 130 131 // chainRouters aggregates multi `FileRouter` as a router 132 type chainRouters []FileRouter 133 134 func (c chainRouters) Route(path string) (*RouteResult, error) { 135 for _, r := range c { 136 res, err := r.Route(path) 137 if err != nil { 138 return nil, err 139 } 140 if res != nil { 141 return res, nil 142 } 143 } 144 return nil, nil 145 } 146 147 func NewFileRouter(cfg []*config.FileRouteRule) (FileRouter, error) { 148 res := make([]FileRouter, 0, len(cfg)) 149 p := regexRouterParser{} 150 for _, c := range cfg { 151 rule, err := p.Parse(c) 152 if err != nil { 153 return nil, err 154 } 155 res = append(res, rule) 156 } 157 return chainRouters(res), nil 158 } 159 160 // `RegexRouter` is a `FileRouter` implement that apply specific regex pattern to filepath. 161 // if regex pattern match, then each extractors with capture the matched regexp pattern and 162 // set value to target field in `RouteResult` 163 type RegexRouter struct { 164 pattern *regexp.Regexp 165 extractors []patExpander 166 } 167 168 func (r *RegexRouter) Route(path string) (*RouteResult, error) { 169 indexes := r.pattern.FindStringSubmatchIndex(path) 170 if len(indexes) == 0 { 171 return nil, nil 172 } 173 result := &RouteResult{} 174 for _, e := range r.extractors { 175 err := e.Expand(r.pattern, path, indexes, result) 176 if err != nil { 177 return nil, err 178 } 179 } 180 return result, nil 181 } 182 183 type regexRouterParser struct{} 184 185 func (p regexRouterParser) Parse(r *config.FileRouteRule) (*RegexRouter, error) { 186 rule := &RegexRouter{} 187 if r.Path == "" && r.Pattern == "" { 188 return nil, errors.New("`path` and `pattern` must not be both empty in [[mydumper.files]]") 189 } 190 if r.Path != "" && r.Pattern != "" { 191 return nil, errors.New("can't set both `path` and `pattern` field in [[mydumper.files]]") 192 } 193 if r.Path != "" { 194 // convert constant string as a regexp pattern 195 r.Pattern = regexp.QuoteMeta(r.Path) 196 // escape all '$' by '$$' in match templates 197 quoteTmplFn := func(t string) string { return strings.ReplaceAll(t, "$", "$$") } 198 r.Table = quoteTmplFn(r.Table) 199 r.Schema = quoteTmplFn(r.Schema) 200 r.Type = quoteTmplFn(r.Type) 201 r.Compression = quoteTmplFn(r.Compression) 202 r.Key = quoteTmplFn(r.Key) 203 204 } 205 pattern, err := regexp.Compile(r.Pattern) 206 if err != nil { 207 return nil, errors.Trace(err) 208 } 209 rule.pattern = pattern 210 211 err = p.parseFieldExtractor(rule, "type", r.Type, func(result *RouteResult, value string) error { 212 ty, err := parseSourceType(value) 213 if err != nil { 214 return err 215 } 216 result.Type = ty 217 return nil 218 }) 219 if err != nil { 220 return nil, err 221 } 222 // ignore pattern needn't parse other fields 223 if r.Type == TypeIgnore { 224 return rule, nil 225 } 226 227 err = p.parseFieldExtractor(rule, "schema", r.Schema, func(result *RouteResult, value string) error { 228 result.Schema = value 229 return nil 230 }) 231 if err != nil { 232 return nil, err 233 } 234 235 // special case: when the pattern is for db schema, should not parse table name 236 if r.Type != SchemaSchema { 237 err = p.parseFieldExtractor(rule, "table", r.Table, func(result *RouteResult, value string) error { 238 result.Name = value 239 return nil 240 }) 241 if err != nil { 242 return nil, err 243 } 244 } 245 246 if len(r.Key) > 0 { 247 err = p.parseFieldExtractor(rule, "key", r.Key, func(result *RouteResult, value string) error { 248 result.Key = value 249 return nil 250 }) 251 if err != nil { 252 return nil, err 253 } 254 } 255 256 if len(r.Compression) > 0 { 257 err = p.parseFieldExtractor(rule, "compression", r.Compression, func(result *RouteResult, value string) error { 258 // TODO: should support restore compressed source files 259 compression, err := parseCompressionType(value) 260 if err != nil { 261 return err 262 } 263 if compression != CompressionNone { 264 return errors.New("Currently we don't support restore compressed source file yet") 265 266 } 267 result.Compression = compression 268 return nil 269 }) 270 if err != nil { 271 return nil, err 272 } 273 } 274 275 return rule, nil 276 } 277 278 // parse each field extractor in `p.r` and set them to p.rule 279 func (p regexRouterParser) parseFieldExtractor( 280 rule *RegexRouter, 281 field, 282 fieldPattern string, 283 applyFn func(result *RouteResult, value string) error, 284 ) error { 285 // pattern is empty, return default rule 286 if len(fieldPattern) == 0 { 287 return errors.Errorf("field '%s' match pattern can't be empty", field) 288 } 289 290 // check and parse regexp template 291 if err := p.checkSubPatterns(rule.pattern, fieldPattern); err != nil { 292 return errors.Trace(err) 293 } 294 rule.extractors = append(rule.extractors, patExpander{ 295 template: fieldPattern, 296 applyFn: applyFn, 297 }) 298 return nil 299 } 300 301 func (p regexRouterParser) checkSubPatterns(pat *regexp.Regexp, t string) error { 302 subPats := expandVariablePattern.FindAllString(t, -1) 303 for _, subVar := range subPats { 304 var tmplName string 305 switch { 306 case subVar == "$$": 307 continue 308 case strings.HasPrefix(subVar, "${"): 309 tmplName = subVar[2 : len(subVar)-1] 310 default: 311 tmplName = subVar[1:] 312 } 313 if number, err := strconv.Atoi(tmplName); err == nil { 314 if number > pat.NumSubexp() { 315 return errors.Errorf("sub pattern capture '%s' out of range", subVar) 316 } 317 } else if !slice.AnyOf(pat.SubexpNames(), func(i int) bool { 318 // FIXME: we should use re.SubexpIndex here, but not supported in go1.13 yet 319 return pat.SubexpNames()[i] == tmplName 320 }) { 321 return errors.Errorf("invalid named capture '%s'", subVar) 322 } 323 } 324 325 return nil 326 } 327 328 // patExpander extract string by expanding template with the regexp pattern 329 type patExpander struct { 330 template string 331 applyFn func(result *RouteResult, value string) error 332 } 333 334 func (p *patExpander) Expand(pattern *regexp.Regexp, path string, matchIndex []int, result *RouteResult) error { 335 value := pattern.ExpandString([]byte{}, p.template, path, matchIndex) 336 return p.applyFn(result, string(value)) 337 } 338 339 type RouteResult struct { 340 filter.Table 341 Key string 342 Compression Compression 343 Type SourceType 344 }