github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/lightning/mydump/router.go (about) 1 package mydump 2 3 import ( 4 "regexp" 5 "strconv" 6 "strings" 7 8 "github.com/pingcap/tidb/util/slice" 9 10 "github.com/pingcap/errors" 11 "github.com/pingcap/tidb-tools/pkg/filter" 12 13 "github.com/pingcap/br/pkg/lightning/config" 14 ) 15 16 type SourceType int 17 18 const ( 19 SourceTypeIgnore SourceType = iota 20 SourceTypeSchemaSchema 21 SourceTypeTableSchema 22 SourceTypeSQL 23 SourceTypeCSV 24 SourceTypeParquet 25 SourceTypeViewSchema 26 ) 27 28 const ( 29 SchemaSchema = "schema-schema" 30 TableSchema = "table-schema" 31 ViewSchema = "view-schema" 32 TypeSQL = "sql" 33 TypeCSV = "csv" 34 TypeParquet = "parquet" 35 TypeIgnore = "ignore" 36 ) 37 38 type Compression int 39 40 const ( 41 CompressionNone Compression = iota 42 CompressionGZ 43 CompressionLZ4 44 CompressionZStd 45 CompressionXZ 46 ) 47 48 func parseSourceType(t string) (SourceType, error) { 49 switch strings.ToLower(strings.TrimSpace(t)) { 50 case SchemaSchema: 51 return SourceTypeSchemaSchema, nil 52 case TableSchema: 53 return SourceTypeTableSchema, nil 54 case TypeSQL: 55 return SourceTypeSQL, nil 56 case TypeCSV: 57 return SourceTypeCSV, nil 58 case TypeParquet: 59 return SourceTypeParquet, nil 60 case TypeIgnore: 61 return SourceTypeIgnore, nil 62 case ViewSchema: 63 return SourceTypeViewSchema, nil 64 default: 65 return SourceTypeIgnore, errors.Errorf("unknown source type '%s'", t) 66 } 67 } 68 69 func (s SourceType) String() string { 70 switch s { 71 case SourceTypeSchemaSchema: 72 return SchemaSchema 73 case SourceTypeTableSchema: 74 return TableSchema 75 case SourceTypeCSV: 76 return TypeCSV 77 case SourceTypeSQL: 78 return TypeSQL 79 case SourceTypeParquet: 80 return TypeParquet 81 case SourceTypeViewSchema: 82 return ViewSchema 83 default: 84 return TypeIgnore 85 } 86 } 87 88 func parseCompressionType(t string) (Compression, error) { 89 switch strings.ToLower(strings.TrimSpace(t)) { 90 case "gz": 91 return CompressionGZ, nil 92 case "lz4": 93 return CompressionLZ4, nil 94 case "zstd": 95 return CompressionZStd, nil 96 case "xz": 97 return CompressionXZ, nil 98 case "": 99 return CompressionNone, nil 100 default: 101 return CompressionNone, errors.Errorf("invalid compression type '%s'", t) 102 } 103 } 104 105 var expandVariablePattern = regexp.MustCompile(`\$(?:\$|[\pL\p{Nd}_]+|\{[\pL\p{Nd}_]+\})`) 106 107 var defaultFileRouteRules = []*config.FileRouteRule{ 108 // ignore *-schema-trigger.sql, *-schema-post.sql files 109 {Pattern: `(?i).*(-schema-trigger|-schema-post)\.sql$`, Type: "ignore"}, 110 // db schema create file pattern, matches files like '{schema}-schema-create.sql' 111 {Pattern: `(?i)^(?:[^/]*/)*([^/.]+)-schema-create\.sql$`, Schema: "$1", Table: "", Type: SchemaSchema}, 112 // table schema create file pattern, matches files like '{schema}.{table}-schema.sql' 113 {Pattern: `(?i)^(?:[^/]*/)*([^/.]+)\.(.*?)-schema\.sql$`, Schema: "$1", Table: "$2", Type: TableSchema}, 114 // view schema create file pattern, matches files like '{schema}.{table}-schema-view.sql' 115 {Pattern: `(?i)^(?:[^/]*/)*([^/.]+)\.(.*?)-schema-view\.sql$`, Schema: "$1", Table: "$2", Type: ViewSchema}, 116 // source file pattern, matches files like '{schema}.{table}.0001.{sql|csv}' 117 {Pattern: `(?i)^(?:[^/]*/)*([^/.]+)\.(.*?)(?:\.([0-9]+))?\.(sql|csv|parquet)$`, Schema: "$1", Table: "$2", Type: "$4", Key: "$3"}, 118 } 119 120 // // RouteRule is a rule to route file path to target schema/table 121 type FileRouter interface { 122 // Route apply rule to path. Return nil if path doesn't math route rule; 123 // return error if path match route rule but the captured value for field is invalid 124 Route(path string) (*RouteResult, error) 125 } 126 127 // chainRouters aggregates multi `FileRouter` as a router 128 type chainRouters []FileRouter 129 130 func (c chainRouters) Route(path string) (*RouteResult, error) { 131 for _, r := range c { 132 res, err := r.Route(path) 133 if err != nil { 134 return nil, err 135 } 136 if res != nil { 137 return res, nil 138 } 139 } 140 return nil, nil 141 } 142 143 func NewFileRouter(cfg []*config.FileRouteRule) (FileRouter, error) { 144 res := make([]FileRouter, 0, len(cfg)) 145 p := regexRouterParser{} 146 for _, c := range cfg { 147 rule, err := p.Parse(c) 148 if err != nil { 149 return nil, err 150 } 151 res = append(res, rule) 152 } 153 return chainRouters(res), nil 154 } 155 156 // `RegexRouter` is a `FileRouter` implement that apply specific regex pattern to filepath. 157 // if regex pattern match, then each extractors with capture the matched regexp pattern and 158 // set value to target field in `RouteResult` 159 type RegexRouter struct { 160 pattern *regexp.Regexp 161 extractors []patExpander 162 } 163 164 func (r *RegexRouter) Route(path string) (*RouteResult, error) { 165 indexes := r.pattern.FindStringSubmatchIndex(path) 166 if len(indexes) == 0 { 167 return nil, nil 168 } 169 result := &RouteResult{} 170 for _, e := range r.extractors { 171 err := e.Expand(r.pattern, path, indexes, result) 172 if err != nil { 173 return nil, err 174 } 175 } 176 return result, nil 177 } 178 179 type regexRouterParser struct{} 180 181 func (p regexRouterParser) Parse(r *config.FileRouteRule) (*RegexRouter, error) { 182 rule := &RegexRouter{} 183 if r.Path == "" && r.Pattern == "" { 184 return nil, errors.New("`path` and `pattern` must not be both empty in [[mydumper.files]]") 185 } 186 if r.Path != "" && r.Pattern != "" { 187 return nil, errors.New("can't set both `path` and `pattern` field in [[mydumper.files]]") 188 } 189 if r.Path != "" { 190 // convert constant string as a regexp pattern 191 r.Pattern = regexp.QuoteMeta(r.Path) 192 // escape all '$' by '$$' in match templates 193 quoteTmplFn := func(t string) string { return strings.ReplaceAll(t, "$", "$$") } 194 r.Table = quoteTmplFn(r.Table) 195 r.Schema = quoteTmplFn(r.Schema) 196 r.Type = quoteTmplFn(r.Type) 197 r.Compression = quoteTmplFn(r.Compression) 198 r.Key = quoteTmplFn(r.Key) 199 } 200 pattern, err := regexp.Compile(r.Pattern) 201 if err != nil { 202 return nil, errors.Trace(err) 203 } 204 rule.pattern = pattern 205 206 err = p.parseFieldExtractor(rule, "type", r.Type, func(result *RouteResult, value string) error { 207 ty, err := parseSourceType(value) 208 if err != nil { 209 return err 210 } 211 result.Type = ty 212 return nil 213 }) 214 if err != nil { 215 return nil, err 216 } 217 // ignore pattern needn't parse other fields 218 if r.Type == TypeIgnore { 219 return rule, nil 220 } 221 222 err = p.parseFieldExtractor(rule, "schema", r.Schema, func(result *RouteResult, value string) error { 223 result.Schema = value 224 return nil 225 }) 226 if err != nil { 227 return nil, err 228 } 229 230 // special case: when the pattern is for db schema, should not parse table name 231 if r.Type != SchemaSchema { 232 err = p.parseFieldExtractor(rule, "table", r.Table, func(result *RouteResult, value string) error { 233 result.Name = value 234 return nil 235 }) 236 if err != nil { 237 return nil, err 238 } 239 } 240 241 if len(r.Key) > 0 { 242 err = p.parseFieldExtractor(rule, "key", r.Key, func(result *RouteResult, value string) error { 243 result.Key = value 244 return nil 245 }) 246 if err != nil { 247 return nil, err 248 } 249 } 250 251 if len(r.Compression) > 0 { 252 err = p.parseFieldExtractor(rule, "compression", r.Compression, func(result *RouteResult, value string) error { 253 // TODO: should support restore compressed source files 254 compression, err := parseCompressionType(value) 255 if err != nil { 256 return err 257 } 258 if compression != CompressionNone { 259 return errors.New("Currently we don't support restore compressed source file yet") 260 } 261 result.Compression = compression 262 return nil 263 }) 264 if err != nil { 265 return nil, err 266 } 267 } 268 269 return rule, nil 270 } 271 272 // parse each field extractor in `p.r` and set them to p.rule 273 func (p regexRouterParser) parseFieldExtractor( 274 rule *RegexRouter, 275 field, 276 fieldPattern string, 277 applyFn func(result *RouteResult, value string) error, 278 ) error { 279 // pattern is empty, return default rule 280 if len(fieldPattern) == 0 { 281 return errors.Errorf("field '%s' match pattern can't be empty", field) 282 } 283 284 // check and parse regexp template 285 if err := p.checkSubPatterns(rule.pattern, fieldPattern); err != nil { 286 return errors.Trace(err) 287 } 288 rule.extractors = append(rule.extractors, patExpander{ 289 template: fieldPattern, 290 applyFn: applyFn, 291 }) 292 return nil 293 } 294 295 func (p regexRouterParser) checkSubPatterns(pat *regexp.Regexp, t string) error { 296 subPats := expandVariablePattern.FindAllString(t, -1) 297 for _, subVar := range subPats { 298 var tmplName string 299 switch { 300 case subVar == "$$": 301 continue 302 case strings.HasPrefix(subVar, "${"): 303 tmplName = subVar[2 : len(subVar)-1] 304 default: 305 tmplName = subVar[1:] 306 } 307 if number, err := strconv.Atoi(tmplName); err == nil { 308 if number > pat.NumSubexp() { 309 return errors.Errorf("sub pattern capture '%s' out of range", subVar) 310 } 311 } else if !slice.AnyOf(pat.SubexpNames(), func(i int) bool { 312 // FIXME: we should use re.SubexpIndex here, but not supported in go1.13 yet 313 return pat.SubexpNames()[i] == tmplName 314 }) { 315 return errors.Errorf("invalid named capture '%s'", subVar) 316 } 317 } 318 319 return nil 320 } 321 322 // patExpander extract string by expanding template with the regexp pattern 323 type patExpander struct { 324 template string 325 applyFn func(result *RouteResult, value string) error 326 } 327 328 func (p *patExpander) Expand(pattern *regexp.Regexp, path string, matchIndex []int, result *RouteResult) error { 329 value := pattern.ExpandString([]byte{}, p.template, path, matchIndex) 330 return p.applyFn(result, string(value)) 331 } 332 333 type RouteResult struct { 334 filter.Table 335 Key string 336 Compression Compression 337 Type SourceType 338 }