github.com/itchyny/rassemble-go@v0.1.1/rassemble.go (about) 1 // Package rassemble provides a method to assemble regular expressions. 2 package rassemble 3 4 import ( 5 "regexp/syntax" 6 "sort" 7 "unicode" 8 _ "unsafe" 9 ) 10 11 // Join patterns to build a regexp pattern. 12 func Join(patterns []string) (string, error) { 13 var sub []*syntax.Regexp 14 for _, pattern := range patterns { 15 r, err := syntax.Parse(pattern, syntax.PerlX|syntax.ClassNL) 16 if err != nil { 17 return "", err 18 } 19 sub = add(sub, breakLiterals(r)) 20 } 21 return mergeSuffix(alternate(sub...)).String(), nil 22 } 23 24 func breakLiterals(r *syntax.Regexp) *syntax.Regexp { 25 if r.Op == syntax.OpLiteral { 26 if len(r.Rune) <= 1 { 27 return r 28 } 29 sub := make([]*syntax.Regexp, len(r.Rune)) 30 for i := range r.Rune { 31 sub[i] = &syntax.Regexp{ 32 Op: syntax.OpLiteral, Flags: r.Flags, Rune: r.Rune[i : i+1], 33 } 34 } 35 return concat(sub...) 36 } 37 for i, rr := range r.Sub { 38 r.Sub[i] = breakLiterals(rr) 39 } 40 if r.Op == syntax.OpConcat { 41 r = flattenConcat(r) 42 } 43 return r 44 } 45 46 func add(sub []*syntax.Regexp, r2 *syntax.Regexp) []*syntax.Regexp { 47 if r2.Op == syntax.OpAlternate { 48 for _, r2 := range r2.Sub { 49 sub = add(sub, r2) 50 } 51 return sub 52 } 53 for i, r1 := range sub { 54 if r1.Equal(r2) { 55 return sub 56 } 57 if r := mergePrefix(r1, r2); r != nil { 58 sub[i] = r 59 return sub 60 } 61 } 62 return append(sub, r2) 63 } 64 65 func mergePrefix(r1, r2 *syntax.Regexp) *syntax.Regexp { 66 if r1.Op > r2.Op { 67 r1, r2 = r2, r1 68 } 69 switch r1.Op { 70 case syntax.OpEmptyMatch: 71 switch r2.Op { 72 case syntax.OpLiteral, syntax.OpCharClass, 73 syntax.OpStar, syntax.OpPlus, syntax.OpQuest: 74 // (?:)|x+ => x*, etc. 75 return quest(r2) 76 } 77 case syntax.OpLiteral: 78 switch r2.Op { 79 case syntax.OpCharClass: 80 // a|[bc] => [a-c] 81 // (?i:a)|[bc] => [Aa-c] 82 return charClass(appendLiteral(r2.Rune, r1.Rune[0], r1.Flags)) 83 case syntax.OpQuest: 84 if r2 := r2.Sub[0]; r2.Op == syntax.OpCharClass { 85 // a|[bc]? => [a-c]? 86 // (?i:a)|[bc]? => [Aa-c]? 87 return quest(charClass(appendLiteral(r2.Rune, r1.Rune[0], r1.Flags))) 88 } 89 } 90 case syntax.OpCharClass: 91 switch r2.Op { 92 case syntax.OpCharClass: 93 // [a-c]|[d-f] => [a-f] 94 return charClass(append(r1.Rune, r2.Rune...)) 95 case syntax.OpQuest: 96 switch r2 := r2.Sub[0]; r2.Op { 97 case syntax.OpLiteral: 98 // [ab]|c? => [a-c]? 99 // [ab]|(?i:c)? => [Ca-c]? 100 return quest(charClass(appendLiteral(r1.Rune, r2.Rune[0], r2.Flags))) 101 case syntax.OpCharClass: 102 // [ab]|[cd]? => [a-d]? 103 return quest(charClass(append(r1.Rune, r2.Rune...))) 104 } 105 } 106 case syntax.OpStar, syntax.OpPlus, syntax.OpQuest: 107 if r1.Sub[0].Equal(r2) { 108 // x*|x => x* 109 // x+|x => x+ 110 // x?|x => x? 111 return r1 112 } 113 if r1.Op < r2.Op && r2.Op <= syntax.OpQuest && r1.Sub[0].Equal(r2.Sub[0]) { 114 // x*|x+ => x* 115 // x*|x? => x* 116 // x+|x? => x* 117 return &syntax.Regexp{Op: syntax.OpStar, Sub: r1.Sub} 118 } 119 case syntax.OpConcat: 120 return mergePrefixConcat(r1, r2) 121 } 122 switch r2.Op { 123 case syntax.OpConcat: 124 return mergePrefixConcat(r2, r1) 125 case syntax.OpStar, syntax.OpPlus, syntax.OpQuest: 126 if r1.Equal(r2.Sub[0]) { 127 // x|x* => x* 128 // x|x? => x? 129 // x|x+ => x+ 130 return r2 131 } 132 } 133 return nil 134 } 135 136 func mergePrefixConcat(r1, r2 *syntax.Regexp) *syntax.Regexp { 137 if r2.Op == syntax.OpConcat { 138 var i int 139 for ; i < len(r1.Sub) && i < len(r2.Sub); i++ { 140 if !r1.Sub[i].Equal(r2.Sub[i]) { 141 break 142 } 143 } 144 if i > 0 { 145 // x*y*z*w*|x*y*u*v* => x*y*(?:z*w*|u*v*) 146 return concat( 147 append( 148 append(make([]*syntax.Regexp, 0, i+1), r1.Sub[:i]...), 149 alternate(concat(r1.Sub[i:]...), concat(r2.Sub[i:]...)), 150 )..., 151 ) 152 } 153 } else if r1.Sub[0].Equal(r2) { 154 // x*y*z*|x* => x*(?:y*z*)? 155 return concat(r2, quest(concat(r1.Sub[1:]...))) 156 } 157 return nil 158 } 159 160 func mergeSuffix(r *syntax.Regexp) *syntax.Regexp { 161 for i, rr := range r.Sub { 162 r.Sub[i] = mergeSuffix(rr) 163 } 164 switch r.Op { 165 case syntax.OpAlternate: 166 sub, k, rs, merge := r.Sub, -1, r.Rune0[:0], false 167 for i := 0; i < len(sub); i++ { 168 r1 := sub[i] 169 for j := i + 1; j < len(sub); j++ { 170 r2 := sub[j] 171 if r := mergeSuffixConcat(r1, r2); r != nil { 172 r1, j, sub = r, j-1, append(sub[:j], sub[j+1:]...) 173 } 174 } 175 if r1 != sub[i] { 176 sub[i] = mergeSuffix(r1) 177 continue 178 } 179 // merge literals and character classes here 180 // to prefer ax?|bx?|cx? over [abc]|ax|bx|cx 181 switch r1.Op { 182 case syntax.OpLiteral: 183 rs = appendLiteral(rs, r1.Rune[0], r1.Flags) 184 case syntax.OpCharClass: 185 rs = append(rs, r1.Rune...) 186 default: 187 continue 188 } 189 if k < 0 { 190 k = i 191 } else { 192 i, sub, merge = i-1, append(sub[:i], sub[i+1:]...), true 193 } 194 } 195 if merge { 196 // (?:a|b|[c-e]) => [a-e] 197 sub[k] = charClass(rs) 198 } 199 return alternate(sub...) 200 case syntax.OpQuest: 201 if r := r.Sub[0]; r.Op == syntax.OpAlternate { 202 for i, rr := range r.Sub { 203 if rr.Op == syntax.OpLiteral { 204 for _, rs := range r.Sub { 205 if rs.Op == syntax.OpConcat && 206 rs.Sub[len(rs.Sub)-1].Op == syntax.OpQuest && 207 rr.Equal(rs.Sub[len(rs.Sub)-1].Sub[0]) { 208 // (?:ab?|b)? => (?:ab?|b?) => a?b? 209 r.Sub[i] = quest(rr) 210 return mergeSuffix(r) 211 } 212 } 213 } 214 } 215 } 216 return r 217 case syntax.OpConcat: 218 return flattenConcat(r) 219 default: 220 return r 221 } 222 } 223 224 func mergeSuffixConcat(r1, r2 *syntax.Regexp) *syntax.Regexp { 225 if r1.Op != syntax.OpConcat { 226 if r2.Op != syntax.OpConcat { 227 return nil 228 } 229 r1, r2 = r2, r1 230 } 231 if r2.Op == syntax.OpConcat { 232 var i int 233 for ; i < len(r1.Sub) && i < len(r2.Sub); i++ { 234 if !r1.Sub[len(r1.Sub)-1-i].Equal(r2.Sub[len(r2.Sub)-1-i]) { 235 break 236 } 237 } 238 if i > 0 { 239 // x*y*z*w*|u*v*z*w* => (?:x*y*|u*v*)z*w* 240 return concat( 241 append( 242 []*syntax.Regexp{ 243 alternate( 244 concat(r1.Sub[:len(r1.Sub)-i]...), 245 concat(r2.Sub[:len(r2.Sub)-i]...), 246 ), 247 }, 248 r1.Sub[len(r1.Sub)-i:]..., 249 )..., 250 ) 251 } 252 } else if r1.Sub[len(r1.Sub)-1].Equal(r2) { 253 // x*y*z*|z* => (?:x*y*)?z* 254 return concat(quest(concat(r1.Sub[:len(r1.Sub)-1]...)), r2) 255 } 256 return nil 257 } 258 259 func flattenConcat(r *syntax.Regexp) *syntax.Regexp { 260 n := len(r.Sub) 261 for _, rr := range r.Sub { 262 if rr.Op == syntax.OpConcat { 263 n += len(rr.Sub) - 1 264 } 265 } 266 sub := make([]*syntax.Regexp, 0, n) 267 for _, rr := range r.Sub { 268 if rr.Op == syntax.OpConcat { 269 sub = append(sub, rr.Sub...) 270 } else { 271 sub = append(sub, rr) 272 } 273 } 274 return concat(sub...) 275 } 276 277 func concat(sub ...*syntax.Regexp) *syntax.Regexp { 278 switch len(sub) { 279 case 0: 280 return &syntax.Regexp{Op: syntax.OpEmptyMatch} 281 case 1: 282 return sub[0] 283 default: 284 return &syntax.Regexp{Op: syntax.OpConcat, Sub: sub} 285 } 286 } 287 288 func alternate(sub ...*syntax.Regexp) *syntax.Regexp { 289 switch len(sub) { 290 case 1: 291 return sub[0] 292 case 2: 293 r1, r2 := sub[0], sub[1] 294 if r := mergePrefix(r1, r2); r != nil { 295 return r 296 } 297 if r2.Op == syntax.OpEmptyMatch { 298 // x*y*|(?:) => (?:x*y*)? 299 return quest(r1) 300 } 301 switch r1.Op { 302 case syntax.OpEmptyMatch: 303 // (?:)|x*y* => (?:x*y*)? 304 return quest(r2) 305 case syntax.OpAlternate: 306 // (?:x*|y*)|z* => x*|y*|z* 307 return alternate(add(r1.Sub, r2)...) 308 case syntax.OpQuest: 309 // x?|y* => (?:x|y*)? 310 return quest(alternate(r1.Sub[0], r2)) 311 } 312 fallthrough 313 default: 314 return &syntax.Regexp{Op: syntax.OpAlternate, Sub: sub} 315 } 316 } 317 318 func quest(r *syntax.Regexp) *syntax.Regexp { 319 switch r.Op { 320 case syntax.OpQuest, syntax.OpStar: 321 // (?:x?)? => x? 322 // (?:x*)? => x* 323 return r 324 case syntax.OpPlus: 325 // (?:x+)? => x* 326 return &syntax.Regexp{Op: syntax.OpStar, Sub: r.Sub} 327 case syntax.OpAlternate: 328 for i, rr := range r.Sub { 329 switch rr.Op { 330 case syntax.OpQuest, syntax.OpStar: 331 // (?:x|y?|z)? => x|y?|z 332 // (?:x|y*|z)? => x|y*|z 333 return r 334 case syntax.OpPlus: 335 // (?:x|y+|z)? => x|y*|z 336 r.Sub[i].Op = syntax.OpStar 337 return r 338 } 339 } 340 fallthrough 341 default: 342 return &syntax.Regexp{Op: syntax.OpQuest, Sub: []*syntax.Regexp{r}} 343 } 344 } 345 346 type charClassSlice []rune 347 348 func (rs charClassSlice) Len() int { 349 return len(rs) / 2 350 } 351 func (rs charClassSlice) Less(i, j int) bool { 352 return rs[i*2] < rs[j*2] 353 } 354 func (rs charClassSlice) Swap(i, j int) { 355 i, j = i*2, j*2 356 rs[i], rs[i+1], rs[j], rs[j+1] = rs[j], rs[j+1], rs[i], rs[i+1] 357 } 358 359 func charClass(rs []rune) *syntax.Regexp { 360 sort.Sort(charClassSlice(rs)) 361 var i int 362 for j := 2; j < len(rs); j += 2 { 363 switch { 364 case rs[i+1] >= rs[j]: 365 if rs[i+1] < rs[j+1] { 366 // [a-dc-e] => [a-e] 367 rs[i+1] = rs[j+1] 368 } 369 case rs[i+1]+1 == rs[j]: 370 switch { 371 case i > 0 && rs[i-1]+1 == rs[i]: 372 // [abc-e] => [a-e] 373 i -= 2 374 fallthrough 375 case rs[i] < rs[i+1] || rs[j] < rs[j+1]: 376 // [a-de], [ab-e] => [a-e] 377 rs[i+1] = rs[j+1] 378 continue 379 } 380 // [ab] =/> [a-b] 381 fallthrough 382 default: 383 if i += 2; i != j { 384 rs[i], rs[i+1] = rs[j], rs[j+1] 385 } 386 } 387 } 388 rs = rs[:i+2] 389 if len(rs) == 2 && rs[0] == 0 && rs[1] == unicode.MaxRune { 390 // [^a]|a => (?s:.) 391 return &syntax.Regexp{Op: syntax.OpAnyChar} 392 } 393 return &syntax.Regexp{Op: syntax.OpCharClass, Rune: rs} 394 } 395 396 //go:linkname appendLiteral regexp/syntax.appendLiteral 397 func appendLiteral([]rune, rune, syntax.Flags) []rune