github.com/admpub/gopiper@v1.1.1/filter.go (about) 1 package gopiper 2 3 import ( 4 "errors" 5 "fmt" 6 "html" 7 "path" 8 "reflect" 9 "regexp" 10 "strconv" 11 "strings" 12 "time" 13 14 "github.com/admpub/regexp2" 15 "github.com/webx-top/com" 16 ) 17 18 func init() { 19 RegisterFilter("preadd", preadd, "添加前缀", `preadd(prefix)`, ``) 20 RegisterFilter("postadd", postadd, "添加后缀", `postadd(suffix)`, ``) 21 RegisterFilter("replace", replace, "替换", `replace(find,replace)`, ``) 22 RegisterFilter("split", split, "将字符串按指定分隔符分割成数组", `split(-)`, ``) 23 RegisterFilter("join", join, "合并数组为字符串", `join(-)`, ``) 24 RegisterFilter("trim", trim, "剪掉头尾指定字符", `trim(;)`, ``) 25 RegisterFilter("trimleft", trimleft, "从左边剪掉指定字符串", `trimleft(a-)`, ``) 26 RegisterFilter("trimright", trimright, "从右边剪掉指定字符串", `trimright(.html)`, ``) 27 RegisterFilter("trimspace", trimspace, "剪掉头尾空白", `trimspace`, ``) 28 RegisterFilter("substr", substr, "获取子字符串。字符串总是从左向右从0开始编号,参数1和参数2分别用来指定要截取的起止位置编号,截取子字符串时,总是包含起始编号的字符,不包含终止编号的字符", `substr(0,5)`, ``) 29 RegisterFilter("intval", intval, "转换为整数", `intval`, ``) 30 RegisterFilter("floatval", floatval, "转换为小数", `floatval`, ``) 31 RegisterFilter("hrefreplace", hrefreplace, "替换href属性。$2为捕获到的href属性值", `hrefreplace(data-url="$2")`, ``) 32 RegisterFilter("regexpreplace", regexpreplace, "正则替换(regexp2引擎)。参数1为正则表达式,参数2为替换成的新内容,参数3为起始位置编号(从0开始),参数4为替换次数(-1代表相对全部替换,-2代表绝对全部替换)", `regexpreplace(^A$,B,0,-1)`, ``) 33 RegisterFilter("wraphtml", wraphtml, "将采集到的数据用HTML标签包围起来", `wraphtml(a)`, ``) 34 RegisterFilter("tosbc", tosbc, "将全角的标点符号和英文字母转换为半角", `tosbc`, ``) 35 RegisterFilter("unescape", unescape, "解码HTML", `unescape`, ``) 36 RegisterFilter("escape", escape, "编码HTML", `escape`, ``) 37 RegisterFilter("sprintf", sprintf, "格式化", `sprintf(%s)`, ``) 38 RegisterFilter("sprintfmap", sprintfmap, "用map值格式化(前提是采集到的数据必须是map类型)。参数1为模板字符串,其它参数用于指定相应map元素值的键值", `sprintfmap(%v-%v,a,b)`, ``) 39 RegisterFilter("unixtime", unixtime, "UNIX时间戳(秒)。如果带参数则代表将获取到的数据按照参数指定的格式转为时间戳;不带参数则获取当前时间戳", `unixtime(DateTime)`, `unixtime、unixtime(Y-m-d H:i:s)、unixtime(DateTime) 或 unixtime(2006-01-02 15:04:05)`) 40 RegisterFilter("unixmill", unixmill, "获取当前UNIX时间戳(毫秒)", `unixmill`, ``) 41 RegisterFilter("paging", paging, "分页。参数1为起始页码,参数2为终止页码,参数3为步进值(可选)。需要在网址中添加页码占位符“{0}”,一般与sprintf组合起来使用。经过paging处理后的网址会变成网址数组", `paging(1,10,1)`, `sprintf(%s?page={0})|paging(1,10)`) 42 RegisterFilter("quote", quote, "用双引号包起来", `quote`, ``) 43 RegisterFilter("unquote", unquote, "取消双引号包围", `unquote`, ``) 44 RegisterFilter("saveto", saveto, "下载并保存文件到指定位置", `saveto(savePath)`, ``) 45 RegisterFilter("fetch", fetch, "抓取网址内容。参数pageType仅支持html、json、text这三个值", `fetch(pageType,selector)`, ``) 46 RegisterFilter("basename", basename, "获取文件名", `basename`, ``) 47 RegisterFilter("extension", extension, "获取扩展名", `extension`, ``) 48 } 49 50 type FilterFunction func(pipe *PipeItem, src interface{}, params string) (interface{}, error) 51 52 func NewFilter(name string, fn FilterFunction, description, usage, example string) *Filter { 53 return &Filter{ 54 Name: name, 55 function: fn, 56 Description: description, 57 Usage: usage, 58 Example: example, 59 } 60 } 61 62 type Filter struct { 63 Name string 64 function FilterFunction 65 Description string `json:",omitempty"` 66 Usage string `json:",omitempty"` 67 Example string `json:",omitempty"` 68 } 69 70 var filters = make(map[string]*Filter) 71 72 func RegisterFilter(name string, fn FilterFunction, description, usage, example string) { 73 _, existing := filters[name] 74 if existing { 75 panic(fmt.Sprintf("Filter with name '%s' is already registered.", name)) 76 } 77 filters[name] = NewFilter(name, fn, description, usage, example) 78 } 79 80 func ReplaceFilter(name string, fn FilterFunction, description, usage, example string) { 81 _, existing := filters[name] 82 if !existing { 83 panic(fmt.Sprintf("Filter with name '%s' does not exist (therefore cannot be overridden).", name)) 84 } 85 filters[name] = NewFilter(name, fn, description, usage, example) 86 } 87 88 func AllFilter() map[string]*Filter { 89 return filters 90 } 91 92 var ( 93 filterExp = regexp.MustCompile(`([a-zA-Z0-9\-_]+)(?:\(([\w\W]*?)\))?(\||$)`) 94 hrefFilterExp = regexp.MustCompile(`href(?:\s*)=(?:\s*)(['"])?([^'" ]*)(['"])?`) 95 hrefFilterExp2 = regexp2.MustCompile(`href(?:\s*)=(?:\s*)(['"]?)([^'" ]*)\1`, regexp2.IgnoreCase) 96 ) 97 98 func applyFilter(pipe *PipeItem, name string, src interface{}, params string) (interface{}, error) { 99 filter, existing := filters[name] 100 if !existing { 101 return nil, fmt.Errorf("Filter with name '%s' not found", name) 102 } 103 return filter.function(pipe, src, params) 104 } 105 106 func callFilter(pipe *PipeItem, src interface{}, value string) (interface{}, error) { 107 108 if src == nil || len(value) == 0 { 109 return src, nil 110 } 111 112 vt := filterExp.FindAllStringSubmatch(value, -1) 113 114 for _, v := range vt { 115 if len(v) < 3 { 116 continue 117 } 118 name := v[1] 119 params := v[2] 120 next, err := applyFilter(pipe, name, src, params) 121 if err != nil { 122 if err == ErrInvalidContent { 123 return next, err 124 } 125 continue 126 } 127 src = next 128 } 129 130 return src, nil 131 } 132 133 // fetch(pageType,selector) 134 func fetch(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 135 if pipe.fetcher == nil { 136 return src, ErrFetcherNotRegistered 137 } 138 var ( 139 pageType = pipe.pageType 140 selector string 141 ) 142 paramList := SplitParams(params, `,`) 143 switch len(paramList) { 144 case 2: 145 selector = paramList[1] 146 fallthrough 147 case 1: 148 pageType = paramList[0] 149 } 150 return _filterValue(src, func(v string) (interface{}, error) { 151 body, err := pipe.fetcher(v) 152 if err != nil { 153 return nil, err 154 } 155 if len(selector) == 0 { 156 return string(body), nil 157 } 158 pipe2 := &PipeItem{ 159 Name: ``, 160 Selector: selector, 161 Type: PT_STRING, 162 Filter: ``, 163 } 164 return pipe2.PipeBytes(body, pageType) 165 }) 166 } 167 168 // saveto(savePath) 169 func saveto(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 170 if pipe.storer == nil { 171 return src, ErrStorerNotRegistered 172 } 173 var ( 174 fetched bool 175 savePath string 176 ) 177 paramList := SplitParams(params, `,`) 178 switch len(paramList) { 179 case 2: 180 fetched, _ = strconv.ParseBool(strings.TrimSpace(paramList[1])) 181 fallthrough 182 case 1: 183 savePath = strings.TrimSpace(paramList[0]) 184 } 185 return _filterValue(src, func(v string) (interface{}, error) { 186 return pipe.storer(v, savePath, fetched) 187 }) 188 } 189 190 // preadd(prefix) => {prefix}{src} 191 func preadd(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 192 return _filterValue(src, func(v string) (interface{}, error) { 193 return params + v, nil 194 }, func(_ interface{}) (interface{}, error) { 195 return params, nil 196 }) 197 } 198 199 // postadd(suffix) => {src}{suffix} 200 func postadd(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 201 return _filterValue(src, func(v string) (interface{}, error) { 202 return v + params, nil 203 }, func(_ interface{}) (interface{}, error) { 204 return params, nil 205 }) 206 } 207 208 func _substr(src string, params string) string { 209 vt := strings.Split(params, ",") 210 switch len(vt) { 211 case 1: 212 start, _ := strconv.Atoi(vt[0]) 213 return src[start:] 214 case 2: 215 start, _ := strconv.Atoi(vt[0]) 216 end, _ := strconv.Atoi(vt[1]) 217 return src[start:end] 218 } 219 return src 220 } 221 222 // substr(0,5) => src[0:5] 223 // substr(5) => src[5:] 224 func substr(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 225 return _filterValue(src, func(v string) (interface{}, error) { 226 return _substr(v, params), nil 227 }) 228 } 229 230 func _replace(src string, params string) string { 231 vt := SplitParams(params) 232 switch len(vt) { 233 case 1: 234 return strings.Replace(src, vt[0], "", -1) 235 case 2: 236 return strings.Replace(src, vt[0], vt[1], -1) 237 case 3: 238 n, _ := strconv.Atoi(vt[2]) 239 return strings.Replace(src, vt[0], vt[1], n) 240 } 241 return src 242 } 243 244 // replace(find,replace) => src=findaaa => replaceaaa 245 // replace(find) => src=findaaa => aaa 246 func replace(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 247 return _filterValue(src, func(v string) (interface{}, error) { 248 return _replace(v, params), nil 249 }) 250 } 251 252 // trim(;) => src=;a; => a 253 func trim(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 254 if len(params) == 0 { 255 return src, ErrTrimNilParams 256 } 257 return _filterValue(src, func(v string) (interface{}, error) { 258 return strings.Trim(v, params), nil 259 }) 260 } 261 func trimleft(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 262 if len(params) == 0 { 263 return src, ErrTrimNilParams 264 } 265 return _filterValue(src, func(v string) (interface{}, error) { 266 return strings.TrimLeft(v, params), nil 267 }) 268 } 269 func trimright(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 270 if len(params) == 0 { 271 return src, ErrTrimNilParams 272 } 273 return _filterValue(src, func(v string) (interface{}, error) { 274 return strings.TrimRight(v, params), nil 275 }) 276 } 277 278 // trimspace => src=" \naaa\n " => "aaa" 279 func trimspace(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 280 return _filterValue(src, func(v string) (interface{}, error) { 281 return strings.TrimSpace(v), nil 282 }) 283 } 284 285 // split(:) => src="a:b" => [a,b] 286 func split(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 287 if len(params) == 0 { 288 return src, ErrSplitNilParams 289 } 290 return _filterValue(src, func(v string) (interface{}, error) { 291 str := strings.TrimSpace(v) 292 if len(str) == 0 { 293 return []string{}, nil 294 } 295 return strings.Split(str, params), nil 296 }) 297 } 298 299 // join(:) => src=["a","b"] => a:b 300 func join(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 301 if len(params) == 0 { 302 return src, ErrJoinNilParams 303 } 304 switch vt := src.(type) { 305 case []string: 306 rs := make([]string, 0, len(vt)) 307 for _, v := range vt { 308 if len(v) > 0 { 309 rs = append(rs, v) 310 } 311 } 312 return strings.Join(rs, params), nil 313 default: 314 return vt, nil 315 } 316 } 317 318 // intval => src="123" => 123 319 func intval(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 320 return _filterValue(src, func(v string) (interface{}, error) { 321 return strconv.Atoi(v) 322 }) 323 } 324 325 // basename => src="a/b/c.html" => c.html 326 func basename(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 327 return _filterValue(src, func(v string) (interface{}, error) { 328 return path.Base(v), nil 329 }) 330 } 331 332 // extension => src="a/b/c.html" => .html 333 func extension(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 334 return _filterValue(src, func(v string) (interface{}, error) { 335 return path.Ext(v), nil 336 }) 337 } 338 339 // floatval => src="12.3" => 12.3 340 func floatval(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 341 return _filterValue(src, func(v string) (interface{}, error) { 342 return strconv.ParseFloat(v, 64) 343 }) 344 } 345 346 // hrefreplace(data-url="$2") => src=`href="http://www.admpub.com"` => data-url="http://www.admpub.com" 347 func hrefreplace(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 348 return _filterValue(src, func(v string) (interface{}, error) { 349 return hrefFilterExp2.Replace(v, params, 0, -1) 350 //return hrefFilterExp.ReplaceAllString(v, params), nil 351 }) 352 } 353 354 // regexpreplace(^1) => src="1233" => "233" 355 // regexpreplace(^1,2) => src="1233" => "2233" 356 func regexpreplace(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 357 vt := SplitParams(params) 358 var ( 359 expr string 360 repl string 361 startAt int 362 count = -1 363 ) 364 switch len(vt) { 365 case 4: 366 count, _ = strconv.Atoi(vt[3]) 367 fallthrough 368 case 3: 369 startAt, _ = strconv.Atoi(vt[2]) 370 fallthrough 371 case 2: 372 repl = vt[1] 373 fallthrough 374 case 1: 375 expr = vt[0] 376 } 377 re, err := regexp2.Compile(expr, 0) 378 if err != nil { 379 return src, err 380 } 381 return _filterValue(src, func(v string) (interface{}, error) { 382 if count < -1 { 383 find, err := re.MatchString(v) 384 for find { 385 v, err = re.Replace(v, repl, startAt, -1) 386 if err != nil { 387 return v, err 388 } 389 if len(v) == 0 { 390 break 391 } 392 find, err = re.MatchString(v) 393 } 394 return v, err 395 } 396 return re.Replace(v, repl, startAt, count) 397 }) 398 } 399 400 // 将全角的标点符号和英文字母转换为半角 401 func _tosbc(src string) string { 402 var res string 403 for _, t := range src { 404 if t == 12288 { 405 t = 32 406 } else if t > 65280 && t < 65375 { 407 t = t - 65248 408 } 409 res += string(t) 410 } 411 return res 412 } 413 414 // tosbc => src="1~2" => "1~2" 415 func tosbc(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 416 return _filterValue(src, func(v string) (interface{}, error) { 417 return _tosbc(v), nil 418 }) 419 } 420 421 // unescape => src="<" => "<" 422 func unescape(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 423 return _filterValue(src, func(v string) (interface{}, error) { 424 return html.UnescapeString(v), nil 425 }) 426 } 427 428 // escape => src="<" => "<" 429 func escape(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 430 return _filterValue(src, func(v string) (interface{}, error) { 431 return html.EscapeString(v), nil 432 }) 433 } 434 435 // wraphtml(a) => <a>{src}</a> 436 func wraphtml(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 437 if len(params) == 0 { 438 return src, errors.New("filter wraphtml nil params") 439 } 440 441 return _filterValue(src, func(v string) (interface{}, error) { 442 return fmt.Sprintf("<%s>%s</%s>", params, v, params), nil 443 }) 444 } 445 446 // sprintf_multi_param(%veee%v) src=[1,2] => 1eee2 447 func sprintf_multi_param(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 448 if len(params) == 0 { 449 return src, errors.New("filter split nil params ") 450 } 451 452 srcValue := reflect.ValueOf(src) 453 srcType := srcValue.Type() 454 if srcType.Kind() == reflect.Array || srcType.Kind() == reflect.Slice { 455 count := strings.Count(params, "%") 456 size := srcValue.Len() 457 ret := make([]interface{}, 0, size) 458 for i := 0; i < size; i++ { 459 ret = append(ret, srcValue.Index(i).Interface()) 460 } 461 if len(ret) > count { 462 return fmt.Sprintf(params, ret[:count]...), nil 463 } 464 return fmt.Sprintf(params, ret...), nil 465 } 466 467 return fmt.Sprintf(params, src), nil 468 } 469 470 // sprintf(%s) src=a => a 471 func sprintf(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 472 if len(params) == 0 { 473 return src, errors.New("filter split nil params") 474 } 475 return _filterValue(src, func(v string) (interface{}, error) { 476 return fmt.Sprintf(params, v), nil 477 }) 478 } 479 480 // sprintfmap(%v-%v,a,b) src={"a":1,"b":2} => "1-2" 481 func sprintfmap(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 482 if len(params) == 0 { 483 return src, errors.New("filter split nil params") 484 } 485 msrc, ok := src.(map[string]interface{}) 486 if ok == false { 487 return src, errors.New("value is not map[string]interface{}") 488 } 489 vt := SplitParams(params) 490 if len(vt) <= 1 { 491 return src, errors.New("params length must > 1") 492 } 493 pArray := []interface{}{} 494 for _, x := range vt[1:] { 495 if vm, ok := msrc[x]; ok { 496 pArray = append(pArray, vm) 497 } else { 498 pArray = append(pArray, nil) 499 } 500 } 501 return fmt.Sprintf(vt[0], pArray...), nil 502 } 503 504 var timeFormatNames = map[string]string{ 505 `Layout`: time.Layout, 506 `ANSIC`: time.ANSIC, 507 `UnixDate`: time.UnixDate, 508 `RubyDate`: time.RubyDate, 509 `RFC822`: time.RFC822, 510 `RFC822Z`: time.RFC822Z, 511 `RFC850`: time.RFC850, 512 `RFC1123`: time.RFC1123, 513 `RFC1123Z`: time.RFC1123Z, 514 `RFC3339`: time.RFC3339, 515 `RFC3339Nano`: time.RFC3339Nano, 516 `Kitchen`: time.Kitchen, 517 // Handy time stamps. 518 `Stamp`: time.Stamp, 519 `StampMilli`: time.StampMilli, 520 `StampMicro`: time.StampMicro, 521 `StampNano`: time.StampNano, 522 `DateTime`: time.DateTime, 523 `DateOnly`: time.DateOnly, 524 `TimeOnly`: time.TimeOnly, 525 } 526 527 // unixtime 时间戳(总秒数) 528 func unixtime(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 529 if len(params) == 0 { 530 return time.Now().Unix(), nil 531 } 532 layout := params 533 if len(layout) == 0 { 534 return time.Now().Unix(), nil 535 } 536 srcString, ok := src.(string) 537 if !ok { 538 return src, errors.New("value is not string") 539 } 540 if v, y := timeFormatNames[layout]; y { 541 layout = v 542 } else { 543 layout = com.ConvDateFormat(layout) 544 } 545 t, err := time.Parse(layout, srcString) 546 if err != nil { 547 return t, err 548 } 549 return t.Unix(), nil 550 } 551 552 // unixmill 时间戳(总毫秒数) 553 func unixmill(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 554 return time.Now().UnixNano() / int64(time.Millisecond), nil 555 } 556 557 // paging(startAt,endAt,step) 558 // paging(1,10) / paging(1,10,2) 559 func paging(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 560 if len(params) == 0 { 561 return src, errors.New("filter paging nil params") 562 } 563 vt := strings.Split(params, ",") 564 if len(vt) < 2 { 565 return src, errors.New("params length must > 1") 566 } 567 start, err := strconv.Atoi(vt[0]) 568 if err != nil { 569 return src, errors.New("params type error:need int." + err.Error()) 570 } 571 end, err := strconv.Atoi(vt[1]) 572 if err != nil { 573 return src, errors.New("params type error:need int." + err.Error()) 574 } 575 576 offset := -1 577 if len(vt) == 3 { 578 offset, err = strconv.Atoi(vt[2]) 579 if err != nil { 580 return src, errors.New("params type error:need int." + err.Error()) 581 } 582 if offset < 1 { 583 return src, errors.New("offset must > 0") 584 } 585 } 586 587 var result []string 588 switch vt := src.(type) { 589 case []interface{}: 590 for i := start; i <= end; i++ { 591 for _, v := range vt { 592 if offset > 0 { 593 result = append(result, sprintf_replace(com.String(v), []string{strconv.Itoa(i * offset), strconv.Itoa((i + 1) * offset)})) 594 } else { 595 result = append(result, sprintf_replace(com.String(v), []string{strconv.Itoa(i)})) 596 } 597 } 598 } 599 return result, nil 600 601 case []string: 602 for i := start; i <= end; i++ { 603 for _, v := range vt { 604 if offset > 0 { 605 result = append(result, sprintf_replace(v, []string{strconv.Itoa(i * offset), strconv.Itoa((i + 1) * offset)})) 606 } else { 607 result = append(result, sprintf_replace(v, []string{strconv.Itoa(i)})) 608 } 609 } 610 611 } 612 return result, nil 613 614 case string: 615 for i := start; i <= end; i++ { 616 if offset > 0 { 617 result = append(result, sprintf_replace(vt, []string{strconv.Itoa(i * offset), strconv.Itoa((i + 1) * offset)})) 618 } else { 619 result = append(result, sprintf_replace(vt, []string{strconv.Itoa(i)})) 620 } 621 } 622 return result, nil 623 624 default: 625 return vt, errors.New("do nothing,src type not support! need slice,array or string") 626 } 627 } 628 629 func sprintf_replace(src string, param []string) string { 630 for i := range param { 631 src = strings.Replace(src, "{"+strconv.Itoa(i)+"}", param[i], -1) 632 } 633 return src 634 } 635 636 // quote => src=`a` => `"a"` 637 func quote(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 638 return _filterValue(src, func(v string) (interface{}, error) { 639 return strconv.Quote(v), nil 640 }) 641 } 642 643 // unquote => src=`"a"` => `a` 644 func unquote(pipe *PipeItem, src interface{}, params string) (interface{}, error) { 645 return _filterValue(src, func(v string) (interface{}, error) { 646 return strconv.Unquote(`"` + v + `"`) 647 }) 648 }