github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/html2md/html2md.go (about)

     1  // a go port of html2md javascript version
     2  
     3  package html2md
     4  
     5  import (
     6  	"fmt"
     7  	"regexp"
     8  	"strconv"
     9  	"strings"
    10  )
    11  
    12  func P() *Rule {
    13  	return &Rule{
    14  		Patterns: []string{"p"},
    15  		Replacement: func(innerHTML string, attrs []string) string {
    16  			if len(attrs) > 1 {
    17  				return "\n\n" + attrs[1] + "\n"
    18  			}
    19  			return ""
    20  		},
    21  	}
    22  }
    23  
    24  func Br() *Rule {
    25  	return &Rule{
    26  		Patterns: []string{"br"},
    27  		Tp:       Void,
    28  		Replacement: func(innerHTML string, attrs []string) string {
    29  			return "  \n"
    30  		},
    31  	}
    32  }
    33  
    34  func H() *Rule {
    35  	return &Rule{
    36  		Patterns: []string{"h([1-6])"},
    37  		Replacement: func(innerHTML string, attrs []string) string {
    38  			if len(attrs) < 4 || attrs[0] != attrs[len(attrs)-1] {
    39  				return ""
    40  			}
    41  
    42  			hLevel, err := strconv.Atoi(attrs[0])
    43  			if err != nil {
    44  				fmt.Println(err)
    45  				return ""
    46  			}
    47  
    48  			return "\n\n" + strings.Repeat("#", hLevel) +
    49  				" " + attrs[2] + "\n"
    50  		},
    51  	}
    52  }
    53  
    54  func Hr() *Rule {
    55  	return &Rule{
    56  		Patterns: []string{"hr"},
    57  		Tp:       Void,
    58  		Replacement: func(innerHTML string, attrs []string) string {
    59  			return "\n\n* * *\n"
    60  		},
    61  	}
    62  }
    63  
    64  func B() *Rule {
    65  	return &Rule{
    66  		Patterns: []string{"b", "strong"},
    67  		Replacement: func(innerHTML string, attrs []string) string {
    68  			if len(attrs) > 1 {
    69  				return "**" + attrs[1] + "**"
    70  			}
    71  			return ""
    72  		},
    73  	}
    74  }
    75  
    76  func I() *Rule {
    77  	return &Rule{
    78  		Patterns: []string{"i", "em"},
    79  		Replacement: func(innerHTML string, attrs []string) string {
    80  			if len(attrs) > 1 {
    81  				return "_" + attrs[1] + "_"
    82  			}
    83  			return ""
    84  		},
    85  	}
    86  }
    87  
    88  func Code() *Rule {
    89  	return &Rule{
    90  		Patterns: []string{"code"},
    91  		Replacement: func(innerHTML string, attrs []string) string {
    92  			if len(attrs) > 1 {
    93  				return "`" + attrs[1] + "`"
    94  			}
    95  			return ""
    96  		},
    97  	}
    98  }
    99  
   100  func A() *Rule {
   101  	return &Rule{
   102  		Patterns: []string{"a"},
   103  		Replacement: func(innerHTML string, attrs []string) string {
   104  			var href string
   105  			hrefR := AttrRegExp("href")
   106  			matches := hrefR.FindStringSubmatch(attrs[0])
   107  			if len(matches) > 1 {
   108  				href = matches[1]
   109  			}
   110  
   111  			/*targetR := AttrRegExp("target")
   112  			matches = targetR.FindStringSubmatch(attrs[0])
   113  			if len(matches) > 1 {
   114  				target = matches[1]
   115  			}*/
   116  
   117  			//if len(target) > 0 {
   118  			//	return "[" + alt + "]" + "(" + src + " \"" + title + "\")"
   119  			//}
   120  			return "[" + attrs[1] + "]" + "(" + href + ")"
   121  		},
   122  	}
   123  }
   124  
   125  func SameRule(tag string, tp int) *Rule {
   126  	return &Rule{Patterns: []string{tag},
   127  		Tp: tp,
   128  		Replacement: func(innerHTML string, attrs []string) string {
   129  			return innerHTML
   130  		},
   131  	}
   132  }
   133  
   134  func Img() *Rule {
   135  	return &Rule{
   136  		Patterns: []string{"img"},
   137  		Tp:       Void,
   138  		Replacement: func(innerHTML string, attrs []string) string {
   139  			var src, alt, title string
   140  			srcR := AttrRegExp("src")
   141  			matches := srcR.FindStringSubmatch(attrs[0])
   142  			if len(matches) > 1 {
   143  				src = matches[1]
   144  			}
   145  
   146  			altR := AttrRegExp("alt")
   147  			matches = altR.FindStringSubmatch(attrs[0])
   148  			if len(matches) > 1 {
   149  				alt = matches[1]
   150  			}
   151  
   152  			titleR := AttrRegExp("title")
   153  			matches = titleR.FindStringSubmatch(attrs[0])
   154  			if len(matches) > 1 {
   155  				title = matches[1]
   156  			}
   157  
   158  			if len(title) > 0 {
   159  				if len(alt) == 0 {
   160  					alt = title
   161  				}
   162  				return "![" + alt + "]" + "(" + src + " \"" + title + "\")"
   163  			}
   164  			if len(alt) == 0 {
   165  				alt = "image"
   166  			}
   167  			return "![" + alt + "]" + "(" + src + ")"
   168  		},
   169  	}
   170  }
   171  
   172  func replaceEls(html, tag string, tp int, replacement ReplaceFunc) string {
   173  	var pattern string
   174  	if tp == Void {
   175  		pattern = "<" + tag + "\\b([^>]*)\\/?>"
   176  	} else {
   177  		pattern = "<" + tag + "\\b([^>]*)>([\\s\\S]*?)<\\/" + tag + ">"
   178  	}
   179  
   180  	re := regexp.MustCompile(pattern)
   181  	return re.ReplaceAllStringFunc(html, func(subHtml string) string {
   182  		matches := re.FindStringSubmatch(subHtml)
   183  		//fmt.Println("xx", subHtml, matches)
   184  		return replacement(subHtml, matches[1:])
   185  	})
   186  }
   187  
   188  func replaceLists(tag, html string) string {
   189  	re := regexp.MustCompile(`<(` + tag + `)\b[^>]*>([\s\S]*?)</` + tag + `>`)
   190  	html = re.ReplaceAllStringFunc(html, func(innerHTML string) string {
   191  		var lis = strings.Split(innerHTML, "</li>")
   192  		var newLis = make([]string, 0)
   193  		var prefix string = "*   "
   194  
   195  		for i, li := range lis[:len(lis)-1] {
   196  			if tag == "ol" {
   197  				prefix = fmt.Sprintf("%d.  ", i+1)
   198  			}
   199  
   200  			re := regexp.MustCompile(`([\s\S]*)<li[^>]*>([\s\S]*)`)
   201  			newLis = append(newLis, re.ReplaceAllStringFunc(li, func(innerHTML string) string {
   202  				matches := re.FindStringSubmatch(innerHTML)
   203  				innerHTML = regexp.MustCompile(`/^\s+/`).ReplaceAllString(matches[2], "")
   204  				innerHTML = regexp.MustCompile(`/\n\n/g`).ReplaceAllString(innerHTML, "\n\n    ")
   205  				// indent nested lists
   206  				innerHTML = regexp.MustCompile(`/\n([ ]*)+(\*|\d+\.) /g`).ReplaceAllString(innerHTML, "\n$1    $2 ")
   207  				return prefix + innerHTML
   208  			}))
   209  		}
   210  
   211  		return strings.Join(newLis, "\n")
   212  	})
   213  
   214  	return "\n\n" + regexp.MustCompile(`[ \t]+\n|\s+$`).ReplaceAllString(html, "")
   215  }
   216  
   217  func replaceBlockquotes(html string) string {
   218  	re := regexp.MustCompile(`<blockquote\b[^>]*>([\s\S]*?)</blockquote>`)
   219  	return re.ReplaceAllStringFunc(html, func(inner string) string {
   220  		matches := re.FindStringSubmatch(inner)
   221  		inner = regexp.MustCompile(`^\s+|\s+$`).ReplaceAllString(matches[1], "")
   222  		inner = cleanUp(inner)
   223  		inner = regexp.MustCompile(`^/gm`).ReplaceAllString(inner, "> ")
   224  		inner = regexp.MustCompile(`^(>([ \t]{2,}>)+)`).ReplaceAllString(inner, "> >")
   225  		return inner
   226  	})
   227  }
   228  
   229  func blockQuote(content string) string {
   230  	// Blockquotes
   231  	//var deepest = `<blockquote\b[^>]*>((?:(?!<blockquote)[\s\S])*?)</blockquote>`
   232  	var deepest = `<blockquote\b[^>]*>((?:[\s\S])*?)</blockquote>`
   233  
   234  	re := regexp.MustCompile(deepest)
   235  	content = re.ReplaceAllStringFunc(content, func(str string) string {
   236  		return replaceBlockquotes(str)
   237  	})
   238  
   239  	return content
   240  }
   241  
   242  func Remove(ct, tag string) string {
   243  	re := regexp.MustCompile("\\<" + tag + "[\\S\\s]+?\\</" + tag + "\\>")
   244  	return re.ReplaceAllString(ct, "")
   245  }
   246  
   247  func cleanUp(ct string) string {
   248  	// trim leading/trailing whitespace
   249  	str := regexp.MustCompile("^[\t\r\n]+|[\t\r\n]+$").ReplaceAllString(ct, "")
   250  	str = regexp.MustCompile(`\n\s+\n`).ReplaceAllString(str, "\n\n")
   251  	// limit consecutive linebreaks to 2
   252  	str = regexp.MustCompile(`\n{3,}`).ReplaceAllString(str, "\n\n")
   253  
   254  	//去除STYLE
   255  	str = Remove(str, "style")
   256  
   257  	//去除SCRIPT
   258  	str = Remove(str, "script")
   259  
   260  	//去除所有尖括号内的HTML代码,并换成换行符
   261  	re := regexp.MustCompile("\\<[\\S\\s]+?\\>")
   262  	str = re.ReplaceAllString(str, "\n")
   263  
   264  	//去除连续的换行符
   265  	//re = regexp.MustCompile("\\s{2,}")
   266  	//str = re.ReplaceAllString(str, "\n")
   267  	return str
   268  }
   269  
   270  func pre(content string) string {
   271  	// Pre code blocks
   272  	re := regexp.MustCompile(`<pre\b[^>]*>([\s\S]*)</pre>`)
   273  	content = re.ReplaceAllStringFunc(content, func(innerHTML string) string {
   274  		matches := re.FindStringSubmatch(innerHTML)
   275  		// convert tabs to spaces (you know it makes sense)
   276  		r := regexp.MustCompile(`/^\t+`)
   277  		innerHTML = r.ReplaceAllString(matches[1], "  ")
   278  		r = regexp.MustCompile(`/\n`)
   279  		innerHTML = r.ReplaceAllString(innerHTML, "\n    ")
   280  		return "\n\n    " + innerHTML + "\n"
   281  	})
   282  	return content
   283  }
   284  
   285  func ul(content string) string {
   286  	return ulol("ul", content)
   287  }
   288  
   289  func ol(content string) string {
   290  	return ulol("ol", content)
   291  }
   292  
   293  func ulol(tag, content string) string {
   294  	// Lists
   295  
   296  	// Escape numbers that could trigger an ol
   297  	// If there are more than three spaces before the code, it would be in a pre tag
   298  	// Make sure we are escaping the period not matching any character
   299  
   300  	//content = string.replace(^(\s{0,3}\d+)\. /g, "$1\\. ");
   301  
   302  	// Converts lists that have no child lists (of same type) first, then works it"s way up
   303  	//var noChildrenRegex = /<(ul|ol)\b[^>]*>(?:(?!<ul|<ol)[\s\S])*?<\/\1>/gi;
   304  	var noChildrenRegex = `<(` + tag + `)\b[^>]*>(?:[\s\S])*?</` + tag + `>`
   305  	re := regexp.MustCompile(noChildrenRegex)
   306  	return re.ReplaceAllStringFunc(content, func(str string) string {
   307  		return replaceLists(tag, str)
   308  	})
   309  }
   310  
   311  func init() {
   312  	AddRule("p", P())
   313  	AddRule("i", I())
   314  	AddRule("h", H())
   315  	AddRule("hr", Hr())
   316  	AddRule("img", Img())
   317  	AddRule("b", B())
   318  	AddRule("br", Br())
   319  	AddRule("code", Code())
   320  	AddRule("a", A())
   321  
   322  	AddConvert(pre)
   323  	AddConvert(ul)
   324  	AddConvert(ol)
   325  	AddConvert(blockQuote)
   326  	AddConvert(cleanUp)
   327  }
   328  
   329  func Convert(content string) string {
   330  	for _, rule := range rules {
   331  		for _, pattern := range rule.Patterns {
   332  			content = replaceEls(content, pattern, rule.Tp, rule.Replacement)
   333  		}
   334  	}
   335  
   336  	for _, convert := range converts {
   337  		content = convert(content)
   338  	}
   339  
   340  	return content
   341  }