github.com/insionng/yougam@v0.0.0-20170714101924-2bc18d833463/libraries/html2md/html2md.go (about) 1 // a go port of html2md javascript version 2 3 package html2md 4 5 import ( 6 "fmt" 7 "regexp" 8 "strconv" 9 "strings" 10 ) 11 12 func P() *Rule { 13 return &Rule{ 14 Patterns: []string{"p"}, 15 Replacement: func(innerHTML string, attrs []string) string { 16 if len(attrs) > 1 { 17 return "\n\n" + attrs[1] + "\n" 18 } 19 return "" 20 }, 21 } 22 } 23 24 func Br() *Rule { 25 return &Rule{ 26 Patterns: []string{"br"}, 27 Tp: Void, 28 Replacement: func(innerHTML string, attrs []string) string { 29 return " \n" 30 }, 31 } 32 } 33 34 func H() *Rule { 35 return &Rule{ 36 Patterns: []string{"h([1-6])"}, 37 Replacement: func(innerHTML string, attrs []string) string { 38 if len(attrs) < 4 || attrs[0] != attrs[len(attrs)-1] { 39 return "" 40 } 41 42 hLevel, err := strconv.Atoi(attrs[0]) 43 if err != nil { 44 fmt.Println(err) 45 return "" 46 } 47 48 return "\n\n" + strings.Repeat("#", hLevel) + 49 " " + attrs[2] + "\n" 50 }, 51 } 52 } 53 54 func Hr() *Rule { 55 return &Rule{ 56 Patterns: []string{"hr"}, 57 Tp: Void, 58 Replacement: func(innerHTML string, attrs []string) string { 59 return "\n\n* * *\n" 60 }, 61 } 62 } 63 64 func B() *Rule { 65 return &Rule{ 66 Patterns: []string{"b", "strong"}, 67 Replacement: func(innerHTML string, attrs []string) string { 68 if len(attrs) > 1 { 69 return "**" + attrs[1] + "**" 70 } 71 return "" 72 }, 73 } 74 } 75 76 func I() *Rule { 77 return &Rule{ 78 Patterns: []string{"i", "em"}, 79 Replacement: func(innerHTML string, attrs []string) string { 80 if len(attrs) > 1 { 81 return "_" + attrs[1] + "_" 82 } 83 return "" 84 }, 85 } 86 } 87 88 func Code() *Rule { 89 return &Rule{ 90 Patterns: []string{"code"}, 91 Replacement: func(innerHTML string, attrs []string) string { 92 if len(attrs) > 1 { 93 return "`" + attrs[1] + "`" 94 } 95 return "" 96 }, 97 } 98 } 99 100 func A() *Rule { 101 return &Rule{ 102 Patterns: []string{"a"}, 103 Replacement: func(innerHTML string, attrs []string) string { 104 var href string 105 hrefR := AttrRegExp("href") 106 matches := hrefR.FindStringSubmatch(attrs[0]) 107 if len(matches) > 1 { 108 href = matches[1] 109 } 110 111 /*targetR := AttrRegExp("target") 112 matches = targetR.FindStringSubmatch(attrs[0]) 113 if len(matches) > 1 { 114 target = matches[1] 115 }*/ 116 117 //if len(target) > 0 { 118 // return "[" + alt + "]" + "(" + src + " \"" + title + "\")" 119 //} 120 return "[" + attrs[1] + "]" + "(" + href + ")" 121 }, 122 } 123 } 124 125 func SameRule(tag string, tp int) *Rule { 126 return &Rule{Patterns: []string{tag}, 127 Tp: tp, 128 Replacement: func(innerHTML string, attrs []string) string { 129 return innerHTML 130 }, 131 } 132 } 133 134 func Img() *Rule { 135 return &Rule{ 136 Patterns: []string{"img"}, 137 Tp: Void, 138 Replacement: func(innerHTML string, attrs []string) string { 139 var src, alt, title string 140 srcR := AttrRegExp("src") 141 matches := srcR.FindStringSubmatch(attrs[0]) 142 if len(matches) > 1 { 143 src = matches[1] 144 } 145 146 altR := AttrRegExp("alt") 147 matches = altR.FindStringSubmatch(attrs[0]) 148 if len(matches) > 1 { 149 alt = matches[1] 150 } 151 152 titleR := AttrRegExp("title") 153 matches = titleR.FindStringSubmatch(attrs[0]) 154 if len(matches) > 1 { 155 title = matches[1] 156 } 157 158 if len(title) > 0 { 159 if len(alt) == 0 { 160 alt = title 161 } 162 return "![" + alt + "]" + "(" + src + " \"" + title + "\")" 163 } 164 if len(alt) == 0 { 165 alt = "image" 166 } 167 return "![" + alt + "]" + "(" + src + ")" 168 }, 169 } 170 } 171 172 func replaceEls(html, tag string, tp int, replacement ReplaceFunc) string { 173 var pattern string 174 if tp == Void { 175 pattern = "<" + tag + "\\b([^>]*)\\/?>" 176 } else { 177 pattern = "<" + tag + "\\b([^>]*)>([\\s\\S]*?)<\\/" + tag + ">" 178 } 179 180 re := regexp.MustCompile(pattern) 181 return re.ReplaceAllStringFunc(html, func(subHtml string) string { 182 matches := re.FindStringSubmatch(subHtml) 183 //fmt.Println("xx", subHtml, matches) 184 return replacement(subHtml, matches[1:]) 185 }) 186 } 187 188 func replaceLists(tag, html string) string { 189 re := regexp.MustCompile(`<(` + tag + `)\b[^>]*>([\s\S]*?)</` + tag + `>`) 190 html = re.ReplaceAllStringFunc(html, func(innerHTML string) string { 191 var lis = strings.Split(innerHTML, "</li>") 192 var newLis = make([]string, 0) 193 var prefix string = "* " 194 195 for i, li := range lis[:len(lis)-1] { 196 if tag == "ol" { 197 prefix = fmt.Sprintf("%d. ", i+1) 198 } 199 200 re := regexp.MustCompile(`([\s\S]*)<li[^>]*>([\s\S]*)`) 201 newLis = append(newLis, re.ReplaceAllStringFunc(li, func(innerHTML string) string { 202 matches := re.FindStringSubmatch(innerHTML) 203 innerHTML = regexp.MustCompile(`/^\s+/`).ReplaceAllString(matches[2], "") 204 innerHTML = regexp.MustCompile(`/\n\n/g`).ReplaceAllString(innerHTML, "\n\n ") 205 // indent nested lists 206 innerHTML = regexp.MustCompile(`/\n([ ]*)+(\*|\d+\.) /g`).ReplaceAllString(innerHTML, "\n$1 $2 ") 207 return prefix + innerHTML 208 })) 209 } 210 211 return strings.Join(newLis, "\n") 212 }) 213 214 return "\n\n" + regexp.MustCompile(`[ \t]+\n|\s+$`).ReplaceAllString(html, "") 215 } 216 217 func replaceBlockquotes(html string) string { 218 re := regexp.MustCompile(`<blockquote\b[^>]*>([\s\S]*?)</blockquote>`) 219 return re.ReplaceAllStringFunc(html, func(inner string) string { 220 matches := re.FindStringSubmatch(inner) 221 inner = regexp.MustCompile(`^\s+|\s+$`).ReplaceAllString(matches[1], "") 222 inner = cleanUp(inner) 223 inner = regexp.MustCompile(`^/gm`).ReplaceAllString(inner, "> ") 224 inner = regexp.MustCompile(`^(>([ \t]{2,}>)+)`).ReplaceAllString(inner, "> >") 225 return inner 226 }) 227 } 228 229 func blockQuote(content string) string { 230 // Blockquotes 231 //var deepest = `<blockquote\b[^>]*>((?:(?!<blockquote)[\s\S])*?)</blockquote>` 232 var deepest = `<blockquote\b[^>]*>((?:[\s\S])*?)</blockquote>` 233 234 re := regexp.MustCompile(deepest) 235 content = re.ReplaceAllStringFunc(content, func(str string) string { 236 return replaceBlockquotes(str) 237 }) 238 239 return content 240 } 241 242 func Remove(ct, tag string) string { 243 re := regexp.MustCompile("\\<" + tag + "[\\S\\s]+?\\</" + tag + "\\>") 244 return re.ReplaceAllString(ct, "") 245 } 246 247 func cleanUp(ct string) string { 248 // trim leading/trailing whitespace 249 str := regexp.MustCompile("^[\t\r\n]+|[\t\r\n]+$").ReplaceAllString(ct, "") 250 str = regexp.MustCompile(`\n\s+\n`).ReplaceAllString(str, "\n\n") 251 // limit consecutive linebreaks to 2 252 str = regexp.MustCompile(`\n{3,}`).ReplaceAllString(str, "\n\n") 253 254 //去除STYLE 255 str = Remove(str, "style") 256 257 //去除SCRIPT 258 str = Remove(str, "script") 259 260 //去除所有尖括号内的HTML代码,并换成换行符 261 re := regexp.MustCompile("\\<[\\S\\s]+?\\>") 262 str = re.ReplaceAllString(str, "\n") 263 264 //去除连续的换行符 265 //re = regexp.MustCompile("\\s{2,}") 266 //str = re.ReplaceAllString(str, "\n") 267 return str 268 } 269 270 func pre(content string) string { 271 // Pre code blocks 272 re := regexp.MustCompile(`<pre\b[^>]*>([\s\S]*)</pre>`) 273 content = re.ReplaceAllStringFunc(content, func(innerHTML string) string { 274 matches := re.FindStringSubmatch(innerHTML) 275 // convert tabs to spaces (you know it makes sense) 276 r := regexp.MustCompile(`/^\t+`) 277 innerHTML = r.ReplaceAllString(matches[1], " ") 278 r = regexp.MustCompile(`/\n`) 279 innerHTML = r.ReplaceAllString(innerHTML, "\n ") 280 return "\n\n " + innerHTML + "\n" 281 }) 282 return content 283 } 284 285 func ul(content string) string { 286 return ulol("ul", content) 287 } 288 289 func ol(content string) string { 290 return ulol("ol", content) 291 } 292 293 func ulol(tag, content string) string { 294 // Lists 295 296 // Escape numbers that could trigger an ol 297 // If there are more than three spaces before the code, it would be in a pre tag 298 // Make sure we are escaping the period not matching any character 299 300 //content = string.replace(^(\s{0,3}\d+)\. /g, "$1\\. "); 301 302 // Converts lists that have no child lists (of same type) first, then works it"s way up 303 //var noChildrenRegex = /<(ul|ol)\b[^>]*>(?:(?!<ul|<ol)[\s\S])*?<\/\1>/gi; 304 var noChildrenRegex = `<(` + tag + `)\b[^>]*>(?:[\s\S])*?</` + tag + `>` 305 re := regexp.MustCompile(noChildrenRegex) 306 return re.ReplaceAllStringFunc(content, func(str string) string { 307 return replaceLists(tag, str) 308 }) 309 } 310 311 func init() { 312 AddRule("p", P()) 313 AddRule("i", I()) 314 AddRule("h", H()) 315 AddRule("hr", Hr()) 316 AddRule("img", Img()) 317 AddRule("b", B()) 318 AddRule("br", Br()) 319 AddRule("code", Code()) 320 AddRule("a", A()) 321 322 AddConvert(pre) 323 AddConvert(ul) 324 AddConvert(ol) 325 AddConvert(blockQuote) 326 AddConvert(cleanUp) 327 } 328 329 func Convert(content string) string { 330 for _, rule := range rules { 331 for _, pattern := range rule.Patterns { 332 content = replaceEls(content, pattern, rule.Tp, rule.Replacement) 333 } 334 } 335 336 for _, convert := range converts { 337 content = convert(content) 338 } 339 340 return content 341 }