github.com/gitbundle/modules@v0.0.0-20231025071548-85b91c5c3b01/markup/mdstripper/mdstripper.go (about) 1 // Copyright 2023 The GitBundle Inc. All rights reserved. 2 // Copyright 2017 The Gitea Authors. All rights reserved. 3 // Use of this source code is governed by a MIT-style 4 // license that can be found in the LICENSE file. 5 6 package mdstripper 7 8 import ( 9 "bytes" 10 "io" 11 "net/url" 12 "strings" 13 "sync" 14 15 "github.com/gitbundle/modules/log" 16 "github.com/gitbundle/modules/markup/common" 17 "github.com/gitbundle/modules/setting" 18 19 "github.com/yuin/goldmark" 20 "github.com/yuin/goldmark/ast" 21 "github.com/yuin/goldmark/extension" 22 "github.com/yuin/goldmark/parser" 23 "github.com/yuin/goldmark/renderer" 24 "github.com/yuin/goldmark/renderer/html" 25 "github.com/yuin/goldmark/text" 26 ) 27 28 var ( 29 giteaHostInit sync.Once 30 giteaHost *url.URL 31 ) 32 33 type stripRenderer struct { 34 localhost *url.URL 35 links []string 36 empty bool 37 } 38 39 func (r *stripRenderer) Render(w io.Writer, source []byte, doc ast.Node) error { 40 return ast.Walk(doc, func(n ast.Node, entering bool) (ast.WalkStatus, error) { 41 if !entering { 42 return ast.WalkContinue, nil 43 } 44 switch v := n.(type) { 45 case *ast.Text: 46 if !v.IsRaw() { 47 _, prevSibIsText := n.PreviousSibling().(*ast.Text) 48 coalesce := prevSibIsText 49 r.processString( 50 w, 51 v.Text(source), 52 coalesce) 53 if v.SoftLineBreak() { 54 r.doubleSpace(w) 55 } 56 } 57 return ast.WalkContinue, nil 58 case *ast.Link: 59 r.processLink(w, v.Destination) 60 return ast.WalkSkipChildren, nil 61 case *ast.AutoLink: 62 // This could be a reference to an issue or pull - if so convert it 63 r.processAutoLink(w, v.URL(source)) 64 return ast.WalkSkipChildren, nil 65 } 66 return ast.WalkContinue, nil 67 }) 68 } 69 70 func (r *stripRenderer) doubleSpace(w io.Writer) { 71 if !r.empty { 72 _, _ = w.Write([]byte{'\n'}) 73 } 74 } 75 76 func (r *stripRenderer) processString(w io.Writer, text []byte, coalesce bool) { 77 // Always break-up words 78 if !coalesce { 79 r.doubleSpace(w) 80 } 81 _, _ = w.Write(text) 82 r.empty = false 83 } 84 85 // ProcessAutoLinks to detect and handle links to issues and pulls 86 func (r *stripRenderer) processAutoLink(w io.Writer, link []byte) { 87 linkStr := string(link) 88 u, err := url.Parse(linkStr) 89 if err != nil { 90 // Process out of band 91 r.links = append(r.links, linkStr) 92 return 93 } 94 95 // Note: we're not attempting to match the URL scheme (http/https) 96 host := strings.ToLower(u.Host) 97 if host != "" && host != strings.ToLower(r.localhost.Host) { 98 // Process out of band 99 r.links = append(r.links, linkStr) 100 return 101 } 102 103 // We want: /user/repo/issues/3 104 parts := strings.Split(strings.TrimPrefix(u.EscapedPath(), r.localhost.EscapedPath()), "/") 105 if len(parts) != 5 || parts[0] != "" { 106 // Process out of band 107 r.links = append(r.links, linkStr) 108 return 109 } 110 111 var sep string 112 if parts[3] == "issues" { 113 sep = "#" 114 } else if parts[3] == "pulls" { 115 sep = "!" 116 } else { 117 // Process out of band 118 r.links = append(r.links, linkStr) 119 return 120 } 121 122 _, _ = w.Write([]byte(parts[1])) 123 _, _ = w.Write([]byte("/")) 124 _, _ = w.Write([]byte(parts[2])) 125 _, _ = w.Write([]byte(sep)) 126 _, _ = w.Write([]byte(parts[4])) 127 } 128 129 func (r *stripRenderer) processLink(w io.Writer, link []byte) { 130 // Links are processed out of band 131 r.links = append(r.links, string(link)) 132 } 133 134 // GetLinks returns the list of link data collected while parsing 135 func (r *stripRenderer) GetLinks() []string { 136 return r.links 137 } 138 139 // AddOptions adds given option to this renderer. 140 func (r *stripRenderer) AddOptions(...renderer.Option) { 141 // no-op 142 } 143 144 // StripMarkdown parses markdown content by removing all markup and code blocks 145 // 146 // in order to extract links and other references 147 func StripMarkdown(rawBytes []byte) (string, []string) { 148 buf, links := StripMarkdownBytes(rawBytes) 149 return string(buf), links 150 } 151 152 var ( 153 stripParser parser.Parser 154 once = sync.Once{} 155 ) 156 157 // StripMarkdownBytes parses markdown content by removing all markup and code blocks 158 // 159 // in order to extract links and other references 160 func StripMarkdownBytes(rawBytes []byte) ([]byte, []string) { 161 once.Do(func() { 162 gdMarkdown := goldmark.New( 163 goldmark.WithExtensions(extension.Table, 164 extension.Strikethrough, 165 extension.TaskList, 166 extension.DefinitionList, 167 common.FootnoteExtension, 168 common.Linkify, 169 ), 170 goldmark.WithParserOptions( 171 parser.WithAttribute(), 172 parser.WithAutoHeadingID(), 173 ), 174 goldmark.WithRendererOptions( 175 html.WithUnsafe(), 176 ), 177 ) 178 stripParser = gdMarkdown.Parser() 179 }) 180 stripper := &stripRenderer{ 181 localhost: getGiteaHost(), 182 links: make([]string, 0, 10), 183 empty: true, 184 } 185 reader := text.NewReader(rawBytes) 186 doc := stripParser.Parse(reader) 187 var buf bytes.Buffer 188 if err := stripper.Render(&buf, rawBytes, doc); err != nil { 189 log.Error("Unable to strip: %v", err) 190 } 191 return buf.Bytes(), stripper.GetLinks() 192 } 193 194 // getGiteaHostName returns a normalized string with the local host name, with no scheme or port information 195 func getGiteaHost() *url.URL { 196 giteaHostInit.Do(func() { 197 var err error 198 if giteaHost, err = url.Parse(setting.AppURL); err != nil { 199 giteaHost = &url.URL{} 200 } 201 }) 202 return giteaHost 203 }