code.gitea.io/gitea@v1.19.3/modules/markup/mdstripper/mdstripper.go (about)

     1  // Copyright 2019 The Gitea Authors. All rights reserved.
     2  // SPDX-License-Identifier: MIT
     3  
     4  package mdstripper
     5  
     6  import (
     7  	"bytes"
     8  	"io"
     9  	"net/url"
    10  	"strings"
    11  	"sync"
    12  
    13  	"code.gitea.io/gitea/modules/log"
    14  	"code.gitea.io/gitea/modules/markup/common"
    15  	"code.gitea.io/gitea/modules/setting"
    16  
    17  	"github.com/yuin/goldmark"
    18  	"github.com/yuin/goldmark/ast"
    19  	"github.com/yuin/goldmark/extension"
    20  	"github.com/yuin/goldmark/parser"
    21  	"github.com/yuin/goldmark/renderer"
    22  	"github.com/yuin/goldmark/renderer/html"
    23  	"github.com/yuin/goldmark/text"
    24  )
    25  
    26  var (
    27  	giteaHostInit sync.Once
    28  	giteaHost     *url.URL
    29  )
    30  
    31  type stripRenderer struct {
    32  	localhost *url.URL
    33  	links     []string
    34  	empty     bool
    35  }
    36  
    37  func (r *stripRenderer) Render(w io.Writer, source []byte, doc ast.Node) error {
    38  	return ast.Walk(doc, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
    39  		if !entering {
    40  			return ast.WalkContinue, nil
    41  		}
    42  		switch v := n.(type) {
    43  		case *ast.Text:
    44  			if !v.IsRaw() {
    45  				_, prevSibIsText := n.PreviousSibling().(*ast.Text)
    46  				coalesce := prevSibIsText
    47  				r.processString(
    48  					w,
    49  					v.Text(source),
    50  					coalesce)
    51  				if v.SoftLineBreak() {
    52  					r.doubleSpace(w)
    53  				}
    54  			}
    55  			return ast.WalkContinue, nil
    56  		case *ast.Link:
    57  			r.processLink(w, v.Destination)
    58  			return ast.WalkSkipChildren, nil
    59  		case *ast.AutoLink:
    60  			// This could be a reference to an issue or pull - if so convert it
    61  			r.processAutoLink(w, v.URL(source))
    62  			return ast.WalkSkipChildren, nil
    63  		}
    64  		return ast.WalkContinue, nil
    65  	})
    66  }
    67  
    68  func (r *stripRenderer) doubleSpace(w io.Writer) {
    69  	if !r.empty {
    70  		_, _ = w.Write([]byte{'\n'})
    71  	}
    72  }
    73  
    74  func (r *stripRenderer) processString(w io.Writer, text []byte, coalesce bool) {
    75  	// Always break-up words
    76  	if !coalesce {
    77  		r.doubleSpace(w)
    78  	}
    79  	_, _ = w.Write(text)
    80  	r.empty = false
    81  }
    82  
    83  // ProcessAutoLinks to detect and handle links to issues and pulls
    84  func (r *stripRenderer) processAutoLink(w io.Writer, link []byte) {
    85  	linkStr := string(link)
    86  	u, err := url.Parse(linkStr)
    87  	if err != nil {
    88  		// Process out of band
    89  		r.links = append(r.links, linkStr)
    90  		return
    91  	}
    92  
    93  	// Note: we're not attempting to match the URL scheme (http/https)
    94  	host := strings.ToLower(u.Host)
    95  	if host != "" && host != strings.ToLower(r.localhost.Host) {
    96  		// Process out of band
    97  		r.links = append(r.links, linkStr)
    98  		return
    99  	}
   100  
   101  	// We want: /user/repo/issues/3
   102  	parts := strings.Split(strings.TrimPrefix(u.EscapedPath(), r.localhost.EscapedPath()), "/")
   103  	if len(parts) != 5 || parts[0] != "" {
   104  		// Process out of band
   105  		r.links = append(r.links, linkStr)
   106  		return
   107  	}
   108  
   109  	var sep string
   110  	if parts[3] == "issues" {
   111  		sep = "#"
   112  	} else if parts[3] == "pulls" {
   113  		sep = "!"
   114  	} else {
   115  		// Process out of band
   116  		r.links = append(r.links, linkStr)
   117  		return
   118  	}
   119  
   120  	_, _ = w.Write([]byte(parts[1]))
   121  	_, _ = w.Write([]byte("/"))
   122  	_, _ = w.Write([]byte(parts[2]))
   123  	_, _ = w.Write([]byte(sep))
   124  	_, _ = w.Write([]byte(parts[4]))
   125  }
   126  
   127  func (r *stripRenderer) processLink(w io.Writer, link []byte) {
   128  	// Links are processed out of band
   129  	r.links = append(r.links, string(link))
   130  }
   131  
   132  // GetLinks returns the list of link data collected while parsing
   133  func (r *stripRenderer) GetLinks() []string {
   134  	return r.links
   135  }
   136  
   137  // AddOptions adds given option to this renderer.
   138  func (r *stripRenderer) AddOptions(...renderer.Option) {
   139  	// no-op
   140  }
   141  
   142  // StripMarkdown parses markdown content by removing all markup and code blocks
   143  // in order to extract links and other references
   144  func StripMarkdown(rawBytes []byte) (string, []string) {
   145  	buf, links := StripMarkdownBytes(rawBytes)
   146  	return string(buf), links
   147  }
   148  
   149  var (
   150  	stripParser parser.Parser
   151  	once        = sync.Once{}
   152  )
   153  
   154  // StripMarkdownBytes parses markdown content by removing all markup and code blocks
   155  // in order to extract links and other references
   156  func StripMarkdownBytes(rawBytes []byte) ([]byte, []string) {
   157  	once.Do(func() {
   158  		gdMarkdown := goldmark.New(
   159  			goldmark.WithExtensions(extension.Table,
   160  				extension.Strikethrough,
   161  				extension.TaskList,
   162  				extension.DefinitionList,
   163  				common.FootnoteExtension,
   164  				common.Linkify,
   165  			),
   166  			goldmark.WithParserOptions(
   167  				parser.WithAttribute(),
   168  				parser.WithAutoHeadingID(),
   169  			),
   170  			goldmark.WithRendererOptions(
   171  				html.WithUnsafe(),
   172  			),
   173  		)
   174  		stripParser = gdMarkdown.Parser()
   175  	})
   176  	stripper := &stripRenderer{
   177  		localhost: getGiteaHost(),
   178  		links:     make([]string, 0, 10),
   179  		empty:     true,
   180  	}
   181  	reader := text.NewReader(rawBytes)
   182  	doc := stripParser.Parse(reader)
   183  	var buf bytes.Buffer
   184  	if err := stripper.Render(&buf, rawBytes, doc); err != nil {
   185  		log.Error("Unable to strip: %v", err)
   186  	}
   187  	return buf.Bytes(), stripper.GetLinks()
   188  }
   189  
   190  // getGiteaHostName returns a normalized string with the local host name, with no scheme or port information
   191  func getGiteaHost() *url.URL {
   192  	giteaHostInit.Do(func() {
   193  		var err error
   194  		if giteaHost, err = url.Parse(setting.AppURL); err != nil {
   195  			giteaHost = &url.URL{}
   196  		}
   197  	})
   198  	return giteaHost
   199  }