github.com/gitbundle/modules@v0.0.0-20231025071548-85b91c5c3b01/markup/mdstripper/mdstripper.go (about)

     1  // Copyright 2023 The GitBundle Inc. All rights reserved.
     2  // Copyright 2017 The Gitea Authors. All rights reserved.
     3  // Use of this source code is governed by a MIT-style
     4  // license that can be found in the LICENSE file.
     5  
     6  package mdstripper
     7  
     8  import (
     9  	"bytes"
    10  	"io"
    11  	"net/url"
    12  	"strings"
    13  	"sync"
    14  
    15  	"github.com/gitbundle/modules/log"
    16  	"github.com/gitbundle/modules/markup/common"
    17  	"github.com/gitbundle/modules/setting"
    18  
    19  	"github.com/yuin/goldmark"
    20  	"github.com/yuin/goldmark/ast"
    21  	"github.com/yuin/goldmark/extension"
    22  	"github.com/yuin/goldmark/parser"
    23  	"github.com/yuin/goldmark/renderer"
    24  	"github.com/yuin/goldmark/renderer/html"
    25  	"github.com/yuin/goldmark/text"
    26  )
    27  
    28  var (
    29  	giteaHostInit sync.Once
    30  	giteaHost     *url.URL
    31  )
    32  
    33  type stripRenderer struct {
    34  	localhost *url.URL
    35  	links     []string
    36  	empty     bool
    37  }
    38  
    39  func (r *stripRenderer) Render(w io.Writer, source []byte, doc ast.Node) error {
    40  	return ast.Walk(doc, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
    41  		if !entering {
    42  			return ast.WalkContinue, nil
    43  		}
    44  		switch v := n.(type) {
    45  		case *ast.Text:
    46  			if !v.IsRaw() {
    47  				_, prevSibIsText := n.PreviousSibling().(*ast.Text)
    48  				coalesce := prevSibIsText
    49  				r.processString(
    50  					w,
    51  					v.Text(source),
    52  					coalesce)
    53  				if v.SoftLineBreak() {
    54  					r.doubleSpace(w)
    55  				}
    56  			}
    57  			return ast.WalkContinue, nil
    58  		case *ast.Link:
    59  			r.processLink(w, v.Destination)
    60  			return ast.WalkSkipChildren, nil
    61  		case *ast.AutoLink:
    62  			// This could be a reference to an issue or pull - if so convert it
    63  			r.processAutoLink(w, v.URL(source))
    64  			return ast.WalkSkipChildren, nil
    65  		}
    66  		return ast.WalkContinue, nil
    67  	})
    68  }
    69  
    70  func (r *stripRenderer) doubleSpace(w io.Writer) {
    71  	if !r.empty {
    72  		_, _ = w.Write([]byte{'\n'})
    73  	}
    74  }
    75  
    76  func (r *stripRenderer) processString(w io.Writer, text []byte, coalesce bool) {
    77  	// Always break-up words
    78  	if !coalesce {
    79  		r.doubleSpace(w)
    80  	}
    81  	_, _ = w.Write(text)
    82  	r.empty = false
    83  }
    84  
    85  // ProcessAutoLinks to detect and handle links to issues and pulls
    86  func (r *stripRenderer) processAutoLink(w io.Writer, link []byte) {
    87  	linkStr := string(link)
    88  	u, err := url.Parse(linkStr)
    89  	if err != nil {
    90  		// Process out of band
    91  		r.links = append(r.links, linkStr)
    92  		return
    93  	}
    94  
    95  	// Note: we're not attempting to match the URL scheme (http/https)
    96  	host := strings.ToLower(u.Host)
    97  	if host != "" && host != strings.ToLower(r.localhost.Host) {
    98  		// Process out of band
    99  		r.links = append(r.links, linkStr)
   100  		return
   101  	}
   102  
   103  	// We want: /user/repo/issues/3
   104  	parts := strings.Split(strings.TrimPrefix(u.EscapedPath(), r.localhost.EscapedPath()), "/")
   105  	if len(parts) != 5 || parts[0] != "" {
   106  		// Process out of band
   107  		r.links = append(r.links, linkStr)
   108  		return
   109  	}
   110  
   111  	var sep string
   112  	if parts[3] == "issues" {
   113  		sep = "#"
   114  	} else if parts[3] == "pulls" {
   115  		sep = "!"
   116  	} else {
   117  		// Process out of band
   118  		r.links = append(r.links, linkStr)
   119  		return
   120  	}
   121  
   122  	_, _ = w.Write([]byte(parts[1]))
   123  	_, _ = w.Write([]byte("/"))
   124  	_, _ = w.Write([]byte(parts[2]))
   125  	_, _ = w.Write([]byte(sep))
   126  	_, _ = w.Write([]byte(parts[4]))
   127  }
   128  
   129  func (r *stripRenderer) processLink(w io.Writer, link []byte) {
   130  	// Links are processed out of band
   131  	r.links = append(r.links, string(link))
   132  }
   133  
   134  // GetLinks returns the list of link data collected while parsing
   135  func (r *stripRenderer) GetLinks() []string {
   136  	return r.links
   137  }
   138  
   139  // AddOptions adds given option to this renderer.
   140  func (r *stripRenderer) AddOptions(...renderer.Option) {
   141  	// no-op
   142  }
   143  
   144  // StripMarkdown parses markdown content by removing all markup and code blocks
   145  //
   146  //	in order to extract links and other references
   147  func StripMarkdown(rawBytes []byte) (string, []string) {
   148  	buf, links := StripMarkdownBytes(rawBytes)
   149  	return string(buf), links
   150  }
   151  
   152  var (
   153  	stripParser parser.Parser
   154  	once        = sync.Once{}
   155  )
   156  
   157  // StripMarkdownBytes parses markdown content by removing all markup and code blocks
   158  //
   159  //	in order to extract links and other references
   160  func StripMarkdownBytes(rawBytes []byte) ([]byte, []string) {
   161  	once.Do(func() {
   162  		gdMarkdown := goldmark.New(
   163  			goldmark.WithExtensions(extension.Table,
   164  				extension.Strikethrough,
   165  				extension.TaskList,
   166  				extension.DefinitionList,
   167  				common.FootnoteExtension,
   168  				common.Linkify,
   169  			),
   170  			goldmark.WithParserOptions(
   171  				parser.WithAttribute(),
   172  				parser.WithAutoHeadingID(),
   173  			),
   174  			goldmark.WithRendererOptions(
   175  				html.WithUnsafe(),
   176  			),
   177  		)
   178  		stripParser = gdMarkdown.Parser()
   179  	})
   180  	stripper := &stripRenderer{
   181  		localhost: getGiteaHost(),
   182  		links:     make([]string, 0, 10),
   183  		empty:     true,
   184  	}
   185  	reader := text.NewReader(rawBytes)
   186  	doc := stripParser.Parse(reader)
   187  	var buf bytes.Buffer
   188  	if err := stripper.Render(&buf, rawBytes, doc); err != nil {
   189  		log.Error("Unable to strip: %v", err)
   190  	}
   191  	return buf.Bytes(), stripper.GetLinks()
   192  }
   193  
   194  // getGiteaHostName returns a normalized string with the local host name, with no scheme or port information
   195  func getGiteaHost() *url.URL {
   196  	giteaHostInit.Do(func() {
   197  		var err error
   198  		if giteaHost, err = url.Parse(setting.AppURL); err != nil {
   199  			giteaHost = &url.URL{}
   200  		}
   201  	})
   202  	return giteaHost
   203  }