github.com/masterhung0112/hk_server/v5@v5.0.0-20220302090640-ec71aef15e1c/shared/markdown/autolink.go (about)

     1  // Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
     2  // See LICENSE.txt for license information.
     3  
     4  package markdown
     5  
     6  import (
     7  	"regexp"
     8  	"strings"
     9  	"unicode"
    10  	"unicode/utf8"
    11  )
    12  
    13  // Based off of extensions/autolink.c from https://github.com/github/cmark
    14  
    15  var (
    16  	DefaultURLSchemes = []string{"http", "https", "ftp", "mailto", "tel"}
    17  	wwwAutoLinkRegex  = regexp.MustCompile(`^www\d{0,3}\.`)
    18  )
    19  
    20  // Given a string with a w at the given position, tries to parse and return a range containing a www link.
    21  // if one exists. If the text at the given position isn't a link, returns an empty string. Equivalent to
    22  // www_match from the reference code.
    23  func parseWWWAutolink(data string, position int) (Range, bool) {
    24  	// Check that this isn't part of another word
    25  	if position > 1 {
    26  		prevChar := data[position-1]
    27  
    28  		if !isWhitespaceByte(prevChar) && !isAllowedBeforeWWWLink(prevChar) {
    29  			return Range{}, false
    30  		}
    31  	}
    32  
    33  	// Check that this starts with www
    34  	if len(data)-position < 4 || !wwwAutoLinkRegex.MatchString(data[position:]) {
    35  		return Range{}, false
    36  	}
    37  
    38  	end := checkDomain(data[position:], false)
    39  	if end == 0 {
    40  		return Range{}, false
    41  	}
    42  
    43  	end += position
    44  
    45  	// Grab all text until the end of the string or the next whitespace character
    46  	for end < len(data) && !isWhitespaceByte(data[end]) {
    47  		end += 1
    48  	}
    49  
    50  	// Trim trailing punctuation
    51  	end = trimTrailingCharactersFromLink(data, position, end)
    52  	if position == end {
    53  		return Range{}, false
    54  	}
    55  
    56  	return Range{position, end}, true
    57  }
    58  
    59  func isAllowedBeforeWWWLink(c byte) bool {
    60  	switch c {
    61  	case '*', '_', '~', ')':
    62  		return true
    63  	}
    64  	return false
    65  }
    66  
    67  // Given a string with a : at the given position, tried to parse and return a range containing a URL scheme
    68  // if one exists. If the text around the given position isn't a link, returns an empty string. Equivalent to
    69  // url_match from the reference code.
    70  func parseURLAutolink(data string, position int) (Range, bool) {
    71  	// Check that a :// exists. This doesn't match the clients that treat the slashes as optional.
    72  	if len(data)-position < 4 || data[position+1] != '/' || data[position+2] != '/' {
    73  		return Range{}, false
    74  	}
    75  
    76  	start := position - 1
    77  	for start > 0 && isAlphanumericByte(data[start-1]) {
    78  		start -= 1
    79  	}
    80  
    81  	if start < 0 || position >= len(data) {
    82  		return Range{}, false
    83  	}
    84  
    85  	// Ensure that the URL scheme is allowed and that at least one character after the scheme is valid.
    86  	scheme := data[start:position]
    87  	if !isSchemeAllowed(scheme) || !isValidHostCharacter(data[position+3:]) {
    88  		return Range{}, false
    89  	}
    90  
    91  	end := checkDomain(data[position+3:], true)
    92  	if end == 0 {
    93  		return Range{}, false
    94  	}
    95  
    96  	end += position
    97  
    98  	// Grab all text until the end of the string or the next whitespace character
    99  	for end < len(data) && !isWhitespaceByte(data[end]) {
   100  		end += 1
   101  	}
   102  
   103  	// Trim trailing punctuation
   104  	end = trimTrailingCharactersFromLink(data, start, end)
   105  	if start == end {
   106  		return Range{}, false
   107  	}
   108  
   109  	return Range{start, end}, true
   110  }
   111  
   112  func isSchemeAllowed(scheme string) bool {
   113  	// Note that this doesn't support the custom URL schemes implemented by the client
   114  	for _, allowed := range DefaultURLSchemes {
   115  		if strings.EqualFold(allowed, scheme) {
   116  			return true
   117  		}
   118  	}
   119  
   120  	return false
   121  }
   122  
   123  // Given a string starting with a URL, returns the number of valid characters that make up the URL's domain.
   124  // Returns 0 if the string doesn't start with a domain name. allowShort determines whether or not the domain
   125  // needs to contain a period to be considered valid. Equivalent to check_domain from the reference code.
   126  func checkDomain(data string, allowShort bool) int {
   127  	foundUnderscore := false
   128  	foundPeriod := false
   129  
   130  	i := 1
   131  	for ; i < len(data)-1; i++ {
   132  		if data[i] == '_' {
   133  			foundUnderscore = true
   134  			break
   135  		} else if data[i] == '.' {
   136  			foundPeriod = true
   137  		} else if !isValidHostCharacter(data[i:]) && data[i] != '-' {
   138  			break
   139  		}
   140  	}
   141  
   142  	if foundUnderscore {
   143  		return 0
   144  	}
   145  
   146  	if allowShort {
   147  		// If allowShort is set, accept any string of valid domain characters
   148  		return i
   149  	}
   150  
   151  	// If allowShort isn't set, a valid domain just requires at least a single period. Note that this
   152  	// logic isn't entirely necessary because we already know the string starts with "www." when
   153  	// this is called from parseWWWAutolink
   154  	if foundPeriod {
   155  		return i
   156  	}
   157  	return 0
   158  }
   159  
   160  // Returns true if the provided link starts with a valid character for a domain name. Equivalent to
   161  // is_valid_hostchar from the reference code.
   162  func isValidHostCharacter(link string) bool {
   163  	c, _ := utf8.DecodeRuneInString(link)
   164  	if c == utf8.RuneError {
   165  		return false
   166  	}
   167  
   168  	return !unicode.IsSpace(c) && !unicode.IsPunct(c)
   169  }
   170  
   171  // Removes any trailing characters such as punctuation or stray brackets that shouldn't be part of the link.
   172  // Returns a new end position for the link. Equivalent to autolink_delim from the reference code.
   173  func trimTrailingCharactersFromLink(markdown string, start int, end int) int {
   174  	runes := []rune(markdown[start:end])
   175  	linkEnd := len(runes)
   176  
   177  	// Cut off the link before an open angle bracket if it contains one
   178  	for i, c := range runes {
   179  		if c == '<' {
   180  			linkEnd = i
   181  			break
   182  		}
   183  	}
   184  
   185  	for linkEnd > 0 {
   186  		c := runes[linkEnd-1]
   187  
   188  		if !canEndAutolink(c) {
   189  			// Trim trailing quotes, periods, etc
   190  			linkEnd = linkEnd - 1
   191  		} else if c == ';' {
   192  			// Trim a trailing HTML entity
   193  			newEnd := linkEnd - 2
   194  
   195  			for newEnd > 0 && ((runes[newEnd] >= 'a' && runes[newEnd] <= 'z') || (runes[newEnd] >= 'A' && runes[newEnd] <= 'Z')) {
   196  				newEnd -= 1
   197  			}
   198  
   199  			if newEnd < linkEnd-2 && runes[newEnd] == '&' {
   200  				linkEnd = newEnd
   201  			} else {
   202  				// This isn't actually an HTML entity, so just trim the semicolon
   203  				linkEnd = linkEnd - 1
   204  			}
   205  		} else if c == ')' {
   206  			// Only allow an autolink ending with a bracket if that bracket is part of a matching pair of brackets.
   207  			// If there are more closing brackets than opening ones, remove the extra bracket
   208  
   209  			numClosing := 0
   210  			numOpening := 0
   211  
   212  			// Examples (input text => output linked portion):
   213  			//
   214  			//  http://www.pokemon.com/Pikachu_(Electric)
   215  			//    => http://www.pokemon.com/Pikachu_(Electric)
   216  			//
   217  			//  http://www.pokemon.com/Pikachu_((Electric)
   218  			//    => http://www.pokemon.com/Pikachu_((Electric)
   219  			//
   220  			//  http://www.pokemon.com/Pikachu_(Electric))
   221  			//    => http://www.pokemon.com/Pikachu_(Electric)
   222  			//
   223  			//  http://www.pokemon.com/Pikachu_((Electric))
   224  			//    => http://www.pokemon.com/Pikachu_((Electric))
   225  
   226  			for i := 0; i < linkEnd; i++ {
   227  				if runes[i] == '(' {
   228  					numOpening += 1
   229  				} else if runes[i] == ')' {
   230  					numClosing += 1
   231  				}
   232  			}
   233  
   234  			if numClosing <= numOpening {
   235  				// There's fewer or equal closing brackets, so we've found the end of the link
   236  				break
   237  			}
   238  
   239  			linkEnd -= 1
   240  		} else {
   241  			// There's no special characters at the end of the link, so we're at the end
   242  			break
   243  		}
   244  	}
   245  
   246  	return start + len(string(runes[:linkEnd]))
   247  }
   248  
   249  func canEndAutolink(c rune) bool {
   250  	switch c {
   251  	case '?', '!', '.', ',', ':', '*', '_', '~', '\'', '"':
   252  		return false
   253  	}
   254  	return true
   255  }