github.com/levb/mattermost-server@v5.3.1+incompatible/utils/markdown/autolink.go (about)

     1  // Copyright (c) 2017-present Mattermost, Inc. All Rights Reserved.
     2  // See License.txt for license information.
     3  
     4  package markdown
     5  
     6  import (
     7  	"regexp"
     8  	"strings"
     9  	"unicode"
    10  	"unicode/utf8"
    11  )
    12  
    13  // Based off of extensions/autolink.c from https://github.com/github/cmark
    14  
    15  var (
    16  	DefaultUrlSchemes = []string{"http", "https", "ftp", "mailto", "tel"}
    17  )
    18  
    19  // Given a string with a w at the given position, tries to parse and return a range containing a www link.
    20  // if one exists. If the text at the given position isn't a link, returns an empty string. Equivalent to
    21  // www_match from the reference code.
    22  func parseWWWAutolink(data string, position int) (Range, bool) {
    23  	// Check that this isn't part of another word
    24  	if position > 1 {
    25  		prevChar := data[position-1]
    26  
    27  		if !isWhitespaceByte(prevChar) && !isAllowedBeforeWWWLink(prevChar) {
    28  			return Range{}, false
    29  		}
    30  	}
    31  
    32  	// Check that this starts with www
    33  	if len(data)-position < 4 || !regexp.MustCompile(`^www\d{0,3}\.`).MatchString(data[position:]) {
    34  		return Range{}, false
    35  	}
    36  
    37  	end := checkDomain(data[position:], false)
    38  	if end == 0 {
    39  		return Range{}, false
    40  	}
    41  
    42  	end += position
    43  
    44  	// Grab all text until the end of the string or the next whitespace character
    45  	for end < len(data) && !isWhitespaceByte(data[end]) {
    46  		end += 1
    47  	}
    48  
    49  	// Trim trailing punctuation
    50  	end = trimTrailingCharactersFromLink(data, position, end)
    51  	if position == end {
    52  		return Range{}, false
    53  	}
    54  
    55  	return Range{position, end}, true
    56  }
    57  
    58  func isAllowedBeforeWWWLink(c byte) bool {
    59  	switch c {
    60  	case '*', '_', '~', ')':
    61  		return true
    62  	default:
    63  		return false
    64  	}
    65  }
    66  
    67  // Given a string with a : at the given position, tried to parse and return a range containing a URL scheme
    68  // if one exists. If the text around the given position isn't a link, returns an empty string. Equivalent to
    69  // url_match from the reference code.
    70  func parseURLAutolink(data string, position int) (Range, bool) {
    71  	// Check that a :// exists. This doesn't match the clients that treat the slashes as optional.
    72  	if len(data)-position < 4 || data[position+1] != '/' || data[position+2] != '/' {
    73  		return Range{}, false
    74  	}
    75  
    76  	start := position - 1
    77  	for start > 0 && isAlphanumericByte(data[start-1]) {
    78  		start -= 1
    79  	}
    80  
    81  	// Ensure that the URL scheme is allowed and that at least one character after the scheme is valid.
    82  	scheme := data[start:position]
    83  	if !isSchemeAllowed(scheme) || !isValidHostCharacter(data[position+3:]) {
    84  		return Range{}, false
    85  	}
    86  
    87  	end := checkDomain(data[position+3:], true)
    88  	if end == 0 {
    89  		return Range{}, false
    90  	}
    91  
    92  	end += position
    93  
    94  	// Grab all text until the end of the string or the next whitespace character
    95  	for end < len(data) && !isWhitespaceByte(data[end]) {
    96  		end += 1
    97  	}
    98  
    99  	// Trim trailing punctuation
   100  	end = trimTrailingCharactersFromLink(data, start, end)
   101  	if start == end {
   102  		return Range{}, false
   103  	}
   104  
   105  	return Range{start, end}, true
   106  }
   107  
   108  func isSchemeAllowed(scheme string) bool {
   109  	// Note that this doesn't support the custom URL schemes implemented by the client
   110  	for _, allowed := range DefaultUrlSchemes {
   111  		if strings.EqualFold(allowed, scheme) {
   112  			return true
   113  		}
   114  	}
   115  
   116  	return false
   117  }
   118  
   119  // Given a string starting with a URL, returns the number of valid characters that make up the URL's domain.
   120  // Returns 0 if the string doesn't start with a domain name. allowShort determines whether or not the domain
   121  // needs to contain a period to be considered valid. Equivalent to check_domain from the reference code.
   122  func checkDomain(data string, allowShort bool) int {
   123  	foundUnderscore := false
   124  	foundPeriod := false
   125  
   126  	i := 1
   127  	for ; i < len(data)-1; i++ {
   128  		if data[i] == '_' {
   129  			foundUnderscore = true
   130  			break
   131  		} else if data[i] == '.' {
   132  			foundPeriod = true
   133  		} else if !isValidHostCharacter(data[i:]) && data[i] != '-' {
   134  			break
   135  		}
   136  	}
   137  
   138  	if foundUnderscore {
   139  		return 0
   140  	}
   141  
   142  	if allowShort {
   143  		// If allowShort is set, accept any string of valid domain characters
   144  		return i
   145  	}
   146  
   147  	// If allowShort isn't set, a valid domain just requires at least a single period. Note that this
   148  	// logic isn't entirely necessary because we already know the string starts with "www." when
   149  	// this is called from parseWWWAutolink
   150  	if foundPeriod {
   151  		return i
   152  	} else {
   153  		return 0
   154  	}
   155  }
   156  
   157  // Returns true if the provided link starts with a valid character for a domain name. Equivalent to
   158  // is_valid_hostchar from the reference code.
   159  func isValidHostCharacter(link string) bool {
   160  	c, _ := utf8.DecodeRuneInString(link)
   161  	if c == utf8.RuneError {
   162  		return false
   163  	}
   164  
   165  	return !unicode.IsSpace(c) && !unicode.IsPunct(c)
   166  }
   167  
   168  // Removes any trailing characters such as punctuation or stray brackets that shouldn't be part of the link.
   169  // Returns a new end position for the link. Equivalent to autolink_delim from the reference code.
   170  func trimTrailingCharactersFromLink(markdown string, start int, end int) int {
   171  	runes := []rune(markdown[start:end])
   172  	linkEnd := len(runes)
   173  
   174  	// Cut off the link before an open angle bracket if it contains one
   175  	for i, c := range runes {
   176  		if c == '<' {
   177  			linkEnd = i
   178  			break
   179  		}
   180  	}
   181  
   182  	for linkEnd > 0 {
   183  		c := runes[linkEnd-1]
   184  
   185  		if !canEndAutolink(c) {
   186  			// Trim trailing quotes, periods, etc
   187  			linkEnd = linkEnd - 1
   188  		} else if c == ';' {
   189  			// Trim a trailing HTML entity
   190  			newEnd := linkEnd - 2
   191  
   192  			for newEnd > 0 && ((runes[newEnd] >= 'a' && runes[newEnd] <= 'z') || (runes[newEnd] >= 'A' && runes[newEnd] <= 'Z')) {
   193  				newEnd -= 1
   194  			}
   195  
   196  			if newEnd < linkEnd-2 && runes[newEnd] == '&' {
   197  				linkEnd = newEnd
   198  			} else {
   199  				// This isn't actually an HTML entity, so just trim the semicolon
   200  				linkEnd = linkEnd - 1
   201  			}
   202  		} else if c == ')' {
   203  			// Only allow an autolink ending with a bracket if that bracket is part of a matching pair of brackets.
   204  			// If there are more closing brackets than opening ones, remove the extra bracket
   205  
   206  			numClosing := 0
   207  			numOpening := 0
   208  
   209  			// Examples (input text => output linked portion):
   210  			//
   211  			//  http://www.pokemon.com/Pikachu_(Electric)
   212  			//    => http://www.pokemon.com/Pikachu_(Electric)
   213  			//
   214  			//  http://www.pokemon.com/Pikachu_((Electric)
   215  			//    => http://www.pokemon.com/Pikachu_((Electric)
   216  			//
   217  			//  http://www.pokemon.com/Pikachu_(Electric))
   218  			//    => http://www.pokemon.com/Pikachu_(Electric)
   219  			//
   220  			//  http://www.pokemon.com/Pikachu_((Electric))
   221  			//    => http://www.pokemon.com/Pikachu_((Electric))
   222  
   223  			for i := 0; i < linkEnd; i++ {
   224  				if runes[i] == '(' {
   225  					numOpening += 1
   226  				} else if runes[i] == ')' {
   227  					numClosing += 1
   228  				}
   229  			}
   230  
   231  			if numClosing <= numOpening {
   232  				// There's fewer or equal closing brackets, so we've found the end of the link
   233  				break
   234  			}
   235  
   236  			linkEnd -= 1
   237  		} else {
   238  			// There's no special characters at the end of the link, so we're at the end
   239  			break
   240  		}
   241  	}
   242  
   243  	return start + len(string(runes[:linkEnd]))
   244  }
   245  
   246  func canEndAutolink(c rune) bool {
   247  	switch c {
   248  	case '?', '!', '.', ',', ':', '*', '_', '~', '\'', '"':
   249  		return false
   250  	default:
   251  		return true
   252  	}
   253  }