github.com/vnforks/kid@v5.11.1+incompatible/utils/markdown/autolink.go (about)

     1  // Copyright (c) 2017-present Mattermost, Inc. All Rights Reserved.
     2  // See License.txt for license information.
     3  
     4  package markdown
     5  
     6  import (
     7  	"regexp"
     8  	"strings"
     9  	"unicode"
    10  	"unicode/utf8"
    11  )
    12  
    13  // Based off of extensions/autolink.c from https://github.com/github/cmark
    14  
    15  var (
    16  	DefaultUrlSchemes = []string{"http", "https", "ftp", "mailto", "tel"}
    17  )
    18  
    19  // Given a string with a w at the given position, tries to parse and return a range containing a www link.
    20  // if one exists. If the text at the given position isn't a link, returns an empty string. Equivalent to
    21  // www_match from the reference code.
    22  func parseWWWAutolink(data string, position int) (Range, bool) {
    23  	// Check that this isn't part of another word
    24  	if position > 1 {
    25  		prevChar := data[position-1]
    26  
    27  		if !isWhitespaceByte(prevChar) && !isAllowedBeforeWWWLink(prevChar) {
    28  			return Range{}, false
    29  		}
    30  	}
    31  
    32  	// Check that this starts with www
    33  	if len(data)-position < 4 || !regexp.MustCompile(`^www\d{0,3}\.`).MatchString(data[position:]) {
    34  		return Range{}, false
    35  	}
    36  
    37  	end := checkDomain(data[position:], false)
    38  	if end == 0 {
    39  		return Range{}, false
    40  	}
    41  
    42  	end += position
    43  
    44  	// Grab all text until the end of the string or the next whitespace character
    45  	for end < len(data) && !isWhitespaceByte(data[end]) {
    46  		end += 1
    47  	}
    48  
    49  	// Trim trailing punctuation
    50  	end = trimTrailingCharactersFromLink(data, position, end)
    51  	if position == end {
    52  		return Range{}, false
    53  	}
    54  
    55  	return Range{position, end}, true
    56  }
    57  
    58  func isAllowedBeforeWWWLink(c byte) bool {
    59  	switch c {
    60  	case '*', '_', '~', ')':
    61  		return true
    62  	default:
    63  		return false
    64  	}
    65  }
    66  
    67  // Given a string with a : at the given position, tried to parse and return a range containing a URL scheme
    68  // if one exists. If the text around the given position isn't a link, returns an empty string. Equivalent to
    69  // url_match from the reference code.
    70  func parseURLAutolink(data string, position int) (Range, bool) {
    71  	// Check that a :// exists. This doesn't match the clients that treat the slashes as optional.
    72  	if len(data)-position < 4 || data[position+1] != '/' || data[position+2] != '/' {
    73  		return Range{}, false
    74  	}
    75  
    76  	start := position - 1
    77  	for start > 0 && isAlphanumericByte(data[start-1]) {
    78  		start -= 1
    79  	}
    80  
    81  	if start < 0 || position >= len(data) {
    82  		return Range{}, false
    83  	}
    84  
    85  	// Ensure that the URL scheme is allowed and that at least one character after the scheme is valid.
    86  	scheme := data[start:position]
    87  	if !isSchemeAllowed(scheme) || !isValidHostCharacter(data[position+3:]) {
    88  		return Range{}, false
    89  	}
    90  
    91  	end := checkDomain(data[position+3:], true)
    92  	if end == 0 {
    93  		return Range{}, false
    94  	}
    95  
    96  	end += position
    97  
    98  	// Grab all text until the end of the string or the next whitespace character
    99  	for end < len(data) && !isWhitespaceByte(data[end]) {
   100  		end += 1
   101  	}
   102  
   103  	// Trim trailing punctuation
   104  	end = trimTrailingCharactersFromLink(data, start, end)
   105  	if start == end {
   106  		return Range{}, false
   107  	}
   108  
   109  	return Range{start, end}, true
   110  }
   111  
   112  func isSchemeAllowed(scheme string) bool {
   113  	// Note that this doesn't support the custom URL schemes implemented by the client
   114  	for _, allowed := range DefaultUrlSchemes {
   115  		if strings.EqualFold(allowed, scheme) {
   116  			return true
   117  		}
   118  	}
   119  
   120  	return false
   121  }
   122  
   123  // Given a string starting with a URL, returns the number of valid characters that make up the URL's domain.
   124  // Returns 0 if the string doesn't start with a domain name. allowShort determines whether or not the domain
   125  // needs to contain a period to be considered valid. Equivalent to check_domain from the reference code.
   126  func checkDomain(data string, allowShort bool) int {
   127  	foundUnderscore := false
   128  	foundPeriod := false
   129  
   130  	i := 1
   131  	for ; i < len(data)-1; i++ {
   132  		if data[i] == '_' {
   133  			foundUnderscore = true
   134  			break
   135  		} else if data[i] == '.' {
   136  			foundPeriod = true
   137  		} else if !isValidHostCharacter(data[i:]) && data[i] != '-' {
   138  			break
   139  		}
   140  	}
   141  
   142  	if foundUnderscore {
   143  		return 0
   144  	}
   145  
   146  	if allowShort {
   147  		// If allowShort is set, accept any string of valid domain characters
   148  		return i
   149  	}
   150  
   151  	// If allowShort isn't set, a valid domain just requires at least a single period. Note that this
   152  	// logic isn't entirely necessary because we already know the string starts with "www." when
   153  	// this is called from parseWWWAutolink
   154  	if foundPeriod {
   155  		return i
   156  	} else {
   157  		return 0
   158  	}
   159  }
   160  
   161  // Returns true if the provided link starts with a valid character for a domain name. Equivalent to
   162  // is_valid_hostchar from the reference code.
   163  func isValidHostCharacter(link string) bool {
   164  	c, _ := utf8.DecodeRuneInString(link)
   165  	if c == utf8.RuneError {
   166  		return false
   167  	}
   168  
   169  	return !unicode.IsSpace(c) && !unicode.IsPunct(c)
   170  }
   171  
   172  // Removes any trailing characters such as punctuation or stray brackets that shouldn't be part of the link.
   173  // Returns a new end position for the link. Equivalent to autolink_delim from the reference code.
   174  func trimTrailingCharactersFromLink(markdown string, start int, end int) int {
   175  	runes := []rune(markdown[start:end])
   176  	linkEnd := len(runes)
   177  
   178  	// Cut off the link before an open angle bracket if it contains one
   179  	for i, c := range runes {
   180  		if c == '<' {
   181  			linkEnd = i
   182  			break
   183  		}
   184  	}
   185  
   186  	for linkEnd > 0 {
   187  		c := runes[linkEnd-1]
   188  
   189  		if !canEndAutolink(c) {
   190  			// Trim trailing quotes, periods, etc
   191  			linkEnd = linkEnd - 1
   192  		} else if c == ';' {
   193  			// Trim a trailing HTML entity
   194  			newEnd := linkEnd - 2
   195  
   196  			for newEnd > 0 && ((runes[newEnd] >= 'a' && runes[newEnd] <= 'z') || (runes[newEnd] >= 'A' && runes[newEnd] <= 'Z')) {
   197  				newEnd -= 1
   198  			}
   199  
   200  			if newEnd < linkEnd-2 && runes[newEnd] == '&' {
   201  				linkEnd = newEnd
   202  			} else {
   203  				// This isn't actually an HTML entity, so just trim the semicolon
   204  				linkEnd = linkEnd - 1
   205  			}
   206  		} else if c == ')' {
   207  			// Only allow an autolink ending with a bracket if that bracket is part of a matching pair of brackets.
   208  			// If there are more closing brackets than opening ones, remove the extra bracket
   209  
   210  			numClosing := 0
   211  			numOpening := 0
   212  
   213  			// Examples (input text => output linked portion):
   214  			//
   215  			//  http://www.pokemon.com/Pikachu_(Electric)
   216  			//    => http://www.pokemon.com/Pikachu_(Electric)
   217  			//
   218  			//  http://www.pokemon.com/Pikachu_((Electric)
   219  			//    => http://www.pokemon.com/Pikachu_((Electric)
   220  			//
   221  			//  http://www.pokemon.com/Pikachu_(Electric))
   222  			//    => http://www.pokemon.com/Pikachu_(Electric)
   223  			//
   224  			//  http://www.pokemon.com/Pikachu_((Electric))
   225  			//    => http://www.pokemon.com/Pikachu_((Electric))
   226  
   227  			for i := 0; i < linkEnd; i++ {
   228  				if runes[i] == '(' {
   229  					numOpening += 1
   230  				} else if runes[i] == ')' {
   231  					numClosing += 1
   232  				}
   233  			}
   234  
   235  			if numClosing <= numOpening {
   236  				// There's fewer or equal closing brackets, so we've found the end of the link
   237  				break
   238  			}
   239  
   240  			linkEnd -= 1
   241  		} else {
   242  			// There's no special characters at the end of the link, so we're at the end
   243  			break
   244  		}
   245  	}
   246  
   247  	return start + len(string(runes[:linkEnd]))
   248  }
   249  
   250  func canEndAutolink(c rune) bool {
   251  	switch c {
   252  	case '?', '!', '.', ',', ':', '*', '_', '~', '\'', '"':
   253  		return false
   254  	default:
   255  		return true
   256  	}
   257  }