github.com/levb/mattermost-server@v5.3.1+incompatible/utils/markdown/autolink.go (about) 1 // Copyright (c) 2017-present Mattermost, Inc. All Rights Reserved. 2 // See License.txt for license information. 3 4 package markdown 5 6 import ( 7 "regexp" 8 "strings" 9 "unicode" 10 "unicode/utf8" 11 ) 12 13 // Based off of extensions/autolink.c from https://github.com/github/cmark 14 15 var ( 16 DefaultUrlSchemes = []string{"http", "https", "ftp", "mailto", "tel"} 17 ) 18 19 // Given a string with a w at the given position, tries to parse and return a range containing a www link. 20 // if one exists. If the text at the given position isn't a link, returns an empty string. Equivalent to 21 // www_match from the reference code. 22 func parseWWWAutolink(data string, position int) (Range, bool) { 23 // Check that this isn't part of another word 24 if position > 1 { 25 prevChar := data[position-1] 26 27 if !isWhitespaceByte(prevChar) && !isAllowedBeforeWWWLink(prevChar) { 28 return Range{}, false 29 } 30 } 31 32 // Check that this starts with www 33 if len(data)-position < 4 || !regexp.MustCompile(`^www\d{0,3}\.`).MatchString(data[position:]) { 34 return Range{}, false 35 } 36 37 end := checkDomain(data[position:], false) 38 if end == 0 { 39 return Range{}, false 40 } 41 42 end += position 43 44 // Grab all text until the end of the string or the next whitespace character 45 for end < len(data) && !isWhitespaceByte(data[end]) { 46 end += 1 47 } 48 49 // Trim trailing punctuation 50 end = trimTrailingCharactersFromLink(data, position, end) 51 if position == end { 52 return Range{}, false 53 } 54 55 return Range{position, end}, true 56 } 57 58 func isAllowedBeforeWWWLink(c byte) bool { 59 switch c { 60 case '*', '_', '~', ')': 61 return true 62 default: 63 return false 64 } 65 } 66 67 // Given a string with a : at the given position, tried to parse and return a range containing a URL scheme 68 // if one exists. If the text around the given position isn't a link, returns an empty string. Equivalent to 69 // url_match from the reference code. 70 func parseURLAutolink(data string, position int) (Range, bool) { 71 // Check that a :// exists. This doesn't match the clients that treat the slashes as optional. 72 if len(data)-position < 4 || data[position+1] != '/' || data[position+2] != '/' { 73 return Range{}, false 74 } 75 76 start := position - 1 77 for start > 0 && isAlphanumericByte(data[start-1]) { 78 start -= 1 79 } 80 81 // Ensure that the URL scheme is allowed and that at least one character after the scheme is valid. 82 scheme := data[start:position] 83 if !isSchemeAllowed(scheme) || !isValidHostCharacter(data[position+3:]) { 84 return Range{}, false 85 } 86 87 end := checkDomain(data[position+3:], true) 88 if end == 0 { 89 return Range{}, false 90 } 91 92 end += position 93 94 // Grab all text until the end of the string or the next whitespace character 95 for end < len(data) && !isWhitespaceByte(data[end]) { 96 end += 1 97 } 98 99 // Trim trailing punctuation 100 end = trimTrailingCharactersFromLink(data, start, end) 101 if start == end { 102 return Range{}, false 103 } 104 105 return Range{start, end}, true 106 } 107 108 func isSchemeAllowed(scheme string) bool { 109 // Note that this doesn't support the custom URL schemes implemented by the client 110 for _, allowed := range DefaultUrlSchemes { 111 if strings.EqualFold(allowed, scheme) { 112 return true 113 } 114 } 115 116 return false 117 } 118 119 // Given a string starting with a URL, returns the number of valid characters that make up the URL's domain. 120 // Returns 0 if the string doesn't start with a domain name. allowShort determines whether or not the domain 121 // needs to contain a period to be considered valid. Equivalent to check_domain from the reference code. 122 func checkDomain(data string, allowShort bool) int { 123 foundUnderscore := false 124 foundPeriod := false 125 126 i := 1 127 for ; i < len(data)-1; i++ { 128 if data[i] == '_' { 129 foundUnderscore = true 130 break 131 } else if data[i] == '.' { 132 foundPeriod = true 133 } else if !isValidHostCharacter(data[i:]) && data[i] != '-' { 134 break 135 } 136 } 137 138 if foundUnderscore { 139 return 0 140 } 141 142 if allowShort { 143 // If allowShort is set, accept any string of valid domain characters 144 return i 145 } 146 147 // If allowShort isn't set, a valid domain just requires at least a single period. Note that this 148 // logic isn't entirely necessary because we already know the string starts with "www." when 149 // this is called from parseWWWAutolink 150 if foundPeriod { 151 return i 152 } else { 153 return 0 154 } 155 } 156 157 // Returns true if the provided link starts with a valid character for a domain name. Equivalent to 158 // is_valid_hostchar from the reference code. 159 func isValidHostCharacter(link string) bool { 160 c, _ := utf8.DecodeRuneInString(link) 161 if c == utf8.RuneError { 162 return false 163 } 164 165 return !unicode.IsSpace(c) && !unicode.IsPunct(c) 166 } 167 168 // Removes any trailing characters such as punctuation or stray brackets that shouldn't be part of the link. 169 // Returns a new end position for the link. Equivalent to autolink_delim from the reference code. 170 func trimTrailingCharactersFromLink(markdown string, start int, end int) int { 171 runes := []rune(markdown[start:end]) 172 linkEnd := len(runes) 173 174 // Cut off the link before an open angle bracket if it contains one 175 for i, c := range runes { 176 if c == '<' { 177 linkEnd = i 178 break 179 } 180 } 181 182 for linkEnd > 0 { 183 c := runes[linkEnd-1] 184 185 if !canEndAutolink(c) { 186 // Trim trailing quotes, periods, etc 187 linkEnd = linkEnd - 1 188 } else if c == ';' { 189 // Trim a trailing HTML entity 190 newEnd := linkEnd - 2 191 192 for newEnd > 0 && ((runes[newEnd] >= 'a' && runes[newEnd] <= 'z') || (runes[newEnd] >= 'A' && runes[newEnd] <= 'Z')) { 193 newEnd -= 1 194 } 195 196 if newEnd < linkEnd-2 && runes[newEnd] == '&' { 197 linkEnd = newEnd 198 } else { 199 // This isn't actually an HTML entity, so just trim the semicolon 200 linkEnd = linkEnd - 1 201 } 202 } else if c == ')' { 203 // Only allow an autolink ending with a bracket if that bracket is part of a matching pair of brackets. 204 // If there are more closing brackets than opening ones, remove the extra bracket 205 206 numClosing := 0 207 numOpening := 0 208 209 // Examples (input text => output linked portion): 210 // 211 // http://www.pokemon.com/Pikachu_(Electric) 212 // => http://www.pokemon.com/Pikachu_(Electric) 213 // 214 // http://www.pokemon.com/Pikachu_((Electric) 215 // => http://www.pokemon.com/Pikachu_((Electric) 216 // 217 // http://www.pokemon.com/Pikachu_(Electric)) 218 // => http://www.pokemon.com/Pikachu_(Electric) 219 // 220 // http://www.pokemon.com/Pikachu_((Electric)) 221 // => http://www.pokemon.com/Pikachu_((Electric)) 222 223 for i := 0; i < linkEnd; i++ { 224 if runes[i] == '(' { 225 numOpening += 1 226 } else if runes[i] == ')' { 227 numClosing += 1 228 } 229 } 230 231 if numClosing <= numOpening { 232 // There's fewer or equal closing brackets, so we've found the end of the link 233 break 234 } 235 236 linkEnd -= 1 237 } else { 238 // There's no special characters at the end of the link, so we're at the end 239 break 240 } 241 } 242 243 return start + len(string(runes[:linkEnd])) 244 } 245 246 func canEndAutolink(c rune) bool { 247 switch c { 248 case '?', '!', '.', ',', ':', '*', '_', '~', '\'', '"': 249 return false 250 default: 251 return true 252 } 253 }