github.com/fretkak/mattermost-mattermost-server@v5.11.1+incompatible/utils/markdown/autolink.go (about) 1 // Copyright (c) 2017-present Mattermost, Inc. All Rights Reserved. 2 // See License.txt for license information. 3 4 package markdown 5 6 import ( 7 "regexp" 8 "strings" 9 "unicode" 10 "unicode/utf8" 11 ) 12 13 // Based off of extensions/autolink.c from https://github.com/github/cmark 14 15 var ( 16 DefaultUrlSchemes = []string{"http", "https", "ftp", "mailto", "tel"} 17 ) 18 19 // Given a string with a w at the given position, tries to parse and return a range containing a www link. 20 // if one exists. If the text at the given position isn't a link, returns an empty string. Equivalent to 21 // www_match from the reference code. 22 func parseWWWAutolink(data string, position int) (Range, bool) { 23 // Check that this isn't part of another word 24 if position > 1 { 25 prevChar := data[position-1] 26 27 if !isWhitespaceByte(prevChar) && !isAllowedBeforeWWWLink(prevChar) { 28 return Range{}, false 29 } 30 } 31 32 // Check that this starts with www 33 if len(data)-position < 4 || !regexp.MustCompile(`^www\d{0,3}\.`).MatchString(data[position:]) { 34 return Range{}, false 35 } 36 37 end := checkDomain(data[position:], false) 38 if end == 0 { 39 return Range{}, false 40 } 41 42 end += position 43 44 // Grab all text until the end of the string or the next whitespace character 45 for end < len(data) && !isWhitespaceByte(data[end]) { 46 end += 1 47 } 48 49 // Trim trailing punctuation 50 end = trimTrailingCharactersFromLink(data, position, end) 51 if position == end { 52 return Range{}, false 53 } 54 55 return Range{position, end}, true 56 } 57 58 func isAllowedBeforeWWWLink(c byte) bool { 59 switch c { 60 case '*', '_', '~', ')': 61 return true 62 default: 63 return false 64 } 65 } 66 67 // Given a string with a : at the given position, tried to parse and return a range containing a URL scheme 68 // if one exists. If the text around the given position isn't a link, returns an empty string. Equivalent to 69 // url_match from the reference code. 70 func parseURLAutolink(data string, position int) (Range, bool) { 71 // Check that a :// exists. This doesn't match the clients that treat the slashes as optional. 72 if len(data)-position < 4 || data[position+1] != '/' || data[position+2] != '/' { 73 return Range{}, false 74 } 75 76 start := position - 1 77 for start > 0 && isAlphanumericByte(data[start-1]) { 78 start -= 1 79 } 80 81 if start < 0 || position >= len(data) { 82 return Range{}, false 83 } 84 85 // Ensure that the URL scheme is allowed and that at least one character after the scheme is valid. 86 scheme := data[start:position] 87 if !isSchemeAllowed(scheme) || !isValidHostCharacter(data[position+3:]) { 88 return Range{}, false 89 } 90 91 end := checkDomain(data[position+3:], true) 92 if end == 0 { 93 return Range{}, false 94 } 95 96 end += position 97 98 // Grab all text until the end of the string or the next whitespace character 99 for end < len(data) && !isWhitespaceByte(data[end]) { 100 end += 1 101 } 102 103 // Trim trailing punctuation 104 end = trimTrailingCharactersFromLink(data, start, end) 105 if start == end { 106 return Range{}, false 107 } 108 109 return Range{start, end}, true 110 } 111 112 func isSchemeAllowed(scheme string) bool { 113 // Note that this doesn't support the custom URL schemes implemented by the client 114 for _, allowed := range DefaultUrlSchemes { 115 if strings.EqualFold(allowed, scheme) { 116 return true 117 } 118 } 119 120 return false 121 } 122 123 // Given a string starting with a URL, returns the number of valid characters that make up the URL's domain. 124 // Returns 0 if the string doesn't start with a domain name. allowShort determines whether or not the domain 125 // needs to contain a period to be considered valid. Equivalent to check_domain from the reference code. 126 func checkDomain(data string, allowShort bool) int { 127 foundUnderscore := false 128 foundPeriod := false 129 130 i := 1 131 for ; i < len(data)-1; i++ { 132 if data[i] == '_' { 133 foundUnderscore = true 134 break 135 } else if data[i] == '.' { 136 foundPeriod = true 137 } else if !isValidHostCharacter(data[i:]) && data[i] != '-' { 138 break 139 } 140 } 141 142 if foundUnderscore { 143 return 0 144 } 145 146 if allowShort { 147 // If allowShort is set, accept any string of valid domain characters 148 return i 149 } 150 151 // If allowShort isn't set, a valid domain just requires at least a single period. Note that this 152 // logic isn't entirely necessary because we already know the string starts with "www." when 153 // this is called from parseWWWAutolink 154 if foundPeriod { 155 return i 156 } else { 157 return 0 158 } 159 } 160 161 // Returns true if the provided link starts with a valid character for a domain name. Equivalent to 162 // is_valid_hostchar from the reference code. 163 func isValidHostCharacter(link string) bool { 164 c, _ := utf8.DecodeRuneInString(link) 165 if c == utf8.RuneError { 166 return false 167 } 168 169 return !unicode.IsSpace(c) && !unicode.IsPunct(c) 170 } 171 172 // Removes any trailing characters such as punctuation or stray brackets that shouldn't be part of the link. 173 // Returns a new end position for the link. Equivalent to autolink_delim from the reference code. 174 func trimTrailingCharactersFromLink(markdown string, start int, end int) int { 175 runes := []rune(markdown[start:end]) 176 linkEnd := len(runes) 177 178 // Cut off the link before an open angle bracket if it contains one 179 for i, c := range runes { 180 if c == '<' { 181 linkEnd = i 182 break 183 } 184 } 185 186 for linkEnd > 0 { 187 c := runes[linkEnd-1] 188 189 if !canEndAutolink(c) { 190 // Trim trailing quotes, periods, etc 191 linkEnd = linkEnd - 1 192 } else if c == ';' { 193 // Trim a trailing HTML entity 194 newEnd := linkEnd - 2 195 196 for newEnd > 0 && ((runes[newEnd] >= 'a' && runes[newEnd] <= 'z') || (runes[newEnd] >= 'A' && runes[newEnd] <= 'Z')) { 197 newEnd -= 1 198 } 199 200 if newEnd < linkEnd-2 && runes[newEnd] == '&' { 201 linkEnd = newEnd 202 } else { 203 // This isn't actually an HTML entity, so just trim the semicolon 204 linkEnd = linkEnd - 1 205 } 206 } else if c == ')' { 207 // Only allow an autolink ending with a bracket if that bracket is part of a matching pair of brackets. 208 // If there are more closing brackets than opening ones, remove the extra bracket 209 210 numClosing := 0 211 numOpening := 0 212 213 // Examples (input text => output linked portion): 214 // 215 // http://www.pokemon.com/Pikachu_(Electric) 216 // => http://www.pokemon.com/Pikachu_(Electric) 217 // 218 // http://www.pokemon.com/Pikachu_((Electric) 219 // => http://www.pokemon.com/Pikachu_((Electric) 220 // 221 // http://www.pokemon.com/Pikachu_(Electric)) 222 // => http://www.pokemon.com/Pikachu_(Electric) 223 // 224 // http://www.pokemon.com/Pikachu_((Electric)) 225 // => http://www.pokemon.com/Pikachu_((Electric)) 226 227 for i := 0; i < linkEnd; i++ { 228 if runes[i] == '(' { 229 numOpening += 1 230 } else if runes[i] == ')' { 231 numClosing += 1 232 } 233 } 234 235 if numClosing <= numOpening { 236 // There's fewer or equal closing brackets, so we've found the end of the link 237 break 238 } 239 240 linkEnd -= 1 241 } else { 242 // There's no special characters at the end of the link, so we're at the end 243 break 244 } 245 } 246 247 return start + len(string(runes[:linkEnd])) 248 } 249 250 func canEndAutolink(c rune) bool { 251 switch c { 252 case '?', '!', '.', ',', ':', '*', '_', '~', '\'', '"': 253 return false 254 default: 255 return true 256 } 257 }