github.com/masterhung0112/hk_server/v5@v5.0.0-20220302090640-ec71aef15e1c/shared/markdown/autolink.go (about) 1 // Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. 2 // See LICENSE.txt for license information. 3 4 package markdown 5 6 import ( 7 "regexp" 8 "strings" 9 "unicode" 10 "unicode/utf8" 11 ) 12 13 // Based off of extensions/autolink.c from https://github.com/github/cmark 14 15 var ( 16 DefaultURLSchemes = []string{"http", "https", "ftp", "mailto", "tel"} 17 wwwAutoLinkRegex = regexp.MustCompile(`^www\d{0,3}\.`) 18 ) 19 20 // Given a string with a w at the given position, tries to parse and return a range containing a www link. 21 // if one exists. If the text at the given position isn't a link, returns an empty string. Equivalent to 22 // www_match from the reference code. 23 func parseWWWAutolink(data string, position int) (Range, bool) { 24 // Check that this isn't part of another word 25 if position > 1 { 26 prevChar := data[position-1] 27 28 if !isWhitespaceByte(prevChar) && !isAllowedBeforeWWWLink(prevChar) { 29 return Range{}, false 30 } 31 } 32 33 // Check that this starts with www 34 if len(data)-position < 4 || !wwwAutoLinkRegex.MatchString(data[position:]) { 35 return Range{}, false 36 } 37 38 end := checkDomain(data[position:], false) 39 if end == 0 { 40 return Range{}, false 41 } 42 43 end += position 44 45 // Grab all text until the end of the string or the next whitespace character 46 for end < len(data) && !isWhitespaceByte(data[end]) { 47 end += 1 48 } 49 50 // Trim trailing punctuation 51 end = trimTrailingCharactersFromLink(data, position, end) 52 if position == end { 53 return Range{}, false 54 } 55 56 return Range{position, end}, true 57 } 58 59 func isAllowedBeforeWWWLink(c byte) bool { 60 switch c { 61 case '*', '_', '~', ')': 62 return true 63 } 64 return false 65 } 66 67 // Given a string with a : at the given position, tried to parse and return a range containing a URL scheme 68 // if one exists. If the text around the given position isn't a link, returns an empty string. Equivalent to 69 // url_match from the reference code. 70 func parseURLAutolink(data string, position int) (Range, bool) { 71 // Check that a :// exists. This doesn't match the clients that treat the slashes as optional. 72 if len(data)-position < 4 || data[position+1] != '/' || data[position+2] != '/' { 73 return Range{}, false 74 } 75 76 start := position - 1 77 for start > 0 && isAlphanumericByte(data[start-1]) { 78 start -= 1 79 } 80 81 if start < 0 || position >= len(data) { 82 return Range{}, false 83 } 84 85 // Ensure that the URL scheme is allowed and that at least one character after the scheme is valid. 86 scheme := data[start:position] 87 if !isSchemeAllowed(scheme) || !isValidHostCharacter(data[position+3:]) { 88 return Range{}, false 89 } 90 91 end := checkDomain(data[position+3:], true) 92 if end == 0 { 93 return Range{}, false 94 } 95 96 end += position 97 98 // Grab all text until the end of the string or the next whitespace character 99 for end < len(data) && !isWhitespaceByte(data[end]) { 100 end += 1 101 } 102 103 // Trim trailing punctuation 104 end = trimTrailingCharactersFromLink(data, start, end) 105 if start == end { 106 return Range{}, false 107 } 108 109 return Range{start, end}, true 110 } 111 112 func isSchemeAllowed(scheme string) bool { 113 // Note that this doesn't support the custom URL schemes implemented by the client 114 for _, allowed := range DefaultURLSchemes { 115 if strings.EqualFold(allowed, scheme) { 116 return true 117 } 118 } 119 120 return false 121 } 122 123 // Given a string starting with a URL, returns the number of valid characters that make up the URL's domain. 124 // Returns 0 if the string doesn't start with a domain name. allowShort determines whether or not the domain 125 // needs to contain a period to be considered valid. Equivalent to check_domain from the reference code. 126 func checkDomain(data string, allowShort bool) int { 127 foundUnderscore := false 128 foundPeriod := false 129 130 i := 1 131 for ; i < len(data)-1; i++ { 132 if data[i] == '_' { 133 foundUnderscore = true 134 break 135 } else if data[i] == '.' { 136 foundPeriod = true 137 } else if !isValidHostCharacter(data[i:]) && data[i] != '-' { 138 break 139 } 140 } 141 142 if foundUnderscore { 143 return 0 144 } 145 146 if allowShort { 147 // If allowShort is set, accept any string of valid domain characters 148 return i 149 } 150 151 // If allowShort isn't set, a valid domain just requires at least a single period. Note that this 152 // logic isn't entirely necessary because we already know the string starts with "www." when 153 // this is called from parseWWWAutolink 154 if foundPeriod { 155 return i 156 } 157 return 0 158 } 159 160 // Returns true if the provided link starts with a valid character for a domain name. Equivalent to 161 // is_valid_hostchar from the reference code. 162 func isValidHostCharacter(link string) bool { 163 c, _ := utf8.DecodeRuneInString(link) 164 if c == utf8.RuneError { 165 return false 166 } 167 168 return !unicode.IsSpace(c) && !unicode.IsPunct(c) 169 } 170 171 // Removes any trailing characters such as punctuation or stray brackets that shouldn't be part of the link. 172 // Returns a new end position for the link. Equivalent to autolink_delim from the reference code. 173 func trimTrailingCharactersFromLink(markdown string, start int, end int) int { 174 runes := []rune(markdown[start:end]) 175 linkEnd := len(runes) 176 177 // Cut off the link before an open angle bracket if it contains one 178 for i, c := range runes { 179 if c == '<' { 180 linkEnd = i 181 break 182 } 183 } 184 185 for linkEnd > 0 { 186 c := runes[linkEnd-1] 187 188 if !canEndAutolink(c) { 189 // Trim trailing quotes, periods, etc 190 linkEnd = linkEnd - 1 191 } else if c == ';' { 192 // Trim a trailing HTML entity 193 newEnd := linkEnd - 2 194 195 for newEnd > 0 && ((runes[newEnd] >= 'a' && runes[newEnd] <= 'z') || (runes[newEnd] >= 'A' && runes[newEnd] <= 'Z')) { 196 newEnd -= 1 197 } 198 199 if newEnd < linkEnd-2 && runes[newEnd] == '&' { 200 linkEnd = newEnd 201 } else { 202 // This isn't actually an HTML entity, so just trim the semicolon 203 linkEnd = linkEnd - 1 204 } 205 } else if c == ')' { 206 // Only allow an autolink ending with a bracket if that bracket is part of a matching pair of brackets. 207 // If there are more closing brackets than opening ones, remove the extra bracket 208 209 numClosing := 0 210 numOpening := 0 211 212 // Examples (input text => output linked portion): 213 // 214 // http://www.pokemon.com/Pikachu_(Electric) 215 // => http://www.pokemon.com/Pikachu_(Electric) 216 // 217 // http://www.pokemon.com/Pikachu_((Electric) 218 // => http://www.pokemon.com/Pikachu_((Electric) 219 // 220 // http://www.pokemon.com/Pikachu_(Electric)) 221 // => http://www.pokemon.com/Pikachu_(Electric) 222 // 223 // http://www.pokemon.com/Pikachu_((Electric)) 224 // => http://www.pokemon.com/Pikachu_((Electric)) 225 226 for i := 0; i < linkEnd; i++ { 227 if runes[i] == '(' { 228 numOpening += 1 229 } else if runes[i] == ')' { 230 numClosing += 1 231 } 232 } 233 234 if numClosing <= numOpening { 235 // There's fewer or equal closing brackets, so we've found the end of the link 236 break 237 } 238 239 linkEnd -= 1 240 } else { 241 // There's no special characters at the end of the link, so we're at the end 242 break 243 } 244 } 245 246 return start + len(string(runes[:linkEnd])) 247 } 248 249 func canEndAutolink(c rune) bool { 250 switch c { 251 case '?', '!', '.', ',', ':', '*', '_', '~', '\'', '"': 252 return false 253 } 254 return true 255 }