kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/common/kythe_uri.cc (about) 1 /* 2 * Copyright 2015 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "kythe/cxx/common/kythe_uri.h" 18 19 #include <cstddef> 20 #include <string> 21 #include <utility> 22 23 #include "absl/strings/match.h" 24 #include "absl/strings/str_split.h" 25 #include "absl/strings/string_view.h" 26 #include "kythe/cxx/common/path_utils.h" 27 #include "kythe/proto/storage.pb.h" 28 29 namespace kythe { 30 namespace { 31 32 constexpr char kHexDigits[] = "0123456789ABCDEF"; 33 /// The URI scheme label for Kythe. 34 constexpr char kUriScheme[] = "kythe"; 35 constexpr char kUriPrefix[] = "kythe:"; 36 37 /// \brief Returns whether a byte should be escaped. 38 /// \param mode The escaping mode to use. 39 /// \param c The byte to examine. 40 bool should_escape(UriEscapeMode mode, char c) { 41 return !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || 42 (c >= '0' && c <= '9') || c == '-' || c == '.' || c == '_' || 43 c == '~' || (mode == UriEscapeMode::kEscapePaths && c == '/')); 44 } 45 46 /// \brief Returns the value of a hex digit. 47 /// \param digit The hex digit. 48 /// \return The value of the digit, or -1 if it was not a hex digit. 49 int value_for_hex_digit(char c) { 50 if (c >= '0' && c <= '9') { 51 return c - '0'; 52 } else if (c >= 'a' && c <= 'f') { 53 return 10 + (c - 'a'); 54 } else if (c >= 'A' && c <= 'F') { 55 return 10 + (c - 'A'); 56 } 57 return -1; 58 } 59 60 std::pair<absl::string_view, absl::string_view> Split(absl::string_view input, 61 char ch) { 62 return absl::StrSplit(input, absl::MaxSplits(ch, 1)); 63 } 64 65 } // namespace 66 67 std::string UriEscape(UriEscapeMode mode, absl::string_view uri) { 68 size_t num_escapes = 0; 69 for (char c : uri) { 70 if (should_escape(mode, c)) { 71 ++num_escapes; 72 } 73 } 74 std::string result; 75 result.reserve(num_escapes * 2 + uri.size()); 76 for (char c : uri) { 77 if (should_escape(mode, c)) { 78 result.push_back('%'); 79 result.push_back(kHexDigits[(c >> 4) & 0xF]); 80 result.push_back(kHexDigits[c & 0xF]); 81 } else { 82 result.push_back(c); 83 } 84 } 85 return result; 86 } 87 88 /// \brief URI-unescapes a string. 89 /// \param string The string to unescape. 90 /// \return A pair of (success, error-or-unescaped-string). 91 std::pair<bool, std::string> UriUnescape(absl::string_view string) { 92 size_t num_escapes = 0; 93 for (size_t i = 0, s = string.size(); i < s; ++i) { 94 if (string[i] == '%') { 95 ++num_escapes; 96 if (i + 3 > string.size()) { 97 return std::make_pair(false, "bad escape"); 98 } 99 i += 2; 100 } 101 } 102 std::string result; 103 result.reserve(string.size() - num_escapes * 2); 104 for (size_t i = 0, s = string.size(); i < s; ++i) { 105 char c = string[i]; 106 if (c != '%') { 107 result.push_back(c); 108 continue; 109 } 110 int high = value_for_hex_digit(string[++i]); 111 int low = value_for_hex_digit(string[++i]); 112 if (high < 0 || low < 0) { 113 return std::make_pair(false, "bad hex digit"); 114 } 115 result.push_back((high << 4) | low); 116 } 117 return std::make_pair(true, result); 118 } 119 120 std::string URI::ToString() const { 121 std::string result = kUriPrefix; 122 if (vname_.signature().empty() && vname_.path().empty() && 123 vname_.root().empty() && vname_.corpus().empty() && 124 vname_.language().empty()) { 125 return result; 126 } 127 absl::string_view signature = vname_.signature(); 128 absl::string_view path = vname_.path(); 129 absl::string_view corpus = vname_.corpus(); 130 absl::string_view language = vname_.language(); 131 absl::string_view root = vname_.root(); 132 if (!corpus.empty()) { 133 result.append("//"); 134 result.append(UriEscape(UriEscapeMode::kEscapePaths, corpus)); 135 } 136 if (!language.empty()) { 137 result.append("?lang="); 138 result.append(UriEscape(UriEscapeMode::kEscapeAll, language)); 139 } 140 if (!path.empty()) { 141 result.append("?path="); 142 result.append(UriEscape(UriEscapeMode::kEscapePaths, CleanPath(path))); 143 } 144 if (!root.empty()) { 145 result.append("?root="); 146 result.append(UriEscape(UriEscapeMode::kEscapePaths, root)); 147 } 148 if (!signature.empty()) { 149 result.push_back('#'); 150 result.append(UriEscape(UriEscapeMode::kEscapeAll, signature)); 151 } 152 return result; 153 } 154 155 /// \brief Separate out the scheme component of `uri` if one exists. 156 /// \return (scheme, tail) if there was a scheme; ("", uri) otherwise. 157 static std::pair<absl::string_view, absl::string_view> SplitScheme( 158 absl::string_view uri) { 159 for (size_t i = 0, s = uri.size(); i != s; ++i) { 160 char c = uri[i]; 161 if (c == ':') { 162 return std::make_pair(uri.substr(0, i), uri.substr(i)); 163 } else if ((i == 0 && 164 ((c >= '0' && c <= '9') || c == '+' || c == '-' || c == '.')) || 165 !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) { 166 break; 167 } 168 } 169 return std::make_pair(absl::string_view(), uri); 170 } 171 172 URI::URI(const kythe::proto::VName& from_vname) : vname_(from_vname) {} 173 174 bool URI::ParseString(absl::string_view uri) { 175 auto head_fragment = Split(uri, '#'); 176 auto head = head_fragment.first, fragment = head_fragment.second; 177 auto scheme_head = SplitScheme(head); 178 auto scheme = scheme_head.first; 179 head = scheme_head.second; 180 if (scheme.empty()) { 181 if (absl::StartsWith(head, ":")) { 182 return false; 183 } 184 } else if (scheme != kUriScheme) { 185 return false; 186 } else if (!head.empty()) { 187 head.remove_prefix(1); 188 } 189 auto head_attrs = Split(head, '?'); 190 head = head_attrs.first; 191 auto attrs = head_attrs.second; 192 std::string corpus; 193 if (!head.empty()) { 194 if (!absl::StartsWith(head, "//")) { 195 return false; 196 } 197 auto maybe_corpus = UriUnescape(head.substr(2)); 198 if (!maybe_corpus.first) { 199 return false; 200 } 201 corpus = maybe_corpus.second; 202 } 203 auto maybe_sig = UriUnescape(fragment); 204 if (!maybe_sig.first) { 205 return false; 206 } 207 auto signature = maybe_sig.second; 208 while (!attrs.empty()) { 209 auto attr_rest = Split(attrs, '?'); 210 auto attr = attr_rest.first; 211 attrs = attr_rest.second; 212 auto name_value = Split(attr, '='); 213 auto maybe_value = UriUnescape(name_value.second); 214 if (!maybe_value.first || maybe_value.second.empty()) { 215 return false; 216 } 217 if (name_value.first == "lang") { 218 vname_.set_language(maybe_value.second); 219 } else if (name_value.first == "root") { 220 vname_.set_root(maybe_value.second); 221 } else if (name_value.first == "path") { 222 vname_.set_path(CleanPath(maybe_value.second)); 223 } else { 224 return false; 225 } 226 } 227 vname_.set_signature(signature); 228 vname_.set_corpus(corpus); 229 return true; 230 } 231 232 } // namespace kythe