kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/common/utf8_line_index.cc (about) 1 /* 2 * Copyright 2018 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "kythe/cxx/common/utf8_line_index.h" 18 19 #include <cstdint> 20 #include <ostream> 21 22 #include "absl/algorithm/container.h" 23 #include "absl/log/check.h" 24 #include "absl/log/log.h" 25 #include "absl/strings/string_view.h" 26 27 namespace kythe { 28 29 std::ostream& operator<<(std::ostream& dest, 30 const CharacterPosition& position) { 31 dest << "<line_number=" << position.line_number 32 << " column_number=" << position.column_number 33 << " character_number=" << position.character_number << ">"; 34 return dest; 35 } 36 37 bool IsUTF8ContinuationByte(int byte) { return ((byte & 0xC0) == 0x80); } 38 39 bool IsUTF8EndOfLineByte(int byte_offset, absl::string_view content) { 40 // If/when we were using a string, checking for the past-the-end byte 41 // was safe. Now that we use string_view we have to avoid that. 42 return (content[byte_offset] == '\n' || 43 (content[byte_offset] == '\r' && (byte_offset + 1 == content.size() || 44 content[byte_offset + 1] != '\n'))); 45 } 46 47 UTF8LineIndex::UTF8LineIndex(absl::string_view content) : content_(content) { 48 IndexContent(); 49 } 50 51 void UTF8LineIndex::IndexContent() { 52 CHECK_LT(content_.size(), 1LL << 32); 53 54 // Line 0 starts at offset 0. All other line start offsets are determined 55 // by scanning for line ends (any of {CR, CR+LF, LF}.) 56 line_begin_byte_offsets_ = {0}; 57 line_begin_character_offsets_ = {0}; 58 59 int character_number = 0; 60 for (int byte_offset = 0; byte_offset < content_.size(); ++byte_offset) { 61 if (IsUTF8ContinuationByte(content_[byte_offset])) continue; 62 63 if (IsUTF8EndOfLineByte(byte_offset, content_)) { 64 line_begin_byte_offsets_.push_back(byte_offset + 1); 65 line_begin_character_offsets_.push_back(character_number + 1); 66 } 67 ++character_number; 68 } 69 } 70 71 CharacterPosition UTF8LineIndex::ComputePositionForByteOffset( 72 int byte_offset) const { 73 CharacterPosition position; 74 75 // Error cases: before the start of the file, or after the past-the-end 76 // position. In either of those cases we return an invalid position. 77 if (byte_offset < 0 || byte_offset > static_cast<int64_t>(content_.size())) 78 return position; 79 80 // We special-case asking for a position at the start of the file in part 81 // because it allows us to ignore the case of an empty file later. 82 if (byte_offset == 0) { 83 position.line_number = 1; 84 position.column_number = 0; 85 position.character_number = 0; 86 return position; 87 } 88 89 if (byte_offset < static_cast<int64_t>(content_.size())) { 90 position.line_number = LineNumber(byte_offset); 91 auto line_begin_byte_offset = 92 line_begin_byte_offsets_[position.line_number - 1]; 93 // Count the characters in the line up to (and including) byte_offset. 94 position.column_number = -1; 95 for (int offset = line_begin_byte_offset; offset <= byte_offset; ++offset) { 96 if (!IsUTF8ContinuationByte(content_[offset])) ++position.column_number; 97 } 98 auto line_begin_character_offset = 99 line_begin_character_offsets_[position.line_number - 1]; 100 position.character_number = 101 line_begin_character_offset + position.column_number; 102 } else if (byte_offset == content_.size()) { 103 // TODO(unknown): see if we can unify this with the previous case 104 // in a less ugly manner. 105 position = ComputePositionForByteOffset(byte_offset - 1); 106 // For the past-the-end position, we want it to be on the same line as 107 // any trailing characters, or on a new line if there are none. (We've 108 // taken care of the empty file case elsewhere.) 109 if (has_trailing_characters()) { 110 ++position.column_number; 111 ++position.character_number; 112 } else { 113 ++position.line_number; 114 position.column_number = 0; 115 position.character_number = 0; 116 } 117 } 118 return position; 119 } 120 121 int UTF8LineIndex::LineNumber(int byte_offset) const { 122 if (content_.empty()) return 1; 123 const auto next_line = 124 absl::c_upper_bound(line_begin_byte_offsets_, byte_offset); 125 return next_line - line_begin_byte_offsets_.begin(); 126 } 127 128 int UTF8LineIndex::line_size(int line_number) const { 129 if (line_number == line_count()) { 130 return content_.size() - ComputeByteOffset(line_number, 0); 131 } else { 132 return (ComputeByteOffset(line_number + 1, 0) - 133 ComputeByteOffset(line_number, 0)); 134 } 135 } 136 137 int UTF8LineIndex::ComputeByteOffset(int line, int column) const { 138 if (line < 1 || line > line_begin_byte_offsets_.size()) return -1; 139 int byte_offset = line_begin_byte_offsets_[line - 1]; 140 for (int i = 0; i < column; ++i) { 141 // Skip over one character for each column, however many bytes that is. 142 // In other words: skip the first byte and then skip any continuation bytes. 143 ++byte_offset; 144 while ((byte_offset < static_cast<int64_t>(content_.size())) && 145 IsUTF8ContinuationByte(content_[byte_offset])) { 146 ++byte_offset; 147 } 148 } 149 return byte_offset; 150 } 151 152 absl::string_view UTF8LineIndex::GetLine(int line_number) const { 153 const int start_byte_offset = ComputeByteOffset(line_number, 0); 154 if (line_number == line_begin_byte_offsets_.size()) { 155 // The last line is a special case, as it might be unterminated. 156 return absl::ClippedSubstr(content_, start_byte_offset); 157 } 158 const int end_byte_offset = ComputeByteOffset(line_number + 1, 0); 159 if (start_byte_offset == -1 || end_byte_offset == -1) { 160 // Error case. 161 return absl::string_view(); 162 } 163 return absl::ClippedSubstr(content_, start_byte_offset, 164 end_byte_offset - start_byte_offset); 165 } 166 167 absl::string_view UTF8LineIndex::GetSubstrFromLine( 168 int line_number, int start_position_in_code_points, 169 int length_in_code_points) const { 170 CHECK_GE(start_position_in_code_points, 0); 171 CHECK_GE(length_in_code_points, 0); 172 173 absl::string_view line = GetLine(line_number); 174 175 int start_byte_offset = -1; // a negative number means not being set yet 176 int code_point_number = 0; 177 int byte_offset = 0; // will point to the end of substr 178 for (; byte_offset < line.size(); ++byte_offset) { 179 if (code_point_number == start_position_in_code_points && 180 // Set the start offset only once in case the start code point is 181 // longer than 1 byte 182 start_byte_offset == -1) { 183 start_byte_offset = byte_offset; 184 } 185 if (code_point_number == 186 start_position_in_code_points + length_in_code_points) { 187 break; 188 } 189 if (IsUTF8ContinuationByte(line[byte_offset])) continue; 190 ++code_point_number; 191 } 192 193 if (start_byte_offset >= 0) { 194 CHECK_GE(byte_offset, start_byte_offset); 195 return absl::ClippedSubstr(line, start_byte_offset, 196 byte_offset - start_byte_offset); 197 } else { 198 LOG(ERROR) << "Substring index " << start_position_in_code_points 199 << " not found in " << line; 200 return absl::string_view(); 201 } 202 } 203 204 } // namespace kythe