kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/common/utf8_line_index.h (about) 1 /* 2 * Copyright 2018 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef KYTHE_CXX_COMMON_UTF8_LINE_INDEX_H_ 18 #define KYTHE_CXX_COMMON_UTF8_LINE_INDEX_H_ 19 20 #include <iosfwd> 21 #include <string> 22 #include <vector> 23 24 #include "absl/strings/string_view.h" 25 26 namespace kythe { 27 28 // Returns whether a UTF-8 byte is a continuation byte, i.e., a byte other 29 // than the first byte of the encoding of a character. 30 bool IsUTF8ContinuationByte(int byte); 31 32 // Returns whether a UTF-8 byte from the |content| is the end of a line, 33 // i.e., a '\n', or a '\r' not immediately followed by a '\n'. 34 bool IsUTF8EndOfLineByte(int byte_offset, absl::string_view content); 35 36 // Describes the position of a character in a file, in an encoding-independent 37 // way. By encoding-independent, we mean that if the file were re-encoded in 38 // a different way (e.g., UCS-32 instead of UTF-8), the CharacterPosition 39 // would be unchanged. 40 struct CharacterPosition { 41 // n-th character in file, 0-based, or -1 if this position is invalid. 42 int character_number; 43 44 // 1-based line number, or or -1 if this position is invalid. 45 int line_number; 46 47 // 0-based column number, or or -1 if this position is invalid. 48 int column_number; 49 50 // Checks whether this position is valid. Invalid positions should 51 // only arise either if this character position has never been set, 52 // or if it was computed based on an invalid position. 53 bool is_valid() const { return character_number != -1; } 54 55 // By default, creates an invalid CharacterPosition. 56 // Postcondition: !this->is_valid() 57 CharacterPosition() 58 : character_number(-1), line_number(-1), column_number(-1) {} 59 }; 60 61 // Writes a debug representation of a CharacterPosition to an ostream. 62 std::ostream& operator<<(std::ostream& dest, const CharacterPosition& position); 63 64 // For a given text file, maps between byte offsets and CharacterPositions 65 // (character number, line number, column number). 66 // 67 // A line is terminated by LF or CF[LF], i.e., any of {LF, CR, CRLF}. The 68 // next line starts with the next byte after the line terminator. In other 69 // words, the line terminator for a given line counts as part of that line, 70 // not as part of the following line. 71 class UTF8LineIndex { 72 public: 73 // Creates a UTF8LineIndex for a file. The index retains a reference to 74 // the file content, which must therefore remain valid (and unchanged) so 75 // long as this index is in use. 76 // 77 // The content must be less than 2GB long. 78 // 79 // Complexity: O(content.size()) 80 explicit UTF8LineIndex(absl::string_view content); 81 82 // Given a (0-based) byte offset into the file, returns character-based 83 // information on the position of that offset. 84 // 85 // If the offset is greater than the size of the content then this returns 86 // an invalid CharacterPosition(). 87 // 88 // Complexity: O(log(#lines) + byte-offset-within-line) 89 CharacterPosition ComputePositionForByteOffset(int byte_offset) const; 90 91 // Computes just a (1-based) line number for a given (0-based) byte offset. 92 // 93 // This is equivalent to ComputePositionForByteOffset(offset).line_number, 94 // but more efficient as it doesn't have to compute the column number or 95 // character number. 96 // 97 // Complexity: O(log(#lines)) 98 int LineNumber(int offset) const; 99 100 // Given a 1-based line and 0-based column, returns the 0-based byte offset 101 // into the file. 102 // 103 // Complexity: O(log(#lines) + column) 104 int ComputeByteOffset(int line, int column) const; 105 106 // Returns the number of bytes in a given line. This includes the bytes 107 // of the end-of-line marker, if present. 108 int line_size(int line_number) const; 109 110 // Returns a view of the n-th line of the file. 111 // 112 // The first line is line 1. This returned string_view is a view into the 113 // buffer indexed by this UTF8LineIndex. 114 absl::string_view GetLine(int line_number) const; 115 116 // Returns a substring from the line at a given line number, starting from 117 // a given |start_position_in_code_points| and with a length of 118 // |length_in_code_points|. Returns an empty string piece if the start 119 // position does not exist in the input. If there is not as many code points 120 // in the line from the start position as the desired length, returns 121 // the rest of the line including the end-of-line marker. 122 // 123 // TODO: Optimize this function for the case of ASCII. 124 absl::string_view GetSubstrFromLine(int line_number, 125 int start_position_in_code_points, 126 int length_in_code_points) const; 127 128 // Returns the number of lines in the indexed file, including the last line 129 // even if it was not terminated. 130 int line_count() const { 131 // We have three cases: 132 // Case 1: empty file. Considered to have 1 (completely empty) line. 133 if (content_.empty()) return 1; 134 // Case 2: a file ending in a line-end. The next character added would 135 // be on line n+1, but we're not there yet. 136 if (!has_trailing_characters()) { 137 return static_cast<int>(line_begin_byte_offsets_.size()) - 1; 138 } 139 // Case 3: There's an unterminated line at the end of the file. The 140 // next character added would still be on line n. 141 return static_cast<int>(line_begin_byte_offsets_.size()); 142 } 143 144 // Returns (a reference to) the content of the indexed file. 145 absl::string_view str() const { return content_; } 146 147 private: 148 // Populates the index vectors based on content_. 149 void IndexContent(); 150 151 // Returns whether this file has trailing characters, i.e., characters that 152 // are not followed by a newline. Empty files or file that end in a newline 153 // do not have trailing characters, but all other files do. 154 bool has_trailing_characters() const { 155 return static_cast<size_t>(line_begin_byte_offsets_.back()) != 156 content_.size(); 157 } 158 159 // The content covered by this UTF8LineIndex. 160 absl::string_view content_; 161 162 // line_ends_byte_offsets_[n] stores the byte offset of the start of line n-1. 163 std::vector<int> line_begin_byte_offsets_; 164 165 // Character offsets corresponding to line_end_byte_offsets_. 166 std::vector<int> line_begin_character_offsets_; 167 }; 168 169 } // namespace kythe 170 171 #endif // KYTHE_CXX_COMMON_UTF8_LINE_INDEX_H_