kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/common/utf8_line_index_test.cc (about) 1 /* 2 * Copyright 2018 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "kythe/cxx/common/utf8_line_index.h" 18 19 #include <algorithm> 20 #include <cstring> 21 #include <string> 22 23 #include "absl/log/check.h" 24 #include "absl/strings/str_cat.h" 25 #include "absl/strings/string_view.h" 26 #include "gtest/gtest.h" 27 28 namespace { 29 30 using ::kythe::CharacterPosition; 31 using ::kythe::UTF8LineIndex; 32 33 // Returns whether a UTF-8 byte is a continuation byte, i.e., a byte other 34 // than the first byte of the encoding of a character. 35 static bool IsUTF8ContinuationByte(int byte) { return ((byte & 0xC0) == 0x80); } 36 37 // Checks that round-tripping is a no-op for the first byte of each character 38 // covered by a UTF8LineIndex, and also for breaking the file up into lines 39 // and then concatenating those lines. 40 void CheckRoundTrips(const UTF8LineIndex& index) { 41 absl::string_view content(index.str()); 42 for (int byte_offset = 0; byte_offset < content.size(); ++byte_offset) { 43 if (IsUTF8ContinuationByte(content[byte_offset])) continue; 44 CharacterPosition position(index.ComputePositionForByteOffset(byte_offset)); 45 EXPECT_TRUE(position.is_valid()) 46 << " at byte offset: " << byte_offset << " in " << content; 47 EXPECT_EQ(byte_offset, index.ComputeByteOffset(position.line_number, 48 position.column_number)); 49 } 50 51 std::string joined_lines; 52 for (int line_number = 1; line_number <= index.line_count(); ++line_number) { 53 absl::StrAppend(&joined_lines, index.GetLine(line_number)); 54 } 55 EXPECT_EQ(index.str(), joined_lines); 56 } 57 58 TEST(UTF8LineIndexTest, WorksForAnEmptyFile) { 59 // All we expect for an empty input is that this doesn't crash, reports 60 // that an empty file contains one line, gives a sane response for the 61 // past-the-end position, and handles bad requests robustly. 62 const std::string empty_file_content(""); 63 UTF8LineIndex index(empty_file_content); 64 65 EXPECT_EQ(1, index.line_count()); 66 67 EXPECT_FALSE(index.ComputePositionForByteOffset(1).is_valid()); 68 EXPECT_FALSE(index.ComputePositionForByteOffset(127).is_valid()); 69 70 CheckRoundTrips(index); 71 72 CharacterPosition past_the_end = index.ComputePositionForByteOffset(0); 73 EXPECT_TRUE(past_the_end.is_valid()); 74 EXPECT_EQ(1, past_the_end.line_number); 75 EXPECT_EQ(0, past_the_end.column_number); 76 EXPECT_EQ(0, past_the_end.character_number); 77 } 78 79 TEST(UTF8LineIndexTest, WorksForSingleLineAsciiFile) { 80 const std::string single_line_content("Hello World!\n"); 81 UTF8LineIndex index(single_line_content); 82 // The whole "file" is on line 1. 83 for (int i = 0; i < single_line_content.size(); ++i) { 84 EXPECT_EQ(1, index.LineNumber(i)) << "i = " << i; 85 EXPECT_EQ(i, index.ComputePositionForByteOffset(i).column_number); 86 } 87 EXPECT_EQ(1, index.line_count()); 88 CheckRoundTrips(index); 89 90 auto past_the_end = index.ComputePositionForByteOffset(index.str().size()); 91 EXPECT_TRUE(past_the_end.is_valid()); 92 EXPECT_EQ(2, past_the_end.line_number); 93 EXPECT_EQ(0, past_the_end.column_number); 94 } 95 96 TEST(UTF8LineIndexTest, WorksForFileWithEmptyFirstLine) { 97 const std::string content_with_empty_first_line("\nSecond line\n"); 98 UTF8LineIndex index(content_with_empty_first_line); 99 // The initial newline counts as part of line 1. 100 EXPECT_EQ(1, index.LineNumber(0)); 101 // The rest of the file is on line 2. 102 for (int i = 1; i < content_with_empty_first_line.size(); ++i) { 103 EXPECT_EQ(2, index.LineNumber(i)); 104 } 105 CheckRoundTrips(index); 106 } 107 108 // Tests lines terminated with just CR (0x0D). 109 TEST(UTF8LineIndexTest, WorksForMacStyleLineEnds) { 110 const std::string mac_style_file_content("Mac\rStyle\rLines\r"); 111 UTF8LineIndex index(mac_style_file_content); 112 CheckRoundTrips(index); 113 } 114 115 TEST(UTF8LineIndexTest, WorksForPlainASCIIFile) { 116 const std::string ascii_content("Now is the {\nWinter of}\nyour disc\n"); 117 UTF8LineIndex index(ascii_content); 118 CheckRoundTrips(index); 119 120 // Some tests for the first 'o' character. 121 EXPECT_EQ(1, index.LineNumber(1)); 122 EXPECT_EQ(1, index.ComputePositionForByteOffset(1).line_number); 123 EXPECT_EQ(1, index.ComputePositionForByteOffset(1).column_number); 124 125 // There's only one 'c' in ascii_content. We depend on this below, so 126 // check it here. This is a CHECK rather than EXPECT or ASSERT because 127 // it's not an assertion about the code under test, it's an assertion 128 // about this test code. 129 CHECK_EQ(1, std::count(ascii_content.begin(), ascii_content.end(), 'c')); 130 // 'c' is the 9th character (hence, column 8) of the 3rd line ("your disc"). 131 auto position_of_c = 132 index.ComputePositionForByteOffset(ascii_content.find('c')); 133 EXPECT_EQ(3, position_of_c.line_number); 134 EXPECT_EQ(8, position_of_c.column_number); 135 EXPECT_EQ(ascii_content.find('c'), 136 index.ComputeByteOffset(position_of_c.line_number, 137 std::string("your disc").find('c'))); 138 } 139 140 TEST(UTF8LineIndexTest, WorksForFileWithMissingTerminalLineEnd) { 141 const std::string ascii_content("Now is the {\nWinter of}\nyour disc"); 142 UTF8LineIndex index(ascii_content); 143 CheckRoundTrips(index); 144 EXPECT_EQ(3, index.line_count()); 145 EXPECT_EQ(1, index.LineNumber(1)); 146 EXPECT_EQ(1, index.ComputePositionForByteOffset(1).line_number); 147 EXPECT_EQ(3, index.LineNumber(ascii_content.size() - 2)); 148 EXPECT_EQ(3, index.line_count()); 149 150 auto past_the_end = index.ComputePositionForByteOffset(index.str().size()); 151 EXPECT_TRUE(past_the_end.is_valid()); 152 EXPECT_EQ(3, past_the_end.line_number); 153 EXPECT_EQ(9, past_the_end.column_number); 154 155 EXPECT_FALSE(index.ComputePositionForByteOffset(12345678).is_valid()); 156 157 EXPECT_EQ(13, index.line_size(1)); 158 EXPECT_EQ(11, index.line_size(2)); 159 EXPECT_EQ(9, index.line_size(3)); 160 } 161 162 // TODO(jdennett): Split WorksWithDoubleByteCharacters out into smaller tests. 163 TEST(UTF8LineIndexTest, WorksWithDoubleByteCharacters) { 164 const std::string text_with_double_byte_characters = 165 "$1 = £0.6354\n" 166 "£1 = $1.5739\n"; 167 UTF8LineIndex index(text_with_double_byte_characters); 168 CheckRoundTrips(index); 169 170 // The "=" on the first line is at byte offset 3 and (0-based) column 3. 171 EXPECT_EQ(3, index.ComputePositionForByteOffset(3).column_number); 172 173 auto first_pound_position = 174 index.ComputePositionForByteOffset(5); // the first byte of £ 175 EXPECT_EQ(1, first_pound_position.line_number); 176 EXPECT_EQ(5, first_pound_position.column_number); 177 178 auto after_pound_position = 179 index.ComputePositionForByteOffset(7); // the first byte *after* £ 180 EXPECT_EQ(1, after_pound_position.line_number); 181 EXPECT_EQ(6, after_pound_position.column_number); // 8th byte, 7th character 182 183 // Each of the two lines is 13 characters wrong including the LF control 184 // character, but one pound character in each line is represented as two 185 // bytes in UTF-8, so each lines is 14 bytes in total. 186 EXPECT_EQ(14, index.line_size(1)); 187 EXPECT_EQ(14, index.line_size(2)); 188 189 // Line 1 starts at character index 0. 190 auto start_of_line_1 = 191 index.ComputePositionForByteOffset(index.ComputeByteOffset(1, 0)); 192 EXPECT_EQ(1, start_of_line_1.line_number); 193 EXPECT_EQ(0, start_of_line_1.column_number); 194 EXPECT_EQ(0, start_of_line_1.character_number); 195 196 // Line 2 starts at character index 13. 197 auto start_of_line_2 = 198 index.ComputePositionForByteOffset(index.ComputeByteOffset(2, 0)); 199 EXPECT_EQ(2, start_of_line_2.line_number); 200 EXPECT_EQ(0, start_of_line_2.column_number); 201 EXPECT_EQ(13, start_of_line_2.character_number); 202 } 203 204 TEST(UTF8LineIndexTest, CRLFIsASingleLineEnd) { 205 const std::string four_empty_lines( 206 "\n" 207 "\r" 208 "\r\n" 209 "\r\n"); 210 UTF8LineIndex index(four_empty_lines); 211 CheckRoundTrips(index); 212 213 EXPECT_EQ(4, index.line_count()); 214 EXPECT_EQ(1, index.line_size(1)); 215 EXPECT_EQ(1, index.line_size(2)); 216 EXPECT_EQ(2, index.line_size(3)); 217 EXPECT_EQ(2, index.line_size(4)); 218 } 219 220 TEST(UTF8LineIndexTest, KeepFrombergerHappy) { 221 const std::string michael("abc\r\ndef"); 222 UTF8LineIndex index(michael); 223 CheckRoundTrips(index); 224 225 // The first line consists of five bytes: "abc\r\n". 226 EXPECT_EQ(5, index.ComputeByteOffset(2, 0)); 227 EXPECT_EQ(5, index.line_size(1)); 228 EXPECT_EQ(3, index.line_size(2)); 229 } 230 231 TEST(UTF8LineIndexTest, GetLineFromEmptyFile) { 232 const std::string empty_file; 233 UTF8LineIndex empty_index(empty_file); 234 EXPECT_EQ("", empty_index.GetLine(-1)); 235 EXPECT_EQ("", empty_index.GetLine(0)); 236 EXPECT_EQ("", empty_index.GetLine(1)); 237 EXPECT_EQ("", empty_index.GetLine(2)); 238 EXPECT_EQ("", empty_index.GetLine(999)); 239 } 240 241 TEST(UTF8LineIndexTest, GetLineFromUnterminatedFile) { 242 const std::string unterminated_file( 243 "Hello world.\n" 244 "Goodbye, unterminated world."); 245 UTF8LineIndex unterminated_index(unterminated_file); 246 CheckRoundTrips(unterminated_index); 247 248 EXPECT_EQ("", unterminated_index.GetLine(0)); 249 EXPECT_EQ("Hello world.\n", unterminated_index.GetLine(1)); 250 EXPECT_EQ("Goodbye, unterminated world.", unterminated_index.GetLine(2)); 251 EXPECT_EQ("", unterminated_index.GetLine(3)); 252 } 253 254 TEST(UTF8LineIndexTest, GetLineFromSmallFile) { 255 const std::string small_file("\nline two\nline three\r\n"); 256 UTF8LineIndex small_index(small_file); 257 CheckRoundTrips(small_index); 258 259 EXPECT_EQ("", small_index.GetLine(-1)); 260 EXPECT_EQ("", small_index.GetLine(0)); 261 EXPECT_EQ("\n", small_index.GetLine(1)); 262 EXPECT_EQ("line two\n", small_index.GetLine(2)); 263 EXPECT_EQ("line three\r\n", small_index.GetLine(3)); 264 EXPECT_EQ("", small_index.GetLine(4)); 265 EXPECT_EQ("", small_index.GetLine(5)); 266 EXPECT_EQ("", small_index.GetLine(999)); 267 } 268 269 TEST(UTF8LineIndexTest, ComputeByteOffsetAtEndOfUnterminatedFile) { 270 // This is a regression test; migrating from ::string storage to using a 271 // string_view means that peeking past the end of the buffer isn't allowed 272 // anymore, and this aborted in ComputeByteOffset previously. 273 const std::string unterminated_file( 274 "Hello world.\n" 275 "Goodbye, unterminated world."); 276 UTF8LineIndex index(unterminated_file); 277 EXPECT_EQ(unterminated_file.length(), 278 index.ComputeByteOffset(2, strlen("Goodbye, unterminated world."))); 279 } 280 281 } // anonymous namespace