kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/common/utf8_line_index_test.cc (about)

     1  /*
     2   * Copyright 2018 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  #include "kythe/cxx/common/utf8_line_index.h"
    18  
    19  #include <algorithm>
    20  #include <cstring>
    21  #include <string>
    22  
    23  #include "absl/log/check.h"
    24  #include "absl/strings/str_cat.h"
    25  #include "absl/strings/string_view.h"
    26  #include "gtest/gtest.h"
    27  
    28  namespace {
    29  
    30  using ::kythe::CharacterPosition;
    31  using ::kythe::UTF8LineIndex;
    32  
    33  // Returns whether a UTF-8 byte is a continuation byte, i.e., a byte other
    34  // than the first byte of the encoding of a character.
    35  static bool IsUTF8ContinuationByte(int byte) { return ((byte & 0xC0) == 0x80); }
    36  
    37  // Checks that round-tripping is a no-op for the first byte of each character
    38  // covered by a UTF8LineIndex, and also for breaking the file up into lines
    39  // and then concatenating those lines.
    40  void CheckRoundTrips(const UTF8LineIndex& index) {
    41    absl::string_view content(index.str());
    42    for (int byte_offset = 0; byte_offset < content.size(); ++byte_offset) {
    43      if (IsUTF8ContinuationByte(content[byte_offset])) continue;
    44      CharacterPosition position(index.ComputePositionForByteOffset(byte_offset));
    45      EXPECT_TRUE(position.is_valid())
    46          << " at byte offset: " << byte_offset << " in " << content;
    47      EXPECT_EQ(byte_offset, index.ComputeByteOffset(position.line_number,
    48                                                     position.column_number));
    49    }
    50  
    51    std::string joined_lines;
    52    for (int line_number = 1; line_number <= index.line_count(); ++line_number) {
    53      absl::StrAppend(&joined_lines, index.GetLine(line_number));
    54    }
    55    EXPECT_EQ(index.str(), joined_lines);
    56  }
    57  
    58  TEST(UTF8LineIndexTest, WorksForAnEmptyFile) {
    59    // All we expect for an empty input is that this doesn't crash, reports
    60    // that an empty file contains one line, gives a sane response for the
    61    // past-the-end position, and handles bad requests robustly.
    62    const std::string empty_file_content("");
    63    UTF8LineIndex index(empty_file_content);
    64  
    65    EXPECT_EQ(1, index.line_count());
    66  
    67    EXPECT_FALSE(index.ComputePositionForByteOffset(1).is_valid());
    68    EXPECT_FALSE(index.ComputePositionForByteOffset(127).is_valid());
    69  
    70    CheckRoundTrips(index);
    71  
    72    CharacterPosition past_the_end = index.ComputePositionForByteOffset(0);
    73    EXPECT_TRUE(past_the_end.is_valid());
    74    EXPECT_EQ(1, past_the_end.line_number);
    75    EXPECT_EQ(0, past_the_end.column_number);
    76    EXPECT_EQ(0, past_the_end.character_number);
    77  }
    78  
    79  TEST(UTF8LineIndexTest, WorksForSingleLineAsciiFile) {
    80    const std::string single_line_content("Hello World!\n");
    81    UTF8LineIndex index(single_line_content);
    82    // The whole "file" is on line 1.
    83    for (int i = 0; i < single_line_content.size(); ++i) {
    84      EXPECT_EQ(1, index.LineNumber(i)) << "i = " << i;
    85      EXPECT_EQ(i, index.ComputePositionForByteOffset(i).column_number);
    86    }
    87    EXPECT_EQ(1, index.line_count());
    88    CheckRoundTrips(index);
    89  
    90    auto past_the_end = index.ComputePositionForByteOffset(index.str().size());
    91    EXPECT_TRUE(past_the_end.is_valid());
    92    EXPECT_EQ(2, past_the_end.line_number);
    93    EXPECT_EQ(0, past_the_end.column_number);
    94  }
    95  
    96  TEST(UTF8LineIndexTest, WorksForFileWithEmptyFirstLine) {
    97    const std::string content_with_empty_first_line("\nSecond line\n");
    98    UTF8LineIndex index(content_with_empty_first_line);
    99    // The initial newline counts as part of line 1.
   100    EXPECT_EQ(1, index.LineNumber(0));
   101    // The rest of the file is on line 2.
   102    for (int i = 1; i < content_with_empty_first_line.size(); ++i) {
   103      EXPECT_EQ(2, index.LineNumber(i));
   104    }
   105    CheckRoundTrips(index);
   106  }
   107  
   108  // Tests lines terminated with just CR (0x0D).
   109  TEST(UTF8LineIndexTest, WorksForMacStyleLineEnds) {
   110    const std::string mac_style_file_content("Mac\rStyle\rLines\r");
   111    UTF8LineIndex index(mac_style_file_content);
   112    CheckRoundTrips(index);
   113  }
   114  
   115  TEST(UTF8LineIndexTest, WorksForPlainASCIIFile) {
   116    const std::string ascii_content("Now is the {\nWinter of}\nyour disc\n");
   117    UTF8LineIndex index(ascii_content);
   118    CheckRoundTrips(index);
   119  
   120    // Some tests for the first 'o' character.
   121    EXPECT_EQ(1, index.LineNumber(1));
   122    EXPECT_EQ(1, index.ComputePositionForByteOffset(1).line_number);
   123    EXPECT_EQ(1, index.ComputePositionForByteOffset(1).column_number);
   124  
   125    // There's only one 'c' in ascii_content.  We depend on this below, so
   126    // check it here.  This is a CHECK rather than EXPECT or ASSERT because
   127    // it's not an assertion about the code under test, it's an assertion
   128    // about this test code.
   129    CHECK_EQ(1, std::count(ascii_content.begin(), ascii_content.end(), 'c'));
   130    // 'c' is the 9th character (hence, column 8) of the 3rd line ("your disc").
   131    auto position_of_c =
   132        index.ComputePositionForByteOffset(ascii_content.find('c'));
   133    EXPECT_EQ(3, position_of_c.line_number);
   134    EXPECT_EQ(8, position_of_c.column_number);
   135    EXPECT_EQ(ascii_content.find('c'),
   136              index.ComputeByteOffset(position_of_c.line_number,
   137                                      std::string("your disc").find('c')));
   138  }
   139  
   140  TEST(UTF8LineIndexTest, WorksForFileWithMissingTerminalLineEnd) {
   141    const std::string ascii_content("Now is the {\nWinter of}\nyour disc");
   142    UTF8LineIndex index(ascii_content);
   143    CheckRoundTrips(index);
   144    EXPECT_EQ(3, index.line_count());
   145    EXPECT_EQ(1, index.LineNumber(1));
   146    EXPECT_EQ(1, index.ComputePositionForByteOffset(1).line_number);
   147    EXPECT_EQ(3, index.LineNumber(ascii_content.size() - 2));
   148    EXPECT_EQ(3, index.line_count());
   149  
   150    auto past_the_end = index.ComputePositionForByteOffset(index.str().size());
   151    EXPECT_TRUE(past_the_end.is_valid());
   152    EXPECT_EQ(3, past_the_end.line_number);
   153    EXPECT_EQ(9, past_the_end.column_number);
   154  
   155    EXPECT_FALSE(index.ComputePositionForByteOffset(12345678).is_valid());
   156  
   157    EXPECT_EQ(13, index.line_size(1));
   158    EXPECT_EQ(11, index.line_size(2));
   159    EXPECT_EQ(9, index.line_size(3));
   160  }
   161  
   162  // TODO(jdennett): Split WorksWithDoubleByteCharacters out into smaller tests.
   163  TEST(UTF8LineIndexTest, WorksWithDoubleByteCharacters) {
   164    const std::string text_with_double_byte_characters =
   165        "$1 = £0.6354\n"
   166        "£1 = $1.5739\n";
   167    UTF8LineIndex index(text_with_double_byte_characters);
   168    CheckRoundTrips(index);
   169  
   170    // The "=" on the first line is at byte offset 3 and (0-based) column 3.
   171    EXPECT_EQ(3, index.ComputePositionForByteOffset(3).column_number);
   172  
   173    auto first_pound_position =
   174        index.ComputePositionForByteOffset(5);  // the first byte of £
   175    EXPECT_EQ(1, first_pound_position.line_number);
   176    EXPECT_EQ(5, first_pound_position.column_number);
   177  
   178    auto after_pound_position =
   179        index.ComputePositionForByteOffset(7);  // the first byte *after* £
   180    EXPECT_EQ(1, after_pound_position.line_number);
   181    EXPECT_EQ(6, after_pound_position.column_number);  // 8th byte, 7th character
   182  
   183    // Each of the two lines is 13 characters wrong including the LF control
   184    // character, but one pound character in each line is represented as two
   185    // bytes in UTF-8, so each lines is 14 bytes in total.
   186    EXPECT_EQ(14, index.line_size(1));
   187    EXPECT_EQ(14, index.line_size(2));
   188  
   189    // Line 1 starts at character index 0.
   190    auto start_of_line_1 =
   191        index.ComputePositionForByteOffset(index.ComputeByteOffset(1, 0));
   192    EXPECT_EQ(1, start_of_line_1.line_number);
   193    EXPECT_EQ(0, start_of_line_1.column_number);
   194    EXPECT_EQ(0, start_of_line_1.character_number);
   195  
   196    // Line 2 starts at character index 13.
   197    auto start_of_line_2 =
   198        index.ComputePositionForByteOffset(index.ComputeByteOffset(2, 0));
   199    EXPECT_EQ(2, start_of_line_2.line_number);
   200    EXPECT_EQ(0, start_of_line_2.column_number);
   201    EXPECT_EQ(13, start_of_line_2.character_number);
   202  }
   203  
   204  TEST(UTF8LineIndexTest, CRLFIsASingleLineEnd) {
   205    const std::string four_empty_lines(
   206        "\n"
   207        "\r"
   208        "\r\n"
   209        "\r\n");
   210    UTF8LineIndex index(four_empty_lines);
   211    CheckRoundTrips(index);
   212  
   213    EXPECT_EQ(4, index.line_count());
   214    EXPECT_EQ(1, index.line_size(1));
   215    EXPECT_EQ(1, index.line_size(2));
   216    EXPECT_EQ(2, index.line_size(3));
   217    EXPECT_EQ(2, index.line_size(4));
   218  }
   219  
   220  TEST(UTF8LineIndexTest, KeepFrombergerHappy) {
   221    const std::string michael("abc\r\ndef");
   222    UTF8LineIndex index(michael);
   223    CheckRoundTrips(index);
   224  
   225    // The first line consists of five bytes: "abc\r\n".
   226    EXPECT_EQ(5, index.ComputeByteOffset(2, 0));
   227    EXPECT_EQ(5, index.line_size(1));
   228    EXPECT_EQ(3, index.line_size(2));
   229  }
   230  
   231  TEST(UTF8LineIndexTest, GetLineFromEmptyFile) {
   232    const std::string empty_file;
   233    UTF8LineIndex empty_index(empty_file);
   234    EXPECT_EQ("", empty_index.GetLine(-1));
   235    EXPECT_EQ("", empty_index.GetLine(0));
   236    EXPECT_EQ("", empty_index.GetLine(1));
   237    EXPECT_EQ("", empty_index.GetLine(2));
   238    EXPECT_EQ("", empty_index.GetLine(999));
   239  }
   240  
   241  TEST(UTF8LineIndexTest, GetLineFromUnterminatedFile) {
   242    const std::string unterminated_file(
   243        "Hello world.\n"
   244        "Goodbye, unterminated world.");
   245    UTF8LineIndex unterminated_index(unterminated_file);
   246    CheckRoundTrips(unterminated_index);
   247  
   248    EXPECT_EQ("", unterminated_index.GetLine(0));
   249    EXPECT_EQ("Hello world.\n", unterminated_index.GetLine(1));
   250    EXPECT_EQ("Goodbye, unterminated world.", unterminated_index.GetLine(2));
   251    EXPECT_EQ("", unterminated_index.GetLine(3));
   252  }
   253  
   254  TEST(UTF8LineIndexTest, GetLineFromSmallFile) {
   255    const std::string small_file("\nline two\nline three\r\n");
   256    UTF8LineIndex small_index(small_file);
   257    CheckRoundTrips(small_index);
   258  
   259    EXPECT_EQ("", small_index.GetLine(-1));
   260    EXPECT_EQ("", small_index.GetLine(0));
   261    EXPECT_EQ("\n", small_index.GetLine(1));
   262    EXPECT_EQ("line two\n", small_index.GetLine(2));
   263    EXPECT_EQ("line three\r\n", small_index.GetLine(3));
   264    EXPECT_EQ("", small_index.GetLine(4));
   265    EXPECT_EQ("", small_index.GetLine(5));
   266    EXPECT_EQ("", small_index.GetLine(999));
   267  }
   268  
   269  TEST(UTF8LineIndexTest, ComputeByteOffsetAtEndOfUnterminatedFile) {
   270    // This is a regression test; migrating from ::string storage to using a
   271    // string_view means that peeking past the end of the buffer isn't allowed
   272    // anymore, and this aborted in ComputeByteOffset previously.
   273    const std::string unterminated_file(
   274        "Hello world.\n"
   275        "Goodbye, unterminated world.");
   276    UTF8LineIndex index(unterminated_file);
   277    EXPECT_EQ(unterminated_file.length(),
   278              index.ComputeByteOffset(2, strlen("Goodbye, unterminated world.")));
   279  }
   280  
   281  }  // anonymous namespace