kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/cxx/extractor/bazel_artifact_selector.cc (about)

     1  /*
     2   * Copyright 2020 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  #include "kythe/cxx/extractor/bazel_artifact_selector.h"
    17  
    18  #include <cstddef>
    19  #include <cstdint>
    20  #include <functional>
    21  #include <optional>
    22  #include <string>
    23  #include <tuple>
    24  #include <type_traits>
    25  #include <utility>
    26  #include <vector>
    27  
    28  #include "absl/base/attributes.h"
    29  #include "absl/container/flat_hash_map.h"
    30  #include "absl/container/flat_hash_set.h"
    31  #include "absl/log/check.h"
    32  #include "absl/log/die_if_null.h"
    33  #include "absl/log/log.h"
    34  #include "absl/status/status.h"
    35  #include "absl/strings/ascii.h"
    36  #include "absl/strings/escaping.h"
    37  #include "absl/strings/numbers.h"
    38  #include "absl/strings/str_cat.h"
    39  #include "absl/strings/str_join.h"
    40  #include "absl/strings/string_view.h"
    41  #include "absl/types/span.h"
    42  #include "google/protobuf/any.pb.h"
    43  #include "kythe/cxx/extractor/bazel_artifact.h"
    44  #include "kythe/proto/bazel_artifact_selector.pb.h"
    45  #include "kythe/proto/bazel_artifact_selector_v2.pb.h"
    46  #include "re2/re2.h"
    47  #include "third_party/bazel/src/main/java/com/google/devtools/build/lib/buildeventstream/proto/build_event_stream.pb.h"
    48  
    49  namespace kythe {
    50  namespace {
    51  
    52  std::optional<std::string> ToUri(const build_event_stream::File& file) {
    53    switch (file.file_case()) {
    54      case build_event_stream::File::kUri:
    55        return file.uri();
    56      case build_event_stream::File::kContents:
    57        // We expect inline data to be rare and small, so always base64 encode it.
    58        return absl::StrCat(
    59            "data:base64,",
    60            // data URIs use regular base64, not "web safe" base64.
    61            absl::Base64Escape(file.contents()));
    62      case build_event_stream::File::kSymlinkTargetPath:
    63        return std::nullopt;
    64      default:
    65        break;
    66    }
    67    LOG(ERROR) << "Unexpected build_event_stream::File case!" << file.file_case();
    68    return std::nullopt;
    69  }
    70  
    71  std::string ToLocalPath(const build_event_stream::File& file) {
    72    std::vector<std::string> parts(file.path_prefix().begin(),
    73                                   file.path_prefix().end());
    74    parts.push_back(file.name());
    75    return absl::StrJoin(parts, "/");
    76  }
    77  
    78  std::optional<BazelArtifactFile> ToBazelArtifactFile(
    79      const build_event_stream::File& file, const RegexSet& allowlist) {
    80    if (!allowlist.Match(file.name())) {
    81      return std::nullopt;
    82    }
    83    std::optional<std::string> uri = ToUri(file);
    84    if (!uri.has_value()) return std::nullopt;
    85    return BazelArtifactFile{
    86        .local_path = ToLocalPath(file),
    87        .uri = *std::move(uri),
    88    };
    89  }
    90  
    91  template <typename T>
    92  T& GetOrConstruct(std::optional<T>& value) {
    93    return value.has_value() ? *value : value.emplace();
    94  }
    95  
    96  template <typename T>
    97  const T& AsConstRef(const T& value) {
    98    return value;
    99  }
   100  
   101  template <typename T>
   102  const T& AsConstRef(const T* value) {
   103    return *value;
   104  }
   105  
   106  template <typename T, typename U>
   107  absl::Status DeserializeInternal(T& selector, const U& container) {
   108    absl::Status error;
   109    for (const auto& any : container) {
   110      switch (auto status = selector.DeserializeFrom(AsConstRef(any));
   111              status.code()) {
   112        case absl::StatusCode::kOk:
   113        case absl::StatusCode::kUnimplemented:
   114          return absl::OkStatus();
   115        case absl::StatusCode::kInvalidArgument:
   116          return status;
   117        case absl::StatusCode::kFailedPrecondition:
   118          error = status;
   119          continue;
   120        default:
   121          error = status;
   122          LOG(WARNING) << "Unrecognized status code: " << status;
   123      }
   124    }
   125    return error.ok() ? absl::NotFoundError("No state found")
   126                      : absl::NotFoundError(
   127                            absl::StrCat("No state found: ", error.ToString()));
   128  }
   129  bool StrictAtoI(absl::string_view value, int64_t* out) {
   130    if (value == "0") {
   131      *out = 0;
   132      return true;
   133    }
   134    if (value.empty() || value.front() == '0') {
   135      // We need to ignore leading zeros as they don't contribute to the integral
   136      // value.
   137      return false;
   138    }
   139    for (char ch : value) {
   140      if (!absl::ascii_isdigit(ch)) {
   141        return false;
   142      }
   143    }
   144    return absl::SimpleAtoi(value, out);
   145  }
   146  }  // namespace
   147  
   148  absl::Status BazelArtifactSelector::Deserialize(
   149      absl::Span<const google::protobuf::Any> state) {
   150    return DeserializeInternal(*this, state);
   151  }
   152  
   153  absl::Status BazelArtifactSelector::Deserialize(
   154      absl::Span<const google::protobuf::Any* const> state) {
   155    return DeserializeInternal(*this, state);
   156  }
   157  
   158  std::optional<BazelArtifact> AspectArtifactSelector::Select(
   159      const build_event_stream::BuildEvent& event) {
   160    std::optional<BazelArtifact> result = std::nullopt;
   161    if (event.id().has_named_set()) {
   162      result =
   163          SelectFileSet(event.id().named_set().id(), event.named_set_of_files());
   164    } else if (event.id().has_target_completed()) {
   165      result =
   166          SelectTargetCompleted(event.id().target_completed(), event.completed());
   167    }
   168    if (event.last_message()) {
   169      state_ = {};
   170    }
   171    return result;
   172  }
   173  
   174  class AspectArtifactSelectorSerializationHelper {
   175   public:
   176    using FileId = AspectArtifactSelector::FileId;
   177    using ProtoFile = ::kythe::proto::BazelAspectArtifactSelectorStateV2::File;
   178    using FileSet = AspectArtifactSelector::FileSet;
   179    using ProtoFileSet =
   180        ::kythe::proto::BazelAspectArtifactSelectorStateV2::FileSet;
   181    using FileSetId = AspectArtifactSelector::FileSetId;
   182    using State = AspectArtifactSelector::State;
   183  
   184    static bool SerializeInto(
   185        const State& state,
   186        kythe::proto::BazelAspectArtifactSelectorStateV2& result) {
   187      return Serializer(&state, result).Serialize();
   188    }
   189  
   190    static absl::Status DeserializeFrom(
   191        const kythe::proto::BazelAspectArtifactSelectorStateV2& state,
   192        State& result) {
   193      return Deserializer(&state, result).Deserialize();
   194    }
   195  
   196   private:
   197    class Serializer {
   198     public:
   199      explicit Serializer(const State* state ABSL_ATTRIBUTE_LIFETIME_BOUND,
   200                          kythe::proto::BazelAspectArtifactSelectorStateV2& result
   201                              ABSL_ATTRIBUTE_LIFETIME_BOUND)
   202          : state_(*ABSL_DIE_IF_NULL(state)), result_(result) {}
   203  
   204      bool Serialize() {
   205        for (const auto& [id, file_set] : state_.file_sets.file_sets()) {
   206          SerializeFileSet(id, file_set);
   207        }
   208        for (FileSetId id : state_.file_sets.disposed()) {
   209          SerializeDisposed(id);
   210        }
   211        for (const auto& [id, target] : state_.pending) {
   212          SerializePending(id, target);
   213        }
   214        return true;
   215      }
   216  
   217     private:
   218      static int64_t ToSerializationId(FileSetId id, size_t other) {
   219        if (const auto [unpacked] = id; unpacked >= 0) {
   220          return unpacked;
   221        }
   222        // 0 is reserved for the integral ids, so start at -1.
   223        return -1 - static_cast<int64_t>(other);
   224      }
   225  
   226      int64_t SerializeFileSetId(FileSetId id) {
   227        auto [iter, inserted] = set_id_map_.try_emplace(
   228            id, ToSerializationId(id, result_.file_set_ids().size()));
   229        if (inserted && iter->second < 0) {
   230          result_.add_file_set_ids(state_.file_sets.ToString(id));
   231        }
   232        return iter->second;
   233      }
   234  
   235      void SerializeFileSet(FileSetId id, const FileSet& file_set) {
   236        auto& entry = (*result_.mutable_file_sets())[SerializeFileSetId(id)];
   237        for (FileId file_id : file_set.files) {
   238          if (std::optional<uint64_t> index = SerializeFile(file_id)) {
   239            entry.add_files(*index);
   240          }
   241        }
   242        for (FileSetId child_id : file_set.file_sets) {
   243          entry.add_file_sets(SerializeFileSetId(child_id));
   244        }
   245      }
   246  
   247      std::optional<uint64_t> SerializeFile(FileId id) {
   248        const BazelArtifactFile* file = state_.files.Find(id);
   249        if (file == nullptr) {
   250          LOG(INFO) << "Omitting extracted FileId from serialization: "
   251                    << std::get<0>(id);
   252          // FileSets may still reference files which have already been selected.
   253          // If so, don't keep them when serializing.
   254          return std::nullopt;
   255        }
   256        auto [iter, inserted] =
   257            file_id_map_.try_emplace(id, result_.files().size());
   258        if (!inserted) {
   259          return iter->second;
   260        }
   261  
   262        auto* entry = result_.add_files();
   263        entry->set_local_path(file->local_path);
   264        entry->set_uri(file->uri);
   265        return iter->second;
   266      }
   267  
   268      void SerializeDisposed(FileSetId id) {
   269        result_.add_disposed(SerializeFileSetId(id));
   270      }
   271  
   272      void SerializePending(FileSetId id, absl::string_view target) {
   273        (*result_.mutable_pending())[SerializeFileSetId(id)] = target;
   274      }
   275  
   276      const State& state_;
   277      kythe::proto::BazelAspectArtifactSelectorStateV2& result_;
   278  
   279      absl::flat_hash_map<FileId, uint64_t> file_id_map_;
   280      absl::flat_hash_map<FileSetId, int64_t> set_id_map_;
   281    };
   282  
   283    class Deserializer {
   284     public:
   285      explicit Deserializer(
   286          const kythe::proto::BazelAspectArtifactSelectorStateV2* state
   287              ABSL_ATTRIBUTE_LIFETIME_BOUND,
   288          State& result ABSL_ATTRIBUTE_LIFETIME_BOUND)
   289          : state_(*ABSL_DIE_IF_NULL(state)), result_(result) {}
   290  
   291      absl::Status Deserialize() {
   292        // First, deserialize all of the disposed sets to help check consistency
   293        // during the rest of deserialization.
   294        for (int64_t id : state_.disposed()) {
   295          absl::StatusOr<FileSetId> real_id = DeserializeFileSetId(id);
   296          if (!real_id.ok()) return real_id.status();
   297          result_.file_sets.Dispose(*real_id);
   298        }
   299        {
   300          // Then check the file_set_ids list for uniqueness:
   301          absl::flat_hash_set<std::string> non_integer_ids(
   302              state_.file_set_ids().begin(), state_.file_set_ids().end());
   303          if (non_integer_ids.size() != state_.file_set_ids().size()) {
   304            return absl::InvalidArgumentError("Inconsistent file_set_ids map");
   305          }
   306        }
   307  
   308        for (const auto& [id, file_set] : state_.file_sets()) {
   309          // Ensure pending and live file sets are distinct.
   310          if (state_.pending().contains(id)) {
   311            return absl::InvalidArgumentError(
   312                absl::StrCat("FileSet ", id, " is both pending and live"));
   313          }
   314          absl::Status status = DeserializeFileSet(id, file_set);
   315          if (!status.ok()) return status;
   316        }
   317        for (const auto& [id, target] : state_.pending()) {
   318          absl::Status status = DeserializePending(id, target);
   319          if (!status.ok()) return status;
   320        }
   321        return absl::OkStatus();
   322      }
   323  
   324     private:
   325      static constexpr FileSetId kDummy{0};
   326  
   327      static absl::StatusOr<std::string> ToDeserializationId(
   328          const kythe::proto::BazelAspectArtifactSelectorStateV2& state,
   329          int64_t id) {
   330        if (id < 0) {
   331          // Normalize the -1 based index.
   332          size_t index = -(id + 1);
   333          if (index > state.file_set_ids().size()) {
   334            return absl::InvalidArgumentError(absl::StrCat(
   335                "Non-integral FileSetId index out of range: ", index));
   336          }
   337          return state.file_set_ids(index);
   338        }
   339        return absl::StrCat(id);
   340      }
   341  
   342      absl::StatusOr<FileSetId> DeserializeFileSetId(int64_t id) {
   343        auto [iter, inserted] = set_id_map_.try_emplace(id, kDummy);
   344        if (inserted) {
   345          absl::StatusOr<std::string> string_id = ToDeserializationId(state_, id);
   346          if (!string_id.ok()) return string_id.status();
   347  
   348          std::optional<FileSetId> file_set_id =
   349              result_.file_sets.InternUnlessDisposed(*string_id);
   350          if (!file_set_id.has_value()) {
   351            return absl::InvalidArgumentError(
   352                "Encountered disposed FileSetId during deserialization");
   353          }
   354          iter->second = *file_set_id;
   355        }
   356        return iter->second;
   357      }
   358  
   359      absl::Status DeserializeFileSet(int64_t id, const ProtoFileSet& file_set) {
   360        absl::StatusOr<FileSetId> file_set_id = DeserializeFileSetId(id);
   361        if (!file_set_id.ok()) return file_set_id.status();
   362  
   363        FileSet result_set;
   364        for (uint64_t file_id : file_set.files()) {
   365          absl::StatusOr<FileId> real_id = DeserializeFile(file_id);
   366          if (!real_id.ok()) return real_id.status();
   367  
   368          result_set.files.push_back(*real_id);
   369        }
   370        for (int64_t child_id : file_set.file_sets()) {
   371          if (!(state_.file_sets().contains(child_id) ||
   372                state_.pending().contains(child_id))) {
   373            // Ensure internal consistency.
   374            return absl::InvalidArgumentError(absl::StrCat(
   375                "Child FileSetId is neither live nor pending: ", id));
   376          }
   377  
   378          absl::StatusOr<FileSetId> real_id = DeserializeFileSetId(child_id);
   379          if (!real_id.ok()) return real_id.status();
   380  
   381          result_set.file_sets.push_back(*real_id);
   382        }
   383        if (!result_.file_sets.InsertUnlessDisposed(*file_set_id,
   384                                                    std::move(result_set))) {
   385          return absl::InvalidArgumentError(
   386              absl::StrCat("FileSetId both disposed and live: ", id));
   387        }
   388        return absl::OkStatus();
   389      }
   390  
   391      absl::StatusOr<FileId> DeserializeFile(uint64_t id) {
   392        if (id > state_.files_size()) {
   393          return absl::InvalidArgumentError(
   394              absl::StrCat("File index out of range: ", id));
   395        }
   396        return result_.files.Insert(BazelArtifactFile{
   397            .local_path = state_.files(id).local_path(),
   398            .uri = state_.files(id).uri(),
   399        });
   400      }
   401  
   402      absl::Status DeserializePending(int64_t id, absl::string_view target) {
   403        absl::StatusOr<FileSetId> real_id = DeserializeFileSetId(id);
   404        if (!real_id.ok()) return real_id.status();
   405  
   406        result_.pending.try_emplace(*real_id, target);
   407        return absl::OkStatus();
   408      }
   409  
   410      const kythe::proto::BazelAspectArtifactSelectorStateV2& state_;
   411      State& result_;
   412  
   413      absl::flat_hash_map<int64_t, FileSetId> set_id_map_;
   414    };
   415  };
   416  
   417  bool AspectArtifactSelector::SerializeInto(google::protobuf::Any& state) const {
   418    switch (options_.serialization_format) {
   419      case AspectArtifactSelectorSerializationFormat::kV2: {
   420        kythe::proto::BazelAspectArtifactSelectorStateV2 raw;
   421        if (!AspectArtifactSelectorSerializationHelper::SerializeInto(state_,
   422                                                                      raw)) {
   423          return false;
   424        }
   425        state.PackFrom(raw);
   426        return true;
   427      }
   428      case AspectArtifactSelectorSerializationFormat::kV1: {
   429        kythe::proto::BazelAspectArtifactSelectorState raw;
   430        for (FileSetId id : state_.file_sets.disposed()) {
   431          raw.add_disposed(state_.file_sets.ToString(id));
   432        }
   433        for (const auto& [id, target] : state_.pending) {
   434          (*raw.mutable_pending())[state_.file_sets.ToString(id)] = target;
   435        }
   436        for (const auto& [id, file_set] : state_.file_sets.file_sets()) {
   437          auto& entry = (*raw.mutable_filesets())[state_.file_sets.ToString(id)];
   438          for (FileSetId child_id : file_set.file_sets) {
   439            entry.add_file_sets()->set_id(state_.file_sets.ToString(child_id));
   440          }
   441          for (FileId file_id : file_set.files) {
   442            const BazelArtifactFile* file = state_.files.Find(file_id);
   443            if (file == nullptr) continue;
   444  
   445            auto* file_entry = entry.add_files();
   446            file_entry->set_name(file->local_path);
   447            file_entry->set_uri(file->uri);
   448          }
   449        }
   450        state.PackFrom(raw);
   451        return true;
   452      }
   453    }
   454    return false;
   455  }
   456  
   457  absl::Status AspectArtifactSelector::DeserializeFrom(
   458      const google::protobuf::Any& state) {
   459    if (auto raw = kythe::proto::BazelAspectArtifactSelectorStateV2();
   460        state.UnpackTo(&raw)) {
   461      state_ = {};
   462      return AspectArtifactSelectorSerializationHelper::DeserializeFrom(raw,
   463                                                                        state_);
   464    } else if (state.Is<kythe::proto::BazelAspectArtifactSelectorStateV2>()) {
   465      return absl::InvalidArgumentError(
   466          "Malformed kythe.proto.BazelAspectArtifactSelectorStateV2");
   467    }
   468    if (auto raw = kythe::proto::BazelAspectArtifactSelectorState();
   469        state.UnpackTo(&raw)) {
   470      state_ = {};
   471      for (const auto& id : raw.disposed()) {
   472        if (std::optional<FileSetId> file_set_id =
   473                state_.file_sets.InternUnlessDisposed(id)) {
   474          state_.file_sets.Dispose(*file_set_id);
   475        }
   476      }
   477      for (const auto& [id, target] : raw.pending()) {
   478        if (std::optional<FileSetId> file_set_id =
   479                state_.file_sets.InternUnlessDisposed(id)) {
   480          state_.pending.try_emplace(*file_set_id, target);
   481        }
   482      }
   483      for (const auto& [id, file_set] : raw.filesets()) {
   484        if (std::optional<FileSetId> file_set_id =
   485                state_.file_sets.InternUnlessDisposed(id)) {
   486          InsertFileSet(*file_set_id, file_set);
   487        }
   488      }
   489      return absl::OkStatus();
   490    } else if (state.Is<kythe::proto::BazelAspectArtifactSelectorState>()) {
   491      return absl::InvalidArgumentError(
   492          "Malformed kythe.proto.BazelAspectArtifactSelectorState");
   493    }
   494    return absl::FailedPreconditionError(
   495        "State not of type kythe.proto.BazelAspectArtifactSelectorState");
   496  }
   497  
   498  AspectArtifactSelector::FileTable::FileTable(const FileTable& other)
   499      : next_id_(other.next_id_),
   500        file_map_(other.file_map_),
   501        id_map_(file_map_.size()) {
   502    for (const auto& [file, entry] : file_map_) {
   503      id_map_.insert_or_assign(entry.id, &file);
   504    }
   505  }
   506  
   507  AspectArtifactSelector::FileTable& AspectArtifactSelector::FileTable::operator=(
   508      const FileTable& other) {
   509    next_id_ = other.next_id_;
   510    file_map_ = other.file_map_;
   511    id_map_.clear();
   512    for (const auto& [file, entry] : file_map_) {
   513      id_map_.insert_or_assign(entry.id, &file);
   514    }
   515    return *this;
   516  }
   517  
   518  AspectArtifactSelector::FileId AspectArtifactSelector::FileTable::Insert(
   519      BazelArtifactFile file) {
   520    auto [iter, inserted] = file_map_.emplace(
   521        std::move(file), Entry{.id = FileId(next_id_), .count = 1});
   522    if (inserted) {
   523      next_id_++;
   524      id_map_[iter->second.id] = &iter->first;
   525    } else {
   526      iter->second.count++;
   527    }
   528    return iter->second.id;
   529  }
   530  
   531  BazelArtifactFile AspectArtifactSelector::FileTable::ExtractIterators(
   532      IdMap::iterator id_iter, FileMap::iterator file_iter) {
   533    CHECK(id_iter != id_map_.end());
   534    CHECK(file_iter != file_map_.end());
   535    if (--file_iter->second.count == 0) {
   536      // Only remove the file once it's been extracted for each FileSet which
   537      // references it.
   538      id_map_.erase(id_iter);
   539      return std::move(file_map_.extract(file_iter).key());
   540    }
   541    return file_iter->first;
   542  }
   543  
   544  std::optional<BazelArtifactFile> AspectArtifactSelector::FileTable::Extract(
   545      FileId id) {
   546    auto id_iter = id_map_.find(id);
   547    if (id_iter == id_map_.end()) {
   548      return std::nullopt;
   549    }
   550    // file_map_ owns the memory underlying the pointer we dereferenced here.
   551    // If it's missing from the map, we're well into UB trouble.
   552    return ExtractIterators(id_iter, file_map_.find(*id_iter->second));
   553  }
   554  
   555  BazelArtifactFile AspectArtifactSelector::FileTable::ExtractFile(
   556      BazelArtifactFile file) {
   557    auto file_iter = file_map_.find(file);
   558    if (file_iter == file_map_.end()) {
   559      return file;
   560    }
   561    // If the file id is missing from id_map_, something has gone horribly wrong
   562    // with our invariants.
   563    return ExtractIterators(id_map_.find(file_iter->second.id), file_iter);
   564  }
   565  
   566  const BazelArtifactFile* AspectArtifactSelector::FileTable::Find(
   567      FileId id) const {
   568    auto iter = id_map_.find(id);
   569    if (iter == id_map_.end()) {
   570      return nullptr;
   571    }
   572    return iter->second;
   573  }
   574  
   575  std::optional<AspectArtifactSelector::FileSetId>
   576  AspectArtifactSelector::FileSetTable::InternUnlessDisposed(
   577      absl::string_view id) {
   578    auto [result, inserted] = InternOrCreate(id);
   579    if (!inserted && disposed_.contains(result)) {
   580      return std::nullopt;
   581    }
   582    return result;
   583  }
   584  
   585  std::pair<AspectArtifactSelector::FileSetId, bool>
   586  AspectArtifactSelector::FileSetTable::InternOrCreate(absl::string_view id) {
   587    int64_t token;
   588    if (StrictAtoI(id, &token)) {
   589      return {{token}, false};
   590    }
   591    auto [iter, inserted] = id_map_.try_emplace(id, std::make_tuple(next_id_));
   592    if (inserted) {
   593      next_id_--;  // Non-integral ids are mapped to negative values.
   594      inverse_id_map_.try_emplace(iter->second, iter->first);
   595    }
   596    return {{iter->second}, inserted};
   597  }
   598  
   599  bool AspectArtifactSelector::FileSetTable::InsertUnlessDisposed(
   600      FileSetId id, FileSet file_set) {
   601    if (disposed_.contains(id)) {
   602      return false;
   603    }
   604    file_sets_.insert_or_assign(id, std::move(file_set));
   605    return true;  // A false return indicates the set has already been disposed.
   606  }
   607  
   608  std::optional<AspectArtifactSelector::FileSet>
   609  AspectArtifactSelector::FileSetTable::ExtractAndDispose(FileSetId id) {
   610    if (auto node = file_sets_.extract(id); !node.empty()) {
   611      disposed_.insert(id);
   612      return std::move(node.mapped());
   613    }
   614    return std::nullopt;
   615  }
   616  
   617  void AspectArtifactSelector::FileSetTable::Dispose(FileSetId id) {
   618    disposed_.insert(id);
   619    file_sets_.erase(id);
   620  }
   621  
   622  bool AspectArtifactSelector::FileSetTable::Disposed(FileSetId id) {
   623    return disposed_.contains(id);
   624  }
   625  
   626  std::string AspectArtifactSelector::FileSetTable::ToString(FileSetId id) const {
   627    if (const auto [unpacked] = id; unpacked >= 0) {
   628      return absl::StrCat(unpacked);
   629    }
   630    return inverse_id_map_.at(id);
   631  }
   632  
   633  std::optional<BazelArtifact> AspectArtifactSelector::SelectFileSet(
   634      absl::string_view id, const build_event_stream::NamedSetOfFiles& fileset) {
   635    std::optional<FileSetId> file_set_id = InternUnlessDisposed(id);
   636    if (!file_set_id.has_value()) {
   637      // Already disposed, skip.
   638      return std::nullopt;
   639    }
   640    // This was a pending file set, select it directly.
   641    if (auto node = state_.pending.extract(*file_set_id); !node.empty()) {
   642      state_.file_sets.Dispose(*file_set_id);
   643      BazelArtifact result = {.label = node.mapped()};
   644      for (const auto& file : fileset.files()) {
   645        if (std::optional<BazelArtifactFile> artifact_file =
   646                ToBazelArtifactFile(file, options_.file_name_allowlist)) {
   647          result.files.push_back(
   648              state_.files.ExtractFile(*std::move(artifact_file)));
   649        }
   650      }
   651      for (const auto& child : fileset.file_sets()) {
   652        if (std::optional<FileSetId> child_id =
   653                InternUnlessDisposed(child.id())) {
   654          ExtractFilesInto(*child_id, result.label, &result.files);
   655        }
   656      }
   657      return result;
   658    }
   659    InsertFileSet(*file_set_id, fileset);
   660    return std::nullopt;
   661  }
   662  
   663  std::optional<BazelArtifact> AspectArtifactSelector::SelectTargetCompleted(
   664      const build_event_stream::BuildEventId::TargetCompletedId& id,
   665      const build_event_stream::TargetComplete& payload) {
   666    BazelArtifact result = {
   667        .label = id.label(),
   668    };
   669    const auto& [selected, unselected] = PartitionFileSets(id, payload);
   670    for (FileSetId file_set_id : selected) {
   671      ExtractFilesInto(file_set_id, result.label, &result.files);
   672    }
   673    if (options_.dispose_unselected_output_groups) {
   674      for (FileSetId file_set_id : unselected) {
   675        ExtractFilesInto(file_set_id, result.label, nullptr);
   676      }
   677    }
   678    if (!result.files.empty()) {
   679      return result;
   680    }
   681    return std::nullopt;
   682  }
   683  
   684  AspectArtifactSelector::PartitionFileSetsResult
   685  AspectArtifactSelector::PartitionFileSets(
   686      const build_event_stream::BuildEventId::TargetCompletedId& id,
   687      const build_event_stream::TargetComplete& payload) {
   688    PartitionFileSetsResult result;
   689    bool id_match = options_.target_aspect_allowlist.Match(id.aspect());
   690    for (const auto& output_group : payload.output_group()) {
   691      auto& output =
   692          (id_match && options_.output_group_allowlist.Match(output_group.name()))
   693              ? result.selected
   694              : result.unselected;
   695      for (const auto& fileset : output_group.file_sets()) {
   696        if (std::optional<FileSetId> file_set_id =
   697                InternUnlessDisposed(fileset.id())) {
   698          output.push_back(*file_set_id);
   699        }
   700      }
   701    }
   702    return result;
   703  }
   704  
   705  void AspectArtifactSelector::ExtractFilesInto(
   706      FileSetId id, absl::string_view target,
   707      std::vector<BazelArtifactFile>* files) {
   708    if (state_.file_sets.Disposed(id)) {
   709      return;
   710    }
   711  
   712    std::optional<FileSet> file_set = state_.file_sets.ExtractAndDispose(id);
   713    if (!file_set.has_value()) {
   714      // Files where requested, but we haven't disposed that filesets id yet.
   715      // Record this for future processing.
   716      LOG(INFO) << "NamedSetOfFiles " << state_.file_sets.ToString(id)
   717                << " requested by " << target << " but not yet disposed.";
   718      if (files != nullptr) {
   719        // Only retain pending file sets if they would've been saved.
   720        state_.pending.emplace(id, target);
   721      } else if (state_.pending.find(id) == state_.pending.end()) {
   722        // But still prefer to retain pending file sets.
   723        state_.file_sets.Dispose(id);
   724      }
   725      return;
   726    }
   727  
   728    for (FileId file_id : file_set->files) {
   729      if (std::optional<BazelArtifactFile> file = state_.files.Extract(file_id);
   730          file.has_value() && files != nullptr) {
   731        files->push_back(*std::move(file));
   732      }
   733    }
   734    for (FileSetId child_id : file_set->file_sets) {
   735      ExtractFilesInto(child_id, target, files);
   736    }
   737  }
   738  
   739  void AspectArtifactSelector::InsertFileSet(
   740      FileSetId id, const build_event_stream::NamedSetOfFiles& fileset) {
   741    std::optional<FileSet> file_set;
   742    for (const auto& file : fileset.files()) {
   743      if (std::optional<BazelArtifactFile> artifact_file =
   744              ToBazelArtifactFile(file, options_.file_name_allowlist)) {
   745        FileId file_id = state_.files.Insert(*std::move(artifact_file));
   746        GetOrConstruct(file_set).files.push_back(file_id);
   747      }
   748    }
   749    for (const auto& child : fileset.file_sets()) {
   750      if (std::optional<FileSetId> child_id = InternUnlessDisposed(child.id())) {
   751        GetOrConstruct(file_set).file_sets.push_back(*child_id);
   752      }
   753    }
   754    if (file_set.has_value()) {
   755      state_.file_sets.InsertUnlessDisposed(id, *std::move(file_set));
   756    } else {
   757      // Nothing to do with this fileset, mark it disposed.
   758      state_.file_sets.Dispose(id);
   759    }
   760  }
   761  
   762  ExtraActionSelector::ExtraActionSelector(
   763      absl::flat_hash_set<std::string> action_types)
   764      : action_matches_([action_types = std::move(action_types)](
   765                            absl::string_view action_type) {
   766          return action_types.empty() || action_types.contains(action_type);
   767        }) {}
   768  
   769  ExtraActionSelector::ExtraActionSelector(const RE2* action_pattern)
   770      : action_matches_([action_pattern](absl::string_view action_type) {
   771          if (action_pattern == nullptr || action_pattern->pattern().empty()) {
   772            return false;
   773          }
   774          return RE2::FullMatch(action_type, *action_pattern);
   775        }) {
   776    CHECK(action_pattern == nullptr || action_pattern->ok())
   777        << "ExtraActionSelector requires a valid pattern: "
   778        << action_pattern->error();
   779  }
   780  
   781  std::optional<BazelArtifact> ExtraActionSelector::Select(
   782      const build_event_stream::BuildEvent& event) {
   783    if (event.id().has_action_completed() && event.action().success() &&
   784        action_matches_(event.action().type())) {
   785      if (std::optional<std::string> uri =
   786              ToUri(event.action().primary_output())) {
   787        return BazelArtifact{
   788            .label = event.id().action_completed().label(),
   789            .files = {{
   790                .local_path = event.id().action_completed().primary_output(),
   791                .uri = *std::move(uri),
   792            }},
   793        };
   794      }
   795    }
   796    return std::nullopt;
   797  }
   798  
   799  }  // namespace kythe