kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/proto/entryset.proto (about)

     1  /*
     2   * Copyright 2017 The Kythe Authors. All rights reserved.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *   http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  syntax = "proto3";
    18  
    19  package kythe.storage;
    20  
    21  option go_package = "kythe.io/kythe/proto/entryset_go_proto";
    22  
    23  // An EntrySet represents a compact collection of entries.
    24  // Compaction is achieved by encoding node names and string-valued data as
    25  // table offsets, and by sorting and prefix-encoding all the strings.
    26  //
    27  // Even without more sophisticated compression this provides a fairly large
    28  // savings over fully-separate entries, even when encoded with protobuf
    29  // wire-format overhead.
    30  //
    31  // The format is somewhat expensive to construct, but not asymptotically bad,
    32  // and decoding is both simpler and less memory-intensive than encoding.
    33  message EntrySet {
    34    // The order of these fields reflects the standard ordering for vnames.
    35    // Each of the fields is an index into the symbol table.
    36    message Node {
    37      int32 corpus = 1;
    38      int32 language = 2;
    39      int32 path = 3;
    40      int32 root = 4;
    41      int32 signature = 5;
    42    }
    43  
    44    // TODO(fromberger): The standard ordering is actually defined in terms of
    45    // tickets rather than vnames, from back when entries carried tickets.  This
    46    // definition uses the same field ordering so as long as the ticket is in
    47    // canonical form I think it should be equivalent.
    48  
    49    // One entry for each unique node named in the entry set.  The index of a
    50    // node in this field is its id.
    51    //
    52    // If the nodes are stored in canonical vname order, the EntrySet is also
    53    // said to be in canonical order. However, an EntrySet is valid whether or
    54    // not this applies.
    55    repeated Node nodes = 1;
    56  
    57    message Fact {
    58      int32 name = 1;   // symbol
    59      int32 value = 2;  // symbol
    60    }
    61    message FactGroup {
    62      repeated Fact facts = 1;
    63    }
    64  
    65    // One entry for each node in the entry set. The index of a group in this
    66    // field matches the id of its corresponding node.
    67    repeated FactGroup fact_groups = 2;
    68  
    69    message Edge {
    70      int32 kind = 1;    // symbol
    71      int32 target = 2;  // node
    72    }
    73    message EdgeGroup {
    74      repeated Edge edges = 1;
    75    }
    76  
    77    // One entry for each node in the entry set. The index of a group in this
    78    // field matches the id of its corresponding source node.
    79    repeated EdgeGroup edge_groups = 3;
    80  
    81    message String {
    82      int32 prefix = 1;  // length of common prefix with predecessor (expanded)
    83      bytes suffix = 2;  // the unshared suffix
    84    }
    85  
    86    // A prefix-coded table of all the symbols referenced by the messages above.
    87    // The entries in this field are lexicographically ordered.  The string table
    88    // always implicitly contains the empty string as its first entry, but it is
    89    // not represented explicitly in the message.
    90    repeated String symbols = 4;
    91  }
    92  
    93  // TODO(fromberger): Some additional tricks to consider.
    94  //
    95  // Static dictionaries for common schema types (fact names, edge kinds). This
    96  // makes ordering harder, though, and doesn't seem to save much, though
    97  // amortized over many edge sets it might help more.
    98  //
    99  // Separate blobs for large values, e.g., files. This also makes ordering a
   100  // little harder, and large blobs are rarely repeated.