kythe.io@v0.0.68-0.20240422202219-7225dbc01741/kythe/proto/entryset.proto (about) 1 /* 2 * Copyright 2017 The Kythe Authors. All rights reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 syntax = "proto3"; 18 19 package kythe.storage; 20 21 option go_package = "kythe.io/kythe/proto/entryset_go_proto"; 22 23 // An EntrySet represents a compact collection of entries. 24 // Compaction is achieved by encoding node names and string-valued data as 25 // table offsets, and by sorting and prefix-encoding all the strings. 26 // 27 // Even without more sophisticated compression this provides a fairly large 28 // savings over fully-separate entries, even when encoded with protobuf 29 // wire-format overhead. 30 // 31 // The format is somewhat expensive to construct, but not asymptotically bad, 32 // and decoding is both simpler and less memory-intensive than encoding. 33 message EntrySet { 34 // The order of these fields reflects the standard ordering for vnames. 35 // Each of the fields is an index into the symbol table. 36 message Node { 37 int32 corpus = 1; 38 int32 language = 2; 39 int32 path = 3; 40 int32 root = 4; 41 int32 signature = 5; 42 } 43 44 // TODO(fromberger): The standard ordering is actually defined in terms of 45 // tickets rather than vnames, from back when entries carried tickets. This 46 // definition uses the same field ordering so as long as the ticket is in 47 // canonical form I think it should be equivalent. 48 49 // One entry for each unique node named in the entry set. The index of a 50 // node in this field is its id. 51 // 52 // If the nodes are stored in canonical vname order, the EntrySet is also 53 // said to be in canonical order. However, an EntrySet is valid whether or 54 // not this applies. 55 repeated Node nodes = 1; 56 57 message Fact { 58 int32 name = 1; // symbol 59 int32 value = 2; // symbol 60 } 61 message FactGroup { 62 repeated Fact facts = 1; 63 } 64 65 // One entry for each node in the entry set. The index of a group in this 66 // field matches the id of its corresponding node. 67 repeated FactGroup fact_groups = 2; 68 69 message Edge { 70 int32 kind = 1; // symbol 71 int32 target = 2; // node 72 } 73 message EdgeGroup { 74 repeated Edge edges = 1; 75 } 76 77 // One entry for each node in the entry set. The index of a group in this 78 // field matches the id of its corresponding source node. 79 repeated EdgeGroup edge_groups = 3; 80 81 message String { 82 int32 prefix = 1; // length of common prefix with predecessor (expanded) 83 bytes suffix = 2; // the unshared suffix 84 } 85 86 // A prefix-coded table of all the symbols referenced by the messages above. 87 // The entries in this field are lexicographically ordered. The string table 88 // always implicitly contains the empty string as its first entry, but it is 89 // not represented explicitly in the message. 90 repeated String symbols = 4; 91 } 92 93 // TODO(fromberger): Some additional tricks to consider. 94 // 95 // Static dictionaries for common schema types (fact names, edge kinds). This 96 // makes ordering harder, though, and doesn't seem to save much, though 97 // amortized over many edge sets it might help more. 98 // 99 // Separate blobs for large values, e.g., files. This also makes ordering a 100 // little harder, and large blobs are rarely repeated.