github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/col/colserde/arrowserde/schema.fbs (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  //   http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing,
    12  // software distributed under the License is distributed on an
    13  // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
    14  // KIND, either express or implied.  See the License for the
    15  // specific language governing permissions and limitations
    16  // under the License.
    17  
    18  /// Logical types, vector layouts, and schemas
    19  
    20  namespace org.apache.arrow.flatbuf;
    21  
    22  enum MetadataVersion:short {
    23    /// 0.1.0
    24    V1,
    25  
    26    /// 0.2.0
    27    V2,
    28  
    29    /// 0.3.0 -> 0.7.1
    30    V3,
    31  
    32    /// >= 0.8.0
    33    V4
    34  }
    35  
    36  /// These are stored in the flatbuffer in the Type union below
    37  
    38  table Null {
    39  }
    40  
    41  /// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct
    42  /// (according to the physical memory layout). We used Struct_ here as
    43  /// Struct is a reserved word in Flatbuffers
    44  table Struct_ {
    45  }
    46  
    47  table List {
    48  }
    49  
    50  table FixedSizeList {
    51    /// Number of list items per value
    52    listSize: int;
    53  }
    54  
    55  /// A Map is a logical nested type that is represented as
    56  ///
    57  /// List<entry: Struct<key: K, value: V>>
    58  ///
    59  /// In this layout, the keys and values are each respectively contiguous. We do
    60  /// not constrain the key and value types, so the application is responsible
    61  /// for ensuring that the keys are hashable and unique. Whether the keys are sorted
    62  /// may be set in the metadata for this field
    63  ///
    64  /// In a Field with Map type, the Field has a child Struct field, which then
    65  /// has two children: key type and the second the value type. The names of the
    66  /// child fields may be respectively "entry", "key", and "value", but this is
    67  /// not enforced
    68  ///
    69  /// Map
    70  ///   - child[0] entry: Struct
    71  ///     - child[0] key: K
    72  ///     - child[1] value: V
    73  ///
    74  /// Neither the "entry" field nor the "key" field may be nullable.
    75  ///
    76  /// The metadata is structured so that Arrow systems without special handling
    77  /// for Map can make Map an alias for List. The "layout" attribute for the Map
    78  /// field must have the same contents as a List.
    79  table Map {
    80    /// Set to true if the keys within each value are sorted
    81    keysSorted: bool;
    82  }
    83  
    84  enum UnionMode:short { Sparse, Dense }
    85  
    86  /// A union is a complex type with children in Field
    87  /// By default ids in the type vector refer to the offsets in the children
    88  /// optionally typeIds provides an indirection between the child offset and the type id
    89  /// for each child typeIds[offset] is the id used in the type vector
    90  table Union {
    91    mode: UnionMode;
    92    typeIds: [ int ]; // optional, describes typeid of each child.
    93  }
    94  
    95  table Int {
    96    bitWidth: int; // restricted to 8, 16, 32, and 64 in v1
    97    is_signed: bool;
    98  }
    99  
   100  enum Precision:short {HALF, SINGLE, DOUBLE}
   101  
   102  table FloatingPoint {
   103    precision: Precision;
   104  }
   105  
   106  /// Unicode with UTF-8 encoding
   107  table Utf8 {
   108  }
   109  
   110  table Binary {
   111  }
   112  
   113  table FixedSizeBinary {
   114    /// Number of bytes per value
   115    byteWidth: int;
   116  }
   117  
   118  table Bool {
   119  }
   120  
   121  table Decimal {
   122    /// Total number of decimal digits
   123    precision: int;
   124    /// Number of digits after the decimal point "."
   125    scale: int;
   126  }
   127  
   128  enum DateUnit: short {
   129    DAY,
   130    MILLISECOND
   131  }
   132  
   133  /// Date is either a 32-bit or 64-bit type representing elapsed time since UNIX
   134  /// epoch (1970-01-01), stored in either of two units:
   135  ///
   136  /// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no
   137  ///   leap seconds), where the values are evenly divisible by 86400000
   138  /// * Days (32 bits) since the UNIX epoch
   139  table Date {
   140    unit: DateUnit = MILLISECOND;
   141  }
   142  
   143  enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND }
   144  
   145  /// Time type. The physical storage type depends on the unit
   146  /// - SECOND and MILLISECOND: 32 bits
   147  /// - MICROSECOND and NANOSECOND: 64 bits
   148  table Time {
   149    unit: TimeUnit = MILLISECOND;
   150    bitWidth: int = 32;
   151  }
   152  
   153  /// Time elapsed from the Unix epoch, 00:00:00.000 on 1 January 1970, excluding
   154  /// leap seconds, as a 64-bit integer. Note that UNIX time does not include
   155  /// leap seconds.
   156  ///
   157  /// The Timestamp metadata supports both "time zone naive" and "time zone
   158  /// aware" timestamps. Read about the timezone attribute for more detail
   159  table Timestamp {
   160    unit: TimeUnit;
   161  
   162    /// The time zone is a string indicating the name of a time zone, one of:
   163    ///
   164    /// * As used in the Olson time zone database (the "tz database" or
   165    ///   "tzdata"), such as "America/New_York"
   166    /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
   167    ///
   168    /// Whether a timezone string is present indicates different semantics about
   169    /// the data:
   170    ///
   171    /// * If the time zone is null or equal to an empty string, the data is "time
   172    ///   zone naive" and shall be displayed *as is* to the user, not localized
   173    ///   to the locale of the user. This data can be though of as UTC but
   174    ///   without having "UTC" as the time zone, it is not considered to be
   175    ///   localized to any time zone
   176    ///
   177    /// * If the time zone is set to a valid value, values can be displayed as
   178    ///   "localized" to that time zone, even though the underlying 64-bit
   179    ///   integers are identical to the same data stored in UTC. Converting
   180    ///   between time zones is a metadata-only operation and does not change the
   181    ///   underlying values
   182    timezone: string;
   183  }
   184  
   185  enum IntervalUnit: short { YEAR_MONTH, DAY_TIME}
   186  table Interval {
   187    unit: IntervalUnit;
   188  }
   189  
   190  /// ----------------------------------------------------------------------
   191  /// Top-level Type value, enabling extensible type-specific metadata. We can
   192  /// add new logical types to Type without breaking backwards compatibility
   193  
   194  union Type {
   195    Null,
   196    Int,
   197    FloatingPoint,
   198    Binary,
   199    Utf8,
   200    Bool,
   201    Decimal,
   202    Date,
   203    Time,
   204    Timestamp,
   205    Interval,
   206    List,
   207    Struct_,
   208    Union,
   209    FixedSizeBinary,
   210    FixedSizeList,
   211    Map
   212  }
   213  
   214  /// ----------------------------------------------------------------------
   215  /// user defined key value pairs to add custom metadata to arrow
   216  /// key namespacing is the responsibility of the user
   217  
   218  table KeyValue {
   219    key: string;
   220    value: string;
   221  }
   222  
   223  /// ----------------------------------------------------------------------
   224  /// Dictionary encoding metadata
   225  
   226  table DictionaryEncoding {
   227    /// The known dictionary id in the application where this data is used. In
   228    /// the file or streaming formats, the dictionary ids are found in the
   229    /// DictionaryBatch messages
   230    id: long;
   231  
   232    /// The dictionary indices are constrained to be positive integers. If this
   233    /// field is null, the indices must be signed int32
   234    indexType: Int;
   235  
   236    /// By default, dictionaries are not ordered, or the order does not have
   237    /// semantic meaning. In some statistical, applications, dictionary-encoding
   238    /// is used to represent ordered categorical data, and we provide a way to
   239    /// preserve that metadata here
   240    isOrdered: bool;
   241  }
   242  
   243  /// ----------------------------------------------------------------------
   244  /// A field represents a named column in a record / row batch or child of a
   245  /// nested type.
   246  ///
   247  /// - children is only for nested Arrow arrays
   248  /// - For primitive types, children will have length 0
   249  /// - nullable should default to true in general
   250  
   251  table Field {
   252    // Name is not required, in i.e. a List
   253    name: string;
   254    nullable: bool;
   255    // This is the type of the decoded value if the field is dictionary encoded
   256    type: Type;
   257  
   258    // Present only if the field is dictionary encoded
   259    dictionary: DictionaryEncoding;
   260  
   261    // children apply only to Nested data types like Struct, List and Union
   262    children: [Field];
   263  
   264    // User-defined metadata
   265    custom_metadata: [ KeyValue ];
   266  }
   267  
   268  /// ----------------------------------------------------------------------
   269  /// Endianness of the platform producing the data
   270  
   271  enum Endianness:short { Little, Big }
   272  
   273  /// ----------------------------------------------------------------------
   274  /// A Buffer represents a single contiguous memory segment
   275  struct Buffer {
   276    /// The relative offset into the shared memory page where the bytes for this
   277    /// buffer starts
   278    offset: long;
   279  
   280    /// The absolute length (in bytes) of the memory buffer. The memory is found
   281    /// from offset (inclusive) to offset + length (non-inclusive).
   282    length: long;
   283  }
   284  
   285  /// ----------------------------------------------------------------------
   286  /// A Schema describes the columns in a row batch
   287  
   288  table Schema {
   289  
   290    /// endianness of the buffer
   291    /// it is Little Endian by default
   292    /// if endianness doesn't match the underlying system then the vectors need to be converted
   293    endianness: Endianness=Little;
   294  
   295    fields: [Field];
   296    // User-defined metadata
   297    custom_metadata: [ KeyValue ];
   298  }
   299  
   300  root_type Schema;