github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/col/colserde/arrowserde/schema.fbs (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, 12 // software distributed under the License is distributed on an 13 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 // KIND, either express or implied. See the License for the 15 // specific language governing permissions and limitations 16 // under the License. 17 18 /// Logical types, vector layouts, and schemas 19 20 namespace org.apache.arrow.flatbuf; 21 22 enum MetadataVersion:short { 23 /// 0.1.0 24 V1, 25 26 /// 0.2.0 27 V2, 28 29 /// 0.3.0 -> 0.7.1 30 V3, 31 32 /// >= 0.8.0 33 V4 34 } 35 36 /// These are stored in the flatbuffer in the Type union below 37 38 table Null { 39 } 40 41 /// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct 42 /// (according to the physical memory layout). We used Struct_ here as 43 /// Struct is a reserved word in Flatbuffers 44 table Struct_ { 45 } 46 47 table List { 48 } 49 50 table FixedSizeList { 51 /// Number of list items per value 52 listSize: int; 53 } 54 55 /// A Map is a logical nested type that is represented as 56 /// 57 /// List<entry: Struct<key: K, value: V>> 58 /// 59 /// In this layout, the keys and values are each respectively contiguous. We do 60 /// not constrain the key and value types, so the application is responsible 61 /// for ensuring that the keys are hashable and unique. Whether the keys are sorted 62 /// may be set in the metadata for this field 63 /// 64 /// In a Field with Map type, the Field has a child Struct field, which then 65 /// has two children: key type and the second the value type. The names of the 66 /// child fields may be respectively "entry", "key", and "value", but this is 67 /// not enforced 68 /// 69 /// Map 70 /// - child[0] entry: Struct 71 /// - child[0] key: K 72 /// - child[1] value: V 73 /// 74 /// Neither the "entry" field nor the "key" field may be nullable. 75 /// 76 /// The metadata is structured so that Arrow systems without special handling 77 /// for Map can make Map an alias for List. The "layout" attribute for the Map 78 /// field must have the same contents as a List. 79 table Map { 80 /// Set to true if the keys within each value are sorted 81 keysSorted: bool; 82 } 83 84 enum UnionMode:short { Sparse, Dense } 85 86 /// A union is a complex type with children in Field 87 /// By default ids in the type vector refer to the offsets in the children 88 /// optionally typeIds provides an indirection between the child offset and the type id 89 /// for each child typeIds[offset] is the id used in the type vector 90 table Union { 91 mode: UnionMode; 92 typeIds: [ int ]; // optional, describes typeid of each child. 93 } 94 95 table Int { 96 bitWidth: int; // restricted to 8, 16, 32, and 64 in v1 97 is_signed: bool; 98 } 99 100 enum Precision:short {HALF, SINGLE, DOUBLE} 101 102 table FloatingPoint { 103 precision: Precision; 104 } 105 106 /// Unicode with UTF-8 encoding 107 table Utf8 { 108 } 109 110 table Binary { 111 } 112 113 table FixedSizeBinary { 114 /// Number of bytes per value 115 byteWidth: int; 116 } 117 118 table Bool { 119 } 120 121 table Decimal { 122 /// Total number of decimal digits 123 precision: int; 124 /// Number of digits after the decimal point "." 125 scale: int; 126 } 127 128 enum DateUnit: short { 129 DAY, 130 MILLISECOND 131 } 132 133 /// Date is either a 32-bit or 64-bit type representing elapsed time since UNIX 134 /// epoch (1970-01-01), stored in either of two units: 135 /// 136 /// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no 137 /// leap seconds), where the values are evenly divisible by 86400000 138 /// * Days (32 bits) since the UNIX epoch 139 table Date { 140 unit: DateUnit = MILLISECOND; 141 } 142 143 enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND } 144 145 /// Time type. The physical storage type depends on the unit 146 /// - SECOND and MILLISECOND: 32 bits 147 /// - MICROSECOND and NANOSECOND: 64 bits 148 table Time { 149 unit: TimeUnit = MILLISECOND; 150 bitWidth: int = 32; 151 } 152 153 /// Time elapsed from the Unix epoch, 00:00:00.000 on 1 January 1970, excluding 154 /// leap seconds, as a 64-bit integer. Note that UNIX time does not include 155 /// leap seconds. 156 /// 157 /// The Timestamp metadata supports both "time zone naive" and "time zone 158 /// aware" timestamps. Read about the timezone attribute for more detail 159 table Timestamp { 160 unit: TimeUnit; 161 162 /// The time zone is a string indicating the name of a time zone, one of: 163 /// 164 /// * As used in the Olson time zone database (the "tz database" or 165 /// "tzdata"), such as "America/New_York" 166 /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 167 /// 168 /// Whether a timezone string is present indicates different semantics about 169 /// the data: 170 /// 171 /// * If the time zone is null or equal to an empty string, the data is "time 172 /// zone naive" and shall be displayed *as is* to the user, not localized 173 /// to the locale of the user. This data can be though of as UTC but 174 /// without having "UTC" as the time zone, it is not considered to be 175 /// localized to any time zone 176 /// 177 /// * If the time zone is set to a valid value, values can be displayed as 178 /// "localized" to that time zone, even though the underlying 64-bit 179 /// integers are identical to the same data stored in UTC. Converting 180 /// between time zones is a metadata-only operation and does not change the 181 /// underlying values 182 timezone: string; 183 } 184 185 enum IntervalUnit: short { YEAR_MONTH, DAY_TIME} 186 table Interval { 187 unit: IntervalUnit; 188 } 189 190 /// ---------------------------------------------------------------------- 191 /// Top-level Type value, enabling extensible type-specific metadata. We can 192 /// add new logical types to Type without breaking backwards compatibility 193 194 union Type { 195 Null, 196 Int, 197 FloatingPoint, 198 Binary, 199 Utf8, 200 Bool, 201 Decimal, 202 Date, 203 Time, 204 Timestamp, 205 Interval, 206 List, 207 Struct_, 208 Union, 209 FixedSizeBinary, 210 FixedSizeList, 211 Map 212 } 213 214 /// ---------------------------------------------------------------------- 215 /// user defined key value pairs to add custom metadata to arrow 216 /// key namespacing is the responsibility of the user 217 218 table KeyValue { 219 key: string; 220 value: string; 221 } 222 223 /// ---------------------------------------------------------------------- 224 /// Dictionary encoding metadata 225 226 table DictionaryEncoding { 227 /// The known dictionary id in the application where this data is used. In 228 /// the file or streaming formats, the dictionary ids are found in the 229 /// DictionaryBatch messages 230 id: long; 231 232 /// The dictionary indices are constrained to be positive integers. If this 233 /// field is null, the indices must be signed int32 234 indexType: Int; 235 236 /// By default, dictionaries are not ordered, or the order does not have 237 /// semantic meaning. In some statistical, applications, dictionary-encoding 238 /// is used to represent ordered categorical data, and we provide a way to 239 /// preserve that metadata here 240 isOrdered: bool; 241 } 242 243 /// ---------------------------------------------------------------------- 244 /// A field represents a named column in a record / row batch or child of a 245 /// nested type. 246 /// 247 /// - children is only for nested Arrow arrays 248 /// - For primitive types, children will have length 0 249 /// - nullable should default to true in general 250 251 table Field { 252 // Name is not required, in i.e. a List 253 name: string; 254 nullable: bool; 255 // This is the type of the decoded value if the field is dictionary encoded 256 type: Type; 257 258 // Present only if the field is dictionary encoded 259 dictionary: DictionaryEncoding; 260 261 // children apply only to Nested data types like Struct, List and Union 262 children: [Field]; 263 264 // User-defined metadata 265 custom_metadata: [ KeyValue ]; 266 } 267 268 /// ---------------------------------------------------------------------- 269 /// Endianness of the platform producing the data 270 271 enum Endianness:short { Little, Big } 272 273 /// ---------------------------------------------------------------------- 274 /// A Buffer represents a single contiguous memory segment 275 struct Buffer { 276 /// The relative offset into the shared memory page where the bytes for this 277 /// buffer starts 278 offset: long; 279 280 /// The absolute length (in bytes) of the memory buffer. The memory is found 281 /// from offset (inclusive) to offset + length (non-inclusive). 282 length: long; 283 } 284 285 /// ---------------------------------------------------------------------- 286 /// A Schema describes the columns in a row batch 287 288 table Schema { 289 290 /// endianness of the buffer 291 /// it is Little Endian by default 292 /// if endianness doesn't match the underlying system then the vectors need to be converted 293 endianness: Endianness=Little; 294 295 fields: [Field]; 296 // User-defined metadata 297 custom_metadata: [ KeyValue ]; 298 } 299 300 root_type Schema;