github.com/apache/arrow/go/v14@v14.0.1/parquet/metadata/app_version.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package metadata 18 19 import ( 20 "regexp" 21 "strconv" 22 "strings" 23 24 "github.com/apache/arrow/go/v14/parquet" 25 "github.com/apache/arrow/go/v14/parquet/schema" 26 ) 27 28 var ( 29 // Regular expression for the version format 30 // major . minor . patch unknown - prerelease.x + build info 31 // Eg: 1.5.0ab-cdh5.5.0+cd 32 versionRx = regexp.MustCompile(`^(\d+)\.(\d+)\.(\d+)([^-+]*)?(?:-([^+]*))?(?:\+(.*))?$`) 33 // Regular expression for the application format 34 // application_name version VERSION_FORMAT (build build_name) 35 // Eg: parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd) 36 applicationRx = regexp.MustCompile(`^(.*?)\s*(?:(version\s*(?:([^(]*?)\s*(?:\(\s*build\s*([^)]*?)\s*\))?)?)?)$`) 37 38 // Parquet816FixedVersion is the version used for fixing PARQUET-816 39 // that changed the padding calculations for dictionary headers on row groups. 40 Parquet816FixedVersion = NewAppVersionExplicit("parquet-mr", 1, 2, 9) 41 parquet251FixedVersion = NewAppVersionExplicit("parquet-mr", 1, 8, 0) 42 parquetCPPFixedStatsVersion = NewAppVersionExplicit("parquet-cpp", 1, 3, 0) 43 parquetMRFixedStatsVersion = NewAppVersionExplicit("parquet-mr", 1, 10, 0) 44 // parquet1655FixedVersion is the version used for fixing PARQUET-1655 45 // which fixed min/max stats comparisons for Decimal types 46 parquet1655FixedVersion = NewAppVersionExplicit("parquet-cpp-arrow", 4, 0, 0) 47 ) 48 49 // AppVersion represents a specific application version either read from 50 // or written to a parquet file. 51 type AppVersion struct { 52 App string 53 Build string 54 Version struct { 55 Major int 56 Minor int 57 Patch int 58 Unknown string 59 PreRelease string 60 BuildInfo string 61 } 62 } 63 64 // NewAppVersionExplicit is a convenience function to construct a specific 65 // application version from the given app string and version 66 func NewAppVersionExplicit(app string, major, minor, patch int) *AppVersion { 67 v := &AppVersion{App: app} 68 v.Version.Major = major 69 v.Version.Minor = minor 70 v.Version.Patch = patch 71 return v 72 } 73 74 // NewAppVersion parses a "created by" string such as "parquet-go 1.0.0". 75 // 76 // It also supports handling pre-releases and build info such as 77 // parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd) 78 func NewAppVersion(createdby string) *AppVersion { 79 v := &AppVersion{} 80 81 var ver []string 82 83 m := applicationRx.FindStringSubmatch(strings.ToLower(createdby)) 84 if len(m) >= 4 { 85 v.App = m[1] 86 v.Build = m[4] 87 ver = versionRx.FindStringSubmatch(m[3]) 88 } else { 89 v.App = "unknown" 90 } 91 92 if len(ver) >= 7 { 93 v.Version.Major, _ = strconv.Atoi(ver[1]) 94 v.Version.Minor, _ = strconv.Atoi(ver[2]) 95 v.Version.Patch, _ = strconv.Atoi(ver[3]) 96 v.Version.Unknown = ver[4] 97 v.Version.PreRelease = ver[5] 98 v.Version.BuildInfo = ver[6] 99 } 100 return v 101 } 102 103 // LessThan compares the app versions and returns true if this version 104 // is "less than" the passed version. 105 // 106 // If the apps don't match, this always returns false. Otherwise it compares 107 // the major versions first, then the minor versions, and finally the patch 108 // versions. 109 // 110 // Pre-release and build info are not considered. 111 func (v AppVersion) LessThan(other *AppVersion) bool { 112 switch { 113 case v.App != other.App: 114 return false 115 case v.Version.Major < other.Version.Major: 116 return true 117 case v.Version.Major > other.Version.Major: 118 return false 119 case v.Version.Minor < other.Version.Minor: 120 return true 121 case v.Version.Minor > other.Version.Minor: 122 return false 123 } 124 125 return v.Version.Patch < other.Version.Patch 126 } 127 128 // Equal only compares the Application and major/minor/patch versions. 129 // 130 // Pre-release and build info are not considered. 131 func (v AppVersion) Equal(other *AppVersion) bool { 132 return v.App == other.App && 133 v.Version.Major == other.Version.Major && 134 v.Version.Minor == other.Version.Minor && 135 v.Version.Patch == other.Version.Patch 136 } 137 138 // HasCorrectStatistics checks whether or not the statistics are valid to be used 139 // based on the primitive type and the version since previous versions had issues with 140 // properly computing stats. 141 // 142 // Reference: parquet-cpp/src/parquet/metadata.cc 143 // 144 // PARQUET-686 has more discussion on statistics 145 func (v AppVersion) HasCorrectStatistics(coltype parquet.Type, logicalType schema.LogicalType, stats EncodedStatistics, sort schema.SortOrder) bool { 146 // parquet-cpp version 1.3.0 and parquet-mr 1.10.0 onwards stats are computed correctly for all types except decimal 147 if (v.App == "parquet-cpp" && v.LessThan(parquetCPPFixedStatsVersion)) || 148 (v.App == "parquet-mr" && v.LessThan(parquetMRFixedStatsVersion)) { 149 // only SIGNED are valid unless max and min are the same (in which case the sort order doesn't matter) 150 var maxEqualsMin bool 151 if stats.HasMin && stats.HasMax { 152 maxEqualsMin = string(stats.Min) == string(stats.Max) 153 } 154 if sort != schema.SortSIGNED && !maxEqualsMin { 155 return false 156 } 157 158 if coltype != parquet.Types.FixedLenByteArray && coltype != parquet.Types.ByteArray { 159 return true 160 } 161 } 162 163 // parquet-cpp-arrow version 4.0.0 fixed Decimal comparisons for creating min/max stats 164 // parquet-cpp also becomes parquet-cpp-arrow as of version 4.0.0 165 if v.App == "parquet-cpp" || (v.App == "parquet-cpp-arrow" && v.LessThan(parquet1655FixedVersion)) { 166 if _, ok := logicalType.(*schema.DecimalLogicalType); ok && coltype == parquet.Types.FixedLenByteArray { 167 return false 168 } 169 } 170 171 // created_by is not populated, which could have been caused by 172 // parquet-mr during the same time as PARQUET-251, see PARQUET-297 173 if v.App == "unknown" { 174 return true 175 } 176 177 // unknown sort order has incorrect stats 178 if sort == schema.SortUNKNOWN { 179 return false 180 } 181 182 // PARQUET-251 183 return !v.LessThan(parquet251FixedVersion) 184 }