github.com/apache/arrow/go/v14@v14.0.2/parquet/metadata/app_version.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package metadata
    18  
    19  import (
    20  	"regexp"
    21  	"strconv"
    22  	"strings"
    23  
    24  	"github.com/apache/arrow/go/v14/parquet"
    25  	"github.com/apache/arrow/go/v14/parquet/schema"
    26  )
    27  
    28  var (
    29  	// Regular expression for the version format
    30  	// major . minor . patch unknown - prerelease.x + build info
    31  	// Eg: 1.5.0ab-cdh5.5.0+cd
    32  	versionRx = regexp.MustCompile(`^(\d+)\.(\d+)\.(\d+)([^-+]*)?(?:-([^+]*))?(?:\+(.*))?$`)
    33  	// Regular expression for the application format
    34  	// application_name version VERSION_FORMAT (build build_name)
    35  	// Eg: parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
    36  	applicationRx = regexp.MustCompile(`^(.*?)\s*(?:(version\s*(?:([^(]*?)\s*(?:\(\s*build\s*([^)]*?)\s*\))?)?)?)$`)
    37  
    38  	// Parquet816FixedVersion is the version used for fixing PARQUET-816
    39  	// that changed the padding calculations for dictionary headers on row groups.
    40  	Parquet816FixedVersion      = NewAppVersionExplicit("parquet-mr", 1, 2, 9)
    41  	parquet251FixedVersion      = NewAppVersionExplicit("parquet-mr", 1, 8, 0)
    42  	parquetCPPFixedStatsVersion = NewAppVersionExplicit("parquet-cpp", 1, 3, 0)
    43  	parquetMRFixedStatsVersion  = NewAppVersionExplicit("parquet-mr", 1, 10, 0)
    44  	// parquet1655FixedVersion is the version used for fixing PARQUET-1655
    45  	// which fixed min/max stats comparisons for Decimal types
    46  	parquet1655FixedVersion = NewAppVersionExplicit("parquet-cpp-arrow", 4, 0, 0)
    47  )
    48  
    49  // AppVersion represents a specific application version either read from
    50  // or written to a parquet file.
    51  type AppVersion struct {
    52  	App     string
    53  	Build   string
    54  	Version struct {
    55  		Major      int
    56  		Minor      int
    57  		Patch      int
    58  		Unknown    string
    59  		PreRelease string
    60  		BuildInfo  string
    61  	}
    62  }
    63  
    64  // NewAppVersionExplicit is a convenience function to construct a specific
    65  // application version from the given app string and version
    66  func NewAppVersionExplicit(app string, major, minor, patch int) *AppVersion {
    67  	v := &AppVersion{App: app}
    68  	v.Version.Major = major
    69  	v.Version.Minor = minor
    70  	v.Version.Patch = patch
    71  	return v
    72  }
    73  
    74  // NewAppVersion parses a "created by" string such as "parquet-go 1.0.0".
    75  //
    76  // It also supports handling pre-releases and build info such as
    77  // 	parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
    78  func NewAppVersion(createdby string) *AppVersion {
    79  	v := &AppVersion{}
    80  
    81  	var ver []string
    82  
    83  	m := applicationRx.FindStringSubmatch(strings.ToLower(createdby))
    84  	if len(m) >= 4 {
    85  		v.App = m[1]
    86  		v.Build = m[4]
    87  		ver = versionRx.FindStringSubmatch(m[3])
    88  	} else {
    89  		v.App = "unknown"
    90  	}
    91  
    92  	if len(ver) >= 7 {
    93  		v.Version.Major, _ = strconv.Atoi(ver[1])
    94  		v.Version.Minor, _ = strconv.Atoi(ver[2])
    95  		v.Version.Patch, _ = strconv.Atoi(ver[3])
    96  		v.Version.Unknown = ver[4]
    97  		v.Version.PreRelease = ver[5]
    98  		v.Version.BuildInfo = ver[6]
    99  	}
   100  	return v
   101  }
   102  
   103  // LessThan compares the app versions and returns true if this version
   104  // is "less than" the passed version.
   105  //
   106  // If the apps don't match, this always returns false. Otherwise it compares
   107  // the major versions first, then the minor versions, and finally the patch
   108  // versions.
   109  //
   110  // Pre-release and build info are not considered.
   111  func (v AppVersion) LessThan(other *AppVersion) bool {
   112  	switch {
   113  	case v.App != other.App:
   114  		return false
   115  	case v.Version.Major < other.Version.Major:
   116  		return true
   117  	case v.Version.Major > other.Version.Major:
   118  		return false
   119  	case v.Version.Minor < other.Version.Minor:
   120  		return true
   121  	case v.Version.Minor > other.Version.Minor:
   122  		return false
   123  	}
   124  
   125  	return v.Version.Patch < other.Version.Patch
   126  }
   127  
   128  // Equal only compares the Application and major/minor/patch versions.
   129  //
   130  // Pre-release and build info are not considered.
   131  func (v AppVersion) Equal(other *AppVersion) bool {
   132  	return v.App == other.App &&
   133  		v.Version.Major == other.Version.Major &&
   134  		v.Version.Minor == other.Version.Minor &&
   135  		v.Version.Patch == other.Version.Patch
   136  }
   137  
   138  // HasCorrectStatistics checks whether or not the statistics are valid to be used
   139  // based on the primitive type and the version since previous versions had issues with
   140  // properly computing stats.
   141  //
   142  // Reference: parquet-cpp/src/parquet/metadata.cc
   143  //
   144  // PARQUET-686 has more discussion on statistics
   145  func (v AppVersion) HasCorrectStatistics(coltype parquet.Type, logicalType schema.LogicalType, stats EncodedStatistics, sort schema.SortOrder) bool {
   146  	// parquet-cpp version 1.3.0 and parquet-mr 1.10.0 onwards stats are computed correctly for all types except decimal
   147  	if (v.App == "parquet-cpp" && v.LessThan(parquetCPPFixedStatsVersion)) ||
   148  		(v.App == "parquet-mr" && v.LessThan(parquetMRFixedStatsVersion)) {
   149  		// only SIGNED are valid unless max and min are the same (in which case the sort order doesn't matter)
   150  		var maxEqualsMin bool
   151  		if stats.HasMin && stats.HasMax {
   152  			maxEqualsMin = string(stats.Min) == string(stats.Max)
   153  		}
   154  		if sort != schema.SortSIGNED && !maxEqualsMin {
   155  			return false
   156  		}
   157  
   158  		if coltype != parquet.Types.FixedLenByteArray && coltype != parquet.Types.ByteArray {
   159  			return true
   160  		}
   161  	}
   162  
   163  	// parquet-cpp-arrow version 4.0.0 fixed Decimal comparisons for creating min/max stats
   164  	// parquet-cpp also becomes parquet-cpp-arrow as of version 4.0.0
   165  	if v.App == "parquet-cpp" || (v.App == "parquet-cpp-arrow" && v.LessThan(parquet1655FixedVersion)) {
   166  		if _, ok := logicalType.(*schema.DecimalLogicalType); ok && coltype == parquet.Types.FixedLenByteArray {
   167  			return false
   168  		}
   169  	}
   170  
   171  	// created_by is not populated, which could have been caused by
   172  	// parquet-mr during the same time as PARQUET-251, see PARQUET-297
   173  	if v.App == "unknown" {
   174  		return true
   175  	}
   176  
   177  	// unknown sort order has incorrect stats
   178  	if sort == schema.SortUNKNOWN {
   179  		return false
   180  	}
   181  
   182  	// PARQUET-251
   183  	return !v.LessThan(parquet251FixedVersion)
   184  }