github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/parsespec_internal_test.go (about)

     1  // Package dsort provides distributed massively parallel resharding for very large datasets.
     2  /*
     3   * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package dsort
     6  
     7  import (
     8  	"errors"
     9  	"math"
    10  	"strings"
    11  
    12  	"github.com/NVIDIA/aistore/api/apc"
    13  	"github.com/NVIDIA/aistore/cmn"
    14  	"github.com/NVIDIA/aistore/cmn/archive"
    15  	"github.com/NVIDIA/aistore/cmn/cos"
    16  	"github.com/NVIDIA/aistore/fs"
    17  	. "github.com/onsi/ginkgo/v2"
    18  	. "github.com/onsi/gomega"
    19  )
    20  
    21  var _ = Describe("RequestSpec", func() {
    22  	BeforeEach(func() {
    23  		fs.TestNew(nil)
    24  
    25  		config := cmn.GCO.BeginUpdate()
    26  		config.Dsort.DefaultMaxMemUsage = "90%"
    27  		cmn.GCO.CommitUpdate(config)
    28  	})
    29  
    30  	Context("requests specs which should pass", func() {
    31  		It("should parse minimal spec", func() {
    32  			rs := RequestSpec{
    33  				InputBck:        cmn.Bck{Name: "test"},
    34  				InputExtension:  archive.ExtTar,
    35  				InputFormat:     newInputFormat("prefix-{0010..0111..2}-suffix"),
    36  				OutputFormat:    "prefix-{10..111}-suffix",
    37  				OutputShardSize: "10KB",
    38  				MaxMemUsage:     "80%",
    39  				Algorithm:       Algorithm{Kind: None},
    40  			}
    41  			pars, err := rs.parse()
    42  			Expect(err).ShouldNot(HaveOccurred())
    43  
    44  			Expect(pars.InputBck.Name).To(Equal("test"))
    45  			Expect(pars.InputBck.Provider).To(Equal(apc.AIS))
    46  			Expect(pars.OutputBck.Name).To(Equal("test"))
    47  			Expect(pars.OutputBck.Provider).To(Equal(apc.AIS))
    48  			Expect(pars.InputExtension).To(Equal(archive.ExtTar))
    49  
    50  			Expect(pars.Pit.Template).To(Equal(cos.ParsedTemplate{
    51  				Prefix: "prefix-",
    52  				Ranges: []cos.TemplateRange{{
    53  					Start:      10,
    54  					End:        111,
    55  					Step:       2,
    56  					DigitCount: 4,
    57  					Gap:        "-suffix",
    58  				}},
    59  			}))
    60  
    61  			Expect(pars.Pot.Template).To(Equal(cos.ParsedTemplate{
    62  				Prefix: "prefix-",
    63  				Ranges: []cos.TemplateRange{{
    64  					Start:      10,
    65  					End:        111,
    66  					Step:       1,
    67  					DigitCount: 2,
    68  					Gap:        "-suffix",
    69  				}},
    70  			}))
    71  
    72  			Expect(pars.OutputShardSize).To(BeEquivalentTo(10 * cos.KiB))
    73  
    74  			Expect(pars.MaxMemUsage.Type).To(Equal(cos.QuantityPercent))
    75  			Expect(pars.MaxMemUsage.Value).To(BeEquivalentTo(80))
    76  		})
    77  
    78  		It("should set buckets correctly", func() {
    79  			rs := RequestSpec{
    80  				InputBck:        cmn.Bck{Provider: apc.AWS, Name: "test"},
    81  				OutputBck:       cmn.Bck{Provider: apc.AWS, Name: "testing"},
    82  				InputExtension:  archive.ExtTar,
    83  				InputFormat:     newInputFormat("prefix-{0010..0111..2}-suffix"),
    84  				OutputFormat:    "prefix-{10..111}-suffix",
    85  				OutputShardSize: "10KB",
    86  				MaxMemUsage:     "80%",
    87  				Algorithm:       Algorithm{Kind: None},
    88  			}
    89  			pars, err := rs.parse()
    90  			Expect(err).ShouldNot(HaveOccurred())
    91  
    92  			Expect(pars.InputBck.Name).To(Equal("test"))
    93  			Expect(pars.InputBck.Provider).To(Equal(apc.AWS))
    94  			Expect(pars.OutputBck.Name).To(Equal("testing"))
    95  			Expect(pars.OutputBck.Provider).To(Equal(apc.AWS))
    96  		})
    97  
    98  		It("should parse spec with mem usage as bytes", func() {
    99  			rs := RequestSpec{
   100  				InputBck: cmn.Bck{Name: "test"},
   101  
   102  				InputExtension:  archive.ExtTar,
   103  				InputFormat:     newInputFormat("prefix-{0010..0111}-suffix"),
   104  				OutputFormat:    "prefix-{0010..0111}-suffix",
   105  				OutputShardSize: "10KB",
   106  				MaxMemUsage:     "80 GB",
   107  				Algorithm:       Algorithm{Kind: None},
   108  			}
   109  			pars, err := rs.parse()
   110  			Expect(err).ShouldNot(HaveOccurred())
   111  
   112  			Expect(pars.MaxMemUsage.Type).To(Equal(cos.QuantityBytes))
   113  			Expect(pars.MaxMemUsage.Value).To(BeEquivalentTo(80 * 1024 * 1024 * 1024))
   114  		})
   115  
   116  		It("should parse spec with .tgz extension", func() {
   117  			rs := RequestSpec{
   118  				InputBck:        cmn.Bck{Name: "test"},
   119  				InputExtension:  archive.ExtTgz,
   120  				InputFormat:     newInputFormat("prefix-{0010..0111}-suffix"),
   121  				OutputFormat:    "prefix-{0010..0111}-suffix",
   122  				OutputShardSize: "10KB",
   123  				Algorithm:       Algorithm{Kind: None},
   124  			}
   125  			pars, err := rs.parse()
   126  			Expect(err).ShouldNot(HaveOccurred())
   127  
   128  			Expect(pars.InputExtension).To(Equal(archive.ExtTgz))
   129  		})
   130  
   131  		It("should parse spec with .tar.gz extension", func() {
   132  			rs := RequestSpec{
   133  				InputBck:        cmn.Bck{Name: "test"},
   134  				InputExtension:  archive.ExtTarGz,
   135  				InputFormat:     newInputFormat("prefix-{0010..0111}-suffix"),
   136  				OutputFormat:    "prefix-{0010..0111}-suffix",
   137  				OutputShardSize: "10KB",
   138  				Algorithm:       Algorithm{Kind: None},
   139  			}
   140  			pars, err := rs.parse()
   141  			Expect(err).ShouldNot(HaveOccurred())
   142  
   143  			Expect(pars.InputExtension).To(Equal(archive.ExtTarGz))
   144  		})
   145  
   146  		It("should parse spec with .tar.gz extension", func() {
   147  			rs := RequestSpec{
   148  				InputBck:        cmn.Bck{Name: "test"},
   149  				InputExtension:  archive.ExtZip,
   150  				InputFormat:     newInputFormat("prefix-{0010..0111}-suffix"),
   151  				OutputFormat:    "prefix-{0010..0111}-suffix",
   152  				OutputShardSize: "10KB",
   153  				Algorithm:       Algorithm{Kind: None},
   154  			}
   155  			pars, err := rs.parse()
   156  			Expect(err).ShouldNot(HaveOccurred())
   157  
   158  			Expect(pars.InputExtension).To(Equal(archive.ExtZip))
   159  		})
   160  
   161  		It("should parse spec with %06d syntax", func() {
   162  			rs := RequestSpec{
   163  				InputBck:        cmn.Bck{Name: "test"},
   164  				InputExtension:  archive.ExtTgz,
   165  				InputFormat:     newInputFormat("prefix-{0010..0111}-suffix"),
   166  				OutputFormat:    "prefix-%06d-suffix",
   167  				OutputShardSize: "10KB",
   168  				Algorithm:       Algorithm{Kind: None},
   169  			}
   170  			pars, err := rs.parse()
   171  			Expect(err).ShouldNot(HaveOccurred())
   172  
   173  			Expect(pars.Pot.Template).To(Equal(cos.ParsedTemplate{
   174  				Prefix: "prefix-",
   175  				Ranges: []cos.TemplateRange{{
   176  					Start:      0,
   177  					End:        math.MaxInt64 - 1,
   178  					Step:       1,
   179  					DigitCount: 6,
   180  					Gap:        "-suffix",
   181  				}},
   182  			}))
   183  		})
   184  
   185  		It("should parse spec with @ syntax", func() {
   186  			rs := RequestSpec{
   187  				InputBck:        cmn.Bck{Name: "test"},
   188  				InputExtension:  archive.ExtTgz,
   189  				InputFormat:     newInputFormat("prefix@0111-suffix"),
   190  				OutputFormat:    "prefix-@000111-suffix",
   191  				OutputShardSize: "10KB",
   192  				Algorithm:       Algorithm{Kind: None},
   193  			}
   194  			pars, err := rs.parse()
   195  			Expect(err).ShouldNot(HaveOccurred())
   196  
   197  			Expect(pars.Pit.Template).To(Equal(cos.ParsedTemplate{
   198  				Prefix: "prefix",
   199  				Ranges: []cos.TemplateRange{{
   200  					Start:      0,
   201  					End:        111,
   202  					Step:       1,
   203  					DigitCount: 4,
   204  					Gap:        "-suffix",
   205  				}},
   206  			}))
   207  
   208  			Expect(pars.Pot.Template).To(Equal(cos.ParsedTemplate{
   209  				Prefix: "prefix-",
   210  				Ranges: []cos.TemplateRange{{
   211  					Start:      0,
   212  					End:        111,
   213  					Step:       1,
   214  					DigitCount: 6,
   215  					Gap:        "-suffix",
   216  				}},
   217  			}))
   218  		})
   219  
   220  		It("should parse spec and set default conc limits", func() {
   221  			rs := RequestSpec{
   222  				InputBck:            cmn.Bck{Name: "test"},
   223  				InputExtension:      archive.ExtTar,
   224  				InputFormat:         newInputFormat("prefix-{0010..0111}-suffix"),
   225  				OutputFormat:        "prefix-{0010..0111}-suffix",
   226  				OutputShardSize:     "10KB",
   227  				CreateConcMaxLimit:  0,
   228  				ExtractConcMaxLimit: 0,
   229  				Algorithm:           Algorithm{Kind: None},
   230  			}
   231  			pars, err := rs.parse()
   232  			Expect(err).ShouldNot(HaveOccurred())
   233  
   234  			Expect(pars.CreateConcMaxLimit).To(BeEquivalentTo(0))
   235  			Expect(pars.ExtractConcMaxLimit).To(BeEquivalentTo(0))
   236  		})
   237  
   238  		It("should parse spec and set the global config values or override them", func() {
   239  			cfg := cmn.GCO.BeginUpdate()
   240  			cfg.Dsort.DsorterMemThreshold = "80%"
   241  			cfg.Dsort.MissingShards = cmn.IgnoreReaction
   242  			cmn.GCO.CommitUpdate(cfg)
   243  
   244  			rs := RequestSpec{
   245  				InputBck:            cmn.Bck{Name: "test"},
   246  				InputExtension:      archive.ExtTar,
   247  				InputFormat:         newInputFormat("prefix-{0010..0111}-suffix"),
   248  				OutputFormat:        "prefix-{0010..0111}-suffix",
   249  				OutputShardSize:     "10KB",
   250  				CreateConcMaxLimit:  0,
   251  				ExtractConcMaxLimit: 0,
   252  				Algorithm:           Algorithm{Kind: None},
   253  
   254  				Config: cmn.DsortConf{
   255  					DuplicatedRecords:   cmn.AbortReaction,
   256  					MissingShards:       "", // should be set to default
   257  					EKMMalformedLine:    cmn.IgnoreReaction,
   258  					EKMMissingKey:       cmn.WarnReaction,
   259  					DsorterMemThreshold: "",
   260  				},
   261  			}
   262  			pars, err := rs.parse()
   263  			Expect(err).ShouldNot(HaveOccurred())
   264  
   265  			Expect(pars.DuplicatedRecords).To(Equal(cmn.AbortReaction))
   266  			Expect(pars.MissingShards).To(Equal(cmn.IgnoreReaction))
   267  			Expect(pars.EKMMalformedLine).To(Equal(cmn.IgnoreReaction))
   268  			Expect(pars.EKMMissingKey).To(Equal(cmn.WarnReaction))
   269  			Expect(pars.DsorterMemThreshold).To(Equal("80%"))
   270  		})
   271  
   272  		It("should pass when output shard is zero and bash or @ template is used for output format", func() {
   273  			rs := RequestSpec{
   274  				InputBck:       cmn.Bck{Name: "test"},
   275  				InputExtension: archive.ExtTar,
   276  				InputFormat:    newInputFormat("prefix-{0010..0111..2}-suffix"),
   277  				OutputFormat:   "prefix-{10..111}-suffix",
   278  				MaxMemUsage:    "80%",
   279  			}
   280  			_, err := rs.parse()
   281  			Expect(err).ShouldNot(HaveOccurred())
   282  
   283  			rs = RequestSpec{
   284  				InputBck:       cmn.Bck{Name: "test"},
   285  				InputExtension: archive.ExtTar,
   286  				InputFormat:    newInputFormat("prefix-{0010..0111..2}-suffix"),
   287  				OutputFormat:   "prefix-@111-suffix",
   288  				MaxMemUsage:    "80%",
   289  			}
   290  			_, err = rs.parse()
   291  			Expect(err).ShouldNot(HaveOccurred())
   292  		})
   293  	})
   294  
   295  	Context("request specs which shall NOT pass", func() {
   296  		It("should fail due to missing bucket property", func() {
   297  			rs := RequestSpec{
   298  				InputExtension:  ".txt",
   299  				OutputShardSize: "10KB",
   300  				Algorithm:       Algorithm{Kind: None},
   301  			}
   302  			_, err := rs.parse()
   303  			Expect(err).Should(HaveOccurred())
   304  			Expect(errors.Is(err, errMissingSrcBucket)).To(BeTrue())
   305  		})
   306  
   307  		It("should fail due to invalid bucket provider", func() {
   308  			rs := RequestSpec{
   309  				InputBck:       cmn.Bck{Provider: "invalid", Name: "test"},
   310  				InputExtension: ".txt",
   311  				Algorithm:      Algorithm{Kind: None},
   312  			}
   313  			_, err := rs.parse()
   314  			Expect(err).Should(HaveOccurred())
   315  			Expect(err).Should(MatchError(&cmn.ErrInvalidBackendProvider{}))
   316  		})
   317  
   318  		It("should fail due to invalid output bucket provider", func() {
   319  			rs := RequestSpec{
   320  				InputBck:       cmn.Bck{Provider: apc.AIS, Name: "test"},
   321  				OutputBck:      cmn.Bck{Provider: "invalid", Name: "test"},
   322  				InputExtension: ".txt",
   323  				Algorithm:      Algorithm{Kind: None},
   324  			}
   325  			_, err := rs.parse()
   326  			Expect(err).Should(HaveOccurred())
   327  			Expect(err).Should(MatchError(&cmn.ErrInvalidBackendProvider{}))
   328  		})
   329  
   330  		It("should fail due to start after end in input format", func() {
   331  			rs := RequestSpec{
   332  				InputBck:        cmn.Bck{Name: "test"},
   333  				InputExtension:  archive.ExtTar,
   334  				OutputShardSize: "10KB",
   335  				InputFormat:     newInputFormat("prefix-{0112..0111}-suffix"),
   336  				OutputFormat:    "prefix-{0010..0111}-suffix",
   337  				Algorithm:       Algorithm{Kind: None},
   338  			}
   339  			_, err := rs.parse()
   340  			Expect(err).Should(HaveOccurred())
   341  			contains := strings.Contains(err.Error(), "start")
   342  			Expect(contains).To(BeTrue())
   343  		})
   344  
   345  		It("should fail due to start after end in output format", func() {
   346  			rs := RequestSpec{
   347  				InputBck:        cmn.Bck{Name: "test"},
   348  				InputExtension:  archive.ExtTar,
   349  				OutputShardSize: "10KB",
   350  				InputFormat:     newInputFormat("prefix-{0010..0111}-suffix"),
   351  				OutputFormat:    "prefix-{0112..0111}-suffix",
   352  				Algorithm:       Algorithm{Kind: None},
   353  			}
   354  			_, err := rs.parse()
   355  			Expect(err).Should(HaveOccurred())
   356  			contains := strings.Contains(err.Error(), "start")
   357  			Expect(contains).To(BeTrue())
   358  		})
   359  
   360  		It("should fail due invalid parentheses", func() {
   361  			rs := RequestSpec{
   362  				InputBck:        cmn.Bck{Name: "test"},
   363  				InputExtension:  archive.ExtTar,
   364  				OutputShardSize: "10KB",
   365  				InputFormat:     newInputFormat("prefix-}{0001..0111}-suffix"),
   366  				OutputFormat:    "prefix-}{0010..0111}-suffix",
   367  				Algorithm:       Algorithm{Kind: None},
   368  			}
   369  			_, err := rs.parse()
   370  			Expect(err).Should(HaveOccurred())
   371  			contains := strings.Contains(err.Error(), "invalid")
   372  			Expect(contains).To(BeTrue())
   373  		})
   374  
   375  		It("should fail due to invalid extension", func() {
   376  			rs := RequestSpec{
   377  				InputBck:        cmn.Bck{Name: "test"},
   378  				InputExtension:  ".jpg",
   379  				InputFormat:     newInputFormat("prefix-{0010..0111}-suffix"),
   380  				OutputFormat:    "prefix-{0010..0111}-suffix",
   381  				OutputShardSize: "10KB",
   382  				Algorithm:       Algorithm{Kind: None},
   383  			}
   384  			_, err := rs.parse()
   385  			Expect(err).Should(HaveOccurred())
   386  			err = errors.Unwrap(err)
   387  			check := archive.IsErrUnknownMime(err)
   388  			Expect(check).To(BeTrue())
   389  		})
   390  
   391  		It("should fail due to invalid mem usage specification", func() {
   392  			rs := RequestSpec{
   393  				InputBck:        cmn.Bck{Name: "test"},
   394  				InputExtension:  archive.ExtTar,
   395  				InputFormat:     newInputFormat("prefix-{0010..0111}-suffix"),
   396  				OutputFormat:    "prefix-{0010..0111}-suffix",
   397  				OutputShardSize: "10KB",
   398  				MaxMemUsage:     "80",
   399  				Algorithm:       Algorithm{Kind: None},
   400  			}
   401  			_, err := rs.parse()
   402  			Expect(err).Should(HaveOccurred())
   403  			Expect(err).To(Equal(cos.ErrQuantityUsage))
   404  		})
   405  
   406  		It("should fail due to invalid mem usage percent specified", func() {
   407  			rs := RequestSpec{
   408  				InputBck:        cmn.Bck{Name: "test"},
   409  				InputExtension:  archive.ExtTar,
   410  				InputFormat:     newInputFormat("prefix-{0010..0111}-suffix"),
   411  				OutputFormat:    "prefix-{0010..0111}-suffix",
   412  				OutputShardSize: "10KB",
   413  				MaxMemUsage:     "120%",
   414  				Algorithm:       Algorithm{Kind: None},
   415  			}
   416  			_, err := rs.parse()
   417  			Expect(err).Should(HaveOccurred())
   418  			Expect(err).To(Equal(cos.ErrQuantityPercent))
   419  		})
   420  
   421  		It("should fail due to invalid mem usage bytes specified", func() {
   422  			rs := RequestSpec{
   423  				InputBck:        cmn.Bck{Name: "test"},
   424  				InputExtension:  archive.ExtTar,
   425  				InputFormat:     newInputFormat("prefix-{0010..0111}-suffix"),
   426  				OutputFormat:    "prefix-{0010..0111}-suffix",
   427  				OutputShardSize: "10KB",
   428  				MaxMemUsage:     "-1 GB",
   429  				Algorithm:       Algorithm{Kind: None},
   430  			}
   431  			_, err := rs.parse()
   432  			Expect(err).Should(HaveOccurred())
   433  			Expect(err).To(Equal(cos.ErrQuantityUsage))
   434  		})
   435  
   436  		It("should fail due to invalid extract concurrency specified", func() {
   437  			rs := RequestSpec{
   438  				InputBck:            cmn.Bck{Name: "test"},
   439  				InputExtension:      archive.ExtTar,
   440  				InputFormat:         newInputFormat("prefix-{0010..0111}-suffix"),
   441  				OutputFormat:        "prefix-{0010..0111}-suffix",
   442  				OutputShardSize:     "10KB",
   443  				ExtractConcMaxLimit: -1,
   444  				Algorithm:           Algorithm{Kind: None},
   445  			}
   446  			_, err := rs.parse()
   447  			Expect(err).Should(HaveOccurred())
   448  			Expect(errors.Is(err, errNegConcLimit)).To(BeTrue())
   449  		})
   450  
   451  		It("should fail due to invalid create concurrency specified", func() {
   452  			rs := RequestSpec{
   453  				InputBck:           cmn.Bck{Name: "test"},
   454  				InputExtension:     archive.ExtTar,
   455  				InputFormat:        newInputFormat("prefix-{0010..0111}-suffix"),
   456  				OutputFormat:       "prefix-{0010..0111}-suffix",
   457  				OutputShardSize:    "10KB",
   458  				CreateConcMaxLimit: -1,
   459  				Algorithm:          Algorithm{Kind: None},
   460  			}
   461  			_, err := rs.parse()
   462  			Expect(err).Should(HaveOccurred())
   463  			Expect(errors.Is(err, errNegConcLimit)).To(BeTrue())
   464  		})
   465  
   466  		It("should fail due to invalid dsort config value", func() {
   467  			rs := RequestSpec{
   468  				InputBck:        cmn.Bck{Name: "test"},
   469  				InputExtension:  archive.ExtTar,
   470  				InputFormat:     newInputFormat("prefix-{0010..0111..2}-suffix"),
   471  				OutputFormat:    "prefix-{10..111}-suffix",
   472  				OutputShardSize: "10KB",
   473  				MaxMemUsage:     "80%",
   474  				Algorithm:       Algorithm{Kind: None},
   475  				Config:          cmn.DsortConf{DuplicatedRecords: "something"},
   476  			}
   477  			_, err := rs.parse()
   478  			Expect(err).Should(HaveOccurred())
   479  		})
   480  
   481  		It("should fail when output shard size is empty and output format is %06d", func() {
   482  			rs := RequestSpec{
   483  				InputBck:       cmn.Bck{Name: "test"},
   484  				InputExtension: archive.ExtTar,
   485  				InputFormat:    newInputFormat("prefix-{0010..0111..2}-suffix"),
   486  				OutputFormat:   "prefix-%06d-suffix",
   487  				MaxMemUsage:    "80%",
   488  			}
   489  			_, err := rs.parse()
   490  			Expect(err).Should(HaveOccurred())
   491  		})
   492  	})
   493  })
   494  
   495  func newInputFormat(template string) apc.ListRange { return apc.ListRange{Template: template} }