github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/parsespec_internal_test.go (about) 1 // Package dsort provides distributed massively parallel resharding for very large datasets. 2 /* 3 * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package dsort 6 7 import ( 8 "errors" 9 "math" 10 "strings" 11 12 "github.com/NVIDIA/aistore/api/apc" 13 "github.com/NVIDIA/aistore/cmn" 14 "github.com/NVIDIA/aistore/cmn/archive" 15 "github.com/NVIDIA/aistore/cmn/cos" 16 "github.com/NVIDIA/aistore/fs" 17 . "github.com/onsi/ginkgo/v2" 18 . "github.com/onsi/gomega" 19 ) 20 21 var _ = Describe("RequestSpec", func() { 22 BeforeEach(func() { 23 fs.TestNew(nil) 24 25 config := cmn.GCO.BeginUpdate() 26 config.Dsort.DefaultMaxMemUsage = "90%" 27 cmn.GCO.CommitUpdate(config) 28 }) 29 30 Context("requests specs which should pass", func() { 31 It("should parse minimal spec", func() { 32 rs := RequestSpec{ 33 InputBck: cmn.Bck{Name: "test"}, 34 InputExtension: archive.ExtTar, 35 InputFormat: newInputFormat("prefix-{0010..0111..2}-suffix"), 36 OutputFormat: "prefix-{10..111}-suffix", 37 OutputShardSize: "10KB", 38 MaxMemUsage: "80%", 39 Algorithm: Algorithm{Kind: None}, 40 } 41 pars, err := rs.parse() 42 Expect(err).ShouldNot(HaveOccurred()) 43 44 Expect(pars.InputBck.Name).To(Equal("test")) 45 Expect(pars.InputBck.Provider).To(Equal(apc.AIS)) 46 Expect(pars.OutputBck.Name).To(Equal("test")) 47 Expect(pars.OutputBck.Provider).To(Equal(apc.AIS)) 48 Expect(pars.InputExtension).To(Equal(archive.ExtTar)) 49 50 Expect(pars.Pit.Template).To(Equal(cos.ParsedTemplate{ 51 Prefix: "prefix-", 52 Ranges: []cos.TemplateRange{{ 53 Start: 10, 54 End: 111, 55 Step: 2, 56 DigitCount: 4, 57 Gap: "-suffix", 58 }}, 59 })) 60 61 Expect(pars.Pot.Template).To(Equal(cos.ParsedTemplate{ 62 Prefix: "prefix-", 63 Ranges: []cos.TemplateRange{{ 64 Start: 10, 65 End: 111, 66 Step: 1, 67 DigitCount: 2, 68 Gap: "-suffix", 69 }}, 70 })) 71 72 Expect(pars.OutputShardSize).To(BeEquivalentTo(10 * cos.KiB)) 73 74 Expect(pars.MaxMemUsage.Type).To(Equal(cos.QuantityPercent)) 75 Expect(pars.MaxMemUsage.Value).To(BeEquivalentTo(80)) 76 }) 77 78 It("should set buckets correctly", func() { 79 rs := RequestSpec{ 80 InputBck: cmn.Bck{Provider: apc.AWS, Name: "test"}, 81 OutputBck: cmn.Bck{Provider: apc.AWS, Name: "testing"}, 82 InputExtension: archive.ExtTar, 83 InputFormat: newInputFormat("prefix-{0010..0111..2}-suffix"), 84 OutputFormat: "prefix-{10..111}-suffix", 85 OutputShardSize: "10KB", 86 MaxMemUsage: "80%", 87 Algorithm: Algorithm{Kind: None}, 88 } 89 pars, err := rs.parse() 90 Expect(err).ShouldNot(HaveOccurred()) 91 92 Expect(pars.InputBck.Name).To(Equal("test")) 93 Expect(pars.InputBck.Provider).To(Equal(apc.AWS)) 94 Expect(pars.OutputBck.Name).To(Equal("testing")) 95 Expect(pars.OutputBck.Provider).To(Equal(apc.AWS)) 96 }) 97 98 It("should parse spec with mem usage as bytes", func() { 99 rs := RequestSpec{ 100 InputBck: cmn.Bck{Name: "test"}, 101 102 InputExtension: archive.ExtTar, 103 InputFormat: newInputFormat("prefix-{0010..0111}-suffix"), 104 OutputFormat: "prefix-{0010..0111}-suffix", 105 OutputShardSize: "10KB", 106 MaxMemUsage: "80 GB", 107 Algorithm: Algorithm{Kind: None}, 108 } 109 pars, err := rs.parse() 110 Expect(err).ShouldNot(HaveOccurred()) 111 112 Expect(pars.MaxMemUsage.Type).To(Equal(cos.QuantityBytes)) 113 Expect(pars.MaxMemUsage.Value).To(BeEquivalentTo(80 * 1024 * 1024 * 1024)) 114 }) 115 116 It("should parse spec with .tgz extension", func() { 117 rs := RequestSpec{ 118 InputBck: cmn.Bck{Name: "test"}, 119 InputExtension: archive.ExtTgz, 120 InputFormat: newInputFormat("prefix-{0010..0111}-suffix"), 121 OutputFormat: "prefix-{0010..0111}-suffix", 122 OutputShardSize: "10KB", 123 Algorithm: Algorithm{Kind: None}, 124 } 125 pars, err := rs.parse() 126 Expect(err).ShouldNot(HaveOccurred()) 127 128 Expect(pars.InputExtension).To(Equal(archive.ExtTgz)) 129 }) 130 131 It("should parse spec with .tar.gz extension", func() { 132 rs := RequestSpec{ 133 InputBck: cmn.Bck{Name: "test"}, 134 InputExtension: archive.ExtTarGz, 135 InputFormat: newInputFormat("prefix-{0010..0111}-suffix"), 136 OutputFormat: "prefix-{0010..0111}-suffix", 137 OutputShardSize: "10KB", 138 Algorithm: Algorithm{Kind: None}, 139 } 140 pars, err := rs.parse() 141 Expect(err).ShouldNot(HaveOccurred()) 142 143 Expect(pars.InputExtension).To(Equal(archive.ExtTarGz)) 144 }) 145 146 It("should parse spec with .tar.gz extension", func() { 147 rs := RequestSpec{ 148 InputBck: cmn.Bck{Name: "test"}, 149 InputExtension: archive.ExtZip, 150 InputFormat: newInputFormat("prefix-{0010..0111}-suffix"), 151 OutputFormat: "prefix-{0010..0111}-suffix", 152 OutputShardSize: "10KB", 153 Algorithm: Algorithm{Kind: None}, 154 } 155 pars, err := rs.parse() 156 Expect(err).ShouldNot(HaveOccurred()) 157 158 Expect(pars.InputExtension).To(Equal(archive.ExtZip)) 159 }) 160 161 It("should parse spec with %06d syntax", func() { 162 rs := RequestSpec{ 163 InputBck: cmn.Bck{Name: "test"}, 164 InputExtension: archive.ExtTgz, 165 InputFormat: newInputFormat("prefix-{0010..0111}-suffix"), 166 OutputFormat: "prefix-%06d-suffix", 167 OutputShardSize: "10KB", 168 Algorithm: Algorithm{Kind: None}, 169 } 170 pars, err := rs.parse() 171 Expect(err).ShouldNot(HaveOccurred()) 172 173 Expect(pars.Pot.Template).To(Equal(cos.ParsedTemplate{ 174 Prefix: "prefix-", 175 Ranges: []cos.TemplateRange{{ 176 Start: 0, 177 End: math.MaxInt64 - 1, 178 Step: 1, 179 DigitCount: 6, 180 Gap: "-suffix", 181 }}, 182 })) 183 }) 184 185 It("should parse spec with @ syntax", func() { 186 rs := RequestSpec{ 187 InputBck: cmn.Bck{Name: "test"}, 188 InputExtension: archive.ExtTgz, 189 InputFormat: newInputFormat("prefix@0111-suffix"), 190 OutputFormat: "prefix-@000111-suffix", 191 OutputShardSize: "10KB", 192 Algorithm: Algorithm{Kind: None}, 193 } 194 pars, err := rs.parse() 195 Expect(err).ShouldNot(HaveOccurred()) 196 197 Expect(pars.Pit.Template).To(Equal(cos.ParsedTemplate{ 198 Prefix: "prefix", 199 Ranges: []cos.TemplateRange{{ 200 Start: 0, 201 End: 111, 202 Step: 1, 203 DigitCount: 4, 204 Gap: "-suffix", 205 }}, 206 })) 207 208 Expect(pars.Pot.Template).To(Equal(cos.ParsedTemplate{ 209 Prefix: "prefix-", 210 Ranges: []cos.TemplateRange{{ 211 Start: 0, 212 End: 111, 213 Step: 1, 214 DigitCount: 6, 215 Gap: "-suffix", 216 }}, 217 })) 218 }) 219 220 It("should parse spec and set default conc limits", func() { 221 rs := RequestSpec{ 222 InputBck: cmn.Bck{Name: "test"}, 223 InputExtension: archive.ExtTar, 224 InputFormat: newInputFormat("prefix-{0010..0111}-suffix"), 225 OutputFormat: "prefix-{0010..0111}-suffix", 226 OutputShardSize: "10KB", 227 CreateConcMaxLimit: 0, 228 ExtractConcMaxLimit: 0, 229 Algorithm: Algorithm{Kind: None}, 230 } 231 pars, err := rs.parse() 232 Expect(err).ShouldNot(HaveOccurred()) 233 234 Expect(pars.CreateConcMaxLimit).To(BeEquivalentTo(0)) 235 Expect(pars.ExtractConcMaxLimit).To(BeEquivalentTo(0)) 236 }) 237 238 It("should parse spec and set the global config values or override them", func() { 239 cfg := cmn.GCO.BeginUpdate() 240 cfg.Dsort.DsorterMemThreshold = "80%" 241 cfg.Dsort.MissingShards = cmn.IgnoreReaction 242 cmn.GCO.CommitUpdate(cfg) 243 244 rs := RequestSpec{ 245 InputBck: cmn.Bck{Name: "test"}, 246 InputExtension: archive.ExtTar, 247 InputFormat: newInputFormat("prefix-{0010..0111}-suffix"), 248 OutputFormat: "prefix-{0010..0111}-suffix", 249 OutputShardSize: "10KB", 250 CreateConcMaxLimit: 0, 251 ExtractConcMaxLimit: 0, 252 Algorithm: Algorithm{Kind: None}, 253 254 Config: cmn.DsortConf{ 255 DuplicatedRecords: cmn.AbortReaction, 256 MissingShards: "", // should be set to default 257 EKMMalformedLine: cmn.IgnoreReaction, 258 EKMMissingKey: cmn.WarnReaction, 259 DsorterMemThreshold: "", 260 }, 261 } 262 pars, err := rs.parse() 263 Expect(err).ShouldNot(HaveOccurred()) 264 265 Expect(pars.DuplicatedRecords).To(Equal(cmn.AbortReaction)) 266 Expect(pars.MissingShards).To(Equal(cmn.IgnoreReaction)) 267 Expect(pars.EKMMalformedLine).To(Equal(cmn.IgnoreReaction)) 268 Expect(pars.EKMMissingKey).To(Equal(cmn.WarnReaction)) 269 Expect(pars.DsorterMemThreshold).To(Equal("80%")) 270 }) 271 272 It("should pass when output shard is zero and bash or @ template is used for output format", func() { 273 rs := RequestSpec{ 274 InputBck: cmn.Bck{Name: "test"}, 275 InputExtension: archive.ExtTar, 276 InputFormat: newInputFormat("prefix-{0010..0111..2}-suffix"), 277 OutputFormat: "prefix-{10..111}-suffix", 278 MaxMemUsage: "80%", 279 } 280 _, err := rs.parse() 281 Expect(err).ShouldNot(HaveOccurred()) 282 283 rs = RequestSpec{ 284 InputBck: cmn.Bck{Name: "test"}, 285 InputExtension: archive.ExtTar, 286 InputFormat: newInputFormat("prefix-{0010..0111..2}-suffix"), 287 OutputFormat: "prefix-@111-suffix", 288 MaxMemUsage: "80%", 289 } 290 _, err = rs.parse() 291 Expect(err).ShouldNot(HaveOccurred()) 292 }) 293 }) 294 295 Context("request specs which shall NOT pass", func() { 296 It("should fail due to missing bucket property", func() { 297 rs := RequestSpec{ 298 InputExtension: ".txt", 299 OutputShardSize: "10KB", 300 Algorithm: Algorithm{Kind: None}, 301 } 302 _, err := rs.parse() 303 Expect(err).Should(HaveOccurred()) 304 Expect(errors.Is(err, errMissingSrcBucket)).To(BeTrue()) 305 }) 306 307 It("should fail due to invalid bucket provider", func() { 308 rs := RequestSpec{ 309 InputBck: cmn.Bck{Provider: "invalid", Name: "test"}, 310 InputExtension: ".txt", 311 Algorithm: Algorithm{Kind: None}, 312 } 313 _, err := rs.parse() 314 Expect(err).Should(HaveOccurred()) 315 Expect(err).Should(MatchError(&cmn.ErrInvalidBackendProvider{})) 316 }) 317 318 It("should fail due to invalid output bucket provider", func() { 319 rs := RequestSpec{ 320 InputBck: cmn.Bck{Provider: apc.AIS, Name: "test"}, 321 OutputBck: cmn.Bck{Provider: "invalid", Name: "test"}, 322 InputExtension: ".txt", 323 Algorithm: Algorithm{Kind: None}, 324 } 325 _, err := rs.parse() 326 Expect(err).Should(HaveOccurred()) 327 Expect(err).Should(MatchError(&cmn.ErrInvalidBackendProvider{})) 328 }) 329 330 It("should fail due to start after end in input format", func() { 331 rs := RequestSpec{ 332 InputBck: cmn.Bck{Name: "test"}, 333 InputExtension: archive.ExtTar, 334 OutputShardSize: "10KB", 335 InputFormat: newInputFormat("prefix-{0112..0111}-suffix"), 336 OutputFormat: "prefix-{0010..0111}-suffix", 337 Algorithm: Algorithm{Kind: None}, 338 } 339 _, err := rs.parse() 340 Expect(err).Should(HaveOccurred()) 341 contains := strings.Contains(err.Error(), "start") 342 Expect(contains).To(BeTrue()) 343 }) 344 345 It("should fail due to start after end in output format", func() { 346 rs := RequestSpec{ 347 InputBck: cmn.Bck{Name: "test"}, 348 InputExtension: archive.ExtTar, 349 OutputShardSize: "10KB", 350 InputFormat: newInputFormat("prefix-{0010..0111}-suffix"), 351 OutputFormat: "prefix-{0112..0111}-suffix", 352 Algorithm: Algorithm{Kind: None}, 353 } 354 _, err := rs.parse() 355 Expect(err).Should(HaveOccurred()) 356 contains := strings.Contains(err.Error(), "start") 357 Expect(contains).To(BeTrue()) 358 }) 359 360 It("should fail due invalid parentheses", func() { 361 rs := RequestSpec{ 362 InputBck: cmn.Bck{Name: "test"}, 363 InputExtension: archive.ExtTar, 364 OutputShardSize: "10KB", 365 InputFormat: newInputFormat("prefix-}{0001..0111}-suffix"), 366 OutputFormat: "prefix-}{0010..0111}-suffix", 367 Algorithm: Algorithm{Kind: None}, 368 } 369 _, err := rs.parse() 370 Expect(err).Should(HaveOccurred()) 371 contains := strings.Contains(err.Error(), "invalid") 372 Expect(contains).To(BeTrue()) 373 }) 374 375 It("should fail due to invalid extension", func() { 376 rs := RequestSpec{ 377 InputBck: cmn.Bck{Name: "test"}, 378 InputExtension: ".jpg", 379 InputFormat: newInputFormat("prefix-{0010..0111}-suffix"), 380 OutputFormat: "prefix-{0010..0111}-suffix", 381 OutputShardSize: "10KB", 382 Algorithm: Algorithm{Kind: None}, 383 } 384 _, err := rs.parse() 385 Expect(err).Should(HaveOccurred()) 386 err = errors.Unwrap(err) 387 check := archive.IsErrUnknownMime(err) 388 Expect(check).To(BeTrue()) 389 }) 390 391 It("should fail due to invalid mem usage specification", func() { 392 rs := RequestSpec{ 393 InputBck: cmn.Bck{Name: "test"}, 394 InputExtension: archive.ExtTar, 395 InputFormat: newInputFormat("prefix-{0010..0111}-suffix"), 396 OutputFormat: "prefix-{0010..0111}-suffix", 397 OutputShardSize: "10KB", 398 MaxMemUsage: "80", 399 Algorithm: Algorithm{Kind: None}, 400 } 401 _, err := rs.parse() 402 Expect(err).Should(HaveOccurred()) 403 Expect(err).To(Equal(cos.ErrQuantityUsage)) 404 }) 405 406 It("should fail due to invalid mem usage percent specified", func() { 407 rs := RequestSpec{ 408 InputBck: cmn.Bck{Name: "test"}, 409 InputExtension: archive.ExtTar, 410 InputFormat: newInputFormat("prefix-{0010..0111}-suffix"), 411 OutputFormat: "prefix-{0010..0111}-suffix", 412 OutputShardSize: "10KB", 413 MaxMemUsage: "120%", 414 Algorithm: Algorithm{Kind: None}, 415 } 416 _, err := rs.parse() 417 Expect(err).Should(HaveOccurred()) 418 Expect(err).To(Equal(cos.ErrQuantityPercent)) 419 }) 420 421 It("should fail due to invalid mem usage bytes specified", func() { 422 rs := RequestSpec{ 423 InputBck: cmn.Bck{Name: "test"}, 424 InputExtension: archive.ExtTar, 425 InputFormat: newInputFormat("prefix-{0010..0111}-suffix"), 426 OutputFormat: "prefix-{0010..0111}-suffix", 427 OutputShardSize: "10KB", 428 MaxMemUsage: "-1 GB", 429 Algorithm: Algorithm{Kind: None}, 430 } 431 _, err := rs.parse() 432 Expect(err).Should(HaveOccurred()) 433 Expect(err).To(Equal(cos.ErrQuantityUsage)) 434 }) 435 436 It("should fail due to invalid extract concurrency specified", func() { 437 rs := RequestSpec{ 438 InputBck: cmn.Bck{Name: "test"}, 439 InputExtension: archive.ExtTar, 440 InputFormat: newInputFormat("prefix-{0010..0111}-suffix"), 441 OutputFormat: "prefix-{0010..0111}-suffix", 442 OutputShardSize: "10KB", 443 ExtractConcMaxLimit: -1, 444 Algorithm: Algorithm{Kind: None}, 445 } 446 _, err := rs.parse() 447 Expect(err).Should(HaveOccurred()) 448 Expect(errors.Is(err, errNegConcLimit)).To(BeTrue()) 449 }) 450 451 It("should fail due to invalid create concurrency specified", func() { 452 rs := RequestSpec{ 453 InputBck: cmn.Bck{Name: "test"}, 454 InputExtension: archive.ExtTar, 455 InputFormat: newInputFormat("prefix-{0010..0111}-suffix"), 456 OutputFormat: "prefix-{0010..0111}-suffix", 457 OutputShardSize: "10KB", 458 CreateConcMaxLimit: -1, 459 Algorithm: Algorithm{Kind: None}, 460 } 461 _, err := rs.parse() 462 Expect(err).Should(HaveOccurred()) 463 Expect(errors.Is(err, errNegConcLimit)).To(BeTrue()) 464 }) 465 466 It("should fail due to invalid dsort config value", func() { 467 rs := RequestSpec{ 468 InputBck: cmn.Bck{Name: "test"}, 469 InputExtension: archive.ExtTar, 470 InputFormat: newInputFormat("prefix-{0010..0111..2}-suffix"), 471 OutputFormat: "prefix-{10..111}-suffix", 472 OutputShardSize: "10KB", 473 MaxMemUsage: "80%", 474 Algorithm: Algorithm{Kind: None}, 475 Config: cmn.DsortConf{DuplicatedRecords: "something"}, 476 } 477 _, err := rs.parse() 478 Expect(err).Should(HaveOccurred()) 479 }) 480 481 It("should fail when output shard size is empty and output format is %06d", func() { 482 rs := RequestSpec{ 483 InputBck: cmn.Bck{Name: "test"}, 484 InputExtension: archive.ExtTar, 485 InputFormat: newInputFormat("prefix-{0010..0111..2}-suffix"), 486 OutputFormat: "prefix-%06d-suffix", 487 MaxMemUsage: "80%", 488 } 489 _, err := rs.parse() 490 Expect(err).Should(HaveOccurred()) 491 }) 492 }) 493 }) 494 495 func newInputFormat(template string) apc.ListRange { return apc.ListRange{Template: template} }