github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/test/dsort_test.go

github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/test/dsort_test.go (about)

     1  //nolint:dupl // copy-paste benign and can wait
     2  // Package integration_test.
     3  /*
     4   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     5   */
     6  package integration_test
     7  
     8  import (
     9  	"archive/tar"
    10  	"bytes"
    11  	"fmt"
    12  	"net/http"
    13  	"os"
    14  	"path/filepath"
    15  	rdebug "runtime/debug"
    16  	"strconv"
    17  	"strings"
    18  	"sync"
    19  	"testing"
    20  	"time"
    21  
    22  	"github.com/NVIDIA/aistore/api"
    23  	"github.com/NVIDIA/aistore/api/apc"
    24  	"github.com/NVIDIA/aistore/cmn"
    25  	"github.com/NVIDIA/aistore/cmn/archive"
    26  	"github.com/NVIDIA/aistore/cmn/cos"
    27  	"github.com/NVIDIA/aistore/cmn/debug"
    28  	"github.com/NVIDIA/aistore/core/meta"
    29  	"github.com/NVIDIA/aistore/ext/dsort"
    30  	"github.com/NVIDIA/aistore/ext/dsort/shard"
    31  	"github.com/NVIDIA/aistore/sys"
    32  	"github.com/NVIDIA/aistore/tools"
    33  	"github.com/NVIDIA/aistore/tools/docker"
    34  	"github.com/NVIDIA/aistore/tools/readers"
    35  	"github.com/NVIDIA/aistore/tools/tarch"
    36  	"github.com/NVIDIA/aistore/tools/tassert"
    37  	"github.com/NVIDIA/aistore/tools/tlog"
    38  	"github.com/NVIDIA/aistore/tools/trand"
    39  	"github.com/NVIDIA/aistore/xact"
    40  	jsoniter "github.com/json-iterator/go"
    41  )
    42  
    43  const (
    44  	dsortDescAllPrefix = apc.ActDsort + "-test-integration"
    45  
    46  	scopeConfig = "config"
    47  	scopeSpec   = "spec"
    48  )
    49  
    50  const (
    51  	startingDS = "starting dsort"
    52  )
    53  
    54  var (
    55  	dsortDescCurPrefix = fmt.Sprintf("%s-%d-", dsortDescAllPrefix, os.Getpid())
    56  
    57  	dsorterTypes       = []string{dsort.GeneralType, dsort.MemType}
    58  	dsortPhases        = []string{dsort.ExtractionPhase, dsort.SortingPhase, dsort.CreationPhase}
    59  	dsortAlgorithms    = []string{dsort.Alphanumeric, dsort.Shuffle}
    60  	dsortSettingScopes = []string{scopeConfig, scopeSpec}
    61  )
    62  
    63  type (
    64  	dsortTestSpec struct {
    65  		p          bool // determines if the tests should be ran in parallel mode
    66  		types      []string
    67  		tarFormats []tar.Format
    68  		phases     []string
    69  		reactions  []string
    70  		scopes     []string
    71  		algs       []string
    72  	}
    73  
    74  	dsortFramework struct {
    75  		m *ioContext
    76  
    77  		dsorterType string
    78  
    79  		outputBck    cmn.Bck
    80  		inputPrefix  string
    81  		outputPrefix string
    82  
    83  		inputTempl            apc.ListRange
    84  		outputTempl           string
    85  		orderFileURL          string
    86  		shardCnt              int
    87  		shardCntToSkip        int
    88  		filesPerShard         int
    89  		fileSz                int // in a shard
    90  		shardSize             int
    91  		outputShardCnt        int
    92  		recordDuplicationsCnt int
    93  		recordExts            []string
    94  
    95  		inputShards []string
    96  
    97  		tarFormat       tar.Format
    98  		inputExt        string
    99  		outputExt       string
   100  		alg             *dsort.Algorithm
   101  		missingKeys     bool
   102  		outputShardSize string
   103  		maxMemUsage     string
   104  		dryRun          bool
   105  
   106  		missingShards     string
   107  		duplicatedRecords string
   108  
   109  		baseParams  api.BaseParams
   110  		managerUUID string
   111  	}
   112  
   113  	shardRecords struct {
   114  		name        string
   115  		recordNames []string
   116  	}
   117  )
   118  
   119  func generateDsortDesc() string {
   120  	return dsortDescCurPrefix + time.Now().Format(time.RFC3339Nano)
   121  }
   122  
   123  //nolint:gocritic // ignoring (dsortTestSpec) hugeParam
   124  func runDsortTest(t *testing.T, dts dsortTestSpec, f any) {
   125  	if dts.p {
   126  		t.Parallel()
   127  	}
   128  
   129  	for _, dsorterType := range dts.types {
   130  		dsorterType := dsorterType // pin
   131  		t.Run(dsorterType, func(t *testing.T) {
   132  			if dts.p {
   133  				t.Parallel()
   134  			}
   135  
   136  			if len(dts.tarFormats) > 0 {
   137  				g := f.(func(dsorterType string, tarFormat tar.Format, t *testing.T))
   138  				for _, tf := range dts.tarFormats {
   139  					tarFormat := tf // pin
   140  					t.Run("format-"+tarFormat.String(), func(t *testing.T) {
   141  						if dts.p {
   142  							t.Parallel()
   143  						}
   144  						g(dsorterType, tarFormat, t)
   145  					})
   146  				}
   147  			} else if len(dts.phases) > 0 {
   148  				g := f.(func(dsorterType, phase string, t *testing.T))
   149  				for _, phase := range dts.phases {
   150  					phase := phase // pin
   151  					t.Run(phase, func(t *testing.T) {
   152  						if dts.p {
   153  							t.Parallel()
   154  						}
   155  						g(dsorterType, phase, t)
   156  					})
   157  				}
   158  			} else if len(dts.reactions) > 0 {
   159  				for _, reaction := range dts.reactions {
   160  					reaction := reaction // pin
   161  					t.Run(reaction, func(t *testing.T) {
   162  						if dts.p {
   163  							t.Parallel()
   164  						}
   165  
   166  						if len(dts.scopes) > 0 {
   167  							for _, scope := range dts.scopes {
   168  								scope := scope // pin
   169  								t.Run(scope, func(t *testing.T) {
   170  									if dts.p {
   171  										t.Parallel()
   172  									}
   173  
   174  									g := f.(func(dsorterType, reaction, scope string, t *testing.T))
   175  									g(dsorterType, reaction, scope, t)
   176  								})
   177  							}
   178  						} else {
   179  							g := f.(func(dsorterType, reaction string, t *testing.T))
   180  							g(dsorterType, reaction, t)
   181  						}
   182  					})
   183  				}
   184  			} else if len(dts.algs) > 0 {
   185  				g := f.(func(dsorterType, alg string, t *testing.T))
   186  				for _, alg := range dts.algs {
   187  					alg := alg // pin
   188  					t.Run(alg, func(t *testing.T) {
   189  						if dts.p {
   190  							t.Parallel()
   191  						}
   192  						g(dsorterType, alg, t)
   193  					})
   194  				}
   195  			} else {
   196  				g := f.(func(dsorterType string, t *testing.T))
   197  				g(dsorterType, t)
   198  			}
   199  		})
   200  	}
   201  }
   202  
   203  ////////////////////
   204  // dsortFramework //
   205  ////////////////////
   206  
   207  func (df *dsortFramework) job() string {
   208  	if df.managerUUID == "" {
   209  		return "dsort[-]"
   210  	}
   211  	return "dsort[" + df.managerUUID + "]"
   212  }
   213  
   214  func (df *dsortFramework) init() {
   215  	if df.inputTempl.Template == "" {
   216  		df.inputTempl = apc.ListRange{Template: fmt.Sprintf("input-{0..%d}", df.shardCnt-1)}
   217  	}
   218  	if df.outputTempl == "" {
   219  		df.outputTempl = "output-{00000..10000}"
   220  	}
   221  	if df.inputExt == "" {
   222  		df.inputExt = dsort.DefaultExt
   223  	}
   224  
   225  	// Assumption is that all prefixes end with dash: "-"
   226  	df.inputPrefix = df.inputTempl.Template[:strings.Index(df.inputTempl.Template, "-")+1]
   227  	df.outputPrefix = df.outputTempl[:strings.Index(df.outputTempl, "-")+1]
   228  
   229  	if df.fileSz == 0 {
   230  		df.fileSz = cos.KiB
   231  	}
   232  
   233  	df.shardSize = df.filesPerShard * df.fileSz
   234  	if df.outputShardSize == "-1" {
   235  		df.outputShardSize = ""
   236  		pt, err := cos.ParseBashTemplate(df.outputTempl)
   237  		cos.AssertNoErr(err)
   238  		df.outputShardCnt = int(pt.Count())
   239  	} else {
   240  		outputShardSize := int64(10 * df.filesPerShard * df.fileSz)
   241  		df.outputShardSize = cos.ToSizeIEC(outputShardSize, 0)
   242  		df.outputShardCnt = (df.shardCnt * df.shardSize) / int(outputShardSize)
   243  	}
   244  
   245  	if df.alg == nil {
   246  		df.alg = &dsort.Algorithm{}
   247  	}
   248  
   249  	df.baseParams = tools.BaseAPIParams(df.m.proxyURL)
   250  }
   251  
   252  func (df *dsortFramework) gen() dsort.RequestSpec {
   253  	return dsort.RequestSpec{
   254  		Description:         generateDsortDesc(),
   255  		InputBck:            df.m.bck,
   256  		OutputBck:           df.outputBck,
   257  		InputExtension:      df.inputExt,
   258  		OutputExtension:     df.outputExt,
   259  		InputFormat:         df.inputTempl,
   260  		OutputFormat:        df.outputTempl,
   261  		OutputShardSize:     df.outputShardSize,
   262  		Algorithm:           *df.alg,
   263  		OrderFileURL:        df.orderFileURL,
   264  		ExtractConcMaxLimit: 10,
   265  		CreateConcMaxLimit:  10,
   266  		MaxMemUsage:         df.maxMemUsage,
   267  		DsorterType:         df.dsorterType,
   268  		DryRun:              df.dryRun,
   269  
   270  		Config: cmn.DsortConf{
   271  			MissingShards:     df.missingShards,
   272  			DuplicatedRecords: df.duplicatedRecords,
   273  		},
   274  	}
   275  }
   276  
   277  func (df *dsortFramework) start() {
   278  	var (
   279  		err  error
   280  		spec = df.gen()
   281  	)
   282  	df.managerUUID, err = api.StartDsort(df.baseParams, &spec)
   283  	tassert.CheckFatal(df.m.t, err)
   284  }
   285  
   286  func (df *dsortFramework) createInputShards() {
   287  	const tmpDir = "/tmp"
   288  	var (
   289  		wg    = cos.NewLimitedWaitGroup(sys.NumCPU(), 0)
   290  		errCh = make(chan error, df.shardCnt)
   291  
   292  		mu = &sync.Mutex{} // to collect inputShards (obj names)
   293  	)
   294  	debug.Assert(len(df.inputShards) == 0)
   295  
   296  	tlog.Logf("creating %d shards...\n", df.shardCnt)
   297  	for i := df.shardCntToSkip; i < df.shardCnt; i++ {
   298  		wg.Add(1)
   299  		go func(i int) {
   300  			defer wg.Done()
   301  			var (
   302  				err         error
   303  				duplication = i < df.recordDuplicationsCnt
   304  				path        = fmt.Sprintf("%s/%s/%s%d", tmpDir, df.m.bck.Name, df.inputPrefix, i)
   305  				tarName     string
   306  			)
   307  			if df.alg.Kind == dsort.Content {
   308  				tarName = path + archive.ExtTar
   309  			} else {
   310  				tarName = path + df.inputExt
   311  			}
   312  			if df.alg.Kind == dsort.Content {
   313  				err = tarch.CreateArchCustomFiles(tarName, df.tarFormat, df.inputExt, df.filesPerShard,
   314  					df.fileSz, df.alg.ContentKeyType, df.alg.Ext, df.missingKeys)
   315  			} else if df.inputExt == archive.ExtTar {
   316  				err = tarch.CreateArchRandomFiles(tarName, df.tarFormat, df.inputExt, df.filesPerShard,
   317  					df.fileSz, duplication, df.recordExts, nil)
   318  			} else {
   319  				err = tarch.CreateArchRandomFiles(tarName, df.tarFormat, df.inputExt, df.filesPerShard,
   320  					df.fileSz, duplication, nil, nil)
   321  			}
   322  			tassert.CheckFatal(df.m.t, err)
   323  
   324  			reader, err := readers.NewExistingFile(tarName, cos.ChecksumNone)
   325  			tassert.CheckFatal(df.m.t, err)
   326  
   327  			objName := filepath.Base(tarName)
   328  			tools.Put(df.m.proxyURL, df.m.bck, objName, reader, errCh)
   329  
   330  			mu.Lock()
   331  			df.inputShards = append(df.inputShards, objName)
   332  			mu.Unlock()
   333  
   334  			os.Remove(tarName)
   335  		}(i)
   336  	}
   337  	wg.Wait()
   338  	close(errCh)
   339  	for err := range errCh {
   340  		tassert.CheckFatal(df.m.t, err)
   341  	}
   342  	tlog.Logf("%s: done creating shards\n", df.job())
   343  }
   344  
   345  func (df *dsortFramework) checkOutputShards(zeros int) {
   346  	var (
   347  		lastValue  any
   348  		lastName   string
   349  		inversions int
   350  		idx        int
   351  		baseParams = tools.BaseAPIParams(df.m.proxyURL)
   352  		records    = make(map[string]int, 100)
   353  
   354  		realOutputShardCnt int
   355  		skipped            int
   356  	)
   357  	tlog.Logf("%s: checking that files are sorted...\n", df.job())
   358  outer:
   359  	for i := range df.outputShardCnt {
   360  		var (
   361  			buffer    bytes.Buffer
   362  			shardName = fmt.Sprintf("%s%0*d%s", df.outputPrefix, zeros, i, df.inputExt)
   363  			getArgs   = api.GetArgs{Writer: &buffer}
   364  			bucket    = df.m.bck
   365  		)
   366  		if df.outputBck.Name != "" {
   367  			bucket = df.outputBck
   368  		}
   369  
   370  		_, err := api.GetObject(baseParams, bucket, shardName, &getArgs)
   371  		if err != nil {
   372  			herr, ok := err.(*cmn.ErrHTTP)
   373  			if ok && herr.Status == http.StatusNotFound && shard.IsCompressed(df.inputExt) && i > 0 {
   374  				// check for NotFound a few more, and break; see also 'skipped == 0' check below
   375  				switch skipped {
   376  				case 0:
   377  					tlog.Logf("%s: computed output shard count (%d) vs compression: [%s] is the first not-found\n",
   378  						df.job(), df.outputShardCnt, shardName)
   379  					fallthrough
   380  				case 1, 2, 3:
   381  					skipped++
   382  					continue
   383  				default:
   384  					break outer
   385  				}
   386  			}
   387  			tassert.CheckFatal(df.m.t, err)
   388  		}
   389  
   390  		tassert.Fatalf(df.m.t, skipped == 0, "%s: got out of order shard %s (not-found >= %d)", df.job(), shardName, skipped)
   391  
   392  		realOutputShardCnt++
   393  
   394  		if df.alg.Kind == dsort.Content {
   395  			files, err := tarch.GetFilesFromArchBuffer(cos.Ext(shardName), buffer, df.alg.Ext)
   396  			tassert.CheckFatal(df.m.t, err)
   397  			for _, file := range files {
   398  				if file.Ext == df.alg.Ext {
   399  					if strings.TrimSuffix(file.Name, filepath.Ext(file.Name)) !=
   400  						strings.TrimSuffix(lastName, filepath.Ext(lastName)) {
   401  						// custom files should go AFTER the regular files
   402  						df.m.t.Fatalf("%s: names out of order (shard: %s, lastName: %s, curName: %s)",
   403  							df.job(), shardName, lastName, file.Name)
   404  					}
   405  
   406  					switch df.alg.ContentKeyType {
   407  					case shard.ContentKeyInt:
   408  						intValue, err := strconv.ParseInt(string(file.Content), 10, 64)
   409  						tassert.CheckFatal(df.m.t, err)
   410  						if lastValue != nil && intValue < lastValue.(int64) {
   411  							df.m.t.Fatalf("%s: int values are not in correct order (shard: %s, lastIntValue: %d, curIntValue: %d)", df.job(), shardName, lastValue.(int64), intValue)
   412  						}
   413  						lastValue = intValue
   414  					case shard.ContentKeyFloat:
   415  						floatValue, err := strconv.ParseFloat(string(file.Content), 64)
   416  						tassert.CheckFatal(df.m.t, err)
   417  						if lastValue != nil && floatValue < lastValue.(float64) {
   418  							df.m.t.Fatalf("%s: string values are not in correct order (shard: %s, lastStringValue: %f, curStringValue: %f)", df.job(), shardName, lastValue.(float64), floatValue)
   419  						}
   420  						lastValue = floatValue
   421  					case shard.ContentKeyString:
   422  						stringValue := string(file.Content)
   423  						if lastValue != nil && stringValue < lastValue.(string) {
   424  							df.m.t.Fatalf("%s: string values are not in correct order (shard: %s, lastStringValue: %s, curStringValue: %s)", df.job(), shardName, lastValue.(string), stringValue)
   425  						}
   426  						lastValue = stringValue
   427  					default:
   428  						df.m.t.Fail()
   429  					}
   430  				} else {
   431  					lastName = file.Name
   432  				}
   433  			}
   434  		} else {
   435  			files, err := tarch.GetFileInfosFromArchBuffer(buffer, df.inputExt)
   436  			tassert.CheckFatal(df.m.t, err)
   437  			if len(files) == 0 {
   438  				df.m.t.Fatalf("%s: number of files inside shard is 0", df.job())
   439  			}
   440  
   441  			for _, file := range files {
   442  				if df.alg.Kind == "" || df.alg.Kind == dsort.Alphanumeric {
   443  					if lastName > file.Name() && canonicalName(lastName) != canonicalName(file.Name()) {
   444  						df.m.t.Fatalf("%s: names out of order (shard: %s, lastName: %s, curName: %s)",
   445  							df.job(), shardName, lastName, file.Name())
   446  					}
   447  				} else if df.alg.Kind == dsort.Shuffle {
   448  					if lastName > file.Name() {
   449  						inversions++
   450  					}
   451  				}
   452  				if file.Size() != int64(df.fileSz) {
   453  					df.m.t.Fatalf("%s: file sizes has changed (expected: %d, got: %d)",
   454  						df.job(), df.fileSz, file.Size())
   455  				}
   456  				lastName = file.Name()
   457  
   458  				// For each record object see if they we weren't split (they should
   459  				// be one after another).
   460  				recordCanonicalName := canonicalName(file.Name())
   461  				prevIdx, ok := records[recordCanonicalName]
   462  				if ok && prevIdx != idx-1 {
   463  					df.m.t.Errorf("%s: record object %q was splitted", df.job(), file.Name())
   464  				}
   465  				records[recordCanonicalName] = idx
   466  
   467  				// Check if the record objects are in the correct order.
   468  				if len(df.recordExts) > 0 {
   469  					ext := cos.Ext(file.Name())
   470  					expectedExt := df.recordExts[idx%len(df.recordExts)]
   471  					if ext != expectedExt {
   472  						df.m.t.Errorf("%s: record objects %q order has been disrupted: %s != %s",
   473  							df.job(), file.Name(), ext, expectedExt,
   474  						)
   475  					}
   476  				}
   477  				idx++
   478  			}
   479  		}
   480  	}
   481  
   482  	if shard.IsCompressed(df.inputExt) {
   483  		tlog.Logf("%s: computed output shard count (%d) vs resulting compressed (%d)\n",
   484  			df.job(), df.outputShardCnt, realOutputShardCnt)
   485  	}
   486  	if df.alg.Kind == dsort.Shuffle {
   487  		if inversions == 0 {
   488  			df.m.t.Fatalf("%s: shuffle sorting did not create any inversions", df.job())
   489  		}
   490  	}
   491  }
   492  
   493  func canonicalName(recordName string) string {
   494  	return strings.TrimSuffix(recordName, cos.Ext(recordName))
   495  }
   496  
   497  func (df *dsortFramework) checkReactionResult(reaction string, expectedProblemsCnt int) {
   498  	tlog.Logf("%s: checking metrics and \"reaction\"\n", df.job())
   499  	all, err := api.MetricsDsort(df.baseParams, df.managerUUID)
   500  	tassert.CheckFatal(df.m.t, err)
   501  	if len(all) != df.m.originalTargetCount {
   502  		df.m.t.Errorf("%s: number of metrics %d is not same as number of targets %d", df.job(),
   503  			len(all), df.m.originalTargetCount)
   504  	}
   505  
   506  	switch reaction {
   507  	case cmn.IgnoreReaction:
   508  		for target, jmetrics := range all {
   509  			metrics := jmetrics.Metrics
   510  			if len(metrics.Warnings) != 0 {
   511  				df.m.t.Errorf("%s: target %q has %s warnings: %s", df.job(), target, apc.ActDsort, metrics.Warnings)
   512  			}
   513  			if len(metrics.Errors) != 0 {
   514  				df.m.t.Errorf("%s: target %q has %s errors: %s", df.job(), target, apc.ActDsort, metrics.Errors)
   515  			}
   516  		}
   517  	case cmn.WarnReaction:
   518  		totalWarnings := 0
   519  		for target, jmetrics := range all {
   520  			metrics := jmetrics.Metrics
   521  			totalWarnings += len(metrics.Warnings)
   522  
   523  			if len(metrics.Errors) != 0 {
   524  				df.m.t.Errorf("%s: target %q has %s errors: %s", df.job(), target, apc.ActDsort, metrics.Errors)
   525  			}
   526  		}
   527  
   528  		if totalWarnings != expectedProblemsCnt {
   529  			df.m.t.Errorf("%s: number of total warnings %d is different than number of deleted shards: %d", df.job(), totalWarnings, expectedProblemsCnt)
   530  		}
   531  	case cmn.AbortReaction:
   532  		totalErrors := 0
   533  		for target, jmetrics := range all {
   534  			metrics := jmetrics.Metrics
   535  			if !metrics.Aborted.Load() {
   536  				df.m.t.Errorf("%s: %s was not aborted by target: %s", df.job(), apc.ActDsort, target)
   537  			}
   538  			totalErrors += len(metrics.Errors)
   539  		}
   540  
   541  		if totalErrors == 0 {
   542  			df.m.t.Errorf("%s: expected errors on abort, got nothing", df.job())
   543  		}
   544  	}
   545  }
   546  
   547  func (df *dsortFramework) getRecordNames(bck cmn.Bck) []shardRecords {
   548  	allShardRecords := make([]shardRecords, 0, 10)
   549  
   550  	list, err := api.ListObjects(df.baseParams, bck, nil, api.ListArgs{})
   551  	tassert.CheckFatal(df.m.t, err)
   552  
   553  	if len(list.Entries) == 0 {
   554  		df.m.t.Errorf("number of objects in bucket %q is 0", bck)
   555  	}
   556  	for _, obj := range list.Entries {
   557  		var (
   558  			buffer  bytes.Buffer
   559  			getArgs = api.GetArgs{Writer: &buffer}
   560  		)
   561  		_, err := api.GetObject(df.baseParams, bck, obj.Name, &getArgs)
   562  		tassert.CheckFatal(df.m.t, err)
   563  
   564  		files, err := tarch.GetFileInfosFromArchBuffer(buffer, archive.ExtTar)
   565  		tassert.CheckFatal(df.m.t, err)
   566  
   567  		shard := shardRecords{
   568  			name:        obj.Name,
   569  			recordNames: make([]string, len(files)),
   570  		}
   571  		for idx, file := range files {
   572  			shard.recordNames[idx] = file.Name()
   573  		}
   574  		allShardRecords = append(allShardRecords, shard)
   575  	}
   576  
   577  	return allShardRecords
   578  }
   579  
   580  func (df *dsortFramework) checkMetrics(expectAbort bool) map[string]*dsort.JobInfo {
   581  	tlog.Logf("%s: checking metrics\n", df.job())
   582  	all, err := api.MetricsDsort(df.baseParams, df.managerUUID)
   583  	tassert.CheckFatal(df.m.t, err)
   584  	if len(all) != df.m.originalTargetCount {
   585  		df.m.t.Errorf("%s: number of metrics %d is not same as number of targets %d",
   586  			df.job(), len(all), df.m.originalTargetCount)
   587  	}
   588  	for target, jmetrics := range all {
   589  		m := jmetrics.Metrics
   590  		if expectAbort && !m.Aborted.Load() {
   591  			df.m.t.Errorf("%s: %s was not aborted by target: %s", df.job(), apc.ActDsort, target)
   592  		} else if !expectAbort && m.Aborted.Load() {
   593  			df.m.t.Errorf("%s: %s was aborted by target: %s", df.job(), apc.ActDsort, target)
   594  		}
   595  	}
   596  	return all
   597  }
   598  
   599  // helper for dispatching i-th dsort job
   600  func dispatchDsortJob(m *ioContext, dsorterType string, i int) {
   601  	df := &dsortFramework{
   602  		m:             m,
   603  		dsorterType:   dsorterType,
   604  		inputTempl:    apc.ListRange{Template: fmt.Sprintf("input%d-{0..999}", i)},
   605  		outputTempl:   fmt.Sprintf("output%d-{00000..01000}", i),
   606  		shardCnt:      500,
   607  		filesPerShard: 50,
   608  		maxMemUsage:   "99%",
   609  	}
   610  
   611  	df.init()
   612  	df.createInputShards()
   613  
   614  	tlog.Logln(startingDS)
   615  	df.start()
   616  
   617  	_, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
   618  	tassert.CheckFatal(m.t, err)
   619  	tlog.Logf("%s: finished\n", df.job())
   620  
   621  	df.checkMetrics(false /* expectAbort */)
   622  	df.checkOutputShards(5)
   623  }
   624  
   625  func waitForDsortPhase(t *testing.T, proxyURL, managerUUID, phaseName string, callback func()) {
   626  	tlog.Logf("waiting for %s phase...\n", phaseName)
   627  	baseParams := tools.BaseAPIParams(proxyURL)
   628  	for {
   629  		all, err := api.MetricsDsort(baseParams, managerUUID)
   630  		if err != nil { // in case of error call callback anyway
   631  			t.Error(err)
   632  			callback()
   633  			break
   634  		}
   635  
   636  		phase := true
   637  		for _, jmetrics := range all {
   638  			metrics := jmetrics.Metrics
   639  			switch phaseName {
   640  			case dsort.ExtractionPhase:
   641  				phase = phase && (metrics.Extraction.Running || metrics.Extraction.Finished)
   642  			case dsort.SortingPhase:
   643  				phase = phase && (metrics.Sorting.Running || metrics.Sorting.Finished)
   644  			case dsort.CreationPhase:
   645  				phase = phase && (metrics.Creation.Running || metrics.Creation.Finished)
   646  			default:
   647  				t.Fatal(phaseName)
   648  			}
   649  		}
   650  
   651  		if phase {
   652  			callback()
   653  			break
   654  		}
   655  		time.Sleep(100 * time.Millisecond)
   656  	}
   657  }
   658  
   659  //
   660  // tests
   661  //
   662  
   663  func TestDsort(t *testing.T) {
   664  	for _, ext := range []string{archive.ExtTar, archive.ExtTarLz4, archive.ExtZip} {
   665  		for _, lr := range []string{"list", "range"} {
   666  			t.Run(ext+"/"+lr, func(t *testing.T) {
   667  				testDsort(t, ext, lr)
   668  			})
   669  		}
   670  	}
   671  }
   672  
   673  func testDsort(t *testing.T, ext, lr string) {
   674  	runDsortTest(
   675  		// Include empty ("") type - in this case type must be selected automatically.
   676  		t, dsortTestSpec{p: true, types: append(dsorterTypes, "")},
   677  		func(dsorterType string, t *testing.T) {
   678  			var (
   679  				m = &ioContext{
   680  					t: t,
   681  				}
   682  				df = &dsortFramework{
   683  					m:             m,
   684  					inputExt:      ext,
   685  					dsorterType:   dsorterType,
   686  					shardCnt:      500,
   687  					filesPerShard: 100,
   688  					maxMemUsage:   "99%",
   689  				}
   690  			)
   691  			if testing.Short() {
   692  				df.shardCnt /= 10
   693  			}
   694  
   695  			// Initialize ioContext
   696  			m.initAndSaveState(true /*cleanup*/)
   697  			m.expectTargets(1)
   698  
   699  			// Create ais bucket
   700  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
   701  
   702  			df.init()
   703  			df.createInputShards()
   704  
   705  			if lr == "list" {
   706  				// iterate list
   707  				df.inputTempl.ObjNames = df.inputShards
   708  				df.inputTempl.Template = ""
   709  				df.missingShards = cmn.AbortReaction // (when shards are explicitly enumerated...)
   710  			}
   711  
   712  			tlog.Logln(startingDS)
   713  			df.start()
   714  
   715  			_, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
   716  			tassert.CheckFatal(t, err)
   717  			tlog.Logf("%s: finished\n", df.job())
   718  
   719  			df.checkMetrics(false /* expectAbort */)
   720  			df.checkOutputShards(5)
   721  		},
   722  	)
   723  }
   724  
   725  func TestDsortNonExistingBuckets(t *testing.T) {
   726  	runDsortTest(
   727  		t, dsortTestSpec{p: true, types: dsorterTypes},
   728  		func(dsorterType string, t *testing.T) {
   729  			var (
   730  				m = &ioContext{
   731  					t: t,
   732  				}
   733  				df = &dsortFramework{
   734  					m:           m,
   735  					dsorterType: dsorterType,
   736  					outputBck: cmn.Bck{
   737  						Name:     trand.String(15),
   738  						Provider: apc.AIS,
   739  					},
   740  					shardCnt:      500,
   741  					filesPerShard: 100,
   742  					maxMemUsage:   "99%",
   743  				}
   744  			)
   745  
   746  			// Initialize ioContext
   747  			m.initAndSaveState(true /*cleanup*/)
   748  			m.expectTargets(3)
   749  
   750  			df.init()
   751  
   752  			// Create ais:// output
   753  			tools.CreateBucket(t, m.proxyURL, df.outputBck, nil, true /*cleanup*/)
   754  
   755  			tlog.Logln(startingDS)
   756  			spec := df.gen()
   757  			tlog.Logf("dsort %s(-) => %s\n", m.bck, df.outputBck)
   758  			if _, err := api.StartDsort(df.baseParams, &spec); err == nil {
   759  				t.Error("expected dsort to fail when input bucket doesn't exist")
   760  			}
   761  
   762  			// Now destroy output bucket and create input bucket
   763  			tools.DestroyBucket(t, m.proxyURL, df.outputBck)
   764  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
   765  
   766  			tlog.Logf("dsort %s => %s(-)\n", m.bck, df.outputBck)
   767  			if _, err := api.StartDsort(df.baseParams, &spec); err != nil {
   768  				t.Errorf("expected dsort to create output bucket on the fly, got: %v", err)
   769  			}
   770  		},
   771  	)
   772  }
   773  
   774  func TestDsortEmptyBucket(t *testing.T) {
   775  	runDsortTest(
   776  		t, dsortTestSpec{p: true, types: dsorterTypes, reactions: cmn.SupportedReactions},
   777  		func(dsorterType, reaction string, t *testing.T) {
   778  			var (
   779  				m = &ioContext{
   780  					t: t,
   781  				}
   782  				df = &dsortFramework{
   783  					m:             m,
   784  					dsorterType:   dsorterType,
   785  					shardCnt:      100,
   786  					filesPerShard: 10,
   787  					maxMemUsage:   "99%",
   788  					missingShards: reaction,
   789  				}
   790  			)
   791  
   792  			// Initialize ioContext
   793  			m.initAndSaveState(true /*cleanup*/)
   794  			m.expectTargets(3)
   795  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
   796  
   797  			df.init()
   798  
   799  			tlog.Logln(startingDS)
   800  			df.start()
   801  
   802  			_, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
   803  			tassert.CheckFatal(t, err)
   804  			tlog.Logf("%s: finished\n", df.job())
   805  
   806  			df.checkMetrics(reaction == cmn.AbortReaction /*expectAbort*/)
   807  			df.checkReactionResult(reaction, df.shardCnt)
   808  		},
   809  	)
   810  }
   811  
   812  func TestDsortOutputBucket(t *testing.T) {
   813  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
   814  
   815  	runDsortTest(
   816  		t, dsortTestSpec{p: true, types: dsorterTypes},
   817  		func(dsorterType string, t *testing.T) {
   818  			var (
   819  				m = &ioContext{
   820  					t: t,
   821  				}
   822  				df = &dsortFramework{
   823  					m:           m,
   824  					dsorterType: dsorterType,
   825  					outputBck: cmn.Bck{
   826  						Name:     trand.String(15),
   827  						Provider: apc.AIS,
   828  					},
   829  					shardCnt:      500,
   830  					filesPerShard: 100,
   831  					maxMemUsage:   "99%",
   832  				}
   833  			)
   834  
   835  			m.initAndSaveState(true /*cleanup*/)
   836  			m.expectTargets(3)
   837  			// Create ais buckets
   838  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
   839  
   840  			// Create local output bucket
   841  			tools.CreateBucket(t, m.proxyURL, df.outputBck, nil, true /*cleanup*/)
   842  
   843  			df.init()
   844  			df.createInputShards()
   845  
   846  			tlog.Logf("starting dsort: %d/%d\n", df.shardCnt, df.filesPerShard)
   847  			df.start()
   848  
   849  			_, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
   850  			tassert.CheckFatal(t, err)
   851  			tlog.Logf("%s: finished\n", df.job())
   852  
   853  			df.checkMetrics(false /* expectAbort */)
   854  			df.checkOutputShards(5)
   855  		},
   856  	)
   857  }
   858  
   859  // TestDsortParallel runs multiple dSorts in parallel
   860  func TestDsortParallel(t *testing.T) {
   861  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
   862  
   863  	runDsortTest(
   864  		t, dsortTestSpec{p: false, types: dsorterTypes},
   865  		func(dsorterType string, t *testing.T) {
   866  			var (
   867  				m = &ioContext{
   868  					t: t,
   869  				}
   870  				dSortsCount = 5
   871  			)
   872  
   873  			m.initAndSaveState(true /*cleanup*/)
   874  			m.expectTargets(3)
   875  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
   876  
   877  			wg := &sync.WaitGroup{}
   878  			for i := range dSortsCount {
   879  				wg.Add(1)
   880  				go func(i int) {
   881  					defer wg.Done()
   882  					dispatchDsortJob(m, dsorterType, i)
   883  				}(i)
   884  			}
   885  			wg.Wait()
   886  		},
   887  	)
   888  }
   889  
   890  // TestDsortChain runs multiple dSorts one after another
   891  func TestDsortChain(t *testing.T) {
   892  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
   893  
   894  	runDsortTest(
   895  		t, dsortTestSpec{p: true, types: dsorterTypes},
   896  		func(dsorterType string, t *testing.T) {
   897  			var (
   898  				m = &ioContext{
   899  					t: t,
   900  				}
   901  				dSortsCount = 5
   902  			)
   903  
   904  			m.initAndSaveState(true /*cleanup*/)
   905  			m.expectTargets(3)
   906  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
   907  
   908  			for i := range dSortsCount {
   909  				dispatchDsortJob(m, dsorterType, i)
   910  			}
   911  		},
   912  	)
   913  }
   914  
   915  func TestDsortShuffle(t *testing.T) {
   916  	runDsortTest(
   917  		t, dsortTestSpec{p: true, types: dsorterTypes},
   918  		func(dsorterType string, t *testing.T) {
   919  			var (
   920  				m = &ioContext{
   921  					t: t,
   922  				}
   923  				df = &dsortFramework{
   924  					m:             m,
   925  					dsorterType:   dsorterType,
   926  					alg:           &dsort.Algorithm{Kind: dsort.Shuffle},
   927  					shardCnt:      500,
   928  					filesPerShard: 10,
   929  					maxMemUsage:   "99%",
   930  				}
   931  			)
   932  
   933  			m.initAndSaveState(true /*cleanup*/)
   934  			m.expectTargets(3)
   935  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
   936  
   937  			df.init()
   938  			df.createInputShards()
   939  
   940  			tlog.Logf("starting dsort: %d/%d\n", df.shardCnt, df.filesPerShard)
   941  			df.start()
   942  
   943  			_, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
   944  			tassert.CheckFatal(t, err)
   945  			tlog.Logf("%s: finished\n", df.job())
   946  
   947  			df.checkMetrics(false /* expectAbort */)
   948  			df.checkOutputShards(5)
   949  		},
   950  	)
   951  }
   952  
   953  func TestDsortDisk(t *testing.T) {
   954  	runDsortTest(
   955  		t, dsortTestSpec{p: true, types: dsorterTypes},
   956  		func(dsorterType string, t *testing.T) {
   957  			var (
   958  				m = &ioContext{
   959  					t: t,
   960  				}
   961  				df = &dsortFramework{
   962  					m:             m,
   963  					dsorterType:   dsorterType,
   964  					outputTempl:   "output-%d",
   965  					shardCnt:      100,
   966  					filesPerShard: 10,
   967  					maxMemUsage:   "1KB",
   968  				}
   969  			)
   970  
   971  			m.initAndSaveState(true /*cleanup*/)
   972  			m.expectTargets(3)
   973  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
   974  
   975  			df.init()
   976  			df.createInputShards()
   977  			tlog.Logf("starting dsort with spilling to disk... (%d/%d)\n", df.shardCnt, df.filesPerShard)
   978  			df.start()
   979  
   980  			_, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
   981  			tassert.CheckFatal(t, err)
   982  			tlog.Logf("%s: finished\n", df.job())
   983  
   984  			all := df.checkMetrics(false /* expectAbort */)
   985  			for target, jmetrics := range all {
   986  				metrics := jmetrics.Metrics
   987  				if metrics.Extraction.ExtractedToDiskCnt == 0 && metrics.Extraction.ExtractedCnt > 0 {
   988  					t.Errorf("target %s did not extract any files do disk", target)
   989  				}
   990  			}
   991  
   992  			df.checkOutputShards(0)
   993  		},
   994  	)
   995  }
   996  
   997  func TestDsortCompressionDisk(t *testing.T) {
   998  	for _, ext := range []string{archive.ExtTgz, archive.ExtTarLz4, archive.ExtZip} {
   999  		t.Run(ext, func(t *testing.T) {
  1000  			runDsortTest(
  1001  				t, dsortTestSpec{p: true, types: dsorterTypes},
  1002  				func(dsorterType string, t *testing.T) {
  1003  					var (
  1004  						m = &ioContext{
  1005  							t: t,
  1006  						}
  1007  						df = &dsortFramework{
  1008  							m:             m,
  1009  							dsorterType:   dsorterType,
  1010  							shardCnt:      200,
  1011  							filesPerShard: 50,
  1012  							inputExt:      ext,
  1013  							maxMemUsage:   "1KB",
  1014  						}
  1015  					)
  1016  
  1017  					m.initAndSaveState(true /*cleanup*/)
  1018  					m.expectTargets(3)
  1019  					tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  1020  
  1021  					df.init()
  1022  					df.createInputShards()
  1023  
  1024  					tlog.Logf("starting dsort: %d/%d, %s\n",
  1025  						df.shardCnt, df.filesPerShard, df.inputExt)
  1026  					df.start()
  1027  
  1028  					_, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  1029  					tassert.CheckFatal(t, err)
  1030  					tlog.Logf("%s: finished\n", df.job())
  1031  
  1032  					df.checkMetrics(false /* expectAbort */)
  1033  					df.checkOutputShards(5)
  1034  				},
  1035  			)
  1036  		})
  1037  	}
  1038  }
  1039  
  1040  func TestDsortMemDisk(t *testing.T) {
  1041  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
  1042  
  1043  	var (
  1044  		m = &ioContext{
  1045  			t: t,
  1046  		}
  1047  		df = &dsortFramework{
  1048  			m:             m,
  1049  			dsorterType:   dsort.GeneralType,
  1050  			shardCnt:      500,
  1051  			fileSz:        cos.MiB,
  1052  			filesPerShard: 5,
  1053  		}
  1054  		mem sys.MemStat
  1055  	)
  1056  
  1057  	m.initAndSaveState(true /*cleanup*/)
  1058  	m.expectTargets(3)
  1059  	tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  1060  
  1061  	df.init()
  1062  	df.createInputShards()
  1063  
  1064  	// Try to free all memory to get estimated actual used memory size
  1065  	rdebug.FreeOSMemory()
  1066  
  1067  	// Get current memory
  1068  	err := mem.Get()
  1069  	tassert.CheckFatal(t, err)
  1070  	df.maxMemUsage = cos.ToSizeIEC(int64(mem.ActualUsed+500*cos.MiB), 2)
  1071  
  1072  	tlog.Logf("starting dsort with memory and disk (max mem usage: %s)... (%d/%d)\n", df.maxMemUsage,
  1073  		df.shardCnt, df.filesPerShard)
  1074  	df.start()
  1075  
  1076  	_, err = tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  1077  	tassert.CheckFatal(t, err)
  1078  	tlog.Logf("%s: finished\n", df.job())
  1079  
  1080  	all := df.checkMetrics(false /* expectAbort */)
  1081  	var (
  1082  		extractedToDisk int64
  1083  		extractedTotal  int64
  1084  	)
  1085  	for _, jmetrics := range all {
  1086  		metrics := jmetrics.Metrics
  1087  		extractedToDisk += metrics.Extraction.ExtractedToDiskCnt
  1088  		extractedTotal += metrics.Extraction.ExtractedCnt
  1089  	}
  1090  
  1091  	if extractedToDisk == 0 {
  1092  		t.Error("all extractions by all targets were done exclusively into memory")
  1093  	}
  1094  	if extractedToDisk == extractedTotal {
  1095  		t.Error("all extractions by all targets were done exclusively into disk")
  1096  	}
  1097  
  1098  	df.checkOutputShards(5)
  1099  }
  1100  
  1101  func TestDsortMinMemCompression(t *testing.T) {
  1102  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
  1103  	for _, ext := range []string{archive.ExtTarGz, archive.ExtTarLz4, archive.ExtZip} {
  1104  		for _, maxMem := range []string{"10%", "1%"} {
  1105  			t.Run(ext+"/mem="+maxMem, func(t *testing.T) {
  1106  				minMemCompression(t, ext, maxMem)
  1107  			})
  1108  		}
  1109  	}
  1110  }
  1111  
  1112  func minMemCompression(t *testing.T, ext, maxMem string) {
  1113  	var (
  1114  		m = &ioContext{
  1115  			t: t,
  1116  		}
  1117  		df = &dsortFramework{
  1118  			m:             m,
  1119  			dsorterType:   dsort.GeneralType,
  1120  			shardCnt:      500,
  1121  			fileSz:        cos.MiB,
  1122  			filesPerShard: 5,
  1123  			inputExt:      ext,
  1124  			maxMemUsage:   maxMem,
  1125  		}
  1126  		mem sys.MemStat
  1127  	)
  1128  
  1129  	m.initAndSaveState(true /*cleanup*/)
  1130  	m.expectTargets(3)
  1131  	tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  1132  
  1133  	df.init()
  1134  	df.createInputShards()
  1135  
  1136  	// Try to free all memory to get estimated actual used memory size
  1137  	rdebug.FreeOSMemory()
  1138  
  1139  	// Get current memory
  1140  	err := mem.Get()
  1141  	tassert.CheckFatal(t, err)
  1142  	df.maxMemUsage = cos.ToSizeIEC(int64(mem.ActualUsed+300*cos.MiB), 2)
  1143  
  1144  	tlog.Logf("starting dsort with memory, disk, and compression (max mem usage: %s) ... %d/%d, %s\n",
  1145  		df.maxMemUsage, df.shardCnt, df.filesPerShard, df.inputExt)
  1146  	df.start()
  1147  
  1148  	_, err = tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  1149  	tassert.CheckFatal(t, err)
  1150  	tlog.Logf("%s: finished\n", df.job())
  1151  
  1152  	all := df.checkMetrics(false /*expectAbort*/)
  1153  	var (
  1154  		extractedToDisk int64
  1155  		extractedTotal  int64
  1156  	)
  1157  	for _, jmetrics := range all {
  1158  		metrics := jmetrics.Metrics
  1159  		extractedToDisk += metrics.Extraction.ExtractedToDiskCnt
  1160  		extractedTotal += metrics.Extraction.ExtractedCnt
  1161  	}
  1162  
  1163  	if extractedToDisk == 0 {
  1164  		t.Error("all extractions by all targets were done exclusively into memory")
  1165  	}
  1166  	if extractedToDisk == extractedTotal {
  1167  		t.Error("all extractions by all targets were done exclusively into disk")
  1168  	}
  1169  
  1170  	df.checkOutputShards(5)
  1171  }
  1172  
  1173  func TestDsortZipLz4(t *testing.T) {
  1174  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
  1175  
  1176  	for _, ext := range []string{archive.ExtZip, archive.ExtTarLz4} {
  1177  		t.Run(ext, func(t *testing.T) {
  1178  			runDsortTest(
  1179  				t, dsortTestSpec{p: true, types: dsorterTypes},
  1180  				func(dsorterType string, t *testing.T) {
  1181  					var (
  1182  						err error
  1183  						m   = &ioContext{
  1184  							t: t,
  1185  						}
  1186  						df = &dsortFramework{
  1187  							m:             m,
  1188  							dsorterType:   dsorterType,
  1189  							shardCnt:      500,
  1190  							filesPerShard: 100,
  1191  							inputExt:      ext,
  1192  							maxMemUsage:   "99%",
  1193  						}
  1194  					)
  1195  
  1196  					m.initAndSaveState(true /*cleanup*/)
  1197  					m.expectTargets(3)
  1198  					tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  1199  
  1200  					df.init()
  1201  					df.createInputShards()
  1202  
  1203  					tlog.Logf("starting dsort: %d/%d, %s\n", df.shardCnt, df.filesPerShard, df.inputExt)
  1204  					df.start()
  1205  
  1206  					_, err = tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  1207  					tassert.CheckFatal(t, err)
  1208  					tlog.Logf("%s: finished\n", df.job())
  1209  
  1210  					df.checkMetrics(false /* expectAbort */)
  1211  					df.checkOutputShards(5)
  1212  				},
  1213  			)
  1214  		})
  1215  	}
  1216  }
  1217  
  1218  func TestDsortMaxMemCompression(t *testing.T) {
  1219  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
  1220  	for _, ext := range []string{archive.ExtTgz, archive.ExtTarLz4, archive.ExtZip} {
  1221  		t.Run(ext, func(t *testing.T) {
  1222  			runDsortTest(
  1223  				t, dsortTestSpec{p: true, types: dsorterTypes},
  1224  				func(dsorterType string, t *testing.T) {
  1225  					var (
  1226  						err error
  1227  						m   = &ioContext{
  1228  							t: t,
  1229  						}
  1230  						df = &dsortFramework{
  1231  							m:             m,
  1232  							dsorterType:   dsorterType,
  1233  							shardCnt:      500,
  1234  							filesPerShard: 50,
  1235  							inputExt:      ext,
  1236  							maxMemUsage:   "99%",
  1237  						}
  1238  					)
  1239  
  1240  					m.initAndSaveState(true /*cleanup*/)
  1241  					m.expectTargets(3)
  1242  					tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  1243  
  1244  					df.init()
  1245  					df.createInputShards()
  1246  
  1247  					tlog.Logf("starting dsort: %d/%d, %s\n", df.shardCnt, df.filesPerShard, df.inputExt)
  1248  					df.start()
  1249  
  1250  					_, err = tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  1251  					tassert.CheckFatal(t, err)
  1252  					tlog.Logf("%s: finished\n", df.job())
  1253  
  1254  					df.checkMetrics(false /* expectAbort */)
  1255  					df.checkOutputShards(5)
  1256  				},
  1257  			)
  1258  		})
  1259  	}
  1260  }
  1261  
  1262  func TestDsortContent(t *testing.T) {
  1263  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
  1264  
  1265  	runDsortTest(
  1266  		t, dsortTestSpec{p: true, types: dsorterTypes},
  1267  		func(dsorterType string, t *testing.T) {
  1268  			cases := []struct {
  1269  				extension      string
  1270  				contentKeyType string
  1271  				missingKeys    bool
  1272  			}{
  1273  				{".loss", shard.ContentKeyInt, false},
  1274  				{".cls", shard.ContentKeyFloat, false},
  1275  				{".smth", shard.ContentKeyString, false},
  1276  
  1277  				{".loss", shard.ContentKeyInt, true},
  1278  				{".cls", shard.ContentKeyFloat, true},
  1279  				{".smth", shard.ContentKeyString, true},
  1280  			}
  1281  
  1282  			for _, entry := range cases {
  1283  				entry := entry // pin
  1284  				test := fmt.Sprintf("%s-%v", entry.contentKeyType, entry.missingKeys)
  1285  				t.Run(test, func(t *testing.T) {
  1286  					t.Parallel()
  1287  
  1288  					var (
  1289  						m = &ioContext{
  1290  							t: t,
  1291  						}
  1292  						df = &dsortFramework{
  1293  							m:           m,
  1294  							dsorterType: dsorterType,
  1295  							alg: &dsort.Algorithm{
  1296  								Kind:           dsort.Content,
  1297  								Ext:            entry.extension,
  1298  								ContentKeyType: entry.contentKeyType,
  1299  							},
  1300  							missingKeys:   entry.missingKeys,
  1301  							shardCnt:      500,
  1302  							filesPerShard: 100,
  1303  							maxMemUsage:   "90%",
  1304  						}
  1305  					)
  1306  
  1307  					m.initAndSaveState(true /*cleanup*/)
  1308  					m.expectTargets(3)
  1309  					tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  1310  
  1311  					df.init()
  1312  					df.createInputShards()
  1313  
  1314  					tlog.Logf("starting dsort: %d/%d\n", df.shardCnt, df.filesPerShard)
  1315  					df.start()
  1316  
  1317  					aborted, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  1318  					tassert.CheckFatal(t, err)
  1319  					if entry.missingKeys && !aborted {
  1320  						t.Errorf("%s was not aborted", apc.ActDsort)
  1321  					}
  1322  
  1323  					tlog.Logf("%s: checking metrics\n", df.job())
  1324  					all, err := api.MetricsDsort(df.baseParams, df.managerUUID)
  1325  					tassert.CheckFatal(t, err)
  1326  					if len(all) != m.originalTargetCount {
  1327  						t.Errorf("number of metrics %d is not same as the number of targets %d",
  1328  							len(all), m.originalTargetCount)
  1329  					}
  1330  
  1331  					for target, jmetrics := range all {
  1332  						metrics := jmetrics.Metrics
  1333  						if entry.missingKeys && !metrics.Aborted.Load() {
  1334  							t.Errorf("%s was not aborted by target: %s", target, apc.ActDsort)
  1335  						}
  1336  					}
  1337  
  1338  					if !entry.missingKeys {
  1339  						df.checkOutputShards(5)
  1340  					}
  1341  				})
  1342  			}
  1343  		},
  1344  	)
  1345  }
  1346  
  1347  func TestDsortAbort(t *testing.T) {
  1348  	runDsortTest(
  1349  		t, dsortTestSpec{p: true, types: dsorterTypes},
  1350  		func(dsorterType string, t *testing.T) {
  1351  			for _, asXaction := range []bool{false, true} {
  1352  				test := dsorterType + "/" + fmt.Sprintf("as-xaction=%t", asXaction)
  1353  				t.Run(test, func(t *testing.T) {
  1354  					var (
  1355  						err error
  1356  						m   = &ioContext{
  1357  							t: t,
  1358  						}
  1359  						df = &dsortFramework{
  1360  							m:             m,
  1361  							dsorterType:   dsorterType,
  1362  							shardCnt:      500,
  1363  							filesPerShard: 10,
  1364  						}
  1365  					)
  1366  
  1367  					m.initAndSaveState(false /*cleanup*/)
  1368  					m.expectTargets(3)
  1369  					tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  1370  
  1371  					df.init()
  1372  					df.createInputShards()
  1373  
  1374  					tlog.Logf("starting dsort: %d/%d\n", df.shardCnt, df.filesPerShard)
  1375  					df.start()
  1376  
  1377  					if asXaction {
  1378  						tlog.Logf("aborting dsort[%s] via api.AbortXaction\n", df.managerUUID)
  1379  						err = api.AbortXaction(df.baseParams, &xact.ArgsMsg{ID: df.managerUUID})
  1380  					} else {
  1381  						tlog.Logf("aborting dsort[%s] via api.AbortDsort\n", df.managerUUID)
  1382  						err = api.AbortDsort(df.baseParams, df.managerUUID)
  1383  					}
  1384  					tassert.CheckFatal(t, err)
  1385  
  1386  					_, err = tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  1387  					tassert.CheckFatal(t, err)
  1388  
  1389  					df.checkMetrics(true /* expectAbort */)
  1390  				})
  1391  			}
  1392  		},
  1393  	)
  1394  }
  1395  
  1396  func TestDsortAbortDuringPhases(t *testing.T) {
  1397  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
  1398  
  1399  	runDsortTest(
  1400  		t, dsortTestSpec{p: true, types: dsorterTypes, phases: dsortPhases},
  1401  		func(dsorterType, phase string, t *testing.T) {
  1402  			for _, asXaction := range []bool{false, true} {
  1403  				test := dsorterType + "/" + fmt.Sprintf("as-xaction=%t", asXaction)
  1404  				t.Run(test, func(t *testing.T) {
  1405  					var (
  1406  						m = &ioContext{
  1407  							t: t,
  1408  						}
  1409  						df = &dsortFramework{
  1410  							m:             m,
  1411  							dsorterType:   dsorterType,
  1412  							shardCnt:      500,
  1413  							filesPerShard: 200,
  1414  						}
  1415  					)
  1416  
  1417  					if phase == dsort.SortingPhase && asXaction {
  1418  						t.Skipf("skipping %s", t.Name()) // TODO -- FIXME: remove
  1419  					}
  1420  
  1421  					m.initAndSaveState(true /*cleanup*/)
  1422  					m.expectTargets(3)
  1423  
  1424  					tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  1425  
  1426  					df.init()
  1427  					df.createInputShards()
  1428  
  1429  					tlog.Logf("starting dsort (abort on: %s)...\n", phase)
  1430  					df.start()
  1431  
  1432  					waitForDsortPhase(t, m.proxyURL, df.managerUUID, phase, func() {
  1433  						var err error
  1434  						if asXaction {
  1435  							tlog.Logf("aborting dsort[%s] via api.AbortXaction\n", df.managerUUID)
  1436  							err = api.AbortXaction(df.baseParams, &xact.ArgsMsg{ID: df.managerUUID})
  1437  						} else {
  1438  							tlog.Logf("aborting dsort[%s] via api.AbortDsort\n", df.managerUUID)
  1439  							err = api.AbortDsort(df.baseParams, df.managerUUID)
  1440  						}
  1441  						tassert.CheckFatal(t, err)
  1442  					})
  1443  
  1444  					_, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  1445  					tassert.CheckFatal(t, err)
  1446  
  1447  					df.checkMetrics(true /* expectAbort */)
  1448  				})
  1449  			}
  1450  		},
  1451  	)
  1452  }
  1453  
  1454  func TestDsortKillTargetDuringPhases(t *testing.T) {
  1455  	t.Skip("test is flaky, run it only when necessary")
  1456  
  1457  	runDsortTest(
  1458  		t, dsortTestSpec{p: false, types: dsorterTypes, phases: dsortPhases},
  1459  		func(dsorterType, phase string, t *testing.T) {
  1460  			var (
  1461  				m = &ioContext{
  1462  					t: t,
  1463  				}
  1464  				df = &dsortFramework{
  1465  					m:             m,
  1466  					dsorterType:   dsorterType,
  1467  					outputTempl:   "output-{0..100000}",
  1468  					shardCnt:      1000,
  1469  					filesPerShard: 500,
  1470  				}
  1471  				target *meta.Snode
  1472  			)
  1473  
  1474  			m.initAndSaveState(true /*cleanup*/)
  1475  			m.expectTargets(3)
  1476  
  1477  			df.init()
  1478  
  1479  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  1480  
  1481  			df.createInputShards()
  1482  
  1483  			tlog.Logf("starting dsort (abort on: %s)...\n", phase)
  1484  			df.start()
  1485  
  1486  			waitForDsortPhase(t, m.proxyURL, df.managerUUID, phase, func() {
  1487  				// It may require calling AbortXaction(rebalance) &
  1488  				// WaitForRebalAndResil() before unregistering
  1489  				target = m.startMaintenanceNoRebalance()
  1490  			})
  1491  
  1492  			aborted, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  1493  			tassert.CheckError(t, err)
  1494  			if !aborted {
  1495  				t.Errorf("%s was not aborted", apc.ActDsort)
  1496  			}
  1497  
  1498  			tlog.Logf("%s: checking metrics\n", df.job())
  1499  			all, err := api.MetricsDsort(df.baseParams, df.managerUUID)
  1500  			tassert.CheckError(t, err)
  1501  			if len(all) == m.originalTargetCount {
  1502  				t.Errorf("number of metrics %d is same as number of original targets %d",
  1503  					len(all), m.originalTargetCount)
  1504  			}
  1505  
  1506  			for target, jmetrics := range all {
  1507  				metrics := jmetrics.Metrics
  1508  				if !metrics.Aborted.Load() {
  1509  					t.Errorf("%s was not aborted by target: %s", apc.ActDsort, target)
  1510  				}
  1511  			}
  1512  
  1513  			rebID := m.stopMaintenance(target)
  1514  			tools.WaitForRebalanceByID(t, df.baseParams, rebID)
  1515  		},
  1516  	)
  1517  }
  1518  
  1519  func TestDsortManipulateMountpathDuringPhases(t *testing.T) {
  1520  	t.Skipf("skipping %s", t.Name())
  1521  
  1522  	runDsortTest(
  1523  		t, dsortTestSpec{p: false, types: dsorterTypes, phases: dsortPhases},
  1524  		func(dsorterType, phase string, t *testing.T) {
  1525  			for _, adding := range []bool{false, true} {
  1526  				t.Run(strconv.FormatBool(adding), func(t *testing.T) {
  1527  					var (
  1528  						m = &ioContext{
  1529  							t: t,
  1530  						}
  1531  						df = &dsortFramework{
  1532  							m:             m,
  1533  							dsorterType:   dsorterType,
  1534  							outputTempl:   "output-{0..100000}",
  1535  							shardCnt:      500,
  1536  							filesPerShard: 200,
  1537  						}
  1538  
  1539  						mountpaths = make(map[*meta.Snode]string)
  1540  					)
  1541  
  1542  					m.initAndSaveState(true /*cleanup*/)
  1543  					m.expectTargets(3)
  1544  
  1545  					// Initialize `df.baseParams`
  1546  					df.init()
  1547  
  1548  					targets := m.smap.Tmap.ActiveNodes()
  1549  					for idx, target := range targets {
  1550  						if adding {
  1551  							mpath := fmt.Sprintf("%s-%d", testMpath, idx)
  1552  							if docker.IsRunning() {
  1553  								err := docker.CreateMpathDir(0, mpath)
  1554  								tassert.CheckFatal(t, err)
  1555  							} else {
  1556  								err := cos.CreateDir(mpath)
  1557  								tassert.CheckFatal(t, err)
  1558  							}
  1559  
  1560  							mountpaths[target] = mpath
  1561  						} else {
  1562  							targetMountpaths, err := api.GetMountpaths(df.baseParams, target)
  1563  							tassert.CheckFatal(t, err)
  1564  							mountpaths[target] = targetMountpaths.Available[0]
  1565  						}
  1566  					}
  1567  
  1568  					t.Cleanup(func() {
  1569  						// Wait for any resilver that might be still running.
  1570  						tools.WaitForResilvering(t, df.baseParams, nil)
  1571  
  1572  						for target, mpath := range mountpaths {
  1573  							if adding {
  1574  								tlog.Logf("removing mountpath %q from %s...\n", mpath, target.ID())
  1575  								err := api.DetachMountpath(df.baseParams, target, mpath, true /*dont-resil*/)
  1576  								tassert.CheckError(t, err)
  1577  								err = os.RemoveAll(mpath)
  1578  								tassert.CheckError(t, err)
  1579  							} else {
  1580  								tlog.Logf("adding mountpath %q to %s...\n", mpath, target.ID())
  1581  								err := api.AttachMountpath(df.baseParams, target, mpath)
  1582  								tassert.CheckError(t, err)
  1583  							}
  1584  						}
  1585  
  1586  						tools.WaitForResilvering(t, df.baseParams, nil)
  1587  					})
  1588  
  1589  					tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  1590  
  1591  					df.createInputShards()
  1592  
  1593  					tlog.Logf("starting dsort (abort on: %s)...\n", phase)
  1594  					df.start()
  1595  
  1596  					waitForDsortPhase(t, m.proxyURL, df.managerUUID, phase, func() {
  1597  						for target, mpath := range mountpaths {
  1598  							if adding {
  1599  								tlog.Logf("adding new mountpath %q to %s...\n", mpath, target.ID())
  1600  								err := api.AttachMountpath(df.baseParams, target, mpath)
  1601  								tassert.CheckFatal(t, err)
  1602  							} else {
  1603  								tlog.Logf("removing mountpath %q from %s...\n", mpath, target.ID())
  1604  								err := api.DetachMountpath(df.baseParams, target,
  1605  									mpath, false /*dont-resil*/)
  1606  								tassert.CheckFatal(t, err)
  1607  							}
  1608  						}
  1609  					})
  1610  
  1611  					_, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  1612  					tassert.CheckError(t, err)
  1613  
  1614  					df.checkMetrics(true /*expectAbort*/)
  1615  				})
  1616  			}
  1617  		},
  1618  	)
  1619  }
  1620  
  1621  func TestDsortAddTarget(t *testing.T) {
  1622  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
  1623  
  1624  	runDsortTest(
  1625  		t, dsortTestSpec{p: false, types: dsorterTypes},
  1626  		func(dsorterType string, t *testing.T) {
  1627  			var (
  1628  				m = &ioContext{
  1629  					t: t,
  1630  				}
  1631  				df = &dsortFramework{
  1632  					m:             m,
  1633  					dsorterType:   dsorterType,
  1634  					outputTempl:   "output-{0..100000}",
  1635  					shardCnt:      1000,
  1636  					filesPerShard: 200,
  1637  				}
  1638  			)
  1639  
  1640  			m.initAndSaveState(true /*cleanup*/)
  1641  			m.expectTargets(3)
  1642  
  1643  			df.init()
  1644  
  1645  			target := m.startMaintenanceNoRebalance()
  1646  
  1647  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  1648  
  1649  			df.createInputShards()
  1650  
  1651  			tlog.Logf("starting dsort: %d/%d\n", df.shardCnt, df.filesPerShard)
  1652  			df.start()
  1653  
  1654  			defer tools.WaitForRebalAndResil(t, df.baseParams)
  1655  
  1656  			waitForDsortPhase(t, m.proxyURL, df.managerUUID, dsort.ExtractionPhase, func() {
  1657  				m.stopMaintenance(target)
  1658  			})
  1659  
  1660  			aborted, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  1661  			tassert.CheckFatal(t, err)
  1662  			if !aborted {
  1663  				t.Errorf("%s was not aborted", apc.ActDsort)
  1664  			}
  1665  
  1666  			tlog.Logf("%s: checking metrics\n", df.job())
  1667  			allMetrics, err := api.MetricsDsort(df.baseParams, df.managerUUID)
  1668  			tassert.CheckFatal(t, err)
  1669  			if len(allMetrics) != m.originalTargetCount-1 {
  1670  				t.Errorf("number of metrics %d is different than number of targets when %s started %d",
  1671  					len(allMetrics), apc.ActDsort, m.originalTargetCount-1)
  1672  			}
  1673  		},
  1674  	)
  1675  }
  1676  
  1677  func TestDsortMetricsAfterFinish(t *testing.T) {
  1678  	runDsortTest(
  1679  		t, dsortTestSpec{p: true, types: dsorterTypes},
  1680  		func(dsorterType string, t *testing.T) {
  1681  			var (
  1682  				m = &ioContext{
  1683  					t: t,
  1684  				}
  1685  				df = &dsortFramework{
  1686  					m:             m,
  1687  					dsorterType:   dsorterType,
  1688  					outputTempl:   "output-{0..1000}",
  1689  					shardCnt:      50,
  1690  					filesPerShard: 10,
  1691  				}
  1692  			)
  1693  
  1694  			m.initAndSaveState(true /*cleanup*/)
  1695  			m.expectTargets(3)
  1696  
  1697  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  1698  
  1699  			df.init()
  1700  			df.createInputShards()
  1701  
  1702  			tlog.Logf("starting dsort: %d/%d\n", df.shardCnt, df.filesPerShard)
  1703  			df.start()
  1704  
  1705  			_, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  1706  			tassert.CheckFatal(t, err)
  1707  			tlog.Logf("%s: finished\n", df.job())
  1708  
  1709  			df.checkMetrics(false /* expectAbort */)
  1710  			df.checkOutputShards(0)
  1711  
  1712  			tlog.Logln("checking if metrics are still accessible after some time..")
  1713  			time.Sleep(2 * time.Second)
  1714  
  1715  			// Check if metrics can be fetched after some time
  1716  			df.checkMetrics(false /* expectAbort */)
  1717  		},
  1718  	)
  1719  }
  1720  
  1721  func TestDsortSelfAbort(t *testing.T) {
  1722  	runDsortTest(
  1723  		t, dsortTestSpec{p: true, types: dsorterTypes},
  1724  		func(dsorterType string, t *testing.T) {
  1725  			var (
  1726  				m = &ioContext{
  1727  					t: t,
  1728  				}
  1729  				df = &dsortFramework{
  1730  					m:             m,
  1731  					dsorterType:   dsorterType,
  1732  					shardCnt:      500,
  1733  					filesPerShard: 100,
  1734  					missingShards: cmn.AbortReaction,
  1735  				}
  1736  			)
  1737  
  1738  			m.initAndSaveState(true /*cleanup*/)
  1739  			m.expectTargets(3)
  1740  
  1741  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  1742  
  1743  			df.init()
  1744  
  1745  			tlog.Logf("starting dsort: %d/%d\n", df.shardCnt, df.filesPerShard)
  1746  			df.start()
  1747  
  1748  			_, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  1749  			tassert.CheckFatal(t, err)
  1750  			tlog.Logf("%s: finished\n", df.job())
  1751  
  1752  			// Wait a while for all targets to abort
  1753  			time.Sleep(2 * time.Second)
  1754  
  1755  			df.checkMetrics(true /* expectAbort */)
  1756  		},
  1757  	)
  1758  }
  1759  
  1760  func TestDsortOnOOM(t *testing.T) {
  1761  	t.Skip("test can take more than couple minutes, run it only when necessary")
  1762  
  1763  	runDsortTest(
  1764  		t, dsortTestSpec{p: false, types: dsorterTypes},
  1765  		func(dsorterType string, t *testing.T) {
  1766  			var (
  1767  				m = &ioContext{
  1768  					t: t,
  1769  				}
  1770  				df = &dsortFramework{
  1771  					m:             m,
  1772  					dsorterType:   dsorterType,
  1773  					filesPerShard: 200,
  1774  					fileSz:        10 * cos.MiB,
  1775  					maxMemUsage:   "80%",
  1776  				}
  1777  				mem sys.MemStat
  1778  			)
  1779  
  1780  			err := mem.Get()
  1781  			tassert.CheckFatal(t, err)
  1782  
  1783  			// Calculate number of shards to cause OOM and overestimate it to make sure
  1784  			// that if dsort doesn't prevent it, it will happen. Notice that maxMemUsage
  1785  			// is 80% so dsort should never go above this number in memory usage.
  1786  			df.shardCnt = int(float64(mem.ActualFree/uint64(df.fileSz)/uint64(df.filesPerShard)) * 1.4)
  1787  
  1788  			m.initAndSaveState(true /*cleanup*/)
  1789  			m.expectTargets(3)
  1790  
  1791  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  1792  
  1793  			df.init()
  1794  			df.createInputShards()
  1795  
  1796  			tlog.Logf("starting dsort: %d/%d\n", df.shardCnt, df.filesPerShard)
  1797  			df.start()
  1798  
  1799  			_, err = tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  1800  			tassert.CheckFatal(t, err)
  1801  			tlog.Logf("%s: finished\n", df.job())
  1802  
  1803  			df.checkMetrics(false /* expectAbort */)
  1804  			df.checkOutputShards(5)
  1805  		},
  1806  	)
  1807  }
  1808  
  1809  func TestDsortMissingShards(t *testing.T) {
  1810  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
  1811  	for _, ext := range []string{archive.ExtTar, archive.ExtTarLz4} {
  1812  		t.Run(ext, func(t *testing.T) {
  1813  			runDsortTest(
  1814  				t, dsortTestSpec{
  1815  					p:         false,
  1816  					types:     dsorterTypes,
  1817  					reactions: cmn.SupportedReactions,
  1818  					scopes:    dsortSettingScopes,
  1819  				},
  1820  				func(dsorterType, reaction, scope string, t *testing.T) {
  1821  					if scope != scopeConfig {
  1822  						t.Parallel()
  1823  					}
  1824  
  1825  					var (
  1826  						m = &ioContext{
  1827  							t: t,
  1828  						}
  1829  						df = &dsortFramework{
  1830  							m:              m,
  1831  							dsorterType:    dsorterType,
  1832  							outputTempl:    "output-{0..100000}",
  1833  							shardCnt:       500,
  1834  							shardCntToSkip: 50,
  1835  							filesPerShard:  200,
  1836  							inputExt:       ext,
  1837  						}
  1838  					)
  1839  
  1840  					m.initAndSaveState(true /*cleanup*/)
  1841  					m.expectTargets(3)
  1842  
  1843  					tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  1844  
  1845  					switch scope {
  1846  					case scopeConfig:
  1847  						defer tools.SetClusterConfig(t,
  1848  							cos.StrKVs{"distributed_sort.missing_shards": cmn.IgnoreReaction})
  1849  						tools.SetClusterConfig(t, cos.StrKVs{"distributed_sort.missing_shards": reaction})
  1850  
  1851  						tlog.Logf("changed `missing_shards` config to: %s\n", reaction)
  1852  					case scopeSpec:
  1853  						df.missingShards = reaction
  1854  						tlog.Logf("set `missing_shards` in request spec to: %s\n", reaction)
  1855  					default:
  1856  						cos.AssertMsg(false, scope)
  1857  					}
  1858  
  1859  					df.init()
  1860  					df.createInputShards()
  1861  
  1862  					tlog.Logf("starting dsort: %d/%d, %s\n", df.shardCnt, df.filesPerShard, df.inputExt)
  1863  					df.start()
  1864  
  1865  					_, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  1866  					tassert.CheckFatal(t, err)
  1867  					tlog.Logf("%s: finished\n", df.job())
  1868  
  1869  					df.checkReactionResult(reaction, df.shardCntToSkip)
  1870  				},
  1871  			)
  1872  		})
  1873  	}
  1874  }
  1875  
  1876  func TestDsortDuplications(t *testing.T) {
  1877  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
  1878  	for _, ext := range []string{archive.ExtTar, archive.ExtTarLz4, archive.ExtTarGz, archive.ExtZip} { // all supported formats
  1879  		t.Run(ext, func(t *testing.T) {
  1880  			runDsortTest(
  1881  				t, dsortTestSpec{
  1882  					p:         false,
  1883  					types:     dsorterTypes,
  1884  					reactions: cmn.SupportedReactions,
  1885  					scopes:    dsortSettingScopes,
  1886  				},
  1887  				func(dsorterType, reaction, scope string, t *testing.T) {
  1888  					if scope != scopeConfig {
  1889  						t.Parallel()
  1890  					}
  1891  					var (
  1892  						m = &ioContext{
  1893  							t: t,
  1894  						}
  1895  						df = &dsortFramework{
  1896  							m:                     m,
  1897  							dsorterType:           dsorterType,
  1898  							outputTempl:           "output-{0..100000}",
  1899  							shardCnt:              500,
  1900  							filesPerShard:         200,
  1901  							recordDuplicationsCnt: 50,
  1902  							inputExt:              ext,
  1903  						}
  1904  					)
  1905  					m.initAndSaveState(false /*cleanup*/)
  1906  					m.expectTargets(3)
  1907  					tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  1908  
  1909  					switch scope {
  1910  					case scopeConfig:
  1911  						defer tools.SetClusterConfig(t,
  1912  							cos.StrKVs{"distributed_sort.duplicated_records": cmn.AbortReaction})
  1913  						tools.SetClusterConfig(t, cos.StrKVs{"distributed_sort.duplicated_records": reaction})
  1914  
  1915  						tlog.Logf("changed `duplicated_records` config to: %s\n", reaction)
  1916  					case scopeSpec:
  1917  						df.duplicatedRecords = reaction
  1918  						tlog.Logf("set `duplicated_records` in request spec to: %s\n", reaction)
  1919  					default:
  1920  						cos.AssertMsg(false, scope)
  1921  					}
  1922  
  1923  					df.init()
  1924  					df.createInputShards()
  1925  
  1926  					tlog.Logf("starting dsort: %d/%d, %s\n", df.shardCnt, df.filesPerShard, df.inputExt)
  1927  					df.start()
  1928  
  1929  					_, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  1930  					tassert.CheckFatal(t, err)
  1931  					tlog.Logf("%s: finished\n", df.job())
  1932  
  1933  					df.checkReactionResult(reaction, df.recordDuplicationsCnt)
  1934  				},
  1935  			)
  1936  		})
  1937  	}
  1938  }
  1939  
  1940  func TestDsortOrderFile(t *testing.T) {
  1941  	runDsortTest(
  1942  		t, dsortTestSpec{p: true, types: dsorterTypes},
  1943  		func(dsorterType string, t *testing.T) {
  1944  			var (
  1945  				err error
  1946  				m   = &ioContext{
  1947  					t: t,
  1948  				}
  1949  				df = &dsortFramework{
  1950  					m:           m,
  1951  					dsorterType: dsorterType,
  1952  					outputBck: cmn.Bck{
  1953  						Name:     trand.String(15),
  1954  						Provider: apc.AIS,
  1955  					},
  1956  					shardCnt:      100,
  1957  					filesPerShard: 10,
  1958  				}
  1959  
  1960  				orderFileName = "orderFileName"
  1961  				ekm           = make(map[string]string, 10)
  1962  				shardFmts     = []string{
  1963  					"shard-%d-suf",
  1964  					"input-%d-pref",
  1965  					"smth-%d",
  1966  				}
  1967  				proxyURL   = tools.RandomProxyURL()
  1968  				baseParams = tools.BaseAPIParams(proxyURL)
  1969  			)
  1970  
  1971  			m.initAndSaveState(true /*cleanup*/)
  1972  			m.expectTargets(3)
  1973  
  1974  			// Set URL for order file (points to the object in cluster).
  1975  			df.orderFileURL = fmt.Sprintf(
  1976  				"%s/%s/%s/%s/%s?%s=%s",
  1977  				proxyURL, apc.Version, apc.Objects, m.bck.Name, orderFileName,
  1978  				apc.QparamProvider, apc.AIS,
  1979  			)
  1980  
  1981  			df.init()
  1982  
  1983  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  1984  
  1985  			// Create local output bucket
  1986  			tools.CreateBucket(t, m.proxyURL, df.outputBck, nil, true /*cleanup*/)
  1987  
  1988  			df.createInputShards()
  1989  
  1990  			// Generate content for the orderFile
  1991  			tlog.Logln("generating and putting order file into cluster...")
  1992  			var (
  1993  				buffer       bytes.Buffer
  1994  				shardRecords = df.getRecordNames(m.bck)
  1995  			)
  1996  			for _, shard := range shardRecords {
  1997  				for idx, recordName := range shard.recordNames {
  1998  					buffer.WriteString(fmt.Sprintf("%s\t%s\n", recordName, shardFmts[idx%len(shardFmts)]))
  1999  					ekm[recordName] = shardFmts[idx%len(shardFmts)]
  2000  				}
  2001  			}
  2002  			args := api.PutArgs{
  2003  				BaseParams: baseParams,
  2004  				Bck:        m.bck,
  2005  				ObjName:    orderFileName,
  2006  				Reader:     readers.NewBytes(buffer.Bytes()),
  2007  			}
  2008  			_, err = api.PutObject(&args)
  2009  			tassert.CheckFatal(t, err)
  2010  
  2011  			tlog.Logln(startingDS)
  2012  			spec := df.gen()
  2013  			managerUUID, err := api.StartDsort(baseParams, &spec)
  2014  			tassert.CheckFatal(t, err)
  2015  
  2016  			_, err = tools.WaitForDsortToFinish(m.proxyURL, managerUUID)
  2017  			tassert.CheckFatal(t, err)
  2018  			tlog.Logf("%s: finished\n", df.job())
  2019  
  2020  			allMetrics, err := api.MetricsDsort(baseParams, managerUUID)
  2021  			tassert.CheckFatal(t, err)
  2022  			if len(allMetrics) != m.originalTargetCount {
  2023  				t.Errorf("number of metrics %d is not same as number of targets %d", len(allMetrics), m.originalTargetCount)
  2024  			}
  2025  
  2026  			tlog.Logln("checking if all records are in specified shards...")
  2027  			shardRecords = df.getRecordNames(df.outputBck)
  2028  			for _, shard := range shardRecords {
  2029  				for _, recordName := range shard.recordNames {
  2030  					match := false
  2031  					// Some shard with specified format contains the record
  2032  					for i := range 30 {
  2033  						match = match || fmt.Sprintf(ekm[recordName], i) == shard.name
  2034  					}
  2035  					if !match {
  2036  						t.Errorf("record %q was not part of any shard with format %q but was in shard %q",
  2037  							recordName, ekm[recordName], shard.name)
  2038  					}
  2039  				}
  2040  			}
  2041  		},
  2042  	)
  2043  }
  2044  
  2045  func TestDsortOrderJSONFile(t *testing.T) {
  2046  	runDsortTest(
  2047  		t, dsortTestSpec{p: true, types: dsorterTypes},
  2048  		func(dsorterType string, t *testing.T) {
  2049  			var (
  2050  				err error
  2051  				m   = &ioContext{
  2052  					t: t,
  2053  				}
  2054  				df = &dsortFramework{
  2055  					m:           m,
  2056  					dsorterType: dsorterType,
  2057  					outputBck: cmn.Bck{
  2058  						Name:     trand.String(15),
  2059  						Provider: apc.AIS,
  2060  					},
  2061  					shardCnt:      100,
  2062  					filesPerShard: 10,
  2063  				}
  2064  
  2065  				orderFileName = "order_file_name.json"
  2066  				ekm           = make(map[string]string, 10)
  2067  				shardFmts     = []string{
  2068  					"shard-%d-suf",
  2069  					"input-%d-pref",
  2070  					"smth-%d",
  2071  				}
  2072  				proxyURL   = tools.RandomProxyURL()
  2073  				baseParams = tools.BaseAPIParams(proxyURL)
  2074  			)
  2075  
  2076  			m.initAndSaveState(true /*cleanup*/)
  2077  			m.expectTargets(3)
  2078  
  2079  			// Set URL for order file (points to the object in cluster).
  2080  			df.orderFileURL = fmt.Sprintf(
  2081  				"%s/%s/%s/%s/%s?%s=%s",
  2082  				proxyURL, apc.Version, apc.Objects, m.bck.Name, orderFileName,
  2083  				apc.QparamProvider, apc.AIS,
  2084  			)
  2085  
  2086  			df.init()
  2087  
  2088  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  2089  
  2090  			// Create local output bucket
  2091  			tools.CreateBucket(t, m.proxyURL, df.outputBck, nil, true /*cleanup*/)
  2092  
  2093  			df.createInputShards()
  2094  
  2095  			// Generate content for the orderFile
  2096  			tlog.Logln("generating and putting order file into cluster...")
  2097  			var (
  2098  				content      = make(map[string][]string, 10)
  2099  				shardRecords = df.getRecordNames(m.bck)
  2100  			)
  2101  			for _, shard := range shardRecords {
  2102  				for idx, recordName := range shard.recordNames {
  2103  					shardFmt := shardFmts[idx%len(shardFmts)]
  2104  					content[shardFmt] = append(content[shardFmt], recordName)
  2105  					ekm[recordName] = shardFmts[idx%len(shardFmts)]
  2106  				}
  2107  			}
  2108  			jsonBytes, err := jsoniter.Marshal(content)
  2109  			tassert.CheckFatal(t, err)
  2110  			args := api.PutArgs{
  2111  				BaseParams: baseParams,
  2112  				Bck:        m.bck,
  2113  				ObjName:    orderFileName,
  2114  				Reader:     readers.NewBytes(jsonBytes),
  2115  			}
  2116  			_, err = api.PutObject(&args)
  2117  			tassert.CheckFatal(t, err)
  2118  
  2119  			tlog.Logln(startingDS)
  2120  			spec := df.gen()
  2121  			managerUUID, err := api.StartDsort(baseParams, &spec)
  2122  			tassert.CheckFatal(t, err)
  2123  
  2124  			_, err = tools.WaitForDsortToFinish(m.proxyURL, managerUUID)
  2125  			tassert.CheckFatal(t, err)
  2126  			tlog.Logf("%s: finished\n", df.job())
  2127  
  2128  			allMetrics, err := api.MetricsDsort(baseParams, managerUUID)
  2129  			tassert.CheckFatal(t, err)
  2130  			if len(allMetrics) != m.originalTargetCount {
  2131  				t.Errorf("number of metrics %d is not same as number of targets %d",
  2132  					len(allMetrics), m.originalTargetCount)
  2133  			}
  2134  
  2135  			tlog.Logln("checking if all records are in specified shards...")
  2136  			shardRecords = df.getRecordNames(df.outputBck)
  2137  			for _, shard := range shardRecords {
  2138  				for _, recordName := range shard.recordNames {
  2139  					match := false
  2140  					// Some shard with specified format contains the record
  2141  					for i := range 30 {
  2142  						match = match || fmt.Sprintf(ekm[recordName], i) == shard.name
  2143  					}
  2144  					if !match {
  2145  						t.Errorf("record %q was not part of any shard with format %q but was in shard %q",
  2146  							recordName, ekm[recordName], shard.name)
  2147  					}
  2148  				}
  2149  			}
  2150  		},
  2151  	)
  2152  }
  2153  
  2154  func TestDsortDryRun(t *testing.T) {
  2155  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
  2156  
  2157  	runDsortTest(
  2158  		t, dsortTestSpec{p: true, types: dsorterTypes},
  2159  		func(dsorterType string, t *testing.T) {
  2160  			var (
  2161  				m = &ioContext{
  2162  					t: t,
  2163  				}
  2164  				df = &dsortFramework{
  2165  					m:             m,
  2166  					dsorterType:   dsorterType,
  2167  					shardCnt:      500,
  2168  					filesPerShard: 100,
  2169  					dryRun:        true,
  2170  				}
  2171  			)
  2172  
  2173  			m.initAndSaveState(true /*cleanup*/)
  2174  			m.expectTargets(3)
  2175  
  2176  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  2177  
  2178  			df.init()
  2179  			df.createInputShards()
  2180  
  2181  			tlog.Logln(startingDS)
  2182  			df.start()
  2183  
  2184  			_, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  2185  			tassert.CheckFatal(t, err)
  2186  			tlog.Logf("%s: finished\n", df.job())
  2187  
  2188  			df.checkMetrics(false /* expectAbort */)
  2189  		},
  2190  	)
  2191  }
  2192  
  2193  func TestDsortDryRunDisk(t *testing.T) {
  2194  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
  2195  
  2196  	runDsortTest(
  2197  		t, dsortTestSpec{p: true, types: dsorterTypes},
  2198  		func(dsorterType string, t *testing.T) {
  2199  			var (
  2200  				m = &ioContext{
  2201  					t: t,
  2202  				}
  2203  				df = &dsortFramework{
  2204  					m:             m,
  2205  					dsorterType:   dsorterType,
  2206  					shardCnt:      500,
  2207  					filesPerShard: 100,
  2208  					dryRun:        true,
  2209  				}
  2210  			)
  2211  
  2212  			m.initAndSaveState(true /*cleanup*/)
  2213  			m.expectTargets(3)
  2214  
  2215  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  2216  
  2217  			df.init()
  2218  			df.createInputShards()
  2219  
  2220  			tlog.Logln(startingDS)
  2221  			df.start()
  2222  
  2223  			_, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  2224  			tassert.CheckFatal(t, err)
  2225  			tlog.Logf("%s: finished\n", df.job())
  2226  
  2227  			df.checkMetrics(false /* expectAbort */)
  2228  		},
  2229  	)
  2230  }
  2231  
  2232  func TestDsortLongerExt(t *testing.T) {
  2233  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
  2234  
  2235  	runDsortTest(
  2236  		t, dsortTestSpec{p: true, types: dsorterTypes, algs: dsortAlgorithms},
  2237  		func(dsorterType, alg string, t *testing.T) {
  2238  			var (
  2239  				m = &ioContext{
  2240  					t: t,
  2241  				}
  2242  				df = &dsortFramework{
  2243  					m:             m,
  2244  					dsorterType:   dsorterType,
  2245  					outputTempl:   "output-%05d",
  2246  					shardCnt:      200,
  2247  					filesPerShard: 10,
  2248  					maxMemUsage:   "99%",
  2249  					alg:           &dsort.Algorithm{Kind: alg},
  2250  					recordExts:    []string{".txt", ".json.info", ".info", ".json"},
  2251  				}
  2252  			)
  2253  
  2254  			m.initAndSaveState(true /*cleanup*/)
  2255  			m.expectTargets(3)
  2256  
  2257  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  2258  
  2259  			df.init()
  2260  			df.createInputShards()
  2261  
  2262  			tlog.Logln(startingDS)
  2263  			df.start()
  2264  
  2265  			_, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  2266  			tassert.CheckFatal(t, err)
  2267  			tlog.Logf("%s: finished\n", df.job())
  2268  
  2269  			df.checkMetrics(false /*expectAbort*/)
  2270  			df.checkOutputShards(5)
  2271  		},
  2272  	)
  2273  }
  2274  
  2275  func TestDsortAutomaticallyCalculateOutputShards(t *testing.T) {
  2276  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
  2277  
  2278  	runDsortTest(
  2279  		t, dsortTestSpec{p: true, types: dsorterTypes},
  2280  		func(dsorterType string, t *testing.T) {
  2281  			var (
  2282  				m = &ioContext{
  2283  					t: t,
  2284  				}
  2285  				df = &dsortFramework{
  2286  					m:               m,
  2287  					dsorterType:     dsorterType,
  2288  					shardCnt:        200,
  2289  					filesPerShard:   10,
  2290  					maxMemUsage:     "99%",
  2291  					outputShardSize: "-1",
  2292  					outputTempl:     "output-{0..10}",
  2293  				}
  2294  			)
  2295  
  2296  			m.initAndSaveState(true /*cleanup*/)
  2297  			m.expectTargets(3)
  2298  
  2299  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  2300  
  2301  			df.init()
  2302  			df.createInputShards()
  2303  
  2304  			tlog.Logln(startingDS)
  2305  			df.start()
  2306  
  2307  			_, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  2308  			tassert.CheckFatal(t, err)
  2309  			tlog.Logf("%s: finished\n", df.job())
  2310  
  2311  			df.checkMetrics(false /*expectAbort*/)
  2312  			df.checkOutputShards(0)
  2313  		},
  2314  	)
  2315  }
  2316  
  2317  func TestDsortWithTarFormats(t *testing.T) {
  2318  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
  2319  
  2320  	runDsortTest(
  2321  		// Include empty ("") type - in this case type must be selected automatically.
  2322  		t, dsortTestSpec{p: true, types: append(dsorterTypes, ""),
  2323  			tarFormats: []tar.Format{tar.FormatUnknown, tar.FormatGNU, tar.FormatPAX}},
  2324  		func(dsorterType string, tarFormat tar.Format, t *testing.T) {
  2325  			var (
  2326  				m = &ioContext{
  2327  					t: t,
  2328  				}
  2329  				df = &dsortFramework{
  2330  					m:             m,
  2331  					dsorterType:   dsorterType,
  2332  					shardCnt:      500,
  2333  					filesPerShard: 100,
  2334  					maxMemUsage:   "1B",
  2335  					tarFormat:     tarFormat,
  2336  					recordExts:    []string{".txt"},
  2337  				}
  2338  			)
  2339  
  2340  			// Initialize ioContext
  2341  			m.initAndSaveState(true /*cleanup*/)
  2342  			m.expectTargets(1)
  2343  
  2344  			// Create ais bucket
  2345  			tools.CreateBucket(t, m.proxyURL, m.bck, nil, true /*cleanup*/)
  2346  
  2347  			df.init()
  2348  			df.createInputShards()
  2349  
  2350  			tlog.Logln(startingDS)
  2351  			df.start()
  2352  
  2353  			_, err := tools.WaitForDsortToFinish(m.proxyURL, df.managerUUID)
  2354  			tassert.CheckFatal(t, err)
  2355  			tlog.Logf("%s: finished\n", df.job())
  2356  
  2357  			df.checkMetrics(false /* expectAbort */)
  2358  			df.checkOutputShards(5)
  2359  		},
  2360  	)
  2361  }