github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/dsort.go

github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/dsort.go (about)

     1  // Package dsort provides distributed massively parallel resharding for very large datasets.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package dsort
     6  
     7  import (
     8  	"bufio"
     9  	"context"
    10  	"encoding/binary"
    11  	"fmt"
    12  	"io"
    13  	"math"
    14  	"net/http"
    15  	"net/url"
    16  	"path/filepath"
    17  	"sort"
    18  	"strconv"
    19  	"strings"
    20  	"sync"
    21  	"time"
    22  
    23  	"github.com/NVIDIA/aistore/api/apc"
    24  	"github.com/NVIDIA/aistore/cmn"
    25  	"github.com/NVIDIA/aistore/cmn/archive"
    26  	"github.com/NVIDIA/aistore/cmn/cos"
    27  	"github.com/NVIDIA/aistore/cmn/debug"
    28  	"github.com/NVIDIA/aistore/cmn/nlog"
    29  	"github.com/NVIDIA/aistore/core"
    30  	"github.com/NVIDIA/aistore/core/meta"
    31  	"github.com/NVIDIA/aistore/ext/dsort/shard"
    32  	"github.com/NVIDIA/aistore/fs"
    33  	"github.com/NVIDIA/aistore/stats"
    34  	"github.com/NVIDIA/aistore/transport"
    35  	"github.com/OneOfOne/xxhash"
    36  	jsoniter "github.com/json-iterator/go"
    37  	"github.com/pkg/errors"
    38  	"github.com/tinylib/msgp/msgp"
    39  	"golang.org/x/sync/errgroup"
    40  )
    41  
    42  const PrefixJobID = "srt-"
    43  
    44  type (
    45  	dsorter interface {
    46  		shard.ContentLoader
    47  
    48  		name() string
    49  		init() error
    50  		start() error
    51  		postExtraction()
    52  		postRecordDistribution()
    53  		createShardsLocally() (err error)
    54  		preShardCreation(shardName string, mi *fs.Mountpath) error
    55  		postShardCreation(mi *fs.Mountpath)
    56  		cleanup()
    57  		finalCleanup() error
    58  		preShardExtraction(expectedUncompressedSize uint64) (toDisk bool)
    59  		postShardExtraction(expectedUncompressedSize uint64)
    60  		onAbort()
    61  	}
    62  )
    63  
    64  var js = jsoniter.ConfigFastest
    65  
    66  func (m *Manager) finish() {
    67  	if cmn.Rom.FastV(4, cos.SmoduleDsort) {
    68  		nlog.Infof("%s: %s finished", core.T, m.ManagerUUID)
    69  	}
    70  	m.lock()
    71  	m.setInProgressTo(false)
    72  	m.unlock()
    73  
    74  	// Trigger decrement reference counter. If it is already 0 it will
    75  	// trigger cleanup because progress is set to false. Otherwise, the
    76  	// cleanup will be triggered by decrementRef in load content handlers.
    77  	m.decrementRef(0)
    78  }
    79  
    80  func (m *Manager) start() (err error) {
    81  	defer m.finish()
    82  
    83  	if err := m.startDsorter(); err != nil {
    84  		return err
    85  	}
    86  
    87  	// Phase 1.
    88  	nlog.Infof("%s: %s started extraction stage", core.T, m.ManagerUUID)
    89  	if err := m.extractLocalShards(); err != nil {
    90  		return err
    91  	}
    92  
    93  	s := binary.BigEndian.Uint64(m.Pars.TargetOrderSalt)
    94  	targetOrder := _torder(s, m.smap.Tmap)
    95  	if cmn.Rom.FastV(4, cos.SmoduleDsort) {
    96  		nlog.Infof("%s: %s final target in targetOrder => URL: %s, tid %s", core.T, m.ManagerUUID,
    97  			targetOrder[len(targetOrder)-1].PubNet.URL, targetOrder[len(targetOrder)-1].ID())
    98  	}
    99  
   100  	// Phase 2.
   101  	nlog.Infof("%s: %s started sort stage", core.T, m.ManagerUUID)
   102  	curTargetIsFinal, err := m.participateInRecordDistribution(targetOrder)
   103  	if err != nil {
   104  		return err
   105  	}
   106  
   107  	// Phase 3. - run only by the final target
   108  	if curTargetIsFinal {
   109  		// assuming uniform distribution estimate avg. output shard size
   110  		ratio := m.compressionRatio()
   111  		if cmn.Rom.FastV(4, cos.SmoduleDsort) {
   112  			nlog.Infof("%s [dsort] %s phase3: ratio=%f", core.T, m.ManagerUUID, ratio)
   113  		}
   114  		debug.Assertf(shard.IsCompressed(m.Pars.InputExtension) || ratio == 1, "tar ratio=%f, ext=%q",
   115  			ratio, m.Pars.InputExtension)
   116  
   117  		shardSize := int64(float64(m.Pars.OutputShardSize) / ratio)
   118  		nlog.Infof("%s: [dsort] %s started phase 3: ratio=%f, shard size (%d, %d)",
   119  			core.T, m.ManagerUUID, ratio, shardSize, m.Pars.OutputShardSize)
   120  		if err := m.phase3(shardSize); err != nil {
   121  			nlog.Errorf("%s: [dsort] %s phase3 err: %v", core.T, m.ManagerUUID, err)
   122  			return err
   123  		}
   124  	}
   125  
   126  	// Wait for signal to start shard creations. This will happen when manager
   127  	// notice that the specification for shards to be created locally was received.
   128  	select {
   129  	case <-m.startShardCreation:
   130  		break
   131  	case <-m.listenAborted():
   132  		return m.newErrAborted()
   133  	}
   134  
   135  	// After each target participates in the cluster-wide record distribution,
   136  	// start listening for the signal to start creating shards locally.
   137  	nlog.Infof("%s: %s started creation stage", core.T, m.ManagerUUID)
   138  	if err := m.dsorter.createShardsLocally(); err != nil {
   139  		return err
   140  	}
   141  
   142  	nlog.Infof("%s: %s finished successfully", core.T, m.ManagerUUID)
   143  	return nil
   144  }
   145  
   146  // returns a slice of targets in a pseudorandom order
   147  func _torder(salt uint64, tmap meta.NodeMap) []*meta.Snode {
   148  	var (
   149  		targets = make(map[uint64]*meta.Snode, len(tmap))
   150  		keys    = make([]uint64, 0, len(tmap))
   151  	)
   152  	for i, d := range tmap {
   153  		if d.InMaintOrDecomm() {
   154  			continue
   155  		}
   156  		c := xxhash.Checksum64S(cos.UnsafeB(i), salt)
   157  		targets[c] = d
   158  		keys = append(keys, c)
   159  	}
   160  	sort.Slice(keys, func(i, j int) bool { return keys[i] < keys[j] })
   161  
   162  	t := make(meta.Nodes, len(keys))
   163  	for i, k := range keys {
   164  		t[i] = targets[k]
   165  	}
   166  	return t
   167  }
   168  
   169  func (m *Manager) startDsorter() error {
   170  	defer m.markStarted()
   171  	if err := m.initStreams(); err != nil {
   172  		return err
   173  	}
   174  	nlog.Infof("%s: %s starting with dsorter: %q", core.T, m.ManagerUUID, m.dsorter.name())
   175  	return m.dsorter.start()
   176  }
   177  
   178  func (m *Manager) extractLocalShards() (err error) {
   179  	m.extractionPhase.adjuster.start()
   180  	m.Metrics.Extraction.begin()
   181  
   182  	// compare with xact/xs/multiobj.go
   183  	group, ctx := errgroup.WithContext(context.Background())
   184  	switch {
   185  	case m.Pars.Pit.isRange():
   186  		err = m.iterRange(ctx, group)
   187  	case m.Pars.Pit.isList():
   188  		err = m.iterList(ctx, group)
   189  	default:
   190  		debug.Assert(m.Pars.Pit.isPrefix())
   191  		debug.Assert(false, "not implemented yet") // TODO -- FIXME
   192  	}
   193  
   194  	m.dsorter.postExtraction()
   195  	m.Metrics.Extraction.finish()
   196  	m.extractionPhase.adjuster.stop()
   197  	if err == nil {
   198  		m.incrementRef(int64(m.recm.Records.TotalObjectCount()))
   199  	}
   200  	return
   201  }
   202  
   203  func (m *Manager) iterRange(ctx context.Context, group *errgroup.Group) error {
   204  	var (
   205  		metrics = m.Metrics.Extraction
   206  		pt      = m.Pars.Pit.Template
   207  	)
   208  	metrics.mu.Lock()
   209  	metrics.TotalCnt = pt.Count()
   210  	metrics.mu.Unlock()
   211  	pt.InitIter()
   212  outer:
   213  	for name, hasNext := pt.Next(); hasNext; name, hasNext = pt.Next() {
   214  		select {
   215  		case <-m.listenAborted():
   216  			group.Wait()
   217  			return m.newErrAborted()
   218  		case <-ctx.Done():
   219  			break outer // context canceled: we have an error
   220  		default:
   221  		}
   222  
   223  		m.extractionPhase.adjuster.acquireGoroutineSema()
   224  		es := &extractShard{m, metrics, name, true /*is-range*/}
   225  		group.Go(es.do)
   226  	}
   227  	return group.Wait()
   228  }
   229  
   230  func (m *Manager) iterList(ctx context.Context, group *errgroup.Group) error {
   231  	metrics := m.Metrics.Extraction
   232  	metrics.mu.Lock()
   233  	metrics.TotalCnt = int64(len(m.Pars.Pit.ObjNames))
   234  	metrics.mu.Unlock()
   235  outer:
   236  	for _, name := range m.Pars.Pit.ObjNames {
   237  		select {
   238  		case <-m.listenAborted():
   239  			group.Wait()
   240  			return m.newErrAborted()
   241  		case <-ctx.Done():
   242  			break outer // context canceled: we have an error
   243  		default:
   244  		}
   245  
   246  		m.extractionPhase.adjuster.acquireGoroutineSema()
   247  		es := &extractShard{m, metrics, name, false /*is-range*/}
   248  		group.Go(es.do)
   249  	}
   250  	return group.Wait()
   251  }
   252  
   253  func (m *Manager) createShard(s *shard.Shard, lom *core.LOM) (err error) {
   254  	var (
   255  		metrics   = m.Metrics.Creation
   256  		shardName = s.Name
   257  		errCh     = make(chan error, 2)
   258  	)
   259  	if err = lom.InitBck(&m.Pars.OutputBck); err != nil {
   260  		return
   261  	}
   262  	lom.SetAtimeUnix(time.Now().UnixNano())
   263  
   264  	if m.aborted() {
   265  		return m.newErrAborted()
   266  	}
   267  
   268  	if err := m.dsorter.preShardCreation(s.Name, lom.Mountpath()); err != nil {
   269  		return err
   270  	}
   271  	defer m.dsorter.postShardCreation(lom.Mountpath())
   272  
   273  	cs := fs.Cap()
   274  	if err = cs.Err(); err != nil {
   275  		m.abort(err)
   276  		return
   277  	}
   278  
   279  	beforeCreation := time.Now()
   280  
   281  	var (
   282  		wg   = &sync.WaitGroup{}
   283  		r, w = io.Pipe()
   284  	)
   285  	wg.Add(1)
   286  	go func() {
   287  		var err error
   288  		if !m.Pars.DryRun {
   289  			params := core.AllocPutParams()
   290  			{
   291  				params.WorkTag = "dsort"
   292  				params.Cksum = nil
   293  				params.Atime = beforeCreation
   294  
   295  				// NOTE: cannot have `PutObject` closing the original reader
   296  				// on error as it'll cause writer (below) to panic
   297  				params.Reader = io.NopCloser(r)
   298  
   299  				// TODO: params.Xact - in part, to count PUTs and bytes in a generic fashion
   300  				// (vs metrics.ShardCreationStats.updateThroughput - see below)
   301  
   302  				// TODO: add params.Size = (size resulting from shardRW.Create below)
   303  			}
   304  			err = core.T.PutObject(lom, params)
   305  			core.FreePutParams(params)
   306  		} else {
   307  			_, err = io.Copy(io.Discard, r)
   308  		}
   309  		errCh <- err
   310  		wg.Done()
   311  	}()
   312  
   313  	// may reshard into a different format
   314  	shardRW := m.shardRW
   315  	//
   316  	// TODO -- FIXME: compare with extractShard._do()
   317  	//
   318  	if !m.Pars.DryRun && m.Pars.OutputExtension != m.Pars.InputExtension {
   319  		debug.Assert(m.Pars.OutputExtension != "")
   320  		shardRW = shard.RWs[m.Pars.OutputExtension]
   321  		debug.Assert(shardRW != nil, m.Pars.OutputExtension)
   322  	}
   323  
   324  	_, err = shardRW.Create(s, w, m.dsorter)
   325  	w.CloseWithError(err)
   326  	if err != nil {
   327  		r.CloseWithError(err)
   328  		return err
   329  	}
   330  
   331  	select {
   332  	case err = <-errCh:
   333  		if err != nil {
   334  			r.CloseWithError(err)
   335  			w.CloseWithError(err)
   336  		}
   337  	case <-m.listenAborted():
   338  		err = m.newErrAborted()
   339  		r.CloseWithError(err)
   340  		w.CloseWithError(err)
   341  	}
   342  
   343  	wg.Wait()
   344  	close(errCh)
   345  
   346  	if err != nil {
   347  		return err
   348  	}
   349  
   350  	si, err := m.smap.HrwHash2T(lom.Digest())
   351  	if err != nil {
   352  		return err
   353  	}
   354  
   355  	// If the newly created shard belongs on a different target
   356  	// according to HRW, send it there. Since it doesn't really matter
   357  	// if we have an extra copy of the object local to this target, we
   358  	// optimize for performance by not removing the object now.
   359  	if si.ID() != core.T.SID() && !m.Pars.DryRun {
   360  		lom.Lock(false)
   361  		defer lom.Unlock(false)
   362  
   363  		// Need to make sure that the object is still there.
   364  		if err := lom.Load(false /*cache it*/, true /*locked*/); err != nil {
   365  			return err
   366  		}
   367  
   368  		if lom.SizeBytes() <= 0 {
   369  			goto exit
   370  		}
   371  
   372  		file, err := cos.NewFileHandle(lom.FQN)
   373  		if err != nil {
   374  			return err
   375  		}
   376  
   377  		o := transport.AllocSend()
   378  		o.Hdr = transport.ObjHdr{
   379  			ObjName:  shardName,
   380  			ObjAttrs: cmn.ObjAttrs{Size: lom.SizeBytes(), Cksum: lom.Checksum()},
   381  		}
   382  		o.Hdr.Bck.Copy(lom.Bucket())
   383  
   384  		// Make send synchronous.
   385  		streamWg := &sync.WaitGroup{}
   386  		errCh := make(chan error, 1)
   387  		o.Callback = func(_ *transport.ObjHdr, _ io.ReadCloser, _ any, err error) {
   388  			errCh <- err
   389  			streamWg.Done()
   390  		}
   391  		streamWg.Add(1)
   392  		err = m.streams.shards.Send(o, file, si)
   393  		if err != nil {
   394  			return err
   395  		}
   396  		streamWg.Wait()
   397  		if err := <-errCh; err != nil {
   398  			return err
   399  		}
   400  	}
   401  
   402  exit:
   403  	metrics.mu.Lock()
   404  	metrics.CreatedCnt++
   405  	if si.ID() != core.T.SID() {
   406  		metrics.MovedShardCnt++
   407  	}
   408  	metrics.mu.Unlock()
   409  
   410  	return nil
   411  }
   412  
   413  // participateInRecordDistribution coordinates the distributed merging and
   414  // sorting of each target's SortedRecords based on the order defined by
   415  // targetOrder. It returns a bool, currentTargetIsFinal, which is true iff the
   416  // current target is the final target in targetOrder, which by construction of
   417  // the algorithm, should contain the final, complete, sorted slice of Record
   418  // structs.
   419  //
   420  // The algorithm uses the following premise: for a target T at index i in
   421  // targetOrder, if i is even, then T will send its FileMeta slice to the target
   422  // at index i+1 in targetOrder. If i is odd, then it will do a blocking receive
   423  // on the FileMeta slice from the target at index i-1 in targetOrder, and will
   424  // remove all even-indexed targets in targetOrder after receiving. This pattern
   425  // repeats until len(targetOrder) == 1, in which case the single target in the
   426  // slice is the final target with the final, complete, sorted slice of Record
   427  // structs.
   428  func (m *Manager) participateInRecordDistribution(targetOrder meta.Nodes) (currentTargetIsFinal bool, err error) {
   429  	var (
   430  		i           int
   431  		d           *meta.Snode
   432  		dummyTarget *meta.Snode // dummy target is represented as nil value
   433  	)
   434  
   435  	// Metrics
   436  	metrics := m.Metrics.Sorting
   437  	metrics.begin()
   438  	defer metrics.finish()
   439  
   440  	expectedReceived := int32(1)
   441  	for len(targetOrder) > 1 {
   442  		if len(targetOrder)%2 == 1 {
   443  			// For simplicity, we always work with an even-length slice of targets. If len(targetOrder) is odd,
   444  			// we put a "dummy target" into the slice at index len(targetOrder)-2 which simulates sending its
   445  			// metadata to the next target in targetOrder (which is actually itself).
   446  			targetOrder = append(
   447  				targetOrder[:len(targetOrder)-1],
   448  				dummyTarget,
   449  				targetOrder[len(targetOrder)-1],
   450  			)
   451  		}
   452  
   453  		for i, d = range targetOrder {
   454  			if d != dummyTarget && d.ID() == core.T.SID() {
   455  				break
   456  			}
   457  		}
   458  
   459  		if i%2 == 0 {
   460  			m.dsorter.postRecordDistribution()
   461  
   462  			var (
   463  				beforeSend = time.Now()
   464  				group      = &errgroup.Group{}
   465  				r, w       = io.Pipe()
   466  			)
   467  			group.Go(func() error {
   468  				var (
   469  					buf, slab = g.mm.AllocSize(serializationBufSize)
   470  					msgpw     = msgp.NewWriterBuf(w, buf)
   471  				)
   472  				defer slab.Free(buf)
   473  
   474  				if err := m.recm.Records.EncodeMsg(msgpw); err != nil {
   475  					w.CloseWithError(err)
   476  					return errors.Errorf("failed to marshal msgp: %v", err)
   477  				}
   478  				err := msgpw.Flush()
   479  				w.CloseWithError(err)
   480  				if err != nil {
   481  					return errors.Errorf("failed to flush msgp: %v", err)
   482  				}
   483  				return nil
   484  			})
   485  			group.Go(func() error {
   486  				var (
   487  					query  = url.Values{}
   488  					sendTo = targetOrder[i+1]
   489  				)
   490  				query.Add(apc.QparamTotalCompressedSize, strconv.FormatInt(m.totalShardSize(), 10))
   491  				query.Add(apc.QparamTotalUncompressedSize, strconv.FormatInt(m.totalExtractedSize(), 10))
   492  				query.Add(apc.QparamTotalInputShardsExtracted, strconv.Itoa(m.recm.Records.Len()))
   493  				reqArgs := &cmn.HreqArgs{
   494  					Method: http.MethodPost,
   495  					Base:   sendTo.URL(cmn.NetIntraData),
   496  					Path:   apc.URLPathdSortRecords.Join(m.ManagerUUID),
   497  					Query:  query,
   498  					BodyR:  r,
   499  				}
   500  				err := m._do(reqArgs, sendTo, "send sorted records")
   501  				r.CloseWithError(err)
   502  				return err
   503  			})
   504  			if err := group.Wait(); err != nil {
   505  				return false, err
   506  			}
   507  
   508  			m.recm.Records.Drain() // we do not need it anymore
   509  
   510  			metrics.mu.Lock()
   511  			metrics.SentStats.updateTime(time.Since(beforeSend))
   512  			metrics.mu.Unlock()
   513  			return
   514  		}
   515  
   516  		beforeRecv := time.Now()
   517  
   518  		// i%2 == 1
   519  		receiveFrom := targetOrder[i-1]
   520  		if receiveFrom == dummyTarget { // dummy target
   521  			m.incrementReceived()
   522  		}
   523  
   524  		for m.received.count.Load() < expectedReceived {
   525  			select {
   526  			case <-m.listenReceived():
   527  			case <-m.listenAborted():
   528  				err = m.newErrAborted()
   529  				return
   530  			}
   531  		}
   532  		expectedReceived++
   533  
   534  		metrics.mu.Lock()
   535  		metrics.RecvStats.updateTime(time.Since(beforeRecv))
   536  		metrics.mu.Unlock()
   537  
   538  		t := targetOrder[:0]
   539  		for i, d = range targetOrder {
   540  			if i%2 == 1 {
   541  				t = append(t, d)
   542  			}
   543  		}
   544  		targetOrder = t
   545  
   546  		m.recm.MergeEnqueuedRecords()
   547  	}
   548  
   549  	err = sortRecords(m.recm.Records, m.Pars.Algorithm)
   550  	m.dsorter.postRecordDistribution()
   551  	return true, err
   552  }
   553  
   554  func (m *Manager) generateShardsWithTemplate(maxSize int64) ([]*shard.Shard, error) {
   555  	var (
   556  		start           int
   557  		curShardSize    int64
   558  		n               = m.recm.Records.Len()
   559  		pt              = m.Pars.Pot.Template
   560  		shardCount      = pt.Count()
   561  		shards          = make([]*shard.Shard, 0)
   562  		numLocalRecords = make(map[string]int, m.smap.CountActiveTs())
   563  	)
   564  	pt.InitIter()
   565  
   566  	if maxSize <= 0 {
   567  		// Heuristic: shard size when maxSize not specified.
   568  		maxSize = int64(math.Ceil(float64(m.totalExtractedSize()) / float64(shardCount)))
   569  	}
   570  
   571  	for i, r := range m.recm.Records.All() {
   572  		numLocalRecords[r.DaemonID]++
   573  		curShardSize += r.TotalSize()
   574  		if curShardSize < maxSize && i < n-1 {
   575  			continue
   576  		}
   577  
   578  		name, hasNext := pt.Next()
   579  		if !hasNext {
   580  			// no more shard names are available
   581  			return nil, errors.Errorf("number of shards to be created exceeds expected number of shards (%d)", shardCount)
   582  		}
   583  		shard := &shard.Shard{
   584  			Name: name,
   585  		}
   586  		ext, err := archive.Mime("", name)
   587  		if err == nil {
   588  			debug.Assert(m.Pars.OutputExtension == ext)
   589  		} else {
   590  			shard.Name = name + m.Pars.OutputExtension
   591  		}
   592  
   593  		shard.Size = curShardSize
   594  		shard.Records = m.recm.Records.Slice(start, i+1)
   595  		shards = append(shards, shard)
   596  
   597  		start = i + 1
   598  		curShardSize = 0
   599  		for k := range numLocalRecords {
   600  			numLocalRecords[k] = 0
   601  		}
   602  	}
   603  
   604  	return shards, nil
   605  }
   606  
   607  func (m *Manager) generateShardsWithOrderingFile(maxSize int64) ([]*shard.Shard, error) {
   608  	var (
   609  		shards         = make([]*shard.Shard, 0)
   610  		externalKeyMap = make(map[string]string)
   611  		shardsBuilder  = make(map[string][]*shard.Shard)
   612  	)
   613  	if maxSize <= 0 {
   614  		return nil, fmt.Errorf(fmtErrInvalidMaxSize, maxSize)
   615  	}
   616  	parsedURL, err := url.Parse(m.Pars.OrderFileURL)
   617  	if err != nil {
   618  		return nil, fmt.Errorf(fmtErrOrderURL, m.Pars.OrderFileURL, err)
   619  	}
   620  
   621  	req, err := http.NewRequest(http.MethodGet, m.Pars.OrderFileURL, http.NoBody)
   622  	if err != nil {
   623  		return nil, err
   624  	}
   625  	// is intra-call
   626  	tsi := core.T.Snode()
   627  	req.Header.Set(apc.HdrCallerID, tsi.ID())
   628  	req.Header.Set(apc.HdrCallerName, tsi.String())
   629  
   630  	resp, err := m.client.Do(req) //nolint:bodyclose // closed by cos.Close below
   631  	if err != nil {
   632  		return nil, err
   633  	}
   634  	defer cos.Close(resp.Body)
   635  	if resp.StatusCode != http.StatusOK {
   636  		return nil, fmt.Errorf(
   637  			"unexpected status code (%d) when requesting order file from %q",
   638  			resp.StatusCode, m.Pars.OrderFileURL,
   639  		)
   640  	}
   641  
   642  	// TODO: handle very large files > GB - in case the file is very big we
   643  	//  need to save file to the disk and operate on the file directly rather
   644  	//  than keeping everything in memory.
   645  
   646  	switch filepath.Ext(parsedURL.Path) {
   647  	case ".json":
   648  		var ekm map[string][]string
   649  		if err := jsoniter.NewDecoder(resp.Body).Decode(&ekm); err != nil {
   650  			return nil, err
   651  		}
   652  
   653  		for shardNameFmt, recordKeys := range ekm {
   654  			for _, recordKey := range recordKeys {
   655  				externalKeyMap[recordKey] = shardNameFmt
   656  			}
   657  		}
   658  	default:
   659  		lineReader := bufio.NewReader(resp.Body)
   660  		for idx := 0; ; idx++ {
   661  			l, _, err := lineReader.ReadLine()
   662  			if err == io.EOF {
   663  				break
   664  			}
   665  			if err != nil {
   666  				return nil, err
   667  			}
   668  
   669  			line := strings.TrimSpace(string(l))
   670  			if line == "" {
   671  				continue
   672  			}
   673  
   674  			parts := strings.Split(line, m.Pars.OrderFileSep)
   675  			if len(parts) != 2 {
   676  				msg := fmt.Sprintf("malformed line (%d) in external key map: %s", idx, line)
   677  				if err := m.react(m.Pars.EKMMalformedLine, msg); err != nil {
   678  					return nil, err
   679  				}
   680  			}
   681  
   682  			recordKey, shardNameFmt := parts[0], parts[1]
   683  			externalKeyMap[recordKey] = shardNameFmt
   684  		}
   685  	}
   686  
   687  	for _, r := range m.recm.Records.All() {
   688  		key := fmt.Sprintf("%v", r.Key)
   689  		shardNameFmt, ok := externalKeyMap[key]
   690  		if !ok {
   691  			msg := fmt.Sprintf("record %q doesn't belong in external key map", key)
   692  			if err := m.react(m.Pars.EKMMissingKey, msg); err != nil {
   693  				return nil, err
   694  			}
   695  		}
   696  
   697  		shards := shardsBuilder[shardNameFmt]
   698  		recordSize := r.TotalSize() + m.shardRW.MetadataSize()*int64(len(r.Objects))
   699  		shardCount := len(shards)
   700  		if shardCount == 0 || shards[shardCount-1].Size > maxSize {
   701  			shard := &shard.Shard{
   702  				Name:    fmt.Sprintf(shardNameFmt, shardCount),
   703  				Size:    recordSize,
   704  				Records: shard.NewRecords(1),
   705  			}
   706  			shard.Records.Insert(r)
   707  			shardsBuilder[shardNameFmt] = append(shardsBuilder[shardNameFmt], shard)
   708  		} else {
   709  			// Append records
   710  			lastShard := shards[shardCount-1]
   711  			lastShard.Size += recordSize
   712  			lastShard.Records.Insert(r)
   713  		}
   714  	}
   715  
   716  	for _, s := range shardsBuilder {
   717  		shards = append(shards, s...)
   718  	}
   719  
   720  	return shards, nil
   721  }
   722  
   723  // Create `maxSize` output shard structures in the order defined by dsortManager.Records.
   724  // Each output shard structure is "distributed" (via m._dist below)
   725  // to one of the targets - to create the corresponding output shard.
   726  // The logic to map output shard => target:
   727  //  1. By HRW (not using compression)
   728  //  2. By locality (using compression),
   729  //     using two maps:
   730  //     i) shardsToTarget - tracks the total number of shards creation requests sent to each target URL
   731  //     ii) numLocalRecords - tracks the number of records in the current shardMeta each target has locally
   732  //     The target is determined firstly by locality (i.e. the target with the most local records)
   733  //     and secondly (if there is a tie), by least load
   734  //     (i.e. the target with the least number of pending shard creation requests).
   735  func (m *Manager) phase3(maxSize int64) error {
   736  	var (
   737  		shards         []*shard.Shard
   738  		err            error
   739  		shardsToTarget = make(map[*meta.Snode][]*shard.Shard, m.smap.CountActiveTs())
   740  		sendOrder      = make(map[string]map[string]*shard.Shard, m.smap.CountActiveTs())
   741  		errCh          = make(chan error, m.smap.CountActiveTs())
   742  	)
   743  	for _, d := range m.smap.Tmap {
   744  		if m.smap.InMaintOrDecomm(d) {
   745  			continue
   746  		}
   747  		shardsToTarget[d] = nil
   748  		if m.dsorter.name() == MemType {
   749  			sendOrder[d.ID()] = make(map[string]*shard.Shard, 100)
   750  		}
   751  	}
   752  	if m.Pars.OrderFileURL != "" {
   753  		shards, err = m.generateShardsWithOrderingFile(maxSize)
   754  	} else {
   755  		shards, err = m.generateShardsWithTemplate(maxSize)
   756  	}
   757  	if err != nil {
   758  		return err
   759  	}
   760  
   761  	bck := meta.CloneBck(&m.Pars.OutputBck)
   762  	if err := bck.Init(core.T.Bowner()); err != nil {
   763  		return err
   764  	}
   765  	for _, s := range shards {
   766  		si, err := m.smap.HrwName2T(bck.MakeUname(s.Name))
   767  		if err != nil {
   768  			return err
   769  		}
   770  		shardsToTarget[si] = append(shardsToTarget[si], s)
   771  
   772  		if m.dsorter.name() == MemType {
   773  			singleSendOrder := make(map[string]*shard.Shard)
   774  			for _, record := range s.Records.All() {
   775  				shrd, ok := singleSendOrder[record.DaemonID]
   776  				if !ok {
   777  					shrd = &shard.Shard{
   778  						Name:    s.Name,
   779  						Records: shard.NewRecords(100),
   780  					}
   781  					singleSendOrder[record.DaemonID] = shrd
   782  				}
   783  				shrd.Records.Insert(record)
   784  			}
   785  
   786  			for tid, shard := range singleSendOrder {
   787  				sendOrder[tid][shard.Name] = shard
   788  			}
   789  		}
   790  	}
   791  
   792  	m.recm.Records.Drain()
   793  
   794  	wg := cos.NewLimitedWaitGroup(cmn.MaxParallelism(), len(shardsToTarget))
   795  	for si, s := range shardsToTarget {
   796  		wg.Add(1)
   797  		go m._dist(si, s, sendOrder[si.ID()], errCh, wg)
   798  	}
   799  
   800  	wg.Wait()
   801  	close(errCh)
   802  
   803  	for err := range errCh {
   804  		nlog.Errorf("%s: [dsort] %s err while sending shards: %v", core.T, m.ManagerUUID, err)
   805  		return err
   806  	}
   807  	nlog.Infof("%s: [dsort] %s finished sending shards", core.T, m.ManagerUUID)
   808  	return nil
   809  }
   810  
   811  func (m *Manager) _dist(si *meta.Snode, s []*shard.Shard, order map[string]*shard.Shard, errCh chan error, wg cos.WG) {
   812  	var (
   813  		group = &errgroup.Group{}
   814  		r, w  = io.Pipe()
   815  	)
   816  	group.Go(func() error {
   817  		var (
   818  			buf, slab = g.mm.AllocSize(serializationBufSize)
   819  			msgpw     = msgp.NewWriterBuf(w, buf)
   820  			md        = &CreationPhaseMetadata{Shards: s, SendOrder: order}
   821  		)
   822  		err := md.EncodeMsg(msgpw)
   823  		if err == nil {
   824  			err = msgpw.Flush()
   825  		}
   826  		w.CloseWithError(err)
   827  		slab.Free(buf)
   828  		return err
   829  	})
   830  	group.Go(func() error {
   831  		query := m.Pars.InputBck.NewQuery()
   832  		reqArgs := &cmn.HreqArgs{
   833  			Method: http.MethodPost,
   834  			Base:   si.URL(cmn.NetIntraData),
   835  			Path:   apc.URLPathdSortShards.Join(m.ManagerUUID),
   836  			Query:  query,
   837  			BodyR:  r,
   838  		}
   839  		err := m._do(reqArgs, si, "distribute shards")
   840  		r.CloseWithError(err)
   841  		return err
   842  	})
   843  
   844  	if err := group.Wait(); err != nil {
   845  		errCh <- err
   846  	}
   847  	wg.Done()
   848  }
   849  
   850  func (m *Manager) _do(reqArgs *cmn.HreqArgs, tsi *meta.Snode, act string) error {
   851  	req, errV := reqArgs.Req()
   852  	if errV != nil {
   853  		return errV
   854  	}
   855  	resp, err := m.client.Do(req) //nolint:bodyclose // cos.Close below
   856  	if err != nil {
   857  		return err
   858  	}
   859  	if resp.StatusCode != http.StatusOK {
   860  		var b []byte
   861  		b, err = io.ReadAll(resp.Body)
   862  		if err == nil {
   863  			err = fmt.Errorf("%s: %s failed to %s: %s", core.T, m.ManagerUUID, act, strings.TrimSuffix(string(b), "\n"))
   864  		} else {
   865  			err = fmt.Errorf("%s: %s failed to %s: got %v(%d) from %s", core.T, m.ManagerUUID, act, err,
   866  				resp.StatusCode, tsi.StringEx())
   867  		}
   868  	}
   869  	cos.Close(resp.Body)
   870  	return err
   871  }
   872  
   873  //////////////////
   874  // extractShard //
   875  //////////////////
   876  
   877  type extractShard struct {
   878  	m       *Manager
   879  	metrics *LocalExtraction
   880  	name    string
   881  	isRange bool
   882  }
   883  
   884  func (es *extractShard) do() (err error) {
   885  	m := es.m
   886  	shardName := es.name
   887  	if es.isRange && m.Pars.InputExtension != "" {
   888  		ext, errV := archive.Mime("", es.name) // from filename
   889  		if errV == nil {
   890  			if !archive.EqExt(ext, m.Pars.InputExtension) {
   891  				if cmn.Rom.FastV(4, cos.SmoduleDsort) {
   892  					nlog.Infof("%s: %s skipping %s: %q vs %q", core.T, m.ManagerUUID,
   893  						es.name, ext, m.Pars.InputExtension)
   894  				}
   895  				return
   896  			}
   897  		} else {
   898  			shardName = es.name + m.Pars.InputExtension
   899  		}
   900  	}
   901  	lom := core.AllocLOM(shardName)
   902  
   903  	err = es._do(lom)
   904  
   905  	core.FreeLOM(lom)
   906  	phaseInfo := &m.extractionPhase
   907  	phaseInfo.adjuster.releaseGoroutineSema()
   908  	return
   909  }
   910  
   911  func (es *extractShard) _do(lom *core.LOM) error {
   912  	var (
   913  		m                        = es.m
   914  		estimateTotalRecordsSize uint64
   915  		warnOOM                  bool
   916  	)
   917  	if err := lom.InitBck(&m.Pars.InputBck); err != nil {
   918  		return err
   919  	}
   920  	if _, local, err := lom.HrwTarget(m.smap); err != nil || !local {
   921  		return err
   922  	}
   923  	if err := lom.Load(false /*cache it*/, false /*locked*/); err != nil {
   924  		if cmn.IsErrObjNought(err) {
   925  			msg := fmt.Sprintf("shard.do: %q does not exist", lom.Cname())
   926  			return m.react(m.Pars.MissingShards, msg)
   927  		}
   928  		return err
   929  	}
   930  
   931  	shardRW := m.shardRW
   932  	if shardRW == nil {
   933  		debug.Assert(!m.Pars.DryRun)
   934  		ext, err := archive.Mime("", lom.FQN)
   935  		if err != nil {
   936  			return nil // skip
   937  		}
   938  		shardRW = shard.RWs[ext]
   939  		debug.Assert(shardRW != nil, ext)
   940  	}
   941  
   942  	phaseInfo := &m.extractionPhase
   943  	phaseInfo.adjuster.acquireSema(lom.Mountpath())
   944  	if m.aborted() {
   945  		phaseInfo.adjuster.releaseSema(lom.Mountpath())
   946  		return m.newErrAborted()
   947  	}
   948  
   949  	cs := fs.Cap()
   950  	if err := cs.Err(); err != nil {
   951  		phaseInfo.adjuster.releaseSema(lom.Mountpath())
   952  		m.abort(err)
   953  		return err
   954  	}
   955  
   956  	lom.Lock(false)
   957  	fh, err := lom.OpenFile()
   958  	if err != nil {
   959  		phaseInfo.adjuster.releaseSema(lom.Mountpath())
   960  		lom.Unlock(false)
   961  		return errors.Errorf("unable to open %s: %v", lom.Cname(), err)
   962  	}
   963  
   964  	expectedExtractedSize := uint64(float64(lom.SizeBytes()) / m.compressionRatio())
   965  	toDisk := m.dsorter.preShardExtraction(expectedExtractedSize)
   966  
   967  	extractedSize, extractedCount, err := shardRW.Extract(lom, fh, m.recm, toDisk)
   968  	cos.Close(fh)
   969  
   970  	m.addSizes(lom.SizeBytes(), extractedSize) // update compression rate
   971  
   972  	phaseInfo.adjuster.releaseSema(lom.Mountpath())
   973  	lom.Unlock(false)
   974  
   975  	m.dsorter.postShardExtraction(expectedExtractedSize) // schedule freeing reserved memory on next memory update
   976  	if err != nil {
   977  		return errors.Errorf("failed to extract shard %s: %v", lom.Cname(), err)
   978  	}
   979  
   980  	if toDisk {
   981  		g.tstats.Add(stats.DsortExtractShardDskCnt, 1)
   982  	} else {
   983  		g.tstats.Add(stats.DsortExtractShardMemCnt, 1)
   984  	}
   985  	g.tstats.Add(stats.DsortExtractShardSize, extractedSize)
   986  
   987  	//
   988  	// update metrics, check OOM
   989  	//
   990  
   991  	metrics := es.metrics
   992  	metrics.mu.Lock()
   993  	metrics.ExtractedRecordCnt += int64(extractedCount)
   994  	metrics.ExtractedCnt++
   995  	if metrics.ExtractedCnt == 1 && extractedCount > 0 {
   996  		// After extracting the _first_ shard estimate how much memory
   997  		// will be required to keep all records in memory. One node
   998  		// will eventually have all records from all shards so we
   999  		// don't calculate estimates only for single node.
  1000  		recordSize := int(m.recm.Records.RecordMemorySize())
  1001  		estimateTotalRecordsSize = uint64(metrics.TotalCnt * int64(extractedCount*recordSize))
  1002  		if estimateTotalRecordsSize > m.freeMemory() {
  1003  			warnOOM = true
  1004  		}
  1005  	}
  1006  	metrics.ExtractedSize += extractedSize
  1007  	if toDisk {
  1008  		metrics.ExtractedToDiskCnt++
  1009  		metrics.ExtractedToDiskSize += extractedSize
  1010  	}
  1011  	metrics.mu.Unlock()
  1012  
  1013  	if warnOOM {
  1014  		msg := fmt.Sprintf("(estimated) total size of records (%d) will possibly exceed available memory (%s) during sorting phase",
  1015  			estimateTotalRecordsSize, m.Pars.MaxMemUsage)
  1016  		return m.react(cmn.WarnReaction, msg)
  1017  	}
  1018  	return nil
  1019  }