github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/request_spec.go (about)

     1  // Package dsort provides distributed massively parallel resharding for very large datasets.
     2  /*
     3   * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package dsort
     6  
     7  import (
     8  	"fmt"
     9  	"math"
    10  	"net/url"
    11  	"strconv"
    12  	"strings"
    13  
    14  	"github.com/NVIDIA/aistore/api/apc"
    15  	"github.com/NVIDIA/aistore/cmn"
    16  	"github.com/NVIDIA/aistore/cmn/archive"
    17  	"github.com/NVIDIA/aistore/cmn/cos"
    18  	"github.com/NVIDIA/aistore/ext/dsort/shard"
    19  )
    20  
    21  type parsedInputTemplate struct {
    22  	Template cos.ParsedTemplate `json:"template"`
    23  	ObjNames []string           `json:"objnames"`
    24  	Prefix   string             `json:"prefix"`
    25  }
    26  
    27  type parsedOutputTemplate struct {
    28  	// Used by 'bash' and 'at' template
    29  	Template cos.ParsedTemplate
    30  }
    31  
    32  type ParsedReq struct {
    33  	InputBck  cmn.Bck
    34  	OutputBck cmn.Bck
    35  	pars      *parsedReqSpec
    36  }
    37  
    38  type parsedReqSpec struct {
    39  	InputBck            cmn.Bck               `json:"input_bck"`
    40  	Description         string                `json:"description"`
    41  	OutputBck           cmn.Bck               `json:"output_bck"`
    42  	InputExtension      string                `json:"input_extension"`
    43  	OutputExtension     string                `json:"output_extension"`
    44  	OutputShardSize     int64                 `json:"output_shard_size,string"`
    45  	Pit                 *parsedInputTemplate  `json:"pit"`
    46  	Pot                 *parsedOutputTemplate `json:"pot"`
    47  	Algorithm           *Algorithm            `json:"algorithm"`
    48  	OrderFileURL        string                `json:"order_file"`
    49  	OrderFileSep        string                `json:"order_file_sep"`
    50  	MaxMemUsage         cos.ParsedQuantity    `json:"max_mem_usage"`
    51  	TargetOrderSalt     []byte                `json:"target_order_salt"`
    52  	ExtractConcMaxLimit int                   `json:"extract_concurrency_max_limit"`
    53  	CreateConcMaxLimit  int                   `json:"create_concurrency_max_limit"`
    54  	SbundleMult         int                   `json:"bundle_multiplier"`
    55  
    56  	// debug
    57  	DsorterType string `json:"dsorter_type"`
    58  	DryRun      bool   `json:"dry_run"`
    59  
    60  	cmn.DsortConf
    61  }
    62  
    63  /////////////////
    64  // RequestSpec //
    65  /////////////////
    66  
    67  func specErr(s string, err error) error { return fmt.Errorf("[dsort] parse-spec: %q %w", s, err) }
    68  
    69  func (rs *RequestSpec) ParseCtx() (*ParsedReq, error) {
    70  	pars, err := rs.parse()
    71  	return &ParsedReq{pars.InputBck, pars.OutputBck, pars}, err
    72  }
    73  
    74  func (rs *RequestSpec) parse() (*parsedReqSpec, error) {
    75  	var (
    76  		cfg  = cmn.GCO.Get().Dsort
    77  		pars = &parsedReqSpec{}
    78  	)
    79  
    80  	// src bck
    81  	if rs.InputBck.IsEmpty() {
    82  		return pars, specErr("input_bck", errMissingSrcBucket)
    83  	}
    84  	pars.InputBck = rs.InputBck
    85  	if rs.InputBck.Provider == "" {
    86  		pars.InputBck.Provider = apc.AIS // NOTE: ais:// is the default
    87  	} else {
    88  		normp, err := cmn.NormalizeProvider(rs.InputBck.Provider)
    89  		if err != nil {
    90  			return pars, specErr("input_bck_provider", err)
    91  		}
    92  		pars.InputBck.Provider = normp
    93  	}
    94  	if err := rs.InputBck.Validate(); err != nil {
    95  		return pars, specErr("input_bck", err)
    96  	}
    97  
    98  	pars.Description = rs.Description
    99  
   100  	// dst bck
   101  	pars.OutputBck = rs.OutputBck
   102  	if pars.OutputBck.IsEmpty() {
   103  		pars.OutputBck = pars.InputBck // NOTE: source can be the destination as well
   104  	} else {
   105  		normp, err := cmn.NormalizeProvider(rs.OutputBck.Provider)
   106  		if err != nil {
   107  			return pars, specErr("output_bck_provider", err)
   108  		}
   109  		pars.OutputBck.Provider = normp
   110  		if err := rs.OutputBck.Validate(); err != nil {
   111  			return pars, specErr("output_bck", err)
   112  		}
   113  	}
   114  
   115  	// input format
   116  	var err error
   117  	pars.Pit, err = parseInputFormat(rs.InputFormat)
   118  	if err != nil {
   119  		return nil, specErr("input_format", err)
   120  	}
   121  	if rs.InputFormat.Template != "" {
   122  		// template is not a filename but all we do here is
   123  		// checking the template's suffix for specific supported extensions
   124  		if ext, err := archive.Mime("", rs.InputFormat.Template); err == nil {
   125  			if rs.InputExtension != "" && rs.InputExtension != ext {
   126  				return nil, fmt.Errorf("input_extension: %q vs %q", rs.InputExtension, ext)
   127  			}
   128  			rs.InputExtension = ext
   129  		}
   130  	}
   131  	if rs.InputExtension != "" {
   132  		pars.InputExtension, err = archive.Mime(rs.InputExtension, "")
   133  		if err != nil {
   134  			return nil, specErr("input_extension", err)
   135  		}
   136  	}
   137  
   138  	// output format
   139  	pars.OutputShardSize, err = cos.ParseSize(rs.OutputShardSize, cos.UnitsIEC)
   140  	if err != nil {
   141  		return nil, specErr("output_shard_size", err)
   142  	}
   143  	if pars.OutputShardSize < 0 {
   144  		return nil, fmt.Errorf(fmtErrNegOutputSize, pars.OutputShardSize)
   145  	}
   146  	pars.Algorithm, err = parseAlgorithm(rs.Algorithm)
   147  	if err != nil {
   148  		return nil, specErr("algorithm", err)
   149  	}
   150  
   151  	var isOrder bool
   152  	if isOrder, err = validateOrderFileURL(rs.OrderFileURL); err != nil {
   153  		return nil, fmt.Errorf(fmtErrOrderURL, rs.OrderFileURL, err)
   154  	}
   155  	if isOrder {
   156  		if pars.Pot, err = parseOutputFormat(rs.OutputFormat); err != nil {
   157  			return nil, err
   158  		}
   159  		if pars.Pot.Template.Count() > math.MaxInt32 {
   160  			// If the count is not defined the output shard size must be
   161  			if pars.OutputShardSize == 0 {
   162  				return nil, errMissingOutputSize
   163  			}
   164  		}
   165  		if rs.OutputFormat != "" {
   166  			// (ditto)
   167  			if ext, err := archive.Mime("", rs.OutputFormat); err == nil {
   168  				if rs.OutputExtension != "" && rs.OutputExtension != ext {
   169  					return nil, fmt.Errorf("output_extension: %q vs %q", rs.OutputExtension, ext)
   170  				}
   171  				rs.OutputExtension = ext
   172  			}
   173  		}
   174  	} else {
   175  		// For the order file the output shard size must be set.
   176  		if pars.OutputShardSize == 0 {
   177  			return nil, errMissingOutputSize
   178  		}
   179  		pars.OrderFileURL = rs.OrderFileURL
   180  		pars.OrderFileSep = rs.OrderFileSep
   181  		if pars.OrderFileSep == "" {
   182  			pars.OrderFileSep = "\t"
   183  		}
   184  	}
   185  	if rs.OutputExtension == "" {
   186  		pars.OutputExtension = pars.InputExtension // default
   187  	} else {
   188  		pars.OutputExtension, err = archive.Mime(rs.OutputExtension, "")
   189  		if err != nil {
   190  			return nil, specErr("output_extension", err)
   191  		}
   192  	}
   193  
   194  	// mem & conc
   195  	if rs.MaxMemUsage == "" {
   196  		rs.MaxMemUsage = cfg.DefaultMaxMemUsage
   197  	}
   198  	pars.MaxMemUsage, err = cos.ParseQuantity(rs.MaxMemUsage)
   199  	if err != nil {
   200  		return nil, err
   201  	}
   202  	if rs.ExtractConcMaxLimit < 0 {
   203  		return nil, fmt.Errorf("%w ('extract', %d)", errNegConcLimit, rs.ExtractConcMaxLimit)
   204  	}
   205  	if rs.CreateConcMaxLimit < 0 {
   206  		return nil, fmt.Errorf("%w ('create', %d)", errNegConcLimit, rs.CreateConcMaxLimit)
   207  	}
   208  
   209  	pars.ExtractConcMaxLimit = rs.ExtractConcMaxLimit
   210  	pars.CreateConcMaxLimit = rs.CreateConcMaxLimit
   211  	pars.DsorterType = rs.DsorterType
   212  	pars.DryRun = rs.DryRun
   213  
   214  	// `cfg` here contains inherited (aka global) part of the dsort config -
   215  	// apply this request's rs.Config values to override or assign defaults
   216  
   217  	if err := rs.Config.ValidateWithOpts(true); err != nil {
   218  		return nil, err
   219  	}
   220  	pars.DsortConf = rs.Config
   221  
   222  	pars.SbundleMult = rs.Config.SbundleMult
   223  	if pars.SbundleMult == 0 {
   224  		pars.SbundleMult = cfg.SbundleMult
   225  	}
   226  	if pars.MissingShards == "" {
   227  		pars.MissingShards = cfg.MissingShards
   228  	}
   229  	if pars.EKMMalformedLine == "" {
   230  		pars.EKMMalformedLine = cfg.EKMMalformedLine
   231  	}
   232  	if pars.EKMMissingKey == "" {
   233  		pars.EKMMissingKey = cfg.EKMMissingKey
   234  	}
   235  	if pars.DuplicatedRecords == "" {
   236  		pars.DuplicatedRecords = cfg.DuplicatedRecords
   237  	}
   238  	if pars.DsorterMemThreshold == "" {
   239  		pars.DsorterMemThreshold = cfg.DsorterMemThreshold
   240  	}
   241  
   242  	return pars, nil
   243  }
   244  
   245  func parseAlgorithm(alg Algorithm) (*Algorithm, error) {
   246  	if !cos.StringInSlice(alg.Kind, algorithms) {
   247  		return nil, fmt.Errorf(fmtErrInvalidAlg, algorithms)
   248  	}
   249  	if alg.Seed != "" {
   250  		if value, err := strconv.ParseInt(alg.Seed, 10, 64); value < 0 || err != nil {
   251  			return nil, fmt.Errorf(fmtErrSeed, alg.Seed)
   252  		}
   253  	}
   254  	if alg.Kind == Content {
   255  		alg.Ext = strings.TrimSpace(alg.Ext)
   256  		if alg.Ext == "" || alg.Ext[0] != '.' {
   257  			return nil, fmt.Errorf("%w %q", errAlgExt, alg.Ext)
   258  		}
   259  		if err := shard.ValidateContentKeyTy(alg.ContentKeyType); err != nil {
   260  			return nil, err
   261  		}
   262  	} else {
   263  		alg.ContentKeyType = shard.ContentKeyString
   264  	}
   265  
   266  	return &alg, nil
   267  }
   268  
   269  func validateOrderFileURL(orderURL string) (empty bool, err error) {
   270  	if orderURL == "" {
   271  		return true, nil
   272  	}
   273  	_, err = url.ParseRequestURI(orderURL)
   274  	return
   275  }
   276  
   277  //////////////////////////
   278  // parsedOutputTemplate //
   279  //////////////////////////
   280  
   281  func parseOutputFormat(outputFormat string) (pot *parsedOutputTemplate, err error) {
   282  	pot = &parsedOutputTemplate{}
   283  	if pot.Template, err = cos.NewParsedTemplate(strings.TrimSpace(outputFormat)); err != nil {
   284  		return
   285  	}
   286  	if len(pot.Template.Ranges) == 0 {
   287  		return nil, fmt.Errorf("invalid output template %q: no ranges (prefix-only output is not supported)",
   288  			outputFormat)
   289  	}
   290  	return
   291  }
   292  
   293  /////////////////////////
   294  // parsedInputTemplate //
   295  /////////////////////////
   296  
   297  func parseInputFormat(inputFormat apc.ListRange) (pit *parsedInputTemplate, err error) {
   298  	pit = &parsedInputTemplate{}
   299  	if inputFormat.IsList() {
   300  		pit.ObjNames = inputFormat.ObjNames
   301  		return
   302  	}
   303  	pit.Template, err = cos.NewParsedTemplate(inputFormat.Template)
   304  
   305  	if err == cos.ErrEmptyTemplate {
   306  		// empty template => empty prefix (match any)
   307  		err = nil
   308  		pit.Prefix = cos.EmptyMatchAll
   309  	} else if err == nil && len(pit.Template.Ranges) == 0 {
   310  		// prefix only
   311  		pit.Prefix = pit.Template.Prefix
   312  	}
   313  	return
   314  }
   315  
   316  func (pit *parsedInputTemplate) isList() bool   { return len(pit.ObjNames) > 0 }
   317  func (pit *parsedInputTemplate) isRange() bool  { return len(pit.Template.Ranges) > 0 }
   318  func (pit *parsedInputTemplate) isPrefix() bool { return !pit.isList() && !pit.isRange() }