github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/api.go (about)

     1  // Package dsort provides distributed massively parallel resharding for very large datasets.
     2  /*
     3   * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package dsort
     6  
     7  import (
     8  	"github.com/NVIDIA/aistore/api/apc"
     9  	"github.com/NVIDIA/aistore/cmn"
    10  	"github.com/NVIDIA/aistore/cmn/archive"
    11  )
    12  
    13  const DefaultExt = archive.ExtTar // default shard extension/format/MIME when spec's input_extension is empty
    14  
    15  const (
    16  	algDefault   = ""             // default (alphanumeric, increasing)
    17  	Alphanumeric = "alphanumeric" // string comparison (decreasing or increasing)
    18  	None         = "none"         // none (used for resharding)
    19  	MD5          = "md5"          // compare md5(name)
    20  	Shuffle      = "shuffle"      // random shuffle (use with the same seed to reproduce)
    21  	Content      = "content"      // extract (int, string, float) from a given file, and compare
    22  )
    23  
    24  var algorithms = []string{algDefault, Alphanumeric, MD5, Shuffle, Content, None}
    25  
    26  type Algorithm struct {
    27  	// one of the `algorithms` above
    28  	Kind string `json:"kind"`
    29  
    30  	// used with two sorting alg-s: Alphanumeric and Content
    31  	Decreasing bool `json:"decreasing"`
    32  
    33  	// when sort is a random shuffle
    34  	Seed string `json:"seed"`
    35  
    36  	// usage: exclusively for Content sorting
    37  	// e.g.: ".cls" containing sorting key for each record (sample) - see next
    38  	// NOTE: not to confuse with shards "input_extension"
    39  	Ext string `json:"extension"`
    40  
    41  	// ditto: Content only
    42  	// `shard.contentKeyTypes` enum values: {"int", "string", "float" }
    43  	ContentKeyType string `json:"content_key_type"`
    44  }
    45  
    46  // RequestSpec defines the user specification for requests to the endpoint /v1/sort.
    47  type RequestSpec struct {
    48  	// Required
    49  	InputBck        cmn.Bck       `json:"input_bck" yaml:"input_bck"`
    50  	InputFormat     apc.ListRange `json:"input_format" yaml:"input_format"`
    51  	OutputFormat    string        `json:"output_format" yaml:"output_format"`
    52  	OutputShardSize string        `json:"output_shard_size" yaml:"output_shard_size"`
    53  
    54  	// Desirable
    55  	InputExtension string `json:"input_extension" yaml:"input_extension"`
    56  
    57  	// Optional
    58  	// Default: InputExtension
    59  	OutputExtension string `json:"output_extension" yaml:"output_extension"`
    60  	// Default: ""
    61  	Description string `json:"description" yaml:"description"`
    62  	// Default: same as `bck` field
    63  	OutputBck cmn.Bck `json:"output_bck" yaml:"output_bck"`
    64  	// Default: alphanumeric, increasing
    65  	Algorithm Algorithm `json:"algorithm" yaml:"algorithm"`
    66  	// Default: ""
    67  	OrderFileURL string `json:"order_file" yaml:"order_file"`
    68  	// Default: "\t"
    69  	OrderFileSep string `json:"order_file_sep" yaml:"order_file_sep"`
    70  	// Default: "80%"
    71  	MaxMemUsage string `json:"max_mem_usage" yaml:"max_mem_usage"`
    72  	// Default: calcMaxLimit()
    73  	ExtractConcMaxLimit int `json:"extract_concurrency_max_limit" yaml:"extract_concurrency_max_limit"`
    74  	// Default: calcMaxLimit()
    75  	CreateConcMaxLimit int `json:"create_concurrency_max_limit" yaml:"create_concurrency_max_limit"`
    76  
    77  	// debug
    78  	DsorterType string `json:"dsorter_type"`
    79  	DryRun      bool   `json:"dry_run"` // Default: false
    80  
    81  	Config cmn.DsortConf
    82  }