github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/api.go (about) 1 // Package dsort provides distributed massively parallel resharding for very large datasets. 2 /* 3 * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package dsort 6 7 import ( 8 "github.com/NVIDIA/aistore/api/apc" 9 "github.com/NVIDIA/aistore/cmn" 10 "github.com/NVIDIA/aistore/cmn/archive" 11 ) 12 13 const DefaultExt = archive.ExtTar // default shard extension/format/MIME when spec's input_extension is empty 14 15 const ( 16 algDefault = "" // default (alphanumeric, increasing) 17 Alphanumeric = "alphanumeric" // string comparison (decreasing or increasing) 18 None = "none" // none (used for resharding) 19 MD5 = "md5" // compare md5(name) 20 Shuffle = "shuffle" // random shuffle (use with the same seed to reproduce) 21 Content = "content" // extract (int, string, float) from a given file, and compare 22 ) 23 24 var algorithms = []string{algDefault, Alphanumeric, MD5, Shuffle, Content, None} 25 26 type Algorithm struct { 27 // one of the `algorithms` above 28 Kind string `json:"kind"` 29 30 // used with two sorting alg-s: Alphanumeric and Content 31 Decreasing bool `json:"decreasing"` 32 33 // when sort is a random shuffle 34 Seed string `json:"seed"` 35 36 // usage: exclusively for Content sorting 37 // e.g.: ".cls" containing sorting key for each record (sample) - see next 38 // NOTE: not to confuse with shards "input_extension" 39 Ext string `json:"extension"` 40 41 // ditto: Content only 42 // `shard.contentKeyTypes` enum values: {"int", "string", "float" } 43 ContentKeyType string `json:"content_key_type"` 44 } 45 46 // RequestSpec defines the user specification for requests to the endpoint /v1/sort. 47 type RequestSpec struct { 48 // Required 49 InputBck cmn.Bck `json:"input_bck" yaml:"input_bck"` 50 InputFormat apc.ListRange `json:"input_format" yaml:"input_format"` 51 OutputFormat string `json:"output_format" yaml:"output_format"` 52 OutputShardSize string `json:"output_shard_size" yaml:"output_shard_size"` 53 54 // Desirable 55 InputExtension string `json:"input_extension" yaml:"input_extension"` 56 57 // Optional 58 // Default: InputExtension 59 OutputExtension string `json:"output_extension" yaml:"output_extension"` 60 // Default: "" 61 Description string `json:"description" yaml:"description"` 62 // Default: same as `bck` field 63 OutputBck cmn.Bck `json:"output_bck" yaml:"output_bck"` 64 // Default: alphanumeric, increasing 65 Algorithm Algorithm `json:"algorithm" yaml:"algorithm"` 66 // Default: "" 67 OrderFileURL string `json:"order_file" yaml:"order_file"` 68 // Default: "\t" 69 OrderFileSep string `json:"order_file_sep" yaml:"order_file_sep"` 70 // Default: "80%" 71 MaxMemUsage string `json:"max_mem_usage" yaml:"max_mem_usage"` 72 // Default: calcMaxLimit() 73 ExtractConcMaxLimit int `json:"extract_concurrency_max_limit" yaml:"extract_concurrency_max_limit"` 74 // Default: calcMaxLimit() 75 CreateConcMaxLimit int `json:"create_concurrency_max_limit" yaml:"create_concurrency_max_limit"` 76 77 // debug 78 DsorterType string `json:"dsorter_type"` 79 DryRun bool `json:"dry_run"` // Default: false 80 81 Config cmn.DsortConf 82 }