github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ext/dsort/request_spec.go (about) 1 // Package dsort provides distributed massively parallel resharding for very large datasets. 2 /* 3 * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package dsort 6 7 import ( 8 "fmt" 9 "math" 10 "net/url" 11 "strconv" 12 "strings" 13 14 "github.com/NVIDIA/aistore/api/apc" 15 "github.com/NVIDIA/aistore/cmn" 16 "github.com/NVIDIA/aistore/cmn/archive" 17 "github.com/NVIDIA/aistore/cmn/cos" 18 "github.com/NVIDIA/aistore/ext/dsort/shard" 19 ) 20 21 type parsedInputTemplate struct { 22 Template cos.ParsedTemplate `json:"template"` 23 ObjNames []string `json:"objnames"` 24 Prefix string `json:"prefix"` 25 } 26 27 type parsedOutputTemplate struct { 28 // Used by 'bash' and 'at' template 29 Template cos.ParsedTemplate 30 } 31 32 type ParsedReq struct { 33 InputBck cmn.Bck 34 OutputBck cmn.Bck 35 pars *parsedReqSpec 36 } 37 38 type parsedReqSpec struct { 39 InputBck cmn.Bck `json:"input_bck"` 40 Description string `json:"description"` 41 OutputBck cmn.Bck `json:"output_bck"` 42 InputExtension string `json:"input_extension"` 43 OutputExtension string `json:"output_extension"` 44 OutputShardSize int64 `json:"output_shard_size,string"` 45 Pit *parsedInputTemplate `json:"pit"` 46 Pot *parsedOutputTemplate `json:"pot"` 47 Algorithm *Algorithm `json:"algorithm"` 48 OrderFileURL string `json:"order_file"` 49 OrderFileSep string `json:"order_file_sep"` 50 MaxMemUsage cos.ParsedQuantity `json:"max_mem_usage"` 51 TargetOrderSalt []byte `json:"target_order_salt"` 52 ExtractConcMaxLimit int `json:"extract_concurrency_max_limit"` 53 CreateConcMaxLimit int `json:"create_concurrency_max_limit"` 54 SbundleMult int `json:"bundle_multiplier"` 55 56 // debug 57 DsorterType string `json:"dsorter_type"` 58 DryRun bool `json:"dry_run"` 59 60 cmn.DsortConf 61 } 62 63 ///////////////// 64 // RequestSpec // 65 ///////////////// 66 67 func specErr(s string, err error) error { return fmt.Errorf("[dsort] parse-spec: %q %w", s, err) } 68 69 func (rs *RequestSpec) ParseCtx() (*ParsedReq, error) { 70 pars, err := rs.parse() 71 return &ParsedReq{pars.InputBck, pars.OutputBck, pars}, err 72 } 73 74 func (rs *RequestSpec) parse() (*parsedReqSpec, error) { 75 var ( 76 cfg = cmn.GCO.Get().Dsort 77 pars = &parsedReqSpec{} 78 ) 79 80 // src bck 81 if rs.InputBck.IsEmpty() { 82 return pars, specErr("input_bck", errMissingSrcBucket) 83 } 84 pars.InputBck = rs.InputBck 85 if rs.InputBck.Provider == "" { 86 pars.InputBck.Provider = apc.AIS // NOTE: ais:// is the default 87 } else { 88 normp, err := cmn.NormalizeProvider(rs.InputBck.Provider) 89 if err != nil { 90 return pars, specErr("input_bck_provider", err) 91 } 92 pars.InputBck.Provider = normp 93 } 94 if err := rs.InputBck.Validate(); err != nil { 95 return pars, specErr("input_bck", err) 96 } 97 98 pars.Description = rs.Description 99 100 // dst bck 101 pars.OutputBck = rs.OutputBck 102 if pars.OutputBck.IsEmpty() { 103 pars.OutputBck = pars.InputBck // NOTE: source can be the destination as well 104 } else { 105 normp, err := cmn.NormalizeProvider(rs.OutputBck.Provider) 106 if err != nil { 107 return pars, specErr("output_bck_provider", err) 108 } 109 pars.OutputBck.Provider = normp 110 if err := rs.OutputBck.Validate(); err != nil { 111 return pars, specErr("output_bck", err) 112 } 113 } 114 115 // input format 116 var err error 117 pars.Pit, err = parseInputFormat(rs.InputFormat) 118 if err != nil { 119 return nil, specErr("input_format", err) 120 } 121 if rs.InputFormat.Template != "" { 122 // template is not a filename but all we do here is 123 // checking the template's suffix for specific supported extensions 124 if ext, err := archive.Mime("", rs.InputFormat.Template); err == nil { 125 if rs.InputExtension != "" && rs.InputExtension != ext { 126 return nil, fmt.Errorf("input_extension: %q vs %q", rs.InputExtension, ext) 127 } 128 rs.InputExtension = ext 129 } 130 } 131 if rs.InputExtension != "" { 132 pars.InputExtension, err = archive.Mime(rs.InputExtension, "") 133 if err != nil { 134 return nil, specErr("input_extension", err) 135 } 136 } 137 138 // output format 139 pars.OutputShardSize, err = cos.ParseSize(rs.OutputShardSize, cos.UnitsIEC) 140 if err != nil { 141 return nil, specErr("output_shard_size", err) 142 } 143 if pars.OutputShardSize < 0 { 144 return nil, fmt.Errorf(fmtErrNegOutputSize, pars.OutputShardSize) 145 } 146 pars.Algorithm, err = parseAlgorithm(rs.Algorithm) 147 if err != nil { 148 return nil, specErr("algorithm", err) 149 } 150 151 var isOrder bool 152 if isOrder, err = validateOrderFileURL(rs.OrderFileURL); err != nil { 153 return nil, fmt.Errorf(fmtErrOrderURL, rs.OrderFileURL, err) 154 } 155 if isOrder { 156 if pars.Pot, err = parseOutputFormat(rs.OutputFormat); err != nil { 157 return nil, err 158 } 159 if pars.Pot.Template.Count() > math.MaxInt32 { 160 // If the count is not defined the output shard size must be 161 if pars.OutputShardSize == 0 { 162 return nil, errMissingOutputSize 163 } 164 } 165 if rs.OutputFormat != "" { 166 // (ditto) 167 if ext, err := archive.Mime("", rs.OutputFormat); err == nil { 168 if rs.OutputExtension != "" && rs.OutputExtension != ext { 169 return nil, fmt.Errorf("output_extension: %q vs %q", rs.OutputExtension, ext) 170 } 171 rs.OutputExtension = ext 172 } 173 } 174 } else { 175 // For the order file the output shard size must be set. 176 if pars.OutputShardSize == 0 { 177 return nil, errMissingOutputSize 178 } 179 pars.OrderFileURL = rs.OrderFileURL 180 pars.OrderFileSep = rs.OrderFileSep 181 if pars.OrderFileSep == "" { 182 pars.OrderFileSep = "\t" 183 } 184 } 185 if rs.OutputExtension == "" { 186 pars.OutputExtension = pars.InputExtension // default 187 } else { 188 pars.OutputExtension, err = archive.Mime(rs.OutputExtension, "") 189 if err != nil { 190 return nil, specErr("output_extension", err) 191 } 192 } 193 194 // mem & conc 195 if rs.MaxMemUsage == "" { 196 rs.MaxMemUsage = cfg.DefaultMaxMemUsage 197 } 198 pars.MaxMemUsage, err = cos.ParseQuantity(rs.MaxMemUsage) 199 if err != nil { 200 return nil, err 201 } 202 if rs.ExtractConcMaxLimit < 0 { 203 return nil, fmt.Errorf("%w ('extract', %d)", errNegConcLimit, rs.ExtractConcMaxLimit) 204 } 205 if rs.CreateConcMaxLimit < 0 { 206 return nil, fmt.Errorf("%w ('create', %d)", errNegConcLimit, rs.CreateConcMaxLimit) 207 } 208 209 pars.ExtractConcMaxLimit = rs.ExtractConcMaxLimit 210 pars.CreateConcMaxLimit = rs.CreateConcMaxLimit 211 pars.DsorterType = rs.DsorterType 212 pars.DryRun = rs.DryRun 213 214 // `cfg` here contains inherited (aka global) part of the dsort config - 215 // apply this request's rs.Config values to override or assign defaults 216 217 if err := rs.Config.ValidateWithOpts(true); err != nil { 218 return nil, err 219 } 220 pars.DsortConf = rs.Config 221 222 pars.SbundleMult = rs.Config.SbundleMult 223 if pars.SbundleMult == 0 { 224 pars.SbundleMult = cfg.SbundleMult 225 } 226 if pars.MissingShards == "" { 227 pars.MissingShards = cfg.MissingShards 228 } 229 if pars.EKMMalformedLine == "" { 230 pars.EKMMalformedLine = cfg.EKMMalformedLine 231 } 232 if pars.EKMMissingKey == "" { 233 pars.EKMMissingKey = cfg.EKMMissingKey 234 } 235 if pars.DuplicatedRecords == "" { 236 pars.DuplicatedRecords = cfg.DuplicatedRecords 237 } 238 if pars.DsorterMemThreshold == "" { 239 pars.DsorterMemThreshold = cfg.DsorterMemThreshold 240 } 241 242 return pars, nil 243 } 244 245 func parseAlgorithm(alg Algorithm) (*Algorithm, error) { 246 if !cos.StringInSlice(alg.Kind, algorithms) { 247 return nil, fmt.Errorf(fmtErrInvalidAlg, algorithms) 248 } 249 if alg.Seed != "" { 250 if value, err := strconv.ParseInt(alg.Seed, 10, 64); value < 0 || err != nil { 251 return nil, fmt.Errorf(fmtErrSeed, alg.Seed) 252 } 253 } 254 if alg.Kind == Content { 255 alg.Ext = strings.TrimSpace(alg.Ext) 256 if alg.Ext == "" || alg.Ext[0] != '.' { 257 return nil, fmt.Errorf("%w %q", errAlgExt, alg.Ext) 258 } 259 if err := shard.ValidateContentKeyTy(alg.ContentKeyType); err != nil { 260 return nil, err 261 } 262 } else { 263 alg.ContentKeyType = shard.ContentKeyString 264 } 265 266 return &alg, nil 267 } 268 269 func validateOrderFileURL(orderURL string) (empty bool, err error) { 270 if orderURL == "" { 271 return true, nil 272 } 273 _, err = url.ParseRequestURI(orderURL) 274 return 275 } 276 277 ////////////////////////// 278 // parsedOutputTemplate // 279 ////////////////////////// 280 281 func parseOutputFormat(outputFormat string) (pot *parsedOutputTemplate, err error) { 282 pot = &parsedOutputTemplate{} 283 if pot.Template, err = cos.NewParsedTemplate(strings.TrimSpace(outputFormat)); err != nil { 284 return 285 } 286 if len(pot.Template.Ranges) == 0 { 287 return nil, fmt.Errorf("invalid output template %q: no ranges (prefix-only output is not supported)", 288 outputFormat) 289 } 290 return 291 } 292 293 ///////////////////////// 294 // parsedInputTemplate // 295 ///////////////////////// 296 297 func parseInputFormat(inputFormat apc.ListRange) (pit *parsedInputTemplate, err error) { 298 pit = &parsedInputTemplate{} 299 if inputFormat.IsList() { 300 pit.ObjNames = inputFormat.ObjNames 301 return 302 } 303 pit.Template, err = cos.NewParsedTemplate(inputFormat.Template) 304 305 if err == cos.ErrEmptyTemplate { 306 // empty template => empty prefix (match any) 307 err = nil 308 pit.Prefix = cos.EmptyMatchAll 309 } else if err == nil && len(pit.Template.Ranges) == 0 { 310 // prefix only 311 pit.Prefix = pit.Template.Prefix 312 } 313 return 314 } 315 316 func (pit *parsedInputTemplate) isList() bool { return len(pit.ObjNames) > 0 } 317 func (pit *parsedInputTemplate) isRange() bool { return len(pit.Template.Ranges) > 0 } 318 func (pit *parsedInputTemplate) isPrefix() bool { return !pit.isList() && !pit.isRange() }