github.com/minio/mc@v0.0.0-20240503112107-b471de8d1882/cmd/sql-main.go (about) 1 // Copyright (c) 2015-2022 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package cmd 19 20 import ( 21 "bufio" 22 "compress/bzip2" 23 "compress/gzip" 24 "context" 25 "errors" 26 "fmt" 27 "io" 28 "os" 29 "path/filepath" 30 "regexp" 31 "strings" 32 "time" 33 34 "github.com/minio/cli" 35 "github.com/minio/mc/pkg/probe" 36 "github.com/minio/minio-go/v7" 37 "github.com/minio/pkg/v2/mimedb" 38 ) 39 40 var sqlFlags = []cli.Flag{ 41 cli.StringFlag{ 42 Name: "query, e", 43 Usage: "sql query expression", 44 Value: "select * from s3object", 45 }, 46 cli.BoolFlag{ 47 Name: "recursive, r", 48 Usage: "sql query recursively", 49 }, 50 cli.StringFlag{ 51 Name: "csv-input", 52 Usage: "csv input serialization option", 53 }, 54 cli.StringFlag{ 55 Name: "json-input", 56 Usage: "json input serialization option", 57 }, 58 cli.StringFlag{ 59 Name: "compression", 60 Usage: "input compression type", 61 }, 62 cli.StringFlag{ 63 Name: "csv-output", 64 Usage: "csv output serialization option", 65 }, 66 cli.StringFlag{ 67 Name: "csv-output-header", 68 Usage: "optional csv output header ", 69 }, 70 cli.StringFlag{ 71 Name: "json-output", 72 Usage: "json output serialization option", 73 }, 74 } 75 76 // Display contents of a file. 77 var sqlCmd = cli.Command{ 78 Name: "sql", 79 Usage: "run sql queries on objects", 80 Action: mainSQL, 81 OnUsageError: onUsageError, 82 Before: setGlobalsFromContext, 83 Flags: append(append(sqlFlags, encCFlag), globalFlags...), 84 CustomHelpTemplate: `NAME: 85 {{.HelpName}} - {{.Usage}} 86 87 USAGE: 88 {{.HelpName}} [FLAGS] TARGET [TARGET...] 89 {{if .VisibleFlags}} 90 FLAGS: 91 {{range .VisibleFlags}}{{.}} 92 {{end}}{{end}} 93 94 SERIALIZATION OPTIONS: 95 For query serialization options, refer to https://min.io/docs/minio/linux/reference/minio-mc/mc-sql.html#command-mc.sql 96 97 EXAMPLES: 98 1. Run a query on a set of objects recursively on AWS S3. 99 {{.Prompt}} {{.HelpName}} --recursive --query "select * from S3Object" s3/personalbucket/my-large-csvs/ 100 101 2. Run a query on an object on MinIO. 102 {{.Prompt}} {{.HelpName}} --query "select count(s.power) from S3Object s" myminio/iot-devices/power-ratio.csv 103 104 3. Run a query on an encrypted object with customer provided keys. 105 {{.Prompt}} {{.HelpName}} --enc-c "myminio/iot-devices=MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDA" \ 106 --query "select count(s.power) from S3Object s" myminio/iot-devices/power-ratio-encrypted.csv 107 108 4. Run a query on an object on MinIO in gzip format using ; as field delimiter, 109 newline as record delimiter and file header to be used 110 {{.Prompt}} {{.HelpName}} --compression GZIP --csv-input "rd=\n,fh=USE,fd=;" \ 111 --query "select count(s.power) from S3Object s" myminio/iot-devices/power-ratio.csv.gz 112 113 5. Run a query on an object on MinIO in gzip format using ; as field delimiter, 114 newline as record delimiter and file header to be used 115 {{.Prompt}} {{.HelpName}} --compression GZIP --csv-input "rd=\n,fh=USE,fd=;" \ 116 --json-output "rd=\n\n" --query "select * from S3Object" myminio/iot-devices/data.csv 117 118 6. Run same query as in 5., but specify csv output headers. If --csv-output-headers is 119 specified as "", first row of csv is interpreted as header 120 {{.Prompt}} {{.HelpName}} --compression GZIP --csv-input "rd=\n,fh=USE,fd=;" \ 121 --csv-output "rd=\n" --csv-output-header "device_id,uptime,lat,lon" \ 122 --query "select * from S3Object" myminio/iot-devices/data.csv 123 `, 124 } 125 126 // valid CSV and JSON keys for input/output serialization 127 var ( 128 validCSVCommonKeys = []string{"FieldDelimiter", "QuoteChar", "QuoteEscChar"} 129 validCSVInputKeys = []string{"Comments", "FileHeader", "QuotedRecordDelimiter", "RecordDelimiter"} 130 validCSVOutputKeys = []string{"QuoteFields"} 131 132 validJSONInputKeys = []string{"Type"} 133 validJSONCSVCommonOutputKeys = []string{"RecordDelimiter"} 134 135 // mapping of abbreviation to long form name of CSV and JSON input/output serialization keys 136 validCSVInputAbbrKeys = map[string]string{"cc": "Comments", "fh": "FileHeader", "qrd": "QuotedRecordDelimiter", "rd": "RecordDelimiter", "fd": "FieldDelimiter", "qc": "QuoteChar", "qec": "QuoteEscChar"} 137 validCSVOutputAbbrKeys = map[string]string{"qf": "QuoteFields", "rd": "RecordDelimiter", "fd": "FieldDelimiter", "qc": "QuoteChar", "qec": "QuoteEscChar"} 138 validJSONOutputAbbrKeys = map[string]string{"rd": "RecordDelimiter"} 139 ) 140 141 // parseKVArgs parses string of the form k=v delimited by "," 142 // into a map of k-v pairs 143 func parseKVArgs(is string) (map[string]string, *probe.Error) { 144 kvmap := make(map[string]string) 145 var key, value string 146 var s, e int // tracking start and end of value 147 var index int // current index in string 148 if is != "" { 149 for index < len(is) { 150 i := strings.Index(is[index:], "=") 151 if i == -1 { 152 return nil, probe.NewError(errors.New("Arguments should be of the form key=value,... ")) 153 } 154 key = is[index : index+i] 155 s = i + index + 1 156 e = strings.Index(is[s:], ",") 157 delimFound := false 158 for !delimFound { 159 if e == -1 || e+s >= len(is) { 160 delimFound = true 161 break 162 } 163 if string(is[s+e]) != "," { 164 delimFound = true 165 if string(is[s+e-1]) == "," { 166 e-- 167 } 168 } else { 169 e++ 170 } 171 } 172 vEnd := len(is) 173 if e != -1 { 174 vEnd = s + e 175 } 176 177 value = is[s:vEnd] 178 index = vEnd + 1 179 if _, ok := kvmap[strings.ToLower(key)]; ok { 180 return nil, probe.NewError(fmt.Errorf("More than one key=value found for %s", strings.TrimSpace(key))) 181 } 182 kvmap[strings.ToLower(key)] = strings.NewReplacer(`\n`, "\n", `\t`, "\t", `\r`, "\r").Replace(value) 183 } 184 } 185 return kvmap, nil 186 } 187 188 // returns a string with list of serialization options and abbreviation(s) if any 189 func fmtString(validAbbr map[string]string, validKeys []string) string { 190 var sb strings.Builder 191 i := 0 192 for k, v := range validAbbr { 193 sb.WriteString(fmt.Sprintf("%s(%s) ", v, k)) 194 i++ 195 if i != len(validAbbr) { 196 sb.WriteString(",") 197 } 198 } 199 if len(sb.String()) == 0 { 200 for _, k := range validKeys { 201 sb.WriteString(fmt.Sprintf("%s ", k)) 202 } 203 } 204 return sb.String() 205 } 206 207 // parses the input string and constructs a k-v map, replacing any abbreviated keys with actual keys 208 func parseSerializationOpts(inp string, validKeys []string, validAbbrKeys map[string]string) (map[string]string, *probe.Error) { 209 if validAbbrKeys == nil { 210 validAbbrKeys = make(map[string]string) 211 } 212 validKeyFn := func(key string, validKeys []string) bool { 213 for _, name := range validKeys { 214 if strings.EqualFold(name, key) { 215 return true 216 } 217 } 218 return false 219 } 220 kv, err := parseKVArgs(inp) 221 if err != nil { 222 return nil, err 223 } 224 ikv := make(map[string]string) 225 for k, v := range kv { 226 fldName, ok := validAbbrKeys[strings.ToLower(k)] 227 if ok { 228 ikv[strings.ToLower(fldName)] = v 229 } else { 230 ikv[strings.ToLower(k)] = v 231 } 232 } 233 for k := range ikv { 234 if !validKeyFn(k, validKeys) { 235 return nil, probe.NewError(errors.New("Options should be key-value pairs in the form key=value,... where valid key(s) are " + fmtString(validAbbrKeys, validKeys))) 236 } 237 } 238 return ikv, nil 239 } 240 241 // gets the input serialization opts from cli context and constructs a map of csv, json or parquet options 242 func getInputSerializationOpts(ctx *cli.Context) map[string]map[string]string { 243 icsv := ctx.String("csv-input") 244 ijson := ctx.String("json-input") 245 m := make(map[string]map[string]string) 246 247 csvType := ctx.IsSet("csv-input") 248 jsonType := ctx.IsSet("json-input") 249 if csvType && jsonType { 250 fatalIf(errInvalidArgument(), "Only one of --csv-input or --json-input can be specified as input serialization option") 251 } 252 253 if icsv != "" { 254 kv, err := parseSerializationOpts(icsv, append(validCSVCommonKeys, validCSVInputKeys...), validCSVInputAbbrKeys) 255 fatalIf(err, "Invalid serialization option(s) specified for --csv-input flag") 256 257 m["csv"] = kv 258 } 259 if ijson != "" { 260 kv, err := parseSerializationOpts(ijson, validJSONInputKeys, nil) 261 262 fatalIf(err, "Invalid serialization option(s) specified for --json-input flag") 263 m["json"] = kv 264 } 265 266 return m 267 } 268 269 // gets the output serialization opts from cli context and constructs a map of csv or json options 270 func getOutputSerializationOpts(ctx *cli.Context, csvHdrs []string) (opts map[string]map[string]string) { 271 m := make(map[string]map[string]string) 272 273 ocsv := ctx.String("csv-output") 274 ojson := ctx.String("json-output") 275 csvType := ctx.IsSet("csv-output") 276 jsonType := ctx.IsSet("json-output") 277 278 if csvType && jsonType { 279 fatalIf(errInvalidArgument(), "Only one of --csv-output, or --json-output can be specified as output serialization option") 280 } 281 282 if jsonType && len(csvHdrs) > 0 { 283 fatalIf(errInvalidArgument(), "--csv-output-header incompatible with --json-output option") 284 } 285 286 if csvType { 287 validKeys := append(validCSVCommonKeys, validJSONCSVCommonOutputKeys...) 288 kv, err := parseSerializationOpts(ocsv, append(validKeys, validCSVOutputKeys...), validCSVOutputAbbrKeys) 289 fatalIf(err, "Invalid value(s) specified for --csv-output flag") 290 m["csv"] = kv 291 } 292 293 if jsonType || globalJSON { 294 kv, err := parseSerializationOpts(ojson, validJSONCSVCommonOutputKeys, validJSONOutputAbbrKeys) 295 fatalIf(err, "Invalid value(s) specified for --json-output flag") 296 m["json"] = kv 297 } 298 return m 299 } 300 301 // getCSVHeader fetches the first line of csv query object 302 func getCSVHeader(sourceURL string, encKeyDB map[string][]prefixSSEPair) ([]string, *probe.Error) { 303 var r io.ReadCloser 304 switch sourceURL { 305 case "-": 306 r = os.Stdin 307 default: 308 var err *probe.Error 309 var content *ClientContent 310 if r, content, err = getSourceStreamMetadataFromURL(globalContext, sourceURL, "", time.Time{}, encKeyDB, false); err != nil { 311 return nil, err.Trace(sourceURL) 312 } 313 314 ctype := content.Metadata["Content-Type"] 315 if strings.Contains(ctype, "gzip") { 316 var e error 317 r, e = gzip.NewReader(r) 318 if e != nil { 319 return nil, probe.NewError(e) 320 } 321 defer r.Close() 322 } else if strings.Contains(ctype, "bzip") { 323 defer r.Close() 324 r = io.NopCloser(bzip2.NewReader(r)) 325 } else { 326 defer r.Close() 327 } 328 } 329 br := bufio.NewReader(r) 330 line, _, e := br.ReadLine() 331 if e != nil { 332 return nil, probe.NewError(e) 333 } 334 return strings.Split(string(line), ","), nil 335 } 336 337 // returns true if query is selectign all columns of the csv object 338 func isSelectAll(query string) bool { 339 match, _ := regexp.MatchString("^\\s*?select\\s+?\\*\\s+?.*?$", query) 340 return match 341 } 342 343 // if csv-output-header is set to a comma delimited string use it, othjerwise attempt to get the header from 344 // query object 345 func getCSVOutputHeaders(ctx *cli.Context, url string, encKeyDB map[string][]prefixSSEPair, query string) (hdrs []string) { 346 if !ctx.IsSet("csv-output-header") { 347 return 348 } 349 350 hdrStr := ctx.String("csv-output-header") 351 if hdrStr == "" && isSelectAll(query) { 352 // attempt to get the first line of csv as header 353 if hdrs, err := getCSVHeader(url, encKeyDB); err == nil { 354 return hdrs 355 } 356 } 357 hdrs = strings.Split(hdrStr, ",") 358 return 359 } 360 361 // get the Select options for sql select API 362 func getSQLOpts(ctx *cli.Context, csvHdrs []string) (s SelectObjectOpts) { 363 is := getInputSerializationOpts(ctx) 364 os := getOutputSerializationOpts(ctx, csvHdrs) 365 366 return SelectObjectOpts{ 367 InputSerOpts: is, 368 OutputSerOpts: os, 369 CompressionType: minio.SelectCompressionType(ctx.String("compression")), 370 } 371 } 372 373 func isCSVOrJSON(inOpts map[string]map[string]string) bool { 374 if _, ok := inOpts["csv"]; ok { 375 return true 376 } 377 if _, ok := inOpts["json"]; ok { 378 return true 379 } 380 return false 381 } 382 383 func sqlSelect(targetURL, expression string, encKeyDB map[string][]prefixSSEPair, selOpts SelectObjectOpts, csvHdrs []string, writeHdr bool) *probe.Error { 384 ctx, cancelSelect := context.WithCancel(globalContext) 385 defer cancelSelect() 386 387 alias, _, _, err := expandAlias(targetURL) 388 if err != nil { 389 return err.Trace(targetURL) 390 } 391 392 targetClnt, err := newClient(targetURL) 393 if err != nil { 394 return err.Trace(targetURL) 395 } 396 397 sseKey := getSSE(targetURL, encKeyDB[alias]) 398 outputer, err := targetClnt.Select(ctx, expression, sseKey, selOpts) 399 if err != nil { 400 return err.Trace(targetURL, expression) 401 } 402 defer outputer.Close() 403 404 // write csv header to stdout 405 if len(csvHdrs) > 0 && writeHdr { 406 fmt.Println(strings.Join(csvHdrs, ",")) 407 } 408 _, e := io.Copy(os.Stdout, outputer) 409 return probe.NewError(e) 410 } 411 412 func validateOpts(selOpts SelectObjectOpts, url string) { 413 _, targetURL, _ := mustExpandAlias(url) 414 if strings.HasSuffix(targetURL, ".parquet") && isCSVOrJSON(selOpts.InputSerOpts) { 415 fatalIf(errInvalidArgument(), "Input serialization flags --csv-input and --json-input cannot be used for object in .parquet format") 416 } 417 } 418 419 // validate args and optionally fetch the csv header of query object 420 func getAndValidateArgs(ctx *cli.Context, encKeyDB map[string][]prefixSSEPair, url string) (query string, csvHdrs []string, selOpts SelectObjectOpts) { 421 query = ctx.String("query") 422 csvHdrs = getCSVOutputHeaders(ctx, url, encKeyDB, query) 423 selOpts = getSQLOpts(ctx, csvHdrs) 424 validateOpts(selOpts, url) 425 return 426 } 427 428 // check sql input arguments. 429 func checkSQLSyntax(ctx *cli.Context) { 430 if len(ctx.Args()) == 0 { 431 showCommandHelpAndExit(ctx, 1) // last argument is exit code. 432 } 433 } 434 435 // mainSQL is the main entry point for sql command. 436 func mainSQL(cliCtx *cli.Context) error { 437 ctx, cancelSQL := context.WithCancel(globalContext) 438 defer cancelSQL() 439 440 var ( 441 csvHdrs []string 442 selOpts SelectObjectOpts 443 query string 444 ) 445 // Parse encryption keys per command. 446 encKeyDB, err := validateAndCreateEncryptionKeys(cliCtx) 447 fatalIf(err, "Unable to parse encryption keys.") 448 449 // validate sql input arguments. 450 checkSQLSyntax(cliCtx) 451 // extract URLs. 452 URLs := cliCtx.Args() 453 writeHdr := true 454 for _, url := range URLs { 455 if _, targetContent, err := url2Stat(ctx, url2StatOptions{urlStr: url, versionID: "", fileAttr: false, encKeyDB: encKeyDB, timeRef: time.Time{}, isZip: false, ignoreBucketExistsCheck: false}); err != nil { 456 errorIf(err.Trace(url), "Unable to run sql for "+url+".") 457 continue 458 } else if !targetContent.Type.IsDir() { 459 if writeHdr { 460 query, csvHdrs, selOpts = getAndValidateArgs(cliCtx, encKeyDB, url) 461 } 462 errorIf(sqlSelect(url, query, encKeyDB, selOpts, csvHdrs, writeHdr).Trace(url), "Unable to run sql") 463 writeHdr = false 464 continue 465 } 466 targetAlias, targetURL, _ := mustExpandAlias(url) 467 clnt, err := newClientFromAlias(targetAlias, targetURL) 468 if err != nil { 469 errorIf(err.Trace(url), "Unable to initialize target `"+url+"`.") 470 continue 471 } 472 473 for content := range clnt.List(ctx, ListOptions{Recursive: cliCtx.Bool("recursive"), WithMetadata: true, ShowDir: DirNone}) { 474 if content.Err != nil { 475 errorIf(content.Err.Trace(url), "Unable to list on target `"+url+"`.") 476 continue 477 } 478 if writeHdr { 479 query, csvHdrs, selOpts = getAndValidateArgs(cliCtx, encKeyDB, targetAlias+content.URL.Path) 480 } 481 contentType := mimedb.TypeByExtension(filepath.Ext(content.URL.Path)) 482 if len(content.UserMetadata) != 0 && content.UserMetadata["content-type"] != "" { 483 contentType = content.UserMetadata["content-type"] 484 } 485 for _, cTypeSuffix := range supportedContentTypes { 486 if strings.Contains(contentType, cTypeSuffix) { 487 errorIf(sqlSelect(targetAlias+content.URL.Path, query, 488 encKeyDB, selOpts, csvHdrs, writeHdr).Trace(content.URL.String()), "Unable to run sql") 489 } 490 writeHdr = false 491 } 492 } 493 } 494 495 // Done. 496 return nil 497 }