github.com/minio/mc@v0.0.0-20240503112107-b471de8d1882/cmd/sql-main.go (about)

     1  // Copyright (c) 2015-2022 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package cmd
    19  
    20  import (
    21  	"bufio"
    22  	"compress/bzip2"
    23  	"compress/gzip"
    24  	"context"
    25  	"errors"
    26  	"fmt"
    27  	"io"
    28  	"os"
    29  	"path/filepath"
    30  	"regexp"
    31  	"strings"
    32  	"time"
    33  
    34  	"github.com/minio/cli"
    35  	"github.com/minio/mc/pkg/probe"
    36  	"github.com/minio/minio-go/v7"
    37  	"github.com/minio/pkg/v2/mimedb"
    38  )
    39  
    40  var sqlFlags = []cli.Flag{
    41  	cli.StringFlag{
    42  		Name:  "query, e",
    43  		Usage: "sql query expression",
    44  		Value: "select * from s3object",
    45  	},
    46  	cli.BoolFlag{
    47  		Name:  "recursive, r",
    48  		Usage: "sql query recursively",
    49  	},
    50  	cli.StringFlag{
    51  		Name:  "csv-input",
    52  		Usage: "csv input serialization option",
    53  	},
    54  	cli.StringFlag{
    55  		Name:  "json-input",
    56  		Usage: "json input serialization option",
    57  	},
    58  	cli.StringFlag{
    59  		Name:  "compression",
    60  		Usage: "input compression type",
    61  	},
    62  	cli.StringFlag{
    63  		Name:  "csv-output",
    64  		Usage: "csv output serialization option",
    65  	},
    66  	cli.StringFlag{
    67  		Name:  "csv-output-header",
    68  		Usage: "optional csv output header ",
    69  	},
    70  	cli.StringFlag{
    71  		Name:  "json-output",
    72  		Usage: "json output serialization option",
    73  	},
    74  }
    75  
    76  // Display contents of a file.
    77  var sqlCmd = cli.Command{
    78  	Name:         "sql",
    79  	Usage:        "run sql queries on objects",
    80  	Action:       mainSQL,
    81  	OnUsageError: onUsageError,
    82  	Before:       setGlobalsFromContext,
    83  	Flags:        append(append(sqlFlags, encCFlag), globalFlags...),
    84  	CustomHelpTemplate: `NAME:
    85    {{.HelpName}} - {{.Usage}}
    86  
    87  USAGE:
    88    {{.HelpName}} [FLAGS] TARGET [TARGET...]
    89  {{if .VisibleFlags}}
    90  FLAGS:
    91    {{range .VisibleFlags}}{{.}}
    92    {{end}}{{end}}
    93  
    94  SERIALIZATION OPTIONS:
    95    For query serialization options, refer to https://min.io/docs/minio/linux/reference/minio-mc/mc-sql.html#command-mc.sql
    96  
    97  EXAMPLES:
    98    1. Run a query on a set of objects recursively on AWS S3.
    99       {{.Prompt}} {{.HelpName}} --recursive --query "select * from S3Object" s3/personalbucket/my-large-csvs/
   100  
   101    2. Run a query on an object on MinIO.
   102       {{.Prompt}} {{.HelpName}} --query "select count(s.power) from S3Object s" myminio/iot-devices/power-ratio.csv
   103  
   104    3. Run a query on an encrypted object with customer provided keys.
   105       {{.Prompt}} {{.HelpName}} --enc-c "myminio/iot-devices=MDEyMzQ1Njc4OTAxMjM0NTY3ODkwMTIzNDU2Nzg5MDA" \
   106           --query "select count(s.power) from S3Object s" myminio/iot-devices/power-ratio-encrypted.csv
   107  
   108    4. Run a query on an object on MinIO in gzip format using ; as field delimiter,
   109       newline as record delimiter and file header to be used
   110       {{.Prompt}} {{.HelpName}} --compression GZIP --csv-input "rd=\n,fh=USE,fd=;" \
   111           --query "select count(s.power) from S3Object s" myminio/iot-devices/power-ratio.csv.gz
   112  
   113    5. Run a query on an object on MinIO in gzip format using ; as field delimiter,
   114       newline as record delimiter and file header to be used
   115       {{.Prompt}} {{.HelpName}} --compression GZIP --csv-input "rd=\n,fh=USE,fd=;" \
   116           --json-output "rd=\n\n" --query "select * from S3Object" myminio/iot-devices/data.csv
   117  
   118    6. Run same query as in 5., but specify csv output headers. If --csv-output-headers is
   119       specified as "", first row of csv is interpreted as header
   120       {{.Prompt}} {{.HelpName}} --compression GZIP --csv-input "rd=\n,fh=USE,fd=;" \
   121           --csv-output "rd=\n" --csv-output-header "device_id,uptime,lat,lon" \
   122           --query "select * from S3Object" myminio/iot-devices/data.csv
   123  `,
   124  }
   125  
   126  // valid CSV and JSON keys for input/output serialization
   127  var (
   128  	validCSVCommonKeys = []string{"FieldDelimiter", "QuoteChar", "QuoteEscChar"}
   129  	validCSVInputKeys  = []string{"Comments", "FileHeader", "QuotedRecordDelimiter", "RecordDelimiter"}
   130  	validCSVOutputKeys = []string{"QuoteFields"}
   131  
   132  	validJSONInputKeys           = []string{"Type"}
   133  	validJSONCSVCommonOutputKeys = []string{"RecordDelimiter"}
   134  
   135  	// mapping of abbreviation to long form name of CSV and JSON input/output serialization keys
   136  	validCSVInputAbbrKeys   = map[string]string{"cc": "Comments", "fh": "FileHeader", "qrd": "QuotedRecordDelimiter", "rd": "RecordDelimiter", "fd": "FieldDelimiter", "qc": "QuoteChar", "qec": "QuoteEscChar"}
   137  	validCSVOutputAbbrKeys  = map[string]string{"qf": "QuoteFields", "rd": "RecordDelimiter", "fd": "FieldDelimiter", "qc": "QuoteChar", "qec": "QuoteEscChar"}
   138  	validJSONOutputAbbrKeys = map[string]string{"rd": "RecordDelimiter"}
   139  )
   140  
   141  // parseKVArgs parses string of the form k=v delimited by ","
   142  // into a map of k-v pairs
   143  func parseKVArgs(is string) (map[string]string, *probe.Error) {
   144  	kvmap := make(map[string]string)
   145  	var key, value string
   146  	var s, e int  // tracking start and end of value
   147  	var index int // current index in string
   148  	if is != "" {
   149  		for index < len(is) {
   150  			i := strings.Index(is[index:], "=")
   151  			if i == -1 {
   152  				return nil, probe.NewError(errors.New("Arguments should be of the form key=value,... "))
   153  			}
   154  			key = is[index : index+i]
   155  			s = i + index + 1
   156  			e = strings.Index(is[s:], ",")
   157  			delimFound := false
   158  			for !delimFound {
   159  				if e == -1 || e+s >= len(is) {
   160  					delimFound = true
   161  					break
   162  				}
   163  				if string(is[s+e]) != "," {
   164  					delimFound = true
   165  					if string(is[s+e-1]) == "," {
   166  						e--
   167  					}
   168  				} else {
   169  					e++
   170  				}
   171  			}
   172  			vEnd := len(is)
   173  			if e != -1 {
   174  				vEnd = s + e
   175  			}
   176  
   177  			value = is[s:vEnd]
   178  			index = vEnd + 1
   179  			if _, ok := kvmap[strings.ToLower(key)]; ok {
   180  				return nil, probe.NewError(fmt.Errorf("More than one key=value found for %s", strings.TrimSpace(key)))
   181  			}
   182  			kvmap[strings.ToLower(key)] = strings.NewReplacer(`\n`, "\n", `\t`, "\t", `\r`, "\r").Replace(value)
   183  		}
   184  	}
   185  	return kvmap, nil
   186  }
   187  
   188  // returns a string with list of serialization options and abbreviation(s) if any
   189  func fmtString(validAbbr map[string]string, validKeys []string) string {
   190  	var sb strings.Builder
   191  	i := 0
   192  	for k, v := range validAbbr {
   193  		sb.WriteString(fmt.Sprintf("%s(%s) ", v, k))
   194  		i++
   195  		if i != len(validAbbr) {
   196  			sb.WriteString(",")
   197  		}
   198  	}
   199  	if len(sb.String()) == 0 {
   200  		for _, k := range validKeys {
   201  			sb.WriteString(fmt.Sprintf("%s ", k))
   202  		}
   203  	}
   204  	return sb.String()
   205  }
   206  
   207  // parses the input string and constructs a k-v map, replacing any abbreviated keys with actual keys
   208  func parseSerializationOpts(inp string, validKeys []string, validAbbrKeys map[string]string) (map[string]string, *probe.Error) {
   209  	if validAbbrKeys == nil {
   210  		validAbbrKeys = make(map[string]string)
   211  	}
   212  	validKeyFn := func(key string, validKeys []string) bool {
   213  		for _, name := range validKeys {
   214  			if strings.EqualFold(name, key) {
   215  				return true
   216  			}
   217  		}
   218  		return false
   219  	}
   220  	kv, err := parseKVArgs(inp)
   221  	if err != nil {
   222  		return nil, err
   223  	}
   224  	ikv := make(map[string]string)
   225  	for k, v := range kv {
   226  		fldName, ok := validAbbrKeys[strings.ToLower(k)]
   227  		if ok {
   228  			ikv[strings.ToLower(fldName)] = v
   229  		} else {
   230  			ikv[strings.ToLower(k)] = v
   231  		}
   232  	}
   233  	for k := range ikv {
   234  		if !validKeyFn(k, validKeys) {
   235  			return nil, probe.NewError(errors.New("Options should be key-value pairs in the form key=value,... where valid key(s) are " + fmtString(validAbbrKeys, validKeys)))
   236  		}
   237  	}
   238  	return ikv, nil
   239  }
   240  
   241  // gets the input serialization opts from cli context and constructs a map of csv, json or parquet options
   242  func getInputSerializationOpts(ctx *cli.Context) map[string]map[string]string {
   243  	icsv := ctx.String("csv-input")
   244  	ijson := ctx.String("json-input")
   245  	m := make(map[string]map[string]string)
   246  
   247  	csvType := ctx.IsSet("csv-input")
   248  	jsonType := ctx.IsSet("json-input")
   249  	if csvType && jsonType {
   250  		fatalIf(errInvalidArgument(), "Only one of --csv-input or --json-input can be specified as input serialization option")
   251  	}
   252  
   253  	if icsv != "" {
   254  		kv, err := parseSerializationOpts(icsv, append(validCSVCommonKeys, validCSVInputKeys...), validCSVInputAbbrKeys)
   255  		fatalIf(err, "Invalid serialization option(s) specified for --csv-input flag")
   256  
   257  		m["csv"] = kv
   258  	}
   259  	if ijson != "" {
   260  		kv, err := parseSerializationOpts(ijson, validJSONInputKeys, nil)
   261  
   262  		fatalIf(err, "Invalid serialization option(s) specified for --json-input flag")
   263  		m["json"] = kv
   264  	}
   265  
   266  	return m
   267  }
   268  
   269  // gets the output serialization opts from cli context and constructs a map of csv or json options
   270  func getOutputSerializationOpts(ctx *cli.Context, csvHdrs []string) (opts map[string]map[string]string) {
   271  	m := make(map[string]map[string]string)
   272  
   273  	ocsv := ctx.String("csv-output")
   274  	ojson := ctx.String("json-output")
   275  	csvType := ctx.IsSet("csv-output")
   276  	jsonType := ctx.IsSet("json-output")
   277  
   278  	if csvType && jsonType {
   279  		fatalIf(errInvalidArgument(), "Only one of --csv-output, or --json-output can be specified as output serialization option")
   280  	}
   281  
   282  	if jsonType && len(csvHdrs) > 0 {
   283  		fatalIf(errInvalidArgument(), "--csv-output-header incompatible with --json-output option")
   284  	}
   285  
   286  	if csvType {
   287  		validKeys := append(validCSVCommonKeys, validJSONCSVCommonOutputKeys...)
   288  		kv, err := parseSerializationOpts(ocsv, append(validKeys, validCSVOutputKeys...), validCSVOutputAbbrKeys)
   289  		fatalIf(err, "Invalid value(s) specified for --csv-output flag")
   290  		m["csv"] = kv
   291  	}
   292  
   293  	if jsonType || globalJSON {
   294  		kv, err := parseSerializationOpts(ojson, validJSONCSVCommonOutputKeys, validJSONOutputAbbrKeys)
   295  		fatalIf(err, "Invalid value(s) specified for --json-output flag")
   296  		m["json"] = kv
   297  	}
   298  	return m
   299  }
   300  
   301  // getCSVHeader fetches the first line of csv query object
   302  func getCSVHeader(sourceURL string, encKeyDB map[string][]prefixSSEPair) ([]string, *probe.Error) {
   303  	var r io.ReadCloser
   304  	switch sourceURL {
   305  	case "-":
   306  		r = os.Stdin
   307  	default:
   308  		var err *probe.Error
   309  		var content *ClientContent
   310  		if r, content, err = getSourceStreamMetadataFromURL(globalContext, sourceURL, "", time.Time{}, encKeyDB, false); err != nil {
   311  			return nil, err.Trace(sourceURL)
   312  		}
   313  
   314  		ctype := content.Metadata["Content-Type"]
   315  		if strings.Contains(ctype, "gzip") {
   316  			var e error
   317  			r, e = gzip.NewReader(r)
   318  			if e != nil {
   319  				return nil, probe.NewError(e)
   320  			}
   321  			defer r.Close()
   322  		} else if strings.Contains(ctype, "bzip") {
   323  			defer r.Close()
   324  			r = io.NopCloser(bzip2.NewReader(r))
   325  		} else {
   326  			defer r.Close()
   327  		}
   328  	}
   329  	br := bufio.NewReader(r)
   330  	line, _, e := br.ReadLine()
   331  	if e != nil {
   332  		return nil, probe.NewError(e)
   333  	}
   334  	return strings.Split(string(line), ","), nil
   335  }
   336  
   337  // returns true if query is selectign all columns of the csv object
   338  func isSelectAll(query string) bool {
   339  	match, _ := regexp.MatchString("^\\s*?select\\s+?\\*\\s+?.*?$", query)
   340  	return match
   341  }
   342  
   343  // if csv-output-header is set to a comma delimited string use it, othjerwise attempt to get the header from
   344  // query object
   345  func getCSVOutputHeaders(ctx *cli.Context, url string, encKeyDB map[string][]prefixSSEPair, query string) (hdrs []string) {
   346  	if !ctx.IsSet("csv-output-header") {
   347  		return
   348  	}
   349  
   350  	hdrStr := ctx.String("csv-output-header")
   351  	if hdrStr == "" && isSelectAll(query) {
   352  		// attempt to get the first line of csv as header
   353  		if hdrs, err := getCSVHeader(url, encKeyDB); err == nil {
   354  			return hdrs
   355  		}
   356  	}
   357  	hdrs = strings.Split(hdrStr, ",")
   358  	return
   359  }
   360  
   361  // get the Select options for sql select API
   362  func getSQLOpts(ctx *cli.Context, csvHdrs []string) (s SelectObjectOpts) {
   363  	is := getInputSerializationOpts(ctx)
   364  	os := getOutputSerializationOpts(ctx, csvHdrs)
   365  
   366  	return SelectObjectOpts{
   367  		InputSerOpts:    is,
   368  		OutputSerOpts:   os,
   369  		CompressionType: minio.SelectCompressionType(ctx.String("compression")),
   370  	}
   371  }
   372  
   373  func isCSVOrJSON(inOpts map[string]map[string]string) bool {
   374  	if _, ok := inOpts["csv"]; ok {
   375  		return true
   376  	}
   377  	if _, ok := inOpts["json"]; ok {
   378  		return true
   379  	}
   380  	return false
   381  }
   382  
   383  func sqlSelect(targetURL, expression string, encKeyDB map[string][]prefixSSEPair, selOpts SelectObjectOpts, csvHdrs []string, writeHdr bool) *probe.Error {
   384  	ctx, cancelSelect := context.WithCancel(globalContext)
   385  	defer cancelSelect()
   386  
   387  	alias, _, _, err := expandAlias(targetURL)
   388  	if err != nil {
   389  		return err.Trace(targetURL)
   390  	}
   391  
   392  	targetClnt, err := newClient(targetURL)
   393  	if err != nil {
   394  		return err.Trace(targetURL)
   395  	}
   396  
   397  	sseKey := getSSE(targetURL, encKeyDB[alias])
   398  	outputer, err := targetClnt.Select(ctx, expression, sseKey, selOpts)
   399  	if err != nil {
   400  		return err.Trace(targetURL, expression)
   401  	}
   402  	defer outputer.Close()
   403  
   404  	// write csv header to stdout
   405  	if len(csvHdrs) > 0 && writeHdr {
   406  		fmt.Println(strings.Join(csvHdrs, ","))
   407  	}
   408  	_, e := io.Copy(os.Stdout, outputer)
   409  	return probe.NewError(e)
   410  }
   411  
   412  func validateOpts(selOpts SelectObjectOpts, url string) {
   413  	_, targetURL, _ := mustExpandAlias(url)
   414  	if strings.HasSuffix(targetURL, ".parquet") && isCSVOrJSON(selOpts.InputSerOpts) {
   415  		fatalIf(errInvalidArgument(), "Input serialization flags --csv-input and --json-input cannot be used for object in .parquet format")
   416  	}
   417  }
   418  
   419  // validate args and optionally fetch the csv header of query object
   420  func getAndValidateArgs(ctx *cli.Context, encKeyDB map[string][]prefixSSEPair, url string) (query string, csvHdrs []string, selOpts SelectObjectOpts) {
   421  	query = ctx.String("query")
   422  	csvHdrs = getCSVOutputHeaders(ctx, url, encKeyDB, query)
   423  	selOpts = getSQLOpts(ctx, csvHdrs)
   424  	validateOpts(selOpts, url)
   425  	return
   426  }
   427  
   428  // check sql input arguments.
   429  func checkSQLSyntax(ctx *cli.Context) {
   430  	if len(ctx.Args()) == 0 {
   431  		showCommandHelpAndExit(ctx, 1) // last argument is exit code.
   432  	}
   433  }
   434  
   435  // mainSQL is the main entry point for sql command.
   436  func mainSQL(cliCtx *cli.Context) error {
   437  	ctx, cancelSQL := context.WithCancel(globalContext)
   438  	defer cancelSQL()
   439  
   440  	var (
   441  		csvHdrs []string
   442  		selOpts SelectObjectOpts
   443  		query   string
   444  	)
   445  	// Parse encryption keys per command.
   446  	encKeyDB, err := validateAndCreateEncryptionKeys(cliCtx)
   447  	fatalIf(err, "Unable to parse encryption keys.")
   448  
   449  	// validate sql input arguments.
   450  	checkSQLSyntax(cliCtx)
   451  	// extract URLs.
   452  	URLs := cliCtx.Args()
   453  	writeHdr := true
   454  	for _, url := range URLs {
   455  		if _, targetContent, err := url2Stat(ctx, url2StatOptions{urlStr: url, versionID: "", fileAttr: false, encKeyDB: encKeyDB, timeRef: time.Time{}, isZip: false, ignoreBucketExistsCheck: false}); err != nil {
   456  			errorIf(err.Trace(url), "Unable to run sql for "+url+".")
   457  			continue
   458  		} else if !targetContent.Type.IsDir() {
   459  			if writeHdr {
   460  				query, csvHdrs, selOpts = getAndValidateArgs(cliCtx, encKeyDB, url)
   461  			}
   462  			errorIf(sqlSelect(url, query, encKeyDB, selOpts, csvHdrs, writeHdr).Trace(url), "Unable to run sql")
   463  			writeHdr = false
   464  			continue
   465  		}
   466  		targetAlias, targetURL, _ := mustExpandAlias(url)
   467  		clnt, err := newClientFromAlias(targetAlias, targetURL)
   468  		if err != nil {
   469  			errorIf(err.Trace(url), "Unable to initialize target `"+url+"`.")
   470  			continue
   471  		}
   472  
   473  		for content := range clnt.List(ctx, ListOptions{Recursive: cliCtx.Bool("recursive"), WithMetadata: true, ShowDir: DirNone}) {
   474  			if content.Err != nil {
   475  				errorIf(content.Err.Trace(url), "Unable to list on target `"+url+"`.")
   476  				continue
   477  			}
   478  			if writeHdr {
   479  				query, csvHdrs, selOpts = getAndValidateArgs(cliCtx, encKeyDB, targetAlias+content.URL.Path)
   480  			}
   481  			contentType := mimedb.TypeByExtension(filepath.Ext(content.URL.Path))
   482  			if len(content.UserMetadata) != 0 && content.UserMetadata["content-type"] != "" {
   483  				contentType = content.UserMetadata["content-type"]
   484  			}
   485  			for _, cTypeSuffix := range supportedContentTypes {
   486  				if strings.Contains(contentType, cTypeSuffix) {
   487  					errorIf(sqlSelect(targetAlias+content.URL.Path, query,
   488  						encKeyDB, selOpts, csvHdrs, writeHdr).Trace(content.URL.String()), "Unable to run sql")
   489  				}
   490  				writeHdr = false
   491  			}
   492  		}
   493  	}
   494  
   495  	// Done.
   496  	return nil
   497  }