storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/mint/run/core/s3select/sql_ops.py (about)

     1  #!/usr/bin/env python
     2  # -*- coding: utf-8 -*-
     3  # MinIO Python Library for Amazon S3 Compatible Cloud Storage,
     4  # (C) 2020 MinIO, Inc.
     5  #
     6  # Licensed under the Apache License, Version 2.0 (the "License");
     7  # you may not use this file except in compliance with the License.
     8  # You may obtain a copy of the License at
     9  #
    10  #     http://www.apache.org/licenses/LICENSE-2.0
    11  #
    12  # Unless required by applicable law or agreed to in writing, software
    13  # distributed under the License is distributed on an "AS IS" BASIS,
    14  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    15  # See the License for the specific language governing permissions and
    16  # limitations under the License.
    17  
    18  import io
    19  from datetime import datetime
    20  
    21  from minio.select import (FILE_HEADER_INFO_NONE, JSON_TYPE_DOCUMENT,
    22                            QUOTE_FIELDS_ASNEEDED, CSVInputSerialization,
    23                            CSVOutputSerialization, JSONInputSerialization,
    24                            JSONOutputSerialization, SelectRequest)
    25  
    26  from utils import generate_bucket_name, generate_object_name
    27  
    28  
    29  def test_sql_expressions_custom_input_output(client, input_bytes, sql_input,
    30                                               sql_output, tests, log_output):
    31      bucket_name = generate_bucket_name()
    32      object_name = generate_object_name()
    33  
    34      log_output.args['total_tests'] = 0
    35      log_output.args['total_success'] = 0
    36  
    37      client.make_bucket(bucket_name)
    38      try:
    39          content = io.BytesIO(bytes(input_bytes, 'utf-8'))
    40          client.put_object(bucket_name, object_name, content, len(input_bytes))
    41  
    42          for idx, (test_name, select_expression, expected_output) in enumerate(tests):
    43              if select_expression == '':
    44                  continue
    45              try:
    46                  log_output.args['total_tests'] += 1
    47                  sreq = SelectRequest(
    48                      select_expression,
    49                      sql_input,
    50                      sql_output,
    51                      request_progress=False
    52                  )
    53  
    54                  data = client.select_object_content(
    55                      bucket_name, object_name, sreq)
    56  
    57                  # Get the records
    58                  records = io.BytesIO()
    59                  for d in data.stream(10*1024):
    60                      records.write(d)
    61                  got_output = records.getvalue()
    62  
    63                  if got_output != expected_output:
    64                      if type(expected_output) == datetime:
    65                          # Attempt to parse the date which will throw an exception for any issue
    66                          datetime.strptime(got_output.decode(
    67                              "utf-8").strip(), '%Y-%m-%dT%H:%M:%S.%f%z')
    68                      else:
    69                          raise ValueError('Test {}: data mismatch. Expected : {}. Received: {}.'.format(
    70                              idx+1, expected_output, got_output))
    71  
    72                  log_output.args['total_success'] += 1
    73              except Exception as err:
    74                  continue  # TODO, raise instead
    75                  # raise Exception(err)
    76      finally:
    77          client.remove_object(bucket_name, object_name)
    78          client.remove_bucket(bucket_name)
    79  
    80  
    81  def test_sql_expressions(client, input_json_bytes, tests, log_output):
    82      input_serialization = JSONInputSerialization(
    83          compression_type="NONE",
    84          json_type=JSON_TYPE_DOCUMENT,
    85      )
    86  
    87      output_serialization = CSVOutputSerialization(
    88          quote_fields=QUOTE_FIELDS_ASNEEDED)
    89  
    90      test_sql_expressions_custom_input_output(client, input_json_bytes,
    91                                               input_serialization, output_serialization, tests, log_output)
    92  
    93  
    94  def test_sql_operators(client, log_output):
    95  
    96      json_testfile = """{"id": 1, "name": "John", "age": 3}
    97  {"id": 2, "name": "Elliot", "age": 4}
    98  {"id": 3, "name": "Yves", "age": 5}
    99  {"id": 4, "name": null, "age": 0}
   100  """
   101  
   102      tests = [
   103          # Logical operators
   104          ("AND", "select * from S3Object s where s.id = 1 AND s.name = 'John'", b'1,John,3\n'),
   105          ("NOT", "select * from S3Object s where NOT s.id = 1",
   106              b'2,Elliot,4\n3,Yves,5\n4,,0\n'),
   107          ("OR", "select * from S3Object s where s.id = 1 OR s.id = 3",
   108              b'1,John,3\n3,Yves,5\n'),
   109          # Comparison Operators
   110          ("<", "select * from S3Object s where s.age < 4", b'1,John,3\n4,,0\n'),
   111          (">", "select * from S3Object s where s.age > 4", b'3,Yves,5\n'),
   112          ("<=", "select * from S3Object s where s.age <= 4",
   113              b'1,John,3\n2,Elliot,4\n4,,0\n'),
   114          (">=", "select * from S3Object s where s.age >= 4", b'2,Elliot,4\n3,Yves,5\n'),
   115          ("=", "select * from S3Object s where s.age = 4", b'2,Elliot,4\n'),
   116          ("<>", "select * from S3Object s where s.age <> 4",
   117              b'1,John,3\n3,Yves,5\n4,,0\n'),
   118          ("!=", "select * from S3Object s where s.age != 4",
   119              b'1,John,3\n3,Yves,5\n4,,0\n'),
   120          ("BETWEEN", "select * from S3Object s where s.age BETWEEN 4 AND 5",
   121              b'2,Elliot,4\n3,Yves,5\n'),
   122          ("IN", "select * from S3Object s where s.age IN (3,5)", b'1,John,3\n3,Yves,5\n'),
   123          # Pattern Matching Operators
   124          ("LIKE_", "select * from S3Object s where s.name LIKE '_ves'", b'3,Yves,5\n'),
   125          ("LIKE%", "select * from S3Object s where s.name LIKE 'Ell%t'", b'2,Elliot,4\n'),
   126          # Unitary Operators
   127          ("NULL", "select * from S3Object s where s.name IS NULL", b'4,,0\n'),
   128          ("NOT_NULL", "select * from S3Object s where s.age IS NOT NULL",
   129              b'1,John,3\n2,Elliot,4\n3,Yves,5\n4,,0\n'),
   130          # Math Operators
   131          ("+", "select * from S3Object s where s.age = 1+3 ", b'2,Elliot,4\n'),
   132          ("-", "select * from S3Object s where s.age = 5-1 ", b'2,Elliot,4\n'),
   133          ("*", "select * from S3Object s where s.age = 2*2 ", b'2,Elliot,4\n'),
   134          ("%", "select * from S3Object s where s.age = 10%6 ", b'2,Elliot,4\n'),
   135      ]
   136  
   137      try:
   138          test_sql_expressions(client, json_testfile, tests, log_output)
   139      except Exception as select_err:
   140          raise select_err
   141          # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err))
   142          # pass
   143  
   144      # Test passes
   145      print(log_output.json_report())
   146  
   147  
   148  def test_sql_operators_precedence(client, log_output):
   149  
   150      json_testfile = """{"id": 1, "name": "Eric"}"""
   151  
   152      tests = [
   153          ("-_1", "select -3*3 from S3Object", b'-9\n'),
   154          ("*", "select 10-3*2 from S3Object", b'4\n'),
   155          ("/", "select 13-10/5 from S3Object", b'11\n'),
   156          ("%", "select 13-10%5 from S3Object", b'13\n'),
   157          ("+", "select 1+1*3 from S3Object", b'4\n'),
   158          ("-_2", "select 1-1*3 from S3Object", b'-2\n'),
   159          ("=", "select * from S3Object as s where s.id = 13-12", b'1,Eric\n'),
   160          ("<>", "select * from S3Object as s where s.id <> 1-1", b'1,Eric\n'),
   161          ("NOT", "select * from S3Object where false OR NOT false", b'1,Eric\n'),
   162          ("AND", "select * from S3Object where true AND true OR false ", b'1,Eric\n'),
   163          ("OR", "select * from S3Object where false OR NOT false", b'1,Eric\n'),
   164          ("IN", "select * from S3Object as s where s.id <> -1 AND s.id IN (1,2,3)", b'1,Eric\n'),
   165          ("BETWEEN", "select * from S3Object as s where s.id <> -1 AND s.id BETWEEN -1 AND 3", b'1,Eric\n'),
   166          ("LIKE", "select * from S3Object as s where s.id <> -1 AND s.name LIKE 'E%'", b'1,Eric\n'),
   167      ]
   168  
   169      try:
   170          test_sql_expressions(client, json_testfile, tests, log_output)
   171      except Exception as select_err:
   172          raise select_err
   173          # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err))
   174          # pass
   175  
   176      # Test passes
   177      print(log_output.json_report())
   178  
   179  
   180  def test_sql_functions_agg_cond_conv(client, log_output):
   181  
   182      json_testfile = """{"id": 1, "name": "John", "age": 3}
   183  {"id": 2, "name": "Elliot", "age": 4}
   184  {"id": 3, "name": "Yves", "age": 5}
   185  {"id": 4, "name": "Christine", "age": null}
   186  {"id": 5, "name": "Eric", "age": 0}
   187  """
   188      tests = [
   189          # Aggregate functions
   190          ("COUNT", "select count(*) from S3Object s", b'5\n'),
   191          ("AVG", "select avg(s.age) from S3Object s", b'3\n'),
   192          ("MAX", "select max(s.age) from S3Object s", b'5\n'),
   193          ("MIN", "select min(s.age) from S3Object s", b'0\n'),
   194          ("SUM", "select sum(s.age) from S3Object s", b'12\n'),
   195          # Conditional functions
   196          ("COALESCE", "SELECT COALESCE(s.age, 99) FROM S3Object s", b'3\n4\n5\n99\n0\n'),
   197          ("NULLIF", "SELECT NULLIF(s.age, 0) FROM S3Object s", b'3\n4\n5\n\n\n'),
   198          # Conversion functions
   199          ("CAST", "SELECT CAST(s.age AS FLOAT) FROM S3Object s",
   200              b'3.0\n4.0\n5.0\n\n0.0\n'),
   201  
   202      ]
   203  
   204      try:
   205          test_sql_expressions(client, json_testfile, tests, log_output)
   206      except Exception as select_err:
   207          raise select_err
   208          # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err))
   209          # pass
   210  
   211      # Test passes
   212      print(log_output.json_report())
   213  
   214  
   215  def test_sql_functions_date(client, log_output):
   216  
   217      json_testfile = """
   218  {"id": 1, "name": "John", "datez": "2017-01-02T03:04:05.006+07:30"}
   219  """
   220  
   221      tests = [
   222          # DATE_ADD
   223          ("DATE_ADD_1", "select DATE_ADD(year, 5, TO_TIMESTAMP(s.datez)) from S3Object as s",
   224           b'2022-01-02T03:04:05.006+07:30\n'),
   225          ("DATE_ADD_2", "select DATE_ADD(month, 1, TO_TIMESTAMP(s.datez)) from S3Object as s",
   226              b'2017-02-02T03:04:05.006+07:30\n'),
   227          ("DATE_ADD_3", "select DATE_ADD(day, -1, TO_TIMESTAMP(s.datez)) from S3Object as s",
   228              b'2017-01-01T03:04:05.006+07:30\n'),
   229          ("DATE_ADD_4", "select DATE_ADD(hour, 1, TO_TIMESTAMP(s.datez)) from S3Object as s",
   230              b'2017-01-02T04:04:05.006+07:30\n'),
   231          ("DATE_ADD_5", "select DATE_ADD(minute, 5, TO_TIMESTAMP(s.datez)) from S3Object as s",
   232              b'2017-01-02T03:09:05.006+07:30\n'),
   233          ("DATE_ADD_6", "select DATE_ADD(second, 5, TO_TIMESTAMP(s.datez)) from S3Object as s",
   234              b'2017-01-02T03:04:10.006+07:30\n'),
   235          # DATE_DIFF
   236          ("DATE_DIFF_1", "select DATE_DIFF(year, TO_TIMESTAMP(s.datez), TO_TIMESTAMP('2011-01-01T')) from S3Object as s", b'-6\n'),
   237          ("DATE_DIFF_2", "select DATE_DIFF(month, TO_TIMESTAMP(s.datez), TO_TIMESTAMP('2011T')) from S3Object as s", b'-72\n'),
   238          ("DATE_DIFF_3", "select DATE_DIFF(day, TO_TIMESTAMP(s.datez), TO_TIMESTAMP('2010-01-02T')) from S3Object as s", b'-2556\n'),
   239          # EXTRACT
   240          ("EXTRACT_1", "select EXTRACT(year FROM TO_TIMESTAMP(s.datez)) from S3Object as s", b'2017\n'),
   241          ("EXTRACT_2", "select EXTRACT(month FROM TO_TIMESTAMP(s.datez)) from S3Object as s", b'1\n'),
   242          ("EXTRACT_3", "select EXTRACT(hour FROM TO_TIMESTAMP(s.datez)) from S3Object as s", b'3\n'),
   243          ("EXTRACT_4", "select EXTRACT(minute FROM TO_TIMESTAMP(s.datez)) from S3Object as s", b'4\n'),
   244          ("EXTRACT_5", "select EXTRACT(timezone_hour FROM TO_TIMESTAMP(s.datez)) from S3Object as s", b'7\n'),
   245          ("EXTRACT_6", "select EXTRACT(timezone_minute FROM TO_TIMESTAMP(s.datez)) from S3Object as s", b'30\n'),
   246          # TO_STRING
   247          ("TO_STRING_1", "select TO_STRING(TO_TIMESTAMP(s.datez), 'MMMM d, y') from S3Object as s",
   248              b'"January 2, 2017"\n'),
   249          ("TO_STRING_2", "select TO_STRING(TO_TIMESTAMP(s.datez), 'MMM d, yyyy') from S3Object as s", b'"Jan 2, 2017"\n'),
   250          ("TO_STRING_3", "select TO_STRING(TO_TIMESTAMP(s.datez), 'M-d-yy') from S3Object as s", b'1-2-17\n'),
   251          ("TO_STRING_4", "select TO_STRING(TO_TIMESTAMP(s.datez), 'MM-d-y') from S3Object as s", b'01-2-2017\n'),
   252          ("TO_STRING_5", "select TO_STRING(TO_TIMESTAMP(s.datez), 'MMMM d, y h:m a') from S3Object as s",
   253              b'"January 2, 2017 3:4 AM"\n'),
   254          ("TO_STRING_6", "select TO_STRING(TO_TIMESTAMP(s.datez), 'y-MM-dd''T''H:m:ssX') from S3Object as s",
   255              b'2017-01-02T3:4:05+0730\n'),
   256          ("TO_STRING_7", "select TO_STRING(TO_TIMESTAMP(s.datez), 'y-MM-dd''T''H:m:ssX') from S3Object as s",
   257              b'2017-01-02T3:4:05+0730\n'),
   258          ("TO_STRING_8", "select TO_STRING(TO_TIMESTAMP(s.datez), 'y-MM-dd''T''H:m:ssXXXX') from S3Object as s",
   259              b'2017-01-02T3:4:05+0730\n'),
   260          ("TO_STRING_9", "select TO_STRING(TO_TIMESTAMP(s.datez), 'y-MM-dd''T''H:m:ssXXXXX') from S3Object as s",
   261              b'2017-01-02T3:4:05+07:30\n'),
   262          ("TO_TIMESTAMP", "select TO_TIMESTAMP(s.datez) from S3Object as s",
   263              b'2017-01-02T03:04:05.006+07:30\n'),
   264          ("UTCNOW", "select UTCNOW() from S3Object", datetime(1, 1, 1)),
   265  
   266      ]
   267  
   268      try:
   269          test_sql_expressions(client, json_testfile, tests, log_output)
   270      except Exception as select_err:
   271          raise select_err
   272          # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err))
   273          # pass
   274  
   275      # Test passes
   276      print(log_output.json_report())
   277  
   278  
   279  def test_sql_functions_string(client, log_output):
   280  
   281      json_testfile = """
   282  {"id": 1, "name": "John"}
   283  {"id": 2, "name": "       \tfoobar\t         "}
   284  {"id": 3, "name": "1112211foobar22211122"}
   285  """
   286  
   287      tests = [
   288          # CHAR_LENGTH
   289          ("CHAR_LENGTH", "select CHAR_LENGTH(s.name) from S3Object as s", b'4\n24\n21\n'),
   290          ("CHARACTER_LENGTH",
   291              "select CHARACTER_LENGTH(s.name) from S3Object as s", b'4\n24\n21\n'),
   292          # LOWER
   293          ("LOWER", "select LOWER(s.name) from S3Object as s where s.id= 1", b'john\n'),
   294          # SUBSTRING
   295          ("SUBSTRING_1", "select SUBSTRING(s.name FROM 2) from S3Object as s where s.id = 1", b'ohn\n'),
   296          ("SUBSTRING_2", "select SUBSTRING(s.name FROM 2 FOR 2) from S3Object as s where s.id = 1", b'oh\n'),
   297          ("SUBSTRING_3", "select SUBSTRING(s.name FROM -1 FOR 2) from S3Object as s where s.id = 1", b'\n'),
   298          # TRIM
   299          ("TRIM_1", "select TRIM(s.name) from S3Object as s where s.id = 2", b'\tfoobar\t\n'),
   300          ("TRIM_2", "select TRIM(LEADING FROM s.name) from S3Object as s where s.id = 2",
   301              b'\tfoobar\t         \n'),
   302          ("TRIM_3", "select TRIM(TRAILING FROM s.name) from S3Object as s where s.id = 2",
   303              b'       \tfoobar\t\n'),
   304          ("TRIM_4", "select TRIM(BOTH FROM s.name) from S3Object as s where s.id = 2", b'\tfoobar\t\n'),
   305          ("TRIM_5", "select TRIM(BOTH '12' FROM s.name) from S3Object as s where s.id = 3", b'foobar\n'),
   306          # UPPER
   307          ("UPPER", "select UPPER(s.name) from S3Object as s where s.id= 1", b'JOHN\n'),
   308      ]
   309  
   310      try:
   311          test_sql_expressions(client, json_testfile, tests, log_output)
   312      except Exception as select_err:
   313          raise select_err
   314          # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err))
   315          # pass
   316  
   317      # Test passes
   318      print(log_output.json_report())
   319  
   320  
   321  def test_sql_datatypes(client, log_output):
   322      json_testfile = """
   323  {"name": "John"}
   324  """
   325      tests = [
   326          ("bool", "select CAST('true' AS BOOL) from S3Object", b'true\n'),
   327          ("int", "select CAST('13' AS INT) from S3Object", b'13\n'),
   328          ("integer", "select CAST('13' AS INTEGER) from S3Object", b'13\n'),
   329          ("string", "select CAST(true AS STRING) from S3Object", b'true\n'),
   330          ("float", "select CAST('13.3' AS FLOAT) from S3Object", b'13.3\n'),
   331          ("decimal", "select CAST('14.3' AS FLOAT) from S3Object", b'14.3\n'),
   332          ("numeric", "select CAST('14.3' AS FLOAT) from S3Object", b'14.3\n'),
   333          ("timestamp", "select CAST('2007-04-05T14:30Z' AS TIMESTAMP) from S3Object",
   334              b'2007-04-05T14:30Z\n'),
   335      ]
   336  
   337      try:
   338          test_sql_expressions(client, json_testfile, tests, log_output)
   339      except Exception as select_err:
   340          raise select_err
   341          # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err))
   342          # pass
   343  
   344      # Test passes
   345      print(log_output.json_report())
   346  
   347  
   348  def test_sql_select(client, log_output):
   349  
   350      json_testfile = """{"id": 1, "created": "June 27", "modified": "July 6" }
   351  {"id": 2, "Created": "June 28", "Modified": "July 7", "Cast": "Random Date" }"""
   352      tests = [
   353          ("select_1", "select * from S3Object",
   354           b'1,June 27,July 6\n2,June 28,July 7,Random Date\n'),
   355          ("select_2", "select * from S3Object s",
   356              b'1,June 27,July 6\n2,June 28,July 7,Random Date\n'),
   357          ("select_3", "select * from S3Object as s",
   358              b'1,June 27,July 6\n2,June 28,July 7,Random Date\n'),
   359          ("select_4", "select s.line from S3Object as s", b'\n\n'),
   360          ("select_5", 'select s."Created" from S3Object as s', b'\nJune 28\n'),
   361          ("select_5", 'select s."Cast" from S3Object as s', b'\nRandom Date\n'),
   362          ("where", 'select s.created from S3Object as s', b'June 27\nJune 28\n'),
   363          ("limit", 'select * from S3Object as s LIMIT 1', b'1,June 27,July 6\n'),
   364      ]
   365  
   366      try:
   367          test_sql_expressions(client, json_testfile, tests, log_output)
   368      except Exception as select_err:
   369          raise select_err
   370          # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err))
   371          # pass
   372  
   373      # Test passes
   374      print(log_output.json_report())
   375  
   376  
   377  def test_sql_select_json(client, log_output):
   378      json_testcontent = """{ "Rules": [ {"id": "1"}, {"expr": "y > x"}, {"id": "2", "expr": "z = DEBUG"} ]}
   379  { "created": "June 27", "modified": "July 6" }
   380  """
   381      tests = [
   382          ("select_1", "SELECT id FROM S3Object[*].Rules[*].id",
   383           b'{"id":"1"}\n{}\n{"id":"2"}\n{}\n'),
   384          ("select_2",
   385              "SELECT id FROM S3Object[*].Rules[*].id WHERE id IS NOT MISSING", b'{"id":"1"}\n{"id":"2"}\n'),
   386          ("select_3", "SELECT d.created, d.modified FROM S3Object[*] d",
   387              b'{}\n{"created":"June 27","modified":"July 6"}\n'),
   388          ("select_4", "SELECT _1.created, _1.modified FROM S3Object[*]",
   389              b'{}\n{"created":"June 27","modified":"July 6"}\n'),
   390          ("select_5",
   391              "Select s.rules[1].expr from S3Object s", b'{"expr":"y > x"}\n{}\n'),
   392      ]
   393  
   394      input_serialization = JSONInputSerialization(json_type=JSON_TYPE_DOCUMENT)
   395      output_serialization = JSONOutputSerialization()
   396      try:
   397          test_sql_expressions_custom_input_output(client, json_testcontent,
   398                                                   input_serialization, output_serialization, tests, log_output)
   399      except Exception as select_err:
   400          raise select_err
   401          # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err))
   402          # pass
   403  
   404      # Test passes
   405      print(log_output.json_report())
   406  
   407  
   408  def test_sql_select_csv_no_header(client, log_output):
   409      json_testcontent = """val1,val2,val3
   410  val4,val5,val6
   411  """
   412      tests = [
   413          ("select_1", "SELECT s._2 FROM S3Object as s", b'val2\nval5\n'),
   414      ]
   415  
   416      input_serialization = CSVInputSerialization(
   417          file_header_info=FILE_HEADER_INFO_NONE,
   418          allow_quoted_record_delimiter="FALSE",
   419      )
   420      output_serialization = CSVOutputSerialization()
   421      try:
   422          test_sql_expressions_custom_input_output(client, json_testcontent,
   423                                                   input_serialization, output_serialization, tests, log_output)
   424      except Exception as select_err:
   425          raise select_err
   426          # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err))
   427          # pass
   428  
   429      # Test passes
   430      print(log_output.json_report())