storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/mint/run/core/s3select/csv.py (about)

     1  #!/usr/bin/env python
     2  # -*- coding: utf-8 -*-
     3  # MinIO Python Library for Amazon S3 Compatible Cloud Storage,
     4  # (C) 2015-2020 MinIO, Inc.
     5  #
     6  # Licensed under the Apache License, Version 2.0 (the "License");
     7  # you may not use this file except in compliance with the License.
     8  # You may obtain a copy of the License at
     9  #
    10  #     http://www.apache.org/licenses/LICENSE-2.0
    11  #
    12  # Unless required by applicable law or agreed to in writing, software
    13  # distributed under the License is distributed on an "AS IS" BASIS,
    14  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    15  # See the License for the specific language governing permissions and
    16  # limitations under the License.
    17  
    18  import io
    19  import os
    20  
    21  from minio import Minio
    22  from minio.select import (COMPRESSION_TYPE_NONE, FILE_HEADER_INFO_NONE,
    23                            JSON_TYPE_DOCUMENT, QUOTE_FIELDS_ALWAYS,
    24                            QUOTE_FIELDS_ASNEEDED, CSVInputSerialization,
    25                            CSVOutputSerialization, JSONInputSerialization,
    26                            JSONOutputSerialization, SelectRequest)
    27  
    28  from utils import *
    29  
    30  
    31  def test_sql_api(test_name, client, bucket_name, input_data, sql_opts, expected_output):
    32      """ Test if the passed SQL request has the output equal to the passed execpted one"""
    33      object_name = generate_object_name()
    34      got_output = b''
    35      try:
    36          bytes_content = io.BytesIO(input_data)
    37          client.put_object(bucket_name, object_name,
    38                            io.BytesIO(input_data), len(input_data))
    39          data = client.select_object_content(bucket_name, object_name, sql_opts)
    40          # Get the records
    41          records = io.BytesIO()
    42          for d in data.stream(10*1024):
    43              records.write(d)
    44              got_output = records.getvalue()
    45      except Exception as select_err:
    46          if not isinstance(expected_output, Exception):
    47              raise ValueError(
    48                  'Test {} unexpectedly failed with: {}'.format(test_name, select_err))
    49      else:
    50          if isinstance(expected_output, Exception):
    51              raise ValueError(
    52                  'Test {}: expected an exception, got {}'.format(test_name, got_output))
    53          if got_output != expected_output:
    54              raise ValueError('Test {}: data mismatch. Expected : {}, Received {}'.format(
    55                  test_name, expected_output, got_output))
    56      finally:
    57          client.remove_object(bucket_name, object_name)
    58  
    59  
    60  def test_csv_input_custom_quote_char(client, log_output):
    61      # Get a unique bucket_name and object_name
    62      log_output.args['bucket_name'] = bucket_name = generate_bucket_name()
    63  
    64      tests = [
    65          # Invalid quote character, should fail
    66          ('""', '"', b'col1,col2,col3\n', Exception()),
    67          # UTF-8 quote character
    68          ('ع', '"', 'عcol1ع,عcol2ع,عcol3ع\n'.encode(),
    69           b'{"_1":"col1","_2":"col2","_3":"col3"}\n'),
    70          # Only one field is quoted
    71          ('"', '"', b'"col1",col2,col3\n',
    72           b'{"_1":"col1","_2":"col2","_3":"col3"}\n'),
    73          ('"', '"', b'"col1,col2,col3"\n', b'{"_1":"col1,col2,col3"}\n'),
    74          ('\'', '"', b'"col1",col2,col3\n',
    75           b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'),
    76          ('', '"', b'"col1",col2,col3\n',
    77           b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'),
    78          ('', '"', b'"col1",col2,col3\n',
    79           b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'),
    80          ('', '"', b'"col1","col2","col3"\n',
    81           b'{"_1":"\\"col1\\"","_2":"\\"col2\\"","_3":"\\"col3\\""}\n'),
    82          ('"', '"', b'""""""\n', b'{"_1":"\\"\\""}\n'),
    83          ('"', '"', b'A",B\n', b'{"_1":"A\\"","_2":"B"}\n'),
    84          ('"', '"', b'A"",B\n', b'{"_1":"A\\"\\"","_2":"B"}\n'),
    85          ('"', '\\', b'A\\B,C\n', b'{"_1":"A\\\\B","_2":"C"}\n'),
    86          ('"', '"', b'"A""B","CD"\n', b'{"_1":"A\\"B","_2":"CD"}\n'),
    87          ('"', '\\', b'"A\\B","CD"\n', b'{"_1":"AB","_2":"CD"}\n'),
    88          ('"', '\\', b'"A\\,","CD"\n', b'{"_1":"A,","_2":"CD"}\n'),
    89          ('"', '\\', b'"A\\"B","CD"\n', b'{"_1":"A\\"B","_2":"CD"}\n'),
    90          ('"', '\\', b'"A\\""\n', b'{"_1":"A\\""}\n'),
    91          ('"', '\\', b'"A\\"\\"B"\n', b'{"_1":"A\\"\\"B"}\n'),
    92          ('"', '\\', b'"A\\"","\\"B"\n', b'{"_1":"A\\"","_2":"\\"B"}\n'),
    93      ]
    94  
    95      client.make_bucket(bucket_name)
    96  
    97      try:
    98          for idx, (quote_char, escape_char, data, expected_output) in enumerate(tests):
    99              sql_opts = SelectRequest(
   100                  "select * from s3object",
   101                  CSVInputSerialization(
   102                      compression_type=COMPRESSION_TYPE_NONE,
   103                      file_header_info=FILE_HEADER_INFO_NONE,
   104                      record_delimiter="\n",
   105                      field_delimiter=",",
   106                      quote_character=quote_char,
   107                      quote_escape_character=escape_char,
   108                      comments="#",
   109                      allow_quoted_record_delimiter="FALSE",
   110                  ),
   111                  JSONOutputSerialization(
   112                      record_delimiter="\n",
   113                  ),
   114                  request_progress=False,
   115              )
   116  
   117              test_sql_api(f'test_{idx}', client, bucket_name,
   118                           data, sql_opts, expected_output)
   119      finally:
   120          client.remove_bucket(bucket_name)
   121  
   122      # Test passes
   123      print(log_output.json_report())
   124  
   125  
   126  def test_csv_output_custom_quote_char(client, log_output):
   127      # Get a unique bucket_name and object_name
   128      log_output.args['bucket_name'] = bucket_name = generate_bucket_name()
   129  
   130      tests = [
   131          # UTF-8 quote character
   132          ("''", "''", b'col1,col2,col3\n', Exception()),
   133          ("'", "'", b'col1,col2,col3\n', b"'col1','col2','col3'\n"),
   134          ("", '"', b'col1,col2,col3\n', b'\x00col1\x00,\x00col2\x00,\x00col3\x00\n'),
   135          ('"', '"', b'col1,col2,col3\n', b'"col1","col2","col3"\n'),
   136          ('"', '"', b'col"1,col2,col3\n', b'"col""1","col2","col3"\n'),
   137          ('"', '"', b'""""\n', b'""""\n'),
   138          ('"', '"', b'\n', b''),
   139          ("'", "\\", b'col1,col2,col3\n', b"'col1','col2','col3'\n"),
   140          ("'", "\\", b'col""1,col2,col3\n', b"'col\"\"1','col2','col3'\n"),
   141          ("'", "\\", b'col\'1,col2,col3\n', b"'col\\'1','col2','col3'\n"),
   142          ("'", "\\", b'"col\'1","col2","col3"\n', b"'col\\'1','col2','col3'\n"),
   143          ("'", "\\", b'col\'\n', b"'col\\''\n"),
   144          # Two consecutive escaped quotes
   145          ("'", "\\", b'"a"""""\n', b"'a\"\"'\n"),
   146      ]
   147  
   148      client.make_bucket(bucket_name)
   149  
   150      try:
   151          for idx, (quote_char, escape_char, input_data, expected_output) in enumerate(tests):
   152              sql_opts = SelectRequest(
   153                  "select * from s3object",
   154                  CSVInputSerialization(
   155                      compression_type=COMPRESSION_TYPE_NONE,
   156                      file_header_info=FILE_HEADER_INFO_NONE,
   157                      record_delimiter="\n",
   158                      field_delimiter=",",
   159                      quote_character='"',
   160                      quote_escape_character='"',
   161                      comments="#",
   162                      allow_quoted_record_delimiter="FALSE",
   163                  ),
   164                  CSVOutputSerialization(
   165                      quote_fields=QUOTE_FIELDS_ALWAYS,
   166                      record_delimiter="\n",
   167                      field_delimiter=",",
   168                      quote_character=quote_char,
   169                      quote_escape_character=escape_char,
   170                  ),
   171                  request_progress=False,
   172              )
   173  
   174              test_sql_api(f'test_{idx}', client, bucket_name,
   175                           input_data, sql_opts, expected_output)
   176      finally:
   177          client.remove_bucket(bucket_name)
   178  
   179      # Test passes
   180      print(log_output.json_report())