storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/mint/run/core/s3select/csv.py (about) 1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 # MinIO Python Library for Amazon S3 Compatible Cloud Storage, 4 # (C) 2015-2020 MinIO, Inc. 5 # 6 # Licensed under the Apache License, Version 2.0 (the "License"); 7 # you may not use this file except in compliance with the License. 8 # You may obtain a copy of the License at 9 # 10 # http://www.apache.org/licenses/LICENSE-2.0 11 # 12 # Unless required by applicable law or agreed to in writing, software 13 # distributed under the License is distributed on an "AS IS" BASIS, 14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 # See the License for the specific language governing permissions and 16 # limitations under the License. 17 18 import io 19 import os 20 21 from minio import Minio 22 from minio.select import (COMPRESSION_TYPE_NONE, FILE_HEADER_INFO_NONE, 23 JSON_TYPE_DOCUMENT, QUOTE_FIELDS_ALWAYS, 24 QUOTE_FIELDS_ASNEEDED, CSVInputSerialization, 25 CSVOutputSerialization, JSONInputSerialization, 26 JSONOutputSerialization, SelectRequest) 27 28 from utils import * 29 30 31 def test_sql_api(test_name, client, bucket_name, input_data, sql_opts, expected_output): 32 """ Test if the passed SQL request has the output equal to the passed execpted one""" 33 object_name = generate_object_name() 34 got_output = b'' 35 try: 36 bytes_content = io.BytesIO(input_data) 37 client.put_object(bucket_name, object_name, 38 io.BytesIO(input_data), len(input_data)) 39 data = client.select_object_content(bucket_name, object_name, sql_opts) 40 # Get the records 41 records = io.BytesIO() 42 for d in data.stream(10*1024): 43 records.write(d) 44 got_output = records.getvalue() 45 except Exception as select_err: 46 if not isinstance(expected_output, Exception): 47 raise ValueError( 48 'Test {} unexpectedly failed with: {}'.format(test_name, select_err)) 49 else: 50 if isinstance(expected_output, Exception): 51 raise ValueError( 52 'Test {}: expected an exception, got {}'.format(test_name, got_output)) 53 if got_output != expected_output: 54 raise ValueError('Test {}: data mismatch. Expected : {}, Received {}'.format( 55 test_name, expected_output, got_output)) 56 finally: 57 client.remove_object(bucket_name, object_name) 58 59 60 def test_csv_input_custom_quote_char(client, log_output): 61 # Get a unique bucket_name and object_name 62 log_output.args['bucket_name'] = bucket_name = generate_bucket_name() 63 64 tests = [ 65 # Invalid quote character, should fail 66 ('""', '"', b'col1,col2,col3\n', Exception()), 67 # UTF-8 quote character 68 ('ع', '"', 'عcol1ع,عcol2ع,عcol3ع\n'.encode(), 69 b'{"_1":"col1","_2":"col2","_3":"col3"}\n'), 70 # Only one field is quoted 71 ('"', '"', b'"col1",col2,col3\n', 72 b'{"_1":"col1","_2":"col2","_3":"col3"}\n'), 73 ('"', '"', b'"col1,col2,col3"\n', b'{"_1":"col1,col2,col3"}\n'), 74 ('\'', '"', b'"col1",col2,col3\n', 75 b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), 76 ('', '"', b'"col1",col2,col3\n', 77 b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), 78 ('', '"', b'"col1",col2,col3\n', 79 b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), 80 ('', '"', b'"col1","col2","col3"\n', 81 b'{"_1":"\\"col1\\"","_2":"\\"col2\\"","_3":"\\"col3\\""}\n'), 82 ('"', '"', b'""""""\n', b'{"_1":"\\"\\""}\n'), 83 ('"', '"', b'A",B\n', b'{"_1":"A\\"","_2":"B"}\n'), 84 ('"', '"', b'A"",B\n', b'{"_1":"A\\"\\"","_2":"B"}\n'), 85 ('"', '\\', b'A\\B,C\n', b'{"_1":"A\\\\B","_2":"C"}\n'), 86 ('"', '"', b'"A""B","CD"\n', b'{"_1":"A\\"B","_2":"CD"}\n'), 87 ('"', '\\', b'"A\\B","CD"\n', b'{"_1":"AB","_2":"CD"}\n'), 88 ('"', '\\', b'"A\\,","CD"\n', b'{"_1":"A,","_2":"CD"}\n'), 89 ('"', '\\', b'"A\\"B","CD"\n', b'{"_1":"A\\"B","_2":"CD"}\n'), 90 ('"', '\\', b'"A\\""\n', b'{"_1":"A\\""}\n'), 91 ('"', '\\', b'"A\\"\\"B"\n', b'{"_1":"A\\"\\"B"}\n'), 92 ('"', '\\', b'"A\\"","\\"B"\n', b'{"_1":"A\\"","_2":"\\"B"}\n'), 93 ] 94 95 client.make_bucket(bucket_name) 96 97 try: 98 for idx, (quote_char, escape_char, data, expected_output) in enumerate(tests): 99 sql_opts = SelectRequest( 100 "select * from s3object", 101 CSVInputSerialization( 102 compression_type=COMPRESSION_TYPE_NONE, 103 file_header_info=FILE_HEADER_INFO_NONE, 104 record_delimiter="\n", 105 field_delimiter=",", 106 quote_character=quote_char, 107 quote_escape_character=escape_char, 108 comments="#", 109 allow_quoted_record_delimiter="FALSE", 110 ), 111 JSONOutputSerialization( 112 record_delimiter="\n", 113 ), 114 request_progress=False, 115 ) 116 117 test_sql_api(f'test_{idx}', client, bucket_name, 118 data, sql_opts, expected_output) 119 finally: 120 client.remove_bucket(bucket_name) 121 122 # Test passes 123 print(log_output.json_report()) 124 125 126 def test_csv_output_custom_quote_char(client, log_output): 127 # Get a unique bucket_name and object_name 128 log_output.args['bucket_name'] = bucket_name = generate_bucket_name() 129 130 tests = [ 131 # UTF-8 quote character 132 ("''", "''", b'col1,col2,col3\n', Exception()), 133 ("'", "'", b'col1,col2,col3\n', b"'col1','col2','col3'\n"), 134 ("", '"', b'col1,col2,col3\n', b'\x00col1\x00,\x00col2\x00,\x00col3\x00\n'), 135 ('"', '"', b'col1,col2,col3\n', b'"col1","col2","col3"\n'), 136 ('"', '"', b'col"1,col2,col3\n', b'"col""1","col2","col3"\n'), 137 ('"', '"', b'""""\n', b'""""\n'), 138 ('"', '"', b'\n', b''), 139 ("'", "\\", b'col1,col2,col3\n', b"'col1','col2','col3'\n"), 140 ("'", "\\", b'col""1,col2,col3\n', b"'col\"\"1','col2','col3'\n"), 141 ("'", "\\", b'col\'1,col2,col3\n', b"'col\\'1','col2','col3'\n"), 142 ("'", "\\", b'"col\'1","col2","col3"\n', b"'col\\'1','col2','col3'\n"), 143 ("'", "\\", b'col\'\n', b"'col\\''\n"), 144 # Two consecutive escaped quotes 145 ("'", "\\", b'"a"""""\n', b"'a\"\"'\n"), 146 ] 147 148 client.make_bucket(bucket_name) 149 150 try: 151 for idx, (quote_char, escape_char, input_data, expected_output) in enumerate(tests): 152 sql_opts = SelectRequest( 153 "select * from s3object", 154 CSVInputSerialization( 155 compression_type=COMPRESSION_TYPE_NONE, 156 file_header_info=FILE_HEADER_INFO_NONE, 157 record_delimiter="\n", 158 field_delimiter=",", 159 quote_character='"', 160 quote_escape_character='"', 161 comments="#", 162 allow_quoted_record_delimiter="FALSE", 163 ), 164 CSVOutputSerialization( 165 quote_fields=QUOTE_FIELDS_ALWAYS, 166 record_delimiter="\n", 167 field_delimiter=",", 168 quote_character=quote_char, 169 quote_escape_character=escape_char, 170 ), 171 request_progress=False, 172 ) 173 174 test_sql_api(f'test_{idx}', client, bucket_name, 175 input_data, sql_opts, expected_output) 176 finally: 177 client.remove_bucket(bucket_name) 178 179 # Test passes 180 print(log_output.json_report())