github.com/siglens/siglens@v0.0.0-20240328180423-f7ce9ae441ed/tools/nyc-taxi-benchmark/parquet_to_tsv.py (about)

     1  import pandas as pd
     2  import glob
     3  import os
     4  import sys
     5  
     6  
     7  def convert_parquet_to_tsv(input_directory, output_directory):
     8      if not os.path.exists(output_directory):
     9          os.makedirs(output_directory)
    10  
    11      for parquet_file in glob.glob(os.path.join(input_directory, '*.parquet')):
    12          df = pd.read_parquet(parquet_file)
    13          base_name = os.path.basename(parquet_file)
    14          tsv_file = os.path.join(output_directory, base_name.replace('.parquet', '.tsv'))
    15          df.to_csv(tsv_file, sep='\t', index=False)
    16          print(f"Converted {parquet_file} to {tsv_file}")
    17  
    18  
    19  if __name__ == "__main__":
    20      if len(sys.argv) != 3:
    21          print("Usage: python script.py <input_directory> <output_directory>")
    22          sys.exit(1)
    23  
    24      input_dir = sys.argv[1]
    25      output_dir = sys.argv[2]
    26      convert_parquet_to_tsv(input_dir, output_dir)
    27