github.com/siglens/siglens@v0.0.0-20240328180423-f7ce9ae441ed/tools/nyc-taxi-benchmark/ingester.py (about)

     1  import subprocess
     2  import sys
     3  
     4  
     5  def ingest(filename, batch_size=100):
     6      # Determine the total number of lines in the file
     7      total_lines = sum(1 for _ in open(filename, "r"))
     8  
     9      lines = []
    10      with open(filename, 'r') as f:
    11          for i, line in enumerate(f):
    12              lines.append(line)
    13  
    14              if len(lines) >= batch_size:
    15                  print(f"\rProcessing... {((i + 1) / total_lines) * 100:.2f}%", end='')
    16                  ingest_lines(lines)
    17                  lines = []
    18      if lines:
    19          ingest_lines(lines)
    20          print(f"\rProcessing... 100.00%")
    21  
    22  
    23  def ingest_lines(lines):
    24      index_data = '{"index": {"_index": "trips", "_type": "_doc"}}'
    25      data = ''
    26      for line in lines:
    27          data += index_data + '\n' + line
    28  
    29      # Prepare the curl command
    30      curl_command = [
    31          "curl",
    32          "-s",
    33          "-o", "/dev/null",
    34          "http://localhost:8081/elastic/_bulk",
    35          "-X", "POST",
    36          "-H", "Authorization: Bearer ",
    37          "-H", "Content-Type: application/json",
    38          "--data-binary", data
    39      ]
    40  
    41      # Execute the curl command
    42      process = subprocess.run(curl_command, capture_output=False, text=False)
    43      if process.stderr:
    44          print("Error:", process.stderr)
    45  
    46  
    47  if __name__ == "__main__":
    48      ingest(sys.argv[1])
    49