github.com/benhoyt/goawk@v1.8.1/benchmark_awks.py

github.com/benhoyt/goawk@v1.8.1/benchmark_awks.py (about)

     1  #!/usr/bin/env python
     2  # Benchmark GoAWK against other AWK versions
     3  
     4  from __future__ import print_function
     5  
     6  import glob
     7  import os.path
     8  import shutil
     9  import subprocess
    10  import sys
    11  import time
    12  
    13  AWKS = [
    14      './goawk',
    15      './orig', # GoAWK without perf improvements (commit 8ab5446)
    16      'awk',
    17      'gawk',
    18      'mawk',
    19  ]
    20  NORM_INDEX = AWKS.index('awk')
    21  # Only get the mean of these tests because these are the only ones
    22  # we show in the GoAWK article.
    23  TESTS_TO_MEAN = [
    24      'tt.01',
    25      'tt.02',
    26      'tt.02a',
    27      'tt.03',
    28      'tt.03a',
    29      'tt.04',
    30      'tt.05',
    31      'tt.06',
    32      'tt.07',
    33      'tt.big',
    34      'tt.x1',
    35      'tt.x2',
    36  ]
    37  MEAN_TESTS = []
    38  NUM_RUNS = 3
    39  MIN_TIME = 0.5
    40  PROGRAM_GLOB = 'testdata/tt.*'
    41  
    42  if len(sys.argv) > 1:
    43      PROGRAM_GLOB = 'testdata/' + sys.argv[1]
    44  
    45  
    46  def repeat_file(input_file, repeated_file, n):
    47      with open(input_file, 'rb') as fin, open(repeated_file, 'wb') as fout:
    48          for i in range(n):
    49              fin.seek(0)
    50              shutil.copyfileobj(fin, fout)
    51  
    52  
    53  print('Test      ', end='')
    54  for awk in AWKS:
    55      display_awk = awk[2:] if awk.startswith('./') else awk
    56      print('| {:>5} '.format(display_awk), end='')
    57  print()
    58  print('-'*9 + ' | -----'*len(AWKS))
    59  
    60  repeats_created = []
    61  products = [1] * len(AWKS)
    62  num_products = 0
    63  programs = sorted(glob.glob(PROGRAM_GLOB))
    64  for program in programs:
    65      # First do a test run with GoAWK to see roughly how long it takes
    66      cmdline = '{} -f {} testdata/foo.td >tt.out'.format(AWKS[0], program)
    67      start = time.time()
    68      status = subprocess.call(cmdline, shell=True)
    69      elapsed = time.time() - start
    70  
    71      # If test run took less than MIN_TIME seconds, scale/repeat input
    72      # file accordingly
    73      input_file = 'testdata/foo.td'
    74      if elapsed < MIN_TIME:
    75          multiplier = int(round(MIN_TIME / elapsed))
    76          repeated_file = '{}.{}'.format(input_file, multiplier)
    77          if not os.path.exists(repeated_file):
    78              repeat_file(input_file, repeated_file, multiplier)
    79              repeats_created.append(repeated_file)
    80          input_file = repeated_file
    81  
    82      # Record time taken to run this test, running each NUM_RUMS times
    83      # and taking the minimum elapsed time
    84      awk_times = []
    85      for awk in AWKS:
    86          cmdline = '{} -f {} {} >tt.out'.format(awk, program, input_file)
    87          times = []
    88          for i in range(NUM_RUNS):
    89              start = time.time()
    90              status = subprocess.call(cmdline, shell=True)
    91              elapsed = time.time() - start
    92              times.append(elapsed)
    93              if status != 0:
    94                  print('ERROR status {} from cmd: {}'.format(status, cmdline), file=sys.stderr)
    95          awk_times.append(min(times))
    96  
    97      # Normalize to One True AWK time = 1.0
    98      norm_time = awk_times[NORM_INDEX]
    99      speeds = [norm_time/t for t in awk_times]
   100      test_name = program.split('/')[1]
   101      if test_name in TESTS_TO_MEAN:
   102          num_products += 1
   103          for i in range(len(AWKS)):
   104              products[i] *= speeds[i]
   105  
   106      print('{:9}'.format(test_name), end='')
   107      for i, awk in enumerate(AWKS):
   108          print(' | {:5.2f}'.format(speeds[i]), end='')
   109      print()
   110  
   111  print('-'*9 + ' | -----'*len(AWKS))
   112  print('**Geo mean** ', end='')
   113  for i, awk in enumerate(AWKS):
   114      print(' | **{:.2f}**'.format(products[i] ** (1.0/num_products)), end='')
   115  print()
   116  
   117  # Delete temporary files created
   118  os.remove('tt.out')
   119  for repeated_file in repeats_created:
   120     os.remove(repeated_file)