github.com/benhoyt/goawk@v1.8.1/benchmark_awks.py (about) 1 #!/usr/bin/env python 2 # Benchmark GoAWK against other AWK versions 3 4 from __future__ import print_function 5 6 import glob 7 import os.path 8 import shutil 9 import subprocess 10 import sys 11 import time 12 13 AWKS = [ 14 './goawk', 15 './orig', # GoAWK without perf improvements (commit 8ab5446) 16 'awk', 17 'gawk', 18 'mawk', 19 ] 20 NORM_INDEX = AWKS.index('awk') 21 # Only get the mean of these tests because these are the only ones 22 # we show in the GoAWK article. 23 TESTS_TO_MEAN = [ 24 'tt.01', 25 'tt.02', 26 'tt.02a', 27 'tt.03', 28 'tt.03a', 29 'tt.04', 30 'tt.05', 31 'tt.06', 32 'tt.07', 33 'tt.big', 34 'tt.x1', 35 'tt.x2', 36 ] 37 MEAN_TESTS = [] 38 NUM_RUNS = 3 39 MIN_TIME = 0.5 40 PROGRAM_GLOB = 'testdata/tt.*' 41 42 if len(sys.argv) > 1: 43 PROGRAM_GLOB = 'testdata/' + sys.argv[1] 44 45 46 def repeat_file(input_file, repeated_file, n): 47 with open(input_file, 'rb') as fin, open(repeated_file, 'wb') as fout: 48 for i in range(n): 49 fin.seek(0) 50 shutil.copyfileobj(fin, fout) 51 52 53 print('Test ', end='') 54 for awk in AWKS: 55 display_awk = awk[2:] if awk.startswith('./') else awk 56 print('| {:>5} '.format(display_awk), end='') 57 print() 58 print('-'*9 + ' | -----'*len(AWKS)) 59 60 repeats_created = [] 61 products = [1] * len(AWKS) 62 num_products = 0 63 programs = sorted(glob.glob(PROGRAM_GLOB)) 64 for program in programs: 65 # First do a test run with GoAWK to see roughly how long it takes 66 cmdline = '{} -f {} testdata/foo.td >tt.out'.format(AWKS[0], program) 67 start = time.time() 68 status = subprocess.call(cmdline, shell=True) 69 elapsed = time.time() - start 70 71 # If test run took less than MIN_TIME seconds, scale/repeat input 72 # file accordingly 73 input_file = 'testdata/foo.td' 74 if elapsed < MIN_TIME: 75 multiplier = int(round(MIN_TIME / elapsed)) 76 repeated_file = '{}.{}'.format(input_file, multiplier) 77 if not os.path.exists(repeated_file): 78 repeat_file(input_file, repeated_file, multiplier) 79 repeats_created.append(repeated_file) 80 input_file = repeated_file 81 82 # Record time taken to run this test, running each NUM_RUMS times 83 # and taking the minimum elapsed time 84 awk_times = [] 85 for awk in AWKS: 86 cmdline = '{} -f {} {} >tt.out'.format(awk, program, input_file) 87 times = [] 88 for i in range(NUM_RUNS): 89 start = time.time() 90 status = subprocess.call(cmdline, shell=True) 91 elapsed = time.time() - start 92 times.append(elapsed) 93 if status != 0: 94 print('ERROR status {} from cmd: {}'.format(status, cmdline), file=sys.stderr) 95 awk_times.append(min(times)) 96 97 # Normalize to One True AWK time = 1.0 98 norm_time = awk_times[NORM_INDEX] 99 speeds = [norm_time/t for t in awk_times] 100 test_name = program.split('/')[1] 101 if test_name in TESTS_TO_MEAN: 102 num_products += 1 103 for i in range(len(AWKS)): 104 products[i] *= speeds[i] 105 106 print('{:9}'.format(test_name), end='') 107 for i, awk in enumerate(AWKS): 108 print(' | {:5.2f}'.format(speeds[i]), end='') 109 print() 110 111 print('-'*9 + ' | -----'*len(AWKS)) 112 print('**Geo mean** ', end='') 113 for i, awk in enumerate(AWKS): 114 print(' | **{:.2f}**'.format(products[i] ** (1.0/num_products)), end='') 115 print() 116 117 # Delete temporary files created 118 os.remove('tt.out') 119 for repeated_file in repeats_created: 120 os.remove(repeated_file)