github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/performance/import_tester/csv_gen.py (about) 1 import sys 2 import json 3 import uuid 4 import string 5 import random 6 7 random.seed(0) 8 9 # this is dramatically faster than generating a random string for every row. Generate a buffer 10 # then select random portions of the buffer. 11 random_string_buffer_size = 128*1024 12 letters = string.ascii_lowercase + string.ascii_uppercase 13 random_string_buffer = ''.join(random.choice(letters) for i in range(random_string_buffer_size)) 14 15 def sequential_int(row_count, col): 16 i = 0 17 def f(): 18 nonlocal i 19 x = i 20 i += 1 21 return str(x) 22 23 return f 24 25 def shuffled_sequential_int(row_count, col): 26 ids = list(range(row_count)) 27 random.shuffle(ids) 28 i = 0 29 def f(): 30 nonlocal i 31 n = ids[i] 32 i += 1 33 return str(n) 34 35 return f 36 37 def random_int(row_count, col): 38 min_val = 0 39 max_val = 2147483647 40 def f(): 41 return str(random.randint(min_val, max_val)) 42 43 return f 44 45 def random_uuid(row_count, col): 46 def f(): 47 return '"' + str(uuid.uuid4()) + '"' 48 49 return f 50 51 def random_float(row_count, col): 52 min_val = 0.0 53 max_val = 1.0 54 delta = max_val - min_val 55 56 def f(): 57 fl = random.random() 58 fl *= delta 59 fl += min_val 60 return str(fl) 61 62 return f 63 64 def random_string(row_count, col): 65 max_length = 512 66 def f(): 67 length = random.randint(0, max_length) 68 start = random.randint(0, random_string_buffer_size-length) 69 return '"' + random_string_buffer[start:start+length] + '"' 70 return f 71 72 73 generator_methods = { 74 "int": {"random": random_int, "sequential": sequential_int, "shuffled_sequential": shuffled_sequential_int}, 75 "uuid": {"random": random_uuid}, 76 "string": {"random": random_string}, 77 "float": {"random": random_float}, 78 } 79 80 def gen_col_methods(row_count, cols): 81 names = [] 82 methods = [] 83 for col in cols: 84 name = col['name'] 85 typ = col['type'] 86 generator = "random" 87 if "generator" in col: 88 generator = col['generator'] 89 90 if typ not in generator_methods: 91 print("unknown column type '%s' for column '%s'", name, typ) 92 sys.exit(1) 93 94 generator_methods_for_type = generator_methods[typ] 95 if generator not in generator_methods_for_type: 96 print("'%s' is not a valid generator type for column '%s'", generator, name) 97 98 names.append(col['name']) 99 methods.append(generator_methods_for_type[generator](row_count, col)) 100 101 return names, methods 102 103 if len(sys.argv) != 2: 104 print("""python csv_gen.py '{ 105 "cols": [ 106 {"name":"pk", "type":"int", "generator":"sequential"}, 107 {"name":"c1", "type":"uuid"}, 108 {"name":"c2", "type":"string", "length":512}, 109 {"name":"c3", "type":"float"}, 110 {"name":"c4", "type":"int"} 111 ], 112 "row_count": 1000000, 113 }'""") 114 sys.exit(1) 115 116 spec_json = json.loads(sys.argv[1]) 117 row_count = spec_json['row_count'] 118 headers, col_methods = gen_col_methods(row_count, spec_json['cols']) 119 print(','.join(headers)) 120 121 122 flush_interval = 1000 123 lines = [] 124 for i in range(row_count): 125 cols = [] 126 for m in col_methods: 127 v = m() 128 cols.append(v) 129 lines.append(','.join(cols)) 130 131 if i % flush_interval == 0: 132 print('\n'.join(lines)) 133 lines = [] 134 135 if len(lines) != 0: 136 print('\n'.join(lines))