github.com/pdaian/flashboys2@v0.0.0-20190718175736-b101c35361f0/read_csv.py (about) 1 import csv 2 3 all_txs = set() # set of all tx ids seen as hashes 4 earliest_seen = {} # hash to earliest line (orderdict) observed of this instance 5 all_seen = {} # hash to each line (ordereddict) observed of this instance 6 hits = {} # hash to number of times seen total by all monitors 7 8 num_processed = 0 9 CUTOFF = 10000000000000000000000 10 #CUTOFF = 1000000 11 12 def print_seen_line(seen_item, prev_seen_item, display_payload): 13 print("%.6f" % ((seen_item['time_seen'] - prev_seen_item['time_seen'])/10**9), "%.6f" % (seen_item['time_seen'] / (10 ** 9)), seen_item['hash'], seen_item['sender'], seen_item['account_nonce'], seen_item['gas_price'], "H:" + str(hits[seen_item['hash']]), seen_item['payload'] if display_payload else "", sep="\t") 14 15 def print_seen_dict(seen_tuple_dictionary): 16 for monitor_ip in seen_tuple_dictionary: 17 monitor_seen = seen_tuple_dictionary[monitor_ip] 18 print_seen_list(monitor_seen) 19 print("-" * 50, "\n") 20 21 def print_seen_list(seen_list, display_payload=True, print_first_item=True): 22 if print_first_item: 23 # first line doesn't have a prev item 24 print_seen_line(seen_list[0], seen_list[0], display_payload) 25 26 for i in range(len(seen_list) - 1): 27 prev_item = seen_list[i] 28 item = seen_list[i+1] 29 print_seen_line(item, prev_item, display_payload) 30 #open(monitor_ip, "a").write(item['hash'] + "\n") 31 32 def get_bidder(item): 33 return (item['sender'], item['account_nonce']) 34 35 def add_bidder_to(auction_participation, bidder, auction_id): 36 if bidder in auction_participation: 37 if auction_id in auction_participation[bidder]: 38 auction_participation[bidder][auction_id] += 1 39 else: 40 auction_participation[bidder][auction_id] = 1 41 else: 42 auction_participation[bidder] = {auction_id: 1} 43 44 def should_filter_frontier(frontier, bidder_id): 45 bid_addr = bidder_id[0] 46 bid_nonce = int(bidder_id[1]) 47 if bid_addr in frontier: 48 if frontier[bid_addr] > bid_nonce + 2: # (choose magic number 2 as threshold for out-of-ordering TODO repair) 49 return True 50 frontier[bid_addr] = max(frontier[bid_addr], bid_nonce) 51 else: 52 frontier[bid_addr] = bid_nonce 53 return False 54 55 def prefilter_list(seen_list): 56 allowed_addrs = open("filter_list.txt").read().strip().splitlines() 57 frontier = {} # maps address bidding to latest known nonce 58 filtered_list = [] 59 for item in seen_list: 60 bidder_id = get_bidder(item) 61 print(item) 62 #if int(item['gas_price']) < 20000000000 or int(item['gas_limit']) < 100000: # was <= 80 63 #if int(item['gas_price']) < 2000 or int(item['gas_limit']) < 100000: # was <= 80 64 # continue 65 if should_filter_frontier(frontier, bidder_id): 66 continue 67 #if not item['sender'].lower() in allowed_addrs: 68 # continue 69 filtered_list.append(item) 70 return filtered_list 71 72 73 def get_individual_auctions(seen_list): 74 # IMPORTANT: ASSUMES DEDUPING (see map to line below) 75 auctions = [] # list of seen lists for output auctions. each transaction represents a "bid" 76 auction_bidders = [] # list of set of bidders [bidder is a tuple of (hash, nonce)] in each of the above auctions, indexed similarly 77 non_auction_txs = [] 78 # garbage_txs = [] # transactions that were a product of syncing (behind the frontier) TODO populate 79 80 auction_participation = {} # maps bidders to maps of (auction_id : tuple(str,str) to num_bid : int) 81 82 curr_auction = [] 83 curr_bidders = set() 84 auction_id = 0 85 for i in range(len(seen_list) - 1): 86 prev_item = seen_list[i] 87 item = seen_list[i+1] 88 time_difference = (item['time_seen'] - prev_item['time_seen'])/10**9 89 90 if time_difference < 3: 91 # this tx is part of the auction 92 bidder_id = get_bidder(item) 93 if len(curr_auction) == 0: 94 # new auction; previous tx must have triggered 95 curr_auction = [prev_item, item] 96 # previous tx actually isn't non-auction 97 non_auction_txs = non_auction_txs[:-1] 98 original_bidder_id = get_bidder(prev_item) 99 curr_bidders.add(original_bidder_id) 100 curr_bidders.add(bidder_id) 101 add_bidder_to(auction_participation, original_bidder_id, auction_id) 102 else: 103 curr_auction.append(item) 104 curr_bidders.add(bidder_id) 105 add_bidder_to(auction_participation, bidder_id, auction_id) 106 else: 107 # tx is not part of an auction 108 if len(curr_auction) != 0: 109 # some previous auction ended; log and reset 110 auctions.append(curr_auction) 111 auction_bidders += [curr_bidders] 112 curr_auction = [] 113 curr_bidders = set() 114 auction_id += 1 115 non_auction_txs.append(item) 116 117 if len(curr_auction) != 0: 118 # last straggler auction 119 auctions.append(curr_auction) 120 auction_bidders += [curr_bidders] 121 return auctions, non_auction_txs, auction_bidders, auction_participation 122 123 with open('arbitrage_data.csv', 'r' ) as f: 124 reader = csv.DictReader(f) 125 for line in reader: 126 if line['time_seen'] == 'time_seen': 127 # duplicate header line, ignore (happens when combining datasets) 128 continue 129 130 if line['gas_price'] == '': 131 # [NOTE this prunes all gas-empty bids] 132 continue 133 134 # line preprocessing (eg type conversions) 135 line['time_seen'] = int(line['time_seen']) 136 line['gas_price'] = int(line['gas_price']) 137 hash = line['hash'] 138 139 all_txs.add(hash) 140 if hash in earliest_seen: 141 if earliest_seen[hash]['time_seen'] > line['time_seen']: 142 earliest_seen[hash] = line 143 #all_seen[hash].append(line) 144 hits[hash] += 1 145 else: 146 #all_seen[hash] = [line] 147 earliest_seen[hash] = line 148 hits[hash] = 1 149 num_processed += 1 150 if num_processed > CUTOFF: 151 break 152 153 seen_times = {} # monitor ip to list of (time_seen, tx_data) for all txs seen 154 global_seen = [] # list of (time_first_ever_seen, tx_data) for all txs seen 155 156 # comments - disable all_seen for resource reasons (TODO refactor) 157 #for hash in all_seen: 158 for hash in earliest_seen: 159 # for line in all_seen[hash]: 160 #for line in earliest_seen[hash]: 161 #monitor_ip = line['monitor_ip'] 162 #if not monitor_ip in seen_times: 163 # seen_times[monitor_ip] = [] 164 #seen_times[monitor_ip].append(line) 165 global_seen.append(earliest_seen[hash]) 166 167 168 print("DONE2") 169 # sort seen_times and global_seen 170 for monitor_ip in seen_times: 171 seen_times[monitor_ip] = sorted(seen_times[monitor_ip], key=lambda line: line['time_seen']) 172 173 global_seen = sorted(global_seen, key=lambda line: line['time_seen']) 174 175 print("UNFILTERED GLOBAL LIST") 176 print_seen_list(global_seen,display_payload=False) 177 global_seen = prefilter_list(global_seen) 178 179 print("FILTERED GLOBAL LIST") 180 print_seen_list(global_seen,display_payload=False) 181 auctions, non_auctions, bidders, participation = get_individual_auctions(global_seen) 182 183 def postprocess_bid_list(all_bids): 184 last_bid = None 185 last_bids_by_id = {} 186 for bid in all_bids: 187 # insert blanks for first bids, etc 188 bid['price_delta'] = '' 189 bid['price_percent_delta'] = '' 190 bid['time_delta'] = '' 191 bid['self_price_delta'] = '' 192 bid['self_price_percent_delta'] = '' 193 bid['self_time_delta'] = '' 194 sender = bid['sender'] 195 if last_bid is not None: 196 price_delta = bid['gas_price'] - last_bid['gas_price'] 197 try: 198 price_percent_delta = (price_delta / (float(bid['gas_price'] + last_bid['gas_price'])/2)) * 100 199 except: 200 price_percent_delta = 0.0 201 try: 202 time_delta = (bid['time_seen'] - last_bid['time_seen']) / (10 ** 9) 203 except: 204 # todo when do these division-by-0 cases happen (this and above)? 205 time_delta = 0.0 206 price_delta /=(10 ** 9) 207 bid['price_delta'] = price_delta 208 bid['price_percent_delta'] = price_percent_delta 209 bid['time_delta'] = time_delta 210 211 if sender in last_bids_by_id: 212 last_self_bid = last_bids_by_id[sender] 213 price_delta = bid['gas_price'] - last_self_bid['gas_price'] 214 time_delta = (bid['time_seen'] - last_self_bid['time_seen']) / (10 ** 9) 215 try: 216 price_percent_delta = (price_delta / (last_self_bid['gas_price'])) * 100 217 except: 218 price_percent_delta = 0.0 219 price_delta /=(10 ** 9) 220 bid['self_price_delta'] = price_delta 221 bid['self_price_percent_delta'] = price_percent_delta 222 bid['self_time_delta'] = time_delta 223 last_bid = bid 224 last_bids_by_id[sender] = bid 225 226 return all_bids 227 228 def normalize_auction_ids(participation, auction_list): 229 auctionspans = [] 230 bids_per_auction = {} 231 for bidder in participation: 232 bidder_auctions = participation[bidder] 233 if sum(bidder_auctions.values()) == 1: 234 continue 235 # not a repeated bid; ignore 236 237 # keep track of total repeated bids in each auction ID; heaviest ID is the canonical auction 238 for auction_id in bidder_auctions: 239 if not auction_id in bids_per_auction: 240 bids_per_auction[auction_id] = 0 241 bids_per_auction[auction_id] += bidder_auctions[auction_id] 242 243 auctionspan = max(bidder_auctions.keys()) - min(bidder_auctions.keys()) 244 auctionspans.append(auctionspan) 245 print(bidder, bidder_auctions, auctionspan) 246 247 # show delta statistics 248 for i in range(max(auctionspans)): 249 print(i, auctionspans.count(i)) 250 251 # populate canonical auction list with repeated bidders in each of those auctions 252 canonical_bidders = {} # maps canonical auction ids to list of bidders 253 for bidder in participation: 254 bidder_auctions = participation[bidder] 255 if sum(bidder_auctions.values()) == 1: 256 continue 257 # not a repeated bid; ignore 258 259 auctionspan = max(bidder_auctions.keys()) - min(bidder_auctions.keys()) 260 if auctionspan > 1: 261 # data quality issues; todo check manually and validate constants 262 continue 263 264 canonical_auction = -1 265 best_bids = -1 266 for auction_id in bidder_auctions: 267 auction_bids = bids_per_auction[auction_id] 268 canonical_auction = canonical_auction if auction_bids < best_bids else auction_id 269 best_bids = bids_per_auction[canonical_auction] 270 if not canonical_auction in canonical_bidders: 271 canonical_bidders[canonical_auction] = [] 272 canonical_bidders[canonical_auction].append(bidder) 273 274 print("CANONICAL LIST") 275 for auction_id in canonical_bidders: 276 print(auction_id, canonical_bidders[auction_id], len(canonical_bidders[auction_id])) 277 278 279 normalized_auction_list = [] 280 for canonical_auction_id in range(0, max(canonical_bidders.keys())): 281 if canonical_auction_id in canonical_bidders: 282 if len(canonical_bidders[canonical_auction_id]) < 2: 283 continue # todo check these 284 auction_bidders = canonical_bidders[canonical_auction_id] 285 # add all bids out-of-period by accounts that rebid here 286 all_bids = [] 287 auctions_bid_in = set([canonical_auction_id]) 288 for bidder in auction_bidders: 289 for auction in participation[bidder]: 290 auctions_bid_in.add(auction) 291 auctions_bid_in = sorted(list(auctions_bid_in)) 292 for auction in auctions_bid_in: 293 if auction == canonical_auction_id: 294 all_bids += auction_list[auction] # add all in-period bids 295 else: 296 for bid in auction_list[auction]: 297 if bid['sender'] == bidder[0] and bid['account_nonce'] == bidder[1]: 298 all_bids.append(bid) 299 max_gas_price = max([int(x['gas_price']) for x in all_bids]) 300 min_gas_price = min([int(x['gas_price']) for x in all_bids]) 301 if max_gas_price - min_gas_price >= 100000000000: 302 all_bids = postprocess_bid_list(all_bids) 303 normalized_auction_list.append(all_bids) 304 305 for auction in normalized_auction_list: 306 print("NORMD AUCTION:") 307 print_seen_list(auction, display_payload=False) 308 print("\n\n") 309 310 return normalized_auction_list 311 312 for bidder_group in bidders: 313 print(bidder_group) 314 print("\n\n") 315 316 for auction in auctions: 317 print("AUCTION:") 318 print_seen_list(auction, display_payload=False) 319 print("\n\n") 320 321 print("NON-AUCTION") 322 print_seen_list(auction, display_payload=False) 323 324 325 normalized_auctions = normalize_auction_ids(participation, auctions) 326 327 def write_normalized_list(normalized_auctions, output_file): 328 f = open(output_file, 'w') 329 w = csv.DictWriter(f, ["auction_id"] + list(normalized_auctions[0][0].keys())) 330 w.writeheader() 331 for auction_id in range(len(normalized_auctions)): 332 auction = normalized_auctions[auction_id] 333 for bid in auction: 334 bid["auction_id"] = auction_id 335 w.writerows(auction) 336 f.close() 337 338 write_normalized_list(normalized_auctions, "data/auctions.csv") 339 340 exit(1) 341 342 unique_seen_times = {} # duplicate seens removed, structure as seen_times 343 344 for monitor_ip in seen_times: 345 txs_seen_by_monitor = set() 346 unique_seen_times[monitor_ip] = [] 347 monitor_seen = seen_times[monitor_ip] 348 for item in monitor_seen: 349 hash = item['hash'] 350 if hash not in txs_seen_by_monitor and hits[hash] > 4: 351 unique_seen_times[monitor_ip].append(item) 352 txs_seen_by_monitor.add(hash) 353 354 #unique_seen_times = {'35.200.170.118': unique_seen_times['35.200.170.118']} 355 356 print_seen_dict(unique_seen_times)