github.com/pdaian/flashboys2@v0.0.0-20190718175736-b101c35361f0/read_csv.py (about)

     1  import csv
     2  
     3  all_txs = set() # set of all tx ids seen as hashes
     4  earliest_seen = {} # hash to earliest line (orderdict) observed of this instance
     5  all_seen = {} # hash to each line (ordereddict) observed of this instance
     6  hits = {} # hash to number of times seen total by all monitors
     7  
     8  num_processed = 0
     9  CUTOFF = 10000000000000000000000
    10  #CUTOFF = 1000000
    11  
    12  def print_seen_line(seen_item, prev_seen_item, display_payload):
    13      print("%.6f" % ((seen_item['time_seen'] - prev_seen_item['time_seen'])/10**9), "%.6f" % (seen_item['time_seen'] / (10 ** 9)), seen_item['hash'], seen_item['sender'], seen_item['account_nonce'], seen_item['gas_price'], "H:" + str(hits[seen_item['hash']]), seen_item['payload'] if display_payload else "", sep="\t")
    14  
    15  def print_seen_dict(seen_tuple_dictionary):
    16      for monitor_ip in seen_tuple_dictionary:
    17          monitor_seen = seen_tuple_dictionary[monitor_ip]
    18          print_seen_list(monitor_seen)
    19          print("-" * 50, "\n")
    20  
    21  def print_seen_list(seen_list, display_payload=True, print_first_item=True):
    22      if print_first_item:
    23          # first line doesn't have a prev item
    24          print_seen_line(seen_list[0], seen_list[0], display_payload)
    25  
    26      for i in range(len(seen_list) - 1):
    27          prev_item = seen_list[i]
    28          item = seen_list[i+1]
    29          print_seen_line(item, prev_item, display_payload)
    30          #open(monitor_ip, "a").write(item['hash'] + "\n")
    31  
    32  def get_bidder(item):
    33      return (item['sender'], item['account_nonce'])
    34  
    35  def add_bidder_to(auction_participation, bidder, auction_id):
    36      if bidder in auction_participation:
    37          if auction_id in auction_participation[bidder]:
    38              auction_participation[bidder][auction_id] += 1
    39          else:
    40              auction_participation[bidder][auction_id] = 1
    41      else:
    42          auction_participation[bidder] = {auction_id: 1}
    43  
    44  def should_filter_frontier(frontier, bidder_id):
    45      bid_addr = bidder_id[0]
    46      bid_nonce = int(bidder_id[1])
    47      if bid_addr in frontier:
    48          if frontier[bid_addr] > bid_nonce + 2: # (choose magic number 2 as threshold for out-of-ordering TODO repair)
    49              return True
    50          frontier[bid_addr] = max(frontier[bid_addr], bid_nonce)
    51      else:
    52          frontier[bid_addr] = bid_nonce
    53      return False
    54  
    55  def prefilter_list(seen_list):
    56      allowed_addrs = open("filter_list.txt").read().strip().splitlines()
    57      frontier = {} # maps address bidding to latest known nonce
    58      filtered_list = []
    59      for item in seen_list:
    60          bidder_id = get_bidder(item)
    61          print(item)
    62          #if int(item['gas_price']) < 20000000000 or int(item['gas_limit']) < 100000: # was <= 80
    63          #if int(item['gas_price']) < 2000 or int(item['gas_limit']) < 100000: # was <= 80
    64          #    continue
    65          if should_filter_frontier(frontier, bidder_id):
    66              continue
    67          #if not item['sender'].lower() in allowed_addrs:
    68          #    continue
    69          filtered_list.append(item)
    70      return filtered_list
    71  
    72  
    73  def get_individual_auctions(seen_list):
    74      # IMPORTANT: ASSUMES DEDUPING (see map to line below)
    75      auctions = [] # list of seen lists for output auctions.  each transaction represents a "bid"
    76      auction_bidders = [] # list of set of bidders [bidder is a tuple of (hash, nonce)] in each of the above auctions, indexed similarly
    77      non_auction_txs = []
    78      # garbage_txs = [] # transactions that were a product of syncing (behind the frontier) TODO populate
    79  
    80      auction_participation = {} # maps bidders to maps of (auction_id : tuple(str,str) to num_bid : int)
    81  
    82      curr_auction = []
    83      curr_bidders = set()
    84      auction_id = 0
    85      for i in range(len(seen_list) - 1):
    86          prev_item = seen_list[i]
    87          item = seen_list[i+1]
    88          time_difference = (item['time_seen'] - prev_item['time_seen'])/10**9
    89  
    90          if time_difference < 3:
    91              # this tx is part of the auction
    92              bidder_id = get_bidder(item)
    93              if len(curr_auction) == 0:
    94                  # new auction; previous tx must have triggered
    95                  curr_auction = [prev_item, item]
    96                  # previous tx actually isn't non-auction
    97                  non_auction_txs = non_auction_txs[:-1]
    98                  original_bidder_id = get_bidder(prev_item)
    99                  curr_bidders.add(original_bidder_id)
   100                  curr_bidders.add(bidder_id)
   101                  add_bidder_to(auction_participation, original_bidder_id, auction_id)
   102              else:
   103                  curr_auction.append(item)
   104                  curr_bidders.add(bidder_id)
   105              add_bidder_to(auction_participation, bidder_id, auction_id)
   106          else:
   107              # tx is not part of an auction
   108              if len(curr_auction) != 0:
   109                  # some previous auction ended; log and reset
   110                  auctions.append(curr_auction)
   111                  auction_bidders += [curr_bidders]
   112                  curr_auction = []
   113                  curr_bidders = set()
   114                  auction_id += 1
   115              non_auction_txs.append(item)
   116  
   117      if len(curr_auction) != 0:
   118          # last straggler auction
   119          auctions.append(curr_auction)
   120          auction_bidders += [curr_bidders]
   121      return auctions, non_auction_txs, auction_bidders, auction_participation
   122  
   123  with open('arbitrage_data.csv', 'r' ) as f:
   124      reader = csv.DictReader(f)
   125      for line in reader:
   126          if line['time_seen'] == 'time_seen':
   127              # duplicate header line, ignore (happens when combining datasets)
   128              continue
   129  
   130          if line['gas_price'] == '':
   131              # [NOTE this prunes all gas-empty bids]
   132              continue
   133  
   134          # line preprocessing (eg type conversions)
   135          line['time_seen'] = int(line['time_seen'])
   136          line['gas_price'] = int(line['gas_price'])
   137          hash = line['hash']
   138  
   139          all_txs.add(hash)
   140          if hash in earliest_seen:
   141              if earliest_seen[hash]['time_seen'] > line['time_seen']:
   142                  earliest_seen[hash] = line
   143              #all_seen[hash].append(line)
   144              hits[hash] += 1
   145          else:
   146              #all_seen[hash] = [line]
   147              earliest_seen[hash] = line
   148              hits[hash] = 1
   149          num_processed += 1
   150          if num_processed > CUTOFF:
   151              break
   152  
   153  seen_times = {} # monitor ip to list of (time_seen, tx_data) for all txs seen
   154  global_seen = [] # list of (time_first_ever_seen, tx_data) for all txs seen
   155  
   156  # comments - disable all_seen for resource reasons (TODO refactor)
   157  #for hash in all_seen:
   158  for hash in earliest_seen:
   159  #    for line in all_seen[hash]:
   160       #for line in earliest_seen[hash]:
   161          #monitor_ip = line['monitor_ip']
   162          #if not monitor_ip in seen_times:
   163          #    seen_times[monitor_ip] = []
   164          #seen_times[monitor_ip].append(line)
   165       global_seen.append(earliest_seen[hash])
   166  
   167  
   168  print("DONE2")
   169  # sort seen_times and global_seen
   170  for monitor_ip in seen_times:
   171      seen_times[monitor_ip] = sorted(seen_times[monitor_ip], key=lambda line: line['time_seen'])
   172  
   173  global_seen = sorted(global_seen, key=lambda line: line['time_seen'])
   174  
   175  print("UNFILTERED GLOBAL LIST")
   176  print_seen_list(global_seen,display_payload=False)
   177  global_seen = prefilter_list(global_seen)
   178  
   179  print("FILTERED GLOBAL LIST")
   180  print_seen_list(global_seen,display_payload=False)
   181  auctions, non_auctions, bidders, participation = get_individual_auctions(global_seen)
   182  
   183  def postprocess_bid_list(all_bids):
   184      last_bid = None
   185      last_bids_by_id = {}
   186      for bid in all_bids:
   187          # insert blanks for first bids, etc
   188          bid['price_delta'] = ''
   189          bid['price_percent_delta'] = ''
   190          bid['time_delta'] = ''
   191          bid['self_price_delta'] = ''
   192          bid['self_price_percent_delta'] = ''
   193          bid['self_time_delta'] = ''
   194          sender = bid['sender']
   195          if last_bid is not None:
   196              price_delta = bid['gas_price'] - last_bid['gas_price']
   197              try:
   198                  price_percent_delta = (price_delta / (float(bid['gas_price'] + last_bid['gas_price'])/2)) * 100
   199              except:
   200                  price_percent_delta = 0.0
   201              try:
   202                  time_delta = (bid['time_seen'] - last_bid['time_seen']) / (10 ** 9)
   203              except:
   204                  # todo when do these division-by-0 cases happen (this and above)?
   205                  time_delta = 0.0
   206              price_delta /=(10 ** 9)
   207              bid['price_delta'] = price_delta
   208              bid['price_percent_delta'] = price_percent_delta
   209              bid['time_delta'] = time_delta
   210  
   211          if sender in last_bids_by_id:
   212              last_self_bid = last_bids_by_id[sender]
   213              price_delta = bid['gas_price'] - last_self_bid['gas_price']
   214              time_delta = (bid['time_seen'] - last_self_bid['time_seen']) / (10 ** 9)
   215              try:
   216                  price_percent_delta = (price_delta / (last_self_bid['gas_price'])) * 100
   217              except:
   218                  price_percent_delta = 0.0
   219              price_delta /=(10 ** 9)
   220              bid['self_price_delta'] = price_delta
   221              bid['self_price_percent_delta'] = price_percent_delta
   222              bid['self_time_delta'] = time_delta
   223          last_bid = bid
   224          last_bids_by_id[sender] = bid
   225  
   226      return all_bids
   227  
   228  def normalize_auction_ids(participation, auction_list):
   229      auctionspans = []
   230      bids_per_auction = {}
   231      for bidder in participation:
   232          bidder_auctions = participation[bidder]
   233          if sum(bidder_auctions.values()) == 1:
   234              continue
   235              # not a repeated bid; ignore
   236  
   237          # keep track of total repeated bids in each auction ID; heaviest ID is the canonical auction
   238          for auction_id in bidder_auctions:
   239              if not auction_id in bids_per_auction:
   240                  bids_per_auction[auction_id] = 0
   241              bids_per_auction[auction_id] += bidder_auctions[auction_id]
   242  
   243          auctionspan = max(bidder_auctions.keys()) - min(bidder_auctions.keys())
   244          auctionspans.append(auctionspan)
   245          print(bidder, bidder_auctions, auctionspan)
   246  
   247      # show delta statistics
   248      for i in range(max(auctionspans)):
   249          print(i, auctionspans.count(i))
   250  
   251      # populate canonical auction list with repeated bidders in each of those auctions
   252      canonical_bidders = {} # maps canonical auction ids to list of bidders
   253      for bidder in participation:
   254          bidder_auctions = participation[bidder]
   255          if sum(bidder_auctions.values()) == 1:
   256              continue
   257              # not a repeated bid; ignore
   258  
   259          auctionspan = max(bidder_auctions.keys()) - min(bidder_auctions.keys())
   260          if auctionspan > 1:
   261              # data quality issues; todo check manually and validate constants
   262              continue
   263  
   264          canonical_auction = -1
   265          best_bids = -1
   266          for auction_id in bidder_auctions:
   267              auction_bids = bids_per_auction[auction_id]
   268              canonical_auction = canonical_auction if auction_bids < best_bids else auction_id
   269              best_bids = bids_per_auction[canonical_auction]
   270          if not canonical_auction in canonical_bidders:
   271              canonical_bidders[canonical_auction] = []
   272          canonical_bidders[canonical_auction].append(bidder)
   273  
   274      print("CANONICAL LIST")
   275      for auction_id in canonical_bidders:
   276          print(auction_id, canonical_bidders[auction_id], len(canonical_bidders[auction_id]))
   277  
   278  
   279      normalized_auction_list = []
   280      for canonical_auction_id in range(0, max(canonical_bidders.keys())):
   281          if canonical_auction_id in canonical_bidders:
   282              if len(canonical_bidders[canonical_auction_id]) < 2:
   283                  continue # todo check these
   284              auction_bidders = canonical_bidders[canonical_auction_id]
   285              # add all bids out-of-period by accounts that rebid here
   286              all_bids = []
   287              auctions_bid_in = set([canonical_auction_id])
   288              for bidder in auction_bidders:
   289                  for auction in participation[bidder]:
   290                      auctions_bid_in.add(auction)
   291              auctions_bid_in = sorted(list(auctions_bid_in))
   292              for auction in auctions_bid_in:
   293                  if auction == canonical_auction_id:
   294                      all_bids += auction_list[auction] # add all in-period bids
   295                  else:
   296                      for bid in auction_list[auction]:
   297                          if bid['sender'] == bidder[0] and bid['account_nonce'] == bidder[1]:
   298                              all_bids.append(bid)
   299              max_gas_price = max([int(x['gas_price']) for x in all_bids])
   300              min_gas_price = min([int(x['gas_price']) for x in all_bids])
   301              if max_gas_price - min_gas_price >= 100000000000:
   302                  all_bids = postprocess_bid_list(all_bids)
   303                  normalized_auction_list.append(all_bids)
   304  
   305      for auction in normalized_auction_list:
   306          print("NORMD AUCTION:")
   307          print_seen_list(auction, display_payload=False)
   308          print("\n\n")
   309  
   310      return normalized_auction_list
   311  
   312  for bidder_group in bidders:
   313      print(bidder_group)
   314      print("\n\n")
   315  
   316  for auction in auctions:
   317      print("AUCTION:")
   318      print_seen_list(auction, display_payload=False)
   319      print("\n\n")
   320  
   321  print("NON-AUCTION")
   322  print_seen_list(auction, display_payload=False)
   323  
   324  
   325  normalized_auctions = normalize_auction_ids(participation, auctions)
   326  
   327  def write_normalized_list(normalized_auctions, output_file):
   328      f = open(output_file, 'w')
   329      w = csv.DictWriter(f, ["auction_id"] + list(normalized_auctions[0][0].keys()))
   330      w.writeheader()
   331      for auction_id in range(len(normalized_auctions)):
   332          auction = normalized_auctions[auction_id]
   333          for bid in auction:
   334              bid["auction_id"] = auction_id
   335          w.writerows(auction)
   336      f.close()
   337  
   338  write_normalized_list(normalized_auctions, "data/auctions.csv")
   339  
   340  exit(1)
   341  
   342  unique_seen_times = {} # duplicate seens removed, structure as seen_times
   343  
   344  for monitor_ip in seen_times:
   345      txs_seen_by_monitor = set()
   346      unique_seen_times[monitor_ip] = []
   347      monitor_seen = seen_times[monitor_ip]
   348      for item in monitor_seen:
   349          hash = item['hash']
   350          if hash not in txs_seen_by_monitor and hits[hash] > 4:
   351              unique_seen_times[monitor_ip].append(item)
   352              txs_seen_by_monitor.add(hash)
   353  
   354  #unique_seen_times = {'35.200.170.118': unique_seen_times['35.200.170.118']}
   355  
   356  print_seen_dict(unique_seen_times)