github.com/web-platform-tests/wpt.fyi@v0.0.0-20240530210107-70cf978996f1/results-processor/processor.py

github.com/web-platform-tests/wpt.fyi@v0.0.0-20240530210107-70cf978996f1/results-processor/processor.py (about)

     1  # Copyright 2018 The WPT Dashboard Project. All rights reserved.
     2  # Use of this source code is governed by a BSD-style license that can be
     3  # found in the LICENSE file.
     4  
     5  import logging
     6  import os
     7  import re
     8  import shutil
     9  import sys
    10  import tempfile
    11  import time
    12  import traceback
    13  import zipfile
    14  from urllib.parse import urlparse
    15  
    16  import requests
    17  from google.cloud import datastore
    18  
    19  import config
    20  import gsutil
    21  import wptreport
    22  from wptscreenshot import WPTScreenshot
    23  
    24  _log = logging.getLogger(__name__)
    25  
    26  
    27  class Processor(object):
    28      USERNAME = '_processor'
    29      # Timeout waiting for remote HTTP servers to respond
    30      TIMEOUT_WAIT = 10
    31  
    32      def __init__(self):
    33          # Delay creating Datastore.client so that tests don't need creds.
    34          self._datastore = None
    35          self._auth = None
    36          # Temporary directories to be created in __enter__:
    37          self._temp_dir = '/tempdir/for/raw/results/screenshots'
    38          self._upload_dir = '/tempdir/for/split/results'
    39  
    40          # Local paths to downloaded results and screenshots:
    41          self.results = []
    42          self.screenshots = []
    43          # To be loaded/initialized later:
    44          self.report = wptreport.WPTReport()
    45          self.test_run_id = 0
    46  
    47      def __enter__(self):
    48          self._temp_dir = tempfile.mkdtemp()
    49          self._upload_dir = tempfile.mkdtemp()
    50          return self
    51  
    52      def __exit__(self, *args):
    53          shutil.rmtree(self._temp_dir)
    54          shutil.rmtree(self._upload_dir)
    55  
    56      @property
    57      def datastore(self):
    58          """An authenticated Datastore client."""
    59          if self._datastore is None:
    60              self._datastore = datastore.Client()
    61          return self._datastore
    62  
    63      @property
    64      def auth(self):
    65          """A (username, password) tuple."""
    66          if self._auth is None:
    67              user = self.datastore.get(
    68                  self.datastore.key('Uploader', self.USERNAME))
    69              self._auth = (user['Username'], user['Password'])
    70          return self._auth
    71  
    72      @property
    73      def raw_results_gs_url(self):
    74          return 'gs://{}/{}/report.json'.format(
    75              config.raw_results_bucket(), self.report.sha_product_path)
    76  
    77      @property
    78      def raw_results_url(self):
    79          return gsutil.gs_to_public_url(self.raw_results_gs_url)
    80  
    81      @property
    82      def results_gs_url(self):
    83          return 'gs://{}/{}'.format(
    84              config.results_bucket(), self.report.sha_summary_path)
    85  
    86      @property
    87      def results_url(self):
    88          return gsutil.gs_to_public_url(self.results_gs_url)
    89  
    90      def check_existing_run(self):
    91          """Returns true if an existing run already has raw_results_url.
    92  
    93          This is used to abort early if the result already exists in Datastore.
    94          It is safe because raw_results_url contains both the full revision &
    95          checksum of the report content, unique enough to use as a UID.
    96  
    97          Datastore does not support a query-and-put transaction, so this is
    98          only a best effort to avoid duplicate runs.
    99          """
   100          q = self.datastore.query(kind='TestRun')
   101          q.add_filter('RawResultsURL', '=', self.raw_results_url)
   102          q.keys_only()
   103          run = list(q.fetch(limit=1))
   104          return len(run) > 0
   105  
   106      @staticmethod
   107      def known_extension(path):
   108          """Returns the extension of the path if known, otherwise None."""
   109          EXT = ('.json.gz', '.txt.gz', '.gz', '.zip', '.json', '.txt')
   110          for e in EXT:
   111              if path.endswith(e):
   112                  return e
   113          return None
   114  
   115      def _download_gcs(self, gcs):
   116          assert gcs.startswith('gs://')
   117          ext = self.known_extension(gcs)
   118          fd, path = tempfile.mkstemp(suffix=ext, dir=self._temp_dir)
   119          os.close(fd)
   120          # gsutil will log itself.
   121          gsutil.copy(gcs, path)
   122          return path
   123  
   124      def _download_http(self, url):
   125          assert url.startswith('http://') or url.startswith('https://')
   126          _log.debug("Downloading %s", url)
   127          try:
   128              r = requests.get(url, stream=True, timeout=self.TIMEOUT_WAIT)
   129              r.raise_for_status()
   130          except requests.RequestException:
   131              # Sleep 1 second and retry.
   132              time.sleep(1)
   133              try:
   134                  r = requests.get(url, stream=True, timeout=self.TIMEOUT_WAIT)
   135                  r.raise_for_status()
   136              except requests.Timeout:
   137                  _log.error("Timed out fetching: %s", url)
   138                  return None
   139              except requests.HTTPError:
   140                  _log.error("Failed to fetch (%d): %s", r.status_code, url)
   141                  return None
   142          ext = (self.known_extension(r.headers.get('Content-Disposition', ''))
   143                 or self.known_extension(url))
   144          fd, path = tempfile.mkstemp(suffix=ext, dir=self._temp_dir)
   145          with os.fdopen(fd, mode='wb') as f:
   146              for chunk in r.iter_content(chunk_size=512*1024):
   147                  f.write(chunk)
   148          # Closing f will automatically close the underlying fd.
   149          return path
   150  
   151      def _download_single(self, uri):
   152          if uri.startswith('gs://'):
   153              return self._download_gcs(uri)
   154          return self._download_http(uri)
   155  
   156      def _download_azure(self, azure_url):
   157          artifact = self._download_http(azure_url)
   158          if artifact is None:
   159              return
   160          with zipfile.ZipFile(artifact, mode='r') as z:
   161              for f in z.infolist():
   162                  # ZipInfo.is_dir isn't available in Python 3.5.
   163                  if f.filename.endswith('/'):
   164                      continue
   165                  path = z.extract(f, path=self._temp_dir)
   166                  if re.match(r'^.*/wpt_report.*\.json$', f.filename):
   167                      self.results.append(path)
   168                  if re.match(r'^.*/wpt_screenshot.*\.txt$', f.filename):
   169                      self.screenshots.append(path)
   170  
   171      def download(self, results, screenshots, azure_url):
   172          """Downloads all necessary inputs.
   173  
   174          Args:
   175              results: A list of results URIs (gs:// or https?://).
   176              screenshots: A list of screenshots URIs (gs:// or https?://).
   177              azure_url: A HTTP URL to an Azure build artifact.
   178          """
   179          if azure_url:
   180              assert not results
   181              assert not screenshots
   182              self._download_azure(azure_url)
   183              return
   184          self.results = [
   185              p for p in (self._download_single(i) for i in results)
   186              if p is not None]
   187          self.screenshots = [
   188              p for p in (self._download_single(i) for i in screenshots)
   189              if p is not None]
   190  
   191      def load_report(self):
   192          """Loads and merges all downloaded results."""
   193          for r in self.results:
   194              self.report.load_file(r)
   195  
   196      def upload_raw(self):
   197          """Uploads the merged raw JSON report to GCS."""
   198          with tempfile.NamedTemporaryFile(
   199                  suffix='.json.gz', dir=self._temp_dir) as temp:
   200              self.report.serialize_gzip(temp.name)
   201              gsutil.copy(temp.name, self.raw_results_gs_url, gzipped=True)
   202  
   203      def upload_split(self):
   204          """Uploads the individual results recursively to GCS."""
   205          self.report.populate_upload_directory(output_dir=self._upload_dir)
   206  
   207          # 1. Copy [ID]-summary_v2.json.gz
   208          # to gs://wptd/[SHA]/[ID]-summary_v2.json.gz.
   209          gsutil.copy(
   210              os.path.join(self._upload_dir, self.report.sha_summary_path),
   211              self.results_gs_url,
   212              gzipped=True)
   213  
   214          # 2. Copy the individual results recursively if there is any (i.e. if
   215          # the report is not empty).
   216          results_dir = os.path.join(
   217              self._upload_dir, self.report.sha_product_path)
   218          if os.path.exists(results_dir):
   219              # gs://wptd/[SHA] is guaranteed to exist after 1, so copying foo to
   220              # gs://wptd/[SHA] will create gs://wptd/[SHA]/foo according to
   221              # `gsutil cp --help`.
   222              gsutil.copy(
   223                  results_dir,
   224                  self.results_gs_url[:self.results_gs_url.rfind('/')],
   225                  gzipped=True)
   226  
   227      def create_run(self, run_id, labels, uploader, callback_url=None):
   228          """Creates a TestRun record.
   229  
   230          Args:
   231              run_id: A string of pre-allocated run ID ('0' if unallocated).
   232              labels: A comma-separated string of extra labels.
   233              uploader: The name of the uploader.
   234              callback_url: URL of the test run creation API (optional).
   235          """
   236          self.test_run_id = wptreport.create_test_run(
   237              self.report,
   238              run_id,
   239              labels,
   240              uploader,
   241              self.auth,
   242              self.results_url,
   243              self.raw_results_url,
   244              callback_url)
   245          assert self.test_run_id
   246  
   247      def update_status(self, run_id, stage, error=None, callback_url=None):
   248          assert stage, "stage cannot be empty"
   249          if int(run_id) == 0:
   250              _log.error('Cannot update run status: missing run_id')
   251              return
   252          if callback_url is None:
   253              callback_url = config.project_baseurl()
   254          parsed_url = urlparse(callback_url)
   255          api = '%s://%s/api/status/%s' % (parsed_url.scheme,
   256                                           parsed_url.netloc,
   257                                           run_id)
   258          payload = {'id': int(run_id), 'stage': stage}
   259          if error:
   260              payload['error'] = error
   261          if self.report.run_info.get('revision'):
   262              payload['full_revision_hash'] = self.report.run_info['revision']
   263          if self.report.run_info.get('product'):
   264              payload['browser_name'] = self.report.run_info['product']
   265          if self.report.run_info.get('browser_version'):
   266              payload['browser_version'] = \
   267                  self.report.run_info['browser_version']
   268          if self.report.run_info.get('os'):
   269              payload['os_name'] = self.report.run_info['os']
   270          if self.report.run_info.get('os_version'):
   271              payload['os_version'] = self.report.run_info['os_version']
   272          try:
   273              response = requests.patch(api, auth=self.auth, json=payload)
   274              response.raise_for_status()
   275              _log.debug('Updated run %s to %s', run_id, stage)
   276          except requests.RequestException as e:
   277              _log.error('Cannot update status for run %s: %s', run_id, str(e))
   278  
   279      def run_hooks(self, tasks):
   280          """Runs post-new-run tasks.
   281  
   282          Args:
   283              tasks: A list of functions that take a single Processor argument.
   284          """
   285          for task in tasks:
   286              _log.info('Running post-new-run task: %s', task.__name__)
   287              try:
   288                  task(self)
   289              except Exception:
   290                  traceback.print_exc()
   291  
   292  
   293  # ==== Beginning of tasks ====
   294  # Tasks are supposed to be independent; exceptions are ignored (but logged).
   295  # Each task is a function that takes a Processor.
   296  
   297  def _upload_screenshots(processor):
   298      for screenshot in processor.screenshots:
   299          with WPTScreenshot(screenshot, processor.report.run_info,
   300                             auth=processor.auth) as s:
   301              s.process()
   302  
   303  # ==== End of tasks ====
   304  
   305  
   306  def process_report(task_id, params):
   307      # Mandatory fields (will throw if key does not exist):
   308      uploader = params['uploader']
   309      # Optional fields:
   310      azure_url = params.get('azure_url')
   311      run_id = params.get('id', '0')
   312      callback_url = params.get('callback_url')
   313      labels = params.get('labels', '')
   314      # Repeatable fields
   315      results = params.getlist('results')
   316      screenshots = params.getlist('screenshots')
   317  
   318      response = []
   319      with Processor() as p:
   320          p.update_status(run_id, 'WPTFYI_PROCESSING', None, callback_url)
   321          if azure_url:
   322              _log.info("Downloading Azure results: %s", azure_url)
   323          else:
   324              _log.info("Downloading %d results & %d screenshots",
   325                        len(results), len(screenshots))
   326          p.download(results, screenshots, azure_url)
   327          if len(p.results) == 0:
   328              _log.error("No results successfully downloaded")
   329              p.update_status(run_id, 'EMPTY', None, callback_url)
   330              return ''
   331          try:
   332              p.load_report()
   333              # To be deprecated once all reports have all the required metadata.
   334              p.report.update_metadata(
   335                  revision=params.get('revision'),
   336                  browser_name=params.get('browser_name'),
   337                  browser_version=params.get('browser_version'),
   338                  os_name=params.get('os_name'),
   339                  os_version=params.get('os_version'),
   340              )
   341              p.report.finalize()
   342          except wptreport.WPTReportError:
   343              etype, e, tb = sys.exc_info()
   344              e.path = results
   345              # This will register an error in Stackdriver.
   346              traceback.print_exception(etype, e, tb)
   347              p.update_status(run_id, 'INVALID', str(e), callback_url)
   348              # The input is invalid and there is no point to retry, so we return
   349              # an empty (but successful) response to drop the task.
   350              return ''
   351  
   352          if p.check_existing_run():
   353              _log.warning(
   354                  'Skipping the task because RawResultsURL already exists: %s',
   355                  p.raw_results_url)
   356              p.update_status(run_id, 'DUPLICATE', None, callback_url)
   357              return ''
   358          response.append("{} results loaded from task {}".format(
   359              len(p.report.results), task_id))
   360  
   361          _log.info("Uploading merged raw report")
   362          p.upload_raw()
   363          response.append("raw_results_url: " + p.raw_results_url)
   364  
   365          _log.info("Uploading split results")
   366          p.upload_split()
   367          response.append("results_url: " + p.results_url)
   368  
   369          # Check again because the upload takes a long time.
   370          if p.check_existing_run():
   371              _log.warning(
   372                  'Skipping the task because RawResultsURL already exists: %s',
   373                  p.raw_results_url)
   374              p.update_status(run_id, 'DUPLICATE', None, callback_url)
   375              return ''
   376  
   377          p.create_run(run_id, labels, uploader, callback_url)
   378          response.append("run ID: {}".format(p.test_run_id))
   379  
   380          p.run_hooks([_upload_screenshots])
   381  
   382      return '\n'.join(response)