github.com/web-platform-tests/wpt.fyi@v0.0.0-20240530210107-70cf978996f1/results-processor/processor.py (about) 1 # Copyright 2018 The WPT Dashboard Project. All rights reserved. 2 # Use of this source code is governed by a BSD-style license that can be 3 # found in the LICENSE file. 4 5 import logging 6 import os 7 import re 8 import shutil 9 import sys 10 import tempfile 11 import time 12 import traceback 13 import zipfile 14 from urllib.parse import urlparse 15 16 import requests 17 from google.cloud import datastore 18 19 import config 20 import gsutil 21 import wptreport 22 from wptscreenshot import WPTScreenshot 23 24 _log = logging.getLogger(__name__) 25 26 27 class Processor(object): 28 USERNAME = '_processor' 29 # Timeout waiting for remote HTTP servers to respond 30 TIMEOUT_WAIT = 10 31 32 def __init__(self): 33 # Delay creating Datastore.client so that tests don't need creds. 34 self._datastore = None 35 self._auth = None 36 # Temporary directories to be created in __enter__: 37 self._temp_dir = '/tempdir/for/raw/results/screenshots' 38 self._upload_dir = '/tempdir/for/split/results' 39 40 # Local paths to downloaded results and screenshots: 41 self.results = [] 42 self.screenshots = [] 43 # To be loaded/initialized later: 44 self.report = wptreport.WPTReport() 45 self.test_run_id = 0 46 47 def __enter__(self): 48 self._temp_dir = tempfile.mkdtemp() 49 self._upload_dir = tempfile.mkdtemp() 50 return self 51 52 def __exit__(self, *args): 53 shutil.rmtree(self._temp_dir) 54 shutil.rmtree(self._upload_dir) 55 56 @property 57 def datastore(self): 58 """An authenticated Datastore client.""" 59 if self._datastore is None: 60 self._datastore = datastore.Client() 61 return self._datastore 62 63 @property 64 def auth(self): 65 """A (username, password) tuple.""" 66 if self._auth is None: 67 user = self.datastore.get( 68 self.datastore.key('Uploader', self.USERNAME)) 69 self._auth = (user['Username'], user['Password']) 70 return self._auth 71 72 @property 73 def raw_results_gs_url(self): 74 return 'gs://{}/{}/report.json'.format( 75 config.raw_results_bucket(), self.report.sha_product_path) 76 77 @property 78 def raw_results_url(self): 79 return gsutil.gs_to_public_url(self.raw_results_gs_url) 80 81 @property 82 def results_gs_url(self): 83 return 'gs://{}/{}'.format( 84 config.results_bucket(), self.report.sha_summary_path) 85 86 @property 87 def results_url(self): 88 return gsutil.gs_to_public_url(self.results_gs_url) 89 90 def check_existing_run(self): 91 """Returns true if an existing run already has raw_results_url. 92 93 This is used to abort early if the result already exists in Datastore. 94 It is safe because raw_results_url contains both the full revision & 95 checksum of the report content, unique enough to use as a UID. 96 97 Datastore does not support a query-and-put transaction, so this is 98 only a best effort to avoid duplicate runs. 99 """ 100 q = self.datastore.query(kind='TestRun') 101 q.add_filter('RawResultsURL', '=', self.raw_results_url) 102 q.keys_only() 103 run = list(q.fetch(limit=1)) 104 return len(run) > 0 105 106 @staticmethod 107 def known_extension(path): 108 """Returns the extension of the path if known, otherwise None.""" 109 EXT = ('.json.gz', '.txt.gz', '.gz', '.zip', '.json', '.txt') 110 for e in EXT: 111 if path.endswith(e): 112 return e 113 return None 114 115 def _download_gcs(self, gcs): 116 assert gcs.startswith('gs://') 117 ext = self.known_extension(gcs) 118 fd, path = tempfile.mkstemp(suffix=ext, dir=self._temp_dir) 119 os.close(fd) 120 # gsutil will log itself. 121 gsutil.copy(gcs, path) 122 return path 123 124 def _download_http(self, url): 125 assert url.startswith('http://') or url.startswith('https://') 126 _log.debug("Downloading %s", url) 127 try: 128 r = requests.get(url, stream=True, timeout=self.TIMEOUT_WAIT) 129 r.raise_for_status() 130 except requests.RequestException: 131 # Sleep 1 second and retry. 132 time.sleep(1) 133 try: 134 r = requests.get(url, stream=True, timeout=self.TIMEOUT_WAIT) 135 r.raise_for_status() 136 except requests.Timeout: 137 _log.error("Timed out fetching: %s", url) 138 return None 139 except requests.HTTPError: 140 _log.error("Failed to fetch (%d): %s", r.status_code, url) 141 return None 142 ext = (self.known_extension(r.headers.get('Content-Disposition', '')) 143 or self.known_extension(url)) 144 fd, path = tempfile.mkstemp(suffix=ext, dir=self._temp_dir) 145 with os.fdopen(fd, mode='wb') as f: 146 for chunk in r.iter_content(chunk_size=512*1024): 147 f.write(chunk) 148 # Closing f will automatically close the underlying fd. 149 return path 150 151 def _download_single(self, uri): 152 if uri.startswith('gs://'): 153 return self._download_gcs(uri) 154 return self._download_http(uri) 155 156 def _download_azure(self, azure_url): 157 artifact = self._download_http(azure_url) 158 if artifact is None: 159 return 160 with zipfile.ZipFile(artifact, mode='r') as z: 161 for f in z.infolist(): 162 # ZipInfo.is_dir isn't available in Python 3.5. 163 if f.filename.endswith('/'): 164 continue 165 path = z.extract(f, path=self._temp_dir) 166 if re.match(r'^.*/wpt_report.*\.json$', f.filename): 167 self.results.append(path) 168 if re.match(r'^.*/wpt_screenshot.*\.txt$', f.filename): 169 self.screenshots.append(path) 170 171 def download(self, results, screenshots, azure_url): 172 """Downloads all necessary inputs. 173 174 Args: 175 results: A list of results URIs (gs:// or https?://). 176 screenshots: A list of screenshots URIs (gs:// or https?://). 177 azure_url: A HTTP URL to an Azure build artifact. 178 """ 179 if azure_url: 180 assert not results 181 assert not screenshots 182 self._download_azure(azure_url) 183 return 184 self.results = [ 185 p for p in (self._download_single(i) for i in results) 186 if p is not None] 187 self.screenshots = [ 188 p for p in (self._download_single(i) for i in screenshots) 189 if p is not None] 190 191 def load_report(self): 192 """Loads and merges all downloaded results.""" 193 for r in self.results: 194 self.report.load_file(r) 195 196 def upload_raw(self): 197 """Uploads the merged raw JSON report to GCS.""" 198 with tempfile.NamedTemporaryFile( 199 suffix='.json.gz', dir=self._temp_dir) as temp: 200 self.report.serialize_gzip(temp.name) 201 gsutil.copy(temp.name, self.raw_results_gs_url, gzipped=True) 202 203 def upload_split(self): 204 """Uploads the individual results recursively to GCS.""" 205 self.report.populate_upload_directory(output_dir=self._upload_dir) 206 207 # 1. Copy [ID]-summary_v2.json.gz 208 # to gs://wptd/[SHA]/[ID]-summary_v2.json.gz. 209 gsutil.copy( 210 os.path.join(self._upload_dir, self.report.sha_summary_path), 211 self.results_gs_url, 212 gzipped=True) 213 214 # 2. Copy the individual results recursively if there is any (i.e. if 215 # the report is not empty). 216 results_dir = os.path.join( 217 self._upload_dir, self.report.sha_product_path) 218 if os.path.exists(results_dir): 219 # gs://wptd/[SHA] is guaranteed to exist after 1, so copying foo to 220 # gs://wptd/[SHA] will create gs://wptd/[SHA]/foo according to 221 # `gsutil cp --help`. 222 gsutil.copy( 223 results_dir, 224 self.results_gs_url[:self.results_gs_url.rfind('/')], 225 gzipped=True) 226 227 def create_run(self, run_id, labels, uploader, callback_url=None): 228 """Creates a TestRun record. 229 230 Args: 231 run_id: A string of pre-allocated run ID ('0' if unallocated). 232 labels: A comma-separated string of extra labels. 233 uploader: The name of the uploader. 234 callback_url: URL of the test run creation API (optional). 235 """ 236 self.test_run_id = wptreport.create_test_run( 237 self.report, 238 run_id, 239 labels, 240 uploader, 241 self.auth, 242 self.results_url, 243 self.raw_results_url, 244 callback_url) 245 assert self.test_run_id 246 247 def update_status(self, run_id, stage, error=None, callback_url=None): 248 assert stage, "stage cannot be empty" 249 if int(run_id) == 0: 250 _log.error('Cannot update run status: missing run_id') 251 return 252 if callback_url is None: 253 callback_url = config.project_baseurl() 254 parsed_url = urlparse(callback_url) 255 api = '%s://%s/api/status/%s' % (parsed_url.scheme, 256 parsed_url.netloc, 257 run_id) 258 payload = {'id': int(run_id), 'stage': stage} 259 if error: 260 payload['error'] = error 261 if self.report.run_info.get('revision'): 262 payload['full_revision_hash'] = self.report.run_info['revision'] 263 if self.report.run_info.get('product'): 264 payload['browser_name'] = self.report.run_info['product'] 265 if self.report.run_info.get('browser_version'): 266 payload['browser_version'] = \ 267 self.report.run_info['browser_version'] 268 if self.report.run_info.get('os'): 269 payload['os_name'] = self.report.run_info['os'] 270 if self.report.run_info.get('os_version'): 271 payload['os_version'] = self.report.run_info['os_version'] 272 try: 273 response = requests.patch(api, auth=self.auth, json=payload) 274 response.raise_for_status() 275 _log.debug('Updated run %s to %s', run_id, stage) 276 except requests.RequestException as e: 277 _log.error('Cannot update status for run %s: %s', run_id, str(e)) 278 279 def run_hooks(self, tasks): 280 """Runs post-new-run tasks. 281 282 Args: 283 tasks: A list of functions that take a single Processor argument. 284 """ 285 for task in tasks: 286 _log.info('Running post-new-run task: %s', task.__name__) 287 try: 288 task(self) 289 except Exception: 290 traceback.print_exc() 291 292 293 # ==== Beginning of tasks ==== 294 # Tasks are supposed to be independent; exceptions are ignored (but logged). 295 # Each task is a function that takes a Processor. 296 297 def _upload_screenshots(processor): 298 for screenshot in processor.screenshots: 299 with WPTScreenshot(screenshot, processor.report.run_info, 300 auth=processor.auth) as s: 301 s.process() 302 303 # ==== End of tasks ==== 304 305 306 def process_report(task_id, params): 307 # Mandatory fields (will throw if key does not exist): 308 uploader = params['uploader'] 309 # Optional fields: 310 azure_url = params.get('azure_url') 311 run_id = params.get('id', '0') 312 callback_url = params.get('callback_url') 313 labels = params.get('labels', '') 314 # Repeatable fields 315 results = params.getlist('results') 316 screenshots = params.getlist('screenshots') 317 318 response = [] 319 with Processor() as p: 320 p.update_status(run_id, 'WPTFYI_PROCESSING', None, callback_url) 321 if azure_url: 322 _log.info("Downloading Azure results: %s", azure_url) 323 else: 324 _log.info("Downloading %d results & %d screenshots", 325 len(results), len(screenshots)) 326 p.download(results, screenshots, azure_url) 327 if len(p.results) == 0: 328 _log.error("No results successfully downloaded") 329 p.update_status(run_id, 'EMPTY', None, callback_url) 330 return '' 331 try: 332 p.load_report() 333 # To be deprecated once all reports have all the required metadata. 334 p.report.update_metadata( 335 revision=params.get('revision'), 336 browser_name=params.get('browser_name'), 337 browser_version=params.get('browser_version'), 338 os_name=params.get('os_name'), 339 os_version=params.get('os_version'), 340 ) 341 p.report.finalize() 342 except wptreport.WPTReportError: 343 etype, e, tb = sys.exc_info() 344 e.path = results 345 # This will register an error in Stackdriver. 346 traceback.print_exception(etype, e, tb) 347 p.update_status(run_id, 'INVALID', str(e), callback_url) 348 # The input is invalid and there is no point to retry, so we return 349 # an empty (but successful) response to drop the task. 350 return '' 351 352 if p.check_existing_run(): 353 _log.warning( 354 'Skipping the task because RawResultsURL already exists: %s', 355 p.raw_results_url) 356 p.update_status(run_id, 'DUPLICATE', None, callback_url) 357 return '' 358 response.append("{} results loaded from task {}".format( 359 len(p.report.results), task_id)) 360 361 _log.info("Uploading merged raw report") 362 p.upload_raw() 363 response.append("raw_results_url: " + p.raw_results_url) 364 365 _log.info("Uploading split results") 366 p.upload_split() 367 response.append("results_url: " + p.results_url) 368 369 # Check again because the upload takes a long time. 370 if p.check_existing_run(): 371 _log.warning( 372 'Skipping the task because RawResultsURL already exists: %s', 373 p.raw_results_url) 374 p.update_status(run_id, 'DUPLICATE', None, callback_url) 375 return '' 376 377 p.create_run(run_id, labels, uploader, callback_url) 378 response.append("run ID: {}".format(p.test_run_id)) 379 380 p.run_hooks([_upload_screenshots]) 381 382 return '\n'.join(response)