github.com/web-platform-tests/wpt.fyi@v0.0.0-20240530210107-70cf978996f1/results-processor/wptreport.py (about) 1 #!/usr/bin/env python3 2 3 # Copyright 2018 The WPT Dashboard Project. All rights reserved. 4 # Use of this source code is governed by a BSD-style license that can be 5 # found in the LICENSE file. 6 7 import argparse 8 import gzip 9 import hashlib 10 import io 11 import json 12 import logging 13 import os 14 import re 15 import tempfile 16 from datetime import datetime, timezone 17 from typing import ( 18 Any, 19 Callable, 20 Dict, 21 IO, 22 Iterator, 23 List, 24 Optional, 25 Set, 26 Union, 27 cast, 28 ) 29 30 import requests 31 from mypy_extensions import TypedDict 32 33 import config 34 35 DEFAULT_PROJECT = 'wptdashboard' 36 # These are the release channels understood by wpt.fyi. 37 RELEASE_CHANNEL_LABELS = frozenset({'stable', 'beta', 'experimental'}) 38 # Ignore inconsistent browser minor versions for now. 39 # TODO(Hexcles): Remove this when the TC decision task is implemented. 40 IGNORED_CONFLICTS = frozenset({'browser_build_id', 'browser_changeset', 41 'version', 'os_build'}) 42 43 # A map of abbreviations for test statuses. This will be used 44 # to convert test statuses to smaller formats to store in summary files. 45 # NOTE: If a new status abbreviation is added here, the mapping 46 # at webapp/views/wpt-results.js will also require the change. 47 STATUS_ABBREVIATIONS = { 48 "PASS": "P", 49 "OK": "O", 50 "FAIL": "F", 51 "SKIP": "S", 52 "ERROR": "E", 53 "NOTRUN": "N", 54 "CRASH": "C", 55 "TIMEOUT": "T", 56 "PRECONDITION_FAILED": "PF" 57 } 58 59 _log = logging.getLogger(__name__) 60 61 62 class RunInfo(TypedDict, total=False): 63 product: str 64 browser_version: str 65 browser_channel: str 66 revision: str 67 os: str 68 os_version: str 69 70 71 class RawWPTReport(TypedDict, total=False): 72 results: List[Dict] 73 run_info: RunInfo 74 time_start: float 75 time_end: float 76 77 78 class WPTReportError(Exception): 79 """Base class for all input-related exceptions.""" 80 def __init__(self, message: str, 81 path: Optional[Union[str, List[str]]] = None) -> None: 82 self.message = message 83 self.path = path 84 85 def __str__(self): 86 message = self.message 87 if self.path: 88 message += " (%s)" % self.path 89 return message 90 91 92 class InvalidJSONError(WPTReportError): 93 def __init__(self) -> None: 94 super(InvalidJSONError, self).__init__("Invalid JSON") 95 96 97 class MissingMetadataError(WPTReportError): 98 def __init__(self, key: str) -> None: 99 super(MissingMetadataError, self).__init__( 100 "Missing required metadata '%s'" % 101 (key,) 102 ) 103 104 105 class InsufficientDataError(WPTReportError): 106 def __init__(self) -> None: 107 super(InsufficientDataError, self).__init__("Missing 'results' field") 108 109 110 class ConflictingDataError(WPTReportError): 111 def __init__(self, key: str) -> None: 112 super(ConflictingDataError, self).__init__( 113 "Conflicting '%s' found in the merged report" % (key,) 114 ) 115 116 117 class BufferedHashsum(object): 118 """A simple buffered hash calculator.""" 119 120 def __init__(self, 121 hash_ctor: Callable = hashlib.sha1, 122 block_size: int = 1024*1024) -> None: 123 assert block_size > 0 124 self._hash = hash_ctor() 125 self._block_size = block_size 126 127 def hash_file(self, fileobj: IO[bytes]) -> None: 128 """Updates the hashsum from a given file. 129 130 Calling this method on multiple files is equivalent to computing the 131 hash of all the files concatenated together. 132 133 Args: 134 fileobj: A file object to hash (must be in binary mode). 135 136 Returns: 137 A string, the hexadecimal digest of the file. 138 """ 139 assert not isinstance(fileobj, io.TextIOBase) 140 buf = fileobj.read(self._block_size) 141 while len(buf) > 0: 142 self._hash.update(buf) 143 buf = fileobj.read(self._block_size) 144 145 def hashsum(self) -> str: 146 """Returns the hexadecimal digest of the current hash.""" 147 return cast(str, self._hash.hexdigest()) 148 149 150 class WPTReport(object): 151 """An abstraction of wptreport.json with some transformation features.""" 152 153 def __init__(self) -> None: 154 self._hash = BufferedHashsum() 155 self._report: RawWPTReport = { 156 'results': [], 157 'run_info': {}, 158 } 159 self._summary: Dict[str, Dict[str, Any]] = {} 160 161 def _add_chunk(self, chunk: RawWPTReport) -> None: 162 self._report['results'].extend(chunk['results']) 163 164 def update_property(key: str, source: Dict, target: Dict, 165 conflict_func: Optional[Callable] = None) -> bool: 166 """Updates target[key] if source[key] is set. 167 168 If target[key] is already set and different from source[key], we 169 have a conflict: 170 * If conflict_func is None, a ConflictingDataError is raised. 171 * If conflict_func is not None, target[key] = 172 conflict_func(target[key], source[key]), and True is returned. 173 174 Returns: False if there is no conflict. 175 """ 176 if key not in source: 177 return False 178 if key in target and source[key] != target[key]: 179 if conflict_func: 180 target[key] = conflict_func(source[key], target[key]) 181 return True 182 raise ConflictingDataError(key) 183 target[key] = source[key] 184 return False 185 186 if 'run_info' in chunk: 187 conflicts = [] 188 for key in chunk['run_info']: 189 source = cast(Dict, chunk['run_info']) 190 target = cast(Dict, self._report['run_info']) 191 192 # We clear the target value as part of update_property; 193 # record it here to be used in the conflict report if needed. 194 target_value = target[key] if key in target else "" 195 196 conflict = update_property( 197 key, source, target, 198 lambda _1, _2: None, # Set conflicting fields to None. 199 ) 200 # Delay raising exceptions even when conflicts are not ignored, 201 # so that we can set as much metadata as possible. 202 if conflict and key not in IGNORED_CONFLICTS: 203 conflicts.append( 204 "%s: [%s, %s]" % (key, source[key], target_value)) 205 if conflicts: 206 raise ConflictingDataError(', '.join(conflicts)) 207 208 update_property( 209 'time_start', cast(Dict, chunk), cast(Dict, self._report), min) 210 update_property( 211 'time_end', cast(Dict, chunk), cast(Dict, self._report), max) 212 213 def load_file(self, filename: str) -> None: 214 """Loads wptreport from a local path. 215 216 Args: 217 filename: Filename of the screenshots database (the file can be 218 gzipped if the extension is ".gz"). 219 """ 220 with open(filename, mode='rb') as f: 221 if filename.endswith('.gz'): 222 self.load_gzip_json(f) 223 else: 224 self.load_json(f) 225 226 def load_json(self, fileobj: IO[bytes]) -> None: 227 """Loads wptreport from a JSON file. 228 229 This method can be called multiple times to load and merge new chunks. 230 231 Args: 232 fileobj: A JSON file object (must be in binary mode). 233 234 Raises: 235 InsufficientDataError if the file does not contain a results field; 236 ConflictingDataError if the current file contains information 237 conflicting with existing data (from previous files). 238 """ 239 assert not isinstance(fileobj, io.TextIOBase) 240 self._hash.hash_file(fileobj) 241 fileobj.seek(0) 242 243 # JSON files are always encoded in UTF-8 (RFC 8529). 244 with io.TextIOWrapper(fileobj, encoding='utf-8') as text_file: 245 try: 246 report = json.load(text_file, strict=False) 247 except json.JSONDecodeError as e: 248 raise InvalidJSONError from e 249 # Raise when 'results' is either not found or empty. 250 if 'results' not in report: 251 raise InsufficientDataError 252 self._add_chunk(report) 253 254 def load_gzip_json(self, fileobj: IO[bytes]) -> None: 255 """Loads wptreport from a gzipped JSON file. 256 257 Args: 258 fileobj: A gzip file object. 259 """ 260 # Gzip is always opened in binary mode (in fact, r == rb for gzip). 261 with gzip.GzipFile(fileobj=fileobj, mode='rb') as gzip_file: 262 self.load_json(cast(IO[bytes], gzip_file)) 263 264 def update_metadata(self, revision: str = '', 265 browser_name: str = '', browser_version: str = '', 266 os_name: str = '', os_version: str = '') -> None: 267 """Overwrites metadata of the report.""" 268 # Unfortunately, the names of the keys don't exactly match. 269 if revision: 270 self._report['run_info']['revision'] = revision 271 if browser_name: 272 self._report['run_info']['product'] = browser_name 273 if browser_version: 274 self._report['run_info']['browser_version'] = browser_version 275 if os_name: 276 self._report['run_info']['os'] = os_name 277 if os_version: 278 self._report['run_info']['os_version'] = os_version 279 280 @staticmethod 281 def write_json(fileobj: IO[bytes], payload: Any) -> None: 282 """Encode an object to JSON and writes it to disk. 283 284 Args: 285 fileobj: A file object to write to. 286 payload: An object that can be JSON encoded. 287 """ 288 # json.dump only produces ASCII characters by default. 289 if isinstance(fileobj, io.TextIOBase): 290 json.dump(payload, fileobj) 291 else: 292 with io.TextIOWrapper(fileobj, encoding='ascii') as text_file: 293 json.dump(payload, text_file) 294 295 @staticmethod 296 def write_gzip_json(filepath: str, payload: Any) -> None: 297 """Encode an object to JSON and writes it to disk. 298 299 Args: 300 filepath: A file path to write to. All intermediate directories 301 in the path will be automatically created. 302 payload: An object that can be JSON encoded. 303 """ 304 if os.path.dirname(filepath): 305 os.makedirs(os.path.dirname(filepath), exist_ok=True) 306 with open(filepath, 'wb') as f: 307 with gzip.GzipFile(fileobj=f, mode='wb') as gz: 308 WPTReport.write_json(cast(IO[bytes], gz), payload) 309 310 @property 311 def results(self) -> List[Dict]: 312 """The 'results' field of the report.""" 313 return self._report['results'] 314 315 @property 316 def run_info(self) -> RunInfo: 317 """The 'run_info' field of the report.""" 318 return self._report['run_info'] 319 320 def hashsum(self) -> str: 321 """Hex checksum of the decompressed, concatenated report.""" 322 return self._hash.hashsum() 323 324 def summarize(self) -> Dict[str, Dict[str, Any]]: 325 """Creates a summary of all the test results. 326 327 The summary will be cached after the first call to this method. 328 329 Returns: 330 A summary dictionary. 331 332 Raises: 333 ConflictingDataError if a test appears multiple times in results. 334 MissingMetadataError if any required metadata is missing. 335 """ 336 if self._summary: 337 return self._summary 338 339 for result in self.results: 340 test_file = result['test'].strip() 341 342 if test_file in self._summary: 343 raise ConflictingDataError(test_file) 344 345 # Abbreviate the status to store in the summary file. 346 status = STATUS_ABBREVIATIONS.get(result['status'], 347 result['status']) 348 self._summary[test_file] = {'s': status, 'c': [0, 0]} 349 350 for subtest in result['subtests']: 351 if subtest['status'] == 'PASS': 352 self._summary[test_file]['c'][0] += 1 353 self._summary[test_file]['c'][1] += 1 354 return self._summary 355 356 def each_result(self) -> Iterator[Any]: 357 """Iterates over all the individual test results. 358 359 Returns: 360 A generator. 361 """ 362 return (result for result in self.results) 363 364 def write_summary(self, filepath: str) -> None: 365 """Writes the summary JSON file to disk. 366 367 Args: 368 filepath: A file path to write to. 369 """ 370 self.write_gzip_json(filepath, self.summarize()) 371 372 def write_result_directory(self, directory: str) -> None: 373 """Writes individual test results to a directory. 374 375 Args: 376 directory: The base directory to write to. 377 """ 378 if directory.endswith('/'): 379 directory = directory[:-1] 380 for result in self.each_result(): 381 test_file = result['test'].strip() 382 assert test_file.startswith('/') 383 filepath = directory + test_file 384 self.write_gzip_json(filepath, result) 385 386 def product_id(self, separator: str = '-', sanitize: bool = False) -> str: 387 """Returns an ID string for the product configuration. 388 389 Args: 390 separator: A character to separate fields in the ID string. 391 sanitize: Whether to sanitize (replace them with underscores) 392 characters in the product ID that are not URL-safe. 393 394 Returns: 395 A string, the product ID of this run. 396 """ 397 name = separator.join([self.run_info['product'], 398 self.run_info['browser_version'], 399 self.run_info['os']]) 400 # os_version isn't required. 401 if self.run_info.get('os_version'): 402 name += separator + self.run_info['os_version'] 403 hashsum = self.hashsum() 404 assert len(hashsum) > 0, 'Missing hashsum of the report' 405 name += separator + hashsum[:10] 406 407 if sanitize: 408 name = re.sub('[^A-Za-z0-9._-]', '_', name) 409 410 return name 411 412 def populate_upload_directory(self, 413 output_dir: Optional[str] = None) -> str: 414 """Populates a directory suitable for uploading to GCS. 415 416 The directory structure is as follows: 417 [output_dir]: 418 - [sha][:10]: 419 - [product]-summary_v2.json.gz 420 - [product]: 421 - (per-test results produced by write_result_directory) 422 423 Args: 424 output_dir: A given output directory instead of a temporary one. 425 426 Returns: 427 The output directory. 428 """ 429 if not output_dir: 430 output_dir = tempfile.mkdtemp() 431 432 self.write_summary(os.path.join(output_dir, self.sha_summary_path)) 433 self.write_result_directory( 434 os.path.join(output_dir, self.sha_product_path)) 435 return output_dir 436 437 @property 438 def sha_product_path(self) -> str: 439 """A relative path: sha/product_id""" 440 try: 441 return os.path.join(self.run_info['revision'], 442 self.product_id(separator='-', sanitize=True)) 443 except KeyError as e: 444 # str(e) gives the name of the key. 445 raise MissingMetadataError(str(e)) from e 446 447 @property 448 def sha_summary_path(self) -> str: 449 """A relative path: sha/product_id-summary_v2.json.gz""" 450 return self.sha_product_path + '-summary_v2.json.gz' 451 452 @property 453 def test_run_metadata(self) -> Dict[str, str]: 454 """Returns a dict of metadata. 455 456 The dict can be used as the payload for the test run creation API. 457 458 Raises: 459 MissingMetadataError if any required metadata is missing. 460 """ 461 # Required fields: 462 try: 463 payload = { 464 'browser_name': self.run_info['product'], 465 'browser_version': self.run_info['browser_version'], 466 'os_name': self.run_info['os'], 467 'revision': self.run_info['revision'][:10], 468 'full_revision_hash': self.run_info['revision'], 469 } 470 except KeyError as e: 471 # str(e) gives the name of the key. 472 raise MissingMetadataError(str(e)) from e 473 474 # Optional fields: 475 if self.run_info.get('os_version'): 476 payload['os_version'] = self.run_info['os_version'] 477 478 def microseconds_to_iso(ms_since_epoch: float) -> str: 479 dt = datetime.fromtimestamp(ms_since_epoch / 1000, timezone.utc) 480 return dt.isoformat() 481 482 if self._report.get('time_start'): 483 payload['time_start'] = microseconds_to_iso( 484 self._report['time_start']) 485 if self._report.get('time_end'): 486 payload['time_end'] = microseconds_to_iso( 487 self._report['time_end']) 488 489 return payload 490 491 def normalize_version(self) -> None: 492 m = re.match(r'Technology Preview \(Release (\d+), (.*)\)', 493 self.run_info.get('browser_version', '')) 494 if m: 495 self.run_info['browser_version'] = m.group(1) + ' preview' 496 497 def finalize(self): 498 """Checks and finalizes the report. 499 500 Populates all in-memory states (summary & metadata) and raises 501 exceptions if any check fails. 502 503 Raises: 504 Exceptions inherited from WPTReportError. 505 """ 506 self.summarize() 507 # Additonal final fixup: 508 self.normalize_version() 509 # Access two property methods which will raise exceptions if any 510 # required field is missing. 511 self.sha_product_path 512 self.test_run_metadata 513 514 def serialize_gzip(self, filepath): 515 """Serializes and gzips the in-memory report to a file. 516 517 Args: 518 filepath: A file path to write to. 519 """ 520 self.write_gzip_json(filepath, self._report) 521 522 523 def _channel_to_labels(browser: str, channel: str) -> Set[str]: 524 """Maps a browser-specific channel to labels. 525 526 The original channel is always preserved as a label. In addition, 527 well-known aliases of browser-specific channels are added. 528 529 This aligns channels to RELEASE_CHANNEL_LABELS so that different browsers 530 can be compared meaningfully on wpt.fyi. A few other aliases are added for 531 convenience. 532 """ 533 labels = {channel} 534 if channel == 'preview': 535 # e.g. Safari Technology Preview. 536 labels.add('experimental') 537 elif channel == 'dev' and browser != 'chrome': 538 # e.g. Edge Dev. 539 labels.add('experimental') 540 elif channel == 'canary' and browser == 'chrome': 541 # We only label Chrome Canary as experimental to avoid confusion 542 # with Chrome Dev. 543 labels.add('experimental') 544 elif channel == 'canary' and browser == 'deno': 545 # Deno Canary is the experimental channel. 546 labels.add('experimental') 547 elif channel == 'nightly' and browser != 'chrome': 548 # Notably, we don't want to treat Chrome Nightly (Chromium trunk) as 549 # experimental, as it would cause confusion with Chrome Canary and Dev. 550 labels.add('experimental') 551 552 if channel == 'release': 553 # e.g. Edge release 554 labels.add('stable') 555 if (channel == 'canary' and 556 (browser == 'edgechromium' or browser == 'edge')): 557 # Edge Canary is almost nightly. 558 labels.add('nightly') 559 560 # TODO(DanielRyanSmith): Figure out how we'd like to handle Edge Canary. 561 # https://github.com/web-platform-tests/wpt.fyi/issues/1635 562 return labels 563 564 565 def prepare_labels(report: WPTReport, 566 labels_str: str, 567 uploader: str) -> Set[str]: 568 """Prepares the list of labels for a test run. 569 570 The following labels will be automatically added: 571 * The name of the uploader 572 * The name of the browser 573 * The release channel of the browser (if the uploader doesn't provide one) 574 575 Args: 576 report: A WPTReport. 577 labels_str: A comma-separated string of labels from the uploader. 578 uploader: The name of the uploader. 579 580 Returns: 581 A set of strings. 582 """ 583 browser = report.run_info['product'] 584 # browser_channel is an optional field. 585 channel = report.run_info.get('browser_channel') 586 labels = set() 587 labels.add(browser) 588 labels.add(uploader) 589 # Empty labels may be generated here, but they will be removed later. 590 for label in labels_str.split(','): 591 labels.add(label.strip()) 592 593 # Add the release channel label. 594 if channel: 595 labels |= _channel_to_labels(browser, channel) 596 elif not (labels & RELEASE_CHANNEL_LABELS): 597 # Default to "stable" if no channel label or browser_channel is present 598 # TODO(Hexcles): remove this fallback default eventually. 599 _log.warning('Test run does not have browser_channel or any channel ' 600 'label, assumed stable.') 601 labels.add('stable') 602 603 # Remove any empty labels. 604 if '' in labels: 605 labels.remove('') 606 return labels 607 608 609 def normalize_product(report: WPTReport) -> Set[str]: 610 """Normalizes the product identifier in the report. 611 612 In addition to modifying the 'product' of the report, this function also 613 returns a set of labels that need to be added. 614 615 Args: 616 report: A WPTReport 617 618 Returns: 619 A set of strings. 620 """ 621 product = report.run_info['product'] 622 if product == 'edgechromium' or product == 'edge': 623 report.run_info['product'] = 'edge' 624 return {'edge', 'edgechromium'} 625 elif product == 'webkitgtk_minibrowser': 626 report.run_info['product'] = 'webkitgtk' 627 return {'webkitgtk', 'minibrowser'} 628 else: 629 return set() 630 631 632 def create_test_run(report, run_id, labels_str, uploader, auth, 633 results_url, raw_results_url, callback_url=None): 634 """Creates a TestRun on the dashboard. 635 636 By posting to the /api/results/create endpoint. 637 638 Args: 639 report: A WPTReport. 640 run_id: The pre-allocated Datastore ID for this run. 641 labels_str: A comma-separated string of labels from the uploader. 642 uploader: The name of the uploader. 643 auth: A (username, password) tuple for HTTP basic auth. 644 results_url: URL of the gzipped summary file. (e.g. 645 'https://.../wptd/0123456789/chrome-62.0-linux-summary_v2.json.gz') 646 raw_results_url: URL of the raw full report. (e.g. 647 'https://.../wptd-results/[FullSHA]/chrome-62.0-linux/report.json') 648 649 Returns: 650 The integral ID associated with the created test run. 651 """ 652 if callback_url is None: 653 callback_url = config.project_baseurl() + '/api/results/create' 654 _log.info('Creating run %s from %s using %s', 655 run_id, uploader, callback_url) 656 657 labels = prepare_labels(report, labels_str, uploader) 658 assert len(labels) > 0 659 660 labels |= normalize_product(report) 661 662 payload = report.test_run_metadata 663 if int(run_id) != 0: 664 payload['id'] = int(run_id) 665 payload['results_url'] = results_url 666 payload['raw_results_url'] = raw_results_url 667 payload['labels'] = sorted(labels) 668 669 response = requests.post(callback_url, auth=auth, json=payload) 670 response.raise_for_status() 671 response_data = response.json() 672 return response_data['id'] 673 674 675 def main() -> None: 676 parser = argparse.ArgumentParser( 677 description='Parse and transform JSON wptreport.') 678 parser.add_argument('report', metavar='REPORT', type=str, nargs='+', 679 help='path to a JSON wptreport (gzipped files are ' 680 'supported as long as the extension is .gz)') 681 parser.add_argument('--summary', type=str, 682 help='if specified, write a gzipped JSON summary to ' 683 'this file path') 684 parser.add_argument('--output-dir', type=str, 685 help='if specified, write both the summary and ' 686 'per-test results (all gzipped) to OUTPUT_DIR/SHA/ ,' 687 'suitable for uploading to GCS (please use an ' 688 'empty directory)') 689 args = parser.parse_args() 690 691 report = WPTReport() 692 for r in args.report: 693 with open(r, 'rb') as f: 694 if r.endswith('.gz'): 695 report.load_gzip_json(f) 696 else: 697 report.load_json(f) 698 699 if args.summary: 700 report.write_summary(args.summary) 701 if args.output_dir: 702 upload_dir = report.populate_upload_directory( 703 output_dir=args.output_dir) 704 _log.info('Populated: %s', upload_dir) 705 706 707 if __name__ == '__main__': 708 _log.setLevel(logging.INFO) 709 main()