github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/pandas_docs_test.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  
    17  """A module for running the pandas docs (such as the users guide) against our
    18  dataframe implementation.
    19  
    20  Run as python -m apache_beam.dataframe.pandas_docs_test [getting_started ...]
    21  """
    22  
    23  import argparse
    24  import contextlib
    25  import io
    26  import multiprocessing
    27  import os
    28  import sys
    29  import time
    30  import urllib.request
    31  import zipfile
    32  
    33  from apache_beam.dataframe import doctests
    34  
    35  PANDAS_VERSION = '1.1.1'
    36  PANDAS_DIR = os.path.expanduser("~/.apache_beam/cache/pandas-" + PANDAS_VERSION)
    37  PANDAS_DOCS_SOURCE = os.path.join(PANDAS_DIR, 'doc', 'source')
    38  
    39  
    40  def main():
    41    parser = argparse.ArgumentParser()
    42    parser.add_argument(
    43        '-p',
    44        '--parallel',
    45        type=int,
    46        default=0,
    47        help='Number of tests to run in parallel. '
    48        'Defaults to 0, meaning the number of cores on the machine.')
    49    parser.add_argument('docs', nargs='*')
    50    args = parser.parse_args()
    51  
    52    if not os.path.exists(PANDAS_DIR):
    53      # Download the pandas source.
    54      os.makedirs(os.path.dirname(PANDAS_DIR), exist_ok=True)
    55      zip = os.path.join(PANDAS_DIR + '.zip')
    56      if not os.path.exists(zip):
    57        url = (
    58            'https://github.com/pandas-dev/pandas/archive/v%s.zip' %
    59            PANDAS_VERSION)
    60        print('Downloading', url)
    61        with urllib.request.urlopen(url) as fin:
    62          with open(zip + '.tmp', 'wb') as fout:
    63            fout.write(fin.read())
    64          os.rename(zip + '.tmp', zip)
    65  
    66      print('Extracting', zip)
    67      with zipfile.ZipFile(zip, 'r') as handle:
    68        handle.extractall(os.path.dirname(PANDAS_DIR))
    69  
    70    tests = args.docs or ['getting_started', 'user_guide']
    71    paths = []
    72    filters = []
    73  
    74    # Explicit paths.
    75    for test in tests:
    76      if os.path.exists(test):
    77        paths.append(test)
    78      else:
    79        filters.append(test)
    80  
    81    # Names of pandas source files.
    82    for root, _, files in os.walk(PANDAS_DOCS_SOURCE):
    83      for name in files:
    84        if name.endswith('.rst'):
    85          path = os.path.join(root, name)
    86          if any(filter in path for filter in filters):
    87            paths.append(path)
    88  
    89    # Using a global here is a bit hacky, but avoids pickling issues when used
    90    # with multiprocessing.
    91    parallelism = max(args.parallel or multiprocessing.cpu_count(), len(paths))
    92  
    93    if parallelism > 1:
    94      pool_map = multiprocessing.pool.Pool(parallelism).imap_unordered
    95      run_tests = run_tests_capturing_stdout
    96      # Make sure slow tests get started first.
    97      paths.sort(
    98          key=lambda path: ('enhancingperf' in path, os.path.getsize(path)),
    99          reverse=True)
   100    else:
   101      pool_map = map
   102      run_tests = run_tests_streaming_stdout
   103  
   104    # Now run all the tests.
   105    running_summary = doctests.Summary()
   106    for count, (summary, stdout) in enumerate(pool_map(run_tests, paths)):
   107      running_summary += summary
   108      if stdout:
   109        print(stdout)
   110      print(count, '/', len(paths), 'done.')
   111  
   112    print('*' * 72)
   113    print("Final summary:")
   114    running_summary.summarize()
   115  
   116  
   117  def run_tests_capturing_stdout(path):
   118    with deferred_stdout() as stdout:
   119      return run_tests(path), stdout()
   120  
   121  
   122  def run_tests_streaming_stdout(path):
   123    return run_tests(path), None
   124  
   125  
   126  def run_tests(path):
   127    # Optionally capture the stdout as interleaved test errors are painful
   128    # to debug.  On the other hand, if there is no parallelism, let the
   129    # output be streamed.
   130    start = time.time()
   131    with open(path) as f:
   132      rst = f.read()
   133    res = doctests.test_rst_ipython(
   134        rst,
   135        path,
   136        report=True,
   137        wont_implement_ok=['*'],
   138        not_implemented_ok=['*'],
   139        use_beam=False).summary
   140    print("Total time for {}: {:.2f} secs".format(path, time.time() - start))
   141    return res
   142  
   143  
   144  @contextlib.contextmanager
   145  def deferred_stdout():
   146    captured = io.StringIO()
   147    old_stdout, sys.stdout = sys.stdout, captured
   148    yield captured.getvalue
   149    sys.stdout = old_stdout
   150  
   151  
   152  if __name__ == '__main__':
   153    main()