github.com/apache/beam/sdks/v2@v2.48.2/python/apache_beam/dataframe/pandas_docs_test.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 17 """A module for running the pandas docs (such as the users guide) against our 18 dataframe implementation. 19 20 Run as python -m apache_beam.dataframe.pandas_docs_test [getting_started ...] 21 """ 22 23 import argparse 24 import contextlib 25 import io 26 import multiprocessing 27 import os 28 import sys 29 import time 30 import urllib.request 31 import zipfile 32 33 from apache_beam.dataframe import doctests 34 35 PANDAS_VERSION = '1.1.1' 36 PANDAS_DIR = os.path.expanduser("~/.apache_beam/cache/pandas-" + PANDAS_VERSION) 37 PANDAS_DOCS_SOURCE = os.path.join(PANDAS_DIR, 'doc', 'source') 38 39 40 def main(): 41 parser = argparse.ArgumentParser() 42 parser.add_argument( 43 '-p', 44 '--parallel', 45 type=int, 46 default=0, 47 help='Number of tests to run in parallel. ' 48 'Defaults to 0, meaning the number of cores on the machine.') 49 parser.add_argument('docs', nargs='*') 50 args = parser.parse_args() 51 52 if not os.path.exists(PANDAS_DIR): 53 # Download the pandas source. 54 os.makedirs(os.path.dirname(PANDAS_DIR), exist_ok=True) 55 zip = os.path.join(PANDAS_DIR + '.zip') 56 if not os.path.exists(zip): 57 url = ( 58 'https://github.com/pandas-dev/pandas/archive/v%s.zip' % 59 PANDAS_VERSION) 60 print('Downloading', url) 61 with urllib.request.urlopen(url) as fin: 62 with open(zip + '.tmp', 'wb') as fout: 63 fout.write(fin.read()) 64 os.rename(zip + '.tmp', zip) 65 66 print('Extracting', zip) 67 with zipfile.ZipFile(zip, 'r') as handle: 68 handle.extractall(os.path.dirname(PANDAS_DIR)) 69 70 tests = args.docs or ['getting_started', 'user_guide'] 71 paths = [] 72 filters = [] 73 74 # Explicit paths. 75 for test in tests: 76 if os.path.exists(test): 77 paths.append(test) 78 else: 79 filters.append(test) 80 81 # Names of pandas source files. 82 for root, _, files in os.walk(PANDAS_DOCS_SOURCE): 83 for name in files: 84 if name.endswith('.rst'): 85 path = os.path.join(root, name) 86 if any(filter in path for filter in filters): 87 paths.append(path) 88 89 # Using a global here is a bit hacky, but avoids pickling issues when used 90 # with multiprocessing. 91 parallelism = max(args.parallel or multiprocessing.cpu_count(), len(paths)) 92 93 if parallelism > 1: 94 pool_map = multiprocessing.pool.Pool(parallelism).imap_unordered 95 run_tests = run_tests_capturing_stdout 96 # Make sure slow tests get started first. 97 paths.sort( 98 key=lambda path: ('enhancingperf' in path, os.path.getsize(path)), 99 reverse=True) 100 else: 101 pool_map = map 102 run_tests = run_tests_streaming_stdout 103 104 # Now run all the tests. 105 running_summary = doctests.Summary() 106 for count, (summary, stdout) in enumerate(pool_map(run_tests, paths)): 107 running_summary += summary 108 if stdout: 109 print(stdout) 110 print(count, '/', len(paths), 'done.') 111 112 print('*' * 72) 113 print("Final summary:") 114 running_summary.summarize() 115 116 117 def run_tests_capturing_stdout(path): 118 with deferred_stdout() as stdout: 119 return run_tests(path), stdout() 120 121 122 def run_tests_streaming_stdout(path): 123 return run_tests(path), None 124 125 126 def run_tests(path): 127 # Optionally capture the stdout as interleaved test errors are painful 128 # to debug. On the other hand, if there is no parallelism, let the 129 # output be streamed. 130 start = time.time() 131 with open(path) as f: 132 rst = f.read() 133 res = doctests.test_rst_ipython( 134 rst, 135 path, 136 report=True, 137 wont_implement_ok=['*'], 138 not_implemented_ok=['*'], 139 use_beam=False).summary 140 print("Total time for {}: {:.2f} secs".format(path, time.time() - start)) 141 return res 142 143 144 @contextlib.contextmanager 145 def deferred_stdout(): 146 captured = io.StringIO() 147 old_stdout, sys.stdout = sys.stdout, captured 148 yield captured.getvalue 149 sys.stdout = old_stdout 150 151 152 if __name__ == '__main__': 153 main()