github.com/apache/beam/sdks/v2@v2.48.2/java/container/license_scripts/pull_licenses_java.py (about) 1 # 2 # Licensed to the Apache Software Foundation (ASF) under one or more 3 # contributor license agreements. See the NOTICE file distributed with 4 # this work for additional information regarding copyright ownership. 5 # The ASF licenses this file to You under the Apache License, Version 2.0 6 # (the "License"); you may not use this file except in compliance with 7 # the License. You may obtain a copy of the License at 8 # 9 # http://www.apache.org/licenses/LICENSE-2.0 10 # 11 # Unless required by applicable law or agreed to in writing, software 12 # distributed under the License is distributed on an "AS IS" BASIS, 13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 # See the License for the specific language governing permissions and 15 # limitations under the License. 16 # 17 """ 18 A script to pull licenses/notices/source code for Java dependencies. 19 It generates a CSV file with [dependency_name, url_to_license, license_type, source_included] 20 """ 21 22 import argparse 23 import csv 24 import json 25 import logging 26 import os 27 import shutil 28 import threading 29 import traceback 30 import yaml 31 32 from bs4 import BeautifulSoup 33 from datetime import datetime 34 from multiprocessing.pool import ThreadPool 35 from queue import Queue 36 from tenacity import retry 37 from tenacity import stop_after_attempt 38 from tenacity import wait_fixed 39 from urllib.request import urlopen, Request, URLError, HTTPError 40 41 SOURCE_CODE_REQUIRED_LICENSES = ['lgpl', 'gpl', 'cddl', 'mpl', 'gnu', 'mozilla public license'] 42 RETRY_NUM = 9 43 THREADS = 16 44 45 @retry(reraise=True, 46 wait=wait_fixed(5), 47 stop=stop_after_attempt(RETRY_NUM)) 48 def pull_from_url(file_name, url, dep, no_list): 49 if url == 'skip': 50 return 51 52 # Replace file path with absolute path to manual licenses 53 if url.startswith('file://{}'): 54 url = url.format(manual_license_path) 55 logging.info('Replaced local file URL with {url} for {dep}'.format(url=url, dep=dep)) 56 57 # Take into account opensource.org changes that cause 404 on licenses 58 if 'opensource.org' in url and url.endswith('-license.php'): 59 url = url.replace('-license.php', '') 60 61 try: 62 url_read = urlopen(Request(url, headers={ 63 'User-Agent': 'Apache Beam', 64 # MPL license fails to resolve redirects without this header 65 # see https://github.com/apache/beam/issues/22394 66 'accept-language': 'en-US,en;q=0.9', 67 })) 68 with open(file_name, 'wb') as temp_write: 69 shutil.copyfileobj(url_read, temp_write) 70 logging.debug( 71 'Successfully pulled {file_name} from {url} for {dep}'.format( 72 url=url, file_name=file_name, dep=dep)) 73 except URLError as e: 74 traceback.print_exc() 75 if pull_from_url.retry.statistics["attempt_number"] < RETRY_NUM: 76 logging.error('Invalid url for {dep}: {url}. Retrying...'.format( 77 url=url, dep=dep)) 78 raise 79 else: 80 logging.error( 81 'Invalid url for {dep}: {url} after {n} retries.'.format( 82 url=url, dep=dep, n=RETRY_NUM)) 83 with thread_lock: 84 no_list.append(dep) 85 return 86 except HTTPError as e: 87 traceback.print_exc() 88 if pull_from_url.retry.statistics["attempt_number"] < RETRY_NUM: 89 logging.info( 90 'Received {code} from {url} for {dep}. Retrying...'.format( 91 code=e.code, url=url, dep=dep)) 92 raise 93 else: 94 logging.error( 95 'Received {code} from {url} for {dep} after {n} retries.'. 96 format(code=e.code, url=url, dep=dep, n=RETRY_NUM)) 97 with thread_lock: 98 no_list.append(dep) 99 return 100 except Exception as e: 101 traceback.print_exc() 102 if pull_from_url.retry.statistics["attempt_number"] < RETRY_NUM: 103 logging.error( 104 'Error occurred when pull {file_name} from {url} for {dep}. Retrying...' 105 .format(url=url, file_name=file_name, dep=dep)) 106 raise 107 else: 108 logging.error( 109 'Error occurred when pull {file_name} from {url} for {dep} after {n} retries.' 110 .format(url=url, file_name=file_name, dep=dep, n=RETRY_NUM)) 111 with thread_lock: 112 no_list.append(dep) 113 return 114 115 116 def pull_source_code(base_url, dir_name, dep): 117 # base_url example: https://repo1.maven.org/maven2/org/mortbay/jetty/jsp-2.1/6.1.14/ 118 try: 119 soup = BeautifulSoup(urlopen(base_url).read(), "html.parser") 120 except: 121 logging.error('Error reading source base from {base_url}'.format(base_url=base_url)) 122 raise 123 source_count = 0 124 for href in (a["href"] for a in soup.select("a[href]")): 125 if href.endswith( 126 '.jar') and 'sources.jar' in href: # download sources jar file only 127 file_name = dir_name + '/' + href 128 url = base_url + '/' + href 129 logging.debug('Pulling source from {url}'.format(url=url)) 130 pull_from_url(file_name, url, dep, incorrect_source_url) 131 source_count = source_count + 1 132 if source_count == 0: 133 raise RuntimeError('No source found at {base_url}'.format(base_url=base_url)) 134 135 136 @retry(reraise=True, stop=stop_after_attempt(3)) 137 def write_to_csv(csv_list): 138 csv_columns = [ 139 'dependency_name', 'url_to_license', 'license_type', 'source_included' 140 ] 141 csv_file = "{output_dir}/beam_java_dependency_list.csv".format( 142 output_dir=output_dir) 143 try: 144 with open(csv_file, 'w') as csvfile: 145 writer = csv.DictWriter(csvfile, fieldnames=csv_columns) 146 writer.writeheader() 147 for data in csv_list: 148 writer.writerow(data) 149 except: 150 traceback.print_exc() 151 raise 152 153 154 def execute(dep): 155 ''' 156 An example of dep. 157 { 158 "moduleName": "antlr:antlr", 159 "moduleUrl": "http://www.antlr.org/", 160 "moduleVersion": "2.7.7", 161 "moduleLicense": "BSD License", 162 "moduleLicenseUrl": "http://www.antlr.org/license.html" 163 } 164 ''' 165 166 name = dep['moduleName'].split(':')[1] 167 version = dep['moduleVersion'] 168 name_version = name + '-' + version 169 # javac is not a runtime dependency 170 if name == 'javac': 171 logging.debug('Skipping', name_version) 172 return 173 # skip self dependencies 174 if dep['moduleName'].lower().startswith('beam'): 175 logging.debug('Skipping', name_version) 176 return 177 dir_name = '{output_dir}/{name_version}.jar'.format( 178 output_dir=output_dir, name_version=name_version) 179 180 # if auto pulled, directory is existing at {output_dir} 181 if not os.path.isdir(dir_name): 182 os.mkdir(dir_name) 183 # pull license 184 try: 185 license_url = dep_config[name][version]['license'] 186 except: 187 try: 188 license_url = dep['moduleLicenseUrl'] 189 except: 190 # url cannot be found, add to no_licenses and skip to pull. 191 with thread_lock: 192 no_licenses.append(name_version) 193 license_url = 'skip' 194 pull_from_url(dir_name + '/LICENSE', license_url, name_version, 195 no_licenses) 196 # pull notice 197 try: 198 notice_url = dep_config[name][version]['notice'] 199 pull_from_url(dir_name + '/NOTICE', notice_url, name_version) 200 except: 201 pass 202 else: 203 try: 204 license_url = dep['moduleLicenseUrl'] 205 except: 206 license_url = '' 207 logging.debug( 208 'License/notice for {name_version} were pulled automatically.'. 209 format(name_version=name_version)) 210 211 # get license_type to decide if pull source code. 212 try: 213 license_type = dep['moduleLicense'] 214 except: 215 try: 216 license_type = dep_config[name][version]['type'] 217 except: 218 license_type = 'no_license_type' 219 with thread_lock: 220 no_license_type.append(name_version) 221 222 # pull source code if license_type is one of SOURCE_CODE_REQUIRED_LICENSES. 223 if any(x in license_type.lower() for x in SOURCE_CODE_REQUIRED_LICENSES): 224 try: 225 base_url = dep_config[name][version]['source'] 226 except: 227 module = dep['moduleName'].split(':')[0].replace('.', '/') 228 base_url = maven_url_temp.format(module=module + '/' + name, 229 version=version) 230 pull_source_code(base_url, dir_name, name_version) 231 source_included = True 232 else: 233 source_included = False 234 235 csv_dict = { 236 'dependency_name': name_version, 237 'url_to_license': license_url, 238 'license_type': license_type, 239 'source_included': source_included 240 } 241 with thread_lock: 242 csv_list.append(csv_dict) 243 244 245 if __name__ == "__main__": 246 start = datetime.now() 247 parser = argparse.ArgumentParser() 248 parser.add_argument('--license_index', required=True) 249 parser.add_argument('--output_dir', required=True) 250 parser.add_argument('--dep_url_yaml', required=True) 251 parser.add_argument('--manual_license_path', required=True) 252 253 args = parser.parse_args() 254 license_index = args.license_index 255 output_dir = args.output_dir 256 dep_url_yaml = args.dep_url_yaml 257 manual_license_path = args.manual_license_path 258 259 logging.getLogger().setLevel(logging.INFO) 260 261 # index.json is generated by Gradle plugin. 262 with open(license_index) as f: 263 dependencies = json.load(f) 264 265 with open(dep_url_yaml) as file: 266 dep_config = yaml.full_load(file) 267 268 maven_url_temp = 'https://repo1.maven.org/maven2/{module}/{version}' 269 270 csv_list = [] 271 no_licenses = [] 272 no_license_type = [] 273 incorrect_source_url = [] 274 275 logging.info( 276 'Pulling license for {num_deps} dependencies using {num_threads} threads.' 277 .format(num_deps=len(dependencies['dependencies']), 278 num_threads=THREADS)) 279 thread_lock = threading.Lock() 280 pool = ThreadPool(THREADS) 281 pool.map(execute, dependencies['dependencies']) 282 283 write_to_csv(csv_list) 284 285 error_msg = [] 286 run_status = 'succeed' 287 if no_licenses: 288 logging.error(no_licenses) 289 how_to = '**************************************** ' \ 290 'Licenses were not able to be pulled ' \ 291 'automatically for some dependencies. Please search source ' \ 292 'code of the dependencies on the internet and add "license" ' \ 293 'and "notice" (if available) field to {yaml_file} for each ' \ 294 'missing license. Dependency List: [{dep_list}]'.format( 295 dep_list=','.join(sorted(no_licenses)), yaml_file=dep_url_yaml) 296 logging.error(how_to) 297 error_msg.append(how_to) 298 run_status = 'failed' 299 300 if no_license_type: 301 how_to = '**************************************** ' \ 302 'License type of some dependencies were not ' \ 303 'identified. The license type is used to decide whether the ' \ 304 'source code of the dependency should be pulled or not. ' \ 305 'Please add "type" field to {yaml_file} for each dependency. ' \ 306 'Dependency List: [{dep_list}]'.format( 307 dep_list=','.join(sorted(no_license_type)), yaml_file=dep_url_yaml) 308 error_msg.append(how_to) 309 run_status = 'failed' 310 311 if incorrect_source_url: 312 how_to = '**************************************** ' \ 313 'Urls to maven repo for some dependencies ' \ 314 'were not able to be generated automatically. Please add ' \ 315 '"source" field to {yaml_file} for each dependency. ' \ 316 'Dependency List: [{dep_list}]'.format( 317 dep_list=','.join(sorted(incorrect_source_url)), 318 yaml_file=dep_url_yaml) 319 error_msg.append(how_to) 320 run_status = 'failed' 321 322 end = datetime.now() 323 logging.info( 324 'pull_licenses_java.py {status}. It took {sec} seconds with {threads} threads.' 325 .format(status=run_status, 326 sec=(end - start).total_seconds(), 327 threads=THREADS)) 328 329 if error_msg: 330 raise RuntimeError('{n} error(s) occurred.'.format(n=len(error_msg)), 331 error_msg)