github.com/apache/beam/sdks/v2@v2.48.2/java/container/license_scripts/pull_licenses_java.py (about)

     1  #
     2  # Licensed to the Apache Software Foundation (ASF) under one or more
     3  # contributor license agreements.  See the NOTICE file distributed with
     4  # this work for additional information regarding copyright ownership.
     5  # The ASF licenses this file to You under the Apache License, Version 2.0
     6  # (the "License"); you may not use this file except in compliance with
     7  # the License.  You may obtain a copy of the License at
     8  #
     9  #    http://www.apache.org/licenses/LICENSE-2.0
    10  #
    11  # Unless required by applicable law or agreed to in writing, software
    12  # distributed under the License is distributed on an "AS IS" BASIS,
    13  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  # See the License for the specific language governing permissions and
    15  # limitations under the License.
    16  #
    17  """
    18  A script to pull licenses/notices/source code for Java dependencies.
    19  It generates a CSV file with [dependency_name, url_to_license, license_type, source_included]
    20  """
    21  
    22  import argparse
    23  import csv
    24  import json
    25  import logging
    26  import os
    27  import shutil
    28  import threading
    29  import traceback
    30  import yaml
    31  
    32  from bs4 import BeautifulSoup
    33  from datetime import datetime
    34  from multiprocessing.pool import ThreadPool
    35  from queue import Queue
    36  from tenacity import retry
    37  from tenacity import stop_after_attempt
    38  from tenacity import wait_fixed
    39  from urllib.request import urlopen, Request, URLError, HTTPError
    40  
    41  SOURCE_CODE_REQUIRED_LICENSES = ['lgpl', 'gpl', 'cddl', 'mpl', 'gnu', 'mozilla public license']
    42  RETRY_NUM = 9
    43  THREADS = 16
    44  
    45  @retry(reraise=True,
    46         wait=wait_fixed(5),
    47         stop=stop_after_attempt(RETRY_NUM))
    48  def pull_from_url(file_name, url, dep, no_list):
    49      if url == 'skip':
    50          return
    51  
    52      # Replace file path with absolute path to manual licenses
    53      if url.startswith('file://{}'):
    54          url = url.format(manual_license_path)
    55          logging.info('Replaced local file URL with {url} for {dep}'.format(url=url, dep=dep))
    56  
    57      # Take into account opensource.org changes that cause 404 on licenses
    58      if 'opensource.org' in url and url.endswith('-license.php'):
    59          url = url.replace('-license.php', '')
    60  
    61      try:
    62          url_read = urlopen(Request(url, headers={
    63              'User-Agent': 'Apache Beam',
    64              # MPL license fails to resolve redirects without this header
    65              # see https://github.com/apache/beam/issues/22394
    66              'accept-language': 'en-US,en;q=0.9',
    67          }))
    68          with open(file_name, 'wb') as temp_write:
    69              shutil.copyfileobj(url_read, temp_write)
    70          logging.debug(
    71              'Successfully pulled {file_name} from {url} for {dep}'.format(
    72                  url=url, file_name=file_name, dep=dep))
    73      except URLError as e:
    74          traceback.print_exc()
    75          if pull_from_url.retry.statistics["attempt_number"] < RETRY_NUM:
    76              logging.error('Invalid url for {dep}: {url}. Retrying...'.format(
    77                  url=url, dep=dep))
    78              raise
    79          else:
    80              logging.error(
    81                  'Invalid url for {dep}: {url} after {n} retries.'.format(
    82                      url=url, dep=dep, n=RETRY_NUM))
    83              with thread_lock:
    84                  no_list.append(dep)
    85              return
    86      except HTTPError as e:
    87          traceback.print_exc()
    88          if pull_from_url.retry.statistics["attempt_number"] < RETRY_NUM:
    89              logging.info(
    90                  'Received {code} from {url} for {dep}. Retrying...'.format(
    91                      code=e.code, url=url, dep=dep))
    92              raise
    93          else:
    94              logging.error(
    95                  'Received {code} from {url} for {dep} after {n} retries.'.
    96                  format(code=e.code, url=url, dep=dep, n=RETRY_NUM))
    97              with thread_lock:
    98                  no_list.append(dep)
    99              return
   100      except Exception as e:
   101          traceback.print_exc()
   102          if pull_from_url.retry.statistics["attempt_number"] < RETRY_NUM:
   103              logging.error(
   104                  'Error occurred when pull {file_name} from {url} for {dep}. Retrying...'
   105                  .format(url=url, file_name=file_name, dep=dep))
   106              raise
   107          else:
   108              logging.error(
   109                  'Error occurred when pull {file_name} from {url} for {dep} after {n} retries.'
   110                  .format(url=url, file_name=file_name, dep=dep, n=RETRY_NUM))
   111              with thread_lock:
   112                  no_list.append(dep)
   113              return
   114  
   115  
   116  def pull_source_code(base_url, dir_name, dep):
   117      # base_url example: https://repo1.maven.org/maven2/org/mortbay/jetty/jsp-2.1/6.1.14/
   118      try:
   119        soup = BeautifulSoup(urlopen(base_url).read(), "html.parser")
   120      except:
   121        logging.error('Error reading source base from {base_url}'.format(base_url=base_url))
   122        raise
   123      source_count = 0
   124      for href in (a["href"] for a in soup.select("a[href]")):
   125          if href.endswith(
   126                  '.jar') and 'sources.jar' in href:  # download sources jar file only
   127              file_name = dir_name + '/' + href
   128              url = base_url + '/' + href
   129              logging.debug('Pulling source from {url}'.format(url=url))
   130              pull_from_url(file_name, url, dep, incorrect_source_url)
   131              source_count = source_count + 1
   132      if source_count == 0:
   133        raise RuntimeError('No source found at {base_url}'.format(base_url=base_url))
   134  
   135  
   136  @retry(reraise=True, stop=stop_after_attempt(3))
   137  def write_to_csv(csv_list):
   138      csv_columns = [
   139          'dependency_name', 'url_to_license', 'license_type', 'source_included'
   140      ]
   141      csv_file = "{output_dir}/beam_java_dependency_list.csv".format(
   142          output_dir=output_dir)
   143      try:
   144          with open(csv_file, 'w') as csvfile:
   145              writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
   146              writer.writeheader()
   147              for data in csv_list:
   148                  writer.writerow(data)
   149      except:
   150          traceback.print_exc()
   151          raise
   152  
   153  
   154  def execute(dep):
   155      '''
   156      An example of dep.
   157      {
   158          "moduleName": "antlr:antlr",
   159          "moduleUrl": "http://www.antlr.org/",
   160          "moduleVersion": "2.7.7",
   161          "moduleLicense": "BSD License",
   162          "moduleLicenseUrl": "http://www.antlr.org/license.html"
   163      }
   164      '''
   165  
   166      name = dep['moduleName'].split(':')[1]
   167      version = dep['moduleVersion']
   168      name_version = name + '-' + version
   169      # javac is not a runtime dependency
   170      if name == 'javac':
   171        logging.debug('Skipping', name_version)
   172        return
   173      # skip self dependencies
   174      if dep['moduleName'].lower().startswith('beam'):
   175        logging.debug('Skipping', name_version)
   176        return
   177      dir_name = '{output_dir}/{name_version}.jar'.format(
   178          output_dir=output_dir, name_version=name_version)
   179  
   180      # if auto pulled, directory is existing at {output_dir}
   181      if not os.path.isdir(dir_name):
   182          os.mkdir(dir_name)
   183          # pull license
   184          try:
   185              license_url = dep_config[name][version]['license']
   186          except:
   187              try:
   188                  license_url = dep['moduleLicenseUrl']
   189              except:
   190                  # url cannot be found, add to no_licenses and skip to pull.
   191                  with thread_lock:
   192                      no_licenses.append(name_version)
   193                  license_url = 'skip'
   194          pull_from_url(dir_name + '/LICENSE', license_url, name_version,
   195                        no_licenses)
   196          # pull notice
   197          try:
   198              notice_url = dep_config[name][version]['notice']
   199              pull_from_url(dir_name + '/NOTICE', notice_url, name_version)
   200          except:
   201              pass
   202      else:
   203          try:
   204              license_url = dep['moduleLicenseUrl']
   205          except:
   206              license_url = ''
   207          logging.debug(
   208              'License/notice for {name_version} were pulled automatically.'.
   209              format(name_version=name_version))
   210  
   211      # get license_type to decide if pull source code.
   212      try:
   213          license_type = dep['moduleLicense']
   214      except:
   215          try:
   216              license_type = dep_config[name][version]['type']
   217          except:
   218              license_type = 'no_license_type'
   219              with thread_lock:
   220                  no_license_type.append(name_version)
   221  
   222      # pull source code if license_type is one of SOURCE_CODE_REQUIRED_LICENSES.
   223      if any(x in license_type.lower() for x in SOURCE_CODE_REQUIRED_LICENSES):
   224          try:
   225              base_url = dep_config[name][version]['source']
   226          except:
   227              module = dep['moduleName'].split(':')[0].replace('.', '/')
   228              base_url = maven_url_temp.format(module=module + '/' + name,
   229                                               version=version)
   230          pull_source_code(base_url, dir_name, name_version)
   231          source_included = True
   232      else:
   233          source_included = False
   234  
   235      csv_dict = {
   236          'dependency_name': name_version,
   237          'url_to_license': license_url,
   238          'license_type': license_type,
   239          'source_included': source_included
   240      }
   241      with thread_lock:
   242          csv_list.append(csv_dict)
   243  
   244  
   245  if __name__ == "__main__":
   246      start = datetime.now()
   247      parser = argparse.ArgumentParser()
   248      parser.add_argument('--license_index', required=True)
   249      parser.add_argument('--output_dir', required=True)
   250      parser.add_argument('--dep_url_yaml', required=True)
   251      parser.add_argument('--manual_license_path', required=True)
   252  
   253      args = parser.parse_args()
   254      license_index = args.license_index
   255      output_dir = args.output_dir
   256      dep_url_yaml = args.dep_url_yaml
   257      manual_license_path = args.manual_license_path
   258  
   259      logging.getLogger().setLevel(logging.INFO)
   260  
   261      # index.json is generated by Gradle plugin.
   262      with open(license_index) as f:
   263          dependencies = json.load(f)
   264  
   265      with open(dep_url_yaml) as file:
   266          dep_config = yaml.full_load(file)
   267  
   268      maven_url_temp = 'https://repo1.maven.org/maven2/{module}/{version}'
   269  
   270      csv_list = []
   271      no_licenses = []
   272      no_license_type = []
   273      incorrect_source_url = []
   274  
   275      logging.info(
   276          'Pulling license for {num_deps} dependencies using {num_threads} threads.'
   277          .format(num_deps=len(dependencies['dependencies']),
   278                  num_threads=THREADS))
   279      thread_lock = threading.Lock()
   280      pool = ThreadPool(THREADS)
   281      pool.map(execute, dependencies['dependencies'])
   282  
   283      write_to_csv(csv_list)
   284  
   285      error_msg = []
   286      run_status = 'succeed'
   287      if no_licenses:
   288          logging.error(no_licenses)
   289          how_to = '**************************************** ' \
   290                   'Licenses were not able to be pulled ' \
   291                   'automatically for some dependencies. Please search source ' \
   292                   'code of the dependencies on the internet and add "license" ' \
   293                   'and "notice" (if available) field to {yaml_file} for each ' \
   294                   'missing license. Dependency List: [{dep_list}]'.format(
   295              dep_list=','.join(sorted(no_licenses)), yaml_file=dep_url_yaml)
   296          logging.error(how_to)
   297          error_msg.append(how_to)
   298          run_status = 'failed'
   299  
   300      if no_license_type:
   301          how_to = '**************************************** ' \
   302                   'License type of some dependencies were not ' \
   303                   'identified. The license type is used to decide whether the ' \
   304                   'source code of the dependency should be pulled or not. ' \
   305                   'Please add "type" field to {yaml_file} for each dependency. ' \
   306                   'Dependency List: [{dep_list}]'.format(
   307              dep_list=','.join(sorted(no_license_type)), yaml_file=dep_url_yaml)
   308          error_msg.append(how_to)
   309          run_status = 'failed'
   310  
   311      if incorrect_source_url:
   312          how_to = '**************************************** ' \
   313                   'Urls to maven repo for some dependencies ' \
   314                   'were not able to be generated automatically. Please add ' \
   315                   '"source" field to {yaml_file} for each dependency. ' \
   316                   'Dependency List: [{dep_list}]'.format(
   317              dep_list=','.join(sorted(incorrect_source_url)),
   318              yaml_file=dep_url_yaml)
   319          error_msg.append(how_to)
   320          run_status = 'failed'
   321  
   322      end = datetime.now()
   323      logging.info(
   324          'pull_licenses_java.py {status}. It took {sec} seconds with {threads} threads.'
   325          .format(status=run_status,
   326                  sec=(end - start).total_seconds(),
   327                  threads=THREADS))
   328  
   329      if error_msg:
   330          raise RuntimeError('{n} error(s) occurred.'.format(n=len(error_msg)),
   331                             error_msg)