github.com/GuanceCloud/cliutils@v1.1.21/copyright.py

github.com/GuanceCloud/cliutils@v1.1.21/copyright.py (about)

     1  #!/usr/bin/env python3
     2  # -*- coding: utf-8 -*-
     3  
     4  # This script copy from
     5  #  https://github.com/DataDog/datadog-agent/blob/main/tasks/libs/copyright.py
     6  # We just made some adjust according to specific conditions.
     7  
     8  import re
     9  import subprocess
    10  import sys
    11  import argparse
    12  from pathlib import Path, PurePosixPath
    13  
    14  GLOB_PATTERN = "**/*.go"
    15  
    16  COPYRIGHT_HEADER = """
    17  // Unless explicitly stated otherwise all files in this repository are licensed
    18  // under the MIT License.
    19  // This product includes software developed at Guance Cloud (https://www.guance.com/).
    20  // Copyright 2021-present Guance, Inc.
    21  """.strip()
    22  
    23  COPYRIGHT_REGEX = [
    24      r'^// Unless explicitly stated otherwise all files in this repository are licensed$',
    25      r'^// under the MIT License\.$',
    26      r'^// This product includes software developed at Guance Cloud \(https://www\.guance\.com/\)\.$',
    27      r'^// Copyright 20[1-3][0-9]-([Pp]resent|20[1-3][0-9]) Guance, (Inc|Inmetrics)\.$',
    28  ]
    29  
    30  # These path patterns are excluded from checks
    31  PATH_EXCLUSION_REGEX = [
    32      # These are auto-generated files but without headers to indicate it
    33      '/vendor',
    34      'datakit/plugins/inputs/skywalking/compiled',
    35      '/plugins/externals/ebpf',
    36      '/plugins/inputs/skywalking/v3',
    37      '/internal/win_utils/pdh',
    38      '/internal/obfuscate',
    39      '/internal/msgpack',
    40      '/internal/obfuscate',
    41      '/pipeline/grok',
    42      '/pipeline/core/parser/lex_test.go',
    43      '/pipeline/core/parser/strutil.go',
    44      '/io/cachedata.pb.go',
    45      '/.git/',
    46      '/git/',
    47  ]
    48  
    49  # These header matchers skip enforcement of the rules if found in the first
    50  # line of the file
    51  HEADER_EXCLUSION_REGEX = [
    52      '^// Code generated ',
    53      '^//go:generate ',
    54      '^// AUTOGENERATED FILE: ',
    55      '^// Copyright.* OpenTelemetry Authors',
    56      '^// Copyright.* The Go Authors',
    57      '^// This file includes software developed at CoreOS',
    58      '^// Copyright 2017 Kinvolk',
    59  ]
    60  
    61  
    62  COMPILED_COPYRIGHT_REGEX = [re.compile(regex, re.UNICODE) for regex in COPYRIGHT_REGEX]
    63  COMPILED_PATH_EXCLUSION_REGEX = [re.compile(regex, re.UNICODE) for regex in PATH_EXCLUSION_REGEX]
    64  COMPILED_HEADER_EXCLUSION_REGEX = [re.compile(regex, re.UNICODE) for regex in HEADER_EXCLUSION_REGEX]
    65  
    66  
    67  class CopyrightLinter:
    68      """
    69      This class is used to enforce copyright headers on specified file patterns
    70      """
    71  
    72      def __init__(self, debug=False):
    73          self._debug = debug
    74  
    75      @staticmethod
    76      def _get_repo_dir():
    77          script_dir = PurePosixPath(__file__).parent
    78  
    79          repo_dir = (
    80              subprocess.check_output(
    81                  ['git', 'rev-parse', '--show-toplevel'],
    82                  cwd=script_dir,
    83              )
    84              .decode(sys.stdout.encoding)
    85              .strip()
    86          )
    87  
    88          return PurePosixPath(repo_dir)
    89  
    90      @staticmethod
    91      def _is_excluded_path(filepath, exclude_matchers):
    92          for matcher in exclude_matchers:
    93              if re.search(matcher, filepath.as_posix()):
    94                  return True
    95  
    96          return False
    97  
    98      @staticmethod
    99      def _get_matching_files(root_dir, glob_pattern, exclude=None):
   100          if exclude is None:
   101              exclude = []
   102  
   103          # Glob is a generator so we have to do the counting ourselves
   104          all_matching_files_cnt = 0
   105  
   106          filtered_files = []
   107          for filepath in Path(root_dir).glob(glob_pattern):
   108              all_matching_files_cnt += 1
   109              if not CopyrightLinter._is_excluded_path(filepath, exclude):
   110                  filtered_files.append(filepath)
   111  
   112          excluded_files_cnt = all_matching_files_cnt - len(filtered_files)
   113          print(f"[INFO] Excluding {excluded_files_cnt} files based on path filters!")
   114  
   115          return sorted(filtered_files)
   116  
   117      @staticmethod
   118      def _get_header(filepath):
   119          header = []
   120          with open(filepath, "r") as file_obj:
   121              # We expect a specific header format which should be 4 lines
   122              for _ in range(4):
   123                  header.append(file_obj.readline().strip())
   124  
   125          return header
   126  
   127      @staticmethod
   128      def _is_excluded_header(header, exclude=None):
   129          if exclude is None:
   130              exclude = []
   131  
   132          for matcher in exclude:
   133              if re.search(matcher, header[0]):
   134                  return True
   135  
   136          return False
   137  
   138      def _has_copyright(self, filepath):
   139          header = CopyrightLinter._get_header(filepath)
   140          if header is None:
   141              print("[WARN] Mismatch found! Could not find any content in file!")
   142              return False
   143  
   144          if len(header) > 0 and CopyrightLinter._is_excluded_header(header, exclude=COMPILED_HEADER_EXCLUSION_REGEX):
   145              if self._debug:
   146                  print(f"[INFO] Excluding {filepath} based on header '{header[0]}'")
   147              return True
   148  
   149          if len(header) <= 3:
   150              print("[WARN] Mismatch found! File too small for header stanza!")
   151              return False
   152  
   153          for line_idx, matcher in enumerate(COMPILED_COPYRIGHT_REGEX):
   154              if not re.match(matcher, header[line_idx]):
   155                  print(
   156                      f"[WARN] Mismatch found! Expected '{COPYRIGHT_REGEX[line_idx]}' pattern but got '{header[line_idx]}'"
   157                  )
   158                  return False
   159  
   160          return True
   161  
   162      def _assert_copyrights(self, files):
   163          failing_files = []
   164          for filepath in files:
   165              if self._has_copyright(filepath):
   166                  if self._debug:
   167                      print(f"[ OK ] {filepath}")
   168  
   169                  continue
   170  
   171              print(f"[FAIL] {filepath}")
   172              failing_files.append(filepath)
   173  
   174          total_files = len(files)
   175          if failing_files:
   176              pct_failing = (len(failing_files) / total_files) * 100
   177              print()
   178              print(
   179                  f"FAIL: There are {len(failing_files)} files out of "
   180                  + f"{total_files} ({pct_failing:.2f}%) that are missing the proper copyright!"
   181              )
   182  
   183          return failing_files
   184  
   185      def _prepend_header(self, filepath, dry_run=True):
   186          with open(filepath, 'r+') as file_obj:
   187              existing_content = file_obj.read()
   188  
   189              if dry_run:
   190                  return True
   191  
   192              file_obj.seek(0)
   193              new_content = COPYRIGHT_HEADER + "\n\n" + existing_content
   194              file_obj.write(new_content)
   195  
   196          # Verify result. A problem here is not benign so we stop the whole run.
   197          if not self._has_copyright(filepath):
   198              raise Exception(f"[ERROR] Header prepend failed to produce correct output for {filepath}!")
   199  
   200          return True
   201  
   202      @staticmethod
   203      def _is_build_header(line):
   204          return line.startswith("// +build ") or line.startswith("//+build ") or line.startswith("//go:build ")
   205  
   206      def _is_package_comment_or_nolint(line):
   207          return line.startswith("// Package ") or line.startswith("//nolint")
   208  
   209      def _fix_file_header(self, filepath, dry_run=True):
   210          header = CopyrightLinter._get_header(filepath)
   211  
   212          # Empty file - ignore
   213          if len(header) < 1:
   214              return False
   215  
   216          # If the file starts with a comment and it's not a build comment,
   217          # there is likely a manual fix to the header needed
   218          if header[0].startswith("//") and not CopyrightLinter._is_build_header(header[0]) and not CopyrightLinter._is_package_comment_or_nolint(header[0]):
   219              return False
   220  
   221          if dry_run:
   222              return True
   223  
   224          return self._prepend_header(filepath, dry_run=dry_run)
   225  
   226      def _fix(self, failing_files, dry_run=True):
   227          failing_files_cnt = len(failing_files)
   228          errors = []
   229          for idx, filepath in enumerate(failing_files):
   230              print(f"[INFO] ({idx+1:3d}/{failing_files_cnt:3}) Fixing '{filepath}'...")
   231  
   232              if not self._fix_file_header(filepath, dry_run=dry_run):
   233                  error_message = f"'{filepath}' could not be fixed!"
   234                  print(f"[WARN] ({idx+1:3d}/{failing_files_cnt:3}) {error_message}")
   235                  errors.append(Exception(error_message))
   236  
   237          return errors
   238  
   239      def assert_compliance(self, fix=False, dry_run=True):
   240          """
   241          This method applies the GLOB_PATTERN to the root of the repository and
   242          verifies that all files have the expected copyright header.
   243          """
   244          git_repo_dir = CopyrightLinter._get_repo_dir()
   245  
   246          if self._debug:
   247              print(f"[DEBG] Repo root: {git_repo_dir}")
   248              print(f"[DEBG] Finding all files in {git_repo_dir} matching '{GLOB_PATTERN}'...")
   249  
   250          matching_files = CopyrightLinter._get_matching_files(
   251              git_repo_dir,
   252              GLOB_PATTERN,
   253              exclude=COMPILED_PATH_EXCLUSION_REGEX,
   254          )
   255          print(f"[INFO] Found {len(matching_files)} files matching '{GLOB_PATTERN}'")
   256  
   257          failing_files = self._assert_copyrights(matching_files)
   258          if len(failing_files) > 0:
   259              if not fix:
   260                  print("CHECK: FAIL")
   261                  raise Exception(
   262                      f"Copyright linting found {len(failing_files)} files that did not have the expected header!"
   263                  )
   264  
   265              # If "fix=True", we will attempt to fix the failing files
   266              errors = self._fix(failing_files, dry_run=dry_run)
   267              if errors:
   268                  raise Exception(f"Copyright linter was unable to fix {len(errors)}/{len(failing_files)} files!")
   269  
   270              return
   271  
   272          print("CHECK: OK")
   273  
   274  
   275  if __name__ == '__main__':
   276      parser = argparse.ArgumentParser()
   277      parser.add_argument("--fix", dest="fix", action='store_true', help='auto add copyright to code')
   278      parser.add_argument("--dry-run", dest="dry_run", action='store_true', help='dry run')
   279  
   280      args = parser.parse_args()
   281      #CopyrightLinter(debug=True).assert_compliance(fix=True, dry_run=False)
   282  
   283      print(args)
   284  
   285      CopyrightLinter(debug=True).assert_compliance(fix=args.fix, dry_run=args.dry_run)