github.com/dmaizel/tests@v0.0.0-20210728163746-cae6a2d9cee8/cmd/check-spelling/kata-spell-check.sh

github.com/dmaizel/tests@v0.0.0-20210728163746-cae6a2d9cee8/cmd/check-spelling/kata-spell-check.sh (about)

     1  #!/bin/bash
     2  # Copyright (c) 2019 Intel Corporation
     3  #
     4  # SPDX-License-Identifier: Apache-2.0
     5  #
     6  # Description: spell-check utility.
     7  
     8  [ -n "$DEBUG" ] && set -x
     9  
    10  set -o errexit
    11  set -o pipefail
    12  set -o nounset
    13  
    14  # Ensure we spell check in English
    15  LANG=C
    16  LC_ALL=C
    17  
    18  script_name=${0##*/}
    19  
    20  if [ "$(uname -s)" == "Darwin" ]
    21  then
    22  	# Hunspell dictionaries are a not easily available
    23  	# on this platform it seems.
    24  	echo "INFO: $script_name: OSX not supported - exiting"
    25  	exit 0
    26  fi
    27  
    28  self_dir=$(dirname "$(readlink -f "$0")")
    29  cidir="${self_dir}/../../.ci"
    30  
    31  # Directory containing word lists.
    32  #
    33  # Each file in this directory must:
    34  #
    35  # - Have the ".txt" extension.
    36  # - Contain one word per line.
    37  #
    38  # Additionally, the files may contain blank lines and comments
    39  # (lines beginning with '#').
    40  KATA_DICT_FRAGMENT_DIR=${KATA_DICT_FRAGMENT_DIR:-data}
    41  
    42  KATA_DICT_NAME="${KATA_DICT_NAME:-kata-dictionary}"
    43  
    44  # Name of dictionary file suitable for using with hunspell(1)
    45  # as a personal dictionary.
    46  KATA_DICT_FILE="${KATA_DICT_FILE:-${KATA_DICT_NAME}.dic}"
    47  
    48  KATA_RULES_FILE="${KATA_RULES_FILE:-${KATA_DICT_FILE/.dic/.aff}}"
    49  
    50  # command to remove code from markdown (inline and blocks)
    51  strip_cmd="${cidir}/kata-doc-to-script.sh"
    52  
    53  fragment_dir="${self_dir}/${KATA_DICT_FRAGMENT_DIR}"
    54  
    55  # Name of file containing dictionary rules that apply to the
    56  # KATA_DICT_FILE word list.
    57  rules_file_name="rules.aff"
    58  
    59  # Command to spell check a file
    60  spell_check_cmd="${KATA_SPELL_CHECK_CMD:-hunspell}"
    61  
    62  # Command to convert a markdown file into plain text
    63  md_convert_tool="${KATA_MARKDOWN_CONVERT_TOOL:-pandoc}"
    64  
    65  KATA_DICT_DIR="${KATA_DICT_DIR:-${self_dir}}"
    66  dict_file="${KATA_DICT_DIR}/${KATA_DICT_FILE}"
    67  rules_file="${KATA_DICT_DIR}/${KATA_RULES_FILE}"
    68  
    69  # Hunspell refers to custom dictionary by their path followed by the name of
    70  # the dictionary (without the file extension).
    71  kata_dict_ref="${KATA_DICT_DIR}/${KATA_DICT_NAME}"
    72  
    73  # All project documentation must be written in English,
    74  # with American English taking priority.
    75  #
    76  # We also use a custom dictionary which has to be specified by its
    77  # "directory and name prefix" and which must also be the first specified
    78  # dictionary.
    79  dict_languages="${kata_dict_ref},en_US,en_GB"
    80  
    81  die()
    82  {
    83  	local msg="$*"
    84  	echo >&2 "ERROR: $msg"
    85  	exit 1
    86  }
    87  
    88  info()
    89  {
    90  	local msg="$*"
    91  	echo "INFO: $msg"
    92  }
    93  
    94  warn()
    95  {
    96  	local msg="$*"
    97  	echo >&2 "WARNING: $msg"
    98  }
    99  
   100  make_dictionary()
   101  {
   102  	[ -d "$fragment_dir" ] || die "invalid fragment directory"
   103  	[ -z "$dict_file" ] && die "missing dictionary output file name"
   104  
   105  	# Note: the first field is extracted to allow for inline
   106  	# comments in each fragment. For example:
   107  	#
   108  	#  word # this text describes why the word is in the dictionary.
   109  	#
   110  	local dict
   111  
   112  	dict=$(cat "$fragment_dir"/*.txt |\
   113  		grep -v '^\#' |\
   114  		grep -v '^$' |\
   115  		awk '{print $1}' |\
   116  		sort -u || true)
   117  
   118  	[ -z "$dict" ] && die "generated dictionary is empty"
   119  
   120  	# Now, add in the number of words as a header (required by Hunspell)
   121  	local count
   122  
   123  	count=$(echo "$dict"| wc -l | awk '{print $1}' || true)
   124  	[ -z "$count" ] && die "cannot determine dictionary length"
   125  	[ "$count" -eq 0 ] && die "invalid dictionary length"
   126  
   127  	# Construct the dictionary
   128  	(echo "$count"; echo "$dict") > "$dict_file"
   129  
   130  	cp "${fragment_dir}/${rules_file_name}" "${rules_file}"
   131  }
   132  
   133  spell_check_file()
   134  {
   135  	local file="$1"
   136  
   137  	[ -z "$file" ] && die "need file to check"
   138  	[ -e "$file" ] || die "file does not exist: '$file'"
   139  
   140  	[ -e "$dict_file" ] || make_dictionary
   141  
   142  	info "Spell checking file '$file'"
   143  
   144  	# Determine the pandoc input format.
   145  	local pandoc_input_fmts
   146  	local pandoc_input_fmt
   147  
   148  	local pandoc_input_fmts=$(pandoc --list-input-formats 2>/dev/null || true)
   149  
   150  	if [ -z "$pandoc_input_fmts" ]
   151  	then
   152  		# We're using a very old version of pandoc that doesn't
   153  		# support listing its available input formats, so
   154  		# specify a default.
   155  		pandoc_input_fmt="markdown_github"
   156  	else
   157  		# Pandoc has multiple names for the gfm parser so find one of them
   158  		pandoc_input_fmt=$(echo "$pandoc_input_fmts" |\
   159  			grep -E "gfm|github" |\
   160  			head -1 || true)
   161  	fi
   162  
   163  	[ -z "$pandoc_input_fmt" ] && die "cannot find usable pandoc input format"
   164  
   165  	local stripped_doc
   166  
   167  	local pandoc_doc
   168  	local utf8_free_doc
   169  	local pre_hunspell_doc
   170  	local hunspell_results
   171  	local final_results
   172  
   173  	# First strip out all code blocks and convert all
   174  	# "quoted apostrophe's" ('\'') back into a single apostrophe.
   175  	stripped_doc=$("$strip_cmd" -i "$file" -)
   176  
   177  	# Next, convert the remainder it into plain text to remove the
   178  	# remaining markdown syntax.
   179  	#
   180  	# Before pandoc gets hold of it:
   181  	#
   182  	# - Replace pipes with spaces. This
   183  	#   fixes an issue with old versions of pandoc (Ubuntu 16.04)
   184  	#   which completely mangle tables into nonsense.
   185  	#
   186  	# - Remove empty reference links.
   187  	#
   188  	#   For example, this markdown
   189  	#
   190  	#       blah [`qemu-lite`][qemu-lite] blah.
   191  	#         :
   192  	#       [qemu-lite]: https://...
   193  	#
   194  	#   Gets converted into
   195  	#
   196  	#       blah [][qemu-lite] blah.
   197  	#         :
   198  	#       [qemu-lite]: https://...
   199  	#
   200  	#   And the empty set of square brackets confuses pandoc.
   201  	#
   202  	# After pandoc has processed the data, remove any remaining
   203  	# "inline links" in this format:
   204  	#
   205  	#     [link name](#link-address)
   206  	#
   207  	# This is strictly only required for old versions of pandoc.
   208  
   209  	pandoc_doc=$(echo "$stripped_doc" |\
   210  		tr '|' ' '  |\
   211  		sed 's/\[\]\[[^]]*\]//g' |\
   212  		"$md_convert_tool" -f "${pandoc_input_fmt}" -t plain - |\
   213  		sed 's/\[[^]]*\]([^\)]*)//g' || true)
   214  
   215  	# Convert the file into "pure ASCII" by removing all awkward
   216  	# Unicode characters that won't spell check.
   217  	#
   218  	# Necessary since pandoc is "clever" and will convert things like
   219  	# GitHub's colon emojis (such as ":smile:") into the actual utf8
   220  	# character where possible.
   221  	utf8_free_doc=$(echo "$pandoc_doc" | iconv -c -f utf-8 -t ascii)
   222  
   223  	# Next, perform the following simplifications:
   224  	#
   225  	# - Remove URLs.
   226  	# - Remove email addresses.
   227  	# - Replace most punctuation symbols with a space
   228  	#   (excluding a dash (aka hyphen!)
   229  	# - Carefully remove non-hyphen dashes.
   230  	# - Remove GitHub @userids.
   231  	pre_hunspell_doc=$(echo "$utf8_free_doc" |\
   232  		sed 's,https*://[^[:space:]()][^[:space:]()]*,,g' |\
   233  		sed -r 's/[a-zA-Z0-9.-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9.-]+//g' |\
   234  		tr '[,\[\]()\*\\/\|=]' ' ' |\
   235  		sed -e 's/^ *-//g' -e 's/- $//g' -e 's/ -//g' |\
   236  		sed 's/@[a-zA-Z0-9][a-zA-Z0-9]*\b//g')
   237  
   238  	# Call the spell checker
   239  	hunspell_results=$(echo "$pre_hunspell_doc" | $spell_check_cmd -d "${dict_languages}")
   240  
   241  	# Finally, post-process the hunspell output:
   242  	#
   243  	# - Parse the output to ignore:
   244  	#   - Hunspell banner.
   245  	#   - Correctly spelt words (lines starting with '*', '+' or '-').
   246  	#   - All words containing numbers (like "100MB").
   247  	#   - All words that appear to be acronymns / Abbreviations
   248  	#     (atleast two upper-case letters and which may be plural or
   249  	#     possessive).
   250  	#   - All words that appear to be numbers.
   251  	#   - All possessives and the dreaded isolated "'s" which occurs
   252  	#     for input like this:
   253  	#
   254  	#         `kata-shim`'s
   255  	#
   256  	#     which gets converted by $strip_cmd into simply:
   257  	#
   258  	#         's
   259  	#
   260  	# - Sort output.
   261  
   262  	final_results=$(echo "$hunspell_results" |\
   263  		grep -Evi "(ispell|hunspell)" |\
   264  		grep -Ev '^(\*|\+|-)' |\
   265  		grep -Evi "^(&|#) [^ ]*[0-9][^ ]*" |\
   266  		grep -Ev "^. [A-Z][A-Z][A-Z]*(s|'s)*" |\
   267  		grep -Ev "^. 's" |\
   268  		sort -u || true)
   269  
   270  	local line
   271  	local incorrects
   272  	local near_misses
   273  
   274  	near_misses=$(echo "$final_results" | grep '^&' || true)
   275  	incorrects=$(echo "$final_results" | grep '^\#' | awk '{print $2}' || true)
   276  
   277  	local -i failed=0
   278  
   279  	[ -n "$near_misses" ] && failed+=1
   280  	[ -n "$incorrects" ] && failed+=1
   281  
   282  	echo "$near_misses" | while read -r line
   283  	do
   284  		[ "$line" = "" ] && continue
   285  
   286  		local word
   287  		local possibles
   288  
   289  		word=$(echo "$line" | awk '{print $2}')
   290  		possibles=$(echo "$line" | cut -d: -f2- | sed 's/^ *//g')
   291  
   292  		warn "Word '${word}': did you mean one of the following?: ${possibles}"
   293  	done
   294  
   295  	local incorrect
   296  	for incorrect in $incorrects
   297  	do
   298  		warn "Incorrect word: '$incorrect'"
   299  	done
   300  
   301  	[ "$failed" -gt 0 ] && die "Spell check failed for file: '$file'"
   302  
   303  	info "Spell check successful for file: '$file'"
   304  }
   305  
   306  delete_dictionary()
   307  {
   308  	rm -f "${KATA_DICT_FILE}" "${KATA_RULES_FILE}"
   309  }
   310  
   311  setup()
   312  {
   313  	local cmd
   314  
   315  	for cmd in "$spell_check_cmd" "$md_convert_tool"
   316  	do
   317  		command -v "$cmd" &>/dev/null || die "Need $cmd command"
   318  	done
   319  }
   320  
   321  usage()
   322  {
   323  	cat <<-EOT
   324  	Usage: ${script_name} <command> [arguments]
   325  
   326  	Description: Spell-checking utility.
   327  
   328  	Commands:
   329  
   330  	  check <file> : Spell check the specified file
   331  	                 (implies 'make-dict').
   332  	  delete-dict  : Delete the dictionary.
   333  	  help         : Show this usage.
   334  	  make-dict    : Create the dictionary.
   335  EOT
   336  }
   337  
   338  main()
   339  {
   340  	setup
   341  
   342  	[ -z "${1:-}" ] && usage && echo && die "need command"
   343  
   344  	case "$1" in
   345  		check) shift && spell_check_file "$1" ;;
   346  		delete-dict) delete_dictionary ;;
   347  		help|-h|--help) usage && exit 0 ;;
   348  		make-dict) make_dictionary ;;
   349  		*) die "invalid command: '$1'" ;;
   350  	esac
   351  }
   352  
   353  main "$@"