github.com/dmaizel/tests@v0.0.0-20210728163746-cae6a2d9cee8/cmd/check-spelling/kata-spell-check.sh (about) 1 #!/bin/bash 2 # Copyright (c) 2019 Intel Corporation 3 # 4 # SPDX-License-Identifier: Apache-2.0 5 # 6 # Description: spell-check utility. 7 8 [ -n "$DEBUG" ] && set -x 9 10 set -o errexit 11 set -o pipefail 12 set -o nounset 13 14 # Ensure we spell check in English 15 LANG=C 16 LC_ALL=C 17 18 script_name=${0##*/} 19 20 if [ "$(uname -s)" == "Darwin" ] 21 then 22 # Hunspell dictionaries are a not easily available 23 # on this platform it seems. 24 echo "INFO: $script_name: OSX not supported - exiting" 25 exit 0 26 fi 27 28 self_dir=$(dirname "$(readlink -f "$0")") 29 cidir="${self_dir}/../../.ci" 30 31 # Directory containing word lists. 32 # 33 # Each file in this directory must: 34 # 35 # - Have the ".txt" extension. 36 # - Contain one word per line. 37 # 38 # Additionally, the files may contain blank lines and comments 39 # (lines beginning with '#'). 40 KATA_DICT_FRAGMENT_DIR=${KATA_DICT_FRAGMENT_DIR:-data} 41 42 KATA_DICT_NAME="${KATA_DICT_NAME:-kata-dictionary}" 43 44 # Name of dictionary file suitable for using with hunspell(1) 45 # as a personal dictionary. 46 KATA_DICT_FILE="${KATA_DICT_FILE:-${KATA_DICT_NAME}.dic}" 47 48 KATA_RULES_FILE="${KATA_RULES_FILE:-${KATA_DICT_FILE/.dic/.aff}}" 49 50 # command to remove code from markdown (inline and blocks) 51 strip_cmd="${cidir}/kata-doc-to-script.sh" 52 53 fragment_dir="${self_dir}/${KATA_DICT_FRAGMENT_DIR}" 54 55 # Name of file containing dictionary rules that apply to the 56 # KATA_DICT_FILE word list. 57 rules_file_name="rules.aff" 58 59 # Command to spell check a file 60 spell_check_cmd="${KATA_SPELL_CHECK_CMD:-hunspell}" 61 62 # Command to convert a markdown file into plain text 63 md_convert_tool="${KATA_MARKDOWN_CONVERT_TOOL:-pandoc}" 64 65 KATA_DICT_DIR="${KATA_DICT_DIR:-${self_dir}}" 66 dict_file="${KATA_DICT_DIR}/${KATA_DICT_FILE}" 67 rules_file="${KATA_DICT_DIR}/${KATA_RULES_FILE}" 68 69 # Hunspell refers to custom dictionary by their path followed by the name of 70 # the dictionary (without the file extension). 71 kata_dict_ref="${KATA_DICT_DIR}/${KATA_DICT_NAME}" 72 73 # All project documentation must be written in English, 74 # with American English taking priority. 75 # 76 # We also use a custom dictionary which has to be specified by its 77 # "directory and name prefix" and which must also be the first specified 78 # dictionary. 79 dict_languages="${kata_dict_ref},en_US,en_GB" 80 81 die() 82 { 83 local msg="$*" 84 echo >&2 "ERROR: $msg" 85 exit 1 86 } 87 88 info() 89 { 90 local msg="$*" 91 echo "INFO: $msg" 92 } 93 94 warn() 95 { 96 local msg="$*" 97 echo >&2 "WARNING: $msg" 98 } 99 100 make_dictionary() 101 { 102 [ -d "$fragment_dir" ] || die "invalid fragment directory" 103 [ -z "$dict_file" ] && die "missing dictionary output file name" 104 105 # Note: the first field is extracted to allow for inline 106 # comments in each fragment. For example: 107 # 108 # word # this text describes why the word is in the dictionary. 109 # 110 local dict 111 112 dict=$(cat "$fragment_dir"/*.txt |\ 113 grep -v '^\#' |\ 114 grep -v '^$' |\ 115 awk '{print $1}' |\ 116 sort -u || true) 117 118 [ -z "$dict" ] && die "generated dictionary is empty" 119 120 # Now, add in the number of words as a header (required by Hunspell) 121 local count 122 123 count=$(echo "$dict"| wc -l | awk '{print $1}' || true) 124 [ -z "$count" ] && die "cannot determine dictionary length" 125 [ "$count" -eq 0 ] && die "invalid dictionary length" 126 127 # Construct the dictionary 128 (echo "$count"; echo "$dict") > "$dict_file" 129 130 cp "${fragment_dir}/${rules_file_name}" "${rules_file}" 131 } 132 133 spell_check_file() 134 { 135 local file="$1" 136 137 [ -z "$file" ] && die "need file to check" 138 [ -e "$file" ] || die "file does not exist: '$file'" 139 140 [ -e "$dict_file" ] || make_dictionary 141 142 info "Spell checking file '$file'" 143 144 # Determine the pandoc input format. 145 local pandoc_input_fmts 146 local pandoc_input_fmt 147 148 local pandoc_input_fmts=$(pandoc --list-input-formats 2>/dev/null || true) 149 150 if [ -z "$pandoc_input_fmts" ] 151 then 152 # We're using a very old version of pandoc that doesn't 153 # support listing its available input formats, so 154 # specify a default. 155 pandoc_input_fmt="markdown_github" 156 else 157 # Pandoc has multiple names for the gfm parser so find one of them 158 pandoc_input_fmt=$(echo "$pandoc_input_fmts" |\ 159 grep -E "gfm|github" |\ 160 head -1 || true) 161 fi 162 163 [ -z "$pandoc_input_fmt" ] && die "cannot find usable pandoc input format" 164 165 local stripped_doc 166 167 local pandoc_doc 168 local utf8_free_doc 169 local pre_hunspell_doc 170 local hunspell_results 171 local final_results 172 173 # First strip out all code blocks and convert all 174 # "quoted apostrophe's" ('\'') back into a single apostrophe. 175 stripped_doc=$("$strip_cmd" -i "$file" -) 176 177 # Next, convert the remainder it into plain text to remove the 178 # remaining markdown syntax. 179 # 180 # Before pandoc gets hold of it: 181 # 182 # - Replace pipes with spaces. This 183 # fixes an issue with old versions of pandoc (Ubuntu 16.04) 184 # which completely mangle tables into nonsense. 185 # 186 # - Remove empty reference links. 187 # 188 # For example, this markdown 189 # 190 # blah [`qemu-lite`][qemu-lite] blah. 191 # : 192 # [qemu-lite]: https://... 193 # 194 # Gets converted into 195 # 196 # blah [][qemu-lite] blah. 197 # : 198 # [qemu-lite]: https://... 199 # 200 # And the empty set of square brackets confuses pandoc. 201 # 202 # After pandoc has processed the data, remove any remaining 203 # "inline links" in this format: 204 # 205 # [link name](#link-address) 206 # 207 # This is strictly only required for old versions of pandoc. 208 209 pandoc_doc=$(echo "$stripped_doc" |\ 210 tr '|' ' ' |\ 211 sed 's/\[\]\[[^]]*\]//g' |\ 212 "$md_convert_tool" -f "${pandoc_input_fmt}" -t plain - |\ 213 sed 's/\[[^]]*\]([^\)]*)//g' || true) 214 215 # Convert the file into "pure ASCII" by removing all awkward 216 # Unicode characters that won't spell check. 217 # 218 # Necessary since pandoc is "clever" and will convert things like 219 # GitHub's colon emojis (such as ":smile:") into the actual utf8 220 # character where possible. 221 utf8_free_doc=$(echo "$pandoc_doc" | iconv -c -f utf-8 -t ascii) 222 223 # Next, perform the following simplifications: 224 # 225 # - Remove URLs. 226 # - Remove email addresses. 227 # - Replace most punctuation symbols with a space 228 # (excluding a dash (aka hyphen!) 229 # - Carefully remove non-hyphen dashes. 230 # - Remove GitHub @userids. 231 pre_hunspell_doc=$(echo "$utf8_free_doc" |\ 232 sed 's,https*://[^[:space:]()][^[:space:]()]*,,g' |\ 233 sed -r 's/[a-zA-Z0-9.-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9.-]+//g' |\ 234 tr '[,\[\]()\*\\/\|=]' ' ' |\ 235 sed -e 's/^ *-//g' -e 's/- $//g' -e 's/ -//g' |\ 236 sed 's/@[a-zA-Z0-9][a-zA-Z0-9]*\b//g') 237 238 # Call the spell checker 239 hunspell_results=$(echo "$pre_hunspell_doc" | $spell_check_cmd -d "${dict_languages}") 240 241 # Finally, post-process the hunspell output: 242 # 243 # - Parse the output to ignore: 244 # - Hunspell banner. 245 # - Correctly spelt words (lines starting with '*', '+' or '-'). 246 # - All words containing numbers (like "100MB"). 247 # - All words that appear to be acronymns / Abbreviations 248 # (atleast two upper-case letters and which may be plural or 249 # possessive). 250 # - All words that appear to be numbers. 251 # - All possessives and the dreaded isolated "'s" which occurs 252 # for input like this: 253 # 254 # `kata-shim`'s 255 # 256 # which gets converted by $strip_cmd into simply: 257 # 258 # 's 259 # 260 # - Sort output. 261 262 final_results=$(echo "$hunspell_results" |\ 263 grep -Evi "(ispell|hunspell)" |\ 264 grep -Ev '^(\*|\+|-)' |\ 265 grep -Evi "^(&|#) [^ ]*[0-9][^ ]*" |\ 266 grep -Ev "^. [A-Z][A-Z][A-Z]*(s|'s)*" |\ 267 grep -Ev "^. 's" |\ 268 sort -u || true) 269 270 local line 271 local incorrects 272 local near_misses 273 274 near_misses=$(echo "$final_results" | grep '^&' || true) 275 incorrects=$(echo "$final_results" | grep '^\#' | awk '{print $2}' || true) 276 277 local -i failed=0 278 279 [ -n "$near_misses" ] && failed+=1 280 [ -n "$incorrects" ] && failed+=1 281 282 echo "$near_misses" | while read -r line 283 do 284 [ "$line" = "" ] && continue 285 286 local word 287 local possibles 288 289 word=$(echo "$line" | awk '{print $2}') 290 possibles=$(echo "$line" | cut -d: -f2- | sed 's/^ *//g') 291 292 warn "Word '${word}': did you mean one of the following?: ${possibles}" 293 done 294 295 local incorrect 296 for incorrect in $incorrects 297 do 298 warn "Incorrect word: '$incorrect'" 299 done 300 301 [ "$failed" -gt 0 ] && die "Spell check failed for file: '$file'" 302 303 info "Spell check successful for file: '$file'" 304 } 305 306 delete_dictionary() 307 { 308 rm -f "${KATA_DICT_FILE}" "${KATA_RULES_FILE}" 309 } 310 311 setup() 312 { 313 local cmd 314 315 for cmd in "$spell_check_cmd" "$md_convert_tool" 316 do 317 command -v "$cmd" &>/dev/null || die "Need $cmd command" 318 done 319 } 320 321 usage() 322 { 323 cat <<-EOT 324 Usage: ${script_name} <command> [arguments] 325 326 Description: Spell-checking utility. 327 328 Commands: 329 330 check <file> : Spell check the specified file 331 (implies 'make-dict'). 332 delete-dict : Delete the dictionary. 333 help : Show this usage. 334 make-dict : Create the dictionary. 335 EOT 336 } 337 338 main() 339 { 340 setup 341 342 [ -z "${1:-}" ] && usage && echo && die "need command" 343 344 case "$1" in 345 check) shift && spell_check_file "$1" ;; 346 delete-dict) delete_dictionary ;; 347 help|-h|--help) usage && exit 0 ;; 348 make-dict) make_dictionary ;; 349 *) die "invalid command: '$1'" ;; 350 esac 351 } 352 353 main "$@"