github.com/elliott5/community@v0.14.1-0.20160709191136-823126fb026a/wordsmith/utility/words.go (about)

     1  // Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
     2  //
     3  // This software (Documize Community Edition) is licensed under 
     4  // GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
     5  //
     6  // You can operate outside the AGPL restrictions by purchasing
     7  // Documize Enterprise Edition and obtaining a commercial license
     8  // by contacting <sales@documize.com>. 
     9  //
    10  // https://documize.com
    11  
    12  package utility
    13  
    14  import "unicode"
    15  import nethtml "golang.org/x/net/html"
    16  
    17  // Words returns a slice of words, where each word contains no whitespace, and each item of punctuation is its own word.
    18  // This functionality is provided to enable verification of the text extraction algorithem across different implemntations.
    19  func Words(ch HTML, inSqBr int, testMode bool) ([]string, int, error) {
    20  	txt, err := ch.Text(testMode)
    21  	if err != nil {
    22  		return nil, inSqBr, err
    23  	}
    24  	txt = nethtml.UnescapeString(txt)
    25  
    26  	words := []string{""}
    27  
    28  	for _, c := range txt {
    29  		if inSqBr > 0 {
    30  			switch c {
    31  			case ']':
    32  				inSqBr--
    33  			case '[':
    34  				inSqBr++
    35  			}
    36  		} else {
    37  			if c == rune(0x200B) { // http://en.wikipedia.org/wiki/Zero-width_space
    38  				if testMode {
    39  					c = ' ' // NOTE only replace with a space here if we are testing
    40  				}
    41  			}
    42  			if c != rune(0x200B) { // http://en.wikipedia.org/wiki/Zero-width_space
    43  				if c == '[' {
    44  					inSqBr = 1
    45  					words = append(words, "[") // open square bracket means potentially elided text
    46  					words = append(words, "")
    47  				} else {
    48  					inSqBr = 0
    49  					if unicode.IsPunct(c) || unicode.IsSymbol(c) || unicode.IsDigit(c) {
    50  						if words[len(words)-1] == "" {
    51  							words[len(words)-1] = string(c)
    52  						} else {
    53  							words = append(words, string(c))
    54  						}
    55  						words = append(words, "")
    56  					} else {
    57  						if unicode.IsGraphic(c) || unicode.IsSpace(c) {
    58  							if unicode.IsSpace(c) {
    59  								if words[len(words)-1] != "" {
    60  									words = append(words, "")
    61  								}
    62  							} else {
    63  								words[len(words)-1] += string(c)
    64  							}
    65  						}
    66  					}
    67  				}
    68  			}
    69  		}
    70  	}
    71  	if !testMode { // add dummy punctuation if not in test mode to avoid incorrect sentance concatanation
    72  		words = append(words, ".")
    73  	}
    74  	return append(words, ""), inSqBr, nil // make sure there is always a blank entry at the end
    75  }