github.com/elliott5/community@v0.14.1-0.20160709191136-823126fb026a/wordsmith/utility/words.go (about) 1 // Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved. 2 // 3 // This software (Documize Community Edition) is licensed under 4 // GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html 5 // 6 // You can operate outside the AGPL restrictions by purchasing 7 // Documize Enterprise Edition and obtaining a commercial license 8 // by contacting <sales@documize.com>. 9 // 10 // https://documize.com 11 12 package utility 13 14 import "unicode" 15 import nethtml "golang.org/x/net/html" 16 17 // Words returns a slice of words, where each word contains no whitespace, and each item of punctuation is its own word. 18 // This functionality is provided to enable verification of the text extraction algorithem across different implemntations. 19 func Words(ch HTML, inSqBr int, testMode bool) ([]string, int, error) { 20 txt, err := ch.Text(testMode) 21 if err != nil { 22 return nil, inSqBr, err 23 } 24 txt = nethtml.UnescapeString(txt) 25 26 words := []string{""} 27 28 for _, c := range txt { 29 if inSqBr > 0 { 30 switch c { 31 case ']': 32 inSqBr-- 33 case '[': 34 inSqBr++ 35 } 36 } else { 37 if c == rune(0x200B) { // http://en.wikipedia.org/wiki/Zero-width_space 38 if testMode { 39 c = ' ' // NOTE only replace with a space here if we are testing 40 } 41 } 42 if c != rune(0x200B) { // http://en.wikipedia.org/wiki/Zero-width_space 43 if c == '[' { 44 inSqBr = 1 45 words = append(words, "[") // open square bracket means potentially elided text 46 words = append(words, "") 47 } else { 48 inSqBr = 0 49 if unicode.IsPunct(c) || unicode.IsSymbol(c) || unicode.IsDigit(c) { 50 if words[len(words)-1] == "" { 51 words[len(words)-1] = string(c) 52 } else { 53 words = append(words, string(c)) 54 } 55 words = append(words, "") 56 } else { 57 if unicode.IsGraphic(c) || unicode.IsSpace(c) { 58 if unicode.IsSpace(c) { 59 if words[len(words)-1] != "" { 60 words = append(words, "") 61 } 62 } else { 63 words[len(words)-1] += string(c) 64 } 65 } 66 } 67 } 68 } 69 } 70 } 71 if !testMode { // add dummy punctuation if not in test mode to avoid incorrect sentance concatanation 72 words = append(words, ".") 73 } 74 return append(words, ""), inSqBr, nil // make sure there is always a blank entry at the end 75 }