github.com/weaviate/weaviate@v1.24.6/modules/text2vec-contextionary/classification/splitter.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package classification
    13  
    14  // TODO: This code is duplicated across weaviate and contextionary which makes
    15  // changes risky. Can we find a single source of truth for this logic
    16  
    17  import (
    18  	"strings"
    19  	"unicode"
    20  )
    21  
    22  func newSplitter() *splitter {
    23  	return &splitter{}
    24  }
    25  
    26  type splitter struct{}
    27  
    28  func (s *splitter) Split(corpus string) []string {
    29  	return strings.FieldsFunc(corpus, func(c rune) bool {
    30  		return !unicode.IsLetter(c) && !unicode.IsNumber(c)
    31  	})
    32  }