github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/document/field.go (about)

     1  package document
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"github.com/balzaczyy/golucene/core/analysis"
     7  	. "github.com/balzaczyy/golucene/core/analysis/tokenattributes"
     8  	"github.com/balzaczyy/golucene/core/index/model"
     9  	"io"
    10  	"log"
    11  	"strconv"
    12  )
    13  
    14  // document/Field.java
    15  
    16  type Field struct {
    17  	_type  *FieldType  // Field's type
    18  	_name  string      // Field's name
    19  	_data  interface{} // Field's value
    20  	_boost float32     // Field's boost
    21  
    22  	/*
    23  		Pre-analyzed tokenStream for indexed fields; this is
    24  		separte from fieldsData because you are allowed to
    25  		have both; eg maybe field has a String value but you
    26  		customize how it's tokenized
    27  	*/
    28  	_tokenStream analysis.TokenStream
    29  }
    30  
    31  /* Create field with Reader value. */
    32  func NewFieldFromReader(name string, reader io.RuneReader, ft *FieldType) *Field {
    33  	assert2(name != "", "name cannot be empty")
    34  	assert2(ft != nil, "type can not be nil")
    35  	assert2(reader != nil, "reader cannot be nil")
    36  	assert2(!ft.Stored(), "fields with a Reader value cannot be stored")
    37  	assert2(!ft.Indexed() || ft.Tokenized(), "non-tokenized fields must use String values")
    38  	return &Field{ft, name, reader, 1.0, nil}
    39  }
    40  
    41  // Create field with String value
    42  func NewFieldFromString(name, value string, ft *FieldType) *Field {
    43  	assert2(name != "", "name cannot be empty")
    44  	assert2(value != "", "value cannot be empty")
    45  	assert2(ft.stored || ft.indexed,
    46  		"it doesn't make sense to have a field that is neither indexed nor stored")
    47  	return &Field{_type: ft, _name: name, _data: value, _boost: 1}
    48  }
    49  
    50  func (f *Field) StringValue() string {
    51  	switch f._data.(type) {
    52  	case string:
    53  		return f._data.(string)
    54  	case int:
    55  		return strconv.Itoa(f._data.(int))
    56  	default:
    57  		log.Println("Unknown type", f._data)
    58  		panic("not implemented yet")
    59  	}
    60  }
    61  
    62  func assert2(ok bool, msg string) {
    63  	if !ok {
    64  		panic(msg)
    65  	}
    66  }
    67  
    68  func (f *Field) ReaderValue() io.RuneReader {
    69  	if v, ok := f._data.(io.RuneReader); ok {
    70  		return v
    71  	}
    72  	return nil
    73  }
    74  
    75  func (f *Field) Name() string {
    76  	return f._name
    77  }
    78  
    79  func (f *Field) Boost() float32 {
    80  	return f._boost
    81  }
    82  
    83  func (f *Field) NumericValue() interface{} {
    84  	switch f._data.(type) {
    85  	case int32, int64, float32, float64:
    86  		return f._data
    87  	default:
    88  		return nil
    89  	}
    90  }
    91  
    92  func (f *Field) BinaryValue() []byte {
    93  	if v, ok := f._data.([]byte); ok {
    94  		return v
    95  	}
    96  	return nil
    97  }
    98  
    99  func (f *Field) String() string {
   100  	var buf bytes.Buffer
   101  	fmt.Fprintf(&buf, "%v<%v:", f._type, f._name)
   102  	if f._data != nil {
   103  		fmt.Fprint(&buf, f._data)
   104  	}
   105  	fmt.Fprint(&buf, ">")
   106  	return buf.String()
   107  }
   108  
   109  func (f *Field) FieldType() model.IndexableFieldType {
   110  	return f._type
   111  }
   112  
   113  func (f *Field) TokenStream(analyzer analysis.Analyzer, reuse analysis.TokenStream) (ts analysis.TokenStream, err error) {
   114  	if !f.FieldType().Indexed() {
   115  		return nil, nil
   116  	}
   117  
   118  	if nt := f.FieldType().(*FieldType).NumericType(); nt != NumericType(0) {
   119  		panic("not implemented yet")
   120  	}
   121  
   122  	if !f.FieldType().Tokenized() {
   123  		assert2(f.StringValue() != "", "Non-Tokenized Fields must have a string value")
   124  		if _, ok := reuse.(*StringTokenStream); !ok {
   125  			reuse = newStringTokenStream()
   126  		}
   127  		reuse.(*StringTokenStream).setValue(f.StringValue())
   128  		return reuse, nil
   129  	}
   130  
   131  	if f._tokenStream != nil {
   132  		return f._tokenStream, nil
   133  	} else if f.ReaderValue() != nil {
   134  		return analyzer.TokenStreamForReader(f._name, f.ReaderValue())
   135  	} else if f.StringValue() != "" {
   136  		return analyzer.TokenStreamForString(f._name, f.StringValue())
   137  	}
   138  
   139  	panic(fmt.Sprintf("Field must have either TokenStream, String, Reader, or Number value; got %v", f))
   140  }
   141  
   142  type StringTokenStream struct {
   143  	*analysis.TokenStreamImpl
   144  	termAttribute   CharTermAttribute
   145  	offsetAttribute OffsetAttribute
   146  	used            bool
   147  	value           string
   148  }
   149  
   150  /*
   151  Creates a new TokenStream that returns a string as single token.
   152  
   153  Warning: Does not initialize the value, you must call setValue() afterwards!
   154  */
   155  func newStringTokenStream() *StringTokenStream {
   156  	ans := &StringTokenStream{TokenStreamImpl: analysis.NewTokenStream()}
   157  	ans.termAttribute = ans.Attributes().Add("CharTermAttribute").(CharTermAttribute)
   158  	ans.offsetAttribute = ans.Attributes().Add("OffsetAttribute").(OffsetAttribute)
   159  	return ans
   160  }
   161  
   162  func (ts *StringTokenStream) setValue(value string) {
   163  	ts.value = value
   164  }
   165  
   166  func (ts *StringTokenStream) IncrementToken() (bool, error) {
   167  	if ts.used {
   168  		return false, nil
   169  	}
   170  	ts.Attributes().Clear()
   171  	ts.termAttribute.AppendString(ts.value)
   172  	ts.offsetAttribute.SetOffset(0, len(ts.value))
   173  	ts.used = true
   174  	return true, nil
   175  }
   176  
   177  /* Specifies whether and how a field should be stored. */
   178  type Store int
   179  
   180  /*
   181  Store the original field value in the index. This is useful for short
   182  texts like a document's title which should be displayed with the
   183  results. The value is stored in its original form, i.e. no analyzer
   184  is used before it is stored.
   185  */
   186  const STORE_YES = Store(1)
   187  
   188  /* Do not store the field's value in the index. */
   189  const STORE_NO = Store(2)
   190  
   191  // document/StringField.java
   192  
   193  /* Indexed, not tokenized, omits norms, indexes DOCS_ONLY, not stored. */
   194  var STRING_FIELD_TYPE_NOT_STORED = func() *FieldType {
   195  	ft := newFieldType()
   196  	ft.indexed = true
   197  	ft._omitNorms = true
   198  	ft._indexOptions = model.INDEX_OPT_DOCS_ONLY
   199  	ft._tokenized = false
   200  	ft.frozen = true
   201  	return ft
   202  }()
   203  
   204  /* Indexed, not tokenized, omits norms, indexes DOCS_ONLY, stored */
   205  var STRING_FIELD_TYPE_STORED = func() *FieldType {
   206  	ft := newFieldType()
   207  	ft.indexed = true
   208  	ft._omitNorms = true
   209  	ft._indexOptions = model.INDEX_OPT_DOCS_ONLY
   210  	ft.stored = true
   211  	ft._tokenized = false
   212  	ft.frozen = true
   213  	return ft
   214  }()
   215  
   216  /*
   217  Creates a new field that is indexed but not tokenized: the entire
   218  String value is indexed as a single token. For example, this might be
   219  used for a 'country' field or an 'id' field, or any field that you
   220  intend to use for sorting or access through the field cache.
   221  */
   222  func newStringField(name, value string, stored Store) *Field {
   223  	return NewFieldFromString(name, value, map[Store]*FieldType{
   224  		STORE_YES: STRING_FIELD_TYPE_STORED,
   225  		STORE_NO:  STRING_FIELD_TYPE_NOT_STORED,
   226  	}[stored])
   227  }
   228  
   229  // document/TextField.java
   230  
   231  /* indexed, tokenized, not stored. */
   232  var TEXT_FIELD_TYPE_NOT_STORED = func() *FieldType {
   233  	ft := newFieldType()
   234  	ft.indexed = true
   235  	ft._tokenized = true
   236  	ft.frozen = true
   237  	return ft
   238  }()
   239  
   240  /* indexed, tokenized, stored. */
   241  var TEXT_FIELD_TYPE_STORED = func() *FieldType {
   242  	ft := newFieldType()
   243  	ft.indexed = true
   244  	ft._tokenized = true
   245  	ft.stored = true
   246  	ft.frozen = true
   247  	return ft
   248  }()
   249  
   250  /*
   251  A field that is indexed and tokenized, without term vectors. For
   252  example, this would be used on a 'body' field, that contains the bulk
   253  of a document's text.
   254  */
   255  type TextField struct {
   256  	*Field
   257  }
   258  
   259  /* Creates a new un-stored TextField with Reader value */
   260  func NewTextFieldFromReader(name string, reader io.RuneReader) *TextField {
   261  	return &TextField{
   262  		NewFieldFromReader(name, reader, TEXT_FIELD_TYPE_NOT_STORED),
   263  	}
   264  }
   265  
   266  func NewTextFieldFromString(name, value string, store Store) *TextField {
   267  	return &TextField{NewFieldFromString(name, value, map[Store]*FieldType{
   268  		STORE_YES: TEXT_FIELD_TYPE_STORED,
   269  		STORE_NO:  TEXT_FIELD_TYPE_NOT_STORED,
   270  	}[store])}
   271  }
   272  
   273  // document/StoredField.java
   274  
   275  // Type for a stored-only field.
   276  var STORED_FIELD_TYPE = func() *FieldType {
   277  	ans := newFieldType()
   278  	ans.stored = true
   279  	return ans
   280  }()
   281  
   282  /*
   283  A field whose value is stored so that IndexSearcher.doc()
   284  and IndexReader.document() will return the field and its
   285  value.
   286  */
   287  type StoredField struct {
   288  	*Field
   289  }
   290  
   291  /*
   292  Create a stored-only field with the given binary value.
   293  
   294  NOTE: the provided byte[] is not copied so be sure
   295  not to change it until you're done with this field.
   296  */
   297  // func newStoredField(name string, value []byte) *StoredField {
   298  // 	return &StoredField{newStringField(name, value, STORED_FIELD_TYPE)}
   299  // }