github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/index/terms.go (about) 1 package index 2 3 import ( 4 "fmt" 5 // "github.com/balzaczyy/golucene/core/analysis/tokenattributes" 6 . "github.com/balzaczyy/golucene/core/index/model" 7 "github.com/balzaczyy/golucene/core/util" 8 // "sort" 9 ) 10 11 // index/Term.java 12 13 /* 14 A Term represents a word from text. This is the unit of search. It is 15 composed of two elements, the text of the word, as a string, and the 16 name of the field that the text occurred in. 17 18 Note that terms may represents more than words from text fields, but 19 also things like dates, email addresses, urls, etc. 20 */ 21 type Term struct { 22 Field string 23 Bytes []byte 24 } 25 26 func NewTermFromBytes(fld string, bytes []byte) *Term { 27 return &Term{fld, bytes} 28 } 29 30 func NewTerm(fld string, text string) *Term { 31 return &Term{fld, []byte(text)} 32 } 33 34 /* 35 Constructs a Term with the given field and empty text. This serves 36 two purposes: 1) reuse of a Term with the same field. 2) pattern for 37 a query. 38 */ 39 func NewEmptyTerm(fld string) *Term { 40 return &Term{fld, nil} 41 } 42 43 type TermSorter []*Term 44 45 func (s TermSorter) Len() int { return len(s) } 46 func (s TermSorter) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 47 func (s TermSorter) Less(i, j int) bool { 48 if s[i].Field == s[j].Field { 49 return util.UTF8SortedAsUnicodeLess(s[i].Bytes, s[j].Bytes) 50 } 51 return s[i].Field < s[j].Field 52 } 53 54 func (t *Term) String() string { 55 return fmt.Sprintf("%v:%v", t.Field, utf8ToString(t.Bytes)) 56 } 57 58 // TermContext.java 59 60 type TermContext struct { 61 TopReaderContext IndexReaderContext 62 states []TermState 63 DocFreq int 64 TotalTermFreq int64 65 } 66 67 /** 68 * Creates an empty {@link TermContext} from a {@link IndexReaderContext} 69 */ 70 func NewTermContext(ctx IndexReaderContext) *TermContext { 71 // assert ctx != nil && ctx.IsTopLevel 72 var n int 73 if ctx.Leaves() == nil { 74 n = 1 75 } else { 76 n = len(ctx.Leaves()) 77 } 78 return &TermContext{TopReaderContext: ctx, states: make([]TermState, n)} 79 } 80 81 /** 82 * Creates a {@link TermContext} from a top-level {@link IndexReaderContext} and the 83 * given {@link Term}. This method will lookup the given term in all context's leaf readers 84 * and register each of the readers containing the term in the returned {@link TermContext} 85 * using the leaf reader's ordinal. 86 * <p> 87 * Note: the given context must be a top-level context. 88 */ 89 func NewTermContextFromTerm(ctx IndexReaderContext, t *Term) (tc *TermContext, err error) { 90 assert(ctx != nil && ctx.Parent() == nil) 91 perReaderTermState := NewTermContext(ctx) 92 // fmt.Printf("prts.build term=%v\n", t) 93 for _, leaf := range ctx.Leaves() { 94 // fmt.Printf(" r=%v\n", leaf.reader) 95 if fields := leaf.reader.Fields(); fields != nil { 96 if terms := fields.Terms(t.Field); terms != nil { 97 termsEnum := terms.Iterator(nil) 98 ok, err := termsEnum.SeekExact(t.Bytes) 99 if err != nil { 100 return nil, err 101 } 102 if ok { 103 termState, err := termsEnum.TermState() 104 if err != nil { 105 return nil, err 106 } 107 // log.Println(" found") 108 df, err := termsEnum.DocFreq() 109 if err != nil { 110 return nil, err 111 } 112 tf, err := termsEnum.TotalTermFreq() 113 if err != nil { 114 return nil, err 115 } 116 perReaderTermState.register(termState, leaf.Ord, df, tf) 117 } 118 } 119 } 120 } 121 return perReaderTermState, nil 122 } 123 124 func (tc *TermContext) register(state TermState, ord, docFreq int, totalTermFreq int64) { 125 assert2(state != nil, "state must not be nil") 126 assert(ord >= 0 && ord < len(tc.states)) 127 assert2(tc.states[ord] == nil, "state for ord: %v already registered", ord) 128 tc.DocFreq += docFreq 129 if tc.TotalTermFreq >= 0 && totalTermFreq >= 0 { 130 tc.TotalTermFreq += totalTermFreq 131 } else { 132 tc.TotalTermFreq = -1 133 } 134 tc.states[ord] = state 135 } 136 137 func (tc *TermContext) State(ord int) TermState { 138 assert(ord >= 0 && ord < len(tc.states)) 139 return tc.states[ord] 140 } 141 142 type MultiTerms struct { 143 subs []Terms 144 subSlices []ReaderSlice 145 } 146 147 func NewMultiTerms(subs []Terms, subSlices []ReaderSlice) *MultiTerms { 148 // TODO support customized comparator 149 return &MultiTerms{subs, subSlices} 150 } 151 152 func (mt *MultiTerms) Iterator(reuse TermsEnum) TermsEnum { 153 panic("not implemented yet") 154 } 155 156 func (mt *MultiTerms) DocCount() int { 157 sum := 0 158 for _, terms := range mt.subs { 159 if v := terms.DocCount(); v != -1 { 160 sum += v 161 } else { 162 return -1 163 } 164 } 165 return sum 166 } 167 168 func (mt *MultiTerms) SumTotalTermFreq() int64 { 169 var sum int64 170 for _, terms := range mt.subs { 171 if v := terms.SumTotalTermFreq(); v != -1 { 172 sum += v 173 } else { 174 return -1 175 } 176 } 177 return sum 178 } 179 180 func (mt *MultiTerms) SumDocFreq() int64 { 181 var sum int64 182 for _, terms := range mt.subs { 183 if v := terms.SumDocFreq(); v != -1 { 184 sum += v 185 } else { 186 return -1 187 } 188 } 189 return sum 190 }