github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/lucene49/normsConsumer.go (about) 1 package lucene49 2 3 import ( 4 "github.com/balzaczyy/golucene/core/codec" 5 . "github.com/balzaczyy/golucene/core/index/model" 6 "github.com/balzaczyy/golucene/core/store" 7 "github.com/balzaczyy/golucene/core/util" 8 "github.com/balzaczyy/golucene/core/util/packed" 9 "math" 10 ) 11 12 // lucene49/Lucene49NormsConsumer.java 13 14 const ( 15 DELTA_COMPRESSED = 0 16 TABLE_COMPRESSED = 1 17 CONST_COMPRESSED = 2 18 UNCOMPRESSED = 3 19 ) 20 21 type NormsConsumer struct { 22 data, meta store.IndexOutput 23 maxDoc int 24 } 25 26 func newLucene49NormsConsumer(state *SegmentWriteState, 27 dataCodec, dataExtension, metaCodec, metaExtension string) (nc *NormsConsumer, err error) { 28 29 assert(packed.PackedFormat(packed.PACKED_SINGLE_BLOCK).IsSupported(1)) 30 assert(packed.PackedFormat(packed.PACKED_SINGLE_BLOCK).IsSupported(2)) 31 assert(packed.PackedFormat(packed.PACKED_SINGLE_BLOCK).IsSupported(4)) 32 33 nc = &NormsConsumer{maxDoc: state.SegmentInfo.DocCount()} 34 var success = false 35 defer func() { 36 if !success { 37 util.CloseWhileSuppressingError(nc) 38 } 39 }() 40 41 dataName := util.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, dataExtension) 42 if nc.data, err = state.Directory.CreateOutput(dataName, state.Context); err != nil { 43 return nil, err 44 } 45 if err = codec.WriteHeader(nc.data, dataCodec, VERSION_CURRENT); err != nil { 46 return nil, err 47 } 48 metaName := util.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, metaExtension) 49 if nc.meta, err = state.Directory.CreateOutput(metaName, state.Context); err != nil { 50 return nil, err 51 } 52 if err = codec.WriteHeader(nc.meta, metaCodec, VERSION_CURRENT); err != nil { 53 return nil, err 54 } 55 success = true 56 return nc, nil 57 } 58 59 func (nc *NormsConsumer) AddNumericField(field *FieldInfo, 60 iter func() func() (interface{}, bool)) (err error) { 61 62 if err = nc.meta.WriteVInt(field.Number); err != nil { 63 return 64 } 65 minValue, maxValue := int64(math.MaxInt64), int64(math.MinInt64) 66 // TODO: more efficient? 67 uniqueValues := newNormMap() 68 69 count := int64(0) 70 next := iter() 71 for { 72 nv, ok := next() 73 if !ok { 74 break 75 } 76 assert2(nv != nil, "illegal norms data for field %v, got null for value: %v", field.Name, count) 77 v := nv.(int64) 78 79 if v < minValue { 80 minValue = v 81 } 82 if v > maxValue { 83 maxValue = v 84 } 85 86 if uniqueValues != nil && uniqueValues.add(v) && uniqueValues.size > 256 { 87 uniqueValues = nil 88 } 89 90 count++ 91 } 92 assert2(count == int64(nc.maxDoc), 93 "illegal norms data for field %v, expected %v values, got %v", 94 field.Name, nc.maxDoc, count) 95 96 if uniqueValues != nil && uniqueValues.size == 1 { 97 // 0 bpv 98 if err = nc.meta.WriteByte(CONST_COMPRESSED); err != nil { 99 return 100 } 101 if err = nc.meta.WriteLong(minValue); err != nil { 102 return 103 } 104 } else if uniqueValues != nil { 105 // small number of unique values; this is the typical case: 106 // we only use bpv=1,2,4,8 107 format := packed.PackedFormat(packed.PACKED_SINGLE_BLOCK) 108 bitsPerValue := packed.BitsRequired(int64(uniqueValues.size) - 1) 109 if bitsPerValue == 3 { 110 bitsPerValue = 4 111 } else if bitsPerValue > 4 { 112 bitsPerValue = 8 113 } 114 115 if bitsPerValue == 8 && minValue >= 0 && maxValue <= 255 { 116 if err = store.Stream(nc.meta).WriteByte(UNCOMPRESSED). // uncompressed []byte 117 WriteLong(nc.data.FilePointer()). 118 Close(); err != nil { 119 return err 120 } 121 next = iter() 122 for { 123 nv, ok := next() 124 if !ok { 125 break 126 } 127 n := byte(0) 128 if nv != nil { 129 n = byte(nv.(int64)) 130 } 131 if err = nc.data.WriteByte(byte(n)); err != nil { 132 return err 133 } 134 } 135 } else { 136 if err = store.Stream(nc.meta).WriteByte(TABLE_COMPRESSED). // table-compressed 137 WriteLong(nc.data.FilePointer()). 138 Close(); err != nil { 139 return err 140 } 141 if err = nc.data.WriteVInt(packed.VERSION_CURRENT); err != nil { 142 return err 143 } 144 145 decode := uniqueValues.decodeTable() 146 // upgrade to power of two sized array 147 size := 1 << uint(bitsPerValue) 148 if err = nc.data.WriteVInt(int32(size)); err != nil { 149 return err 150 } 151 for _, v := range decode { 152 if err = nc.data.WriteLong(v); err != nil { 153 return err 154 } 155 } 156 for i := len(decode); i < size; i++ { 157 if err = nc.data.WriteLong(0); err != nil { 158 return err 159 } 160 } 161 162 if err = store.Stream(nc.data).WriteVInt(int32(format.Id())). 163 WriteVInt(int32(bitsPerValue)). 164 Close(); err != nil { 165 return err 166 } 167 168 writer := packed.WriterNoHeader(nc.data, format, nc.maxDoc, bitsPerValue, packed.DEFAULT_BUFFER_SIZE) 169 next = iter() 170 for { 171 nv, ok := next() 172 if !ok { 173 break 174 } 175 if err = writer.Add(int64(uniqueValues.ord(nv.(int64)))); err != nil { 176 return err 177 } 178 } 179 if err = writer.Finish(); err != nil { 180 return err 181 } 182 } 183 } else { 184 panic("not implemented yet") 185 } 186 return nil 187 } 188 189 type Longs []int64 190 191 func (a Longs) Len() int { return len(a) } 192 func (a Longs) Less(i, j int) bool { return a[i] < a[j] } 193 func (a Longs) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 194 195 func (nc *NormsConsumer) Close() (err error) { 196 var success = false 197 defer func() { 198 if success { 199 err = util.Close(nc.data, nc.meta) 200 } else { 201 util.CloseWhileSuppressingError(nc.data, nc.meta) 202 } 203 }() 204 205 if nc.meta != nil { 206 if err = nc.meta.WriteVInt(-1); err != nil { // write EOF marker 207 return 208 } 209 if err = codec.WriteFooter(nc.meta); err != nil { // write checksum 210 return 211 } 212 } 213 if nc.data != nil { 214 if err = codec.WriteFooter(nc.data); err != nil { // write checksum 215 return 216 } 217 } 218 success = true 219 return nil 220 } 221 222 /* 223 Specialized deduplication of long-ord for norms: 99.99999% of the 224 time this will be a single-byte range. 225 */ 226 type NormMap struct { 227 // we use int16: at most we will add 257 values to this map before its rejected as too big above. 228 singleByteRange []int16 229 other map[int64]int16 230 size int 231 } 232 233 func newNormMap() *NormMap { 234 ans := &NormMap{ 235 singleByteRange: make([]int16, 256), 236 other: make(map[int64]int16), 237 } 238 for i, _ := range ans.singleByteRange { 239 ans.singleByteRange[i] = -1 240 } 241 return ans 242 } 243 244 /* Adds an item to the mapping. Returns true if actually added. */ 245 func (m *NormMap) add(l int64) bool { 246 assert(m.size <= 256) // once we add > 256 values, we nullify the map in addNumericField and don't use this strategy 247 if l >= math.MinInt8 && l <= math.MaxInt8 { 248 index := int(l + 128) 249 if previous := m.singleByteRange[index]; previous < 0 { 250 m.singleByteRange[index] = int16(m.size) 251 m.size++ 252 return true 253 } 254 return false 255 } 256 if _, ok := m.other[l]; !ok { 257 m.other[l] = int16(m.size) 258 m.size++ 259 return true 260 } 261 return false 262 } 263 264 /* Gets the ordinal for a previously added item. */ 265 func (m *NormMap) ord(l int64) int { 266 panic("niy") 267 } 268 269 /* Retrieves the ordinal table for previously added items. */ 270 func (m *NormMap) decodeTable() []int64 { 271 panic("niy") 272 }