github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/store/nrt.go (about) 1 package store 2 3 import ( 4 "fmt" 5 "log" 6 "sync" 7 ) 8 9 const NRT_VERBOSE = false 10 11 // TODO 12 // - let subclass dictate policy...? 13 // - rename to MergeCachingDir? NRTCachingDIR 14 15 /* 16 Wraps a RAMDirectory around any provided delegate directory, to be 17 used during NRT search. 18 19 This class is likely only useful in a near-real-time context, where 20 indexing rate is lowish but reopen rate is highish, resulting in many 21 tiny files being written. This directory keeps such segments (as well 22 as the segments produced by merging them, as long as they are small 23 enough), in RAM. 24 25 This is safe to use: when you app calls IndexWriter.Commit(), all 26 cached files will be flushed from the cached and sync'd. 27 28 Here's a simple example usage: 29 30 fsDir, _ := OpenFSDirectory("/path/to/index") 31 cachedFSDir := NewNRTCachingDirectory(fsDir, 5.0, 60.0) 32 conf := NewIndexWriterConfig(VERSION_49, analyzer) 33 writer := NewIndexWriter(cachedFSDir, conf) 34 35 This will cache all newly flushed segments, all merged whose expected 36 segment size is <= 5 MB, unless the net cached bytes exceeds 60 MB at 37 which point all writes will not be cached (until the net bytes falls 38 below 60 MB). 39 */ 40 type NRTCachingDirectory struct { 41 Directory 42 sync.Locker 43 44 cache *RAMDirectory 45 maxMergeSizeBytes int64 46 maxCachedBytes int64 47 48 doCacheWrite func(name string, context IOContext) bool 49 uncacheLock sync.Locker 50 } 51 52 /* 53 We will cache a newly created output if 1) it's a flush or a merge 54 and the estimated size of the merged segment is <= maxMergedSizeMB, 55 and 2) the total cached bytes is <= maxCachedMB. 56 */ 57 func NewNRTCachingDirectory(delegate Directory, maxMergeSizeMB, maxCachedMB float64) (nrt *NRTCachingDirectory) { 58 nrt = &NRTCachingDirectory{ 59 Directory: delegate, 60 Locker: &sync.Mutex{}, 61 cache: NewRAMDirectory(), 62 maxMergeSizeBytes: int64(maxMergeSizeMB * 1024 * 1024), 63 maxCachedBytes: int64(maxCachedMB * 1024 * 1024), 64 uncacheLock: &sync.Mutex{}, 65 } 66 // Subclass can override this to customize logic; return true if this 67 // file should be written to the RAMDirectory. 68 nrt.doCacheWrite = func(name string, context IOContext) bool { 69 var bytes int64 70 if context.MergeInfo != nil { 71 bytes = context.MergeInfo.EstimatedMergeBytes 72 } else if context.FlushInfo != nil { 73 bytes = context.FlushInfo.EstimatedSegmentSize 74 } 75 if NRT_VERBOSE { 76 log.Printf("CACHE check merge=%v flush=%v size=%v", 77 context.MergeInfo, context.FlushInfo, bytes) 78 } 79 return name != "segments.gen" && 80 bytes <= nrt.maxMergeSizeBytes && 81 bytes+nrt.cache.RamBytesUsed() <= nrt.maxCachedBytes 82 } 83 return 84 } 85 86 func (nrt *NRTCachingDirectory) String() string { 87 return fmt.Sprintf("NRTCachingDirectory(%v; maxCacheMB=%.2f maxMergeSizeMB=%.2f)", 88 nrt.Directory, float32(nrt.maxCachedBytes/1024/1024), 89 float32(nrt.maxMergeSizeBytes/1024/1024)) 90 } 91 92 func (nrt *NRTCachingDirectory) ListAll() (all []string, err error) { 93 nrt.Lock() // synchronized 94 defer nrt.Unlock() 95 files := make(map[string]bool) 96 all, err = nrt.cache.ListAll() 97 if err != nil { 98 return 99 } 100 for _, f := range all { 101 files[f] = true 102 } 103 // LUCENE-1468: our NRTCachingDirectory will actually exist (RAMDir!), 104 // but if the underlying delegate is an FSDir and mkdirs() has not 105 // yet been called, because so far everything is a cached write, 106 // in this case, we don't want to throw a NoSuchDirectoryException 107 all, err = nrt.Directory.ListAll() 108 if err != nil { 109 if _, ok := err.(*NoSuchDirectoryError); ok { 110 // however if there are no cached files, then the directory truly 111 // does not "exist" 112 if len(files) == 0 { 113 return 114 } 115 } else { 116 return 117 } 118 } 119 for _, f := range all { 120 // Cannot do this -- if Lucene calls createOutput but files 121 // already exists then this falsely trips: 122 // assert2(!files[f], fmt.Sprintf("file '%v' is in both dirs", f) 123 files[f] = true 124 } 125 all = make([]string, 0, len(files)) 126 for f, _ := range files { 127 all = append(all, f) 128 } 129 return 130 } 131 132 // Returns how many bytes are being used by the RAMDirectory cache 133 // func (nrt *NRTCachingDirectory) sizeInBytes() int64 { 134 // return nrt.cache.sizeInBytes 135 // } 136 137 func (nrt *NRTCachingDirectory) FileExists(name string) bool { 138 nrt.Lock() // synchronized 139 defer nrt.Unlock() 140 return nrt._fileExists(name) 141 } 142 143 func (nrt *NRTCachingDirectory) _fileExists(name string) bool { 144 return nrt.cache.FileExists(name) || nrt.Directory.FileExists(name) 145 } 146 147 func (nrt *NRTCachingDirectory) DeleteFile(name string) error { 148 assert(nrt.Directory != nil) 149 nrt.Lock() // synchronized 150 defer nrt.Unlock() 151 152 if NRT_VERBOSE { 153 log.Printf("nrtdir.deleteFile name=%v", name) 154 } 155 if nrt.cache.FileExists(name) { 156 return nrt.cache.DeleteFile(name) 157 } else { 158 return nrt.Directory.DeleteFile(name) 159 } 160 } 161 162 func assert2(ok bool, msg string, args ...interface{}) { 163 if !ok { 164 panic(fmt.Sprintf(msg, args...)) 165 } 166 } 167 168 func (nrt *NRTCachingDirectory) FileLength(name string) (length int64, err error) { 169 nrt.Lock() // synchronized 170 defer nrt.Unlock() 171 if nrt.cache.FileExists(name) { 172 return nrt.cache.FileLength(name) 173 } else { 174 return nrt.Directory.FileLength(name) 175 } 176 } 177 178 func (nrt *NRTCachingDirectory) CreateOutput(name string, context IOContext) (out IndexOutput, err error) { 179 if NRT_VERBOSE { 180 log.Printf("nrtdir.createOutput name=%v", name) 181 } 182 if nrt.doCacheWrite(name, context) { 183 if NRT_VERBOSE { 184 log.Println(" to cache") 185 } 186 nrt.Directory.DeleteFile(name) // ignore IO error 187 return nrt.cache.CreateOutput(name, context) 188 } 189 nrt.cache.DeleteFile(name) // ignore IO error 190 return nrt.Directory.CreateOutput(name, context) 191 } 192 193 func (nrt *NRTCachingDirectory) Sync(fileNames []string) (err error) { 194 if NRT_VERBOSE { 195 log.Printf("nrtdir.sync files=%v", fileNames) 196 } 197 for _, fileName := range fileNames { 198 err = nrt.unCache(fileName) 199 if err != nil { 200 return 201 } 202 } 203 return nrt.Directory.Sync(fileNames) 204 } 205 206 func (nrt *NRTCachingDirectory) OpenInput(name string, context IOContext) (in IndexInput, err error) { 207 nrt.Lock() // synchronized 208 defer nrt.Unlock() 209 if NRT_VERBOSE { 210 log.Printf("nrtdir.openInput name=%v", name) 211 } 212 if nrt.cache.FileExists(name) { 213 if NRT_VERBOSE { 214 log.Println(" from cache") 215 } 216 return nrt.cache.OpenInput(name, context) 217 } 218 return nrt.Directory.OpenInput(name, context) 219 } 220 221 // func (nrt *NRTCachingDirectory) CreateSlicer(name string, context IOContext) (slicer IndexInputSlicer, err error) { 222 // nrt.EnsureOpen() 223 // if NRT_VERBOSE { 224 // log.Println("nrtdir.openInput name=%v", name) 225 // } 226 // if nrt.cache.FileExists(name) { 227 // if NRT_VERBOSE { 228 // log.Println(" from cache") 229 // } 230 // return nrt.cache.CreateSlicer(name, context) 231 // } 232 // return nrt.Directory.CreateSlicer(name, context) 233 // } 234 235 // Close this directory, which flushes any cached files to the 236 // delegate and then closes the delegate. 237 func (nrt *NRTCachingDirectory) Close() error { 238 // NOTE: technically we shouldn't have to do this, ie, 239 // IndexWriter should have sync'd all files, but we do 240 // it for defensive reasons... or in case the app is 241 // doing something custom (creating outputs directly w/o 242 // using IndexWriter): 243 all, err := nrt.cache.ListAll() 244 if err != nil { 245 return err 246 } 247 for _, fileName := range all { 248 nrt.unCache(fileName) 249 } 250 err = nrt.cache.Close() 251 if err != nil { 252 return err 253 } 254 return nrt.Directory.Close() 255 } 256 257 func (nrt *NRTCachingDirectory) unCache(fileName string) (err error) { 258 // Only let one goroutine uncache at a time; this only happens 259 // during commit() or close(): 260 nrt.uncacheLock.Lock() 261 defer nrt.uncacheLock.Unlock() 262 263 log.Printf("nrtdir.unCache name=%v", fileName) 264 if !nrt.cache.FileExists(fileName) { 265 // Another goroutine beat us... 266 return 267 } 268 context := IO_CONTEXT_DEFAULT 269 var out IndexOutput 270 out, err = nrt.Directory.CreateOutput(fileName, context) 271 if err != nil { 272 return 273 } 274 defer out.Close() 275 var in IndexInput 276 in, err = nrt.cache.OpenInput(fileName, context) 277 if err != nil { 278 return 279 } 280 defer in.Close() 281 err = out.CopyBytes(in, in.Length()) 282 if err != nil { 283 return 284 } 285 286 nrt.Lock() // Lock order: uncacheLock -> this 287 defer nrt.Unlock() 288 // Must sync here because other sync methods have 289 // if nrt.cache.FileExists(name) { ... } else { ... } 290 return nrt.cache.DeleteFile(fileName) 291 }