github.com/mattermosttest/mattermost-server/v5@v5.0.0-20200917143240-9dfa12e121f9/services/searchengine/bleveengine/indexer/indexing_job.go (about) 1 // Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. 2 // See LICENSE.txt for license information. 3 4 package ebleveengine 5 6 import ( 7 "context" 8 "net/http" 9 "strconv" 10 "time" 11 12 "github.com/mattermost/mattermost-server/v5/app" 13 "github.com/mattermost/mattermost-server/v5/jobs" 14 tjobs "github.com/mattermost/mattermost-server/v5/jobs/interfaces" 15 "github.com/mattermost/mattermost-server/v5/mlog" 16 "github.com/mattermost/mattermost-server/v5/model" 17 "github.com/mattermost/mattermost-server/v5/services/searchengine/bleveengine" 18 ) 19 20 const ( 21 BATCH_SIZE = 1000 22 TIME_BETWEEN_BATCHES = 100 23 ESTIMATED_POST_COUNT = 10000000 24 ESTIMATED_CHANNEL_COUNT = 100000 25 ESTIMATED_USER_COUNT = 10000 26 ) 27 28 func init() { 29 app.RegisterJobsBleveIndexerInterface(func(s *app.Server) tjobs.IndexerJobInterface { 30 return &BleveIndexerInterfaceImpl{s} 31 }) 32 } 33 34 type BleveIndexerInterfaceImpl struct { 35 Server *app.Server 36 } 37 38 type BleveIndexerWorker struct { 39 name string 40 stop chan bool 41 stopped chan bool 42 jobs chan model.Job 43 jobServer *jobs.JobServer 44 45 engine *bleveengine.BleveEngine 46 } 47 48 func (bi *BleveIndexerInterfaceImpl) MakeWorker() model.Worker { 49 if bi.Server.SearchEngine.BleveEngine == nil { 50 return nil 51 } 52 return &BleveIndexerWorker{ 53 name: "BleveIndexer", 54 stop: make(chan bool, 1), 55 stopped: make(chan bool, 1), 56 jobs: make(chan model.Job), 57 jobServer: bi.Server.Jobs, 58 59 engine: bi.Server.SearchEngine.BleveEngine.(*bleveengine.BleveEngine), 60 } 61 } 62 63 type IndexingProgress struct { 64 Now time.Time 65 StartAtTime int64 66 EndAtTime int64 67 LastEntityTime int64 68 TotalPostsCount int64 69 DonePostsCount int64 70 DonePosts bool 71 TotalChannelsCount int64 72 DoneChannelsCount int64 73 DoneChannels bool 74 TotalUsersCount int64 75 DoneUsersCount int64 76 DoneUsers bool 77 } 78 79 func (ip *IndexingProgress) CurrentProgress() int64 { 80 return (ip.DonePostsCount + ip.DoneChannelsCount + ip.DoneUsersCount) * 100 / (ip.TotalPostsCount + ip.TotalChannelsCount + ip.TotalUsersCount) 81 } 82 83 func (ip *IndexingProgress) IsDone() bool { 84 return ip.DonePosts && ip.DoneChannels && ip.DoneUsers 85 } 86 87 func (worker *BleveIndexerWorker) JobChannel() chan<- model.Job { 88 return worker.jobs 89 } 90 91 func (worker *BleveIndexerWorker) Run() { 92 mlog.Debug("Worker Started", mlog.String("workername", worker.name)) 93 94 defer func() { 95 mlog.Debug("Worker: Finished", mlog.String("workername", worker.name)) 96 worker.stopped <- true 97 }() 98 99 for { 100 select { 101 case <-worker.stop: 102 mlog.Debug("Worker: Received stop signal", mlog.String("workername", worker.name)) 103 return 104 case job := <-worker.jobs: 105 mlog.Debug("Worker: Received a new candidate job.", mlog.String("workername", worker.name)) 106 worker.DoJob(&job) 107 } 108 } 109 } 110 111 func (worker *BleveIndexerWorker) Stop() { 112 mlog.Debug("Worker Stopping", mlog.String("workername", worker.name)) 113 worker.stop <- true 114 <-worker.stopped 115 } 116 117 func (worker *BleveIndexerWorker) DoJob(job *model.Job) { 118 claimed, err := worker.jobServer.ClaimJob(job) 119 if err != nil { 120 mlog.Warn("Worker: Error ocurred while trying to claim job", mlog.String("workername", worker.name), mlog.String("job_id", job.Id), mlog.Err(err)) 121 return 122 } 123 if !claimed { 124 return 125 } 126 127 mlog.Info("Worker: Indexing job claimed by worker", mlog.String("workername", worker.name), mlog.String("job_id", job.Id)) 128 129 if !worker.engine.IsActive() { 130 appError := model.NewAppError("BleveIndexerWorker", "bleveengine.indexer.do_job.engine_inactive", nil, "", http.StatusInternalServerError) 131 if err := worker.jobServer.SetJobError(job, appError); err != nil { 132 mlog.Error("Worker: Failed to run job as ") 133 } 134 return 135 } 136 137 progress := IndexingProgress{ 138 Now: time.Now(), 139 DonePosts: false, 140 DoneChannels: false, 141 DoneUsers: false, 142 StartAtTime: 0, 143 EndAtTime: model.GetMillis(), 144 } 145 146 // Extract the start and end times, if they are set. 147 if startString, ok := job.Data["start_time"]; ok { 148 startInt, err := strconv.ParseInt(startString, 10, 64) 149 if err != nil { 150 mlog.Error("Worker: Failed to parse start_time for job", mlog.String("workername", worker.name), mlog.String("start_time", startString), mlog.String("job_id", job.Id), mlog.Err(err)) 151 appError := model.NewAppError("BleveIndexerWorker", "bleveengine.indexer.do_job.parse_start_time.error", nil, err.Error(), http.StatusInternalServerError) 152 if err := worker.jobServer.SetJobError(job, appError); err != nil { 153 mlog.Error("Worker: Failed to set job error", mlog.String("workername", worker.name), mlog.String("job_id", job.Id), mlog.Err(err), mlog.NamedErr("set_error", appError)) 154 } 155 return 156 } 157 progress.StartAtTime = startInt 158 progress.LastEntityTime = progress.StartAtTime 159 } else { 160 // Set start time to oldest post in the database. 161 oldestPost, err := worker.jobServer.Store.Post().GetOldest() 162 if err != nil { 163 mlog.Error("Worker: Failed to fetch oldest post for job.", mlog.String("workername", worker.name), mlog.String("job_id", job.Id), mlog.String("start_time", startString), mlog.Err(err)) 164 appError := model.NewAppError("BleveIndexerWorker", "bleveengine.indexer.do_job.get_oldest_post.error", nil, err.Error(), http.StatusInternalServerError) 165 if err := worker.jobServer.SetJobError(job, appError); err != nil { 166 mlog.Error("Worker: Failed to set job error", mlog.String("workername", worker.name), mlog.String("job_id", job.Id), mlog.Err(err), mlog.NamedErr("set_error", appError)) 167 } 168 return 169 } 170 progress.StartAtTime = oldestPost.CreateAt 171 progress.LastEntityTime = progress.StartAtTime 172 } 173 174 if endString, ok := job.Data["end_time"]; ok { 175 endInt, err := strconv.ParseInt(endString, 10, 64) 176 if err != nil { 177 mlog.Error("Worker: Failed to parse end_time for job", mlog.String("workername", worker.name), mlog.String("job_id", job.Id), mlog.String("end_time", endString), mlog.Err(err)) 178 appError := model.NewAppError("BleveIndexerWorker", "bleveengine.indexer.do_job.parse_end_time.error", nil, err.Error(), http.StatusInternalServerError) 179 if err := worker.jobServer.SetJobError(job, appError); err != nil { 180 mlog.Error("Worker: Failed to set job errorv", mlog.String("workername", worker.name), mlog.String("job_id", job.Id), mlog.Err(err), mlog.NamedErr("set_error", appError)) 181 } 182 return 183 } 184 progress.EndAtTime = endInt 185 } 186 187 // Counting all posts may fail or timeout when the posts table is large. If this happens, log a warning, but carry 188 // on with the indexing job anyway. The only issue is that the progress % reporting will be inaccurate. 189 if count, err := worker.jobServer.Store.Post().AnalyticsPostCount("", false, false); err != nil { 190 mlog.Warn("Worker: Failed to fetch total post count for job. An estimated value will be used for progress reporting.", mlog.String("workername", worker.name), mlog.String("job_id", job.Id), mlog.Err(err)) 191 progress.TotalPostsCount = ESTIMATED_POST_COUNT 192 } else { 193 progress.TotalPostsCount = count 194 } 195 196 // Same possible fail as above can happen when counting channels 197 if count, err := worker.jobServer.Store.Channel().AnalyticsTypeCount("", "O"); err != nil { 198 mlog.Warn("Worker: Failed to fetch total channel count for job. An estimated value will be used for progress reporting.", mlog.String("workername", worker.name), mlog.String("job_id", job.Id), mlog.Err(err)) 199 progress.TotalChannelsCount = ESTIMATED_CHANNEL_COUNT 200 } else { 201 progress.TotalChannelsCount = count 202 } 203 204 // Same possible fail as above can happen when counting users 205 if count, err := worker.jobServer.Store.User().Count(model.UserCountOptions{}); err != nil { 206 mlog.Warn("Worker: Failed to fetch total user count for job. An estimated value will be used for progress reporting.", mlog.String("workername", worker.name), mlog.String("job_id", job.Id), mlog.Err(err)) 207 progress.TotalUsersCount = ESTIMATED_USER_COUNT 208 } else { 209 progress.TotalUsersCount = count 210 } 211 212 cancelCtx, cancelCancelWatcher := context.WithCancel(context.Background()) 213 cancelWatcherChan := make(chan interface{}, 1) 214 go worker.jobServer.CancellationWatcher(cancelCtx, job.Id, cancelWatcherChan) 215 216 defer cancelCancelWatcher() 217 218 for { 219 select { 220 case <-cancelWatcherChan: 221 mlog.Info("Worker: Indexing job has been canceled via CancellationWatcher", mlog.String("workername", worker.name), mlog.String("job_id", job.Id)) 222 if err := worker.jobServer.SetJobCanceled(job); err != nil { 223 mlog.Error("Worker: Failed to mark job as cancelled", mlog.String("workername", worker.name), mlog.String("job_id", job.Id), mlog.Err(err)) 224 } 225 return 226 227 case <-worker.stop: 228 mlog.Info("Worker: Indexing has been canceled via Worker Stop", mlog.String("workername", worker.name), mlog.String("job_id", job.Id)) 229 if err := worker.jobServer.SetJobCanceled(job); err != nil { 230 mlog.Error("Worker: Failed to mark job as canceled", mlog.String("workername", worker.name), mlog.String("job_id", job.Id), mlog.Err(err)) 231 } 232 return 233 234 case <-time.After(TIME_BETWEEN_BATCHES * time.Millisecond): 235 var err *model.AppError 236 if progress, err = worker.IndexBatch(progress); err != nil { 237 mlog.Error("Worker: Failed to index batch for job", mlog.String("workername", worker.name), mlog.String("job_id", job.Id), mlog.Err(err)) 238 if err2 := worker.jobServer.SetJobError(job, err); err2 != nil { 239 mlog.Error("Worker: Failed to set job error", mlog.String("workername", worker.name), mlog.String("job_id", job.Id), mlog.Err(err2), mlog.NamedErr("set_error", err)) 240 } 241 return 242 } 243 244 if err := worker.jobServer.SetJobProgress(job, progress.CurrentProgress()); err != nil { 245 mlog.Error("Worker: Failed to set progress for job", mlog.String("workername", worker.name), mlog.String("job_id", job.Id), mlog.Err(err)) 246 if err2 := worker.jobServer.SetJobError(job, err); err2 != nil { 247 mlog.Error("Worker: Failed to set error for job", mlog.String("workername", worker.name), mlog.String("job_id", job.Id), mlog.Err(err2), mlog.NamedErr("set_error", err)) 248 } 249 return 250 } 251 252 if progress.IsDone() { 253 if err := worker.jobServer.SetJobSuccess(job); err != nil { 254 mlog.Error("Worker: Failed to set success for job", mlog.String("workername", worker.name), mlog.String("job_id", job.Id), mlog.Err(err)) 255 if err2 := worker.jobServer.SetJobError(job, err); err2 != nil { 256 mlog.Error("Worker: Failed to set error for job", mlog.String("workername", worker.name), mlog.String("job_id", job.Id), mlog.Err(err2), mlog.NamedErr("set_error", err)) 257 } 258 } 259 mlog.Info("Worker: Indexing job finished successfully", mlog.String("workername", worker.name), mlog.String("job_id", job.Id)) 260 return 261 } 262 } 263 } 264 } 265 266 func (worker *BleveIndexerWorker) IndexBatch(progress IndexingProgress) (IndexingProgress, *model.AppError) { 267 if !progress.DonePosts { 268 return worker.IndexPostsBatch(progress) 269 } 270 if !progress.DoneChannels { 271 return worker.IndexChannelsBatch(progress) 272 } 273 if !progress.DoneUsers { 274 return worker.IndexUsersBatch(progress) 275 } 276 return progress, model.NewAppError("BleveIndexerWorker", "bleveengine.indexer.index_batch.nothing_left_to_index.error", nil, "", http.StatusInternalServerError) 277 } 278 279 func (worker *BleveIndexerWorker) IndexPostsBatch(progress IndexingProgress) (IndexingProgress, *model.AppError) { 280 endTime := progress.LastEntityTime + int64(*worker.jobServer.Config().BleveSettings.BulkIndexingTimeWindowSeconds*1000) 281 282 var posts []*model.PostForIndexing 283 284 tries := 0 285 for posts == nil { 286 var err *model.AppError 287 posts, err = worker.jobServer.Store.Post().GetPostsBatchForIndexing(progress.LastEntityTime, endTime, BATCH_SIZE) 288 if err != nil { 289 if tries >= 10 { 290 return progress, err 291 } else { 292 mlog.Warn("Failed to get posts batch for indexing. Retrying.", mlog.Err(err)) 293 294 // Wait a bit before trying again. 295 time.Sleep(15 * time.Second) 296 } 297 } 298 299 tries++ 300 } 301 302 newLastMessageTime, err := worker.BulkIndexPosts(posts, progress) 303 if err != nil { 304 return progress, err 305 } 306 307 // Due to the "endTime" parameter in the store query, we might get an incomplete batch before the end. In this 308 // case, set the "newLastMessageTime" to the endTime so we don't get stuck running the same query in a loop. 309 if len(posts) < BATCH_SIZE { 310 newLastMessageTime = endTime 311 } 312 313 // When to Stop: we index either until we pass a batch of messages where the last 314 // message is created at or after the specified end time when setting up the batch 315 // index, or until two consecutive full batches have the same end time of their final 316 // messages. This second case is safe as long as the assumption that the database 317 // cannot contain more messages with the same CreateAt time than the batch size holds. 318 if progress.EndAtTime <= newLastMessageTime { 319 progress.DonePosts = true 320 progress.LastEntityTime = progress.StartAtTime 321 } else if progress.LastEntityTime == newLastMessageTime && len(posts) == BATCH_SIZE { 322 mlog.Error("More posts with the same CreateAt time were detected than the permitted batch size. Aborting indexing job.", mlog.Int64("CreateAt", newLastMessageTime), mlog.Int("Batch Size", BATCH_SIZE)) 323 progress.DonePosts = true 324 progress.LastEntityTime = progress.StartAtTime 325 } else { 326 progress.LastEntityTime = newLastMessageTime 327 } 328 329 progress.DonePostsCount += int64(len(posts)) 330 331 return progress, nil 332 } 333 334 func (worker *BleveIndexerWorker) BulkIndexPosts(posts []*model.PostForIndexing, progress IndexingProgress) (int64, *model.AppError) { 335 lastCreateAt := int64(0) 336 batch := worker.engine.PostIndex.NewBatch() 337 338 for _, post := range posts { 339 if post.DeleteAt == 0 { 340 searchPost := bleveengine.BLVPostFromPostForIndexing(post) 341 batch.Index(searchPost.Id, searchPost) 342 } else { 343 batch.Delete(post.Id) 344 } 345 346 lastCreateAt = post.CreateAt 347 } 348 349 worker.engine.Mutex.RLock() 350 defer worker.engine.Mutex.RUnlock() 351 352 if err := worker.engine.PostIndex.Batch(batch); err != nil { 353 return 0, model.NewAppError("BleveIndexerWorker.BulkIndexPosts", "bleveengine.indexer.do_job.bulk_index_posts.batch_error", nil, err.Error(), http.StatusInternalServerError) 354 } 355 return lastCreateAt, nil 356 } 357 358 func (worker *BleveIndexerWorker) IndexChannelsBatch(progress IndexingProgress) (IndexingProgress, *model.AppError) { 359 endTime := progress.LastEntityTime + int64(*worker.jobServer.Config().BleveSettings.BulkIndexingTimeWindowSeconds*1000) 360 361 var channels []*model.Channel 362 363 tries := 0 364 for channels == nil { 365 var err *model.AppError 366 channels, err = worker.jobServer.Store.Channel().GetChannelsBatchForIndexing(progress.LastEntityTime, endTime, BATCH_SIZE) 367 if err != nil { 368 if tries >= 10 { 369 return progress, err 370 } 371 372 mlog.Warn("Failed to get channels batch for indexing. Retrying.", mlog.Err(err)) 373 374 // Wait a bit before trying again. 375 time.Sleep(15 * time.Second) 376 } 377 tries++ 378 } 379 380 newLastChannelTime, err := worker.BulkIndexChannels(channels, progress) 381 if err != nil { 382 return progress, err 383 } 384 385 // Due to the "endTime" parameter in the store query, we might get an incomplete batch before the end. In this 386 // case, set the "newLastChannelTime" to the endTime so we don't get stuck running the same query in a loop. 387 if len(channels) < BATCH_SIZE { 388 newLastChannelTime = endTime 389 } 390 391 // When to Stop: we index either until we pass a batch of channels where the last 392 // channel is created at or after the specified end time when setting up the batch 393 // index, or until two consecutive full batches have the same end time of their final 394 // channels. This second case is safe as long as the assumption that the database 395 // cannot contain more channels with the same CreateAt time than the batch size holds. 396 if progress.EndAtTime <= newLastChannelTime { 397 progress.DoneChannels = true 398 progress.LastEntityTime = progress.StartAtTime 399 } else if progress.LastEntityTime == newLastChannelTime && len(channels) == BATCH_SIZE { 400 mlog.Error("More channels with the same CreateAt time were detected than the permitted batch size. Aborting indexing job.", mlog.Int64("CreateAt", newLastChannelTime), mlog.Int("Batch Size", BATCH_SIZE)) 401 progress.DoneChannels = true 402 progress.LastEntityTime = progress.StartAtTime 403 } else { 404 progress.LastEntityTime = newLastChannelTime 405 } 406 407 progress.DoneChannelsCount += int64(len(channels)) 408 409 return progress, nil 410 } 411 412 func (worker *BleveIndexerWorker) BulkIndexChannels(channels []*model.Channel, progress IndexingProgress) (int64, *model.AppError) { 413 lastCreateAt := int64(0) 414 batch := worker.engine.ChannelIndex.NewBatch() 415 416 for _, channel := range channels { 417 if channel.DeleteAt == 0 { 418 searchChannel := bleveengine.BLVChannelFromChannel(channel) 419 batch.Index(searchChannel.Id, searchChannel) 420 } else { 421 batch.Delete(channel.Id) 422 } 423 424 lastCreateAt = channel.CreateAt 425 } 426 427 worker.engine.Mutex.RLock() 428 defer worker.engine.Mutex.RUnlock() 429 430 if err := worker.engine.ChannelIndex.Batch(batch); err != nil { 431 return 0, model.NewAppError("BleveIndexerWorker.BulkIndexChannels", "bleveengine.indexer.do_job.bulk_index_channels.batch_error", nil, err.Error(), http.StatusInternalServerError) 432 } 433 return lastCreateAt, nil 434 } 435 436 func (worker *BleveIndexerWorker) IndexUsersBatch(progress IndexingProgress) (IndexingProgress, *model.AppError) { 437 endTime := progress.LastEntityTime + int64(*worker.jobServer.Config().BleveSettings.BulkIndexingTimeWindowSeconds*1000) 438 439 var users []*model.UserForIndexing 440 441 tries := 0 442 for users == nil { 443 if usersBatch, err := worker.jobServer.Store.User().GetUsersBatchForIndexing(progress.LastEntityTime, endTime, BATCH_SIZE); err != nil { 444 if tries >= 10 { 445 return progress, err 446 } else { 447 mlog.Warn("Failed to get users batch for indexing. Retrying.", mlog.Err(err)) 448 449 // Wait a bit before trying again. 450 time.Sleep(15 * time.Second) 451 } 452 } else { 453 users = usersBatch 454 } 455 456 tries++ 457 } 458 459 newLastUserTime, err := worker.BulkIndexUsers(users, progress) 460 if err != nil { 461 return progress, err 462 } 463 464 // Due to the "endTime" parameter in the store query, we might get an incomplete batch before the end. In this 465 // case, set the "newLastUserTime" to the endTime so we don't get stuck running the same query in a loop. 466 if len(users) < BATCH_SIZE { 467 newLastUserTime = endTime 468 } 469 470 // When to Stop: we index either until we pass a batch of users where the last 471 // user is created at or after the specified end time when setting up the batch 472 // index, or until two consecutive full batches have the same end time of their final 473 // users. This second case is safe as long as the assumption that the database 474 // cannot contain more users with the same CreateAt time than the batch size holds. 475 if progress.EndAtTime <= newLastUserTime { 476 progress.DoneUsers = true 477 progress.LastEntityTime = progress.StartAtTime 478 } else if progress.LastEntityTime == newLastUserTime && len(users) == BATCH_SIZE { 479 mlog.Error("More users with the same CreateAt time were detected than the permitted batch size. Aborting indexing job.", mlog.Int64("CreateAt", newLastUserTime), mlog.Int("Batch Size", BATCH_SIZE)) 480 progress.DoneUsers = true 481 progress.LastEntityTime = progress.StartAtTime 482 } else { 483 progress.LastEntityTime = newLastUserTime 484 } 485 486 progress.DoneUsersCount += int64(len(users)) 487 488 return progress, nil 489 } 490 491 func (worker *BleveIndexerWorker) BulkIndexUsers(users []*model.UserForIndexing, progress IndexingProgress) (int64, *model.AppError) { 492 lastCreateAt := int64(0) 493 batch := worker.engine.UserIndex.NewBatch() 494 495 for _, user := range users { 496 if user.DeleteAt == 0 { 497 searchUser := bleveengine.BLVUserFromUserForIndexing(user) 498 batch.Index(searchUser.Id, searchUser) 499 } else { 500 batch.Delete(user.Id) 501 } 502 503 lastCreateAt = user.CreateAt 504 } 505 506 worker.engine.Mutex.RLock() 507 defer worker.engine.Mutex.RUnlock() 508 509 if err := worker.engine.UserIndex.Batch(batch); err != nil { 510 return 0, model.NewAppError("BleveIndexerWorker.BulkIndexUsers", "bleveengine.indexer.do_job.bulk_index_users.batch_error", nil, err.Error(), http.StatusInternalServerError) 511 } 512 return lastCreateAt, nil 513 }