github.com/oinume/lekcije@v0.0.0-20231017100347-5b4c5eb6ab24/backend/cmd/crawler/main.go (about) 1 package main 2 3 import ( 4 "context" 5 "flag" 6 "fmt" 7 "io" 8 "os" 9 "time" 10 11 "github.com/jinzhu/gorm" 12 "go.uber.org/zap" 13 "golang.org/x/sync/errgroup" 14 15 "github.com/oinume/lekcije/backend/cli" 16 "github.com/oinume/lekcije/backend/crawler" 17 "github.com/oinume/lekcije/backend/domain/config" 18 "github.com/oinume/lekcije/backend/infrastructure/dmm_eikaiwa" 19 "github.com/oinume/lekcije/backend/infrastructure/mysql" 20 "github.com/oinume/lekcije/backend/logger" 21 "github.com/oinume/lekcije/backend/model" 22 "github.com/oinume/lekcije/backend/registry" 23 ) 24 25 func main() { 26 m := &crawlerMain{ 27 outStream: os.Stdout, 28 errStream: os.Stderr, 29 } 30 if err := m.run(os.Args); err != nil { 31 cli.WriteError(m.errStream, err) 32 os.Exit(cli.ExitError) 33 } 34 os.Exit(cli.ExitOK) 35 } 36 37 type crawlerMain struct { 38 outStream io.Writer 39 errStream io.Writer 40 } 41 42 func (m *crawlerMain) run(args []string) error { 43 flagSet := flag.NewFlagSet("crawler", flag.ContinueOnError) 44 flagSet.SetOutput(m.errStream) 45 var ( 46 concurrency = flagSet.Int("concurrency", 1, "Concurrency of crawler. (default=1)") 47 continueOnError = flagSet.Bool("continue", true, "Continue to crawl if any error occurred. (default=true)") 48 specifiedIDs = flagSet.String("ids", "", "Teacher IDs") 49 followedOnly = flagSet.Bool("followedOnly", false, "Crawl followedOnly teachers") 50 all = flagSet.Bool("all", false, "Crawl all teachers ordered by evaluation") 51 newOnly = flagSet.Bool("new", false, "Crawl all teachers ordered by new") 52 interval = flagSet.Duration("interval", 1*time.Second, "Fetch interval. (default=1s)") 53 logLevel = flag.String("log-level", "info", "Log level") 54 ) 55 if err := flagSet.Parse(args[1:]); err != nil { 56 return err 57 } 58 if *followedOnly && *specifiedIDs != "" { 59 return fmt.Errorf("can't specify -followedOnly and -ids flags both") 60 } 61 62 config.MustProcessDefault() 63 ctx := context.Background() 64 startedAt := time.Now().UTC() 65 appLogger := logger.NewAppLogger(os.Stderr, logger.NewLevel(*logLevel)) 66 appLogger.Info("crawler started") 67 defer func() { 68 elapsed := time.Now().UTC().Sub(startedAt) / time.Millisecond 69 appLogger.Info("crawler finished", zap.Int("elapsed", int(elapsed))) 70 }() 71 72 db, err := model.OpenDB(config.DefaultVars.DBURL(), 1, config.DefaultVars.DebugSQL) 73 if err != nil { 74 return err 75 } 76 defer func() { _ = db.Close() }() 77 78 loader := m.createLoader(db, *specifiedIDs, *followedOnly, *all, *newOnly) 79 mCountryList := registry.MustNewMCountryList(ctx, db.DB()) 80 lessonFetcher := dmm_eikaiwa.NewLessonFetcher(nil, *concurrency, false, mCountryList, appLogger) 81 teacherRepo := mysql.NewTeacherRepository(db.DB()) 82 for cursor := loader.GetInitialCursor(); cursor != ""; { 83 var teacherIDs []uint32 84 var err error 85 teacherIDs, cursor, err = loader.Load(cursor) 86 if err != nil { 87 return err 88 } 89 90 // TODO: semaphore 91 var g errgroup.Group 92 for _, id := range teacherIDs { 93 id := id 94 g.Go(func() error { 95 teacher, _, err := lessonFetcher.Fetch(ctx, uint(id)) 96 if err != nil { 97 if *continueOnError { 98 appLogger.Error("Error during LessonFetcher.Fetch", zap.Error(err)) 99 return nil 100 } else { 101 return err 102 } 103 } 104 if err := teacherRepo.CreateOrUpdate(ctx, teacher); err != nil { 105 if *continueOnError { 106 appLogger.Error("Error during TeacherService.CreateOrUpdate", zap.Error(err)) 107 return nil 108 } else { 109 return err 110 } 111 } 112 // TODO: update lessons 113 114 return nil 115 }) 116 } 117 118 if err := g.Wait(); err != nil { 119 return err 120 } 121 122 time.Sleep(*interval) 123 } 124 125 return nil 126 } 127 128 func (m *crawlerMain) createLoader( 129 db *gorm.DB, 130 specifiedIDs string, 131 followed bool, 132 all bool, 133 newOnly bool, 134 ) crawler.TeacherIDLoader { 135 var loader crawler.TeacherIDLoader 136 if specifiedIDs != "" { 137 loader = crawler.NewSpecificTeacherIDLoader(specifiedIDs) 138 } else if followed { 139 loader = crawler.NewFollowedTeacherIDLoader(db) 140 } else if all { 141 loader = crawler.NewScrapingTeacherIDLoader(crawler.ByRating, nil) 142 } else if newOnly { 143 loader = crawler.NewScrapingTeacherIDLoader(crawler.ByNew, nil) 144 } else { 145 loader = crawler.NewScrapingTeacherIDLoader(crawler.ByRating, nil) 146 } 147 return loader 148 }