github.com/oinume/lekcije@v0.0.0-20231017100347-5b4c5eb6ab24/backend/cmd/crawler/main.go (about)

     1  package main
     2  
     3  import (
     4  	"context"
     5  	"flag"
     6  	"fmt"
     7  	"io"
     8  	"os"
     9  	"time"
    10  
    11  	"github.com/jinzhu/gorm"
    12  	"go.uber.org/zap"
    13  	"golang.org/x/sync/errgroup"
    14  
    15  	"github.com/oinume/lekcije/backend/cli"
    16  	"github.com/oinume/lekcije/backend/crawler"
    17  	"github.com/oinume/lekcije/backend/domain/config"
    18  	"github.com/oinume/lekcije/backend/infrastructure/dmm_eikaiwa"
    19  	"github.com/oinume/lekcije/backend/infrastructure/mysql"
    20  	"github.com/oinume/lekcije/backend/logger"
    21  	"github.com/oinume/lekcije/backend/model"
    22  	"github.com/oinume/lekcije/backend/registry"
    23  )
    24  
    25  func main() {
    26  	m := &crawlerMain{
    27  		outStream: os.Stdout,
    28  		errStream: os.Stderr,
    29  	}
    30  	if err := m.run(os.Args); err != nil {
    31  		cli.WriteError(m.errStream, err)
    32  		os.Exit(cli.ExitError)
    33  	}
    34  	os.Exit(cli.ExitOK)
    35  }
    36  
    37  type crawlerMain struct {
    38  	outStream io.Writer
    39  	errStream io.Writer
    40  }
    41  
    42  func (m *crawlerMain) run(args []string) error {
    43  	flagSet := flag.NewFlagSet("crawler", flag.ContinueOnError)
    44  	flagSet.SetOutput(m.errStream)
    45  	var (
    46  		concurrency     = flagSet.Int("concurrency", 1, "Concurrency of crawler. (default=1)")
    47  		continueOnError = flagSet.Bool("continue", true, "Continue to crawl if any error occurred. (default=true)")
    48  		specifiedIDs    = flagSet.String("ids", "", "Teacher IDs")
    49  		followedOnly    = flagSet.Bool("followedOnly", false, "Crawl followedOnly teachers")
    50  		all             = flagSet.Bool("all", false, "Crawl all teachers ordered by evaluation")
    51  		newOnly         = flagSet.Bool("new", false, "Crawl all teachers ordered by new")
    52  		interval        = flagSet.Duration("interval", 1*time.Second, "Fetch interval. (default=1s)")
    53  		logLevel        = flag.String("log-level", "info", "Log level")
    54  	)
    55  	if err := flagSet.Parse(args[1:]); err != nil {
    56  		return err
    57  	}
    58  	if *followedOnly && *specifiedIDs != "" {
    59  		return fmt.Errorf("can't specify -followedOnly and -ids flags both")
    60  	}
    61  
    62  	config.MustProcessDefault()
    63  	ctx := context.Background()
    64  	startedAt := time.Now().UTC()
    65  	appLogger := logger.NewAppLogger(os.Stderr, logger.NewLevel(*logLevel))
    66  	appLogger.Info("crawler started")
    67  	defer func() {
    68  		elapsed := time.Now().UTC().Sub(startedAt) / time.Millisecond
    69  		appLogger.Info("crawler finished", zap.Int("elapsed", int(elapsed)))
    70  	}()
    71  
    72  	db, err := model.OpenDB(config.DefaultVars.DBURL(), 1, config.DefaultVars.DebugSQL)
    73  	if err != nil {
    74  		return err
    75  	}
    76  	defer func() { _ = db.Close() }()
    77  
    78  	loader := m.createLoader(db, *specifiedIDs, *followedOnly, *all, *newOnly)
    79  	mCountryList := registry.MustNewMCountryList(ctx, db.DB())
    80  	lessonFetcher := dmm_eikaiwa.NewLessonFetcher(nil, *concurrency, false, mCountryList, appLogger)
    81  	teacherRepo := mysql.NewTeacherRepository(db.DB())
    82  	for cursor := loader.GetInitialCursor(); cursor != ""; {
    83  		var teacherIDs []uint32
    84  		var err error
    85  		teacherIDs, cursor, err = loader.Load(cursor)
    86  		if err != nil {
    87  			return err
    88  		}
    89  
    90  		// TODO: semaphore
    91  		var g errgroup.Group
    92  		for _, id := range teacherIDs {
    93  			id := id
    94  			g.Go(func() error {
    95  				teacher, _, err := lessonFetcher.Fetch(ctx, uint(id))
    96  				if err != nil {
    97  					if *continueOnError {
    98  						appLogger.Error("Error during LessonFetcher.Fetch", zap.Error(err))
    99  						return nil
   100  					} else {
   101  						return err
   102  					}
   103  				}
   104  				if err := teacherRepo.CreateOrUpdate(ctx, teacher); err != nil {
   105  					if *continueOnError {
   106  						appLogger.Error("Error during TeacherService.CreateOrUpdate", zap.Error(err))
   107  						return nil
   108  					} else {
   109  						return err
   110  					}
   111  				}
   112  				// TODO: update lessons
   113  
   114  				return nil
   115  			})
   116  		}
   117  
   118  		if err := g.Wait(); err != nil {
   119  			return err
   120  		}
   121  
   122  		time.Sleep(*interval)
   123  	}
   124  
   125  	return nil
   126  }
   127  
   128  func (m *crawlerMain) createLoader(
   129  	db *gorm.DB,
   130  	specifiedIDs string,
   131  	followed bool,
   132  	all bool,
   133  	newOnly bool,
   134  ) crawler.TeacherIDLoader {
   135  	var loader crawler.TeacherIDLoader
   136  	if specifiedIDs != "" {
   137  		loader = crawler.NewSpecificTeacherIDLoader(specifiedIDs)
   138  	} else if followed {
   139  		loader = crawler.NewFollowedTeacherIDLoader(db)
   140  	} else if all {
   141  		loader = crawler.NewScrapingTeacherIDLoader(crawler.ByRating, nil)
   142  	} else if newOnly {
   143  		loader = crawler.NewScrapingTeacherIDLoader(crawler.ByNew, nil)
   144  	} else {
   145  		loader = crawler.NewScrapingTeacherIDLoader(crawler.ByRating, nil)
   146  	}
   147  	return loader
   148  }