github.com/oinume/lekcije@v0.0.0-20231017100347-5b4c5eb6ab24/backend/infrastructure/dmm_eikaiwa/lesson_fetcher.go (about)

     1  package dmm_eikaiwa
     2  
     3  import (
     4  	"context"
     5  	"crypto/tls"
     6  	"fmt"
     7  	"io"
     8  	"net"
     9  	"net/http"
    10  	"net/http/httptrace"
    11  	"regexp"
    12  	"strconv"
    13  	"strings"
    14  	"sync"
    15  	"time"
    16  
    17  	"github.com/Songmu/retry"
    18  	"github.com/ericlagergren/decimal"
    19  	"github.com/morikuni/failure"
    20  	"github.com/volatiletech/sqlboiler/v4/types"
    21  	"go.opentelemetry.io/contrib/instrumentation/net/http/httptrace/otelhttptrace"
    22  	"go.opentelemetry.io/otel"
    23  	"go.opentelemetry.io/otel/attribute"
    24  	"go.uber.org/zap"
    25  	"golang.org/x/text/width"
    26  	"gopkg.in/xmlpath.v2"
    27  
    28  	"github.com/oinume/lekcije/backend/domain/config"
    29  	"github.com/oinume/lekcije/backend/domain/repository"
    30  	"github.com/oinume/lekcije/backend/errors"
    31  	"github.com/oinume/lekcije/backend/model"
    32  	"github.com/oinume/lekcije/backend/model2"
    33  )
    34  
    35  const (
    36  	userAgent = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html"
    37  )
    38  
    39  var (
    40  	_                 = fmt.Print
    41  	defaultHTTPClient = &http.Client{
    42  		Timeout:       5 * time.Second,
    43  		CheckRedirect: redirectErrorFunc,
    44  		Transport: &http.Transport{
    45  			MaxIdleConns:        100,
    46  			MaxIdleConnsPerHost: 100,
    47  			Proxy:               http.ProxyFromEnvironment,
    48  			DialContext: (&net.Dialer{
    49  				Timeout:   30 * time.Second,
    50  				KeepAlive: 1200 * time.Second,
    51  			}).DialContext,
    52  			IdleConnTimeout:     1200 * time.Second,
    53  			TLSHandshakeTimeout: 10 * time.Second,
    54  			TLSClientConfig: &tls.Config{
    55  				ClientSessionCache: tls.NewLRUClientSessionCache(100),
    56  			},
    57  			ExpectContinueTimeout: 1 * time.Second,
    58  		},
    59  	}
    60  	redirectErrorFunc = func(req *http.Request, via []*http.Request) error {
    61  		return http.ErrUseLastResponse
    62  	}
    63  	teacherNameXPath = xmlpath.MustCompile(`//div[@class='area-detail']/h1/text()`)
    64  	attributesXPath  = xmlpath.MustCompile(`//div[@class='confirm low']/dl`)
    65  	lessonXPath      = xmlpath.MustCompile(`//ul[@class='oneday']//li`)
    66  	classAttrXPath   = xmlpath.MustCompile(`@class`)
    67  	ratingXPath      = xmlpath.MustCompile(`//p[@class='ui-star-rating-text']/strong/text()`)
    68  	reviewCountXPath = xmlpath.MustCompile(`//p[@class='ui-star-rating-text']/text()`)
    69  	newTeacherXPath  = xmlpath.MustCompile(`//div[@class='favorite-list-box-wrap']/img[@class='new_teacher']`)
    70  )
    71  
    72  type teacherLessons struct {
    73  	teacher *model2.Teacher
    74  	lessons []*model2.Lesson
    75  }
    76  
    77  type lessonFetcher struct {
    78  	httpClient   *http.Client
    79  	semaphore    chan struct{}
    80  	caching      bool
    81  	cache        map[uint]*teacherLessons
    82  	cacheLock    *sync.RWMutex
    83  	logger       *zap.Logger
    84  	mCountryList *model2.MCountryList
    85  }
    86  
    87  func NewLessonFetcher(
    88  	httpClient *http.Client,
    89  	concurrency int,
    90  	caching bool,
    91  	mCountryList *model2.MCountryList,
    92  	log *zap.Logger,
    93  ) repository.LessonFetcher {
    94  	if httpClient == nil {
    95  		httpClient = defaultHTTPClient
    96  	}
    97  	return &lessonFetcher{
    98  		httpClient:   httpClient,
    99  		semaphore:    make(chan struct{}, concurrency),
   100  		caching:      caching,
   101  		cache:        make(map[uint]*teacherLessons, 5000),
   102  		cacheLock:    new(sync.RWMutex),
   103  		mCountryList: mCountryList,
   104  		logger:       log,
   105  	}
   106  }
   107  
   108  func (f *lessonFetcher) Fetch(ctx context.Context, teacherID uint) (*model2.Teacher, []*model2.Lesson, error) {
   109  	ctx, span := otel.Tracer(config.DefaultTracerName).Start(ctx, "LessonFetcher.Fetch")
   110  	span.SetAttributes(attribute.KeyValue{
   111  		Key:   "teacherID",
   112  		Value: attribute.Int64Value(int64(teacherID)),
   113  	})
   114  	defer span.End()
   115  
   116  	f.semaphore <- struct{}{}
   117  	defer func() {
   118  		<-f.semaphore
   119  	}()
   120  
   121  	// Check cache
   122  	if f.caching {
   123  		f.cacheLock.RLock()
   124  		if c, ok := f.cache[teacherID]; ok {
   125  			f.cacheLock.RUnlock()
   126  			return c.teacher, c.lessons, nil
   127  		}
   128  		f.cacheLock.RUnlock()
   129  	}
   130  
   131  	teacher := model2.NewTeacher(teacherID)
   132  	var content io.ReadCloser
   133  	err := retry.Retry(2, 300*time.Millisecond, func() error {
   134  		var err error
   135  		content, err = f.fetchContent(ctx, teacher.URL())
   136  		return err
   137  	})
   138  	defer content.Close()
   139  	if err != nil {
   140  		return nil, nil, err
   141  	}
   142  
   143  	_, lessons, err := f.parseHTML(teacher, content)
   144  	if err != nil {
   145  		return nil, nil, err
   146  	}
   147  	if len(lessons) > 0 {
   148  		teacher.LastLessonAt = lessons[len(lessons)-1].Datetime
   149  	}
   150  	return teacher, lessons, nil
   151  }
   152  
   153  func (f *lessonFetcher) fetchContent(ctx context.Context, url string) (io.ReadCloser, error) {
   154  	clientTrace := otelhttptrace.NewClientTrace(ctx, otelhttptrace.WithoutSubSpans())
   155  	ctx = httptrace.WithClientTrace(ctx, clientTrace)
   156  	nopCloser := io.NopCloser(strings.NewReader(""))
   157  	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
   158  	if err != nil {
   159  		return nopCloser, failure.Wrap(err, failure.Messagef("failed to create HTTP request: url=%v", url))
   160  	}
   161  	req.Header.Set("User-Agent", userAgent)
   162  
   163  	resp, err := f.httpClient.Do(req)
   164  	if err != nil {
   165  		return nopCloser, failure.Wrap(err, failure.Messagef("failed httpClient.Do(): url=%v", url))
   166  	}
   167  
   168  	switch resp.StatusCode {
   169  	case http.StatusOK:
   170  		return resp.Body, nil
   171  	case http.StatusMovedPermanently, http.StatusFound:
   172  		_ = resp.Body.Close()
   173  		return nopCloser, errors.NewNotFoundError(
   174  			errors.WithMessagef("Teacher not found: url=%v, statusCode=%v", url, resp.StatusCode),
   175  		)
   176  	default:
   177  		_ = resp.Body.Close()
   178  		return nopCloser, failure.New(
   179  			errors.Internal,
   180  			failure.Messagef(
   181  				"Unknown error in fetchContent: url=%v, statusCode=%v, status=%v",
   182  				url, resp.StatusCode, resp.Status,
   183  			),
   184  		)
   185  	}
   186  }
   187  
   188  func (f *lessonFetcher) parseHTML(
   189  	teacher *model2.Teacher,
   190  	html io.Reader,
   191  ) (*model2.Teacher, []*model2.Lesson, error) {
   192  	root, err := xmlpath.ParseHTML(html)
   193  	if err != nil {
   194  		return nil, nil, err
   195  	}
   196  
   197  	// teacher name
   198  	if teacherName, ok := teacherNameXPath.String(root); ok {
   199  		teacher.Name = teacherName
   200  	} else {
   201  		return nil, nil, fmt.Errorf("failed to fetch teacher's name: url=%v", teacher.URL())
   202  	}
   203  
   204  	// Nationality, birthday, etc...
   205  	f.parseTeacherAttribute(teacher, root)
   206  	if !teacher.IsJapanese() { // Japanese teachers don't have favorite count
   207  		// FavoriteCount
   208  		f.parseTeacherFavoriteCount(teacher, root)
   209  	}
   210  	// Rating
   211  	f.parseTeacherRating(teacher, root)
   212  	// ReviewCount
   213  	f.parseTeacherReviewCount(teacher, root)
   214  
   215  	dateRegexp := regexp.MustCompile(`([\d]+)月([\d]+)日(.+)`)
   216  	lessons := make([]*model2.Lesson, 0, 1000)
   217  	now := time.Now().In(config.LocalLocation())
   218  	originalDate := time.Now().In(config.LocalLocation()).Truncate(24 * time.Hour)
   219  	date := originalDate
   220  	// lessons
   221  	for iter := lessonXPath.Iter(root); iter.Next(); {
   222  		node := iter.Node()
   223  		timeClass, ok := classAttrXPath.String(node)
   224  		if !ok {
   225  			continue
   226  		}
   227  
   228  		text := strings.Trim(node.String(), " ")
   229  		//fmt.Printf("text = '%v', timeClass = '%v'\n", text, timeClass)
   230  		f.logger.Debug("Scraping as", zap.String("timeClass", timeClass), zap.String("text", text))
   231  
   232  		// blank, available, reserved
   233  		if timeClass == "date" {
   234  			group := dateRegexp.FindStringSubmatch(text)
   235  			if len(group) > 0 {
   236  				month, day := MustInt(group[1]), MustInt(group[2])
   237  				year := date.Year()
   238  				if now.Month() == time.December && month == 1 {
   239  					year = now.Year() + 1
   240  				}
   241  				originalDate = time.Date(
   242  					year, time.Month(month), int(day),
   243  					0, 0, 0, 0,
   244  					config.LocalLocation(),
   245  				)
   246  				date = originalDate
   247  			}
   248  		} else if strings.HasPrefix(timeClass, "t-") && text != "" {
   249  			tmp := strings.Split(timeClass, "-")
   250  			hour, minute := MustInt(tmp[1]), MustInt(tmp[2])
   251  			if hour >= 24 {
   252  				// Convert 24:30 -> 00:30
   253  				hour -= 24
   254  				if date.Unix() == originalDate.Unix() {
   255  					// Set date to next day for 24:30
   256  					date = date.Add(24 * time.Hour)
   257  				}
   258  			}
   259  			dt := time.Date(
   260  				date.Year(), date.Month(), date.Day(),
   261  				hour, minute, 0, 0,
   262  				config.LocalLocation(),
   263  			)
   264  			status := model2.LessonStatuses.MustValueForAlias(text)
   265  			f.logger.Debug(
   266  				"lesson",
   267  				zap.String("dt", dt.Format("2006-01-02 15:04")),
   268  				zap.String("status", model.LessonStatuses.MustName(status)),
   269  			)
   270  			lessons = append(lessons, &model2.Lesson{
   271  				TeacherID: teacher.ID,
   272  				Datetime:  dt,
   273  				Status:    model2.LessonStatuses.MustName(status),
   274  			})
   275  		}
   276  		// TODO: else
   277  	}
   278  
   279  	// Set teacher lesson data to cache
   280  	if f.caching {
   281  		f.cacheLock.Lock()
   282  		f.cache[teacher.ID] = &teacherLessons{teacher: teacher, lessons: lessons}
   283  		f.cacheLock.Unlock()
   284  	}
   285  
   286  	return teacher, lessons, nil
   287  }
   288  
   289  func (f *lessonFetcher) parseTeacherAttribute(teacher *model2.Teacher, rootNode *xmlpath.Node) {
   290  	nameXPath := xmlpath.MustCompile(`./dt`)
   291  	valueXPath := xmlpath.MustCompile(`./dd`)
   292  	for iter := attributesXPath.Iter(rootNode); iter.Next(); {
   293  		node := iter.Node()
   294  		name, ok := nameXPath.String(node)
   295  		if !ok {
   296  			f.logger.Error(
   297  				fmt.Sprintf("Failed to parse teacher value: name=%v", name),
   298  				zap.Uint("teacherID", teacher.ID),
   299  			)
   300  			continue
   301  		}
   302  		value, ok := valueXPath.String(node)
   303  		if !ok {
   304  			f.logger.Error(
   305  				fmt.Sprintf("Failed to parse teacher value: name=%v, value=%v", name, value),
   306  				zap.Uint("teacherID", teacher.ID),
   307  			)
   308  			continue
   309  		}
   310  		if err := f.setTeacherAttribute(teacher, strings.TrimSpace(name), strings.TrimSpace(value)); err != nil {
   311  			f.logger.Error(
   312  				fmt.Sprintf("Failed to setTeacherAttribute: name=%v, value=%v", name, value),
   313  				zap.Uint("teacherID", teacher.ID),
   314  			)
   315  		}
   316  		//fmt.Printf("name = %v, value = %v\n", strings.TrimSpace(name), strings.TrimSpace(value))
   317  	}
   318  	//fmt.Printf("teacher = %+v\n", teacher)
   319  }
   320  
   321  func (f *lessonFetcher) setTeacherAttribute(teacher *model2.Teacher, name string, value string) error {
   322  	switch name {
   323  	case "国籍":
   324  		c, found := f.mCountryList.GetByNameJA(value)
   325  		if !found {
   326  			return errors.NewNotFoundError(errors.WithMessage(fmt.Sprintf("No MCountries for %v", value)))
   327  		}
   328  		teacher.CountryID = int16(c.ID) // TODO: teacher.CountryID must be uint16
   329  	case "誕生日":
   330  		value = width.Narrow.String(value)
   331  		if strings.TrimSpace(value) == "" {
   332  			teacher.Birthday = time.Time{}
   333  		} else {
   334  			t, err := time.Parse("2006-01-02", value)
   335  			if err != nil {
   336  				return err
   337  			}
   338  			teacher.Birthday = t
   339  		}
   340  	case "性別":
   341  		switch value {
   342  		case "男性":
   343  			teacher.Gender = "male" // TODO: enum
   344  		case "女性":
   345  			teacher.Gender = "female"
   346  		default:
   347  			return failure.New(errors.Internal, failure.Messagef("unknown gender for %v", value))
   348  		}
   349  	case "経歴":
   350  		var yoe int
   351  		switch value {
   352  		case "1年未満":
   353  			yoe = 0
   354  		case "3年以上":
   355  			yoe = 4
   356  		default:
   357  			value = strings.Replace(value, "年", "", -1)
   358  			if v, err := strconv.ParseInt(width.Narrow.String(value), 10, 32); err == nil {
   359  				yoe = int(v)
   360  			} else {
   361  				return failure.Wrap(err, failure.Messagef("failed to convert to number: %v", value))
   362  			}
   363  		}
   364  		teacher.YearsOfExperience = int8(yoe) // TODO: teacher.YearsOfExperience must be uint8
   365  	}
   366  	return nil
   367  }
   368  
   369  func (f *lessonFetcher) parseTeacherFavoriteCount(teacher *model2.Teacher, rootNode *xmlpath.Node) {
   370  	favCountXPath := xmlpath.MustCompile(`//span[@id='fav_count']`)
   371  	value, ok := favCountXPath.String(rootNode)
   372  	if !ok {
   373  		f.logger.Error(
   374  			"Failed to parse teacher favorite count",
   375  			zap.Uint("teacherID", uint(teacher.ID)),
   376  		)
   377  		return
   378  	}
   379  	v, err := strconv.ParseUint(value, 10, 32)
   380  	if err != nil {
   381  		f.logger.Error(
   382  			"Failed to parse teacher favorite count. It's not a number",
   383  			zap.Uint("teacherID", uint(teacher.ID)),
   384  		)
   385  		return
   386  	}
   387  	teacher.FavoriteCount = uint(v)
   388  }
   389  
   390  func (f *lessonFetcher) parseTeacherRating(teacher *model2.Teacher, rootNode *xmlpath.Node) {
   391  	value, ok := ratingXPath.String(rootNode)
   392  	if !ok {
   393  		if _, ok := newTeacherXPath.String(rootNode); !ok {
   394  			f.logger.Error(
   395  				"Failed to parse teacher rating",
   396  				zap.Uint("teacherID", teacher.ID),
   397  				zap.String("value", value),
   398  			)
   399  		}
   400  		// Give up to obtain rating
   401  		return
   402  	}
   403  	rating, err := strconv.ParseFloat(value, 32)
   404  	if err != nil {
   405  		f.logger.Error(
   406  			"Failed to parse teacher rating. It's not a number",
   407  			zap.Uint("teacherID", teacher.ID),
   408  		)
   409  		return
   410  	}
   411  	teacher.Rating = types.NullDecimal{Big: decimal.New(int64(rating*100), 2)}
   412  }
   413  
   414  func (f *lessonFetcher) parseTeacherReviewCount(teacher *model2.Teacher, rootNode *xmlpath.Node) {
   415  	value, ok := reviewCountXPath.String(rootNode)
   416  	if !ok {
   417  		if _, ok := newTeacherXPath.String(rootNode); !ok {
   418  			f.logger.Error(
   419  				"Failed to parse teacher review count",
   420  				zap.Uint("teacherID", teacher.ID),
   421  				zap.String("value", value),
   422  			)
   423  		}
   424  		// Give up to obtain rating
   425  		return
   426  	}
   427  	value = strings.TrimPrefix(value, "(")
   428  	value = strings.TrimSuffix(value, ")")
   429  	reviewCount, err := strconv.ParseUint(value, 10, 32)
   430  	if err != nil {
   431  		f.logger.Error(
   432  			"Failed to parse teacher review count. It's not a number",
   433  			zap.Uint("teacherID", teacher.ID),
   434  			zap.String("value", value),
   435  		)
   436  		return
   437  	}
   438  	teacher.ReviewCount = uint(reviewCount)
   439  }
   440  
   441  func (f *lessonFetcher) Close() {
   442  	close(f.semaphore)
   443  }
   444  
   445  func MustInt(s string) int {
   446  	i, err := strconv.ParseInt(s, 10, 32)
   447  	if err != nil {
   448  		panic(err)
   449  	}
   450  	return int(i)
   451  }