github.com/oinume/lekcije@v0.0.0-20231017100347-5b4c5eb6ab24/backend/infrastructure/dmm_eikaiwa/lesson_fetcher.go (about) 1 package dmm_eikaiwa 2 3 import ( 4 "context" 5 "crypto/tls" 6 "fmt" 7 "io" 8 "net" 9 "net/http" 10 "net/http/httptrace" 11 "regexp" 12 "strconv" 13 "strings" 14 "sync" 15 "time" 16 17 "github.com/Songmu/retry" 18 "github.com/ericlagergren/decimal" 19 "github.com/morikuni/failure" 20 "github.com/volatiletech/sqlboiler/v4/types" 21 "go.opentelemetry.io/contrib/instrumentation/net/http/httptrace/otelhttptrace" 22 "go.opentelemetry.io/otel" 23 "go.opentelemetry.io/otel/attribute" 24 "go.uber.org/zap" 25 "golang.org/x/text/width" 26 "gopkg.in/xmlpath.v2" 27 28 "github.com/oinume/lekcije/backend/domain/config" 29 "github.com/oinume/lekcije/backend/domain/repository" 30 "github.com/oinume/lekcije/backend/errors" 31 "github.com/oinume/lekcije/backend/model" 32 "github.com/oinume/lekcije/backend/model2" 33 ) 34 35 const ( 36 userAgent = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html" 37 ) 38 39 var ( 40 _ = fmt.Print 41 defaultHTTPClient = &http.Client{ 42 Timeout: 5 * time.Second, 43 CheckRedirect: redirectErrorFunc, 44 Transport: &http.Transport{ 45 MaxIdleConns: 100, 46 MaxIdleConnsPerHost: 100, 47 Proxy: http.ProxyFromEnvironment, 48 DialContext: (&net.Dialer{ 49 Timeout: 30 * time.Second, 50 KeepAlive: 1200 * time.Second, 51 }).DialContext, 52 IdleConnTimeout: 1200 * time.Second, 53 TLSHandshakeTimeout: 10 * time.Second, 54 TLSClientConfig: &tls.Config{ 55 ClientSessionCache: tls.NewLRUClientSessionCache(100), 56 }, 57 ExpectContinueTimeout: 1 * time.Second, 58 }, 59 } 60 redirectErrorFunc = func(req *http.Request, via []*http.Request) error { 61 return http.ErrUseLastResponse 62 } 63 teacherNameXPath = xmlpath.MustCompile(`//div[@class='area-detail']/h1/text()`) 64 attributesXPath = xmlpath.MustCompile(`//div[@class='confirm low']/dl`) 65 lessonXPath = xmlpath.MustCompile(`//ul[@class='oneday']//li`) 66 classAttrXPath = xmlpath.MustCompile(`@class`) 67 ratingXPath = xmlpath.MustCompile(`//p[@class='ui-star-rating-text']/strong/text()`) 68 reviewCountXPath = xmlpath.MustCompile(`//p[@class='ui-star-rating-text']/text()`) 69 newTeacherXPath = xmlpath.MustCompile(`//div[@class='favorite-list-box-wrap']/img[@class='new_teacher']`) 70 ) 71 72 type teacherLessons struct { 73 teacher *model2.Teacher 74 lessons []*model2.Lesson 75 } 76 77 type lessonFetcher struct { 78 httpClient *http.Client 79 semaphore chan struct{} 80 caching bool 81 cache map[uint]*teacherLessons 82 cacheLock *sync.RWMutex 83 logger *zap.Logger 84 mCountryList *model2.MCountryList 85 } 86 87 func NewLessonFetcher( 88 httpClient *http.Client, 89 concurrency int, 90 caching bool, 91 mCountryList *model2.MCountryList, 92 log *zap.Logger, 93 ) repository.LessonFetcher { 94 if httpClient == nil { 95 httpClient = defaultHTTPClient 96 } 97 return &lessonFetcher{ 98 httpClient: httpClient, 99 semaphore: make(chan struct{}, concurrency), 100 caching: caching, 101 cache: make(map[uint]*teacherLessons, 5000), 102 cacheLock: new(sync.RWMutex), 103 mCountryList: mCountryList, 104 logger: log, 105 } 106 } 107 108 func (f *lessonFetcher) Fetch(ctx context.Context, teacherID uint) (*model2.Teacher, []*model2.Lesson, error) { 109 ctx, span := otel.Tracer(config.DefaultTracerName).Start(ctx, "LessonFetcher.Fetch") 110 span.SetAttributes(attribute.KeyValue{ 111 Key: "teacherID", 112 Value: attribute.Int64Value(int64(teacherID)), 113 }) 114 defer span.End() 115 116 f.semaphore <- struct{}{} 117 defer func() { 118 <-f.semaphore 119 }() 120 121 // Check cache 122 if f.caching { 123 f.cacheLock.RLock() 124 if c, ok := f.cache[teacherID]; ok { 125 f.cacheLock.RUnlock() 126 return c.teacher, c.lessons, nil 127 } 128 f.cacheLock.RUnlock() 129 } 130 131 teacher := model2.NewTeacher(teacherID) 132 var content io.ReadCloser 133 err := retry.Retry(2, 300*time.Millisecond, func() error { 134 var err error 135 content, err = f.fetchContent(ctx, teacher.URL()) 136 return err 137 }) 138 defer content.Close() 139 if err != nil { 140 return nil, nil, err 141 } 142 143 _, lessons, err := f.parseHTML(teacher, content) 144 if err != nil { 145 return nil, nil, err 146 } 147 if len(lessons) > 0 { 148 teacher.LastLessonAt = lessons[len(lessons)-1].Datetime 149 } 150 return teacher, lessons, nil 151 } 152 153 func (f *lessonFetcher) fetchContent(ctx context.Context, url string) (io.ReadCloser, error) { 154 clientTrace := otelhttptrace.NewClientTrace(ctx, otelhttptrace.WithoutSubSpans()) 155 ctx = httptrace.WithClientTrace(ctx, clientTrace) 156 nopCloser := io.NopCloser(strings.NewReader("")) 157 req, err := http.NewRequestWithContext(ctx, "GET", url, nil) 158 if err != nil { 159 return nopCloser, failure.Wrap(err, failure.Messagef("failed to create HTTP request: url=%v", url)) 160 } 161 req.Header.Set("User-Agent", userAgent) 162 163 resp, err := f.httpClient.Do(req) 164 if err != nil { 165 return nopCloser, failure.Wrap(err, failure.Messagef("failed httpClient.Do(): url=%v", url)) 166 } 167 168 switch resp.StatusCode { 169 case http.StatusOK: 170 return resp.Body, nil 171 case http.StatusMovedPermanently, http.StatusFound: 172 _ = resp.Body.Close() 173 return nopCloser, errors.NewNotFoundError( 174 errors.WithMessagef("Teacher not found: url=%v, statusCode=%v", url, resp.StatusCode), 175 ) 176 default: 177 _ = resp.Body.Close() 178 return nopCloser, failure.New( 179 errors.Internal, 180 failure.Messagef( 181 "Unknown error in fetchContent: url=%v, statusCode=%v, status=%v", 182 url, resp.StatusCode, resp.Status, 183 ), 184 ) 185 } 186 } 187 188 func (f *lessonFetcher) parseHTML( 189 teacher *model2.Teacher, 190 html io.Reader, 191 ) (*model2.Teacher, []*model2.Lesson, error) { 192 root, err := xmlpath.ParseHTML(html) 193 if err != nil { 194 return nil, nil, err 195 } 196 197 // teacher name 198 if teacherName, ok := teacherNameXPath.String(root); ok { 199 teacher.Name = teacherName 200 } else { 201 return nil, nil, fmt.Errorf("failed to fetch teacher's name: url=%v", teacher.URL()) 202 } 203 204 // Nationality, birthday, etc... 205 f.parseTeacherAttribute(teacher, root) 206 if !teacher.IsJapanese() { // Japanese teachers don't have favorite count 207 // FavoriteCount 208 f.parseTeacherFavoriteCount(teacher, root) 209 } 210 // Rating 211 f.parseTeacherRating(teacher, root) 212 // ReviewCount 213 f.parseTeacherReviewCount(teacher, root) 214 215 dateRegexp := regexp.MustCompile(`([\d]+)月([\d]+)日(.+)`) 216 lessons := make([]*model2.Lesson, 0, 1000) 217 now := time.Now().In(config.LocalLocation()) 218 originalDate := time.Now().In(config.LocalLocation()).Truncate(24 * time.Hour) 219 date := originalDate 220 // lessons 221 for iter := lessonXPath.Iter(root); iter.Next(); { 222 node := iter.Node() 223 timeClass, ok := classAttrXPath.String(node) 224 if !ok { 225 continue 226 } 227 228 text := strings.Trim(node.String(), " ") 229 //fmt.Printf("text = '%v', timeClass = '%v'\n", text, timeClass) 230 f.logger.Debug("Scraping as", zap.String("timeClass", timeClass), zap.String("text", text)) 231 232 // blank, available, reserved 233 if timeClass == "date" { 234 group := dateRegexp.FindStringSubmatch(text) 235 if len(group) > 0 { 236 month, day := MustInt(group[1]), MustInt(group[2]) 237 year := date.Year() 238 if now.Month() == time.December && month == 1 { 239 year = now.Year() + 1 240 } 241 originalDate = time.Date( 242 year, time.Month(month), int(day), 243 0, 0, 0, 0, 244 config.LocalLocation(), 245 ) 246 date = originalDate 247 } 248 } else if strings.HasPrefix(timeClass, "t-") && text != "" { 249 tmp := strings.Split(timeClass, "-") 250 hour, minute := MustInt(tmp[1]), MustInt(tmp[2]) 251 if hour >= 24 { 252 // Convert 24:30 -> 00:30 253 hour -= 24 254 if date.Unix() == originalDate.Unix() { 255 // Set date to next day for 24:30 256 date = date.Add(24 * time.Hour) 257 } 258 } 259 dt := time.Date( 260 date.Year(), date.Month(), date.Day(), 261 hour, minute, 0, 0, 262 config.LocalLocation(), 263 ) 264 status := model2.LessonStatuses.MustValueForAlias(text) 265 f.logger.Debug( 266 "lesson", 267 zap.String("dt", dt.Format("2006-01-02 15:04")), 268 zap.String("status", model.LessonStatuses.MustName(status)), 269 ) 270 lessons = append(lessons, &model2.Lesson{ 271 TeacherID: teacher.ID, 272 Datetime: dt, 273 Status: model2.LessonStatuses.MustName(status), 274 }) 275 } 276 // TODO: else 277 } 278 279 // Set teacher lesson data to cache 280 if f.caching { 281 f.cacheLock.Lock() 282 f.cache[teacher.ID] = &teacherLessons{teacher: teacher, lessons: lessons} 283 f.cacheLock.Unlock() 284 } 285 286 return teacher, lessons, nil 287 } 288 289 func (f *lessonFetcher) parseTeacherAttribute(teacher *model2.Teacher, rootNode *xmlpath.Node) { 290 nameXPath := xmlpath.MustCompile(`./dt`) 291 valueXPath := xmlpath.MustCompile(`./dd`) 292 for iter := attributesXPath.Iter(rootNode); iter.Next(); { 293 node := iter.Node() 294 name, ok := nameXPath.String(node) 295 if !ok { 296 f.logger.Error( 297 fmt.Sprintf("Failed to parse teacher value: name=%v", name), 298 zap.Uint("teacherID", teacher.ID), 299 ) 300 continue 301 } 302 value, ok := valueXPath.String(node) 303 if !ok { 304 f.logger.Error( 305 fmt.Sprintf("Failed to parse teacher value: name=%v, value=%v", name, value), 306 zap.Uint("teacherID", teacher.ID), 307 ) 308 continue 309 } 310 if err := f.setTeacherAttribute(teacher, strings.TrimSpace(name), strings.TrimSpace(value)); err != nil { 311 f.logger.Error( 312 fmt.Sprintf("Failed to setTeacherAttribute: name=%v, value=%v", name, value), 313 zap.Uint("teacherID", teacher.ID), 314 ) 315 } 316 //fmt.Printf("name = %v, value = %v\n", strings.TrimSpace(name), strings.TrimSpace(value)) 317 } 318 //fmt.Printf("teacher = %+v\n", teacher) 319 } 320 321 func (f *lessonFetcher) setTeacherAttribute(teacher *model2.Teacher, name string, value string) error { 322 switch name { 323 case "国籍": 324 c, found := f.mCountryList.GetByNameJA(value) 325 if !found { 326 return errors.NewNotFoundError(errors.WithMessage(fmt.Sprintf("No MCountries for %v", value))) 327 } 328 teacher.CountryID = int16(c.ID) // TODO: teacher.CountryID must be uint16 329 case "誕生日": 330 value = width.Narrow.String(value) 331 if strings.TrimSpace(value) == "" { 332 teacher.Birthday = time.Time{} 333 } else { 334 t, err := time.Parse("2006-01-02", value) 335 if err != nil { 336 return err 337 } 338 teacher.Birthday = t 339 } 340 case "性別": 341 switch value { 342 case "男性": 343 teacher.Gender = "male" // TODO: enum 344 case "女性": 345 teacher.Gender = "female" 346 default: 347 return failure.New(errors.Internal, failure.Messagef("unknown gender for %v", value)) 348 } 349 case "経歴": 350 var yoe int 351 switch value { 352 case "1年未満": 353 yoe = 0 354 case "3年以上": 355 yoe = 4 356 default: 357 value = strings.Replace(value, "年", "", -1) 358 if v, err := strconv.ParseInt(width.Narrow.String(value), 10, 32); err == nil { 359 yoe = int(v) 360 } else { 361 return failure.Wrap(err, failure.Messagef("failed to convert to number: %v", value)) 362 } 363 } 364 teacher.YearsOfExperience = int8(yoe) // TODO: teacher.YearsOfExperience must be uint8 365 } 366 return nil 367 } 368 369 func (f *lessonFetcher) parseTeacherFavoriteCount(teacher *model2.Teacher, rootNode *xmlpath.Node) { 370 favCountXPath := xmlpath.MustCompile(`//span[@id='fav_count']`) 371 value, ok := favCountXPath.String(rootNode) 372 if !ok { 373 f.logger.Error( 374 "Failed to parse teacher favorite count", 375 zap.Uint("teacherID", uint(teacher.ID)), 376 ) 377 return 378 } 379 v, err := strconv.ParseUint(value, 10, 32) 380 if err != nil { 381 f.logger.Error( 382 "Failed to parse teacher favorite count. It's not a number", 383 zap.Uint("teacherID", uint(teacher.ID)), 384 ) 385 return 386 } 387 teacher.FavoriteCount = uint(v) 388 } 389 390 func (f *lessonFetcher) parseTeacherRating(teacher *model2.Teacher, rootNode *xmlpath.Node) { 391 value, ok := ratingXPath.String(rootNode) 392 if !ok { 393 if _, ok := newTeacherXPath.String(rootNode); !ok { 394 f.logger.Error( 395 "Failed to parse teacher rating", 396 zap.Uint("teacherID", teacher.ID), 397 zap.String("value", value), 398 ) 399 } 400 // Give up to obtain rating 401 return 402 } 403 rating, err := strconv.ParseFloat(value, 32) 404 if err != nil { 405 f.logger.Error( 406 "Failed to parse teacher rating. It's not a number", 407 zap.Uint("teacherID", teacher.ID), 408 ) 409 return 410 } 411 teacher.Rating = types.NullDecimal{Big: decimal.New(int64(rating*100), 2)} 412 } 413 414 func (f *lessonFetcher) parseTeacherReviewCount(teacher *model2.Teacher, rootNode *xmlpath.Node) { 415 value, ok := reviewCountXPath.String(rootNode) 416 if !ok { 417 if _, ok := newTeacherXPath.String(rootNode); !ok { 418 f.logger.Error( 419 "Failed to parse teacher review count", 420 zap.Uint("teacherID", teacher.ID), 421 zap.String("value", value), 422 ) 423 } 424 // Give up to obtain rating 425 return 426 } 427 value = strings.TrimPrefix(value, "(") 428 value = strings.TrimSuffix(value, ")") 429 reviewCount, err := strconv.ParseUint(value, 10, 32) 430 if err != nil { 431 f.logger.Error( 432 "Failed to parse teacher review count. It's not a number", 433 zap.Uint("teacherID", teacher.ID), 434 zap.String("value", value), 435 ) 436 return 437 } 438 teacher.ReviewCount = uint(reviewCount) 439 } 440 441 func (f *lessonFetcher) Close() { 442 close(f.semaphore) 443 } 444 445 func MustInt(s string) int { 446 i, err := strconv.ParseInt(s, 10, 32) 447 if err != nil { 448 panic(err) 449 } 450 return int(i) 451 }