github.com/angenalZZZ/gofunc@v0.0.0-20210507121333-48ff1be3917b/f/encoding_html.go (about) 1 package f 2 3 import ( 4 "bytes" 5 "fmt" 6 "github.com/PuerkitoBio/goquery" 7 "golang.org/x/net/html" 8 "io" 9 "reflect" 10 "strconv" 11 "strings" 12 "sync" 13 ) 14 15 const ( 16 // https://github.com/andrewstuart/goq 17 // eg. type example struct { Title string `html:"h1"` } 18 // All important settings 19 html2TagName = "html" 20 html2Prefix = '!' 21 html2Ignore = "!ignore" 22 23 // All "Reason" fields within HtmlCannotUnmarshalErr will be constants and part of this list 24 html2NonPointer = "non-pointer value" 25 html2NilValue = "destination argument is nil" 26 html2ArrayLenMismatch = "array length does not match document elements found" 27 html2CustomUnmarshalErr = "a custom unmarshal implementation threw an error" 28 html2TypeConversionErr = "a type conversion error occurred" 29 html2MapKeyUnmarshalErr = "error unmarshal a map key" 30 html2MissingValSelector = "at least one value selector must be passed to use as map index" 31 ) 32 33 // NewHtmlDecoder returns a new decoder given an io.Reader 34 func NewHtmlDecoder(r io.Reader) *HTMLDecoder { 35 d := &HTMLDecoder{} 36 d.doc, d.err = goquery.NewDocumentFromReader(r) 37 return d 38 } 39 40 // NewHtmlSelection is a quick utility function to get a goquery.Selection from a 41 // slice of *html.Node. Useful for performing unmarshal, since the decision 42 // was made to use []*html.Node for maximum flexibility. 43 func NewHtmlSelection(nodes []*html.Node) *goquery.Selection { 44 sel := &goquery.Selection{} 45 return sel.AddNodes(nodes...) 46 } 47 48 // HtmlUnmarshal takes a byte slice and a destination pointer to any 49 // interface{}, and unmarshal the document into the destination based on the 50 // rules above. Any error returned here will likely be of type 51 // HtmlCannotUnmarshalErr, though an initial goquery error will pass through directly. 52 func HtmlUnmarshal(bs []byte, v interface{}) error { 53 d, err := goquery.NewDocumentFromReader(bytes.NewReader(bs)) 54 55 if err != nil { 56 return err 57 } 58 59 return HtmlUnmarshalSelection(d.Selection, v) 60 } 61 62 // HTMLDecoder implements the same API you will see in encoding/xml and 63 // encoding/json except that we do not currently support proper streaming 64 // decoding as it is not supported by goquery upstream. 65 type HTMLDecoder struct { 66 err error 67 doc *goquery.Document 68 } 69 70 // Decode will unmarshal the contents of the decoder when given an instance of 71 // an annotated type as its argument. It will return any errors encountered 72 // during either parsing the document or unmarshal into the given object. 73 func (d *HTMLDecoder) Decode(dest interface{}) error { 74 if d.err != nil { 75 return d.err 76 } 77 if d.doc == nil { 78 return &HtmlCannotUnmarshalErr{ 79 Reason: "resulting document was nil", 80 } 81 } 82 83 return HtmlUnmarshalSelection(d.doc.Selection, dest) 84 } 85 86 // UnmarshalHTMLer allows for custom implementations of unmarshal logic 87 type UnmarshalHTMLer interface { 88 UnmarshalHTML([]*html.Node) error 89 } 90 91 // reflectUnmarshalHTMLer is stolen mostly from pkg/encoding/json/decode.go and removed some 92 // cases (handling `null`) that go doesn't need to handle. 93 func reflectUnmarshalHTMLer(v reflect.Value) (UnmarshalHTMLer, reflect.Value) { 94 if v.Kind() != reflect.Ptr && v.Type().Name() != "" && v.CanAddr() { 95 v = v.Addr() 96 } 97 for { 98 // Load value from interface, but only if the result will be 99 // usefully addressable. 100 if v.Kind() == reflect.Interface && !v.IsNil() { 101 e := v.Elem() 102 if e.Kind() == reflect.Ptr && !e.IsNil() && (e.Elem().Kind() == reflect.Ptr) { 103 v = e 104 continue 105 } 106 } 107 108 if v.Kind() != reflect.Ptr { 109 break 110 } 111 112 if v.IsNil() { 113 v.Set(reflect.New(TypeElem(v.Type()))) 114 } 115 if v.Type().NumMethod() > 0 { 116 if u, ok := v.Interface().(UnmarshalHTMLer); ok { 117 return u, reflect.Value{} 118 } 119 } 120 v = v.Elem() 121 } 122 return nil, v 123 } 124 125 // HtmlCannotUnmarshalErr represents an error returned by the goquery HtmlUnmarshal 126 // and helps consumers in programmatically diagnosing the cause of their error. 127 type HtmlCannotUnmarshalErr struct { 128 Err error 129 Val string 130 FldOrIdx interface{} 131 132 V reflect.Value 133 Reason string 134 } 135 136 // This type is a mid-level abstraction to help understand the error printing logic 137 type html2ErrorChain struct { 138 chain []*HtmlCannotUnmarshalErr 139 val string 140 tail error 141 } 142 143 // tPath returns the type path in the same string format one might use to access 144 // the nested value in go code. This should hopefully help make debugging easier. 145 func (e html2ErrorChain) tPath() string { 146 nest := "" 147 148 for _, err := range e.chain { 149 if err.FldOrIdx != nil { 150 switch nesting := err.FldOrIdx.(type) { 151 case string: 152 switch err.V.Type().Kind() { 153 case reflect.Map: 154 nest += fmt.Sprintf("[%q]", nesting) 155 case reflect.Struct: 156 nest += fmt.Sprintf(".%s", nesting) 157 } 158 case int: 159 nest += fmt.Sprintf("[%d]", nesting) 160 case *int: 161 nest += fmt.Sprintf("[%d]", *nesting) 162 default: 163 fmt.Printf("err.FldOrIdx = %#v\n", err.FldOrIdx) 164 nest += fmt.Sprintf("[%v]", nesting) 165 } 166 } 167 } 168 169 return nest 170 } 171 172 func (e html2ErrorChain) last() *HtmlCannotUnmarshalErr { 173 return e.chain[len(e.chain)-1] 174 } 175 176 // Error gives a human-readable error message for debugging purposes. 177 func (e html2ErrorChain) Error() string { 178 last := e.last() 179 180 // Avoid panic if we cannot get a type name for the Value 181 t := "unknown: invalid value" 182 if last.V.IsValid() { 183 t = last.V.Type().String() 184 } 185 186 msg := "could not unmarshal " 187 188 if e.val != "" { 189 msg += fmt.Sprintf("value %q ", e.val) 190 } 191 192 msg += fmt.Sprintf( 193 "into '%s%s' (type %s): %s", 194 e.chain[0].V.Type(), 195 e.tPath(), 196 t, 197 last.Reason, 198 ) 199 200 // If a generic error was reported elsewhere, report its message last 201 if e.tail != nil { 202 msg = msg + ": " + e.tail.Error() 203 } 204 205 return msg 206 } 207 208 // Traverse e.Err, printing hopefully helpful type info until there are no more 209 // chained errors. 210 func (e *HtmlCannotUnmarshalErr) unwind() *html2ErrorChain { 211 str := &html2ErrorChain{chain: []*HtmlCannotUnmarshalErr{}} 212 for { 213 str.chain = append(str.chain, e) 214 215 if e.Val != "" { 216 str.val = e.Val 217 } 218 219 // Terminal error was of type *HtmlCannotUnmarshalErr and had no children 220 if e.Err == nil { 221 return str 222 } 223 224 if e2, ok := e.Err.(*HtmlCannotUnmarshalErr); ok { 225 e = e2 226 continue 227 } 228 229 // Child error was not a *HtmlCannotUnmarshalErr; print its message 230 str.tail = e.Err 231 return str 232 } 233 } 234 235 func (e *HtmlCannotUnmarshalErr) Error() string { 236 return e.unwind().Error() 237 } 238 239 type html2ValFunc func(*goquery.Selection) string 240 241 type html2QueryTag string 242 243 func (tag html2QueryTag) preprocess(s *goquery.Selection) *goquery.Selection { 244 arr := strings.Split(string(tag), ",") 245 var offset int 246 for len(arr)-1 > offset && arr[offset][0] == html2Prefix { 247 m := arr[offset][1:] 248 v := reflect.ValueOf(s).MethodByName(m) 249 if !v.IsValid() { 250 return s 251 } 252 253 result := v.Call(nil) 254 255 if sel, ok := result[0].Interface().(*goquery.Selection); ok { 256 s = sel 257 } 258 offset++ 259 } 260 return s 261 } 262 263 func (tag html2QueryTag) selector(which int) string { 264 arr := strings.Split(string(tag), ",") 265 if which > len(arr)-1 { 266 return "" 267 } 268 var offset int 269 for len(arr) > offset && arr[offset][0] == html2Prefix { 270 offset++ 271 } 272 return arr[which+offset] 273 } 274 275 var ( 276 html2TextVal html2ValFunc = func(s *goquery.Selection) string { 277 return strings.TrimSpace(s.Text()) 278 } 279 html2Val = func(s *goquery.Selection) string { 280 str, _ := s.Html() 281 return strings.TrimSpace(str) 282 } 283 284 html2vfMut = sync.Mutex{} 285 html2vfCache = map[html2QueryTag]html2ValFunc{} 286 ) 287 288 func html2AttrFunc(attr string) html2ValFunc { 289 return func(s *goquery.Selection) string { 290 str, _ := s.Attr(attr) 291 return str 292 } 293 } 294 295 func (tag html2QueryTag) valFunc() html2ValFunc { 296 html2vfMut.Lock() 297 defer html2vfMut.Unlock() 298 299 if fn := html2vfCache[tag]; fn != nil { 300 return fn 301 } 302 303 srcArr := strings.Split(string(tag), ",") 304 if len(srcArr) < 2 { 305 html2vfCache[tag] = html2TextVal 306 return html2TextVal 307 } 308 309 src := srcArr[1] 310 311 var f html2ValFunc 312 switch { 313 case src[0] == '[': 314 // [someattr] will return value of .Attr("someattr") 315 attr := src[1 : len(src)-1] 316 f = html2AttrFunc(attr) 317 case src == "html": 318 f = html2Val 319 case src == "text": 320 f = html2TextVal 321 default: 322 f = html2TextVal 323 } 324 325 html2vfCache[tag] = f 326 return f 327 } 328 329 // popVal should allow us to handle arbitrarily nested maps as well as the 330 // cleanly handling the possible of map[literal]literal by just delegating 331 // back to `html2UnmarshalByType`. 332 func (tag html2QueryTag) popVal() html2QueryTag { 333 arr := strings.Split(string(tag), ",") 334 if len(arr) < 2 { 335 return tag 336 } 337 newA := []string{arr[0]} 338 newA = append(newA, arr[2:]...) 339 340 return html2QueryTag(strings.Join(newA, ",")) 341 } 342 343 func html2WrapUnErr(err error, v reflect.Value) error { 344 if err == nil { 345 return nil 346 } 347 348 return &HtmlCannotUnmarshalErr{ 349 V: v, 350 Reason: html2CustomUnmarshalErr, 351 Err: err, 352 } 353 } 354 355 // HtmlUnmarshalSelection will unmarshal a goquery.Selection into an interface 356 // appropriately and with goquery tags. 357 func HtmlUnmarshalSelection(s *goquery.Selection, face interface{}) error { 358 v := reflect.ValueOf(face) 359 360 // Must come before v.IsNil() else IsNil panics on NonPointer value 361 if v.Kind() != reflect.Ptr { 362 return &HtmlCannotUnmarshalErr{V: v, Reason: html2NonPointer} 363 } 364 365 if face == nil || v.IsNil() { 366 return &HtmlCannotUnmarshalErr{V: v, Reason: html2NilValue} 367 } 368 369 u, v := reflectUnmarshalHTMLer(v) 370 371 if u != nil { 372 return html2WrapUnErr(u.UnmarshalHTML(s.Nodes), v) 373 } 374 375 return html2UnmarshalByType(s, v, "") 376 } 377 378 func html2UnmarshalByType(s *goquery.Selection, v reflect.Value, tag html2QueryTag) error { 379 u, v := reflectUnmarshalHTMLer(v) 380 381 if u != nil { 382 return html2WrapUnErr(u.UnmarshalHTML(s.Nodes), v) 383 } 384 385 // Handle special cases where we can just set the value directly 386 switch val := v.Interface().(type) { 387 case []*html.Node: 388 val = append(val, s.Nodes...) 389 v.Set(reflect.ValueOf(val)) 390 return nil 391 } 392 393 t := v.Type() 394 395 switch t.Kind() { 396 case reflect.Struct: 397 return html2UnmarshalStruct(s, v) 398 case reflect.Slice: 399 return html2UnmarshalSlice(s, v, tag) 400 case reflect.Array: 401 return html2UnmarshalArray(s, v, tag) 402 case reflect.Map: 403 return html2UnmarshalMap(s, v, tag) 404 default: 405 vf := tag.valFunc() 406 str := vf(s) 407 err := html2UnmarshalLiteral(str, v) 408 if err != nil { 409 return &HtmlCannotUnmarshalErr{ 410 V: v, 411 Reason: html2TypeConversionErr, 412 Err: err, 413 Val: str, 414 } 415 } 416 return nil 417 } 418 } 419 420 func html2UnmarshalLiteral(s string, v reflect.Value) error { 421 t := v.Type() 422 423 switch t.Kind() { 424 case reflect.Interface: 425 if t.NumMethod() == 0 { 426 // For empty interfaces, just set to a string 427 nv := reflect.New(reflect.TypeOf(s)).Elem() 428 nv.Set(reflect.ValueOf(s)) 429 v.Set(nv) 430 } 431 case reflect.Bool: 432 i, err := strconv.ParseBool(s) 433 if err != nil { 434 return err 435 } 436 v.SetBool(i) 437 case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: 438 i, err := strconv.ParseInt(s, 10, 64) 439 if err != nil { 440 return err 441 } 442 v.SetInt(i) 443 case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64: 444 i, err := strconv.ParseUint(s, 10, 64) 445 if err != nil { 446 return err 447 } 448 v.SetUint(i) 449 case reflect.Float32, reflect.Float64: 450 i, err := strconv.ParseFloat(s, 64) 451 if err != nil { 452 return err 453 } 454 v.SetFloat(i) 455 case reflect.String: 456 v.SetString(s) 457 } 458 return nil 459 } 460 461 func html2UnmarshalStruct(s *goquery.Selection, v reflect.Value) error { 462 t := v.Type() 463 464 for i := 0; i < t.NumField(); i++ { 465 tag := html2QueryTag(t.Field(i).Tag.Get(html2TagName)) 466 467 if tag == html2Ignore { 468 continue 469 } 470 471 // If tag is empty and the object doesn't implement Unmarshaler, skip 472 if tag == "" { 473 if u, _ := reflectUnmarshalHTMLer(v.Field(i)); u == nil { 474 continue 475 } 476 } 477 478 sel := tag.preprocess(s) 479 if tag != "" { 480 selStr := tag.selector(0) 481 sel = sel.Find(selStr) 482 } 483 484 err := html2UnmarshalByType(sel, v.Field(i), tag) 485 if err != nil { 486 return &HtmlCannotUnmarshalErr{ 487 Reason: html2TypeConversionErr, 488 Err: err, 489 V: v, 490 FldOrIdx: t.Field(i).Name, 491 } 492 } 493 } 494 return nil 495 } 496 497 func html2UnmarshalArray(s *goquery.Selection, v reflect.Value, tag html2QueryTag) error { 498 if v.Type().Len() != len(s.Nodes) { 499 return &HtmlCannotUnmarshalErr{ 500 Reason: html2ArrayLenMismatch, 501 V: v, 502 } 503 } 504 505 for i := 0; i < v.Type().Len(); i++ { 506 err := html2UnmarshalByType(s.Eq(i), v.Index(i), tag) 507 if err != nil { 508 return &HtmlCannotUnmarshalErr{ 509 Reason: html2TypeConversionErr, 510 Err: err, 511 V: v, 512 FldOrIdx: i, 513 } 514 } 515 } 516 517 return nil 518 } 519 520 func html2UnmarshalSlice(s *goquery.Selection, v reflect.Value, tag html2QueryTag) error { 521 slice := v 522 eleT := v.Type().Elem() 523 524 for i := 0; i < s.Length(); i++ { 525 newV := reflect.New(TypeElem(eleT)) 526 527 err := html2UnmarshalByType(s.Eq(i), newV, tag) 528 529 if err != nil { 530 return &HtmlCannotUnmarshalErr{ 531 Reason: html2TypeConversionErr, 532 Err: err, 533 V: v, 534 FldOrIdx: i, 535 } 536 } 537 538 if eleT.Kind() != reflect.Ptr { 539 newV = newV.Elem() 540 } 541 542 v = reflect.Append(v, newV) 543 } 544 545 slice.Set(v) 546 return nil 547 } 548 549 func html2ChildrenUntilMatch(s *goquery.Selection, sel string) *goquery.Selection { 550 orig := s 551 s = s.Children() 552 for s.Length() != 0 && s.Filter(sel).Length() == 0 { 553 s = s.Children() 554 } 555 if s.Length() == 0 { 556 return orig 557 } 558 return s.Filter(sel) 559 } 560 561 func html2UnmarshalMap(s *goquery.Selection, v reflect.Value, tag html2QueryTag) error { 562 // Make new map here because indirect for some Reason doesn't help us out 563 if v.IsNil() { 564 v.Set(reflect.MakeMap(v.Type())) 565 } 566 567 keyT, eleT := v.Type().Key(), v.Type().Elem() 568 569 if tag.selector(1) == "" { 570 // We need minimum one value selector to determine the map key 571 return &HtmlCannotUnmarshalErr{ 572 Reason: html2MissingValSelector, 573 V: v, 574 } 575 } 576 577 valTag := tag 578 579 // Find children at the same level that match the given selector 580 s = html2ChildrenUntilMatch(s, tag.selector(1)) 581 // Then augment the selector we will pass down to the next unmarshal step 582 valTag = valTag.popVal() 583 584 var err error 585 s.EachWithBreak(func(_ int, subS *goquery.Selection) bool { 586 newK, newV := reflect.New(TypeElem(keyT)), reflect.New(TypeElem(eleT)) 587 588 err = html2UnmarshalByType(subS, newK, tag) 589 if err != nil { 590 err = &HtmlCannotUnmarshalErr{ 591 Reason: html2MapKeyUnmarshalErr, 592 V: v, 593 Err: err, 594 FldOrIdx: newK.Interface(), 595 Val: valTag.valFunc()(subS), 596 } 597 return false 598 } 599 600 err = html2UnmarshalByType(subS, newV, valTag) 601 if err != nil { 602 return false 603 } 604 605 if eleT.Kind() != reflect.Ptr { 606 newV = newV.Elem() 607 } 608 if keyT.Kind() != reflect.Ptr { 609 newK = newK.Elem() 610 } 611 612 v.SetMapIndex(newK, newV) 613 614 return true 615 }) 616 617 if err != nil { 618 return &HtmlCannotUnmarshalErr{ 619 Reason: html2TypeConversionErr, 620 Err: err, 621 V: v, 622 } 623 } 624 625 return nil 626 }