github.com/angenalZZZ/gofunc@v0.0.0-20210507121333-48ff1be3917b/f/encoding_html.go (about)

     1  package f
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"github.com/PuerkitoBio/goquery"
     7  	"golang.org/x/net/html"
     8  	"io"
     9  	"reflect"
    10  	"strconv"
    11  	"strings"
    12  	"sync"
    13  )
    14  
    15  const (
    16  	// https://github.com/andrewstuart/goq
    17  	// eg. type example struct { Title string `html:"h1"` }
    18  	// All important settings
    19  	html2TagName = "html"
    20  	html2Prefix  = '!'
    21  	html2Ignore  = "!ignore"
    22  
    23  	// All "Reason" fields within HtmlCannotUnmarshalErr will be constants and part of this list
    24  	html2NonPointer         = "non-pointer value"
    25  	html2NilValue           = "destination argument is nil"
    26  	html2ArrayLenMismatch   = "array length does not match document elements found"
    27  	html2CustomUnmarshalErr = "a custom unmarshal implementation threw an error"
    28  	html2TypeConversionErr  = "a type conversion error occurred"
    29  	html2MapKeyUnmarshalErr = "error unmarshal a map key"
    30  	html2MissingValSelector = "at least one value selector must be passed to use as map index"
    31  )
    32  
    33  // NewHtmlDecoder returns a new decoder given an io.Reader
    34  func NewHtmlDecoder(r io.Reader) *HTMLDecoder {
    35  	d := &HTMLDecoder{}
    36  	d.doc, d.err = goquery.NewDocumentFromReader(r)
    37  	return d
    38  }
    39  
    40  // NewHtmlSelection is a quick utility function to get a goquery.Selection from a
    41  // slice of *html.Node. Useful for performing unmarshal, since the decision
    42  // was made to use []*html.Node for maximum flexibility.
    43  func NewHtmlSelection(nodes []*html.Node) *goquery.Selection {
    44  	sel := &goquery.Selection{}
    45  	return sel.AddNodes(nodes...)
    46  }
    47  
    48  // HtmlUnmarshal takes a byte slice and a destination pointer to any
    49  // interface{}, and unmarshal the document into the destination based on the
    50  // rules above. Any error returned here will likely be of type
    51  // HtmlCannotUnmarshalErr, though an initial goquery error will pass through directly.
    52  func HtmlUnmarshal(bs []byte, v interface{}) error {
    53  	d, err := goquery.NewDocumentFromReader(bytes.NewReader(bs))
    54  
    55  	if err != nil {
    56  		return err
    57  	}
    58  
    59  	return HtmlUnmarshalSelection(d.Selection, v)
    60  }
    61  
    62  // HTMLDecoder implements the same API you will see in encoding/xml and
    63  // encoding/json except that we do not currently support proper streaming
    64  // decoding as it is not supported by goquery upstream.
    65  type HTMLDecoder struct {
    66  	err error
    67  	doc *goquery.Document
    68  }
    69  
    70  // Decode will unmarshal the contents of the decoder when given an instance of
    71  // an annotated type as its argument. It will return any errors encountered
    72  // during either parsing the document or unmarshal into the given object.
    73  func (d *HTMLDecoder) Decode(dest interface{}) error {
    74  	if d.err != nil {
    75  		return d.err
    76  	}
    77  	if d.doc == nil {
    78  		return &HtmlCannotUnmarshalErr{
    79  			Reason: "resulting document was nil",
    80  		}
    81  	}
    82  
    83  	return HtmlUnmarshalSelection(d.doc.Selection, dest)
    84  }
    85  
    86  // UnmarshalHTMLer allows for custom implementations of unmarshal logic
    87  type UnmarshalHTMLer interface {
    88  	UnmarshalHTML([]*html.Node) error
    89  }
    90  
    91  // reflectUnmarshalHTMLer is stolen mostly from pkg/encoding/json/decode.go and removed some
    92  // cases (handling `null`) that go doesn't need to handle.
    93  func reflectUnmarshalHTMLer(v reflect.Value) (UnmarshalHTMLer, reflect.Value) {
    94  	if v.Kind() != reflect.Ptr && v.Type().Name() != "" && v.CanAddr() {
    95  		v = v.Addr()
    96  	}
    97  	for {
    98  		// Load value from interface, but only if the result will be
    99  		// usefully addressable.
   100  		if v.Kind() == reflect.Interface && !v.IsNil() {
   101  			e := v.Elem()
   102  			if e.Kind() == reflect.Ptr && !e.IsNil() && (e.Elem().Kind() == reflect.Ptr) {
   103  				v = e
   104  				continue
   105  			}
   106  		}
   107  
   108  		if v.Kind() != reflect.Ptr {
   109  			break
   110  		}
   111  
   112  		if v.IsNil() {
   113  			v.Set(reflect.New(TypeElem(v.Type())))
   114  		}
   115  		if v.Type().NumMethod() > 0 {
   116  			if u, ok := v.Interface().(UnmarshalHTMLer); ok {
   117  				return u, reflect.Value{}
   118  			}
   119  		}
   120  		v = v.Elem()
   121  	}
   122  	return nil, v
   123  }
   124  
   125  // HtmlCannotUnmarshalErr represents an error returned by the goquery HtmlUnmarshal
   126  // and helps consumers in programmatically diagnosing the cause of their error.
   127  type HtmlCannotUnmarshalErr struct {
   128  	Err      error
   129  	Val      string
   130  	FldOrIdx interface{}
   131  
   132  	V      reflect.Value
   133  	Reason string
   134  }
   135  
   136  // This type is a mid-level abstraction to help understand the error printing logic
   137  type html2ErrorChain struct {
   138  	chain []*HtmlCannotUnmarshalErr
   139  	val   string
   140  	tail  error
   141  }
   142  
   143  // tPath returns the type path in the same string format one might use to access
   144  // the nested value in go code. This should hopefully help make debugging easier.
   145  func (e html2ErrorChain) tPath() string {
   146  	nest := ""
   147  
   148  	for _, err := range e.chain {
   149  		if err.FldOrIdx != nil {
   150  			switch nesting := err.FldOrIdx.(type) {
   151  			case string:
   152  				switch err.V.Type().Kind() {
   153  				case reflect.Map:
   154  					nest += fmt.Sprintf("[%q]", nesting)
   155  				case reflect.Struct:
   156  					nest += fmt.Sprintf(".%s", nesting)
   157  				}
   158  			case int:
   159  				nest += fmt.Sprintf("[%d]", nesting)
   160  			case *int:
   161  				nest += fmt.Sprintf("[%d]", *nesting)
   162  			default:
   163  				fmt.Printf("err.FldOrIdx = %#v\n", err.FldOrIdx)
   164  				nest += fmt.Sprintf("[%v]", nesting)
   165  			}
   166  		}
   167  	}
   168  
   169  	return nest
   170  }
   171  
   172  func (e html2ErrorChain) last() *HtmlCannotUnmarshalErr {
   173  	return e.chain[len(e.chain)-1]
   174  }
   175  
   176  // Error gives a human-readable error message for debugging purposes.
   177  func (e html2ErrorChain) Error() string {
   178  	last := e.last()
   179  
   180  	// Avoid panic if we cannot get a type name for the Value
   181  	t := "unknown: invalid value"
   182  	if last.V.IsValid() {
   183  		t = last.V.Type().String()
   184  	}
   185  
   186  	msg := "could not unmarshal "
   187  
   188  	if e.val != "" {
   189  		msg += fmt.Sprintf("value %q ", e.val)
   190  	}
   191  
   192  	msg += fmt.Sprintf(
   193  		"into '%s%s' (type %s): %s",
   194  		e.chain[0].V.Type(),
   195  		e.tPath(),
   196  		t,
   197  		last.Reason,
   198  	)
   199  
   200  	// If a generic error was reported elsewhere, report its message last
   201  	if e.tail != nil {
   202  		msg = msg + ": " + e.tail.Error()
   203  	}
   204  
   205  	return msg
   206  }
   207  
   208  // Traverse e.Err, printing hopefully helpful type info until there are no more
   209  // chained errors.
   210  func (e *HtmlCannotUnmarshalErr) unwind() *html2ErrorChain {
   211  	str := &html2ErrorChain{chain: []*HtmlCannotUnmarshalErr{}}
   212  	for {
   213  		str.chain = append(str.chain, e)
   214  
   215  		if e.Val != "" {
   216  			str.val = e.Val
   217  		}
   218  
   219  		// Terminal error was of type *HtmlCannotUnmarshalErr and had no children
   220  		if e.Err == nil {
   221  			return str
   222  		}
   223  
   224  		if e2, ok := e.Err.(*HtmlCannotUnmarshalErr); ok {
   225  			e = e2
   226  			continue
   227  		}
   228  
   229  		// Child error was not a *HtmlCannotUnmarshalErr; print its message
   230  		str.tail = e.Err
   231  		return str
   232  	}
   233  }
   234  
   235  func (e *HtmlCannotUnmarshalErr) Error() string {
   236  	return e.unwind().Error()
   237  }
   238  
   239  type html2ValFunc func(*goquery.Selection) string
   240  
   241  type html2QueryTag string
   242  
   243  func (tag html2QueryTag) preprocess(s *goquery.Selection) *goquery.Selection {
   244  	arr := strings.Split(string(tag), ",")
   245  	var offset int
   246  	for len(arr)-1 > offset && arr[offset][0] == html2Prefix {
   247  		m := arr[offset][1:]
   248  		v := reflect.ValueOf(s).MethodByName(m)
   249  		if !v.IsValid() {
   250  			return s
   251  		}
   252  
   253  		result := v.Call(nil)
   254  
   255  		if sel, ok := result[0].Interface().(*goquery.Selection); ok {
   256  			s = sel
   257  		}
   258  		offset++
   259  	}
   260  	return s
   261  }
   262  
   263  func (tag html2QueryTag) selector(which int) string {
   264  	arr := strings.Split(string(tag), ",")
   265  	if which > len(arr)-1 {
   266  		return ""
   267  	}
   268  	var offset int
   269  	for len(arr) > offset && arr[offset][0] == html2Prefix {
   270  		offset++
   271  	}
   272  	return arr[which+offset]
   273  }
   274  
   275  var (
   276  	html2TextVal html2ValFunc = func(s *goquery.Selection) string {
   277  		return strings.TrimSpace(s.Text())
   278  	}
   279  	html2Val = func(s *goquery.Selection) string {
   280  		str, _ := s.Html()
   281  		return strings.TrimSpace(str)
   282  	}
   283  
   284  	html2vfMut   = sync.Mutex{}
   285  	html2vfCache = map[html2QueryTag]html2ValFunc{}
   286  )
   287  
   288  func html2AttrFunc(attr string) html2ValFunc {
   289  	return func(s *goquery.Selection) string {
   290  		str, _ := s.Attr(attr)
   291  		return str
   292  	}
   293  }
   294  
   295  func (tag html2QueryTag) valFunc() html2ValFunc {
   296  	html2vfMut.Lock()
   297  	defer html2vfMut.Unlock()
   298  
   299  	if fn := html2vfCache[tag]; fn != nil {
   300  		return fn
   301  	}
   302  
   303  	srcArr := strings.Split(string(tag), ",")
   304  	if len(srcArr) < 2 {
   305  		html2vfCache[tag] = html2TextVal
   306  		return html2TextVal
   307  	}
   308  
   309  	src := srcArr[1]
   310  
   311  	var f html2ValFunc
   312  	switch {
   313  	case src[0] == '[':
   314  		// [someattr] will return value of .Attr("someattr")
   315  		attr := src[1 : len(src)-1]
   316  		f = html2AttrFunc(attr)
   317  	case src == "html":
   318  		f = html2Val
   319  	case src == "text":
   320  		f = html2TextVal
   321  	default:
   322  		f = html2TextVal
   323  	}
   324  
   325  	html2vfCache[tag] = f
   326  	return f
   327  }
   328  
   329  // popVal should allow us to handle arbitrarily nested maps as well as the
   330  // cleanly handling the possible of map[literal]literal by just delegating
   331  // back to `html2UnmarshalByType`.
   332  func (tag html2QueryTag) popVal() html2QueryTag {
   333  	arr := strings.Split(string(tag), ",")
   334  	if len(arr) < 2 {
   335  		return tag
   336  	}
   337  	newA := []string{arr[0]}
   338  	newA = append(newA, arr[2:]...)
   339  
   340  	return html2QueryTag(strings.Join(newA, ","))
   341  }
   342  
   343  func html2WrapUnErr(err error, v reflect.Value) error {
   344  	if err == nil {
   345  		return nil
   346  	}
   347  
   348  	return &HtmlCannotUnmarshalErr{
   349  		V:      v,
   350  		Reason: html2CustomUnmarshalErr,
   351  		Err:    err,
   352  	}
   353  }
   354  
   355  // HtmlUnmarshalSelection will unmarshal a goquery.Selection into an interface
   356  // appropriately and with goquery tags.
   357  func HtmlUnmarshalSelection(s *goquery.Selection, face interface{}) error {
   358  	v := reflect.ValueOf(face)
   359  
   360  	// Must come before v.IsNil() else IsNil panics on NonPointer value
   361  	if v.Kind() != reflect.Ptr {
   362  		return &HtmlCannotUnmarshalErr{V: v, Reason: html2NonPointer}
   363  	}
   364  
   365  	if face == nil || v.IsNil() {
   366  		return &HtmlCannotUnmarshalErr{V: v, Reason: html2NilValue}
   367  	}
   368  
   369  	u, v := reflectUnmarshalHTMLer(v)
   370  
   371  	if u != nil {
   372  		return html2WrapUnErr(u.UnmarshalHTML(s.Nodes), v)
   373  	}
   374  
   375  	return html2UnmarshalByType(s, v, "")
   376  }
   377  
   378  func html2UnmarshalByType(s *goquery.Selection, v reflect.Value, tag html2QueryTag) error {
   379  	u, v := reflectUnmarshalHTMLer(v)
   380  
   381  	if u != nil {
   382  		return html2WrapUnErr(u.UnmarshalHTML(s.Nodes), v)
   383  	}
   384  
   385  	// Handle special cases where we can just set the value directly
   386  	switch val := v.Interface().(type) {
   387  	case []*html.Node:
   388  		val = append(val, s.Nodes...)
   389  		v.Set(reflect.ValueOf(val))
   390  		return nil
   391  	}
   392  
   393  	t := v.Type()
   394  
   395  	switch t.Kind() {
   396  	case reflect.Struct:
   397  		return html2UnmarshalStruct(s, v)
   398  	case reflect.Slice:
   399  		return html2UnmarshalSlice(s, v, tag)
   400  	case reflect.Array:
   401  		return html2UnmarshalArray(s, v, tag)
   402  	case reflect.Map:
   403  		return html2UnmarshalMap(s, v, tag)
   404  	default:
   405  		vf := tag.valFunc()
   406  		str := vf(s)
   407  		err := html2UnmarshalLiteral(str, v)
   408  		if err != nil {
   409  			return &HtmlCannotUnmarshalErr{
   410  				V:      v,
   411  				Reason: html2TypeConversionErr,
   412  				Err:    err,
   413  				Val:    str,
   414  			}
   415  		}
   416  		return nil
   417  	}
   418  }
   419  
   420  func html2UnmarshalLiteral(s string, v reflect.Value) error {
   421  	t := v.Type()
   422  
   423  	switch t.Kind() {
   424  	case reflect.Interface:
   425  		if t.NumMethod() == 0 {
   426  			// For empty interfaces, just set to a string
   427  			nv := reflect.New(reflect.TypeOf(s)).Elem()
   428  			nv.Set(reflect.ValueOf(s))
   429  			v.Set(nv)
   430  		}
   431  	case reflect.Bool:
   432  		i, err := strconv.ParseBool(s)
   433  		if err != nil {
   434  			return err
   435  		}
   436  		v.SetBool(i)
   437  	case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
   438  		i, err := strconv.ParseInt(s, 10, 64)
   439  		if err != nil {
   440  			return err
   441  		}
   442  		v.SetInt(i)
   443  	case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
   444  		i, err := strconv.ParseUint(s, 10, 64)
   445  		if err != nil {
   446  			return err
   447  		}
   448  		v.SetUint(i)
   449  	case reflect.Float32, reflect.Float64:
   450  		i, err := strconv.ParseFloat(s, 64)
   451  		if err != nil {
   452  			return err
   453  		}
   454  		v.SetFloat(i)
   455  	case reflect.String:
   456  		v.SetString(s)
   457  	}
   458  	return nil
   459  }
   460  
   461  func html2UnmarshalStruct(s *goquery.Selection, v reflect.Value) error {
   462  	t := v.Type()
   463  
   464  	for i := 0; i < t.NumField(); i++ {
   465  		tag := html2QueryTag(t.Field(i).Tag.Get(html2TagName))
   466  
   467  		if tag == html2Ignore {
   468  			continue
   469  		}
   470  
   471  		// If tag is empty and the object doesn't implement Unmarshaler, skip
   472  		if tag == "" {
   473  			if u, _ := reflectUnmarshalHTMLer(v.Field(i)); u == nil {
   474  				continue
   475  			}
   476  		}
   477  
   478  		sel := tag.preprocess(s)
   479  		if tag != "" {
   480  			selStr := tag.selector(0)
   481  			sel = sel.Find(selStr)
   482  		}
   483  
   484  		err := html2UnmarshalByType(sel, v.Field(i), tag)
   485  		if err != nil {
   486  			return &HtmlCannotUnmarshalErr{
   487  				Reason:   html2TypeConversionErr,
   488  				Err:      err,
   489  				V:        v,
   490  				FldOrIdx: t.Field(i).Name,
   491  			}
   492  		}
   493  	}
   494  	return nil
   495  }
   496  
   497  func html2UnmarshalArray(s *goquery.Selection, v reflect.Value, tag html2QueryTag) error {
   498  	if v.Type().Len() != len(s.Nodes) {
   499  		return &HtmlCannotUnmarshalErr{
   500  			Reason: html2ArrayLenMismatch,
   501  			V:      v,
   502  		}
   503  	}
   504  
   505  	for i := 0; i < v.Type().Len(); i++ {
   506  		err := html2UnmarshalByType(s.Eq(i), v.Index(i), tag)
   507  		if err != nil {
   508  			return &HtmlCannotUnmarshalErr{
   509  				Reason:   html2TypeConversionErr,
   510  				Err:      err,
   511  				V:        v,
   512  				FldOrIdx: i,
   513  			}
   514  		}
   515  	}
   516  
   517  	return nil
   518  }
   519  
   520  func html2UnmarshalSlice(s *goquery.Selection, v reflect.Value, tag html2QueryTag) error {
   521  	slice := v
   522  	eleT := v.Type().Elem()
   523  
   524  	for i := 0; i < s.Length(); i++ {
   525  		newV := reflect.New(TypeElem(eleT))
   526  
   527  		err := html2UnmarshalByType(s.Eq(i), newV, tag)
   528  
   529  		if err != nil {
   530  			return &HtmlCannotUnmarshalErr{
   531  				Reason:   html2TypeConversionErr,
   532  				Err:      err,
   533  				V:        v,
   534  				FldOrIdx: i,
   535  			}
   536  		}
   537  
   538  		if eleT.Kind() != reflect.Ptr {
   539  			newV = newV.Elem()
   540  		}
   541  
   542  		v = reflect.Append(v, newV)
   543  	}
   544  
   545  	slice.Set(v)
   546  	return nil
   547  }
   548  
   549  func html2ChildrenUntilMatch(s *goquery.Selection, sel string) *goquery.Selection {
   550  	orig := s
   551  	s = s.Children()
   552  	for s.Length() != 0 && s.Filter(sel).Length() == 0 {
   553  		s = s.Children()
   554  	}
   555  	if s.Length() == 0 {
   556  		return orig
   557  	}
   558  	return s.Filter(sel)
   559  }
   560  
   561  func html2UnmarshalMap(s *goquery.Selection, v reflect.Value, tag html2QueryTag) error {
   562  	// Make new map here because indirect for some Reason doesn't help us out
   563  	if v.IsNil() {
   564  		v.Set(reflect.MakeMap(v.Type()))
   565  	}
   566  
   567  	keyT, eleT := v.Type().Key(), v.Type().Elem()
   568  
   569  	if tag.selector(1) == "" {
   570  		// We need minimum one value selector to determine the map key
   571  		return &HtmlCannotUnmarshalErr{
   572  			Reason: html2MissingValSelector,
   573  			V:      v,
   574  		}
   575  	}
   576  
   577  	valTag := tag
   578  
   579  	// Find children at the same level that match the given selector
   580  	s = html2ChildrenUntilMatch(s, tag.selector(1))
   581  	// Then augment the selector we will pass down to the next unmarshal step
   582  	valTag = valTag.popVal()
   583  
   584  	var err error
   585  	s.EachWithBreak(func(_ int, subS *goquery.Selection) bool {
   586  		newK, newV := reflect.New(TypeElem(keyT)), reflect.New(TypeElem(eleT))
   587  
   588  		err = html2UnmarshalByType(subS, newK, tag)
   589  		if err != nil {
   590  			err = &HtmlCannotUnmarshalErr{
   591  				Reason:   html2MapKeyUnmarshalErr,
   592  				V:        v,
   593  				Err:      err,
   594  				FldOrIdx: newK.Interface(),
   595  				Val:      valTag.valFunc()(subS),
   596  			}
   597  			return false
   598  		}
   599  
   600  		err = html2UnmarshalByType(subS, newV, valTag)
   601  		if err != nil {
   602  			return false
   603  		}
   604  
   605  		if eleT.Kind() != reflect.Ptr {
   606  			newV = newV.Elem()
   607  		}
   608  		if keyT.Kind() != reflect.Ptr {
   609  			newK = newK.Elem()
   610  		}
   611  
   612  		v.SetMapIndex(newK, newV)
   613  
   614  		return true
   615  	})
   616  
   617  	if err != nil {
   618  		return &HtmlCannotUnmarshalErr{
   619  			Reason: html2TypeConversionErr,
   620  			Err:    err,
   621  			V:      v,
   622  		}
   623  	}
   624  
   625  	return nil
   626  }