github.com/slspeek/camlistore_namedsearch@v0.0.0-20140519202248-ed6f70f7721a/pkg/importer/feed/parse.go (about)

     1  /*
     2  Copyright 2014 The Camlistore Authors
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8       http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package feed
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/xml"
    22  	"fmt"
    23  	"html"
    24  	"log"
    25  	"net/url"
    26  	"strings"
    27  	"time"
    28  
    29  	"camlistore.org/pkg/importer/feed/atom"
    30  	"camlistore.org/pkg/importer/feed/rdf"
    31  	"camlistore.org/pkg/importer/feed/rss"
    32  	"camlistore.org/third_party/code.google.com/p/go-charset/charset"
    33  	_ "camlistore.org/third_party/code.google.com/p/go-charset/data"
    34  )
    35  
    36  type feed struct {
    37  	Title   string
    38  	Updated time.Time
    39  	Link    string
    40  	Items   []*item
    41  }
    42  
    43  type item struct {
    44  	ID           string
    45  	Title        string
    46  	Link         string
    47  	Created      time.Time
    48  	Published    time.Time
    49  	Updated      time.Time
    50  	Author       string
    51  	Content      string
    52  	MediaContent string
    53  }
    54  
    55  func parseFeed(body []byte, feedURL string) (*feed, error) {
    56  	var f *feed
    57  	var atomerr, rsserr, rdferr error
    58  	f, atomerr = parseAtom(body)
    59  	if f == nil {
    60  		f, rsserr = parseRSS(body)
    61  	}
    62  	if f == nil {
    63  		f, rdferr = parseRDF(body)
    64  	}
    65  	if f == nil {
    66  		log.Printf("atom parse error: %s", atomerr.Error())
    67  		log.Printf("xml parse error: %s", rsserr.Error())
    68  		log.Printf("rdf parse error: %s", rdferr.Error())
    69  		return nil, fmt.Errorf("Could not parse feed data")
    70  	}
    71  	return f, nil
    72  }
    73  
    74  func parseAtom(body []byte) (*feed, error) {
    75  	var f feed
    76  	var a atom.Feed
    77  	d := xml.NewDecoder(bytes.NewReader(body))
    78  	d.CharsetReader = charset.NewReader
    79  	if err := d.Decode(&a); err != nil {
    80  		return nil, err
    81  	}
    82  	f.Title = a.Title
    83  	if t, err := parseDate(string(a.Updated)); err == nil {
    84  		f.Updated = t
    85  	}
    86  	fb, err := url.Parse(a.XMLBase)
    87  	if err != nil {
    88  		fb, _ = url.Parse("")
    89  	}
    90  	if len(a.Link) > 0 {
    91  		f.Link = findBestAtomLink(a.Link)
    92  		if l, err := fb.Parse(f.Link); err == nil {
    93  			f.Link = l.String()
    94  		}
    95  	}
    96  
    97  	for _, i := range a.Entry {
    98  		eb, err := fb.Parse(i.XMLBase)
    99  		if err != nil {
   100  			eb = fb
   101  		}
   102  		st := item{
   103  			ID:    i.ID,
   104  			Title: atomTitle(i.Title),
   105  		}
   106  		if t, err := parseDate(string(i.Updated)); err == nil {
   107  			st.Updated = t
   108  		}
   109  		if t, err := parseDate(string(i.Published)); err == nil {
   110  			st.Published = t
   111  		}
   112  		if len(i.Link) > 0 {
   113  			st.Link = findBestAtomLink(i.Link)
   114  			if l, err := eb.Parse(st.Link); err == nil {
   115  				st.Link = l.String()
   116  			}
   117  		}
   118  		if i.Author != nil {
   119  			st.Author = i.Author.Name
   120  		}
   121  		if i.Content != nil {
   122  			if len(strings.TrimSpace(i.Content.Body)) != 0 {
   123  				st.Content = i.Content.Body
   124  			} else if len(i.Content.InnerXML) != 0 {
   125  				st.Content = i.Content.InnerXML
   126  			}
   127  		} else if i.Summary != nil {
   128  			st.Content = i.Summary.Body
   129  		}
   130  		f.Items = append(f.Items, &st)
   131  	}
   132  	return &f, nil
   133  }
   134  
   135  func parseRSS(body []byte) (*feed, error) {
   136  	var f feed
   137  	var r rss.RSS
   138  	d := xml.NewDecoder(bytes.NewReader(body))
   139  	d.CharsetReader = charset.NewReader
   140  	d.DefaultSpace = "DefaultSpace"
   141  	if err := d.Decode(&r); err != nil {
   142  		return nil, err
   143  	}
   144  	f.Title = r.Title
   145  	if t, err := parseDate(r.LastBuildDate, r.PubDate); err == nil {
   146  		f.Updated = t
   147  	}
   148  	f.Link = r.BaseLink()
   149  
   150  	for _, i := range r.Items {
   151  		st := item{
   152  			Link:   i.Link,
   153  			Author: i.Author,
   154  		}
   155  		if i.Content != "" {
   156  			st.Content = i.Content
   157  		} else if i.Description != "" {
   158  			st.Content = i.Description
   159  		}
   160  		if i.Title != "" {
   161  			st.Title = i.Title
   162  		} else if i.Description != "" {
   163  			st.Title = i.Description
   164  		}
   165  		if st.Content == st.Title {
   166  			st.Title = ""
   167  		}
   168  		st.Title = textTitle(st.Title)
   169  		if i.Guid != nil {
   170  			st.ID = i.Guid.Guid
   171  		}
   172  		if i.Enclosure != nil && strings.HasPrefix(i.Enclosure.Type, "audio/") {
   173  			st.MediaContent = i.Enclosure.Url
   174  		} else if i.Media != nil && strings.HasPrefix(i.Media.Type, "audio/") {
   175  			st.MediaContent = i.Media.URL
   176  		}
   177  		if t, err := parseDate(i.PubDate, i.Date, i.Published); err == nil {
   178  			st.Published = t
   179  			st.Updated = t
   180  		}
   181  		f.Items = append(f.Items, &st)
   182  	}
   183  
   184  	return &f, nil
   185  }
   186  
   187  func parseRDF(body []byte) (*feed, error) {
   188  	var f feed
   189  	var rd rdf.RDF
   190  	d := xml.NewDecoder(bytes.NewReader(body))
   191  	d.CharsetReader = charset.NewReader
   192  	if err := d.Decode(&rd); err != nil {
   193  		return nil, err
   194  	}
   195  	if rd.Channel != nil {
   196  		f.Title = rd.Channel.Title
   197  		f.Link = rd.Channel.Link
   198  		if t, err := parseDate(rd.Channel.Date); err == nil {
   199  			f.Updated = t
   200  		}
   201  	}
   202  
   203  	for _, i := range rd.Item {
   204  		st := item{
   205  			ID:     i.About,
   206  			Title:  textTitle(i.Title),
   207  			Link:   i.Link,
   208  			Author: i.Creator,
   209  		}
   210  		if len(i.Description) > 0 {
   211  			st.Content = html.UnescapeString(i.Description)
   212  		} else if len(i.Content) > 0 {
   213  			st.Content = html.UnescapeString(i.Content)
   214  		}
   215  		if t, err := parseDate(i.Date); err == nil {
   216  			st.Published = t
   217  			st.Updated = t
   218  		}
   219  		f.Items = append(f.Items, &st)
   220  	}
   221  
   222  	return &f, nil
   223  }
   224  
   225  func textTitle(t string) string {
   226  	return html.UnescapeString(t)
   227  }
   228  
   229  func atomTitle(t *atom.Text) string {
   230  	if t == nil {
   231  		return ""
   232  	}
   233  	if t.Type == "html" {
   234  		// see: https://github.com/mjibson/goread/blob/59aec794f3ef87b36c1bac029438c33a6aa6d8d3/utils.go#L533
   235  		//return html.UnescapeString(sanitizer.StripTags(t.Body))
   236  	}
   237  	return textTitle(t.Body)
   238  }
   239  
   240  func findBestAtomLink(links []atom.Link) string {
   241  	getScore := func(l atom.Link) int {
   242  		switch {
   243  		case l.Rel == "hub":
   244  			return 0
   245  		case l.Rel == "alternate" && l.Type == "text/html":
   246  			return 5
   247  		case l.Type == "text/html":
   248  			return 4
   249  		case l.Rel == "self":
   250  			return 2
   251  		case l.Rel == "":
   252  			return 3
   253  		default:
   254  			return 1
   255  		}
   256  	}
   257  
   258  	var bestlink string
   259  	bestscore := -1
   260  	for _, l := range links {
   261  		score := getScore(l)
   262  		if score > bestscore {
   263  			bestlink = l.Href
   264  			bestscore = score
   265  		}
   266  	}
   267  
   268  	return bestlink
   269  }
   270  
   271  func parseFix(f *feed, feedURL string) (*feed, error) {
   272  	f.Link = strings.TrimSpace(f.Link)
   273  	f.Title = html.UnescapeString(strings.TrimSpace(f.Title))
   274  
   275  	if u, err := url.Parse(feedURL); err == nil {
   276  		if ul, err := u.Parse(f.Link); err == nil {
   277  			f.Link = ul.String()
   278  		}
   279  	}
   280  	base, err := url.Parse(f.Link)
   281  	if err != nil {
   282  		log.Printf("unable to parse link: %v", f.Link)
   283  	}
   284  
   285  	var nss []*item
   286  	now := time.Now()
   287  	for _, s := range f.Items {
   288  		s.Created = now
   289  		s.Link = strings.TrimSpace(s.Link)
   290  		if s.ID == "" {
   291  			if s.Link != "" {
   292  				s.ID = s.Link
   293  			} else if s.Title != "" {
   294  				s.ID = s.Title
   295  			} else {
   296  				log.Printf("item has no id: %v", s)
   297  				continue
   298  			}
   299  		}
   300  		// if a story doesn't have a link, see if its id is a URL
   301  		if s.Link == "" {
   302  			if u, err := url.Parse(s.ID); err == nil {
   303  				s.Link = u.String()
   304  			}
   305  		}
   306  		if base != nil && s.Link != "" {
   307  			link, err := base.Parse(s.Link)
   308  			if err == nil {
   309  				s.Link = link.String()
   310  			} else {
   311  				log.Printf("unable to resolve link: %v", s.Link)
   312  			}
   313  		}
   314  		nss = append(nss, s)
   315  	}
   316  	f.Items = nss
   317  
   318  	return f, nil
   319  }
   320  
   321  var dateFormats = []string{
   322  	"01-02-2006",
   323  	"01/02/2006",
   324  	"01/02/2006 - 15:04",
   325  	"01/02/2006 15:04:05 MST",
   326  	"01/02/2006 3:04 PM",
   327  	"02-01-2006",
   328  	"02/01/2006",
   329  	"02.01.2006 -0700",
   330  	"02/01/2006 - 15:04",
   331  	"02.01.2006 15:04",
   332  	"02/01/2006 15:04:05",
   333  	"02.01.2006 15:04:05",
   334  	"02-01-2006 15:04:05 MST",
   335  	"02/01/2006 15:04 MST",
   336  	"02 Jan 2006",
   337  	"02 Jan 2006 15:04:05",
   338  	"02 Jan 2006 15:04:05 -0700",
   339  	"02 Jan 2006 15:04:05 MST",
   340  	"02 Jan 2006 15:04:05 UT",
   341  	"02 Jan 2006 15:04 MST",
   342  	"02 Monday, Jan 2006 15:04",
   343  	"06-1-2 15:04",
   344  	"06/1/2 15:04",
   345  	"1/2/2006",
   346  	"1/2/2006 15:04:05 MST",
   347  	"1/2/2006 3:04:05 PM",
   348  	"1/2/2006 3:04:05 PM MST",
   349  	"15:04 02.01.2006 -0700",
   350  	"2006-01-02",
   351  	"2006/01/02",
   352  	"2006-01-02 00:00:00.0 15:04:05.0 -0700",
   353  	"2006-01-02 15:04",
   354  	"2006-01-02 15:04:05 -0700",
   355  	"2006-01-02 15:04:05-07:00",
   356  	"2006-01-02 15:04:05-0700",
   357  	"2006-01-02 15:04:05 MST",
   358  	"2006-01-02 15:04:05Z",
   359  	"2006-01-02 at 15:04:05",
   360  	"2006-01-02T15:04:05",
   361  	"2006-01-02T15:04:05:00",
   362  	"2006-01-02T15:04:05 -0700",
   363  	"2006-01-02T15:04:05-07:00",
   364  	"2006-01-02T15:04:05-0700",
   365  	"2006-01-02T15:04:05:-0700",
   366  	"2006-01-02T15:04:05-07:00:00",
   367  	"2006-01-02T15:04:05Z",
   368  	"2006-01-02T15:04-07:00",
   369  	"2006-01-02T15:04Z",
   370  	"2006-1-02T15:04:05Z",
   371  	"2006-1-2",
   372  	"2006-1-2 15:04:05",
   373  	"2006-1-2T15:04:05Z",
   374  	"2006 January 02",
   375  	"2-1-2006",
   376  	"2/1/2006",
   377  	"2.1.2006 15:04:05",
   378  	"2 Jan 2006",
   379  	"2 Jan 2006 15:04:05 -0700",
   380  	"2 Jan 2006 15:04:05 MST",
   381  	"2 Jan 2006 15:04:05 Z",
   382  	"2 January 2006",
   383  	"2 January 2006 15:04:05 -0700",
   384  	"2 January 2006 15:04:05 MST",
   385  	"6-1-2 15:04",
   386  	"6/1/2 15:04",
   387  	"Jan 02, 2006",
   388  	"Jan 02 2006 03:04:05PM",
   389  	"Jan 2, 2006",
   390  	"Jan 2, 2006 15:04:05 MST",
   391  	"Jan 2, 2006 3:04:05 PM",
   392  	"Jan 2, 2006 3:04:05 PM MST",
   393  	"January 02, 2006",
   394  	"January 02, 2006 03:04 PM",
   395  	"January 02, 2006 15:04",
   396  	"January 02, 2006 15:04:05 MST",
   397  	"January 2, 2006",
   398  	"January 2, 2006 03:04 PM",
   399  	"January 2, 2006 15:04:05",
   400  	"January 2, 2006 15:04:05 MST",
   401  	"January 2, 2006, 3:04 p.m.",
   402  	"January 2, 2006 3:04 PM",
   403  	"Mon, 02 Jan 06 15:04:05 MST",
   404  	"Mon, 02 Jan 2006",
   405  	"Mon, 02 Jan 2006 15:04:05",
   406  	"Mon, 02 Jan 2006 15:04:05 00",
   407  	"Mon, 02 Jan 2006 15:04:05 -07",
   408  	"Mon 02 Jan 2006 15:04:05 -0700",
   409  	"Mon, 02 Jan 2006 15:04:05 --0700",
   410  	"Mon, 02 Jan 2006 15:04:05 -07:00",
   411  	"Mon, 02 Jan 2006 15:04:05 -0700",
   412  	"Mon,02 Jan 2006 15:04:05 -0700",
   413  	"Mon, 02 Jan 2006 15:04:05 GMT-0700",
   414  	"Mon , 02 Jan 2006 15:04:05 MST",
   415  	"Mon, 02 Jan 2006 15:04:05 MST",
   416  	"Mon, 02 Jan 2006 15:04:05MST",
   417  	"Mon, 02 Jan 2006, 15:04:05 MST",
   418  	"Mon, 02 Jan 2006 15:04:05 MST -0700",
   419  	"Mon, 02 Jan 2006 15:04:05 MST-07:00",
   420  	"Mon, 02 Jan 2006 15:04:05 UT",
   421  	"Mon, 02 Jan 2006 15:04:05 Z",
   422  	"Mon, 02 Jan 2006 15:04 -0700",
   423  	"Mon, 02 Jan 2006 15:04 MST",
   424  	"Mon,02 Jan 2006 15:04 MST",
   425  	"Mon, 02 Jan 2006 15 -0700",
   426  	"Mon, 02 Jan 2006 3:04:05 PM MST",
   427  	"Mon, 02 January 2006",
   428  	"Mon,02 January 2006 14:04:05 MST",
   429  	"Mon, 2006-01-02 15:04",
   430  	"Mon, 2 Jan 06 15:04:05 -0700",
   431  	"Mon, 2 Jan 06 15:04:05 MST",
   432  	"Mon, 2 Jan 15:04:05 MST",
   433  	"Mon, 2 Jan 2006",
   434  	"Mon,2 Jan 2006",
   435  	"Mon, 2 Jan 2006 15:04",
   436  	"Mon, 2 Jan 2006 15:04:05",
   437  	"Mon, 2 Jan 2006 15:04:05 -0700",
   438  	"Mon, 2 Jan 2006 15:04:05-0700",
   439  	"Mon, 2 Jan 2006 15:04:05 -0700 MST",
   440  	"mon,2 Jan 2006 15:04:05 MST",
   441  	"Mon 2 Jan 2006 15:04:05 MST",
   442  	"Mon, 2 Jan 2006 15:04:05 MST",
   443  	"Mon, 2 Jan 2006 15:04:05MST",
   444  	"Mon, 2 Jan 2006 15:04:05 UT",
   445  	"Mon, 2 Jan 2006 15:04 -0700",
   446  	"Mon, 2 Jan 2006, 15:04 -0700",
   447  	"Mon, 2 Jan 2006 15:04 MST",
   448  	"Mon, 2, Jan 2006 15:4",
   449  	"Mon, 2 Jan 2006 15:4:5 -0700 GMT",
   450  	"Mon, 2 Jan 2006 15:4:5 MST",
   451  	"Mon, 2 Jan 2006 3:04:05 PM -0700",
   452  	"Mon, 2 January 2006",
   453  	"Mon, 2 January 2006 15:04:05 -0700",
   454  	"Mon, 2 January 2006 15:04:05 MST",
   455  	"Mon, 2 January 2006, 15:04:05 MST",
   456  	"Mon, 2 January 2006, 15:04 -0700",
   457  	"Mon, 2 January 2006 15:04 MST",
   458  	"Monday, 02 January 2006 15:04:05",
   459  	"Monday, 02 January 2006 15:04:05 -0700",
   460  	"Monday, 02 January 2006 15:04:05 MST",
   461  	"Monday, 2 Jan 2006 15:04:05 -0700",
   462  	"Monday, 2 Jan 2006 15:04:05 MST",
   463  	"Monday, 2 January 2006 15:04:05 -0700",
   464  	"Monday, 2 January 2006 15:04:05 MST",
   465  	"Monday, January 02, 2006",
   466  	"Monday, January 2, 2006",
   467  	"Monday, January 2, 2006 03:04 PM",
   468  	"Monday, January 2, 2006 15:04:05 MST",
   469  	"Mon Jan 02 2006 15:04:05 -0700",
   470  	"Mon, Jan 02,2006 15:04:05 MST",
   471  	"Mon Jan 02, 2006 3:04 pm",
   472  	"Mon Jan 2 15:04:05 2006 MST",
   473  	"Mon Jan 2 15:04 2006",
   474  	"Mon, Jan 2 2006 15:04:05 -0700",
   475  	"Mon, Jan 2 2006 15:04:05 -700",
   476  	"Mon, Jan 2, 2006 15:04:05 MST",
   477  	"Mon, Jan 2 2006 15:04 MST",
   478  	"Mon, Jan 2, 2006 15:04 MST",
   479  	"Mon, January 02, 2006 15:04:05 MST",
   480  	"Mon, January 02, 2006, 15:04:05 MST",
   481  	"Mon, January 2 2006 15:04:05 -0700",
   482  	"Updated January 2, 2006",
   483  	time.ANSIC,
   484  	time.RFC1123,
   485  	time.RFC1123Z,
   486  	time.RFC3339,
   487  	time.RFC822,
   488  	time.RFC822Z,
   489  	time.RFC850,
   490  	time.RubyDate,
   491  	time.UnixDate,
   492  }
   493  
   494  func parseDate(ds ...string) (t time.Time, err error) {
   495  	for _, d := range ds {
   496  		d = strings.TrimSpace(d)
   497  		if d == "" {
   498  			continue
   499  		}
   500  		for _, f := range dateFormats {
   501  			if t, err = time.Parse(f, d); err == nil {
   502  				return
   503  			}
   504  		}
   505  	}
   506  	err = fmt.Errorf("could not parse dates: %v", strings.Join(ds, ", "))
   507  	return
   508  }