github.com/slspeek/camlistore_namedsearch@v0.0.0-20140519202248-ed6f70f7721a/pkg/importer/feed/feed.go (about)

     1  /*
     2  Copyright 2014 The Camlistore Authors
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8       http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // Package feed implements an importer for RSS, Atom, and RDF feeds.
    18  package feed
    19  
    20  import (
    21  	"bytes"
    22  	"fmt"
    23  	"html/template"
    24  	"io"
    25  	"io/ioutil"
    26  	"log"
    27  	"net/http"
    28  	"net/url"
    29  	"strings"
    30  	"sync"
    31  
    32  	"camlistore.org/pkg/blob"
    33  	"camlistore.org/pkg/context"
    34  	"camlistore.org/pkg/httputil"
    35  	"camlistore.org/pkg/importer"
    36  	"camlistore.org/pkg/schema"
    37  	"camlistore.org/third_party/code.google.com/p/go.net/html"
    38  	"camlistore.org/third_party/code.google.com/p/go.net/html/atom"
    39  )
    40  
    41  const (
    42  	// Permanode attributes on account node:
    43  	acctAttrFeedURL = "feedURL"
    44  )
    45  
    46  func init() {
    47  	importer.Register("feed", &imp{
    48  		urlFileRef: make(map[string]blob.Ref),
    49  	})
    50  }
    51  
    52  type imp struct {
    53  	mu         sync.Mutex          // guards following
    54  	urlFileRef map[string]blob.Ref // url to file schema blob
    55  
    56  	importer.OAuth1 // for CallbackRequestAccount and CallbackURLParameters
    57  }
    58  
    59  func (im *imp) NeedsAPIKey() bool { return false }
    60  
    61  func (im *imp) IsAccountReady(acctNode *importer.Object) (ok bool, err error) {
    62  	if acctNode.Attr(acctAttrFeedURL) != "" {
    63  		return true, nil
    64  	}
    65  	return false, nil
    66  }
    67  
    68  func (im *imp) SummarizeAccount(acct *importer.Object) string {
    69  	ok, err := im.IsAccountReady(acct)
    70  	if err != nil {
    71  		return "Not configured; error = " + err.Error()
    72  	}
    73  	if !ok {
    74  		return "Not configured"
    75  	}
    76  	return fmt.Sprintf("feed %s", acct.Attr(acctAttrFeedURL))
    77  }
    78  
    79  // A run is our state for a given run of the importer.
    80  type run struct {
    81  	*importer.RunContext
    82  	im *imp
    83  }
    84  
    85  func (im *imp) Run(ctx *importer.RunContext) error {
    86  	r := &run{
    87  		RunContext: ctx,
    88  		im:         im,
    89  	}
    90  
    91  	if err := r.importFeed(); err != nil {
    92  		return err
    93  	}
    94  	return nil
    95  }
    96  
    97  func (r *run) importFeed() error {
    98  	feedURL, err := url.Parse(r.RunContext.AccountNode().Attr(acctAttrFeedURL))
    99  	if err != nil {
   100  		return err
   101  	}
   102  	body, err := doGet(r.Context, feedURL.String())
   103  	if err != nil {
   104  		return err
   105  	}
   106  	if auto, err := autoDiscover(body); err == nil {
   107  		if autoURL, err := url.Parse(auto); err == nil {
   108  			if autoURL.Scheme == "" {
   109  				autoURL.Scheme = feedURL.Scheme
   110  			}
   111  			if autoURL.Host == "" {
   112  				autoURL.Host = feedURL.Host
   113  			}
   114  			body, err = doGet(r.Context, autoURL.String())
   115  			if err != nil {
   116  				return err
   117  			}
   118  		}
   119  	}
   120  	feed, err := parseFeed(body, feedURL.String())
   121  	if err != nil {
   122  		return err
   123  	}
   124  	itemsNode, err := r.getTopLevelNode("items", "Items")
   125  	if err != nil {
   126  		return err
   127  	}
   128  	for _, item := range feed.Items {
   129  		if err := r.importItem(itemsNode, item); err != nil {
   130  			log.Printf("Feed importer: error importing item %s %v", item.ID, err)
   131  			continue
   132  		}
   133  	}
   134  	return nil
   135  }
   136  
   137  func (r *run) importItem(parent *importer.Object, item *item) error {
   138  	itemNode, err := parent.ChildPathObject(item.ID)
   139  	if err != nil {
   140  		return err
   141  	}
   142  	fileRef, err := schema.WriteFileFromReader(r.Host.Target(), "", bytes.NewBufferString(item.Content))
   143  	if err != nil {
   144  		return err
   145  	}
   146  	if err := itemNode.SetAttrs(
   147  		"feedItemId", item.ID,
   148  		"camliNodeType", "feed:item",
   149  		"title", item.Title,
   150  		"link", item.Link,
   151  		"author", item.Author,
   152  		"camliContent", fileRef.String(),
   153  		"feedMediaContentURL", item.MediaContent,
   154  	); err != nil {
   155  		return err
   156  	}
   157  	return nil
   158  }
   159  
   160  func (r *run) getTopLevelNode(path string, title string) (*importer.Object, error) {
   161  	childObject, err := r.RootNode().ChildPathObject(path)
   162  	if err != nil {
   163  		return nil, err
   164  	}
   165  
   166  	if err := childObject.SetAttr("title", title); err != nil {
   167  		return nil, err
   168  	}
   169  	return childObject, nil
   170  }
   171  
   172  // autodiscover takes an HTML document and returns the autodiscovered feed
   173  // URL. Returns an error if there is no such URL.
   174  func autoDiscover(body []byte) (feedURL string, err error) {
   175  	r := bytes.NewReader(body)
   176  	z := html.NewTokenizer(r)
   177  	for {
   178  		if z.Next() == html.ErrorToken {
   179  			break
   180  		}
   181  		t := z.Token()
   182  		switch t.DataAtom {
   183  		case atom.Link:
   184  			if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken {
   185  				attrs := make(map[string]string)
   186  				for _, a := range t.Attr {
   187  					attrs[a.Key] = a.Val
   188  				}
   189  				if attrs["rel"] == "alternate" && attrs["href"] != "" &&
   190  					(attrs["type"] == "application/rss+xml" || attrs["type"] == "application/atom+xml") {
   191  					return attrs["href"], nil
   192  				}
   193  			}
   194  		}
   195  	}
   196  	return "", fmt.Errorf("No feed link found")
   197  }
   198  
   199  func doGet(ctx *context.Context, url string) ([]byte, error) {
   200  	req, err := http.NewRequest("GET", url, nil)
   201  	if err != nil {
   202  		return nil, err
   203  	}
   204  	res, err := ctx.HTTPClient().Do(req)
   205  	if err != nil {
   206  		log.Printf("Error fetching %s: %v", url, err)
   207  		return nil, err
   208  	}
   209  	defer httputil.CloseBody(res.Body)
   210  	if res.StatusCode != http.StatusOK {
   211  		return nil, fmt.Errorf("Get request on %s failed with: %s", url, res.Status)
   212  	}
   213  	return ioutil.ReadAll(io.LimitReader(res.Body, 8<<20))
   214  }
   215  
   216  // urlFileRef slurps urlstr from the net, writes to a file and returns its
   217  // fileref or "" on error
   218  func (r *run) urlFileRef(urlstr string) string {
   219  	if urlstr == "" {
   220  		return ""
   221  	}
   222  	im := r.im
   223  	im.mu.Lock()
   224  	if br, ok := im.urlFileRef[urlstr]; ok {
   225  		im.mu.Unlock()
   226  		return br.String()
   227  	}
   228  	im.mu.Unlock()
   229  
   230  	res, err := r.Host.HTTPClient().Get(urlstr)
   231  	if err != nil {
   232  		log.Printf("couldn't get file: %v", err)
   233  		return ""
   234  	}
   235  	defer res.Body.Close()
   236  
   237  	filename := urlstr[strings.LastIndex(urlstr, "/")+1:]
   238  	fileRef, err := schema.WriteFileFromReader(r.Host.Target(), filename, res.Body)
   239  	if err != nil {
   240  		log.Printf("couldn't write file: %v", err)
   241  		return ""
   242  	}
   243  
   244  	im.mu.Lock()
   245  	defer im.mu.Unlock()
   246  	im.urlFileRef[urlstr] = fileRef
   247  	return fileRef.String()
   248  }
   249  
   250  func (im *imp) ServeSetup(w http.ResponseWriter, r *http.Request, ctx *importer.SetupContext) error {
   251  	return tmpl.ExecuteTemplate(w, "serveSetup", ctx)
   252  }
   253  
   254  var tmpl = template.Must(template.New("root").Parse(`
   255  {{define "serveSetup"}}
   256  <h1>Configuring Feed</h1>
   257  <form method="get" action="{{.CallbackURL}}">
   258    <input type="hidden" name="acct" value="{{.AccountNode.PermanodeRef}}">
   259    <table border=0 cellpadding=3>
   260    <tr><td align=right>Feed URL</td><td><input name="feedURL" size=50></td></tr>
   261    <tr><td align=right></td><td><input type="submit" value="Add"></td></tr>
   262    </table>
   263  </form>
   264  {{end}}
   265  `))
   266  
   267  func (im *imp) ServeCallback(w http.ResponseWriter, r *http.Request, ctx *importer.SetupContext) {
   268  	u := r.FormValue("feedURL")
   269  	if u == "" {
   270  		http.Error(w, "Expected a feed URL", 400)
   271  		return
   272  	}
   273  	feed, err := url.Parse(u)
   274  	if err != nil {
   275  		httputil.ServeError(w, r, err)
   276  		return
   277  	}
   278  	if feed.Scheme == "" {
   279  		feed.Scheme = "http"
   280  	}
   281  	if err := ctx.AccountNode.SetAttrs(
   282  		acctAttrFeedURL, feed.String(),
   283  	); err != nil {
   284  		httputil.ServeError(w, r, fmt.Errorf("Error setting attribute: %v", err))
   285  		return
   286  	}
   287  	http.Redirect(w, r, ctx.AccountURL(), http.StatusFound)
   288  }