github.com/slspeek/camlistore_namedsearch@v0.0.0-20140519202248-ed6f70f7721a/pkg/importer/feed/feed.go (about) 1 /* 2 Copyright 2014 The Camlistore Authors 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // Package feed implements an importer for RSS, Atom, and RDF feeds. 18 package feed 19 20 import ( 21 "bytes" 22 "fmt" 23 "html/template" 24 "io" 25 "io/ioutil" 26 "log" 27 "net/http" 28 "net/url" 29 "strings" 30 "sync" 31 32 "camlistore.org/pkg/blob" 33 "camlistore.org/pkg/context" 34 "camlistore.org/pkg/httputil" 35 "camlistore.org/pkg/importer" 36 "camlistore.org/pkg/schema" 37 "camlistore.org/third_party/code.google.com/p/go.net/html" 38 "camlistore.org/third_party/code.google.com/p/go.net/html/atom" 39 ) 40 41 const ( 42 // Permanode attributes on account node: 43 acctAttrFeedURL = "feedURL" 44 ) 45 46 func init() { 47 importer.Register("feed", &imp{ 48 urlFileRef: make(map[string]blob.Ref), 49 }) 50 } 51 52 type imp struct { 53 mu sync.Mutex // guards following 54 urlFileRef map[string]blob.Ref // url to file schema blob 55 56 importer.OAuth1 // for CallbackRequestAccount and CallbackURLParameters 57 } 58 59 func (im *imp) NeedsAPIKey() bool { return false } 60 61 func (im *imp) IsAccountReady(acctNode *importer.Object) (ok bool, err error) { 62 if acctNode.Attr(acctAttrFeedURL) != "" { 63 return true, nil 64 } 65 return false, nil 66 } 67 68 func (im *imp) SummarizeAccount(acct *importer.Object) string { 69 ok, err := im.IsAccountReady(acct) 70 if err != nil { 71 return "Not configured; error = " + err.Error() 72 } 73 if !ok { 74 return "Not configured" 75 } 76 return fmt.Sprintf("feed %s", acct.Attr(acctAttrFeedURL)) 77 } 78 79 // A run is our state for a given run of the importer. 80 type run struct { 81 *importer.RunContext 82 im *imp 83 } 84 85 func (im *imp) Run(ctx *importer.RunContext) error { 86 r := &run{ 87 RunContext: ctx, 88 im: im, 89 } 90 91 if err := r.importFeed(); err != nil { 92 return err 93 } 94 return nil 95 } 96 97 func (r *run) importFeed() error { 98 feedURL, err := url.Parse(r.RunContext.AccountNode().Attr(acctAttrFeedURL)) 99 if err != nil { 100 return err 101 } 102 body, err := doGet(r.Context, feedURL.String()) 103 if err != nil { 104 return err 105 } 106 if auto, err := autoDiscover(body); err == nil { 107 if autoURL, err := url.Parse(auto); err == nil { 108 if autoURL.Scheme == "" { 109 autoURL.Scheme = feedURL.Scheme 110 } 111 if autoURL.Host == "" { 112 autoURL.Host = feedURL.Host 113 } 114 body, err = doGet(r.Context, autoURL.String()) 115 if err != nil { 116 return err 117 } 118 } 119 } 120 feed, err := parseFeed(body, feedURL.String()) 121 if err != nil { 122 return err 123 } 124 itemsNode, err := r.getTopLevelNode("items", "Items") 125 if err != nil { 126 return err 127 } 128 for _, item := range feed.Items { 129 if err := r.importItem(itemsNode, item); err != nil { 130 log.Printf("Feed importer: error importing item %s %v", item.ID, err) 131 continue 132 } 133 } 134 return nil 135 } 136 137 func (r *run) importItem(parent *importer.Object, item *item) error { 138 itemNode, err := parent.ChildPathObject(item.ID) 139 if err != nil { 140 return err 141 } 142 fileRef, err := schema.WriteFileFromReader(r.Host.Target(), "", bytes.NewBufferString(item.Content)) 143 if err != nil { 144 return err 145 } 146 if err := itemNode.SetAttrs( 147 "feedItemId", item.ID, 148 "camliNodeType", "feed:item", 149 "title", item.Title, 150 "link", item.Link, 151 "author", item.Author, 152 "camliContent", fileRef.String(), 153 "feedMediaContentURL", item.MediaContent, 154 ); err != nil { 155 return err 156 } 157 return nil 158 } 159 160 func (r *run) getTopLevelNode(path string, title string) (*importer.Object, error) { 161 childObject, err := r.RootNode().ChildPathObject(path) 162 if err != nil { 163 return nil, err 164 } 165 166 if err := childObject.SetAttr("title", title); err != nil { 167 return nil, err 168 } 169 return childObject, nil 170 } 171 172 // autodiscover takes an HTML document and returns the autodiscovered feed 173 // URL. Returns an error if there is no such URL. 174 func autoDiscover(body []byte) (feedURL string, err error) { 175 r := bytes.NewReader(body) 176 z := html.NewTokenizer(r) 177 for { 178 if z.Next() == html.ErrorToken { 179 break 180 } 181 t := z.Token() 182 switch t.DataAtom { 183 case atom.Link: 184 if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken { 185 attrs := make(map[string]string) 186 for _, a := range t.Attr { 187 attrs[a.Key] = a.Val 188 } 189 if attrs["rel"] == "alternate" && attrs["href"] != "" && 190 (attrs["type"] == "application/rss+xml" || attrs["type"] == "application/atom+xml") { 191 return attrs["href"], nil 192 } 193 } 194 } 195 } 196 return "", fmt.Errorf("No feed link found") 197 } 198 199 func doGet(ctx *context.Context, url string) ([]byte, error) { 200 req, err := http.NewRequest("GET", url, nil) 201 if err != nil { 202 return nil, err 203 } 204 res, err := ctx.HTTPClient().Do(req) 205 if err != nil { 206 log.Printf("Error fetching %s: %v", url, err) 207 return nil, err 208 } 209 defer httputil.CloseBody(res.Body) 210 if res.StatusCode != http.StatusOK { 211 return nil, fmt.Errorf("Get request on %s failed with: %s", url, res.Status) 212 } 213 return ioutil.ReadAll(io.LimitReader(res.Body, 8<<20)) 214 } 215 216 // urlFileRef slurps urlstr from the net, writes to a file and returns its 217 // fileref or "" on error 218 func (r *run) urlFileRef(urlstr string) string { 219 if urlstr == "" { 220 return "" 221 } 222 im := r.im 223 im.mu.Lock() 224 if br, ok := im.urlFileRef[urlstr]; ok { 225 im.mu.Unlock() 226 return br.String() 227 } 228 im.mu.Unlock() 229 230 res, err := r.Host.HTTPClient().Get(urlstr) 231 if err != nil { 232 log.Printf("couldn't get file: %v", err) 233 return "" 234 } 235 defer res.Body.Close() 236 237 filename := urlstr[strings.LastIndex(urlstr, "/")+1:] 238 fileRef, err := schema.WriteFileFromReader(r.Host.Target(), filename, res.Body) 239 if err != nil { 240 log.Printf("couldn't write file: %v", err) 241 return "" 242 } 243 244 im.mu.Lock() 245 defer im.mu.Unlock() 246 im.urlFileRef[urlstr] = fileRef 247 return fileRef.String() 248 } 249 250 func (im *imp) ServeSetup(w http.ResponseWriter, r *http.Request, ctx *importer.SetupContext) error { 251 return tmpl.ExecuteTemplate(w, "serveSetup", ctx) 252 } 253 254 var tmpl = template.Must(template.New("root").Parse(` 255 {{define "serveSetup"}} 256 <h1>Configuring Feed</h1> 257 <form method="get" action="{{.CallbackURL}}"> 258 <input type="hidden" name="acct" value="{{.AccountNode.PermanodeRef}}"> 259 <table border=0 cellpadding=3> 260 <tr><td align=right>Feed URL</td><td><input name="feedURL" size=50></td></tr> 261 <tr><td align=right></td><td><input type="submit" value="Add"></td></tr> 262 </table> 263 </form> 264 {{end}} 265 `)) 266 267 func (im *imp) ServeCallback(w http.ResponseWriter, r *http.Request, ctx *importer.SetupContext) { 268 u := r.FormValue("feedURL") 269 if u == "" { 270 http.Error(w, "Expected a feed URL", 400) 271 return 272 } 273 feed, err := url.Parse(u) 274 if err != nil { 275 httputil.ServeError(w, r, err) 276 return 277 } 278 if feed.Scheme == "" { 279 feed.Scheme = "http" 280 } 281 if err := ctx.AccountNode.SetAttrs( 282 acctAttrFeedURL, feed.String(), 283 ); err != nil { 284 httputil.ServeError(w, r, fmt.Errorf("Error setting attribute: %v", err)) 285 return 286 } 287 http.Redirect(w, r, ctx.AccountURL(), http.StatusFound) 288 }