github.com/slspeek/camlistore_namedsearch@v0.0.0-20140519202248-ed6f70f7721a/pkg/importer/feed/parse.go (about) 1 /* 2 Copyright 2014 The Camlistore Authors 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package feed 18 19 import ( 20 "bytes" 21 "encoding/xml" 22 "fmt" 23 "html" 24 "log" 25 "net/url" 26 "strings" 27 "time" 28 29 "camlistore.org/pkg/importer/feed/atom" 30 "camlistore.org/pkg/importer/feed/rdf" 31 "camlistore.org/pkg/importer/feed/rss" 32 "camlistore.org/third_party/code.google.com/p/go-charset/charset" 33 _ "camlistore.org/third_party/code.google.com/p/go-charset/data" 34 ) 35 36 type feed struct { 37 Title string 38 Updated time.Time 39 Link string 40 Items []*item 41 } 42 43 type item struct { 44 ID string 45 Title string 46 Link string 47 Created time.Time 48 Published time.Time 49 Updated time.Time 50 Author string 51 Content string 52 MediaContent string 53 } 54 55 func parseFeed(body []byte, feedURL string) (*feed, error) { 56 var f *feed 57 var atomerr, rsserr, rdferr error 58 f, atomerr = parseAtom(body) 59 if f == nil { 60 f, rsserr = parseRSS(body) 61 } 62 if f == nil { 63 f, rdferr = parseRDF(body) 64 } 65 if f == nil { 66 log.Printf("atom parse error: %s", atomerr.Error()) 67 log.Printf("xml parse error: %s", rsserr.Error()) 68 log.Printf("rdf parse error: %s", rdferr.Error()) 69 return nil, fmt.Errorf("Could not parse feed data") 70 } 71 return f, nil 72 } 73 74 func parseAtom(body []byte) (*feed, error) { 75 var f feed 76 var a atom.Feed 77 d := xml.NewDecoder(bytes.NewReader(body)) 78 d.CharsetReader = charset.NewReader 79 if err := d.Decode(&a); err != nil { 80 return nil, err 81 } 82 f.Title = a.Title 83 if t, err := parseDate(string(a.Updated)); err == nil { 84 f.Updated = t 85 } 86 fb, err := url.Parse(a.XMLBase) 87 if err != nil { 88 fb, _ = url.Parse("") 89 } 90 if len(a.Link) > 0 { 91 f.Link = findBestAtomLink(a.Link) 92 if l, err := fb.Parse(f.Link); err == nil { 93 f.Link = l.String() 94 } 95 } 96 97 for _, i := range a.Entry { 98 eb, err := fb.Parse(i.XMLBase) 99 if err != nil { 100 eb = fb 101 } 102 st := item{ 103 ID: i.ID, 104 Title: atomTitle(i.Title), 105 } 106 if t, err := parseDate(string(i.Updated)); err == nil { 107 st.Updated = t 108 } 109 if t, err := parseDate(string(i.Published)); err == nil { 110 st.Published = t 111 } 112 if len(i.Link) > 0 { 113 st.Link = findBestAtomLink(i.Link) 114 if l, err := eb.Parse(st.Link); err == nil { 115 st.Link = l.String() 116 } 117 } 118 if i.Author != nil { 119 st.Author = i.Author.Name 120 } 121 if i.Content != nil { 122 if len(strings.TrimSpace(i.Content.Body)) != 0 { 123 st.Content = i.Content.Body 124 } else if len(i.Content.InnerXML) != 0 { 125 st.Content = i.Content.InnerXML 126 } 127 } else if i.Summary != nil { 128 st.Content = i.Summary.Body 129 } 130 f.Items = append(f.Items, &st) 131 } 132 return &f, nil 133 } 134 135 func parseRSS(body []byte) (*feed, error) { 136 var f feed 137 var r rss.RSS 138 d := xml.NewDecoder(bytes.NewReader(body)) 139 d.CharsetReader = charset.NewReader 140 d.DefaultSpace = "DefaultSpace" 141 if err := d.Decode(&r); err != nil { 142 return nil, err 143 } 144 f.Title = r.Title 145 if t, err := parseDate(r.LastBuildDate, r.PubDate); err == nil { 146 f.Updated = t 147 } 148 f.Link = r.BaseLink() 149 150 for _, i := range r.Items { 151 st := item{ 152 Link: i.Link, 153 Author: i.Author, 154 } 155 if i.Content != "" { 156 st.Content = i.Content 157 } else if i.Description != "" { 158 st.Content = i.Description 159 } 160 if i.Title != "" { 161 st.Title = i.Title 162 } else if i.Description != "" { 163 st.Title = i.Description 164 } 165 if st.Content == st.Title { 166 st.Title = "" 167 } 168 st.Title = textTitle(st.Title) 169 if i.Guid != nil { 170 st.ID = i.Guid.Guid 171 } 172 if i.Enclosure != nil && strings.HasPrefix(i.Enclosure.Type, "audio/") { 173 st.MediaContent = i.Enclosure.Url 174 } else if i.Media != nil && strings.HasPrefix(i.Media.Type, "audio/") { 175 st.MediaContent = i.Media.URL 176 } 177 if t, err := parseDate(i.PubDate, i.Date, i.Published); err == nil { 178 st.Published = t 179 st.Updated = t 180 } 181 f.Items = append(f.Items, &st) 182 } 183 184 return &f, nil 185 } 186 187 func parseRDF(body []byte) (*feed, error) { 188 var f feed 189 var rd rdf.RDF 190 d := xml.NewDecoder(bytes.NewReader(body)) 191 d.CharsetReader = charset.NewReader 192 if err := d.Decode(&rd); err != nil { 193 return nil, err 194 } 195 if rd.Channel != nil { 196 f.Title = rd.Channel.Title 197 f.Link = rd.Channel.Link 198 if t, err := parseDate(rd.Channel.Date); err == nil { 199 f.Updated = t 200 } 201 } 202 203 for _, i := range rd.Item { 204 st := item{ 205 ID: i.About, 206 Title: textTitle(i.Title), 207 Link: i.Link, 208 Author: i.Creator, 209 } 210 if len(i.Description) > 0 { 211 st.Content = html.UnescapeString(i.Description) 212 } else if len(i.Content) > 0 { 213 st.Content = html.UnescapeString(i.Content) 214 } 215 if t, err := parseDate(i.Date); err == nil { 216 st.Published = t 217 st.Updated = t 218 } 219 f.Items = append(f.Items, &st) 220 } 221 222 return &f, nil 223 } 224 225 func textTitle(t string) string { 226 return html.UnescapeString(t) 227 } 228 229 func atomTitle(t *atom.Text) string { 230 if t == nil { 231 return "" 232 } 233 if t.Type == "html" { 234 // see: https://github.com/mjibson/goread/blob/59aec794f3ef87b36c1bac029438c33a6aa6d8d3/utils.go#L533 235 //return html.UnescapeString(sanitizer.StripTags(t.Body)) 236 } 237 return textTitle(t.Body) 238 } 239 240 func findBestAtomLink(links []atom.Link) string { 241 getScore := func(l atom.Link) int { 242 switch { 243 case l.Rel == "hub": 244 return 0 245 case l.Rel == "alternate" && l.Type == "text/html": 246 return 5 247 case l.Type == "text/html": 248 return 4 249 case l.Rel == "self": 250 return 2 251 case l.Rel == "": 252 return 3 253 default: 254 return 1 255 } 256 } 257 258 var bestlink string 259 bestscore := -1 260 for _, l := range links { 261 score := getScore(l) 262 if score > bestscore { 263 bestlink = l.Href 264 bestscore = score 265 } 266 } 267 268 return bestlink 269 } 270 271 func parseFix(f *feed, feedURL string) (*feed, error) { 272 f.Link = strings.TrimSpace(f.Link) 273 f.Title = html.UnescapeString(strings.TrimSpace(f.Title)) 274 275 if u, err := url.Parse(feedURL); err == nil { 276 if ul, err := u.Parse(f.Link); err == nil { 277 f.Link = ul.String() 278 } 279 } 280 base, err := url.Parse(f.Link) 281 if err != nil { 282 log.Printf("unable to parse link: %v", f.Link) 283 } 284 285 var nss []*item 286 now := time.Now() 287 for _, s := range f.Items { 288 s.Created = now 289 s.Link = strings.TrimSpace(s.Link) 290 if s.ID == "" { 291 if s.Link != "" { 292 s.ID = s.Link 293 } else if s.Title != "" { 294 s.ID = s.Title 295 } else { 296 log.Printf("item has no id: %v", s) 297 continue 298 } 299 } 300 // if a story doesn't have a link, see if its id is a URL 301 if s.Link == "" { 302 if u, err := url.Parse(s.ID); err == nil { 303 s.Link = u.String() 304 } 305 } 306 if base != nil && s.Link != "" { 307 link, err := base.Parse(s.Link) 308 if err == nil { 309 s.Link = link.String() 310 } else { 311 log.Printf("unable to resolve link: %v", s.Link) 312 } 313 } 314 nss = append(nss, s) 315 } 316 f.Items = nss 317 318 return f, nil 319 } 320 321 var dateFormats = []string{ 322 "01-02-2006", 323 "01/02/2006", 324 "01/02/2006 - 15:04", 325 "01/02/2006 15:04:05 MST", 326 "01/02/2006 3:04 PM", 327 "02-01-2006", 328 "02/01/2006", 329 "02.01.2006 -0700", 330 "02/01/2006 - 15:04", 331 "02.01.2006 15:04", 332 "02/01/2006 15:04:05", 333 "02.01.2006 15:04:05", 334 "02-01-2006 15:04:05 MST", 335 "02/01/2006 15:04 MST", 336 "02 Jan 2006", 337 "02 Jan 2006 15:04:05", 338 "02 Jan 2006 15:04:05 -0700", 339 "02 Jan 2006 15:04:05 MST", 340 "02 Jan 2006 15:04:05 UT", 341 "02 Jan 2006 15:04 MST", 342 "02 Monday, Jan 2006 15:04", 343 "06-1-2 15:04", 344 "06/1/2 15:04", 345 "1/2/2006", 346 "1/2/2006 15:04:05 MST", 347 "1/2/2006 3:04:05 PM", 348 "1/2/2006 3:04:05 PM MST", 349 "15:04 02.01.2006 -0700", 350 "2006-01-02", 351 "2006/01/02", 352 "2006-01-02 00:00:00.0 15:04:05.0 -0700", 353 "2006-01-02 15:04", 354 "2006-01-02 15:04:05 -0700", 355 "2006-01-02 15:04:05-07:00", 356 "2006-01-02 15:04:05-0700", 357 "2006-01-02 15:04:05 MST", 358 "2006-01-02 15:04:05Z", 359 "2006-01-02 at 15:04:05", 360 "2006-01-02T15:04:05", 361 "2006-01-02T15:04:05:00", 362 "2006-01-02T15:04:05 -0700", 363 "2006-01-02T15:04:05-07:00", 364 "2006-01-02T15:04:05-0700", 365 "2006-01-02T15:04:05:-0700", 366 "2006-01-02T15:04:05-07:00:00", 367 "2006-01-02T15:04:05Z", 368 "2006-01-02T15:04-07:00", 369 "2006-01-02T15:04Z", 370 "2006-1-02T15:04:05Z", 371 "2006-1-2", 372 "2006-1-2 15:04:05", 373 "2006-1-2T15:04:05Z", 374 "2006 January 02", 375 "2-1-2006", 376 "2/1/2006", 377 "2.1.2006 15:04:05", 378 "2 Jan 2006", 379 "2 Jan 2006 15:04:05 -0700", 380 "2 Jan 2006 15:04:05 MST", 381 "2 Jan 2006 15:04:05 Z", 382 "2 January 2006", 383 "2 January 2006 15:04:05 -0700", 384 "2 January 2006 15:04:05 MST", 385 "6-1-2 15:04", 386 "6/1/2 15:04", 387 "Jan 02, 2006", 388 "Jan 02 2006 03:04:05PM", 389 "Jan 2, 2006", 390 "Jan 2, 2006 15:04:05 MST", 391 "Jan 2, 2006 3:04:05 PM", 392 "Jan 2, 2006 3:04:05 PM MST", 393 "January 02, 2006", 394 "January 02, 2006 03:04 PM", 395 "January 02, 2006 15:04", 396 "January 02, 2006 15:04:05 MST", 397 "January 2, 2006", 398 "January 2, 2006 03:04 PM", 399 "January 2, 2006 15:04:05", 400 "January 2, 2006 15:04:05 MST", 401 "January 2, 2006, 3:04 p.m.", 402 "January 2, 2006 3:04 PM", 403 "Mon, 02 Jan 06 15:04:05 MST", 404 "Mon, 02 Jan 2006", 405 "Mon, 02 Jan 2006 15:04:05", 406 "Mon, 02 Jan 2006 15:04:05 00", 407 "Mon, 02 Jan 2006 15:04:05 -07", 408 "Mon 02 Jan 2006 15:04:05 -0700", 409 "Mon, 02 Jan 2006 15:04:05 --0700", 410 "Mon, 02 Jan 2006 15:04:05 -07:00", 411 "Mon, 02 Jan 2006 15:04:05 -0700", 412 "Mon,02 Jan 2006 15:04:05 -0700", 413 "Mon, 02 Jan 2006 15:04:05 GMT-0700", 414 "Mon , 02 Jan 2006 15:04:05 MST", 415 "Mon, 02 Jan 2006 15:04:05 MST", 416 "Mon, 02 Jan 2006 15:04:05MST", 417 "Mon, 02 Jan 2006, 15:04:05 MST", 418 "Mon, 02 Jan 2006 15:04:05 MST -0700", 419 "Mon, 02 Jan 2006 15:04:05 MST-07:00", 420 "Mon, 02 Jan 2006 15:04:05 UT", 421 "Mon, 02 Jan 2006 15:04:05 Z", 422 "Mon, 02 Jan 2006 15:04 -0700", 423 "Mon, 02 Jan 2006 15:04 MST", 424 "Mon,02 Jan 2006 15:04 MST", 425 "Mon, 02 Jan 2006 15 -0700", 426 "Mon, 02 Jan 2006 3:04:05 PM MST", 427 "Mon, 02 January 2006", 428 "Mon,02 January 2006 14:04:05 MST", 429 "Mon, 2006-01-02 15:04", 430 "Mon, 2 Jan 06 15:04:05 -0700", 431 "Mon, 2 Jan 06 15:04:05 MST", 432 "Mon, 2 Jan 15:04:05 MST", 433 "Mon, 2 Jan 2006", 434 "Mon,2 Jan 2006", 435 "Mon, 2 Jan 2006 15:04", 436 "Mon, 2 Jan 2006 15:04:05", 437 "Mon, 2 Jan 2006 15:04:05 -0700", 438 "Mon, 2 Jan 2006 15:04:05-0700", 439 "Mon, 2 Jan 2006 15:04:05 -0700 MST", 440 "mon,2 Jan 2006 15:04:05 MST", 441 "Mon 2 Jan 2006 15:04:05 MST", 442 "Mon, 2 Jan 2006 15:04:05 MST", 443 "Mon, 2 Jan 2006 15:04:05MST", 444 "Mon, 2 Jan 2006 15:04:05 UT", 445 "Mon, 2 Jan 2006 15:04 -0700", 446 "Mon, 2 Jan 2006, 15:04 -0700", 447 "Mon, 2 Jan 2006 15:04 MST", 448 "Mon, 2, Jan 2006 15:4", 449 "Mon, 2 Jan 2006 15:4:5 -0700 GMT", 450 "Mon, 2 Jan 2006 15:4:5 MST", 451 "Mon, 2 Jan 2006 3:04:05 PM -0700", 452 "Mon, 2 January 2006", 453 "Mon, 2 January 2006 15:04:05 -0700", 454 "Mon, 2 January 2006 15:04:05 MST", 455 "Mon, 2 January 2006, 15:04:05 MST", 456 "Mon, 2 January 2006, 15:04 -0700", 457 "Mon, 2 January 2006 15:04 MST", 458 "Monday, 02 January 2006 15:04:05", 459 "Monday, 02 January 2006 15:04:05 -0700", 460 "Monday, 02 January 2006 15:04:05 MST", 461 "Monday, 2 Jan 2006 15:04:05 -0700", 462 "Monday, 2 Jan 2006 15:04:05 MST", 463 "Monday, 2 January 2006 15:04:05 -0700", 464 "Monday, 2 January 2006 15:04:05 MST", 465 "Monday, January 02, 2006", 466 "Monday, January 2, 2006", 467 "Monday, January 2, 2006 03:04 PM", 468 "Monday, January 2, 2006 15:04:05 MST", 469 "Mon Jan 02 2006 15:04:05 -0700", 470 "Mon, Jan 02,2006 15:04:05 MST", 471 "Mon Jan 02, 2006 3:04 pm", 472 "Mon Jan 2 15:04:05 2006 MST", 473 "Mon Jan 2 15:04 2006", 474 "Mon, Jan 2 2006 15:04:05 -0700", 475 "Mon, Jan 2 2006 15:04:05 -700", 476 "Mon, Jan 2, 2006 15:04:05 MST", 477 "Mon, Jan 2 2006 15:04 MST", 478 "Mon, Jan 2, 2006 15:04 MST", 479 "Mon, January 02, 2006 15:04:05 MST", 480 "Mon, January 02, 2006, 15:04:05 MST", 481 "Mon, January 2 2006 15:04:05 -0700", 482 "Updated January 2, 2006", 483 time.ANSIC, 484 time.RFC1123, 485 time.RFC1123Z, 486 time.RFC3339, 487 time.RFC822, 488 time.RFC822Z, 489 time.RFC850, 490 time.RubyDate, 491 time.UnixDate, 492 } 493 494 func parseDate(ds ...string) (t time.Time, err error) { 495 for _, d := range ds { 496 d = strings.TrimSpace(d) 497 if d == "" { 498 continue 499 } 500 for _, f := range dateFormats { 501 if t, err = time.Parse(f, d); err == nil { 502 return 503 } 504 } 505 } 506 err = fmt.Errorf("could not parse dates: %v", strings.Join(ds, ", ")) 507 return 508 }