rss_agent.rb 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. require 'rss'
  2. require 'feed-normalizer'
  3. module Agents
  4. class RssAgent < Agent
  5. include WebRequestConcern
  6. cannot_receive_events!
  7. default_schedule "every_1d"
  8. description do
  9. <<-MD
  10. This Agent consumes RSS feeds and emits events when they change.
  11. (If you want to *output* an RSS feed, use the DataOutputAgent. Also, you can technically parse RSS and XML feeds
  12. with the WebsiteAgent as well. See [this example](https://github.com/cantino/huginn/wiki/Agent-configuration-examples#itunes-trailers).)
  13. Options:
  14. * `url` - The URL of the RSS feed.
  15. * `clean` - Attempt to use [feed-normalizer](https://github.com/aasmith/feed-normalizer)'s' `clean!` method to cleanup HTML in the feed. Set to `true` to use.
  16. * `expected_update_period_in_days` - How often you expect this RSS feed to change. If more than this amount of time passes without an update, the Agent will mark itself as not working.
  17. MD
  18. end
  19. def default_options
  20. {
  21. 'expected_update_period_in_days' => "5",
  22. 'clean' => 'false',
  23. 'url' => "https://github.com/cantino/huginn/commits/master.atom"
  24. }
  25. end
  26. def working?
  27. event_created_within?((interpolated['expected_update_period_in_days'].presence || 10).to_i) && !recent_error_logs?
  28. end
  29. def validate_options
  30. errors.add(:base, "url is required") unless options['url'].present?
  31. unless options['expected_update_period_in_days'].present? && options['expected_update_period_in_days'].to_i > 0
  32. errors.add(:base, "Please provide 'expected_update_period_in_days' to indicate how many days can pass without an update before this Agent is considered to not be working")
  33. end
  34. validate_web_request_options!
  35. end
  36. def check
  37. response = faraday.get(interpolated['url'])
  38. if response.success?
  39. feed = FeedNormalizer::FeedNormalizer.parse(response.body)
  40. feed.clean! if interpolated['clean'] == 'true'
  41. created_event_count = 0
  42. feed.entries.each do |entry|
  43. if check_and_track(entry.id)
  44. created_event_count += 1
  45. create_event(:payload => {
  46. :id => entry.id,
  47. :date_published => entry.date_published,
  48. :last_updated => entry.last_updated,
  49. :urls => entry.urls,
  50. :description => entry.description,
  51. :content => entry.content,
  52. :title => entry.title,
  53. :authors => entry.authors,
  54. :categories => entry.categories
  55. })
  56. end
  57. end
  58. log "Fetched #{interpolated['url']} and created #{created_event_count} event(s)."
  59. else
  60. error "Failed to fetch #{interpolated['url']}: #{response.inspect}"
  61. end
  62. end
  63. protected
  64. def check_and_track(entry_id)
  65. memory['seen_ids'] ||= []
  66. if memory['seen_ids'].include?(entry_id)
  67. false
  68. else
  69. memory['seen_ids'].unshift entry_id
  70. memory['seen_ids'].pop if memory['seen_ids'].length > 500
  71. true
  72. end
  73. end
  74. end
  75. end