rss_agent.rb 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. require 'rss'
  2. require 'feed-normalizer'
  3. module Agents
  4. class RssAgent < Agent
  5. include WebRequestConcern
  6. cannot_receive_events!
  7. can_dry_run!
  8. default_schedule "every_1d"
  9. DEFAULT_EVENTS_ORDER = [['{{date_published}}', 'time'], ['{{last_updated}}', 'time']]
  10. description do
  11. <<-MD
  12. The RSS Agent consumes RSS feeds and emits events when they change.
  13. This Agent is fairly simple, using [feed-normalizer](https://github.com/aasmith/feed-normalizer) as a base. For complex feeds
  14. with additional field types, we recommend using a WebsiteAgent. See [this example](https://github.com/cantino/huginn/wiki/Agent-configuration-examples#itunes-trailers).
  15. If you want to *output* an RSS feed, use the DataOutputAgent.
  16. Options:
  17. * `url` - The URL of the RSS feed (an array of URLs can also be used; items with identical guids across feeds will be considered duplicates).
  18. * `clean` - Attempt to use [feed-normalizer](https://github.com/aasmith/feed-normalizer)'s' `clean!` method to cleanup HTML in the feed. Set to `true` to use.
  19. * `expected_update_period_in_days` - How often you expect this RSS feed to change. If more than this amount of time passes without an update, the Agent will mark itself as not working.
  20. * `headers` - When present, it should be a hash of headers to send with the request.
  21. * `basic_auth` - Specify HTTP basic auth parameters: `"username:password"`, or `["username", "password"]`.
  22. * `disable_ssl_verification` - Set to `true` to disable ssl verification.
  23. * `disable_url_encoding` - Set to `true` to disable url encoding.
  24. * `force_encoding` - Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid or wrong charset in the Content-Type header. Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1).
  25. * `user_agent` - A custom User-Agent name (default: "Faraday v#{Faraday::VERSION}").
  26. * `max_events_per_run` - Limit number of events created (items parsed) per run for feed.
  27. # Ordering Events
  28. #{description_events_order}
  29. In this Agent, the default value for `events_order` is `#{DEFAULT_EVENTS_ORDER.to_json}`.
  30. MD
  31. end
  32. def default_options
  33. {
  34. 'expected_update_period_in_days' => "5",
  35. 'clean' => 'false',
  36. 'url' => "https://github.com/cantino/huginn/commits/master.atom"
  37. }
  38. end
  39. event_description <<-MD
  40. Events look like:
  41. {
  42. "id": "829f845279611d7925146725317b868d",
  43. "date_published": "2014-09-11 01:30:00 -0700",
  44. "last_updated": "Thu, 11 Sep 2014 01:30:00 -0700",
  45. "url": "http://example.com/...",
  46. "urls": [ "http://example.com/..." ],
  47. "description": "Some description",
  48. "content": "Some content",
  49. "title": "Some title",
  50. "authors": [ ... ],
  51. "categories": [ ... ]
  52. }
  53. MD
  54. def working?
  55. event_created_within?((interpolated['expected_update_period_in_days'].presence || 10).to_i) && !recent_error_logs?
  56. end
  57. def validate_options
  58. errors.add(:base, "url is required") unless options['url'].present?
  59. unless options['expected_update_period_in_days'].present? && options['expected_update_period_in_days'].to_i > 0
  60. errors.add(:base, "Please provide 'expected_update_period_in_days' to indicate how many days can pass without an update before this Agent is considered to not be working")
  61. end
  62. validate_web_request_options!
  63. validate_events_order
  64. end
  65. def events_order
  66. super.presence || DEFAULT_EVENTS_ORDER
  67. end
  68. def check
  69. Array(interpolated['url']).each do |url|
  70. check_url(url)
  71. end
  72. end
  73. protected
  74. def check_url(url)
  75. response = faraday.get(url)
  76. if response.success?
  77. feed = FeedNormalizer::FeedNormalizer.parse(response.body, loose: true)
  78. feed.clean! if boolify(interpolated['clean'])
  79. max_events = (interpolated['max_events_per_run'].presence || 0).to_i
  80. created_event_count = 0
  81. sort_events(feed_to_events(feed)).each.with_index do |event, index|
  82. break if max_events && max_events > 0 && index >= max_events
  83. entry_id = event.payload[:id]
  84. if check_and_track(entry_id)
  85. created_event_count += 1
  86. create_event(event)
  87. end
  88. end
  89. log "Fetched #{url} and created #{created_event_count} event(s)."
  90. else
  91. error "Failed to fetch #{url}: #{response.inspect}"
  92. end
  93. rescue => e
  94. error "Failed to fetch #{url} with message '#{e.message}': #{e.backtrace}"
  95. end
  96. def get_entry_id(entry)
  97. entry.id.presence || Digest::MD5.hexdigest(entry.content)
  98. end
  99. def check_and_track(entry_id)
  100. memory['seen_ids'] ||= []
  101. if memory['seen_ids'].include?(entry_id)
  102. false
  103. else
  104. memory['seen_ids'].unshift entry_id
  105. memory['seen_ids'].pop if memory['seen_ids'].length > 500
  106. true
  107. end
  108. end
  109. def feed_to_events(feed)
  110. feed.entries.map { |entry|
  111. Event.new(payload: {
  112. id: get_entry_id(entry),
  113. date_published: entry.date_published,
  114. last_updated: entry.last_updated,
  115. url: entry.url,
  116. urls: entry.urls,
  117. description: entry.description,
  118. content: entry.content,
  119. title: entry.title,
  120. authors: entry.authors,
  121. categories: entry.categories
  122. })
  123. }
  124. end
  125. end
  126. end