rss_agent.rb 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. module Agents
  2. class RssAgent < Agent
  3. include WebRequestConcern
  4. cannot_receive_events!
  5. can_dry_run!
  6. default_schedule "every_1d"
  7. gem_dependency_check { defined?(Feedjira::Feed) }
  8. DEFAULT_EVENTS_ORDER = [['{{date_published}}', 'time'], ['{{last_updated}}', 'time']]
  9. description do
  10. <<-MD
  11. The RSS Agent consumes RSS feeds and emits events when they change.
  12. This agent, using [Feedjira](https://github.com/feedjira/feedjira) as a base, can parse various types of RSS and Atom feeds and has some special handlers for FeedBurner, iTunes RSS, and so on. However, supported fields are limited by its general and abstract nature. For complex feeds with additional field types, we recommend using a WebsiteAgent. See [this example](https://github.com/cantino/huginn/wiki/Agent-configuration-examples#itunes-trailers).
  13. If you want to *output* an RSS feed, use the DataOutputAgent.
  14. Options:
  15. * `url` - The URL of the RSS feed (an array of URLs can also be used; items with identical guids across feeds will be considered duplicates).
  16. * `include_feed_info` - Set to `true` to include feed information in each event.
  17. * `clean` - Set to `true` to sanitize `description` and `content` as HTML fragments, removing unknown/unsafe elements and attributes.
  18. * `expected_update_period_in_days` - How often you expect this RSS feed to change. If more than this amount of time passes without an update, the Agent will mark itself as not working.
  19. * `headers` - When present, it should be a hash of headers to send with the request.
  20. * `basic_auth` - Specify HTTP basic auth parameters: `"username:password"`, or `["username", "password"]`.
  21. * `disable_ssl_verification` - Set to `true` to disable ssl verification.
  22. * `disable_url_encoding` - Set to `true` to disable url encoding.
  23. * `force_encoding` - Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid or wrong charset in the Content-Type header. Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1).
  24. * `user_agent` - A custom User-Agent name (default: "Faraday v#{Faraday::VERSION}").
  25. * `max_events_per_run` - Limit number of events created (items parsed) per run for feed.
  26. # Ordering Events
  27. #{description_events_order}
  28. In this Agent, the default value for `events_order` is `#{DEFAULT_EVENTS_ORDER.to_json}`.
  29. MD
  30. end
  31. def default_options
  32. {
  33. 'expected_update_period_in_days' => "5",
  34. 'clean' => 'false',
  35. 'url' => "https://github.com/cantino/huginn/commits/master.atom"
  36. }
  37. end
  38. event_description <<-MD
  39. Events look like:
  40. {
  41. "feed": {
  42. "id": "...",
  43. "type": "atom",
  44. "generator": "...",
  45. "url": "http://example.com/",
  46. "links": [
  47. { "href": "http://example.com/", "rel": "alternate", "type": "text/html" },
  48. { "href": "http://example.com/index.atom", "rel": "self", "type": "application/atom+xml" }
  49. ],
  50. "title": "Some site title",
  51. "description": "Some site description",
  52. "copyright": "...",
  53. "icon": "http://example.com/icon.png",
  54. "authors": [ "..." ],
  55. "date_published": "2014-09-11T01:30:00-07:00",
  56. "last_updated": "2014-09-11T01:30:00-07:00"
  57. },
  58. "id": "829f845279611d7925146725317b868d",
  59. "url": "http://example.com/...",
  60. "urls": [ "http://example.com/..." ],
  61. "links": [
  62. { "href": "http://example.com/...", "rel": "alternate" },
  63. ],
  64. "title": "Some title",
  65. "description": "Some description",
  66. "content": "Some content",
  67. "authors": [ "Some Author <email@address>" ],
  68. "categories": [ "..." ],
  69. "enclosure": {
  70. "url" => "http://example.com/file.mp3", "type" => "audio/mpeg", "length" => "123456789"
  71. },
  72. "date_published": "2014-09-11T01:30:00-0700",
  73. "last_updated": "2014-09-11T01:30:00-0700"
  74. }
  75. Some notes:
  76. - The `feed` key is present only if `include_feed_info` is set to true.
  77. - Each element in `authors` is a string normalized in the format "*name* <*email*> (*url*)", where each space-separated part is optional.
  78. - Timestamps are converted to the ISO 8601 format.
  79. MD
  80. def working?
  81. event_created_within?((interpolated['expected_update_period_in_days'].presence || 10).to_i) && !recent_error_logs?
  82. end
  83. def validate_options
  84. errors.add(:base, "url is required") unless options['url'].present?
  85. unless options['expected_update_period_in_days'].present? && options['expected_update_period_in_days'].to_i > 0
  86. errors.add(:base, "Please provide 'expected_update_period_in_days' to indicate how many days can pass without an update before this Agent is considered to not be working")
  87. end
  88. validate_web_request_options!
  89. validate_events_order
  90. end
  91. def events_order(key = SortableEvents::EVENTS_ORDER_KEY)
  92. if key == SortableEvents::EVENTS_ORDER_KEY
  93. super.presence || DEFAULT_EVENTS_ORDER
  94. else
  95. raise ArgumentError, "unsupported key: #{key}"
  96. end
  97. end
  98. def check
  99. check_urls(Array(interpolated['url']))
  100. end
  101. protected
  102. def check_urls(urls)
  103. new_events = []
  104. max_events = (interpolated['max_events_per_run'].presence || 0).to_i
  105. urls.each do |url|
  106. begin
  107. response = faraday.get(url)
  108. if response.success?
  109. feed = Feedjira::Feed.parse(response.body)
  110. new_events.concat feed_to_events(feed)
  111. else
  112. error "Failed to fetch #{url}: #{response.inspect}"
  113. end
  114. rescue => e
  115. error "Failed to fetch #{url} with message '#{e.message}': #{e.backtrace}"
  116. end
  117. end
  118. created_event_count = 0
  119. sort_events(new_events).each.with_index do |event, index|
  120. entry_id = event.payload[:id]
  121. if check_and_track(entry_id)
  122. unless max_events && max_events > 0 && index >= max_events
  123. created_event_count += 1
  124. create_event(event)
  125. end
  126. end
  127. end
  128. log "Fetched #{urls.to_sentence} and created #{created_event_count} event(s)."
  129. end
  130. def check_and_track(entry_id)
  131. memory['seen_ids'] ||= []
  132. if memory['seen_ids'].include?(entry_id)
  133. false
  134. else
  135. memory['seen_ids'].unshift entry_id
  136. memory['seen_ids'].pop if memory['seen_ids'].length > 500
  137. true
  138. end
  139. end
  140. unless dependencies_missing?
  141. require 'feedjira_extension'
  142. end
  143. def feed_data(feed)
  144. type =
  145. case feed.class.name
  146. when /Atom/
  147. 'atom'
  148. else
  149. 'rss'
  150. end
  151. {
  152. id: feed.feed_id,
  153. type: type,
  154. url: feed.url,
  155. links: feed.links,
  156. title: feed.title,
  157. description: feed.description,
  158. copyright: feed.copyright,
  159. generator: feed.generator,
  160. icon: feed.icon,
  161. authors: feed.authors,
  162. date_published: feed.date_published,
  163. last_updated: feed.last_updated,
  164. }
  165. end
  166. def entry_data(entry)
  167. {
  168. id: entry.id,
  169. url: entry.url,
  170. urls: entry.links.map(&:href),
  171. links: entry.links,
  172. title: entry.title,
  173. description: clean_fragment(entry.summary),
  174. content: clean_fragment(entry.content || entry.summary),
  175. image: entry.try(:image),
  176. enclosure: entry.enclosure,
  177. authors: entry.authors,
  178. categories: Array(entry.try(:categories)),
  179. date_published: entry.date_published,
  180. last_updated: entry.last_updated,
  181. }
  182. end
  183. def feed_to_events(feed)
  184. payload_base = {}
  185. if boolify(interpolated['include_feed_info'])
  186. payload_base[:feed] = feed_data(feed)
  187. end
  188. feed.entries.map { |entry|
  189. Event.new(payload: payload_base.merge(entry_data(entry)))
  190. }
  191. end
  192. def clean_fragment(fragment)
  193. if boolify(interpolated['clean']) && fragment.present?
  194. Loofah.scrub_fragment(fragment, :prune).to_s
  195. else
  196. fragment
  197. end
  198. end
  199. end
  200. end