rss_agent.rb 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. module Agents
  2. class RssAgent < Agent
  3. include WebRequestConcern
  4. cannot_receive_events!
  5. can_dry_run!
  6. default_schedule "every_1d"
  7. gem_dependency_check { defined?(Feedjira::Feed) }
  8. DEFAULT_EVENTS_ORDER = [['{{date_published}}', 'time'], ['{{last_updated}}', 'time']]
  9. description do
  10. <<-MD
  11. The RSS Agent consumes RSS feeds and emits events when they change.
  12. This agent, using [Feedjira](https://github.com/feedjira/feedjira) as a base, can parse various types of RSS and Atom feeds and has some special handlers for FeedBurner, iTunes RSS, and so on. However, supported fields are limited by its general and abstract nature. For complex feeds with additional field types, we recommend using a WebsiteAgent. See [this example](https://github.com/cantino/huginn/wiki/Agent-configuration-examples#itunes-trailers).
  13. If you want to *output* an RSS feed, use the DataOutputAgent.
  14. Options:
  15. * `url` - The URL of the RSS feed (an array of URLs can also be used; items with identical guids across feeds will be considered duplicates).
  16. * `include_feed_info` - Set to `true` to include feed information in each event.
  17. * `clean` - Set to `true` to sanitize `description` and `content` as HTML fragments, removing unknown/unsafe elements and attributes.
  18. * `expected_update_period_in_days` - How often you expect this RSS feed to change. If more than this amount of time passes without an update, the Agent will mark itself as not working.
  19. * `headers` - When present, it should be a hash of headers to send with the request.
  20. * `basic_auth` - Specify HTTP basic auth parameters: `"username:password"`, or `["username", "password"]`.
  21. * `disable_ssl_verification` - Set to `true` to disable ssl verification.
  22. * `disable_url_encoding` - Set to `true` to disable url encoding.
  23. * `force_encoding` - Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid or wrong charset in the Content-Type header. Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1).
  24. * `user_agent` - A custom User-Agent name (default: "Faraday v#{Faraday::VERSION}").
  25. * `max_events_per_run` - Limit number of events created (items parsed) per run for feed.
  26. # Ordering Events
  27. #{description_events_order}
  28. In this Agent, the default value for `events_order` is `#{DEFAULT_EVENTS_ORDER.to_json}`.
  29. MD
  30. end
  31. def default_options
  32. {
  33. 'expected_update_period_in_days' => "5",
  34. 'clean' => 'false',
  35. 'url' => "https://github.com/cantino/huginn/commits/master.atom"
  36. }
  37. end
  38. event_description <<-MD
  39. Events look like:
  40. {
  41. "feed": {
  42. "id": "...",
  43. "type": "atom",
  44. "generator": "...",
  45. "url": "http://example.com/",
  46. "links": [
  47. { "href": "http://example.com/", "rel": "alternate", "type": "text/html" },
  48. { "href": "http://example.com/index.atom", "rel": "self", "type": "application/atom+xml" }
  49. ],
  50. "title": "Some site title",
  51. "description": "Some site description",
  52. "copyright": "...",
  53. "icon": "http://example.com/icon.png",
  54. "authors": [ "..." ],
  55. "date_published": "2014-09-11T01:30:00-07:00",
  56. "last_updated": "2014-09-11T01:30:00-07:00"
  57. },
  58. "id": "829f845279611d7925146725317b868d",
  59. "url": "http://example.com/...",
  60. "urls": [ "http://example.com/..." ],
  61. "links": [
  62. { "href": "http://example.com/...", "rel": "alternate" },
  63. ],
  64. "title": "Some title",
  65. "description": "Some description",
  66. "content": "Some content",
  67. "authors": [ "Some Author <email@address>" ],
  68. "categories": [ "..." ],
  69. "image": "http://example.com/...",
  70. "enclosure": {
  71. "url" => "http://example.com/file.mp3", "type" => "audio/mpeg", "length" => "123456789"
  72. },
  73. "date_published": "2014-09-11T01:30:00-0700",
  74. "last_updated": "2014-09-11T01:30:00-0700"
  75. }
  76. Some notes:
  77. - The `feed` key is present only if `include_feed_info` is set to true.
  78. - Each element in `authors` is a string normalized in the format "*name* <*email*> (*url*)", where each space-separated part is optional.
  79. - Timestamps are converted to the ISO 8601 format.
  80. MD
  81. def working?
  82. event_created_within?((interpolated['expected_update_period_in_days'].presence || 10).to_i) && !recent_error_logs?
  83. end
  84. def validate_options
  85. errors.add(:base, "url is required") unless options['url'].present?
  86. unless options['expected_update_period_in_days'].present? && options['expected_update_period_in_days'].to_i > 0
  87. errors.add(:base, "Please provide 'expected_update_period_in_days' to indicate how many days can pass without an update before this Agent is considered to not be working")
  88. end
  89. validate_web_request_options!
  90. validate_events_order
  91. end
  92. def events_order(key = SortableEvents::EVENTS_ORDER_KEY)
  93. if key == SortableEvents::EVENTS_ORDER_KEY
  94. super.presence || DEFAULT_EVENTS_ORDER
  95. else
  96. raise ArgumentError, "unsupported key: #{key}"
  97. end
  98. end
  99. def check
  100. check_urls(Array(interpolated['url']))
  101. end
  102. protected
  103. def check_urls(urls)
  104. new_events = []
  105. max_events = (interpolated['max_events_per_run'].presence || 0).to_i
  106. urls.each do |url|
  107. begin
  108. response = faraday.get(url)
  109. if response.success?
  110. feed = Feedjira::Feed.parse(preprocessed_body(response))
  111. new_events.concat feed_to_events(feed)
  112. else
  113. error "Failed to fetch #{url}: #{response.inspect}"
  114. end
  115. rescue => e
  116. error "Failed to fetch #{url} with message '#{e.message}': #{e.backtrace}"
  117. end
  118. end
  119. created_event_count = 0
  120. sort_events(new_events).each.with_index do |event, index|
  121. entry_id = event.payload[:id]
  122. if check_and_track(entry_id)
  123. unless max_events && max_events > 0 && index >= max_events
  124. created_event_count += 1
  125. create_event(event)
  126. end
  127. end
  128. end
  129. log "Fetched #{urls.to_sentence} and created #{created_event_count} event(s)."
  130. end
  131. def check_and_track(entry_id)
  132. memory['seen_ids'] ||= []
  133. if memory['seen_ids'].include?(entry_id)
  134. false
  135. else
  136. memory['seen_ids'].unshift entry_id
  137. memory['seen_ids'].pop if memory['seen_ids'].length > 500
  138. true
  139. end
  140. end
  141. unless dependencies_missing?
  142. require 'feedjira_extension'
  143. end
  144. def preprocessed_body(response)
  145. body = response.body
  146. case body.encoding
  147. when Encoding::ASCII_8BIT
  148. # Encoding is unknown from the Content-Type, so let the SAX
  149. # parser detect it from the content.
  150. else
  151. # Encoding is already known, so do not let the parser detect
  152. # it from the XML declaration in the content.
  153. body.sub!(/(<\?xml(?:\s+\w+\s*=\s*(['"]).*?\2)*)\s+encoding\s*=\s*(['"]).*?\3/, '\\1')
  154. end
  155. body
  156. end
  157. def feed_data(feed)
  158. type =
  159. case feed.class.name
  160. when /Atom/
  161. 'atom'
  162. else
  163. 'rss'
  164. end
  165. {
  166. id: feed.feed_id,
  167. type: type,
  168. url: feed.url,
  169. links: feed.links,
  170. title: feed.title,
  171. description: feed.description,
  172. copyright: feed.copyright,
  173. generator: feed.generator,
  174. icon: feed.icon,
  175. authors: feed.authors,
  176. date_published: feed.date_published,
  177. last_updated: feed.last_updated,
  178. }
  179. end
  180. def entry_data(entry)
  181. {
  182. id: entry.id,
  183. url: entry.url,
  184. urls: entry.links.map(&:href),
  185. links: entry.links,
  186. title: entry.title,
  187. description: clean_fragment(entry.summary),
  188. content: clean_fragment(entry.content || entry.summary),
  189. image: entry.try(:image),
  190. enclosure: entry.enclosure,
  191. authors: entry.authors,
  192. categories: Array(entry.try(:categories)),
  193. date_published: entry.date_published,
  194. last_updated: entry.last_updated,
  195. }
  196. end
  197. def feed_to_events(feed)
  198. payload_base = {}
  199. if boolify(interpolated['include_feed_info'])
  200. payload_base[:feed] = feed_data(feed)
  201. end
  202. feed.entries.map { |entry|
  203. Event.new(payload: payload_base.merge(entry_data(entry)))
  204. }
  205. end
  206. def clean_fragment(fragment)
  207. if boolify(interpolated['clean']) && fragment.present?
  208. Loofah.scrub_fragment(fragment, :prune).to_s
  209. else
  210. fragment
  211. end
  212. end
  213. end
  214. end