rss_agent.rb 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. module Agents
  2. class RssAgent < Agent
  3. include WebRequestConcern
  4. cannot_receive_events!
  5. can_dry_run!
  6. default_schedule "every_1d"
  7. gem_dependency_check { defined?(Feedjira) }
  8. DEFAULT_EVENTS_ORDER = [['{{date_published}}', 'time'], ['{{last_updated}}', 'time']]
  9. description do
  10. <<~MD
  11. The RSS Agent consumes RSS feeds and emits events when they change.
  12. This agent, using [Feedjira](https://github.com/feedjira/feedjira) as a base, can parse various types of RSS and Atom feeds and has some special handlers for FeedBurner, iTunes RSS, and so on. However, supported fields are limited by its general and abstract nature. For complex feeds with additional field types, we recommend using a WebsiteAgent. See [this example](https://github.com/huginn/huginn/wiki/Agent-configuration-examples#itunes-trailers).
  13. If you want to *output* an RSS feed, use the DataOutputAgent.
  14. Options:
  15. * `url` - The URL of the RSS feed (an array of URLs can also be used; items with identical guids across feeds will be considered duplicates).
  16. * `include_feed_info` - Set to `true` to include feed information in each event.
  17. * `clean` - Set to `true` to sanitize `description` and `content` as HTML fragments, removing unknown/unsafe elements and attributes.
  18. * `expected_update_period_in_days` - How often you expect this RSS feed to change. If more than this amount of time passes without an update, the Agent will mark itself as not working.
  19. * `headers` - When present, it should be a hash of headers to send with the request.
  20. * `basic_auth` - Specify HTTP basic auth parameters: `"username:password"`, or `["username", "password"]`.
  21. * `disable_ssl_verification` - Set to `true` to disable ssl verification.
  22. * `disable_url_encoding` - Set to `true` to disable url encoding.
  23. * `force_encoding` - Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid or wrong charset in the Content-Type header. Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1).
  24. * `user_agent` - A custom User-Agent name (default: "Faraday v#{Faraday::VERSION}").
  25. * `max_events_per_run` - Limit number of events created (items parsed) per run for feed.
  26. * `remembered_id_count` - Number of IDs to keep track of and avoid re-emitting (default: 500).
  27. # Ordering Events
  28. #{description_events_order}
  29. In this Agent, the default value for `events_order` is `#{DEFAULT_EVENTS_ORDER.to_json}`.
  30. MD
  31. end
  32. def default_options
  33. {
  34. 'expected_update_period_in_days' => "5",
  35. 'clean' => 'false',
  36. 'url' => "https://github.com/huginn/huginn/commits/master.atom"
  37. }
  38. end
  39. event_description <<~MD
  40. Events look like:
  41. {
  42. "feed": {
  43. "id": "...",
  44. "type": "atom",
  45. "generator": "...",
  46. "url": "http://example.com/",
  47. "links": [
  48. { "href": "http://example.com/", "rel": "alternate", "type": "text/html" },
  49. { "href": "http://example.com/index.atom", "rel": "self", "type": "application/atom+xml" }
  50. ],
  51. "title": "Some site title",
  52. "description": "Some site description",
  53. "copyright": "...",
  54. "icon": "http://example.com/icon.png",
  55. "authors": [ "..." ],
  56. "itunes_block": "no",
  57. "itunes_categories": [
  58. "Technology", "Gadgets",
  59. "TV & Film",
  60. "Arts", "Food"
  61. ],
  62. "itunes_complete": "yes",
  63. "itunes_explicit": "yes",
  64. "itunes_image": "http://...",
  65. "itunes_new_feed_url": "http://...",
  66. "itunes_owners": [ "John Doe <john.doe@example.com>" ],
  67. "itunes_subtitle": "...",
  68. "itunes_summary": "...",
  69. "language": "en-US",
  70. "date_published": "2014-09-11T01:30:00-07:00",
  71. "last_updated": "2014-09-11T01:30:00-07:00"
  72. },
  73. "id": "829f845279611d7925146725317b868d",
  74. "url": "http://example.com/...",
  75. "urls": [ "http://example.com/..." ],
  76. "links": [
  77. { "href": "http://example.com/...", "rel": "alternate" },
  78. ],
  79. "title": "Some title",
  80. "description": "Some description",
  81. "content": "Some content",
  82. "authors": [ "Some Author <email@address>" ],
  83. "categories": [ "..." ],
  84. "image": "http://example.com/...",
  85. "enclosure": {
  86. "url" => "http://example.com/file.mp3", "type" => "audio/mpeg", "length" => "123456789"
  87. },
  88. "itunes_block": "no",
  89. "itunes_closed_captioned": "yes",
  90. "itunes_duration": "04:34",
  91. "itunes_explicit": "yes",
  92. "itunes_image": "http://...",
  93. "itunes_order": "1",
  94. "itunes_subtitle": "...",
  95. "itunes_summary": "...",
  96. "date_published": "2014-09-11T01:30:00-0700",
  97. "last_updated": "2014-09-11T01:30:00-0700"
  98. }
  99. Some notes:
  100. - The `feed` key is present only if `include_feed_info` is set to true.
  101. - The keys starting with `itunes_`, and `language` are only present when the feed is a podcast. See [Podcasts Connect Help](https://help.apple.com/itc/podcasts_connect/#/itcb54353390) for details.
  102. - Each element in `authors` and `itunes_owners` is a string normalized in the format "*name* <*email*> (*url*)", where each space-separated part is optional.
  103. - Timestamps are converted to the ISO 8601 format.
  104. MD
  105. def working?
  106. event_created_within?((interpolated['expected_update_period_in_days'].presence || 10).to_i) && !recent_error_logs?
  107. end
  108. def validate_options
  109. errors.add(:base, "url is required") unless options['url'].present?
  110. unless options['expected_update_period_in_days'].present? && options['expected_update_period_in_days'].to_i > 0
  111. errors.add(:base,
  112. "Please provide 'expected_update_period_in_days' to indicate how many days can pass without an update before this Agent is considered to not be working")
  113. end
  114. if options['remembered_id_count'].present? && options['remembered_id_count'].to_i < 1
  115. errors.add(:base,
  116. "Please provide 'remembered_id_count' as a number bigger than 0 indicating how many IDs should be saved to distinguish between new and old IDs in RSS feeds. Delete option to use default (500).")
  117. end
  118. validate_web_request_options!
  119. validate_events_order
  120. end
  121. def events_order(key = SortableEvents::EVENTS_ORDER_KEY)
  122. if key == SortableEvents::EVENTS_ORDER_KEY
  123. super.presence || DEFAULT_EVENTS_ORDER
  124. else
  125. raise ArgumentError, "unsupported key: #{key}"
  126. end
  127. end
  128. def check
  129. check_urls(Array(interpolated['url']))
  130. end
  131. protected
  132. def check_urls(urls)
  133. new_events = []
  134. max_events = (interpolated['max_events_per_run'].presence || 0).to_i
  135. urls.each do |url|
  136. response = faraday.get(url)
  137. if response.success?
  138. feed = Feedjira.parse(preprocessed_body(response))
  139. new_events.concat feed_to_events(feed)
  140. else
  141. error "Failed to fetch #{url}: #{response.inspect}"
  142. end
  143. rescue StandardError => e
  144. error "Failed to fetch #{url} with message '#{e.message}': #{e.backtrace}"
  145. end
  146. events = sort_events(new_events).select.with_index { |event, index|
  147. check_and_track(event.payload[:id]) &&
  148. !(max_events && max_events > 0 && index >= max_events)
  149. }
  150. create_events(events)
  151. log "Fetched #{urls.to_sentence} and created #{events.size} event(s)."
  152. end
  153. def remembered_id_count
  154. (options['remembered_id_count'].presence || 500).to_i
  155. end
  156. def check_and_track(entry_id)
  157. memory['seen_ids'] ||= []
  158. if memory['seen_ids'].include?(entry_id)
  159. false
  160. else
  161. memory['seen_ids'].unshift entry_id
  162. memory['seen_ids'].pop(memory['seen_ids'].length - remembered_id_count) if memory['seen_ids'].length > remembered_id_count
  163. true
  164. end
  165. end
  166. unless dependencies_missing?
  167. require 'feedjira_extension'
  168. end
  169. def preprocessed_body(response)
  170. body = response.body
  171. case body.encoding
  172. when Encoding::ASCII_8BIT
  173. # Encoding is unknown from the Content-Type, so let the SAX
  174. # parser detect it from the content.
  175. else
  176. # Encoding is already known, so do not let the parser detect
  177. # it from the XML declaration in the content.
  178. body.sub!(/(?<noenc>\A\u{FEFF}?\s*<\?xml(?:\s+\w+(?<av>\s*=\s*(?:'[^']*'|"[^"]*")))*?)\s+encoding\g<av>/,
  179. '\\k<noenc>')
  180. end
  181. body
  182. end
  183. def feed_data(feed)
  184. type =
  185. case feed.class.name
  186. when /Atom/
  187. 'atom'
  188. else
  189. 'rss'
  190. end
  191. {
  192. id: feed.feed_id,
  193. type:,
  194. url: feed.url,
  195. links: feed.links,
  196. title: feed.title,
  197. description: feed.description,
  198. copyright: feed.copyright,
  199. generator: feed.generator,
  200. icon: feed.icon,
  201. authors: feed.authors,
  202. date_published: feed.date_published,
  203. last_updated: feed.last_updated,
  204. **itunes_feed_data(feed)
  205. }
  206. end
  207. def itunes_feed_data(feed)
  208. data = {}
  209. case feed
  210. when Feedjira::Parser::ITunesRSS
  211. %i[
  212. itunes_block
  213. itunes_categories
  214. itunes_complete
  215. itunes_explicit
  216. itunes_image
  217. itunes_new_feed_url
  218. itunes_owners
  219. itunes_subtitle
  220. itunes_summary
  221. language
  222. ].each { |attr|
  223. next unless value = feed.try(attr).presence
  224. data[attr] =
  225. case attr
  226. when :itunes_summary
  227. clean_fragment(value)
  228. else
  229. value
  230. end
  231. }
  232. end
  233. data
  234. end
  235. def entry_data(entry)
  236. {
  237. id: entry.id,
  238. url: entry.url,
  239. urls: Array(entry.url) | entry.links.map(&:href),
  240. links: entry.links,
  241. title: entry.title,
  242. description: clean_fragment(entry.summary),
  243. content: clean_fragment(entry.content || entry.summary),
  244. image: entry.try(:image),
  245. enclosure: entry.enclosure,
  246. authors: entry.authors,
  247. categories: Array(entry.try(:categories)),
  248. date_published: entry.date_published,
  249. last_updated: entry.last_updated,
  250. **itunes_entry_data(entry)
  251. }
  252. end
  253. def itunes_entry_data(entry)
  254. data = {}
  255. case entry
  256. when Feedjira::Parser::ITunesRSSItem
  257. %i[
  258. itunes_block
  259. itunes_closed_captioned
  260. itunes_duration
  261. itunes_explicit
  262. itunes_image
  263. itunes_order
  264. itunes_subtitle
  265. itunes_summary
  266. ].each { |attr|
  267. if value = entry.try(attr).presence
  268. data[attr] = value
  269. end
  270. }
  271. end
  272. data
  273. end
  274. def feed_to_events(feed)
  275. payload_base = {}
  276. if boolify(interpolated['include_feed_info'])
  277. payload_base[:feed] = feed_data(feed)
  278. end
  279. feed.entries.map { |entry|
  280. Event.new(payload: payload_base.merge(entry_data(entry)))
  281. }
  282. end
  283. def clean_fragment(fragment)
  284. if boolify(interpolated['clean']) && fragment.present?
  285. Loofah.scrub_fragment(fragment, :prune).to_s
  286. else
  287. fragment
  288. end
  289. end
  290. end
  291. end