data_output_agent.rb 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454
  1. module Agents
  2. class DataOutputAgent < Agent
  3. include WebRequestConcern
  4. cannot_be_scheduled!
  5. cannot_create_events!
  6. description do
  7. <<~MD
  8. The Data Output Agent outputs received events as either RSS or JSON. Use it to output a public or private stream of Huginn data.
  9. This Agent will output data at:
  10. `https://#{ENV['DOMAIN']}#{Rails.application.routes.url_helpers.web_requests_path(agent_id: ':id', user_id:, secret: ':secret', format: :xml)}`
  11. where `:secret` is one of the allowed secrets specified in your options and the extension can be `xml` or `json`.
  12. You can setup multiple secrets so that you can individually authorize external systems to
  13. access your Huginn data.
  14. Options:
  15. * `secrets` - An array of tokens that the requestor must provide for light-weight authentication.
  16. * `expected_receive_period_in_days` - How often you expect data to be received by this Agent from other Agents.
  17. * `template` - A JSON object representing a mapping between item output keys and incoming event values. Use [Liquid](https://github.com/huginn/huginn/wiki/Formatting-Events-using-Liquid) to format the values. Values of the `link`, `title`, `description` and `icon` keys will be put into the \\<channel\\> section of RSS output. Value of the `self` key will be used as URL for this feed itself, which is useful when you serve it via reverse proxy. The `item` key will be repeated for every Event. The `pubDate` key for each item will have the creation time of the Event unless given.
  18. * `events_to_show` - The number of events to output in RSS or JSON. (default: `40`)
  19. * `ttl` - A value for the \\<ttl\\> element in RSS output. (default: `60`)
  20. * `ns_dc` - Add [DCMI Metadata Terms namespace](http://purl.org/dc/elements/1.1/) in output xml
  21. * `ns_media` - Add [yahoo media namespace](https://en.wikipedia.org/wiki/Media_RSS) in output xml
  22. * `ns_itunes` - Add [itunes compatible namespace](http://lists.apple.com/archives/syndication-dev/2005/Nov/msg00002.html) in output xml
  23. * `rss_content_type` - Content-Type for RSS output (default: `application/rss+xml`)
  24. * `response_headers` - An object with any custom response headers. (example: `{"Access-Control-Allow-Origin": "*"}`)
  25. * `push_hubs` - Set to a list of PubSubHubbub endpoints you want to publish an update to every time this agent receives an event. (default: none) Popular hubs include [Superfeedr](https://pubsubhubbub.superfeedr.com/) and [Google](https://pubsubhubbub.appspot.com/). Note that publishing updates will make your feed URL known to the public, so if you want to keep it secret, set up a reverse proxy to serve your feed via a safe URL and specify it in `template.self`.
  26. If you'd like to output RSS tags with attributes, such as `enclosure`, use something like the following in your `template`:
  27. "enclosure": {
  28. "_attributes": {
  29. "url": "{{media_url}}",
  30. "length": "1234456789",
  31. "type": "audio/mpeg"
  32. }
  33. },
  34. "another_tag": {
  35. "_attributes": {
  36. "key": "value",
  37. "another_key": "another_value"
  38. },
  39. "_contents": "tag contents (can be an object for nesting)"
  40. }
  41. # Ordering events
  42. #{description_events_order('events')}
  43. DataOutputAgent will select the last `events_to_show` entries of its received events sorted in the order specified by `events_order`, which is defaulted to the event creation time.
  44. So, if you have multiple source agents that may create many events in a run, you may want to either increase `events_to_show` to have a larger "window", or specify the `events_order` option to an appropriate value (like `date_published`) so events from various sources are properly mixed in the resulted feed.
  45. There is also an option `events_list_order` that only controls the order of events listed in the final output, without attempting to maintain a total order of received events. It has the same format as `events_order` and is defaulted to `#{Utils.jsonify(DEFAULT_EVENTS_ORDER['events_list_order'])}` so the selected events are listed in reverse order like most popular RSS feeds list their articles.
  46. # Liquid Templating
  47. In [Liquid](https://github.com/huginn/huginn/wiki/Formatting-Events-using-Liquid) templating, the following variable is available:
  48. * `events`: An array of events being output, sorted in the given order, up to `events_to_show` in number. For example, if source events contain a site title in the `site_title` key, you can refer to it in `template.title` by putting `{{events.first.site_title}}`.
  49. MD
  50. end
  51. def default_options
  52. {
  53. "secrets" => ["a-secret-key"],
  54. "expected_receive_period_in_days" => 2,
  55. "template" => {
  56. "title" => "XKCD comics as a feed",
  57. "description" => "This is a feed of recent XKCD comics, generated by Huginn",
  58. "item" => {
  59. "title" => "{{title}}",
  60. "description" => "Secret hovertext: {{hovertext}}",
  61. "link" => "{{url}}"
  62. }
  63. },
  64. "ns_media" => "true"
  65. }
  66. end
  67. def working?
  68. last_receive_at && last_receive_at > options['expected_receive_period_in_days'].to_i.days.ago && !recent_error_logs?
  69. end
  70. def validate_options
  71. if options['secrets'].is_a?(Array) && options['secrets'].length > 0
  72. options['secrets'].each do |secret|
  73. case secret
  74. when %r{[/.]}
  75. errors.add(:base, "secret may not contain a slash or dot")
  76. when String
  77. else
  78. errors.add(:base, "secret must be a string")
  79. end
  80. end
  81. else
  82. errors.add(:base, "Please specify one or more secrets for 'authenticating' incoming feed requests")
  83. end
  84. unless options['expected_receive_period_in_days'].present? && options['expected_receive_period_in_days'].to_i > 0
  85. errors.add(:base,
  86. "Please provide 'expected_receive_period_in_days' to indicate how many days can pass before this Agent is considered to be not working")
  87. end
  88. unless options['template'].present? && options['template']['item'].present? && options['template']['item'].is_a?(Hash)
  89. errors.add(:base, "Please provide template and template.item")
  90. end
  91. case options['push_hubs']
  92. when nil
  93. when Array
  94. options['push_hubs'].each do |hub|
  95. case hub
  96. when /\{/
  97. # Liquid templating
  98. when String
  99. begin
  100. URI.parse(hub)
  101. rescue URI::Error
  102. errors.add(:base, "invalid URL found in push_hubs")
  103. break
  104. end
  105. else
  106. errors.add(:base, "push_hubs must be an array of endpoint URLs")
  107. break
  108. end
  109. end
  110. else
  111. errors.add(:base, "push_hubs must be an array")
  112. end
  113. end
  114. def events_to_show
  115. (interpolated['events_to_show'].presence || 40).to_i
  116. end
  117. def feed_ttl
  118. (interpolated['ttl'].presence || 60).to_i
  119. end
  120. def feed_title
  121. interpolated['template']['title'].presence || "#{name} Event Feed"
  122. end
  123. def feed_link
  124. interpolated['template']['link'].presence || "https://#{ENV['DOMAIN']}"
  125. end
  126. def feed_url(options = {})
  127. interpolated['template']['self'].presence ||
  128. feed_link + Rails.application.routes.url_helpers.web_requests_path(
  129. agent_id: id || ':id',
  130. user_id:,
  131. secret: options[:secret],
  132. format: options[:format]
  133. )
  134. end
  135. def feed_icon
  136. interpolated['template']['icon'].presence || feed_link + '/favicon.ico'
  137. end
  138. def itunes_icon
  139. if boolify(interpolated['ns_itunes'])
  140. "<itunes:image href=#{feed_icon.encode(xml: :attr)} />"
  141. end
  142. end
  143. def feed_description
  144. interpolated['template']['description'].presence || "A feed of Events received by the '#{name}' Huginn Agent"
  145. end
  146. def rss_content_type
  147. interpolated['rss_content_type'].presence || 'application/rss+xml'
  148. end
  149. def xml_namespace
  150. namespaces = ['xmlns:atom="http://www.w3.org/2005/Atom"']
  151. if boolify(interpolated['ns_dc'])
  152. namespaces << 'xmlns:dc="http://purl.org/dc/elements/1.1/"'
  153. end
  154. if boolify(interpolated['ns_media'])
  155. namespaces << 'xmlns:media="http://search.yahoo.com/mrss/"'
  156. end
  157. if boolify(interpolated['ns_itunes'])
  158. namespaces << 'xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd"'
  159. end
  160. namespaces.join(' ')
  161. end
  162. def push_hubs
  163. interpolated['push_hubs'].presence || []
  164. end
  165. DEFAULT_EVENTS_ORDER = {
  166. 'events_order' => nil,
  167. 'events_list_order' => [["{{_index_}}", "number", true]],
  168. }
  169. def events_order(key = SortableEvents::EVENTS_ORDER_KEY)
  170. super || DEFAULT_EVENTS_ORDER[key]
  171. end
  172. def latest_events(reload = false)
  173. received_events = received_events().reorder(id: :asc)
  174. events =
  175. if (event_ids = memory[:event_ids]) &&
  176. memory[:events_order] == events_order &&
  177. memory[:events_to_show] >= events_to_show
  178. received_events.where(id: event_ids).to_a
  179. else
  180. memory[:last_event_id] = nil
  181. reload = true
  182. []
  183. end
  184. if reload
  185. memory[:events_order] = events_order
  186. memory[:events_to_show] = events_to_show
  187. new_events =
  188. if last_event_id = memory[:last_event_id]
  189. received_events.where(Event.arel_table[:id].gt(last_event_id)).to_a
  190. else
  191. source_ids.flat_map { |source_id|
  192. # dig twice as many events as the number of
  193. # `events_to_show`
  194. received_events.where(agent_id: source_id)
  195. .last(2 * events_to_show)
  196. }.sort_by(&:id)
  197. end
  198. unless new_events.empty?
  199. memory[:last_event_id] = new_events.last.id
  200. events.concat(new_events)
  201. end
  202. end
  203. events = sort_events(events).last(events_to_show)
  204. if reload
  205. memory[:event_ids] = events.map(&:id)
  206. end
  207. events
  208. end
  209. def receive_web_request(params, method, format)
  210. unless interpolated['secrets'].include?(params['secret'])
  211. if format =~ /json/
  212. return [{ error: "Not Authorized" }, 401]
  213. else
  214. return ["Not Authorized", 401]
  215. end
  216. end
  217. source_events = sort_events(latest_events, 'events_list_order')
  218. interpolate_with('events' => source_events) do
  219. items = source_events.map do |event|
  220. interpolated = interpolate_options(options['template']['item'], event)
  221. interpolated['guid'] = {
  222. '_attributes' => { 'isPermaLink' => 'false' },
  223. '_contents' => interpolated['guid'].presence || event.id
  224. }
  225. date_string = interpolated['pubDate'].to_s
  226. date =
  227. begin
  228. Time.zone.parse(date_string) # may return nil
  229. rescue StandardError => e
  230. error "Error parsing a \"pubDate\" value \"#{date_string}\": #{e.message}"
  231. nil
  232. end || event.created_at
  233. interpolated['pubDate'] = date.rfc2822.to_s
  234. interpolated
  235. end
  236. now = Time.now
  237. if format =~ /json/
  238. content = {
  239. 'title' => feed_title,
  240. 'description' => feed_description,
  241. 'pubDate' => now,
  242. 'items' => simplify_item_for_json(items)
  243. }
  244. return [content, 200, "application/json", interpolated['response_headers'].presence]
  245. else
  246. hub_links = push_hubs.map { |hub|
  247. <<-XML
  248. <atom:link rel="hub" href=#{hub.encode(xml: :attr)}/>
  249. XML
  250. }.join
  251. items = items_to_xml(items)
  252. return [<<~XML, 200, rss_content_type, interpolated['response_headers'].presence]
  253. <?xml version="1.0" encoding="UTF-8" ?>
  254. <rss version="2.0" #{xml_namespace}>
  255. <channel>
  256. <atom:link href=#{feed_url(secret: params['secret'], format: :xml).encode(xml: :attr)} rel="self" type="application/rss+xml" />
  257. <atom:icon>#{feed_icon.encode(xml: :text)}</atom:icon>
  258. #{itunes_icon}
  259. #{hub_links}
  260. <title>#{feed_title.encode(xml: :text)}</title>
  261. <description>#{feed_description.encode(xml: :text)}</description>
  262. <link>#{feed_link.encode(xml: :text)}</link>
  263. <lastBuildDate>#{now.rfc2822.to_s.encode(xml: :text)}</lastBuildDate>
  264. <pubDate>#{now.rfc2822.to_s.encode(xml: :text)}</pubDate>
  265. <ttl>#{feed_ttl}</ttl>
  266. #{items}
  267. </channel>
  268. </rss>
  269. XML
  270. end
  271. end
  272. end
  273. def receive(incoming_events)
  274. url = feed_url(secret: interpolated['secrets'].first, format: :xml)
  275. # Reload new events and update cache
  276. latest_events(true)
  277. push_hubs.each do |hub|
  278. push_to_hub(hub, url)
  279. end
  280. end
  281. private
  282. class XMLNode
  283. def initialize(tag_name, attributes, contents)
  284. @tag_name = tag_name
  285. @attributes = attributes
  286. @contents = contents
  287. end
  288. def to_xml(options)
  289. if @contents.is_a?(Hash)
  290. options[:builder].tag! @tag_name, @attributes do
  291. @contents.each { |key, value|
  292. ActiveSupport::XmlMini.to_tag(key, value, options.merge(skip_instruct: true))
  293. }
  294. end
  295. else
  296. options[:builder].tag! @tag_name, @attributes, @contents
  297. end
  298. end
  299. end
  300. def simplify_item_for_xml(item)
  301. if item.is_a?(Hash)
  302. item.each.with_object({}) do |(key, value), memo|
  303. memo[key] =
  304. if value.is_a?(Hash)
  305. if value.key?('_attributes') || value.key?('_contents')
  306. XMLNode.new(key, value['_attributes'], simplify_item_for_xml(value['_contents']))
  307. else
  308. simplify_item_for_xml(value)
  309. end
  310. else
  311. value
  312. end
  313. end
  314. elsif item.is_a?(Array)
  315. item.map { |value| simplify_item_for_xml(value) }
  316. else
  317. item
  318. end
  319. end
  320. def simplify_item_for_json(item)
  321. if item.is_a?(Hash)
  322. item.each.with_object({}) do |(key, value), memo|
  323. if value.is_a?(Hash)
  324. if value.key?('_attributes') || value.key?('_contents')
  325. contents =
  326. if value['_contents'] && value['_contents'].is_a?(Hash)
  327. simplify_item_for_json(value['_contents'])
  328. elsif value['_contents']
  329. { "contents" => value['_contents'] }
  330. else
  331. {}
  332. end
  333. memo[key] = contents.merge(value['_attributes'] || {})
  334. else
  335. memo[key] = simplify_item_for_json(value)
  336. end
  337. else
  338. memo[key] = value
  339. end
  340. end
  341. elsif item.is_a?(Array)
  342. item.map { |value| simplify_item_for_json(value) }
  343. else
  344. item
  345. end
  346. end
  347. def items_to_xml(items)
  348. simplify_item_for_xml(items)
  349. .to_xml(skip_types: true, root: "items", skip_instruct: true, indent: 1)
  350. .gsub(%r{
  351. (?<indent> ^\ + ) < (?<tagname> [^> ]+ ) > \n
  352. (?<children>
  353. (?: \k<indent> \ < \k<tagname> (?:\ [^>]*)? > [^<>]*? </ \k<tagname> > \n )+
  354. )
  355. \k<indent> </ \k<tagname> > \n
  356. }mx) { $~[:children].gsub(/^ /, '') } # delete redundant nesting of array elements
  357. .gsub(%r{
  358. (?<indent> ^\ + ) < [^> ]+ /> \n
  359. }mx, '') # delete empty elements
  360. .gsub(%r{^</?items>\n}, '')
  361. end
  362. def push_to_hub(hub, url)
  363. hub_uri =
  364. begin
  365. URI.parse(hub)
  366. rescue URI::Error
  367. nil
  368. end
  369. if !hub_uri.is_a?(URI::HTTP)
  370. error "Invalid push endpoint: #{hub}"
  371. return
  372. end
  373. log "Pushing #{url} to #{hub_uri}"
  374. return if dry_run?
  375. begin
  376. faraday.post hub_uri, {
  377. 'hub.mode' => 'publish',
  378. 'hub.url' => url
  379. }
  380. rescue StandardError => e
  381. error "Push failed: #{e.message}"
  382. end
  383. end
  384. end
  385. end