rss_agent_spec.rb 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. require 'rails_helper'
  2. describe Agents::RssAgent do
  3. before do
  4. @valid_options = {
  5. 'expected_update_period_in_days' => "2",
  6. 'url' => "https://github.com/cantino/huginn/commits/master.atom",
  7. }
  8. stub_request(:any, /github.com/).to_return(:body => File.read(Rails.root.join("spec/data_fixtures/github_rss.atom")), :status => 200)
  9. stub_request(:any, /bad.github.com/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/github_rss.atom")).gsub(/<link [^>]+\/>/, '<link/>'), status: 200)
  10. stub_request(:any, /SlickdealsnetFP/).to_return(:body => File.read(Rails.root.join("spec/data_fixtures/slickdeals.atom")), :status => 200)
  11. stub_request(:any, /onethingwell.org/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/onethingwell.rss")), status: 200)
  12. stub_request(:any, /bad.onethingwell.org/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/onethingwell.rss")).gsub(/(?<=<link>)[^<]*/, ''), status: 200)
  13. end
  14. let(:agent) do
  15. _agent = Agents::RssAgent.new(:name => "rss feed", :options => @valid_options)
  16. _agent.user = users(:bob)
  17. _agent.save!
  18. _agent
  19. end
  20. it_behaves_like WebRequestConcern
  21. describe "validations" do
  22. it "should validate the presence of url" do
  23. agent.options['url'] = "http://google.com"
  24. expect(agent).to be_valid
  25. agent.options['url'] = ["http://google.com", "http://yahoo.com"]
  26. expect(agent).to be_valid
  27. agent.options['url'] = ""
  28. expect(agent).not_to be_valid
  29. agent.options['url'] = nil
  30. expect(agent).not_to be_valid
  31. end
  32. it "should validate the presence and numericality of expected_update_period_in_days" do
  33. agent.options['expected_update_period_in_days'] = "5"
  34. expect(agent).to be_valid
  35. agent.options['expected_update_period_in_days'] = "wut?"
  36. expect(agent).not_to be_valid
  37. agent.options['expected_update_period_in_days'] = 0
  38. expect(agent).not_to be_valid
  39. agent.options['expected_update_period_in_days'] = nil
  40. expect(agent).not_to be_valid
  41. agent.options['expected_update_period_in_days'] = ""
  42. expect(agent).not_to be_valid
  43. end
  44. end
  45. describe "emitting RSS events" do
  46. it "should emit items as events for an Atom feed" do
  47. agent.options['include_feed_info'] = true
  48. expect {
  49. agent.check
  50. }.to change { agent.events.count }.by(20)
  51. first, *, last = agent.events.last(20)
  52. [first, last].each do |event|
  53. expect(first.payload['feed']).to include({
  54. "type" => "atom",
  55. "title" => "Recent Commits to huginn:master",
  56. "url" => "https://github.com/cantino/huginn/commits/master",
  57. "links" => [
  58. {
  59. "type" => "text/html",
  60. "rel" => "alternate",
  61. "href" => "https://github.com/cantino/huginn/commits/master",
  62. },
  63. {
  64. "type" => "application/atom+xml",
  65. "rel" => "self",
  66. "href" => "https://github.com/cantino/huginn/commits/master.atom",
  67. },
  68. ],
  69. })
  70. end
  71. expect(first.payload['url']).to eq("https://github.com/cantino/huginn/commit/d0a844662846cf3c83b94c637c1803f03db5a5b0")
  72. expect(first.payload['urls']).to eq(["https://github.com/cantino/huginn/commit/d0a844662846cf3c83b94c637c1803f03db5a5b0"])
  73. expect(first.payload['links']).to eq([
  74. {
  75. "href" => "https://github.com/cantino/huginn/commit/d0a844662846cf3c83b94c637c1803f03db5a5b0",
  76. "rel" => "alternate",
  77. "type" => "text/html",
  78. }
  79. ])
  80. expect(first.payload['authors']).to eq(["cantino (https://github.com/cantino)"])
  81. expect(first.payload['date_published']).to be_nil
  82. expect(first.payload['last_updated']).to eq("2014-07-16T22:26:22-07:00")
  83. expect(last.payload['url']).to eq("https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af")
  84. expect(last.payload['urls']).to eq(["https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af"])
  85. expect(last.payload['links']).to eq([
  86. {
  87. "href" => "https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af",
  88. "rel" => "alternate",
  89. "type" => "text/html",
  90. }
  91. ])
  92. expect(last.payload['authors']).to eq(["CloCkWeRX (https://github.com/CloCkWeRX)"])
  93. expect(last.payload['date_published']).to be_nil
  94. expect(last.payload['last_updated']).to eq("2014-07-01T16:37:47+09:30")
  95. end
  96. it "should emit items as events in the order specified in the events_order option" do
  97. expect {
  98. agent.options['events_order'] = ['{{title | replace_regex: "^[[:space:]]+", "" }}']
  99. agent.check
  100. }.to change { agent.events.count }.by(20)
  101. first, *, last = agent.events.last(20)
  102. expect(first.payload['title'].strip).to eq('upgrade rails and gems')
  103. expect(first.payload['url']).to eq("https://github.com/cantino/huginn/commit/87a7abda23a82305d7050ac0bb400ce36c863d01")
  104. expect(first.payload['urls']).to eq(["https://github.com/cantino/huginn/commit/87a7abda23a82305d7050ac0bb400ce36c863d01"])
  105. expect(last.payload['title'].strip).to eq('Dashed line in a diagram indicates propagate_immediately being false.')
  106. expect(last.payload['url']).to eq("https://github.com/cantino/huginn/commit/0e80f5341587aace2c023b06eb9265b776ac4535")
  107. expect(last.payload['urls']).to eq(["https://github.com/cantino/huginn/commit/0e80f5341587aace2c023b06eb9265b776ac4535"])
  108. end
  109. it "should emit items as events for a FeedBurner RSS 2.0 feed" do
  110. agent.options['url'] = "http://feeds.feedburner.com/SlickdealsnetFP?format=atom" # This is actually RSS 2.0 w/ Atom extension
  111. agent.options['include_feed_info'] = true
  112. agent.save!
  113. expect {
  114. agent.check
  115. }.to change { agent.events.count }.by(79)
  116. first, *, last = agent.events.last(79)
  117. expect(first.payload['feed']).to include({
  118. "type" => "rss",
  119. "title" => "SlickDeals.net",
  120. "description" => "Slick online shopping deals.",
  121. "url" => "http://slickdeals.net/",
  122. })
  123. # Feedjira extracts feedburner:origLink
  124. expect(first.payload['url']).to eq("http://slickdeals.net/permadeal/130160/green-man-gaming---pc-games-tomb-raider-game-of-the-year-6-hitman-absolution-elite-edition")
  125. expect(last.payload['feed']).to include({
  126. "type" => "rss",
  127. "title" => "SlickDeals.net",
  128. "description" => "Slick online shopping deals.",
  129. "url" => "http://slickdeals.net/",
  130. })
  131. expect(last.payload['url']).to eq("http://slickdeals.net/permadeal/129980/amazon---rearth-ringke-fusion-bumper-hybrid-case-for-iphone-6")
  132. end
  133. it "should track ids and not re-emit the same item when seen again" do
  134. agent.check
  135. expect(agent.memory['seen_ids']).to eq(agent.events.map {|e| e.payload['id'] })
  136. newest_id = agent.memory['seen_ids'][0]
  137. expect(agent.events.first.payload['id']).to eq(newest_id)
  138. agent.memory['seen_ids'] = agent.memory['seen_ids'][1..-1] # forget the newest id
  139. expect {
  140. agent.check
  141. }.to change { agent.events.count }.by(1)
  142. expect(agent.events.first.payload['id']).to eq(newest_id)
  143. expect(agent.memory['seen_ids'][0]).to eq(newest_id)
  144. end
  145. it "should truncate the seen_ids in memory at 500 items" do
  146. agent.memory['seen_ids'] = ['x'] * 490
  147. agent.check
  148. expect(agent.memory['seen_ids'].length).to eq(500)
  149. end
  150. it "should support an array of URLs" do
  151. agent.options['url'] = ["https://github.com/cantino/huginn/commits/master.atom", "http://feeds.feedburner.com/SlickdealsnetFP?format=atom"]
  152. agent.save!
  153. expect {
  154. agent.check
  155. }.to change { agent.events.count }.by(20 + 79)
  156. end
  157. it "should fetch one event per run" do
  158. agent.options['url'] = ["https://github.com/cantino/huginn/commits/master.atom"]
  159. agent.options['max_events_per_run'] = 1
  160. agent.check
  161. expect(agent.events.count).to eq(1)
  162. end
  163. it "should fetch all events per run" do
  164. agent.options['url'] = ["https://github.com/cantino/huginn/commits/master.atom"]
  165. # <= 0 should ignore option and get all
  166. agent.options['max_events_per_run'] = 0
  167. agent.check
  168. expect(agent.events.count).to eq(20)
  169. agent.options['max_events_per_run'] = -1
  170. expect {
  171. agent.check
  172. }.to_not change { agent.events.count }
  173. end
  174. end
  175. context "when no ids are available" do
  176. before do
  177. @valid_options['url'] = 'http://feeds.feedburner.com/SlickdealsnetFP?format=atom'
  178. end
  179. it "calculates content MD5 sums" do
  180. expect {
  181. agent.check
  182. }.to change { agent.events.count }.by(79)
  183. expect(agent.memory['seen_ids']).to eq(agent.events.map {|e| Digest::MD5.hexdigest(e.payload['content']) })
  184. end
  185. end
  186. context "parsing feeds" do
  187. before do
  188. @valid_options['url'] = 'http://onethingwell.org/rss'
  189. end
  190. it "captures timestamps normalized in the ISO 8601 format" do
  191. agent.check
  192. first, *, third = agent.events.take(3)
  193. expect(first.payload['date_published']).to eq('2015-08-20T17:00:10+01:00')
  194. expect(third.payload['date_published']).to eq('2015-08-20T13:00:07+01:00')
  195. end
  196. it "captures multiple categories" do
  197. agent.check
  198. first, *, third = agent.events.take(3)
  199. expect(first.payload['categories']).to eq(["csv", "crossplatform", "utilities"])
  200. expect(third.payload['categories']).to eq(["web"])
  201. end
  202. it "sanitizes HTML content" do
  203. agent.options['clean'] = true
  204. agent.check
  205. event = agent.events.last
  206. expect(event.payload['content']).to eq('<a href="http://showgoers.tv/">Showgoers</a>: <blockquote> <p>Showgoers is a Chrome browser extension to synchronize your Netflix player with someone else so that you can co-watch the same movie on different computers with no hassle. Syncing up your player is as easy as sharing a URL.</p> </blockquote>')
  207. expect(event.payload['description']).to eq('<a href="http://showgoers.tv/">Showgoers</a>: <blockquote> <p>Showgoers is a Chrome browser extension to synchronize your Netflix player with someone else so that you can co-watch the same movie on different computers with no hassle. Syncing up your player is as easy as sharing a URL.</p> </blockquote>')
  208. end
  209. it "captures an enclosure" do
  210. agent.check
  211. event = agent.events.fourth
  212. expect(event.payload['enclosure']).to eq({ "url" => "http://c.1tw.org/images/2015/itsy.png", "type" => "image/png", "length" => "48249" })
  213. expect(event.payload['image']).to eq("http://c.1tw.org/images/2015/itsy.png")
  214. end
  215. it "ignores an empty author" do
  216. agent.check
  217. event = agent.events.first
  218. expect(event.payload['authors']).to eq([])
  219. end
  220. context 'with an empty link in RSS' do
  221. before do
  222. @valid_options['url'] = 'http://bad.onethingwell.org/rss'
  223. end
  224. it "does not leak :no_buffer" do
  225. agent.check
  226. event = agent.events.first
  227. expect(event.payload['links']).to eq([])
  228. end
  229. end
  230. context 'with an empty link in RSS' do
  231. before do
  232. @valid_options['url'] = "https://bad.github.com/cantino/huginn/commits/master.atom"
  233. end
  234. it "does not leak :no_buffer" do
  235. agent.check
  236. event = agent.events.first
  237. expect(event.payload['links']).to eq([])
  238. end
  239. end
  240. end
  241. describe 'logging errors with the feed url' do
  242. it 'includes the feed URL when an exception is raised' do
  243. mock(Feedjira::Feed).parse(anything) { raise StandardError.new("Some error!") }
  244. expect(lambda {
  245. agent.check
  246. }).not_to raise_error
  247. expect(agent.logs.last.message).to match(%r[Failed to fetch https://github.com])
  248. end
  249. end
  250. end