require 'rails_helper' describe Agents::RssAgent do before do @valid_options = { 'expected_update_period_in_days' => "2", 'url' => "https://github.com/cantino/huginn/commits/master.atom", } stub_request(:any, /github.com/).to_return(:body => File.read(Rails.root.join("spec/data_fixtures/github_rss.atom")), :status => 200) stub_request(:any, /bad.github.com/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/github_rss.atom")).gsub(/]+\/>/, ''), status: 200) stub_request(:any, /SlickdealsnetFP/).to_return(:body => File.read(Rails.root.join("spec/data_fixtures/slickdeals.atom")), :status => 200) stub_request(:any, /onethingwell.org/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/onethingwell.rss")), status: 200) stub_request(:any, /bad.onethingwell.org/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/onethingwell.rss")).gsub(/(?<=)[^<]*/, ''), status: 200) stub_request(:any, /iso-8859-1/).to_return(body: File.binread(Rails.root.join("spec/data_fixtures/iso-8859-1.rss")), headers: { 'Content-Type' => 'application/rss+xml; charset=ISO-8859-1' }, status: 200) stub_request(:any, /podcast/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/podcast.rss")), status: 200) stub_request(:any, /youtube/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/youtube.xml")), status: 200) end let(:agent) do _agent = Agents::RssAgent.new(:name => "rss feed", :options => @valid_options) _agent.user = users(:bob) _agent.save! _agent end it_behaves_like WebRequestConcern describe "validations" do it "should validate the presence of url" do agent.options['url'] = "http://google.com" expect(agent).to be_valid agent.options['url'] = ["http://google.com", "http://yahoo.com"] expect(agent).to be_valid agent.options['url'] = "" expect(agent).not_to be_valid agent.options['url'] = nil expect(agent).not_to be_valid end it "should validate the presence and numericality of expected_update_period_in_days" do agent.options['expected_update_period_in_days'] = "5" expect(agent).to be_valid agent.options['expected_update_period_in_days'] = "wut?" expect(agent).not_to be_valid agent.options['expected_update_period_in_days'] = 0 expect(agent).not_to be_valid agent.options['expected_update_period_in_days'] = nil expect(agent).not_to be_valid agent.options['expected_update_period_in_days'] = "" expect(agent).not_to be_valid end end describe "emitting RSS events" do it "should emit items as events for an Atom feed" do agent.options['include_feed_info'] = true agent.options['include_sort_info'] = true expect { agent.check }.to change { agent.events.count }.by(20) first, *, last = agent.events.last(20) [first, last].each do |event| expect(event.payload['feed']).to include({ "type" => "atom", "title" => "Recent Commits to huginn:master", "url" => "https://github.com/cantino/huginn/commits/master", "links" => [ { "type" => "text/html", "rel" => "alternate", "href" => "https://github.com/cantino/huginn/commits/master", }, { "type" => "application/atom+xml", "rel" => "self", "href" => "https://github.com/cantino/huginn/commits/master.atom", }, ], }) end expect(first.payload['url']).to eq("https://github.com/cantino/huginn/commit/d0a844662846cf3c83b94c637c1803f03db5a5b0") expect(first.payload['urls']).to eq(["https://github.com/cantino/huginn/commit/d0a844662846cf3c83b94c637c1803f03db5a5b0"]) expect(first.payload['links']).to eq([ { "href" => "https://github.com/cantino/huginn/commit/d0a844662846cf3c83b94c637c1803f03db5a5b0", "rel" => "alternate", "type" => "text/html", } ]) expect(first.payload['authors']).to eq(["cantino (https://github.com/cantino)"]) expect(first.payload['date_published']).to be_nil expect(first.payload['last_updated']).to eq("2014-07-16T22:26:22-07:00") expect(first.payload['sort_info']).to eq({ 'position' => 20, 'count' => 20 }) expect(last.payload['url']).to eq("https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af") expect(last.payload['urls']).to eq(["https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af"]) expect(last.payload['links']).to eq([ { "href" => "https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af", "rel" => "alternate", "type" => "text/html", } ]) expect(last.payload['authors']).to eq(["CloCkWeRX (https://github.com/CloCkWeRX)"]) expect(last.payload['date_published']).to be_nil expect(last.payload['last_updated']).to eq("2014-07-01T16:37:47+09:30") expect(last.payload['sort_info']).to eq({ 'position' => 1, 'count' => 20 }) end it "should emit items as events in the order specified in the events_order option" do expect { agent.options['events_order'] = ['{{title | replace_regex: "^[[:space:]]+", "" }}'] agent.options['include_sort_info'] = true agent.check }.to change { agent.events.count }.by(20) first, *, last = agent.events.last(20) expect(first.payload['title'].strip).to eq('upgrade rails and gems') expect(first.payload['url']).to eq("https://github.com/cantino/huginn/commit/87a7abda23a82305d7050ac0bb400ce36c863d01") expect(first.payload['urls']).to eq(["https://github.com/cantino/huginn/commit/87a7abda23a82305d7050ac0bb400ce36c863d01"]) expect(first.payload['sort_info']).to eq({ 'position' => 20, 'count' => 20 }) expect(last.payload['title'].strip).to eq('Dashed line in a diagram indicates propagate_immediately being false.') expect(last.payload['url']).to eq("https://github.com/cantino/huginn/commit/0e80f5341587aace2c023b06eb9265b776ac4535") expect(last.payload['urls']).to eq(["https://github.com/cantino/huginn/commit/0e80f5341587aace2c023b06eb9265b776ac4535"]) expect(last.payload['sort_info']).to eq({ 'position' => 1, 'count' => 20 }) end it "should emit items as events for a FeedBurner RSS 2.0 feed" do agent.options['url'] = "http://feeds.feedburner.com/SlickdealsnetFP?format=atom" # This is actually RSS 2.0 w/ Atom extension agent.options['include_feed_info'] = true agent.save! expect { agent.check }.to change { agent.events.count }.by(79) first, *, last = agent.events.last(79) expect(first.payload['feed']).to include({ "type" => "rss", "title" => "SlickDeals.net", "description" => "Slick online shopping deals.", "url" => "http://slickdeals.net/", }) # Feedjira extracts feedburner:origLink expect(first.payload['url']).to eq("http://slickdeals.net/permadeal/130160/green-man-gaming---pc-games-tomb-raider-game-of-the-year-6-hitman-absolution-elite-edition") expect(last.payload['feed']).to include({ "type" => "rss", "title" => "SlickDeals.net", "description" => "Slick online shopping deals.", "url" => "http://slickdeals.net/", }) expect(last.payload['url']).to eq("http://slickdeals.net/permadeal/129980/amazon---rearth-ringke-fusion-bumper-hybrid-case-for-iphone-6") end it "should track ids and not re-emit the same item when seen again" do agent.check expect(agent.memory['seen_ids']).to eq(agent.events.map {|e| e.payload['id'] }) newest_id = agent.memory['seen_ids'][0] expect(agent.events.first.payload['id']).to eq(newest_id) agent.memory['seen_ids'] = agent.memory['seen_ids'][1..-1] # forget the newest id expect { agent.check }.to change { agent.events.count }.by(1) expect(agent.events.first.payload['id']).to eq(newest_id) expect(agent.memory['seen_ids'][0]).to eq(newest_id) end it "should truncate the seen_ids in memory at 500 items per default" do agent.memory['seen_ids'] = ['x'] * 490 agent.check expect(agent.memory['seen_ids'].length).to eq(500) end it "should truncate the seen_ids in memory at amount of items configured in options" do agent.options['remembered_id_count'] = "600" agent.memory['seen_ids'] = ['x'] * 590 agent.check expect(agent.memory['seen_ids'].length).to eq(600) end it "should truncate the seen_ids after configuring a lower limit of items when check is executed" do agent.memory['seen_ids'] = ['x'] * 600 agent.options['remembered_id_count'] = "400" expect(agent.memory['seen_ids'].length).to eq(600) agent.check expect(agent.memory['seen_ids'].length).to eq(400) end it "should truncate the seen_ids at default after removing custom limit" do agent.options['remembered_id_count'] = "600" agent.memory['seen_ids'] = ['x'] * 590 agent.check expect(agent.memory['seen_ids'].length).to eq(600) agent.options.delete('remembered_id_count') agent.memory['seen_ids'] = ['x'] * 590 agent.check expect(agent.memory['seen_ids'].length).to eq(500) end it "should support an array of URLs" do agent.options['url'] = ["https://github.com/cantino/huginn/commits/master.atom", "http://feeds.feedburner.com/SlickdealsnetFP?format=atom"] agent.save! expect { agent.check }.to change { agent.events.count }.by(20 + 79) end it "should fetch one event per run" do agent.options['url'] = ["https://github.com/cantino/huginn/commits/master.atom"] agent.options['max_events_per_run'] = 1 agent.check expect(agent.events.count).to eq(1) end it "should fetch all events per run" do agent.options['url'] = ["https://github.com/cantino/huginn/commits/master.atom"] # <= 0 should ignore option and get all agent.options['max_events_per_run'] = 0 agent.check expect(agent.events.count).to eq(20) agent.options['max_events_per_run'] = -1 expect { agent.check }.to_not change { agent.events.count } end end context "when no ids are available" do before do @valid_options['url'] = 'http://feeds.feedburner.com/SlickdealsnetFP?format=atom' end it "calculates content MD5 sums" do expect { agent.check }.to change { agent.events.count }.by(79) expect(agent.memory['seen_ids']).to eq(agent.events.map {|e| Digest::MD5.hexdigest(e.payload['content']) }) end end context "parsing feeds" do before do @valid_options['url'] = 'http://onethingwell.org/rss' end it "captures timestamps normalized in the ISO 8601 format" do agent.check first, *, third = agent.events.take(3) expect(first.payload['date_published']).to eq('2015-08-20T17:00:10+01:00') expect(third.payload['date_published']).to eq('2015-08-20T13:00:07+01:00') end it "captures multiple categories" do agent.check first, *, third = agent.events.take(3) expect(first.payload['categories']).to eq(["csv", "crossplatform", "utilities"]) expect(third.payload['categories']).to eq(["web"]) end it "sanitizes HTML content" do agent.options['clean'] = true agent.check event = agent.events.last expect(event.payload['content']).to eq('Showgoers:
') expect(event.payload['description']).to eq('Showgoers:Showgoers is a Chrome browser extension to synchronize your Netflix player with someone else so that you can co-watch the same movie on different computers with no hassle. Syncing up your player is as easy as sharing a URL.
') end it "captures an enclosure" do agent.check event = agent.events.fourth expect(event.payload['enclosure']).to eq({ "url" => "http://c.1tw.org/images/2015/itsy.png", "type" => "image/png", "length" => "48249" }) expect(event.payload['image']).to eq("http://c.1tw.org/images/2015/itsy.png") end it "ignores an empty author" do agent.check event = agent.events.first expect(event.payload['authors']).to eq([]) end context 'with an empty link in RSS' do before do @valid_options['url'] = 'http://bad.onethingwell.org/rss' end it "does not leak :no_buffer" do agent.check event = agent.events.first expect(event.payload['links']).to eq([]) end end context 'with an empty link in RSS' do before do @valid_options['url'] = "https://bad.github.com/cantino/huginn/commits/master.atom" end it "does not leak :no_buffer" do agent.check event = agent.events.first expect(event.payload['links']).to eq([]) end end context 'with the encoding declared in both headers and the content' do before do @valid_options['url'] = 'http://example.org/iso-8859-1.rss' end it "decodes the content properly" do agent.check event = agent.events.first expect(event.payload['title']).to eq('Mëkanïk Zaïn') end it "decodes the content properly with force_encoding specified" do @valid_options['force_encoding'] = 'iso-8859-1' agent.check event = agent.events.first expect(event.payload['title']).to eq('Mëkanïk Zaïn') end end context 'with podcast elements' do before do @valid_options['url'] = 'http://example.com/podcast.rss' @valid_options['include_feed_info'] = true end let :feed_info do { "id" => nil, "type" => "rss", "url" => "http://www.example.com/podcasts/everything/index.html", "links" => [ { "href" => "http://www.example.com/podcasts/everything/index.html" } ], "title" => "All About Everything", "description" => "All About Everything is a show about everything. Each week we dive into any subject known to man and talk about it as much as we can. Look for our podcast in the Podcasts app or in the iTunes Store", "copyright" => "℗ & © 2014 John Doe & Family", "generator" => nil, "icon" => nil, "authors" => [ "John Doe" ], "date_published" => nil, "last_updated" => nil, "itunes_categories" => [ "Technology", "Gadgets", "TV & Film", "Arts", "Food" ], "itunes_complete" => "yes", "itunes_explicit" => "no", "itunes_image" => "http://example.com/podcasts/everything/AllAboutEverything.jpg", "itunes_owners" => ["John DoeShowgoers is a Chrome browser extension to synchronize your Netflix player with someone else so that you can co-watch the same movie on different computers with no hassle. Syncing up your player is as easy as sharing a URL.