123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548 |
- require 'rails_helper'
- describe Agents::RssAgent do
- before do
- @valid_options = {
- 'expected_update_period_in_days' => "2",
- 'url' => "https://github.com/cantino/huginn/commits/master.atom",
- }
- stub_request(:any, /github.com/).to_return(:body => File.read(Rails.root.join("spec/data_fixtures/github_rss.atom")), :status => 200)
- stub_request(:any, /bad.github.com/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/github_rss.atom")).gsub(/<link [^>]+\/>/, '<link/>'), status: 200)
- stub_request(:any, /SlickdealsnetFP/).to_return(:body => File.read(Rails.root.join("spec/data_fixtures/slickdeals.atom")), :status => 200)
- stub_request(:any, /onethingwell.org/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/onethingwell.rss")), status: 200)
- stub_request(:any, /bad.onethingwell.org/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/onethingwell.rss")).gsub(/(?<=<link>)[^<]*/, ''), status: 200)
- stub_request(:any, /iso-8859-1/).to_return(body: File.binread(Rails.root.join("spec/data_fixtures/iso-8859-1.rss")), headers: { 'Content-Type' => 'application/rss+xml; charset=ISO-8859-1' }, status: 200)
- stub_request(:any, /podcast/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/podcast.rss")), status: 200)
- stub_request(:any, /youtube/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/youtube.xml")), status: 200)
- end
- let(:agent) do
- _agent = Agents::RssAgent.new(:name => "rss feed", :options => @valid_options)
- _agent.user = users(:bob)
- _agent.save!
- _agent
- end
- it_behaves_like WebRequestConcern
- describe "validations" do
- it "should validate the presence of url" do
- agent.options['url'] = "http://google.com"
- expect(agent).to be_valid
- agent.options['url'] = ["http://google.com", "http://yahoo.com"]
- expect(agent).to be_valid
- agent.options['url'] = ""
- expect(agent).not_to be_valid
- agent.options['url'] = nil
- expect(agent).not_to be_valid
- end
- it "should validate the presence and numericality of expected_update_period_in_days" do
- agent.options['expected_update_period_in_days'] = "5"
- expect(agent).to be_valid
- agent.options['expected_update_period_in_days'] = "wut?"
- expect(agent).not_to be_valid
- agent.options['expected_update_period_in_days'] = 0
- expect(agent).not_to be_valid
- agent.options['expected_update_period_in_days'] = nil
- expect(agent).not_to be_valid
- agent.options['expected_update_period_in_days'] = ""
- expect(agent).not_to be_valid
- end
- end
- describe "emitting RSS events" do
- it "should emit items as events for an Atom feed" do
- agent.options['include_feed_info'] = true
- agent.options['include_sort_info'] = true
- expect {
- agent.check
- }.to change { agent.events.count }.by(20)
- first, *, last = agent.events.last(20)
- [first, last].each do |event|
- expect(event.payload['feed']).to include({
- "type" => "atom",
- "title" => "Recent Commits to huginn:master",
- "url" => "https://github.com/cantino/huginn/commits/master",
- "links" => [
- {
- "type" => "text/html",
- "rel" => "alternate",
- "href" => "https://github.com/cantino/huginn/commits/master",
- },
- {
- "type" => "application/atom+xml",
- "rel" => "self",
- "href" => "https://github.com/cantino/huginn/commits/master.atom",
- },
- ],
- })
- end
- expect(first.payload['url']).to eq("https://github.com/cantino/huginn/commit/d0a844662846cf3c83b94c637c1803f03db5a5b0")
- expect(first.payload['urls']).to eq(["https://github.com/cantino/huginn/commit/d0a844662846cf3c83b94c637c1803f03db5a5b0"])
- expect(first.payload['links']).to eq([
- {
- "href" => "https://github.com/cantino/huginn/commit/d0a844662846cf3c83b94c637c1803f03db5a5b0",
- "rel" => "alternate",
- "type" => "text/html",
- }
- ])
- expect(first.payload['authors']).to eq(["cantino (https://github.com/cantino)"])
- expect(first.payload['date_published']).to be_nil
- expect(first.payload['last_updated']).to eq("2014-07-16T22:26:22-07:00")
- expect(first.payload['sort_info']).to eq({ 'position' => 20, 'count' => 20 })
- expect(last.payload['url']).to eq("https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af")
- expect(last.payload['urls']).to eq(["https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af"])
- expect(last.payload['links']).to eq([
- {
- "href" => "https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af",
- "rel" => "alternate",
- "type" => "text/html",
- }
- ])
- expect(last.payload['authors']).to eq(["CloCkWeRX (https://github.com/CloCkWeRX)"])
- expect(last.payload['date_published']).to be_nil
- expect(last.payload['last_updated']).to eq("2014-07-01T16:37:47+09:30")
- expect(last.payload['sort_info']).to eq({ 'position' => 1, 'count' => 20 })
- end
- it "should emit items as events in the order specified in the events_order option" do
- expect {
- agent.options['events_order'] = ['{{title | replace_regex: "^[[:space:]]+", "" }}']
- agent.options['include_sort_info'] = true
- agent.check
- }.to change { agent.events.count }.by(20)
- first, *, last = agent.events.last(20)
- expect(first.payload['title'].strip).to eq('upgrade rails and gems')
- expect(first.payload['url']).to eq("https://github.com/cantino/huginn/commit/87a7abda23a82305d7050ac0bb400ce36c863d01")
- expect(first.payload['urls']).to eq(["https://github.com/cantino/huginn/commit/87a7abda23a82305d7050ac0bb400ce36c863d01"])
- expect(first.payload['sort_info']).to eq({ 'position' => 20, 'count' => 20 })
- expect(last.payload['title'].strip).to eq('Dashed line in a diagram indicates propagate_immediately being false.')
- expect(last.payload['url']).to eq("https://github.com/cantino/huginn/commit/0e80f5341587aace2c023b06eb9265b776ac4535")
- expect(last.payload['urls']).to eq(["https://github.com/cantino/huginn/commit/0e80f5341587aace2c023b06eb9265b776ac4535"])
- expect(last.payload['sort_info']).to eq({ 'position' => 1, 'count' => 20 })
- end
- it "should emit items as events for a FeedBurner RSS 2.0 feed" do
- agent.options['url'] = "http://feeds.feedburner.com/SlickdealsnetFP?format=atom" # This is actually RSS 2.0 w/ Atom extension
- agent.options['include_feed_info'] = true
- agent.save!
- expect {
- agent.check
- }.to change { agent.events.count }.by(79)
- first, *, last = agent.events.last(79)
- expect(first.payload['feed']).to include({
- "type" => "rss",
- "title" => "SlickDeals.net",
- "description" => "Slick online shopping deals.",
- "url" => "http://slickdeals.net/",
- })
- # Feedjira extracts feedburner:origLink
- expect(first.payload['url']).to eq("http://slickdeals.net/permadeal/130160/green-man-gaming---pc-games-tomb-raider-game-of-the-year-6-hitman-absolution-elite-edition")
- expect(last.payload['feed']).to include({
- "type" => "rss",
- "title" => "SlickDeals.net",
- "description" => "Slick online shopping deals.",
- "url" => "http://slickdeals.net/",
- })
- expect(last.payload['url']).to eq("http://slickdeals.net/permadeal/129980/amazon---rearth-ringke-fusion-bumper-hybrid-case-for-iphone-6")
- end
- it "should track ids and not re-emit the same item when seen again" do
- agent.check
- expect(agent.memory['seen_ids']).to eq(agent.events.map {|e| e.payload['id'] })
- newest_id = agent.memory['seen_ids'][0]
- expect(agent.events.first.payload['id']).to eq(newest_id)
- agent.memory['seen_ids'] = agent.memory['seen_ids'][1..-1] # forget the newest id
- expect {
- agent.check
- }.to change { agent.events.count }.by(1)
- expect(agent.events.first.payload['id']).to eq(newest_id)
- expect(agent.memory['seen_ids'][0]).to eq(newest_id)
- end
- it "should truncate the seen_ids in memory at 500 items per default" do
- agent.memory['seen_ids'] = ['x'] * 490
- agent.check
- expect(agent.memory['seen_ids'].length).to eq(500)
- end
-
- it "should truncate the seen_ids in memory at amount of items configured in options" do
- agent.options['remembered_id_count'] = "600"
- agent.memory['seen_ids'] = ['x'] * 590
- agent.check
- expect(agent.memory['seen_ids'].length).to eq(600)
- end
-
- it "should truncate the seen_ids after configuring a lower limit of items when check is executed" do
- agent.memory['seen_ids'] = ['x'] * 600
- agent.options['remembered_id_count'] = "400"
- expect(agent.memory['seen_ids'].length).to eq(600)
- agent.check
- expect(agent.memory['seen_ids'].length).to eq(400)
- end
-
- it "should truncate the seen_ids at default after removing custom limit" do
- agent.options['remembered_id_count'] = "600"
- agent.memory['seen_ids'] = ['x'] * 590
- agent.check
- expect(agent.memory['seen_ids'].length).to eq(600)
- agent.options.delete('remembered_id_count')
- agent.memory['seen_ids'] = ['x'] * 590
- agent.check
- expect(agent.memory['seen_ids'].length).to eq(500)
- end
- it "should support an array of URLs" do
- agent.options['url'] = ["https://github.com/cantino/huginn/commits/master.atom", "http://feeds.feedburner.com/SlickdealsnetFP?format=atom"]
- agent.save!
- expect {
- agent.check
- }.to change { agent.events.count }.by(20 + 79)
- end
- it "should fetch one event per run" do
- agent.options['url'] = ["https://github.com/cantino/huginn/commits/master.atom"]
- agent.options['max_events_per_run'] = 1
- agent.check
- expect(agent.events.count).to eq(1)
- end
- it "should fetch all events per run" do
- agent.options['url'] = ["https://github.com/cantino/huginn/commits/master.atom"]
- # <= 0 should ignore option and get all
- agent.options['max_events_per_run'] = 0
- agent.check
- expect(agent.events.count).to eq(20)
- agent.options['max_events_per_run'] = -1
- expect {
- agent.check
- }.to_not change { agent.events.count }
- end
- end
- context "when no ids are available" do
- before do
- @valid_options['url'] = 'http://feeds.feedburner.com/SlickdealsnetFP?format=atom'
- end
- it "calculates content MD5 sums" do
- expect {
- agent.check
- }.to change { agent.events.count }.by(79)
- expect(agent.memory['seen_ids']).to eq(agent.events.map {|e| Digest::MD5.hexdigest(e.payload['content']) })
- end
- end
- context "parsing feeds" do
- before do
- @valid_options['url'] = 'http://onethingwell.org/rss'
- end
- it "captures timestamps normalized in the ISO 8601 format" do
- agent.check
- first, *, third = agent.events.take(3)
- expect(first.payload['date_published']).to eq('2015-08-20T17:00:10+01:00')
- expect(third.payload['date_published']).to eq('2015-08-20T13:00:07+01:00')
- end
- it "captures multiple categories" do
- agent.check
- first, *, third = agent.events.take(3)
- expect(first.payload['categories']).to eq(["csv", "crossplatform", "utilities"])
- expect(third.payload['categories']).to eq(["web"])
- end
- it "sanitizes HTML content" do
- agent.options['clean'] = true
- agent.check
- event = agent.events.last
- expect(event.payload['content']).to eq('<a href="http://showgoers.tv/">Showgoers</a>: <blockquote> <p>Showgoers is a Chrome browser extension to synchronize your Netflix player with someone else so that you can co-watch the same movie on different computers with no hassle. Syncing up your player is as easy as sharing a URL.</p> </blockquote>')
- expect(event.payload['description']).to eq('<a href="http://showgoers.tv/">Showgoers</a>: <blockquote> <p>Showgoers is a Chrome browser extension to synchronize your Netflix player with someone else so that you can co-watch the same movie on different computers with no hassle. Syncing up your player is as easy as sharing a URL.</p> </blockquote>')
- end
- it "captures an enclosure" do
- agent.check
- event = agent.events.fourth
- expect(event.payload['enclosure']).to eq({ "url" => "http://c.1tw.org/images/2015/itsy.png", "type" => "image/png", "length" => "48249" })
- expect(event.payload['image']).to eq("http://c.1tw.org/images/2015/itsy.png")
- end
- it "ignores an empty author" do
- agent.check
- event = agent.events.first
- expect(event.payload['authors']).to eq([])
- end
- context 'with an empty link in RSS' do
- before do
- @valid_options['url'] = 'http://bad.onethingwell.org/rss'
- end
- it "does not leak :no_buffer" do
- agent.check
- event = agent.events.first
- expect(event.payload['links']).to eq([])
- end
- end
- context 'with an empty link in RSS' do
- before do
- @valid_options['url'] = "https://bad.github.com/cantino/huginn/commits/master.atom"
- end
- it "does not leak :no_buffer" do
- agent.check
- event = agent.events.first
- expect(event.payload['links']).to eq([])
- end
- end
- context 'with the encoding declared in both headers and the content' do
- before do
- @valid_options['url'] = 'http://example.org/iso-8859-1.rss'
- end
- it "decodes the content properly" do
- agent.check
- event = agent.events.first
- expect(event.payload['title']).to eq('Mëkanïk Zaïn')
- end
- it "decodes the content properly with force_encoding specified" do
- @valid_options['force_encoding'] = 'iso-8859-1'
- agent.check
- event = agent.events.first
- expect(event.payload['title']).to eq('Mëkanïk Zaïn')
- end
- end
- context 'with podcast elements' do
- before do
- @valid_options['url'] = 'http://example.com/podcast.rss'
- @valid_options['include_feed_info'] = true
- end
- let :feed_info do
- {
- "id" => nil,
- "type" => "rss",
- "url" => "http://www.example.com/podcasts/everything/index.html",
- "links" => [ { "href" => "http://www.example.com/podcasts/everything/index.html" } ],
- "title" => "All About Everything",
- "description" => "All About Everything is a show about everything. Each week we dive into any subject known to man and talk about it as much as we can. Look for our podcast in the Podcasts app or in the iTunes Store",
- "copyright" => "℗ & © 2014 John Doe & Family",
- "generator" => nil,
- "icon" => nil,
- "authors" => [
- "John Doe"
- ],
- "date_published" => nil,
- "last_updated" => nil,
- "itunes_categories" => [
- "Technology", "Gadgets",
- "TV & Film",
- "Arts", "Food"
- ],
- "itunes_complete" => "yes",
- "itunes_explicit" => "no",
- "itunes_image" => "http://example.com/podcasts/everything/AllAboutEverything.jpg",
- "itunes_owners" => ["John Doe <john.doe@example.com>"],
- "itunes_subtitle" => "A show about everything",
- "itunes_summary" => "All About Everything is a show about everything. Each week we dive into any subject known to man and talk about it as much as we can. Look for our podcast in the Podcasts app or in the iTunes Store",
- "language" => "en-us"
- }
- end
- it "is parsed correctly" do
- expect {
- agent.check
- }.to change { agent.events.count }.by(4)
- expect(agent.events.map(&:payload)).to match([
- {
- "feed" => feed_info,
- "id" => "http://example.com/podcasts/archive/aae20140601.mp3",
- "url" => "http://example.com/podcasts/archive/aae20140601.mp3",
- "urls" => ["http://example.com/podcasts/archive/aae20140601.mp3"],
- "links" => [],
- "title" => "Red,Whine, & Blue",
- "description" => nil,
- "content" => nil,
- "image" => nil,
- "enclosure" => {
- "url" => "http://example.com/podcasts/everything/AllAboutEverythingEpisode4.mp3",
- "type" => "audio/mpeg",
- "length" => "498537"
- },
- "authors" => ["<Various>"],
- "categories" => [],
- "date_published" => "2016-03-11T01:15:00+00:00",
- "last_updated" => "2016-03-11T01:15:00+00:00",
- "itunes_duration" => "03:59",
- "itunes_explicit" => "no",
- "itunes_image" => "http://example.com/podcasts/everything/AllAboutEverything/Episode4.jpg",
- "itunes_subtitle" => "Red + Blue != Purple",
- "itunes_summary" => "This week we talk about surviving in a Red state if you are a Blue person. Or vice versa."
- },
- {
- "feed" => feed_info,
- "id" => "http://example.com/podcasts/archive/aae20140697.m4v",
- "url" => "http://example.com/podcasts/archive/aae20140697.m4v",
- "urls" => ["http://example.com/podcasts/archive/aae20140697.m4v"],
- "links" => [],
- "title" => "The Best Chili",
- "description" => nil,
- "content" => nil,
- "image" => nil,
- "enclosure" => {
- "url" => "http://example.com/podcasts/everything/AllAboutEverythingEpisode2.m4v",
- "type" => "video/x-m4v",
- "length" => "5650889"
- },
- "authors" => ["Jane Doe"],
- "categories" => [],
- "date_published" => "2016-03-10T02:00:00-07:00",
- "last_updated" => "2016-03-10T02:00:00-07:00",
- "itunes_closed_captioned" => "Yes",
- "itunes_duration" => "04:34",
- "itunes_explicit" => "no",
- "itunes_image" => "http://example.com/podcasts/everything/AllAboutEverything/Episode3.jpg",
- "itunes_subtitle" => "Jane and Eric",
- "itunes_summary" => "This week we talk about the best Chili in the world. Which chili is better?"
- },
- {
- "feed" => feed_info,
- "id" => "http://example.com/podcasts/archive/aae20140608.mp4",
- "url" => "http://example.com/podcasts/archive/aae20140608.mp4",
- "urls" => ["http://example.com/podcasts/archive/aae20140608.mp4"],
- "links" => [],
- "title" => "Socket Wrench Shootout",
- "description" => nil,
- "content" => nil,
- "image" => nil,
- "enclosure" => {
- "url" => "http://example.com/podcasts/everything/AllAboutEverythingEpisode2.mp4",
- "type" => "video/mp4",
- "length" => "5650889"
- },
- "authors" => ["Jane Doe"],
- "categories" => [],
- "date_published" => "2016-03-09T13:00:00-05:00",
- "last_updated" => "2016-03-09T13:00:00-05:00",
- "itunes_duration" => "04:34",
- "itunes_explicit" => "no",
- "itunes_image" => "http://example.com/podcasts/everything/AllAboutEverything/Episode2.jpg",
- "itunes_subtitle" => "Comparing socket wrenches is fun!",
- "itunes_summary" => "This week we talk about metric vs. Old English socket wrenches. Which one is better? Do you really need both? Get all of your answers here."
- },
- {
- "feed" => feed_info,
- "id" => "http://example.com/podcasts/archive/aae20140615.m4a",
- "url" => "http://example.com/podcasts/archive/aae20140615.m4a",
- "urls" => ["http://example.com/podcasts/archive/aae20140615.m4a"],
- "links" => [],
- "title" => "Shake Shake Shake Your Spices",
- "description" => nil,
- "content" => nil,
- "image" => nil,
- "enclosure" => {
- "url" => "http://example.com/podcasts/everything/AllAboutEverythingEpisode3.m4a",
- "type" => "audio/x-m4a",
- "length" => "8727310"
- },
- "authors" => ["John Doe"],
- "categories" => [],
- "date_published" => "2016-03-08T12:00:00+00:00",
- "last_updated" => "2016-03-08T12:00:00+00:00",
- "itunes_duration" => "07:04",
- "itunes_explicit" => "no",
- "itunes_image" => "http://example.com/podcasts/everything/AllAboutEverything/Episode1.jpg",
- "itunes_subtitle" => "A short primer on table spices",
- "itunes_summary" => "This week we talk about <a href=\"https://itunes/apple.com/us/book/antique-trader-salt-pepper/id429691295?mt=11\">salt and pepper shakers</a>, comparing and contrasting pour rates, construction materials, and overall aesthetics. Come and join the party!"
- }
- ])
- end
- end
- context 'of YouTube' do
- before do
- @valid_options['url'] = 'http://example.com/youtube.xml'
- @valid_options['include_feed_info'] = true
- end
- it "is parsed correctly" do
- expect {
- agent.check
- }.to change { agent.events.count }.by(15)
- expect(agent.events.first.payload).to match({
- "feed" => {
- "id" => "yt:channel:UCoTLdfNePDQzvdEgIToLIUg",
- "type" => "atom",
- "url" => "https://www.youtube.com/channel/UCoTLdfNePDQzvdEgIToLIUg",
- "links" => [
- { "href" => "http://www.youtube.com/feeds/videos.xml?channel_id=UCoTLdfNePDQzvdEgIToLIUg", "rel" => "self" },
- { "href" => "https://www.youtube.com/channel/UCoTLdfNePDQzvdEgIToLIUg", "rel" => "alternate" }
- ],
- "title" => "SecDSM",
- "description" => nil,
- "copyright" => nil,
- "generator" => nil,
- "icon" => nil,
- "authors" => ["SecDSM (https://www.youtube.com/channel/UCoTLdfNePDQzvdEgIToLIUg)"],
- "date_published" => "2016-07-28T18:46:21+00:00",
- "last_updated" => "2016-07-28T18:46:21+00:00"
- },
- "id" => "yt:video:OCs1E0vP7Oc",
- "authors" => ["SecDSM (https://www.youtube.com/channel/UCoTLdfNePDQzvdEgIToLIUg)"],
- "categories" => [],
- "content" => nil,
- "date_published" => "2017-06-15T02:36:17+00:00",
- "description" => nil,
- "enclosure" => nil,
- "image" => nil,
- "last_updated" => "2017-06-15T02:36:17+00:00",
- "links" => [
- { "href"=>"https://www.youtube.com/watch?v=OCs1E0vP7Oc", "rel"=>"alternate" }
- ],
- "title" => "SecDSM 2017 March - Talk 01",
- "url" => "https://www.youtube.com/watch?v=OCs1E0vP7Oc",
- "urls" => ["https://www.youtube.com/watch?v=OCs1E0vP7Oc"]
- })
- end
- end
- end
- describe 'logging errors with the feed url' do
- it 'includes the feed URL when an exception is raised' do
- expect(Feedjira).to receive(:parse).with(anything) { raise StandardError.new("Some error!") }
- expect {
- agent.check
- }.not_to raise_error
- expect(agent.logs.last.message).to match(%r[Failed to fetch https://github.com])
- end
- end
- end
|