9 years ago · 3469ed2a5e
--- a/app/models/agents/website_agent.rb
+++ b/app/models/agents/website_agent.rb
@@ -18,14 +18,13 @@ module Agents
 
				 
			
 
				       Specify a `url` and select a `mode` for when to create Events based on the scraped data, either `all`, `on_change`, or `merge` (if fetching based on an Event, see below).
			
 
				 
			
 
				-      `url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape)
			
 
				+      The `url` option can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape).
			
 
				 
			
 
				       The WebsiteAgent can also scrape based on incoming events.
			
 
				 
			
 
				-      * If the Event contains a `url` key, that URL will be fetched.
			
 
				-      * For more control, you can set the `url_from_event` option and it will be used as a Liquid template to generate the url to access based on the Event.
			
 
				-      * If you set `data_from_event` to a Liquid template, it will be used to generate the data directly without fetching any URL.  (For example, set it to `{{ html }}` to use HTML contained in the `html` key of the incoming Event.)
			
 
				-      * If you specify `merge` for the `mode` option, Huginn will retain the old payload and update it with the new values.
			
 
				+      * Set the `url_from_event` option to a Liquid template to generate the url to access based on the Event.  (To fetch the url in the Event's `url` key, for example, set `url_from_event` to `{{ url }}`.)
			
 
				+      * Alternatively, set `data_from_event` to a Liquid template to use data directly without fetching any URL.  (For example, set it to `{{ html }}` to use HTML contained in the `html` key of the incoming Event.)
			
 
				+      * If you specify `merge` for the `mode` option, Huginn will retain the old payload and update it with new values.
			
 
				 
			
 
				       # Supported Document Types
			
 
				 
			
@@ -343,7 +342,7 @@ module Agents
 
				               if url_template = options['url_from_event'].presence
			
 
				                 interpolate_options(url_template)
			
 
				               else
			
 
				-                event.payload['url'].presence || interpolated['url']
			
 
				+                interpolated['url']
			
 
				               end
			
 
				             check_urls(url_to_scrape, existing_payload)
			
 
				           end
			
--- a/db/migrate/20160108221620_website_agent_does_not_use_event_url.rb
+++ b/db/migrate/20160108221620_website_agent_does_not_use_event_url.rb
@@ -0,0 +1,22 @@
 
				+class WebsiteAgentDoesNotUseEventUrl < ActiveRecord::Migration
			
 
				+  def up
			
 
				+    # Until this migration, if a WebsiteAgent received Events and did not have a `url_from_event` option set,
			
 
				+    # it would use the `url` from the Event's payload.  If the Event did not have a `url` in its payload, the
			
 
				+    # WebsiteAgent would do nothing. This migration assumes that if someone has wired a WebsiteAgent to receive Events
			
 
				+    # and has not set `url_from_event` or `data_from_event`, they were trying to use the Event's `url` payload, so we
			
 
				+    # set `url_from_event` to `{{ url }}` for them.
			
 
				+    Agents::WebsiteAgent.find_each do |agent|
			
 
				+      next unless agent.sources.count > 0
			
 
				+
			
 
				+      if !agent.options['data_from_event'].present? && !agent.options['url_from_event'].present?
			
 
				+        agent.options['url_from_event'] = '{{ url }}'
			
 
				+        agent.save!
			
 
				+        puts ">> Setting `url_from_event` on WebsiteAgent##{agent.id} to {{ url }} because it is wired"
			
 
				+        puts ">> to receive Events, and the WebsiteAgent no longer uses the Event's `url` value directly."
			
 
				+      end
			
 
				+    end
			
 
				+  end
			
 
				+
			
 
				+  def down
			
 
				+  end
			
 
				+end
			
--- a/spec/models/agents/website_agent_spec.rb
+++ b/spec/models/agents/website_agent_spec.rb
@@ -768,20 +768,13 @@ fire: hot
 
				           @event = Event.new
			
 
				           @event.agent = agents(:bob_rain_notifier_agent)
			
 
				           @event.payload = {
			
 
				-            'url' => 'http://xkcd.com',
			
 
				+            'url' => 'http://foo.com',
			
 
				             'link' => 'Random'
			
 
				           }
			
 
				         end
			
 
				 
			
 
				-        it "should scrape from the url element in incoming event payload" do
			
 
				-          expect {
			
 
				-            @checker.options = @valid_options
			
 
				-            @checker.receive([@event])
			
 
				-          }.to change { Event.count }.by(1)
			
 
				-        end
			
 
				-
			
 
				-        it "should use url_from_event as url to scrape if it exists when receiving an event" do
			
 
				-          stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
			
 
				+        it "should use url_from_event as the url to scrape" do
			
 
				+          stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Ffoo.com')
			
 
				 
			
 
				           @checker.options = @valid_options.merge(
			
 
				             'url_from_event' => 'http://example.org/?url={{url | uri_escape}}'
			
@@ -791,9 +784,16 @@ fire: hot
 
				           expect(stub).to have_been_requested
			
 
				         end
			
 
				 
			
 
				+        it "should use the Agent's `url` option if url_from_event is not set" do
			
 
				+          expect {
			
 
				+            @checker.options = @valid_options
			
 
				+            @checker.receive([@event])
			
 
				+          }.to change { Event.count }.by(1)
			
 
				+        end
			
 
				+
			
 
				         it "should allow url_from_event to be an array of urls" do
			
 
				-          stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
			
 
				-          stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Fxkcd.com')
			
 
				+          stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Ffoo.com')
			
 
				+          stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Ffoo.com')
			
 
				 
			
 
				           @checker.options = @valid_options.merge(
			
 
				             'url_from_event' => ['http://example.org/?url={{url | uri_escape}}', 'http://google.org/?url={{url | uri_escape}}']
			
@@ -805,7 +805,10 @@ fire: hot
 
				         end
			
 
				 
			
 
				         it "should interpolate values from incoming event payload" do
			
 
				+          stub_request(:any, /foo/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/xkcd.html")), status: 200)
			
 
				+
			
 
				           expect {
			
 
				+            @valid_options['url_from_event'] = '{{ url }}'
			
 
				             @valid_options['extract'] = {
			
 
				               'from' => {
			
 
				                 'xpath' => '*[1]',
			
@@ -821,7 +824,7 @@ fire: hot
 
				           }.to change { Event.count }.by(1)
			
 
				 
			
 
				           expect(Event.last.payload).to eq({
			
 
				-            'from' => 'http://xkcd.com',
			
 
				+            'from' => 'http://foo.com',
			
 
				             'to' => 'http://dynamic.xkcd.com/random/comic/',
			
 
				           })
			
 
				         end
			
@@ -1075,7 +1078,6 @@ fire: hot
 
				         event = @events[6]
			
 
				         expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8")
			
 
				       end
			
 
				-
			
 
				     end
			
 
				   end
			
 
				 end