Browse Source

Add a migration to set url_from_event and remove direct usage of Event `url` payload value

Andrew Cantino 9 years ago
parent
commit
3469ed2a5e

+ 5 - 6
app/models/agents/website_agent.rb

@@ -18,14 +18,13 @@ module Agents
 
       Specify a `url` and select a `mode` for when to create Events based on the scraped data, either `all`, `on_change`, or `merge` (if fetching based on an Event, see below).
 
-      `url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape)
+      The `url` option can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape).
 
       The WebsiteAgent can also scrape based on incoming events.
 
-      * If the Event contains a `url` key, that URL will be fetched.
-      * For more control, you can set the `url_from_event` option and it will be used as a Liquid template to generate the url to access based on the Event.
-      * If you set `data_from_event` to a Liquid template, it will be used to generate the data directly without fetching any URL.  (For example, set it to `{{ html }}` to use HTML contained in the `html` key of the incoming Event.)
-      * If you specify `merge` for the `mode` option, Huginn will retain the old payload and update it with the new values.
+      * Set the `url_from_event` option to a Liquid template to generate the url to access based on the Event.  (To fetch the url in the Event's `url` key, for example, set `url_from_event` to `{{ url }}`.)
+      * Alternatively, set `data_from_event` to a Liquid template to use data directly without fetching any URL.  (For example, set it to `{{ html }}` to use HTML contained in the `html` key of the incoming Event.)
+      * If you specify `merge` for the `mode` option, Huginn will retain the old payload and update it with new values.
 
       # Supported Document Types
 
@@ -343,7 +342,7 @@ module Agents
               if url_template = options['url_from_event'].presence
                 interpolate_options(url_template)
               else
-                event.payload['url'].presence || interpolated['url']
+                interpolated['url']
               end
             check_urls(url_to_scrape, existing_payload)
           end

+ 22 - 0
db/migrate/20160108221620_website_agent_does_not_use_event_url.rb

@@ -0,0 +1,22 @@
+class WebsiteAgentDoesNotUseEventUrl < ActiveRecord::Migration
+  def up
+    # Until this migration, if a WebsiteAgent received Events and did not have a `url_from_event` option set,
+    # it would use the `url` from the Event's payload.  If the Event did not have a `url` in its payload, the
+    # WebsiteAgent would do nothing. This migration assumes that if someone has wired a WebsiteAgent to receive Events
+    # and has not set `url_from_event` or `data_from_event`, they were trying to use the Event's `url` payload, so we
+    # set `url_from_event` to `{{ url }}` for them.
+    Agents::WebsiteAgent.find_each do |agent|
+      next unless agent.sources.count > 0
+
+      if !agent.options['data_from_event'].present? && !agent.options['url_from_event'].present?
+        agent.options['url_from_event'] = '{{ url }}'
+        agent.save!
+        puts ">> Setting `url_from_event` on WebsiteAgent##{agent.id} to {{ url }} because it is wired"
+        puts ">> to receive Events, and the WebsiteAgent no longer uses the Event's `url` value directly."
+      end
+    end
+  end
+
+  def down
+  end
+end

+ 16 - 14
spec/models/agents/website_agent_spec.rb

@@ -768,20 +768,13 @@ fire: hot
           @event = Event.new
           @event.agent = agents(:bob_rain_notifier_agent)
           @event.payload = {
-            'url' => 'http://xkcd.com',
+            'url' => 'http://foo.com',
             'link' => 'Random'
           }
         end
 
-        it "should scrape from the url element in incoming event payload" do
-          expect {
-            @checker.options = @valid_options
-            @checker.receive([@event])
-          }.to change { Event.count }.by(1)
-        end
-
-        it "should use url_from_event as url to scrape if it exists when receiving an event" do
-          stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
+        it "should use url_from_event as the url to scrape" do
+          stub = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Ffoo.com')
 
           @checker.options = @valid_options.merge(
             'url_from_event' => 'http://example.org/?url={{url | uri_escape}}'
@@ -791,9 +784,16 @@ fire: hot
           expect(stub).to have_been_requested
         end
 
+        it "should use the Agent's `url` option if url_from_event is not set" do
+          expect {
+            @checker.options = @valid_options
+            @checker.receive([@event])
+          }.to change { Event.count }.by(1)
+        end
+
         it "should allow url_from_event to be an array of urls" do
-          stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Fxkcd.com')
-          stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Fxkcd.com')
+          stub1 = stub_request(:any, 'http://example.org/?url=http%3A%2F%2Ffoo.com')
+          stub2 = stub_request(:any, 'http://google.org/?url=http%3A%2F%2Ffoo.com')
 
           @checker.options = @valid_options.merge(
             'url_from_event' => ['http://example.org/?url={{url | uri_escape}}', 'http://google.org/?url={{url | uri_escape}}']
@@ -805,7 +805,10 @@ fire: hot
         end
 
         it "should interpolate values from incoming event payload" do
+          stub_request(:any, /foo/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/xkcd.html")), status: 200)
+
           expect {
+            @valid_options['url_from_event'] = '{{ url }}'
             @valid_options['extract'] = {
               'from' => {
                 'xpath' => '*[1]',
@@ -821,7 +824,7 @@ fire: hot
           }.to change { Event.count }.by(1)
 
           expect(Event.last.payload).to eq({
-            'from' => 'http://xkcd.com',
+            'from' => 'http://foo.com',
             'to' => 'http://dynamic.xkcd.com/random/comic/',
           })
         end
@@ -1075,7 +1078,6 @@ fire: hot
         event = @events[6]
         expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8")
       end
-
     end
   end
 end