8 ani în urmă · 7ed40a6901
--- a/app/models/agents/website_agent.rb
+++ b/app/models/agents/website_agent.rb
@@ -42,12 +42,12 @@ module Agents
 
				           "extract": {
			
 
				             "url": { "css": "#comic img", "value": "@src" },
			
 
				             "title": { "css": "#comic img", "value": "@title" },
			
 
				-            "body_text": { "css": "div.main", "value": ".//text()" }
			
 
				+            "body_text": { "css": "div.main", "value": "string(.)" }
			
 
				           }
			
 
				 
			
 
				-      "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and `.//text()` extracts all the enclosed text. To extract the innerHTML, use `./node()`; and to extract the outer HTML, use  `.`.
			
 
				+      "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and `string(.)` gives a string with all the enclosed text nodes concatenated without entity escaping (such as `&amp;`). To extract the innerHTML, use `./node()`; and to extract the outer HTML, use `.`.
			
 
				 
			
 
				-      You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove commas from formatted numbers, etc.  Note that these functions take a string, not a node set, so what you may think would be written as `normalize-space(.//text())` should actually be `normalize-space(.)`.
			
 
				+      You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove commas from formatted numbers, etc.  Instead of passing `string(.)` to these functions, you can just pass `.` like `normalize-space(.)` and `translate(., ',', '')`.
			
 
				 
			
 
				       Beware that when parsing an XML document (i.e. `type` is `xml`) using `xpath` expressions, all namespaces are stripped from the document unless the top-level option `use_namespaces` is set to `true`.
			
 
				 
			
--- a/data/default_scenario.json
+++ b/data/default_scenario.json
@@ -104,11 +104,11 @@
 
				         "extract": {
			
 
				           "title": {
			
 
				             "css": "item title",
			
 
				-            "value": ".//text()"
			
 
				+            "value": "string(.)"
			
 
				           },
			
 
				           "url": {
			
 
				             "css": "item link",
			
 
				-            "value": ".//text()"
			
 
				+            "value": "string(.)"
			
 
				           }
			
 
				         }
			
 
				       },
			
--- a/db/migrate/20140723110551_adopt_xpath_in_website_agent.rb
+++ b/db/migrate/20140723110551_adopt_xpath_in_website_agent.rb
@@ -15,7 +15,7 @@ class AdoptXpathInWebsiteAgent < ActiveRecord::Migration
 
				       agent.options['extract'].each { |name, extraction|
			
 
				         case
			
 
				         when extraction.delete('text')
			
 
				-          extraction['value'] = './/text()'
			
 
				+          extraction['value'] = 'string(.)'
			
 
				         when attr = extraction.delete('attr')
			
 
				           extraction['value'] = "@#{attr}"
			
 
				         end
			
--- a/spec/data_fixtures/xkcd.html
+++ b/spec/data_fixtures/xkcd.html
@@ -27,7 +27,7 @@
 
				     <div id="topRight">
			
 
				         <div id="masthead">
			
 
				             <span><a href="/"><img src="http://imgs.xkcd.com/static/terrible_small_logo.png" alt="xkcd.com logo" height="83" width="185"/></a></span>
			
 
				-            <span id="slogan">A webcomic of romance,<br/> sarcasm, math, and language.</span>
			
 
				+            <span id="slogan">A webcomic of romance,<br/> sarcasm, math, &amp; language.</span>
			
 
				         </div>
			
 
				         <div id="news">
			
 
				             <a href="http://store.xkcd.com/"><img src="//imgs.xkcd.com/store_news/xkcd_store_rip_43.png" title="the xkcd store" alt="the xkcd store" /></a>
			
--- a/spec/fixtures/agents.yml
+++ b/spec/fixtures/agents.yml
@@ -10,8 +10,8 @@ jane_website_agent:
 
				                  :expected_update_period_in_days => 2,
			
 
				                  :mode => :on_change,
			
 
				                  :extract => {
			
 
				-                     :title => {:css => "item title", :value => './/text()'},
			
 
				-                     :url => {:css => "item link", :value => './/text()'}
			
 
				+                     :title => {:css => "item title", :value => 'string(.)'},
			
 
				+                     :url => {:css => "item link", :value => 'string(.)'}
			
 
				                  }
			
 
				                }.to_json.inspect %>
			
 
				 
			
--- a/spec/models/agent_spec.rb
+++ b/spec/models/agent_spec.rb
@@ -947,7 +947,7 @@ describe AgentDrop do
 
				         mode: 'on_change',
			
 
				         extract: {
			
 
				           url: { css: '[id^=strip_enlarged_] img', value: '@src' },
			
 
				-          title: { css: '.STR_DateStrip', value: './/text()' },
			
 
				+          title: { css: '.STR_DateStrip', value: 'string(.)' },
			
 
				         },
			
 
				       },
			
 
				       schedule: 'every_12h',
			
--- a/spec/models/agents/website_agent_spec.rb
+++ b/spec/models/agents/website_agent_spec.rb
@@ -507,7 +507,7 @@ describe Agents::WebsiteAgent do
 
				         expect(event.payload['num_links']).to eq("9")
			
 
				       end
			
 
				 
			
 
				-      it "should return all texts concatenated if XPath returns many text nodes" do
			
 
				+      it "should return everything concatenated if XPath returns many nodes" do
			
 
				         rel_site = {
			
 
				           'name' => "XKCD",
			
 
				           'expected_update_period_in_days' => 2,
			
@@ -523,7 +523,26 @@ describe Agents::WebsiteAgent do
 
				         rel.save!
			
 
				         rel.check
			
 
				         event = Event.last
			
 
				-        expect(event.payload['slogan']).to eq("A webcomic of romance, sarcasm, math, and language.")
			
 
				+        expect(event.payload['slogan']).to eq("A webcomic of romance, sarcasm, math, &amp; language.")
			
 
				+      end
			
 
				+
			
 
				+      it "should return a string value returned by XPath" do
			
 
				+        rel_site = {
			
 
				+          'name' => "XKCD",
			
 
				+          'expected_update_period_in_days' => 2,
			
 
				+          'type' => "html",
			
 
				+          'url' => "http://xkcd.com",
			
 
				+          'mode' => "on_change",
			
 
				+          'extract' => {
			
 
				+            'slogan' => {'css' => "#slogan", 'value' => "string(.)"}
			
 
				+          }
			
 
				+        }
			
 
				+        rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site)
			
 
				+        rel.user = users(:bob)
			
 
				+        rel.save!
			
 
				+        rel.check
			
 
				+        event = Event.last
			
 
				+        expect(event.payload['slogan']).to eq("A webcomic of romance, sarcasm, math, & language.")
			
 
				       end
			
 
				 
			
 
				       it "should interpolate _response_" do
			
@@ -653,9 +672,9 @@ describe Agents::WebsiteAgent do
 
				             'url' => 'http://example.com/cdata_rss.atom',
			
 
				             'mode' => 'on_change',
			
 
				             'extract' => {
			
 
				-              'author' => { 'xpath' => '/feed/entry/author/name', 'value' => './/text()'},
			
 
				-              'title' => { 'xpath' => '/feed/entry/title', 'value' => './/text()' },
			
 
				-              'content' => { 'xpath' => '/feed/entry/content', 'value' => './/text()' },
			
 
				+              'author' => { 'xpath' => '/feed/entry/author/name', 'value' => 'string(.)'},
			
 
				+              'title' => { 'xpath' => '/feed/entry/title', 'value' => 'string(.)' },
			
 
				+              'content' => { 'xpath' => '/feed/entry/content', 'value' => 'string(.)' },
			
 
				             }
			
 
				           }, keep_events_for: 2.days)
			
 
				           @checker.user = users(:bob)
			
@@ -1001,8 +1020,8 @@ fire: hot
 
				               'type' => 'html',
			
 
				               'data_from_event' => '{{ some_object.some_data }}',
			
 
				               'extract' => {
			
 
				-                'title' => { 'css' => ".title", 'value' => ".//text()" },
			
 
				-                'body' => { 'css' => "div span.body", 'value' => ".//text()" }
			
 
				+                'title' => { 'css' => ".title", 'value' => "string(.)" },
			
 
				+                'body' => { 'css' => "div span.body", 'value' => "string(.)" }
			
 
				               }
			
 
				             )
			
 
				           end