Răsfoiți Sursa

Use the XPath expression `string(.)` instead of `.//text()`

That is the correct way to extract a raw string with all text nodes
concatenated without entity escaping.
Akinori MUSHA 8 ani în urmă
părinte
comite
7ed40a6901

+ 3 - 3
app/models/agents/website_agent.rb

@@ -42,12 +42,12 @@ module Agents
           "extract": {
             "url": { "css": "#comic img", "value": "@src" },
             "title": { "css": "#comic img", "value": "@title" },
-            "body_text": { "css": "div.main", "value": ".//text()" }
+            "body_text": { "css": "div.main", "value": "string(.)" }
           }
 
-      "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and `.//text()` extracts all the enclosed text. To extract the innerHTML, use `./node()`; and to extract the outer HTML, use  `.`.
+      "@_attr_" is the XPath expression to extract the value of an attribute named _attr_ from a node, and `string(.)` gives a string with all the enclosed text nodes concatenated without entity escaping (such as `&`). To extract the innerHTML, use `./node()`; and to extract the outer HTML, use `.`.
 
-      You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove commas from formatted numbers, etc.  Note that these functions take a string, not a node set, so what you may think would be written as `normalize-space(.//text())` should actually be `normalize-space(.)`.
+      You can also use [XPath functions](http://www.w3.org/TR/xpath/#section-String-Functions) like `normalize-space` to strip and squeeze whitespace, `substring-after` to extract part of a text, and `translate` to remove commas from formatted numbers, etc.  Instead of passing `string(.)` to these functions, you can just pass `.` like `normalize-space(.)` and `translate(., ',', '')`.
 
       Beware that when parsing an XML document (i.e. `type` is `xml`) using `xpath` expressions, all namespaces are stripped from the document unless the top-level option `use_namespaces` is set to `true`.
 

+ 2 - 2
data/default_scenario.json

@@ -104,11 +104,11 @@
         "extract": {
           "title": {
             "css": "item title",
-            "value": ".//text()"
+            "value": "string(.)"
           },
           "url": {
             "css": "item link",
-            "value": ".//text()"
+            "value": "string(.)"
           }
         }
       },

+ 1 - 1
db/migrate/20140723110551_adopt_xpath_in_website_agent.rb

@@ -15,7 +15,7 @@ class AdoptXpathInWebsiteAgent < ActiveRecord::Migration
       agent.options['extract'].each { |name, extraction|
         case
         when extraction.delete('text')
-          extraction['value'] = './/text()'
+          extraction['value'] = 'string(.)'
         when attr = extraction.delete('attr')
           extraction['value'] = "@#{attr}"
         end

+ 1 - 1
spec/data_fixtures/xkcd.html

@@ -27,7 +27,7 @@
     <div id="topRight">
         <div id="masthead">
             <span><a href="/"><img src="http://imgs.xkcd.com/static/terrible_small_logo.png" alt="xkcd.com logo" height="83" width="185"/></a></span>
-            <span id="slogan">A webcomic of romance,<br/> sarcasm, math, and language.</span>
+            <span id="slogan">A webcomic of romance,<br/> sarcasm, math, &amp; language.</span>
         </div>
         <div id="news">
             <a href="http://store.xkcd.com/"><img src="//imgs.xkcd.com/store_news/xkcd_store_rip_43.png" title="the xkcd store" alt="the xkcd store" /></a>

+ 2 - 2
spec/fixtures/agents.yml

@@ -10,8 +10,8 @@ jane_website_agent:
                  :expected_update_period_in_days => 2,
                  :mode => :on_change,
                  :extract => {
-                     :title => {:css => "item title", :value => './/text()'},
-                     :url => {:css => "item link", :value => './/text()'}
+                     :title => {:css => "item title", :value => 'string(.)'},
+                     :url => {:css => "item link", :value => 'string(.)'}
                  }
                }.to_json.inspect %>
 

+ 1 - 1
spec/models/agent_spec.rb

@@ -947,7 +947,7 @@ describe AgentDrop do
         mode: 'on_change',
         extract: {
           url: { css: '[id^=strip_enlarged_] img', value: '@src' },
-          title: { css: '.STR_DateStrip', value: './/text()' },
+          title: { css: '.STR_DateStrip', value: 'string(.)' },
         },
       },
       schedule: 'every_12h',

+ 26 - 7
spec/models/agents/website_agent_spec.rb

@@ -507,7 +507,7 @@ describe Agents::WebsiteAgent do
         expect(event.payload['num_links']).to eq("9")
       end
 
-      it "should return all texts concatenated if XPath returns many text nodes" do
+      it "should return everything concatenated if XPath returns many nodes" do
         rel_site = {
           'name' => "XKCD",
           'expected_update_period_in_days' => 2,
@@ -523,7 +523,26 @@ describe Agents::WebsiteAgent do
         rel.save!
         rel.check
         event = Event.last
-        expect(event.payload['slogan']).to eq("A webcomic of romance, sarcasm, math, and language.")
+        expect(event.payload['slogan']).to eq("A webcomic of romance, sarcasm, math, &amp; language.")
+      end
+
+      it "should return a string value returned by XPath" do
+        rel_site = {
+          'name' => "XKCD",
+          'expected_update_period_in_days' => 2,
+          'type' => "html",
+          'url' => "http://xkcd.com",
+          'mode' => "on_change",
+          'extract' => {
+            'slogan' => {'css' => "#slogan", 'value' => "string(.)"}
+          }
+        }
+        rel = Agents::WebsiteAgent.new(:name => "xkcd", :options => rel_site)
+        rel.user = users(:bob)
+        rel.save!
+        rel.check
+        event = Event.last
+        expect(event.payload['slogan']).to eq("A webcomic of romance, sarcasm, math, & language.")
       end
 
       it "should interpolate _response_" do
@@ -653,9 +672,9 @@ describe Agents::WebsiteAgent do
             'url' => 'http://example.com/cdata_rss.atom',
             'mode' => 'on_change',
             'extract' => {
-              'author' => { 'xpath' => '/feed/entry/author/name', 'value' => './/text()'},
-              'title' => { 'xpath' => '/feed/entry/title', 'value' => './/text()' },
-              'content' => { 'xpath' => '/feed/entry/content', 'value' => './/text()' },
+              'author' => { 'xpath' => '/feed/entry/author/name', 'value' => 'string(.)'},
+              'title' => { 'xpath' => '/feed/entry/title', 'value' => 'string(.)' },
+              'content' => { 'xpath' => '/feed/entry/content', 'value' => 'string(.)' },
             }
           }, keep_events_for: 2.days)
           @checker.user = users(:bob)
@@ -1001,8 +1020,8 @@ fire: hot
               'type' => 'html',
               'data_from_event' => '{{ some_object.some_data }}',
               'extract' => {
-                'title' => { 'css' => ".title", 'value' => ".//text()" },
-                'body' => { 'css' => "div span.body", 'value' => ".//text()" }
+                'title' => { 'css' => ".title", 'value' => "string(.)" },
+                'body' => { 'css' => "div span.body", 'value' => "string(.)" }
               }
             )
           end