Browse Source

stripping cdata from the value

oroce 9 years ago
parent
commit
8afca6fed0

+ 10 - 1
app/models/agents/website_agent.rb

@@ -46,6 +46,8 @@ module Agents
 
       Beware that when parsing an XML document (i.e. `type` is `xml`) using `xpath` expressions all namespaces are stripped from the document unless a toplevel option `use_namespaces` is set to true.
 
+      If the extracted value contains `<![CDATA[content]]` you can get the `content` part if you set `strip_cdata` to true.
+
       # Scraping JSON
 
       When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about.  For example:
@@ -439,7 +441,14 @@ module Agents
         case nodes
         when Nokogiri::XML::NodeSet
           result = nodes.map { |node|
-            case value = node.xpath(extraction_details['value'] || '.')
+            value = node.xpath(extraction_details['value'] || '.')
+            if extraction_details['strip_cdata']
+              child = value.first
+              if child.cdata?
+                value = child.text
+              end
+            end
+            case value
             when Float
               # Node#xpath() returns any numeric value as float;
               # convert it to integer as appropriate.

File diff suppressed because it is too large
+ 92 - 0
spec/data_fixtures/cdata_rss.atom


+ 35 - 0
spec/models/agents/website_agent_spec.rb

@@ -529,6 +529,41 @@ describe Agents::WebsiteAgent do
         end
       end
 
+      describe "XML with cdata" do
+        before do
+          stub_request(:any, /cdata_rss/).to_return(
+            body: File.read(Rails.root.join("spec/data_fixtures/cdata_rss.atom")),
+            status: 200
+          )
+
+          @checker = Agents::WebsiteAgent.new(name: 'cdata', options: {
+            'name' => 'CDATA',
+            'expected_update_period_in_days' => '2',
+            'type' => 'xml',
+            'url' => 'http://example.com/cdata_rss.atom',
+            'mode' => 'on_change',
+            'extract' => {
+              'author' => { 'xpath' => '/feed/entry/author/name', 'value' => './/text()', 'strip_cdata' => true },
+              'title' => { 'xpath' => '/feed/entry/title', 'value' => './/text()', 'strip_cdata' => false },
+              'content' => { 'xpath' => '/feed/entry/content', 'value' => './/text()' },
+            }
+          }, keep_events_for: 2.days)
+          @checker.user = users(:bob)
+          @checker.save!
+        end
+
+        it "works with XPath" do
+          expect {
+            @checker.check
+          }.to change { Event.count }.by(10)
+          event = Event.last
+          expect(event.payload['author']).to eq('bill98')
+          expect(event.payload['title']).to eq('<![CDATA[Help: Rainmeter Skins • Test if Today is Between 2 Dates]]>')
+          expect(event.payload['content']).to start_with('<![CDATA[Can I ')
+        end
+
+      end
+
       describe "JSON" do
         it "works with paths" do
           json = {

Some files were not shown because too many files changed in this diff