9 years ago · 8afca6fed0
--- a/app/models/agents/website_agent.rb
+++ b/app/models/agents/website_agent.rb
@@ -46,6 +46,8 @@ module Agents
 
				 
			
 
				       Beware that when parsing an XML document (i.e. `type` is `xml`) using `xpath` expressions all namespaces are stripped from the document unless a toplevel option `use_namespaces` is set to true.
			
 
				 
			
 
				+      If the extracted value contains `<![CDATA[content]]` you can get the `content` part if you set `strip_cdata` to true.
			
 
				+
			
 
				       # Scraping JSON
			
 
				 
			
 
				       When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about.  For example:
			
@@ -439,7 +441,14 @@ module Agents
 
				         case nodes
			
 
				         when Nokogiri::XML::NodeSet
			
 
				           result = nodes.map { |node|
			
 
				-            case value = node.xpath(extraction_details['value'] || '.')
			
 
				+            value = node.xpath(extraction_details['value'] || '.')
			
 
				+            if extraction_details['strip_cdata']
			
 
				+              child = value.first
			
 
				+              if child.cdata?
			
 
				+                value = child.text
			
 
				+              end
			
 
				+            end
			
 
				+            case value
			
 
				             when Float
			
 
				               # Node#xpath() returns any numeric value as float;
			
 
				               # convert it to integer as appropriate.
			
--- a/spec/data_fixtures/cdata_rss.atom
+++ b/spec/data_fixtures/cdata_rss.atom
--- a/spec/models/agents/website_agent_spec.rb
+++ b/spec/models/agents/website_agent_spec.rb
@@ -529,6 +529,41 @@ describe Agents::WebsiteAgent do
 
				         end
			
 
				       end
			
 
				 
			
 
				+      describe "XML with cdata" do
			
 
				+        before do
			
 
				+          stub_request(:any, /cdata_rss/).to_return(
			
 
				+            body: File.read(Rails.root.join("spec/data_fixtures/cdata_rss.atom")),
			
 
				+            status: 200
			
 
				+          )
			
 
				+
			
 
				+          @checker = Agents::WebsiteAgent.new(name: 'cdata', options: {
			
 
				+            'name' => 'CDATA',
			
 
				+            'expected_update_period_in_days' => '2',
			
 
				+            'type' => 'xml',
			
 
				+            'url' => 'http://example.com/cdata_rss.atom',
			
 
				+            'mode' => 'on_change',
			
 
				+            'extract' => {
			
 
				+              'author' => { 'xpath' => '/feed/entry/author/name', 'value' => './/text()', 'strip_cdata' => true },
			
 
				+              'title' => { 'xpath' => '/feed/entry/title', 'value' => './/text()', 'strip_cdata' => false },
			
 
				+              'content' => { 'xpath' => '/feed/entry/content', 'value' => './/text()' },
			
 
				+            }
			
 
				+          }, keep_events_for: 2.days)
			
 
				+          @checker.user = users(:bob)
			
 
				+          @checker.save!
			
 
				+        end
			
 
				+
			
 
				+        it "works with XPath" do
			
 
				+          expect {
			
 
				+            @checker.check
			
 
				+          }.to change { Event.count }.by(10)
			
 
				+          event = Event.last
			
 
				+          expect(event.payload['author']).to eq('bill98')
			
 
				+          expect(event.payload['title']).to eq('<![CDATA[Help: Rainmeter Skins • Test if Today is Between 2 Dates]]>')
			
 
				+          expect(event.payload['content']).to start_with('<![CDATA[Can I ')
			
 
				+        end
			
 
				+
			
 
				+      end
			
 
				+
			
 
				       describe "JSON" do
			
 
				         it "works with paths" do
			
 
				           json = {