Sfoglia il codice sorgente

Fix a double-decoding problem in RssAgent

The SAX parser Feedjira uses (Nokogiri::XML::SAX) tries to detect the
encoding of a document from the content even if it is already known
and given.  This results in a content being decoded twice by
WebRequestConcern and the SAX parser if its encoding is declared in
both the Content-Type header and the XML declaration.

This commit makes RssAgent remove the `encoding` attribute from the
XML declaration of a document if the encoding is already known by the
Content-Type header.

Fixes #1797.
Akinori MUSHA 8 anni fa
parent
commit
0b3700999b

+ 15 - 1
app/models/agents/rss_agent.rb

@@ -132,7 +132,7 @@ module Agents
         begin
           response = faraday.get(url)
           if response.success?
-            feed = Feedjira::Feed.parse(response.body)
+            feed = Feedjira::Feed.parse(preprocessed_body(response))
             new_events.concat feed_to_events(feed)
           else
             error "Failed to fetch #{url}: #{response.inspect}"
@@ -170,6 +170,20 @@ module Agents
       require 'feedjira_extension'
     end
 
+    def preprocessed_body(response)
+      body = response.body
+      case body.encoding
+      when Encoding::ASCII_8BIT
+        # Encoding is unknown from the Content-Type, so let the SAX
+        # parser detect it from the content.
+      else
+        # Encoding is already known, so do not let the parser detect
+        # it from the XML declaration in the content.
+        body.sub!(/(<\?xml(?:\s+\w+\s*=\s*(['"]).*?\2)*)\s+encoding\s*=\s*(['"]).*?\3/, '\\1')
+      end
+      body
+    end
+
     def feed_data(feed)
       type =
         case feed.class.name

+ 13 - 0
spec/data_fixtures/iso-8859-1.rss

@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="iso-8859-1" ?>
+<rss xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
+    <channel>
+        <title>Zeuhl</title>
+        <link>http://example.net/</link>
+        <item>
+            <title>Mëkanïk Zaïn</title>
+            <link>http://example.net/post/1</link>
+            <guid>http://example.net/post/1</guid>
+            <pubDate>Mon, 21 Nov 2016 17:00:10 +0100</pubDate>
+        </item>
+    </channel>
+</rss>

+ 13 - 0
spec/models/agents/rss_agent_spec.rb

@@ -12,6 +12,7 @@ describe Agents::RssAgent do
     stub_request(:any, /SlickdealsnetFP/).to_return(:body => File.read(Rails.root.join("spec/data_fixtures/slickdeals.atom")), :status => 200)
     stub_request(:any, /onethingwell.org/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/onethingwell.rss")), status: 200)
     stub_request(:any, /bad.onethingwell.org/).to_return(body: File.read(Rails.root.join("spec/data_fixtures/onethingwell.rss")).gsub(/(?<=<link>)[^<]*/, ''), status: 200)
+    stub_request(:any, /iso-8859-1/).to_return(body: File.binread(Rails.root.join("spec/data_fixtures/iso-8859-1.rss")), headers: { 'Content-Type' => 'application/rss+xml; charset=ISO-8859-1' }, status: 200)
   end
 
   let(:agent) do
@@ -283,6 +284,18 @@ describe Agents::RssAgent do
         expect(event.payload['links']).to eq([])
       end
     end
+
+    context 'with the encoding declared in both headers and the content' do
+      before do
+        @valid_options['url'] = 'http://example.org/iso-8859-1.rss'
+      end
+
+      it "decodes the content properly" do
+        agent.check
+        event = agent.events.first
+        expect(event.payload['title']).to eq('Mëkanïk Zaïn')
+      end
+    end
   end
 
   describe 'logging errors with the feed url' do