Browse Source

Improve Utils.normalize_uri (#1719)

* Improve Utils.normalize_uri

Globally replacing generally unsafe characters in a URL would not fix
invalid authorities and paths, so use Addressable::URI to normalize them
when necessary.

This should fix #1701.

* Remove an unused function

* Fix the test case to make sure an IPv6 address is supported
Akinori MUSHA 8 năm trước cách đây
mục cha
commit
7e79d576b5
3 tập tin đã thay đổi với 33 bổ sung8 xóa
  1. 24 5
      lib/utils.rb
  2. 2 1
      spec/data_fixtures/urlTest.html
  3. 7 2
      spec/models/agents/website_agent_spec.rb

+ 24 - 5
lib/utils.rb

@@ -1,5 +1,6 @@
 require 'jsonpath'
 require 'cgi'
+require 'addressable/uri'
 
 module Utils
   def self.unindent(s)
@@ -25,11 +26,29 @@ module Utils
     begin
       URI(uri)
     rescue URI::Error
-      URI(uri.to_s.gsub(/[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]+/) { |unsafe|
-            unsafe.bytes.each_with_object(String.new) { |uc, s|
-              s << sprintf('%%%02X', uc)
-            }
-          }.force_encoding(Encoding::US_ASCII))
+      begin
+        URI(uri.to_s.gsub(/[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]+/) { |unsafe|
+              unsafe.bytes.each_with_object(String.new) { |uc, s|
+                s << sprintf('%%%02X', uc)
+              }
+            }.force_encoding(Encoding::US_ASCII))
+      rescue URI::Error => e
+        begin
+          auri = Addressable::URI.parse(uri.to_s)
+        rescue
+          # Do not leak Addressable::URI::InvalidURIError which
+          # callers might not expect.
+          raise e
+        else
+          # Addressable::URI#normalize! modifies the query and
+          # fragment components beyond escaping unsafe characters, so
+          # avoid using it.  Otherwise `?a[]=%2F` would be normalized
+          # as `?a%5B%5D=/`, for example.
+          auri.site = auri.normalized_site
+          auri.path = auri.normalized_path
+          URI(auri.to_s)
+        end
+      end
     end
   end
 

+ 2 - 1
spec/data_fixtures/urlTest.html

@@ -12,6 +12,7 @@
             <li><a href="https://www.google.ca/search?q=위키백과:대문">unicode param</a></li>
             <li><a href="http://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8">percent encoded url</a></li>
             <li><a href="https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8">percent encoded param</a></li>
+            <li><a href="http://[::1]/path[]?query[]=foo">brackets</a></li>
         </ul>
     </body>
-</html>
+</html>

+ 7 - 2
spec/models/agents/website_agent_spec.rb

@@ -1105,8 +1105,8 @@ fire: hot
 
     describe "#check" do
       before do
-        expect { @checker.check }.to change { Event.count }.by(7)
-        @events = Event.last(7)
+        expect { @checker.check }.to change { Event.count }.by(8)
+        @events = Event.last(8)
       end
 
       it "should check hostname" do
@@ -1143,6 +1143,11 @@ fire: hot
         event = @events[6]
         expect(event.payload['url']).to eq("https://www.google.ca/search?q=%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8")
       end
+
+      it "should check url with unescaped brackets in the path component" do
+        event = @events[7]
+        expect(event.payload['url']).to eq("http://[::1]/path%5B%5D?query[]=foo")
+      end
     end
   end
 end