Browse Source

Improve Utils.normalize_uri()

- Always try Addressable::URI.parse() when URI.parse() fails.  This
  allows handling unencoded IDN hostnames properly as a side effect.

- Do not double encode the percent sign even if a URI contains unsafe
  characters.
Akinori MUSHA 5 years ago
parent
commit
a7497c1a67
2 changed files with 39 additions and 25 deletions
  1. 31 25
      lib/utils.rb
  2. 8 0
      spec/lib/utils_spec.rb

+ 31 - 25
lib/utils.rb

@@ -1,5 +1,6 @@
 require 'jsonpath'
 require 'cgi'
+require 'uri'
 require 'addressable/uri'
 
 module Utils
@@ -22,34 +23,39 @@ module Utils
     end
   end
 
-  def self.normalize_uri(uri)
-    begin
-      URI(uri)
-    rescue URI::Error
+  class << self
+    def normalize_uri(uri)
+      URI.parse(uri)
+    rescue URI::Error => e
       begin
-        URI(uri.to_s.gsub(/[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]+/) { |unsafe|
-              unsafe.bytes.each_with_object(String.new) { |uc, s|
-                s << sprintf('%%%02X', uc)
-              }
-            }.force_encoding(Encoding::US_ASCII))
-      rescue URI::Error => e
-        begin
-          auri = Addressable::URI.parse(uri.to_s)
-        rescue
-          # Do not leak Addressable::URI::InvalidURIError which
-          # callers might not expect.
-          raise e
-        else
-          # Addressable::URI#normalize! modifies the query and
-          # fragment components beyond escaping unsafe characters, so
-          # avoid using it.  Otherwise `?a[]=%2F` would be normalized
-          # as `?a%5B%5D=/`, for example.
-          auri.site = auri.normalized_site
-          auri.path = auri.normalized_path
-          URI(auri.to_s)
-        end
+        auri = Addressable::URI.parse(uri.to_s)
+      rescue StandardError
+        # Do not leak Addressable::URI::InvalidURIError which
+        # callers might not expect.
+        raise e
+      else
+        # Addressable::URI#normalize! modifies the query and
+        # fragment components beyond escaping unsafe characters, so
+        # avoid using it.  Otherwise `?a[]=%2F` would be normalized
+        # as `?a%5B%5D=/`, for example.
+        auri.site = auri.normalized_site
+        auri.path = auri.normalized_path
+        auri.query &&= escape_uri_unsafe_characters(auri.query)
+        auri.fragment &&= escape_uri_unsafe_characters(auri.fragment)
+
+        URI.parse(auri.to_s)
       end
     end
+
+    private
+
+    def escape_uri_unsafe_characters(string)
+      string.gsub(/(?!%[\h\H]{2})[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]]+/) { |unsafe|
+        unsafe.bytes.each_with_object(String.new) { |uc, s|
+          s << '%%%02X' % uc
+        }
+      }.force_encoding(Encoding::US_ASCII)
+    end
   end
 
   def self.interpolate_jsonpaths(value, data, options = {})

+ 8 - 0
spec/lib/utils_spec.rb

@@ -196,4 +196,12 @@ describe Utils do
       expect(Utils.if_present(argument, :to_i)).to eq(1)
     end
   end
+
+  describe ".normalize_uri" do
+    it 'should accept a URI with an IDN hostname, malformed path, query, and fragment parts' do
+      uri = Utils.normalize_uri("http://\u{3042}/\u{3042}?a[]=%2F&b=\u{3042}#100%")
+      expect(uri).to be_a(URI::HTTP)
+      expect(uri.to_s).to eq "http://xn--l8j/%E3%81%82?a[]=%2F&b=%E3%81%82#100%25"
+    end
+  end
 end