Implement CGI.url_encode and CGI.url_decode

byroot · byroot · commit c2729c7f333d · 2022-08-16T10:48:34.000+02:00
[Feature #18822]

Ruby is somewhat missing an RFC 3986 compliant escape method.
diff --git a/ext/cgi/escape/escape.c b/ext/cgi/escape/escape.c
@@ -200,7 +200,7 @@ url_unreserved_char(unsigned char c)
 }
 
 static VALUE
-optimized_escape(VALUE str)
+optimized_escape(VALUE str, int plus_escape)
 {
     long i, len, beg = 0;
     VALUE dest = 0;
@@ -220,7 +220,7 @@ optimized_escape(VALUE str)
             rb_str_cat(dest, cstr + beg, i - beg);
             beg = i + 1;
 
-            if (c == ' ') {
+            if (plus_escape && c == ' ') {
                 rb_str_cat_cstr(dest, "+");
             }
             else {
@@ -242,7 +242,7 @@ optimized_escape(VALUE str)
 }
 
 static VALUE
-optimized_unescape(VALUE str, VALUE encoding)
+optimized_unescape(VALUE str, VALUE encoding, int unescape_plus)
 {
     long i, len, beg = 0;
     VALUE dest = 0;
@@ -265,7 +265,7 @@ optimized_unescape(VALUE str, VALUE encoding)
                       | char_to_number(cstr[i+2]));
             clen = 2;
         }
-        else if (c == '+') {
+        else if (unescape_plus && c == '+') {
             buf[0] = ' ';
         }
         else {
@@ -348,7 +348,7 @@ cgiesc_unescape_html(VALUE self, VALUE str)
  *  call-seq:
  *     CGI.escape(string) -> string
  *
- *  Returns URL-escaped string.
+ *  Returns URL-escaped string (+application/x-www-form-urlencoded+).
  *
  */
 static VALUE
@@ -357,7 +357,7 @@ cgiesc_escape(VALUE self, VALUE str)
     StringValue(str);
 
     if (rb_enc_str_asciicompat_p(str)) {
-        return optimized_escape(str);
+        return optimized_escape(str, 1);
     }
     else {
         return rb_call_super(1, &str);
@@ -376,7 +376,7 @@ accept_charset(int argc, VALUE *argv, VALUE self)
  *  call-seq:
  *     CGI.unescape(string, encoding=@@accept_charset) -> string
  *
- *  Returns URL-unescaped string.
+ *  Returns URL-unescaped string (+application/x-www-form-urlencoded+).
  *
  */
 static VALUE
@@ -388,7 +388,50 @@ cgiesc_unescape(int argc, VALUE *argv, VALUE self)
 
     if (rb_enc_str_asciicompat_p(str)) {
         VALUE enc = accept_charset(argc-1, argv+1, self);
-        return optimized_unescape(str, enc);
+        return optimized_unescape(str, enc, 1);
+    }
+    else {
+        return rb_call_super(argc, argv);
+    }
+}
+
+/*
+ *  call-seq:
+ *     CGI.escapeURIComponent(string) -> string
+ *
+ *  Returns URL-escaped string following RFC 3986.
+ *
+ */
+static VALUE
+cgiesc_escape_uri_component(VALUE self, VALUE str)
+{
+    StringValue(str);
+
+    if (rb_enc_str_asciicompat_p(str)) {
+        return optimized_escape(str, 0);
+    }
+    else {
+        return rb_call_super(1, &str);
+    }
+}
+
+/*
+ *  call-seq:
+ *     CGI.unescapeURIComponent(string, encoding=@@accept_charset) -> string
+ *
+ *  Returns URL-unescaped string following RFC 3986.
+ *
+ */
+static VALUE
+cgiesc_unescape_uri_component(int argc, VALUE *argv, VALUE self)
+{
+    VALUE str = (rb_check_arity(argc, 1, 2), argv[0]);
+
+    StringValue(str);
+
+    if (rb_enc_str_asciicompat_p(str)) {
+        VALUE enc = accept_charset(argc-1, argv+1, self);
+        return optimized_unescape(str, enc, 0);
     }
     else {
         return rb_call_super(argc, argv);
@@ -414,6 +457,8 @@ InitVM_escape(void)
     rb_mUtil   = rb_define_module_under(rb_cCGI, "Util");
     rb_define_method(rb_mEscape, "escapeHTML", cgiesc_escape_html, 1);
     rb_define_method(rb_mEscape, "unescapeHTML", cgiesc_unescape_html, 1);
+    rb_define_method(rb_mEscape, "escapeURIComponent", cgiesc_escape_uri_component, 1);
+    rb_define_method(rb_mEscape, "unescapeURIComponent", cgiesc_unescape_uri_component, -1);
     rb_define_method(rb_mEscape, "escape", cgiesc_escape, 1);
     rb_define_method(rb_mEscape, "unescape", cgiesc_unescape, -1);
     rb_prepend_module(rb_mUtil, rb_mEscape);
diff --git a/lib/cgi/util.rb b/lib/cgi/util.rb
@@ -5,24 +5,57 @@ module Util; end
   extend Util
 end
 module CGI::Util
-  @@accept_charset="UTF-8" unless defined?(@@accept_charset)
-  # URL-encode a string.
+  @@accept_charset = Encoding::UTF_8 unless defined?(@@accept_charset)
+
+  # URL-encode a string into application/x-www-form-urlencoded.
+  # Space characters (+" "+) are encoded with plus signs (+"+"+)
   #   url_encoded_string = CGI.escape("'Stop!' said Fred")
   #      # => "%27Stop%21%27+said+Fred"
   def escape(string)
     encoding = string.encoding
-    string.b.gsub(/([^ a-zA-Z0-9_.\-~]+)/) do |m|
+    buffer = string.b
+    buffer.gsub!(/([^ a-zA-Z0-9_.\-~]+)/) do |m|
       '%' + m.unpack('H2' * m.bytesize).join('%').upcase
-    end.tr(' ', '+').force_encoding(encoding)
+    end
+    buffer.tr!(' ', '+')
+    buffer.force_encoding(encoding)
   end
 
-  # URL-decode a string with encoding(optional).
+  # URL-decode an application/x-www-form-urlencoded string with encoding(optional).
   #   string = CGI.unescape("%27Stop%21%27+said+Fred")
   #      # => "'Stop!' said Fred"
-  def unescape(string,encoding=@@accept_charset)
-    str=string.tr('+', ' ').b.gsub(/((?:%[0-9a-fA-F]{2})+)/) do |m|
+  def unescape(string, encoding = @@accept_charset)
+    str = string.tr('+', ' ')
+    str = str.b
+    str.gsub!(/((?:%[0-9a-fA-F]{2})+)/) do |m|
+      [m.delete('%')].pack('H*')
+    end
+    str.force_encoding(encoding)
+    str.valid_encoding? ? str : str.force_encoding(string.encoding)
+  end
+
+  # URL-encode a string following RFC 3986
+  # Space characters (+" "+) are encoded with (+"%20"+)
+  #   url_encoded_string = CGI.escape("'Stop!' said Fred")
+  #      # => "%27Stop%21%27%20said%20Fred"
+  def escapeURIComponent(string)
+    encoding = string.encoding
+    buffer = string.b
+    buffer.gsub!(/([^a-zA-Z0-9_.\-~]+)/) do |m|
+      '%' + m.unpack('H2' * m.bytesize).join('%').upcase
+    end
+    buffer.force_encoding(encoding)
+  end
+
+  # URL-decode a string following RFC 3986 with encoding(optional).
+  #   string = CGI.unescape("%27Stop%21%27+said%20Fred")
+  #      # => "'Stop!'+said Fred"
+  def unescapeURIComponent(string, encoding = @@accept_charset)
+    str = string.b
+    str.gsub!(/((?:%[0-9a-fA-F]{2})+)/) do |m|
       [m.delete('%')].pack('H*')
-    end.force_encoding(encoding)
+    end
+    str.force_encoding(encoding)
     str.valid_encoding? ? str : str.force_encoding(string.encoding)
   end
 
diff --git a/test/cgi/test_cgi_util.rb b/test/cgi/test_cgi_util.rb
@@ -23,7 +23,6 @@ def teardown
     ENV.update(@environ)
   end
 
-
   def test_cgi_escape
     assert_equal('%26%3C%3E%22+%E3%82%86%E3%82%93%E3%82%86%E3%82%93', CGI.escape(@str1))
     assert_equal('%26%3C%3E%22+%E3%82%86%E3%82%93%E3%82%86%E3%82%93'.ascii_only?, CGI.escape(@str1).ascii_only?) if defined?(::Encoding)
@@ -70,6 +69,54 @@ def test_cgi_unescape_accept_charset
     end;
   end
 
+  def test_cgi_escapeURIComponent
+    assert_equal('%26%3C%3E%22%20%E3%82%86%E3%82%93%E3%82%86%E3%82%93', CGI.escapeURIComponent(@str1))
+    assert_equal('%26%3C%3E%22%20%E3%82%86%E3%82%93%E3%82%86%E3%82%93'.ascii_only?, CGI.escapeURIComponent(@str1).ascii_only?) if defined?(::Encoding)
+  end
+
+  def test_cgi_escapeURIComponent_with_unreserved_characters
+    assert_equal("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~",
+                 CGI.escapeURIComponent("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"),
+                 "should not encode any unreserved characters, as per RFC3986 Section 2.3")
+  end
+
+  def test_cgi_escapeURIComponent_with_invalid_byte_sequence
+    assert_equal('%C0%3C%3C', CGI.escapeURIComponent("\xC0\<\<".dup.force_encoding("UTF-8")))
+  end
+
+  def test_cgi_escapeURIComponent_preserve_encoding
+    assert_equal(Encoding::US_ASCII, CGI.escapeURIComponent("\xC0\<\<".dup.force_encoding("US-ASCII")).encoding)
+    assert_equal(Encoding::ASCII_8BIT, CGI.escapeURIComponent("\xC0\<\<".dup.force_encoding("ASCII-8BIT")).encoding)
+    assert_equal(Encoding::UTF_8, CGI.escapeURIComponent("\xC0\<\<".dup.force_encoding("UTF-8")).encoding)
+  end
+
+  def test_cgi_unescapeURIComponent
+    str = CGI.unescapeURIComponent('%26%3C%3E%22%20%E3%82%86%E3%82%93%E3%82%86%E3%82%93')
+    assert_equal(@str1, str)
+    return unless defined?(::Encoding)
+
+    assert_equal("foo+bar", CGI.unescapeURIComponent("foo+bar"))
+
+    assert_equal(@str1.encoding, str.encoding)
+    assert_equal("\u{30E1 30E2 30EA 691C 7D22}", CGI.unescapeURIComponent("\u{30E1 30E2 30EA}%E6%A4%9C%E7%B4%A2"))
+  end
+
+  def test_cgi_unescapeURIComponent_preserve_encoding
+    assert_equal(Encoding::US_ASCII, CGI.unescapeURIComponent("%C0%3C%3C".dup.force_encoding("US-ASCII")).encoding)
+    assert_equal(Encoding::ASCII_8BIT, CGI.unescapeURIComponent("%C0%3C%3C".dup.force_encoding("ASCII-8BIT")).encoding)
+    assert_equal(Encoding::UTF_8, CGI.unescapeURIComponent("%C0%3C%3C".dup.force_encoding("UTF-8")).encoding)
+  end
+
+  def test_cgi_unescapeURIComponent_accept_charset
+    return unless defined?(::Encoding)
+
+    assert_raise(TypeError) {CGI.unescapeURIComponent('', nil)}
+    assert_separately(%w[-rcgi/util], "#{<<-"begin;"}\n#{<<-"end;"}")
+    begin;
+      assert_equal("", CGI.unescapeURIComponent(''))
+    end;
+  end
+
   def test_cgi_pretty
     assert_equal("<HTML>\n  <BODY>\n  </BODY>\n</HTML>\n",CGI.pretty("<HTML><BODY></BODY></HTML>"))
     assert_equal("<HTML>\n\t<BODY>\n\t</BODY>\n</HTML>\n",CGI.pretty("<HTML><BODY></BODY></HTML>","\t"))