Skip to content

Commit c2729c7

Browse files
committed
Implement CGI.url_encode and CGI.url_decode
[Feature #18822] Ruby is somewhat missing an RFC 3986 compliant escape method.
1 parent 4bb04e9 commit c2729c7

File tree

3 files changed

+142
-17
lines changed

3 files changed

+142
-17
lines changed

ext/cgi/escape/escape.c

Lines changed: 53 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ url_unreserved_char(unsigned char c)
200200
}
201201

202202
static VALUE
203-
optimized_escape(VALUE str)
203+
optimized_escape(VALUE str, int plus_escape)
204204
{
205205
long i, len, beg = 0;
206206
VALUE dest = 0;
@@ -220,7 +220,7 @@ optimized_escape(VALUE str)
220220
rb_str_cat(dest, cstr + beg, i - beg);
221221
beg = i + 1;
222222

223-
if (c == ' ') {
223+
if (plus_escape && c == ' ') {
224224
rb_str_cat_cstr(dest, "+");
225225
}
226226
else {
@@ -242,7 +242,7 @@ optimized_escape(VALUE str)
242242
}
243243

244244
static VALUE
245-
optimized_unescape(VALUE str, VALUE encoding)
245+
optimized_unescape(VALUE str, VALUE encoding, int unescape_plus)
246246
{
247247
long i, len, beg = 0;
248248
VALUE dest = 0;
@@ -265,7 +265,7 @@ optimized_unescape(VALUE str, VALUE encoding)
265265
| char_to_number(cstr[i+2]));
266266
clen = 2;
267267
}
268-
else if (c == '+') {
268+
else if (unescape_plus && c == '+') {
269269
buf[0] = ' ';
270270
}
271271
else {
@@ -348,7 +348,7 @@ cgiesc_unescape_html(VALUE self, VALUE str)
348348
* call-seq:
349349
* CGI.escape(string) -> string
350350
*
351-
* Returns URL-escaped string.
351+
* Returns URL-escaped string (+application/x-www-form-urlencoded+).
352352
*
353353
*/
354354
static VALUE
@@ -357,7 +357,7 @@ cgiesc_escape(VALUE self, VALUE str)
357357
StringValue(str);
358358

359359
if (rb_enc_str_asciicompat_p(str)) {
360-
return optimized_escape(str);
360+
return optimized_escape(str, 1);
361361
}
362362
else {
363363
return rb_call_super(1, &str);
@@ -376,7 +376,7 @@ accept_charset(int argc, VALUE *argv, VALUE self)
376376
* call-seq:
377377
* CGI.unescape(string, encoding=@@accept_charset) -> string
378378
*
379-
* Returns URL-unescaped string.
379+
* Returns URL-unescaped string (+application/x-www-form-urlencoded+).
380380
*
381381
*/
382382
static VALUE
@@ -388,7 +388,50 @@ cgiesc_unescape(int argc, VALUE *argv, VALUE self)
388388

389389
if (rb_enc_str_asciicompat_p(str)) {
390390
VALUE enc = accept_charset(argc-1, argv+1, self);
391-
return optimized_unescape(str, enc);
391+
return optimized_unescape(str, enc, 1);
392+
}
393+
else {
394+
return rb_call_super(argc, argv);
395+
}
396+
}
397+
398+
/*
399+
* call-seq:
400+
* CGI.escapeURIComponent(string) -> string
401+
*
402+
* Returns URL-escaped string following RFC 3986.
403+
*
404+
*/
405+
static VALUE
406+
cgiesc_escape_uri_component(VALUE self, VALUE str)
407+
{
408+
StringValue(str);
409+
410+
if (rb_enc_str_asciicompat_p(str)) {
411+
return optimized_escape(str, 0);
412+
}
413+
else {
414+
return rb_call_super(1, &str);
415+
}
416+
}
417+
418+
/*
419+
* call-seq:
420+
* CGI.unescapeURIComponent(string, encoding=@@accept_charset) -> string
421+
*
422+
* Returns URL-unescaped string following RFC 3986.
423+
*
424+
*/
425+
static VALUE
426+
cgiesc_unescape_uri_component(int argc, VALUE *argv, VALUE self)
427+
{
428+
VALUE str = (rb_check_arity(argc, 1, 2), argv[0]);
429+
430+
StringValue(str);
431+
432+
if (rb_enc_str_asciicompat_p(str)) {
433+
VALUE enc = accept_charset(argc-1, argv+1, self);
434+
return optimized_unescape(str, enc, 0);
392435
}
393436
else {
394437
return rb_call_super(argc, argv);
@@ -414,6 +457,8 @@ InitVM_escape(void)
414457
rb_mUtil = rb_define_module_under(rb_cCGI, "Util");
415458
rb_define_method(rb_mEscape, "escapeHTML", cgiesc_escape_html, 1);
416459
rb_define_method(rb_mEscape, "unescapeHTML", cgiesc_unescape_html, 1);
460+
rb_define_method(rb_mEscape, "escapeURIComponent", cgiesc_escape_uri_component, 1);
461+
rb_define_method(rb_mEscape, "unescapeURIComponent", cgiesc_unescape_uri_component, -1);
417462
rb_define_method(rb_mEscape, "escape", cgiesc_escape, 1);
418463
rb_define_method(rb_mEscape, "unescape", cgiesc_unescape, -1);
419464
rb_prepend_module(rb_mUtil, rb_mEscape);

lib/cgi/util.rb

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,57 @@ module Util; end
55
extend Util
66
end
77
module CGI::Util
8-
@@accept_charset="UTF-8" unless defined?(@@accept_charset)
9-
# URL-encode a string.
8+
@@accept_charset = Encoding::UTF_8 unless defined?(@@accept_charset)
9+
10+
# URL-encode a string into application/x-www-form-urlencoded.
11+
# Space characters (+" "+) are encoded with plus signs (+"+"+)
1012
# url_encoded_string = CGI.escape("'Stop!' said Fred")
1113
# # => "%27Stop%21%27+said+Fred"
1214
def escape(string)
1315
encoding = string.encoding
14-
string.b.gsub(/([^ a-zA-Z0-9_.\-~]+)/) do |m|
16+
buffer = string.b
17+
buffer.gsub!(/([^ a-zA-Z0-9_.\-~]+)/) do |m|
1518
'%' + m.unpack('H2' * m.bytesize).join('%').upcase
16-
end.tr(' ', '+').force_encoding(encoding)
19+
end
20+
buffer.tr!(' ', '+')
21+
buffer.force_encoding(encoding)
1722
end
1823

19-
# URL-decode a string with encoding(optional).
24+
# URL-decode an application/x-www-form-urlencoded string with encoding(optional).
2025
# string = CGI.unescape("%27Stop%21%27+said+Fred")
2126
# # => "'Stop!' said Fred"
22-
def unescape(string,encoding=@@accept_charset)
23-
str=string.tr('+', ' ').b.gsub(/((?:%[0-9a-fA-F]{2})+)/) do |m|
27+
def unescape(string, encoding = @@accept_charset)
28+
str = string.tr('+', ' ')
29+
str = str.b
30+
str.gsub!(/((?:%[0-9a-fA-F]{2})+)/) do |m|
31+
[m.delete('%')].pack('H*')
32+
end
33+
str.force_encoding(encoding)
34+
str.valid_encoding? ? str : str.force_encoding(string.encoding)
35+
end
36+
37+
# URL-encode a string following RFC 3986
38+
# Space characters (+" "+) are encoded with (+"%20"+)
39+
# url_encoded_string = CGI.escape("'Stop!' said Fred")
40+
# # => "%27Stop%21%27%20said%20Fred"
41+
def escapeURIComponent(string)
42+
encoding = string.encoding
43+
buffer = string.b
44+
buffer.gsub!(/([^a-zA-Z0-9_.\-~]+)/) do |m|
45+
'%' + m.unpack('H2' * m.bytesize).join('%').upcase
46+
end
47+
buffer.force_encoding(encoding)
48+
end
49+
50+
# URL-decode a string following RFC 3986 with encoding(optional).
51+
# string = CGI.unescape("%27Stop%21%27+said%20Fred")
52+
# # => "'Stop!'+said Fred"
53+
def unescapeURIComponent(string, encoding = @@accept_charset)
54+
str = string.b
55+
str.gsub!(/((?:%[0-9a-fA-F]{2})+)/) do |m|
2456
[m.delete('%')].pack('H*')
25-
end.force_encoding(encoding)
57+
end
58+
str.force_encoding(encoding)
2659
str.valid_encoding? ? str : str.force_encoding(string.encoding)
2760
end
2861

test/cgi/test_cgi_util.rb

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ def teardown
2323
ENV.update(@environ)
2424
end
2525

26-
2726
def test_cgi_escape
2827
assert_equal('%26%3C%3E%22+%E3%82%86%E3%82%93%E3%82%86%E3%82%93', CGI.escape(@str1))
2928
assert_equal('%26%3C%3E%22+%E3%82%86%E3%82%93%E3%82%86%E3%82%93'.ascii_only?, CGI.escape(@str1).ascii_only?) if defined?(::Encoding)
@@ -70,6 +69,54 @@ def test_cgi_unescape_accept_charset
7069
end;
7170
end
7271

72+
def test_cgi_escapeURIComponent
73+
assert_equal('%26%3C%3E%22%20%E3%82%86%E3%82%93%E3%82%86%E3%82%93', CGI.escapeURIComponent(@str1))
74+
assert_equal('%26%3C%3E%22%20%E3%82%86%E3%82%93%E3%82%86%E3%82%93'.ascii_only?, CGI.escapeURIComponent(@str1).ascii_only?) if defined?(::Encoding)
75+
end
76+
77+
def test_cgi_escapeURIComponent_with_unreserved_characters
78+
assert_equal("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~",
79+
CGI.escapeURIComponent("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"),
80+
"should not encode any unreserved characters, as per RFC3986 Section 2.3")
81+
end
82+
83+
def test_cgi_escapeURIComponent_with_invalid_byte_sequence
84+
assert_equal('%C0%3C%3C', CGI.escapeURIComponent("\xC0\<\<".dup.force_encoding("UTF-8")))
85+
end
86+
87+
def test_cgi_escapeURIComponent_preserve_encoding
88+
assert_equal(Encoding::US_ASCII, CGI.escapeURIComponent("\xC0\<\<".dup.force_encoding("US-ASCII")).encoding)
89+
assert_equal(Encoding::ASCII_8BIT, CGI.escapeURIComponent("\xC0\<\<".dup.force_encoding("ASCII-8BIT")).encoding)
90+
assert_equal(Encoding::UTF_8, CGI.escapeURIComponent("\xC0\<\<".dup.force_encoding("UTF-8")).encoding)
91+
end
92+
93+
def test_cgi_unescapeURIComponent
94+
str = CGI.unescapeURIComponent('%26%3C%3E%22%20%E3%82%86%E3%82%93%E3%82%86%E3%82%93')
95+
assert_equal(@str1, str)
96+
return unless defined?(::Encoding)
97+
98+
assert_equal("foo+bar", CGI.unescapeURIComponent("foo+bar"))
99+
100+
assert_equal(@str1.encoding, str.encoding)
101+
assert_equal("\u{30E1 30E2 30EA 691C 7D22}", CGI.unescapeURIComponent("\u{30E1 30E2 30EA}%E6%A4%9C%E7%B4%A2"))
102+
end
103+
104+
def test_cgi_unescapeURIComponent_preserve_encoding
105+
assert_equal(Encoding::US_ASCII, CGI.unescapeURIComponent("%C0%3C%3C".dup.force_encoding("US-ASCII")).encoding)
106+
assert_equal(Encoding::ASCII_8BIT, CGI.unescapeURIComponent("%C0%3C%3C".dup.force_encoding("ASCII-8BIT")).encoding)
107+
assert_equal(Encoding::UTF_8, CGI.unescapeURIComponent("%C0%3C%3C".dup.force_encoding("UTF-8")).encoding)
108+
end
109+
110+
def test_cgi_unescapeURIComponent_accept_charset
111+
return unless defined?(::Encoding)
112+
113+
assert_raise(TypeError) {CGI.unescapeURIComponent('', nil)}
114+
assert_separately(%w[-rcgi/util], "#{<<-"begin;"}\n#{<<-"end;"}")
115+
begin;
116+
assert_equal("", CGI.unescapeURIComponent(''))
117+
end;
118+
end
119+
73120
def test_cgi_pretty
74121
assert_equal("<HTML>\n <BODY>\n </BODY>\n</HTML>\n",CGI.pretty("<HTML><BODY></BODY></HTML>"))
75122
assert_equal("<HTML>\n\t<BODY>\n\t</BODY>\n</HTML>\n",CGI.pretty("<HTML><BODY></BODY></HTML>","\t"))

0 commit comments

Comments
 (0)