Skip to content

Commit 42bd7db

Browse files
committed
Escape control chars even if emitting UTF8 open-source-parsers#1176
1 parent 75b360a commit 42bd7db

File tree

2 files changed

+77
-24
lines changed

2 files changed

+77
-24
lines changed

src/lib_json/json_writer.cpp

+27-24
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,14 @@ static String toHex16Bit(unsigned int x) {
262262
return result;
263263
}
264264

265+
static void appendRaw(String& result, unsigned ch) {
266+
result += static_cast<char>(ch);
267+
}
268+
269+
static void appendHex(String& result, unsigned ch) {
270+
result.append("\\u").append(toHex16Bit(ch));
271+
}
272+
265273
static String valueToQuotedStringN(const char* value, unsigned length,
266274
bool emitUTF8 = false) {
267275
if (value == nullptr)
@@ -309,31 +317,25 @@ static String valueToQuotedStringN(const char* value, unsigned length,
309317
// Should add a flag to allow this compatibility mode and prevent this
310318
// sequence from occurring.
311319
default: {
320+
unsigned codepoint;
312321
if (emitUTF8) {
313-
result += *c;
322+
codepoint = static_cast<unsigned char>(*c);
314323
} else {
315-
unsigned int codepoint = utf8ToCodepoint(c, end);
316-
const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20;
317-
const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F;
318-
const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000;
319-
// don't escape non-control characters
320-
// (short escape sequence are applied above)
321-
if (FIRST_NON_CONTROL_CODEPOINT <= codepoint &&
322-
codepoint <= LAST_NON_CONTROL_CODEPOINT) {
323-
result += static_cast<char>(codepoint);
324-
} else if (codepoint <
325-
FIRST_SURROGATE_PAIR_CODEPOINT) { // codepoint is in Basic
326-
// Multilingual Plane
327-
result += "\\u";
328-
result += toHex16Bit(codepoint);
329-
} else { // codepoint is not in Basic Multilingual Plane
330-
// convert to surrogate pair first
331-
codepoint -= FIRST_SURROGATE_PAIR_CODEPOINT;
332-
result += "\\u";
333-
result += toHex16Bit((codepoint >> 10) + 0xD800);
334-
result += "\\u";
335-
result += toHex16Bit((codepoint & 0x3FF) + 0xDC00);
336-
}
324+
codepoint = utf8ToCodepoint(c, end); // modifies `c`
325+
}
326+
327+
if (codepoint < 0x20) {
328+
appendHex(result, codepoint);
329+
} else if (codepoint < 0x80 || emitUTF8) {
330+
appendRaw(result, codepoint);
331+
} else if (codepoint < 0x10000) {
332+
// Basic Multilingual Plane
333+
appendHex(result, codepoint);
334+
} else {
335+
// Extended Unicode. Encode 20 bits as a surrogate pair.
336+
codepoint -= 0x10000;
337+
appendHex(result, 0xd800 + ((codepoint >> 10) & 0x3ff));
338+
appendHex(result, 0xdc00 + (codepoint & 0x3ff));
337339
}
338340
} break;
339341
}
@@ -864,7 +866,8 @@ struct CommentStyle {
864866
/// Decide whether to write comments.
865867
enum Enum {
866868
None, ///< Drop all comments.
867-
Most, ///< Recover odd behavior of previous versions (not implemented yet).
869+
Most, ///< Recover odd behavior of previous versions (not implemented
870+
///< yet).
868871
All ///< Keep all comments.
869872
};
870873
};

src/test_lib_json/main.cpp

+50
Original file line numberDiff line numberDiff line change
@@ -2640,6 +2640,56 @@ JSONTEST_FIXTURE_LOCAL(StreamWriterTest, unicode) {
26402640
"\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}");
26412641
}
26422642

2643+
// Control chars should be escaped regardless of UTF-8 input encoding.
2644+
JSONTEST_FIXTURE_LOCAL(StreamWriterTest, escapeControlCharacters) {
2645+
auto uEscape = [](unsigned ch) {
2646+
static const char h[] = "0123456789abcdef";
2647+
std::string r = "\\u";
2648+
r += h[(ch >> (3 * 4)) & 0xf];
2649+
r += h[(ch >> (2 * 4)) & 0xf];
2650+
r += h[(ch >> (1 * 4)) & 0xf];
2651+
r += h[(ch >> (0 * 4)) & 0xf];
2652+
return r;
2653+
};
2654+
auto shortEscape = [](unsigned ch) -> const char* {
2655+
switch (ch) {
2656+
case '\"':
2657+
return "\\\"";
2658+
case '\\':
2659+
return "\\\\";
2660+
case '\b':
2661+
return "\\b";
2662+
case '\f':
2663+
return "\\f";
2664+
case '\n':
2665+
return "\\n";
2666+
case '\r':
2667+
return "\\r";
2668+
case '\t':
2669+
return "\\t";
2670+
default:
2671+
return nullptr;
2672+
}
2673+
};
2674+
2675+
Json::StreamWriterBuilder b;
2676+
b.settings_["emitUTF8"] = true;
2677+
2678+
for (unsigned i = 0; i != 0x100; ++i) {
2679+
std::string raw({static_cast<char>(i)});
2680+
std::string esc = raw;
2681+
if (i < 0x20)
2682+
esc = uEscape(i);
2683+
if (const char* shEsc = shortEscape(i))
2684+
esc = shEsc;
2685+
Json::Value root;
2686+
root["test"] = raw;
2687+
JSONTEST_ASSERT_STRING_EQUAL(
2688+
std::string("{\n\t\"test\" : \"").append(esc).append("\"\n}"),
2689+
Json::writeString(b, root));
2690+
}
2691+
}
2692+
26432693
struct ReaderTest : JsonTest::TestCase {
26442694
void setStrictMode() {
26452695
reader = std::unique_ptr<Json::Reader>(

0 commit comments

Comments
 (0)