Skip to content

Commit c161f4a

Browse files
authored
Escape control chars even if emitting UTF8 (#1178)
* Escape control chars even if emitting UTF8 See #1176 Fixes #1175 * review comments * fix test by stopping early enough to punt on utf8-input.
1 parent 75b360a commit c161f4a

File tree

2 files changed

+89
-22
lines changed

2 files changed

+89
-22
lines changed

Diff for: src/lib_json/json_writer.cpp

+27-22
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,14 @@ static String toHex16Bit(unsigned int x) {
262262
return result;
263263
}
264264

265+
static void appendRaw(String& result, unsigned ch) {
266+
result += static_cast<char>(ch);
267+
}
268+
269+
static void appendHex(String& result, unsigned ch) {
270+
result.append("\\u").append(toHex16Bit(ch));
271+
}
272+
265273
static String valueToQuotedStringN(const char* value, unsigned length,
266274
bool emitUTF8 = false) {
267275
if (value == nullptr)
@@ -310,29 +318,26 @@ static String valueToQuotedStringN(const char* value, unsigned length,
310318
// sequence from occurring.
311319
default: {
312320
if (emitUTF8) {
313-
result += *c;
321+
unsigned codepoint = static_cast<unsigned char>(*c);
322+
if (codepoint < 0x20) {
323+
appendHex(result, codepoint);
324+
} else {
325+
appendRaw(result, codepoint);
326+
}
314327
} else {
315-
unsigned int codepoint = utf8ToCodepoint(c, end);
316-
const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20;
317-
const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F;
318-
const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000;
319-
// don't escape non-control characters
320-
// (short escape sequence are applied above)
321-
if (FIRST_NON_CONTROL_CODEPOINT <= codepoint &&
322-
codepoint <= LAST_NON_CONTROL_CODEPOINT) {
323-
result += static_cast<char>(codepoint);
324-
} else if (codepoint <
325-
FIRST_SURROGATE_PAIR_CODEPOINT) { // codepoint is in Basic
326-
// Multilingual Plane
327-
result += "\\u";
328-
result += toHex16Bit(codepoint);
329-
} else { // codepoint is not in Basic Multilingual Plane
330-
// convert to surrogate pair first
331-
codepoint -= FIRST_SURROGATE_PAIR_CODEPOINT;
332-
result += "\\u";
333-
result += toHex16Bit((codepoint >> 10) + 0xD800);
334-
result += "\\u";
335-
result += toHex16Bit((codepoint & 0x3FF) + 0xDC00);
328+
unsigned codepoint = utf8ToCodepoint(c, end); // modifies `c`
329+
if (codepoint < 0x20) {
330+
appendHex(result, codepoint);
331+
} else if (codepoint < 0x80) {
332+
appendRaw(result, codepoint);
333+
} else if (codepoint < 0x10000) {
334+
// Basic Multilingual Plane
335+
appendHex(result, codepoint);
336+
} else {
337+
// Extended Unicode. Encode 20 bits as a surrogate pair.
338+
codepoint -= 0x10000;
339+
appendHex(result, 0xd800 + ((codepoint >> 10) & 0x3ff));
340+
appendHex(result, 0xdc00 + (codepoint & 0x3ff));
336341
}
337342
}
338343
} break;

Diff for: src/test_lib_json/main.cpp

+62
Original file line numberDiff line numberDiff line change
@@ -2640,6 +2640,68 @@ JSONTEST_FIXTURE_LOCAL(StreamWriterTest, unicode) {
26402640
"\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}");
26412641
}
26422642

2643+
// Control chars should be escaped regardless of UTF-8 input encoding.
2644+
JSONTEST_FIXTURE_LOCAL(StreamWriterTest, escapeControlCharacters) {
2645+
auto uEscape = [](unsigned ch) {
2646+
static const char h[] = "0123456789abcdef";
2647+
std::string r = "\\u";
2648+
r += h[(ch >> (3 * 4)) & 0xf];
2649+
r += h[(ch >> (2 * 4)) & 0xf];
2650+
r += h[(ch >> (1 * 4)) & 0xf];
2651+
r += h[(ch >> (0 * 4)) & 0xf];
2652+
return r;
2653+
};
2654+
auto shortEscape = [](unsigned ch) -> const char* {
2655+
switch (ch) {
2656+
case '\"':
2657+
return "\\\"";
2658+
case '\\':
2659+
return "\\\\";
2660+
case '\b':
2661+
return "\\b";
2662+
case '\f':
2663+
return "\\f";
2664+
case '\n':
2665+
return "\\n";
2666+
case '\r':
2667+
return "\\r";
2668+
case '\t':
2669+
return "\\t";
2670+
default:
2671+
return nullptr;
2672+
}
2673+
};
2674+
2675+
Json::StreamWriterBuilder b;
2676+
2677+
for (bool emitUTF8 : {true, false}) {
2678+
b.settings_["emitUTF8"] = emitUTF8;
2679+
2680+
for (unsigned i = 0; i != 0x100; ++i) {
2681+
if (!emitUTF8 && i >= 0x80)
2682+
break; // The algorithm would try to parse UTF-8, so stop here.
2683+
2684+
std::string raw({static_cast<char>(i)});
2685+
std::string esc = raw;
2686+
if (i < 0x20)
2687+
esc = uEscape(i);
2688+
if (const char* shEsc = shortEscape(i))
2689+
esc = shEsc;
2690+
2691+
// std::cout << "emit=" << emitUTF8 << ", i=" << std::hex << i << std::dec
2692+
// << std::endl;
2693+
2694+
Json::Value root;
2695+
root["test"] = raw;
2696+
JSONTEST_ASSERT_STRING_EQUAL(
2697+
std::string("{\n\t\"test\" : \"").append(esc).append("\"\n}"),
2698+
Json::writeString(b, root))
2699+
<< ", emit=" << emitUTF8 << ", i=" << i << ", raw=\"" << raw << "\""
2700+
<< ", esc=\"" << esc << "\"";
2701+
}
2702+
}
2703+
}
2704+
26432705
struct ReaderTest : JsonTest::TestCase {
26442706
void setStrictMode() {
26452707
reader = std::unique_ptr<Json::Reader>(

0 commit comments

Comments
 (0)