From 6973244664a27b070f60937cb1a803f28a9a8966 Mon Sep 17 00:00:00 2001 From: Nikolay Baklicharov <nikolay.baklicharov@gmail.com> Date: Tue, 19 May 2020 16:14:22 +0300 Subject: [PATCH 1/4] Fixed control chars escaping with enabled emitUTF8 --- src/lib_json/json_tool.h | 3 +++ src/lib_json/json_writer.cpp | 7 ++++++- src/test_lib_json/main.cpp | 24 ++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/lib_json/json_tool.h b/src/lib_json/json_tool.h index 2d7b7d9a0..b0e04f1a4 100644 --- a/src/lib_json/json_tool.h +++ b/src/lib_json/json_tool.h @@ -64,6 +64,9 @@ static inline String codePointToUTF8(unsigned int cp) { return result; } +/// Returns true if ch is a control character (in range [1,31]). +static inline bool isControlCharacter(char ch) { return ch > 0 && ch <= 0x1F; } + enum { /// Constant that specify the size of the buffer that must be passed to /// uintToString. diff --git a/src/lib_json/json_writer.cpp b/src/lib_json/json_writer.cpp index 56ee65ef6..236fa70a8 100644 --- a/src/lib_json/json_writer.cpp +++ b/src/lib_json/json_writer.cpp @@ -310,7 +310,12 @@ static String valueToQuotedStringN(const char* value, unsigned length, // sequence from occurring. default: { if (emitUTF8) { - result += *c; + if (isControlCharacter(*c)) { + result += "\\u"; + result += toHex16Bit(*c); + } else { + result += *c; + } } else { unsigned int codepoint = utf8ToCodepoint(c, end); const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20; diff --git a/src/test_lib_json/main.cpp b/src/test_lib_json/main.cpp index 73850cfd8..224f8516a 100644 --- a/src/test_lib_json/main.cpp +++ b/src/test_lib_json/main.cpp @@ -2640,6 +2640,30 @@ JSONTEST_FIXTURE_LOCAL(StreamWriterTest, unicode) { "\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}"); } +JSONTEST_FIXTURE_LOCAL(StreamWriterTest, controlChars) { + // Create a Json value containing UTF-8 string with some chars that need + // escape (tab,newline, control chars). + const Json::String expected( + "{\n\t\"test\" : " + "\"\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006\\u0007\\b\\t\\n\\u000b\\f\\" + "r\\u000e\\u000f\\u0010\\u0011\\u0012\\u0013\\u0014\\u0015\\u0016\\u0017" + "\\u0018\\u0019\\u001a\\u001b\\u001c\\u001d\\u001e\\u001f\"\n}"); + + // Create a Json value containing string with controls chars that need escape. + Json::Value root; + root["test"] = "\x1\x2\x3\x4\x5\x6\x7\b\t\n\xB\f\r\xE\xF\x10\x11\x12\x13\x14" + "\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"; + + Json::StreamWriterBuilder b; + JSONTEST_ASSERT(Json::writeString(b, root) == expected); + + b.settings_["emitUTF8"] = true; + JSONTEST_ASSERT(Json::writeString(b, root) == expected); + + b.settings_["emitUTF8"] = false; + JSONTEST_ASSERT(Json::writeString(b, root) == expected); +} + struct ReaderTest : JsonTest::TestCase { void setStrictMode() { reader = std::unique_ptr<Json::Reader>( From d268648c52f4bae82df2156742f609667bd48b8c Mon Sep 17 00:00:00 2001 From: Nikolay Baklicharov <nikolay.baklicharov@gmail.com> Date: Wed, 20 May 2020 17:50:17 +0300 Subject: [PATCH 2/4] Code-review fixes --- src/lib_json/json_tool.h | 3 --- src/lib_json/json_writer.cpp | 46 +++++++++++++++--------------------- src/test_lib_json/main.cpp | 2 +- 3 files changed, 20 insertions(+), 31 deletions(-) diff --git a/src/lib_json/json_tool.h b/src/lib_json/json_tool.h index b0e04f1a4..2d7b7d9a0 100644 --- a/src/lib_json/json_tool.h +++ b/src/lib_json/json_tool.h @@ -64,9 +64,6 @@ static inline String codePointToUTF8(unsigned int cp) { return result; } -/// Returns true if ch is a control character (in range [1,31]). -static inline bool isControlCharacter(char ch) { return ch > 0 && ch <= 0x1F; } - enum { /// Constant that specify the size of the buffer that must be passed to /// uintToString. diff --git a/src/lib_json/json_writer.cpp b/src/lib_json/json_writer.cpp index 236fa70a8..d3d759827 100644 --- a/src/lib_json/json_writer.cpp +++ b/src/lib_json/json_writer.cpp @@ -309,36 +309,28 @@ static String valueToQuotedStringN(const char* value, unsigned length, // Should add a flag to allow this compatibility mode and prevent this // sequence from occurring. default: { - if (emitUTF8) { - if (isControlCharacter(*c)) { - result += "\\u"; - result += toHex16Bit(*c); + unsigned int codepoint = + emitUTF8 ? static_cast<unsigned char>(*c) : utf8ToCodepoint(c, end); + const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20; + const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F; + const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000; + + if ((codepoint < FIRST_NON_CONTROL_CODEPOINT) || + (!emitUTF8 && (codepoint >= LAST_NON_CONTROL_CODEPOINT))) { + auto appendHexChar = [&result](unsigned ch) { + result.append("\\u").append(toHex16Bit(ch)); + }; + if (codepoint < FIRST_SURROGATE_PAIR_CODEPOINT) { + // codepoint is in Basic Multilingual Plane + appendHexChar(codepoint); } else { - result += *c; - } - } else { - unsigned int codepoint = utf8ToCodepoint(c, end); - const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20; - const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F; - const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000; - // don't escape non-control characters - // (short escape sequence are applied above) - if (FIRST_NON_CONTROL_CODEPOINT <= codepoint && - codepoint <= LAST_NON_CONTROL_CODEPOINT) { - result += static_cast<char>(codepoint); - } else if (codepoint < - FIRST_SURROGATE_PAIR_CODEPOINT) { // codepoint is in Basic - // Multilingual Plane - result += "\\u"; - result += toHex16Bit(codepoint); - } else { // codepoint is not in Basic Multilingual Plane - // convert to surrogate pair first + // codepoint is not in Basic Multilingual Plane codepoint -= FIRST_SURROGATE_PAIR_CODEPOINT; - result += "\\u"; - result += toHex16Bit((codepoint >> 10) + 0xD800); - result += "\\u"; - result += toHex16Bit((codepoint & 0x3FF) + 0xDC00); + appendHexChar(0xD800 + (codepoint >> 10)); + appendHexChar(0xDC00 + (codepoint & 0x3FF)); } + } else { + result += *c; } } break; } diff --git a/src/test_lib_json/main.cpp b/src/test_lib_json/main.cpp index 224f8516a..27c340378 100644 --- a/src/test_lib_json/main.cpp +++ b/src/test_lib_json/main.cpp @@ -2642,7 +2642,7 @@ JSONTEST_FIXTURE_LOCAL(StreamWriterTest, unicode) { JSONTEST_FIXTURE_LOCAL(StreamWriterTest, controlChars) { // Create a Json value containing UTF-8 string with some chars that need - // escape (tab,newline, control chars). + // escape (tab,newline,control chars). const Json::String expected( "{\n\t\"test\" : " "\"\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006\\u0007\\b\\t\\n\\u000b\\f\\" From 8a1417e27beb8733ae15326e0f57848023d4fe76 Mon Sep 17 00:00:00 2001 From: Nikolay Baklicharov <nikolay.baklicharov@gmail.com> Date: Thu, 21 May 2020 02:48:38 +0300 Subject: [PATCH 3/4] Code-review fixes 2 --- src/lib_json/json_writer.cpp | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/lib_json/json_writer.cpp b/src/lib_json/json_writer.cpp index d3d759827..965f02f7c 100644 --- a/src/lib_json/json_writer.cpp +++ b/src/lib_json/json_writer.cpp @@ -309,28 +309,26 @@ static String valueToQuotedStringN(const char* value, unsigned length, // Should add a flag to allow this compatibility mode and prevent this // sequence from occurring. default: { - unsigned int codepoint = - emitUTF8 ? static_cast<unsigned char>(*c) : utf8ToCodepoint(c, end); - const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20; - const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F; - const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000; - - if ((codepoint < FIRST_NON_CONTROL_CODEPOINT) || - (!emitUTF8 && (codepoint >= LAST_NON_CONTROL_CODEPOINT))) { - auto appendHexChar = [&result](unsigned ch) { - result.append("\\u").append(toHex16Bit(ch)); - }; - if (codepoint < FIRST_SURROGATE_PAIR_CODEPOINT) { + const auto appendHexChar = [&result](unsigned ch) { + result.append("\\u").append(toHex16Bit(ch)); + }; + + unsigned codepoint = static_cast<unsigned>(*c); + if (codepoint > 0x7F && !emitUTF8) { + codepoint = utf8ToCodepoint(c, end); + if (codepoint < 0x10000) { // codepoint is in Basic Multilingual Plane appendHexChar(codepoint); } else { // codepoint is not in Basic Multilingual Plane - codepoint -= FIRST_SURROGATE_PAIR_CODEPOINT; + codepoint -= 0x10000; appendHexChar(0xD800 + (codepoint >> 10)); appendHexChar(0xDC00 + (codepoint & 0x3FF)); } + } else if (codepoint < 0x20) { + appendHexChar(codepoint); } else { - result += *c; + result += static_cast<char>(codepoint); } } break; } From 0c6deca542117fae99ad512a6327637cb27e07d5 Mon Sep 17 00:00:00 2001 From: Nikolay Baklicharov <nikolay.baklicharov@gmail.com> Date: Thu, 21 May 2020 11:32:36 +0300 Subject: [PATCH 4/4] Code-review fixes 3 --- src/lib_json/json_writer.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/lib_json/json_writer.cpp b/src/lib_json/json_writer.cpp index 965f02f7c..a3be0bb04 100644 --- a/src/lib_json/json_writer.cpp +++ b/src/lib_json/json_writer.cpp @@ -309,23 +309,28 @@ static String valueToQuotedStringN(const char* value, unsigned length, // Should add a flag to allow this compatibility mode and prevent this // sequence from occurring. default: { + const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20; + const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F; + const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000; + const auto appendHexChar = [&result](unsigned ch) { result.append("\\u").append(toHex16Bit(ch)); }; - unsigned codepoint = static_cast<unsigned>(*c); - if (codepoint > 0x7F && !emitUTF8) { + unsigned int codepoint = static_cast<unsigned>(*c); + if (codepoint > LAST_NON_CONTROL_CODEPOINT && !emitUTF8) { codepoint = utf8ToCodepoint(c, end); - if (codepoint < 0x10000) { + if (codepoint < FIRST_SURROGATE_PAIR_CODEPOINT) { // codepoint is in Basic Multilingual Plane appendHexChar(codepoint); } else { // codepoint is not in Basic Multilingual Plane - codepoint -= 0x10000; + // convert to surrogate pair first + codepoint -= FIRST_SURROGATE_PAIR_CODEPOINT; appendHexChar(0xD800 + (codepoint >> 10)); appendHexChar(0xDC00 + (codepoint & 0x3FF)); } - } else if (codepoint < 0x20) { + } else if (codepoint < FIRST_NON_CONTROL_CODEPOINT) { appendHexChar(codepoint); } else { result += static_cast<char>(codepoint);