From 6973244664a27b070f60937cb1a803f28a9a8966 Mon Sep 17 00:00:00 2001
From: Nikolay Baklicharov <nikolay.baklicharov@gmail.com>
Date: Tue, 19 May 2020 16:14:22 +0300
Subject: [PATCH 1/4] Fixed control chars escaping with enabled emitUTF8

---
 src/lib_json/json_tool.h     |  3 +++
 src/lib_json/json_writer.cpp |  7 ++++++-
 src/test_lib_json/main.cpp   | 24 ++++++++++++++++++++++++
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/src/lib_json/json_tool.h b/src/lib_json/json_tool.h
index 2d7b7d9a0..b0e04f1a4 100644
--- a/src/lib_json/json_tool.h
+++ b/src/lib_json/json_tool.h
@@ -64,6 +64,9 @@ static inline String codePointToUTF8(unsigned int cp) {
   return result;
 }
 
+/// Returns true if ch is a control character (in range [1,31]).
+static inline bool isControlCharacter(char ch) { return ch > 0 && ch <= 0x1F; }
+
 enum {
   /// Constant that specify the size of the buffer that must be passed to
   /// uintToString.
diff --git a/src/lib_json/json_writer.cpp b/src/lib_json/json_writer.cpp
index 56ee65ef6..236fa70a8 100644
--- a/src/lib_json/json_writer.cpp
+++ b/src/lib_json/json_writer.cpp
@@ -310,7 +310,12 @@ static String valueToQuotedStringN(const char* value, unsigned length,
     // sequence from occurring.
     default: {
       if (emitUTF8) {
-        result += *c;
+        if (isControlCharacter(*c)) {
+          result += "\\u";
+          result += toHex16Bit(*c);
+        } else {
+          result += *c;
+        }
       } else {
         unsigned int codepoint = utf8ToCodepoint(c, end);
         const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20;
diff --git a/src/test_lib_json/main.cpp b/src/test_lib_json/main.cpp
index 73850cfd8..224f8516a 100644
--- a/src/test_lib_json/main.cpp
+++ b/src/test_lib_json/main.cpp
@@ -2640,6 +2640,30 @@ JSONTEST_FIXTURE_LOCAL(StreamWriterTest, unicode) {
                   "\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}");
 }
 
+JSONTEST_FIXTURE_LOCAL(StreamWriterTest, controlChars) {
+  // Create a Json value containing UTF-8 string with some chars that need
+  // escape (tab,newline, control chars).
+  const Json::String expected(
+      "{\n\t\"test\" : "
+      "\"\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006\\u0007\\b\\t\\n\\u000b\\f\\"
+      "r\\u000e\\u000f\\u0010\\u0011\\u0012\\u0013\\u0014\\u0015\\u0016\\u0017"
+      "\\u0018\\u0019\\u001a\\u001b\\u001c\\u001d\\u001e\\u001f\"\n}");
+
+  // Create a Json value containing string with controls chars that need escape.
+  Json::Value root;
+  root["test"] = "\x1\x2\x3\x4\x5\x6\x7\b\t\n\xB\f\r\xE\xF\x10\x11\x12\x13\x14"
+                 "\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F";
+
+  Json::StreamWriterBuilder b;
+  JSONTEST_ASSERT(Json::writeString(b, root) == expected);
+
+  b.settings_["emitUTF8"] = true;
+  JSONTEST_ASSERT(Json::writeString(b, root) == expected);
+
+  b.settings_["emitUTF8"] = false;
+  JSONTEST_ASSERT(Json::writeString(b, root) == expected);
+}
+
 struct ReaderTest : JsonTest::TestCase {
   void setStrictMode() {
     reader = std::unique_ptr<Json::Reader>(

From d268648c52f4bae82df2156742f609667bd48b8c Mon Sep 17 00:00:00 2001
From: Nikolay Baklicharov <nikolay.baklicharov@gmail.com>
Date: Wed, 20 May 2020 17:50:17 +0300
Subject: [PATCH 2/4] Code-review fixes

---
 src/lib_json/json_tool.h     |  3 ---
 src/lib_json/json_writer.cpp | 46 +++++++++++++++---------------------
 src/test_lib_json/main.cpp   |  2 +-
 3 files changed, 20 insertions(+), 31 deletions(-)

diff --git a/src/lib_json/json_tool.h b/src/lib_json/json_tool.h
index b0e04f1a4..2d7b7d9a0 100644
--- a/src/lib_json/json_tool.h
+++ b/src/lib_json/json_tool.h
@@ -64,9 +64,6 @@ static inline String codePointToUTF8(unsigned int cp) {
   return result;
 }
 
-/// Returns true if ch is a control character (in range [1,31]).
-static inline bool isControlCharacter(char ch) { return ch > 0 && ch <= 0x1F; }
-
 enum {
   /// Constant that specify the size of the buffer that must be passed to
   /// uintToString.
diff --git a/src/lib_json/json_writer.cpp b/src/lib_json/json_writer.cpp
index 236fa70a8..d3d759827 100644
--- a/src/lib_json/json_writer.cpp
+++ b/src/lib_json/json_writer.cpp
@@ -309,36 +309,28 @@ static String valueToQuotedStringN(const char* value, unsigned length,
     // Should add a flag to allow this compatibility mode and prevent this
     // sequence from occurring.
     default: {
-      if (emitUTF8) {
-        if (isControlCharacter(*c)) {
-          result += "\\u";
-          result += toHex16Bit(*c);
+      unsigned int codepoint =
+          emitUTF8 ? static_cast<unsigned char>(*c) : utf8ToCodepoint(c, end);
+      const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20;
+      const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F;
+      const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000;
+
+      if ((codepoint < FIRST_NON_CONTROL_CODEPOINT) ||
+          (!emitUTF8 && (codepoint >= LAST_NON_CONTROL_CODEPOINT))) {
+        auto appendHexChar = [&result](unsigned ch) {
+          result.append("\\u").append(toHex16Bit(ch));
+        };
+        if (codepoint < FIRST_SURROGATE_PAIR_CODEPOINT) {
+          // codepoint is in Basic Multilingual Plane
+          appendHexChar(codepoint);
         } else {
-          result += *c;
-        }
-      } else {
-        unsigned int codepoint = utf8ToCodepoint(c, end);
-        const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20;
-        const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F;
-        const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000;
-        // don't escape non-control characters
-        // (short escape sequence are applied above)
-        if (FIRST_NON_CONTROL_CODEPOINT <= codepoint &&
-            codepoint <= LAST_NON_CONTROL_CODEPOINT) {
-          result += static_cast<char>(codepoint);
-        } else if (codepoint <
-                   FIRST_SURROGATE_PAIR_CODEPOINT) { // codepoint is in Basic
-                                                     // Multilingual Plane
-          result += "\\u";
-          result += toHex16Bit(codepoint);
-        } else { // codepoint is not in Basic Multilingual Plane
-                 // convert to surrogate pair first
+          // codepoint is not in Basic Multilingual Plane
           codepoint -= FIRST_SURROGATE_PAIR_CODEPOINT;
-          result += "\\u";
-          result += toHex16Bit((codepoint >> 10) + 0xD800);
-          result += "\\u";
-          result += toHex16Bit((codepoint & 0x3FF) + 0xDC00);
+          appendHexChar(0xD800 + (codepoint >> 10));
+          appendHexChar(0xDC00 + (codepoint & 0x3FF));
         }
+      } else {
+        result += *c;
       }
     } break;
     }
diff --git a/src/test_lib_json/main.cpp b/src/test_lib_json/main.cpp
index 224f8516a..27c340378 100644
--- a/src/test_lib_json/main.cpp
+++ b/src/test_lib_json/main.cpp
@@ -2642,7 +2642,7 @@ JSONTEST_FIXTURE_LOCAL(StreamWriterTest, unicode) {
 
 JSONTEST_FIXTURE_LOCAL(StreamWriterTest, controlChars) {
   // Create a Json value containing UTF-8 string with some chars that need
-  // escape (tab,newline, control chars).
+  // escape (tab,newline,control chars).
   const Json::String expected(
       "{\n\t\"test\" : "
       "\"\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006\\u0007\\b\\t\\n\\u000b\\f\\"

From 8a1417e27beb8733ae15326e0f57848023d4fe76 Mon Sep 17 00:00:00 2001
From: Nikolay Baklicharov <nikolay.baklicharov@gmail.com>
Date: Thu, 21 May 2020 02:48:38 +0300
Subject: [PATCH 3/4] Code-review fixes 2

---
 src/lib_json/json_writer.cpp | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/lib_json/json_writer.cpp b/src/lib_json/json_writer.cpp
index d3d759827..965f02f7c 100644
--- a/src/lib_json/json_writer.cpp
+++ b/src/lib_json/json_writer.cpp
@@ -309,28 +309,26 @@ static String valueToQuotedStringN(const char* value, unsigned length,
     // Should add a flag to allow this compatibility mode and prevent this
     // sequence from occurring.
     default: {
-      unsigned int codepoint =
-          emitUTF8 ? static_cast<unsigned char>(*c) : utf8ToCodepoint(c, end);
-      const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20;
-      const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F;
-      const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000;
-
-      if ((codepoint < FIRST_NON_CONTROL_CODEPOINT) ||
-          (!emitUTF8 && (codepoint >= LAST_NON_CONTROL_CODEPOINT))) {
-        auto appendHexChar = [&result](unsigned ch) {
-          result.append("\\u").append(toHex16Bit(ch));
-        };
-        if (codepoint < FIRST_SURROGATE_PAIR_CODEPOINT) {
+      const auto appendHexChar = [&result](unsigned ch) {
+        result.append("\\u").append(toHex16Bit(ch));
+      };
+
+      unsigned codepoint = static_cast<unsigned>(*c);
+      if (codepoint > 0x7F && !emitUTF8) {
+        codepoint = utf8ToCodepoint(c, end);
+        if (codepoint < 0x10000) {
           // codepoint is in Basic Multilingual Plane
           appendHexChar(codepoint);
         } else {
           // codepoint is not in Basic Multilingual Plane
-          codepoint -= FIRST_SURROGATE_PAIR_CODEPOINT;
+          codepoint -= 0x10000;
           appendHexChar(0xD800 + (codepoint >> 10));
           appendHexChar(0xDC00 + (codepoint & 0x3FF));
         }
+      } else if (codepoint < 0x20) {
+        appendHexChar(codepoint);
       } else {
-        result += *c;
+        result += static_cast<char>(codepoint);
       }
     } break;
     }

From 0c6deca542117fae99ad512a6327637cb27e07d5 Mon Sep 17 00:00:00 2001
From: Nikolay Baklicharov <nikolay.baklicharov@gmail.com>
Date: Thu, 21 May 2020 11:32:36 +0300
Subject: [PATCH 4/4] Code-review fixes 3

---
 src/lib_json/json_writer.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/lib_json/json_writer.cpp b/src/lib_json/json_writer.cpp
index 965f02f7c..a3be0bb04 100644
--- a/src/lib_json/json_writer.cpp
+++ b/src/lib_json/json_writer.cpp
@@ -309,23 +309,28 @@ static String valueToQuotedStringN(const char* value, unsigned length,
     // Should add a flag to allow this compatibility mode and prevent this
     // sequence from occurring.
     default: {
+      const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20;
+      const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F;
+      const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000;
+
       const auto appendHexChar = [&result](unsigned ch) {
         result.append("\\u").append(toHex16Bit(ch));
       };
 
-      unsigned codepoint = static_cast<unsigned>(*c);
-      if (codepoint > 0x7F && !emitUTF8) {
+      unsigned int codepoint = static_cast<unsigned>(*c);
+      if (codepoint > LAST_NON_CONTROL_CODEPOINT && !emitUTF8) {
         codepoint = utf8ToCodepoint(c, end);
-        if (codepoint < 0x10000) {
+        if (codepoint < FIRST_SURROGATE_PAIR_CODEPOINT) {
           // codepoint is in Basic Multilingual Plane
           appendHexChar(codepoint);
         } else {
           // codepoint is not in Basic Multilingual Plane
-          codepoint -= 0x10000;
+          // convert to surrogate pair first
+          codepoint -= FIRST_SURROGATE_PAIR_CODEPOINT;
           appendHexChar(0xD800 + (codepoint >> 10));
           appendHexChar(0xDC00 + (codepoint & 0x3FF));
         }
-      } else if (codepoint < 0x20) {
+      } else if (codepoint < FIRST_NON_CONTROL_CODEPOINT) {
         appendHexChar(codepoint);
       } else {
         result += static_cast<char>(codepoint);