Skip to content

Commit a955529

Browse files
nicolaswilsonbaylesj
authored andcommitted
Added emitUTF8 setting. (#1045)
* Added emitUTF8 setting to emit UTF8 format JSON. * Added a test for emitUTF8, with it in default, on and off states. * Review comments addressed. * Merged master into my branch & resolved conflicts. * Fix clang-format errors. * Fix clang-format errors. * Fixed clang-format errors. * Fixed clang-format errors.
1 parent f59ac2a commit a955529

File tree

2 files changed

+75
-29
lines changed

2 files changed

+75
-29
lines changed

src/lib_json/json_writer.cpp

+46-29
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,8 @@ static String toHex16Bit(unsigned int x) {
264264
return result;
265265
}
266266

267-
static String valueToQuotedStringN(const char* value, unsigned length) {
267+
static String valueToQuotedStringN(const char* value, unsigned length,
268+
bool emitUTF8 = false) {
268269
if (value == nullptr)
269270
return "";
270271

@@ -310,21 +311,31 @@ static String valueToQuotedStringN(const char* value, unsigned length) {
310311
// Should add a flag to allow this compatibility mode and prevent this
311312
// sequence from occurring.
312313
default: {
313-
unsigned int cp = utf8ToCodepoint(c, end);
314-
// don't escape non-control characters
315-
// (short escape sequence are applied above)
316-
if (cp < 0x80 && cp >= 0x20)
317-
result += static_cast<char>(cp);
318-
else if (cp < 0x10000) { // codepoint is in Basic Multilingual Plane
319-
result += "\\u";
320-
result += toHex16Bit(cp);
321-
} else { // codepoint is not in Basic Multilingual Plane
322-
// convert to surrogate pair first
323-
cp -= 0x10000;
324-
result += "\\u";
325-
result += toHex16Bit((cp >> 10) + 0xD800);
326-
result += "\\u";
327-
result += toHex16Bit((cp & 0x3FF) + 0xDC00);
314+
if (emitUTF8) {
315+
result += *c;
316+
} else {
317+
unsigned int codepoint = utf8ToCodepoint(c, end);
318+
const unsigned int FIRST_NON_CONTROL_CODEPOINT = 0x20;
319+
const unsigned int LAST_NON_CONTROL_CODEPOINT = 0x7F;
320+
const unsigned int FIRST_SURROGATE_PAIR_CODEPOINT = 0x10000;
321+
// don't escape non-control characters
322+
// (short escape sequence are applied above)
323+
if (FIRST_NON_CONTROL_CODEPOINT <= codepoint &&
324+
codepoint <= LAST_NON_CONTROL_CODEPOINT) {
325+
result += static_cast<char>(codepoint);
326+
} else if (codepoint <
327+
FIRST_SURROGATE_PAIR_CODEPOINT) { // codepoint is in Basic
328+
// Multilingual Plane
329+
result += "\\u";
330+
result += toHex16Bit(codepoint);
331+
} else { // codepoint is not in Basic Multilingual Plane
332+
// convert to surrogate pair first
333+
codepoint -= FIRST_SURROGATE_PAIR_CODEPOINT;
334+
result += "\\u";
335+
result += toHex16Bit((codepoint >> 10) + 0xD800);
336+
result += "\\u";
337+
result += toHex16Bit((codepoint & 0x3FF) + 0xDC00);
338+
}
328339
}
329340
} break;
330341
}
@@ -864,7 +875,8 @@ struct BuiltStyledStreamWriter : public StreamWriter {
864875
BuiltStyledStreamWriter(String indentation, CommentStyle::Enum cs,
865876
String colonSymbol, String nullSymbol,
866877
String endingLineFeedSymbol, bool useSpecialFloats,
867-
unsigned int precision, PrecisionType precisionType);
878+
bool emitUTF8, unsigned int precision,
879+
PrecisionType precisionType);
868880
int write(Value const& root, OStream* sout) override;
869881

870882
private:
@@ -893,19 +905,20 @@ struct BuiltStyledStreamWriter : public StreamWriter {
893905
bool addChildValues_ : 1;
894906
bool indented_ : 1;
895907
bool useSpecialFloats_ : 1;
908+
bool emitUTF8_ : 1;
896909
unsigned int precision_;
897910
PrecisionType precisionType_;
898911
};
899912
BuiltStyledStreamWriter::BuiltStyledStreamWriter(
900913
String indentation, CommentStyle::Enum cs, String colonSymbol,
901914
String nullSymbol, String endingLineFeedSymbol, bool useSpecialFloats,
902-
unsigned int precision, PrecisionType precisionType)
915+
bool emitUTF8, unsigned int precision, PrecisionType precisionType)
903916
: rightMargin_(74), indentation_(std::move(indentation)), cs_(cs),
904917
colonSymbol_(std::move(colonSymbol)), nullSymbol_(std::move(nullSymbol)),
905918
endingLineFeedSymbol_(std::move(endingLineFeedSymbol)),
906919
addChildValues_(false), indented_(false),
907-
useSpecialFloats_(useSpecialFloats), precision_(precision),
908-
precisionType_(precisionType) {}
920+
useSpecialFloats_(useSpecialFloats), emitUTF8_(emitUTF8),
921+
precision_(precision), precisionType_(precisionType) {}
909922
int BuiltStyledStreamWriter::write(Value const& root, OStream* sout) {
910923
sout_ = sout;
911924
addChildValues_ = false;
@@ -942,7 +955,8 @@ void BuiltStyledStreamWriter::writeValue(Value const& value) {
942955
char const* end;
943956
bool ok = value.getString(&str, &end);
944957
if (ok)
945-
pushValue(valueToQuotedStringN(str, static_cast<unsigned>(end - str)));
958+
pushValue(valueToQuotedStringN(str, static_cast<unsigned>(end - str),
959+
emitUTF8_));
946960
else
947961
pushValue("");
948962
break;
@@ -966,7 +980,7 @@ void BuiltStyledStreamWriter::writeValue(Value const& value) {
966980
Value const& childValue = value[name];
967981
writeCommentBeforeValue(childValue);
968982
writeWithIndent(valueToQuotedStringN(
969-
name.data(), static_cast<unsigned>(name.length())));
983+
name.data(), static_cast<unsigned>(name.length()), emitUTF8_));
970984
*sout_ << colonSymbol_;
971985
writeValue(childValue);
972986
if (++it == members.end()) {
@@ -1142,12 +1156,13 @@ StreamWriter::Factory::~Factory() = default;
11421156
StreamWriterBuilder::StreamWriterBuilder() { setDefaults(&settings_); }
11431157
StreamWriterBuilder::~StreamWriterBuilder() = default;
11441158
StreamWriter* StreamWriterBuilder::newStreamWriter() const {
1145-
String indentation = settings_["indentation"].asString();
1146-
String cs_str = settings_["commentStyle"].asString();
1147-
String pt_str = settings_["precisionType"].asString();
1148-
bool eyc = settings_["enableYAMLCompatibility"].asBool();
1149-
bool dnp = settings_["dropNullPlaceholders"].asBool();
1150-
bool usf = settings_["useSpecialFloats"].asBool();
1159+
const String indentation = settings_["indentation"].asString();
1160+
const String cs_str = settings_["commentStyle"].asString();
1161+
const String pt_str = settings_["precisionType"].asString();
1162+
const bool eyc = settings_["enableYAMLCompatibility"].asBool();
1163+
const bool dnp = settings_["dropNullPlaceholders"].asBool();
1164+
const bool usf = settings_["useSpecialFloats"].asBool();
1165+
const bool emitUTF8 = settings_["emitUTF8"].asBool();
11511166
unsigned int pre = settings_["precision"].asUInt();
11521167
CommentStyle::Enum cs = CommentStyle::All;
11531168
if (cs_str == "All") {
@@ -1179,7 +1194,7 @@ StreamWriter* StreamWriterBuilder::newStreamWriter() const {
11791194
pre = 17;
11801195
String endingLineFeedSymbol;
11811196
return new BuiltStyledStreamWriter(indentation, cs, colonSymbol, nullSymbol,
1182-
endingLineFeedSymbol, usf, pre,
1197+
endingLineFeedSymbol, usf, emitUTF8, pre,
11831198
precisionType);
11841199
}
11851200
static void getValidWriterKeys(std::set<String>* valid_keys) {
@@ -1189,6 +1204,7 @@ static void getValidWriterKeys(std::set<String>* valid_keys) {
11891204
valid_keys->insert("enableYAMLCompatibility");
11901205
valid_keys->insert("dropNullPlaceholders");
11911206
valid_keys->insert("useSpecialFloats");
1207+
valid_keys->insert("emitUTF8");
11921208
valid_keys->insert("precision");
11931209
valid_keys->insert("precisionType");
11941210
}
@@ -1220,6 +1236,7 @@ void StreamWriterBuilder::setDefaults(Json::Value* settings) {
12201236
(*settings)["enableYAMLCompatibility"] = false;
12211237
(*settings)["dropNullPlaceholders"] = false;
12221238
(*settings)["useSpecialFloats"] = false;
1239+
(*settings)["emitUTF8"] = false;
12231240
(*settings)["precision"] = 17;
12241241
(*settings)["precisionType"] = "significant";
12251242
//! [StreamWriterBuilderDefaults]

src/test_lib_json/main.cpp

+29
Original file line numberDiff line numberDiff line change
@@ -2481,6 +2481,35 @@ JSONTEST_FIXTURE_LOCAL(StreamWriterTest, writeZeroes) {
24812481
}
24822482
}
24832483

2484+
JSONTEST_FIXTURE_LOCAL(StreamWriterTest, unicode) {
2485+
// Create a Json value containing UTF-8 string with some chars that need
2486+
// escape (tab,newline).
2487+
Json::Value root;
2488+
root["test"] = "\t\n\xF0\x91\xA2\xA1\x3D\xC4\xB3\xF0\x9B\x84\x9B\xEF\xBD\xA7";
2489+
2490+
Json::StreamWriterBuilder b;
2491+
2492+
// Default settings - should be unicode escaped.
2493+
JSONTEST_ASSERT(Json::writeString(b, root) ==
2494+
"{\n\t\"test\" : "
2495+
"\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}");
2496+
2497+
b.settings_["emitUTF8"] = true;
2498+
2499+
// Should not be unicode escaped.
2500+
JSONTEST_ASSERT(
2501+
Json::writeString(b, root) ==
2502+
"{\n\t\"test\" : "
2503+
"\"\\t\\n\xF0\x91\xA2\xA1=\xC4\xB3\xF0\x9B\x84\x9B\xEF\xBD\xA7\"\n}");
2504+
2505+
b.settings_["emitUTF8"] = false;
2506+
2507+
// Should be unicode escaped.
2508+
JSONTEST_ASSERT(Json::writeString(b, root) ==
2509+
"{\n\t\"test\" : "
2510+
"\"\\t\\n\\ud806\\udca1=\\u0133\\ud82c\\udd1b\\uff67\"\n}");
2511+
}
2512+
24842513
struct ReaderTest : JsonTest::TestCase {};
24852514

24862515
JSONTEST_FIXTURE_LOCAL(ReaderTest, parseWithNoErrors) {

0 commit comments

Comments
 (0)