Skip to content

Commit 98dad7f

Browse files
committed
refs #18, unicode error raise can now be enabled with define GHC_RAISE_UNICODE_ERRORS
1 parent 2969bad commit 98dad7f

File tree

2 files changed

+75
-3
lines changed

2 files changed

+75
-3
lines changed

include/ghc/filesystem.hpp

+63-1
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,11 @@
166166
// as ghc::filesystem::string_type.
167167
// #define GHC_WIN_WSTRING_STRING_TYPE
168168
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
169+
// Rais errors/exceptions when invalid unicode codepoints or UTF-8 sequences are found,
170+
// instead of replacing them with the unicode replacement character (U+FFFD).
171+
// #define GHC_RAISE_UNICODE_ERRORS
172+
//- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
173+
169174
// ghc::filesystem version in decimal (major * 10000 + minor * 100 + patch)
170175
#define GHC_FILESYSTEM_VERSION 10199L
171176

@@ -1209,7 +1214,11 @@ GHC_INLINE void appendUTF8(std::string& str, uint32_t unicode)
12091214
str.push_back(static_cast<char>((unicode & 0x3f) + 128));
12101215
}
12111216
else {
1217+
#ifdef GHC_RAISE_UNICODE_ERRORS
1218+
throw filesystem_error("Illegal code point for unicode character.", str, std::make_error_code(std::errc::illegal_byte_sequence));
1219+
#else
12121220
appendUTF8(str, 0xfffd);
1221+
#endif
12131222
}
12141223
}
12151224

@@ -1228,6 +1237,22 @@ GHC_INLINE unsigned consumeUtf8Fragment(const unsigned state, const uint8_t frag
12281237
return state == S_RJCT ? static_cast<unsigned>(S_RJCT) : static_cast<unsigned>((utf8_state_info[category + 16] >> (state << 2)) & 0xf);
12291238
}
12301239

1240+
GHC_INLINE bool validUtf8(const std::string& utf8String)
1241+
{
1242+
std::string::const_iterator iter = utf8String.begin();
1243+
unsigned utf8_state = S_STRT;
1244+
std::uint32_t codepoint = 0;
1245+
while (iter < utf8String.end()) {
1246+
if ((utf8_state = consumeUtf8Fragment(utf8_state, (uint8_t)*iter++, codepoint)) == S_RJCT) {
1247+
return false;
1248+
}
1249+
}
1250+
if (utf8_state) {
1251+
return false;
1252+
}
1253+
return true;
1254+
}
1255+
12311256
} // namespace detail
12321257

12331258
#endif
@@ -1261,13 +1286,21 @@ inline StringType fromUtf8(const std::string& utf8String, const typename StringT
12611286
codepoint = 0;
12621287
}
12631288
else if (utf8_state == S_RJCT) {
1289+
#ifdef GHC_RAISE_UNICODE_ERRORS
1290+
throw filesystem_error("Illegal byte sequence for unicode character.", utf8String, std::make_error_code(std::errc::illegal_byte_sequence));
1291+
#else
12641292
result += (typename StringType::value_type)0xfffd;
12651293
utf8_state = S_STRT;
12661294
codepoint = 0;
1295+
#endif
12671296
}
12681297
}
12691298
if (utf8_state) {
1299+
#ifdef GHC_RAISE_UNICODE_ERRORS
1300+
throw filesystem_error("Illegal byte sequence for unicode character.", utf8String, std::make_error_code(std::errc::illegal_byte_sequence));
1301+
#else
12701302
result += (typename StringType::value_type)0xfffd;
1303+
#endif
12711304
}
12721305
return result;
12731306
}
@@ -1286,13 +1319,21 @@ inline StringType fromUtf8(const std::string& utf8String, const typename StringT
12861319
codepoint = 0;
12871320
}
12881321
else if (utf8_state == S_RJCT) {
1322+
#ifdef GHC_RAISE_UNICODE_ERRORS
1323+
throw filesystem_error("Illegal byte sequence for unicode character.", utf8String, std::make_error_code(std::errc::illegal_byte_sequence));
1324+
#else
12891325
result += (typename StringType::value_type)0xfffd;
12901326
utf8_state = S_STRT;
12911327
codepoint = 0;
1328+
#endif
12921329
}
12931330
}
12941331
if (utf8_state) {
1332+
#ifdef GHC_RAISE_UNICODE_ERRORS
1333+
throw filesystem_error("Illegal byte sequence for unicode character.", utf8String, std::make_error_code(std::errc::illegal_byte_sequence));
1334+
#else
12951335
result += (typename StringType::value_type)0xfffd;
1336+
#endif
12961337
}
12971338
return result;
12981339
}
@@ -1315,10 +1356,14 @@ inline std::string toUtf8(const std::basic_string<charT, traits, Alloc>& unicode
13151356
appendUTF8(result, (char32_t(c) << 10) + *iter - 0x35fdc00);
13161357
}
13171358
else {
1359+
#ifdef GHC_RAISE_UNICODE_ERRORS
1360+
throw filesystem_error("Illegal code point for unicode character.", result, std::make_error_code(std::errc::illegal_byte_sequence));
1361+
#else
13181362
appendUTF8(result, 0xfffd);
13191363
if(iter == unicodeString.end()) {
13201364
break;
13211365
}
1366+
#endif
13221367
}
13231368
}
13241369
else {
@@ -1359,6 +1404,13 @@ GHC_INLINE bool startsWith(const std::string& what, const std::string& with)
13591404

13601405
GHC_INLINE void path::postprocess_path_with_format(path::impl_string_type& p, path::format fmt)
13611406
{
1407+
#ifdef GHC_RAISE_UNICODE_ERRORS
1408+
if(!detail::validUtf8(p)) {
1409+
path t;
1410+
t._path = p;
1411+
throw filesystem_error("Illegal byte sequence for unicode character.", t, std::make_error_code(std::errc::illegal_byte_sequence));
1412+
}
1413+
#endif
13621414
switch (fmt) {
13631415
#ifndef GHC_OS_WINDOWS
13641416
case path::auto_format:
@@ -4658,10 +4710,20 @@ class directory_iterator::impl
46584710
do {
46594711
if (FindNextFileW(_dirHandle, &_findData)) {
46604712
_current = _base;
4661-
_current.append_name(detail::toUtf8(_findData.cFileName).c_str());
4713+
try {
4714+
_current.append_name(detail::toUtf8(_findData.cFileName).c_str());
4715+
}
4716+
catch(filesystem_error& fe) {
4717+
ec = fe.code();
4718+
return;
4719+
}
46624720
copyToDirEntry(ec);
46634721
}
46644722
else {
4723+
auto err = ::GetLastError();
4724+
if(err != ERROR_NO_MORE_FILES) {
4725+
_ec = ec = std::error_code(err, std::system_category());
4726+
}
46654727
FindClose(_dirHandle);
46664728
_dirHandle = INVALID_HANDLE_VALUE;
46674729
_current = filesystem::path();

test/filesystem_test.cpp

+12-2
Original file line numberDiff line numberDiff line change
@@ -320,18 +320,28 @@ TEST_CASE("fs::detail::fromUtf8", "[filesystem][fs.detail.utf8]")
320320
CHECK(fs::detail::toUtf8(std::wstring(L"foobar")) == "foobar");
321321
CHECK(fs::detail::toUtf8(std::wstring(L"föobar")).length() == 7);
322322
CHECK(fs::detail::toUtf8(std::wstring(L"föobar")) == u8"föobar");
323-
323+
324+
#ifdef GHC_RAISE_UNICODE_ERRORS
325+
CHECK_THROWS_AS(fs::detail::fromUtf8<std::u16string>(std::string("\xed\xa0\x80")), fs::filesystem_error);
326+
CHECK_THROWS_AS(fs::detail::fromUtf8<std::u16string>(std::string("\xc3")), fs::filesystem_error);
327+
#else
324328
CHECK(std::u16string(2,0xfffd) == fs::detail::fromUtf8<std::u16string>(std::string("\xed\xa0\x80")));
325329
CHECK(std::u16string(1,0xfffd) == fs::detail::fromUtf8<std::u16string>(std::string("\xc3")));
330+
#endif
326331
}
327332

328333
TEST_CASE("fs::detail::toUtf8", "[filesystem][fs.detail.utf8]")
329334
{
335+
std::string t;
330336
CHECK(std::string("\xc3\xa4/\xe2\x82\xac\xf0\x9d\x84\x9e") == fs::detail::toUtf8(std::u16string(u"\u00E4/\u20AC\U0001D11E")));
337+
#ifdef GHC_RAISE_UNICODE_ERRORS
338+
CHECK_THROWS_AS(fs::detail::toUtf8(std::u16string(1, 0xd800)), fs::filesystem_error);
339+
CHECK_THROWS_AS(fs::detail::appendUTF8(t, 0x200000), fs::filesystem_error);
340+
#else
331341
CHECK(std::string("\xEF\xBF\xBD") == fs::detail::toUtf8(std::u16string(1, 0xd800)));
332-
std::string t;
333342
fs::detail::appendUTF8(t, 0x200000);
334343
CHECK(std::string("\xEF\xBF\xBD") == t);
344+
#endif
335345
}
336346
#endif
337347

0 commit comments

Comments
 (0)