166
166
// as ghc::filesystem::string_type.
167
167
// #define GHC_WIN_WSTRING_STRING_TYPE
168
168
// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
169
+ // Rais errors/exceptions when invalid unicode codepoints or UTF-8 sequences are found,
170
+ // instead of replacing them with the unicode replacement character (U+FFFD).
171
+ // #define GHC_RAISE_UNICODE_ERRORS
172
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
173
+
169
174
// ghc::filesystem version in decimal (major * 10000 + minor * 100 + patch)
170
175
#define GHC_FILESYSTEM_VERSION 10199L
171
176
@@ -1209,7 +1214,11 @@ GHC_INLINE void appendUTF8(std::string& str, uint32_t unicode)
1209
1214
str.push_back (static_cast <char >((unicode & 0x3f ) + 128 ));
1210
1215
}
1211
1216
else {
1217
+ #ifdef GHC_RAISE_UNICODE_ERRORS
1218
+ throw filesystem_error (" Illegal code point for unicode character." , str, std::make_error_code (std::errc::illegal_byte_sequence));
1219
+ #else
1212
1220
appendUTF8 (str, 0xfffd );
1221
+ #endif
1213
1222
}
1214
1223
}
1215
1224
@@ -1228,6 +1237,22 @@ GHC_INLINE unsigned consumeUtf8Fragment(const unsigned state, const uint8_t frag
1228
1237
return state == S_RJCT ? static_cast <unsigned >(S_RJCT) : static_cast <unsigned >((utf8_state_info[category + 16 ] >> (state << 2 )) & 0xf );
1229
1238
}
1230
1239
1240
+ GHC_INLINE bool validUtf8 (const std::string& utf8String)
1241
+ {
1242
+ std::string::const_iterator iter = utf8String.begin ();
1243
+ unsigned utf8_state = S_STRT;
1244
+ std::uint32_t codepoint = 0 ;
1245
+ while (iter < utf8String.end ()) {
1246
+ if ((utf8_state = consumeUtf8Fragment (utf8_state, (uint8_t )*iter++, codepoint)) == S_RJCT) {
1247
+ return false ;
1248
+ }
1249
+ }
1250
+ if (utf8_state) {
1251
+ return false ;
1252
+ }
1253
+ return true ;
1254
+ }
1255
+
1231
1256
} // namespace detail
1232
1257
1233
1258
#endif
@@ -1261,13 +1286,21 @@ inline StringType fromUtf8(const std::string& utf8String, const typename StringT
1261
1286
codepoint = 0 ;
1262
1287
}
1263
1288
else if (utf8_state == S_RJCT) {
1289
+ #ifdef GHC_RAISE_UNICODE_ERRORS
1290
+ throw filesystem_error (" Illegal byte sequence for unicode character." , utf8String, std::make_error_code (std::errc::illegal_byte_sequence));
1291
+ #else
1264
1292
result += (typename StringType::value_type)0xfffd ;
1265
1293
utf8_state = S_STRT;
1266
1294
codepoint = 0 ;
1295
+ #endif
1267
1296
}
1268
1297
}
1269
1298
if (utf8_state) {
1299
+ #ifdef GHC_RAISE_UNICODE_ERRORS
1300
+ throw filesystem_error (" Illegal byte sequence for unicode character." , utf8String, std::make_error_code (std::errc::illegal_byte_sequence));
1301
+ #else
1270
1302
result += (typename StringType::value_type)0xfffd ;
1303
+ #endif
1271
1304
}
1272
1305
return result;
1273
1306
}
@@ -1286,13 +1319,21 @@ inline StringType fromUtf8(const std::string& utf8String, const typename StringT
1286
1319
codepoint = 0 ;
1287
1320
}
1288
1321
else if (utf8_state == S_RJCT) {
1322
+ #ifdef GHC_RAISE_UNICODE_ERRORS
1323
+ throw filesystem_error (" Illegal byte sequence for unicode character." , utf8String, std::make_error_code (std::errc::illegal_byte_sequence));
1324
+ #else
1289
1325
result += (typename StringType::value_type)0xfffd ;
1290
1326
utf8_state = S_STRT;
1291
1327
codepoint = 0 ;
1328
+ #endif
1292
1329
}
1293
1330
}
1294
1331
if (utf8_state) {
1332
+ #ifdef GHC_RAISE_UNICODE_ERRORS
1333
+ throw filesystem_error (" Illegal byte sequence for unicode character." , utf8String, std::make_error_code (std::errc::illegal_byte_sequence));
1334
+ #else
1295
1335
result += (typename StringType::value_type)0xfffd ;
1336
+ #endif
1296
1337
}
1297
1338
return result;
1298
1339
}
@@ -1315,10 +1356,14 @@ inline std::string toUtf8(const std::basic_string<charT, traits, Alloc>& unicode
1315
1356
appendUTF8 (result, (char32_t (c) << 10 ) + *iter - 0x35fdc00 );
1316
1357
}
1317
1358
else {
1359
+ #ifdef GHC_RAISE_UNICODE_ERRORS
1360
+ throw filesystem_error (" Illegal code point for unicode character." , result, std::make_error_code (std::errc::illegal_byte_sequence));
1361
+ #else
1318
1362
appendUTF8 (result, 0xfffd );
1319
1363
if (iter == unicodeString.end ()) {
1320
1364
break ;
1321
1365
}
1366
+ #endif
1322
1367
}
1323
1368
}
1324
1369
else {
@@ -1359,6 +1404,13 @@ GHC_INLINE bool startsWith(const std::string& what, const std::string& with)
1359
1404
1360
1405
GHC_INLINE void path::postprocess_path_with_format (path::impl_string_type& p, path::format fmt)
1361
1406
{
1407
+ #ifdef GHC_RAISE_UNICODE_ERRORS
1408
+ if (!detail::validUtf8 (p)) {
1409
+ path t;
1410
+ t._path = p;
1411
+ throw filesystem_error (" Illegal byte sequence for unicode character." , t, std::make_error_code (std::errc::illegal_byte_sequence));
1412
+ }
1413
+ #endif
1362
1414
switch (fmt) {
1363
1415
#ifndef GHC_OS_WINDOWS
1364
1416
case path::auto_format:
@@ -4658,10 +4710,20 @@ class directory_iterator::impl
4658
4710
do {
4659
4711
if (FindNextFileW (_dirHandle, &_findData)) {
4660
4712
_current = _base;
4661
- _current.append_name (detail::toUtf8 (_findData.cFileName ).c_str ());
4713
+ try {
4714
+ _current.append_name (detail::toUtf8 (_findData.cFileName ).c_str ());
4715
+ }
4716
+ catch (filesystem_error& fe) {
4717
+ ec = fe.code ();
4718
+ return ;
4719
+ }
4662
4720
copyToDirEntry (ec);
4663
4721
}
4664
4722
else {
4723
+ auto err = ::GetLastError ();
4724
+ if (err != ERROR_NO_MORE_FILES) {
4725
+ _ec = ec = std::error_code (err, std::system_category ());
4726
+ }
4665
4727
FindClose (_dirHandle);
4666
4728
_dirHandle = INVALID_HANDLE_VALUE;
4667
4729
_current = filesystem::path ();
0 commit comments