Skip to content

Commit a9ee8e4

Browse files
authored
Create a EncodingConverter class with both iconv and icu support. (llvm#138893)
This patch adds a wrapper class called EncodingConverter for ConverterEBCDIC. This class is then extended to support the ICU library or iconv library. The ICU library currently takes priority over the iconv library. Relevant RFCs: https://discourse.llvm.org/t/rfc-adding-a-charset-converter-to-the-llvm-support-library/69795 https://discourse.llvm.org/t/rfc-enabling-fexec-charset-support-to-llvm-and-clang-reposting/71512 Stacked PR to enable fexec-charset that depends on this: llvm#138895 See old PR for review and commit history: llvm#74516
1 parent cbcfe66 commit a9ee8e4

File tree

9 files changed

+853
-2
lines changed

9 files changed

+853
-2
lines changed

llvm/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,10 @@ else()
592592
option(LLVM_ENABLE_THREADS "Use threads if available." ON)
593593
endif()
594594

595+
set(LLVM_ENABLE_ICU "OFF" CACHE STRING "Use ICU for text encoding conversion support if available. Can be ON, OFF, or FORCE_ON")
596+
597+
set(LLVM_ENABLE_ICONV "OFF" CACHE STRING "Use iconv for text encoding conversion support if available. Can be ON, OFF, or FORCE_ON")
598+
595599
set(LLVM_ENABLE_ZLIB "ON" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
596600

597601
set(LLVM_ENABLE_ZSTD "ON" CACHE STRING "Use zstd for compression/decompression if available. Can be ON, OFF, or FORCE_ON")

llvm/cmake/config-ix.cmake

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,41 @@ if(LLVM_HAS_LOGF128)
294294
set(LLVM_HAS_LOGF128 "${HAS_LOGF128}")
295295
endif()
296296

297+
if (LLVM_ENABLE_ICU STREQUAL FORCE_ON AND LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
298+
message(FATAL_ERROR "LLVM_ENABLE_ICU and LLVM_ENABLE_ICONV should not both be FORCE_ON")
299+
endif()
300+
301+
# Check for ICU. Only allow an optional, dynamic link for ICU so we don't impact LLVM's licensing.
302+
if(LLVM_ENABLE_ICU AND NOT(LLVM_ENABLE_ICONV STREQUAL FORCE_ON))
303+
set(LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
304+
set(CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_SHARED_LIBRARY_SUFFIX}")
305+
if (LLVM_ENABLE_ICU STREQUAL FORCE_ON)
306+
find_package(ICU REQUIRED COMPONENTS uc i18n)
307+
if (NOT ICU_FOUND)
308+
message(FATAL_ERROR "Failed to configure ICU, but LLVM_ENABLE_ICU is FORCE_ON")
309+
endif()
310+
else()
311+
find_package(ICU COMPONENTS uc i18n)
312+
endif()
313+
set(HAVE_ICU ${ICU_FOUND})
314+
set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES})
315+
endif()
316+
317+
# Check only for builtin iconv to avoid licensing issues.
318+
if(LLVM_ENABLE_ICONV AND NOT HAVE_ICU)
319+
if (LLVM_ENABLE_ICONV STREQUAL FORCE_ON)
320+
find_package(Iconv REQUIRED)
321+
if (NOT Iconv_FOUND OR NOT Iconv_IS_BUILT_IN)
322+
message(FATAL_ERROR "Failed to configure iconv, but LLVM_ENABLE_ICONV is FORCE_ON")
323+
endif()
324+
else()
325+
find_package(Iconv)
326+
endif()
327+
if(Iconv_FOUND AND Iconv_IS_BUILT_IN)
328+
set(HAVE_ICONV 1)
329+
endif()
330+
endif()
331+
297332
# function checks
298333
check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
299334
find_package(Backtrace)

llvm/include/llvm/Config/config.h.cmake

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,12 @@
236236
/* Have host's ___chkstk_ms */
237237
#cmakedefine HAVE____CHKSTK_MS ${HAVE____CHKSTK_MS}
238238

239+
/* Define if ICU library is available */
240+
#cmakedefine01 HAVE_ICU
241+
242+
/* Define if iconv library is available */
243+
#cmakedefine01 HAVE_ICONV
244+
239245
/* Linker version detected at compile time. */
240246
#cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"
241247

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
//===-- TextEncoding.h - Text encoding conversion class -----------*- C++ -*-=//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
///
9+
/// \file
10+
/// This file provides a utility class to convert between different character
11+
/// set encodings.
12+
///
13+
//===----------------------------------------------------------------------===//
14+
15+
#ifndef LLVM_SUPPORT_TEXT_ENCODING_H
16+
#define LLVM_SUPPORT_TEXT_ENCODING_H
17+
18+
#include "llvm/ADT/SmallString.h"
19+
#include "llvm/ADT/StringRef.h"
20+
#include "llvm/Config/config.h"
21+
#include "llvm/Support/ErrorOr.h"
22+
23+
#include <string>
24+
#include <system_error>
25+
26+
namespace llvm {
27+
28+
template <typename T> class SmallVectorImpl;
29+
30+
namespace details {
31+
class TextEncodingConverterImplBase {
32+
33+
private:
34+
/// Converts a string.
35+
/// \param[in] Source source string
36+
/// \param[out] Result container for converted string
37+
/// \return error code in case something went wrong
38+
///
39+
/// The following error codes can occur, among others:
40+
/// - std::errc::argument_list_too_long: The result requires more than
41+
/// std::numeric_limits<size_t>::max() bytes.
42+
/// - std::errc::illegal_byte_sequence: The input contains an invalid
43+
/// multibyte sequence.
44+
/// - std::errc::invalid_argument: The input contains an incomplete
45+
/// multibyte sequence.
46+
///
47+
/// If the destination encoding is stateful, the shift state will be set
48+
/// to the initial state.
49+
///
50+
/// In case of an error, the result string contains the successfully converted
51+
/// part of the input string.
52+
///
53+
virtual std::error_code convertString(StringRef Source,
54+
SmallVectorImpl<char> &Result) = 0;
55+
56+
/// Resets the converter to the initial state.
57+
virtual void reset() = 0;
58+
59+
public:
60+
virtual ~TextEncodingConverterImplBase() = default;
61+
62+
/// Converts a string and resets the converter to the initial state.
63+
std::error_code convert(StringRef Source, SmallVectorImpl<char> &Result) {
64+
auto EC = convertString(Source, Result);
65+
reset();
66+
return EC;
67+
}
68+
};
69+
} // namespace details
70+
71+
// Names inspired by https://wg21.link/p1885.
72+
enum class TextEncoding {
73+
/// UTF-8 character set encoding.
74+
UTF8,
75+
76+
/// IBM EBCDIC 1047 character set encoding.
77+
IBM1047
78+
};
79+
80+
/// Utility class to convert between different character encodings.
81+
class TextEncodingConverter {
82+
std::unique_ptr<details::TextEncodingConverterImplBase> Converter;
83+
84+
TextEncodingConverter(
85+
std::unique_ptr<details::TextEncodingConverterImplBase> Converter)
86+
: Converter(std::move(Converter)) {}
87+
88+
public:
89+
/// Creates a TextEncodingConverter instance.
90+
/// Returns std::errc::invalid_argument in case the requested conversion is
91+
/// not supported.
92+
/// \param[in] From the source character encoding
93+
/// \param[in] To the target character encoding
94+
/// \return a TextEncodingConverter instance or an error code
95+
static ErrorOr<TextEncodingConverter> create(TextEncoding From,
96+
TextEncoding To);
97+
98+
/// Creates a TextEncodingConverter instance.
99+
/// Returns std::errc::invalid_argument in case the requested conversion is
100+
/// not supported.
101+
/// \param[in] From name of the source character encoding
102+
/// \param[in] To name of the target character encoding
103+
/// \return a TextEncodingConverter instance or an error code
104+
static ErrorOr<TextEncodingConverter> create(StringRef From, StringRef To);
105+
106+
TextEncodingConverter(const TextEncodingConverter &) = delete;
107+
TextEncodingConverter &operator=(const TextEncodingConverter &) = delete;
108+
109+
TextEncodingConverter(TextEncodingConverter &&Other)
110+
: Converter(std::move(Other.Converter)) {}
111+
112+
TextEncodingConverter &operator=(TextEncodingConverter &&Other) {
113+
if (this != &Other)
114+
Converter = std::move(Other.Converter);
115+
return *this;
116+
}
117+
118+
~TextEncodingConverter() = default;
119+
120+
/// Converts a string.
121+
/// \param[in] Source source string
122+
/// \param[out] Result container for converted string
123+
/// \return error code in case something went wrong
124+
std::error_code convert(StringRef Source,
125+
SmallVectorImpl<char> &Result) const {
126+
return Converter->convert(Source, Result);
127+
}
128+
129+
ErrorOr<std::string> convert(StringRef Source) const {
130+
SmallString<100> Result;
131+
auto EC = Converter->convert(Source, Result);
132+
if (!EC)
133+
return std::string(Result);
134+
return EC;
135+
}
136+
};
137+
138+
} // namespace llvm
139+
140+
#endif

llvm/lib/Support/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,7 @@ add_llvm_component_library(LLVMSupport
257257
SuffixTree.cpp
258258
SystemUtils.cpp
259259
TarWriter.cpp
260+
TextEncoding.cpp
260261
ThreadPool.cpp
261262
TimeProfiler.cpp
262263
Timer.cpp
@@ -316,6 +317,14 @@ add_llvm_component_library(LLVMSupport
316317
Demangle
317318
)
318319

320+
# Link ICU library if it is an external library.
321+
if(ICU_FOUND)
322+
target_link_libraries(LLVMSupport
323+
PRIVATE
324+
${ICU_LIBRARIES}
325+
)
326+
endif()
327+
319328
set(llvm_system_libs ${system_libs})
320329

321330
# This block is only needed for llvm-config. When we deprecate llvm-config and

0 commit comments

Comments
 (0)