Skip to content

Commit 09c9a31

Browse files
committed
Html Reader Non-UTF8 Charsets
Fix PHPOffice#3995. Fix PHPOffice#866. Fix PHPOffice#1681. Php DOM loadhtml defaults to character set ISO-8859-1, but our data is UTF-8. So Html Reader alters its html so that loadhtml will not misinterpret characters outside the ASCII range. This works for UTF-8, but breaks other charsets. However, loadhtml uses the correct non-default charset when charset is specified in a meta tag, or when the html starts with a BOM. So, it is sufficient for us to alter the non-ASCII characters only when (a) the data does not start with a BOM, and (b) there is no charset tag. This will allow us to use: - UTF-8 files or snippets without BOM, with or without charset - UTF-8 files with BOM (charset should not be specified and will be ignored if it is) - UTF-16 files with BOM (charset should not be specified and will be ignored if it is) - all charsets which are ASCII-compatible for 0x00-0x7f when the charset is declared. This applies to ASCII itself, many Windows and Mac charsets, all of ISO-8859, and most CJK and other-language-specific charsets. We cannot use: - UTF-16BE or UTF-16LE declared in a meta tag - UTF-32, with or without a BOM (browser recommendation is to not support UTF-32, and most browsers do not support it) - unknown (to loadhtml) or non-ASCII-compatible charsets (EBCDIC?) I will note that the way I detect the `charset` attribute is imperfect (e.g. might find it in text rather than a meta tag). I think we'd need to write a browser to get it perfect. Anyhow, it is about the same as XmlScanner's attempt to find the `encoding` attribute, and, if it's good enough there, it ought to be good enough here.
1 parent c56a583 commit 09c9a31

12 files changed

+248
-12
lines changed

src/PhpSpreadsheet/Reader/Html.php

+23-12
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ class Html extends BaseReader
3030
*/
3131
const TEST_SAMPLE_SIZE = 2048;
3232

33+
private const STARTS_WITH_BOM = '/^(?:\xfe\xff|\xff\xfe|\xEF\xBB\xBF)/';
34+
35+
private const DECLARES_CHARSET = '/ charset=/i';
36+
3337
/**
3438
* Input encoding.
3539
*/
@@ -144,6 +148,9 @@ public function canRead(string $filename): bool
144148
}
145149

146150
$beginning = $this->readBeginning();
151+
if (preg_match(self::STARTS_WITH_BOM, $beginning)) {
152+
return true;
153+
}
147154
$startWithTag = self::startsWithTag($beginning);
148155
$containsTags = self::containsTags($beginning);
149156
$endsWithTag = self::endsWithTag($this->readEnding());
@@ -638,12 +645,7 @@ public function loadIntoExisting(string $filename, Spreadsheet $spreadsheet): Sp
638645
// Reload the HTML file into the DOM object
639646
try {
640647
$convert = $this->getSecurityScannerOrThrow()->scanFile($filename);
641-
$lowend = "\u{80}";
642-
$highend = "\u{10ffff}";
643-
$regexp = "/[$lowend-$highend]/u";
644-
/** @var callable $callback */
645-
$callback = [self::class, 'replaceNonAscii'];
646-
$convert = preg_replace_callback($regexp, $callback, $convert);
648+
$convert = $this->replaceNonAsciiIfNeeded($convert);
647649
$loaded = ($convert === null) ? false : $dom->loadHTML($convert);
648650
} catch (Throwable $e) {
649651
$loaded = false;
@@ -736,6 +738,20 @@ private static function replaceNonAscii(array $matches): string
736738
return '&#' . mb_ord($matches[0], 'UTF-8') . ';';
737739
}
738740

741+
private function replaceNonAsciiIfNeeded(string $convert): ?string
742+
{
743+
if (preg_match(self::STARTS_WITH_BOM, $convert) !== 1 && preg_match(self::DECLARES_CHARSET, $convert) !== 1) {
744+
$lowend = "\u{80}";
745+
$highend = "\u{10ffff}";
746+
$regexp = "/[$lowend-$highend]/u";
747+
/** @var callable $callback */
748+
$callback = [self::class, 'replaceNonAscii'];
749+
$convert = preg_replace_callback($regexp, $callback, $convert);
750+
}
751+
752+
return $convert;
753+
}
754+
739755
/**
740756
* Spreadsheet from content.
741757
*/
@@ -747,12 +763,7 @@ public function loadFromString(string $content, ?Spreadsheet $spreadsheet = null
747763
// Reload the HTML file into the DOM object
748764
try {
749765
$convert = $this->getSecurityScannerOrThrow()->scan($content);
750-
$lowend = "\u{80}";
751-
$highend = "\u{10ffff}";
752-
$regexp = "/[$lowend-$highend]/u";
753-
/** @var callable $callback */
754-
$callback = [self::class, 'replaceNonAscii'];
755-
$convert = preg_replace_callback($regexp, $callback, $convert);
766+
$convert = $this->replaceNonAsciiIfNeeded($convert);
756767
$loaded = ($convert === null) ? false : $dom->loadHTML($convert);
757768
} catch (Throwable $e) {
758769
$loaded = false;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpOffice\PhpSpreadsheetTests\Reader\Html;
6+
7+
use PhpOffice\PhpSpreadsheet\Reader\Exception as ReaderException;
8+
use PhpOffice\PhpSpreadsheet\Reader\Html;
9+
use PHPUnit\Framework\TestCase;
10+
11+
class HtmlCharsetTest extends TestCase
12+
{
13+
/**
14+
* @dataProvider providerCharset
15+
*/
16+
public function testCharset(string $filename, string $expectedResult): void
17+
{
18+
if ($expectedResult === 'exception') {
19+
$this->expectException(ReaderException::class);
20+
$this->expectExceptionMessage('Failed to load');
21+
}
22+
$directory = 'tests/data/Reader/Html';
23+
$reader = new Html();
24+
$spreadsheet = $reader->load("$directory/$filename");
25+
$sheet = $spreadsheet->getActiveSheet();
26+
self::assertSame($expectedResult, $sheet->getCell('A1')->getValue());
27+
$spreadsheet->disconnectWorksheets();
28+
}
29+
30+
public static function providerCharset(): array
31+
{
32+
return [
33+
['charset.ISO-8859-1.html', 'À1'],
34+
['charset.ISO-8859-1.html4.html', 'À1'],
35+
['charset.ISO-8859-2.html', 'Ŕ1'],
36+
['charset.nocharset.html', 'À1'],
37+
['charset.UTF-8.html', 'À1'],
38+
['charset.UTF-8.bom.html', 'À1'],
39+
['charset.UTF-16.bebom.html', 'À1'],
40+
['charset.UTF-16.lebom.html', 'À1'],
41+
['charset.gb18030.html', '电视机'],
42+
['charset.unknown.html', 'exception'],
43+
];
44+
}
45+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<!DOCTYPE html>
2+
<html lang='en'>
3+
<head>
4+
<meta charset='ISO-8859-1'>
5+
<title>ISO-8859-1</title>
6+
</head>
7+
<body>
8+
<table>
9+
<tbody>
10+
<tr>
11+
<td>À1</td>
12+
<td>B1</td>
13+
<td>ç1</td>
14+
<td>D1</td>
15+
</tr>
16+
<tr>
17+
<td>Ã2</td>
18+
<td>B2</td>
19+
<td>C2</td>
20+
<td>Ð2</td>
21+
</tr>
22+
</tbody>
23+
</table>
24+
</body>
25+
</html>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
2+
<html lang='en'>
3+
<head>
4+
<meta http-equiv="Content-Type" content="text/html; CHARSET=ISO-8859-1">
5+
<title>ISO-8859-1 Html4 Doctype and Meta</title>
6+
</head>
7+
<body>
8+
<table>
9+
<tbody>
10+
<tr>
11+
<td>À1</td>
12+
<td>B1</td>
13+
<td>ç1</td>
14+
<td>D1</td>
15+
</tr>
16+
<tr>
17+
<td>Ã2</td>
18+
<td>B2</td>
19+
<td>C2</td>
20+
<td>Ð2</td>
21+
</tr>
22+
</tbody>
23+
</table>
24+
</body>
25+
</html>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<!DOCTYPE html>
2+
<html lang='en'>
3+
<head>
4+
<meta charset='ISO-8859-2'>
5+
<title>ISO-8859-2</title>
6+
</head>
7+
<body>
8+
<table>
9+
<tbody>
10+
<tr>
11+
<td>À1</td>
12+
<td>B1</td>
13+
<td>ç1</td>
14+
<td>D1</td>
15+
</tr>
16+
<tr>
17+
<td>Ã2</td>
18+
<td>B2</td>
19+
<td>C2</td>
20+
<td>Ð2</td>
21+
</tr>
22+
</tbody>
23+
</table>
24+
</body>
25+
</html>
860 Bytes
Binary file not shown.
860 Bytes
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
 <!DOCTYPE html>
2+
<html lang='en'>
3+
<head>
4+
<title>UTF-8</title>
5+
</head>
6+
<body>
7+
<table>
8+
<tbody>
9+
<tr>
10+
<td>À1</td>
11+
<td>B1</td>
12+
<td>ç1</td>
13+
<td>D1</td>
14+
</tr>
15+
<tr>
16+
<td>Ã2</td>
17+
<td>B2</td>
18+
<td>C2</td>
19+
<td>Ð2</td>
20+
</tr>
21+
</tbody>
22+
</table>
23+
</body>
24+
</html>
+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<!DOCTYPE html>
2+
<html lang='en'>
3+
<head>
4+
<meta charset='UTF-8'>
5+
<title>UTF-8</title>
6+
</head>
7+
<body>
8+
<table>
9+
<tbody>
10+
<tr>
11+
<td>À1</td>
12+
<td>B1</td>
13+
<td>ç1</td>
14+
<td>D1</td>
15+
</tr>
16+
<tr>
17+
<td>Ã2</td>
18+
<td>B2</td>
19+
<td>C2</td>
20+
<td>Ð2</td>
21+
</tr>
22+
</tbody>
23+
</table>
24+
</body>
25+
</html>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<html>
2+
<head>
3+
<meta http-equiv="Content-Type" content="text/html; charset=gb18030">
4+
<title>gb18030</title>
5+
</head>
6+
<body>
7+
<table>
8+
<tbody>
9+
<tr>
10+
<td>µçÊÓ»ú</td>
11+
</tr>
12+
</tbody>
13+
</table>
14+
</body>
15+
</html>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<table>
2+
<tbody>
3+
<tr>
4+
<td>À1</td>
5+
<td>B1</td>
6+
<td>ç1</td>
7+
<td>D1</td>
8+
</tr>
9+
<tr>
10+
<td>Ã2</td>
11+
<td>B2</td>
12+
<td>C2</td>
13+
<td>Ð2</td>
14+
</tr>
15+
</tbody>
16+
</table>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<!DOCTYPE html>
2+
<html lang='en'>
3+
<head>
4+
<meta charset='unknown'>
5+
<title>UTF-8</title>
6+
</head>
7+
<body>
8+
<table>
9+
<tbody>
10+
<tr>
11+
<td>À1</td>
12+
<td>B1</td>
13+
<td>ç1</td>
14+
<td>D1</td>
15+
</tr>
16+
<tr>
17+
<td>Ã2</td>
18+
<td>B2</td>
19+
<td>C2</td>
20+
<td>Ð2</td>
21+
</tr>
22+
</tbody>
23+
</table>
24+
</body>
25+
</html>

0 commit comments

Comments
 (0)