Html Reader Non-UTF8 Charsets

oleibman · oleibman · commit 09c9a310a03f · 2024-05-06T16:43:23.000-07:00
Fix PHPOffice#3995. Fix PHPOffice#866. Fix PHPOffice#1681. Php DOM loadhtml defaults to character set ISO-8859-1, but our data is UTF-8. So Html Reader alters its html so that loadhtml will not misinterpret characters outside the ASCII range. This works for UTF-8, but breaks other charsets. However, loadhtml uses the correct non-default charset when charset is specified in a meta tag, or when the html starts with a BOM. So, it is sufficient for us to alter the non-ASCII characters only when (a) the data does not start with a BOM, and (b) there is no charset tag. This will allow us to use: - UTF-8 files or snippets without BOM, with or without charset - UTF-8 files with BOM (charset should not be specified and will be ignored if it is) - UTF-16 files with BOM (charset should not be specified and will be ignored if it is) - all charsets which are ASCII-compatible for 0x00-0x7f when the charset is declared. This applies to ASCII itself, many Windows and Mac charsets, all of ISO-8859, and most CJK and other-language-specific charsets. We cannot use: - UTF-16BE or UTF-16LE declared in a meta tag - UTF-32, with or without a BOM (browser recommendation is to not support UTF-32, and most browsers do not support it) - unknown (to loadhtml) or non-ASCII-compatible charsets (EBCDIC?) I will note that the way I detect the `charset` attribute is imperfect (e.g. might find it in text rather than a meta tag). I think we'd need to write a browser to get it perfect. Anyhow, it is about the same as XmlScanner's attempt to find the `encoding` attribute, and, if it's good enough there, it ought to be good enough here.
diff --git a/src/PhpSpreadsheet/Reader/Html.php b/src/PhpSpreadsheet/Reader/Html.php
@@ -30,6 +30,10 @@ class Html extends BaseReader
      */
     const TEST_SAMPLE_SIZE = 2048;
 
+    private const STARTS_WITH_BOM = '/^(?:\xfe\xff|\xff\xfe|\xEF\xBB\xBF)/';
+
+    private const DECLARES_CHARSET = '/ charset=/i';
+
     /**
      * Input encoding.
      */
@@ -144,6 +148,9 @@ public function canRead(string $filename): bool
         }
 
         $beginning = $this->readBeginning();
+        if (preg_match(self::STARTS_WITH_BOM, $beginning)) {
+            return true;
+        }
         $startWithTag = self::startsWithTag($beginning);
         $containsTags = self::containsTags($beginning);
         $endsWithTag = self::endsWithTag($this->readEnding());
@@ -638,12 +645,7 @@ public function loadIntoExisting(string $filename, Spreadsheet $spreadsheet): Sp
         // Reload the HTML file into the DOM object
         try {
             $convert = $this->getSecurityScannerOrThrow()->scanFile($filename);
-            $lowend = "\u{80}";
-            $highend = "\u{10ffff}";
-            $regexp = "/[$lowend-$highend]/u";
-            /** @var callable $callback */
-            $callback = [self::class, 'replaceNonAscii'];
-            $convert = preg_replace_callback($regexp, $callback, $convert);
+            $convert = $this->replaceNonAsciiIfNeeded($convert);
             $loaded = ($convert === null) ? false : $dom->loadHTML($convert);
         } catch (Throwable $e) {
             $loaded = false;
@@ -736,6 +738,20 @@ private static function replaceNonAscii(array $matches): string
         return '&#' . mb_ord($matches[0], 'UTF-8') . ';';
     }
 
+    private function replaceNonAsciiIfNeeded(string $convert): ?string
+    {
+        if (preg_match(self::STARTS_WITH_BOM, $convert) !== 1 && preg_match(self::DECLARES_CHARSET, $convert) !== 1) {
+            $lowend = "\u{80}";
+            $highend = "\u{10ffff}";
+            $regexp = "/[$lowend-$highend]/u";
+            /** @var callable $callback */
+            $callback = [self::class, 'replaceNonAscii'];
+            $convert = preg_replace_callback($regexp, $callback, $convert);
+        }
+
+        return $convert;
+    }
+
     /**
      * Spreadsheet from content.
      */
@@ -747,12 +763,7 @@ public function loadFromString(string $content, ?Spreadsheet $spreadsheet = null
         //    Reload the HTML file into the DOM object
         try {
             $convert = $this->getSecurityScannerOrThrow()->scan($content);
-            $lowend = "\u{80}";
-            $highend = "\u{10ffff}";
-            $regexp = "/[$lowend-$highend]/u";
-            /** @var callable $callback */
-            $callback = [self::class, 'replaceNonAscii'];
-            $convert = preg_replace_callback($regexp, $callback, $convert);
+            $convert = $this->replaceNonAsciiIfNeeded($convert);
             $loaded = ($convert === null) ? false : $dom->loadHTML($convert);
         } catch (Throwable $e) {
             $loaded = false;
diff --git a/tests/PhpSpreadsheetTests/Reader/Html/HtmlCharsetTest.php b/tests/PhpSpreadsheetTests/Reader/Html/HtmlCharsetTest.php
@@ -0,0 +1,45 @@
+<?php
+
+declare(strict_types=1);
+
+namespace PhpOffice\PhpSpreadsheetTests\Reader\Html;
+
+use PhpOffice\PhpSpreadsheet\Reader\Exception as ReaderException;
+use PhpOffice\PhpSpreadsheet\Reader\Html;
+use PHPUnit\Framework\TestCase;
+
+class HtmlCharsetTest extends TestCase
+{
+    /**
+     * @dataProvider providerCharset
+     */
+    public function testCharset(string $filename, string $expectedResult): void
+    {
+        if ($expectedResult === 'exception') {
+            $this->expectException(ReaderException::class);
+            $this->expectExceptionMessage('Failed to load');
+        }
+        $directory = 'tests/data/Reader/Html';
+        $reader = new Html();
+        $spreadsheet = $reader->load("$directory/$filename");
+        $sheet = $spreadsheet->getActiveSheet();
+        self::assertSame($expectedResult, $sheet->getCell('A1')->getValue());
+        $spreadsheet->disconnectWorksheets();
+    }
+
+    public static function providerCharset(): array
+    {
+        return [
+            ['charset.ISO-8859-1.html', 'À1'],
+            ['charset.ISO-8859-1.html4.html', 'À1'],
+            ['charset.ISO-8859-2.html', 'Ŕ1'],
+            ['charset.nocharset.html', 'À1'],
+            ['charset.UTF-8.html', 'À1'],
+            ['charset.UTF-8.bom.html', 'À1'],
+            ['charset.UTF-16.bebom.html', 'À1'],
+            ['charset.UTF-16.lebom.html', 'À1'],
+            ['charset.gb18030.html', '电视机'],
+            ['charset.unknown.html', 'exception'],
+        ];
+    }
+}
diff --git a/tests/data/Reader/HTML/charset.ISO-8859-1.html b/tests/data/Reader/HTML/charset.ISO-8859-1.html
@@ -0,0 +1,25 @@
+    <!DOCTYPE html>
+    <html lang='en'>
+    <head>
+    <meta charset='ISO-8859-1'>
+    <title>ISO-8859-1</title>
+    </head>
+    <body>
+    <table>
+        <tbody>
+        <tr>
+            <td>�1</td>
+            <td>B1</td>
+            <td>�1</td>
+            <td>D1</td>
+        </tr>
+        <tr>
+            <td>�2</td>
+            <td>B2</td>
+            <td>C2</td>
+            <td>�2</td>
+        </tr>
+        </tbody>
+    </table>
+    </body>
+    </html>
diff --git a/tests/data/Reader/HTML/charset.ISO-8859-1.html4.html b/tests/data/Reader/HTML/charset.ISO-8859-1.html4.html
@@ -0,0 +1,25 @@
+    <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+    <html lang='en'>
+    <head>
+    <meta http-equiv="Content-Type" content="text/html; CHARSET=ISO-8859-1">
+    <title>ISO-8859-1 Html4 Doctype and Meta</title>
+    </head>
+    <body>
+    <table>
+        <tbody>
+        <tr>
+            <td>�1</td>
+            <td>B1</td>
+            <td>�1</td>
+            <td>D1</td>
+        </tr>
+        <tr>
+            <td>�2</td>
+            <td>B2</td>
+            <td>C2</td>
+            <td>�2</td>
+        </tr>
+        </tbody>
+    </table>
+    </body>
+    </html>
diff --git a/tests/data/Reader/HTML/charset.ISO-8859-2.html b/tests/data/Reader/HTML/charset.ISO-8859-2.html
@@ -0,0 +1,25 @@
+    <!DOCTYPE html>
+    <html lang='en'>
+    <head>
+    <meta charset='ISO-8859-2'>
+    <title>ISO-8859-2</title>
+    </head>
+    <body>
+    <table>
+        <tbody>
+        <tr>
+            <td>�1</td>
+            <td>B1</td>
+            <td>�1</td>
+            <td>D1</td>
+        </tr>
+        <tr>
+            <td>�2</td>
+            <td>B2</td>
+            <td>C2</td>
+            <td>�2</td>
+        </tr>
+        </tbody>
+    </table>
+    </body>
+    </html>
diff --git a/tests/data/Reader/HTML/charset.UTF-16.bebom.html b/tests/data/Reader/HTML/charset.UTF-16.bebom.html
diff --git a/tests/data/Reader/HTML/charset.UTF-16.lebom.html b/tests/data/Reader/HTML/charset.UTF-16.lebom.html
diff --git a/tests/data/Reader/HTML/charset.UTF-8.bom.html b/tests/data/Reader/HTML/charset.UTF-8.bom.html
@@ -0,0 +1,24 @@
+﻿    <!DOCTYPE html>
+    <html lang='en'>
+    <head>
+    <title>UTF-8</title>
+    </head>
+    <body>
+    <table>
+        <tbody>
+        <tr>
+            <td>À1</td>
+            <td>B1</td>
+            <td>ç1</td>
+            <td>D1</td>
+        </tr>
+        <tr>
+            <td>Ã2</td>
+            <td>B2</td>
+            <td>C2</td>
+            <td>Ð2</td>
+        </tr>
+        </tbody>
+    </table>
+    </body>
+    </html>
diff --git a/tests/data/Reader/HTML/charset.UTF-8.html b/tests/data/Reader/HTML/charset.UTF-8.html
@@ -0,0 +1,25 @@
+    <!DOCTYPE html>
+    <html lang='en'>
+    <head>
+    <meta charset='UTF-8'>
+    <title>UTF-8</title>
+    </head>
+    <body>
+    <table>
+        <tbody>
+        <tr>
+            <td>À1</td>
+            <td>B1</td>
+            <td>ç1</td>
+            <td>D1</td>
+        </tr>
+        <tr>
+            <td>Ã2</td>
+            <td>B2</td>
+            <td>C2</td>
+            <td>Ð2</td>
+        </tr>
+        </tbody>
+    </table>
+    </body>
+    </html>
diff --git a/tests/data/Reader/HTML/charset.gb18030.html b/tests/data/Reader/HTML/charset.gb18030.html
@@ -0,0 +1,15 @@
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=gb18030">
+<title>gb18030</title>
+</head>
+<body>
+    <table>
+        <tbody>
+        <tr>
+            <td>���ӻ�</td>
+        </tr>
+        </tbody>
+    </table>
+</body>
+</html>
diff --git a/tests/data/Reader/HTML/charset.nocharset.html b/tests/data/Reader/HTML/charset.nocharset.html
@@ -0,0 +1,16 @@
+<table>
+    <tbody>
+    <tr>
+        <td>À1</td>
+        <td>B1</td>
+        <td>ç1</td>
+        <td>D1</td>
+    </tr>
+    <tr>
+        <td>Ã2</td>
+        <td>B2</td>
+        <td>C2</td>
+        <td>Ð2</td>
+    </tr>
+    </tbody>
+</table>
diff --git a/tests/data/Reader/HTML/charset.unknown.html b/tests/data/Reader/HTML/charset.unknown.html
@@ -0,0 +1,25 @@
+    <!DOCTYPE html>
+    <html lang='en'>
+    <head>
+    <meta charset='unknown'>
+    <title>UTF-8</title>
+    </head>
+    <body>
+    <table>
+        <tbody>
+        <tr>
+            <td>À1</td>
+            <td>B1</td>
+            <td>ç1</td>
+            <td>D1</td>
+        </tr>
+        <tr>
+            <td>Ã2</td>
+            <td>B2</td>
+            <td>C2</td>
+            <td>Ð2</td>
+        </tr>
+        </tbody>
+    </table>
+    </body>
+    </html>