Skip to content

Commit e768cb0

Browse files
authored
CSV - Guess Encoding, Handle Null-string Escape (#1717)
* CSV - Guess Encoding, Handle Null-string Escape This is in response to issue #1647 (detect CSV character encoding). First, my tests with mb_detect_encoding indicate that it doesn't work well enough; regardless, users can always do that on their own if they deem it useful. Rolling my own is also troublesome, but I can at least: a. Check for BOM (UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE). b. Do some heuristic tests for each of the above encodings. c. Fallback to a user-specified encoding (default CP1252) if a and b don't yield result. I think this is probably useful enough to include, and relatively easy to expand if other potential encodings should be considered. Starting with PHP7.4, fgetcsv allows specification of null string as escape character in fgetcsv. This is a much better choice than the PHP (and PhpSpreadsheet) default of backslash in that it handles the file in the same manner as Excel does. There is one statement in Reader/CSV which would be adversely affected if the caller so specified (building a regular expression under the assumption that escape character is a single character). Fix that statement appropriately and add tests.
1 parent 607d347 commit e768cb0

15 files changed

+170
-8
lines changed

docs/topics/reading-and-writing-to-file.md

+18
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,24 @@ $reader->setSheetIndex(0);
458458

459459
$spreadsheet = $reader->load("sample.csv");
460460
```
461+
You may also let PhpSpreadsheet attempt to guess the input encoding.
462+
It will do so based on a test for BOM (UTF-8, UTF-16BE, UTF-16LE, UTF-32BE,
463+
or UTF-32LE),
464+
or by doing heuristic tests for those encodings, falling back to a
465+
specifiable encoding (default is CP1252) if all of those tests fail.
466+
467+
```php
468+
$reader = new \PhpOffice\PhpSpreadsheet\Reader\Csv();
469+
$encoding = \PhpOffice\PhpSpreadsheet\Reader\Csv::guessEncoding('sample.csv');
470+
// or, e.g. $encoding = \PhpOffice\PhpSpreadsheet\Reader\Csv::guessEncoding(
471+
// 'sample.csv', 'ISO-8859-2');
472+
$reader->setInputEncoding($encoding);
473+
$reader->setDelimiter(';');
474+
$reader->setEnclosure('');
475+
$reader->setSheetIndex(0);
476+
477+
$spreadsheet = $reader->load('sample.csv');
478+
```
461479

462480
#### Read a specific worksheet
463481

src/PhpSpreadsheet/Reader/Csv.php

+80-8
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,21 @@
99

1010
class Csv extends BaseReader
1111
{
12+
const UTF8_BOM = "\xEF\xBB\xBF";
13+
const UTF8_BOM_LEN = 3;
14+
const UTF16BE_BOM = "\xfe\xff";
15+
const UTF16BE_BOM_LEN = 2;
16+
const UTF16BE_LF = "\x00\x0a";
17+
const UTF16LE_BOM = "\xff\xfe";
18+
const UTF16LE_BOM_LEN = 2;
19+
const UTF16LE_LF = "\x0a\x00";
20+
const UTF32BE_BOM = "\x00\x00\xfe\xff";
21+
const UTF32BE_BOM_LEN = 4;
22+
const UTF32BE_LF = "\x00\x00\x00\x0a";
23+
const UTF32LE_BOM = "\xff\xfe\x00\x00";
24+
const UTF32LE_BOM_LEN = 4;
25+
const UTF32LE_LF = "\x0a\x00\x00\x00";
26+
1227
/**
1328
* Input encoding.
1429
*
@@ -90,12 +105,8 @@ protected function skipBOM(): void
90105
{
91106
rewind($this->fileHandle);
92107

93-
switch ($this->inputEncoding) {
94-
case 'UTF-8':
95-
fgets($this->fileHandle, 4) == "\xEF\xBB\xBF" ?
96-
fseek($this->fileHandle, 3) : fseek($this->fileHandle, 0);
97-
98-
break;
108+
if (fgets($this->fileHandle, self::UTF8_BOM_LEN + 1) !== self::UTF8_BOM) {
109+
rewind($this->fileHandle);
99110
}
100111
}
101112

@@ -213,7 +224,9 @@ function ($sum, $value) use ($median) {
213224
private function getNextLine()
214225
{
215226
$line = '';
216-
$enclosure = '(?<!' . preg_quote($this->escapeCharacter, '/') . ')' . preg_quote($this->enclosure, '/');
227+
$enclosure = ($this->escapeCharacter === '' ? ''
228+
: ('(?<!' . preg_quote($this->escapeCharacter, '/') . ')'))
229+
. preg_quote($this->enclosure, '/');
217230

218231
do {
219232
// Get the next line in the file
@@ -307,7 +320,7 @@ private function openFileOrMemory($pFilename): void
307320
$this->fileHandle = fopen('php://memory', 'r+b');
308321
$data = StringHelper::convertEncoding($entireFile, 'UTF-8', $this->inputEncoding);
309322
fwrite($this->fileHandle, $data);
310-
rewind($this->fileHandle);
323+
$this->skipBOM();
311324
}
312325
}
313326

@@ -531,4 +544,63 @@ public function canRead($pFilename)
531544

532545
return in_array($type, $supportedTypes, true);
533546
}
547+
548+
private static function guessEncodingTestNoBom(string &$encoding, string &$contents, string $compare, string $setEncoding): void
549+
{
550+
if ($encoding === '') {
551+
$pos = strpos($contents, $compare);
552+
if ($pos !== false && $pos % strlen($compare) === 0) {
553+
$encoding = $setEncoding;
554+
}
555+
}
556+
}
557+
558+
private static function guessEncodingNoBom(string $filename): string
559+
{
560+
$encoding = '';
561+
$contents = file_get_contents($filename);
562+
self::guessEncodingTestNoBom($encoding, $contents, self::UTF32BE_LF, 'UTF-32BE');
563+
self::guessEncodingTestNoBom($encoding, $contents, self::UTF32LE_LF, 'UTF-32LE');
564+
self::guessEncodingTestNoBom($encoding, $contents, self::UTF16BE_LF, 'UTF-16BE');
565+
self::guessEncodingTestNoBom($encoding, $contents, self::UTF16LE_LF, 'UTF-16LE');
566+
if ($encoding === '' && preg_match('//u', $contents) === 1) {
567+
$encoding = 'UTF-8';
568+
}
569+
570+
return $encoding;
571+
}
572+
573+
private static function guessEncodingTestBom(string &$encoding, string $first4, string $compare, string $setEncoding): void
574+
{
575+
if ($encoding === '') {
576+
if ($compare === substr($first4, 0, strlen($compare))) {
577+
$encoding = $setEncoding;
578+
}
579+
}
580+
}
581+
582+
private static function guessEncodingBom(string $filename): string
583+
{
584+
$encoding = '';
585+
$first4 = file_get_contents($filename, false, null, 0, 4);
586+
if ($first4 !== false) {
587+
self::guessEncodingTestBom($encoding, $first4, self::UTF8_BOM, 'UTF-8');
588+
self::guessEncodingTestBom($encoding, $first4, self::UTF16BE_BOM, 'UTF-16BE');
589+
self::guessEncodingTestBom($encoding, $first4, self::UTF32BE_BOM, 'UTF-32BE');
590+
self::guessEncodingTestBom($encoding, $first4, self::UTF32LE_BOM, 'UTF-32LE');
591+
self::guessEncodingTestBom($encoding, $first4, self::UTF16LE_BOM, 'UTF-16LE');
592+
}
593+
594+
return $encoding;
595+
}
596+
597+
public static function guessEncoding(string $filename, string $dflt = 'CP1252'): string
598+
{
599+
$encoding = self::guessEncodingBom($filename);
600+
if ($encoding === '') {
601+
$encoding = self::guessEncodingNoBom($filename);
602+
}
603+
604+
return ($encoding === '') ? $dflt : $encoding;
605+
}
534606
}

tests/PhpSpreadsheetTests/Reader/CsvTest.php

+62
Original file line numberDiff line numberDiff line change
@@ -275,4 +275,66 @@ public function testReadNonexistentFileName(): void
275275
$reader = new Csv();
276276
$reader->load('tests/data/Reader/CSV/encoding.utf8.csvxxx');
277277
}
278+
279+
/**
280+
* @dataProvider providerEscapes
281+
*/
282+
public function testInferSeparator(string $escape, string $delimiter): void
283+
{
284+
$reader = new Csv();
285+
$reader->setEscapeCharacter($escape);
286+
$filename = 'tests/data/Reader/CSV/escape.csv';
287+
$reader->listWorksheetInfo($filename);
288+
self::assertEquals($delimiter, $reader->getDelimiter());
289+
}
290+
291+
public function providerEscapes()
292+
{
293+
return [
294+
['\\', ';'],
295+
["\x0", ','],
296+
[(version_compare(PHP_VERSION, '7.4') < 0) ? "\x0" : '', ','],
297+
];
298+
}
299+
300+
/**
301+
* @dataProvider providerGuessEncoding
302+
*/
303+
public function testGuessEncoding(string $filename): void
304+
{
305+
$reader = new Csv();
306+
$reader->setInputEncoding(Csv::guessEncoding($filename));
307+
$spreadsheet = $reader->load($filename);
308+
$sheet = $spreadsheet->getActiveSheet();
309+
self::assertEquals('première', $sheet->getCell('A1')->getValue());
310+
self::assertEquals('sixième', $sheet->getCell('C2')->getValue());
311+
}
312+
313+
public function providerGuessEncoding()
314+
{
315+
return [
316+
['tests/data/Reader/CSV/premiere.utf8.csv'],
317+
['tests/data/Reader/CSV/premiere.utf8bom.csv'],
318+
['tests/data/Reader/CSV/premiere.utf16be.csv'],
319+
['tests/data/Reader/CSV/premiere.utf16bebom.csv'],
320+
['tests/data/Reader/CSV/premiere.utf16le.csv'],
321+
['tests/data/Reader/CSV/premiere.utf16lebom.csv'],
322+
['tests/data/Reader/CSV/premiere.utf32be.csv'],
323+
['tests/data/Reader/CSV/premiere.utf32bebom.csv'],
324+
['tests/data/Reader/CSV/premiere.utf32le.csv'],
325+
['tests/data/Reader/CSV/premiere.utf32lebom.csv'],
326+
['tests/data/Reader/CSV/premiere.win1252.csv'],
327+
];
328+
}
329+
330+
public function testGuessEncodingDefltIso2(): void
331+
{
332+
$filename = 'tests/data/Reader/CSV/premiere.win1252.csv';
333+
$reader = new Csv();
334+
$reader->setInputEncoding(Csv::guessEncoding($filename, 'ISO-8859-2'));
335+
$spreadsheet = $reader->load($filename);
336+
$sheet = $spreadsheet->getActiveSheet();
337+
self::assertEquals('premičre', $sheet->getCell('A1')->getValue());
338+
self::assertEquals('sixičme', $sheet->getCell('C2')->getValue());
339+
}
278340
}

tests/data/Reader/CSV/escape.csv

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
a\"hello;hello;hello;\",b\"hello;hello;hello;\",c\"\hello;hello;hello;\"
2+
a\"hello;hello;hello;\",b\"hello;hello;hello;\",c\"\hello;hello;hello;\",d
3+
a\"hello;hello;hello;\",b\"hello;hello;hello;\",c\"\hello;hello;hello;\"
4+
a\"hello;hello;hello;\",b\"hello;hello;hello;\",c\"\hello;hello;hello;\"
112 Bytes
Binary file not shown.
114 Bytes
Binary file not shown.
112 Bytes
Binary file not shown.
114 Bytes
Binary file not shown.
224 Bytes
Binary file not shown.
228 Bytes
Binary file not shown.
224 Bytes
Binary file not shown.
228 Bytes
Binary file not shown.
+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
première,second,troisième
2+
Quatrième,cinquième,sixième
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
première,second,troisième
2+
Quatrième,cinquième,sixième
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
premi�re,second,troisi�me
2+
Quatri�me,cinqui�me,sixi�me

0 commit comments

Comments
 (0)