Skip to content

Commit 096e193

Browse files
committed
Improved Performance for toArray
There are severe inefficiencies in `toArray` (or, to be more precise, `rangeToArray`, which toArray calls). For small ranges, the inefficiencies hardly matter, and, for large ranges, the code was likely to exhaust memory before the execution speed mattered. But, with PR PHPOffice#3834 now merged into the codebase, the memory problem is much lessened, and there is now merit in looking into performance. The code currently accesses each cell in the range. This PR changes it to access only cells which have been initialized - fewer cells processed, and many fewer function calls for those that are. For ranges which are densely filled, there will be little difference. However, for ranges which are lightly filled (a very common occurrence in issues which are raised for this project), the effect will be staggering. The sample file which accompanied 3834 saw an enormous reduction in memory consumption from 20GB to 32MB when toArray was used on it. The same file still took well over an hour to process even after the merge; after this PR, it is processed in approximately a second. While testing this change, it became apparent that Collection/Cells does not handle column XFD (highest allowed) correctly. That is corrected and new tests added.
1 parent 9f87d7c commit 096e193

File tree

3 files changed

+135
-22
lines changed

3 files changed

+135
-22
lines changed

src/PhpSpreadsheet/Collection/Cells.php

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,18 @@ public function getSortedCoordinates(): array
144144
return array_keys($this->index);
145145
}
146146

147+
/**
148+
* Get a sorted list of all cell coordinates currently held in the collection by index (16384*row+column).
149+
*
150+
* @return int[]
151+
*/
152+
public function getSortedCoordinatesInt(): array
153+
{
154+
asort($this->index);
155+
156+
return array_values($this->index);
157+
}
158+
147159
/**
148160
* Return the cell coordinate of the currently active cell object.
149161
*
@@ -188,9 +200,9 @@ public function getHighestRowAndColumn(): array
188200
// Lookup highest column and highest row
189201
$maxRow = $maxColumn = 1;
190202
foreach ($this->index as $coordinate) {
191-
$row = (int) floor($coordinate / self::MAX_COLUMN_ID) + 1;
203+
$row = (int) floor(($coordinate - 1) / self::MAX_COLUMN_ID) + 1;
192204
$maxRow = ($maxRow > $row) ? $maxRow : $row;
193-
$column = $coordinate % self::MAX_COLUMN_ID;
205+
$column = ($coordinate % self::MAX_COLUMN_ID) ?: self::MAX_COLUMN_ID;
194206
$maxColumn = ($maxColumn > $column) ? $maxColumn : $column;
195207
}
196208

@@ -226,7 +238,7 @@ public function getHighestColumn($row = null)
226238
if ($coordinate < $fromRow || $coordinate >= $toRow) {
227239
continue;
228240
}
229-
$column = $coordinate % self::MAX_COLUMN_ID;
241+
$column = ($coordinate % self::MAX_COLUMN_ID) ?: self::MAX_COLUMN_ID;
230242
$maxColumn = $maxColumn > $column ? $maxColumn : $column;
231243
}
232244

src/PhpSpreadsheet/Worksheet/Worksheet.php

Lines changed: 41 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3203,6 +3203,9 @@ protected function cellToArray(Cell $cell, bool $calculateFormulas, bool $format
32033203
return $returnValue;
32043204
}
32053205

3206+
/** @var array<string, bool> */
3207+
private array $hiddenColumns;
3208+
32063209
/**
32073210
* Create array from a range of cells.
32083211
*
@@ -3231,33 +3234,50 @@ public function rangeToArray(
32313234
$minRow = $rangeStart[1];
32323235
$maxCol = Coordinate::stringFromColumnIndex($rangeEnd[0]);
32333236
$maxRow = $rangeEnd[1];
3237+
$minColInt = $rangeStart[0];
3238+
$maxColInt = $rangeEnd[0];
32343239

32353240
++$maxCol;
32363241
$nullRow = $this->buildNullRow($nullValue, $minCol, $maxCol, $returnCellRef, $ignoreHidden);
3242+
$hideColumns = !empty($this->hiddenColumns);
32373243

3244+
$keys = $this->cellCollection?->getSortedCoordinatesInt() ?? [];
3245+
$keyIndex = 0;
3246+
$keysCount = count($keys);
32383247
// Loop through rows
3239-
$r = -1;
32403248
for ($row = $minRow; $row <= $maxRow; ++$row) {
3241-
if (($ignoreHidden === true) && ($this->getRowDimension($row)->getVisible() === false)) {
3249+
if (($ignoreHidden === true) && ($this->isRowVisible($row) === false)) {
32423250
continue;
32433251
}
3244-
$rowRef = $returnCellRef ? $row : ++$r;
3252+
$rowRef = $returnCellRef ? $row : ($row - $minRow);
32453253
$returnValue[$rowRef] = $nullRow;
3246-
$c = -1;
3247-
// Loop through columns in the current row
3248-
for ($col = $minCol; $col !== $maxCol; ++$col) {
3249-
if (($ignoreHidden === true) && ($this->getColumnDimension($col)->getVisible() === false)) {
3250-
continue;
3251-
}
3252-
$columnRef = $returnCellRef ? $col : ++$c;
3253-
// Using getCell() will create a new cell if it doesn't already exist. We don't want that to happen
3254-
// so we test and retrieve directly against cellCollection
3255-
$cell = $this->getCellCollection()->get("{$col}{$row}");
3256-
if ($cell !== null) {
3257-
$returnValue[$rowRef][$columnRef] = $this->cellToArray($cell, $calculateFormulas, $formatData, $nullValue);
3254+
3255+
$index = ($row - 1) * AddressRange::MAX_COLUMN_INT + 1;
3256+
$indexPlus = $index + AddressRange::MAX_COLUMN_INT - 1;
3257+
while ($keyIndex < $keysCount && $keys[$keyIndex] < $index) {
3258+
++$keyIndex;
3259+
}
3260+
while ($keyIndex < $keysCount && $keys[$keyIndex] <= $indexPlus) {
3261+
$key = $keys[$keyIndex];
3262+
$thisRow = intdiv($key - 1, AddressRange::MAX_COLUMN_INT) + 1;
3263+
$thisCol = ($key % AddressRange::MAX_COLUMN_INT) ?: AddressRange::MAX_COLUMN_INT;
3264+
if ($thisCol >= $minColInt && $thisCol <= $maxColInt) {
3265+
$col = Coordinate::stringFromColumnIndex($thisCol);
3266+
if ($hideColumns === false || !isset($this->hiddenColumns[$col])) {
3267+
$columnRef = $returnCellRef ? $col : ($thisCol - $minColInt);
3268+
$cell = $this->cellCollection?->get("{$col}{$thisRow}");
3269+
if ($cell !== null) {
3270+
$value = $this->cellToArray($cell, $calculateFormulas, $formatData, $nullValue);
3271+
if ($value !== $nullValue) {
3272+
$returnValue[$rowRef][$columnRef] = $value;
3273+
}
3274+
}
3275+
}
32583276
}
3277+
++$keyIndex;
32593278
}
32603279
}
3280+
unset($this->hiddenColumns);
32613281

32623282
// Return
32633283
return $returnValue;
@@ -3281,14 +3301,16 @@ private function buildNullRow(
32813301
bool $returnCellRef,
32823302
bool $ignoreHidden,
32833303
): array {
3304+
$this->hiddenColumns = [];
32843305
$nullRow = [];
32853306
$c = -1;
32863307
for ($col = $minCol; $col !== $maxCol; ++$col) {
3287-
if (($ignoreHidden === true) && ($this->getColumnDimension($col)->getVisible() === false)) {
3288-
continue;
3308+
if ($ignoreHidden === true && $this->columnDimensionExists($col) && $this->getColumnDimension($col)->getVisible() === false) {
3309+
$this->hiddenColumns[$col] = true;
3310+
} else {
3311+
$columnRef = $returnCellRef ? $col : ++$c;
3312+
$nullRow[$columnRef] = $nullValue;
32893313
}
3290-
$columnRef = $returnCellRef ? $col : ++$c;
3291-
$nullRow[$columnRef] = $nullValue;
32923314
}
32933315

32943316
return $nullRow;
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpOffice\PhpSpreadsheetTests\Worksheet;
6+
7+
use PhpOffice\PhpSpreadsheet\Spreadsheet;
8+
use PHPUnit\Framework\TestCase;
9+
10+
class ToArrayTest extends TestCase
11+
{
12+
public static function testToArray(): void
13+
{
14+
$spreadsheet = new Spreadsheet();
15+
$sheet = $spreadsheet->getActiveSheet();
16+
$inputArray = [
17+
['a', 'b', null, 'd', 'e'],
18+
[],
19+
[null, null, null, null, 'f'],
20+
[],
21+
[1, 0, null, 4],
22+
];
23+
24+
$sheet->fromArray($inputArray, null, 'A1', true);
25+
26+
$expected1 = [
27+
['a', 'b', null, 'd', 'e'],
28+
[null, null, null, null, null],
29+
[null, null, null, null, 'f'],
30+
[null, null, null, null, null],
31+
[1, 0, null, 4, null],
32+
];
33+
$result1 = $sheet->toArray(null, false, false, false, false);
34+
self::assertSame($expected1, $result1);
35+
36+
$expected2 = [
37+
1 => ['A' => 'a', 'B' => 'b', 'C' => null, 'D' => 'd', 'E' => 'e'],
38+
2 => ['A' => null, 'B' => null, 'C' => null, 'D' => null, 'E' => null],
39+
3 => ['A' => null, 'B' => null, 'C' => null, 'D' => null, 'E' => 'f'],
40+
4 => ['A' => null, 'B' => null, 'C' => null, 'D' => null, 'E' => null],
41+
5 => ['A' => 1, 'B' => 0, 'C' => null, 'D' => 4, 'E' => null],
42+
];
43+
$result2 = $sheet->toArray(null, false, false, true, false);
44+
self::assertSame($expected2, $result2);
45+
46+
$expected3 = [
47+
[null, null, null, null],
48+
[null, null, null, 'f'],
49+
[null, null, null, null],
50+
[0, null, 4, null],
51+
];
52+
$result3 = $sheet->rangeToArray('B2:E5', null, false, false, false, false);
53+
self::assertSame($expected3, $result3);
54+
55+
$expected4 = [
56+
2 => ['B' => null, 'C' => null, 'D' => null, 'E' => null],
57+
3 => ['B' => null, 'C' => null, 'D' => null, 'E' => 'f'],
58+
4 => ['B' => null, 'C' => null, 'D' => null, 'E' => null],
59+
5 => ['B' => 0, 'C' => null, 'D' => 4, 'E' => null],
60+
];
61+
$result4 = $sheet->rangeToArray('B2:E5', null, false, false, true, false);
62+
self::assertSame($expected4, $result4);
63+
64+
$spreadsheet->disconnectWorksheets();
65+
}
66+
67+
public static function testMaxCol(): void
68+
{
69+
$spreadsheet = new Spreadsheet();
70+
$sheet = $spreadsheet->getActiveSheet();
71+
$sheet->getCell('A1')->setValue('start');
72+
$sheet->getCell('XFD1')->setValue('end');
73+
$array = $sheet->toArray(null, false, false, false, false);
74+
self::assertCount(1, $array);
75+
self::assertCount(16384, $array[0]);
76+
self::assertSame('start', $array[0][0]);
77+
self::assertSame('end', $array[0][16383]);
78+
}
79+
}

0 commit comments

Comments
 (0)