Csv.php 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562
  1. <?php
  2. namespace PhpOffice\PhpSpreadsheet\Reader;
  3. use PhpOffice\PhpSpreadsheet\Cell\Coordinate;
  4. use PhpOffice\PhpSpreadsheet\Shared\StringHelper;
  5. use PhpOffice\PhpSpreadsheet\Spreadsheet;
  6. class Csv extends BaseReader
  7. {
  8. /**
  9. * Input encoding.
  10. *
  11. * @var string
  12. */
  13. private $inputEncoding = 'UTF-8';
  14. /**
  15. * Delimiter.
  16. *
  17. * @var string
  18. */
  19. private $delimiter;
  20. /**
  21. * Enclosure.
  22. *
  23. * @var string
  24. */
  25. private $enclosure = '"';
  26. /**
  27. * Sheet index to read.
  28. *
  29. * @var int
  30. */
  31. private $sheetIndex = 0;
  32. /**
  33. * Load rows contiguously.
  34. *
  35. * @var bool
  36. */
  37. private $contiguous = false;
  38. /**
  39. * Row counter for loading rows contiguously.
  40. *
  41. * @var int
  42. */
  43. private $contiguousRow = -1;
  44. /**
  45. * The character that can escape the enclosure.
  46. *
  47. * @var string
  48. */
  49. private $escapeCharacter = '\\';
  50. /**
  51. * Create a new CSV Reader instance.
  52. */
  53. public function __construct()
  54. {
  55. $this->readFilter = new DefaultReadFilter();
  56. }
  57. /**
  58. * Set input encoding.
  59. *
  60. * @param string $pValue Input encoding, eg: 'UTF-8'
  61. *
  62. * @return Csv
  63. */
  64. public function setInputEncoding($pValue)
  65. {
  66. $this->inputEncoding = $pValue;
  67. return $this;
  68. }
  69. /**
  70. * Get input encoding.
  71. *
  72. * @return string
  73. */
  74. public function getInputEncoding()
  75. {
  76. return $this->inputEncoding;
  77. }
  78. /**
  79. * Move filepointer past any BOM marker.
  80. */
  81. protected function skipBOM()
  82. {
  83. rewind($this->fileHandle);
  84. switch ($this->inputEncoding) {
  85. case 'UTF-8':
  86. fgets($this->fileHandle, 4) == "\xEF\xBB\xBF" ?
  87. fseek($this->fileHandle, 3) : fseek($this->fileHandle, 0);
  88. break;
  89. case 'UTF-16LE':
  90. fgets($this->fileHandle, 3) == "\xFF\xFE" ?
  91. fseek($this->fileHandle, 2) : fseek($this->fileHandle, 0);
  92. break;
  93. case 'UTF-16BE':
  94. fgets($this->fileHandle, 3) == "\xFE\xFF" ?
  95. fseek($this->fileHandle, 2) : fseek($this->fileHandle, 0);
  96. break;
  97. case 'UTF-32LE':
  98. fgets($this->fileHandle, 5) == "\xFF\xFE\x00\x00" ?
  99. fseek($this->fileHandle, 4) : fseek($this->fileHandle, 0);
  100. break;
  101. case 'UTF-32BE':
  102. fgets($this->fileHandle, 5) == "\x00\x00\xFE\xFF" ?
  103. fseek($this->fileHandle, 4) : fseek($this->fileHandle, 0);
  104. break;
  105. default:
  106. break;
  107. }
  108. }
  109. /**
  110. * Identify any separator that is explicitly set in the file.
  111. */
  112. protected function checkSeparator()
  113. {
  114. $line = fgets($this->fileHandle);
  115. if ($line === false) {
  116. return;
  117. }
  118. if ((strlen(trim($line, "\r\n")) == 5) && (stripos($line, 'sep=') === 0)) {
  119. $this->delimiter = substr($line, 4, 1);
  120. return;
  121. }
  122. return $this->skipBOM();
  123. }
  124. /**
  125. * Infer the separator if it isn't explicitly set in the file or specified by the user.
  126. */
  127. protected function inferSeparator()
  128. {
  129. if ($this->delimiter !== null) {
  130. return;
  131. }
  132. $potentialDelimiters = [',', ';', "\t", '|', ':', ' ', '~'];
  133. $counts = [];
  134. foreach ($potentialDelimiters as $delimiter) {
  135. $counts[$delimiter] = [];
  136. }
  137. // Count how many times each of the potential delimiters appears in each line
  138. $numberLines = 0;
  139. while (($line = $this->getNextLine()) !== false && (++$numberLines < 1000)) {
  140. $countLine = [];
  141. for ($i = strlen($line) - 1; $i >= 0; --$i) {
  142. $char = $line[$i];
  143. if (isset($counts[$char])) {
  144. if (!isset($countLine[$char])) {
  145. $countLine[$char] = 0;
  146. }
  147. ++$countLine[$char];
  148. }
  149. }
  150. foreach ($potentialDelimiters as $delimiter) {
  151. $counts[$delimiter][] = isset($countLine[$delimiter])
  152. ? $countLine[$delimiter]
  153. : 0;
  154. }
  155. }
  156. // If number of lines is 0, nothing to infer : fall back to the default
  157. if ($numberLines === 0) {
  158. $this->delimiter = reset($potentialDelimiters);
  159. return $this->skipBOM();
  160. }
  161. // Calculate the mean square deviations for each delimiter (ignoring delimiters that haven't been found consistently)
  162. $meanSquareDeviations = [];
  163. $middleIdx = floor(($numberLines - 1) / 2);
  164. foreach ($potentialDelimiters as $delimiter) {
  165. $series = $counts[$delimiter];
  166. sort($series);
  167. $median = ($numberLines % 2)
  168. ? $series[$middleIdx]
  169. : ($series[$middleIdx] + $series[$middleIdx + 1]) / 2;
  170. if ($median === 0) {
  171. continue;
  172. }
  173. $meanSquareDeviations[$delimiter] = array_reduce(
  174. $series,
  175. function ($sum, $value) use ($median) {
  176. return $sum + pow($value - $median, 2);
  177. }
  178. ) / count($series);
  179. }
  180. // ... and pick the delimiter with the smallest mean square deviation (in case of ties, the order in potentialDelimiters is respected)
  181. $min = INF;
  182. foreach ($potentialDelimiters as $delimiter) {
  183. if (!isset($meanSquareDeviations[$delimiter])) {
  184. continue;
  185. }
  186. if ($meanSquareDeviations[$delimiter] < $min) {
  187. $min = $meanSquareDeviations[$delimiter];
  188. $this->delimiter = $delimiter;
  189. }
  190. }
  191. // If no delimiter could be detected, fall back to the default
  192. if ($this->delimiter === null) {
  193. $this->delimiter = reset($potentialDelimiters);
  194. }
  195. return $this->skipBOM();
  196. }
  197. /**
  198. * Get the next full line from the file.
  199. *
  200. * @param string $line
  201. *
  202. * @return bool|string
  203. */
  204. private function getNextLine($line = '')
  205. {
  206. // Get the next line in the file
  207. $newLine = fgets($this->fileHandle);
  208. // Return false if there is no next line
  209. if ($newLine === false) {
  210. return false;
  211. }
  212. // Add the new line to the line passed in
  213. $line = $line . $newLine;
  214. // Drop everything that is enclosed to avoid counting false positives in enclosures
  215. $enclosure = '(?<!' . preg_quote($this->escapeCharacter, '/') . ')'
  216. . preg_quote($this->enclosure, '/');
  217. $line = preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/Us', '', $line);
  218. // See if we have any enclosures left in the line
  219. // if we still have an enclosure then we need to read the next line as well
  220. if (preg_match('/(' . $enclosure . ')/', $line) > 0) {
  221. $line = $this->getNextLine($line);
  222. }
  223. return $line;
  224. }
  225. /**
  226. * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns).
  227. *
  228. * @param string $pFilename
  229. *
  230. * @throws Exception
  231. *
  232. * @return array
  233. */
  234. public function listWorksheetInfo($pFilename)
  235. {
  236. // Open file
  237. if (!$this->canRead($pFilename)) {
  238. throw new Exception($pFilename . ' is an Invalid Spreadsheet file.');
  239. }
  240. $this->openFile($pFilename);
  241. $fileHandle = $this->fileHandle;
  242. // Skip BOM, if any
  243. $this->skipBOM();
  244. $this->checkSeparator();
  245. $this->inferSeparator();
  246. $worksheetInfo = [];
  247. $worksheetInfo[0]['worksheetName'] = 'Worksheet';
  248. $worksheetInfo[0]['lastColumnLetter'] = 'A';
  249. $worksheetInfo[0]['lastColumnIndex'] = 0;
  250. $worksheetInfo[0]['totalRows'] = 0;
  251. $worksheetInfo[0]['totalColumns'] = 0;
  252. // Loop through each line of the file in turn
  253. while (($rowData = fgetcsv($fileHandle, 0, $this->delimiter, $this->enclosure, $this->escapeCharacter)) !== false) {
  254. ++$worksheetInfo[0]['totalRows'];
  255. $worksheetInfo[0]['lastColumnIndex'] = max($worksheetInfo[0]['lastColumnIndex'], count($rowData) - 1);
  256. }
  257. $worksheetInfo[0]['lastColumnLetter'] = Coordinate::stringFromColumnIndex($worksheetInfo[0]['lastColumnIndex'] + 1);
  258. $worksheetInfo[0]['totalColumns'] = $worksheetInfo[0]['lastColumnIndex'] + 1;
  259. // Close file
  260. fclose($fileHandle);
  261. return $worksheetInfo;
  262. }
  263. /**
  264. * Loads Spreadsheet from file.
  265. *
  266. * @param string $pFilename
  267. *
  268. * @throws Exception
  269. *
  270. * @return Spreadsheet
  271. */
  272. public function load($pFilename)
  273. {
  274. // Create new Spreadsheet
  275. $spreadsheet = new Spreadsheet();
  276. // Load into this instance
  277. return $this->loadIntoExisting($pFilename, $spreadsheet);
  278. }
  279. /**
  280. * Loads PhpSpreadsheet from file into PhpSpreadsheet instance.
  281. *
  282. * @param string $pFilename
  283. * @param Spreadsheet $spreadsheet
  284. *
  285. * @throws Exception
  286. *
  287. * @return Spreadsheet
  288. */
  289. public function loadIntoExisting($pFilename, Spreadsheet $spreadsheet)
  290. {
  291. $lineEnding = ini_get('auto_detect_line_endings');
  292. ini_set('auto_detect_line_endings', true);
  293. // Open file
  294. if (!$this->canRead($pFilename)) {
  295. throw new Exception($pFilename . ' is an Invalid Spreadsheet file.');
  296. }
  297. $this->openFile($pFilename);
  298. $fileHandle = $this->fileHandle;
  299. // Skip BOM, if any
  300. $this->skipBOM();
  301. $this->checkSeparator();
  302. $this->inferSeparator();
  303. // Create new PhpSpreadsheet object
  304. while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
  305. $spreadsheet->createSheet();
  306. }
  307. $sheet = $spreadsheet->setActiveSheetIndex($this->sheetIndex);
  308. // Set our starting row based on whether we're in contiguous mode or not
  309. $currentRow = 1;
  310. if ($this->contiguous) {
  311. $currentRow = ($this->contiguousRow == -1) ? $sheet->getHighestRow() : $this->contiguousRow;
  312. }
  313. // Loop through each line of the file in turn
  314. while (($rowData = fgetcsv($fileHandle, 0, $this->delimiter, $this->enclosure, $this->escapeCharacter)) !== false) {
  315. $columnLetter = 'A';
  316. foreach ($rowData as $rowDatum) {
  317. if ($rowDatum != '' && $this->readFilter->readCell($columnLetter, $currentRow)) {
  318. // Convert encoding if necessary
  319. if ($this->inputEncoding !== 'UTF-8') {
  320. $rowDatum = StringHelper::convertEncoding($rowDatum, 'UTF-8', $this->inputEncoding);
  321. }
  322. // Set cell value
  323. $sheet->getCell($columnLetter . $currentRow)->setValue($rowDatum);
  324. }
  325. ++$columnLetter;
  326. }
  327. ++$currentRow;
  328. }
  329. // Close file
  330. fclose($fileHandle);
  331. if ($this->contiguous) {
  332. $this->contiguousRow = $currentRow;
  333. }
  334. ini_set('auto_detect_line_endings', $lineEnding);
  335. // Return
  336. return $spreadsheet;
  337. }
  338. /**
  339. * Get delimiter.
  340. *
  341. * @return string
  342. */
  343. public function getDelimiter()
  344. {
  345. return $this->delimiter;
  346. }
  347. /**
  348. * Set delimiter.
  349. *
  350. * @param string $delimiter Delimiter, eg: ','
  351. *
  352. * @return CSV
  353. */
  354. public function setDelimiter($delimiter)
  355. {
  356. $this->delimiter = $delimiter;
  357. return $this;
  358. }
  359. /**
  360. * Get enclosure.
  361. *
  362. * @return string
  363. */
  364. public function getEnclosure()
  365. {
  366. return $this->enclosure;
  367. }
  368. /**
  369. * Set enclosure.
  370. *
  371. * @param string $enclosure Enclosure, defaults to "
  372. *
  373. * @return CSV
  374. */
  375. public function setEnclosure($enclosure)
  376. {
  377. if ($enclosure == '') {
  378. $enclosure = '"';
  379. }
  380. $this->enclosure = $enclosure;
  381. return $this;
  382. }
  383. /**
  384. * Get sheet index.
  385. *
  386. * @return int
  387. */
  388. public function getSheetIndex()
  389. {
  390. return $this->sheetIndex;
  391. }
  392. /**
  393. * Set sheet index.
  394. *
  395. * @param int $pValue Sheet index
  396. *
  397. * @return CSV
  398. */
  399. public function setSheetIndex($pValue)
  400. {
  401. $this->sheetIndex = $pValue;
  402. return $this;
  403. }
  404. /**
  405. * Set Contiguous.
  406. *
  407. * @param bool $contiguous
  408. *
  409. * @return Csv
  410. */
  411. public function setContiguous($contiguous)
  412. {
  413. $this->contiguous = (bool) $contiguous;
  414. if (!$contiguous) {
  415. $this->contiguousRow = -1;
  416. }
  417. return $this;
  418. }
  419. /**
  420. * Get Contiguous.
  421. *
  422. * @return bool
  423. */
  424. public function getContiguous()
  425. {
  426. return $this->contiguous;
  427. }
  428. /**
  429. * Set escape backslashes.
  430. *
  431. * @param string $escapeCharacter
  432. *
  433. * @return $this
  434. */
  435. public function setEscapeCharacter($escapeCharacter)
  436. {
  437. $this->escapeCharacter = $escapeCharacter;
  438. return $this;
  439. }
  440. /**
  441. * Get escape backslashes.
  442. *
  443. * @return string
  444. */
  445. public function getEscapeCharacter()
  446. {
  447. return $this->escapeCharacter;
  448. }
  449. /**
  450. * Can the current IReader read the file?
  451. *
  452. * @param string $pFilename
  453. *
  454. * @return bool
  455. */
  456. public function canRead($pFilename)
  457. {
  458. // Check if file exists
  459. try {
  460. $this->openFile($pFilename);
  461. } catch (Exception $e) {
  462. return false;
  463. }
  464. fclose($this->fileHandle);
  465. // Trust file extension if any
  466. if (strtolower(pathinfo($pFilename, PATHINFO_EXTENSION)) === 'csv') {
  467. return true;
  468. }
  469. // Attempt to guess mimetype
  470. $type = mime_content_type($pFilename);
  471. $supportedTypes = [
  472. 'text/csv',
  473. 'text/plain',
  474. 'inode/x-empty',
  475. ];
  476. return in_array($type, $supportedTypes, true);
  477. }
  478. }