File: /home/imensosw/www/imenso.co/payment/count/class.doccounter.php
<?php
/*
* A collection of simple tools for analysing
* .PDF, .DOCX, .DOC and .TXT docs.
*
* Copyright (C) 2016-2017
* Joseph Blurton (http://github.com/joeblurton)
* And other contributors (see attrib below)
*
* Version 1.0.2
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* ATTRIBUTIONS
*
* PageCount_PDF and
* PageCount_DOCX by Whiteflash
* http://stackoverflow.com/questions/5540886/extract-text-from-doc-and-docx/
*
* Paragraph tweak by JoshB
* http://stackoverflow.com/questions/5607594/find-linebreaks-in-a-docx-file-using-php
*
* read_word_doc by
* Davinder Singh
* http://stackoverflow.com/questions/7358637/reading-doc-file-in-php
*
* Jonny 5's simple word splitter
* http://php.net/manual/en/function.str-word-count.php#107363
*
* Line Count method by K2xL
* http://stackoverflow.com/questions/7955402/count-lines-in-a-posted-string
*
* RTFTOOLS by
* Christian Vigh
* https://github.com/christian-vigh-phpclasses/RtfTools
*
* PDF Parser by
* Smalot GPL 3
* https://github.com/smalot/pdfparser
*/
class DocCounter {
// Class Variables
private $file;
private $filetype;
// Set file
public function setFile($filename)
{
$this->file = $filename;
$this->filetype = pathinfo($this->file, PATHINFO_EXTENSION);
}
// Get file
public function getFile()
{
return $this->file;
}
// Get file information object
public function getInfo()
{
// Function variables
$ft = $this->filetype;
// Let's construct our info response object
$obj = new stdClass();
$obj->format = $ft;
$obj->wordCount = null;
$obj->lineCount = null;
$obj->pageCount = null;
// Let's set our function calls based on filetype
switch($ft)
{
case "doc":
$doc = $this->read_doc_file();
$obj->wordCount = $this->str_word_count_utf8($doc);
$obj->lineCount = $this->lineCount($doc);
$obj->pageCount = $this->pageCount($doc);
break;
case "docx":
$obj->wordCount = $this->str_word_count_utf8($this->docx2text());
$obj->lineCount = $this->lineCount($this->docx2text());
$obj->pageCount = $this->PageCount_DOCX();
break;
case "pdf":
$obj->wordCount = $this->str_word_count_utf8($this->pdf2text());
$obj->lineCount = $this->lineCount($this->pdf2text());
$obj->pageCount = $this->PageCount_PDF();
break;
case "txt":
$textContents = file_get_contents($this->file);
$obj->wordCount = $this->str_word_count_utf8($textContents);
$obj->lineCount = $this->lineCount($textContents);
$obj->pageCount = $this->pageCount($textContents);
break;
default:
$obj->wordCount = "unsupported file format";
$obj->lineCount = "unsupported file format";
$obj->pageCount = "unsupported file format";
}
return $obj;
}
// Convert: Word.doc to Text String
function read_doc_file() {
$path = getcwd();
$f = $path."/".$this->file;
if(file_exists($f))
{
if(($fh = fopen($f, 'r')) !== false )
{
$headers = fread($fh, 0xA00);
// 1 = (ord(n)*1) ; Document has from 0 to 255 characters
$n1 = ( ord($headers[0x21C]) - 1 );
// 1 = ((ord(n)-8)*256) ; Document has from 256 to 63743 characters
$n2 = ( ( ord($headers[0x21D]) - 8 ) * 256 );
// 1 = ((ord(n)*256)*256) ; Document has from 63744 to 16775423 characters
$n3 = ( ( ord($headers[0x21E]) * 256 ) * 256 );
// 1 = (((ord(n)*256)*256)*256) ; Document has from 16775424 to 4294965504 characters
$n4 = ( ( ( ord($headers[0x21F]) * 256 ) * 256 ) * 256 );
// Total length of text in the document
$textLength = ($n1 + $n2 + $n3 + $n4);
$extracted_plaintext = fread($fh, $textLength);
$extracted_plaintext = mb_convert_encoding($extracted_plaintext,'UTF-8');
// simple print character stream without new lines
//echo $extracted_plaintext;
// if you want to see your paragraphs in a new line, do this
return nl2br($extracted_plaintext);
// need more spacing after each paragraph use another nl2br
}
}
}
// Jonny 5's simple word splitter
function str_word_count_utf8($str) {
return count(preg_split('~[^\p{L}\p{N}\']+~u',$str));
}
// Convert: Word.docx to Text String
function docx2text()
{
return $this->readZippedXML($this->file, "word/document.xml");
}
function readZippedXML($archiveFile, $dataFile)
{
// Create new ZIP archive
$zip = new ZipArchive;
// set absolute path
$path = getcwd();
$f = $path."/".$archiveFile;
// Open received archive file
if (true === $zip->open($f)) {
// If done, search for the data file in the archive
if (($index = $zip->locateName($dataFile)) !== false) {
// If found, read it to the string
$data = $zip->getFromIndex($index);
// Close archive file
$zip->close();
// Load XML from a string
// Skip errors and warnings
$xml = new DOMDocument();
$xml->loadXML($data, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
$xmldata = $xml->saveXML();
// Newline Replacement
$xmldata = str_replace("</w:p>", "\r\n", $xmldata);
// Return data without XML formatting tags
return strip_tags($xmldata);
}
$zip->close();
}
// In case of failure return empty string
return "";
}
// Convert: Word.doc to Text String
function read_doc()
{
$path = getcwd();
$f = $path."/".$this->file;
$fileHandle = fopen($f, "r");
$line = @fread($fileHandle, filesize($this->file));
$lines = explode(chr(0x0D),$line);
$outtext = "";
foreach($lines as $thisline)
{
$pos = strpos($thisline, chr(0x00));
if (($pos !== FALSE)||(strlen($thisline)==0))
{
} else {
$outtext .= $thisline." ";
}
}
$outtext = preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/","",$outtext);
return $outtext;
}
// Convert: Adobe.pdf to Text String
function pdf2text()
{
//absolute path for file
$path = getcwd();
$f = $path."/".$this->file;
if (file_exists($f)) {
include('vendor/autoload.php');
$parser = new \Smalot\PdfParser\Parser();
$pdf = $parser->parseFile($f);
$text = $pdf->getText();
return $text;
}
return null;
}
// Page Count: DOCX using XML Metadata
function PageCount_DOCX()
{
$pageCount = 0;
$zip = new ZipArchive();
$path = getcwd();
$f = $path."/".$this->file;
if($zip->open($f) === true) {
if(($index = $zip->locateName('docProps/app.xml')) !== false) {
$data = $zip->getFromIndex($index);
$zip->close();
$xml = new SimpleXMLElement($data);
$pageCount = $xml->Pages;
}
}
return intval($pageCount);
}
// Page Count: PDF using FPDF and FPDI
function PageCount_PDF()
{
//absolute path for file
$path = getcwd();
$f = $path."/".$this->file;
$pageCount = 0;
if (file_exists($f)) {
require_once('lib/fpdf/fpdf.php');
require_once('lib/fpdi/fpdi.php');
$pdf = new FPDI();
$pageCount = $pdf->setSourceFile($f); // returns page count
}
return $pageCount;
}
// Page Count: General
function pageCount($text)
{
require_once('lib/fpdf/fpdf.php');
$pdf = new FPDF();
$pdf->AddPage();
$pdf->SetFont('Times','',12);
$pdf->MultiCell(0,5,$text);
//$pdf->Output();
$filename="tmp.pdf";
$pdf->Output($filename,'F');
require_once('lib/fpdi/fpdi.php');
$pdf = new FPDI();
$pageCount = $pdf->setSourceFile($filename);
unlink($filename);
return $pageCount;
}
// Line Count: General
function lineCount($text)
{
$lines_arr = preg_split('/\n|\r/',$text);
$num_newlines = count($lines_arr);
return $num_newlines;
}
}
?>