source-class-Com.Tecnick.Pdf.Parser.Parser

It appears that you are using AdBlocking software. The cost of running this website is covered by advertisements. If you like it please feel free to a small amount of money to secure the future of this website.
  1: <?php
  2: /**
  3:  * Parser.php
  4:  *
  5:  * @since       2011-05-23
  6:  * @category    Library
  7:  * @package     PdfParser
  8:  * @author      Nicola Asuni <info@tecnick.com>
  9:  * @copyright   2011-2015 Nicola Asuni - Tecnick.com LTD
 10:  * @license     http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT)
 11:  * @link        https://github.com/tecnickcom/tc-lib-pdf-parser
 12:  *
 13:  * This file is part of tc-lib-pdf-parser software library.
 14:  */
 15: 
 16: namespace Com\Tecnick\Pdf\Parser;
 17: 
 18: use \Com\Tecnick\Pdf\Parser\Exception as PPException;
 19: 
 20: /**
 21:  * Com\Tecnick\Pdf\Parser\Parser
 22:  *
 23:  * PHP class for parsing PDF documents.
 24:  *
 25:  * @since       2011-05-23
 26:  * @category    Library
 27:  * @package     PdfParser
 28:  * @author      Nicola Asuni <info@tecnick.com>
 29:  * @copyright   2011-2015 Nicola Asuni - Tecnick.com LTD
 30:  * @license     http://www.gnu.org/copyleft/lesser.html GNU-LGPL v3 (see LICENSE.TXT)
 31:  * @link        https://github.com/tecnickcom/tc-lib-pdf-parser
 32:  */
 33: class Parser extends \Com\Tecnick\Pdf\Parser\Process\Xref
 34: {
 35:     /**
 36:      * Raw content of the PDF document.
 37:      *
 38:      * @var string
 39:      */
 40:     protected $pdfdata = '';
 41: 
 42:     /**
 43:      * Array of PDF objects.
 44:      *
 45:      * @var array
 46:      */
 47:     protected $objects = array();
 48: 
 49:     /**
 50:      * Array of configuration parameters.
 51:      *
 52:      * @var array
 53:      */
 54:     private $cfg = array(
 55:         'ignore_filter_errors'  => false,
 56:     );
 57: 
 58:     /**
 59:      * Initialize the PDF parser
 60:      *
 61:      * @param array $cfg   Array of configuration parameters:
 62:      *          'ignore_filter_decoding_errors'  : if true ignore filter decoding errors;
 63:      *          'ignore_missing_filter_decoders' : if true ignore missing filter decoding errors.
 64:      */
 65:     public function __construct($cfg = array())
 66:     {
 67:         if (isset($cfg['ignore_filter_errors'])) {
 68:             $this->cfg['ignore_filter_errors'] = (bool)$cfg['ignore_filter_errors'];
 69:         }
 70:     }
 71: 
 72:     /**
 73:      * Parse a PDF document into an array of objects
 74:      *
 75:      * @param string $data PDF data to parse.
 76:      */
 77:     public function parse($data)
 78:     {
 79:         if (empty($data)) {
 80:             throw new PPException('Empty PDF data.');
 81:         }
 82:         // find the pdf header starting position
 83:         if (($trimpos = strpos($data, '%PDF-')) === false) {
 84:             throw new PPException('Invalid PDF data: missing %PDF header.');
 85:         }
 86:         // get PDF content string
 87:         $this->pdfdata = substr($data, $trimpos);
 88:         // get xref and trailer data
 89:         $this->xref = $this->getXrefData();
 90:         // parse all document objects
 91:         $this->objects = array();
 92:         foreach ($this->xref['xref'] as $obj => $offset) {
 93:             if (!isset($this->objects[$obj]) && ($offset > 0)) {
 94:                 // decode objects with positive offset
 95:                 $this->objects[$obj] = $this->getIndirectObject($obj, $offset, true);
 96:             }
 97:         }
 98:         // release some memory
 99:         unset($this->pdfdata);
100:         return array($this->xref, $this->objects);
101:     }
102: 
103:     /**
104:      * Get content of indirect object.
105:      *
106:      * @param string $obj_ref  Object number and generation number separated by underscore character.
107:      * @param int    $offset   Object offset.
108:      * @param bool   $decoding If true decode streams.
109:      *
110:      * @return array Object data.
111:      */
112:     protected function getIndirectObject($obj_ref, $offset = 0, $decoding = true)
113:     {
114:         $obj = explode('_', $obj_ref);
115:         if (($obj === false) || (count($obj) != 2)) {
116:             throw new PPException('Invalid object reference: '.$obj);
117:         }
118:         $objref = $obj[0].' '.$obj[1].' obj';
119:         // ignore leading zeros
120:         $offset += strspn($this->pdfdata, '0', $offset);
121:         if (strpos($this->pdfdata, $objref, $offset) != $offset) {
122:             // an indirect reference to an undefined object shall be considered a reference to the null object
123:             return array('null', 'null', $offset);
124:         }
125:         // starting position of object content
126:         $offset += strlen($objref);
127:         // return raw object content
128:         return $this->getRawIndirectObject($offset, $decoding);
129:     }
130: 
131:     /**
132:      * Get content of indirect object.
133:      *
134:      * @param string $obj_ref  Object number and generation number separated by underscore character.
135:      * @param int    $offset   Object offset.
136:      * @param bool   $decoding If true decode streams.
137:      *
138:      * @return array Object data.
139:      */
140:     protected function getRawIndirectObject($offset, $decoding)
141:     {
142:         // get array of object content
143:         $objdata = array();
144:         $idx = 0; // object main index
145:         do {
146:             $oldoffset = $offset;
147:             // get element
148:             $element = $this->getRawObject($offset);
149:             $offset = $element[2];
150:             // decode stream using stream's dictionary information
151:             if ($decoding
152:                 && ($element[0] == 'stream')
153:                 && (isset($objdata[($idx - 1)][0]))
154:                 && ($objdata[($idx - 1)][0] == '<<')
155:             ) {
156:                 $element[3] = $this->decodeStream($objdata[($idx - 1)][1], $element[1]);
157:             }
158:             $objdata[$idx] = $element;
159:             ++$idx;
160:         } while (($element[0] != 'endobj') && ($offset != $oldoffset));
161:         // remove closing delimiter
162:         array_pop($objdata);
163:         // return raw object content
164:         return $objdata;
165:     }
166: 
167:     /**
168:      * Get the content of object, resolving indect object reference if necessary.
169:      *
170:      * @param string $obj Object value.
171:      *
172:      * @return array Object data.
173:      */
174:     protected function getObjectVal($obj)
175:     {
176:         if ($obj[0] == 'objref') {
177:             // reference to indirect object
178:             if (isset($this->objects[$obj[1]])) {
179:                 // this object has been already parsed
180:                 return $this->objects[$obj[1]];
181:             } elseif (isset($this->xref[$obj[1]])) {
182:                 // parse new object
183:                 $this->objects[$obj[1]] = $this->getIndirectObject($obj[1], $this->xref[$obj[1]], false);
184:                 return $this->objects[$obj[1]];
185:             }
186:         }
187:         return $obj;
188:     }
189: 
190:     /**
191:      * Decode the specified stream.
192:      *
193:      * @param array  $sdic   Stream's dictionary array.
194:      * @param string $stream Stream to decode.
195:      *
196:      * @return array Decoded stream data and remaining filters.
197:      */
198:     protected function decodeStream($sdic, $stream)
199:     {
200:         // get stream length and filters
201:         $slength = strlen($stream);
202:         if ($slength <= 0) {
203:             return array('', array());
204:         }
205:         $filters = array();
206:         foreach ($sdic as $key => $val) {
207:             if ($val[0] == '/') {
208:                 if (($val[1] == 'Length') && (isset($sdic[($key + 1)])) && ($sdic[($key + 1)][0] == 'numeric')) {
209:                     // get declared stream length
210:                     $this->getDeclaredStreamLength($stream, $slength, $sdic, $key);
211:                 } elseif (($val[1] == 'Filter') && (isset($sdic[($key + 1)]))) {
212:                     $filters = $this->getFilters($filters, $sdic, $key);
213:                 }
214:             }
215:         }
216:         return $this->getDecodedStream($filters, $stream);
217:     }
218: 
219:     /**
220:      * Get Filters
221:      *
222:      * @param string $stream  Stream
223:      * @param int    $slength Stream length
224:      * @param array  $sdic    Stream's dictionary array.
225:      * @param int    $key     Index
226:      *
227:      * @return array Array of filters
228:      */
229:     protected function getDeclaredStreamLength(&$stream, &$slength, $sdic, $key)
230:     {
231:         // get declared stream length
232:         $declength = intval($sdic[($key + 1)][1]);
233:         if ($declength < $slength) {
234:             $stream = substr($stream, 0, $declength);
235:             $slength = $declength;
236:         }
237:     }
238: 
239:     /**
240:      * Get Filters
241:      *
242:      * @param array $filters Array of Filters
243:      * @param array $sdic    Stream's dictionary array.
244:      * @param int   $key     Index
245:      *
246:      * @return array Array of filters
247:      */
248:     protected function getFilters($filters, $sdic, $key)
249:     {
250:         // resolve indirect object
251:         $objval = $this->getObjectVal($sdic[($key + 1)]);
252:         if ($objval[0] == '/') {
253:             // single filter
254:             $filters[] = $objval[1];
255:         } elseif ($objval[0] == '[') {
256:             // array of filters
257:             foreach ($objval[1] as $flt) {
258:                 if ($flt[0] == '/') {
259:                     $filters[] = $flt[1];
260:                 }
261:             }
262:         }
263:         return $filters;
264:     }
265: 
266:     /**
267:      * Decode the specified stream.
268:      *
269:      * @param array  $filters Array of decoding filters to apply
270:      * @param string $stream  Stream to decode.
271:      *
272:      * @return array Decoded stream data and remaining filters.
273:      */
274:     protected function getDecodedStream($filters, $stream)
275:     {
276:         // decode the stream
277:         $errorfilters = array();
278:         try {
279:             $filter = new \Com\Tecnick\Pdf\Filter\Filter;
280:             $stream = $filter->decodeAll($filters, $stream);
281:         } catch (\Com\Tecnick\Pdf\Filter\Exception $e) {
282:             if ($this->cfg['ignore_filter_errors']) {
283:                 $errorfilters = $filters;
284:             } else {
285:                 throw new PPException($e->getMessage());
286:             }
287:         }
288:         return array($stream, $errorfilters);
289:     }
290: }
291: 
 

© 2004-2017 – Nicola Asuni - Tecnick.com - All rights reserved.
about - disclaimer - privacy