1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21
22
23
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_sdext.hxx"
26
27 #include <stdio.h>
28 #include <sal/main.h>
29 #include <osl/file.h>
30 #include <osl/thread.h>
31 #include <rtl/alloc.h>
32 #include <rtl/ustring.hxx>
33 #include <rtl/strbuf.hxx>
34
35 #include "pdfparse.hxx"
36
37 using namespace rtl;
38 using namespace pdfparse;
39
printHelp(const char * pExe)40 void printHelp( const char* pExe )
41 {
42 fprintf( stdout,
43 "USAGE: %s [-h,--help]\n"
44 " %s [-pw, --password <password>] <inputfile> [<outputfile>]\n"
45 " %s <-a, --extract-add-streams> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
46 " %s <-f, --extract-fonts> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
47 " %s <-o, --extract-objects> <o0>[:<g0>][,<o1>[:g1][,...]] [-pw, --password <password>] <inputfile> [<outputfile>]\n"
48 " -h, --help: show help\n"
49 " -a, --extract-add-streams: extracts additional streams to outputfile_object\n"
50 " and prints the mimetype found to stdout\n"
51 " -f, --extract-fonts: extracts fonts (currently only type1 and truetype are supported\n"
52 " -o, --extract-objects: extracts object streams, the syntax of the argument is comma separated\n"
53 " object numbers, where object number and generation number are separated by \':\'\n"
54 " an omitted generation number defaults to 0\n"
55 " -pw, --password: use password for decryption\n"
56 "\n"
57 "note: -f, -a, -o and normal unzip operation are mutually exclusive\n"
58 , pExe, pExe, pExe, pExe, pExe );
59 }
60
61 class FileEmitContext : public EmitContext
62 {
63 oslFileHandle m_aHandle;
64 oslFileHandle m_aReadHandle;
65 unsigned int m_nReadLen;
66
67 void openReadFile( const char* pOrigName );
68
69 public:
70 FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop );
71 virtual ~FileEmitContext();
72
73 virtual bool write( const void* pBuf, unsigned int nLen ) throw();
74 virtual unsigned int getCurPos() throw();
75 virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) throw();
76 virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) throw();
77 };
78
FileEmitContext(const char * pFileName,const char * pOrigName,const PDFContainer * pTop)79 FileEmitContext::FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop )
80 : EmitContext( pTop ),
81 m_aHandle( NULL ),
82 m_aReadHandle( NULL ),
83 m_nReadLen( 0 )
84 {
85 OUString aSysFile( OStringToOUString( OString( pFileName ), osl_getThreadTextEncoding() ) );
86 OUString aURL;
87 if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None )
88 {
89 fprintf( stderr, "filename conversion \"%s\" failed\n", pFileName );
90 return;
91 }
92
93 if( osl_openFile( aURL.pData, &m_aHandle, osl_File_OpenFlag_Write ) == osl_File_E_None )
94 {
95 if( osl_setFileSize( m_aHandle, 0 ) != osl_File_E_None )
96 {
97 fprintf( stderr, "could not truncate %s\n", pFileName );
98 osl_closeFile( m_aHandle );
99 m_aHandle = NULL;
100 }
101 }
102 else if( osl_openFile( aURL.pData, &m_aHandle,
103 osl_File_OpenFlag_Write |osl_File_OpenFlag_Create ) != osl_File_E_None )
104 {
105 fprintf( stderr, "could not open %s\n", pFileName );
106 return;
107 }
108 m_bDeflate = true;
109
110 openReadFile( pOrigName );
111 }
112
~FileEmitContext()113 FileEmitContext::~FileEmitContext()
114 {
115 if( m_aHandle )
116 osl_closeFile( m_aHandle );
117 if( m_aReadHandle )
118 osl_closeFile( m_aReadHandle );
119 }
120
openReadFile(const char * pInFile)121 void FileEmitContext::openReadFile( const char* pInFile )
122 {
123 OUString aSysFile( OStringToOUString( OString( pInFile ), osl_getThreadTextEncoding() ) );
124 OUString aURL;
125 if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None )
126 {
127 fprintf( stderr, "filename conversion \"%s\" failed\n", pInFile );
128 return;
129 }
130
131 if( osl_openFile( aURL.pData, &m_aReadHandle, osl_File_OpenFlag_Read ) != osl_File_E_None )
132 {
133 fprintf( stderr, "could not open %s\n", pInFile );
134 return;
135 }
136
137 if( osl_setFilePos( m_aReadHandle, osl_Pos_End, 0 ) != osl_File_E_None )
138 {
139 fprintf( stderr, "could not seek to end of %s\n", pInFile );
140 osl_closeFile( m_aReadHandle );
141 return;
142 }
143
144 sal_uInt64 nFileSize = 0;
145 if( osl_getFilePos( m_aReadHandle, &nFileSize ) != osl_File_E_None )
146 {
147 fprintf( stderr, "could not get end pos of %s\n", pInFile );
148 osl_closeFile( m_aReadHandle );
149 return;
150 }
151
152 m_nReadLen = static_cast<unsigned int>(nFileSize);
153 }
154
write(const void * pBuf,unsigned int nLen)155 bool FileEmitContext::write( const void* pBuf, unsigned int nLen ) throw()
156 {
157 if( ! m_aHandle )
158 return false;
159
160 sal_uInt64 nWrite = static_cast<sal_uInt64>(nLen);
161 sal_uInt64 nWritten = 0;
162 return (osl_writeFile( m_aHandle, pBuf, nWrite, &nWritten ) == osl_File_E_None)
163 && nWrite == nWritten;
164 }
165
getCurPos()166 unsigned int FileEmitContext::getCurPos() throw()
167 {
168 sal_uInt64 nFileSize = 0;
169 if( m_aHandle )
170 {
171 if( osl_getFilePos( m_aHandle, &nFileSize ) != osl_File_E_None )
172 nFileSize = 0;
173 }
174 return static_cast<unsigned int>(nFileSize);
175 }
176
copyOrigBytes(unsigned int nOrigOffset,unsigned int nLen)177 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) throw()
178 {
179 if( nOrigOffset + nLen > m_nReadLen )
180 return false;
181
182 if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
183 {
184 fprintf( stderr, "could not seek to offset %u\n", nOrigOffset );
185 return false;
186 }
187 void* pBuf = rtl_allocateMemory( nLen );
188 if( ! pBuf )
189 return false;
190 sal_uInt64 nBytesRead = 0;
191 if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None
192 || nBytesRead != static_cast<sal_uInt64>(nLen) )
193 {
194 fprintf( stderr, "could not read %u bytes\n", nLen );
195 rtl_freeMemory( pBuf );
196 return false;
197 }
198 bool bRet = write( pBuf, nLen );
199 rtl_freeMemory( pBuf );
200 return bRet;
201 }
202
readOrigBytes(unsigned int nOrigOffset,unsigned int nLen,void * pBuf)203 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) throw()
204 {
205 if( nOrigOffset + nLen > m_nReadLen )
206 return 0;
207
208 if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
209 {
210 fprintf( stderr, "could not seek to offset %u\n", nOrigOffset );
211 return 0;
212 }
213 sal_uInt64 nBytesRead = 0;
214 if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None )
215 return 0;
216 return static_cast<unsigned int>(nBytesRead);
217 }
218
219 typedef int(*PDFFileHdl)(const char*, const char*, PDFFile*);
220
handleFile(const char * pInFile,const char * pOutFile,const char * pPassword,PDFFileHdl pHdl)221 int handleFile( const char* pInFile, const char* pOutFile, const char* pPassword, PDFFileHdl pHdl )
222 {
223
224 PDFReader aParser;
225 int nRet = 0;
226 PDFEntry* pEntry = aParser.read( pInFile );
227 if( pEntry )
228 {
229 PDFFile* pPDFFile = dynamic_cast<PDFFile*>(pEntry);
230 if( pPDFFile )
231 {
232 fprintf( stdout, "have a %s PDF file\n", pPDFFile->isEncrypted() ? "encrypted" : "unencrypted" );
233 if( pPassword )
234 fprintf( stdout, "password %s\n",
235 pPDFFile->setupDecryptionData( pPassword ) ? "matches" : "does not match" );
236 nRet = pHdl( pInFile, pOutFile, pPDFFile );
237 }
238 else
239 nRet = 20;
240 delete pEntry;
241 }
242 return nRet;
243 }
244
write_unzipFile(const char * pInFile,const char * pOutFile,PDFFile * pPDFFile)245 int write_unzipFile( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile )
246 {
247 FileEmitContext aContext( pOutFile, pInFile, pPDFFile );
248 aContext.m_bDecrypt = pPDFFile->isEncrypted();
249 pPDFFile->emit(aContext);
250 return 0;
251 }
252
write_addStreamArray(const char * pOutFile,PDFArray * pStreams,PDFFile * pPDFFile,const char * pInFile)253 int write_addStreamArray( const char* pOutFile, PDFArray* pStreams, PDFFile* pPDFFile, const char* pInFile )
254 {
255 int nRet = 0;
256 unsigned int nArrayElements = pStreams->m_aSubElements.size();
257 for( unsigned int i = 0; i < nArrayElements-1 && nRet == 0; i++ )
258 {
259 PDFName* pMimeType = dynamic_cast<PDFName*>(pStreams->m_aSubElements[i]);
260 PDFObjectRef* pStreamRef = dynamic_cast<PDFObjectRef*>(pStreams->m_aSubElements[i+1]);
261 if( ! pMimeType )
262 fprintf( stderr, "error: no mimetype element\n" );
263 if( ! pStreamRef )
264 fprintf( stderr, "error: no stream ref element\n" );
265 if( pMimeType && pStreamRef )
266 {
267 fprintf( stdout, "found stream %d %d with mimetype %s\n",
268 pStreamRef->m_nNumber, pStreamRef->m_nGeneration,
269 pMimeType->m_aName.getStr() );
270 PDFObject* pObject = pPDFFile->findObject( pStreamRef->m_nNumber, pStreamRef->m_nGeneration );
271 if( pObject )
272 {
273 rtl::OStringBuffer aOutStream( pOutFile );
274 aOutStream.append( "_stream_" );
275 aOutStream.append( sal_Int32(pStreamRef->m_nNumber) );
276 aOutStream.append( "_" );
277 aOutStream.append( sal_Int32(pStreamRef->m_nGeneration) );
278 FileEmitContext aContext( aOutStream.getStr(), pInFile, pPDFFile );
279 aContext.m_bDecrypt = pPDFFile->isEncrypted();
280 pObject->writeStream( aContext, pPDFFile );
281 }
282 else
283 {
284 fprintf( stderr, "object not found\n" );
285 nRet = 121;
286 }
287 }
288 else
289 nRet = 120;
290 }
291 return nRet;
292 }
293
write_addStreams(const char * pInFile,const char * pOutFile,PDFFile * pPDFFile)294 int write_addStreams( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile )
295 {
296 // find all trailers
297 int nRet = 0;
298 unsigned int nElements = pPDFFile->m_aSubElements.size();
299 for( unsigned i = 0; i < nElements && nRet == 0; i++ )
300 {
301 PDFTrailer* pTrailer = dynamic_cast<PDFTrailer*>(pPDFFile->m_aSubElements[i]);
302 if( pTrailer && pTrailer->m_pDict )
303 {
304 // search for AdditionalStreams entry
305 std::hash_map<rtl::OString,PDFEntry*,rtl::OStringHash>::iterator add_stream;
306 add_stream = pTrailer->m_pDict->m_aMap.find( "AdditionalStreams" );
307 if( add_stream != pTrailer->m_pDict->m_aMap.end() )
308 {
309 PDFArray* pStreams = dynamic_cast<PDFArray*>(add_stream->second);
310 if( pStreams )
311 nRet = write_addStreamArray( pOutFile, pStreams, pPDFFile, pInFile );
312 }
313 }
314 }
315 return nRet;
316 }
317
write_fonts(const char * i_pInFile,const char * i_pOutFile,PDFFile * i_pPDFFile)318 int write_fonts( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile )
319 {
320 int nRet = 0;
321 unsigned int nElements = i_pPDFFile->m_aSubElements.size();
322 for( unsigned i = 0; i < nElements && nRet == 0; i++ )
323 {
324 // search FontDescriptors
325 PDFObject* pObj = dynamic_cast<PDFObject*>(i_pPDFFile->m_aSubElements[i]);
326 if( ! pObj )
327 continue;
328 PDFDict* pDict = dynamic_cast<PDFDict*>(pObj->m_pObject);
329 if( ! pDict )
330 continue;
331
332 std::hash_map<rtl::OString,PDFEntry*,rtl::OStringHash>::iterator map_it =
333 pDict->m_aMap.find( "Type" );
334 if( map_it == pDict->m_aMap.end() )
335 continue;
336
337 PDFName* pName = dynamic_cast<PDFName*>(map_it->second);
338 if( ! pName )
339 continue;
340 if( ! pName->m_aName.equals( "FontDescriptor" ) )
341 continue;
342
343 // the font name will be helpful, also there must be one in
344 // a font descriptor
345 map_it = pDict->m_aMap.find( "FontName" );
346 if( map_it == pDict->m_aMap.end() )
347 continue;
348 pName = dynamic_cast<PDFName*>(map_it->second);
349 if( ! pName )
350 continue;
351 rtl::OString aFontName( pName->m_aName );
352
353 PDFObjectRef* pStreamRef = 0;
354 const char* pFileType = NULL;
355 // we have a font descriptor, try for a type 1 font
356 map_it = pDict->m_aMap.find( "FontFile" );
357 if( map_it != pDict->m_aMap.end() )
358 {
359 pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second);
360 if( pStreamRef )
361 pFileType = "pfa";
362 }
363
364 // perhaps it's a truetype file ?
365 if( ! pStreamRef )
366 {
367 map_it = pDict->m_aMap.find( "FontFile2" );
368 if( map_it != pDict->m_aMap.end() )
369 {
370 pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second);
371 if( pStreamRef )
372 pFileType = "ttf";
373 }
374 }
375
376 if( ! pStreamRef )
377 continue;
378
379 PDFObject* pStream = i_pPDFFile->findObject( pStreamRef );
380 if( ! pStream )
381 continue;
382
383 rtl::OStringBuffer aOutStream( i_pOutFile );
384 aOutStream.append( "_font_" );
385 aOutStream.append( sal_Int32(pStreamRef->m_nNumber) );
386 aOutStream.append( "_" );
387 aOutStream.append( sal_Int32(pStreamRef->m_nGeneration) );
388 aOutStream.append( "_" );
389 aOutStream.append( aFontName );
390 if( pFileType )
391 {
392 aOutStream.append( "." );
393 aOutStream.append( pFileType );
394 }
395 FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile );
396 aContext.m_bDecrypt = i_pPDFFile->isEncrypted();
397 pStream->writeStream( aContext, i_pPDFFile );
398 }
399 return nRet;
400 }
401
402 std::vector< std::pair< sal_Int32, sal_Int32 > > s_aEmitObjects;
403
write_objects(const char * i_pInFile,const char * i_pOutFile,PDFFile * i_pPDFFile)404 int write_objects( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile )
405 {
406 int nRet = 0;
407 unsigned int nElements = s_aEmitObjects.size();
408 for( unsigned i = 0; i < nElements && nRet == 0; i++ )
409 {
410 sal_Int32 nObject = s_aEmitObjects[i].first;
411 sal_Int32 nGeneration = s_aEmitObjects[i].second;
412 PDFObject* pStream = i_pPDFFile->findObject( nObject, nGeneration );
413 if( ! pStream )
414 {
415 fprintf( stderr, "object %d %d not found !\n", (int)nObject, (int)nGeneration );
416 continue;
417 }
418
419 rtl::OStringBuffer aOutStream( i_pOutFile );
420 aOutStream.append( "_stream_" );
421 aOutStream.append( nObject );
422 aOutStream.append( "_" );
423 aOutStream.append( nGeneration );
424 FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile );
425 aContext.m_bDecrypt = i_pPDFFile->isEncrypted();
426 pStream->writeStream( aContext, i_pPDFFile );
427 }
428 return nRet;
429 }
430
SAL_IMPLEMENT_MAIN_WITH_ARGS(argc,argv)431 SAL_IMPLEMENT_MAIN_WITH_ARGS( argc, argv )
432 {
433 const char* pInFile = NULL;
434 const char* pOutFile = NULL;
435 const char* pPassword = NULL;
436 OStringBuffer aOutFile( 256 );
437 PDFFileHdl aHdl = write_unzipFile;
438
439 for( int nArg = 1; nArg < argc; nArg++ )
440 {
441 if( argv[nArg][0] == '-' )
442 {
443 if( ! rtl_str_compare( "-pw", argv[nArg] ) ||
444 ! rtl_str_compare( "--password" , argv[nArg] ) )
445 {
446 if( nArg == argc-1 )
447 {
448 fprintf( stderr, "no password given\n" );
449 return 1;
450 }
451 nArg++;
452 pPassword = argv[nArg];
453 }
454 else if( ! rtl_str_compare( "-h", argv[nArg] ) ||
455 ! rtl_str_compare( "--help", argv[nArg] ) )
456 {
457 printHelp( argv[0] );
458 return 0;
459 }
460 else if( ! rtl_str_compare( "-a", argv[nArg] ) ||
461 ! rtl_str_compare( "--extract-add-streams", argv[nArg] ) )
462 {
463 aHdl = write_addStreams;
464 }
465 else if( ! rtl_str_compare( "-f", argv[nArg] ) ||
466 ! rtl_str_compare( "--extract-fonts", argv[nArg] ) )
467 {
468 aHdl = write_fonts;
469 }
470 else if( ! rtl_str_compare( "-o", argv[nArg] ) ||
471 ! rtl_str_compare( "--extract-objects", argv[nArg] ) )
472 {
473 aHdl = write_objects;
474 nArg++;
475 if( nArg < argc )
476 {
477 rtl::OString aObjs( argv[nArg] );
478 sal_Int32 nIndex = 0;
479 while( nIndex != -1 )
480 {
481 rtl::OString aToken( aObjs.getToken( 0, ',', nIndex ) );
482 sal_Int32 nObject = 0;
483 sal_Int32 nGeneration = 0;
484 sal_Int32 nGenIndex = 0;
485 nObject = aToken.getToken( 0, ':', nGenIndex ).toInt32();
486 if( nGenIndex != -1 )
487 nGeneration = aToken.getToken( 0, ':', nGenIndex ).toInt32();
488 s_aEmitObjects.push_back( std::pair<sal_Int32,sal_Int32>(nObject,nGeneration) );
489 }
490 }
491 }
492 else
493 {
494 fprintf( stderr, "unrecognized option \"%s\"\n",
495 argv[nArg] );
496 printHelp( argv[0] );
497 return 1;
498 }
499 }
500 else if( pInFile == NULL )
501 pInFile = argv[nArg];
502 else if( pOutFile == NULL )
503 pOutFile = argv[nArg];
504 }
505 if( ! pInFile )
506 {
507 fprintf( stderr, "no input file given\n" );
508 return 10;
509 }
510 if( ! pOutFile )
511 {
512 OString aFile( pInFile );
513 if( aFile.getLength() > 0 )
514 {
515 if( aFile.getLength() > 4 )
516 {
517 if( aFile.matchIgnoreAsciiCase( OString( ".pdf" ), aFile.getLength()-4 ) )
518 aOutFile.append( pInFile, aFile.getLength() - 4 );
519 else
520 aOutFile.append( aFile );
521 }
522 aOutFile.append( "_unzip.pdf" );
523 pOutFile = aOutFile.getStr();
524 }
525 else
526 {
527 fprintf( stderr, "no output file given\n" );
528 return 11;
529 }
530 }
531
532 return handleFile( pInFile, pOutFile, pPassword, aHdl );
533 }
534
535