1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_sdext.hxx"
26 
27 #include <stdio.h>
28 #include <sal/main.h>
29 #include <osl/file.h>
30 #include <osl/thread.h>
31 #include <rtl/alloc.h>
32 #include <rtl/ustring.hxx>
33 #include <rtl/strbuf.hxx>
34 
35 #include "pdfparse.hxx"
36 
37 using namespace rtl;
38 using namespace pdfparse;
39 
printHelp(const char * pExe)40 void printHelp( const char* pExe )
41 {
42     fprintf( stdout,
43     "USAGE: %s [-h,--help]\n"
44     "       %s [-pw, --password <password>] <inputfile> [<outputfile>]\n"
45     "       %s <-a, --extract-add-streams> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
46     "       %s <-f, --extract-fonts> [-pw, --password <password>] <inputfile> [<outputfile>]\n"
47     "       %s <-o, --extract-objects> <o0>[:<g0>][,<o1>[:g1][,...]] [-pw, --password <password>] <inputfile> [<outputfile>]\n"
48     "  -h, --help: show help\n"
49     "  -a, --extract-add-streams: extracts additional streams to outputfile_object\n"
50     "      and prints the mimetype found to stdout\n"
51     "  -f, --extract-fonts: extracts fonts (currently only type1 and truetype are supported\n"
52     "  -o, --extract-objects: extracts object streams, the syntax of the argument is comma separated\n"
53     "      object numbers, where object number and generation number are separated by \':\'\n"
54     "      an omitted generation number defaults to 0\n"
55     "  -pw, --password: use password for decryption\n"
56     "\n"
57     "note: -f, -a, -o and normal unzip operation are mutually exclusive\n"
58     , pExe, pExe, pExe, pExe, pExe );
59 }
60 
61 class FileEmitContext : public EmitContext
62 {
63     oslFileHandle m_aHandle;
64     oslFileHandle m_aReadHandle;
65     unsigned int  m_nReadLen;
66 
67     void openReadFile( const char* pOrigName );
68 
69     public:
70     FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop );
71     virtual ~FileEmitContext();
72 
73     virtual bool write( const void* pBuf, unsigned int nLen ) throw();
74     virtual unsigned int getCurPos() throw();
75     virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) throw();
76     virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) throw();
77 };
78 
FileEmitContext(const char * pFileName,const char * pOrigName,const PDFContainer * pTop)79 FileEmitContext::FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop )
80     : EmitContext( pTop ),
81       m_aHandle( NULL ),
82       m_aReadHandle( NULL ),
83       m_nReadLen( 0 )
84 {
85     OUString aSysFile( OStringToOUString( OString( pFileName ), osl_getThreadTextEncoding() ) );
86     OUString aURL;
87     if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None )
88     {
89         fprintf( stderr, "filename conversion \"%s\" failed\n", pFileName );
90         return;
91     }
92 
93     if( osl_openFile( aURL.pData, &m_aHandle, osl_File_OpenFlag_Write ) == osl_File_E_None )
94     {
95         if( osl_setFileSize( m_aHandle, 0 ) != osl_File_E_None )
96         {
97             fprintf( stderr, "could not truncate %s\n", pFileName );
98             osl_closeFile( m_aHandle );
99             m_aHandle = NULL;
100         }
101     }
102     else if( osl_openFile( aURL.pData, &m_aHandle,
103             osl_File_OpenFlag_Write |osl_File_OpenFlag_Create ) != osl_File_E_None )
104     {
105         fprintf( stderr, "could not open %s\n", pFileName );
106         return;
107     }
108     m_bDeflate = true;
109 
110     openReadFile( pOrigName );
111 }
112 
~FileEmitContext()113 FileEmitContext::~FileEmitContext()
114 {
115     if( m_aHandle )
116         osl_closeFile( m_aHandle );
117     if( m_aReadHandle )
118         osl_closeFile( m_aReadHandle );
119 }
120 
openReadFile(const char * pInFile)121 void FileEmitContext::openReadFile( const char* pInFile )
122 {
123     OUString aSysFile( OStringToOUString( OString( pInFile ), osl_getThreadTextEncoding() ) );
124     OUString aURL;
125     if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None )
126     {
127         fprintf( stderr, "filename conversion \"%s\" failed\n", pInFile );
128         return;
129     }
130 
131     if( osl_openFile( aURL.pData, &m_aReadHandle, osl_File_OpenFlag_Read ) != osl_File_E_None )
132     {
133         fprintf( stderr, "could not open %s\n", pInFile );
134         return;
135     }
136 
137     if( osl_setFilePos( m_aReadHandle, osl_Pos_End, 0 ) != osl_File_E_None )
138     {
139         fprintf( stderr, "could not seek to end of %s\n", pInFile );
140         osl_closeFile( m_aReadHandle );
141         return;
142     }
143 
144     sal_uInt64 nFileSize = 0;
145     if( osl_getFilePos( m_aReadHandle, &nFileSize ) != osl_File_E_None )
146     {
147         fprintf( stderr, "could not get end pos of %s\n", pInFile );
148         osl_closeFile( m_aReadHandle );
149         return;
150     }
151 
152     m_nReadLen = static_cast<unsigned int>(nFileSize);
153 }
154 
write(const void * pBuf,unsigned int nLen)155 bool FileEmitContext::write( const void* pBuf, unsigned int nLen ) throw()
156 {
157     if( ! m_aHandle )
158         return false;
159 
160     sal_uInt64 nWrite = static_cast<sal_uInt64>(nLen);
161     sal_uInt64 nWritten = 0;
162     return (osl_writeFile( m_aHandle, pBuf, nWrite, &nWritten ) == osl_File_E_None)
163            && nWrite == nWritten;
164 }
165 
getCurPos()166 unsigned int FileEmitContext::getCurPos() throw()
167 {
168     sal_uInt64 nFileSize = 0;
169     if( m_aHandle )
170     {
171         if( osl_getFilePos( m_aHandle, &nFileSize ) != osl_File_E_None )
172             nFileSize = 0;
173     }
174     return static_cast<unsigned int>(nFileSize);
175 }
176 
copyOrigBytes(unsigned int nOrigOffset,unsigned int nLen)177 bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) throw()
178 {
179     if( nOrigOffset + nLen > m_nReadLen )
180         return false;
181 
182     if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
183     {
184         fprintf( stderr, "could not seek to offset %u\n", nOrigOffset );
185         return false;
186     }
187     void* pBuf = rtl_allocateMemory( nLen );
188     if( ! pBuf )
189         return false;
190     sal_uInt64 nBytesRead = 0;
191     if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None
192         || nBytesRead != static_cast<sal_uInt64>(nLen) )
193     {
194         fprintf( stderr, "could not read %u bytes\n", nLen );
195         rtl_freeMemory( pBuf );
196         return false;
197     }
198     bool bRet = write( pBuf, nLen );
199     rtl_freeMemory( pBuf );
200     return bRet;
201 }
202 
readOrigBytes(unsigned int nOrigOffset,unsigned int nLen,void * pBuf)203 unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) throw()
204 {
205     if( nOrigOffset + nLen > m_nReadLen )
206         return 0;
207 
208     if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None )
209     {
210         fprintf( stderr, "could not seek to offset %u\n", nOrigOffset );
211         return 0;
212     }
213     sal_uInt64 nBytesRead = 0;
214     if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None )
215         return 0;
216     return static_cast<unsigned int>(nBytesRead);
217 }
218 
219 typedef int(*PDFFileHdl)(const char*, const char*, PDFFile*);
220 
handleFile(const char * pInFile,const char * pOutFile,const char * pPassword,PDFFileHdl pHdl)221 int handleFile( const char* pInFile, const char* pOutFile, const char* pPassword, PDFFileHdl pHdl )
222 {
223 
224     PDFReader aParser;
225     int nRet = 0;
226     PDFEntry* pEntry = aParser.read( pInFile );
227     if( pEntry )
228     {
229         PDFFile* pPDFFile = dynamic_cast<PDFFile*>(pEntry);
230         if( pPDFFile )
231         {
232             fprintf( stdout, "have a %s PDF file\n", pPDFFile->isEncrypted() ? "encrypted" : "unencrypted" );
233             if( pPassword )
234                 fprintf( stdout, "password %s\n",
235                          pPDFFile->setupDecryptionData( pPassword ) ? "matches" : "does not match" );
236             nRet = pHdl( pInFile, pOutFile, pPDFFile );
237         }
238         else
239             nRet = 20;
240         delete pEntry;
241     }
242     return nRet;
243 }
244 
write_unzipFile(const char * pInFile,const char * pOutFile,PDFFile * pPDFFile)245 int write_unzipFile( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile )
246 {
247     FileEmitContext aContext( pOutFile, pInFile, pPDFFile );
248     aContext.m_bDecrypt = pPDFFile->isEncrypted();
249     pPDFFile->emit(aContext);
250     return 0;
251 }
252 
write_addStreamArray(const char * pOutFile,PDFArray * pStreams,PDFFile * pPDFFile,const char * pInFile)253 int write_addStreamArray( const char* pOutFile, PDFArray* pStreams, PDFFile* pPDFFile, const char* pInFile )
254 {
255     int nRet = 0;
256     unsigned int nArrayElements = pStreams->m_aSubElements.size();
257     for( unsigned int i = 0; i < nArrayElements-1 && nRet == 0; i++ )
258     {
259         PDFName* pMimeType = dynamic_cast<PDFName*>(pStreams->m_aSubElements[i]);
260         PDFObjectRef* pStreamRef = dynamic_cast<PDFObjectRef*>(pStreams->m_aSubElements[i+1]);
261         if( ! pMimeType )
262             fprintf( stderr, "error: no mimetype element\n" );
263         if( ! pStreamRef )
264             fprintf( stderr, "error: no stream ref element\n" );
265         if( pMimeType && pStreamRef )
266         {
267             fprintf( stdout, "found stream %d %d with mimetype %s\n",
268                      pStreamRef->m_nNumber, pStreamRef->m_nGeneration,
269                      pMimeType->m_aName.getStr() );
270             PDFObject* pObject = pPDFFile->findObject( pStreamRef->m_nNumber, pStreamRef->m_nGeneration );
271             if( pObject )
272             {
273                 rtl::OStringBuffer aOutStream( pOutFile );
274                 aOutStream.append( "_stream_" );
275                 aOutStream.append( sal_Int32(pStreamRef->m_nNumber) );
276                 aOutStream.append( "_" );
277                 aOutStream.append( sal_Int32(pStreamRef->m_nGeneration) );
278                 FileEmitContext aContext( aOutStream.getStr(), pInFile, pPDFFile );
279                 aContext.m_bDecrypt = pPDFFile->isEncrypted();
280                 pObject->writeStream( aContext, pPDFFile );
281             }
282             else
283             {
284                 fprintf( stderr, "object not found\n" );
285                 nRet = 121;
286             }
287         }
288         else
289             nRet = 120;
290     }
291     return nRet;
292 }
293 
write_addStreams(const char * pInFile,const char * pOutFile,PDFFile * pPDFFile)294 int write_addStreams( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile )
295 {
296     // find all trailers
297     int nRet = 0;
298     unsigned int nElements = pPDFFile->m_aSubElements.size();
299     for( unsigned i = 0; i < nElements && nRet == 0; i++ )
300     {
301         PDFTrailer* pTrailer = dynamic_cast<PDFTrailer*>(pPDFFile->m_aSubElements[i]);
302         if( pTrailer && pTrailer->m_pDict )
303         {
304             // search for AdditionalStreams entry
305             std::hash_map<rtl::OString,PDFEntry*,rtl::OStringHash>::iterator add_stream;
306             add_stream = pTrailer->m_pDict->m_aMap.find( "AdditionalStreams" );
307             if( add_stream != pTrailer->m_pDict->m_aMap.end() )
308             {
309                 PDFArray* pStreams = dynamic_cast<PDFArray*>(add_stream->second);
310                 if( pStreams )
311                     nRet = write_addStreamArray( pOutFile, pStreams, pPDFFile, pInFile );
312             }
313         }
314     }
315     return nRet;
316 }
317 
write_fonts(const char * i_pInFile,const char * i_pOutFile,PDFFile * i_pPDFFile)318 int write_fonts( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile )
319 {
320     int nRet = 0;
321     unsigned int nElements = i_pPDFFile->m_aSubElements.size();
322     for( unsigned i = 0; i < nElements && nRet == 0; i++ )
323     {
324         // search FontDescriptors
325         PDFObject* pObj = dynamic_cast<PDFObject*>(i_pPDFFile->m_aSubElements[i]);
326         if( ! pObj )
327             continue;
328         PDFDict* pDict = dynamic_cast<PDFDict*>(pObj->m_pObject);
329         if( ! pDict )
330             continue;
331 
332         std::hash_map<rtl::OString,PDFEntry*,rtl::OStringHash>::iterator map_it =
333                 pDict->m_aMap.find( "Type" );
334         if( map_it == pDict->m_aMap.end() )
335             continue;
336 
337         PDFName* pName = dynamic_cast<PDFName*>(map_it->second);
338         if( ! pName )
339             continue;
340         if( ! pName->m_aName.equals( "FontDescriptor" ) )
341             continue;
342 
343         // the font name will be helpful, also there must be one in
344         // a font descriptor
345         map_it = pDict->m_aMap.find( "FontName" );
346         if( map_it == pDict->m_aMap.end() )
347             continue;
348         pName = dynamic_cast<PDFName*>(map_it->second);
349         if( ! pName )
350             continue;
351         rtl::OString aFontName( pName->m_aName );
352 
353         PDFObjectRef* pStreamRef = 0;
354         const char* pFileType = NULL;
355         // we have a font descriptor, try for a type 1 font
356         map_it = pDict->m_aMap.find( "FontFile" );
357         if( map_it != pDict->m_aMap.end() )
358         {
359             pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second);
360             if( pStreamRef )
361                 pFileType = "pfa";
362         }
363 
364         // perhaps it's a truetype file ?
365         if( ! pStreamRef )
366         {
367             map_it  = pDict->m_aMap.find( "FontFile2" );
368             if( map_it != pDict->m_aMap.end() )
369             {
370                 pStreamRef = dynamic_cast<PDFObjectRef*>(map_it->second);
371                 if( pStreamRef )
372                     pFileType = "ttf";
373             }
374         }
375 
376         if( ! pStreamRef )
377             continue;
378 
379         PDFObject* pStream = i_pPDFFile->findObject( pStreamRef );
380         if( ! pStream )
381             continue;
382 
383         rtl::OStringBuffer aOutStream( i_pOutFile );
384         aOutStream.append( "_font_" );
385         aOutStream.append( sal_Int32(pStreamRef->m_nNumber) );
386         aOutStream.append( "_" );
387         aOutStream.append( sal_Int32(pStreamRef->m_nGeneration) );
388         aOutStream.append( "_" );
389         aOutStream.append( aFontName );
390         if( pFileType )
391         {
392             aOutStream.append( "." );
393             aOutStream.append( pFileType );
394         }
395         FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile );
396         aContext.m_bDecrypt = i_pPDFFile->isEncrypted();
397         pStream->writeStream( aContext, i_pPDFFile );
398     }
399     return nRet;
400 }
401 
402 std::vector< std::pair< sal_Int32, sal_Int32 > > s_aEmitObjects;
403 
write_objects(const char * i_pInFile,const char * i_pOutFile,PDFFile * i_pPDFFile)404 int write_objects( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile )
405 {
406     int nRet = 0;
407     unsigned int nElements = s_aEmitObjects.size();
408     for( unsigned i = 0; i < nElements && nRet == 0; i++ )
409     {
410         sal_Int32 nObject     = s_aEmitObjects[i].first;
411         sal_Int32 nGeneration = s_aEmitObjects[i].second;
412         PDFObject* pStream = i_pPDFFile->findObject( nObject, nGeneration );
413         if( ! pStream )
414         {
415             fprintf( stderr, "object %d %d not found !\n", (int)nObject, (int)nGeneration );
416             continue;
417         }
418 
419         rtl::OStringBuffer aOutStream( i_pOutFile );
420         aOutStream.append( "_stream_" );
421         aOutStream.append( nObject );
422         aOutStream.append( "_" );
423         aOutStream.append( nGeneration );
424         FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile );
425         aContext.m_bDecrypt = i_pPDFFile->isEncrypted();
426         pStream->writeStream( aContext, i_pPDFFile );
427     }
428     return nRet;
429 }
430 
SAL_IMPLEMENT_MAIN_WITH_ARGS(argc,argv)431 SAL_IMPLEMENT_MAIN_WITH_ARGS( argc, argv )
432 {
433     const char* pInFile = NULL;
434     const char* pOutFile = NULL;
435     const char* pPassword = NULL;
436     OStringBuffer aOutFile( 256 );
437     PDFFileHdl aHdl = write_unzipFile;
438 
439     for( int nArg = 1; nArg < argc; nArg++ )
440     {
441         if( argv[nArg][0] == '-' )
442         {
443             if( ! rtl_str_compare( "-pw", argv[nArg] ) ||
444                 ! rtl_str_compare( "--password" , argv[nArg] ) )
445             {
446                 if( nArg == argc-1 )
447                 {
448                     fprintf( stderr, "no password given\n" );
449                     return 1;
450                 }
451                 nArg++;
452                 pPassword = argv[nArg];
453             }
454             else if( ! rtl_str_compare( "-h", argv[nArg] ) ||
455                 ! rtl_str_compare( "--help", argv[nArg] ) )
456             {
457                 printHelp( argv[0] );
458                 return 0;
459             }
460             else if( ! rtl_str_compare( "-a", argv[nArg] ) ||
461                 ! rtl_str_compare( "--extract-add-streams", argv[nArg] ) )
462             {
463                 aHdl = write_addStreams;
464             }
465             else if( ! rtl_str_compare( "-f", argv[nArg] ) ||
466                 ! rtl_str_compare( "--extract-fonts", argv[nArg] ) )
467             {
468                 aHdl = write_fonts;
469             }
470             else if( ! rtl_str_compare( "-o", argv[nArg] ) ||
471                 ! rtl_str_compare( "--extract-objects", argv[nArg] ) )
472             {
473                 aHdl = write_objects;
474                 nArg++;
475                 if( nArg < argc )
476                 {
477                     rtl::OString aObjs( argv[nArg] );
478                     sal_Int32 nIndex = 0;
479                     while( nIndex != -1 )
480                     {
481                         rtl::OString aToken( aObjs.getToken( 0, ',', nIndex ) );
482                         sal_Int32 nObject = 0;
483                         sal_Int32 nGeneration = 0;
484                         sal_Int32 nGenIndex = 0;
485                         nObject = aToken.getToken( 0, ':', nGenIndex ).toInt32();
486                         if( nGenIndex != -1 )
487                             nGeneration = aToken.getToken( 0, ':', nGenIndex ).toInt32();
488                         s_aEmitObjects.push_back( std::pair<sal_Int32,sal_Int32>(nObject,nGeneration) );
489                     }
490                 }
491             }
492             else
493             {
494                 fprintf( stderr, "unrecognized option \"%s\"\n",
495                          argv[nArg] );
496                 printHelp( argv[0] );
497                 return 1;
498             }
499         }
500         else if( pInFile == NULL )
501             pInFile = argv[nArg];
502         else if( pOutFile == NULL )
503             pOutFile = argv[nArg];
504     }
505     if( ! pInFile )
506     {
507         fprintf( stderr, "no input file given\n" );
508         return 10;
509     }
510     if( ! pOutFile )
511     {
512         OString aFile( pInFile );
513         if( aFile.getLength() > 0 )
514         {
515             if( aFile.getLength() > 4 )
516             {
517                 if( aFile.matchIgnoreAsciiCase( OString( ".pdf" ), aFile.getLength()-4 ) )
518                     aOutFile.append( pInFile, aFile.getLength() - 4 );
519                 else
520                     aOutFile.append( aFile );
521             }
522             aOutFile.append( "_unzip.pdf" );
523             pOutFile = aOutFile.getStr();
524         }
525         else
526         {
527             fprintf( stderr, "no output file given\n" );
528             return 11;
529         }
530     }
531 
532     return handleFile( pInFile, pOutFile, pPassword, aHdl );
533 }
534 
535