/************************************************************** * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * *************************************************************/ // MARKER(update_precomp.py): autogen include statement, do not remove #include "precompiled_sdext.hxx" #include #include #include #include #include #include #include #include "pdfparse.hxx" using namespace rtl; using namespace pdfparse; void printHelp( const char* pExe ) { fprintf( stdout, "USAGE: %s [-h,--help]\n" " %s [-pw, --password ] []\n" " %s <-a, --extract-add-streams> [-pw, --password ] []\n" " %s <-f, --extract-fonts> [-pw, --password ] []\n" " %s <-o, --extract-objects> [:][,[:g1][,...]] [-pw, --password ] []\n" " -h, --help: show help\n" " -a, --extract-add-streams: extracts additional streams to outputfile_object\n" " and prints the mimetype found to stdout\n" " -f, --extract-fonts: extracts fonts (currently only type1 and truetype are supported\n" " -o, --extract-objects: extracts object streams, the syntax of the argument is comma separated\n" " object numbers, where object number and generation number are separated by \':\'\n" " an omitted generation number defaults to 0\n" " -pw, --password: use password for decryption\n" "\n" "note: -f, -a, -o and normal unzip operation are mutually exclusive\n" , pExe, pExe, pExe, pExe, pExe ); } class FileEmitContext : public EmitContext { oslFileHandle m_aHandle; oslFileHandle m_aReadHandle; unsigned int m_nReadLen; void openReadFile( const char* pOrigName ); public: FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop ); virtual ~FileEmitContext(); virtual bool write( const void* pBuf, unsigned int nLen ) throw(); virtual unsigned int getCurPos() throw(); virtual bool copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) throw(); virtual unsigned int readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) throw(); }; FileEmitContext::FileEmitContext( const char* pFileName, const char* pOrigName, const PDFContainer* pTop ) : EmitContext( pTop ), m_aHandle( NULL ), m_aReadHandle( NULL ), m_nReadLen( 0 ) { OUString aSysFile( OStringToOUString( OString( pFileName ), osl_getThreadTextEncoding() ) ); OUString aURL; if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None ) { fprintf( stderr, "filename conversion \"%s\" failed\n", pFileName ); return; } if( osl_openFile( aURL.pData, &m_aHandle, osl_File_OpenFlag_Write ) == osl_File_E_None ) { if( osl_setFileSize( m_aHandle, 0 ) != osl_File_E_None ) { fprintf( stderr, "could not truncate %s\n", pFileName ); osl_closeFile( m_aHandle ); m_aHandle = NULL; } } else if( osl_openFile( aURL.pData, &m_aHandle, osl_File_OpenFlag_Write |osl_File_OpenFlag_Create ) != osl_File_E_None ) { fprintf( stderr, "could not open %s\n", pFileName ); return; } m_bDeflate = true; openReadFile( pOrigName ); } FileEmitContext::~FileEmitContext() { if( m_aHandle ) osl_closeFile( m_aHandle ); if( m_aReadHandle ) osl_closeFile( m_aReadHandle ); } void FileEmitContext::openReadFile( const char* pInFile ) { OUString aSysFile( OStringToOUString( OString( pInFile ), osl_getThreadTextEncoding() ) ); OUString aURL; if( osl_getFileURLFromSystemPath( aSysFile.pData, &aURL.pData ) != osl_File_E_None ) { fprintf( stderr, "filename conversion \"%s\" failed\n", pInFile ); return; } if( osl_openFile( aURL.pData, &m_aReadHandle, osl_File_OpenFlag_Read ) != osl_File_E_None ) { fprintf( stderr, "could not open %s\n", pInFile ); return; } if( osl_setFilePos( m_aReadHandle, osl_Pos_End, 0 ) != osl_File_E_None ) { fprintf( stderr, "could not seek to end of %s\n", pInFile ); osl_closeFile( m_aReadHandle ); return; } sal_uInt64 nFileSize = 0; if( osl_getFilePos( m_aReadHandle, &nFileSize ) != osl_File_E_None ) { fprintf( stderr, "could not get end pos of %s\n", pInFile ); osl_closeFile( m_aReadHandle ); return; } m_nReadLen = static_cast(nFileSize); } bool FileEmitContext::write( const void* pBuf, unsigned int nLen ) throw() { if( ! m_aHandle ) return false; sal_uInt64 nWrite = static_cast(nLen); sal_uInt64 nWritten = 0; return (osl_writeFile( m_aHandle, pBuf, nWrite, &nWritten ) == osl_File_E_None) && nWrite == nWritten; } unsigned int FileEmitContext::getCurPos() throw() { sal_uInt64 nFileSize = 0; if( m_aHandle ) { if( osl_getFilePos( m_aHandle, &nFileSize ) != osl_File_E_None ) nFileSize = 0; } return static_cast(nFileSize); } bool FileEmitContext::copyOrigBytes( unsigned int nOrigOffset, unsigned int nLen ) throw() { if( nOrigOffset + nLen > m_nReadLen ) return false; if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None ) { fprintf( stderr, "could not seek to offset %u\n", nOrigOffset ); return false; } void* pBuf = rtl_allocateMemory( nLen ); if( ! pBuf ) return false; sal_uInt64 nBytesRead = 0; if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None || nBytesRead != static_cast(nLen) ) { fprintf( stderr, "could not read %u bytes\n", nLen ); rtl_freeMemory( pBuf ); return false; } bool bRet = write( pBuf, nLen ); rtl_freeMemory( pBuf ); return bRet; } unsigned int FileEmitContext::readOrigBytes( unsigned int nOrigOffset, unsigned int nLen, void* pBuf ) throw() { if( nOrigOffset + nLen > m_nReadLen ) return 0; if( osl_setFilePos( m_aReadHandle, osl_Pos_Absolut, nOrigOffset ) != osl_File_E_None ) { fprintf( stderr, "could not seek to offset %u\n", nOrigOffset ); return 0; } sal_uInt64 nBytesRead = 0; if( osl_readFile( m_aReadHandle, pBuf, nLen, &nBytesRead ) != osl_File_E_None ) return 0; return static_cast(nBytesRead); } typedef int(*PDFFileHdl)(const char*, const char*, PDFFile*); int handleFile( const char* pInFile, const char* pOutFile, const char* pPassword, PDFFileHdl pHdl ) { PDFReader aParser; int nRet = 0; PDFEntry* pEntry = aParser.read( pInFile ); if( pEntry ) { PDFFile* pPDFFile = dynamic_cast(pEntry); if( pPDFFile ) { fprintf( stdout, "have a %s PDF file\n", pPDFFile->isEncrypted() ? "encrypted" : "unencrypted" ); if( pPassword ) fprintf( stdout, "password %s\n", pPDFFile->setupDecryptionData( pPassword ) ? "matches" : "does not match" ); nRet = pHdl( pInFile, pOutFile, pPDFFile ); } else nRet = 20; delete pEntry; } return nRet; } int write_unzipFile( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile ) { FileEmitContext aContext( pOutFile, pInFile, pPDFFile ); aContext.m_bDecrypt = pPDFFile->isEncrypted(); pPDFFile->emit(aContext); return 0; } int write_addStreamArray( const char* pOutFile, PDFArray* pStreams, PDFFile* pPDFFile, const char* pInFile ) { int nRet = 0; unsigned int nArrayElements = pStreams->m_aSubElements.size(); for( unsigned int i = 0; i < nArrayElements-1 && nRet == 0; i++ ) { PDFName* pMimeType = dynamic_cast(pStreams->m_aSubElements[i]); PDFObjectRef* pStreamRef = dynamic_cast(pStreams->m_aSubElements[i+1]); if( ! pMimeType ) fprintf( stderr, "error: no mimetype element\n" ); if( ! pStreamRef ) fprintf( stderr, "error: no stream ref element\n" ); if( pMimeType && pStreamRef ) { fprintf( stdout, "found stream %d %d with mimetype %s\n", pStreamRef->m_nNumber, pStreamRef->m_nGeneration, pMimeType->m_aName.getStr() ); PDFObject* pObject = pPDFFile->findObject( pStreamRef->m_nNumber, pStreamRef->m_nGeneration ); if( pObject ) { rtl::OStringBuffer aOutStream( pOutFile ); aOutStream.append( "_stream_" ); aOutStream.append( sal_Int32(pStreamRef->m_nNumber) ); aOutStream.append( "_" ); aOutStream.append( sal_Int32(pStreamRef->m_nGeneration) ); FileEmitContext aContext( aOutStream.getStr(), pInFile, pPDFFile ); aContext.m_bDecrypt = pPDFFile->isEncrypted(); pObject->writeStream( aContext, pPDFFile ); } else { fprintf( stderr, "object not found\n" ); nRet = 121; } } else nRet = 120; } return nRet; } int write_addStreams( const char* pInFile, const char* pOutFile, PDFFile* pPDFFile ) { // find all trailers int nRet = 0; unsigned int nElements = pPDFFile->m_aSubElements.size(); for( unsigned i = 0; i < nElements && nRet == 0; i++ ) { PDFTrailer* pTrailer = dynamic_cast(pPDFFile->m_aSubElements[i]); if( pTrailer && pTrailer->m_pDict ) { // search for AdditionalStreams entry std::hash_map::iterator add_stream; add_stream = pTrailer->m_pDict->m_aMap.find( "AdditionalStreams" ); if( add_stream != pTrailer->m_pDict->m_aMap.end() ) { PDFArray* pStreams = dynamic_cast(add_stream->second); if( pStreams ) nRet = write_addStreamArray( pOutFile, pStreams, pPDFFile, pInFile ); } } } return nRet; } int write_fonts( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile ) { int nRet = 0; unsigned int nElements = i_pPDFFile->m_aSubElements.size(); for( unsigned i = 0; i < nElements && nRet == 0; i++ ) { // search FontDescriptors PDFObject* pObj = dynamic_cast(i_pPDFFile->m_aSubElements[i]); if( ! pObj ) continue; PDFDict* pDict = dynamic_cast(pObj->m_pObject); if( ! pDict ) continue; std::hash_map::iterator map_it = pDict->m_aMap.find( "Type" ); if( map_it == pDict->m_aMap.end() ) continue; PDFName* pName = dynamic_cast(map_it->second); if( ! pName ) continue; if( ! pName->m_aName.equals( "FontDescriptor" ) ) continue; // the font name will be helpful, also there must be one in // a font descriptor map_it = pDict->m_aMap.find( "FontName" ); if( map_it == pDict->m_aMap.end() ) continue; pName = dynamic_cast(map_it->second); if( ! pName ) continue; rtl::OString aFontName( pName->m_aName ); PDFObjectRef* pStreamRef = 0; const char* pFileType = NULL; // we have a font descriptor, try for a type 1 font map_it = pDict->m_aMap.find( "FontFile" ); if( map_it != pDict->m_aMap.end() ) { pStreamRef = dynamic_cast(map_it->second); if( pStreamRef ) pFileType = "pfa"; } // perhaps it's a truetype file ? if( ! pStreamRef ) { map_it = pDict->m_aMap.find( "FontFile2" ); if( map_it != pDict->m_aMap.end() ) { pStreamRef = dynamic_cast(map_it->second); if( pStreamRef ) pFileType = "ttf"; } } if( ! pStreamRef ) continue; PDFObject* pStream = i_pPDFFile->findObject( pStreamRef ); if( ! pStream ) continue; rtl::OStringBuffer aOutStream( i_pOutFile ); aOutStream.append( "_font_" ); aOutStream.append( sal_Int32(pStreamRef->m_nNumber) ); aOutStream.append( "_" ); aOutStream.append( sal_Int32(pStreamRef->m_nGeneration) ); aOutStream.append( "_" ); aOutStream.append( aFontName ); if( pFileType ) { aOutStream.append( "." ); aOutStream.append( pFileType ); } FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile ); aContext.m_bDecrypt = i_pPDFFile->isEncrypted(); pStream->writeStream( aContext, i_pPDFFile ); } return nRet; } std::vector< std::pair< sal_Int32, sal_Int32 > > s_aEmitObjects; int write_objects( const char* i_pInFile, const char* i_pOutFile, PDFFile* i_pPDFFile ) { int nRet = 0; unsigned int nElements = s_aEmitObjects.size(); for( unsigned i = 0; i < nElements && nRet == 0; i++ ) { sal_Int32 nObject = s_aEmitObjects[i].first; sal_Int32 nGeneration = s_aEmitObjects[i].second; PDFObject* pStream = i_pPDFFile->findObject( nObject, nGeneration ); if( ! pStream ) { fprintf( stderr, "object %d %d not found !\n", (int)nObject, (int)nGeneration ); continue; } rtl::OStringBuffer aOutStream( i_pOutFile ); aOutStream.append( "_stream_" ); aOutStream.append( nObject ); aOutStream.append( "_" ); aOutStream.append( nGeneration ); FileEmitContext aContext( aOutStream.getStr(), i_pInFile, i_pPDFFile ); aContext.m_bDecrypt = i_pPDFFile->isEncrypted(); pStream->writeStream( aContext, i_pPDFFile ); } return nRet; } SAL_IMPLEMENT_MAIN_WITH_ARGS( argc, argv ) { const char* pInFile = NULL; const char* pOutFile = NULL; const char* pPassword = NULL; OStringBuffer aOutFile( 256 ); PDFFileHdl aHdl = write_unzipFile; for( int nArg = 1; nArg < argc; nArg++ ) { if( argv[nArg][0] == '-' ) { if( ! rtl_str_compare( "-pw", argv[nArg] ) || ! rtl_str_compare( "--password" , argv[nArg] ) ) { if( nArg == argc-1 ) { fprintf( stderr, "no password given\n" ); return 1; } nArg++; pPassword = argv[nArg]; } else if( ! rtl_str_compare( "-h", argv[nArg] ) || ! rtl_str_compare( "--help", argv[nArg] ) ) { printHelp( argv[0] ); return 0; } else if( ! rtl_str_compare( "-a", argv[nArg] ) || ! rtl_str_compare( "--extract-add-streams", argv[nArg] ) ) { aHdl = write_addStreams; } else if( ! rtl_str_compare( "-f", argv[nArg] ) || ! rtl_str_compare( "--extract-fonts", argv[nArg] ) ) { aHdl = write_fonts; } else if( ! rtl_str_compare( "-o", argv[nArg] ) || ! rtl_str_compare( "--extract-objects", argv[nArg] ) ) { aHdl = write_objects; nArg++; if( nArg < argc ) { rtl::OString aObjs( argv[nArg] ); sal_Int32 nIndex = 0; while( nIndex != -1 ) { rtl::OString aToken( aObjs.getToken( 0, ',', nIndex ) ); sal_Int32 nObject = 0; sal_Int32 nGeneration = 0; sal_Int32 nGenIndex = 0; nObject = aToken.getToken( 0, ':', nGenIndex ).toInt32(); if( nGenIndex != -1 ) nGeneration = aToken.getToken( 0, ':', nGenIndex ).toInt32(); s_aEmitObjects.push_back( std::pair(nObject,nGeneration) ); } } } else { fprintf( stderr, "unrecognized option \"%s\"\n", argv[nArg] ); printHelp( argv[0] ); return 1; } } else if( pInFile == NULL ) pInFile = argv[nArg]; else if( pOutFile == NULL ) pOutFile = argv[nArg]; } if( ! pInFile ) { fprintf( stderr, "no input file given\n" ); return 10; } if( ! pOutFile ) { OString aFile( pInFile ); if( aFile.getLength() > 0 ) { if( aFile.getLength() > 4 ) { if( aFile.matchIgnoreAsciiCase( OString( ".pdf" ), aFile.getLength()-4 ) ) aOutFile.append( pInFile, aFile.getLength() - 4 ); else aOutFile.append( aFile ); } aOutFile.append( "_unzip.pdf" ); pOutFile = aOutFile.getStr(); } else { fprintf( stderr, "no output file given\n" ); return 11; } } return handleFile( pInFile, pOutFile, pPassword, aHdl ); }