1 #include "HtmlFmtFlt.hxx" 2 3 #include <rtl/string.h> 4 5 #include <string> 6 #include <sstream> 7 #include <vector> 8 #include <iomanip> 9 10 #include <boost/assert.hpp> 11 12 using namespace com::sun::star::uno; 13 14 //------------------------------------------------------------------------------ 15 // converts the openoffice text/html clipboard format to the HTML Format 16 // well known under MS Windows 17 // the MS HTML Format has a header before the real html data 18 // 19 // Version:1.0 Version number of the clipboard. Staring is 0.9 20 // StartHTML: Byte count from the beginning of the clipboard to the start 21 // of the context, or -1 if no context 22 // EndHTML: Byte count from the beginning of the clipboard to the end 23 // of the context, or -1 if no context 24 // StartFragment: Byte count from the beginning of the clipboard to the 25 // start of the fragment 26 // EndFragment: Byte count from the beginning of the clipboard to the 27 // end of the fragment 28 // StartSelection: Byte count from the beginning of the clipboard to the 29 // start of the selection 30 // EndSelection: Byte count from the beginning of the clipboard to the 31 // end of the selection 32 // 33 // StartSelection and EndSelection are optional 34 // The fragment should be preceded and followed by the HTML comments 35 // <!--StartFragment--> and <!--EndFragment--> (no space between !-- and the 36 // text 37 //------------------------------------------------------------------------------ 38 39 namespace // private 40 { 41 std::string GetHtmlFormatHeader(size_t startHtml, size_t endHtml, size_t startFragment, size_t endFragment) 42 { 43 std::ostringstream htmlHeader; 44 htmlHeader << "Version:1.0" << '\r' << '\n'; 45 htmlHeader << "StartHTML:" << std::setw(10) << std::setfill('0') << std::dec << startHtml << '\r' << '\n'; 46 htmlHeader << "EndHTML:" << std::setw(10) << std::setfill('0') << std::dec << endHtml << '\r' << '\n'; 47 htmlHeader << "StartFragment:" << std::setw(10) << std::setfill('0') << std::dec << startFragment << '\r' << '\n'; 48 htmlHeader << "EndFragment:" << std::setw(10) << std::setfill('0') << std::dec << endFragment << '\r' << '\n'; 49 return htmlHeader.str(); 50 } 51 52 } // namespace private 53 54 55 // the office allways writes the start and end html tag in upper cases and 56 // without spaces both tags don't allow parameters 57 const std::string TAG_HTML = std::string("<HTML>"); 58 const std::string TAG_END_HTML = std::string("</HTML>"); 59 60 // The body tag may have parameters so we need to search for the 61 // closing '>' manually e.g. <BODY param> #92840# 62 const std::string TAG_BODY = std::string("<BODY"); 63 const std::string TAG_END_BODY = std::string("</BODY"); 64 65 Sequence<sal_Int8> SAL_CALL TextHtmlToHTMLFormat(Sequence<sal_Int8>& aTextHtml) 66 { 67 OSL_ASSERT(aTextHtml.getLength() > 0); 68 69 if (!(aTextHtml.getLength() > 0)) 70 return Sequence<sal_Int8>(); 71 72 // fill the buffer with dummy values to calc the exact length 73 std::string dummyHtmlHeader = GetHtmlFormatHeader(0, 0, 0, 0); 74 size_t lHtmlFormatHeader = dummyHtmlHeader.length(); 75 76 std::string textHtml( 77 reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()), 78 reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()) + aTextHtml.getLength()); 79 80 std::string::size_type nStartHtml = textHtml.find(TAG_HTML) + lHtmlFormatHeader - 1; // we start one before '<HTML>' Word 2000 does also so 81 std::string::size_type nEndHtml = textHtml.find(TAG_END_HTML) + lHtmlFormatHeader + TAG_END_HTML.length() + 1; // our SOffice 5.2 wants 2 behind </HTML>? 82 83 // The body tag may have parameters so we need to search for the 84 // closing '>' manually e.g. <BODY param> #92840# 85 std::string::size_type nStartFragment = textHtml.find(">", textHtml.find(TAG_BODY)) + lHtmlFormatHeader + 1; 86 std::string::size_type nEndFragment = textHtml.find(TAG_END_BODY) + lHtmlFormatHeader; 87 88 std::string htmlFormat = GetHtmlFormatHeader(nStartHtml, nEndHtml, nStartFragment, nEndFragment); 89 htmlFormat += textHtml; 90 91 Sequence<sal_Int8> byteSequence(htmlFormat.length() + 1); // space the trailing '\0' 92 rtl_zeroMemory(byteSequence.getArray(), byteSequence.getLength()); 93 94 rtl_copyMemory( 95 static_cast<void*>(byteSequence.getArray()), 96 static_cast<const void*>(htmlFormat.c_str()), 97 htmlFormat.length()); 98 99 return byteSequence; 100 } 101 102 const char* HtmlStartTag = "<html"; 103 104 Sequence<sal_Int8> HTMLFormatToTextHtml(const Sequence<sal_Int8>& aHTMLFormat) 105 { 106 BOOST_ASSERT(isHTMLFormat(aHTMLFormat) && "No HTML Format provided"); 107 108 Sequence<sal_Int8>& nonconstHTMLFormatRef = const_cast< Sequence<sal_Int8>& >(aHTMLFormat); 109 sal_Char* dataStart = reinterpret_cast<sal_Char*>(nonconstHTMLFormatRef.getArray()); 110 sal_Char* dataEnd = dataStart + nonconstHTMLFormatRef.getLength() - 1; 111 const sal_Char* htmlStartTag = strcasestr(dataStart, HtmlStartTag); 112 113 BOOST_ASSERT(htmlStartTag && "Seems to be no HTML at all"); 114 115 // It doesn't seem to be HTML? Well then simply return what has been 116 // provided in non-debug builds 117 if (htmlStartTag == NULL) 118 { 119 return aHTMLFormat; 120 } 121 122 sal_Int32 len = dataEnd - htmlStartTag; 123 Sequence<sal_Int8> plainHtmlData(len); 124 125 rtl_copyMemory(static_cast<void*>(plainHtmlData.getArray()), htmlStartTag, len); 126 127 return plainHtmlData; 128 } 129 130 /* A simple format detection. We are just comparing the first few bytes 131 of the provided byte sequence to see whether or not it is the MS 132 Office Html format. If it shows that this is not reliable enough we 133 can improve this 134 */ 135 const char HtmlFormatStart[] = "Version:"; 136 int HtmlFormatStartLen = (sizeof(HtmlFormatStart) - 1); 137 138 bool isHTMLFormat(const Sequence<sal_Int8>& aHtmlSequence) 139 { 140 if (aHtmlSequence.getLength() < HtmlFormatStartLen) 141 return false; 142 143 return rtl_str_compareIgnoreAsciiCase_WithLength(HtmlFormatStart, 144 HtmlFormatStartLen, 145 reinterpret_cast<const sal_Char*>(aHtmlSequence.getConstArray()), 146 HtmlFormatStartLen) == 0; 147 } 148