1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 #include "HtmlFmtFlt.hxx"
23 
24 #include <rtl/string.h>
25 
26 #include <string>
27 #include <sstream>
28 #include <vector>
29 #include <iomanip>
30 
31 #include <boost/assert.hpp>
32 
33 using namespace com::sun::star::uno;
34 
35 //------------------------------------------------------------------------------
36 // converts the openoffice text/html clipboard format to the HTML Format
37 // well known under MS Windows
38 // the MS HTML Format has a header before the real html data
39 //
40 // Version:1.0		Version number of the clipboard. Staring is 0.9
41 // StartHTML:		Byte count from the beginning of the clipboard to the start
42 //					of the context, or -1 if no context
43 // EndHTML:			Byte count from the beginning of the clipboard to the end
44 //					of the context, or -1 if no context
45 // StartFragment:	Byte count from the beginning of the clipboard to the
46 //					start of the fragment
47 // EndFragment:		Byte count from the beginning of the clipboard to the
48 //					end of the fragment
49 // StartSelection:	Byte count from the beginning of the clipboard to the
50 //					start of the selection
51 // EndSelection:	Byte count from the beginning of the clipboard to the
52 //					end of the selection
53 //
54 // StartSelection and EndSelection are optional
55 // The fragment should be preceded and followed by the HTML comments
56 // <!--StartFragment--> and <!--EndFragment--> (no space between !-- and the
57 // text
58 //------------------------------------------------------------------------------
59 
60 namespace // private
61 {
GetHtmlFormatHeader(size_t startHtml,size_t endHtml,size_t startFragment,size_t endFragment)62 std::string GetHtmlFormatHeader(size_t startHtml, size_t endHtml, size_t startFragment, size_t endFragment)
63 {
64     std::ostringstream htmlHeader;
65     htmlHeader << "Version:1.0" << '\r' << '\n';
66     htmlHeader << "StartHTML:" << std::setw(10) << std::setfill('0') << std::dec << startHtml << '\r' << '\n';
67 	htmlHeader << "EndHTML:" << std::setw(10) << std::setfill('0') << std::dec << endHtml << '\r' << '\n';
68 	htmlHeader << "StartFragment:" << std::setw(10) << std::setfill('0') << std::dec << startFragment << '\r' << '\n';
69 	htmlHeader << "EndFragment:" << std::setw(10) << std::setfill('0') << std::dec << endFragment << '\r' << '\n';
70     return htmlHeader.str();
71 }
72 
73 } // namespace private
74 
75 
76 // the office allways writes the start and end html tag in upper cases and
77 // without spaces both tags don't allow parameters
78 const std::string TAG_HTML = std::string("<HTML>");
79 const std::string TAG_END_HTML = std::string("</HTML>");
80 
81 // The body tag may have parameters so we need to search for the
82 // closing '>' manually e.g. <BODY param> #92840#
83 const std::string TAG_BODY = std::string("<BODY");
84 const std::string TAG_END_BODY = std::string("</BODY");
85 
TextHtmlToHTMLFormat(Sequence<sal_Int8> & aTextHtml)86 Sequence<sal_Int8> SAL_CALL TextHtmlToHTMLFormat(Sequence<sal_Int8>& aTextHtml)
87 {
88 	OSL_ASSERT(aTextHtml.getLength() > 0);
89 
90 	if (!(aTextHtml.getLength() > 0))
91 		return Sequence<sal_Int8>();
92 
93 	// fill the buffer with dummy values to calc the exact length
94     std::string dummyHtmlHeader = GetHtmlFormatHeader(0, 0, 0, 0);
95 	size_t lHtmlFormatHeader = dummyHtmlHeader.length();
96 
97 	std::string textHtml(
98 	    reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()),
99 		reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()) + aTextHtml.getLength());
100 
101 	std::string::size_type nStartHtml = textHtml.find(TAG_HTML) + lHtmlFormatHeader - 1; // we start one before '<HTML>' Word 2000 does also so
102 	std::string::size_type nEndHtml = textHtml.find(TAG_END_HTML) + lHtmlFormatHeader + TAG_END_HTML.length() + 1; // our SOffice 5.2 wants 2 behind </HTML>?
103 
104 	// The body tag may have parameters so we need to search for the
105 	// closing '>' manually e.g. <BODY param> #92840#
106 	std::string::size_type nStartFragment = textHtml.find(">", textHtml.find(TAG_BODY)) + lHtmlFormatHeader + 1;
107 	std::string::size_type nEndFragment = textHtml.find(TAG_END_BODY) + lHtmlFormatHeader;
108 
109 	std::string htmlFormat = GetHtmlFormatHeader(nStartHtml, nEndHtml, nStartFragment, nEndFragment);
110 	htmlFormat += textHtml;
111 
112 	Sequence<sal_Int8> byteSequence(htmlFormat.length() + 1); // space the trailing '\0'
113 	rtl_zeroMemory(byteSequence.getArray(), byteSequence.getLength());
114 
115 	rtl_copyMemory(
116 		static_cast<void*>(byteSequence.getArray()),
117 		static_cast<const void*>(htmlFormat.c_str()),
118 		htmlFormat.length());
119 
120 	return byteSequence;
121 }
122 
123 const char* HtmlStartTag = "<html";
124 
HTMLFormatToTextHtml(const Sequence<sal_Int8> & aHTMLFormat)125 Sequence<sal_Int8> HTMLFormatToTextHtml(const Sequence<sal_Int8>& aHTMLFormat)
126 {
127   BOOST_ASSERT(isHTMLFormat(aHTMLFormat) && "No HTML Format provided");
128 
129   Sequence<sal_Int8>& nonconstHTMLFormatRef = const_cast< Sequence<sal_Int8>& >(aHTMLFormat);
130   sal_Char* dataStart = reinterpret_cast<sal_Char*>(nonconstHTMLFormatRef.getArray());
131   sal_Char* dataEnd = dataStart + nonconstHTMLFormatRef.getLength() - 1;
132   const sal_Char* htmlStartTag = strcasestr(dataStart, HtmlStartTag);
133 
134   BOOST_ASSERT(htmlStartTag && "Seems to be no HTML at all");
135 
136   // It doesn't seem to be HTML? Well then simply return what has been
137   // provided in non-debug builds
138   if (htmlStartTag == NULL)
139 	{
140 	return aHTMLFormat;
141 	}
142 
143   sal_Int32 len = dataEnd - htmlStartTag;
144   Sequence<sal_Int8> plainHtmlData(len);
145 
146   rtl_copyMemory(static_cast<void*>(plainHtmlData.getArray()), htmlStartTag, len);
147 
148   return plainHtmlData;
149 }
150 
151 /* A simple format detection. We are just comparing the first few bytes
152    of the provided byte sequence to see whether or not it is the MS
153    Office Html format. If it shows that this is not reliable enough we
154    can improve this
155 */
156 const char HtmlFormatStart[] = "Version:";
157 int HtmlFormatStartLen = (sizeof(HtmlFormatStart) - 1);
158 
isHTMLFormat(const Sequence<sal_Int8> & aHtmlSequence)159 bool isHTMLFormat(const Sequence<sal_Int8>& aHtmlSequence)
160 {
161   if (aHtmlSequence.getLength() < HtmlFormatStartLen)
162 	return false;
163 
164   return rtl_str_compareIgnoreAsciiCase_WithLength(HtmlFormatStart,
165 												   HtmlFormatStartLen,
166 												   reinterpret_cast<const sal_Char*>(aHtmlSequence.getConstArray()),
167 												   HtmlFormatStartLen) == 0;
168 }
169