xref: /trunk/main/svtools/source/svhtml/parhtml.cxx (revision cdf0e10c4e3984b49a9502b011690b615761d4a3)
1*cdf0e10cSrcweir /*************************************************************************
2*cdf0e10cSrcweir  *
3*cdf0e10cSrcweir  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4*cdf0e10cSrcweir  *
5*cdf0e10cSrcweir  * Copyright 2000, 2010 Oracle and/or its affiliates.
6*cdf0e10cSrcweir  *
7*cdf0e10cSrcweir  * OpenOffice.org - a multi-platform office productivity suite
8*cdf0e10cSrcweir  *
9*cdf0e10cSrcweir  * This file is part of OpenOffice.org.
10*cdf0e10cSrcweir  *
11*cdf0e10cSrcweir  * OpenOffice.org is free software: you can redistribute it and/or modify
12*cdf0e10cSrcweir  * it under the terms of the GNU Lesser General Public License version 3
13*cdf0e10cSrcweir  * only, as published by the Free Software Foundation.
14*cdf0e10cSrcweir  *
15*cdf0e10cSrcweir  * OpenOffice.org is distributed in the hope that it will be useful,
16*cdf0e10cSrcweir  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17*cdf0e10cSrcweir  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18*cdf0e10cSrcweir  * GNU Lesser General Public License version 3 for more details
19*cdf0e10cSrcweir  * (a copy is included in the LICENSE file that accompanied this code).
20*cdf0e10cSrcweir  *
21*cdf0e10cSrcweir  * You should have received a copy of the GNU Lesser General Public License
22*cdf0e10cSrcweir  * version 3 along with OpenOffice.org.  If not, see
23*cdf0e10cSrcweir  * <http://www.openoffice.org/license.html>
24*cdf0e10cSrcweir  * for a copy of the LGPLv3 License.
25*cdf0e10cSrcweir  *
26*cdf0e10cSrcweir  ************************************************************************/
27*cdf0e10cSrcweir 
28*cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove
29*cdf0e10cSrcweir #include "precompiled_svtools.hxx"
30*cdf0e10cSrcweir 
31*cdf0e10cSrcweir #include <ctype.h>
32*cdf0e10cSrcweir #include <stdio.h>
33*cdf0e10cSrcweir #include <tools/stream.hxx>
34*cdf0e10cSrcweir #include <tools/debug.hxx>
35*cdf0e10cSrcweir #include <tools/color.hxx>
36*cdf0e10cSrcweir #include <rtl/ustrbuf.hxx>
37*cdf0e10cSrcweir #include <rtl/strbuf.hxx>
38*cdf0e10cSrcweir #ifndef _SVSTDARR_HXX
39*cdf0e10cSrcweir #define _SVSTDARR_ULONGS
40*cdf0e10cSrcweir #include <svl/svstdarr.hxx>
41*cdf0e10cSrcweir #endif
42*cdf0e10cSrcweir 
43*cdf0e10cSrcweir #include <tools/tenccvt.hxx>
44*cdf0e10cSrcweir #include <tools/datetime.hxx>
45*cdf0e10cSrcweir #include <svl/inettype.hxx>
46*cdf0e10cSrcweir #include <comphelper/string.hxx>
47*cdf0e10cSrcweir #include <com/sun/star/beans/PropertyAttribute.hpp>
48*cdf0e10cSrcweir #include <com/sun/star/document/XDocumentProperties.hpp>
49*cdf0e10cSrcweir 
50*cdf0e10cSrcweir #include <svtools/parhtml.hxx>
51*cdf0e10cSrcweir #include <svtools/htmltokn.h>
52*cdf0e10cSrcweir #include <svtools/htmlkywd.hxx>
53*cdf0e10cSrcweir 
54*cdf0e10cSrcweir 
55*cdf0e10cSrcweir using namespace ::com::sun::star;
56*cdf0e10cSrcweir 
57*cdf0e10cSrcweir 
58*cdf0e10cSrcweir const sal_Int32 MAX_LEN( 1024L );
59*cdf0e10cSrcweir //static sal_Unicode sTmpBuffer[ MAX_LEN+1 ];
60*cdf0e10cSrcweir const sal_Int32 MAX_MACRO_LEN( 1024 );
61*cdf0e10cSrcweir 
62*cdf0e10cSrcweir const sal_Int32 MAX_ENTITY_LEN( 8L );
63*cdf0e10cSrcweir 
64*cdf0e10cSrcweir /*  */
65*cdf0e10cSrcweir 
66*cdf0e10cSrcweir // Tabellen zum Umwandeln von Options-Werten in Strings
67*cdf0e10cSrcweir 
68*cdf0e10cSrcweir // <INPUT TYPE=xxx>
69*cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aInputTypeOptEnums[] =
70*cdf0e10cSrcweir {
71*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_text,      HTML_IT_TEXT        },
72*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_password,  HTML_IT_PASSWORD    },
73*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_checkbox,  HTML_IT_CHECKBOX    },
74*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_radio,     HTML_IT_RADIO       },
75*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_range,     HTML_IT_RANGE       },
76*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_scribble,  HTML_IT_SCRIBBLE    },
77*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_file,      HTML_IT_FILE        },
78*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_hidden,    HTML_IT_HIDDEN      },
79*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_submit,    HTML_IT_SUBMIT      },
80*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_image,     HTML_IT_IMAGE       },
81*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_reset,     HTML_IT_RESET       },
82*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_button,    HTML_IT_BUTTON      },
83*cdf0e10cSrcweir     { 0,                    0                   }
84*cdf0e10cSrcweir };
85*cdf0e10cSrcweir 
86*cdf0e10cSrcweir // <TABLE FRAME=xxx>
87*cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aTableFrameOptEnums[] =
88*cdf0e10cSrcweir {
89*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TF_void,  HTML_TF_VOID    },
90*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TF_above, HTML_TF_ABOVE   },
91*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TF_below, HTML_TF_BELOW   },
92*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TF_hsides,    HTML_TF_HSIDES  },
93*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TF_lhs,       HTML_TF_LHS     },
94*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TF_rhs,       HTML_TF_RHS     },
95*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TF_vsides,    HTML_TF_VSIDES  },
96*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TF_box,       HTML_TF_BOX     },
97*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TF_border,    HTML_TF_BOX     },
98*cdf0e10cSrcweir     { 0,                0               }
99*cdf0e10cSrcweir };
100*cdf0e10cSrcweir 
101*cdf0e10cSrcweir // <TABLE RULES=xxx>
102*cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aTableRulesOptEnums[] =
103*cdf0e10cSrcweir {
104*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TR_none,  HTML_TR_NONE    },
105*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TR_groups,    HTML_TR_GROUPS  },
106*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TR_rows,  HTML_TR_ROWS    },
107*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TR_cols,  HTML_TR_COLS    },
108*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TR_all,       HTML_TR_ALL     },
109*cdf0e10cSrcweir     { 0,                0               }
110*cdf0e10cSrcweir };
111*cdf0e10cSrcweir 
112*cdf0e10cSrcweir 
113*cdf0e10cSrcweir SV_IMPL_PTRARR(HTMLOptions,HTMLOptionPtr)
114*cdf0e10cSrcweir 
115*cdf0e10cSrcweir /*  */
116*cdf0e10cSrcweir 
117*cdf0e10cSrcweir sal_uInt16 HTMLOption::GetEnum( const HTMLOptionEnum *pOptEnums, sal_uInt16 nDflt ) const
118*cdf0e10cSrcweir {
119*cdf0e10cSrcweir     sal_uInt16 nValue = nDflt;
120*cdf0e10cSrcweir 
121*cdf0e10cSrcweir     while( pOptEnums->pName )
122*cdf0e10cSrcweir         if( aValue.EqualsIgnoreCaseAscii( pOptEnums->pName ) )
123*cdf0e10cSrcweir             break;
124*cdf0e10cSrcweir         else
125*cdf0e10cSrcweir             pOptEnums++;
126*cdf0e10cSrcweir 
127*cdf0e10cSrcweir     if( pOptEnums->pName )
128*cdf0e10cSrcweir         nValue = pOptEnums->nValue;
129*cdf0e10cSrcweir 
130*cdf0e10cSrcweir     return nValue;
131*cdf0e10cSrcweir }
132*cdf0e10cSrcweir 
133*cdf0e10cSrcweir sal_Bool HTMLOption::GetEnum( sal_uInt16 &rEnum, const HTMLOptionEnum *pOptEnums ) const
134*cdf0e10cSrcweir {
135*cdf0e10cSrcweir     while( pOptEnums->pName )
136*cdf0e10cSrcweir     {
137*cdf0e10cSrcweir         if( aValue.EqualsIgnoreCaseAscii( pOptEnums->pName ) )
138*cdf0e10cSrcweir             break;
139*cdf0e10cSrcweir         else
140*cdf0e10cSrcweir             pOptEnums++;
141*cdf0e10cSrcweir     }
142*cdf0e10cSrcweir 
143*cdf0e10cSrcweir     const sal_Char *pName = pOptEnums->pName;
144*cdf0e10cSrcweir     if( pName )
145*cdf0e10cSrcweir         rEnum = pOptEnums->nValue;
146*cdf0e10cSrcweir 
147*cdf0e10cSrcweir     return (pName != 0);
148*cdf0e10cSrcweir }
149*cdf0e10cSrcweir 
150*cdf0e10cSrcweir HTMLOption::HTMLOption( sal_uInt16 nTok, const String& rToken,
151*cdf0e10cSrcweir                         const String& rValue )
152*cdf0e10cSrcweir     : aValue(rValue)
153*cdf0e10cSrcweir     , aToken(rToken)
154*cdf0e10cSrcweir     , nToken( nTok )
155*cdf0e10cSrcweir {
156*cdf0e10cSrcweir     DBG_ASSERT( nToken>=HTML_OPTION_START && nToken<HTML_OPTION_END,
157*cdf0e10cSrcweir         "HTMLOption: unbekanntes Token" );
158*cdf0e10cSrcweir }
159*cdf0e10cSrcweir 
160*cdf0e10cSrcweir sal_uInt32 HTMLOption::GetNumber() const
161*cdf0e10cSrcweir {
162*cdf0e10cSrcweir     DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START &&
163*cdf0e10cSrcweir                  nToken<HTML_OPTION_NUMBER_END) ||
164*cdf0e10cSrcweir                 (nToken>=HTML_OPTION_CONTEXT_START &&
165*cdf0e10cSrcweir                  nToken<HTML_OPTION_CONTEXT_END) ||
166*cdf0e10cSrcweir                 nToken==HTML_O_VALUE,
167*cdf0e10cSrcweir         "GetNumber: Option ist nicht numerisch" );
168*cdf0e10cSrcweir     String aTmp( aValue );
169*cdf0e10cSrcweir     aTmp.EraseLeadingChars();
170*cdf0e10cSrcweir     sal_Int32 nTmp = aTmp.ToInt32();
171*cdf0e10cSrcweir     return nTmp >= 0 ? (sal_uInt32)nTmp : 0;
172*cdf0e10cSrcweir }
173*cdf0e10cSrcweir 
174*cdf0e10cSrcweir sal_Int32 HTMLOption::GetSNumber() const
175*cdf0e10cSrcweir {
176*cdf0e10cSrcweir     DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START && nToken<HTML_OPTION_NUMBER_END) ||
177*cdf0e10cSrcweir                 (nToken>=HTML_OPTION_CONTEXT_START && nToken<HTML_OPTION_CONTEXT_END),
178*cdf0e10cSrcweir         "GetSNumber: Option ist nicht numerisch" );
179*cdf0e10cSrcweir     String aTmp( aValue );
180*cdf0e10cSrcweir     aTmp.EraseLeadingChars();
181*cdf0e10cSrcweir     return aTmp.ToInt32();
182*cdf0e10cSrcweir }
183*cdf0e10cSrcweir 
184*cdf0e10cSrcweir void HTMLOption::GetNumbers( SvULongs &rLongs, sal_Bool bSpaceDelim ) const
185*cdf0e10cSrcweir {
186*cdf0e10cSrcweir     if( rLongs.Count() )
187*cdf0e10cSrcweir         rLongs.Remove( 0, rLongs.Count() );
188*cdf0e10cSrcweir 
189*cdf0e10cSrcweir     if( bSpaceDelim )
190*cdf0e10cSrcweir     {
191*cdf0e10cSrcweir         // das ist ein sehr stark vereinfachter Scanner. Er sucht einfach
192*cdf0e10cSrcweir         // alle Tiffern aus dem String
193*cdf0e10cSrcweir         sal_Bool bInNum = sal_False;
194*cdf0e10cSrcweir         sal_uLong nNum = 0;
195*cdf0e10cSrcweir         for( xub_StrLen i=0; i<aValue.Len(); i++ )
196*cdf0e10cSrcweir         {
197*cdf0e10cSrcweir             register sal_Unicode c = aValue.GetChar( i );
198*cdf0e10cSrcweir             if( c>='0' && c<='9' )
199*cdf0e10cSrcweir             {
200*cdf0e10cSrcweir                 nNum *= 10;
201*cdf0e10cSrcweir                 nNum += (c - '0');
202*cdf0e10cSrcweir                 bInNum = sal_True;
203*cdf0e10cSrcweir             }
204*cdf0e10cSrcweir             else if( bInNum )
205*cdf0e10cSrcweir             {
206*cdf0e10cSrcweir                 rLongs.Insert( nNum, rLongs.Count() );
207*cdf0e10cSrcweir                 bInNum = sal_False;
208*cdf0e10cSrcweir                 nNum = 0;
209*cdf0e10cSrcweir             }
210*cdf0e10cSrcweir         }
211*cdf0e10cSrcweir         if( bInNum )
212*cdf0e10cSrcweir         {
213*cdf0e10cSrcweir             rLongs.Insert( nNum, rLongs.Count() );
214*cdf0e10cSrcweir         }
215*cdf0e10cSrcweir     }
216*cdf0e10cSrcweir     else
217*cdf0e10cSrcweir     {
218*cdf0e10cSrcweir         // hier wird auf die korrekte Trennung der Zahlen durch ',' geachtet
219*cdf0e10cSrcweir         // und auch mal eine 0 eingefuegt
220*cdf0e10cSrcweir         xub_StrLen nPos = 0;
221*cdf0e10cSrcweir         while( nPos < aValue.Len() )
222*cdf0e10cSrcweir         {
223*cdf0e10cSrcweir             register sal_Unicode c;
224*cdf0e10cSrcweir             while( nPos < aValue.Len() &&
225*cdf0e10cSrcweir                    ((c=aValue.GetChar(nPos)) == ' ' || c == '\t' ||
226*cdf0e10cSrcweir                    c == '\n' || c== '\r' ) )
227*cdf0e10cSrcweir                 nPos++;
228*cdf0e10cSrcweir 
229*cdf0e10cSrcweir             if( nPos==aValue.Len() )
230*cdf0e10cSrcweir                 rLongs.Insert( sal_uLong(0), rLongs.Count() );
231*cdf0e10cSrcweir             else
232*cdf0e10cSrcweir             {
233*cdf0e10cSrcweir                 xub_StrLen nEnd = aValue.Search( (sal_Unicode)',', nPos );
234*cdf0e10cSrcweir                 if( STRING_NOTFOUND==nEnd )
235*cdf0e10cSrcweir                 {
236*cdf0e10cSrcweir                     sal_Int32 nTmp = aValue.Copy(nPos).ToInt32();
237*cdf0e10cSrcweir                     rLongs.Insert( nTmp >= 0 ? (sal_uInt32)nTmp : 0,
238*cdf0e10cSrcweir                                    rLongs.Count() );
239*cdf0e10cSrcweir                     nPos = aValue.Len();
240*cdf0e10cSrcweir                 }
241*cdf0e10cSrcweir                 else
242*cdf0e10cSrcweir                 {
243*cdf0e10cSrcweir                     sal_Int32 nTmp =
244*cdf0e10cSrcweir                         aValue.Copy(nPos,nEnd-nPos).ToInt32();
245*cdf0e10cSrcweir                     rLongs.Insert( nTmp >= 0 ? (sal_uInt32)nTmp : 0,
246*cdf0e10cSrcweir                                    rLongs.Count() );
247*cdf0e10cSrcweir                     nPos = nEnd+1;
248*cdf0e10cSrcweir                 }
249*cdf0e10cSrcweir             }
250*cdf0e10cSrcweir         }
251*cdf0e10cSrcweir     }
252*cdf0e10cSrcweir }
253*cdf0e10cSrcweir 
254*cdf0e10cSrcweir void HTMLOption::GetColor( Color& rColor ) const
255*cdf0e10cSrcweir {
256*cdf0e10cSrcweir     DBG_ASSERT( (nToken>=HTML_OPTION_COLOR_START && nToken<HTML_OPTION_COLOR_END) || nToken==HTML_O_SIZE,
257*cdf0e10cSrcweir         "GetColor: Option spezifiziert keine Farbe" );
258*cdf0e10cSrcweir 
259*cdf0e10cSrcweir     String aTmp( aValue );
260*cdf0e10cSrcweir     aTmp.ToUpperAscii();
261*cdf0e10cSrcweir     sal_uLong nColor = ULONG_MAX;
262*cdf0e10cSrcweir     if( '#'!=aTmp.GetChar( 0 ) )
263*cdf0e10cSrcweir         nColor = GetHTMLColor( aTmp );
264*cdf0e10cSrcweir 
265*cdf0e10cSrcweir     if( ULONG_MAX == nColor )
266*cdf0e10cSrcweir     {
267*cdf0e10cSrcweir         nColor = 0;
268*cdf0e10cSrcweir         xub_StrLen nPos = 0;
269*cdf0e10cSrcweir         for( sal_uInt32 i=0; i<6; i++ )
270*cdf0e10cSrcweir         {
271*cdf0e10cSrcweir             // MIB 26.06.97: Wie auch immer Netscape Farbwerte ermittelt,
272*cdf0e10cSrcweir             // maximal drei Zeichen, die kleiner als '0' sind werden
273*cdf0e10cSrcweir             // ignoriert. Bug #40901# stimmt damit. Mal schauen, was sich
274*cdf0e10cSrcweir             // irgendwelche HTML-Autoren noch so einfallen lassen...
275*cdf0e10cSrcweir             register sal_Unicode c = nPos<aTmp.Len() ? aTmp.GetChar( nPos++ )
276*cdf0e10cSrcweir                                                      : '0';
277*cdf0e10cSrcweir             if( c < '0' )
278*cdf0e10cSrcweir             {
279*cdf0e10cSrcweir                 c = nPos<aTmp.Len() ? aTmp.GetChar(nPos++) : '0';
280*cdf0e10cSrcweir                 if( c < '0' )
281*cdf0e10cSrcweir                     c = nPos<aTmp.Len() ? aTmp.GetChar(nPos++) : '0';
282*cdf0e10cSrcweir             }
283*cdf0e10cSrcweir             nColor *= 16;
284*cdf0e10cSrcweir             if( c >= '0' && c <= '9' )
285*cdf0e10cSrcweir                 nColor += (c - 48);
286*cdf0e10cSrcweir             else if( c >= 'A' && c <= 'F' )
287*cdf0e10cSrcweir                 nColor += (c - 55);
288*cdf0e10cSrcweir         }
289*cdf0e10cSrcweir     }
290*cdf0e10cSrcweir 
291*cdf0e10cSrcweir     rColor.SetRed(   (sal_uInt8)((nColor & 0x00ff0000) >> 16) );
292*cdf0e10cSrcweir     rColor.SetGreen( (sal_uInt8)((nColor & 0x0000ff00) >> 8));
293*cdf0e10cSrcweir     rColor.SetBlue(  (sal_uInt8)(nColor & 0x000000ff) );
294*cdf0e10cSrcweir }
295*cdf0e10cSrcweir 
296*cdf0e10cSrcweir HTMLInputType HTMLOption::GetInputType() const
297*cdf0e10cSrcweir {
298*cdf0e10cSrcweir     DBG_ASSERT( nToken==HTML_O_TYPE, "GetInputType: Option nicht TYPE" );
299*cdf0e10cSrcweir     return (HTMLInputType)GetEnum( aInputTypeOptEnums, HTML_IT_TEXT );
300*cdf0e10cSrcweir }
301*cdf0e10cSrcweir 
302*cdf0e10cSrcweir HTMLTableFrame HTMLOption::GetTableFrame() const
303*cdf0e10cSrcweir {
304*cdf0e10cSrcweir     DBG_ASSERT( nToken==HTML_O_FRAME, "GetTableFrame: Option nicht FRAME" );
305*cdf0e10cSrcweir     return (HTMLTableFrame)GetEnum( aTableFrameOptEnums, HTML_TF_VOID );
306*cdf0e10cSrcweir }
307*cdf0e10cSrcweir 
308*cdf0e10cSrcweir HTMLTableRules HTMLOption::GetTableRules() const
309*cdf0e10cSrcweir {
310*cdf0e10cSrcweir     DBG_ASSERT( nToken==HTML_O_RULES, "GetTableRules: Option nicht RULES" );
311*cdf0e10cSrcweir     return (HTMLTableRules)GetEnum( aTableRulesOptEnums, HTML_TR_NONE );
312*cdf0e10cSrcweir }
313*cdf0e10cSrcweir 
314*cdf0e10cSrcweir /*  */
315*cdf0e10cSrcweir 
316*cdf0e10cSrcweir HTMLParser::HTMLParser( SvStream& rIn, int bReadNewDoc )
317*cdf0e10cSrcweir     : SvParser( rIn )
318*cdf0e10cSrcweir {
319*cdf0e10cSrcweir     bNewDoc = bReadNewDoc;
320*cdf0e10cSrcweir     bReadListing = bReadXMP = bReadPRE = bReadTextArea =
321*cdf0e10cSrcweir         bReadScript = bReadStyle =
322*cdf0e10cSrcweir         bEndTokenFound = bIsInBody = bReadNextChar =
323*cdf0e10cSrcweir         bReadComment = sal_False;
324*cdf0e10cSrcweir     bIsInHeader = sal_True;
325*cdf0e10cSrcweir     pOptions = new HTMLOptions;
326*cdf0e10cSrcweir }
327*cdf0e10cSrcweir 
328*cdf0e10cSrcweir HTMLParser::~HTMLParser()
329*cdf0e10cSrcweir {
330*cdf0e10cSrcweir     if( pOptions && pOptions->Count() )
331*cdf0e10cSrcweir         pOptions->DeleteAndDestroy( 0, pOptions->Count() );
332*cdf0e10cSrcweir     delete pOptions;
333*cdf0e10cSrcweir }
334*cdf0e10cSrcweir 
335*cdf0e10cSrcweir SvParserState __EXPORT HTMLParser::CallParser()
336*cdf0e10cSrcweir {
337*cdf0e10cSrcweir     eState = SVPAR_WORKING;
338*cdf0e10cSrcweir     nNextCh = GetNextChar();
339*cdf0e10cSrcweir     SaveState( 0 );
340*cdf0e10cSrcweir 
341*cdf0e10cSrcweir     nPre_LinePos = 0;
342*cdf0e10cSrcweir     bPre_IgnoreNewPara = sal_False;
343*cdf0e10cSrcweir 
344*cdf0e10cSrcweir     AddRef();
345*cdf0e10cSrcweir     Continue( 0 );
346*cdf0e10cSrcweir     if( SVPAR_PENDING != eState )
347*cdf0e10cSrcweir         ReleaseRef();       // dann brauchen wir den Parser nicht mehr!
348*cdf0e10cSrcweir 
349*cdf0e10cSrcweir     return eState;
350*cdf0e10cSrcweir }
351*cdf0e10cSrcweir 
352*cdf0e10cSrcweir void HTMLParser::Continue( int nToken )
353*cdf0e10cSrcweir {
354*cdf0e10cSrcweir     if( !nToken )
355*cdf0e10cSrcweir         nToken = GetNextToken();
356*cdf0e10cSrcweir 
357*cdf0e10cSrcweir     while( IsParserWorking() )
358*cdf0e10cSrcweir     {
359*cdf0e10cSrcweir         SaveState( nToken );
360*cdf0e10cSrcweir         nToken = FilterToken( nToken );
361*cdf0e10cSrcweir 
362*cdf0e10cSrcweir         if( nToken )
363*cdf0e10cSrcweir             NextToken( nToken );
364*cdf0e10cSrcweir 
365*cdf0e10cSrcweir         if( IsParserWorking() )
366*cdf0e10cSrcweir             SaveState( 0 );         // bis hierhin abgearbeitet,
367*cdf0e10cSrcweir                                     // weiter mit neuem Token!
368*cdf0e10cSrcweir         nToken = GetNextToken();
369*cdf0e10cSrcweir     }
370*cdf0e10cSrcweir }
371*cdf0e10cSrcweir 
372*cdf0e10cSrcweir int HTMLParser::FilterToken( int nToken )
373*cdf0e10cSrcweir {
374*cdf0e10cSrcweir     switch( nToken )
375*cdf0e10cSrcweir     {
376*cdf0e10cSrcweir     case sal_Unicode(EOF):
377*cdf0e10cSrcweir         nToken = 0;
378*cdf0e10cSrcweir         break;          // nicht verschicken
379*cdf0e10cSrcweir 
380*cdf0e10cSrcweir     case HTML_HEAD_OFF:
381*cdf0e10cSrcweir         bIsInBody = sal_True;
382*cdf0e10cSrcweir     case HTML_HEAD_ON:
383*cdf0e10cSrcweir         bIsInHeader = HTML_HEAD_ON == nToken;
384*cdf0e10cSrcweir         break;
385*cdf0e10cSrcweir 
386*cdf0e10cSrcweir     case HTML_BODY_ON:
387*cdf0e10cSrcweir     case HTML_FRAMESET_ON:
388*cdf0e10cSrcweir         bIsInHeader = sal_False;
389*cdf0e10cSrcweir         bIsInBody = HTML_BODY_ON == nToken;
390*cdf0e10cSrcweir         break;
391*cdf0e10cSrcweir 
392*cdf0e10cSrcweir     case HTML_BODY_OFF:
393*cdf0e10cSrcweir         bIsInBody = bReadPRE = bReadListing = bReadXMP = sal_False;
394*cdf0e10cSrcweir         break;
395*cdf0e10cSrcweir 
396*cdf0e10cSrcweir     case HTML_HTML_OFF:
397*cdf0e10cSrcweir         nToken = 0;
398*cdf0e10cSrcweir         bReadPRE = bReadListing = bReadXMP = sal_False;
399*cdf0e10cSrcweir         break;      // HTML_ON wurde auch nicht verschickt !
400*cdf0e10cSrcweir 
401*cdf0e10cSrcweir     case HTML_PREFORMTXT_ON:
402*cdf0e10cSrcweir         StartPRE();
403*cdf0e10cSrcweir         break;
404*cdf0e10cSrcweir 
405*cdf0e10cSrcweir     case HTML_PREFORMTXT_OFF:
406*cdf0e10cSrcweir         FinishPRE();
407*cdf0e10cSrcweir         break;
408*cdf0e10cSrcweir 
409*cdf0e10cSrcweir     case HTML_LISTING_ON:
410*cdf0e10cSrcweir         StartListing();
411*cdf0e10cSrcweir         break;
412*cdf0e10cSrcweir 
413*cdf0e10cSrcweir     case HTML_LISTING_OFF:
414*cdf0e10cSrcweir         FinishListing();
415*cdf0e10cSrcweir         break;
416*cdf0e10cSrcweir 
417*cdf0e10cSrcweir     case HTML_XMP_ON:
418*cdf0e10cSrcweir         StartXMP();
419*cdf0e10cSrcweir         break;
420*cdf0e10cSrcweir 
421*cdf0e10cSrcweir     case HTML_XMP_OFF:
422*cdf0e10cSrcweir         FinishXMP();
423*cdf0e10cSrcweir         break;
424*cdf0e10cSrcweir 
425*cdf0e10cSrcweir     default:
426*cdf0e10cSrcweir         if( bReadPRE )
427*cdf0e10cSrcweir             nToken = FilterPRE( nToken );
428*cdf0e10cSrcweir         else if( bReadListing )
429*cdf0e10cSrcweir             nToken = FilterListing( nToken );
430*cdf0e10cSrcweir         else if( bReadXMP )
431*cdf0e10cSrcweir             nToken = FilterXMP( nToken );
432*cdf0e10cSrcweir 
433*cdf0e10cSrcweir         break;
434*cdf0e10cSrcweir     }
435*cdf0e10cSrcweir 
436*cdf0e10cSrcweir     return nToken;
437*cdf0e10cSrcweir }
438*cdf0e10cSrcweir 
439*cdf0e10cSrcweir #define HTML_ISDIGIT( c ) (c >= '0' && c <= '9')
440*cdf0e10cSrcweir #define HTML_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') )
441*cdf0e10cSrcweir #define HTML_ISALNUM( c ) ( HTML_ISALPHA(c) || HTML_ISDIGIT(c) )
442*cdf0e10cSrcweir #define HTML_ISSPACE( c ) ( ' ' == c || (c >= 0x09 && c <= 0x0d) )
443*cdf0e10cSrcweir #define HTML_ISPRINTABLE( c ) ( c >= 32 && c != 127)
444*cdf0e10cSrcweir // --> OD 2006-07-26 #138464#
445*cdf0e10cSrcweir #define HTML_ISHEXDIGIT( c ) ( HTML_ISDIGIT(c) || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f') )
446*cdf0e10cSrcweir // <--
447*cdf0e10cSrcweir 
448*cdf0e10cSrcweir int HTMLParser::ScanText( const sal_Unicode cBreak )
449*cdf0e10cSrcweir {
450*cdf0e10cSrcweir     ::rtl::OUStringBuffer sTmpBuffer( MAX_LEN );
451*cdf0e10cSrcweir     int bWeiter = sal_True;
452*cdf0e10cSrcweir     int bEqSignFound = sal_False;
453*cdf0e10cSrcweir     sal_Unicode cQuote = 0U;
454*cdf0e10cSrcweir 
455*cdf0e10cSrcweir     while( bWeiter && IsParserWorking() )
456*cdf0e10cSrcweir     {
457*cdf0e10cSrcweir         int bNextCh = sal_True;
458*cdf0e10cSrcweir         switch( nNextCh )
459*cdf0e10cSrcweir         {
460*cdf0e10cSrcweir         case '&':
461*cdf0e10cSrcweir             bEqSignFound = sal_False;
462*cdf0e10cSrcweir             if( bReadXMP )
463*cdf0e10cSrcweir                 sTmpBuffer.append( (sal_Unicode)'&' );
464*cdf0e10cSrcweir             else
465*cdf0e10cSrcweir             {
466*cdf0e10cSrcweir                 sal_uLong nStreamPos = rInput.Tell();
467*cdf0e10cSrcweir                 sal_uLong nLinePos = GetLinePos();
468*cdf0e10cSrcweir 
469*cdf0e10cSrcweir                 sal_Unicode cChar = 0U;
470*cdf0e10cSrcweir                 if( '#' == (nNextCh = GetNextChar()) )
471*cdf0e10cSrcweir                 {
472*cdf0e10cSrcweir                     nNextCh = GetNextChar();
473*cdf0e10cSrcweir                     // --> OD 2006-07-26 #138464#
474*cdf0e10cSrcweir                     // consider hexadecimal digits
475*cdf0e10cSrcweir                     const sal_Bool bIsHex( 'x' == nNextCh );
476*cdf0e10cSrcweir                     const sal_Bool bIsDecOrHex( bIsHex || HTML_ISDIGIT(nNextCh) );
477*cdf0e10cSrcweir                     if ( bIsDecOrHex )
478*cdf0e10cSrcweir                     {
479*cdf0e10cSrcweir                         if ( bIsHex )
480*cdf0e10cSrcweir                         {
481*cdf0e10cSrcweir                             nNextCh = GetNextChar();
482*cdf0e10cSrcweir                             while ( HTML_ISHEXDIGIT(nNextCh) )
483*cdf0e10cSrcweir                             {
484*cdf0e10cSrcweir                                 cChar = cChar * 16U +
485*cdf0e10cSrcweir                                         ( nNextCh <= '9'
486*cdf0e10cSrcweir                                           ? sal_Unicode( nNextCh - '0' )
487*cdf0e10cSrcweir                                           : ( nNextCh <= 'F'
488*cdf0e10cSrcweir                                               ? sal_Unicode( nNextCh - 'A' + 10 )
489*cdf0e10cSrcweir                                               : sal_Unicode( nNextCh - 'a' + 10 ) ) );
490*cdf0e10cSrcweir                                 nNextCh = GetNextChar();
491*cdf0e10cSrcweir                             }
492*cdf0e10cSrcweir                         }
493*cdf0e10cSrcweir                         else
494*cdf0e10cSrcweir                         {
495*cdf0e10cSrcweir                             do
496*cdf0e10cSrcweir                             {
497*cdf0e10cSrcweir                                 cChar = cChar * 10U + sal_Unicode( nNextCh - '0');
498*cdf0e10cSrcweir                                 nNextCh = GetNextChar();
499*cdf0e10cSrcweir                             }
500*cdf0e10cSrcweir                             while( HTML_ISDIGIT(nNextCh) );
501*cdf0e10cSrcweir                         }
502*cdf0e10cSrcweir 
503*cdf0e10cSrcweir                         if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc &&
504*cdf0e10cSrcweir                             RTL_TEXTENCODING_UCS2 != eSrcEnc &&
505*cdf0e10cSrcweir                             RTL_TEXTENCODING_UTF8 != eSrcEnc &&
506*cdf0e10cSrcweir                             cChar < 256 )
507*cdf0e10cSrcweir                         {
508*cdf0e10cSrcweir                             sal_Unicode cOrig = cChar;
509*cdf0e10cSrcweir                             cChar = ByteString::ConvertToUnicode(
510*cdf0e10cSrcweir                                             (sal_Char)cChar, eSrcEnc );
511*cdf0e10cSrcweir                             if( 0U == cChar )
512*cdf0e10cSrcweir                             {
513*cdf0e10cSrcweir                                 // #73398#: If the character could not be
514*cdf0e10cSrcweir                                 // converted, because a conversion is not
515*cdf0e10cSrcweir                                 // available, do no conversion at all.
516*cdf0e10cSrcweir                                 cChar = cOrig;
517*cdf0e10cSrcweir                             }
518*cdf0e10cSrcweir                         }
519*cdf0e10cSrcweir                     }
520*cdf0e10cSrcweir                     // <--
521*cdf0e10cSrcweir                     else
522*cdf0e10cSrcweir                         nNextCh = 0U;
523*cdf0e10cSrcweir                 }
524*cdf0e10cSrcweir                 else if( HTML_ISALPHA( nNextCh ) )
525*cdf0e10cSrcweir                 {
526*cdf0e10cSrcweir                     ::rtl::OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN );
527*cdf0e10cSrcweir                     xub_StrLen nPos = 0L;
528*cdf0e10cSrcweir                     do
529*cdf0e10cSrcweir                     {
530*cdf0e10cSrcweir                         sEntityBuffer.append( nNextCh );
531*cdf0e10cSrcweir                         nPos++;
532*cdf0e10cSrcweir                         nNextCh = GetNextChar();
533*cdf0e10cSrcweir                     }
534*cdf0e10cSrcweir                     while( nPos < MAX_ENTITY_LEN && HTML_ISALNUM( nNextCh ) &&
535*cdf0e10cSrcweir                            !rInput.IsEof() );
536*cdf0e10cSrcweir 
537*cdf0e10cSrcweir                     if( IsParserWorking() && !rInput.IsEof() )
538*cdf0e10cSrcweir                     {
539*cdf0e10cSrcweir                         String sEntity( sEntityBuffer.getStr(), nPos );
540*cdf0e10cSrcweir                         cChar = GetHTMLCharName( sEntity );
541*cdf0e10cSrcweir 
542*cdf0e10cSrcweir                         // nicht gefunden ( == 0 ), dann Klartext
543*cdf0e10cSrcweir                         // oder ein Zeichen das als Attribut eingefuegt
544*cdf0e10cSrcweir                         // wird
545*cdf0e10cSrcweir                         if( 0U == cChar && ';' != nNextCh )
546*cdf0e10cSrcweir                         {
547*cdf0e10cSrcweir                             DBG_ASSERT( rInput.Tell() - nStreamPos ==
548*cdf0e10cSrcweir                                         (sal_uLong)(nPos+1L)*GetCharSize(),
549*cdf0e10cSrcweir                                         "UTF-8 geht hier schief" );
550*cdf0e10cSrcweir                             for( xub_StrLen i=nPos-1L; i>1L; i-- )
551*cdf0e10cSrcweir                             {
552*cdf0e10cSrcweir                                 nNextCh = sEntityBuffer[i];
553*cdf0e10cSrcweir                                 sEntityBuffer.setLength( i );
554*cdf0e10cSrcweir                                 sEntity.Assign( sEntityBuffer.getStr(), i );
555*cdf0e10cSrcweir                                 cChar = GetHTMLCharName( sEntity );
556*cdf0e10cSrcweir                                 if( cChar )
557*cdf0e10cSrcweir                                 {
558*cdf0e10cSrcweir                                     rInput.SeekRel( -(long)
559*cdf0e10cSrcweir                                             ((nPos-i)*GetCharSize()) );
560*cdf0e10cSrcweir                                     nlLinePos -= sal_uInt32(nPos-i);
561*cdf0e10cSrcweir                                     nPos = i;
562*cdf0e10cSrcweir                                     ClearTxtConvContext();
563*cdf0e10cSrcweir                                     break;
564*cdf0e10cSrcweir                                 }
565*cdf0e10cSrcweir                             }
566*cdf0e10cSrcweir                         }
567*cdf0e10cSrcweir 
568*cdf0e10cSrcweir                         if( !cChar )        // unbekanntes Zeichen?
569*cdf0e10cSrcweir                         {
570*cdf0e10cSrcweir                             // dann im Stream zurueck, das '&' als Zeichen
571*cdf0e10cSrcweir                             // einfuegen und mit dem nachfolgenden Zeichen
572*cdf0e10cSrcweir                             // wieder aufsetzen
573*cdf0e10cSrcweir                             sTmpBuffer.append( (sal_Unicode)'&' );
574*cdf0e10cSrcweir 
575*cdf0e10cSrcweir //                          rInput.SeekRel( -(long)(++nPos*GetCharSize()) );
576*cdf0e10cSrcweir //                          nlLinePos -= nPos;
577*cdf0e10cSrcweir                             DBG_ASSERT( rInput.Tell()-nStreamPos ==
578*cdf0e10cSrcweir                                         (sal_uLong)(nPos+1)*GetCharSize(),
579*cdf0e10cSrcweir                                         "Falsche Stream-Position" );
580*cdf0e10cSrcweir                             DBG_ASSERT( nlLinePos-nLinePos ==
581*cdf0e10cSrcweir                                         (sal_uLong)(nPos+1),
582*cdf0e10cSrcweir                                         "Falsche Zeilen-Position" );
583*cdf0e10cSrcweir                             rInput.Seek( nStreamPos );
584*cdf0e10cSrcweir                             nlLinePos = nLinePos;
585*cdf0e10cSrcweir                             ClearTxtConvContext();
586*cdf0e10cSrcweir                             break;
587*cdf0e10cSrcweir                         }
588*cdf0e10cSrcweir 
589*cdf0e10cSrcweir                         // 1 == Non Breaking Space
590*cdf0e10cSrcweir                         // 2 == SoftHyphen
591*cdf0e10cSrcweir 
592*cdf0e10cSrcweir                         if( cChar < 3U )
593*cdf0e10cSrcweir                         {
594*cdf0e10cSrcweir                             if( '>' == cBreak )
595*cdf0e10cSrcweir                             {
596*cdf0e10cSrcweir                                 // Wenn der Inhalt eines Tags gelesen wird,
597*cdf0e10cSrcweir                                 // muessen wir ein Space bzw. - daraus machen
598*cdf0e10cSrcweir                                 switch( cChar )
599*cdf0e10cSrcweir                                 {
600*cdf0e10cSrcweir                                 case 1U: cChar = ' '; break;
601*cdf0e10cSrcweir                                 case 2U: cChar = '-'; break;
602*cdf0e10cSrcweir                                 default:
603*cdf0e10cSrcweir                                     DBG_ASSERT( cChar==1U,
604*cdf0e10cSrcweir                             "\0x00 sollte doch schon laengt abgefangen sein!" );
605*cdf0e10cSrcweir                                     break;
606*cdf0e10cSrcweir                                 }
607*cdf0e10cSrcweir                             }
608*cdf0e10cSrcweir                             else
609*cdf0e10cSrcweir                             {
610*cdf0e10cSrcweir                                 // Wenn kein Tag gescannt wird, enstprechendes
611*cdf0e10cSrcweir                                 // Token zurueckgeben
612*cdf0e10cSrcweir                                 aToken +=
613*cdf0e10cSrcweir                                     String( sTmpBuffer.makeStringAndClear() );
614*cdf0e10cSrcweir                                 if( cChar )
615*cdf0e10cSrcweir                                 {
616*cdf0e10cSrcweir                                     if( aToken.Len() )
617*cdf0e10cSrcweir                                     {
618*cdf0e10cSrcweir                                         // mit dem Zeichen wieder aufsetzen
619*cdf0e10cSrcweir                                         nNextCh = '&';
620*cdf0e10cSrcweir //                                      rInput.SeekRel( -(long)(++nPos*GetCharSize()) );
621*cdf0e10cSrcweir //                                      nlLinePos -= nPos;
622*cdf0e10cSrcweir                                         DBG_ASSERT( rInput.Tell()-nStreamPos ==
623*cdf0e10cSrcweir                                                     (sal_uLong)(nPos+1)*GetCharSize(),
624*cdf0e10cSrcweir                                                     "Falsche Stream-Position" );
625*cdf0e10cSrcweir                                         DBG_ASSERT( nlLinePos-nLinePos ==
626*cdf0e10cSrcweir                                                     (sal_uLong)(nPos+1),
627*cdf0e10cSrcweir                                                     "Falsche Zeilen-Position" );
628*cdf0e10cSrcweir                                         rInput.Seek( nStreamPos );
629*cdf0e10cSrcweir                                         nlLinePos = nLinePos;
630*cdf0e10cSrcweir                                         ClearTxtConvContext();
631*cdf0e10cSrcweir                                         return HTML_TEXTTOKEN;
632*cdf0e10cSrcweir                                     }
633*cdf0e10cSrcweir 
634*cdf0e10cSrcweir                                     // Hack: _GetNextChar soll nicht das
635*cdf0e10cSrcweir                                     // naechste Zeichen lesen
636*cdf0e10cSrcweir                                     if( ';' != nNextCh )
637*cdf0e10cSrcweir                                         aToken += ' ';
638*cdf0e10cSrcweir                                     if( 1U == cChar )
639*cdf0e10cSrcweir                                         return HTML_NONBREAKSPACE;
640*cdf0e10cSrcweir                                     if( 2U == cChar )
641*cdf0e10cSrcweir                                         return HTML_SOFTHYPH;
642*cdf0e10cSrcweir                                 }
643*cdf0e10cSrcweir                                 aToken += (sal_Unicode)'&';
644*cdf0e10cSrcweir                                 aToken +=
645*cdf0e10cSrcweir                                     String(sEntityBuffer.makeStringAndClear());
646*cdf0e10cSrcweir                                 break;
647*cdf0e10cSrcweir                             }
648*cdf0e10cSrcweir                         }
649*cdf0e10cSrcweir                     }
650*cdf0e10cSrcweir                     else
651*cdf0e10cSrcweir                         nNextCh = 0U;
652*cdf0e10cSrcweir                 }
653*cdf0e10cSrcweir                 // MIB 03/02/2000: &{...};-JavaScript-Macros are not
654*cdf0e10cSrcweir                 // supported any longer.
655*cdf0e10cSrcweir                 else if( IsParserWorking() )
656*cdf0e10cSrcweir                 {
657*cdf0e10cSrcweir                     sTmpBuffer.append( (sal_Unicode)'&' );
658*cdf0e10cSrcweir                     bNextCh = sal_False;
659*cdf0e10cSrcweir                     break;
660*cdf0e10cSrcweir                 }
661*cdf0e10cSrcweir 
662*cdf0e10cSrcweir                 bNextCh = (';' == nNextCh);
663*cdf0e10cSrcweir                 if( cBreak=='>' && (cChar=='\\' || cChar=='\'' ||
664*cdf0e10cSrcweir                                     cChar=='\"' || cChar==' ') )
665*cdf0e10cSrcweir                 {
666*cdf0e10cSrcweir                     // ' und " mussen innerhalb von Tags mit einem
667*cdf0e10cSrcweir                     // gekennzeichnet werden, um sie von ' und " als Klammern
668*cdf0e10cSrcweir                     // um Optionen zu unterscheiden. Logischerweise muss
669*cdf0e10cSrcweir                     // deshalb auch ein \ gekeenzeichnet werden. Ausserdem
670*cdf0e10cSrcweir                     // schuetzen wir ein Space, weil es kein Trennzeichen
671*cdf0e10cSrcweir                     // zwischen Optionen ist.
672*cdf0e10cSrcweir                     sTmpBuffer.append( (sal_Unicode)'\\' );
673*cdf0e10cSrcweir                     if( MAX_LEN == sTmpBuffer.getLength() )
674*cdf0e10cSrcweir                         aToken += String(sTmpBuffer.makeStringAndClear());
675*cdf0e10cSrcweir                 }
676*cdf0e10cSrcweir                 if( IsParserWorking() )
677*cdf0e10cSrcweir                 {
678*cdf0e10cSrcweir                     if( cChar )
679*cdf0e10cSrcweir                         sTmpBuffer.append( cChar );
680*cdf0e10cSrcweir                 }
681*cdf0e10cSrcweir                 else if( SVPAR_PENDING==eState && '>'!=cBreak )
682*cdf0e10cSrcweir                 {
683*cdf0e10cSrcweir                     // Mit dem '&' Zeichen wieder aufsetzen, der Rest
684*cdf0e10cSrcweir                     // wird als Texttoken zurueckgegeben.
685*cdf0e10cSrcweir                     if( aToken.Len() || sTmpBuffer.getLength() )
686*cdf0e10cSrcweir                     {
687*cdf0e10cSrcweir                         // Der bisherige Text wird von _GetNextChar()
688*cdf0e10cSrcweir                         // zurueckgegeben und beim naechsten Aufruf wird
689*cdf0e10cSrcweir                         // ein neues Zeichen gelesen. Also muessen wir uns
690*cdf0e10cSrcweir                         // noch vor das & stellen.
691*cdf0e10cSrcweir                         nNextCh = 0U;
692*cdf0e10cSrcweir                         rInput.Seek( nStreamPos-(sal_uInt32)GetCharSize() );
693*cdf0e10cSrcweir                         nlLinePos = nLinePos-1;
694*cdf0e10cSrcweir                         ClearTxtConvContext();
695*cdf0e10cSrcweir                         bReadNextChar = sal_True;
696*cdf0e10cSrcweir                     }
697*cdf0e10cSrcweir                     bNextCh = sal_False;
698*cdf0e10cSrcweir                 }
699*cdf0e10cSrcweir             }
700*cdf0e10cSrcweir             break;
701*cdf0e10cSrcweir         case '=':
702*cdf0e10cSrcweir             if( '>'==cBreak && !cQuote )
703*cdf0e10cSrcweir                 bEqSignFound = sal_True;
704*cdf0e10cSrcweir             sTmpBuffer.append( nNextCh );
705*cdf0e10cSrcweir             break;
706*cdf0e10cSrcweir 
707*cdf0e10cSrcweir         case '\\':
708*cdf0e10cSrcweir             if( '>'==cBreak )
709*cdf0e10cSrcweir             {
710*cdf0e10cSrcweir                 // Innerhalb von Tags kennzeichnen
711*cdf0e10cSrcweir                 sTmpBuffer.append( (sal_Unicode)'\\' );
712*cdf0e10cSrcweir                 if( MAX_LEN == sTmpBuffer.getLength() )
713*cdf0e10cSrcweir                     aToken += String(sTmpBuffer.makeStringAndClear());
714*cdf0e10cSrcweir             }
715*cdf0e10cSrcweir             sTmpBuffer.append( (sal_Unicode)'\\' );
716*cdf0e10cSrcweir             break;
717*cdf0e10cSrcweir 
718*cdf0e10cSrcweir         case '\"':
719*cdf0e10cSrcweir         case '\'':
720*cdf0e10cSrcweir             if( '>'==cBreak )
721*cdf0e10cSrcweir             {
722*cdf0e10cSrcweir                 if( bEqSignFound )
723*cdf0e10cSrcweir                     cQuote = nNextCh;
724*cdf0e10cSrcweir                 else if( cQuote && (cQuote==nNextCh ) )
725*cdf0e10cSrcweir                     cQuote = 0U;
726*cdf0e10cSrcweir             }
727*cdf0e10cSrcweir             sTmpBuffer.append( nNextCh );
728*cdf0e10cSrcweir             bEqSignFound = sal_False;
729*cdf0e10cSrcweir             break;
730*cdf0e10cSrcweir 
731*cdf0e10cSrcweir         case sal_Unicode(EOF):
732*cdf0e10cSrcweir             if( rInput.IsEof() )
733*cdf0e10cSrcweir             {
734*cdf0e10cSrcweir // MIB 20.11.98: Das macht hier keinen Sinn, oder doch: Zumindest wird
735*cdf0e10cSrcweir // abc&auml;<EOF> nicht angezeigt, also lassen wir das in Zukunft.
736*cdf0e10cSrcweir //              if( '>' != cBreak )
737*cdf0e10cSrcweir //                  eState = SVPAR_ACCEPTED;
738*cdf0e10cSrcweir                 bWeiter = sal_False;
739*cdf0e10cSrcweir             }
740*cdf0e10cSrcweir             else
741*cdf0e10cSrcweir             {
742*cdf0e10cSrcweir                 sTmpBuffer.append( nNextCh );
743*cdf0e10cSrcweir             }
744*cdf0e10cSrcweir             break;
745*cdf0e10cSrcweir 
746*cdf0e10cSrcweir         case '<':
747*cdf0e10cSrcweir             bEqSignFound = sal_False;
748*cdf0e10cSrcweir             if( '>'==cBreak )
749*cdf0e10cSrcweir                 sTmpBuffer.append( nNextCh );
750*cdf0e10cSrcweir             else
751*cdf0e10cSrcweir                 bWeiter = sal_False;        // Abbrechen, String zusammen
752*cdf0e10cSrcweir             break;
753*cdf0e10cSrcweir 
754*cdf0e10cSrcweir         case '\f':
755*cdf0e10cSrcweir             if( '>' == cBreak )
756*cdf0e10cSrcweir             {
757*cdf0e10cSrcweir                 // Beim Scannen von Optionen wie ein Space behandeln
758*cdf0e10cSrcweir                 sTmpBuffer.append( (sal_Unicode)' ' );
759*cdf0e10cSrcweir             }
760*cdf0e10cSrcweir             else
761*cdf0e10cSrcweir             {
762*cdf0e10cSrcweir                 // sonst wird es ein eigenes Token
763*cdf0e10cSrcweir                 bWeiter = sal_False;
764*cdf0e10cSrcweir             }
765*cdf0e10cSrcweir             break;
766*cdf0e10cSrcweir 
767*cdf0e10cSrcweir         case '\r':
768*cdf0e10cSrcweir         case '\n':
769*cdf0e10cSrcweir             if( '>'==cBreak )
770*cdf0e10cSrcweir             {
771*cdf0e10cSrcweir                 // #26979# cr/lf in Tag wird in _GetNextToken() behandeln
772*cdf0e10cSrcweir                 sTmpBuffer.append( nNextCh );
773*cdf0e10cSrcweir                 break;
774*cdf0e10cSrcweir             }
775*cdf0e10cSrcweir             else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
776*cdf0e10cSrcweir             {
777*cdf0e10cSrcweir                 bWeiter = sal_False;
778*cdf0e10cSrcweir                 break;
779*cdf0e10cSrcweir             }
780*cdf0e10cSrcweir             // Bug 18984: CR-LF -> Blank
781*cdf0e10cSrcweir             //      Folge von CR/LF/BLANK/TAB nur in ein Blank wandeln
782*cdf0e10cSrcweir             // kein break!!
783*cdf0e10cSrcweir         case '\t':
784*cdf0e10cSrcweir             if( '\t'==nNextCh && bReadPRE && '>'!=cBreak )
785*cdf0e10cSrcweir             {
786*cdf0e10cSrcweir                 // In <PRE>: Tabs nach oben durchreichen
787*cdf0e10cSrcweir                 bWeiter = sal_False;
788*cdf0e10cSrcweir                 break;
789*cdf0e10cSrcweir             }
790*cdf0e10cSrcweir             // kein break
791*cdf0e10cSrcweir         case '\x0b':
792*cdf0e10cSrcweir             if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) &&
793*cdf0e10cSrcweir                 '>'!=cBreak )
794*cdf0e10cSrcweir             {
795*cdf0e10cSrcweir                 break;
796*cdf0e10cSrcweir             }
797*cdf0e10cSrcweir             nNextCh = ' ';
798*cdf0e10cSrcweir             // kein break;
799*cdf0e10cSrcweir         case ' ':
800*cdf0e10cSrcweir             sTmpBuffer.append( nNextCh );
801*cdf0e10cSrcweir             if( '>'!=cBreak && (!bReadListing && !bReadXMP &&
802*cdf0e10cSrcweir                                 !bReadPRE && !bReadTextArea) )
803*cdf0e10cSrcweir             {
804*cdf0e10cSrcweir                 // alle Folgen von Blanks/Tabs/CR/LF zu einem Blank umwandeln
805*cdf0e10cSrcweir                 do {
806*cdf0e10cSrcweir                     if( sal_Unicode(EOF) == (nNextCh = GetNextChar()) &&
807*cdf0e10cSrcweir                         rInput.IsEof() )
808*cdf0e10cSrcweir                     {
809*cdf0e10cSrcweir                         if( aToken.Len() || sTmpBuffer.getLength() > 1L )
810*cdf0e10cSrcweir                         {
811*cdf0e10cSrcweir                             // ausser den Blanks wurde noch etwas geselen
812*cdf0e10cSrcweir                             aToken += String(sTmpBuffer.makeStringAndClear());
813*cdf0e10cSrcweir                             return HTML_TEXTTOKEN;
814*cdf0e10cSrcweir                         }
815*cdf0e10cSrcweir                         else
816*cdf0e10cSrcweir                             // nur Blanks gelesen: dann darf kein Text
817*cdf0e10cSrcweir                             // mehr zurueckgegeben werden und _GetNextToken
818*cdf0e10cSrcweir                             // muss auf EOF laufen
819*cdf0e10cSrcweir                             return 0;
820*cdf0e10cSrcweir                     }
821*cdf0e10cSrcweir                 } while ( ' ' == nNextCh || '\t' == nNextCh ||
822*cdf0e10cSrcweir                           '\r' == nNextCh || '\n' == nNextCh ||
823*cdf0e10cSrcweir                           '\x0b' == nNextCh );
824*cdf0e10cSrcweir                 bNextCh = sal_False;
825*cdf0e10cSrcweir             }
826*cdf0e10cSrcweir             break;
827*cdf0e10cSrcweir 
828*cdf0e10cSrcweir         default:
829*cdf0e10cSrcweir             bEqSignFound = sal_False;
830*cdf0e10cSrcweir             if( (nNextCh==cBreak && !cQuote) ||
831*cdf0e10cSrcweir                 (sal_uLong(aToken.Len()) + MAX_LEN) > sal_uLong(STRING_MAXLEN & ~1 ))
832*cdf0e10cSrcweir                 bWeiter = sal_False;
833*cdf0e10cSrcweir             else
834*cdf0e10cSrcweir             {
835*cdf0e10cSrcweir                 do {
836*cdf0e10cSrcweir                     // alle anderen Zeichen kommen in den Text
837*cdf0e10cSrcweir                     sTmpBuffer.append( nNextCh );
838*cdf0e10cSrcweir                     if( MAX_LEN == sTmpBuffer.getLength() )
839*cdf0e10cSrcweir                     {
840*cdf0e10cSrcweir                         aToken += String(sTmpBuffer.makeStringAndClear());
841*cdf0e10cSrcweir                         if( (sal_uLong(aToken.Len()) + MAX_LEN) >
842*cdf0e10cSrcweir                                 sal_uLong(STRING_MAXLEN & ~1 ) )
843*cdf0e10cSrcweir                         {
844*cdf0e10cSrcweir                             nNextCh = GetNextChar();
845*cdf0e10cSrcweir                             return HTML_TEXTTOKEN;
846*cdf0e10cSrcweir                         }
847*cdf0e10cSrcweir                     }
848*cdf0e10cSrcweir                     if( ( sal_Unicode(EOF) == (nNextCh = GetNextChar()) &&
849*cdf0e10cSrcweir                           rInput.IsEof() ) ||
850*cdf0e10cSrcweir                         !IsParserWorking() )
851*cdf0e10cSrcweir                     {
852*cdf0e10cSrcweir                         if( sTmpBuffer.getLength() )
853*cdf0e10cSrcweir                             aToken += String(sTmpBuffer.makeStringAndClear());
854*cdf0e10cSrcweir                         return HTML_TEXTTOKEN;
855*cdf0e10cSrcweir                     }
856*cdf0e10cSrcweir                 } while( HTML_ISALPHA( nNextCh ) || HTML_ISDIGIT( nNextCh ) );
857*cdf0e10cSrcweir                 bNextCh = sal_False;
858*cdf0e10cSrcweir             }
859*cdf0e10cSrcweir         }
860*cdf0e10cSrcweir 
861*cdf0e10cSrcweir         if( MAX_LEN == sTmpBuffer.getLength() )
862*cdf0e10cSrcweir             aToken += String(sTmpBuffer.makeStringAndClear());
863*cdf0e10cSrcweir 
864*cdf0e10cSrcweir         if( bWeiter && bNextCh )
865*cdf0e10cSrcweir             nNextCh = GetNextChar();
866*cdf0e10cSrcweir     }
867*cdf0e10cSrcweir 
868*cdf0e10cSrcweir     if( sTmpBuffer.getLength() )
869*cdf0e10cSrcweir         aToken += String(sTmpBuffer.makeStringAndClear());
870*cdf0e10cSrcweir 
871*cdf0e10cSrcweir     return HTML_TEXTTOKEN;
872*cdf0e10cSrcweir }
873*cdf0e10cSrcweir 
874*cdf0e10cSrcweir int HTMLParser::_GetNextRawToken()
875*cdf0e10cSrcweir {
876*cdf0e10cSrcweir     ::rtl::OUStringBuffer sTmpBuffer( MAX_LEN );
877*cdf0e10cSrcweir 
878*cdf0e10cSrcweir     if( bEndTokenFound )
879*cdf0e10cSrcweir     {
880*cdf0e10cSrcweir         // beim letzten Aufruf haben wir das End-Token bereits gefunden,
881*cdf0e10cSrcweir         // deshalb muessen wir es nicht noch einmal suchen
882*cdf0e10cSrcweir         bReadScript = sal_False;
883*cdf0e10cSrcweir         bReadStyle = sal_False;
884*cdf0e10cSrcweir         aEndToken.Erase();
885*cdf0e10cSrcweir         bEndTokenFound = sal_False;
886*cdf0e10cSrcweir 
887*cdf0e10cSrcweir         return 0;
888*cdf0e10cSrcweir     }
889*cdf0e10cSrcweir 
890*cdf0e10cSrcweir     // per default geben wir HTML_RAWDATA zurueck
891*cdf0e10cSrcweir     int bWeiter = sal_True;
892*cdf0e10cSrcweir     int nToken = HTML_RAWDATA;
893*cdf0e10cSrcweir     SaveState( 0 );
894*cdf0e10cSrcweir     while( bWeiter && IsParserWorking() )
895*cdf0e10cSrcweir     {
896*cdf0e10cSrcweir         int bNextCh = sal_True;
897*cdf0e10cSrcweir         switch( nNextCh )
898*cdf0e10cSrcweir         {
899*cdf0e10cSrcweir         case '<':
900*cdf0e10cSrcweir             {
901*cdf0e10cSrcweir                 // Vielleicht haben wir das Ende erreicht
902*cdf0e10cSrcweir 
903*cdf0e10cSrcweir                 // das bisher gelesene erstmal retten
904*cdf0e10cSrcweir                 aToken += String(sTmpBuffer.makeStringAndClear());
905*cdf0e10cSrcweir 
906*cdf0e10cSrcweir                 // und die Position im Stream merken
907*cdf0e10cSrcweir                 sal_uLong nStreamPos = rInput.Tell();
908*cdf0e10cSrcweir                 sal_uLong nLineNr = GetLineNr();
909*cdf0e10cSrcweir                 sal_uLong nLinePos = GetLinePos();
910*cdf0e10cSrcweir 
911*cdf0e10cSrcweir                 // Start eines End-Token?
912*cdf0e10cSrcweir                 int bOffState = sal_False;
913*cdf0e10cSrcweir                 if( '/' == (nNextCh = GetNextChar()) )
914*cdf0e10cSrcweir                 {
915*cdf0e10cSrcweir                     bOffState = sal_True;
916*cdf0e10cSrcweir                     nNextCh = GetNextChar();
917*cdf0e10cSrcweir                 }
918*cdf0e10cSrcweir                 else if( '!' == nNextCh )
919*cdf0e10cSrcweir                 {
920*cdf0e10cSrcweir                     sTmpBuffer.append( nNextCh );
921*cdf0e10cSrcweir                     nNextCh = GetNextChar();
922*cdf0e10cSrcweir                 }
923*cdf0e10cSrcweir 
924*cdf0e10cSrcweir                 // jetzt die Buchstaben danach lesen
925*cdf0e10cSrcweir                 while( (HTML_ISALPHA(nNextCh) || '-'==nNextCh) &&
926*cdf0e10cSrcweir                        IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN )
927*cdf0e10cSrcweir                 {
928*cdf0e10cSrcweir                     sTmpBuffer.append( nNextCh );
929*cdf0e10cSrcweir                     nNextCh = GetNextChar();
930*cdf0e10cSrcweir                 }
931*cdf0e10cSrcweir 
932*cdf0e10cSrcweir                 String aTok( sTmpBuffer.getStr(),
933*cdf0e10cSrcweir                              sal::static_int_cast< xub_StrLen >(
934*cdf0e10cSrcweir                                  sTmpBuffer.getLength()) );
935*cdf0e10cSrcweir                 aTok.ToUpperAscii();
936*cdf0e10cSrcweir                 sal_Bool bDone = sal_False;
937*cdf0e10cSrcweir                 if( bReadScript || aEndToken.Len() )
938*cdf0e10cSrcweir                 {
939*cdf0e10cSrcweir                     if( !bReadComment )
940*cdf0e10cSrcweir                     {
941*cdf0e10cSrcweir                         if( aTok.CompareToAscii( OOO_STRING_SVTOOLS_HTML_comment, 3 )
942*cdf0e10cSrcweir                                 == COMPARE_EQUAL )
943*cdf0e10cSrcweir                         {
944*cdf0e10cSrcweir                             bReadComment = sal_True;
945*cdf0e10cSrcweir                         }
946*cdf0e10cSrcweir                         else
947*cdf0e10cSrcweir                         {
948*cdf0e10cSrcweir                             // ein Script muss mit "</SCRIPT>" aufhoehren, wobei
949*cdf0e10cSrcweir                             // wir es mit dem ">" aus sicherheitsgruenden
950*cdf0e10cSrcweir                             // erstmal nicht so genau nehmen
951*cdf0e10cSrcweir                             bDone = bOffState && // '>'==nNextCh &&
952*cdf0e10cSrcweir                             COMPARE_EQUAL == ( bReadScript
953*cdf0e10cSrcweir                                 ? aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_script)
954*cdf0e10cSrcweir                                 : aTok.CompareTo(aEndToken) );
955*cdf0e10cSrcweir                         }
956*cdf0e10cSrcweir                     }
957*cdf0e10cSrcweir                     if( bReadComment && '>'==nNextCh && aTok.Len() >= 2 &&
958*cdf0e10cSrcweir                         aTok.Copy( aTok.Len()-2 ).EqualsAscii( "--" ) )
959*cdf0e10cSrcweir                     {
960*cdf0e10cSrcweir                         // hier ist ein Kommentar der Art <!-----> zuende
961*cdf0e10cSrcweir                         bReadComment = sal_False;
962*cdf0e10cSrcweir                     }
963*cdf0e10cSrcweir                 }
964*cdf0e10cSrcweir                 else
965*cdf0e10cSrcweir                 {
966*cdf0e10cSrcweir                     // ein Style-Sheet kann mit </STYLE>, </HEAD> oder
967*cdf0e10cSrcweir                     // <BODY> aughoehren
968*cdf0e10cSrcweir                     if( bOffState )
969*cdf0e10cSrcweir                         bDone = aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_style)
970*cdf0e10cSrcweir                                     == COMPARE_EQUAL ||
971*cdf0e10cSrcweir                                 aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_head)
972*cdf0e10cSrcweir                                     == COMPARE_EQUAL;
973*cdf0e10cSrcweir                     else
974*cdf0e10cSrcweir                         bDone =
975*cdf0e10cSrcweir                             aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_body) == COMPARE_EQUAL;
976*cdf0e10cSrcweir                 }
977*cdf0e10cSrcweir 
978*cdf0e10cSrcweir                 if( bDone )
979*cdf0e10cSrcweir                 {
980*cdf0e10cSrcweir                     // das war's, jetzt muessen wir gegebenenfalls den
981*cdf0e10cSrcweir                     // bisher gelesenen String zurueckgeben und dnach normal
982*cdf0e10cSrcweir                     // weitermachen
983*cdf0e10cSrcweir 
984*cdf0e10cSrcweir                     bWeiter = sal_False;
985*cdf0e10cSrcweir 
986*cdf0e10cSrcweir                     // nToken==0 heisst, dass _GetNextToken gleich weiterliest
987*cdf0e10cSrcweir                     if( !aToken.Len() && (bReadStyle || bReadScript) )
988*cdf0e10cSrcweir                     {
989*cdf0e10cSrcweir                         // wir koennen sofort die Umgebung beeden und
990*cdf0e10cSrcweir                         // das End-Token parsen
991*cdf0e10cSrcweir                         bReadScript = sal_False;
992*cdf0e10cSrcweir                         bReadStyle = sal_False;
993*cdf0e10cSrcweir                         aEndToken.Erase();
994*cdf0e10cSrcweir                         nToken = 0;
995*cdf0e10cSrcweir                     }
996*cdf0e10cSrcweir                     else
997*cdf0e10cSrcweir                     {
998*cdf0e10cSrcweir                         // wir muessen bReadScript/bReadStyle noch am
999*cdf0e10cSrcweir                         // Leben lassen und koennen erst beim naechsten
1000*cdf0e10cSrcweir                         // mal das End-Token Parsen
1001*cdf0e10cSrcweir                         bEndTokenFound = sal_True;
1002*cdf0e10cSrcweir                     }
1003*cdf0e10cSrcweir 
1004*cdf0e10cSrcweir                     // jetzt fahren wir im Stream auf das '<' zurueck
1005*cdf0e10cSrcweir                     rInput.Seek( nStreamPos );
1006*cdf0e10cSrcweir                     SetLineNr( nLineNr );
1007*cdf0e10cSrcweir                     SetLinePos( nLinePos );
1008*cdf0e10cSrcweir                     ClearTxtConvContext();
1009*cdf0e10cSrcweir                     nNextCh = '<';
1010*cdf0e10cSrcweir 
1011*cdf0e10cSrcweir                     // den String wollen wir nicht an das Token haengen
1012*cdf0e10cSrcweir                     sTmpBuffer.setLength( 0L );
1013*cdf0e10cSrcweir                 }
1014*cdf0e10cSrcweir                 else
1015*cdf0e10cSrcweir                 {
1016*cdf0e10cSrcweir                     // "</" merken, alles andere steht noch im buffer
1017*cdf0e10cSrcweir                     aToken += (sal_Unicode)'<';
1018*cdf0e10cSrcweir                     if( bOffState )
1019*cdf0e10cSrcweir                         aToken += (sal_Unicode)'/';
1020*cdf0e10cSrcweir 
1021*cdf0e10cSrcweir                     bNextCh = sal_False;
1022*cdf0e10cSrcweir                 }
1023*cdf0e10cSrcweir             }
1024*cdf0e10cSrcweir             break;
1025*cdf0e10cSrcweir         case '-':
1026*cdf0e10cSrcweir             sTmpBuffer.append( nNextCh );
1027*cdf0e10cSrcweir             if( bReadComment )
1028*cdf0e10cSrcweir             {
1029*cdf0e10cSrcweir                 sal_Bool bTwoMinus = sal_False;
1030*cdf0e10cSrcweir                 nNextCh = GetNextChar();
1031*cdf0e10cSrcweir                 while( '-' == nNextCh && IsParserWorking() )
1032*cdf0e10cSrcweir                 {
1033*cdf0e10cSrcweir                     bTwoMinus = sal_True;
1034*cdf0e10cSrcweir 
1035*cdf0e10cSrcweir                     if( MAX_LEN == sTmpBuffer.getLength() )
1036*cdf0e10cSrcweir                         aToken += String(sTmpBuffer.makeStringAndClear());
1037*cdf0e10cSrcweir                     sTmpBuffer.append( nNextCh );
1038*cdf0e10cSrcweir                     nNextCh = GetNextChar();
1039*cdf0e10cSrcweir                 }
1040*cdf0e10cSrcweir 
1041*cdf0e10cSrcweir                 if( '>' == nNextCh && IsParserWorking() && bTwoMinus )
1042*cdf0e10cSrcweir                     bReadComment = sal_False;
1043*cdf0e10cSrcweir 
1044*cdf0e10cSrcweir                 bNextCh = sal_False;
1045*cdf0e10cSrcweir             }
1046*cdf0e10cSrcweir             break;
1047*cdf0e10cSrcweir 
1048*cdf0e10cSrcweir         case '\r':
1049*cdf0e10cSrcweir             // \r\n? beendet das aktuelle Text-Token (auch wenn es leer ist)
1050*cdf0e10cSrcweir             nNextCh = GetNextChar();
1051*cdf0e10cSrcweir             if( nNextCh=='\n' )
1052*cdf0e10cSrcweir                 nNextCh = GetNextChar();
1053*cdf0e10cSrcweir             bWeiter = sal_False;
1054*cdf0e10cSrcweir             break;
1055*cdf0e10cSrcweir         case '\n':
1056*cdf0e10cSrcweir             // \n beendet das aktuelle Text-Token (auch wenn es leer ist)
1057*cdf0e10cSrcweir             nNextCh = GetNextChar();
1058*cdf0e10cSrcweir             bWeiter = sal_False;
1059*cdf0e10cSrcweir             break;
1060*cdf0e10cSrcweir         case sal_Unicode(EOF):
1061*cdf0e10cSrcweir             // eof beendet das aktuelle Text-Token und tut so, als ob
1062*cdf0e10cSrcweir             // ein End-Token gelesen wurde
1063*cdf0e10cSrcweir             if( rInput.IsEof() )
1064*cdf0e10cSrcweir             {
1065*cdf0e10cSrcweir                 bWeiter = sal_False;
1066*cdf0e10cSrcweir                 if( aToken.Len() || sTmpBuffer.getLength() )
1067*cdf0e10cSrcweir                 {
1068*cdf0e10cSrcweir                     bEndTokenFound = sal_True;
1069*cdf0e10cSrcweir                 }
1070*cdf0e10cSrcweir                 else
1071*cdf0e10cSrcweir                 {
1072*cdf0e10cSrcweir                     bReadScript = sal_False;
1073*cdf0e10cSrcweir                     bReadStyle = sal_False;
1074*cdf0e10cSrcweir                     aEndToken.Erase();
1075*cdf0e10cSrcweir                     nToken = 0;
1076*cdf0e10cSrcweir                 }
1077*cdf0e10cSrcweir                 break;
1078*cdf0e10cSrcweir             }
1079*cdf0e10cSrcweir             // kein break
1080*cdf0e10cSrcweir         default:
1081*cdf0e10cSrcweir             // alle anderen Zeichen landen im Buffer
1082*cdf0e10cSrcweir             sTmpBuffer.append( nNextCh );
1083*cdf0e10cSrcweir             break;
1084*cdf0e10cSrcweir         }
1085*cdf0e10cSrcweir 
1086*cdf0e10cSrcweir         if( (!bWeiter && sTmpBuffer.getLength() > 0L) ||
1087*cdf0e10cSrcweir             MAX_LEN == sTmpBuffer.getLength() )
1088*cdf0e10cSrcweir             aToken += String(sTmpBuffer.makeStringAndClear());
1089*cdf0e10cSrcweir 
1090*cdf0e10cSrcweir         if( bWeiter && bNextCh )
1091*cdf0e10cSrcweir             nNextCh = GetNextChar();
1092*cdf0e10cSrcweir     }
1093*cdf0e10cSrcweir 
1094*cdf0e10cSrcweir     if( IsParserWorking() )
1095*cdf0e10cSrcweir         SaveState( 0 );
1096*cdf0e10cSrcweir     else
1097*cdf0e10cSrcweir         nToken = 0;
1098*cdf0e10cSrcweir 
1099*cdf0e10cSrcweir     return nToken;
1100*cdf0e10cSrcweir }
1101*cdf0e10cSrcweir 
1102*cdf0e10cSrcweir // scanne das naechste Token,
1103*cdf0e10cSrcweir int __EXPORT HTMLParser::_GetNextToken()
1104*cdf0e10cSrcweir {
1105*cdf0e10cSrcweir     int nRet = 0;
1106*cdf0e10cSrcweir     sSaveToken.Erase();
1107*cdf0e10cSrcweir 
1108*cdf0e10cSrcweir     // die Optionen loeschen
1109*cdf0e10cSrcweir     if( pOptions->Count() )
1110*cdf0e10cSrcweir         pOptions->DeleteAndDestroy( 0, pOptions->Count() );
1111*cdf0e10cSrcweir 
1112*cdf0e10cSrcweir     if( !IsParserWorking() )        // wenn schon Fehler, dann nicht weiter!
1113*cdf0e10cSrcweir         return 0;
1114*cdf0e10cSrcweir 
1115*cdf0e10cSrcweir     sal_Bool bReadNextCharSave = bReadNextChar;
1116*cdf0e10cSrcweir     if( bReadNextChar )
1117*cdf0e10cSrcweir     {
1118*cdf0e10cSrcweir         DBG_ASSERT( !bEndTokenFound,
1119*cdf0e10cSrcweir                     "</SCRIPT> gelesen und trotzdem noch ein Zeichen lesen?" );
1120*cdf0e10cSrcweir         nNextCh = GetNextChar();
1121*cdf0e10cSrcweir         if( !IsParserWorking() )        // wenn schon Fehler, dann nicht weiter!
1122*cdf0e10cSrcweir             return 0;
1123*cdf0e10cSrcweir         bReadNextChar = sal_False;
1124*cdf0e10cSrcweir     }
1125*cdf0e10cSrcweir 
1126*cdf0e10cSrcweir     if( bReadScript || bReadStyle || aEndToken.Len() )
1127*cdf0e10cSrcweir     {
1128*cdf0e10cSrcweir         nRet = _GetNextRawToken();
1129*cdf0e10cSrcweir         if( nRet || !IsParserWorking() )
1130*cdf0e10cSrcweir             return nRet;
1131*cdf0e10cSrcweir     }
1132*cdf0e10cSrcweir 
1133*cdf0e10cSrcweir     do {
1134*cdf0e10cSrcweir         int bNextCh = sal_True;
1135*cdf0e10cSrcweir         switch( nNextCh )
1136*cdf0e10cSrcweir         {
1137*cdf0e10cSrcweir         case '<':
1138*cdf0e10cSrcweir             {
1139*cdf0e10cSrcweir                 sal_uLong nStreamPos = rInput.Tell();
1140*cdf0e10cSrcweir                 sal_uLong nLineNr = GetLineNr();
1141*cdf0e10cSrcweir                 sal_uLong nLinePos = GetLinePos();
1142*cdf0e10cSrcweir 
1143*cdf0e10cSrcweir                 int bOffState = sal_False;
1144*cdf0e10cSrcweir                 if( '/' == (nNextCh = GetNextChar()) )
1145*cdf0e10cSrcweir                 {
1146*cdf0e10cSrcweir                     bOffState = sal_True;
1147*cdf0e10cSrcweir                     nNextCh = GetNextChar();
1148*cdf0e10cSrcweir                 }
1149*cdf0e10cSrcweir                 if( HTML_ISALPHA( nNextCh ) || '!'==nNextCh ) // fix #26984#
1150*cdf0e10cSrcweir                 {
1151*cdf0e10cSrcweir                     ::rtl::OUStringBuffer sTmpBuffer;
1152*cdf0e10cSrcweir                     do {
1153*cdf0e10cSrcweir                         sTmpBuffer.append( nNextCh );
1154*cdf0e10cSrcweir                         if( MAX_LEN == sTmpBuffer.getLength() )
1155*cdf0e10cSrcweir                             aToken += String(sTmpBuffer.makeStringAndClear());
1156*cdf0e10cSrcweir                         nNextCh = GetNextChar();
1157*cdf0e10cSrcweir                     } while( '>' != nNextCh && !HTML_ISSPACE( nNextCh ) &&
1158*cdf0e10cSrcweir                              IsParserWorking() && !rInput.IsEof() );
1159*cdf0e10cSrcweir 
1160*cdf0e10cSrcweir                     if( sTmpBuffer.getLength() )
1161*cdf0e10cSrcweir                         aToken += String(sTmpBuffer.makeStringAndClear());
1162*cdf0e10cSrcweir 
1163*cdf0e10cSrcweir                     // Blanks ueberlesen
1164*cdf0e10cSrcweir                     while( HTML_ISSPACE( nNextCh ) && IsParserWorking() )
1165*cdf0e10cSrcweir                         nNextCh = GetNextChar();
1166*cdf0e10cSrcweir 
1167*cdf0e10cSrcweir                     if( !IsParserWorking() )
1168*cdf0e10cSrcweir                     {
1169*cdf0e10cSrcweir                         if( SVPAR_PENDING == eState )
1170*cdf0e10cSrcweir                             bReadNextChar = bReadNextCharSave;
1171*cdf0e10cSrcweir                         break;
1172*cdf0e10cSrcweir                     }
1173*cdf0e10cSrcweir 
1174*cdf0e10cSrcweir                     // suche das Token in der Tabelle:
1175*cdf0e10cSrcweir                     sSaveToken = aToken;
1176*cdf0e10cSrcweir                     aToken.ToUpperAscii();
1177*cdf0e10cSrcweir                     if( 0 == (nRet = GetHTMLToken( aToken )) )
1178*cdf0e10cSrcweir                         // Unknown Control
1179*cdf0e10cSrcweir                         nRet = HTML_UNKNOWNCONTROL_ON;
1180*cdf0e10cSrcweir 
1181*cdf0e10cSrcweir                     // Wenn es ein Token zum ausschalten ist ...
1182*cdf0e10cSrcweir                     if( bOffState )
1183*cdf0e10cSrcweir                     {
1184*cdf0e10cSrcweir                          if( HTML_TOKEN_ONOFF & nRet )
1185*cdf0e10cSrcweir                          {
1186*cdf0e10cSrcweir                             // und es ein Off-Token gibt, das daraus machen
1187*cdf0e10cSrcweir                             ++nRet;
1188*cdf0e10cSrcweir                          }
1189*cdf0e10cSrcweir                          else if( HTML_LINEBREAK!=nRet )
1190*cdf0e10cSrcweir                          {
1191*cdf0e10cSrcweir                             // und es kein Off-Token gibt, ein unbekanntes
1192*cdf0e10cSrcweir                             // Token daraus machen (ausser </BR>, das wird
1193*cdf0e10cSrcweir                             // wie <BR> behandelt
1194*cdf0e10cSrcweir                             nRet = HTML_UNKNOWNCONTROL_OFF;
1195*cdf0e10cSrcweir                          }
1196*cdf0e10cSrcweir                     }
1197*cdf0e10cSrcweir 
1198*cdf0e10cSrcweir                     if( nRet == HTML_COMMENT )
1199*cdf0e10cSrcweir                     {
1200*cdf0e10cSrcweir                         // fix: sSaveToken wegen Gross-/Kleinschreibung
1201*cdf0e10cSrcweir                         // als Anfang des Kommentars benutzen und ein
1202*cdf0e10cSrcweir                         // Space anhaengen.
1203*cdf0e10cSrcweir                         aToken = sSaveToken;
1204*cdf0e10cSrcweir                         if( '>'!=nNextCh )
1205*cdf0e10cSrcweir                             aToken += (sal_Unicode)' ';
1206*cdf0e10cSrcweir                         sal_uLong nCStreamPos = 0;
1207*cdf0e10cSrcweir                         sal_uLong nCLineNr = 0;
1208*cdf0e10cSrcweir                         sal_uLong nCLinePos = 0;
1209*cdf0e10cSrcweir                         xub_StrLen nCStrLen = 0;
1210*cdf0e10cSrcweir 
1211*cdf0e10cSrcweir                         sal_Bool bDone = sal_False;
1212*cdf0e10cSrcweir                         // bis zum schliessenden --> lesen. wenn keins gefunden
1213*cdf0e10cSrcweir                         // wurde beim der ersten > wieder aufsetzen
1214*cdf0e10cSrcweir                         while( !bDone && !rInput.IsEof() && IsParserWorking() )
1215*cdf0e10cSrcweir                         {
1216*cdf0e10cSrcweir                             if( '>'==nNextCh )
1217*cdf0e10cSrcweir                             {
1218*cdf0e10cSrcweir                                 if( !nCStreamPos )
1219*cdf0e10cSrcweir                                 {
1220*cdf0e10cSrcweir                                     nCStreamPos = rInput.Tell();
1221*cdf0e10cSrcweir                                     nCStrLen = aToken.Len();
1222*cdf0e10cSrcweir                                     nCLineNr = GetLineNr();
1223*cdf0e10cSrcweir                                     nCLinePos = GetLinePos();
1224*cdf0e10cSrcweir                                 }
1225*cdf0e10cSrcweir                                 bDone = aToken.Len() >= 2 &&
1226*cdf0e10cSrcweir                                         aToken.Copy(aToken.Len()-2,2).
1227*cdf0e10cSrcweir                                                         EqualsAscii( "--" );
1228*cdf0e10cSrcweir                                 if( !bDone )
1229*cdf0e10cSrcweir                                 aToken += nNextCh;
1230*cdf0e10cSrcweir                             }
1231*cdf0e10cSrcweir                             else
1232*cdf0e10cSrcweir                                 aToken += nNextCh;
1233*cdf0e10cSrcweir                             if( !bDone )
1234*cdf0e10cSrcweir                                 nNextCh = GetNextChar();
1235*cdf0e10cSrcweir                         }
1236*cdf0e10cSrcweir                         if( !bDone && IsParserWorking() && nCStreamPos )
1237*cdf0e10cSrcweir                         {
1238*cdf0e10cSrcweir                             rInput.Seek( nCStreamPos );
1239*cdf0e10cSrcweir                             SetLineNr( nCLineNr );
1240*cdf0e10cSrcweir                             SetLinePos( nCLinePos );
1241*cdf0e10cSrcweir                             ClearTxtConvContext();
1242*cdf0e10cSrcweir                             aToken.Erase( nCStrLen );
1243*cdf0e10cSrcweir                             nNextCh = '>';
1244*cdf0e10cSrcweir                         }
1245*cdf0e10cSrcweir                     }
1246*cdf0e10cSrcweir                     else
1247*cdf0e10cSrcweir                     {
1248*cdf0e10cSrcweir                         // den TokenString koennen wir jetzt verwerfen
1249*cdf0e10cSrcweir                         aToken.Erase();
1250*cdf0e10cSrcweir                     }
1251*cdf0e10cSrcweir 
1252*cdf0e10cSrcweir                     // dann lesen wir mal alles bis zur schliessenden '>'
1253*cdf0e10cSrcweir                     if( '>' != nNextCh && IsParserWorking() )
1254*cdf0e10cSrcweir                     {
1255*cdf0e10cSrcweir                         ScanText( '>' );
1256*cdf0e10cSrcweir                         if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() )
1257*cdf0e10cSrcweir                         {
1258*cdf0e10cSrcweir                             // zurueck hinter die < gehen  und dort neu
1259*cdf0e10cSrcweir                             // aufsetzen, das < als Text zurueckgeben
1260*cdf0e10cSrcweir                             rInput.Seek( nStreamPos );
1261*cdf0e10cSrcweir                             SetLineNr( nLineNr );
1262*cdf0e10cSrcweir                             SetLinePos( nLinePos );
1263*cdf0e10cSrcweir                             ClearTxtConvContext();
1264*cdf0e10cSrcweir 
1265*cdf0e10cSrcweir                             aToken = '<';
1266*cdf0e10cSrcweir                             nRet = HTML_TEXTTOKEN;
1267*cdf0e10cSrcweir                             nNextCh = GetNextChar();
1268*cdf0e10cSrcweir                             bNextCh = sal_False;
1269*cdf0e10cSrcweir                             break;
1270*cdf0e10cSrcweir                         }
1271*cdf0e10cSrcweir                     }
1272*cdf0e10cSrcweir                     if( SVPAR_PENDING == eState )
1273*cdf0e10cSrcweir                         bReadNextChar = bReadNextCharSave;
1274*cdf0e10cSrcweir                 }
1275*cdf0e10cSrcweir                 else
1276*cdf0e10cSrcweir                 {
1277*cdf0e10cSrcweir                     if( bOffState )
1278*cdf0e10cSrcweir                     {
1279*cdf0e10cSrcweir                         // einfach alles wegschmeissen
1280*cdf0e10cSrcweir                         ScanText( '>' );
1281*cdf0e10cSrcweir                         if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() )
1282*cdf0e10cSrcweir                         {
1283*cdf0e10cSrcweir                             // zurueck hinter die < gehen  und dort neu
1284*cdf0e10cSrcweir                             // aufsetzen, das < als Text zurueckgeben
1285*cdf0e10cSrcweir                             rInput.Seek( nStreamPos );
1286*cdf0e10cSrcweir                             SetLineNr( nLineNr );
1287*cdf0e10cSrcweir                             SetLinePos( nLinePos );
1288*cdf0e10cSrcweir                             ClearTxtConvContext();
1289*cdf0e10cSrcweir 
1290*cdf0e10cSrcweir                             aToken = '<';
1291*cdf0e10cSrcweir                             nRet = HTML_TEXTTOKEN;
1292*cdf0e10cSrcweir                             nNextCh = GetNextChar();
1293*cdf0e10cSrcweir                             bNextCh = sal_False;
1294*cdf0e10cSrcweir                             break;
1295*cdf0e10cSrcweir                         }
1296*cdf0e10cSrcweir                         if( SVPAR_PENDING == eState )
1297*cdf0e10cSrcweir                             bReadNextChar = bReadNextCharSave;
1298*cdf0e10cSrcweir                         aToken.Erase();
1299*cdf0e10cSrcweir                     }
1300*cdf0e10cSrcweir                     else if( '%' == nNextCh )
1301*cdf0e10cSrcweir                     {
1302*cdf0e10cSrcweir                         nRet = HTML_UNKNOWNCONTROL_ON;
1303*cdf0e10cSrcweir 
1304*cdf0e10cSrcweir                         sal_uLong nCStreamPos = rInput.Tell();
1305*cdf0e10cSrcweir                         sal_uLong nCLineNr = GetLineNr(), nCLinePos = GetLinePos();
1306*cdf0e10cSrcweir 
1307*cdf0e10cSrcweir                         sal_Bool bDone = sal_False;
1308*cdf0e10cSrcweir                         // bis zum schliessenden %> lesen. wenn keins gefunden
1309*cdf0e10cSrcweir                         // wurde beim der ersten > wieder aufsetzen
1310*cdf0e10cSrcweir                         while( !bDone && !rInput.IsEof() && IsParserWorking() )
1311*cdf0e10cSrcweir                         {
1312*cdf0e10cSrcweir                             bDone = '>'==nNextCh && aToken.Len() >= 1 &&
1313*cdf0e10cSrcweir                                     '%' == aToken.GetChar( aToken.Len()-1 );
1314*cdf0e10cSrcweir                             if( !bDone )
1315*cdf0e10cSrcweir                             {
1316*cdf0e10cSrcweir                                 aToken += nNextCh;
1317*cdf0e10cSrcweir                                 nNextCh = GetNextChar();
1318*cdf0e10cSrcweir                             }
1319*cdf0e10cSrcweir                         }
1320*cdf0e10cSrcweir                         if( !bDone && IsParserWorking() )
1321*cdf0e10cSrcweir                         {
1322*cdf0e10cSrcweir                             rInput.Seek( nCStreamPos );
1323*cdf0e10cSrcweir                             SetLineNr( nCLineNr );
1324*cdf0e10cSrcweir                             SetLinePos( nCLinePos );
1325*cdf0e10cSrcweir                             ClearTxtConvContext();
1326*cdf0e10cSrcweir                             aToken.AssignAscii( "<%", 2 );
1327*cdf0e10cSrcweir                             nRet = HTML_TEXTTOKEN;
1328*cdf0e10cSrcweir                             break;
1329*cdf0e10cSrcweir                         }
1330*cdf0e10cSrcweir                         if( IsParserWorking() )
1331*cdf0e10cSrcweir                         {
1332*cdf0e10cSrcweir                             sSaveToken = aToken;
1333*cdf0e10cSrcweir                             aToken.Erase();
1334*cdf0e10cSrcweir                         }
1335*cdf0e10cSrcweir                     }
1336*cdf0e10cSrcweir                     else
1337*cdf0e10cSrcweir                     {
1338*cdf0e10cSrcweir                         aToken = '<';
1339*cdf0e10cSrcweir                         nRet = HTML_TEXTTOKEN;
1340*cdf0e10cSrcweir                         bNextCh = sal_False;
1341*cdf0e10cSrcweir                         break;
1342*cdf0e10cSrcweir                     }
1343*cdf0e10cSrcweir                 }
1344*cdf0e10cSrcweir 
1345*cdf0e10cSrcweir                 if( IsParserWorking() )
1346*cdf0e10cSrcweir                 {
1347*cdf0e10cSrcweir                     bNextCh = '>' == nNextCh;
1348*cdf0e10cSrcweir                     switch( nRet )
1349*cdf0e10cSrcweir                     {
1350*cdf0e10cSrcweir                     case HTML_TEXTAREA_ON:
1351*cdf0e10cSrcweir                         bReadTextArea = sal_True;
1352*cdf0e10cSrcweir                         break;
1353*cdf0e10cSrcweir                     case HTML_TEXTAREA_OFF:
1354*cdf0e10cSrcweir                         bReadTextArea = sal_False;
1355*cdf0e10cSrcweir                         break;
1356*cdf0e10cSrcweir                     case HTML_SCRIPT_ON:
1357*cdf0e10cSrcweir                         if( !bReadTextArea )
1358*cdf0e10cSrcweir                             bReadScript = sal_True;
1359*cdf0e10cSrcweir                         break;
1360*cdf0e10cSrcweir                     case HTML_SCRIPT_OFF:
1361*cdf0e10cSrcweir                         if( !bReadTextArea )
1362*cdf0e10cSrcweir                         {
1363*cdf0e10cSrcweir                             bReadScript = sal_False;
1364*cdf0e10cSrcweir                             // JavaScript kann den Stream veraendern
1365*cdf0e10cSrcweir                             // also muss das letzte Zeichen nochmals
1366*cdf0e10cSrcweir                             // gelesen werden
1367*cdf0e10cSrcweir                             bReadNextChar = sal_True;
1368*cdf0e10cSrcweir                             bNextCh = sal_False;
1369*cdf0e10cSrcweir                         }
1370*cdf0e10cSrcweir                         break;
1371*cdf0e10cSrcweir 
1372*cdf0e10cSrcweir                     case HTML_STYLE_ON:
1373*cdf0e10cSrcweir                         bReadStyle = sal_True;
1374*cdf0e10cSrcweir                         break;
1375*cdf0e10cSrcweir                     case HTML_STYLE_OFF:
1376*cdf0e10cSrcweir                         bReadStyle = sal_False;
1377*cdf0e10cSrcweir                         break;
1378*cdf0e10cSrcweir                     }
1379*cdf0e10cSrcweir 
1380*cdf0e10cSrcweir                 }
1381*cdf0e10cSrcweir             }
1382*cdf0e10cSrcweir             break;
1383*cdf0e10cSrcweir 
1384*cdf0e10cSrcweir         case sal_Unicode(EOF):
1385*cdf0e10cSrcweir             if( rInput.IsEof() )
1386*cdf0e10cSrcweir             {
1387*cdf0e10cSrcweir                 eState = SVPAR_ACCEPTED;
1388*cdf0e10cSrcweir                 nRet = nNextCh;
1389*cdf0e10cSrcweir             }
1390*cdf0e10cSrcweir             else
1391*cdf0e10cSrcweir             {
1392*cdf0e10cSrcweir                 // normalen Text lesen
1393*cdf0e10cSrcweir                 goto scan_text;
1394*cdf0e10cSrcweir             }
1395*cdf0e10cSrcweir             break;
1396*cdf0e10cSrcweir 
1397*cdf0e10cSrcweir         case '\f':
1398*cdf0e10cSrcweir             // Form-Feeds werden jetzt extra nach oben gereicht
1399*cdf0e10cSrcweir             nRet = HTML_LINEFEEDCHAR; // !!! eigentlich FORMFEEDCHAR
1400*cdf0e10cSrcweir             break;
1401*cdf0e10cSrcweir 
1402*cdf0e10cSrcweir         case '\n':
1403*cdf0e10cSrcweir         case '\r':
1404*cdf0e10cSrcweir             if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
1405*cdf0e10cSrcweir             {
1406*cdf0e10cSrcweir                 sal_Unicode c = GetNextChar();
1407*cdf0e10cSrcweir                 if( ( '\n' != nNextCh || '\r' != c ) &&
1408*cdf0e10cSrcweir                     ( '\r' != nNextCh || '\n' != c ) )
1409*cdf0e10cSrcweir                 {
1410*cdf0e10cSrcweir                     bNextCh = sal_False;
1411*cdf0e10cSrcweir                     nNextCh = c;
1412*cdf0e10cSrcweir                 }
1413*cdf0e10cSrcweir                 nRet = HTML_NEWPARA;
1414*cdf0e10cSrcweir                 break;
1415*cdf0e10cSrcweir             }
1416*cdf0e10cSrcweir             // kein break !
1417*cdf0e10cSrcweir         case '\t':
1418*cdf0e10cSrcweir             if( bReadPRE )
1419*cdf0e10cSrcweir             {
1420*cdf0e10cSrcweir                 nRet = HTML_TABCHAR;
1421*cdf0e10cSrcweir                 break;
1422*cdf0e10cSrcweir             }
1423*cdf0e10cSrcweir             // kein break !
1424*cdf0e10cSrcweir         case ' ':
1425*cdf0e10cSrcweir             // kein break !
1426*cdf0e10cSrcweir         default:
1427*cdf0e10cSrcweir 
1428*cdf0e10cSrcweir scan_text:
1429*cdf0e10cSrcweir             // es folgt "normaler" Text
1430*cdf0e10cSrcweir             nRet = ScanText();
1431*cdf0e10cSrcweir             bNextCh = 0 == aToken.Len();
1432*cdf0e10cSrcweir 
1433*cdf0e10cSrcweir             // der Text sollte noch verarbeitet werden
1434*cdf0e10cSrcweir             if( !bNextCh && eState == SVPAR_PENDING )
1435*cdf0e10cSrcweir             {
1436*cdf0e10cSrcweir                 eState = SVPAR_WORKING;
1437*cdf0e10cSrcweir                 bReadNextChar = sal_True;
1438*cdf0e10cSrcweir             }
1439*cdf0e10cSrcweir 
1440*cdf0e10cSrcweir             break;
1441*cdf0e10cSrcweir         }
1442*cdf0e10cSrcweir 
1443*cdf0e10cSrcweir         if( bNextCh && SVPAR_WORKING == eState )
1444*cdf0e10cSrcweir         {
1445*cdf0e10cSrcweir             nNextCh = GetNextChar();
1446*cdf0e10cSrcweir             if( SVPAR_PENDING == eState && nRet && HTML_TEXTTOKEN != nRet )
1447*cdf0e10cSrcweir             {
1448*cdf0e10cSrcweir                 bReadNextChar = sal_True;
1449*cdf0e10cSrcweir                 eState = SVPAR_WORKING;
1450*cdf0e10cSrcweir             }
1451*cdf0e10cSrcweir         }
1452*cdf0e10cSrcweir 
1453*cdf0e10cSrcweir     } while( !nRet && SVPAR_WORKING == eState );
1454*cdf0e10cSrcweir 
1455*cdf0e10cSrcweir     if( SVPAR_PENDING == eState )
1456*cdf0e10cSrcweir         nRet = -1;      // irgendwas ungueltiges
1457*cdf0e10cSrcweir 
1458*cdf0e10cSrcweir     return nRet;
1459*cdf0e10cSrcweir }
1460*cdf0e10cSrcweir 
1461*cdf0e10cSrcweir void HTMLParser::UnescapeToken()
1462*cdf0e10cSrcweir {
1463*cdf0e10cSrcweir     xub_StrLen nPos=0;
1464*cdf0e10cSrcweir 
1465*cdf0e10cSrcweir     sal_Bool bEscape = sal_False;
1466*cdf0e10cSrcweir     while( nPos < aToken.Len() )
1467*cdf0e10cSrcweir     {
1468*cdf0e10cSrcweir         sal_Bool bOldEscape = bEscape;
1469*cdf0e10cSrcweir         bEscape = sal_False;
1470*cdf0e10cSrcweir         if( '\\'==aToken.GetChar(nPos) && !bOldEscape )
1471*cdf0e10cSrcweir         {
1472*cdf0e10cSrcweir             aToken.Erase( nPos, 1 );
1473*cdf0e10cSrcweir             bEscape = sal_True;
1474*cdf0e10cSrcweir         }
1475*cdf0e10cSrcweir         else
1476*cdf0e10cSrcweir         {
1477*cdf0e10cSrcweir             nPos++;
1478*cdf0e10cSrcweir         }
1479*cdf0e10cSrcweir     }
1480*cdf0e10cSrcweir }
1481*cdf0e10cSrcweir 
1482*cdf0e10cSrcweir // hole die Optionen
1483*cdf0e10cSrcweir const HTMLOptions *HTMLParser::GetOptions( sal_uInt16 *pNoConvertToken ) const
1484*cdf0e10cSrcweir {
1485*cdf0e10cSrcweir     // wenn die Option fuer das aktuelle Token schon einmal
1486*cdf0e10cSrcweir     // geholt wurden, geben wir sie noch einmal zurueck
1487*cdf0e10cSrcweir     if( pOptions->Count() )
1488*cdf0e10cSrcweir         return pOptions;
1489*cdf0e10cSrcweir 
1490*cdf0e10cSrcweir     xub_StrLen nPos = 0;
1491*cdf0e10cSrcweir     while( nPos < aToken.Len() )
1492*cdf0e10cSrcweir     {
1493*cdf0e10cSrcweir         // ein Zeichen ? Dann faengt hier eine Option an
1494*cdf0e10cSrcweir         if( HTML_ISALPHA( aToken.GetChar(nPos) ) )
1495*cdf0e10cSrcweir         {
1496*cdf0e10cSrcweir             int nToken;
1497*cdf0e10cSrcweir             String aValue;
1498*cdf0e10cSrcweir             xub_StrLen nStt = nPos;
1499*cdf0e10cSrcweir             sal_Unicode cChar = 0;
1500*cdf0e10cSrcweir 
1501*cdf0e10cSrcweir             // Eigentlich sind hier nur ganz bestimmte Zeichen erlaubt.
1502*cdf0e10cSrcweir             // Netscape achtet aber nur auf "=" und Leerzeichen (siehe
1503*cdf0e10cSrcweir             // Mozilla: PA_FetchRequestedNameValues in
1504*cdf0e10cSrcweir             // lipparse/pa_mdl.c
1505*cdf0e10cSrcweir //          while( nPos < aToken.Len() &&
1506*cdf0e10cSrcweir //                  ( '-'==(c=aToken[nPos]) || isalnum(c) || '.'==c || '_'==c) )
1507*cdf0e10cSrcweir             while( nPos < aToken.Len() && '=' != (cChar=aToken.GetChar(nPos)) &&
1508*cdf0e10cSrcweir                    HTML_ISPRINTABLE(cChar) && !HTML_ISSPACE(cChar) )
1509*cdf0e10cSrcweir                 nPos++;
1510*cdf0e10cSrcweir 
1511*cdf0e10cSrcweir             String sName( aToken.Copy( nStt, nPos-nStt ) );
1512*cdf0e10cSrcweir 
1513*cdf0e10cSrcweir //JP 23.03.97: die PlugIns wollen die TokenName im "Original" haben
1514*cdf0e10cSrcweir //              also nur fuers Suchen in UpperCase wandeln
1515*cdf0e10cSrcweir             String sNameUpperCase( sName );
1516*cdf0e10cSrcweir             sNameUpperCase.ToUpperAscii();
1517*cdf0e10cSrcweir 
1518*cdf0e10cSrcweir             nToken = GetHTMLOption( sNameUpperCase ); // der Name ist fertig
1519*cdf0e10cSrcweir             DBG_ASSERTWARNING( nToken!=HTML_O_UNKNOWN,
1520*cdf0e10cSrcweir                         "GetOption: unbekannte HTML-Option" );
1521*cdf0e10cSrcweir             sal_Bool bStripCRLF = (nToken < HTML_OPTION_SCRIPT_START ||
1522*cdf0e10cSrcweir                                nToken >= HTML_OPTION_SCRIPT_END) &&
1523*cdf0e10cSrcweir                               (!pNoConvertToken || nToken != *pNoConvertToken);
1524*cdf0e10cSrcweir 
1525*cdf0e10cSrcweir             while( nPos < aToken.Len() &&
1526*cdf0e10cSrcweir                    ( !HTML_ISPRINTABLE( (cChar=aToken.GetChar(nPos)) ) ||
1527*cdf0e10cSrcweir                      HTML_ISSPACE(cChar) ) )
1528*cdf0e10cSrcweir                 nPos++;
1529*cdf0e10cSrcweir 
1530*cdf0e10cSrcweir             // hat die Option auch einen Wert?
1531*cdf0e10cSrcweir             if( nPos!=aToken.Len() && '='==cChar )
1532*cdf0e10cSrcweir             {
1533*cdf0e10cSrcweir                 nPos++;
1534*cdf0e10cSrcweir 
1535*cdf0e10cSrcweir                 while( nPos < aToken.Len() &&
1536*cdf0e10cSrcweir                         ( !HTML_ISPRINTABLE( (cChar=aToken.GetChar(nPos)) ) ||
1537*cdf0e10cSrcweir                           ' '==cChar || '\t'==cChar || '\r'==cChar || '\n'==cChar ) )
1538*cdf0e10cSrcweir                     nPos++;
1539*cdf0e10cSrcweir 
1540*cdf0e10cSrcweir                 if( nPos != aToken.Len() )
1541*cdf0e10cSrcweir                 {
1542*cdf0e10cSrcweir                     xub_StrLen nLen = 0;
1543*cdf0e10cSrcweir                     nStt = nPos;
1544*cdf0e10cSrcweir                     if( ('"'==cChar) || ('\'')==cChar )
1545*cdf0e10cSrcweir                     {
1546*cdf0e10cSrcweir                         sal_Unicode cEnd = cChar;
1547*cdf0e10cSrcweir                         nPos++; nStt++;
1548*cdf0e10cSrcweir                         sal_Bool bDone = sal_False;
1549*cdf0e10cSrcweir                         sal_Bool bEscape = sal_False;
1550*cdf0e10cSrcweir                         while( nPos < aToken.Len() && !bDone )
1551*cdf0e10cSrcweir                         {
1552*cdf0e10cSrcweir                             sal_Bool bOldEscape = bEscape;
1553*cdf0e10cSrcweir                             bEscape = sal_False;
1554*cdf0e10cSrcweir                             cChar = aToken.GetChar(nPos);
1555*cdf0e10cSrcweir                             switch( cChar )
1556*cdf0e10cSrcweir                             {
1557*cdf0e10cSrcweir                             case '\r':
1558*cdf0e10cSrcweir                             case '\n':
1559*cdf0e10cSrcweir                                 if( bStripCRLF )
1560*cdf0e10cSrcweir                                     ((String &)aToken).Erase( nPos, 1 );
1561*cdf0e10cSrcweir                                 else
1562*cdf0e10cSrcweir                                     nPos++, nLen++;
1563*cdf0e10cSrcweir                                 break;
1564*cdf0e10cSrcweir                             case '\\':
1565*cdf0e10cSrcweir                                 if( bOldEscape )
1566*cdf0e10cSrcweir                                 {
1567*cdf0e10cSrcweir                                     nPos++, nLen++;
1568*cdf0e10cSrcweir                                 }
1569*cdf0e10cSrcweir                                 else
1570*cdf0e10cSrcweir                                 {
1571*cdf0e10cSrcweir                                     ((String &)aToken).Erase( nPos, 1 );
1572*cdf0e10cSrcweir                                     bEscape = sal_True;
1573*cdf0e10cSrcweir                                 }
1574*cdf0e10cSrcweir                                 break;
1575*cdf0e10cSrcweir                             case '"':
1576*cdf0e10cSrcweir                             case '\'':
1577*cdf0e10cSrcweir                                 bDone = !bOldEscape && cChar==cEnd;
1578*cdf0e10cSrcweir                                 if( !bDone )
1579*cdf0e10cSrcweir                                     nPos++, nLen++;
1580*cdf0e10cSrcweir                                 break;
1581*cdf0e10cSrcweir                             default:
1582*cdf0e10cSrcweir                                 nPos++, nLen++;
1583*cdf0e10cSrcweir                                 break;
1584*cdf0e10cSrcweir                             }
1585*cdf0e10cSrcweir                         }
1586*cdf0e10cSrcweir                         if( nPos!=aToken.Len() )
1587*cdf0e10cSrcweir                             nPos++;
1588*cdf0e10cSrcweir                     }
1589*cdf0e10cSrcweir                     else
1590*cdf0e10cSrcweir                     {
1591*cdf0e10cSrcweir                         // hier sind wir etwas laxer als der
1592*cdf0e10cSrcweir                         // Standard und erlauben alles druckbare
1593*cdf0e10cSrcweir                         sal_Bool bEscape = sal_False;
1594*cdf0e10cSrcweir                         sal_Bool bDone = sal_False;
1595*cdf0e10cSrcweir                         while( nPos < aToken.Len() && !bDone )
1596*cdf0e10cSrcweir                         {
1597*cdf0e10cSrcweir                             sal_Bool bOldEscape = bEscape;
1598*cdf0e10cSrcweir                             bEscape = sal_False;
1599*cdf0e10cSrcweir                             sal_Unicode c = aToken.GetChar(nPos);
1600*cdf0e10cSrcweir                             switch( c )
1601*cdf0e10cSrcweir                             {
1602*cdf0e10cSrcweir                             case ' ':
1603*cdf0e10cSrcweir                                 bDone = !bOldEscape;
1604*cdf0e10cSrcweir                                 if( !bDone )
1605*cdf0e10cSrcweir                                     nPos++, nLen++;
1606*cdf0e10cSrcweir                                 break;
1607*cdf0e10cSrcweir 
1608*cdf0e10cSrcweir                             case '\t':
1609*cdf0e10cSrcweir                             case '\r':
1610*cdf0e10cSrcweir                             case '\n':
1611*cdf0e10cSrcweir                                 bDone = sal_True;
1612*cdf0e10cSrcweir                                 break;
1613*cdf0e10cSrcweir 
1614*cdf0e10cSrcweir                             case '\\':
1615*cdf0e10cSrcweir                                 if( bOldEscape )
1616*cdf0e10cSrcweir                                 {
1617*cdf0e10cSrcweir                                     nPos++, nLen++;
1618*cdf0e10cSrcweir                                 }
1619*cdf0e10cSrcweir                                 else
1620*cdf0e10cSrcweir                                 {
1621*cdf0e10cSrcweir                                     ((String &)aToken).Erase( nPos, 1 );
1622*cdf0e10cSrcweir                                     bEscape = sal_True;
1623*cdf0e10cSrcweir                                 }
1624*cdf0e10cSrcweir                                 break;
1625*cdf0e10cSrcweir 
1626*cdf0e10cSrcweir                             default:
1627*cdf0e10cSrcweir                                 if( HTML_ISPRINTABLE( c ) )
1628*cdf0e10cSrcweir                                     nPos++, nLen++;
1629*cdf0e10cSrcweir                                 else
1630*cdf0e10cSrcweir                                     bDone = sal_True;
1631*cdf0e10cSrcweir                                 break;
1632*cdf0e10cSrcweir                             }
1633*cdf0e10cSrcweir                         }
1634*cdf0e10cSrcweir                     }
1635*cdf0e10cSrcweir 
1636*cdf0e10cSrcweir                     if( nLen )
1637*cdf0e10cSrcweir                         aValue = aToken.Copy( nStt, nLen );
1638*cdf0e10cSrcweir                 }
1639*cdf0e10cSrcweir             }
1640*cdf0e10cSrcweir 
1641*cdf0e10cSrcweir             // Wir kennen das Token und koennen es Speichern
1642*cdf0e10cSrcweir             HTMLOption *pOption =
1643*cdf0e10cSrcweir                 new HTMLOption(
1644*cdf0e10cSrcweir                     sal::static_int_cast< sal_uInt16 >(nToken), sName, aValue );
1645*cdf0e10cSrcweir 
1646*cdf0e10cSrcweir             pOptions->Insert( pOption, pOptions->Count() );
1647*cdf0e10cSrcweir 
1648*cdf0e10cSrcweir         }
1649*cdf0e10cSrcweir         else
1650*cdf0e10cSrcweir             // white space un unerwartete Zeichen ignorieren wie
1651*cdf0e10cSrcweir             nPos++;
1652*cdf0e10cSrcweir     }
1653*cdf0e10cSrcweir 
1654*cdf0e10cSrcweir     return pOptions;
1655*cdf0e10cSrcweir }
1656*cdf0e10cSrcweir 
1657*cdf0e10cSrcweir int HTMLParser::FilterPRE( int nToken )
1658*cdf0e10cSrcweir {
1659*cdf0e10cSrcweir     switch( nToken )
1660*cdf0e10cSrcweir     {
1661*cdf0e10cSrcweir #ifdef HTML_BEHAVIOUR
1662*cdf0e10cSrcweir     // diese werden laut Definition zu LFs
1663*cdf0e10cSrcweir     case HTML_PARABREAK_ON:
1664*cdf0e10cSrcweir     case HTML_LINEBREAK:
1665*cdf0e10cSrcweir         nToken = HTML_NEWPARA;
1666*cdf0e10cSrcweir #else
1667*cdf0e10cSrcweir     // in Netscape zeigen sie aber nur in nicht-leeren Absaetzen Wirkung
1668*cdf0e10cSrcweir     case HTML_PARABREAK_ON:
1669*cdf0e10cSrcweir         nToken = HTML_LINEBREAK;
1670*cdf0e10cSrcweir     case HTML_LINEBREAK:
1671*cdf0e10cSrcweir #endif
1672*cdf0e10cSrcweir     case HTML_NEWPARA:
1673*cdf0e10cSrcweir         nPre_LinePos = 0;
1674*cdf0e10cSrcweir         if( bPre_IgnoreNewPara )
1675*cdf0e10cSrcweir             nToken = 0;
1676*cdf0e10cSrcweir         break;
1677*cdf0e10cSrcweir 
1678*cdf0e10cSrcweir     case HTML_TABCHAR:
1679*cdf0e10cSrcweir         {
1680*cdf0e10cSrcweir             xub_StrLen nSpaces = sal::static_int_cast< xub_StrLen >(
1681*cdf0e10cSrcweir                 8 - (nPre_LinePos % 8));
1682*cdf0e10cSrcweir             DBG_ASSERT( !aToken.Len(), "Wieso ist das Token nicht leer?" );
1683*cdf0e10cSrcweir             aToken.Expand( nSpaces, ' ' );
1684*cdf0e10cSrcweir             nPre_LinePos += nSpaces;
1685*cdf0e10cSrcweir             nToken = HTML_TEXTTOKEN;
1686*cdf0e10cSrcweir         }
1687*cdf0e10cSrcweir         break;
1688*cdf0e10cSrcweir     // diese bleiben erhalten
1689*cdf0e10cSrcweir     case HTML_TEXTTOKEN:
1690*cdf0e10cSrcweir         nPre_LinePos += aToken.Len();
1691*cdf0e10cSrcweir         break;
1692*cdf0e10cSrcweir 
1693*cdf0e10cSrcweir     case HTML_SELECT_ON:
1694*cdf0e10cSrcweir     case HTML_SELECT_OFF:
1695*cdf0e10cSrcweir     case HTML_BODY_ON:
1696*cdf0e10cSrcweir     case HTML_FORM_ON:
1697*cdf0e10cSrcweir     case HTML_FORM_OFF:
1698*cdf0e10cSrcweir     case HTML_INPUT:
1699*cdf0e10cSrcweir     case HTML_OPTION:
1700*cdf0e10cSrcweir     case HTML_TEXTAREA_ON:
1701*cdf0e10cSrcweir     case HTML_TEXTAREA_OFF:
1702*cdf0e10cSrcweir 
1703*cdf0e10cSrcweir     case HTML_IMAGE:
1704*cdf0e10cSrcweir     case HTML_APPLET_ON:
1705*cdf0e10cSrcweir     case HTML_APPLET_OFF:
1706*cdf0e10cSrcweir     case HTML_PARAM:
1707*cdf0e10cSrcweir     case HTML_EMBED:
1708*cdf0e10cSrcweir 
1709*cdf0e10cSrcweir     case HTML_HEAD1_ON:
1710*cdf0e10cSrcweir     case HTML_HEAD1_OFF:
1711*cdf0e10cSrcweir     case HTML_HEAD2_ON:
1712*cdf0e10cSrcweir     case HTML_HEAD2_OFF:
1713*cdf0e10cSrcweir     case HTML_HEAD3_ON:
1714*cdf0e10cSrcweir     case HTML_HEAD3_OFF:
1715*cdf0e10cSrcweir     case HTML_HEAD4_ON:
1716*cdf0e10cSrcweir     case HTML_HEAD4_OFF:
1717*cdf0e10cSrcweir     case HTML_HEAD5_ON:
1718*cdf0e10cSrcweir     case HTML_HEAD5_OFF:
1719*cdf0e10cSrcweir     case HTML_HEAD6_ON:
1720*cdf0e10cSrcweir     case HTML_HEAD6_OFF:
1721*cdf0e10cSrcweir     case HTML_BLOCKQUOTE_ON:
1722*cdf0e10cSrcweir     case HTML_BLOCKQUOTE_OFF:
1723*cdf0e10cSrcweir     case HTML_ADDRESS_ON:
1724*cdf0e10cSrcweir     case HTML_ADDRESS_OFF:
1725*cdf0e10cSrcweir     case HTML_HORZRULE:
1726*cdf0e10cSrcweir 
1727*cdf0e10cSrcweir     case HTML_CENTER_ON:
1728*cdf0e10cSrcweir     case HTML_CENTER_OFF:
1729*cdf0e10cSrcweir     case HTML_DIVISION_ON:
1730*cdf0e10cSrcweir     case HTML_DIVISION_OFF:
1731*cdf0e10cSrcweir 
1732*cdf0e10cSrcweir     case HTML_SCRIPT_ON:
1733*cdf0e10cSrcweir     case HTML_SCRIPT_OFF:
1734*cdf0e10cSrcweir     case HTML_RAWDATA:
1735*cdf0e10cSrcweir 
1736*cdf0e10cSrcweir     case HTML_TABLE_ON:
1737*cdf0e10cSrcweir     case HTML_TABLE_OFF:
1738*cdf0e10cSrcweir     case HTML_CAPTION_ON:
1739*cdf0e10cSrcweir     case HTML_CAPTION_OFF:
1740*cdf0e10cSrcweir     case HTML_COLGROUP_ON:
1741*cdf0e10cSrcweir     case HTML_COLGROUP_OFF:
1742*cdf0e10cSrcweir     case HTML_COL_ON:
1743*cdf0e10cSrcweir     case HTML_COL_OFF:
1744*cdf0e10cSrcweir     case HTML_THEAD_ON:
1745*cdf0e10cSrcweir     case HTML_THEAD_OFF:
1746*cdf0e10cSrcweir     case HTML_TFOOT_ON:
1747*cdf0e10cSrcweir     case HTML_TFOOT_OFF:
1748*cdf0e10cSrcweir     case HTML_TBODY_ON:
1749*cdf0e10cSrcweir     case HTML_TBODY_OFF:
1750*cdf0e10cSrcweir     case HTML_TABLEROW_ON:
1751*cdf0e10cSrcweir     case HTML_TABLEROW_OFF:
1752*cdf0e10cSrcweir     case HTML_TABLEDATA_ON:
1753*cdf0e10cSrcweir     case HTML_TABLEDATA_OFF:
1754*cdf0e10cSrcweir     case HTML_TABLEHEADER_ON:
1755*cdf0e10cSrcweir     case HTML_TABLEHEADER_OFF:
1756*cdf0e10cSrcweir 
1757*cdf0e10cSrcweir     case HTML_ANCHOR_ON:
1758*cdf0e10cSrcweir     case HTML_ANCHOR_OFF:
1759*cdf0e10cSrcweir     case HTML_BOLD_ON:
1760*cdf0e10cSrcweir     case HTML_BOLD_OFF:
1761*cdf0e10cSrcweir     case HTML_ITALIC_ON:
1762*cdf0e10cSrcweir     case HTML_ITALIC_OFF:
1763*cdf0e10cSrcweir     case HTML_STRIKE_ON:
1764*cdf0e10cSrcweir     case HTML_STRIKE_OFF:
1765*cdf0e10cSrcweir     case HTML_STRIKETHROUGH_ON:
1766*cdf0e10cSrcweir     case HTML_STRIKETHROUGH_OFF:
1767*cdf0e10cSrcweir     case HTML_UNDERLINE_ON:
1768*cdf0e10cSrcweir     case HTML_UNDERLINE_OFF:
1769*cdf0e10cSrcweir     case HTML_BASEFONT_ON:
1770*cdf0e10cSrcweir     case HTML_BASEFONT_OFF:
1771*cdf0e10cSrcweir     case HTML_FONT_ON:
1772*cdf0e10cSrcweir     case HTML_FONT_OFF:
1773*cdf0e10cSrcweir     case HTML_BLINK_ON:
1774*cdf0e10cSrcweir     case HTML_BLINK_OFF:
1775*cdf0e10cSrcweir     case HTML_SPAN_ON:
1776*cdf0e10cSrcweir     case HTML_SPAN_OFF:
1777*cdf0e10cSrcweir     case HTML_SUBSCRIPT_ON:
1778*cdf0e10cSrcweir     case HTML_SUBSCRIPT_OFF:
1779*cdf0e10cSrcweir     case HTML_SUPERSCRIPT_ON:
1780*cdf0e10cSrcweir     case HTML_SUPERSCRIPT_OFF:
1781*cdf0e10cSrcweir     case HTML_BIGPRINT_ON:
1782*cdf0e10cSrcweir     case HTML_BIGPRINT_OFF:
1783*cdf0e10cSrcweir     case HTML_SMALLPRINT_OFF:
1784*cdf0e10cSrcweir     case HTML_SMALLPRINT_ON:
1785*cdf0e10cSrcweir 
1786*cdf0e10cSrcweir     case HTML_EMPHASIS_ON:
1787*cdf0e10cSrcweir     case HTML_EMPHASIS_OFF:
1788*cdf0e10cSrcweir     case HTML_CITIATION_ON:
1789*cdf0e10cSrcweir     case HTML_CITIATION_OFF:
1790*cdf0e10cSrcweir     case HTML_STRONG_ON:
1791*cdf0e10cSrcweir     case HTML_STRONG_OFF:
1792*cdf0e10cSrcweir     case HTML_CODE_ON:
1793*cdf0e10cSrcweir     case HTML_CODE_OFF:
1794*cdf0e10cSrcweir     case HTML_SAMPLE_ON:
1795*cdf0e10cSrcweir     case HTML_SAMPLE_OFF:
1796*cdf0e10cSrcweir     case HTML_KEYBOARD_ON:
1797*cdf0e10cSrcweir     case HTML_KEYBOARD_OFF:
1798*cdf0e10cSrcweir     case HTML_VARIABLE_ON:
1799*cdf0e10cSrcweir     case HTML_VARIABLE_OFF:
1800*cdf0e10cSrcweir     case HTML_DEFINSTANCE_ON:
1801*cdf0e10cSrcweir     case HTML_DEFINSTANCE_OFF:
1802*cdf0e10cSrcweir     case HTML_SHORTQUOTE_ON:
1803*cdf0e10cSrcweir     case HTML_SHORTQUOTE_OFF:
1804*cdf0e10cSrcweir     case HTML_LANGUAGE_ON:
1805*cdf0e10cSrcweir     case HTML_LANGUAGE_OFF:
1806*cdf0e10cSrcweir     case HTML_AUTHOR_ON:
1807*cdf0e10cSrcweir     case HTML_AUTHOR_OFF:
1808*cdf0e10cSrcweir     case HTML_PERSON_ON:
1809*cdf0e10cSrcweir     case HTML_PERSON_OFF:
1810*cdf0e10cSrcweir     case HTML_ACRONYM_ON:
1811*cdf0e10cSrcweir     case HTML_ACRONYM_OFF:
1812*cdf0e10cSrcweir     case HTML_ABBREVIATION_ON:
1813*cdf0e10cSrcweir     case HTML_ABBREVIATION_OFF:
1814*cdf0e10cSrcweir     case HTML_INSERTEDTEXT_ON:
1815*cdf0e10cSrcweir     case HTML_INSERTEDTEXT_OFF:
1816*cdf0e10cSrcweir     case HTML_DELETEDTEXT_ON:
1817*cdf0e10cSrcweir     case HTML_DELETEDTEXT_OFF:
1818*cdf0e10cSrcweir     case HTML_TELETYPE_ON:
1819*cdf0e10cSrcweir     case HTML_TELETYPE_OFF:
1820*cdf0e10cSrcweir 
1821*cdf0e10cSrcweir         break;
1822*cdf0e10cSrcweir 
1823*cdf0e10cSrcweir     // der Rest wird als unbekanntes Token behandelt
1824*cdf0e10cSrcweir     default:
1825*cdf0e10cSrcweir         if( nToken )
1826*cdf0e10cSrcweir         {
1827*cdf0e10cSrcweir             nToken =
1828*cdf0e10cSrcweir                 ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken))
1829*cdf0e10cSrcweir                     ? HTML_UNKNOWNCONTROL_OFF
1830*cdf0e10cSrcweir                     : HTML_UNKNOWNCONTROL_ON );
1831*cdf0e10cSrcweir         }
1832*cdf0e10cSrcweir         break;
1833*cdf0e10cSrcweir     }
1834*cdf0e10cSrcweir 
1835*cdf0e10cSrcweir     bPre_IgnoreNewPara = sal_False;
1836*cdf0e10cSrcweir 
1837*cdf0e10cSrcweir     return nToken;
1838*cdf0e10cSrcweir }
1839*cdf0e10cSrcweir 
1840*cdf0e10cSrcweir int HTMLParser::FilterXMP( int nToken )
1841*cdf0e10cSrcweir {
1842*cdf0e10cSrcweir     switch( nToken )
1843*cdf0e10cSrcweir     {
1844*cdf0e10cSrcweir     case HTML_NEWPARA:
1845*cdf0e10cSrcweir         if( bPre_IgnoreNewPara )
1846*cdf0e10cSrcweir             nToken = 0;
1847*cdf0e10cSrcweir     case HTML_TEXTTOKEN:
1848*cdf0e10cSrcweir     case HTML_NONBREAKSPACE:
1849*cdf0e10cSrcweir     case HTML_SOFTHYPH:
1850*cdf0e10cSrcweir         break;              // bleiben erhalten
1851*cdf0e10cSrcweir 
1852*cdf0e10cSrcweir     default:
1853*cdf0e10cSrcweir         if( nToken )
1854*cdf0e10cSrcweir         {
1855*cdf0e10cSrcweir             if( (HTML_TOKEN_ONOFF & nToken) && (1 & nToken) )
1856*cdf0e10cSrcweir             {
1857*cdf0e10cSrcweir                 sSaveToken.Insert( '<', 0 );
1858*cdf0e10cSrcweir                 sSaveToken.Insert( '/', 1 );
1859*cdf0e10cSrcweir             }
1860*cdf0e10cSrcweir             else
1861*cdf0e10cSrcweir                 sSaveToken.Insert( '<', 0 );
1862*cdf0e10cSrcweir             if( aToken.Len() )
1863*cdf0e10cSrcweir             {
1864*cdf0e10cSrcweir                 UnescapeToken();
1865*cdf0e10cSrcweir                 sSaveToken += (sal_Unicode)' ';
1866*cdf0e10cSrcweir                 aToken.Insert( sSaveToken, 0 );
1867*cdf0e10cSrcweir             }
1868*cdf0e10cSrcweir             else
1869*cdf0e10cSrcweir                 aToken = sSaveToken;
1870*cdf0e10cSrcweir             aToken += (sal_Unicode)'>';
1871*cdf0e10cSrcweir             nToken = HTML_TEXTTOKEN;
1872*cdf0e10cSrcweir         }
1873*cdf0e10cSrcweir         break;
1874*cdf0e10cSrcweir     }
1875*cdf0e10cSrcweir 
1876*cdf0e10cSrcweir     bPre_IgnoreNewPara = sal_False;
1877*cdf0e10cSrcweir 
1878*cdf0e10cSrcweir     return nToken;
1879*cdf0e10cSrcweir }
1880*cdf0e10cSrcweir 
1881*cdf0e10cSrcweir int HTMLParser::FilterListing( int nToken )
1882*cdf0e10cSrcweir {
1883*cdf0e10cSrcweir     switch( nToken )
1884*cdf0e10cSrcweir     {
1885*cdf0e10cSrcweir     case HTML_NEWPARA:
1886*cdf0e10cSrcweir         if( bPre_IgnoreNewPara )
1887*cdf0e10cSrcweir             nToken = 0;
1888*cdf0e10cSrcweir     case HTML_TEXTTOKEN:
1889*cdf0e10cSrcweir     case HTML_NONBREAKSPACE:
1890*cdf0e10cSrcweir     case HTML_SOFTHYPH:
1891*cdf0e10cSrcweir         break;      // bleiben erhalten
1892*cdf0e10cSrcweir 
1893*cdf0e10cSrcweir     default:
1894*cdf0e10cSrcweir         if( nToken )
1895*cdf0e10cSrcweir         {
1896*cdf0e10cSrcweir             nToken =
1897*cdf0e10cSrcweir                 ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken))
1898*cdf0e10cSrcweir                     ? HTML_UNKNOWNCONTROL_OFF
1899*cdf0e10cSrcweir                     : HTML_UNKNOWNCONTROL_ON );
1900*cdf0e10cSrcweir         }
1901*cdf0e10cSrcweir         break;
1902*cdf0e10cSrcweir     }
1903*cdf0e10cSrcweir 
1904*cdf0e10cSrcweir     bPre_IgnoreNewPara = sal_False;
1905*cdf0e10cSrcweir 
1906*cdf0e10cSrcweir     return nToken;
1907*cdf0e10cSrcweir }
1908*cdf0e10cSrcweir 
1909*cdf0e10cSrcweir FASTBOOL HTMLParser::IsHTMLFormat( const sal_Char* pHeader,
1910*cdf0e10cSrcweir                                    sal_Bool bSwitchToUCS2,
1911*cdf0e10cSrcweir                                    rtl_TextEncoding eEnc )
1912*cdf0e10cSrcweir {
1913*cdf0e10cSrcweir     // Einer der folgenden regulaeren Ausdrucke muss sich auf den String
1914*cdf0e10cSrcweir     // anwenden lassen, damit das Dok ein HTML-Dokument ist.
1915*cdf0e10cSrcweir     //
1916*cdf0e10cSrcweir     // ^[^<]*<[^ \t]*[> \t]
1917*cdf0e10cSrcweir     //        -------
1918*cdf0e10cSrcweir     // ^<!
1919*cdf0e10cSrcweir     //
1920*cdf0e10cSrcweir     // wobei der unterstrichene Teilausdruck einem HTML-Token
1921*cdf0e10cSrcweir     // ensprechen muss
1922*cdf0e10cSrcweir 
1923*cdf0e10cSrcweir     ByteString sCmp;
1924*cdf0e10cSrcweir     sal_Bool bUCS2B = sal_False;
1925*cdf0e10cSrcweir     if( bSwitchToUCS2 )
1926*cdf0e10cSrcweir     {
1927*cdf0e10cSrcweir         if( 0xfeU == (sal_uChar)pHeader[0] &&
1928*cdf0e10cSrcweir             0xffU == (sal_uChar)pHeader[1] )
1929*cdf0e10cSrcweir         {
1930*cdf0e10cSrcweir             eEnc = RTL_TEXTENCODING_UCS2;
1931*cdf0e10cSrcweir             bUCS2B = sal_True;
1932*cdf0e10cSrcweir         }
1933*cdf0e10cSrcweir         else if( 0xffU == (sal_uChar)pHeader[0] &&
1934*cdf0e10cSrcweir                  0xfeU == (sal_uChar)pHeader[1] )
1935*cdf0e10cSrcweir         {
1936*cdf0e10cSrcweir             eEnc = RTL_TEXTENCODING_UCS2;
1937*cdf0e10cSrcweir         }
1938*cdf0e10cSrcweir     }
1939*cdf0e10cSrcweir     if
1940*cdf0e10cSrcweir        (
1941*cdf0e10cSrcweir         RTL_TEXTENCODING_UCS2 == eEnc &&
1942*cdf0e10cSrcweir         (
1943*cdf0e10cSrcweir          (0xfe == (sal_uChar)pHeader[0] && 0xff == (sal_uChar)pHeader[1]) ||
1944*cdf0e10cSrcweir          (0xff == (sal_uChar)pHeader[0] && 0xfe == (sal_uChar)pHeader[1])
1945*cdf0e10cSrcweir         )
1946*cdf0e10cSrcweir        )
1947*cdf0e10cSrcweir     {
1948*cdf0e10cSrcweir         if( 0xfe == (sal_uChar)pHeader[0] )
1949*cdf0e10cSrcweir             bUCS2B = sal_True;
1950*cdf0e10cSrcweir 
1951*cdf0e10cSrcweir         xub_StrLen nLen;
1952*cdf0e10cSrcweir         for( nLen = 2;
1953*cdf0e10cSrcweir              pHeader[nLen] != 0 || pHeader[nLen+1] != 0;
1954*cdf0e10cSrcweir              nLen+=2 )
1955*cdf0e10cSrcweir             ;
1956*cdf0e10cSrcweir 
1957*cdf0e10cSrcweir         ::rtl::OStringBuffer sTmp( (nLen - 2)/2 );
1958*cdf0e10cSrcweir         for( xub_StrLen nPos = 2; nPos < nLen; nPos += 2 )
1959*cdf0e10cSrcweir         {
1960*cdf0e10cSrcweir             sal_Unicode cUC;
1961*cdf0e10cSrcweir             if( bUCS2B )
1962*cdf0e10cSrcweir                 cUC = (sal_Unicode(pHeader[nPos]) << 8) | pHeader[nPos+1];
1963*cdf0e10cSrcweir             else
1964*cdf0e10cSrcweir                 cUC = (sal_Unicode(pHeader[nPos+1]) << 8) | pHeader[nPos];
1965*cdf0e10cSrcweir             if( 0U == cUC )
1966*cdf0e10cSrcweir                 break;
1967*cdf0e10cSrcweir 
1968*cdf0e10cSrcweir             sTmp.append( cUC < 256U ? (sal_Char)cUC : '.' );
1969*cdf0e10cSrcweir         }
1970*cdf0e10cSrcweir         sCmp = ByteString( sTmp.makeStringAndClear() );
1971*cdf0e10cSrcweir     }
1972*cdf0e10cSrcweir     else
1973*cdf0e10cSrcweir     {
1974*cdf0e10cSrcweir         sCmp = (sal_Char *)pHeader;
1975*cdf0e10cSrcweir     }
1976*cdf0e10cSrcweir 
1977*cdf0e10cSrcweir     sCmp.ToUpperAscii();
1978*cdf0e10cSrcweir 
1979*cdf0e10cSrcweir     // Ein HTML-Dokument muss in der ersten Zeile ein '<' besitzen
1980*cdf0e10cSrcweir     xub_StrLen nStart = sCmp.Search( '<' );
1981*cdf0e10cSrcweir     if( STRING_NOTFOUND  == nStart )
1982*cdf0e10cSrcweir         return sal_False;
1983*cdf0e10cSrcweir     nStart++;
1984*cdf0e10cSrcweir 
1985*cdf0e10cSrcweir     // danach duerfen beliebige andere Zeichen bis zu einem blank oder
1986*cdf0e10cSrcweir     // '>' kommen
1987*cdf0e10cSrcweir     sal_Char c;
1988*cdf0e10cSrcweir     xub_StrLen nPos;
1989*cdf0e10cSrcweir     for( nPos = nStart; nPos<sCmp.Len(); nPos++ )
1990*cdf0e10cSrcweir     {
1991*cdf0e10cSrcweir         if( '>'==(c=sCmp.GetChar(nPos)) || HTML_ISSPACE(c) )
1992*cdf0e10cSrcweir             break;
1993*cdf0e10cSrcweir     }
1994*cdf0e10cSrcweir 
1995*cdf0e10cSrcweir     // wenn das Dokeument hinter dem < aufhoert ist es wohl kein HTML
1996*cdf0e10cSrcweir     if( nPos==nStart )
1997*cdf0e10cSrcweir         return sal_False;
1998*cdf0e10cSrcweir 
1999*cdf0e10cSrcweir     // die Zeichenkette nach dem '<' muss ausserdem ein bekanntes
2000*cdf0e10cSrcweir     // HTML Token sein. Damit die Ausgabe eines DOS-dir-Befehls nicht
2001*cdf0e10cSrcweir     // als HTML interpretiert wird, wird ein <DIR> jedoch nicht als HTML
2002*cdf0e10cSrcweir     // interpretiert.
2003*cdf0e10cSrcweir     String sTest( sCmp.Copy( nStart, nPos-nStart ), RTL_TEXTENCODING_ASCII_US );
2004*cdf0e10cSrcweir     int nTok = GetHTMLToken( sTest );
2005*cdf0e10cSrcweir     if( 0 != nTok && HTML_DIRLIST_ON != nTok )
2006*cdf0e10cSrcweir         return sal_True;
2007*cdf0e10cSrcweir 
2008*cdf0e10cSrcweir     // oder es handelt sich um ein "<!" ganz am Anfang der Datei (fix #27092#)
2009*cdf0e10cSrcweir     if( nStart == 1 && '!' == sCmp.GetChar( 1 ) )
2010*cdf0e10cSrcweir         return sal_True;
2011*cdf0e10cSrcweir 
2012*cdf0e10cSrcweir     // oder wir finden irgendwo ein <HTML> in den ersten 80 Zeichen
2013*cdf0e10cSrcweir     nStart = sCmp.Search( OOO_STRING_SVTOOLS_HTML_html );
2014*cdf0e10cSrcweir     if( nStart!=STRING_NOTFOUND &&
2015*cdf0e10cSrcweir         nStart>0 && '<'==sCmp.GetChar(nStart-1) &&
2016*cdf0e10cSrcweir         nStart+4 < sCmp.Len() && '>'==sCmp.GetChar(nStart+4) )
2017*cdf0e10cSrcweir         return sal_True;
2018*cdf0e10cSrcweir 
2019*cdf0e10cSrcweir     // sonst ist es wohl doch eher kein HTML-Dokument
2020*cdf0e10cSrcweir     return sal_False;
2021*cdf0e10cSrcweir }
2022*cdf0e10cSrcweir 
2023*cdf0e10cSrcweir sal_Bool HTMLParser::InternalImgToPrivateURL( String& rURL )
2024*cdf0e10cSrcweir {
2025*cdf0e10cSrcweir     if( rURL.Len() < 19 || 'i' != rURL.GetChar(0) ||
2026*cdf0e10cSrcweir         rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher, 9 ) != COMPARE_EQUAL )
2027*cdf0e10cSrcweir         return sal_False;
2028*cdf0e10cSrcweir 
2029*cdf0e10cSrcweir     sal_Bool bFound = sal_False;
2030*cdf0e10cSrcweir 
2031*cdf0e10cSrcweir     if( rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher,16) == COMPARE_EQUAL )
2032*cdf0e10cSrcweir     {
2033*cdf0e10cSrcweir         String aName( rURL.Copy(16) );
2034*cdf0e10cSrcweir         switch( aName.GetChar(0) )
2035*cdf0e10cSrcweir         {
2036*cdf0e10cSrcweir         case 'b':
2037*cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_binary );
2038*cdf0e10cSrcweir             break;
2039*cdf0e10cSrcweir         case 'i':
2040*cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_image ) ||
2041*cdf0e10cSrcweir                      aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_index );
2042*cdf0e10cSrcweir             break;
2043*cdf0e10cSrcweir         case 'm':
2044*cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_menu ) ||
2045*cdf0e10cSrcweir                      aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_movie );
2046*cdf0e10cSrcweir             break;
2047*cdf0e10cSrcweir         case 's':
2048*cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_sound );
2049*cdf0e10cSrcweir             break;
2050*cdf0e10cSrcweir         case 't':
2051*cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_telnet ) ||
2052*cdf0e10cSrcweir                      aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_text );
2053*cdf0e10cSrcweir             break;
2054*cdf0e10cSrcweir         case 'u':
2055*cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_unknown );
2056*cdf0e10cSrcweir             break;
2057*cdf0e10cSrcweir         }
2058*cdf0e10cSrcweir     }
2059*cdf0e10cSrcweir     else if( rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_icon,14) == COMPARE_EQUAL )
2060*cdf0e10cSrcweir     {
2061*cdf0e10cSrcweir         String aName( rURL.Copy(14) );
2062*cdf0e10cSrcweir         switch( aName.GetChar(0) )
2063*cdf0e10cSrcweir         {
2064*cdf0e10cSrcweir         case 'b':
2065*cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata );
2066*cdf0e10cSrcweir             break;
2067*cdf0e10cSrcweir         case 'd':
2068*cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed );
2069*cdf0e10cSrcweir             break;
2070*cdf0e10cSrcweir         case 'e':
2071*cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_embed );
2072*cdf0e10cSrcweir             break;
2073*cdf0e10cSrcweir         case 'i':
2074*cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure );
2075*cdf0e10cSrcweir             break;
2076*cdf0e10cSrcweir         case 'n':
2077*cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound );
2078*cdf0e10cSrcweir             break;
2079*cdf0e10cSrcweir         }
2080*cdf0e10cSrcweir     }
2081*cdf0e10cSrcweir     if( bFound )
2082*cdf0e10cSrcweir     {
2083*cdf0e10cSrcweir         String sTmp ( rURL );
2084*cdf0e10cSrcweir         rURL.AssignAscii( OOO_STRING_SVTOOLS_HTML_private_image );
2085*cdf0e10cSrcweir         rURL.Append( sTmp );
2086*cdf0e10cSrcweir     }
2087*cdf0e10cSrcweir 
2088*cdf0e10cSrcweir     return bFound;
2089*cdf0e10cSrcweir }
2090*cdf0e10cSrcweir 
2091*cdf0e10cSrcweir #ifdef USED
2092*cdf0e10cSrcweir void HTMLParser::SaveState( int nToken )
2093*cdf0e10cSrcweir {
2094*cdf0e10cSrcweir     SvParser::SaveState( nToken );
2095*cdf0e10cSrcweir }
2096*cdf0e10cSrcweir 
2097*cdf0e10cSrcweir void HTMLParser::RestoreState()
2098*cdf0e10cSrcweir {
2099*cdf0e10cSrcweir     SvParser::RestoreState();
2100*cdf0e10cSrcweir }
2101*cdf0e10cSrcweir #endif
2102*cdf0e10cSrcweir 
2103*cdf0e10cSrcweir 
2104*cdf0e10cSrcweir enum eHtmlMetas {
2105*cdf0e10cSrcweir     HTML_META_NONE = 0,
2106*cdf0e10cSrcweir     HTML_META_AUTHOR,
2107*cdf0e10cSrcweir     HTML_META_DESCRIPTION,
2108*cdf0e10cSrcweir     HTML_META_KEYWORDS,
2109*cdf0e10cSrcweir     HTML_META_REFRESH,
2110*cdf0e10cSrcweir     HTML_META_CLASSIFICATION,
2111*cdf0e10cSrcweir     HTML_META_CREATED,
2112*cdf0e10cSrcweir     HTML_META_CHANGEDBY,
2113*cdf0e10cSrcweir     HTML_META_CHANGED,
2114*cdf0e10cSrcweir     HTML_META_GENERATOR,
2115*cdf0e10cSrcweir     HTML_META_SDFOOTNOTE,
2116*cdf0e10cSrcweir     HTML_META_SDENDNOTE,
2117*cdf0e10cSrcweir     HTML_META_CONTENT_TYPE
2118*cdf0e10cSrcweir };
2119*cdf0e10cSrcweir 
2120*cdf0e10cSrcweir // <META NAME=xxx>
2121*cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aHTMLMetaNameTable[] =
2122*cdf0e10cSrcweir {
2123*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_author,        HTML_META_AUTHOR        },
2124*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_changed,       HTML_META_CHANGED       },
2125*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_changedby,     HTML_META_CHANGEDBY     },
2126*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_classification,HTML_META_CLASSIFICATION},
2127*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_content_type,  HTML_META_CONTENT_TYPE  },
2128*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_created,       HTML_META_CREATED       },
2129*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_description,   HTML_META_DESCRIPTION   },
2130*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_keywords,      HTML_META_KEYWORDS      },
2131*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_generator,     HTML_META_GENERATOR     },
2132*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_refresh,       HTML_META_REFRESH       },
2133*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_sdendnote,     HTML_META_SDENDNOTE     },
2134*cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_sdfootnote,    HTML_META_SDFOOTNOTE    },
2135*cdf0e10cSrcweir     { 0,                                          0                       }
2136*cdf0e10cSrcweir };
2137*cdf0e10cSrcweir 
2138*cdf0e10cSrcweir 
2139*cdf0e10cSrcweir void HTMLParser::AddMetaUserDefined( ::rtl::OUString const & )
2140*cdf0e10cSrcweir {
2141*cdf0e10cSrcweir }
2142*cdf0e10cSrcweir 
2143*cdf0e10cSrcweir bool HTMLParser::ParseMetaOptionsImpl(
2144*cdf0e10cSrcweir         const uno::Reference<document::XDocumentProperties> & i_xDocProps,
2145*cdf0e10cSrcweir         SvKeyValueIterator *i_pHTTPHeader,
2146*cdf0e10cSrcweir         const HTMLOptions *i_pOptions,
2147*cdf0e10cSrcweir         rtl_TextEncoding& o_rEnc )
2148*cdf0e10cSrcweir {
2149*cdf0e10cSrcweir     String aName, aContent;
2150*cdf0e10cSrcweir     sal_uInt16 nAction = HTML_META_NONE;
2151*cdf0e10cSrcweir     bool bHTTPEquiv = false, bChanged = false;
2152*cdf0e10cSrcweir 
2153*cdf0e10cSrcweir     for ( sal_uInt16 i = i_pOptions->Count(); i; )
2154*cdf0e10cSrcweir     {
2155*cdf0e10cSrcweir         const HTMLOption *pOption = (*i_pOptions)[ --i ];
2156*cdf0e10cSrcweir         switch ( pOption->GetToken() )
2157*cdf0e10cSrcweir         {
2158*cdf0e10cSrcweir             case HTML_O_NAME:
2159*cdf0e10cSrcweir                 aName = pOption->GetString();
2160*cdf0e10cSrcweir                 if ( HTML_META_NONE==nAction )
2161*cdf0e10cSrcweir                 {
2162*cdf0e10cSrcweir                     pOption->GetEnum( nAction, aHTMLMetaNameTable );
2163*cdf0e10cSrcweir                 }
2164*cdf0e10cSrcweir                 break;
2165*cdf0e10cSrcweir             case HTML_O_HTTPEQUIV:
2166*cdf0e10cSrcweir                 aName = pOption->GetString();
2167*cdf0e10cSrcweir                 pOption->GetEnum( nAction, aHTMLMetaNameTable );
2168*cdf0e10cSrcweir                 bHTTPEquiv = true;
2169*cdf0e10cSrcweir                 break;
2170*cdf0e10cSrcweir             case HTML_O_CONTENT:
2171*cdf0e10cSrcweir                 aContent = pOption->GetString();
2172*cdf0e10cSrcweir                 break;
2173*cdf0e10cSrcweir         }
2174*cdf0e10cSrcweir     }
2175*cdf0e10cSrcweir 
2176*cdf0e10cSrcweir     if ( bHTTPEquiv || HTML_META_DESCRIPTION != nAction )
2177*cdf0e10cSrcweir     {
2178*cdf0e10cSrcweir         // if it is not a Description, remove CRs and LFs from CONTENT
2179*cdf0e10cSrcweir         aContent.EraseAllChars( _CR );
2180*cdf0e10cSrcweir         aContent.EraseAllChars( _LF );
2181*cdf0e10cSrcweir     }
2182*cdf0e10cSrcweir     else
2183*cdf0e10cSrcweir     {
2184*cdf0e10cSrcweir         // convert line endings for Description
2185*cdf0e10cSrcweir         aContent.ConvertLineEnd();
2186*cdf0e10cSrcweir     }
2187*cdf0e10cSrcweir 
2188*cdf0e10cSrcweir 
2189*cdf0e10cSrcweir     if ( bHTTPEquiv && i_pHTTPHeader )
2190*cdf0e10cSrcweir     {
2191*cdf0e10cSrcweir         // #57232#: Netscape seems to just ignore a closing ", so we do too
2192*cdf0e10cSrcweir         if ( aContent.Len() && '"' == aContent.GetChar( aContent.Len()-1 ) )
2193*cdf0e10cSrcweir         {
2194*cdf0e10cSrcweir             aContent.Erase( aContent.Len() - 1 );
2195*cdf0e10cSrcweir         }
2196*cdf0e10cSrcweir         SvKeyValue aKeyValue( aName, aContent );
2197*cdf0e10cSrcweir         i_pHTTPHeader->Append( aKeyValue );
2198*cdf0e10cSrcweir     }
2199*cdf0e10cSrcweir 
2200*cdf0e10cSrcweir     switch ( nAction )
2201*cdf0e10cSrcweir     {
2202*cdf0e10cSrcweir         case HTML_META_AUTHOR:
2203*cdf0e10cSrcweir             if (i_xDocProps.is()) {
2204*cdf0e10cSrcweir                 i_xDocProps->setAuthor( aContent );
2205*cdf0e10cSrcweir                 bChanged = true;
2206*cdf0e10cSrcweir             }
2207*cdf0e10cSrcweir             break;
2208*cdf0e10cSrcweir         case HTML_META_DESCRIPTION:
2209*cdf0e10cSrcweir             if (i_xDocProps.is()) {
2210*cdf0e10cSrcweir                 i_xDocProps->setDescription( aContent );
2211*cdf0e10cSrcweir                 bChanged = true;
2212*cdf0e10cSrcweir             }
2213*cdf0e10cSrcweir             break;
2214*cdf0e10cSrcweir         case HTML_META_KEYWORDS:
2215*cdf0e10cSrcweir             if (i_xDocProps.is()) {
2216*cdf0e10cSrcweir                 i_xDocProps->setKeywords(
2217*cdf0e10cSrcweir                     ::comphelper::string::convertCommaSeparated(aContent));
2218*cdf0e10cSrcweir                 bChanged = true;
2219*cdf0e10cSrcweir             }
2220*cdf0e10cSrcweir             break;
2221*cdf0e10cSrcweir         case HTML_META_CLASSIFICATION:
2222*cdf0e10cSrcweir             if (i_xDocProps.is()) {
2223*cdf0e10cSrcweir                 i_xDocProps->setSubject( aContent );
2224*cdf0e10cSrcweir                 bChanged = true;
2225*cdf0e10cSrcweir             }
2226*cdf0e10cSrcweir             break;
2227*cdf0e10cSrcweir 
2228*cdf0e10cSrcweir         case HTML_META_CHANGEDBY:
2229*cdf0e10cSrcweir             if (i_xDocProps.is()) {
2230*cdf0e10cSrcweir                 i_xDocProps->setModifiedBy( aContent );
2231*cdf0e10cSrcweir             }
2232*cdf0e10cSrcweir             break;
2233*cdf0e10cSrcweir 
2234*cdf0e10cSrcweir         case HTML_META_CREATED:
2235*cdf0e10cSrcweir         case HTML_META_CHANGED:
2236*cdf0e10cSrcweir             if ( i_xDocProps.is() && aContent.Len() &&
2237*cdf0e10cSrcweir                  aContent.GetTokenCount() == 2 )
2238*cdf0e10cSrcweir             {
2239*cdf0e10cSrcweir                 Date aDate( (sal_uLong)aContent.GetToken(0).ToInt32() );
2240*cdf0e10cSrcweir                 Time aTime( (sal_uLong)aContent.GetToken(1).ToInt32() );
2241*cdf0e10cSrcweir                 DateTime aDateTime( aDate, aTime );
2242*cdf0e10cSrcweir                 ::util::DateTime uDT(aDateTime.Get100Sec(),
2243*cdf0e10cSrcweir                     aDateTime.GetSec(), aDateTime.GetMin(),
2244*cdf0e10cSrcweir                     aDateTime.GetHour(), aDateTime.GetDay(),
2245*cdf0e10cSrcweir                     aDateTime.GetMonth(), aDateTime.GetYear());
2246*cdf0e10cSrcweir                 if ( HTML_META_CREATED==nAction )
2247*cdf0e10cSrcweir                     i_xDocProps->setCreationDate( uDT );
2248*cdf0e10cSrcweir                 else
2249*cdf0e10cSrcweir                     i_xDocProps->setModificationDate( uDT );
2250*cdf0e10cSrcweir                 bChanged = true;
2251*cdf0e10cSrcweir             }
2252*cdf0e10cSrcweir             break;
2253*cdf0e10cSrcweir 
2254*cdf0e10cSrcweir         case HTML_META_REFRESH:
2255*cdf0e10cSrcweir             DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader,
2256*cdf0e10cSrcweir         "Reload-URL aufgrund unterlassener MUSS-Aenderung verlorengegangen" );
2257*cdf0e10cSrcweir             break;
2258*cdf0e10cSrcweir 
2259*cdf0e10cSrcweir         case HTML_META_CONTENT_TYPE:
2260*cdf0e10cSrcweir             if ( aContent.Len() )
2261*cdf0e10cSrcweir             {
2262*cdf0e10cSrcweir                 o_rEnc = GetEncodingByMIME( aContent );
2263*cdf0e10cSrcweir             }
2264*cdf0e10cSrcweir             break;
2265*cdf0e10cSrcweir 
2266*cdf0e10cSrcweir         case HTML_META_NONE:
2267*cdf0e10cSrcweir             if ( !bHTTPEquiv )
2268*cdf0e10cSrcweir             {
2269*cdf0e10cSrcweir                 if (i_xDocProps.is())
2270*cdf0e10cSrcweir                 {
2271*cdf0e10cSrcweir                     uno::Reference<beans::XPropertyContainer> xUDProps
2272*cdf0e10cSrcweir                         = i_xDocProps->getUserDefinedProperties();
2273*cdf0e10cSrcweir                     try {
2274*cdf0e10cSrcweir                         xUDProps->addProperty(aName,
2275*cdf0e10cSrcweir                             beans::PropertyAttribute::REMOVEABLE,
2276*cdf0e10cSrcweir                             uno::makeAny(::rtl::OUString(aContent)));
2277*cdf0e10cSrcweir                         AddMetaUserDefined(aName);
2278*cdf0e10cSrcweir                         bChanged = true;
2279*cdf0e10cSrcweir                     } catch (uno::Exception &) {
2280*cdf0e10cSrcweir                         // ignore
2281*cdf0e10cSrcweir                     }
2282*cdf0e10cSrcweir                 }
2283*cdf0e10cSrcweir             }
2284*cdf0e10cSrcweir             break;
2285*cdf0e10cSrcweir         default:
2286*cdf0e10cSrcweir             break;
2287*cdf0e10cSrcweir     }
2288*cdf0e10cSrcweir 
2289*cdf0e10cSrcweir     return bChanged;
2290*cdf0e10cSrcweir }
2291*cdf0e10cSrcweir 
2292*cdf0e10cSrcweir bool HTMLParser::ParseMetaOptions(
2293*cdf0e10cSrcweir         const uno::Reference<document::XDocumentProperties> & i_xDocProps,
2294*cdf0e10cSrcweir         SvKeyValueIterator *i_pHeader )
2295*cdf0e10cSrcweir {
2296*cdf0e10cSrcweir     sal_uInt16 nContentOption = HTML_O_CONTENT;
2297*cdf0e10cSrcweir     rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
2298*cdf0e10cSrcweir 
2299*cdf0e10cSrcweir     bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader,
2300*cdf0e10cSrcweir                       GetOptions(&nContentOption),
2301*cdf0e10cSrcweir                       eEnc );
2302*cdf0e10cSrcweir 
2303*cdf0e10cSrcweir     // If the encoding is set by a META tag, it may only overwrite the
2304*cdf0e10cSrcweir     // current encoding if both, the current and the new encoding, are 1-sal_uInt8
2305*cdf0e10cSrcweir     // encodings. Everything else cannot lead to reasonable results.
2306*cdf0e10cSrcweir     if (RTL_TEXTENCODING_DONTKNOW != eEnc &&
2307*cdf0e10cSrcweir         rtl_isOctetTextEncoding( eEnc ) &&
2308*cdf0e10cSrcweir         rtl_isOctetTextEncoding( GetSrcEncoding() ) )
2309*cdf0e10cSrcweir     {
2310*cdf0e10cSrcweir         eEnc = GetExtendedCompatibilityTextEncoding( eEnc ); // #89973#
2311*cdf0e10cSrcweir         SetSrcEncoding( eEnc );
2312*cdf0e10cSrcweir     }
2313*cdf0e10cSrcweir 
2314*cdf0e10cSrcweir     return bRet;
2315*cdf0e10cSrcweir }
2316*cdf0e10cSrcweir 
2317*cdf0e10cSrcweir rtl_TextEncoding HTMLParser::GetEncodingByMIME( const String& rMime )
2318*cdf0e10cSrcweir {
2319*cdf0e10cSrcweir     ByteString sType;
2320*cdf0e10cSrcweir     ByteString sSubType;
2321*cdf0e10cSrcweir     INetContentTypeParameterList aParameters;
2322*cdf0e10cSrcweir     ByteString sMime( rMime, RTL_TEXTENCODING_ASCII_US );
2323*cdf0e10cSrcweir     if (INetContentTypes::parse(sMime, sType, sSubType, &aParameters))
2324*cdf0e10cSrcweir     {
2325*cdf0e10cSrcweir         const INetContentTypeParameter * pCharset
2326*cdf0e10cSrcweir             = aParameters.find("charset");
2327*cdf0e10cSrcweir         if (pCharset != 0)
2328*cdf0e10cSrcweir         {
2329*cdf0e10cSrcweir             ByteString sValue( pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US );
2330*cdf0e10cSrcweir             return GetExtendedCompatibilityTextEncoding(
2331*cdf0e10cSrcweir                     rtl_getTextEncodingFromMimeCharset( sValue.GetBuffer() ) );
2332*cdf0e10cSrcweir         }
2333*cdf0e10cSrcweir     }
2334*cdf0e10cSrcweir     return RTL_TEXTENCODING_DONTKNOW;
2335*cdf0e10cSrcweir }
2336*cdf0e10cSrcweir 
2337*cdf0e10cSrcweir rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader )
2338*cdf0e10cSrcweir {
2339*cdf0e10cSrcweir     rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW;
2340*cdf0e10cSrcweir     if( pHTTPHeader )
2341*cdf0e10cSrcweir     {
2342*cdf0e10cSrcweir         SvKeyValue aKV;
2343*cdf0e10cSrcweir         for( sal_Bool bCont = pHTTPHeader->GetFirst( aKV ); bCont;
2344*cdf0e10cSrcweir              bCont = pHTTPHeader->GetNext( aKV ) )
2345*cdf0e10cSrcweir         {
2346*cdf0e10cSrcweir             if( aKV.GetKey().EqualsIgnoreCaseAscii( OOO_STRING_SVTOOLS_HTML_META_content_type ) )
2347*cdf0e10cSrcweir             {
2348*cdf0e10cSrcweir                 if( aKV.GetValue().Len() )
2349*cdf0e10cSrcweir                 {
2350*cdf0e10cSrcweir                     eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() );
2351*cdf0e10cSrcweir                 }
2352*cdf0e10cSrcweir             }
2353*cdf0e10cSrcweir         }
2354*cdf0e10cSrcweir     }
2355*cdf0e10cSrcweir     return eRet;
2356*cdf0e10cSrcweir }
2357*cdf0e10cSrcweir 
2358*cdf0e10cSrcweir sal_Bool HTMLParser::SetEncodingByHTTPHeader(
2359*cdf0e10cSrcweir                                 SvKeyValueIterator *pHTTPHeader )
2360*cdf0e10cSrcweir {
2361*cdf0e10cSrcweir     sal_Bool bRet = sal_False;
2362*cdf0e10cSrcweir     rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader );
2363*cdf0e10cSrcweir     if(RTL_TEXTENCODING_DONTKNOW != eEnc)
2364*cdf0e10cSrcweir     {
2365*cdf0e10cSrcweir         SetSrcEncoding( eEnc );
2366*cdf0e10cSrcweir         bRet = sal_True;
2367*cdf0e10cSrcweir     }
2368*cdf0e10cSrcweir     return bRet;
2369*cdf0e10cSrcweir }
2370*cdf0e10cSrcweir 
2371*cdf0e10cSrcweir 
2372