1*cdf0e10cSrcweir /************************************************************************* 2*cdf0e10cSrcweir * 3*cdf0e10cSrcweir * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4*cdf0e10cSrcweir * 5*cdf0e10cSrcweir * Copyright 2000, 2010 Oracle and/or its affiliates. 6*cdf0e10cSrcweir * 7*cdf0e10cSrcweir * OpenOffice.org - a multi-platform office productivity suite 8*cdf0e10cSrcweir * 9*cdf0e10cSrcweir * This file is part of OpenOffice.org. 10*cdf0e10cSrcweir * 11*cdf0e10cSrcweir * OpenOffice.org is free software: you can redistribute it and/or modify 12*cdf0e10cSrcweir * it under the terms of the GNU Lesser General Public License version 3 13*cdf0e10cSrcweir * only, as published by the Free Software Foundation. 14*cdf0e10cSrcweir * 15*cdf0e10cSrcweir * OpenOffice.org is distributed in the hope that it will be useful, 16*cdf0e10cSrcweir * but WITHOUT ANY WARRANTY; without even the implied warranty of 17*cdf0e10cSrcweir * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18*cdf0e10cSrcweir * GNU Lesser General Public License version 3 for more details 19*cdf0e10cSrcweir * (a copy is included in the LICENSE file that accompanied this code). 20*cdf0e10cSrcweir * 21*cdf0e10cSrcweir * You should have received a copy of the GNU Lesser General Public License 22*cdf0e10cSrcweir * version 3 along with OpenOffice.org. If not, see 23*cdf0e10cSrcweir * <http://www.openoffice.org/license.html> 24*cdf0e10cSrcweir * for a copy of the LGPLv3 License. 25*cdf0e10cSrcweir * 26*cdf0e10cSrcweir ************************************************************************/ 27*cdf0e10cSrcweir 28*cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove 29*cdf0e10cSrcweir #include "precompiled_svtools.hxx" 30*cdf0e10cSrcweir 31*cdf0e10cSrcweir #include <ctype.h> 32*cdf0e10cSrcweir #include <stdio.h> 33*cdf0e10cSrcweir #include <tools/stream.hxx> 34*cdf0e10cSrcweir #include <tools/debug.hxx> 35*cdf0e10cSrcweir #include <tools/color.hxx> 36*cdf0e10cSrcweir #include <rtl/ustrbuf.hxx> 37*cdf0e10cSrcweir #include <rtl/strbuf.hxx> 38*cdf0e10cSrcweir #ifndef _SVSTDARR_HXX 39*cdf0e10cSrcweir #define _SVSTDARR_ULONGS 40*cdf0e10cSrcweir #include <svl/svstdarr.hxx> 41*cdf0e10cSrcweir #endif 42*cdf0e10cSrcweir 43*cdf0e10cSrcweir #include <tools/tenccvt.hxx> 44*cdf0e10cSrcweir #include <tools/datetime.hxx> 45*cdf0e10cSrcweir #include <svl/inettype.hxx> 46*cdf0e10cSrcweir #include <comphelper/string.hxx> 47*cdf0e10cSrcweir #include <com/sun/star/beans/PropertyAttribute.hpp> 48*cdf0e10cSrcweir #include <com/sun/star/document/XDocumentProperties.hpp> 49*cdf0e10cSrcweir 50*cdf0e10cSrcweir #include <svtools/parhtml.hxx> 51*cdf0e10cSrcweir #include <svtools/htmltokn.h> 52*cdf0e10cSrcweir #include <svtools/htmlkywd.hxx> 53*cdf0e10cSrcweir 54*cdf0e10cSrcweir 55*cdf0e10cSrcweir using namespace ::com::sun::star; 56*cdf0e10cSrcweir 57*cdf0e10cSrcweir 58*cdf0e10cSrcweir const sal_Int32 MAX_LEN( 1024L ); 59*cdf0e10cSrcweir //static sal_Unicode sTmpBuffer[ MAX_LEN+1 ]; 60*cdf0e10cSrcweir const sal_Int32 MAX_MACRO_LEN( 1024 ); 61*cdf0e10cSrcweir 62*cdf0e10cSrcweir const sal_Int32 MAX_ENTITY_LEN( 8L ); 63*cdf0e10cSrcweir 64*cdf0e10cSrcweir /* */ 65*cdf0e10cSrcweir 66*cdf0e10cSrcweir // Tabellen zum Umwandeln von Options-Werten in Strings 67*cdf0e10cSrcweir 68*cdf0e10cSrcweir // <INPUT TYPE=xxx> 69*cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aInputTypeOptEnums[] = 70*cdf0e10cSrcweir { 71*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_text, HTML_IT_TEXT }, 72*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_password, HTML_IT_PASSWORD }, 73*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_checkbox, HTML_IT_CHECKBOX }, 74*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_radio, HTML_IT_RADIO }, 75*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_range, HTML_IT_RANGE }, 76*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_scribble, HTML_IT_SCRIBBLE }, 77*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_file, HTML_IT_FILE }, 78*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_hidden, HTML_IT_HIDDEN }, 79*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_submit, HTML_IT_SUBMIT }, 80*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_image, HTML_IT_IMAGE }, 81*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_reset, HTML_IT_RESET }, 82*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_button, HTML_IT_BUTTON }, 83*cdf0e10cSrcweir { 0, 0 } 84*cdf0e10cSrcweir }; 85*cdf0e10cSrcweir 86*cdf0e10cSrcweir // <TABLE FRAME=xxx> 87*cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aTableFrameOptEnums[] = 88*cdf0e10cSrcweir { 89*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_void, HTML_TF_VOID }, 90*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_above, HTML_TF_ABOVE }, 91*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_below, HTML_TF_BELOW }, 92*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_hsides, HTML_TF_HSIDES }, 93*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_lhs, HTML_TF_LHS }, 94*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_rhs, HTML_TF_RHS }, 95*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_vsides, HTML_TF_VSIDES }, 96*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_box, HTML_TF_BOX }, 97*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_border, HTML_TF_BOX }, 98*cdf0e10cSrcweir { 0, 0 } 99*cdf0e10cSrcweir }; 100*cdf0e10cSrcweir 101*cdf0e10cSrcweir // <TABLE RULES=xxx> 102*cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aTableRulesOptEnums[] = 103*cdf0e10cSrcweir { 104*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_none, HTML_TR_NONE }, 105*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_groups, HTML_TR_GROUPS }, 106*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_rows, HTML_TR_ROWS }, 107*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_cols, HTML_TR_COLS }, 108*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_all, HTML_TR_ALL }, 109*cdf0e10cSrcweir { 0, 0 } 110*cdf0e10cSrcweir }; 111*cdf0e10cSrcweir 112*cdf0e10cSrcweir 113*cdf0e10cSrcweir SV_IMPL_PTRARR(HTMLOptions,HTMLOptionPtr) 114*cdf0e10cSrcweir 115*cdf0e10cSrcweir /* */ 116*cdf0e10cSrcweir 117*cdf0e10cSrcweir sal_uInt16 HTMLOption::GetEnum( const HTMLOptionEnum *pOptEnums, sal_uInt16 nDflt ) const 118*cdf0e10cSrcweir { 119*cdf0e10cSrcweir sal_uInt16 nValue = nDflt; 120*cdf0e10cSrcweir 121*cdf0e10cSrcweir while( pOptEnums->pName ) 122*cdf0e10cSrcweir if( aValue.EqualsIgnoreCaseAscii( pOptEnums->pName ) ) 123*cdf0e10cSrcweir break; 124*cdf0e10cSrcweir else 125*cdf0e10cSrcweir pOptEnums++; 126*cdf0e10cSrcweir 127*cdf0e10cSrcweir if( pOptEnums->pName ) 128*cdf0e10cSrcweir nValue = pOptEnums->nValue; 129*cdf0e10cSrcweir 130*cdf0e10cSrcweir return nValue; 131*cdf0e10cSrcweir } 132*cdf0e10cSrcweir 133*cdf0e10cSrcweir sal_Bool HTMLOption::GetEnum( sal_uInt16 &rEnum, const HTMLOptionEnum *pOptEnums ) const 134*cdf0e10cSrcweir { 135*cdf0e10cSrcweir while( pOptEnums->pName ) 136*cdf0e10cSrcweir { 137*cdf0e10cSrcweir if( aValue.EqualsIgnoreCaseAscii( pOptEnums->pName ) ) 138*cdf0e10cSrcweir break; 139*cdf0e10cSrcweir else 140*cdf0e10cSrcweir pOptEnums++; 141*cdf0e10cSrcweir } 142*cdf0e10cSrcweir 143*cdf0e10cSrcweir const sal_Char *pName = pOptEnums->pName; 144*cdf0e10cSrcweir if( pName ) 145*cdf0e10cSrcweir rEnum = pOptEnums->nValue; 146*cdf0e10cSrcweir 147*cdf0e10cSrcweir return (pName != 0); 148*cdf0e10cSrcweir } 149*cdf0e10cSrcweir 150*cdf0e10cSrcweir HTMLOption::HTMLOption( sal_uInt16 nTok, const String& rToken, 151*cdf0e10cSrcweir const String& rValue ) 152*cdf0e10cSrcweir : aValue(rValue) 153*cdf0e10cSrcweir , aToken(rToken) 154*cdf0e10cSrcweir , nToken( nTok ) 155*cdf0e10cSrcweir { 156*cdf0e10cSrcweir DBG_ASSERT( nToken>=HTML_OPTION_START && nToken<HTML_OPTION_END, 157*cdf0e10cSrcweir "HTMLOption: unbekanntes Token" ); 158*cdf0e10cSrcweir } 159*cdf0e10cSrcweir 160*cdf0e10cSrcweir sal_uInt32 HTMLOption::GetNumber() const 161*cdf0e10cSrcweir { 162*cdf0e10cSrcweir DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START && 163*cdf0e10cSrcweir nToken<HTML_OPTION_NUMBER_END) || 164*cdf0e10cSrcweir (nToken>=HTML_OPTION_CONTEXT_START && 165*cdf0e10cSrcweir nToken<HTML_OPTION_CONTEXT_END) || 166*cdf0e10cSrcweir nToken==HTML_O_VALUE, 167*cdf0e10cSrcweir "GetNumber: Option ist nicht numerisch" ); 168*cdf0e10cSrcweir String aTmp( aValue ); 169*cdf0e10cSrcweir aTmp.EraseLeadingChars(); 170*cdf0e10cSrcweir sal_Int32 nTmp = aTmp.ToInt32(); 171*cdf0e10cSrcweir return nTmp >= 0 ? (sal_uInt32)nTmp : 0; 172*cdf0e10cSrcweir } 173*cdf0e10cSrcweir 174*cdf0e10cSrcweir sal_Int32 HTMLOption::GetSNumber() const 175*cdf0e10cSrcweir { 176*cdf0e10cSrcweir DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START && nToken<HTML_OPTION_NUMBER_END) || 177*cdf0e10cSrcweir (nToken>=HTML_OPTION_CONTEXT_START && nToken<HTML_OPTION_CONTEXT_END), 178*cdf0e10cSrcweir "GetSNumber: Option ist nicht numerisch" ); 179*cdf0e10cSrcweir String aTmp( aValue ); 180*cdf0e10cSrcweir aTmp.EraseLeadingChars(); 181*cdf0e10cSrcweir return aTmp.ToInt32(); 182*cdf0e10cSrcweir } 183*cdf0e10cSrcweir 184*cdf0e10cSrcweir void HTMLOption::GetNumbers( SvULongs &rLongs, sal_Bool bSpaceDelim ) const 185*cdf0e10cSrcweir { 186*cdf0e10cSrcweir if( rLongs.Count() ) 187*cdf0e10cSrcweir rLongs.Remove( 0, rLongs.Count() ); 188*cdf0e10cSrcweir 189*cdf0e10cSrcweir if( bSpaceDelim ) 190*cdf0e10cSrcweir { 191*cdf0e10cSrcweir // das ist ein sehr stark vereinfachter Scanner. Er sucht einfach 192*cdf0e10cSrcweir // alle Tiffern aus dem String 193*cdf0e10cSrcweir sal_Bool bInNum = sal_False; 194*cdf0e10cSrcweir sal_uLong nNum = 0; 195*cdf0e10cSrcweir for( xub_StrLen i=0; i<aValue.Len(); i++ ) 196*cdf0e10cSrcweir { 197*cdf0e10cSrcweir register sal_Unicode c = aValue.GetChar( i ); 198*cdf0e10cSrcweir if( c>='0' && c<='9' ) 199*cdf0e10cSrcweir { 200*cdf0e10cSrcweir nNum *= 10; 201*cdf0e10cSrcweir nNum += (c - '0'); 202*cdf0e10cSrcweir bInNum = sal_True; 203*cdf0e10cSrcweir } 204*cdf0e10cSrcweir else if( bInNum ) 205*cdf0e10cSrcweir { 206*cdf0e10cSrcweir rLongs.Insert( nNum, rLongs.Count() ); 207*cdf0e10cSrcweir bInNum = sal_False; 208*cdf0e10cSrcweir nNum = 0; 209*cdf0e10cSrcweir } 210*cdf0e10cSrcweir } 211*cdf0e10cSrcweir if( bInNum ) 212*cdf0e10cSrcweir { 213*cdf0e10cSrcweir rLongs.Insert( nNum, rLongs.Count() ); 214*cdf0e10cSrcweir } 215*cdf0e10cSrcweir } 216*cdf0e10cSrcweir else 217*cdf0e10cSrcweir { 218*cdf0e10cSrcweir // hier wird auf die korrekte Trennung der Zahlen durch ',' geachtet 219*cdf0e10cSrcweir // und auch mal eine 0 eingefuegt 220*cdf0e10cSrcweir xub_StrLen nPos = 0; 221*cdf0e10cSrcweir while( nPos < aValue.Len() ) 222*cdf0e10cSrcweir { 223*cdf0e10cSrcweir register sal_Unicode c; 224*cdf0e10cSrcweir while( nPos < aValue.Len() && 225*cdf0e10cSrcweir ((c=aValue.GetChar(nPos)) == ' ' || c == '\t' || 226*cdf0e10cSrcweir c == '\n' || c== '\r' ) ) 227*cdf0e10cSrcweir nPos++; 228*cdf0e10cSrcweir 229*cdf0e10cSrcweir if( nPos==aValue.Len() ) 230*cdf0e10cSrcweir rLongs.Insert( sal_uLong(0), rLongs.Count() ); 231*cdf0e10cSrcweir else 232*cdf0e10cSrcweir { 233*cdf0e10cSrcweir xub_StrLen nEnd = aValue.Search( (sal_Unicode)',', nPos ); 234*cdf0e10cSrcweir if( STRING_NOTFOUND==nEnd ) 235*cdf0e10cSrcweir { 236*cdf0e10cSrcweir sal_Int32 nTmp = aValue.Copy(nPos).ToInt32(); 237*cdf0e10cSrcweir rLongs.Insert( nTmp >= 0 ? (sal_uInt32)nTmp : 0, 238*cdf0e10cSrcweir rLongs.Count() ); 239*cdf0e10cSrcweir nPos = aValue.Len(); 240*cdf0e10cSrcweir } 241*cdf0e10cSrcweir else 242*cdf0e10cSrcweir { 243*cdf0e10cSrcweir sal_Int32 nTmp = 244*cdf0e10cSrcweir aValue.Copy(nPos,nEnd-nPos).ToInt32(); 245*cdf0e10cSrcweir rLongs.Insert( nTmp >= 0 ? (sal_uInt32)nTmp : 0, 246*cdf0e10cSrcweir rLongs.Count() ); 247*cdf0e10cSrcweir nPos = nEnd+1; 248*cdf0e10cSrcweir } 249*cdf0e10cSrcweir } 250*cdf0e10cSrcweir } 251*cdf0e10cSrcweir } 252*cdf0e10cSrcweir } 253*cdf0e10cSrcweir 254*cdf0e10cSrcweir void HTMLOption::GetColor( Color& rColor ) const 255*cdf0e10cSrcweir { 256*cdf0e10cSrcweir DBG_ASSERT( (nToken>=HTML_OPTION_COLOR_START && nToken<HTML_OPTION_COLOR_END) || nToken==HTML_O_SIZE, 257*cdf0e10cSrcweir "GetColor: Option spezifiziert keine Farbe" ); 258*cdf0e10cSrcweir 259*cdf0e10cSrcweir String aTmp( aValue ); 260*cdf0e10cSrcweir aTmp.ToUpperAscii(); 261*cdf0e10cSrcweir sal_uLong nColor = ULONG_MAX; 262*cdf0e10cSrcweir if( '#'!=aTmp.GetChar( 0 ) ) 263*cdf0e10cSrcweir nColor = GetHTMLColor( aTmp ); 264*cdf0e10cSrcweir 265*cdf0e10cSrcweir if( ULONG_MAX == nColor ) 266*cdf0e10cSrcweir { 267*cdf0e10cSrcweir nColor = 0; 268*cdf0e10cSrcweir xub_StrLen nPos = 0; 269*cdf0e10cSrcweir for( sal_uInt32 i=0; i<6; i++ ) 270*cdf0e10cSrcweir { 271*cdf0e10cSrcweir // MIB 26.06.97: Wie auch immer Netscape Farbwerte ermittelt, 272*cdf0e10cSrcweir // maximal drei Zeichen, die kleiner als '0' sind werden 273*cdf0e10cSrcweir // ignoriert. Bug #40901# stimmt damit. Mal schauen, was sich 274*cdf0e10cSrcweir // irgendwelche HTML-Autoren noch so einfallen lassen... 275*cdf0e10cSrcweir register sal_Unicode c = nPos<aTmp.Len() ? aTmp.GetChar( nPos++ ) 276*cdf0e10cSrcweir : '0'; 277*cdf0e10cSrcweir if( c < '0' ) 278*cdf0e10cSrcweir { 279*cdf0e10cSrcweir c = nPos<aTmp.Len() ? aTmp.GetChar(nPos++) : '0'; 280*cdf0e10cSrcweir if( c < '0' ) 281*cdf0e10cSrcweir c = nPos<aTmp.Len() ? aTmp.GetChar(nPos++) : '0'; 282*cdf0e10cSrcweir } 283*cdf0e10cSrcweir nColor *= 16; 284*cdf0e10cSrcweir if( c >= '0' && c <= '9' ) 285*cdf0e10cSrcweir nColor += (c - 48); 286*cdf0e10cSrcweir else if( c >= 'A' && c <= 'F' ) 287*cdf0e10cSrcweir nColor += (c - 55); 288*cdf0e10cSrcweir } 289*cdf0e10cSrcweir } 290*cdf0e10cSrcweir 291*cdf0e10cSrcweir rColor.SetRed( (sal_uInt8)((nColor & 0x00ff0000) >> 16) ); 292*cdf0e10cSrcweir rColor.SetGreen( (sal_uInt8)((nColor & 0x0000ff00) >> 8)); 293*cdf0e10cSrcweir rColor.SetBlue( (sal_uInt8)(nColor & 0x000000ff) ); 294*cdf0e10cSrcweir } 295*cdf0e10cSrcweir 296*cdf0e10cSrcweir HTMLInputType HTMLOption::GetInputType() const 297*cdf0e10cSrcweir { 298*cdf0e10cSrcweir DBG_ASSERT( nToken==HTML_O_TYPE, "GetInputType: Option nicht TYPE" ); 299*cdf0e10cSrcweir return (HTMLInputType)GetEnum( aInputTypeOptEnums, HTML_IT_TEXT ); 300*cdf0e10cSrcweir } 301*cdf0e10cSrcweir 302*cdf0e10cSrcweir HTMLTableFrame HTMLOption::GetTableFrame() const 303*cdf0e10cSrcweir { 304*cdf0e10cSrcweir DBG_ASSERT( nToken==HTML_O_FRAME, "GetTableFrame: Option nicht FRAME" ); 305*cdf0e10cSrcweir return (HTMLTableFrame)GetEnum( aTableFrameOptEnums, HTML_TF_VOID ); 306*cdf0e10cSrcweir } 307*cdf0e10cSrcweir 308*cdf0e10cSrcweir HTMLTableRules HTMLOption::GetTableRules() const 309*cdf0e10cSrcweir { 310*cdf0e10cSrcweir DBG_ASSERT( nToken==HTML_O_RULES, "GetTableRules: Option nicht RULES" ); 311*cdf0e10cSrcweir return (HTMLTableRules)GetEnum( aTableRulesOptEnums, HTML_TR_NONE ); 312*cdf0e10cSrcweir } 313*cdf0e10cSrcweir 314*cdf0e10cSrcweir /* */ 315*cdf0e10cSrcweir 316*cdf0e10cSrcweir HTMLParser::HTMLParser( SvStream& rIn, int bReadNewDoc ) 317*cdf0e10cSrcweir : SvParser( rIn ) 318*cdf0e10cSrcweir { 319*cdf0e10cSrcweir bNewDoc = bReadNewDoc; 320*cdf0e10cSrcweir bReadListing = bReadXMP = bReadPRE = bReadTextArea = 321*cdf0e10cSrcweir bReadScript = bReadStyle = 322*cdf0e10cSrcweir bEndTokenFound = bIsInBody = bReadNextChar = 323*cdf0e10cSrcweir bReadComment = sal_False; 324*cdf0e10cSrcweir bIsInHeader = sal_True; 325*cdf0e10cSrcweir pOptions = new HTMLOptions; 326*cdf0e10cSrcweir } 327*cdf0e10cSrcweir 328*cdf0e10cSrcweir HTMLParser::~HTMLParser() 329*cdf0e10cSrcweir { 330*cdf0e10cSrcweir if( pOptions && pOptions->Count() ) 331*cdf0e10cSrcweir pOptions->DeleteAndDestroy( 0, pOptions->Count() ); 332*cdf0e10cSrcweir delete pOptions; 333*cdf0e10cSrcweir } 334*cdf0e10cSrcweir 335*cdf0e10cSrcweir SvParserState __EXPORT HTMLParser::CallParser() 336*cdf0e10cSrcweir { 337*cdf0e10cSrcweir eState = SVPAR_WORKING; 338*cdf0e10cSrcweir nNextCh = GetNextChar(); 339*cdf0e10cSrcweir SaveState( 0 ); 340*cdf0e10cSrcweir 341*cdf0e10cSrcweir nPre_LinePos = 0; 342*cdf0e10cSrcweir bPre_IgnoreNewPara = sal_False; 343*cdf0e10cSrcweir 344*cdf0e10cSrcweir AddRef(); 345*cdf0e10cSrcweir Continue( 0 ); 346*cdf0e10cSrcweir if( SVPAR_PENDING != eState ) 347*cdf0e10cSrcweir ReleaseRef(); // dann brauchen wir den Parser nicht mehr! 348*cdf0e10cSrcweir 349*cdf0e10cSrcweir return eState; 350*cdf0e10cSrcweir } 351*cdf0e10cSrcweir 352*cdf0e10cSrcweir void HTMLParser::Continue( int nToken ) 353*cdf0e10cSrcweir { 354*cdf0e10cSrcweir if( !nToken ) 355*cdf0e10cSrcweir nToken = GetNextToken(); 356*cdf0e10cSrcweir 357*cdf0e10cSrcweir while( IsParserWorking() ) 358*cdf0e10cSrcweir { 359*cdf0e10cSrcweir SaveState( nToken ); 360*cdf0e10cSrcweir nToken = FilterToken( nToken ); 361*cdf0e10cSrcweir 362*cdf0e10cSrcweir if( nToken ) 363*cdf0e10cSrcweir NextToken( nToken ); 364*cdf0e10cSrcweir 365*cdf0e10cSrcweir if( IsParserWorking() ) 366*cdf0e10cSrcweir SaveState( 0 ); // bis hierhin abgearbeitet, 367*cdf0e10cSrcweir // weiter mit neuem Token! 368*cdf0e10cSrcweir nToken = GetNextToken(); 369*cdf0e10cSrcweir } 370*cdf0e10cSrcweir } 371*cdf0e10cSrcweir 372*cdf0e10cSrcweir int HTMLParser::FilterToken( int nToken ) 373*cdf0e10cSrcweir { 374*cdf0e10cSrcweir switch( nToken ) 375*cdf0e10cSrcweir { 376*cdf0e10cSrcweir case sal_Unicode(EOF): 377*cdf0e10cSrcweir nToken = 0; 378*cdf0e10cSrcweir break; // nicht verschicken 379*cdf0e10cSrcweir 380*cdf0e10cSrcweir case HTML_HEAD_OFF: 381*cdf0e10cSrcweir bIsInBody = sal_True; 382*cdf0e10cSrcweir case HTML_HEAD_ON: 383*cdf0e10cSrcweir bIsInHeader = HTML_HEAD_ON == nToken; 384*cdf0e10cSrcweir break; 385*cdf0e10cSrcweir 386*cdf0e10cSrcweir case HTML_BODY_ON: 387*cdf0e10cSrcweir case HTML_FRAMESET_ON: 388*cdf0e10cSrcweir bIsInHeader = sal_False; 389*cdf0e10cSrcweir bIsInBody = HTML_BODY_ON == nToken; 390*cdf0e10cSrcweir break; 391*cdf0e10cSrcweir 392*cdf0e10cSrcweir case HTML_BODY_OFF: 393*cdf0e10cSrcweir bIsInBody = bReadPRE = bReadListing = bReadXMP = sal_False; 394*cdf0e10cSrcweir break; 395*cdf0e10cSrcweir 396*cdf0e10cSrcweir case HTML_HTML_OFF: 397*cdf0e10cSrcweir nToken = 0; 398*cdf0e10cSrcweir bReadPRE = bReadListing = bReadXMP = sal_False; 399*cdf0e10cSrcweir break; // HTML_ON wurde auch nicht verschickt ! 400*cdf0e10cSrcweir 401*cdf0e10cSrcweir case HTML_PREFORMTXT_ON: 402*cdf0e10cSrcweir StartPRE(); 403*cdf0e10cSrcweir break; 404*cdf0e10cSrcweir 405*cdf0e10cSrcweir case HTML_PREFORMTXT_OFF: 406*cdf0e10cSrcweir FinishPRE(); 407*cdf0e10cSrcweir break; 408*cdf0e10cSrcweir 409*cdf0e10cSrcweir case HTML_LISTING_ON: 410*cdf0e10cSrcweir StartListing(); 411*cdf0e10cSrcweir break; 412*cdf0e10cSrcweir 413*cdf0e10cSrcweir case HTML_LISTING_OFF: 414*cdf0e10cSrcweir FinishListing(); 415*cdf0e10cSrcweir break; 416*cdf0e10cSrcweir 417*cdf0e10cSrcweir case HTML_XMP_ON: 418*cdf0e10cSrcweir StartXMP(); 419*cdf0e10cSrcweir break; 420*cdf0e10cSrcweir 421*cdf0e10cSrcweir case HTML_XMP_OFF: 422*cdf0e10cSrcweir FinishXMP(); 423*cdf0e10cSrcweir break; 424*cdf0e10cSrcweir 425*cdf0e10cSrcweir default: 426*cdf0e10cSrcweir if( bReadPRE ) 427*cdf0e10cSrcweir nToken = FilterPRE( nToken ); 428*cdf0e10cSrcweir else if( bReadListing ) 429*cdf0e10cSrcweir nToken = FilterListing( nToken ); 430*cdf0e10cSrcweir else if( bReadXMP ) 431*cdf0e10cSrcweir nToken = FilterXMP( nToken ); 432*cdf0e10cSrcweir 433*cdf0e10cSrcweir break; 434*cdf0e10cSrcweir } 435*cdf0e10cSrcweir 436*cdf0e10cSrcweir return nToken; 437*cdf0e10cSrcweir } 438*cdf0e10cSrcweir 439*cdf0e10cSrcweir #define HTML_ISDIGIT( c ) (c >= '0' && c <= '9') 440*cdf0e10cSrcweir #define HTML_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ) 441*cdf0e10cSrcweir #define HTML_ISALNUM( c ) ( HTML_ISALPHA(c) || HTML_ISDIGIT(c) ) 442*cdf0e10cSrcweir #define HTML_ISSPACE( c ) ( ' ' == c || (c >= 0x09 && c <= 0x0d) ) 443*cdf0e10cSrcweir #define HTML_ISPRINTABLE( c ) ( c >= 32 && c != 127) 444*cdf0e10cSrcweir // --> OD 2006-07-26 #138464# 445*cdf0e10cSrcweir #define HTML_ISHEXDIGIT( c ) ( HTML_ISDIGIT(c) || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f') ) 446*cdf0e10cSrcweir // <-- 447*cdf0e10cSrcweir 448*cdf0e10cSrcweir int HTMLParser::ScanText( const sal_Unicode cBreak ) 449*cdf0e10cSrcweir { 450*cdf0e10cSrcweir ::rtl::OUStringBuffer sTmpBuffer( MAX_LEN ); 451*cdf0e10cSrcweir int bWeiter = sal_True; 452*cdf0e10cSrcweir int bEqSignFound = sal_False; 453*cdf0e10cSrcweir sal_Unicode cQuote = 0U; 454*cdf0e10cSrcweir 455*cdf0e10cSrcweir while( bWeiter && IsParserWorking() ) 456*cdf0e10cSrcweir { 457*cdf0e10cSrcweir int bNextCh = sal_True; 458*cdf0e10cSrcweir switch( nNextCh ) 459*cdf0e10cSrcweir { 460*cdf0e10cSrcweir case '&': 461*cdf0e10cSrcweir bEqSignFound = sal_False; 462*cdf0e10cSrcweir if( bReadXMP ) 463*cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'&' ); 464*cdf0e10cSrcweir else 465*cdf0e10cSrcweir { 466*cdf0e10cSrcweir sal_uLong nStreamPos = rInput.Tell(); 467*cdf0e10cSrcweir sal_uLong nLinePos = GetLinePos(); 468*cdf0e10cSrcweir 469*cdf0e10cSrcweir sal_Unicode cChar = 0U; 470*cdf0e10cSrcweir if( '#' == (nNextCh = GetNextChar()) ) 471*cdf0e10cSrcweir { 472*cdf0e10cSrcweir nNextCh = GetNextChar(); 473*cdf0e10cSrcweir // --> OD 2006-07-26 #138464# 474*cdf0e10cSrcweir // consider hexadecimal digits 475*cdf0e10cSrcweir const sal_Bool bIsHex( 'x' == nNextCh ); 476*cdf0e10cSrcweir const sal_Bool bIsDecOrHex( bIsHex || HTML_ISDIGIT(nNextCh) ); 477*cdf0e10cSrcweir if ( bIsDecOrHex ) 478*cdf0e10cSrcweir { 479*cdf0e10cSrcweir if ( bIsHex ) 480*cdf0e10cSrcweir { 481*cdf0e10cSrcweir nNextCh = GetNextChar(); 482*cdf0e10cSrcweir while ( HTML_ISHEXDIGIT(nNextCh) ) 483*cdf0e10cSrcweir { 484*cdf0e10cSrcweir cChar = cChar * 16U + 485*cdf0e10cSrcweir ( nNextCh <= '9' 486*cdf0e10cSrcweir ? sal_Unicode( nNextCh - '0' ) 487*cdf0e10cSrcweir : ( nNextCh <= 'F' 488*cdf0e10cSrcweir ? sal_Unicode( nNextCh - 'A' + 10 ) 489*cdf0e10cSrcweir : sal_Unicode( nNextCh - 'a' + 10 ) ) ); 490*cdf0e10cSrcweir nNextCh = GetNextChar(); 491*cdf0e10cSrcweir } 492*cdf0e10cSrcweir } 493*cdf0e10cSrcweir else 494*cdf0e10cSrcweir { 495*cdf0e10cSrcweir do 496*cdf0e10cSrcweir { 497*cdf0e10cSrcweir cChar = cChar * 10U + sal_Unicode( nNextCh - '0'); 498*cdf0e10cSrcweir nNextCh = GetNextChar(); 499*cdf0e10cSrcweir } 500*cdf0e10cSrcweir while( HTML_ISDIGIT(nNextCh) ); 501*cdf0e10cSrcweir } 502*cdf0e10cSrcweir 503*cdf0e10cSrcweir if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc && 504*cdf0e10cSrcweir RTL_TEXTENCODING_UCS2 != eSrcEnc && 505*cdf0e10cSrcweir RTL_TEXTENCODING_UTF8 != eSrcEnc && 506*cdf0e10cSrcweir cChar < 256 ) 507*cdf0e10cSrcweir { 508*cdf0e10cSrcweir sal_Unicode cOrig = cChar; 509*cdf0e10cSrcweir cChar = ByteString::ConvertToUnicode( 510*cdf0e10cSrcweir (sal_Char)cChar, eSrcEnc ); 511*cdf0e10cSrcweir if( 0U == cChar ) 512*cdf0e10cSrcweir { 513*cdf0e10cSrcweir // #73398#: If the character could not be 514*cdf0e10cSrcweir // converted, because a conversion is not 515*cdf0e10cSrcweir // available, do no conversion at all. 516*cdf0e10cSrcweir cChar = cOrig; 517*cdf0e10cSrcweir } 518*cdf0e10cSrcweir } 519*cdf0e10cSrcweir } 520*cdf0e10cSrcweir // <-- 521*cdf0e10cSrcweir else 522*cdf0e10cSrcweir nNextCh = 0U; 523*cdf0e10cSrcweir } 524*cdf0e10cSrcweir else if( HTML_ISALPHA( nNextCh ) ) 525*cdf0e10cSrcweir { 526*cdf0e10cSrcweir ::rtl::OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN ); 527*cdf0e10cSrcweir xub_StrLen nPos = 0L; 528*cdf0e10cSrcweir do 529*cdf0e10cSrcweir { 530*cdf0e10cSrcweir sEntityBuffer.append( nNextCh ); 531*cdf0e10cSrcweir nPos++; 532*cdf0e10cSrcweir nNextCh = GetNextChar(); 533*cdf0e10cSrcweir } 534*cdf0e10cSrcweir while( nPos < MAX_ENTITY_LEN && HTML_ISALNUM( nNextCh ) && 535*cdf0e10cSrcweir !rInput.IsEof() ); 536*cdf0e10cSrcweir 537*cdf0e10cSrcweir if( IsParserWorking() && !rInput.IsEof() ) 538*cdf0e10cSrcweir { 539*cdf0e10cSrcweir String sEntity( sEntityBuffer.getStr(), nPos ); 540*cdf0e10cSrcweir cChar = GetHTMLCharName( sEntity ); 541*cdf0e10cSrcweir 542*cdf0e10cSrcweir // nicht gefunden ( == 0 ), dann Klartext 543*cdf0e10cSrcweir // oder ein Zeichen das als Attribut eingefuegt 544*cdf0e10cSrcweir // wird 545*cdf0e10cSrcweir if( 0U == cChar && ';' != nNextCh ) 546*cdf0e10cSrcweir { 547*cdf0e10cSrcweir DBG_ASSERT( rInput.Tell() - nStreamPos == 548*cdf0e10cSrcweir (sal_uLong)(nPos+1L)*GetCharSize(), 549*cdf0e10cSrcweir "UTF-8 geht hier schief" ); 550*cdf0e10cSrcweir for( xub_StrLen i=nPos-1L; i>1L; i-- ) 551*cdf0e10cSrcweir { 552*cdf0e10cSrcweir nNextCh = sEntityBuffer[i]; 553*cdf0e10cSrcweir sEntityBuffer.setLength( i ); 554*cdf0e10cSrcweir sEntity.Assign( sEntityBuffer.getStr(), i ); 555*cdf0e10cSrcweir cChar = GetHTMLCharName( sEntity ); 556*cdf0e10cSrcweir if( cChar ) 557*cdf0e10cSrcweir { 558*cdf0e10cSrcweir rInput.SeekRel( -(long) 559*cdf0e10cSrcweir ((nPos-i)*GetCharSize()) ); 560*cdf0e10cSrcweir nlLinePos -= sal_uInt32(nPos-i); 561*cdf0e10cSrcweir nPos = i; 562*cdf0e10cSrcweir ClearTxtConvContext(); 563*cdf0e10cSrcweir break; 564*cdf0e10cSrcweir } 565*cdf0e10cSrcweir } 566*cdf0e10cSrcweir } 567*cdf0e10cSrcweir 568*cdf0e10cSrcweir if( !cChar ) // unbekanntes Zeichen? 569*cdf0e10cSrcweir { 570*cdf0e10cSrcweir // dann im Stream zurueck, das '&' als Zeichen 571*cdf0e10cSrcweir // einfuegen und mit dem nachfolgenden Zeichen 572*cdf0e10cSrcweir // wieder aufsetzen 573*cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'&' ); 574*cdf0e10cSrcweir 575*cdf0e10cSrcweir // rInput.SeekRel( -(long)(++nPos*GetCharSize()) ); 576*cdf0e10cSrcweir // nlLinePos -= nPos; 577*cdf0e10cSrcweir DBG_ASSERT( rInput.Tell()-nStreamPos == 578*cdf0e10cSrcweir (sal_uLong)(nPos+1)*GetCharSize(), 579*cdf0e10cSrcweir "Falsche Stream-Position" ); 580*cdf0e10cSrcweir DBG_ASSERT( nlLinePos-nLinePos == 581*cdf0e10cSrcweir (sal_uLong)(nPos+1), 582*cdf0e10cSrcweir "Falsche Zeilen-Position" ); 583*cdf0e10cSrcweir rInput.Seek( nStreamPos ); 584*cdf0e10cSrcweir nlLinePos = nLinePos; 585*cdf0e10cSrcweir ClearTxtConvContext(); 586*cdf0e10cSrcweir break; 587*cdf0e10cSrcweir } 588*cdf0e10cSrcweir 589*cdf0e10cSrcweir // 1 == Non Breaking Space 590*cdf0e10cSrcweir // 2 == SoftHyphen 591*cdf0e10cSrcweir 592*cdf0e10cSrcweir if( cChar < 3U ) 593*cdf0e10cSrcweir { 594*cdf0e10cSrcweir if( '>' == cBreak ) 595*cdf0e10cSrcweir { 596*cdf0e10cSrcweir // Wenn der Inhalt eines Tags gelesen wird, 597*cdf0e10cSrcweir // muessen wir ein Space bzw. - daraus machen 598*cdf0e10cSrcweir switch( cChar ) 599*cdf0e10cSrcweir { 600*cdf0e10cSrcweir case 1U: cChar = ' '; break; 601*cdf0e10cSrcweir case 2U: cChar = '-'; break; 602*cdf0e10cSrcweir default: 603*cdf0e10cSrcweir DBG_ASSERT( cChar==1U, 604*cdf0e10cSrcweir "\0x00 sollte doch schon laengt abgefangen sein!" ); 605*cdf0e10cSrcweir break; 606*cdf0e10cSrcweir } 607*cdf0e10cSrcweir } 608*cdf0e10cSrcweir else 609*cdf0e10cSrcweir { 610*cdf0e10cSrcweir // Wenn kein Tag gescannt wird, enstprechendes 611*cdf0e10cSrcweir // Token zurueckgeben 612*cdf0e10cSrcweir aToken += 613*cdf0e10cSrcweir String( sTmpBuffer.makeStringAndClear() ); 614*cdf0e10cSrcweir if( cChar ) 615*cdf0e10cSrcweir { 616*cdf0e10cSrcweir if( aToken.Len() ) 617*cdf0e10cSrcweir { 618*cdf0e10cSrcweir // mit dem Zeichen wieder aufsetzen 619*cdf0e10cSrcweir nNextCh = '&'; 620*cdf0e10cSrcweir // rInput.SeekRel( -(long)(++nPos*GetCharSize()) ); 621*cdf0e10cSrcweir // nlLinePos -= nPos; 622*cdf0e10cSrcweir DBG_ASSERT( rInput.Tell()-nStreamPos == 623*cdf0e10cSrcweir (sal_uLong)(nPos+1)*GetCharSize(), 624*cdf0e10cSrcweir "Falsche Stream-Position" ); 625*cdf0e10cSrcweir DBG_ASSERT( nlLinePos-nLinePos == 626*cdf0e10cSrcweir (sal_uLong)(nPos+1), 627*cdf0e10cSrcweir "Falsche Zeilen-Position" ); 628*cdf0e10cSrcweir rInput.Seek( nStreamPos ); 629*cdf0e10cSrcweir nlLinePos = nLinePos; 630*cdf0e10cSrcweir ClearTxtConvContext(); 631*cdf0e10cSrcweir return HTML_TEXTTOKEN; 632*cdf0e10cSrcweir } 633*cdf0e10cSrcweir 634*cdf0e10cSrcweir // Hack: _GetNextChar soll nicht das 635*cdf0e10cSrcweir // naechste Zeichen lesen 636*cdf0e10cSrcweir if( ';' != nNextCh ) 637*cdf0e10cSrcweir aToken += ' '; 638*cdf0e10cSrcweir if( 1U == cChar ) 639*cdf0e10cSrcweir return HTML_NONBREAKSPACE; 640*cdf0e10cSrcweir if( 2U == cChar ) 641*cdf0e10cSrcweir return HTML_SOFTHYPH; 642*cdf0e10cSrcweir } 643*cdf0e10cSrcweir aToken += (sal_Unicode)'&'; 644*cdf0e10cSrcweir aToken += 645*cdf0e10cSrcweir String(sEntityBuffer.makeStringAndClear()); 646*cdf0e10cSrcweir break; 647*cdf0e10cSrcweir } 648*cdf0e10cSrcweir } 649*cdf0e10cSrcweir } 650*cdf0e10cSrcweir else 651*cdf0e10cSrcweir nNextCh = 0U; 652*cdf0e10cSrcweir } 653*cdf0e10cSrcweir // MIB 03/02/2000: &{...};-JavaScript-Macros are not 654*cdf0e10cSrcweir // supported any longer. 655*cdf0e10cSrcweir else if( IsParserWorking() ) 656*cdf0e10cSrcweir { 657*cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'&' ); 658*cdf0e10cSrcweir bNextCh = sal_False; 659*cdf0e10cSrcweir break; 660*cdf0e10cSrcweir } 661*cdf0e10cSrcweir 662*cdf0e10cSrcweir bNextCh = (';' == nNextCh); 663*cdf0e10cSrcweir if( cBreak=='>' && (cChar=='\\' || cChar=='\'' || 664*cdf0e10cSrcweir cChar=='\"' || cChar==' ') ) 665*cdf0e10cSrcweir { 666*cdf0e10cSrcweir // ' und " mussen innerhalb von Tags mit einem 667*cdf0e10cSrcweir // gekennzeichnet werden, um sie von ' und " als Klammern 668*cdf0e10cSrcweir // um Optionen zu unterscheiden. Logischerweise muss 669*cdf0e10cSrcweir // deshalb auch ein \ gekeenzeichnet werden. Ausserdem 670*cdf0e10cSrcweir // schuetzen wir ein Space, weil es kein Trennzeichen 671*cdf0e10cSrcweir // zwischen Optionen ist. 672*cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'\\' ); 673*cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() ) 674*cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 675*cdf0e10cSrcweir } 676*cdf0e10cSrcweir if( IsParserWorking() ) 677*cdf0e10cSrcweir { 678*cdf0e10cSrcweir if( cChar ) 679*cdf0e10cSrcweir sTmpBuffer.append( cChar ); 680*cdf0e10cSrcweir } 681*cdf0e10cSrcweir else if( SVPAR_PENDING==eState && '>'!=cBreak ) 682*cdf0e10cSrcweir { 683*cdf0e10cSrcweir // Mit dem '&' Zeichen wieder aufsetzen, der Rest 684*cdf0e10cSrcweir // wird als Texttoken zurueckgegeben. 685*cdf0e10cSrcweir if( aToken.Len() || sTmpBuffer.getLength() ) 686*cdf0e10cSrcweir { 687*cdf0e10cSrcweir // Der bisherige Text wird von _GetNextChar() 688*cdf0e10cSrcweir // zurueckgegeben und beim naechsten Aufruf wird 689*cdf0e10cSrcweir // ein neues Zeichen gelesen. Also muessen wir uns 690*cdf0e10cSrcweir // noch vor das & stellen. 691*cdf0e10cSrcweir nNextCh = 0U; 692*cdf0e10cSrcweir rInput.Seek( nStreamPos-(sal_uInt32)GetCharSize() ); 693*cdf0e10cSrcweir nlLinePos = nLinePos-1; 694*cdf0e10cSrcweir ClearTxtConvContext(); 695*cdf0e10cSrcweir bReadNextChar = sal_True; 696*cdf0e10cSrcweir } 697*cdf0e10cSrcweir bNextCh = sal_False; 698*cdf0e10cSrcweir } 699*cdf0e10cSrcweir } 700*cdf0e10cSrcweir break; 701*cdf0e10cSrcweir case '=': 702*cdf0e10cSrcweir if( '>'==cBreak && !cQuote ) 703*cdf0e10cSrcweir bEqSignFound = sal_True; 704*cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 705*cdf0e10cSrcweir break; 706*cdf0e10cSrcweir 707*cdf0e10cSrcweir case '\\': 708*cdf0e10cSrcweir if( '>'==cBreak ) 709*cdf0e10cSrcweir { 710*cdf0e10cSrcweir // Innerhalb von Tags kennzeichnen 711*cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'\\' ); 712*cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() ) 713*cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 714*cdf0e10cSrcweir } 715*cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'\\' ); 716*cdf0e10cSrcweir break; 717*cdf0e10cSrcweir 718*cdf0e10cSrcweir case '\"': 719*cdf0e10cSrcweir case '\'': 720*cdf0e10cSrcweir if( '>'==cBreak ) 721*cdf0e10cSrcweir { 722*cdf0e10cSrcweir if( bEqSignFound ) 723*cdf0e10cSrcweir cQuote = nNextCh; 724*cdf0e10cSrcweir else if( cQuote && (cQuote==nNextCh ) ) 725*cdf0e10cSrcweir cQuote = 0U; 726*cdf0e10cSrcweir } 727*cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 728*cdf0e10cSrcweir bEqSignFound = sal_False; 729*cdf0e10cSrcweir break; 730*cdf0e10cSrcweir 731*cdf0e10cSrcweir case sal_Unicode(EOF): 732*cdf0e10cSrcweir if( rInput.IsEof() ) 733*cdf0e10cSrcweir { 734*cdf0e10cSrcweir // MIB 20.11.98: Das macht hier keinen Sinn, oder doch: Zumindest wird 735*cdf0e10cSrcweir // abcä<EOF> nicht angezeigt, also lassen wir das in Zukunft. 736*cdf0e10cSrcweir // if( '>' != cBreak ) 737*cdf0e10cSrcweir // eState = SVPAR_ACCEPTED; 738*cdf0e10cSrcweir bWeiter = sal_False; 739*cdf0e10cSrcweir } 740*cdf0e10cSrcweir else 741*cdf0e10cSrcweir { 742*cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 743*cdf0e10cSrcweir } 744*cdf0e10cSrcweir break; 745*cdf0e10cSrcweir 746*cdf0e10cSrcweir case '<': 747*cdf0e10cSrcweir bEqSignFound = sal_False; 748*cdf0e10cSrcweir if( '>'==cBreak ) 749*cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 750*cdf0e10cSrcweir else 751*cdf0e10cSrcweir bWeiter = sal_False; // Abbrechen, String zusammen 752*cdf0e10cSrcweir break; 753*cdf0e10cSrcweir 754*cdf0e10cSrcweir case '\f': 755*cdf0e10cSrcweir if( '>' == cBreak ) 756*cdf0e10cSrcweir { 757*cdf0e10cSrcweir // Beim Scannen von Optionen wie ein Space behandeln 758*cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)' ' ); 759*cdf0e10cSrcweir } 760*cdf0e10cSrcweir else 761*cdf0e10cSrcweir { 762*cdf0e10cSrcweir // sonst wird es ein eigenes Token 763*cdf0e10cSrcweir bWeiter = sal_False; 764*cdf0e10cSrcweir } 765*cdf0e10cSrcweir break; 766*cdf0e10cSrcweir 767*cdf0e10cSrcweir case '\r': 768*cdf0e10cSrcweir case '\n': 769*cdf0e10cSrcweir if( '>'==cBreak ) 770*cdf0e10cSrcweir { 771*cdf0e10cSrcweir // #26979# cr/lf in Tag wird in _GetNextToken() behandeln 772*cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 773*cdf0e10cSrcweir break; 774*cdf0e10cSrcweir } 775*cdf0e10cSrcweir else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea ) 776*cdf0e10cSrcweir { 777*cdf0e10cSrcweir bWeiter = sal_False; 778*cdf0e10cSrcweir break; 779*cdf0e10cSrcweir } 780*cdf0e10cSrcweir // Bug 18984: CR-LF -> Blank 781*cdf0e10cSrcweir // Folge von CR/LF/BLANK/TAB nur in ein Blank wandeln 782*cdf0e10cSrcweir // kein break!! 783*cdf0e10cSrcweir case '\t': 784*cdf0e10cSrcweir if( '\t'==nNextCh && bReadPRE && '>'!=cBreak ) 785*cdf0e10cSrcweir { 786*cdf0e10cSrcweir // In <PRE>: Tabs nach oben durchreichen 787*cdf0e10cSrcweir bWeiter = sal_False; 788*cdf0e10cSrcweir break; 789*cdf0e10cSrcweir } 790*cdf0e10cSrcweir // kein break 791*cdf0e10cSrcweir case '\x0b': 792*cdf0e10cSrcweir if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) && 793*cdf0e10cSrcweir '>'!=cBreak ) 794*cdf0e10cSrcweir { 795*cdf0e10cSrcweir break; 796*cdf0e10cSrcweir } 797*cdf0e10cSrcweir nNextCh = ' '; 798*cdf0e10cSrcweir // kein break; 799*cdf0e10cSrcweir case ' ': 800*cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 801*cdf0e10cSrcweir if( '>'!=cBreak && (!bReadListing && !bReadXMP && 802*cdf0e10cSrcweir !bReadPRE && !bReadTextArea) ) 803*cdf0e10cSrcweir { 804*cdf0e10cSrcweir // alle Folgen von Blanks/Tabs/CR/LF zu einem Blank umwandeln 805*cdf0e10cSrcweir do { 806*cdf0e10cSrcweir if( sal_Unicode(EOF) == (nNextCh = GetNextChar()) && 807*cdf0e10cSrcweir rInput.IsEof() ) 808*cdf0e10cSrcweir { 809*cdf0e10cSrcweir if( aToken.Len() || sTmpBuffer.getLength() > 1L ) 810*cdf0e10cSrcweir { 811*cdf0e10cSrcweir // ausser den Blanks wurde noch etwas geselen 812*cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 813*cdf0e10cSrcweir return HTML_TEXTTOKEN; 814*cdf0e10cSrcweir } 815*cdf0e10cSrcweir else 816*cdf0e10cSrcweir // nur Blanks gelesen: dann darf kein Text 817*cdf0e10cSrcweir // mehr zurueckgegeben werden und _GetNextToken 818*cdf0e10cSrcweir // muss auf EOF laufen 819*cdf0e10cSrcweir return 0; 820*cdf0e10cSrcweir } 821*cdf0e10cSrcweir } while ( ' ' == nNextCh || '\t' == nNextCh || 822*cdf0e10cSrcweir '\r' == nNextCh || '\n' == nNextCh || 823*cdf0e10cSrcweir '\x0b' == nNextCh ); 824*cdf0e10cSrcweir bNextCh = sal_False; 825*cdf0e10cSrcweir } 826*cdf0e10cSrcweir break; 827*cdf0e10cSrcweir 828*cdf0e10cSrcweir default: 829*cdf0e10cSrcweir bEqSignFound = sal_False; 830*cdf0e10cSrcweir if( (nNextCh==cBreak && !cQuote) || 831*cdf0e10cSrcweir (sal_uLong(aToken.Len()) + MAX_LEN) > sal_uLong(STRING_MAXLEN & ~1 )) 832*cdf0e10cSrcweir bWeiter = sal_False; 833*cdf0e10cSrcweir else 834*cdf0e10cSrcweir { 835*cdf0e10cSrcweir do { 836*cdf0e10cSrcweir // alle anderen Zeichen kommen in den Text 837*cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 838*cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() ) 839*cdf0e10cSrcweir { 840*cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 841*cdf0e10cSrcweir if( (sal_uLong(aToken.Len()) + MAX_LEN) > 842*cdf0e10cSrcweir sal_uLong(STRING_MAXLEN & ~1 ) ) 843*cdf0e10cSrcweir { 844*cdf0e10cSrcweir nNextCh = GetNextChar(); 845*cdf0e10cSrcweir return HTML_TEXTTOKEN; 846*cdf0e10cSrcweir } 847*cdf0e10cSrcweir } 848*cdf0e10cSrcweir if( ( sal_Unicode(EOF) == (nNextCh = GetNextChar()) && 849*cdf0e10cSrcweir rInput.IsEof() ) || 850*cdf0e10cSrcweir !IsParserWorking() ) 851*cdf0e10cSrcweir { 852*cdf0e10cSrcweir if( sTmpBuffer.getLength() ) 853*cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 854*cdf0e10cSrcweir return HTML_TEXTTOKEN; 855*cdf0e10cSrcweir } 856*cdf0e10cSrcweir } while( HTML_ISALPHA( nNextCh ) || HTML_ISDIGIT( nNextCh ) ); 857*cdf0e10cSrcweir bNextCh = sal_False; 858*cdf0e10cSrcweir } 859*cdf0e10cSrcweir } 860*cdf0e10cSrcweir 861*cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() ) 862*cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 863*cdf0e10cSrcweir 864*cdf0e10cSrcweir if( bWeiter && bNextCh ) 865*cdf0e10cSrcweir nNextCh = GetNextChar(); 866*cdf0e10cSrcweir } 867*cdf0e10cSrcweir 868*cdf0e10cSrcweir if( sTmpBuffer.getLength() ) 869*cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 870*cdf0e10cSrcweir 871*cdf0e10cSrcweir return HTML_TEXTTOKEN; 872*cdf0e10cSrcweir } 873*cdf0e10cSrcweir 874*cdf0e10cSrcweir int HTMLParser::_GetNextRawToken() 875*cdf0e10cSrcweir { 876*cdf0e10cSrcweir ::rtl::OUStringBuffer sTmpBuffer( MAX_LEN ); 877*cdf0e10cSrcweir 878*cdf0e10cSrcweir if( bEndTokenFound ) 879*cdf0e10cSrcweir { 880*cdf0e10cSrcweir // beim letzten Aufruf haben wir das End-Token bereits gefunden, 881*cdf0e10cSrcweir // deshalb muessen wir es nicht noch einmal suchen 882*cdf0e10cSrcweir bReadScript = sal_False; 883*cdf0e10cSrcweir bReadStyle = sal_False; 884*cdf0e10cSrcweir aEndToken.Erase(); 885*cdf0e10cSrcweir bEndTokenFound = sal_False; 886*cdf0e10cSrcweir 887*cdf0e10cSrcweir return 0; 888*cdf0e10cSrcweir } 889*cdf0e10cSrcweir 890*cdf0e10cSrcweir // per default geben wir HTML_RAWDATA zurueck 891*cdf0e10cSrcweir int bWeiter = sal_True; 892*cdf0e10cSrcweir int nToken = HTML_RAWDATA; 893*cdf0e10cSrcweir SaveState( 0 ); 894*cdf0e10cSrcweir while( bWeiter && IsParserWorking() ) 895*cdf0e10cSrcweir { 896*cdf0e10cSrcweir int bNextCh = sal_True; 897*cdf0e10cSrcweir switch( nNextCh ) 898*cdf0e10cSrcweir { 899*cdf0e10cSrcweir case '<': 900*cdf0e10cSrcweir { 901*cdf0e10cSrcweir // Vielleicht haben wir das Ende erreicht 902*cdf0e10cSrcweir 903*cdf0e10cSrcweir // das bisher gelesene erstmal retten 904*cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 905*cdf0e10cSrcweir 906*cdf0e10cSrcweir // und die Position im Stream merken 907*cdf0e10cSrcweir sal_uLong nStreamPos = rInput.Tell(); 908*cdf0e10cSrcweir sal_uLong nLineNr = GetLineNr(); 909*cdf0e10cSrcweir sal_uLong nLinePos = GetLinePos(); 910*cdf0e10cSrcweir 911*cdf0e10cSrcweir // Start eines End-Token? 912*cdf0e10cSrcweir int bOffState = sal_False; 913*cdf0e10cSrcweir if( '/' == (nNextCh = GetNextChar()) ) 914*cdf0e10cSrcweir { 915*cdf0e10cSrcweir bOffState = sal_True; 916*cdf0e10cSrcweir nNextCh = GetNextChar(); 917*cdf0e10cSrcweir } 918*cdf0e10cSrcweir else if( '!' == nNextCh ) 919*cdf0e10cSrcweir { 920*cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 921*cdf0e10cSrcweir nNextCh = GetNextChar(); 922*cdf0e10cSrcweir } 923*cdf0e10cSrcweir 924*cdf0e10cSrcweir // jetzt die Buchstaben danach lesen 925*cdf0e10cSrcweir while( (HTML_ISALPHA(nNextCh) || '-'==nNextCh) && 926*cdf0e10cSrcweir IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN ) 927*cdf0e10cSrcweir { 928*cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 929*cdf0e10cSrcweir nNextCh = GetNextChar(); 930*cdf0e10cSrcweir } 931*cdf0e10cSrcweir 932*cdf0e10cSrcweir String aTok( sTmpBuffer.getStr(), 933*cdf0e10cSrcweir sal::static_int_cast< xub_StrLen >( 934*cdf0e10cSrcweir sTmpBuffer.getLength()) ); 935*cdf0e10cSrcweir aTok.ToUpperAscii(); 936*cdf0e10cSrcweir sal_Bool bDone = sal_False; 937*cdf0e10cSrcweir if( bReadScript || aEndToken.Len() ) 938*cdf0e10cSrcweir { 939*cdf0e10cSrcweir if( !bReadComment ) 940*cdf0e10cSrcweir { 941*cdf0e10cSrcweir if( aTok.CompareToAscii( OOO_STRING_SVTOOLS_HTML_comment, 3 ) 942*cdf0e10cSrcweir == COMPARE_EQUAL ) 943*cdf0e10cSrcweir { 944*cdf0e10cSrcweir bReadComment = sal_True; 945*cdf0e10cSrcweir } 946*cdf0e10cSrcweir else 947*cdf0e10cSrcweir { 948*cdf0e10cSrcweir // ein Script muss mit "</SCRIPT>" aufhoehren, wobei 949*cdf0e10cSrcweir // wir es mit dem ">" aus sicherheitsgruenden 950*cdf0e10cSrcweir // erstmal nicht so genau nehmen 951*cdf0e10cSrcweir bDone = bOffState && // '>'==nNextCh && 952*cdf0e10cSrcweir COMPARE_EQUAL == ( bReadScript 953*cdf0e10cSrcweir ? aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_script) 954*cdf0e10cSrcweir : aTok.CompareTo(aEndToken) ); 955*cdf0e10cSrcweir } 956*cdf0e10cSrcweir } 957*cdf0e10cSrcweir if( bReadComment && '>'==nNextCh && aTok.Len() >= 2 && 958*cdf0e10cSrcweir aTok.Copy( aTok.Len()-2 ).EqualsAscii( "--" ) ) 959*cdf0e10cSrcweir { 960*cdf0e10cSrcweir // hier ist ein Kommentar der Art <!-----> zuende 961*cdf0e10cSrcweir bReadComment = sal_False; 962*cdf0e10cSrcweir } 963*cdf0e10cSrcweir } 964*cdf0e10cSrcweir else 965*cdf0e10cSrcweir { 966*cdf0e10cSrcweir // ein Style-Sheet kann mit </STYLE>, </HEAD> oder 967*cdf0e10cSrcweir // <BODY> aughoehren 968*cdf0e10cSrcweir if( bOffState ) 969*cdf0e10cSrcweir bDone = aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_style) 970*cdf0e10cSrcweir == COMPARE_EQUAL || 971*cdf0e10cSrcweir aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_head) 972*cdf0e10cSrcweir == COMPARE_EQUAL; 973*cdf0e10cSrcweir else 974*cdf0e10cSrcweir bDone = 975*cdf0e10cSrcweir aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_body) == COMPARE_EQUAL; 976*cdf0e10cSrcweir } 977*cdf0e10cSrcweir 978*cdf0e10cSrcweir if( bDone ) 979*cdf0e10cSrcweir { 980*cdf0e10cSrcweir // das war's, jetzt muessen wir gegebenenfalls den 981*cdf0e10cSrcweir // bisher gelesenen String zurueckgeben und dnach normal 982*cdf0e10cSrcweir // weitermachen 983*cdf0e10cSrcweir 984*cdf0e10cSrcweir bWeiter = sal_False; 985*cdf0e10cSrcweir 986*cdf0e10cSrcweir // nToken==0 heisst, dass _GetNextToken gleich weiterliest 987*cdf0e10cSrcweir if( !aToken.Len() && (bReadStyle || bReadScript) ) 988*cdf0e10cSrcweir { 989*cdf0e10cSrcweir // wir koennen sofort die Umgebung beeden und 990*cdf0e10cSrcweir // das End-Token parsen 991*cdf0e10cSrcweir bReadScript = sal_False; 992*cdf0e10cSrcweir bReadStyle = sal_False; 993*cdf0e10cSrcweir aEndToken.Erase(); 994*cdf0e10cSrcweir nToken = 0; 995*cdf0e10cSrcweir } 996*cdf0e10cSrcweir else 997*cdf0e10cSrcweir { 998*cdf0e10cSrcweir // wir muessen bReadScript/bReadStyle noch am 999*cdf0e10cSrcweir // Leben lassen und koennen erst beim naechsten 1000*cdf0e10cSrcweir // mal das End-Token Parsen 1001*cdf0e10cSrcweir bEndTokenFound = sal_True; 1002*cdf0e10cSrcweir } 1003*cdf0e10cSrcweir 1004*cdf0e10cSrcweir // jetzt fahren wir im Stream auf das '<' zurueck 1005*cdf0e10cSrcweir rInput.Seek( nStreamPos ); 1006*cdf0e10cSrcweir SetLineNr( nLineNr ); 1007*cdf0e10cSrcweir SetLinePos( nLinePos ); 1008*cdf0e10cSrcweir ClearTxtConvContext(); 1009*cdf0e10cSrcweir nNextCh = '<'; 1010*cdf0e10cSrcweir 1011*cdf0e10cSrcweir // den String wollen wir nicht an das Token haengen 1012*cdf0e10cSrcweir sTmpBuffer.setLength( 0L ); 1013*cdf0e10cSrcweir } 1014*cdf0e10cSrcweir else 1015*cdf0e10cSrcweir { 1016*cdf0e10cSrcweir // "</" merken, alles andere steht noch im buffer 1017*cdf0e10cSrcweir aToken += (sal_Unicode)'<'; 1018*cdf0e10cSrcweir if( bOffState ) 1019*cdf0e10cSrcweir aToken += (sal_Unicode)'/'; 1020*cdf0e10cSrcweir 1021*cdf0e10cSrcweir bNextCh = sal_False; 1022*cdf0e10cSrcweir } 1023*cdf0e10cSrcweir } 1024*cdf0e10cSrcweir break; 1025*cdf0e10cSrcweir case '-': 1026*cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 1027*cdf0e10cSrcweir if( bReadComment ) 1028*cdf0e10cSrcweir { 1029*cdf0e10cSrcweir sal_Bool bTwoMinus = sal_False; 1030*cdf0e10cSrcweir nNextCh = GetNextChar(); 1031*cdf0e10cSrcweir while( '-' == nNextCh && IsParserWorking() ) 1032*cdf0e10cSrcweir { 1033*cdf0e10cSrcweir bTwoMinus = sal_True; 1034*cdf0e10cSrcweir 1035*cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() ) 1036*cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 1037*cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 1038*cdf0e10cSrcweir nNextCh = GetNextChar(); 1039*cdf0e10cSrcweir } 1040*cdf0e10cSrcweir 1041*cdf0e10cSrcweir if( '>' == nNextCh && IsParserWorking() && bTwoMinus ) 1042*cdf0e10cSrcweir bReadComment = sal_False; 1043*cdf0e10cSrcweir 1044*cdf0e10cSrcweir bNextCh = sal_False; 1045*cdf0e10cSrcweir } 1046*cdf0e10cSrcweir break; 1047*cdf0e10cSrcweir 1048*cdf0e10cSrcweir case '\r': 1049*cdf0e10cSrcweir // \r\n? beendet das aktuelle Text-Token (auch wenn es leer ist) 1050*cdf0e10cSrcweir nNextCh = GetNextChar(); 1051*cdf0e10cSrcweir if( nNextCh=='\n' ) 1052*cdf0e10cSrcweir nNextCh = GetNextChar(); 1053*cdf0e10cSrcweir bWeiter = sal_False; 1054*cdf0e10cSrcweir break; 1055*cdf0e10cSrcweir case '\n': 1056*cdf0e10cSrcweir // \n beendet das aktuelle Text-Token (auch wenn es leer ist) 1057*cdf0e10cSrcweir nNextCh = GetNextChar(); 1058*cdf0e10cSrcweir bWeiter = sal_False; 1059*cdf0e10cSrcweir break; 1060*cdf0e10cSrcweir case sal_Unicode(EOF): 1061*cdf0e10cSrcweir // eof beendet das aktuelle Text-Token und tut so, als ob 1062*cdf0e10cSrcweir // ein End-Token gelesen wurde 1063*cdf0e10cSrcweir if( rInput.IsEof() ) 1064*cdf0e10cSrcweir { 1065*cdf0e10cSrcweir bWeiter = sal_False; 1066*cdf0e10cSrcweir if( aToken.Len() || sTmpBuffer.getLength() ) 1067*cdf0e10cSrcweir { 1068*cdf0e10cSrcweir bEndTokenFound = sal_True; 1069*cdf0e10cSrcweir } 1070*cdf0e10cSrcweir else 1071*cdf0e10cSrcweir { 1072*cdf0e10cSrcweir bReadScript = sal_False; 1073*cdf0e10cSrcweir bReadStyle = sal_False; 1074*cdf0e10cSrcweir aEndToken.Erase(); 1075*cdf0e10cSrcweir nToken = 0; 1076*cdf0e10cSrcweir } 1077*cdf0e10cSrcweir break; 1078*cdf0e10cSrcweir } 1079*cdf0e10cSrcweir // kein break 1080*cdf0e10cSrcweir default: 1081*cdf0e10cSrcweir // alle anderen Zeichen landen im Buffer 1082*cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 1083*cdf0e10cSrcweir break; 1084*cdf0e10cSrcweir } 1085*cdf0e10cSrcweir 1086*cdf0e10cSrcweir if( (!bWeiter && sTmpBuffer.getLength() > 0L) || 1087*cdf0e10cSrcweir MAX_LEN == sTmpBuffer.getLength() ) 1088*cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 1089*cdf0e10cSrcweir 1090*cdf0e10cSrcweir if( bWeiter && bNextCh ) 1091*cdf0e10cSrcweir nNextCh = GetNextChar(); 1092*cdf0e10cSrcweir } 1093*cdf0e10cSrcweir 1094*cdf0e10cSrcweir if( IsParserWorking() ) 1095*cdf0e10cSrcweir SaveState( 0 ); 1096*cdf0e10cSrcweir else 1097*cdf0e10cSrcweir nToken = 0; 1098*cdf0e10cSrcweir 1099*cdf0e10cSrcweir return nToken; 1100*cdf0e10cSrcweir } 1101*cdf0e10cSrcweir 1102*cdf0e10cSrcweir // scanne das naechste Token, 1103*cdf0e10cSrcweir int __EXPORT HTMLParser::_GetNextToken() 1104*cdf0e10cSrcweir { 1105*cdf0e10cSrcweir int nRet = 0; 1106*cdf0e10cSrcweir sSaveToken.Erase(); 1107*cdf0e10cSrcweir 1108*cdf0e10cSrcweir // die Optionen loeschen 1109*cdf0e10cSrcweir if( pOptions->Count() ) 1110*cdf0e10cSrcweir pOptions->DeleteAndDestroy( 0, pOptions->Count() ); 1111*cdf0e10cSrcweir 1112*cdf0e10cSrcweir if( !IsParserWorking() ) // wenn schon Fehler, dann nicht weiter! 1113*cdf0e10cSrcweir return 0; 1114*cdf0e10cSrcweir 1115*cdf0e10cSrcweir sal_Bool bReadNextCharSave = bReadNextChar; 1116*cdf0e10cSrcweir if( bReadNextChar ) 1117*cdf0e10cSrcweir { 1118*cdf0e10cSrcweir DBG_ASSERT( !bEndTokenFound, 1119*cdf0e10cSrcweir "</SCRIPT> gelesen und trotzdem noch ein Zeichen lesen?" ); 1120*cdf0e10cSrcweir nNextCh = GetNextChar(); 1121*cdf0e10cSrcweir if( !IsParserWorking() ) // wenn schon Fehler, dann nicht weiter! 1122*cdf0e10cSrcweir return 0; 1123*cdf0e10cSrcweir bReadNextChar = sal_False; 1124*cdf0e10cSrcweir } 1125*cdf0e10cSrcweir 1126*cdf0e10cSrcweir if( bReadScript || bReadStyle || aEndToken.Len() ) 1127*cdf0e10cSrcweir { 1128*cdf0e10cSrcweir nRet = _GetNextRawToken(); 1129*cdf0e10cSrcweir if( nRet || !IsParserWorking() ) 1130*cdf0e10cSrcweir return nRet; 1131*cdf0e10cSrcweir } 1132*cdf0e10cSrcweir 1133*cdf0e10cSrcweir do { 1134*cdf0e10cSrcweir int bNextCh = sal_True; 1135*cdf0e10cSrcweir switch( nNextCh ) 1136*cdf0e10cSrcweir { 1137*cdf0e10cSrcweir case '<': 1138*cdf0e10cSrcweir { 1139*cdf0e10cSrcweir sal_uLong nStreamPos = rInput.Tell(); 1140*cdf0e10cSrcweir sal_uLong nLineNr = GetLineNr(); 1141*cdf0e10cSrcweir sal_uLong nLinePos = GetLinePos(); 1142*cdf0e10cSrcweir 1143*cdf0e10cSrcweir int bOffState = sal_False; 1144*cdf0e10cSrcweir if( '/' == (nNextCh = GetNextChar()) ) 1145*cdf0e10cSrcweir { 1146*cdf0e10cSrcweir bOffState = sal_True; 1147*cdf0e10cSrcweir nNextCh = GetNextChar(); 1148*cdf0e10cSrcweir } 1149*cdf0e10cSrcweir if( HTML_ISALPHA( nNextCh ) || '!'==nNextCh ) // fix #26984# 1150*cdf0e10cSrcweir { 1151*cdf0e10cSrcweir ::rtl::OUStringBuffer sTmpBuffer; 1152*cdf0e10cSrcweir do { 1153*cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 1154*cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() ) 1155*cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 1156*cdf0e10cSrcweir nNextCh = GetNextChar(); 1157*cdf0e10cSrcweir } while( '>' != nNextCh && !HTML_ISSPACE( nNextCh ) && 1158*cdf0e10cSrcweir IsParserWorking() && !rInput.IsEof() ); 1159*cdf0e10cSrcweir 1160*cdf0e10cSrcweir if( sTmpBuffer.getLength() ) 1161*cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 1162*cdf0e10cSrcweir 1163*cdf0e10cSrcweir // Blanks ueberlesen 1164*cdf0e10cSrcweir while( HTML_ISSPACE( nNextCh ) && IsParserWorking() ) 1165*cdf0e10cSrcweir nNextCh = GetNextChar(); 1166*cdf0e10cSrcweir 1167*cdf0e10cSrcweir if( !IsParserWorking() ) 1168*cdf0e10cSrcweir { 1169*cdf0e10cSrcweir if( SVPAR_PENDING == eState ) 1170*cdf0e10cSrcweir bReadNextChar = bReadNextCharSave; 1171*cdf0e10cSrcweir break; 1172*cdf0e10cSrcweir } 1173*cdf0e10cSrcweir 1174*cdf0e10cSrcweir // suche das Token in der Tabelle: 1175*cdf0e10cSrcweir sSaveToken = aToken; 1176*cdf0e10cSrcweir aToken.ToUpperAscii(); 1177*cdf0e10cSrcweir if( 0 == (nRet = GetHTMLToken( aToken )) ) 1178*cdf0e10cSrcweir // Unknown Control 1179*cdf0e10cSrcweir nRet = HTML_UNKNOWNCONTROL_ON; 1180*cdf0e10cSrcweir 1181*cdf0e10cSrcweir // Wenn es ein Token zum ausschalten ist ... 1182*cdf0e10cSrcweir if( bOffState ) 1183*cdf0e10cSrcweir { 1184*cdf0e10cSrcweir if( HTML_TOKEN_ONOFF & nRet ) 1185*cdf0e10cSrcweir { 1186*cdf0e10cSrcweir // und es ein Off-Token gibt, das daraus machen 1187*cdf0e10cSrcweir ++nRet; 1188*cdf0e10cSrcweir } 1189*cdf0e10cSrcweir else if( HTML_LINEBREAK!=nRet ) 1190*cdf0e10cSrcweir { 1191*cdf0e10cSrcweir // und es kein Off-Token gibt, ein unbekanntes 1192*cdf0e10cSrcweir // Token daraus machen (ausser </BR>, das wird 1193*cdf0e10cSrcweir // wie <BR> behandelt 1194*cdf0e10cSrcweir nRet = HTML_UNKNOWNCONTROL_OFF; 1195*cdf0e10cSrcweir } 1196*cdf0e10cSrcweir } 1197*cdf0e10cSrcweir 1198*cdf0e10cSrcweir if( nRet == HTML_COMMENT ) 1199*cdf0e10cSrcweir { 1200*cdf0e10cSrcweir // fix: sSaveToken wegen Gross-/Kleinschreibung 1201*cdf0e10cSrcweir // als Anfang des Kommentars benutzen und ein 1202*cdf0e10cSrcweir // Space anhaengen. 1203*cdf0e10cSrcweir aToken = sSaveToken; 1204*cdf0e10cSrcweir if( '>'!=nNextCh ) 1205*cdf0e10cSrcweir aToken += (sal_Unicode)' '; 1206*cdf0e10cSrcweir sal_uLong nCStreamPos = 0; 1207*cdf0e10cSrcweir sal_uLong nCLineNr = 0; 1208*cdf0e10cSrcweir sal_uLong nCLinePos = 0; 1209*cdf0e10cSrcweir xub_StrLen nCStrLen = 0; 1210*cdf0e10cSrcweir 1211*cdf0e10cSrcweir sal_Bool bDone = sal_False; 1212*cdf0e10cSrcweir // bis zum schliessenden --> lesen. wenn keins gefunden 1213*cdf0e10cSrcweir // wurde beim der ersten > wieder aufsetzen 1214*cdf0e10cSrcweir while( !bDone && !rInput.IsEof() && IsParserWorking() ) 1215*cdf0e10cSrcweir { 1216*cdf0e10cSrcweir if( '>'==nNextCh ) 1217*cdf0e10cSrcweir { 1218*cdf0e10cSrcweir if( !nCStreamPos ) 1219*cdf0e10cSrcweir { 1220*cdf0e10cSrcweir nCStreamPos = rInput.Tell(); 1221*cdf0e10cSrcweir nCStrLen = aToken.Len(); 1222*cdf0e10cSrcweir nCLineNr = GetLineNr(); 1223*cdf0e10cSrcweir nCLinePos = GetLinePos(); 1224*cdf0e10cSrcweir } 1225*cdf0e10cSrcweir bDone = aToken.Len() >= 2 && 1226*cdf0e10cSrcweir aToken.Copy(aToken.Len()-2,2). 1227*cdf0e10cSrcweir EqualsAscii( "--" ); 1228*cdf0e10cSrcweir if( !bDone ) 1229*cdf0e10cSrcweir aToken += nNextCh; 1230*cdf0e10cSrcweir } 1231*cdf0e10cSrcweir else 1232*cdf0e10cSrcweir aToken += nNextCh; 1233*cdf0e10cSrcweir if( !bDone ) 1234*cdf0e10cSrcweir nNextCh = GetNextChar(); 1235*cdf0e10cSrcweir } 1236*cdf0e10cSrcweir if( !bDone && IsParserWorking() && nCStreamPos ) 1237*cdf0e10cSrcweir { 1238*cdf0e10cSrcweir rInput.Seek( nCStreamPos ); 1239*cdf0e10cSrcweir SetLineNr( nCLineNr ); 1240*cdf0e10cSrcweir SetLinePos( nCLinePos ); 1241*cdf0e10cSrcweir ClearTxtConvContext(); 1242*cdf0e10cSrcweir aToken.Erase( nCStrLen ); 1243*cdf0e10cSrcweir nNextCh = '>'; 1244*cdf0e10cSrcweir } 1245*cdf0e10cSrcweir } 1246*cdf0e10cSrcweir else 1247*cdf0e10cSrcweir { 1248*cdf0e10cSrcweir // den TokenString koennen wir jetzt verwerfen 1249*cdf0e10cSrcweir aToken.Erase(); 1250*cdf0e10cSrcweir } 1251*cdf0e10cSrcweir 1252*cdf0e10cSrcweir // dann lesen wir mal alles bis zur schliessenden '>' 1253*cdf0e10cSrcweir if( '>' != nNextCh && IsParserWorking() ) 1254*cdf0e10cSrcweir { 1255*cdf0e10cSrcweir ScanText( '>' ); 1256*cdf0e10cSrcweir if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() ) 1257*cdf0e10cSrcweir { 1258*cdf0e10cSrcweir // zurueck hinter die < gehen und dort neu 1259*cdf0e10cSrcweir // aufsetzen, das < als Text zurueckgeben 1260*cdf0e10cSrcweir rInput.Seek( nStreamPos ); 1261*cdf0e10cSrcweir SetLineNr( nLineNr ); 1262*cdf0e10cSrcweir SetLinePos( nLinePos ); 1263*cdf0e10cSrcweir ClearTxtConvContext(); 1264*cdf0e10cSrcweir 1265*cdf0e10cSrcweir aToken = '<'; 1266*cdf0e10cSrcweir nRet = HTML_TEXTTOKEN; 1267*cdf0e10cSrcweir nNextCh = GetNextChar(); 1268*cdf0e10cSrcweir bNextCh = sal_False; 1269*cdf0e10cSrcweir break; 1270*cdf0e10cSrcweir } 1271*cdf0e10cSrcweir } 1272*cdf0e10cSrcweir if( SVPAR_PENDING == eState ) 1273*cdf0e10cSrcweir bReadNextChar = bReadNextCharSave; 1274*cdf0e10cSrcweir } 1275*cdf0e10cSrcweir else 1276*cdf0e10cSrcweir { 1277*cdf0e10cSrcweir if( bOffState ) 1278*cdf0e10cSrcweir { 1279*cdf0e10cSrcweir // einfach alles wegschmeissen 1280*cdf0e10cSrcweir ScanText( '>' ); 1281*cdf0e10cSrcweir if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() ) 1282*cdf0e10cSrcweir { 1283*cdf0e10cSrcweir // zurueck hinter die < gehen und dort neu 1284*cdf0e10cSrcweir // aufsetzen, das < als Text zurueckgeben 1285*cdf0e10cSrcweir rInput.Seek( nStreamPos ); 1286*cdf0e10cSrcweir SetLineNr( nLineNr ); 1287*cdf0e10cSrcweir SetLinePos( nLinePos ); 1288*cdf0e10cSrcweir ClearTxtConvContext(); 1289*cdf0e10cSrcweir 1290*cdf0e10cSrcweir aToken = '<'; 1291*cdf0e10cSrcweir nRet = HTML_TEXTTOKEN; 1292*cdf0e10cSrcweir nNextCh = GetNextChar(); 1293*cdf0e10cSrcweir bNextCh = sal_False; 1294*cdf0e10cSrcweir break; 1295*cdf0e10cSrcweir } 1296*cdf0e10cSrcweir if( SVPAR_PENDING == eState ) 1297*cdf0e10cSrcweir bReadNextChar = bReadNextCharSave; 1298*cdf0e10cSrcweir aToken.Erase(); 1299*cdf0e10cSrcweir } 1300*cdf0e10cSrcweir else if( '%' == nNextCh ) 1301*cdf0e10cSrcweir { 1302*cdf0e10cSrcweir nRet = HTML_UNKNOWNCONTROL_ON; 1303*cdf0e10cSrcweir 1304*cdf0e10cSrcweir sal_uLong nCStreamPos = rInput.Tell(); 1305*cdf0e10cSrcweir sal_uLong nCLineNr = GetLineNr(), nCLinePos = GetLinePos(); 1306*cdf0e10cSrcweir 1307*cdf0e10cSrcweir sal_Bool bDone = sal_False; 1308*cdf0e10cSrcweir // bis zum schliessenden %> lesen. wenn keins gefunden 1309*cdf0e10cSrcweir // wurde beim der ersten > wieder aufsetzen 1310*cdf0e10cSrcweir while( !bDone && !rInput.IsEof() && IsParserWorking() ) 1311*cdf0e10cSrcweir { 1312*cdf0e10cSrcweir bDone = '>'==nNextCh && aToken.Len() >= 1 && 1313*cdf0e10cSrcweir '%' == aToken.GetChar( aToken.Len()-1 ); 1314*cdf0e10cSrcweir if( !bDone ) 1315*cdf0e10cSrcweir { 1316*cdf0e10cSrcweir aToken += nNextCh; 1317*cdf0e10cSrcweir nNextCh = GetNextChar(); 1318*cdf0e10cSrcweir } 1319*cdf0e10cSrcweir } 1320*cdf0e10cSrcweir if( !bDone && IsParserWorking() ) 1321*cdf0e10cSrcweir { 1322*cdf0e10cSrcweir rInput.Seek( nCStreamPos ); 1323*cdf0e10cSrcweir SetLineNr( nCLineNr ); 1324*cdf0e10cSrcweir SetLinePos( nCLinePos ); 1325*cdf0e10cSrcweir ClearTxtConvContext(); 1326*cdf0e10cSrcweir aToken.AssignAscii( "<%", 2 ); 1327*cdf0e10cSrcweir nRet = HTML_TEXTTOKEN; 1328*cdf0e10cSrcweir break; 1329*cdf0e10cSrcweir } 1330*cdf0e10cSrcweir if( IsParserWorking() ) 1331*cdf0e10cSrcweir { 1332*cdf0e10cSrcweir sSaveToken = aToken; 1333*cdf0e10cSrcweir aToken.Erase(); 1334*cdf0e10cSrcweir } 1335*cdf0e10cSrcweir } 1336*cdf0e10cSrcweir else 1337*cdf0e10cSrcweir { 1338*cdf0e10cSrcweir aToken = '<'; 1339*cdf0e10cSrcweir nRet = HTML_TEXTTOKEN; 1340*cdf0e10cSrcweir bNextCh = sal_False; 1341*cdf0e10cSrcweir break; 1342*cdf0e10cSrcweir } 1343*cdf0e10cSrcweir } 1344*cdf0e10cSrcweir 1345*cdf0e10cSrcweir if( IsParserWorking() ) 1346*cdf0e10cSrcweir { 1347*cdf0e10cSrcweir bNextCh = '>' == nNextCh; 1348*cdf0e10cSrcweir switch( nRet ) 1349*cdf0e10cSrcweir { 1350*cdf0e10cSrcweir case HTML_TEXTAREA_ON: 1351*cdf0e10cSrcweir bReadTextArea = sal_True; 1352*cdf0e10cSrcweir break; 1353*cdf0e10cSrcweir case HTML_TEXTAREA_OFF: 1354*cdf0e10cSrcweir bReadTextArea = sal_False; 1355*cdf0e10cSrcweir break; 1356*cdf0e10cSrcweir case HTML_SCRIPT_ON: 1357*cdf0e10cSrcweir if( !bReadTextArea ) 1358*cdf0e10cSrcweir bReadScript = sal_True; 1359*cdf0e10cSrcweir break; 1360*cdf0e10cSrcweir case HTML_SCRIPT_OFF: 1361*cdf0e10cSrcweir if( !bReadTextArea ) 1362*cdf0e10cSrcweir { 1363*cdf0e10cSrcweir bReadScript = sal_False; 1364*cdf0e10cSrcweir // JavaScript kann den Stream veraendern 1365*cdf0e10cSrcweir // also muss das letzte Zeichen nochmals 1366*cdf0e10cSrcweir // gelesen werden 1367*cdf0e10cSrcweir bReadNextChar = sal_True; 1368*cdf0e10cSrcweir bNextCh = sal_False; 1369*cdf0e10cSrcweir } 1370*cdf0e10cSrcweir break; 1371*cdf0e10cSrcweir 1372*cdf0e10cSrcweir case HTML_STYLE_ON: 1373*cdf0e10cSrcweir bReadStyle = sal_True; 1374*cdf0e10cSrcweir break; 1375*cdf0e10cSrcweir case HTML_STYLE_OFF: 1376*cdf0e10cSrcweir bReadStyle = sal_False; 1377*cdf0e10cSrcweir break; 1378*cdf0e10cSrcweir } 1379*cdf0e10cSrcweir 1380*cdf0e10cSrcweir } 1381*cdf0e10cSrcweir } 1382*cdf0e10cSrcweir break; 1383*cdf0e10cSrcweir 1384*cdf0e10cSrcweir case sal_Unicode(EOF): 1385*cdf0e10cSrcweir if( rInput.IsEof() ) 1386*cdf0e10cSrcweir { 1387*cdf0e10cSrcweir eState = SVPAR_ACCEPTED; 1388*cdf0e10cSrcweir nRet = nNextCh; 1389*cdf0e10cSrcweir } 1390*cdf0e10cSrcweir else 1391*cdf0e10cSrcweir { 1392*cdf0e10cSrcweir // normalen Text lesen 1393*cdf0e10cSrcweir goto scan_text; 1394*cdf0e10cSrcweir } 1395*cdf0e10cSrcweir break; 1396*cdf0e10cSrcweir 1397*cdf0e10cSrcweir case '\f': 1398*cdf0e10cSrcweir // Form-Feeds werden jetzt extra nach oben gereicht 1399*cdf0e10cSrcweir nRet = HTML_LINEFEEDCHAR; // !!! eigentlich FORMFEEDCHAR 1400*cdf0e10cSrcweir break; 1401*cdf0e10cSrcweir 1402*cdf0e10cSrcweir case '\n': 1403*cdf0e10cSrcweir case '\r': 1404*cdf0e10cSrcweir if( bReadListing || bReadXMP || bReadPRE || bReadTextArea ) 1405*cdf0e10cSrcweir { 1406*cdf0e10cSrcweir sal_Unicode c = GetNextChar(); 1407*cdf0e10cSrcweir if( ( '\n' != nNextCh || '\r' != c ) && 1408*cdf0e10cSrcweir ( '\r' != nNextCh || '\n' != c ) ) 1409*cdf0e10cSrcweir { 1410*cdf0e10cSrcweir bNextCh = sal_False; 1411*cdf0e10cSrcweir nNextCh = c; 1412*cdf0e10cSrcweir } 1413*cdf0e10cSrcweir nRet = HTML_NEWPARA; 1414*cdf0e10cSrcweir break; 1415*cdf0e10cSrcweir } 1416*cdf0e10cSrcweir // kein break ! 1417*cdf0e10cSrcweir case '\t': 1418*cdf0e10cSrcweir if( bReadPRE ) 1419*cdf0e10cSrcweir { 1420*cdf0e10cSrcweir nRet = HTML_TABCHAR; 1421*cdf0e10cSrcweir break; 1422*cdf0e10cSrcweir } 1423*cdf0e10cSrcweir // kein break ! 1424*cdf0e10cSrcweir case ' ': 1425*cdf0e10cSrcweir // kein break ! 1426*cdf0e10cSrcweir default: 1427*cdf0e10cSrcweir 1428*cdf0e10cSrcweir scan_text: 1429*cdf0e10cSrcweir // es folgt "normaler" Text 1430*cdf0e10cSrcweir nRet = ScanText(); 1431*cdf0e10cSrcweir bNextCh = 0 == aToken.Len(); 1432*cdf0e10cSrcweir 1433*cdf0e10cSrcweir // der Text sollte noch verarbeitet werden 1434*cdf0e10cSrcweir if( !bNextCh && eState == SVPAR_PENDING ) 1435*cdf0e10cSrcweir { 1436*cdf0e10cSrcweir eState = SVPAR_WORKING; 1437*cdf0e10cSrcweir bReadNextChar = sal_True; 1438*cdf0e10cSrcweir } 1439*cdf0e10cSrcweir 1440*cdf0e10cSrcweir break; 1441*cdf0e10cSrcweir } 1442*cdf0e10cSrcweir 1443*cdf0e10cSrcweir if( bNextCh && SVPAR_WORKING == eState ) 1444*cdf0e10cSrcweir { 1445*cdf0e10cSrcweir nNextCh = GetNextChar(); 1446*cdf0e10cSrcweir if( SVPAR_PENDING == eState && nRet && HTML_TEXTTOKEN != nRet ) 1447*cdf0e10cSrcweir { 1448*cdf0e10cSrcweir bReadNextChar = sal_True; 1449*cdf0e10cSrcweir eState = SVPAR_WORKING; 1450*cdf0e10cSrcweir } 1451*cdf0e10cSrcweir } 1452*cdf0e10cSrcweir 1453*cdf0e10cSrcweir } while( !nRet && SVPAR_WORKING == eState ); 1454*cdf0e10cSrcweir 1455*cdf0e10cSrcweir if( SVPAR_PENDING == eState ) 1456*cdf0e10cSrcweir nRet = -1; // irgendwas ungueltiges 1457*cdf0e10cSrcweir 1458*cdf0e10cSrcweir return nRet; 1459*cdf0e10cSrcweir } 1460*cdf0e10cSrcweir 1461*cdf0e10cSrcweir void HTMLParser::UnescapeToken() 1462*cdf0e10cSrcweir { 1463*cdf0e10cSrcweir xub_StrLen nPos=0; 1464*cdf0e10cSrcweir 1465*cdf0e10cSrcweir sal_Bool bEscape = sal_False; 1466*cdf0e10cSrcweir while( nPos < aToken.Len() ) 1467*cdf0e10cSrcweir { 1468*cdf0e10cSrcweir sal_Bool bOldEscape = bEscape; 1469*cdf0e10cSrcweir bEscape = sal_False; 1470*cdf0e10cSrcweir if( '\\'==aToken.GetChar(nPos) && !bOldEscape ) 1471*cdf0e10cSrcweir { 1472*cdf0e10cSrcweir aToken.Erase( nPos, 1 ); 1473*cdf0e10cSrcweir bEscape = sal_True; 1474*cdf0e10cSrcweir } 1475*cdf0e10cSrcweir else 1476*cdf0e10cSrcweir { 1477*cdf0e10cSrcweir nPos++; 1478*cdf0e10cSrcweir } 1479*cdf0e10cSrcweir } 1480*cdf0e10cSrcweir } 1481*cdf0e10cSrcweir 1482*cdf0e10cSrcweir // hole die Optionen 1483*cdf0e10cSrcweir const HTMLOptions *HTMLParser::GetOptions( sal_uInt16 *pNoConvertToken ) const 1484*cdf0e10cSrcweir { 1485*cdf0e10cSrcweir // wenn die Option fuer das aktuelle Token schon einmal 1486*cdf0e10cSrcweir // geholt wurden, geben wir sie noch einmal zurueck 1487*cdf0e10cSrcweir if( pOptions->Count() ) 1488*cdf0e10cSrcweir return pOptions; 1489*cdf0e10cSrcweir 1490*cdf0e10cSrcweir xub_StrLen nPos = 0; 1491*cdf0e10cSrcweir while( nPos < aToken.Len() ) 1492*cdf0e10cSrcweir { 1493*cdf0e10cSrcweir // ein Zeichen ? Dann faengt hier eine Option an 1494*cdf0e10cSrcweir if( HTML_ISALPHA( aToken.GetChar(nPos) ) ) 1495*cdf0e10cSrcweir { 1496*cdf0e10cSrcweir int nToken; 1497*cdf0e10cSrcweir String aValue; 1498*cdf0e10cSrcweir xub_StrLen nStt = nPos; 1499*cdf0e10cSrcweir sal_Unicode cChar = 0; 1500*cdf0e10cSrcweir 1501*cdf0e10cSrcweir // Eigentlich sind hier nur ganz bestimmte Zeichen erlaubt. 1502*cdf0e10cSrcweir // Netscape achtet aber nur auf "=" und Leerzeichen (siehe 1503*cdf0e10cSrcweir // Mozilla: PA_FetchRequestedNameValues in 1504*cdf0e10cSrcweir // lipparse/pa_mdl.c 1505*cdf0e10cSrcweir // while( nPos < aToken.Len() && 1506*cdf0e10cSrcweir // ( '-'==(c=aToken[nPos]) || isalnum(c) || '.'==c || '_'==c) ) 1507*cdf0e10cSrcweir while( nPos < aToken.Len() && '=' != (cChar=aToken.GetChar(nPos)) && 1508*cdf0e10cSrcweir HTML_ISPRINTABLE(cChar) && !HTML_ISSPACE(cChar) ) 1509*cdf0e10cSrcweir nPos++; 1510*cdf0e10cSrcweir 1511*cdf0e10cSrcweir String sName( aToken.Copy( nStt, nPos-nStt ) ); 1512*cdf0e10cSrcweir 1513*cdf0e10cSrcweir //JP 23.03.97: die PlugIns wollen die TokenName im "Original" haben 1514*cdf0e10cSrcweir // also nur fuers Suchen in UpperCase wandeln 1515*cdf0e10cSrcweir String sNameUpperCase( sName ); 1516*cdf0e10cSrcweir sNameUpperCase.ToUpperAscii(); 1517*cdf0e10cSrcweir 1518*cdf0e10cSrcweir nToken = GetHTMLOption( sNameUpperCase ); // der Name ist fertig 1519*cdf0e10cSrcweir DBG_ASSERTWARNING( nToken!=HTML_O_UNKNOWN, 1520*cdf0e10cSrcweir "GetOption: unbekannte HTML-Option" ); 1521*cdf0e10cSrcweir sal_Bool bStripCRLF = (nToken < HTML_OPTION_SCRIPT_START || 1522*cdf0e10cSrcweir nToken >= HTML_OPTION_SCRIPT_END) && 1523*cdf0e10cSrcweir (!pNoConvertToken || nToken != *pNoConvertToken); 1524*cdf0e10cSrcweir 1525*cdf0e10cSrcweir while( nPos < aToken.Len() && 1526*cdf0e10cSrcweir ( !HTML_ISPRINTABLE( (cChar=aToken.GetChar(nPos)) ) || 1527*cdf0e10cSrcweir HTML_ISSPACE(cChar) ) ) 1528*cdf0e10cSrcweir nPos++; 1529*cdf0e10cSrcweir 1530*cdf0e10cSrcweir // hat die Option auch einen Wert? 1531*cdf0e10cSrcweir if( nPos!=aToken.Len() && '='==cChar ) 1532*cdf0e10cSrcweir { 1533*cdf0e10cSrcweir nPos++; 1534*cdf0e10cSrcweir 1535*cdf0e10cSrcweir while( nPos < aToken.Len() && 1536*cdf0e10cSrcweir ( !HTML_ISPRINTABLE( (cChar=aToken.GetChar(nPos)) ) || 1537*cdf0e10cSrcweir ' '==cChar || '\t'==cChar || '\r'==cChar || '\n'==cChar ) ) 1538*cdf0e10cSrcweir nPos++; 1539*cdf0e10cSrcweir 1540*cdf0e10cSrcweir if( nPos != aToken.Len() ) 1541*cdf0e10cSrcweir { 1542*cdf0e10cSrcweir xub_StrLen nLen = 0; 1543*cdf0e10cSrcweir nStt = nPos; 1544*cdf0e10cSrcweir if( ('"'==cChar) || ('\'')==cChar ) 1545*cdf0e10cSrcweir { 1546*cdf0e10cSrcweir sal_Unicode cEnd = cChar; 1547*cdf0e10cSrcweir nPos++; nStt++; 1548*cdf0e10cSrcweir sal_Bool bDone = sal_False; 1549*cdf0e10cSrcweir sal_Bool bEscape = sal_False; 1550*cdf0e10cSrcweir while( nPos < aToken.Len() && !bDone ) 1551*cdf0e10cSrcweir { 1552*cdf0e10cSrcweir sal_Bool bOldEscape = bEscape; 1553*cdf0e10cSrcweir bEscape = sal_False; 1554*cdf0e10cSrcweir cChar = aToken.GetChar(nPos); 1555*cdf0e10cSrcweir switch( cChar ) 1556*cdf0e10cSrcweir { 1557*cdf0e10cSrcweir case '\r': 1558*cdf0e10cSrcweir case '\n': 1559*cdf0e10cSrcweir if( bStripCRLF ) 1560*cdf0e10cSrcweir ((String &)aToken).Erase( nPos, 1 ); 1561*cdf0e10cSrcweir else 1562*cdf0e10cSrcweir nPos++, nLen++; 1563*cdf0e10cSrcweir break; 1564*cdf0e10cSrcweir case '\\': 1565*cdf0e10cSrcweir if( bOldEscape ) 1566*cdf0e10cSrcweir { 1567*cdf0e10cSrcweir nPos++, nLen++; 1568*cdf0e10cSrcweir } 1569*cdf0e10cSrcweir else 1570*cdf0e10cSrcweir { 1571*cdf0e10cSrcweir ((String &)aToken).Erase( nPos, 1 ); 1572*cdf0e10cSrcweir bEscape = sal_True; 1573*cdf0e10cSrcweir } 1574*cdf0e10cSrcweir break; 1575*cdf0e10cSrcweir case '"': 1576*cdf0e10cSrcweir case '\'': 1577*cdf0e10cSrcweir bDone = !bOldEscape && cChar==cEnd; 1578*cdf0e10cSrcweir if( !bDone ) 1579*cdf0e10cSrcweir nPos++, nLen++; 1580*cdf0e10cSrcweir break; 1581*cdf0e10cSrcweir default: 1582*cdf0e10cSrcweir nPos++, nLen++; 1583*cdf0e10cSrcweir break; 1584*cdf0e10cSrcweir } 1585*cdf0e10cSrcweir } 1586*cdf0e10cSrcweir if( nPos!=aToken.Len() ) 1587*cdf0e10cSrcweir nPos++; 1588*cdf0e10cSrcweir } 1589*cdf0e10cSrcweir else 1590*cdf0e10cSrcweir { 1591*cdf0e10cSrcweir // hier sind wir etwas laxer als der 1592*cdf0e10cSrcweir // Standard und erlauben alles druckbare 1593*cdf0e10cSrcweir sal_Bool bEscape = sal_False; 1594*cdf0e10cSrcweir sal_Bool bDone = sal_False; 1595*cdf0e10cSrcweir while( nPos < aToken.Len() && !bDone ) 1596*cdf0e10cSrcweir { 1597*cdf0e10cSrcweir sal_Bool bOldEscape = bEscape; 1598*cdf0e10cSrcweir bEscape = sal_False; 1599*cdf0e10cSrcweir sal_Unicode c = aToken.GetChar(nPos); 1600*cdf0e10cSrcweir switch( c ) 1601*cdf0e10cSrcweir { 1602*cdf0e10cSrcweir case ' ': 1603*cdf0e10cSrcweir bDone = !bOldEscape; 1604*cdf0e10cSrcweir if( !bDone ) 1605*cdf0e10cSrcweir nPos++, nLen++; 1606*cdf0e10cSrcweir break; 1607*cdf0e10cSrcweir 1608*cdf0e10cSrcweir case '\t': 1609*cdf0e10cSrcweir case '\r': 1610*cdf0e10cSrcweir case '\n': 1611*cdf0e10cSrcweir bDone = sal_True; 1612*cdf0e10cSrcweir break; 1613*cdf0e10cSrcweir 1614*cdf0e10cSrcweir case '\\': 1615*cdf0e10cSrcweir if( bOldEscape ) 1616*cdf0e10cSrcweir { 1617*cdf0e10cSrcweir nPos++, nLen++; 1618*cdf0e10cSrcweir } 1619*cdf0e10cSrcweir else 1620*cdf0e10cSrcweir { 1621*cdf0e10cSrcweir ((String &)aToken).Erase( nPos, 1 ); 1622*cdf0e10cSrcweir bEscape = sal_True; 1623*cdf0e10cSrcweir } 1624*cdf0e10cSrcweir break; 1625*cdf0e10cSrcweir 1626*cdf0e10cSrcweir default: 1627*cdf0e10cSrcweir if( HTML_ISPRINTABLE( c ) ) 1628*cdf0e10cSrcweir nPos++, nLen++; 1629*cdf0e10cSrcweir else 1630*cdf0e10cSrcweir bDone = sal_True; 1631*cdf0e10cSrcweir break; 1632*cdf0e10cSrcweir } 1633*cdf0e10cSrcweir } 1634*cdf0e10cSrcweir } 1635*cdf0e10cSrcweir 1636*cdf0e10cSrcweir if( nLen ) 1637*cdf0e10cSrcweir aValue = aToken.Copy( nStt, nLen ); 1638*cdf0e10cSrcweir } 1639*cdf0e10cSrcweir } 1640*cdf0e10cSrcweir 1641*cdf0e10cSrcweir // Wir kennen das Token und koennen es Speichern 1642*cdf0e10cSrcweir HTMLOption *pOption = 1643*cdf0e10cSrcweir new HTMLOption( 1644*cdf0e10cSrcweir sal::static_int_cast< sal_uInt16 >(nToken), sName, aValue ); 1645*cdf0e10cSrcweir 1646*cdf0e10cSrcweir pOptions->Insert( pOption, pOptions->Count() ); 1647*cdf0e10cSrcweir 1648*cdf0e10cSrcweir } 1649*cdf0e10cSrcweir else 1650*cdf0e10cSrcweir // white space un unerwartete Zeichen ignorieren wie 1651*cdf0e10cSrcweir nPos++; 1652*cdf0e10cSrcweir } 1653*cdf0e10cSrcweir 1654*cdf0e10cSrcweir return pOptions; 1655*cdf0e10cSrcweir } 1656*cdf0e10cSrcweir 1657*cdf0e10cSrcweir int HTMLParser::FilterPRE( int nToken ) 1658*cdf0e10cSrcweir { 1659*cdf0e10cSrcweir switch( nToken ) 1660*cdf0e10cSrcweir { 1661*cdf0e10cSrcweir #ifdef HTML_BEHAVIOUR 1662*cdf0e10cSrcweir // diese werden laut Definition zu LFs 1663*cdf0e10cSrcweir case HTML_PARABREAK_ON: 1664*cdf0e10cSrcweir case HTML_LINEBREAK: 1665*cdf0e10cSrcweir nToken = HTML_NEWPARA; 1666*cdf0e10cSrcweir #else 1667*cdf0e10cSrcweir // in Netscape zeigen sie aber nur in nicht-leeren Absaetzen Wirkung 1668*cdf0e10cSrcweir case HTML_PARABREAK_ON: 1669*cdf0e10cSrcweir nToken = HTML_LINEBREAK; 1670*cdf0e10cSrcweir case HTML_LINEBREAK: 1671*cdf0e10cSrcweir #endif 1672*cdf0e10cSrcweir case HTML_NEWPARA: 1673*cdf0e10cSrcweir nPre_LinePos = 0; 1674*cdf0e10cSrcweir if( bPre_IgnoreNewPara ) 1675*cdf0e10cSrcweir nToken = 0; 1676*cdf0e10cSrcweir break; 1677*cdf0e10cSrcweir 1678*cdf0e10cSrcweir case HTML_TABCHAR: 1679*cdf0e10cSrcweir { 1680*cdf0e10cSrcweir xub_StrLen nSpaces = sal::static_int_cast< xub_StrLen >( 1681*cdf0e10cSrcweir 8 - (nPre_LinePos % 8)); 1682*cdf0e10cSrcweir DBG_ASSERT( !aToken.Len(), "Wieso ist das Token nicht leer?" ); 1683*cdf0e10cSrcweir aToken.Expand( nSpaces, ' ' ); 1684*cdf0e10cSrcweir nPre_LinePos += nSpaces; 1685*cdf0e10cSrcweir nToken = HTML_TEXTTOKEN; 1686*cdf0e10cSrcweir } 1687*cdf0e10cSrcweir break; 1688*cdf0e10cSrcweir // diese bleiben erhalten 1689*cdf0e10cSrcweir case HTML_TEXTTOKEN: 1690*cdf0e10cSrcweir nPre_LinePos += aToken.Len(); 1691*cdf0e10cSrcweir break; 1692*cdf0e10cSrcweir 1693*cdf0e10cSrcweir case HTML_SELECT_ON: 1694*cdf0e10cSrcweir case HTML_SELECT_OFF: 1695*cdf0e10cSrcweir case HTML_BODY_ON: 1696*cdf0e10cSrcweir case HTML_FORM_ON: 1697*cdf0e10cSrcweir case HTML_FORM_OFF: 1698*cdf0e10cSrcweir case HTML_INPUT: 1699*cdf0e10cSrcweir case HTML_OPTION: 1700*cdf0e10cSrcweir case HTML_TEXTAREA_ON: 1701*cdf0e10cSrcweir case HTML_TEXTAREA_OFF: 1702*cdf0e10cSrcweir 1703*cdf0e10cSrcweir case HTML_IMAGE: 1704*cdf0e10cSrcweir case HTML_APPLET_ON: 1705*cdf0e10cSrcweir case HTML_APPLET_OFF: 1706*cdf0e10cSrcweir case HTML_PARAM: 1707*cdf0e10cSrcweir case HTML_EMBED: 1708*cdf0e10cSrcweir 1709*cdf0e10cSrcweir case HTML_HEAD1_ON: 1710*cdf0e10cSrcweir case HTML_HEAD1_OFF: 1711*cdf0e10cSrcweir case HTML_HEAD2_ON: 1712*cdf0e10cSrcweir case HTML_HEAD2_OFF: 1713*cdf0e10cSrcweir case HTML_HEAD3_ON: 1714*cdf0e10cSrcweir case HTML_HEAD3_OFF: 1715*cdf0e10cSrcweir case HTML_HEAD4_ON: 1716*cdf0e10cSrcweir case HTML_HEAD4_OFF: 1717*cdf0e10cSrcweir case HTML_HEAD5_ON: 1718*cdf0e10cSrcweir case HTML_HEAD5_OFF: 1719*cdf0e10cSrcweir case HTML_HEAD6_ON: 1720*cdf0e10cSrcweir case HTML_HEAD6_OFF: 1721*cdf0e10cSrcweir case HTML_BLOCKQUOTE_ON: 1722*cdf0e10cSrcweir case HTML_BLOCKQUOTE_OFF: 1723*cdf0e10cSrcweir case HTML_ADDRESS_ON: 1724*cdf0e10cSrcweir case HTML_ADDRESS_OFF: 1725*cdf0e10cSrcweir case HTML_HORZRULE: 1726*cdf0e10cSrcweir 1727*cdf0e10cSrcweir case HTML_CENTER_ON: 1728*cdf0e10cSrcweir case HTML_CENTER_OFF: 1729*cdf0e10cSrcweir case HTML_DIVISION_ON: 1730*cdf0e10cSrcweir case HTML_DIVISION_OFF: 1731*cdf0e10cSrcweir 1732*cdf0e10cSrcweir case HTML_SCRIPT_ON: 1733*cdf0e10cSrcweir case HTML_SCRIPT_OFF: 1734*cdf0e10cSrcweir case HTML_RAWDATA: 1735*cdf0e10cSrcweir 1736*cdf0e10cSrcweir case HTML_TABLE_ON: 1737*cdf0e10cSrcweir case HTML_TABLE_OFF: 1738*cdf0e10cSrcweir case HTML_CAPTION_ON: 1739*cdf0e10cSrcweir case HTML_CAPTION_OFF: 1740*cdf0e10cSrcweir case HTML_COLGROUP_ON: 1741*cdf0e10cSrcweir case HTML_COLGROUP_OFF: 1742*cdf0e10cSrcweir case HTML_COL_ON: 1743*cdf0e10cSrcweir case HTML_COL_OFF: 1744*cdf0e10cSrcweir case HTML_THEAD_ON: 1745*cdf0e10cSrcweir case HTML_THEAD_OFF: 1746*cdf0e10cSrcweir case HTML_TFOOT_ON: 1747*cdf0e10cSrcweir case HTML_TFOOT_OFF: 1748*cdf0e10cSrcweir case HTML_TBODY_ON: 1749*cdf0e10cSrcweir case HTML_TBODY_OFF: 1750*cdf0e10cSrcweir case HTML_TABLEROW_ON: 1751*cdf0e10cSrcweir case HTML_TABLEROW_OFF: 1752*cdf0e10cSrcweir case HTML_TABLEDATA_ON: 1753*cdf0e10cSrcweir case HTML_TABLEDATA_OFF: 1754*cdf0e10cSrcweir case HTML_TABLEHEADER_ON: 1755*cdf0e10cSrcweir case HTML_TABLEHEADER_OFF: 1756*cdf0e10cSrcweir 1757*cdf0e10cSrcweir case HTML_ANCHOR_ON: 1758*cdf0e10cSrcweir case HTML_ANCHOR_OFF: 1759*cdf0e10cSrcweir case HTML_BOLD_ON: 1760*cdf0e10cSrcweir case HTML_BOLD_OFF: 1761*cdf0e10cSrcweir case HTML_ITALIC_ON: 1762*cdf0e10cSrcweir case HTML_ITALIC_OFF: 1763*cdf0e10cSrcweir case HTML_STRIKE_ON: 1764*cdf0e10cSrcweir case HTML_STRIKE_OFF: 1765*cdf0e10cSrcweir case HTML_STRIKETHROUGH_ON: 1766*cdf0e10cSrcweir case HTML_STRIKETHROUGH_OFF: 1767*cdf0e10cSrcweir case HTML_UNDERLINE_ON: 1768*cdf0e10cSrcweir case HTML_UNDERLINE_OFF: 1769*cdf0e10cSrcweir case HTML_BASEFONT_ON: 1770*cdf0e10cSrcweir case HTML_BASEFONT_OFF: 1771*cdf0e10cSrcweir case HTML_FONT_ON: 1772*cdf0e10cSrcweir case HTML_FONT_OFF: 1773*cdf0e10cSrcweir case HTML_BLINK_ON: 1774*cdf0e10cSrcweir case HTML_BLINK_OFF: 1775*cdf0e10cSrcweir case HTML_SPAN_ON: 1776*cdf0e10cSrcweir case HTML_SPAN_OFF: 1777*cdf0e10cSrcweir case HTML_SUBSCRIPT_ON: 1778*cdf0e10cSrcweir case HTML_SUBSCRIPT_OFF: 1779*cdf0e10cSrcweir case HTML_SUPERSCRIPT_ON: 1780*cdf0e10cSrcweir case HTML_SUPERSCRIPT_OFF: 1781*cdf0e10cSrcweir case HTML_BIGPRINT_ON: 1782*cdf0e10cSrcweir case HTML_BIGPRINT_OFF: 1783*cdf0e10cSrcweir case HTML_SMALLPRINT_OFF: 1784*cdf0e10cSrcweir case HTML_SMALLPRINT_ON: 1785*cdf0e10cSrcweir 1786*cdf0e10cSrcweir case HTML_EMPHASIS_ON: 1787*cdf0e10cSrcweir case HTML_EMPHASIS_OFF: 1788*cdf0e10cSrcweir case HTML_CITIATION_ON: 1789*cdf0e10cSrcweir case HTML_CITIATION_OFF: 1790*cdf0e10cSrcweir case HTML_STRONG_ON: 1791*cdf0e10cSrcweir case HTML_STRONG_OFF: 1792*cdf0e10cSrcweir case HTML_CODE_ON: 1793*cdf0e10cSrcweir case HTML_CODE_OFF: 1794*cdf0e10cSrcweir case HTML_SAMPLE_ON: 1795*cdf0e10cSrcweir case HTML_SAMPLE_OFF: 1796*cdf0e10cSrcweir case HTML_KEYBOARD_ON: 1797*cdf0e10cSrcweir case HTML_KEYBOARD_OFF: 1798*cdf0e10cSrcweir case HTML_VARIABLE_ON: 1799*cdf0e10cSrcweir case HTML_VARIABLE_OFF: 1800*cdf0e10cSrcweir case HTML_DEFINSTANCE_ON: 1801*cdf0e10cSrcweir case HTML_DEFINSTANCE_OFF: 1802*cdf0e10cSrcweir case HTML_SHORTQUOTE_ON: 1803*cdf0e10cSrcweir case HTML_SHORTQUOTE_OFF: 1804*cdf0e10cSrcweir case HTML_LANGUAGE_ON: 1805*cdf0e10cSrcweir case HTML_LANGUAGE_OFF: 1806*cdf0e10cSrcweir case HTML_AUTHOR_ON: 1807*cdf0e10cSrcweir case HTML_AUTHOR_OFF: 1808*cdf0e10cSrcweir case HTML_PERSON_ON: 1809*cdf0e10cSrcweir case HTML_PERSON_OFF: 1810*cdf0e10cSrcweir case HTML_ACRONYM_ON: 1811*cdf0e10cSrcweir case HTML_ACRONYM_OFF: 1812*cdf0e10cSrcweir case HTML_ABBREVIATION_ON: 1813*cdf0e10cSrcweir case HTML_ABBREVIATION_OFF: 1814*cdf0e10cSrcweir case HTML_INSERTEDTEXT_ON: 1815*cdf0e10cSrcweir case HTML_INSERTEDTEXT_OFF: 1816*cdf0e10cSrcweir case HTML_DELETEDTEXT_ON: 1817*cdf0e10cSrcweir case HTML_DELETEDTEXT_OFF: 1818*cdf0e10cSrcweir case HTML_TELETYPE_ON: 1819*cdf0e10cSrcweir case HTML_TELETYPE_OFF: 1820*cdf0e10cSrcweir 1821*cdf0e10cSrcweir break; 1822*cdf0e10cSrcweir 1823*cdf0e10cSrcweir // der Rest wird als unbekanntes Token behandelt 1824*cdf0e10cSrcweir default: 1825*cdf0e10cSrcweir if( nToken ) 1826*cdf0e10cSrcweir { 1827*cdf0e10cSrcweir nToken = 1828*cdf0e10cSrcweir ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken)) 1829*cdf0e10cSrcweir ? HTML_UNKNOWNCONTROL_OFF 1830*cdf0e10cSrcweir : HTML_UNKNOWNCONTROL_ON ); 1831*cdf0e10cSrcweir } 1832*cdf0e10cSrcweir break; 1833*cdf0e10cSrcweir } 1834*cdf0e10cSrcweir 1835*cdf0e10cSrcweir bPre_IgnoreNewPara = sal_False; 1836*cdf0e10cSrcweir 1837*cdf0e10cSrcweir return nToken; 1838*cdf0e10cSrcweir } 1839*cdf0e10cSrcweir 1840*cdf0e10cSrcweir int HTMLParser::FilterXMP( int nToken ) 1841*cdf0e10cSrcweir { 1842*cdf0e10cSrcweir switch( nToken ) 1843*cdf0e10cSrcweir { 1844*cdf0e10cSrcweir case HTML_NEWPARA: 1845*cdf0e10cSrcweir if( bPre_IgnoreNewPara ) 1846*cdf0e10cSrcweir nToken = 0; 1847*cdf0e10cSrcweir case HTML_TEXTTOKEN: 1848*cdf0e10cSrcweir case HTML_NONBREAKSPACE: 1849*cdf0e10cSrcweir case HTML_SOFTHYPH: 1850*cdf0e10cSrcweir break; // bleiben erhalten 1851*cdf0e10cSrcweir 1852*cdf0e10cSrcweir default: 1853*cdf0e10cSrcweir if( nToken ) 1854*cdf0e10cSrcweir { 1855*cdf0e10cSrcweir if( (HTML_TOKEN_ONOFF & nToken) && (1 & nToken) ) 1856*cdf0e10cSrcweir { 1857*cdf0e10cSrcweir sSaveToken.Insert( '<', 0 ); 1858*cdf0e10cSrcweir sSaveToken.Insert( '/', 1 ); 1859*cdf0e10cSrcweir } 1860*cdf0e10cSrcweir else 1861*cdf0e10cSrcweir sSaveToken.Insert( '<', 0 ); 1862*cdf0e10cSrcweir if( aToken.Len() ) 1863*cdf0e10cSrcweir { 1864*cdf0e10cSrcweir UnescapeToken(); 1865*cdf0e10cSrcweir sSaveToken += (sal_Unicode)' '; 1866*cdf0e10cSrcweir aToken.Insert( sSaveToken, 0 ); 1867*cdf0e10cSrcweir } 1868*cdf0e10cSrcweir else 1869*cdf0e10cSrcweir aToken = sSaveToken; 1870*cdf0e10cSrcweir aToken += (sal_Unicode)'>'; 1871*cdf0e10cSrcweir nToken = HTML_TEXTTOKEN; 1872*cdf0e10cSrcweir } 1873*cdf0e10cSrcweir break; 1874*cdf0e10cSrcweir } 1875*cdf0e10cSrcweir 1876*cdf0e10cSrcweir bPre_IgnoreNewPara = sal_False; 1877*cdf0e10cSrcweir 1878*cdf0e10cSrcweir return nToken; 1879*cdf0e10cSrcweir } 1880*cdf0e10cSrcweir 1881*cdf0e10cSrcweir int HTMLParser::FilterListing( int nToken ) 1882*cdf0e10cSrcweir { 1883*cdf0e10cSrcweir switch( nToken ) 1884*cdf0e10cSrcweir { 1885*cdf0e10cSrcweir case HTML_NEWPARA: 1886*cdf0e10cSrcweir if( bPre_IgnoreNewPara ) 1887*cdf0e10cSrcweir nToken = 0; 1888*cdf0e10cSrcweir case HTML_TEXTTOKEN: 1889*cdf0e10cSrcweir case HTML_NONBREAKSPACE: 1890*cdf0e10cSrcweir case HTML_SOFTHYPH: 1891*cdf0e10cSrcweir break; // bleiben erhalten 1892*cdf0e10cSrcweir 1893*cdf0e10cSrcweir default: 1894*cdf0e10cSrcweir if( nToken ) 1895*cdf0e10cSrcweir { 1896*cdf0e10cSrcweir nToken = 1897*cdf0e10cSrcweir ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken)) 1898*cdf0e10cSrcweir ? HTML_UNKNOWNCONTROL_OFF 1899*cdf0e10cSrcweir : HTML_UNKNOWNCONTROL_ON ); 1900*cdf0e10cSrcweir } 1901*cdf0e10cSrcweir break; 1902*cdf0e10cSrcweir } 1903*cdf0e10cSrcweir 1904*cdf0e10cSrcweir bPre_IgnoreNewPara = sal_False; 1905*cdf0e10cSrcweir 1906*cdf0e10cSrcweir return nToken; 1907*cdf0e10cSrcweir } 1908*cdf0e10cSrcweir 1909*cdf0e10cSrcweir FASTBOOL HTMLParser::IsHTMLFormat( const sal_Char* pHeader, 1910*cdf0e10cSrcweir sal_Bool bSwitchToUCS2, 1911*cdf0e10cSrcweir rtl_TextEncoding eEnc ) 1912*cdf0e10cSrcweir { 1913*cdf0e10cSrcweir // Einer der folgenden regulaeren Ausdrucke muss sich auf den String 1914*cdf0e10cSrcweir // anwenden lassen, damit das Dok ein HTML-Dokument ist. 1915*cdf0e10cSrcweir // 1916*cdf0e10cSrcweir // ^[^<]*<[^ \t]*[> \t] 1917*cdf0e10cSrcweir // ------- 1918*cdf0e10cSrcweir // ^<! 1919*cdf0e10cSrcweir // 1920*cdf0e10cSrcweir // wobei der unterstrichene Teilausdruck einem HTML-Token 1921*cdf0e10cSrcweir // ensprechen muss 1922*cdf0e10cSrcweir 1923*cdf0e10cSrcweir ByteString sCmp; 1924*cdf0e10cSrcweir sal_Bool bUCS2B = sal_False; 1925*cdf0e10cSrcweir if( bSwitchToUCS2 ) 1926*cdf0e10cSrcweir { 1927*cdf0e10cSrcweir if( 0xfeU == (sal_uChar)pHeader[0] && 1928*cdf0e10cSrcweir 0xffU == (sal_uChar)pHeader[1] ) 1929*cdf0e10cSrcweir { 1930*cdf0e10cSrcweir eEnc = RTL_TEXTENCODING_UCS2; 1931*cdf0e10cSrcweir bUCS2B = sal_True; 1932*cdf0e10cSrcweir } 1933*cdf0e10cSrcweir else if( 0xffU == (sal_uChar)pHeader[0] && 1934*cdf0e10cSrcweir 0xfeU == (sal_uChar)pHeader[1] ) 1935*cdf0e10cSrcweir { 1936*cdf0e10cSrcweir eEnc = RTL_TEXTENCODING_UCS2; 1937*cdf0e10cSrcweir } 1938*cdf0e10cSrcweir } 1939*cdf0e10cSrcweir if 1940*cdf0e10cSrcweir ( 1941*cdf0e10cSrcweir RTL_TEXTENCODING_UCS2 == eEnc && 1942*cdf0e10cSrcweir ( 1943*cdf0e10cSrcweir (0xfe == (sal_uChar)pHeader[0] && 0xff == (sal_uChar)pHeader[1]) || 1944*cdf0e10cSrcweir (0xff == (sal_uChar)pHeader[0] && 0xfe == (sal_uChar)pHeader[1]) 1945*cdf0e10cSrcweir ) 1946*cdf0e10cSrcweir ) 1947*cdf0e10cSrcweir { 1948*cdf0e10cSrcweir if( 0xfe == (sal_uChar)pHeader[0] ) 1949*cdf0e10cSrcweir bUCS2B = sal_True; 1950*cdf0e10cSrcweir 1951*cdf0e10cSrcweir xub_StrLen nLen; 1952*cdf0e10cSrcweir for( nLen = 2; 1953*cdf0e10cSrcweir pHeader[nLen] != 0 || pHeader[nLen+1] != 0; 1954*cdf0e10cSrcweir nLen+=2 ) 1955*cdf0e10cSrcweir ; 1956*cdf0e10cSrcweir 1957*cdf0e10cSrcweir ::rtl::OStringBuffer sTmp( (nLen - 2)/2 ); 1958*cdf0e10cSrcweir for( xub_StrLen nPos = 2; nPos < nLen; nPos += 2 ) 1959*cdf0e10cSrcweir { 1960*cdf0e10cSrcweir sal_Unicode cUC; 1961*cdf0e10cSrcweir if( bUCS2B ) 1962*cdf0e10cSrcweir cUC = (sal_Unicode(pHeader[nPos]) << 8) | pHeader[nPos+1]; 1963*cdf0e10cSrcweir else 1964*cdf0e10cSrcweir cUC = (sal_Unicode(pHeader[nPos+1]) << 8) | pHeader[nPos]; 1965*cdf0e10cSrcweir if( 0U == cUC ) 1966*cdf0e10cSrcweir break; 1967*cdf0e10cSrcweir 1968*cdf0e10cSrcweir sTmp.append( cUC < 256U ? (sal_Char)cUC : '.' ); 1969*cdf0e10cSrcweir } 1970*cdf0e10cSrcweir sCmp = ByteString( sTmp.makeStringAndClear() ); 1971*cdf0e10cSrcweir } 1972*cdf0e10cSrcweir else 1973*cdf0e10cSrcweir { 1974*cdf0e10cSrcweir sCmp = (sal_Char *)pHeader; 1975*cdf0e10cSrcweir } 1976*cdf0e10cSrcweir 1977*cdf0e10cSrcweir sCmp.ToUpperAscii(); 1978*cdf0e10cSrcweir 1979*cdf0e10cSrcweir // Ein HTML-Dokument muss in der ersten Zeile ein '<' besitzen 1980*cdf0e10cSrcweir xub_StrLen nStart = sCmp.Search( '<' ); 1981*cdf0e10cSrcweir if( STRING_NOTFOUND == nStart ) 1982*cdf0e10cSrcweir return sal_False; 1983*cdf0e10cSrcweir nStart++; 1984*cdf0e10cSrcweir 1985*cdf0e10cSrcweir // danach duerfen beliebige andere Zeichen bis zu einem blank oder 1986*cdf0e10cSrcweir // '>' kommen 1987*cdf0e10cSrcweir sal_Char c; 1988*cdf0e10cSrcweir xub_StrLen nPos; 1989*cdf0e10cSrcweir for( nPos = nStart; nPos<sCmp.Len(); nPos++ ) 1990*cdf0e10cSrcweir { 1991*cdf0e10cSrcweir if( '>'==(c=sCmp.GetChar(nPos)) || HTML_ISSPACE(c) ) 1992*cdf0e10cSrcweir break; 1993*cdf0e10cSrcweir } 1994*cdf0e10cSrcweir 1995*cdf0e10cSrcweir // wenn das Dokeument hinter dem < aufhoert ist es wohl kein HTML 1996*cdf0e10cSrcweir if( nPos==nStart ) 1997*cdf0e10cSrcweir return sal_False; 1998*cdf0e10cSrcweir 1999*cdf0e10cSrcweir // die Zeichenkette nach dem '<' muss ausserdem ein bekanntes 2000*cdf0e10cSrcweir // HTML Token sein. Damit die Ausgabe eines DOS-dir-Befehls nicht 2001*cdf0e10cSrcweir // als HTML interpretiert wird, wird ein <DIR> jedoch nicht als HTML 2002*cdf0e10cSrcweir // interpretiert. 2003*cdf0e10cSrcweir String sTest( sCmp.Copy( nStart, nPos-nStart ), RTL_TEXTENCODING_ASCII_US ); 2004*cdf0e10cSrcweir int nTok = GetHTMLToken( sTest ); 2005*cdf0e10cSrcweir if( 0 != nTok && HTML_DIRLIST_ON != nTok ) 2006*cdf0e10cSrcweir return sal_True; 2007*cdf0e10cSrcweir 2008*cdf0e10cSrcweir // oder es handelt sich um ein "<!" ganz am Anfang der Datei (fix #27092#) 2009*cdf0e10cSrcweir if( nStart == 1 && '!' == sCmp.GetChar( 1 ) ) 2010*cdf0e10cSrcweir return sal_True; 2011*cdf0e10cSrcweir 2012*cdf0e10cSrcweir // oder wir finden irgendwo ein <HTML> in den ersten 80 Zeichen 2013*cdf0e10cSrcweir nStart = sCmp.Search( OOO_STRING_SVTOOLS_HTML_html ); 2014*cdf0e10cSrcweir if( nStart!=STRING_NOTFOUND && 2015*cdf0e10cSrcweir nStart>0 && '<'==sCmp.GetChar(nStart-1) && 2016*cdf0e10cSrcweir nStart+4 < sCmp.Len() && '>'==sCmp.GetChar(nStart+4) ) 2017*cdf0e10cSrcweir return sal_True; 2018*cdf0e10cSrcweir 2019*cdf0e10cSrcweir // sonst ist es wohl doch eher kein HTML-Dokument 2020*cdf0e10cSrcweir return sal_False; 2021*cdf0e10cSrcweir } 2022*cdf0e10cSrcweir 2023*cdf0e10cSrcweir sal_Bool HTMLParser::InternalImgToPrivateURL( String& rURL ) 2024*cdf0e10cSrcweir { 2025*cdf0e10cSrcweir if( rURL.Len() < 19 || 'i' != rURL.GetChar(0) || 2026*cdf0e10cSrcweir rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher, 9 ) != COMPARE_EQUAL ) 2027*cdf0e10cSrcweir return sal_False; 2028*cdf0e10cSrcweir 2029*cdf0e10cSrcweir sal_Bool bFound = sal_False; 2030*cdf0e10cSrcweir 2031*cdf0e10cSrcweir if( rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher,16) == COMPARE_EQUAL ) 2032*cdf0e10cSrcweir { 2033*cdf0e10cSrcweir String aName( rURL.Copy(16) ); 2034*cdf0e10cSrcweir switch( aName.GetChar(0) ) 2035*cdf0e10cSrcweir { 2036*cdf0e10cSrcweir case 'b': 2037*cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_binary ); 2038*cdf0e10cSrcweir break; 2039*cdf0e10cSrcweir case 'i': 2040*cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_image ) || 2041*cdf0e10cSrcweir aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_index ); 2042*cdf0e10cSrcweir break; 2043*cdf0e10cSrcweir case 'm': 2044*cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_menu ) || 2045*cdf0e10cSrcweir aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_movie ); 2046*cdf0e10cSrcweir break; 2047*cdf0e10cSrcweir case 's': 2048*cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_sound ); 2049*cdf0e10cSrcweir break; 2050*cdf0e10cSrcweir case 't': 2051*cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_telnet ) || 2052*cdf0e10cSrcweir aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_text ); 2053*cdf0e10cSrcweir break; 2054*cdf0e10cSrcweir case 'u': 2055*cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_unknown ); 2056*cdf0e10cSrcweir break; 2057*cdf0e10cSrcweir } 2058*cdf0e10cSrcweir } 2059*cdf0e10cSrcweir else if( rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_icon,14) == COMPARE_EQUAL ) 2060*cdf0e10cSrcweir { 2061*cdf0e10cSrcweir String aName( rURL.Copy(14) ); 2062*cdf0e10cSrcweir switch( aName.GetChar(0) ) 2063*cdf0e10cSrcweir { 2064*cdf0e10cSrcweir case 'b': 2065*cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata ); 2066*cdf0e10cSrcweir break; 2067*cdf0e10cSrcweir case 'd': 2068*cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed ); 2069*cdf0e10cSrcweir break; 2070*cdf0e10cSrcweir case 'e': 2071*cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_embed ); 2072*cdf0e10cSrcweir break; 2073*cdf0e10cSrcweir case 'i': 2074*cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure ); 2075*cdf0e10cSrcweir break; 2076*cdf0e10cSrcweir case 'n': 2077*cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound ); 2078*cdf0e10cSrcweir break; 2079*cdf0e10cSrcweir } 2080*cdf0e10cSrcweir } 2081*cdf0e10cSrcweir if( bFound ) 2082*cdf0e10cSrcweir { 2083*cdf0e10cSrcweir String sTmp ( rURL ); 2084*cdf0e10cSrcweir rURL.AssignAscii( OOO_STRING_SVTOOLS_HTML_private_image ); 2085*cdf0e10cSrcweir rURL.Append( sTmp ); 2086*cdf0e10cSrcweir } 2087*cdf0e10cSrcweir 2088*cdf0e10cSrcweir return bFound; 2089*cdf0e10cSrcweir } 2090*cdf0e10cSrcweir 2091*cdf0e10cSrcweir #ifdef USED 2092*cdf0e10cSrcweir void HTMLParser::SaveState( int nToken ) 2093*cdf0e10cSrcweir { 2094*cdf0e10cSrcweir SvParser::SaveState( nToken ); 2095*cdf0e10cSrcweir } 2096*cdf0e10cSrcweir 2097*cdf0e10cSrcweir void HTMLParser::RestoreState() 2098*cdf0e10cSrcweir { 2099*cdf0e10cSrcweir SvParser::RestoreState(); 2100*cdf0e10cSrcweir } 2101*cdf0e10cSrcweir #endif 2102*cdf0e10cSrcweir 2103*cdf0e10cSrcweir 2104*cdf0e10cSrcweir enum eHtmlMetas { 2105*cdf0e10cSrcweir HTML_META_NONE = 0, 2106*cdf0e10cSrcweir HTML_META_AUTHOR, 2107*cdf0e10cSrcweir HTML_META_DESCRIPTION, 2108*cdf0e10cSrcweir HTML_META_KEYWORDS, 2109*cdf0e10cSrcweir HTML_META_REFRESH, 2110*cdf0e10cSrcweir HTML_META_CLASSIFICATION, 2111*cdf0e10cSrcweir HTML_META_CREATED, 2112*cdf0e10cSrcweir HTML_META_CHANGEDBY, 2113*cdf0e10cSrcweir HTML_META_CHANGED, 2114*cdf0e10cSrcweir HTML_META_GENERATOR, 2115*cdf0e10cSrcweir HTML_META_SDFOOTNOTE, 2116*cdf0e10cSrcweir HTML_META_SDENDNOTE, 2117*cdf0e10cSrcweir HTML_META_CONTENT_TYPE 2118*cdf0e10cSrcweir }; 2119*cdf0e10cSrcweir 2120*cdf0e10cSrcweir // <META NAME=xxx> 2121*cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aHTMLMetaNameTable[] = 2122*cdf0e10cSrcweir { 2123*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_author, HTML_META_AUTHOR }, 2124*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_changed, HTML_META_CHANGED }, 2125*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_changedby, HTML_META_CHANGEDBY }, 2126*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_classification,HTML_META_CLASSIFICATION}, 2127*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_content_type, HTML_META_CONTENT_TYPE }, 2128*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_created, HTML_META_CREATED }, 2129*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_description, HTML_META_DESCRIPTION }, 2130*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_keywords, HTML_META_KEYWORDS }, 2131*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_generator, HTML_META_GENERATOR }, 2132*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_refresh, HTML_META_REFRESH }, 2133*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_sdendnote, HTML_META_SDENDNOTE }, 2134*cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_sdfootnote, HTML_META_SDFOOTNOTE }, 2135*cdf0e10cSrcweir { 0, 0 } 2136*cdf0e10cSrcweir }; 2137*cdf0e10cSrcweir 2138*cdf0e10cSrcweir 2139*cdf0e10cSrcweir void HTMLParser::AddMetaUserDefined( ::rtl::OUString const & ) 2140*cdf0e10cSrcweir { 2141*cdf0e10cSrcweir } 2142*cdf0e10cSrcweir 2143*cdf0e10cSrcweir bool HTMLParser::ParseMetaOptionsImpl( 2144*cdf0e10cSrcweir const uno::Reference<document::XDocumentProperties> & i_xDocProps, 2145*cdf0e10cSrcweir SvKeyValueIterator *i_pHTTPHeader, 2146*cdf0e10cSrcweir const HTMLOptions *i_pOptions, 2147*cdf0e10cSrcweir rtl_TextEncoding& o_rEnc ) 2148*cdf0e10cSrcweir { 2149*cdf0e10cSrcweir String aName, aContent; 2150*cdf0e10cSrcweir sal_uInt16 nAction = HTML_META_NONE; 2151*cdf0e10cSrcweir bool bHTTPEquiv = false, bChanged = false; 2152*cdf0e10cSrcweir 2153*cdf0e10cSrcweir for ( sal_uInt16 i = i_pOptions->Count(); i; ) 2154*cdf0e10cSrcweir { 2155*cdf0e10cSrcweir const HTMLOption *pOption = (*i_pOptions)[ --i ]; 2156*cdf0e10cSrcweir switch ( pOption->GetToken() ) 2157*cdf0e10cSrcweir { 2158*cdf0e10cSrcweir case HTML_O_NAME: 2159*cdf0e10cSrcweir aName = pOption->GetString(); 2160*cdf0e10cSrcweir if ( HTML_META_NONE==nAction ) 2161*cdf0e10cSrcweir { 2162*cdf0e10cSrcweir pOption->GetEnum( nAction, aHTMLMetaNameTable ); 2163*cdf0e10cSrcweir } 2164*cdf0e10cSrcweir break; 2165*cdf0e10cSrcweir case HTML_O_HTTPEQUIV: 2166*cdf0e10cSrcweir aName = pOption->GetString(); 2167*cdf0e10cSrcweir pOption->GetEnum( nAction, aHTMLMetaNameTable ); 2168*cdf0e10cSrcweir bHTTPEquiv = true; 2169*cdf0e10cSrcweir break; 2170*cdf0e10cSrcweir case HTML_O_CONTENT: 2171*cdf0e10cSrcweir aContent = pOption->GetString(); 2172*cdf0e10cSrcweir break; 2173*cdf0e10cSrcweir } 2174*cdf0e10cSrcweir } 2175*cdf0e10cSrcweir 2176*cdf0e10cSrcweir if ( bHTTPEquiv || HTML_META_DESCRIPTION != nAction ) 2177*cdf0e10cSrcweir { 2178*cdf0e10cSrcweir // if it is not a Description, remove CRs and LFs from CONTENT 2179*cdf0e10cSrcweir aContent.EraseAllChars( _CR ); 2180*cdf0e10cSrcweir aContent.EraseAllChars( _LF ); 2181*cdf0e10cSrcweir } 2182*cdf0e10cSrcweir else 2183*cdf0e10cSrcweir { 2184*cdf0e10cSrcweir // convert line endings for Description 2185*cdf0e10cSrcweir aContent.ConvertLineEnd(); 2186*cdf0e10cSrcweir } 2187*cdf0e10cSrcweir 2188*cdf0e10cSrcweir 2189*cdf0e10cSrcweir if ( bHTTPEquiv && i_pHTTPHeader ) 2190*cdf0e10cSrcweir { 2191*cdf0e10cSrcweir // #57232#: Netscape seems to just ignore a closing ", so we do too 2192*cdf0e10cSrcweir if ( aContent.Len() && '"' == aContent.GetChar( aContent.Len()-1 ) ) 2193*cdf0e10cSrcweir { 2194*cdf0e10cSrcweir aContent.Erase( aContent.Len() - 1 ); 2195*cdf0e10cSrcweir } 2196*cdf0e10cSrcweir SvKeyValue aKeyValue( aName, aContent ); 2197*cdf0e10cSrcweir i_pHTTPHeader->Append( aKeyValue ); 2198*cdf0e10cSrcweir } 2199*cdf0e10cSrcweir 2200*cdf0e10cSrcweir switch ( nAction ) 2201*cdf0e10cSrcweir { 2202*cdf0e10cSrcweir case HTML_META_AUTHOR: 2203*cdf0e10cSrcweir if (i_xDocProps.is()) { 2204*cdf0e10cSrcweir i_xDocProps->setAuthor( aContent ); 2205*cdf0e10cSrcweir bChanged = true; 2206*cdf0e10cSrcweir } 2207*cdf0e10cSrcweir break; 2208*cdf0e10cSrcweir case HTML_META_DESCRIPTION: 2209*cdf0e10cSrcweir if (i_xDocProps.is()) { 2210*cdf0e10cSrcweir i_xDocProps->setDescription( aContent ); 2211*cdf0e10cSrcweir bChanged = true; 2212*cdf0e10cSrcweir } 2213*cdf0e10cSrcweir break; 2214*cdf0e10cSrcweir case HTML_META_KEYWORDS: 2215*cdf0e10cSrcweir if (i_xDocProps.is()) { 2216*cdf0e10cSrcweir i_xDocProps->setKeywords( 2217*cdf0e10cSrcweir ::comphelper::string::convertCommaSeparated(aContent)); 2218*cdf0e10cSrcweir bChanged = true; 2219*cdf0e10cSrcweir } 2220*cdf0e10cSrcweir break; 2221*cdf0e10cSrcweir case HTML_META_CLASSIFICATION: 2222*cdf0e10cSrcweir if (i_xDocProps.is()) { 2223*cdf0e10cSrcweir i_xDocProps->setSubject( aContent ); 2224*cdf0e10cSrcweir bChanged = true; 2225*cdf0e10cSrcweir } 2226*cdf0e10cSrcweir break; 2227*cdf0e10cSrcweir 2228*cdf0e10cSrcweir case HTML_META_CHANGEDBY: 2229*cdf0e10cSrcweir if (i_xDocProps.is()) { 2230*cdf0e10cSrcweir i_xDocProps->setModifiedBy( aContent ); 2231*cdf0e10cSrcweir } 2232*cdf0e10cSrcweir break; 2233*cdf0e10cSrcweir 2234*cdf0e10cSrcweir case HTML_META_CREATED: 2235*cdf0e10cSrcweir case HTML_META_CHANGED: 2236*cdf0e10cSrcweir if ( i_xDocProps.is() && aContent.Len() && 2237*cdf0e10cSrcweir aContent.GetTokenCount() == 2 ) 2238*cdf0e10cSrcweir { 2239*cdf0e10cSrcweir Date aDate( (sal_uLong)aContent.GetToken(0).ToInt32() ); 2240*cdf0e10cSrcweir Time aTime( (sal_uLong)aContent.GetToken(1).ToInt32() ); 2241*cdf0e10cSrcweir DateTime aDateTime( aDate, aTime ); 2242*cdf0e10cSrcweir ::util::DateTime uDT(aDateTime.Get100Sec(), 2243*cdf0e10cSrcweir aDateTime.GetSec(), aDateTime.GetMin(), 2244*cdf0e10cSrcweir aDateTime.GetHour(), aDateTime.GetDay(), 2245*cdf0e10cSrcweir aDateTime.GetMonth(), aDateTime.GetYear()); 2246*cdf0e10cSrcweir if ( HTML_META_CREATED==nAction ) 2247*cdf0e10cSrcweir i_xDocProps->setCreationDate( uDT ); 2248*cdf0e10cSrcweir else 2249*cdf0e10cSrcweir i_xDocProps->setModificationDate( uDT ); 2250*cdf0e10cSrcweir bChanged = true; 2251*cdf0e10cSrcweir } 2252*cdf0e10cSrcweir break; 2253*cdf0e10cSrcweir 2254*cdf0e10cSrcweir case HTML_META_REFRESH: 2255*cdf0e10cSrcweir DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader, 2256*cdf0e10cSrcweir "Reload-URL aufgrund unterlassener MUSS-Aenderung verlorengegangen" ); 2257*cdf0e10cSrcweir break; 2258*cdf0e10cSrcweir 2259*cdf0e10cSrcweir case HTML_META_CONTENT_TYPE: 2260*cdf0e10cSrcweir if ( aContent.Len() ) 2261*cdf0e10cSrcweir { 2262*cdf0e10cSrcweir o_rEnc = GetEncodingByMIME( aContent ); 2263*cdf0e10cSrcweir } 2264*cdf0e10cSrcweir break; 2265*cdf0e10cSrcweir 2266*cdf0e10cSrcweir case HTML_META_NONE: 2267*cdf0e10cSrcweir if ( !bHTTPEquiv ) 2268*cdf0e10cSrcweir { 2269*cdf0e10cSrcweir if (i_xDocProps.is()) 2270*cdf0e10cSrcweir { 2271*cdf0e10cSrcweir uno::Reference<beans::XPropertyContainer> xUDProps 2272*cdf0e10cSrcweir = i_xDocProps->getUserDefinedProperties(); 2273*cdf0e10cSrcweir try { 2274*cdf0e10cSrcweir xUDProps->addProperty(aName, 2275*cdf0e10cSrcweir beans::PropertyAttribute::REMOVEABLE, 2276*cdf0e10cSrcweir uno::makeAny(::rtl::OUString(aContent))); 2277*cdf0e10cSrcweir AddMetaUserDefined(aName); 2278*cdf0e10cSrcweir bChanged = true; 2279*cdf0e10cSrcweir } catch (uno::Exception &) { 2280*cdf0e10cSrcweir // ignore 2281*cdf0e10cSrcweir } 2282*cdf0e10cSrcweir } 2283*cdf0e10cSrcweir } 2284*cdf0e10cSrcweir break; 2285*cdf0e10cSrcweir default: 2286*cdf0e10cSrcweir break; 2287*cdf0e10cSrcweir } 2288*cdf0e10cSrcweir 2289*cdf0e10cSrcweir return bChanged; 2290*cdf0e10cSrcweir } 2291*cdf0e10cSrcweir 2292*cdf0e10cSrcweir bool HTMLParser::ParseMetaOptions( 2293*cdf0e10cSrcweir const uno::Reference<document::XDocumentProperties> & i_xDocProps, 2294*cdf0e10cSrcweir SvKeyValueIterator *i_pHeader ) 2295*cdf0e10cSrcweir { 2296*cdf0e10cSrcweir sal_uInt16 nContentOption = HTML_O_CONTENT; 2297*cdf0e10cSrcweir rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW; 2298*cdf0e10cSrcweir 2299*cdf0e10cSrcweir bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader, 2300*cdf0e10cSrcweir GetOptions(&nContentOption), 2301*cdf0e10cSrcweir eEnc ); 2302*cdf0e10cSrcweir 2303*cdf0e10cSrcweir // If the encoding is set by a META tag, it may only overwrite the 2304*cdf0e10cSrcweir // current encoding if both, the current and the new encoding, are 1-sal_uInt8 2305*cdf0e10cSrcweir // encodings. Everything else cannot lead to reasonable results. 2306*cdf0e10cSrcweir if (RTL_TEXTENCODING_DONTKNOW != eEnc && 2307*cdf0e10cSrcweir rtl_isOctetTextEncoding( eEnc ) && 2308*cdf0e10cSrcweir rtl_isOctetTextEncoding( GetSrcEncoding() ) ) 2309*cdf0e10cSrcweir { 2310*cdf0e10cSrcweir eEnc = GetExtendedCompatibilityTextEncoding( eEnc ); // #89973# 2311*cdf0e10cSrcweir SetSrcEncoding( eEnc ); 2312*cdf0e10cSrcweir } 2313*cdf0e10cSrcweir 2314*cdf0e10cSrcweir return bRet; 2315*cdf0e10cSrcweir } 2316*cdf0e10cSrcweir 2317*cdf0e10cSrcweir rtl_TextEncoding HTMLParser::GetEncodingByMIME( const String& rMime ) 2318*cdf0e10cSrcweir { 2319*cdf0e10cSrcweir ByteString sType; 2320*cdf0e10cSrcweir ByteString sSubType; 2321*cdf0e10cSrcweir INetContentTypeParameterList aParameters; 2322*cdf0e10cSrcweir ByteString sMime( rMime, RTL_TEXTENCODING_ASCII_US ); 2323*cdf0e10cSrcweir if (INetContentTypes::parse(sMime, sType, sSubType, &aParameters)) 2324*cdf0e10cSrcweir { 2325*cdf0e10cSrcweir const INetContentTypeParameter * pCharset 2326*cdf0e10cSrcweir = aParameters.find("charset"); 2327*cdf0e10cSrcweir if (pCharset != 0) 2328*cdf0e10cSrcweir { 2329*cdf0e10cSrcweir ByteString sValue( pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US ); 2330*cdf0e10cSrcweir return GetExtendedCompatibilityTextEncoding( 2331*cdf0e10cSrcweir rtl_getTextEncodingFromMimeCharset( sValue.GetBuffer() ) ); 2332*cdf0e10cSrcweir } 2333*cdf0e10cSrcweir } 2334*cdf0e10cSrcweir return RTL_TEXTENCODING_DONTKNOW; 2335*cdf0e10cSrcweir } 2336*cdf0e10cSrcweir 2337*cdf0e10cSrcweir rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader ) 2338*cdf0e10cSrcweir { 2339*cdf0e10cSrcweir rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW; 2340*cdf0e10cSrcweir if( pHTTPHeader ) 2341*cdf0e10cSrcweir { 2342*cdf0e10cSrcweir SvKeyValue aKV; 2343*cdf0e10cSrcweir for( sal_Bool bCont = pHTTPHeader->GetFirst( aKV ); bCont; 2344*cdf0e10cSrcweir bCont = pHTTPHeader->GetNext( aKV ) ) 2345*cdf0e10cSrcweir { 2346*cdf0e10cSrcweir if( aKV.GetKey().EqualsIgnoreCaseAscii( OOO_STRING_SVTOOLS_HTML_META_content_type ) ) 2347*cdf0e10cSrcweir { 2348*cdf0e10cSrcweir if( aKV.GetValue().Len() ) 2349*cdf0e10cSrcweir { 2350*cdf0e10cSrcweir eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() ); 2351*cdf0e10cSrcweir } 2352*cdf0e10cSrcweir } 2353*cdf0e10cSrcweir } 2354*cdf0e10cSrcweir } 2355*cdf0e10cSrcweir return eRet; 2356*cdf0e10cSrcweir } 2357*cdf0e10cSrcweir 2358*cdf0e10cSrcweir sal_Bool HTMLParser::SetEncodingByHTTPHeader( 2359*cdf0e10cSrcweir SvKeyValueIterator *pHTTPHeader ) 2360*cdf0e10cSrcweir { 2361*cdf0e10cSrcweir sal_Bool bRet = sal_False; 2362*cdf0e10cSrcweir rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader ); 2363*cdf0e10cSrcweir if(RTL_TEXTENCODING_DONTKNOW != eEnc) 2364*cdf0e10cSrcweir { 2365*cdf0e10cSrcweir SetSrcEncoding( eEnc ); 2366*cdf0e10cSrcweir bRet = sal_True; 2367*cdf0e10cSrcweir } 2368*cdf0e10cSrcweir return bRet; 2369*cdf0e10cSrcweir } 2370*cdf0e10cSrcweir 2371*cdf0e10cSrcweir 2372