1cdf0e10cSrcweir /************************************************************************* 2cdf0e10cSrcweir * 3cdf0e10cSrcweir * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4cdf0e10cSrcweir * 5cdf0e10cSrcweir * Copyright 2000, 2010 Oracle and/or its affiliates. 6cdf0e10cSrcweir * 7cdf0e10cSrcweir * OpenOffice.org - a multi-platform office productivity suite 8cdf0e10cSrcweir * 9cdf0e10cSrcweir * This file is part of OpenOffice.org. 10cdf0e10cSrcweir * 11cdf0e10cSrcweir * OpenOffice.org is free software: you can redistribute it and/or modify 12cdf0e10cSrcweir * it under the terms of the GNU Lesser General Public License version 3 13cdf0e10cSrcweir * only, as published by the Free Software Foundation. 14cdf0e10cSrcweir * 15cdf0e10cSrcweir * OpenOffice.org is distributed in the hope that it will be useful, 16cdf0e10cSrcweir * but WITHOUT ANY WARRANTY; without even the implied warranty of 17cdf0e10cSrcweir * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18cdf0e10cSrcweir * GNU Lesser General Public License version 3 for more details 19cdf0e10cSrcweir * (a copy is included in the LICENSE file that accompanied this code). 20cdf0e10cSrcweir * 21cdf0e10cSrcweir * You should have received a copy of the GNU Lesser General Public License 22cdf0e10cSrcweir * version 3 along with OpenOffice.org. If not, see 23cdf0e10cSrcweir * <http://www.openoffice.org/license.html> 24cdf0e10cSrcweir * for a copy of the LGPLv3 License. 25cdf0e10cSrcweir * 26cdf0e10cSrcweir ************************************************************************/ 27cdf0e10cSrcweir 28cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove 29cdf0e10cSrcweir #include "precompiled_svtools.hxx" 30cdf0e10cSrcweir 31cdf0e10cSrcweir #include <ctype.h> 32cdf0e10cSrcweir #include <stdio.h> 33cdf0e10cSrcweir #include <tools/stream.hxx> 34cdf0e10cSrcweir #include <tools/debug.hxx> 35cdf0e10cSrcweir #include <tools/color.hxx> 36cdf0e10cSrcweir #include <rtl/ustrbuf.hxx> 37cdf0e10cSrcweir #include <rtl/strbuf.hxx> 38cdf0e10cSrcweir #ifndef _SVSTDARR_HXX 39cdf0e10cSrcweir #define _SVSTDARR_ULONGS 40cdf0e10cSrcweir #include <svl/svstdarr.hxx> 41cdf0e10cSrcweir #endif 42cdf0e10cSrcweir 43cdf0e10cSrcweir #include <tools/tenccvt.hxx> 44cdf0e10cSrcweir #include <tools/datetime.hxx> 45cdf0e10cSrcweir #include <svl/inettype.hxx> 46cdf0e10cSrcweir #include <comphelper/string.hxx> 47cdf0e10cSrcweir #include <com/sun/star/beans/PropertyAttribute.hpp> 48cdf0e10cSrcweir #include <com/sun/star/document/XDocumentProperties.hpp> 49cdf0e10cSrcweir 50cdf0e10cSrcweir #include <svtools/parhtml.hxx> 51cdf0e10cSrcweir #include <svtools/htmltokn.h> 52cdf0e10cSrcweir #include <svtools/htmlkywd.hxx> 53cdf0e10cSrcweir 54cdf0e10cSrcweir 55cdf0e10cSrcweir using namespace ::com::sun::star; 56cdf0e10cSrcweir 57cdf0e10cSrcweir 58cdf0e10cSrcweir const sal_Int32 MAX_LEN( 1024L ); 59cdf0e10cSrcweir //static sal_Unicode sTmpBuffer[ MAX_LEN+1 ]; 60cdf0e10cSrcweir const sal_Int32 MAX_MACRO_LEN( 1024 ); 61cdf0e10cSrcweir 62cdf0e10cSrcweir const sal_Int32 MAX_ENTITY_LEN( 8L ); 63cdf0e10cSrcweir 64cdf0e10cSrcweir /* */ 65cdf0e10cSrcweir 66cdf0e10cSrcweir // Tabellen zum Umwandeln von Options-Werten in Strings 67cdf0e10cSrcweir 68cdf0e10cSrcweir // <INPUT TYPE=xxx> 69cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aInputTypeOptEnums[] = 70cdf0e10cSrcweir { 71cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_text, HTML_IT_TEXT }, 72cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_password, HTML_IT_PASSWORD }, 73cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_checkbox, HTML_IT_CHECKBOX }, 74cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_radio, HTML_IT_RADIO }, 75cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_range, HTML_IT_RANGE }, 76cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_scribble, HTML_IT_SCRIBBLE }, 77cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_file, HTML_IT_FILE }, 78cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_hidden, HTML_IT_HIDDEN }, 79cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_submit, HTML_IT_SUBMIT }, 80cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_image, HTML_IT_IMAGE }, 81cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_reset, HTML_IT_RESET }, 82cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_button, HTML_IT_BUTTON }, 83cdf0e10cSrcweir { 0, 0 } 84cdf0e10cSrcweir }; 85cdf0e10cSrcweir 86cdf0e10cSrcweir // <TABLE FRAME=xxx> 87cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aTableFrameOptEnums[] = 88cdf0e10cSrcweir { 89cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_void, HTML_TF_VOID }, 90cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_above, HTML_TF_ABOVE }, 91cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_below, HTML_TF_BELOW }, 92cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_hsides, HTML_TF_HSIDES }, 93cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_lhs, HTML_TF_LHS }, 94cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_rhs, HTML_TF_RHS }, 95cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_vsides, HTML_TF_VSIDES }, 96cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_box, HTML_TF_BOX }, 97cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_border, HTML_TF_BOX }, 98cdf0e10cSrcweir { 0, 0 } 99cdf0e10cSrcweir }; 100cdf0e10cSrcweir 101cdf0e10cSrcweir // <TABLE RULES=xxx> 102cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aTableRulesOptEnums[] = 103cdf0e10cSrcweir { 104cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_none, HTML_TR_NONE }, 105cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_groups, HTML_TR_GROUPS }, 106cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_rows, HTML_TR_ROWS }, 107cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_cols, HTML_TR_COLS }, 108cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_all, HTML_TR_ALL }, 109cdf0e10cSrcweir { 0, 0 } 110cdf0e10cSrcweir }; 111cdf0e10cSrcweir 112cdf0e10cSrcweir 113cdf0e10cSrcweir SV_IMPL_PTRARR(HTMLOptions,HTMLOptionPtr) 114cdf0e10cSrcweir 115cdf0e10cSrcweir /* */ 116cdf0e10cSrcweir 117cdf0e10cSrcweir sal_uInt16 HTMLOption::GetEnum( const HTMLOptionEnum *pOptEnums, sal_uInt16 nDflt ) const 118cdf0e10cSrcweir { 119cdf0e10cSrcweir sal_uInt16 nValue = nDflt; 120cdf0e10cSrcweir 121cdf0e10cSrcweir while( pOptEnums->pName ) 122cdf0e10cSrcweir if( aValue.EqualsIgnoreCaseAscii( pOptEnums->pName ) ) 123cdf0e10cSrcweir break; 124cdf0e10cSrcweir else 125cdf0e10cSrcweir pOptEnums++; 126cdf0e10cSrcweir 127cdf0e10cSrcweir if( pOptEnums->pName ) 128cdf0e10cSrcweir nValue = pOptEnums->nValue; 129cdf0e10cSrcweir 130cdf0e10cSrcweir return nValue; 131cdf0e10cSrcweir } 132cdf0e10cSrcweir 133cdf0e10cSrcweir sal_Bool HTMLOption::GetEnum( sal_uInt16 &rEnum, const HTMLOptionEnum *pOptEnums ) const 134cdf0e10cSrcweir { 135cdf0e10cSrcweir while( pOptEnums->pName ) 136cdf0e10cSrcweir { 137cdf0e10cSrcweir if( aValue.EqualsIgnoreCaseAscii( pOptEnums->pName ) ) 138cdf0e10cSrcweir break; 139cdf0e10cSrcweir else 140cdf0e10cSrcweir pOptEnums++; 141cdf0e10cSrcweir } 142cdf0e10cSrcweir 143cdf0e10cSrcweir const sal_Char *pName = pOptEnums->pName; 144cdf0e10cSrcweir if( pName ) 145cdf0e10cSrcweir rEnum = pOptEnums->nValue; 146cdf0e10cSrcweir 147cdf0e10cSrcweir return (pName != 0); 148cdf0e10cSrcweir } 149cdf0e10cSrcweir 150cdf0e10cSrcweir HTMLOption::HTMLOption( sal_uInt16 nTok, const String& rToken, 151cdf0e10cSrcweir const String& rValue ) 152cdf0e10cSrcweir : aValue(rValue) 153cdf0e10cSrcweir , aToken(rToken) 154cdf0e10cSrcweir , nToken( nTok ) 155cdf0e10cSrcweir { 156cdf0e10cSrcweir DBG_ASSERT( nToken>=HTML_OPTION_START && nToken<HTML_OPTION_END, 157cdf0e10cSrcweir "HTMLOption: unbekanntes Token" ); 158cdf0e10cSrcweir } 159cdf0e10cSrcweir 160cdf0e10cSrcweir sal_uInt32 HTMLOption::GetNumber() const 161cdf0e10cSrcweir { 162cdf0e10cSrcweir DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START && 163cdf0e10cSrcweir nToken<HTML_OPTION_NUMBER_END) || 164cdf0e10cSrcweir (nToken>=HTML_OPTION_CONTEXT_START && 165cdf0e10cSrcweir nToken<HTML_OPTION_CONTEXT_END) || 166cdf0e10cSrcweir nToken==HTML_O_VALUE, 167cdf0e10cSrcweir "GetNumber: Option ist nicht numerisch" ); 168cdf0e10cSrcweir String aTmp( aValue ); 169cdf0e10cSrcweir aTmp.EraseLeadingChars(); 170cdf0e10cSrcweir sal_Int32 nTmp = aTmp.ToInt32(); 171cdf0e10cSrcweir return nTmp >= 0 ? (sal_uInt32)nTmp : 0; 172cdf0e10cSrcweir } 173cdf0e10cSrcweir 174cdf0e10cSrcweir sal_Int32 HTMLOption::GetSNumber() const 175cdf0e10cSrcweir { 176cdf0e10cSrcweir DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START && nToken<HTML_OPTION_NUMBER_END) || 177cdf0e10cSrcweir (nToken>=HTML_OPTION_CONTEXT_START && nToken<HTML_OPTION_CONTEXT_END), 178cdf0e10cSrcweir "GetSNumber: Option ist nicht numerisch" ); 179cdf0e10cSrcweir String aTmp( aValue ); 180cdf0e10cSrcweir aTmp.EraseLeadingChars(); 181cdf0e10cSrcweir return aTmp.ToInt32(); 182cdf0e10cSrcweir } 183cdf0e10cSrcweir 184cdf0e10cSrcweir void HTMLOption::GetNumbers( SvULongs &rLongs, sal_Bool bSpaceDelim ) const 185cdf0e10cSrcweir { 186cdf0e10cSrcweir if( rLongs.Count() ) 187cdf0e10cSrcweir rLongs.Remove( 0, rLongs.Count() ); 188cdf0e10cSrcweir 189cdf0e10cSrcweir if( bSpaceDelim ) 190cdf0e10cSrcweir { 191cdf0e10cSrcweir // das ist ein sehr stark vereinfachter Scanner. Er sucht einfach 192cdf0e10cSrcweir // alle Tiffern aus dem String 193cdf0e10cSrcweir sal_Bool bInNum = sal_False; 194cdf0e10cSrcweir sal_uLong nNum = 0; 195cdf0e10cSrcweir for( xub_StrLen i=0; i<aValue.Len(); i++ ) 196cdf0e10cSrcweir { 197cdf0e10cSrcweir register sal_Unicode c = aValue.GetChar( i ); 198cdf0e10cSrcweir if( c>='0' && c<='9' ) 199cdf0e10cSrcweir { 200cdf0e10cSrcweir nNum *= 10; 201cdf0e10cSrcweir nNum += (c - '0'); 202cdf0e10cSrcweir bInNum = sal_True; 203cdf0e10cSrcweir } 204cdf0e10cSrcweir else if( bInNum ) 205cdf0e10cSrcweir { 206cdf0e10cSrcweir rLongs.Insert( nNum, rLongs.Count() ); 207cdf0e10cSrcweir bInNum = sal_False; 208cdf0e10cSrcweir nNum = 0; 209cdf0e10cSrcweir } 210cdf0e10cSrcweir } 211cdf0e10cSrcweir if( bInNum ) 212cdf0e10cSrcweir { 213cdf0e10cSrcweir rLongs.Insert( nNum, rLongs.Count() ); 214cdf0e10cSrcweir } 215cdf0e10cSrcweir } 216cdf0e10cSrcweir else 217cdf0e10cSrcweir { 218cdf0e10cSrcweir // hier wird auf die korrekte Trennung der Zahlen durch ',' geachtet 219cdf0e10cSrcweir // und auch mal eine 0 eingefuegt 220cdf0e10cSrcweir xub_StrLen nPos = 0; 221cdf0e10cSrcweir while( nPos < aValue.Len() ) 222cdf0e10cSrcweir { 223cdf0e10cSrcweir register sal_Unicode c; 224cdf0e10cSrcweir while( nPos < aValue.Len() && 225cdf0e10cSrcweir ((c=aValue.GetChar(nPos)) == ' ' || c == '\t' || 226cdf0e10cSrcweir c == '\n' || c== '\r' ) ) 227cdf0e10cSrcweir nPos++; 228cdf0e10cSrcweir 229cdf0e10cSrcweir if( nPos==aValue.Len() ) 230cdf0e10cSrcweir rLongs.Insert( sal_uLong(0), rLongs.Count() ); 231cdf0e10cSrcweir else 232cdf0e10cSrcweir { 233cdf0e10cSrcweir xub_StrLen nEnd = aValue.Search( (sal_Unicode)',', nPos ); 234cdf0e10cSrcweir if( STRING_NOTFOUND==nEnd ) 235cdf0e10cSrcweir { 236cdf0e10cSrcweir sal_Int32 nTmp = aValue.Copy(nPos).ToInt32(); 237cdf0e10cSrcweir rLongs.Insert( nTmp >= 0 ? (sal_uInt32)nTmp : 0, 238cdf0e10cSrcweir rLongs.Count() ); 239cdf0e10cSrcweir nPos = aValue.Len(); 240cdf0e10cSrcweir } 241cdf0e10cSrcweir else 242cdf0e10cSrcweir { 243cdf0e10cSrcweir sal_Int32 nTmp = 244cdf0e10cSrcweir aValue.Copy(nPos,nEnd-nPos).ToInt32(); 245cdf0e10cSrcweir rLongs.Insert( nTmp >= 0 ? (sal_uInt32)nTmp : 0, 246cdf0e10cSrcweir rLongs.Count() ); 247cdf0e10cSrcweir nPos = nEnd+1; 248cdf0e10cSrcweir } 249cdf0e10cSrcweir } 250cdf0e10cSrcweir } 251cdf0e10cSrcweir } 252cdf0e10cSrcweir } 253cdf0e10cSrcweir 254cdf0e10cSrcweir void HTMLOption::GetColor( Color& rColor ) const 255cdf0e10cSrcweir { 256cdf0e10cSrcweir DBG_ASSERT( (nToken>=HTML_OPTION_COLOR_START && nToken<HTML_OPTION_COLOR_END) || nToken==HTML_O_SIZE, 257cdf0e10cSrcweir "GetColor: Option spezifiziert keine Farbe" ); 258cdf0e10cSrcweir 259cdf0e10cSrcweir String aTmp( aValue ); 260cdf0e10cSrcweir aTmp.ToUpperAscii(); 261cdf0e10cSrcweir sal_uLong nColor = ULONG_MAX; 262cdf0e10cSrcweir if( '#'!=aTmp.GetChar( 0 ) ) 263cdf0e10cSrcweir nColor = GetHTMLColor( aTmp ); 264cdf0e10cSrcweir 265cdf0e10cSrcweir if( ULONG_MAX == nColor ) 266cdf0e10cSrcweir { 267cdf0e10cSrcweir nColor = 0; 268cdf0e10cSrcweir xub_StrLen nPos = 0; 269cdf0e10cSrcweir for( sal_uInt32 i=0; i<6; i++ ) 270cdf0e10cSrcweir { 271cdf0e10cSrcweir // MIB 26.06.97: Wie auch immer Netscape Farbwerte ermittelt, 272cdf0e10cSrcweir // maximal drei Zeichen, die kleiner als '0' sind werden 273cdf0e10cSrcweir // ignoriert. Bug #40901# stimmt damit. Mal schauen, was sich 274cdf0e10cSrcweir // irgendwelche HTML-Autoren noch so einfallen lassen... 275cdf0e10cSrcweir register sal_Unicode c = nPos<aTmp.Len() ? aTmp.GetChar( nPos++ ) 276cdf0e10cSrcweir : '0'; 277cdf0e10cSrcweir if( c < '0' ) 278cdf0e10cSrcweir { 279cdf0e10cSrcweir c = nPos<aTmp.Len() ? aTmp.GetChar(nPos++) : '0'; 280cdf0e10cSrcweir if( c < '0' ) 281cdf0e10cSrcweir c = nPos<aTmp.Len() ? aTmp.GetChar(nPos++) : '0'; 282cdf0e10cSrcweir } 283cdf0e10cSrcweir nColor *= 16; 284cdf0e10cSrcweir if( c >= '0' && c <= '9' ) 285cdf0e10cSrcweir nColor += (c - 48); 286cdf0e10cSrcweir else if( c >= 'A' && c <= 'F' ) 287cdf0e10cSrcweir nColor += (c - 55); 288cdf0e10cSrcweir } 289cdf0e10cSrcweir } 290cdf0e10cSrcweir 291cdf0e10cSrcweir rColor.SetRed( (sal_uInt8)((nColor & 0x00ff0000) >> 16) ); 292cdf0e10cSrcweir rColor.SetGreen( (sal_uInt8)((nColor & 0x0000ff00) >> 8)); 293cdf0e10cSrcweir rColor.SetBlue( (sal_uInt8)(nColor & 0x000000ff) ); 294cdf0e10cSrcweir } 295cdf0e10cSrcweir 296cdf0e10cSrcweir HTMLInputType HTMLOption::GetInputType() const 297cdf0e10cSrcweir { 298cdf0e10cSrcweir DBG_ASSERT( nToken==HTML_O_TYPE, "GetInputType: Option nicht TYPE" ); 299cdf0e10cSrcweir return (HTMLInputType)GetEnum( aInputTypeOptEnums, HTML_IT_TEXT ); 300cdf0e10cSrcweir } 301cdf0e10cSrcweir 302cdf0e10cSrcweir HTMLTableFrame HTMLOption::GetTableFrame() const 303cdf0e10cSrcweir { 304cdf0e10cSrcweir DBG_ASSERT( nToken==HTML_O_FRAME, "GetTableFrame: Option nicht FRAME" ); 305cdf0e10cSrcweir return (HTMLTableFrame)GetEnum( aTableFrameOptEnums, HTML_TF_VOID ); 306cdf0e10cSrcweir } 307cdf0e10cSrcweir 308cdf0e10cSrcweir HTMLTableRules HTMLOption::GetTableRules() const 309cdf0e10cSrcweir { 310cdf0e10cSrcweir DBG_ASSERT( nToken==HTML_O_RULES, "GetTableRules: Option nicht RULES" ); 311cdf0e10cSrcweir return (HTMLTableRules)GetEnum( aTableRulesOptEnums, HTML_TR_NONE ); 312cdf0e10cSrcweir } 313cdf0e10cSrcweir 314cdf0e10cSrcweir /* */ 315cdf0e10cSrcweir 316cdf0e10cSrcweir HTMLParser::HTMLParser( SvStream& rIn, int bReadNewDoc ) 317cdf0e10cSrcweir : SvParser( rIn ) 318cdf0e10cSrcweir { 319cdf0e10cSrcweir bNewDoc = bReadNewDoc; 320cdf0e10cSrcweir bReadListing = bReadXMP = bReadPRE = bReadTextArea = 321cdf0e10cSrcweir bReadScript = bReadStyle = 322cdf0e10cSrcweir bEndTokenFound = bIsInBody = bReadNextChar = 323cdf0e10cSrcweir bReadComment = sal_False; 324cdf0e10cSrcweir bIsInHeader = sal_True; 325cdf0e10cSrcweir pOptions = new HTMLOptions; 326*8d621361SPedro Giffuni 327*8d621361SPedro Giffuni //#i76649, default to UTF-8 for HTML unless we know differently 328*8d621361SPedro Giffuni SetSrcEncoding(RTL_TEXTENCODING_UTF8); 329cdf0e10cSrcweir } 330cdf0e10cSrcweir 331cdf0e10cSrcweir HTMLParser::~HTMLParser() 332cdf0e10cSrcweir { 333cdf0e10cSrcweir if( pOptions && pOptions->Count() ) 334cdf0e10cSrcweir pOptions->DeleteAndDestroy( 0, pOptions->Count() ); 335cdf0e10cSrcweir delete pOptions; 336cdf0e10cSrcweir } 337cdf0e10cSrcweir 338cdf0e10cSrcweir SvParserState __EXPORT HTMLParser::CallParser() 339cdf0e10cSrcweir { 340cdf0e10cSrcweir eState = SVPAR_WORKING; 341cdf0e10cSrcweir nNextCh = GetNextChar(); 342cdf0e10cSrcweir SaveState( 0 ); 343cdf0e10cSrcweir 344cdf0e10cSrcweir nPre_LinePos = 0; 345cdf0e10cSrcweir bPre_IgnoreNewPara = sal_False; 346cdf0e10cSrcweir 347cdf0e10cSrcweir AddRef(); 348cdf0e10cSrcweir Continue( 0 ); 349cdf0e10cSrcweir if( SVPAR_PENDING != eState ) 350cdf0e10cSrcweir ReleaseRef(); // dann brauchen wir den Parser nicht mehr! 351cdf0e10cSrcweir 352cdf0e10cSrcweir return eState; 353cdf0e10cSrcweir } 354cdf0e10cSrcweir 355cdf0e10cSrcweir void HTMLParser::Continue( int nToken ) 356cdf0e10cSrcweir { 357cdf0e10cSrcweir if( !nToken ) 358cdf0e10cSrcweir nToken = GetNextToken(); 359cdf0e10cSrcweir 360cdf0e10cSrcweir while( IsParserWorking() ) 361cdf0e10cSrcweir { 362cdf0e10cSrcweir SaveState( nToken ); 363cdf0e10cSrcweir nToken = FilterToken( nToken ); 364cdf0e10cSrcweir 365cdf0e10cSrcweir if( nToken ) 366cdf0e10cSrcweir NextToken( nToken ); 367cdf0e10cSrcweir 368cdf0e10cSrcweir if( IsParserWorking() ) 369cdf0e10cSrcweir SaveState( 0 ); // bis hierhin abgearbeitet, 370cdf0e10cSrcweir // weiter mit neuem Token! 371cdf0e10cSrcweir nToken = GetNextToken(); 372cdf0e10cSrcweir } 373cdf0e10cSrcweir } 374cdf0e10cSrcweir 375cdf0e10cSrcweir int HTMLParser::FilterToken( int nToken ) 376cdf0e10cSrcweir { 377cdf0e10cSrcweir switch( nToken ) 378cdf0e10cSrcweir { 379cdf0e10cSrcweir case sal_Unicode(EOF): 380cdf0e10cSrcweir nToken = 0; 381cdf0e10cSrcweir break; // nicht verschicken 382cdf0e10cSrcweir 383cdf0e10cSrcweir case HTML_HEAD_OFF: 384cdf0e10cSrcweir bIsInBody = sal_True; 385cdf0e10cSrcweir case HTML_HEAD_ON: 386cdf0e10cSrcweir bIsInHeader = HTML_HEAD_ON == nToken; 387cdf0e10cSrcweir break; 388cdf0e10cSrcweir 389cdf0e10cSrcweir case HTML_BODY_ON: 390cdf0e10cSrcweir case HTML_FRAMESET_ON: 391cdf0e10cSrcweir bIsInHeader = sal_False; 392cdf0e10cSrcweir bIsInBody = HTML_BODY_ON == nToken; 393cdf0e10cSrcweir break; 394cdf0e10cSrcweir 395cdf0e10cSrcweir case HTML_BODY_OFF: 396cdf0e10cSrcweir bIsInBody = bReadPRE = bReadListing = bReadXMP = sal_False; 397cdf0e10cSrcweir break; 398cdf0e10cSrcweir 399cdf0e10cSrcweir case HTML_HTML_OFF: 400cdf0e10cSrcweir nToken = 0; 401cdf0e10cSrcweir bReadPRE = bReadListing = bReadXMP = sal_False; 402cdf0e10cSrcweir break; // HTML_ON wurde auch nicht verschickt ! 403cdf0e10cSrcweir 404cdf0e10cSrcweir case HTML_PREFORMTXT_ON: 405cdf0e10cSrcweir StartPRE(); 406cdf0e10cSrcweir break; 407cdf0e10cSrcweir 408cdf0e10cSrcweir case HTML_PREFORMTXT_OFF: 409cdf0e10cSrcweir FinishPRE(); 410cdf0e10cSrcweir break; 411cdf0e10cSrcweir 412cdf0e10cSrcweir case HTML_LISTING_ON: 413cdf0e10cSrcweir StartListing(); 414cdf0e10cSrcweir break; 415cdf0e10cSrcweir 416cdf0e10cSrcweir case HTML_LISTING_OFF: 417cdf0e10cSrcweir FinishListing(); 418cdf0e10cSrcweir break; 419cdf0e10cSrcweir 420cdf0e10cSrcweir case HTML_XMP_ON: 421cdf0e10cSrcweir StartXMP(); 422cdf0e10cSrcweir break; 423cdf0e10cSrcweir 424cdf0e10cSrcweir case HTML_XMP_OFF: 425cdf0e10cSrcweir FinishXMP(); 426cdf0e10cSrcweir break; 427cdf0e10cSrcweir 428cdf0e10cSrcweir default: 429cdf0e10cSrcweir if( bReadPRE ) 430cdf0e10cSrcweir nToken = FilterPRE( nToken ); 431cdf0e10cSrcweir else if( bReadListing ) 432cdf0e10cSrcweir nToken = FilterListing( nToken ); 433cdf0e10cSrcweir else if( bReadXMP ) 434cdf0e10cSrcweir nToken = FilterXMP( nToken ); 435cdf0e10cSrcweir 436cdf0e10cSrcweir break; 437cdf0e10cSrcweir } 438cdf0e10cSrcweir 439cdf0e10cSrcweir return nToken; 440cdf0e10cSrcweir } 441cdf0e10cSrcweir 442cdf0e10cSrcweir #define HTML_ISDIGIT( c ) (c >= '0' && c <= '9') 443cdf0e10cSrcweir #define HTML_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ) 444cdf0e10cSrcweir #define HTML_ISALNUM( c ) ( HTML_ISALPHA(c) || HTML_ISDIGIT(c) ) 445cdf0e10cSrcweir #define HTML_ISSPACE( c ) ( ' ' == c || (c >= 0x09 && c <= 0x0d) ) 446cdf0e10cSrcweir #define HTML_ISPRINTABLE( c ) ( c >= 32 && c != 127) 447cdf0e10cSrcweir // --> OD 2006-07-26 #138464# 448cdf0e10cSrcweir #define HTML_ISHEXDIGIT( c ) ( HTML_ISDIGIT(c) || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f') ) 449cdf0e10cSrcweir // <-- 450cdf0e10cSrcweir 451cdf0e10cSrcweir int HTMLParser::ScanText( const sal_Unicode cBreak ) 452cdf0e10cSrcweir { 453cdf0e10cSrcweir ::rtl::OUStringBuffer sTmpBuffer( MAX_LEN ); 454cdf0e10cSrcweir int bWeiter = sal_True; 455cdf0e10cSrcweir int bEqSignFound = sal_False; 456cdf0e10cSrcweir sal_Unicode cQuote = 0U; 457cdf0e10cSrcweir 458cdf0e10cSrcweir while( bWeiter && IsParserWorking() ) 459cdf0e10cSrcweir { 460cdf0e10cSrcweir int bNextCh = sal_True; 461cdf0e10cSrcweir switch( nNextCh ) 462cdf0e10cSrcweir { 463cdf0e10cSrcweir case '&': 464cdf0e10cSrcweir bEqSignFound = sal_False; 465cdf0e10cSrcweir if( bReadXMP ) 466cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'&' ); 467cdf0e10cSrcweir else 468cdf0e10cSrcweir { 469cdf0e10cSrcweir sal_uLong nStreamPos = rInput.Tell(); 470cdf0e10cSrcweir sal_uLong nLinePos = GetLinePos(); 471cdf0e10cSrcweir 472cdf0e10cSrcweir sal_Unicode cChar = 0U; 473cdf0e10cSrcweir if( '#' == (nNextCh = GetNextChar()) ) 474cdf0e10cSrcweir { 475cdf0e10cSrcweir nNextCh = GetNextChar(); 476cdf0e10cSrcweir // --> OD 2006-07-26 #138464# 477cdf0e10cSrcweir // consider hexadecimal digits 478cdf0e10cSrcweir const sal_Bool bIsHex( 'x' == nNextCh ); 479cdf0e10cSrcweir const sal_Bool bIsDecOrHex( bIsHex || HTML_ISDIGIT(nNextCh) ); 480cdf0e10cSrcweir if ( bIsDecOrHex ) 481cdf0e10cSrcweir { 482cdf0e10cSrcweir if ( bIsHex ) 483cdf0e10cSrcweir { 484cdf0e10cSrcweir nNextCh = GetNextChar(); 485cdf0e10cSrcweir while ( HTML_ISHEXDIGIT(nNextCh) ) 486cdf0e10cSrcweir { 487cdf0e10cSrcweir cChar = cChar * 16U + 488cdf0e10cSrcweir ( nNextCh <= '9' 489cdf0e10cSrcweir ? sal_Unicode( nNextCh - '0' ) 490cdf0e10cSrcweir : ( nNextCh <= 'F' 491cdf0e10cSrcweir ? sal_Unicode( nNextCh - 'A' + 10 ) 492cdf0e10cSrcweir : sal_Unicode( nNextCh - 'a' + 10 ) ) ); 493cdf0e10cSrcweir nNextCh = GetNextChar(); 494cdf0e10cSrcweir } 495cdf0e10cSrcweir } 496cdf0e10cSrcweir else 497cdf0e10cSrcweir { 498cdf0e10cSrcweir do 499cdf0e10cSrcweir { 500cdf0e10cSrcweir cChar = cChar * 10U + sal_Unicode( nNextCh - '0'); 501cdf0e10cSrcweir nNextCh = GetNextChar(); 502cdf0e10cSrcweir } 503cdf0e10cSrcweir while( HTML_ISDIGIT(nNextCh) ); 504cdf0e10cSrcweir } 505cdf0e10cSrcweir 506cdf0e10cSrcweir if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc && 507cdf0e10cSrcweir RTL_TEXTENCODING_UCS2 != eSrcEnc && 508cdf0e10cSrcweir RTL_TEXTENCODING_UTF8 != eSrcEnc && 509cdf0e10cSrcweir cChar < 256 ) 510cdf0e10cSrcweir { 511cdf0e10cSrcweir sal_Unicode cOrig = cChar; 512cdf0e10cSrcweir cChar = ByteString::ConvertToUnicode( 513cdf0e10cSrcweir (sal_Char)cChar, eSrcEnc ); 514cdf0e10cSrcweir if( 0U == cChar ) 515cdf0e10cSrcweir { 516cdf0e10cSrcweir // #73398#: If the character could not be 517cdf0e10cSrcweir // converted, because a conversion is not 518cdf0e10cSrcweir // available, do no conversion at all. 519cdf0e10cSrcweir cChar = cOrig; 520cdf0e10cSrcweir } 521cdf0e10cSrcweir } 522cdf0e10cSrcweir } 523cdf0e10cSrcweir // <-- 524cdf0e10cSrcweir else 525cdf0e10cSrcweir nNextCh = 0U; 526cdf0e10cSrcweir } 527cdf0e10cSrcweir else if( HTML_ISALPHA( nNextCh ) ) 528cdf0e10cSrcweir { 529cdf0e10cSrcweir ::rtl::OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN ); 530cdf0e10cSrcweir xub_StrLen nPos = 0L; 531cdf0e10cSrcweir do 532cdf0e10cSrcweir { 533cdf0e10cSrcweir sEntityBuffer.append( nNextCh ); 534cdf0e10cSrcweir nPos++; 535cdf0e10cSrcweir nNextCh = GetNextChar(); 536cdf0e10cSrcweir } 537cdf0e10cSrcweir while( nPos < MAX_ENTITY_LEN && HTML_ISALNUM( nNextCh ) && 538cdf0e10cSrcweir !rInput.IsEof() ); 539cdf0e10cSrcweir 540cdf0e10cSrcweir if( IsParserWorking() && !rInput.IsEof() ) 541cdf0e10cSrcweir { 542cdf0e10cSrcweir String sEntity( sEntityBuffer.getStr(), nPos ); 543cdf0e10cSrcweir cChar = GetHTMLCharName( sEntity ); 544cdf0e10cSrcweir 545cdf0e10cSrcweir // nicht gefunden ( == 0 ), dann Klartext 546cdf0e10cSrcweir // oder ein Zeichen das als Attribut eingefuegt 547cdf0e10cSrcweir // wird 548cdf0e10cSrcweir if( 0U == cChar && ';' != nNextCh ) 549cdf0e10cSrcweir { 550cdf0e10cSrcweir DBG_ASSERT( rInput.Tell() - nStreamPos == 551cdf0e10cSrcweir (sal_uLong)(nPos+1L)*GetCharSize(), 552cdf0e10cSrcweir "UTF-8 geht hier schief" ); 553cdf0e10cSrcweir for( xub_StrLen i=nPos-1L; i>1L; i-- ) 554cdf0e10cSrcweir { 555cdf0e10cSrcweir nNextCh = sEntityBuffer[i]; 556cdf0e10cSrcweir sEntityBuffer.setLength( i ); 557cdf0e10cSrcweir sEntity.Assign( sEntityBuffer.getStr(), i ); 558cdf0e10cSrcweir cChar = GetHTMLCharName( sEntity ); 559cdf0e10cSrcweir if( cChar ) 560cdf0e10cSrcweir { 561cdf0e10cSrcweir rInput.SeekRel( -(long) 562cdf0e10cSrcweir ((nPos-i)*GetCharSize()) ); 563cdf0e10cSrcweir nlLinePos -= sal_uInt32(nPos-i); 564cdf0e10cSrcweir nPos = i; 565cdf0e10cSrcweir ClearTxtConvContext(); 566cdf0e10cSrcweir break; 567cdf0e10cSrcweir } 568cdf0e10cSrcweir } 569cdf0e10cSrcweir } 570cdf0e10cSrcweir 571cdf0e10cSrcweir if( !cChar ) // unbekanntes Zeichen? 572cdf0e10cSrcweir { 573cdf0e10cSrcweir // dann im Stream zurueck, das '&' als Zeichen 574cdf0e10cSrcweir // einfuegen und mit dem nachfolgenden Zeichen 575cdf0e10cSrcweir // wieder aufsetzen 576cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'&' ); 577cdf0e10cSrcweir 578cdf0e10cSrcweir // rInput.SeekRel( -(long)(++nPos*GetCharSize()) ); 579cdf0e10cSrcweir // nlLinePos -= nPos; 580cdf0e10cSrcweir DBG_ASSERT( rInput.Tell()-nStreamPos == 581cdf0e10cSrcweir (sal_uLong)(nPos+1)*GetCharSize(), 582cdf0e10cSrcweir "Falsche Stream-Position" ); 583cdf0e10cSrcweir DBG_ASSERT( nlLinePos-nLinePos == 584cdf0e10cSrcweir (sal_uLong)(nPos+1), 585cdf0e10cSrcweir "Falsche Zeilen-Position" ); 586cdf0e10cSrcweir rInput.Seek( nStreamPos ); 587cdf0e10cSrcweir nlLinePos = nLinePos; 588cdf0e10cSrcweir ClearTxtConvContext(); 589cdf0e10cSrcweir break; 590cdf0e10cSrcweir } 591cdf0e10cSrcweir 592cdf0e10cSrcweir // 1 == Non Breaking Space 593cdf0e10cSrcweir // 2 == SoftHyphen 594cdf0e10cSrcweir 595cdf0e10cSrcweir if( cChar < 3U ) 596cdf0e10cSrcweir { 597cdf0e10cSrcweir if( '>' == cBreak ) 598cdf0e10cSrcweir { 599cdf0e10cSrcweir // Wenn der Inhalt eines Tags gelesen wird, 600cdf0e10cSrcweir // muessen wir ein Space bzw. - daraus machen 601cdf0e10cSrcweir switch( cChar ) 602cdf0e10cSrcweir { 603cdf0e10cSrcweir case 1U: cChar = ' '; break; 604cdf0e10cSrcweir case 2U: cChar = '-'; break; 605cdf0e10cSrcweir default: 606cdf0e10cSrcweir DBG_ASSERT( cChar==1U, 607cdf0e10cSrcweir "\0x00 sollte doch schon laengt abgefangen sein!" ); 608cdf0e10cSrcweir break; 609cdf0e10cSrcweir } 610cdf0e10cSrcweir } 611cdf0e10cSrcweir else 612cdf0e10cSrcweir { 613cdf0e10cSrcweir // Wenn kein Tag gescannt wird, enstprechendes 614cdf0e10cSrcweir // Token zurueckgeben 615cdf0e10cSrcweir aToken += 616cdf0e10cSrcweir String( sTmpBuffer.makeStringAndClear() ); 617cdf0e10cSrcweir if( cChar ) 618cdf0e10cSrcweir { 619cdf0e10cSrcweir if( aToken.Len() ) 620cdf0e10cSrcweir { 621cdf0e10cSrcweir // mit dem Zeichen wieder aufsetzen 622cdf0e10cSrcweir nNextCh = '&'; 623cdf0e10cSrcweir // rInput.SeekRel( -(long)(++nPos*GetCharSize()) ); 624cdf0e10cSrcweir // nlLinePos -= nPos; 625cdf0e10cSrcweir DBG_ASSERT( rInput.Tell()-nStreamPos == 626cdf0e10cSrcweir (sal_uLong)(nPos+1)*GetCharSize(), 627cdf0e10cSrcweir "Falsche Stream-Position" ); 628cdf0e10cSrcweir DBG_ASSERT( nlLinePos-nLinePos == 629cdf0e10cSrcweir (sal_uLong)(nPos+1), 630cdf0e10cSrcweir "Falsche Zeilen-Position" ); 631cdf0e10cSrcweir rInput.Seek( nStreamPos ); 632cdf0e10cSrcweir nlLinePos = nLinePos; 633cdf0e10cSrcweir ClearTxtConvContext(); 634cdf0e10cSrcweir return HTML_TEXTTOKEN; 635cdf0e10cSrcweir } 636cdf0e10cSrcweir 637cdf0e10cSrcweir // Hack: _GetNextChar soll nicht das 638cdf0e10cSrcweir // naechste Zeichen lesen 639cdf0e10cSrcweir if( ';' != nNextCh ) 640cdf0e10cSrcweir aToken += ' '; 641cdf0e10cSrcweir if( 1U == cChar ) 642cdf0e10cSrcweir return HTML_NONBREAKSPACE; 643cdf0e10cSrcweir if( 2U == cChar ) 644cdf0e10cSrcweir return HTML_SOFTHYPH; 645cdf0e10cSrcweir } 646cdf0e10cSrcweir aToken += (sal_Unicode)'&'; 647cdf0e10cSrcweir aToken += 648cdf0e10cSrcweir String(sEntityBuffer.makeStringAndClear()); 649cdf0e10cSrcweir break; 650cdf0e10cSrcweir } 651cdf0e10cSrcweir } 652cdf0e10cSrcweir } 653cdf0e10cSrcweir else 654cdf0e10cSrcweir nNextCh = 0U; 655cdf0e10cSrcweir } 656cdf0e10cSrcweir // MIB 03/02/2000: &{...};-JavaScript-Macros are not 657cdf0e10cSrcweir // supported any longer. 658cdf0e10cSrcweir else if( IsParserWorking() ) 659cdf0e10cSrcweir { 660cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'&' ); 661cdf0e10cSrcweir bNextCh = sal_False; 662cdf0e10cSrcweir break; 663cdf0e10cSrcweir } 664cdf0e10cSrcweir 665cdf0e10cSrcweir bNextCh = (';' == nNextCh); 666cdf0e10cSrcweir if( cBreak=='>' && (cChar=='\\' || cChar=='\'' || 667cdf0e10cSrcweir cChar=='\"' || cChar==' ') ) 668cdf0e10cSrcweir { 669cdf0e10cSrcweir // ' und " mussen innerhalb von Tags mit einem 670cdf0e10cSrcweir // gekennzeichnet werden, um sie von ' und " als Klammern 671cdf0e10cSrcweir // um Optionen zu unterscheiden. Logischerweise muss 672cdf0e10cSrcweir // deshalb auch ein \ gekeenzeichnet werden. Ausserdem 673cdf0e10cSrcweir // schuetzen wir ein Space, weil es kein Trennzeichen 674cdf0e10cSrcweir // zwischen Optionen ist. 675cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'\\' ); 676cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() ) 677cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 678cdf0e10cSrcweir } 679cdf0e10cSrcweir if( IsParserWorking() ) 680cdf0e10cSrcweir { 681cdf0e10cSrcweir if( cChar ) 682cdf0e10cSrcweir sTmpBuffer.append( cChar ); 683cdf0e10cSrcweir } 684cdf0e10cSrcweir else if( SVPAR_PENDING==eState && '>'!=cBreak ) 685cdf0e10cSrcweir { 686cdf0e10cSrcweir // Mit dem '&' Zeichen wieder aufsetzen, der Rest 687cdf0e10cSrcweir // wird als Texttoken zurueckgegeben. 688cdf0e10cSrcweir if( aToken.Len() || sTmpBuffer.getLength() ) 689cdf0e10cSrcweir { 690cdf0e10cSrcweir // Der bisherige Text wird von _GetNextChar() 691cdf0e10cSrcweir // zurueckgegeben und beim naechsten Aufruf wird 692cdf0e10cSrcweir // ein neues Zeichen gelesen. Also muessen wir uns 693cdf0e10cSrcweir // noch vor das & stellen. 694cdf0e10cSrcweir nNextCh = 0U; 695cdf0e10cSrcweir rInput.Seek( nStreamPos-(sal_uInt32)GetCharSize() ); 696cdf0e10cSrcweir nlLinePos = nLinePos-1; 697cdf0e10cSrcweir ClearTxtConvContext(); 698cdf0e10cSrcweir bReadNextChar = sal_True; 699cdf0e10cSrcweir } 700cdf0e10cSrcweir bNextCh = sal_False; 701cdf0e10cSrcweir } 702cdf0e10cSrcweir } 703cdf0e10cSrcweir break; 704cdf0e10cSrcweir case '=': 705cdf0e10cSrcweir if( '>'==cBreak && !cQuote ) 706cdf0e10cSrcweir bEqSignFound = sal_True; 707cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 708cdf0e10cSrcweir break; 709cdf0e10cSrcweir 710cdf0e10cSrcweir case '\\': 711cdf0e10cSrcweir if( '>'==cBreak ) 712cdf0e10cSrcweir { 713cdf0e10cSrcweir // Innerhalb von Tags kennzeichnen 714cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'\\' ); 715cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() ) 716cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 717cdf0e10cSrcweir } 718cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'\\' ); 719cdf0e10cSrcweir break; 720cdf0e10cSrcweir 721cdf0e10cSrcweir case '\"': 722cdf0e10cSrcweir case '\'': 723cdf0e10cSrcweir if( '>'==cBreak ) 724cdf0e10cSrcweir { 725cdf0e10cSrcweir if( bEqSignFound ) 726cdf0e10cSrcweir cQuote = nNextCh; 727cdf0e10cSrcweir else if( cQuote && (cQuote==nNextCh ) ) 728cdf0e10cSrcweir cQuote = 0U; 729cdf0e10cSrcweir } 730cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 731cdf0e10cSrcweir bEqSignFound = sal_False; 732cdf0e10cSrcweir break; 733cdf0e10cSrcweir 734cdf0e10cSrcweir case sal_Unicode(EOF): 735cdf0e10cSrcweir if( rInput.IsEof() ) 736cdf0e10cSrcweir { 737cdf0e10cSrcweir // MIB 20.11.98: Das macht hier keinen Sinn, oder doch: Zumindest wird 738cdf0e10cSrcweir // abcä<EOF> nicht angezeigt, also lassen wir das in Zukunft. 739cdf0e10cSrcweir // if( '>' != cBreak ) 740cdf0e10cSrcweir // eState = SVPAR_ACCEPTED; 741cdf0e10cSrcweir bWeiter = sal_False; 742cdf0e10cSrcweir } 743cdf0e10cSrcweir else 744cdf0e10cSrcweir { 745cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 746cdf0e10cSrcweir } 747cdf0e10cSrcweir break; 748cdf0e10cSrcweir 749cdf0e10cSrcweir case '<': 750cdf0e10cSrcweir bEqSignFound = sal_False; 751cdf0e10cSrcweir if( '>'==cBreak ) 752cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 753cdf0e10cSrcweir else 754cdf0e10cSrcweir bWeiter = sal_False; // Abbrechen, String zusammen 755cdf0e10cSrcweir break; 756cdf0e10cSrcweir 757cdf0e10cSrcweir case '\f': 758cdf0e10cSrcweir if( '>' == cBreak ) 759cdf0e10cSrcweir { 760cdf0e10cSrcweir // Beim Scannen von Optionen wie ein Space behandeln 761cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)' ' ); 762cdf0e10cSrcweir } 763cdf0e10cSrcweir else 764cdf0e10cSrcweir { 765cdf0e10cSrcweir // sonst wird es ein eigenes Token 766cdf0e10cSrcweir bWeiter = sal_False; 767cdf0e10cSrcweir } 768cdf0e10cSrcweir break; 769cdf0e10cSrcweir 770cdf0e10cSrcweir case '\r': 771cdf0e10cSrcweir case '\n': 772cdf0e10cSrcweir if( '>'==cBreak ) 773cdf0e10cSrcweir { 774cdf0e10cSrcweir // #26979# cr/lf in Tag wird in _GetNextToken() behandeln 775cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 776cdf0e10cSrcweir break; 777cdf0e10cSrcweir } 778cdf0e10cSrcweir else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea ) 779cdf0e10cSrcweir { 780cdf0e10cSrcweir bWeiter = sal_False; 781cdf0e10cSrcweir break; 782cdf0e10cSrcweir } 783cdf0e10cSrcweir // Bug 18984: CR-LF -> Blank 784cdf0e10cSrcweir // Folge von CR/LF/BLANK/TAB nur in ein Blank wandeln 785cdf0e10cSrcweir // kein break!! 786cdf0e10cSrcweir case '\t': 787cdf0e10cSrcweir if( '\t'==nNextCh && bReadPRE && '>'!=cBreak ) 788cdf0e10cSrcweir { 789cdf0e10cSrcweir // In <PRE>: Tabs nach oben durchreichen 790cdf0e10cSrcweir bWeiter = sal_False; 791cdf0e10cSrcweir break; 792cdf0e10cSrcweir } 793cdf0e10cSrcweir // kein break 794cdf0e10cSrcweir case '\x0b': 795cdf0e10cSrcweir if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) && 796cdf0e10cSrcweir '>'!=cBreak ) 797cdf0e10cSrcweir { 798cdf0e10cSrcweir break; 799cdf0e10cSrcweir } 800cdf0e10cSrcweir nNextCh = ' '; 801cdf0e10cSrcweir // kein break; 802cdf0e10cSrcweir case ' ': 803cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 804cdf0e10cSrcweir if( '>'!=cBreak && (!bReadListing && !bReadXMP && 805cdf0e10cSrcweir !bReadPRE && !bReadTextArea) ) 806cdf0e10cSrcweir { 807cdf0e10cSrcweir // alle Folgen von Blanks/Tabs/CR/LF zu einem Blank umwandeln 808cdf0e10cSrcweir do { 809cdf0e10cSrcweir if( sal_Unicode(EOF) == (nNextCh = GetNextChar()) && 810cdf0e10cSrcweir rInput.IsEof() ) 811cdf0e10cSrcweir { 812cdf0e10cSrcweir if( aToken.Len() || sTmpBuffer.getLength() > 1L ) 813cdf0e10cSrcweir { 814cdf0e10cSrcweir // ausser den Blanks wurde noch etwas geselen 815cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 816cdf0e10cSrcweir return HTML_TEXTTOKEN; 817cdf0e10cSrcweir } 818cdf0e10cSrcweir else 819cdf0e10cSrcweir // nur Blanks gelesen: dann darf kein Text 820cdf0e10cSrcweir // mehr zurueckgegeben werden und _GetNextToken 821cdf0e10cSrcweir // muss auf EOF laufen 822cdf0e10cSrcweir return 0; 823cdf0e10cSrcweir } 824cdf0e10cSrcweir } while ( ' ' == nNextCh || '\t' == nNextCh || 825cdf0e10cSrcweir '\r' == nNextCh || '\n' == nNextCh || 826cdf0e10cSrcweir '\x0b' == nNextCh ); 827cdf0e10cSrcweir bNextCh = sal_False; 828cdf0e10cSrcweir } 829cdf0e10cSrcweir break; 830cdf0e10cSrcweir 831cdf0e10cSrcweir default: 832cdf0e10cSrcweir bEqSignFound = sal_False; 833cdf0e10cSrcweir if( (nNextCh==cBreak && !cQuote) || 834cdf0e10cSrcweir (sal_uLong(aToken.Len()) + MAX_LEN) > sal_uLong(STRING_MAXLEN & ~1 )) 835cdf0e10cSrcweir bWeiter = sal_False; 836cdf0e10cSrcweir else 837cdf0e10cSrcweir { 838cdf0e10cSrcweir do { 839cdf0e10cSrcweir // alle anderen Zeichen kommen in den Text 840cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 841cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() ) 842cdf0e10cSrcweir { 843cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 844cdf0e10cSrcweir if( (sal_uLong(aToken.Len()) + MAX_LEN) > 845cdf0e10cSrcweir sal_uLong(STRING_MAXLEN & ~1 ) ) 846cdf0e10cSrcweir { 847cdf0e10cSrcweir nNextCh = GetNextChar(); 848cdf0e10cSrcweir return HTML_TEXTTOKEN; 849cdf0e10cSrcweir } 850cdf0e10cSrcweir } 851cdf0e10cSrcweir if( ( sal_Unicode(EOF) == (nNextCh = GetNextChar()) && 852cdf0e10cSrcweir rInput.IsEof() ) || 853cdf0e10cSrcweir !IsParserWorking() ) 854cdf0e10cSrcweir { 855cdf0e10cSrcweir if( sTmpBuffer.getLength() ) 856cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 857cdf0e10cSrcweir return HTML_TEXTTOKEN; 858cdf0e10cSrcweir } 859cdf0e10cSrcweir } while( HTML_ISALPHA( nNextCh ) || HTML_ISDIGIT( nNextCh ) ); 860cdf0e10cSrcweir bNextCh = sal_False; 861cdf0e10cSrcweir } 862cdf0e10cSrcweir } 863cdf0e10cSrcweir 864cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() ) 865cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 866cdf0e10cSrcweir 867cdf0e10cSrcweir if( bWeiter && bNextCh ) 868cdf0e10cSrcweir nNextCh = GetNextChar(); 869cdf0e10cSrcweir } 870cdf0e10cSrcweir 871cdf0e10cSrcweir if( sTmpBuffer.getLength() ) 872cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 873cdf0e10cSrcweir 874cdf0e10cSrcweir return HTML_TEXTTOKEN; 875cdf0e10cSrcweir } 876cdf0e10cSrcweir 877cdf0e10cSrcweir int HTMLParser::_GetNextRawToken() 878cdf0e10cSrcweir { 879cdf0e10cSrcweir ::rtl::OUStringBuffer sTmpBuffer( MAX_LEN ); 880cdf0e10cSrcweir 881cdf0e10cSrcweir if( bEndTokenFound ) 882cdf0e10cSrcweir { 883cdf0e10cSrcweir // beim letzten Aufruf haben wir das End-Token bereits gefunden, 884cdf0e10cSrcweir // deshalb muessen wir es nicht noch einmal suchen 885cdf0e10cSrcweir bReadScript = sal_False; 886cdf0e10cSrcweir bReadStyle = sal_False; 887cdf0e10cSrcweir aEndToken.Erase(); 888cdf0e10cSrcweir bEndTokenFound = sal_False; 889cdf0e10cSrcweir 890cdf0e10cSrcweir return 0; 891cdf0e10cSrcweir } 892cdf0e10cSrcweir 893cdf0e10cSrcweir // per default geben wir HTML_RAWDATA zurueck 894cdf0e10cSrcweir int bWeiter = sal_True; 895cdf0e10cSrcweir int nToken = HTML_RAWDATA; 896cdf0e10cSrcweir SaveState( 0 ); 897cdf0e10cSrcweir while( bWeiter && IsParserWorking() ) 898cdf0e10cSrcweir { 899cdf0e10cSrcweir int bNextCh = sal_True; 900cdf0e10cSrcweir switch( nNextCh ) 901cdf0e10cSrcweir { 902cdf0e10cSrcweir case '<': 903cdf0e10cSrcweir { 904cdf0e10cSrcweir // Vielleicht haben wir das Ende erreicht 905cdf0e10cSrcweir 906cdf0e10cSrcweir // das bisher gelesene erstmal retten 907cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 908cdf0e10cSrcweir 909cdf0e10cSrcweir // und die Position im Stream merken 910cdf0e10cSrcweir sal_uLong nStreamPos = rInput.Tell(); 911cdf0e10cSrcweir sal_uLong nLineNr = GetLineNr(); 912cdf0e10cSrcweir sal_uLong nLinePos = GetLinePos(); 913cdf0e10cSrcweir 914cdf0e10cSrcweir // Start eines End-Token? 915cdf0e10cSrcweir int bOffState = sal_False; 916cdf0e10cSrcweir if( '/' == (nNextCh = GetNextChar()) ) 917cdf0e10cSrcweir { 918cdf0e10cSrcweir bOffState = sal_True; 919cdf0e10cSrcweir nNextCh = GetNextChar(); 920cdf0e10cSrcweir } 921cdf0e10cSrcweir else if( '!' == nNextCh ) 922cdf0e10cSrcweir { 923cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 924cdf0e10cSrcweir nNextCh = GetNextChar(); 925cdf0e10cSrcweir } 926cdf0e10cSrcweir 927cdf0e10cSrcweir // jetzt die Buchstaben danach lesen 928cdf0e10cSrcweir while( (HTML_ISALPHA(nNextCh) || '-'==nNextCh) && 929cdf0e10cSrcweir IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN ) 930cdf0e10cSrcweir { 931cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 932cdf0e10cSrcweir nNextCh = GetNextChar(); 933cdf0e10cSrcweir } 934cdf0e10cSrcweir 935cdf0e10cSrcweir String aTok( sTmpBuffer.getStr(), 936cdf0e10cSrcweir sal::static_int_cast< xub_StrLen >( 937cdf0e10cSrcweir sTmpBuffer.getLength()) ); 938cdf0e10cSrcweir aTok.ToUpperAscii(); 939cdf0e10cSrcweir sal_Bool bDone = sal_False; 940cdf0e10cSrcweir if( bReadScript || aEndToken.Len() ) 941cdf0e10cSrcweir { 942cdf0e10cSrcweir if( !bReadComment ) 943cdf0e10cSrcweir { 944cdf0e10cSrcweir if( aTok.CompareToAscii( OOO_STRING_SVTOOLS_HTML_comment, 3 ) 945cdf0e10cSrcweir == COMPARE_EQUAL ) 946cdf0e10cSrcweir { 947cdf0e10cSrcweir bReadComment = sal_True; 948cdf0e10cSrcweir } 949cdf0e10cSrcweir else 950cdf0e10cSrcweir { 951cdf0e10cSrcweir // ein Script muss mit "</SCRIPT>" aufhoehren, wobei 952cdf0e10cSrcweir // wir es mit dem ">" aus sicherheitsgruenden 953cdf0e10cSrcweir // erstmal nicht so genau nehmen 954cdf0e10cSrcweir bDone = bOffState && // '>'==nNextCh && 955cdf0e10cSrcweir COMPARE_EQUAL == ( bReadScript 956cdf0e10cSrcweir ? aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_script) 957cdf0e10cSrcweir : aTok.CompareTo(aEndToken) ); 958cdf0e10cSrcweir } 959cdf0e10cSrcweir } 960cdf0e10cSrcweir if( bReadComment && '>'==nNextCh && aTok.Len() >= 2 && 961cdf0e10cSrcweir aTok.Copy( aTok.Len()-2 ).EqualsAscii( "--" ) ) 962cdf0e10cSrcweir { 963cdf0e10cSrcweir // hier ist ein Kommentar der Art <!-----> zuende 964cdf0e10cSrcweir bReadComment = sal_False; 965cdf0e10cSrcweir } 966cdf0e10cSrcweir } 967cdf0e10cSrcweir else 968cdf0e10cSrcweir { 969cdf0e10cSrcweir // ein Style-Sheet kann mit </STYLE>, </HEAD> oder 970cdf0e10cSrcweir // <BODY> aughoehren 971cdf0e10cSrcweir if( bOffState ) 972cdf0e10cSrcweir bDone = aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_style) 973cdf0e10cSrcweir == COMPARE_EQUAL || 974cdf0e10cSrcweir aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_head) 975cdf0e10cSrcweir == COMPARE_EQUAL; 976cdf0e10cSrcweir else 977cdf0e10cSrcweir bDone = 978cdf0e10cSrcweir aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_body) == COMPARE_EQUAL; 979cdf0e10cSrcweir } 980cdf0e10cSrcweir 981cdf0e10cSrcweir if( bDone ) 982cdf0e10cSrcweir { 983cdf0e10cSrcweir // das war's, jetzt muessen wir gegebenenfalls den 984cdf0e10cSrcweir // bisher gelesenen String zurueckgeben und dnach normal 985cdf0e10cSrcweir // weitermachen 986cdf0e10cSrcweir 987cdf0e10cSrcweir bWeiter = sal_False; 988cdf0e10cSrcweir 989cdf0e10cSrcweir // nToken==0 heisst, dass _GetNextToken gleich weiterliest 990cdf0e10cSrcweir if( !aToken.Len() && (bReadStyle || bReadScript) ) 991cdf0e10cSrcweir { 992cdf0e10cSrcweir // wir koennen sofort die Umgebung beeden und 993cdf0e10cSrcweir // das End-Token parsen 994cdf0e10cSrcweir bReadScript = sal_False; 995cdf0e10cSrcweir bReadStyle = sal_False; 996cdf0e10cSrcweir aEndToken.Erase(); 997cdf0e10cSrcweir nToken = 0; 998cdf0e10cSrcweir } 999cdf0e10cSrcweir else 1000cdf0e10cSrcweir { 1001cdf0e10cSrcweir // wir muessen bReadScript/bReadStyle noch am 1002cdf0e10cSrcweir // Leben lassen und koennen erst beim naechsten 1003cdf0e10cSrcweir // mal das End-Token Parsen 1004cdf0e10cSrcweir bEndTokenFound = sal_True; 1005cdf0e10cSrcweir } 1006cdf0e10cSrcweir 1007cdf0e10cSrcweir // jetzt fahren wir im Stream auf das '<' zurueck 1008cdf0e10cSrcweir rInput.Seek( nStreamPos ); 1009cdf0e10cSrcweir SetLineNr( nLineNr ); 1010cdf0e10cSrcweir SetLinePos( nLinePos ); 1011cdf0e10cSrcweir ClearTxtConvContext(); 1012cdf0e10cSrcweir nNextCh = '<'; 1013cdf0e10cSrcweir 1014cdf0e10cSrcweir // den String wollen wir nicht an das Token haengen 1015cdf0e10cSrcweir sTmpBuffer.setLength( 0L ); 1016cdf0e10cSrcweir } 1017cdf0e10cSrcweir else 1018cdf0e10cSrcweir { 1019cdf0e10cSrcweir // "</" merken, alles andere steht noch im buffer 1020cdf0e10cSrcweir aToken += (sal_Unicode)'<'; 1021cdf0e10cSrcweir if( bOffState ) 1022cdf0e10cSrcweir aToken += (sal_Unicode)'/'; 1023cdf0e10cSrcweir 1024cdf0e10cSrcweir bNextCh = sal_False; 1025cdf0e10cSrcweir } 1026cdf0e10cSrcweir } 1027cdf0e10cSrcweir break; 1028cdf0e10cSrcweir case '-': 1029cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 1030cdf0e10cSrcweir if( bReadComment ) 1031cdf0e10cSrcweir { 1032cdf0e10cSrcweir sal_Bool bTwoMinus = sal_False; 1033cdf0e10cSrcweir nNextCh = GetNextChar(); 1034cdf0e10cSrcweir while( '-' == nNextCh && IsParserWorking() ) 1035cdf0e10cSrcweir { 1036cdf0e10cSrcweir bTwoMinus = sal_True; 1037cdf0e10cSrcweir 1038cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() ) 1039cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 1040cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 1041cdf0e10cSrcweir nNextCh = GetNextChar(); 1042cdf0e10cSrcweir } 1043cdf0e10cSrcweir 1044cdf0e10cSrcweir if( '>' == nNextCh && IsParserWorking() && bTwoMinus ) 1045cdf0e10cSrcweir bReadComment = sal_False; 1046cdf0e10cSrcweir 1047cdf0e10cSrcweir bNextCh = sal_False; 1048cdf0e10cSrcweir } 1049cdf0e10cSrcweir break; 1050cdf0e10cSrcweir 1051cdf0e10cSrcweir case '\r': 1052cdf0e10cSrcweir // \r\n? beendet das aktuelle Text-Token (auch wenn es leer ist) 1053cdf0e10cSrcweir nNextCh = GetNextChar(); 1054cdf0e10cSrcweir if( nNextCh=='\n' ) 1055cdf0e10cSrcweir nNextCh = GetNextChar(); 1056cdf0e10cSrcweir bWeiter = sal_False; 1057cdf0e10cSrcweir break; 1058cdf0e10cSrcweir case '\n': 1059cdf0e10cSrcweir // \n beendet das aktuelle Text-Token (auch wenn es leer ist) 1060cdf0e10cSrcweir nNextCh = GetNextChar(); 1061cdf0e10cSrcweir bWeiter = sal_False; 1062cdf0e10cSrcweir break; 1063cdf0e10cSrcweir case sal_Unicode(EOF): 1064cdf0e10cSrcweir // eof beendet das aktuelle Text-Token und tut so, als ob 1065cdf0e10cSrcweir // ein End-Token gelesen wurde 1066cdf0e10cSrcweir if( rInput.IsEof() ) 1067cdf0e10cSrcweir { 1068cdf0e10cSrcweir bWeiter = sal_False; 1069cdf0e10cSrcweir if( aToken.Len() || sTmpBuffer.getLength() ) 1070cdf0e10cSrcweir { 1071cdf0e10cSrcweir bEndTokenFound = sal_True; 1072cdf0e10cSrcweir } 1073cdf0e10cSrcweir else 1074cdf0e10cSrcweir { 1075cdf0e10cSrcweir bReadScript = sal_False; 1076cdf0e10cSrcweir bReadStyle = sal_False; 1077cdf0e10cSrcweir aEndToken.Erase(); 1078cdf0e10cSrcweir nToken = 0; 1079cdf0e10cSrcweir } 1080cdf0e10cSrcweir break; 1081cdf0e10cSrcweir } 1082cdf0e10cSrcweir // kein break 1083cdf0e10cSrcweir default: 1084cdf0e10cSrcweir // alle anderen Zeichen landen im Buffer 1085cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 1086cdf0e10cSrcweir break; 1087cdf0e10cSrcweir } 1088cdf0e10cSrcweir 1089cdf0e10cSrcweir if( (!bWeiter && sTmpBuffer.getLength() > 0L) || 1090cdf0e10cSrcweir MAX_LEN == sTmpBuffer.getLength() ) 1091cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 1092cdf0e10cSrcweir 1093cdf0e10cSrcweir if( bWeiter && bNextCh ) 1094cdf0e10cSrcweir nNextCh = GetNextChar(); 1095cdf0e10cSrcweir } 1096cdf0e10cSrcweir 1097cdf0e10cSrcweir if( IsParserWorking() ) 1098cdf0e10cSrcweir SaveState( 0 ); 1099cdf0e10cSrcweir else 1100cdf0e10cSrcweir nToken = 0; 1101cdf0e10cSrcweir 1102cdf0e10cSrcweir return nToken; 1103cdf0e10cSrcweir } 1104cdf0e10cSrcweir 1105cdf0e10cSrcweir // scanne das naechste Token, 1106cdf0e10cSrcweir int __EXPORT HTMLParser::_GetNextToken() 1107cdf0e10cSrcweir { 1108cdf0e10cSrcweir int nRet = 0; 1109cdf0e10cSrcweir sSaveToken.Erase(); 1110cdf0e10cSrcweir 1111cdf0e10cSrcweir // die Optionen loeschen 1112cdf0e10cSrcweir if( pOptions->Count() ) 1113cdf0e10cSrcweir pOptions->DeleteAndDestroy( 0, pOptions->Count() ); 1114cdf0e10cSrcweir 1115cdf0e10cSrcweir if( !IsParserWorking() ) // wenn schon Fehler, dann nicht weiter! 1116cdf0e10cSrcweir return 0; 1117cdf0e10cSrcweir 1118cdf0e10cSrcweir sal_Bool bReadNextCharSave = bReadNextChar; 1119cdf0e10cSrcweir if( bReadNextChar ) 1120cdf0e10cSrcweir { 1121cdf0e10cSrcweir DBG_ASSERT( !bEndTokenFound, 1122cdf0e10cSrcweir "</SCRIPT> gelesen und trotzdem noch ein Zeichen lesen?" ); 1123cdf0e10cSrcweir nNextCh = GetNextChar(); 1124cdf0e10cSrcweir if( !IsParserWorking() ) // wenn schon Fehler, dann nicht weiter! 1125cdf0e10cSrcweir return 0; 1126cdf0e10cSrcweir bReadNextChar = sal_False; 1127cdf0e10cSrcweir } 1128cdf0e10cSrcweir 1129cdf0e10cSrcweir if( bReadScript || bReadStyle || aEndToken.Len() ) 1130cdf0e10cSrcweir { 1131cdf0e10cSrcweir nRet = _GetNextRawToken(); 1132cdf0e10cSrcweir if( nRet || !IsParserWorking() ) 1133cdf0e10cSrcweir return nRet; 1134cdf0e10cSrcweir } 1135cdf0e10cSrcweir 1136cdf0e10cSrcweir do { 1137cdf0e10cSrcweir int bNextCh = sal_True; 1138cdf0e10cSrcweir switch( nNextCh ) 1139cdf0e10cSrcweir { 1140cdf0e10cSrcweir case '<': 1141cdf0e10cSrcweir { 1142cdf0e10cSrcweir sal_uLong nStreamPos = rInput.Tell(); 1143cdf0e10cSrcweir sal_uLong nLineNr = GetLineNr(); 1144cdf0e10cSrcweir sal_uLong nLinePos = GetLinePos(); 1145cdf0e10cSrcweir 1146cdf0e10cSrcweir int bOffState = sal_False; 1147cdf0e10cSrcweir if( '/' == (nNextCh = GetNextChar()) ) 1148cdf0e10cSrcweir { 1149cdf0e10cSrcweir bOffState = sal_True; 1150cdf0e10cSrcweir nNextCh = GetNextChar(); 1151cdf0e10cSrcweir } 1152cdf0e10cSrcweir if( HTML_ISALPHA( nNextCh ) || '!'==nNextCh ) // fix #26984# 1153cdf0e10cSrcweir { 1154cdf0e10cSrcweir ::rtl::OUStringBuffer sTmpBuffer; 1155cdf0e10cSrcweir do { 1156cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 1157cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() ) 1158cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 1159cdf0e10cSrcweir nNextCh = GetNextChar(); 1160cdf0e10cSrcweir } while( '>' != nNextCh && !HTML_ISSPACE( nNextCh ) && 1161cdf0e10cSrcweir IsParserWorking() && !rInput.IsEof() ); 1162cdf0e10cSrcweir 1163cdf0e10cSrcweir if( sTmpBuffer.getLength() ) 1164cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 1165cdf0e10cSrcweir 1166cdf0e10cSrcweir // Blanks ueberlesen 1167cdf0e10cSrcweir while( HTML_ISSPACE( nNextCh ) && IsParserWorking() ) 1168cdf0e10cSrcweir nNextCh = GetNextChar(); 1169cdf0e10cSrcweir 1170cdf0e10cSrcweir if( !IsParserWorking() ) 1171cdf0e10cSrcweir { 1172cdf0e10cSrcweir if( SVPAR_PENDING == eState ) 1173cdf0e10cSrcweir bReadNextChar = bReadNextCharSave; 1174cdf0e10cSrcweir break; 1175cdf0e10cSrcweir } 1176cdf0e10cSrcweir 1177cdf0e10cSrcweir // suche das Token in der Tabelle: 1178cdf0e10cSrcweir sSaveToken = aToken; 1179cdf0e10cSrcweir aToken.ToUpperAscii(); 1180cdf0e10cSrcweir if( 0 == (nRet = GetHTMLToken( aToken )) ) 1181cdf0e10cSrcweir // Unknown Control 1182cdf0e10cSrcweir nRet = HTML_UNKNOWNCONTROL_ON; 1183cdf0e10cSrcweir 1184cdf0e10cSrcweir // Wenn es ein Token zum ausschalten ist ... 1185cdf0e10cSrcweir if( bOffState ) 1186cdf0e10cSrcweir { 1187cdf0e10cSrcweir if( HTML_TOKEN_ONOFF & nRet ) 1188cdf0e10cSrcweir { 1189cdf0e10cSrcweir // und es ein Off-Token gibt, das daraus machen 1190cdf0e10cSrcweir ++nRet; 1191cdf0e10cSrcweir } 1192cdf0e10cSrcweir else if( HTML_LINEBREAK!=nRet ) 1193cdf0e10cSrcweir { 1194cdf0e10cSrcweir // und es kein Off-Token gibt, ein unbekanntes 1195cdf0e10cSrcweir // Token daraus machen (ausser </BR>, das wird 1196cdf0e10cSrcweir // wie <BR> behandelt 1197cdf0e10cSrcweir nRet = HTML_UNKNOWNCONTROL_OFF; 1198cdf0e10cSrcweir } 1199cdf0e10cSrcweir } 1200cdf0e10cSrcweir 1201cdf0e10cSrcweir if( nRet == HTML_COMMENT ) 1202cdf0e10cSrcweir { 1203cdf0e10cSrcweir // fix: sSaveToken wegen Gross-/Kleinschreibung 1204cdf0e10cSrcweir // als Anfang des Kommentars benutzen und ein 1205cdf0e10cSrcweir // Space anhaengen. 1206cdf0e10cSrcweir aToken = sSaveToken; 1207cdf0e10cSrcweir if( '>'!=nNextCh ) 1208cdf0e10cSrcweir aToken += (sal_Unicode)' '; 1209cdf0e10cSrcweir sal_uLong nCStreamPos = 0; 1210cdf0e10cSrcweir sal_uLong nCLineNr = 0; 1211cdf0e10cSrcweir sal_uLong nCLinePos = 0; 1212cdf0e10cSrcweir xub_StrLen nCStrLen = 0; 1213cdf0e10cSrcweir 1214cdf0e10cSrcweir sal_Bool bDone = sal_False; 1215cdf0e10cSrcweir // bis zum schliessenden --> lesen. wenn keins gefunden 1216cdf0e10cSrcweir // wurde beim der ersten > wieder aufsetzen 1217cdf0e10cSrcweir while( !bDone && !rInput.IsEof() && IsParserWorking() ) 1218cdf0e10cSrcweir { 1219cdf0e10cSrcweir if( '>'==nNextCh ) 1220cdf0e10cSrcweir { 1221cdf0e10cSrcweir if( !nCStreamPos ) 1222cdf0e10cSrcweir { 1223cdf0e10cSrcweir nCStreamPos = rInput.Tell(); 1224cdf0e10cSrcweir nCStrLen = aToken.Len(); 1225cdf0e10cSrcweir nCLineNr = GetLineNr(); 1226cdf0e10cSrcweir nCLinePos = GetLinePos(); 1227cdf0e10cSrcweir } 1228cdf0e10cSrcweir bDone = aToken.Len() >= 2 && 1229cdf0e10cSrcweir aToken.Copy(aToken.Len()-2,2). 1230cdf0e10cSrcweir EqualsAscii( "--" ); 1231cdf0e10cSrcweir if( !bDone ) 1232cdf0e10cSrcweir aToken += nNextCh; 1233cdf0e10cSrcweir } 1234cdf0e10cSrcweir else 1235cdf0e10cSrcweir aToken += nNextCh; 1236cdf0e10cSrcweir if( !bDone ) 1237cdf0e10cSrcweir nNextCh = GetNextChar(); 1238cdf0e10cSrcweir } 1239cdf0e10cSrcweir if( !bDone && IsParserWorking() && nCStreamPos ) 1240cdf0e10cSrcweir { 1241cdf0e10cSrcweir rInput.Seek( nCStreamPos ); 1242cdf0e10cSrcweir SetLineNr( nCLineNr ); 1243cdf0e10cSrcweir SetLinePos( nCLinePos ); 1244cdf0e10cSrcweir ClearTxtConvContext(); 1245cdf0e10cSrcweir aToken.Erase( nCStrLen ); 1246cdf0e10cSrcweir nNextCh = '>'; 1247cdf0e10cSrcweir } 1248cdf0e10cSrcweir } 1249cdf0e10cSrcweir else 1250cdf0e10cSrcweir { 1251cdf0e10cSrcweir // den TokenString koennen wir jetzt verwerfen 1252cdf0e10cSrcweir aToken.Erase(); 1253cdf0e10cSrcweir } 1254cdf0e10cSrcweir 1255cdf0e10cSrcweir // dann lesen wir mal alles bis zur schliessenden '>' 1256cdf0e10cSrcweir if( '>' != nNextCh && IsParserWorking() ) 1257cdf0e10cSrcweir { 1258cdf0e10cSrcweir ScanText( '>' ); 1259cdf0e10cSrcweir if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() ) 1260cdf0e10cSrcweir { 1261cdf0e10cSrcweir // zurueck hinter die < gehen und dort neu 1262cdf0e10cSrcweir // aufsetzen, das < als Text zurueckgeben 1263cdf0e10cSrcweir rInput.Seek( nStreamPos ); 1264cdf0e10cSrcweir SetLineNr( nLineNr ); 1265cdf0e10cSrcweir SetLinePos( nLinePos ); 1266cdf0e10cSrcweir ClearTxtConvContext(); 1267cdf0e10cSrcweir 1268cdf0e10cSrcweir aToken = '<'; 1269cdf0e10cSrcweir nRet = HTML_TEXTTOKEN; 1270cdf0e10cSrcweir nNextCh = GetNextChar(); 1271cdf0e10cSrcweir bNextCh = sal_False; 1272cdf0e10cSrcweir break; 1273cdf0e10cSrcweir } 1274cdf0e10cSrcweir } 1275cdf0e10cSrcweir if( SVPAR_PENDING == eState ) 1276cdf0e10cSrcweir bReadNextChar = bReadNextCharSave; 1277cdf0e10cSrcweir } 1278cdf0e10cSrcweir else 1279cdf0e10cSrcweir { 1280cdf0e10cSrcweir if( bOffState ) 1281cdf0e10cSrcweir { 1282cdf0e10cSrcweir // einfach alles wegschmeissen 1283cdf0e10cSrcweir ScanText( '>' ); 1284cdf0e10cSrcweir if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() ) 1285cdf0e10cSrcweir { 1286cdf0e10cSrcweir // zurueck hinter die < gehen und dort neu 1287cdf0e10cSrcweir // aufsetzen, das < als Text zurueckgeben 1288cdf0e10cSrcweir rInput.Seek( nStreamPos ); 1289cdf0e10cSrcweir SetLineNr( nLineNr ); 1290cdf0e10cSrcweir SetLinePos( nLinePos ); 1291cdf0e10cSrcweir ClearTxtConvContext(); 1292cdf0e10cSrcweir 1293cdf0e10cSrcweir aToken = '<'; 1294cdf0e10cSrcweir nRet = HTML_TEXTTOKEN; 1295cdf0e10cSrcweir nNextCh = GetNextChar(); 1296cdf0e10cSrcweir bNextCh = sal_False; 1297cdf0e10cSrcweir break; 1298cdf0e10cSrcweir } 1299cdf0e10cSrcweir if( SVPAR_PENDING == eState ) 1300cdf0e10cSrcweir bReadNextChar = bReadNextCharSave; 1301cdf0e10cSrcweir aToken.Erase(); 1302cdf0e10cSrcweir } 1303cdf0e10cSrcweir else if( '%' == nNextCh ) 1304cdf0e10cSrcweir { 1305cdf0e10cSrcweir nRet = HTML_UNKNOWNCONTROL_ON; 1306cdf0e10cSrcweir 1307cdf0e10cSrcweir sal_uLong nCStreamPos = rInput.Tell(); 1308cdf0e10cSrcweir sal_uLong nCLineNr = GetLineNr(), nCLinePos = GetLinePos(); 1309cdf0e10cSrcweir 1310cdf0e10cSrcweir sal_Bool bDone = sal_False; 1311cdf0e10cSrcweir // bis zum schliessenden %> lesen. wenn keins gefunden 1312cdf0e10cSrcweir // wurde beim der ersten > wieder aufsetzen 1313cdf0e10cSrcweir while( !bDone && !rInput.IsEof() && IsParserWorking() ) 1314cdf0e10cSrcweir { 1315cdf0e10cSrcweir bDone = '>'==nNextCh && aToken.Len() >= 1 && 1316cdf0e10cSrcweir '%' == aToken.GetChar( aToken.Len()-1 ); 1317cdf0e10cSrcweir if( !bDone ) 1318cdf0e10cSrcweir { 1319cdf0e10cSrcweir aToken += nNextCh; 1320cdf0e10cSrcweir nNextCh = GetNextChar(); 1321cdf0e10cSrcweir } 1322cdf0e10cSrcweir } 1323cdf0e10cSrcweir if( !bDone && IsParserWorking() ) 1324cdf0e10cSrcweir { 1325cdf0e10cSrcweir rInput.Seek( nCStreamPos ); 1326cdf0e10cSrcweir SetLineNr( nCLineNr ); 1327cdf0e10cSrcweir SetLinePos( nCLinePos ); 1328cdf0e10cSrcweir ClearTxtConvContext(); 1329cdf0e10cSrcweir aToken.AssignAscii( "<%", 2 ); 1330cdf0e10cSrcweir nRet = HTML_TEXTTOKEN; 1331cdf0e10cSrcweir break; 1332cdf0e10cSrcweir } 1333cdf0e10cSrcweir if( IsParserWorking() ) 1334cdf0e10cSrcweir { 1335cdf0e10cSrcweir sSaveToken = aToken; 1336cdf0e10cSrcweir aToken.Erase(); 1337cdf0e10cSrcweir } 1338cdf0e10cSrcweir } 1339cdf0e10cSrcweir else 1340cdf0e10cSrcweir { 1341cdf0e10cSrcweir aToken = '<'; 1342cdf0e10cSrcweir nRet = HTML_TEXTTOKEN; 1343cdf0e10cSrcweir bNextCh = sal_False; 1344cdf0e10cSrcweir break; 1345cdf0e10cSrcweir } 1346cdf0e10cSrcweir } 1347cdf0e10cSrcweir 1348cdf0e10cSrcweir if( IsParserWorking() ) 1349cdf0e10cSrcweir { 1350cdf0e10cSrcweir bNextCh = '>' == nNextCh; 1351cdf0e10cSrcweir switch( nRet ) 1352cdf0e10cSrcweir { 1353cdf0e10cSrcweir case HTML_TEXTAREA_ON: 1354cdf0e10cSrcweir bReadTextArea = sal_True; 1355cdf0e10cSrcweir break; 1356cdf0e10cSrcweir case HTML_TEXTAREA_OFF: 1357cdf0e10cSrcweir bReadTextArea = sal_False; 1358cdf0e10cSrcweir break; 1359cdf0e10cSrcweir case HTML_SCRIPT_ON: 1360cdf0e10cSrcweir if( !bReadTextArea ) 1361cdf0e10cSrcweir bReadScript = sal_True; 1362cdf0e10cSrcweir break; 1363cdf0e10cSrcweir case HTML_SCRIPT_OFF: 1364cdf0e10cSrcweir if( !bReadTextArea ) 1365cdf0e10cSrcweir { 1366cdf0e10cSrcweir bReadScript = sal_False; 1367cdf0e10cSrcweir // JavaScript kann den Stream veraendern 1368cdf0e10cSrcweir // also muss das letzte Zeichen nochmals 1369cdf0e10cSrcweir // gelesen werden 1370cdf0e10cSrcweir bReadNextChar = sal_True; 1371cdf0e10cSrcweir bNextCh = sal_False; 1372cdf0e10cSrcweir } 1373cdf0e10cSrcweir break; 1374cdf0e10cSrcweir 1375cdf0e10cSrcweir case HTML_STYLE_ON: 1376cdf0e10cSrcweir bReadStyle = sal_True; 1377cdf0e10cSrcweir break; 1378cdf0e10cSrcweir case HTML_STYLE_OFF: 1379cdf0e10cSrcweir bReadStyle = sal_False; 1380cdf0e10cSrcweir break; 1381cdf0e10cSrcweir } 1382cdf0e10cSrcweir 1383cdf0e10cSrcweir } 1384cdf0e10cSrcweir } 1385cdf0e10cSrcweir break; 1386cdf0e10cSrcweir 1387cdf0e10cSrcweir case sal_Unicode(EOF): 1388cdf0e10cSrcweir if( rInput.IsEof() ) 1389cdf0e10cSrcweir { 1390cdf0e10cSrcweir eState = SVPAR_ACCEPTED; 1391cdf0e10cSrcweir nRet = nNextCh; 1392cdf0e10cSrcweir } 1393cdf0e10cSrcweir else 1394cdf0e10cSrcweir { 1395cdf0e10cSrcweir // normalen Text lesen 1396cdf0e10cSrcweir goto scan_text; 1397cdf0e10cSrcweir } 1398cdf0e10cSrcweir break; 1399cdf0e10cSrcweir 1400cdf0e10cSrcweir case '\f': 1401cdf0e10cSrcweir // Form-Feeds werden jetzt extra nach oben gereicht 1402cdf0e10cSrcweir nRet = HTML_LINEFEEDCHAR; // !!! eigentlich FORMFEEDCHAR 1403cdf0e10cSrcweir break; 1404cdf0e10cSrcweir 1405cdf0e10cSrcweir case '\n': 1406cdf0e10cSrcweir case '\r': 1407cdf0e10cSrcweir if( bReadListing || bReadXMP || bReadPRE || bReadTextArea ) 1408cdf0e10cSrcweir { 1409cdf0e10cSrcweir sal_Unicode c = GetNextChar(); 1410cdf0e10cSrcweir if( ( '\n' != nNextCh || '\r' != c ) && 1411cdf0e10cSrcweir ( '\r' != nNextCh || '\n' != c ) ) 1412cdf0e10cSrcweir { 1413cdf0e10cSrcweir bNextCh = sal_False; 1414cdf0e10cSrcweir nNextCh = c; 1415cdf0e10cSrcweir } 1416cdf0e10cSrcweir nRet = HTML_NEWPARA; 1417cdf0e10cSrcweir break; 1418cdf0e10cSrcweir } 1419cdf0e10cSrcweir // kein break ! 1420cdf0e10cSrcweir case '\t': 1421cdf0e10cSrcweir if( bReadPRE ) 1422cdf0e10cSrcweir { 1423cdf0e10cSrcweir nRet = HTML_TABCHAR; 1424cdf0e10cSrcweir break; 1425cdf0e10cSrcweir } 1426cdf0e10cSrcweir // kein break ! 1427cdf0e10cSrcweir case ' ': 1428cdf0e10cSrcweir // kein break ! 1429cdf0e10cSrcweir default: 1430cdf0e10cSrcweir 1431cdf0e10cSrcweir scan_text: 1432cdf0e10cSrcweir // es folgt "normaler" Text 1433cdf0e10cSrcweir nRet = ScanText(); 1434cdf0e10cSrcweir bNextCh = 0 == aToken.Len(); 1435cdf0e10cSrcweir 1436cdf0e10cSrcweir // der Text sollte noch verarbeitet werden 1437cdf0e10cSrcweir if( !bNextCh && eState == SVPAR_PENDING ) 1438cdf0e10cSrcweir { 1439cdf0e10cSrcweir eState = SVPAR_WORKING; 1440cdf0e10cSrcweir bReadNextChar = sal_True; 1441cdf0e10cSrcweir } 1442cdf0e10cSrcweir 1443cdf0e10cSrcweir break; 1444cdf0e10cSrcweir } 1445cdf0e10cSrcweir 1446cdf0e10cSrcweir if( bNextCh && SVPAR_WORKING == eState ) 1447cdf0e10cSrcweir { 1448cdf0e10cSrcweir nNextCh = GetNextChar(); 1449cdf0e10cSrcweir if( SVPAR_PENDING == eState && nRet && HTML_TEXTTOKEN != nRet ) 1450cdf0e10cSrcweir { 1451cdf0e10cSrcweir bReadNextChar = sal_True; 1452cdf0e10cSrcweir eState = SVPAR_WORKING; 1453cdf0e10cSrcweir } 1454cdf0e10cSrcweir } 1455cdf0e10cSrcweir 1456cdf0e10cSrcweir } while( !nRet && SVPAR_WORKING == eState ); 1457cdf0e10cSrcweir 1458cdf0e10cSrcweir if( SVPAR_PENDING == eState ) 1459cdf0e10cSrcweir nRet = -1; // irgendwas ungueltiges 1460cdf0e10cSrcweir 1461cdf0e10cSrcweir return nRet; 1462cdf0e10cSrcweir } 1463cdf0e10cSrcweir 1464cdf0e10cSrcweir void HTMLParser::UnescapeToken() 1465cdf0e10cSrcweir { 1466cdf0e10cSrcweir xub_StrLen nPos=0; 1467cdf0e10cSrcweir 1468cdf0e10cSrcweir sal_Bool bEscape = sal_False; 1469cdf0e10cSrcweir while( nPos < aToken.Len() ) 1470cdf0e10cSrcweir { 1471cdf0e10cSrcweir sal_Bool bOldEscape = bEscape; 1472cdf0e10cSrcweir bEscape = sal_False; 1473cdf0e10cSrcweir if( '\\'==aToken.GetChar(nPos) && !bOldEscape ) 1474cdf0e10cSrcweir { 1475cdf0e10cSrcweir aToken.Erase( nPos, 1 ); 1476cdf0e10cSrcweir bEscape = sal_True; 1477cdf0e10cSrcweir } 1478cdf0e10cSrcweir else 1479cdf0e10cSrcweir { 1480cdf0e10cSrcweir nPos++; 1481cdf0e10cSrcweir } 1482cdf0e10cSrcweir } 1483cdf0e10cSrcweir } 1484cdf0e10cSrcweir 1485cdf0e10cSrcweir // hole die Optionen 1486cdf0e10cSrcweir const HTMLOptions *HTMLParser::GetOptions( sal_uInt16 *pNoConvertToken ) const 1487cdf0e10cSrcweir { 1488cdf0e10cSrcweir // wenn die Option fuer das aktuelle Token schon einmal 1489cdf0e10cSrcweir // geholt wurden, geben wir sie noch einmal zurueck 1490cdf0e10cSrcweir if( pOptions->Count() ) 1491cdf0e10cSrcweir return pOptions; 1492cdf0e10cSrcweir 1493cdf0e10cSrcweir xub_StrLen nPos = 0; 1494cdf0e10cSrcweir while( nPos < aToken.Len() ) 1495cdf0e10cSrcweir { 1496cdf0e10cSrcweir // ein Zeichen ? Dann faengt hier eine Option an 1497cdf0e10cSrcweir if( HTML_ISALPHA( aToken.GetChar(nPos) ) ) 1498cdf0e10cSrcweir { 1499cdf0e10cSrcweir int nToken; 1500cdf0e10cSrcweir String aValue; 1501cdf0e10cSrcweir xub_StrLen nStt = nPos; 1502cdf0e10cSrcweir sal_Unicode cChar = 0; 1503cdf0e10cSrcweir 1504cdf0e10cSrcweir // Eigentlich sind hier nur ganz bestimmte Zeichen erlaubt. 1505cdf0e10cSrcweir // Netscape achtet aber nur auf "=" und Leerzeichen (siehe 1506cdf0e10cSrcweir // Mozilla: PA_FetchRequestedNameValues in 1507cdf0e10cSrcweir // lipparse/pa_mdl.c 1508cdf0e10cSrcweir // while( nPos < aToken.Len() && 1509cdf0e10cSrcweir // ( '-'==(c=aToken[nPos]) || isalnum(c) || '.'==c || '_'==c) ) 1510cdf0e10cSrcweir while( nPos < aToken.Len() && '=' != (cChar=aToken.GetChar(nPos)) && 1511cdf0e10cSrcweir HTML_ISPRINTABLE(cChar) && !HTML_ISSPACE(cChar) ) 1512cdf0e10cSrcweir nPos++; 1513cdf0e10cSrcweir 1514cdf0e10cSrcweir String sName( aToken.Copy( nStt, nPos-nStt ) ); 1515cdf0e10cSrcweir 1516cdf0e10cSrcweir //JP 23.03.97: die PlugIns wollen die TokenName im "Original" haben 1517cdf0e10cSrcweir // also nur fuers Suchen in UpperCase wandeln 1518cdf0e10cSrcweir String sNameUpperCase( sName ); 1519cdf0e10cSrcweir sNameUpperCase.ToUpperAscii(); 1520cdf0e10cSrcweir 1521cdf0e10cSrcweir nToken = GetHTMLOption( sNameUpperCase ); // der Name ist fertig 1522cdf0e10cSrcweir DBG_ASSERTWARNING( nToken!=HTML_O_UNKNOWN, 1523cdf0e10cSrcweir "GetOption: unbekannte HTML-Option" ); 1524cdf0e10cSrcweir sal_Bool bStripCRLF = (nToken < HTML_OPTION_SCRIPT_START || 1525cdf0e10cSrcweir nToken >= HTML_OPTION_SCRIPT_END) && 1526cdf0e10cSrcweir (!pNoConvertToken || nToken != *pNoConvertToken); 1527cdf0e10cSrcweir 1528cdf0e10cSrcweir while( nPos < aToken.Len() && 1529cdf0e10cSrcweir ( !HTML_ISPRINTABLE( (cChar=aToken.GetChar(nPos)) ) || 1530cdf0e10cSrcweir HTML_ISSPACE(cChar) ) ) 1531cdf0e10cSrcweir nPos++; 1532cdf0e10cSrcweir 1533cdf0e10cSrcweir // hat die Option auch einen Wert? 1534cdf0e10cSrcweir if( nPos!=aToken.Len() && '='==cChar ) 1535cdf0e10cSrcweir { 1536cdf0e10cSrcweir nPos++; 1537cdf0e10cSrcweir 1538cdf0e10cSrcweir while( nPos < aToken.Len() && 1539cdf0e10cSrcweir ( !HTML_ISPRINTABLE( (cChar=aToken.GetChar(nPos)) ) || 1540cdf0e10cSrcweir ' '==cChar || '\t'==cChar || '\r'==cChar || '\n'==cChar ) ) 1541cdf0e10cSrcweir nPos++; 1542cdf0e10cSrcweir 1543cdf0e10cSrcweir if( nPos != aToken.Len() ) 1544cdf0e10cSrcweir { 1545cdf0e10cSrcweir xub_StrLen nLen = 0; 1546cdf0e10cSrcweir nStt = nPos; 1547cdf0e10cSrcweir if( ('"'==cChar) || ('\'')==cChar ) 1548cdf0e10cSrcweir { 1549cdf0e10cSrcweir sal_Unicode cEnd = cChar; 1550cdf0e10cSrcweir nPos++; nStt++; 1551cdf0e10cSrcweir sal_Bool bDone = sal_False; 1552cdf0e10cSrcweir sal_Bool bEscape = sal_False; 1553cdf0e10cSrcweir while( nPos < aToken.Len() && !bDone ) 1554cdf0e10cSrcweir { 1555cdf0e10cSrcweir sal_Bool bOldEscape = bEscape; 1556cdf0e10cSrcweir bEscape = sal_False; 1557cdf0e10cSrcweir cChar = aToken.GetChar(nPos); 1558cdf0e10cSrcweir switch( cChar ) 1559cdf0e10cSrcweir { 1560cdf0e10cSrcweir case '\r': 1561cdf0e10cSrcweir case '\n': 1562cdf0e10cSrcweir if( bStripCRLF ) 1563cdf0e10cSrcweir ((String &)aToken).Erase( nPos, 1 ); 1564cdf0e10cSrcweir else 1565cdf0e10cSrcweir nPos++, nLen++; 1566cdf0e10cSrcweir break; 1567cdf0e10cSrcweir case '\\': 1568cdf0e10cSrcweir if( bOldEscape ) 1569cdf0e10cSrcweir { 1570cdf0e10cSrcweir nPos++, nLen++; 1571cdf0e10cSrcweir } 1572cdf0e10cSrcweir else 1573cdf0e10cSrcweir { 1574cdf0e10cSrcweir ((String &)aToken).Erase( nPos, 1 ); 1575cdf0e10cSrcweir bEscape = sal_True; 1576cdf0e10cSrcweir } 1577cdf0e10cSrcweir break; 1578cdf0e10cSrcweir case '"': 1579cdf0e10cSrcweir case '\'': 1580cdf0e10cSrcweir bDone = !bOldEscape && cChar==cEnd; 1581cdf0e10cSrcweir if( !bDone ) 1582cdf0e10cSrcweir nPos++, nLen++; 1583cdf0e10cSrcweir break; 1584cdf0e10cSrcweir default: 1585cdf0e10cSrcweir nPos++, nLen++; 1586cdf0e10cSrcweir break; 1587cdf0e10cSrcweir } 1588cdf0e10cSrcweir } 1589cdf0e10cSrcweir if( nPos!=aToken.Len() ) 1590cdf0e10cSrcweir nPos++; 1591cdf0e10cSrcweir } 1592cdf0e10cSrcweir else 1593cdf0e10cSrcweir { 1594cdf0e10cSrcweir // hier sind wir etwas laxer als der 1595cdf0e10cSrcweir // Standard und erlauben alles druckbare 1596cdf0e10cSrcweir sal_Bool bEscape = sal_False; 1597cdf0e10cSrcweir sal_Bool bDone = sal_False; 1598cdf0e10cSrcweir while( nPos < aToken.Len() && !bDone ) 1599cdf0e10cSrcweir { 1600cdf0e10cSrcweir sal_Bool bOldEscape = bEscape; 1601cdf0e10cSrcweir bEscape = sal_False; 1602cdf0e10cSrcweir sal_Unicode c = aToken.GetChar(nPos); 1603cdf0e10cSrcweir switch( c ) 1604cdf0e10cSrcweir { 1605cdf0e10cSrcweir case ' ': 1606cdf0e10cSrcweir bDone = !bOldEscape; 1607cdf0e10cSrcweir if( !bDone ) 1608cdf0e10cSrcweir nPos++, nLen++; 1609cdf0e10cSrcweir break; 1610cdf0e10cSrcweir 1611cdf0e10cSrcweir case '\t': 1612cdf0e10cSrcweir case '\r': 1613cdf0e10cSrcweir case '\n': 1614cdf0e10cSrcweir bDone = sal_True; 1615cdf0e10cSrcweir break; 1616cdf0e10cSrcweir 1617cdf0e10cSrcweir case '\\': 1618cdf0e10cSrcweir if( bOldEscape ) 1619cdf0e10cSrcweir { 1620cdf0e10cSrcweir nPos++, nLen++; 1621cdf0e10cSrcweir } 1622cdf0e10cSrcweir else 1623cdf0e10cSrcweir { 1624cdf0e10cSrcweir ((String &)aToken).Erase( nPos, 1 ); 1625cdf0e10cSrcweir bEscape = sal_True; 1626cdf0e10cSrcweir } 1627cdf0e10cSrcweir break; 1628cdf0e10cSrcweir 1629cdf0e10cSrcweir default: 1630cdf0e10cSrcweir if( HTML_ISPRINTABLE( c ) ) 1631cdf0e10cSrcweir nPos++, nLen++; 1632cdf0e10cSrcweir else 1633cdf0e10cSrcweir bDone = sal_True; 1634cdf0e10cSrcweir break; 1635cdf0e10cSrcweir } 1636cdf0e10cSrcweir } 1637cdf0e10cSrcweir } 1638cdf0e10cSrcweir 1639cdf0e10cSrcweir if( nLen ) 1640cdf0e10cSrcweir aValue = aToken.Copy( nStt, nLen ); 1641cdf0e10cSrcweir } 1642cdf0e10cSrcweir } 1643cdf0e10cSrcweir 1644cdf0e10cSrcweir // Wir kennen das Token und koennen es Speichern 1645cdf0e10cSrcweir HTMLOption *pOption = 1646cdf0e10cSrcweir new HTMLOption( 1647cdf0e10cSrcweir sal::static_int_cast< sal_uInt16 >(nToken), sName, aValue ); 1648cdf0e10cSrcweir 1649cdf0e10cSrcweir pOptions->Insert( pOption, pOptions->Count() ); 1650cdf0e10cSrcweir 1651cdf0e10cSrcweir } 1652cdf0e10cSrcweir else 1653cdf0e10cSrcweir // white space un unerwartete Zeichen ignorieren wie 1654cdf0e10cSrcweir nPos++; 1655cdf0e10cSrcweir } 1656cdf0e10cSrcweir 1657cdf0e10cSrcweir return pOptions; 1658cdf0e10cSrcweir } 1659cdf0e10cSrcweir 1660cdf0e10cSrcweir int HTMLParser::FilterPRE( int nToken ) 1661cdf0e10cSrcweir { 1662cdf0e10cSrcweir switch( nToken ) 1663cdf0e10cSrcweir { 1664cdf0e10cSrcweir #ifdef HTML_BEHAVIOUR 1665cdf0e10cSrcweir // diese werden laut Definition zu LFs 1666cdf0e10cSrcweir case HTML_PARABREAK_ON: 1667cdf0e10cSrcweir case HTML_LINEBREAK: 1668cdf0e10cSrcweir nToken = HTML_NEWPARA; 1669cdf0e10cSrcweir #else 1670cdf0e10cSrcweir // in Netscape zeigen sie aber nur in nicht-leeren Absaetzen Wirkung 1671cdf0e10cSrcweir case HTML_PARABREAK_ON: 1672cdf0e10cSrcweir nToken = HTML_LINEBREAK; 1673cdf0e10cSrcweir case HTML_LINEBREAK: 1674cdf0e10cSrcweir #endif 1675cdf0e10cSrcweir case HTML_NEWPARA: 1676cdf0e10cSrcweir nPre_LinePos = 0; 1677cdf0e10cSrcweir if( bPre_IgnoreNewPara ) 1678cdf0e10cSrcweir nToken = 0; 1679cdf0e10cSrcweir break; 1680cdf0e10cSrcweir 1681cdf0e10cSrcweir case HTML_TABCHAR: 1682cdf0e10cSrcweir { 1683cdf0e10cSrcweir xub_StrLen nSpaces = sal::static_int_cast< xub_StrLen >( 1684cdf0e10cSrcweir 8 - (nPre_LinePos % 8)); 1685cdf0e10cSrcweir DBG_ASSERT( !aToken.Len(), "Wieso ist das Token nicht leer?" ); 1686cdf0e10cSrcweir aToken.Expand( nSpaces, ' ' ); 1687cdf0e10cSrcweir nPre_LinePos += nSpaces; 1688cdf0e10cSrcweir nToken = HTML_TEXTTOKEN; 1689cdf0e10cSrcweir } 1690cdf0e10cSrcweir break; 1691cdf0e10cSrcweir // diese bleiben erhalten 1692cdf0e10cSrcweir case HTML_TEXTTOKEN: 1693cdf0e10cSrcweir nPre_LinePos += aToken.Len(); 1694cdf0e10cSrcweir break; 1695cdf0e10cSrcweir 1696cdf0e10cSrcweir case HTML_SELECT_ON: 1697cdf0e10cSrcweir case HTML_SELECT_OFF: 1698cdf0e10cSrcweir case HTML_BODY_ON: 1699cdf0e10cSrcweir case HTML_FORM_ON: 1700cdf0e10cSrcweir case HTML_FORM_OFF: 1701cdf0e10cSrcweir case HTML_INPUT: 1702cdf0e10cSrcweir case HTML_OPTION: 1703cdf0e10cSrcweir case HTML_TEXTAREA_ON: 1704cdf0e10cSrcweir case HTML_TEXTAREA_OFF: 1705cdf0e10cSrcweir 1706cdf0e10cSrcweir case HTML_IMAGE: 1707cdf0e10cSrcweir case HTML_APPLET_ON: 1708cdf0e10cSrcweir case HTML_APPLET_OFF: 1709cdf0e10cSrcweir case HTML_PARAM: 1710cdf0e10cSrcweir case HTML_EMBED: 1711cdf0e10cSrcweir 1712cdf0e10cSrcweir case HTML_HEAD1_ON: 1713cdf0e10cSrcweir case HTML_HEAD1_OFF: 1714cdf0e10cSrcweir case HTML_HEAD2_ON: 1715cdf0e10cSrcweir case HTML_HEAD2_OFF: 1716cdf0e10cSrcweir case HTML_HEAD3_ON: 1717cdf0e10cSrcweir case HTML_HEAD3_OFF: 1718cdf0e10cSrcweir case HTML_HEAD4_ON: 1719cdf0e10cSrcweir case HTML_HEAD4_OFF: 1720cdf0e10cSrcweir case HTML_HEAD5_ON: 1721cdf0e10cSrcweir case HTML_HEAD5_OFF: 1722cdf0e10cSrcweir case HTML_HEAD6_ON: 1723cdf0e10cSrcweir case HTML_HEAD6_OFF: 1724cdf0e10cSrcweir case HTML_BLOCKQUOTE_ON: 1725cdf0e10cSrcweir case HTML_BLOCKQUOTE_OFF: 1726cdf0e10cSrcweir case HTML_ADDRESS_ON: 1727cdf0e10cSrcweir case HTML_ADDRESS_OFF: 1728cdf0e10cSrcweir case HTML_HORZRULE: 1729cdf0e10cSrcweir 1730cdf0e10cSrcweir case HTML_CENTER_ON: 1731cdf0e10cSrcweir case HTML_CENTER_OFF: 1732cdf0e10cSrcweir case HTML_DIVISION_ON: 1733cdf0e10cSrcweir case HTML_DIVISION_OFF: 1734cdf0e10cSrcweir 1735cdf0e10cSrcweir case HTML_SCRIPT_ON: 1736cdf0e10cSrcweir case HTML_SCRIPT_OFF: 1737cdf0e10cSrcweir case HTML_RAWDATA: 1738cdf0e10cSrcweir 1739cdf0e10cSrcweir case HTML_TABLE_ON: 1740cdf0e10cSrcweir case HTML_TABLE_OFF: 1741cdf0e10cSrcweir case HTML_CAPTION_ON: 1742cdf0e10cSrcweir case HTML_CAPTION_OFF: 1743cdf0e10cSrcweir case HTML_COLGROUP_ON: 1744cdf0e10cSrcweir case HTML_COLGROUP_OFF: 1745cdf0e10cSrcweir case HTML_COL_ON: 1746cdf0e10cSrcweir case HTML_COL_OFF: 1747cdf0e10cSrcweir case HTML_THEAD_ON: 1748cdf0e10cSrcweir case HTML_THEAD_OFF: 1749cdf0e10cSrcweir case HTML_TFOOT_ON: 1750cdf0e10cSrcweir case HTML_TFOOT_OFF: 1751cdf0e10cSrcweir case HTML_TBODY_ON: 1752cdf0e10cSrcweir case HTML_TBODY_OFF: 1753cdf0e10cSrcweir case HTML_TABLEROW_ON: 1754cdf0e10cSrcweir case HTML_TABLEROW_OFF: 1755cdf0e10cSrcweir case HTML_TABLEDATA_ON: 1756cdf0e10cSrcweir case HTML_TABLEDATA_OFF: 1757cdf0e10cSrcweir case HTML_TABLEHEADER_ON: 1758cdf0e10cSrcweir case HTML_TABLEHEADER_OFF: 1759cdf0e10cSrcweir 1760cdf0e10cSrcweir case HTML_ANCHOR_ON: 1761cdf0e10cSrcweir case HTML_ANCHOR_OFF: 1762cdf0e10cSrcweir case HTML_BOLD_ON: 1763cdf0e10cSrcweir case HTML_BOLD_OFF: 1764cdf0e10cSrcweir case HTML_ITALIC_ON: 1765cdf0e10cSrcweir case HTML_ITALIC_OFF: 1766cdf0e10cSrcweir case HTML_STRIKE_ON: 1767cdf0e10cSrcweir case HTML_STRIKE_OFF: 1768cdf0e10cSrcweir case HTML_STRIKETHROUGH_ON: 1769cdf0e10cSrcweir case HTML_STRIKETHROUGH_OFF: 1770cdf0e10cSrcweir case HTML_UNDERLINE_ON: 1771cdf0e10cSrcweir case HTML_UNDERLINE_OFF: 1772cdf0e10cSrcweir case HTML_BASEFONT_ON: 1773cdf0e10cSrcweir case HTML_BASEFONT_OFF: 1774cdf0e10cSrcweir case HTML_FONT_ON: 1775cdf0e10cSrcweir case HTML_FONT_OFF: 1776cdf0e10cSrcweir case HTML_BLINK_ON: 1777cdf0e10cSrcweir case HTML_BLINK_OFF: 1778cdf0e10cSrcweir case HTML_SPAN_ON: 1779cdf0e10cSrcweir case HTML_SPAN_OFF: 1780cdf0e10cSrcweir case HTML_SUBSCRIPT_ON: 1781cdf0e10cSrcweir case HTML_SUBSCRIPT_OFF: 1782cdf0e10cSrcweir case HTML_SUPERSCRIPT_ON: 1783cdf0e10cSrcweir case HTML_SUPERSCRIPT_OFF: 1784cdf0e10cSrcweir case HTML_BIGPRINT_ON: 1785cdf0e10cSrcweir case HTML_BIGPRINT_OFF: 1786cdf0e10cSrcweir case HTML_SMALLPRINT_OFF: 1787cdf0e10cSrcweir case HTML_SMALLPRINT_ON: 1788cdf0e10cSrcweir 1789cdf0e10cSrcweir case HTML_EMPHASIS_ON: 1790cdf0e10cSrcweir case HTML_EMPHASIS_OFF: 1791cdf0e10cSrcweir case HTML_CITIATION_ON: 1792cdf0e10cSrcweir case HTML_CITIATION_OFF: 1793cdf0e10cSrcweir case HTML_STRONG_ON: 1794cdf0e10cSrcweir case HTML_STRONG_OFF: 1795cdf0e10cSrcweir case HTML_CODE_ON: 1796cdf0e10cSrcweir case HTML_CODE_OFF: 1797cdf0e10cSrcweir case HTML_SAMPLE_ON: 1798cdf0e10cSrcweir case HTML_SAMPLE_OFF: 1799cdf0e10cSrcweir case HTML_KEYBOARD_ON: 1800cdf0e10cSrcweir case HTML_KEYBOARD_OFF: 1801cdf0e10cSrcweir case HTML_VARIABLE_ON: 1802cdf0e10cSrcweir case HTML_VARIABLE_OFF: 1803cdf0e10cSrcweir case HTML_DEFINSTANCE_ON: 1804cdf0e10cSrcweir case HTML_DEFINSTANCE_OFF: 1805cdf0e10cSrcweir case HTML_SHORTQUOTE_ON: 1806cdf0e10cSrcweir case HTML_SHORTQUOTE_OFF: 1807cdf0e10cSrcweir case HTML_LANGUAGE_ON: 1808cdf0e10cSrcweir case HTML_LANGUAGE_OFF: 1809cdf0e10cSrcweir case HTML_AUTHOR_ON: 1810cdf0e10cSrcweir case HTML_AUTHOR_OFF: 1811cdf0e10cSrcweir case HTML_PERSON_ON: 1812cdf0e10cSrcweir case HTML_PERSON_OFF: 1813cdf0e10cSrcweir case HTML_ACRONYM_ON: 1814cdf0e10cSrcweir case HTML_ACRONYM_OFF: 1815cdf0e10cSrcweir case HTML_ABBREVIATION_ON: 1816cdf0e10cSrcweir case HTML_ABBREVIATION_OFF: 1817cdf0e10cSrcweir case HTML_INSERTEDTEXT_ON: 1818cdf0e10cSrcweir case HTML_INSERTEDTEXT_OFF: 1819cdf0e10cSrcweir case HTML_DELETEDTEXT_ON: 1820cdf0e10cSrcweir case HTML_DELETEDTEXT_OFF: 1821cdf0e10cSrcweir case HTML_TELETYPE_ON: 1822cdf0e10cSrcweir case HTML_TELETYPE_OFF: 1823cdf0e10cSrcweir 1824cdf0e10cSrcweir break; 1825cdf0e10cSrcweir 1826cdf0e10cSrcweir // der Rest wird als unbekanntes Token behandelt 1827cdf0e10cSrcweir default: 1828cdf0e10cSrcweir if( nToken ) 1829cdf0e10cSrcweir { 1830cdf0e10cSrcweir nToken = 1831cdf0e10cSrcweir ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken)) 1832cdf0e10cSrcweir ? HTML_UNKNOWNCONTROL_OFF 1833cdf0e10cSrcweir : HTML_UNKNOWNCONTROL_ON ); 1834cdf0e10cSrcweir } 1835cdf0e10cSrcweir break; 1836cdf0e10cSrcweir } 1837cdf0e10cSrcweir 1838cdf0e10cSrcweir bPre_IgnoreNewPara = sal_False; 1839cdf0e10cSrcweir 1840cdf0e10cSrcweir return nToken; 1841cdf0e10cSrcweir } 1842cdf0e10cSrcweir 1843cdf0e10cSrcweir int HTMLParser::FilterXMP( int nToken ) 1844cdf0e10cSrcweir { 1845cdf0e10cSrcweir switch( nToken ) 1846cdf0e10cSrcweir { 1847cdf0e10cSrcweir case HTML_NEWPARA: 1848cdf0e10cSrcweir if( bPre_IgnoreNewPara ) 1849cdf0e10cSrcweir nToken = 0; 1850cdf0e10cSrcweir case HTML_TEXTTOKEN: 1851cdf0e10cSrcweir case HTML_NONBREAKSPACE: 1852cdf0e10cSrcweir case HTML_SOFTHYPH: 1853cdf0e10cSrcweir break; // bleiben erhalten 1854cdf0e10cSrcweir 1855cdf0e10cSrcweir default: 1856cdf0e10cSrcweir if( nToken ) 1857cdf0e10cSrcweir { 1858cdf0e10cSrcweir if( (HTML_TOKEN_ONOFF & nToken) && (1 & nToken) ) 1859cdf0e10cSrcweir { 1860cdf0e10cSrcweir sSaveToken.Insert( '<', 0 ); 1861cdf0e10cSrcweir sSaveToken.Insert( '/', 1 ); 1862cdf0e10cSrcweir } 1863cdf0e10cSrcweir else 1864cdf0e10cSrcweir sSaveToken.Insert( '<', 0 ); 1865cdf0e10cSrcweir if( aToken.Len() ) 1866cdf0e10cSrcweir { 1867cdf0e10cSrcweir UnescapeToken(); 1868cdf0e10cSrcweir sSaveToken += (sal_Unicode)' '; 1869cdf0e10cSrcweir aToken.Insert( sSaveToken, 0 ); 1870cdf0e10cSrcweir } 1871cdf0e10cSrcweir else 1872cdf0e10cSrcweir aToken = sSaveToken; 1873cdf0e10cSrcweir aToken += (sal_Unicode)'>'; 1874cdf0e10cSrcweir nToken = HTML_TEXTTOKEN; 1875cdf0e10cSrcweir } 1876cdf0e10cSrcweir break; 1877cdf0e10cSrcweir } 1878cdf0e10cSrcweir 1879cdf0e10cSrcweir bPre_IgnoreNewPara = sal_False; 1880cdf0e10cSrcweir 1881cdf0e10cSrcweir return nToken; 1882cdf0e10cSrcweir } 1883cdf0e10cSrcweir 1884cdf0e10cSrcweir int HTMLParser::FilterListing( int nToken ) 1885cdf0e10cSrcweir { 1886cdf0e10cSrcweir switch( nToken ) 1887cdf0e10cSrcweir { 1888cdf0e10cSrcweir case HTML_NEWPARA: 1889cdf0e10cSrcweir if( bPre_IgnoreNewPara ) 1890cdf0e10cSrcweir nToken = 0; 1891cdf0e10cSrcweir case HTML_TEXTTOKEN: 1892cdf0e10cSrcweir case HTML_NONBREAKSPACE: 1893cdf0e10cSrcweir case HTML_SOFTHYPH: 1894cdf0e10cSrcweir break; // bleiben erhalten 1895cdf0e10cSrcweir 1896cdf0e10cSrcweir default: 1897cdf0e10cSrcweir if( nToken ) 1898cdf0e10cSrcweir { 1899cdf0e10cSrcweir nToken = 1900cdf0e10cSrcweir ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken)) 1901cdf0e10cSrcweir ? HTML_UNKNOWNCONTROL_OFF 1902cdf0e10cSrcweir : HTML_UNKNOWNCONTROL_ON ); 1903cdf0e10cSrcweir } 1904cdf0e10cSrcweir break; 1905cdf0e10cSrcweir } 1906cdf0e10cSrcweir 1907cdf0e10cSrcweir bPre_IgnoreNewPara = sal_False; 1908cdf0e10cSrcweir 1909cdf0e10cSrcweir return nToken; 1910cdf0e10cSrcweir } 1911cdf0e10cSrcweir 1912cdf0e10cSrcweir FASTBOOL HTMLParser::IsHTMLFormat( const sal_Char* pHeader, 1913cdf0e10cSrcweir sal_Bool bSwitchToUCS2, 1914cdf0e10cSrcweir rtl_TextEncoding eEnc ) 1915cdf0e10cSrcweir { 1916cdf0e10cSrcweir // Einer der folgenden regulaeren Ausdrucke muss sich auf den String 1917cdf0e10cSrcweir // anwenden lassen, damit das Dok ein HTML-Dokument ist. 1918cdf0e10cSrcweir // 1919cdf0e10cSrcweir // ^[^<]*<[^ \t]*[> \t] 1920cdf0e10cSrcweir // ------- 1921cdf0e10cSrcweir // ^<! 1922cdf0e10cSrcweir // 1923cdf0e10cSrcweir // wobei der unterstrichene Teilausdruck einem HTML-Token 1924cdf0e10cSrcweir // ensprechen muss 1925cdf0e10cSrcweir 1926cdf0e10cSrcweir ByteString sCmp; 1927cdf0e10cSrcweir sal_Bool bUCS2B = sal_False; 1928cdf0e10cSrcweir if( bSwitchToUCS2 ) 1929cdf0e10cSrcweir { 1930cdf0e10cSrcweir if( 0xfeU == (sal_uChar)pHeader[0] && 1931cdf0e10cSrcweir 0xffU == (sal_uChar)pHeader[1] ) 1932cdf0e10cSrcweir { 1933cdf0e10cSrcweir eEnc = RTL_TEXTENCODING_UCS2; 1934cdf0e10cSrcweir bUCS2B = sal_True; 1935cdf0e10cSrcweir } 1936cdf0e10cSrcweir else if( 0xffU == (sal_uChar)pHeader[0] && 1937cdf0e10cSrcweir 0xfeU == (sal_uChar)pHeader[1] ) 1938cdf0e10cSrcweir { 1939cdf0e10cSrcweir eEnc = RTL_TEXTENCODING_UCS2; 1940cdf0e10cSrcweir } 1941cdf0e10cSrcweir } 1942cdf0e10cSrcweir if 1943cdf0e10cSrcweir ( 1944cdf0e10cSrcweir RTL_TEXTENCODING_UCS2 == eEnc && 1945cdf0e10cSrcweir ( 1946cdf0e10cSrcweir (0xfe == (sal_uChar)pHeader[0] && 0xff == (sal_uChar)pHeader[1]) || 1947cdf0e10cSrcweir (0xff == (sal_uChar)pHeader[0] && 0xfe == (sal_uChar)pHeader[1]) 1948cdf0e10cSrcweir ) 1949cdf0e10cSrcweir ) 1950cdf0e10cSrcweir { 1951cdf0e10cSrcweir if( 0xfe == (sal_uChar)pHeader[0] ) 1952cdf0e10cSrcweir bUCS2B = sal_True; 1953cdf0e10cSrcweir 1954cdf0e10cSrcweir xub_StrLen nLen; 1955cdf0e10cSrcweir for( nLen = 2; 1956cdf0e10cSrcweir pHeader[nLen] != 0 || pHeader[nLen+1] != 0; 1957cdf0e10cSrcweir nLen+=2 ) 1958cdf0e10cSrcweir ; 1959cdf0e10cSrcweir 1960cdf0e10cSrcweir ::rtl::OStringBuffer sTmp( (nLen - 2)/2 ); 1961cdf0e10cSrcweir for( xub_StrLen nPos = 2; nPos < nLen; nPos += 2 ) 1962cdf0e10cSrcweir { 1963cdf0e10cSrcweir sal_Unicode cUC; 1964cdf0e10cSrcweir if( bUCS2B ) 1965cdf0e10cSrcweir cUC = (sal_Unicode(pHeader[nPos]) << 8) | pHeader[nPos+1]; 1966cdf0e10cSrcweir else 1967cdf0e10cSrcweir cUC = (sal_Unicode(pHeader[nPos+1]) << 8) | pHeader[nPos]; 1968cdf0e10cSrcweir if( 0U == cUC ) 1969cdf0e10cSrcweir break; 1970cdf0e10cSrcweir 1971cdf0e10cSrcweir sTmp.append( cUC < 256U ? (sal_Char)cUC : '.' ); 1972cdf0e10cSrcweir } 1973cdf0e10cSrcweir sCmp = ByteString( sTmp.makeStringAndClear() ); 1974cdf0e10cSrcweir } 1975cdf0e10cSrcweir else 1976cdf0e10cSrcweir { 1977cdf0e10cSrcweir sCmp = (sal_Char *)pHeader; 1978cdf0e10cSrcweir } 1979cdf0e10cSrcweir 1980cdf0e10cSrcweir sCmp.ToUpperAscii(); 1981cdf0e10cSrcweir 1982cdf0e10cSrcweir // Ein HTML-Dokument muss in der ersten Zeile ein '<' besitzen 1983cdf0e10cSrcweir xub_StrLen nStart = sCmp.Search( '<' ); 1984cdf0e10cSrcweir if( STRING_NOTFOUND == nStart ) 1985cdf0e10cSrcweir return sal_False; 1986cdf0e10cSrcweir nStart++; 1987cdf0e10cSrcweir 1988cdf0e10cSrcweir // danach duerfen beliebige andere Zeichen bis zu einem blank oder 1989cdf0e10cSrcweir // '>' kommen 1990cdf0e10cSrcweir sal_Char c; 1991cdf0e10cSrcweir xub_StrLen nPos; 1992cdf0e10cSrcweir for( nPos = nStart; nPos<sCmp.Len(); nPos++ ) 1993cdf0e10cSrcweir { 1994cdf0e10cSrcweir if( '>'==(c=sCmp.GetChar(nPos)) || HTML_ISSPACE(c) ) 1995cdf0e10cSrcweir break; 1996cdf0e10cSrcweir } 1997cdf0e10cSrcweir 1998cdf0e10cSrcweir // wenn das Dokeument hinter dem < aufhoert ist es wohl kein HTML 1999cdf0e10cSrcweir if( nPos==nStart ) 2000cdf0e10cSrcweir return sal_False; 2001cdf0e10cSrcweir 2002cdf0e10cSrcweir // die Zeichenkette nach dem '<' muss ausserdem ein bekanntes 2003cdf0e10cSrcweir // HTML Token sein. Damit die Ausgabe eines DOS-dir-Befehls nicht 2004cdf0e10cSrcweir // als HTML interpretiert wird, wird ein <DIR> jedoch nicht als HTML 2005cdf0e10cSrcweir // interpretiert. 2006cdf0e10cSrcweir String sTest( sCmp.Copy( nStart, nPos-nStart ), RTL_TEXTENCODING_ASCII_US ); 2007cdf0e10cSrcweir int nTok = GetHTMLToken( sTest ); 2008cdf0e10cSrcweir if( 0 != nTok && HTML_DIRLIST_ON != nTok ) 2009cdf0e10cSrcweir return sal_True; 2010cdf0e10cSrcweir 2011cdf0e10cSrcweir // oder es handelt sich um ein "<!" ganz am Anfang der Datei (fix #27092#) 2012cdf0e10cSrcweir if( nStart == 1 && '!' == sCmp.GetChar( 1 ) ) 2013cdf0e10cSrcweir return sal_True; 2014cdf0e10cSrcweir 2015cdf0e10cSrcweir // oder wir finden irgendwo ein <HTML> in den ersten 80 Zeichen 2016cdf0e10cSrcweir nStart = sCmp.Search( OOO_STRING_SVTOOLS_HTML_html ); 2017cdf0e10cSrcweir if( nStart!=STRING_NOTFOUND && 2018cdf0e10cSrcweir nStart>0 && '<'==sCmp.GetChar(nStart-1) && 2019cdf0e10cSrcweir nStart+4 < sCmp.Len() && '>'==sCmp.GetChar(nStart+4) ) 2020cdf0e10cSrcweir return sal_True; 2021cdf0e10cSrcweir 2022cdf0e10cSrcweir // sonst ist es wohl doch eher kein HTML-Dokument 2023cdf0e10cSrcweir return sal_False; 2024cdf0e10cSrcweir } 2025cdf0e10cSrcweir 2026cdf0e10cSrcweir sal_Bool HTMLParser::InternalImgToPrivateURL( String& rURL ) 2027cdf0e10cSrcweir { 2028cdf0e10cSrcweir if( rURL.Len() < 19 || 'i' != rURL.GetChar(0) || 2029cdf0e10cSrcweir rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher, 9 ) != COMPARE_EQUAL ) 2030cdf0e10cSrcweir return sal_False; 2031cdf0e10cSrcweir 2032cdf0e10cSrcweir sal_Bool bFound = sal_False; 2033cdf0e10cSrcweir 2034cdf0e10cSrcweir if( rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher,16) == COMPARE_EQUAL ) 2035cdf0e10cSrcweir { 2036cdf0e10cSrcweir String aName( rURL.Copy(16) ); 2037cdf0e10cSrcweir switch( aName.GetChar(0) ) 2038cdf0e10cSrcweir { 2039cdf0e10cSrcweir case 'b': 2040cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_binary ); 2041cdf0e10cSrcweir break; 2042cdf0e10cSrcweir case 'i': 2043cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_image ) || 2044cdf0e10cSrcweir aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_index ); 2045cdf0e10cSrcweir break; 2046cdf0e10cSrcweir case 'm': 2047cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_menu ) || 2048cdf0e10cSrcweir aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_movie ); 2049cdf0e10cSrcweir break; 2050cdf0e10cSrcweir case 's': 2051cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_sound ); 2052cdf0e10cSrcweir break; 2053cdf0e10cSrcweir case 't': 2054cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_telnet ) || 2055cdf0e10cSrcweir aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_text ); 2056cdf0e10cSrcweir break; 2057cdf0e10cSrcweir case 'u': 2058cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_unknown ); 2059cdf0e10cSrcweir break; 2060cdf0e10cSrcweir } 2061cdf0e10cSrcweir } 2062cdf0e10cSrcweir else if( rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_icon,14) == COMPARE_EQUAL ) 2063cdf0e10cSrcweir { 2064cdf0e10cSrcweir String aName( rURL.Copy(14) ); 2065cdf0e10cSrcweir switch( aName.GetChar(0) ) 2066cdf0e10cSrcweir { 2067cdf0e10cSrcweir case 'b': 2068cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata ); 2069cdf0e10cSrcweir break; 2070cdf0e10cSrcweir case 'd': 2071cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed ); 2072cdf0e10cSrcweir break; 2073cdf0e10cSrcweir case 'e': 2074cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_embed ); 2075cdf0e10cSrcweir break; 2076cdf0e10cSrcweir case 'i': 2077cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure ); 2078cdf0e10cSrcweir break; 2079cdf0e10cSrcweir case 'n': 2080cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound ); 2081cdf0e10cSrcweir break; 2082cdf0e10cSrcweir } 2083cdf0e10cSrcweir } 2084cdf0e10cSrcweir if( bFound ) 2085cdf0e10cSrcweir { 2086cdf0e10cSrcweir String sTmp ( rURL ); 2087cdf0e10cSrcweir rURL.AssignAscii( OOO_STRING_SVTOOLS_HTML_private_image ); 2088cdf0e10cSrcweir rURL.Append( sTmp ); 2089cdf0e10cSrcweir } 2090cdf0e10cSrcweir 2091cdf0e10cSrcweir return bFound; 2092cdf0e10cSrcweir } 2093cdf0e10cSrcweir 2094cdf0e10cSrcweir #ifdef USED 2095cdf0e10cSrcweir void HTMLParser::SaveState( int nToken ) 2096cdf0e10cSrcweir { 2097cdf0e10cSrcweir SvParser::SaveState( nToken ); 2098cdf0e10cSrcweir } 2099cdf0e10cSrcweir 2100cdf0e10cSrcweir void HTMLParser::RestoreState() 2101cdf0e10cSrcweir { 2102cdf0e10cSrcweir SvParser::RestoreState(); 2103cdf0e10cSrcweir } 2104cdf0e10cSrcweir #endif 2105cdf0e10cSrcweir 2106cdf0e10cSrcweir 2107cdf0e10cSrcweir enum eHtmlMetas { 2108cdf0e10cSrcweir HTML_META_NONE = 0, 2109cdf0e10cSrcweir HTML_META_AUTHOR, 2110cdf0e10cSrcweir HTML_META_DESCRIPTION, 2111cdf0e10cSrcweir HTML_META_KEYWORDS, 2112cdf0e10cSrcweir HTML_META_REFRESH, 2113cdf0e10cSrcweir HTML_META_CLASSIFICATION, 2114cdf0e10cSrcweir HTML_META_CREATED, 2115cdf0e10cSrcweir HTML_META_CHANGEDBY, 2116cdf0e10cSrcweir HTML_META_CHANGED, 2117cdf0e10cSrcweir HTML_META_GENERATOR, 2118cdf0e10cSrcweir HTML_META_SDFOOTNOTE, 2119cdf0e10cSrcweir HTML_META_SDENDNOTE, 2120cdf0e10cSrcweir HTML_META_CONTENT_TYPE 2121cdf0e10cSrcweir }; 2122cdf0e10cSrcweir 2123cdf0e10cSrcweir // <META NAME=xxx> 2124cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aHTMLMetaNameTable[] = 2125cdf0e10cSrcweir { 2126cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_author, HTML_META_AUTHOR }, 2127cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_changed, HTML_META_CHANGED }, 2128cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_changedby, HTML_META_CHANGEDBY }, 2129cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_classification,HTML_META_CLASSIFICATION}, 2130cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_content_type, HTML_META_CONTENT_TYPE }, 2131cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_created, HTML_META_CREATED }, 2132cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_description, HTML_META_DESCRIPTION }, 2133cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_keywords, HTML_META_KEYWORDS }, 2134cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_generator, HTML_META_GENERATOR }, 2135cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_refresh, HTML_META_REFRESH }, 2136cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_sdendnote, HTML_META_SDENDNOTE }, 2137cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_sdfootnote, HTML_META_SDFOOTNOTE }, 2138cdf0e10cSrcweir { 0, 0 } 2139cdf0e10cSrcweir }; 2140cdf0e10cSrcweir 2141cdf0e10cSrcweir 2142cdf0e10cSrcweir void HTMLParser::AddMetaUserDefined( ::rtl::OUString const & ) 2143cdf0e10cSrcweir { 2144cdf0e10cSrcweir } 2145cdf0e10cSrcweir 2146cdf0e10cSrcweir bool HTMLParser::ParseMetaOptionsImpl( 2147cdf0e10cSrcweir const uno::Reference<document::XDocumentProperties> & i_xDocProps, 2148cdf0e10cSrcweir SvKeyValueIterator *i_pHTTPHeader, 2149cdf0e10cSrcweir const HTMLOptions *i_pOptions, 2150cdf0e10cSrcweir rtl_TextEncoding& o_rEnc ) 2151cdf0e10cSrcweir { 2152cdf0e10cSrcweir String aName, aContent; 2153cdf0e10cSrcweir sal_uInt16 nAction = HTML_META_NONE; 2154cdf0e10cSrcweir bool bHTTPEquiv = false, bChanged = false; 2155cdf0e10cSrcweir 2156cdf0e10cSrcweir for ( sal_uInt16 i = i_pOptions->Count(); i; ) 2157cdf0e10cSrcweir { 2158cdf0e10cSrcweir const HTMLOption *pOption = (*i_pOptions)[ --i ]; 2159cdf0e10cSrcweir switch ( pOption->GetToken() ) 2160cdf0e10cSrcweir { 2161cdf0e10cSrcweir case HTML_O_NAME: 2162cdf0e10cSrcweir aName = pOption->GetString(); 2163cdf0e10cSrcweir if ( HTML_META_NONE==nAction ) 2164cdf0e10cSrcweir { 2165cdf0e10cSrcweir pOption->GetEnum( nAction, aHTMLMetaNameTable ); 2166cdf0e10cSrcweir } 2167cdf0e10cSrcweir break; 2168cdf0e10cSrcweir case HTML_O_HTTPEQUIV: 2169cdf0e10cSrcweir aName = pOption->GetString(); 2170cdf0e10cSrcweir pOption->GetEnum( nAction, aHTMLMetaNameTable ); 2171cdf0e10cSrcweir bHTTPEquiv = true; 2172cdf0e10cSrcweir break; 2173cdf0e10cSrcweir case HTML_O_CONTENT: 2174cdf0e10cSrcweir aContent = pOption->GetString(); 2175cdf0e10cSrcweir break; 2176cdf0e10cSrcweir } 2177cdf0e10cSrcweir } 2178cdf0e10cSrcweir 2179cdf0e10cSrcweir if ( bHTTPEquiv || HTML_META_DESCRIPTION != nAction ) 2180cdf0e10cSrcweir { 2181cdf0e10cSrcweir // if it is not a Description, remove CRs and LFs from CONTENT 2182cdf0e10cSrcweir aContent.EraseAllChars( _CR ); 2183cdf0e10cSrcweir aContent.EraseAllChars( _LF ); 2184cdf0e10cSrcweir } 2185cdf0e10cSrcweir else 2186cdf0e10cSrcweir { 2187cdf0e10cSrcweir // convert line endings for Description 2188cdf0e10cSrcweir aContent.ConvertLineEnd(); 2189cdf0e10cSrcweir } 2190cdf0e10cSrcweir 2191cdf0e10cSrcweir 2192cdf0e10cSrcweir if ( bHTTPEquiv && i_pHTTPHeader ) 2193cdf0e10cSrcweir { 2194cdf0e10cSrcweir // #57232#: Netscape seems to just ignore a closing ", so we do too 2195cdf0e10cSrcweir if ( aContent.Len() && '"' == aContent.GetChar( aContent.Len()-1 ) ) 2196cdf0e10cSrcweir { 2197cdf0e10cSrcweir aContent.Erase( aContent.Len() - 1 ); 2198cdf0e10cSrcweir } 2199cdf0e10cSrcweir SvKeyValue aKeyValue( aName, aContent ); 2200cdf0e10cSrcweir i_pHTTPHeader->Append( aKeyValue ); 2201cdf0e10cSrcweir } 2202cdf0e10cSrcweir 2203cdf0e10cSrcweir switch ( nAction ) 2204cdf0e10cSrcweir { 2205cdf0e10cSrcweir case HTML_META_AUTHOR: 2206cdf0e10cSrcweir if (i_xDocProps.is()) { 2207cdf0e10cSrcweir i_xDocProps->setAuthor( aContent ); 2208cdf0e10cSrcweir bChanged = true; 2209cdf0e10cSrcweir } 2210cdf0e10cSrcweir break; 2211cdf0e10cSrcweir case HTML_META_DESCRIPTION: 2212cdf0e10cSrcweir if (i_xDocProps.is()) { 2213cdf0e10cSrcweir i_xDocProps->setDescription( aContent ); 2214cdf0e10cSrcweir bChanged = true; 2215cdf0e10cSrcweir } 2216cdf0e10cSrcweir break; 2217cdf0e10cSrcweir case HTML_META_KEYWORDS: 2218cdf0e10cSrcweir if (i_xDocProps.is()) { 2219cdf0e10cSrcweir i_xDocProps->setKeywords( 2220cdf0e10cSrcweir ::comphelper::string::convertCommaSeparated(aContent)); 2221cdf0e10cSrcweir bChanged = true; 2222cdf0e10cSrcweir } 2223cdf0e10cSrcweir break; 2224cdf0e10cSrcweir case HTML_META_CLASSIFICATION: 2225cdf0e10cSrcweir if (i_xDocProps.is()) { 2226cdf0e10cSrcweir i_xDocProps->setSubject( aContent ); 2227cdf0e10cSrcweir bChanged = true; 2228cdf0e10cSrcweir } 2229cdf0e10cSrcweir break; 2230cdf0e10cSrcweir 2231cdf0e10cSrcweir case HTML_META_CHANGEDBY: 2232cdf0e10cSrcweir if (i_xDocProps.is()) { 2233cdf0e10cSrcweir i_xDocProps->setModifiedBy( aContent ); 2234cdf0e10cSrcweir } 2235cdf0e10cSrcweir break; 2236cdf0e10cSrcweir 2237cdf0e10cSrcweir case HTML_META_CREATED: 2238cdf0e10cSrcweir case HTML_META_CHANGED: 2239cdf0e10cSrcweir if ( i_xDocProps.is() && aContent.Len() && 2240cdf0e10cSrcweir aContent.GetTokenCount() == 2 ) 2241cdf0e10cSrcweir { 2242cdf0e10cSrcweir Date aDate( (sal_uLong)aContent.GetToken(0).ToInt32() ); 2243cdf0e10cSrcweir Time aTime( (sal_uLong)aContent.GetToken(1).ToInt32() ); 2244cdf0e10cSrcweir DateTime aDateTime( aDate, aTime ); 2245cdf0e10cSrcweir ::util::DateTime uDT(aDateTime.Get100Sec(), 2246cdf0e10cSrcweir aDateTime.GetSec(), aDateTime.GetMin(), 2247cdf0e10cSrcweir aDateTime.GetHour(), aDateTime.GetDay(), 2248cdf0e10cSrcweir aDateTime.GetMonth(), aDateTime.GetYear()); 2249cdf0e10cSrcweir if ( HTML_META_CREATED==nAction ) 2250cdf0e10cSrcweir i_xDocProps->setCreationDate( uDT ); 2251cdf0e10cSrcweir else 2252cdf0e10cSrcweir i_xDocProps->setModificationDate( uDT ); 2253cdf0e10cSrcweir bChanged = true; 2254cdf0e10cSrcweir } 2255cdf0e10cSrcweir break; 2256cdf0e10cSrcweir 2257cdf0e10cSrcweir case HTML_META_REFRESH: 2258cdf0e10cSrcweir DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader, 2259cdf0e10cSrcweir "Reload-URL aufgrund unterlassener MUSS-Aenderung verlorengegangen" ); 2260cdf0e10cSrcweir break; 2261cdf0e10cSrcweir 2262cdf0e10cSrcweir case HTML_META_CONTENT_TYPE: 2263cdf0e10cSrcweir if ( aContent.Len() ) 2264cdf0e10cSrcweir { 2265cdf0e10cSrcweir o_rEnc = GetEncodingByMIME( aContent ); 2266cdf0e10cSrcweir } 2267cdf0e10cSrcweir break; 2268cdf0e10cSrcweir 2269cdf0e10cSrcweir case HTML_META_NONE: 2270cdf0e10cSrcweir if ( !bHTTPEquiv ) 2271cdf0e10cSrcweir { 2272cdf0e10cSrcweir if (i_xDocProps.is()) 2273cdf0e10cSrcweir { 2274cdf0e10cSrcweir uno::Reference<beans::XPropertyContainer> xUDProps 2275cdf0e10cSrcweir = i_xDocProps->getUserDefinedProperties(); 2276cdf0e10cSrcweir try { 2277cdf0e10cSrcweir xUDProps->addProperty(aName, 2278cdf0e10cSrcweir beans::PropertyAttribute::REMOVEABLE, 2279cdf0e10cSrcweir uno::makeAny(::rtl::OUString(aContent))); 2280cdf0e10cSrcweir AddMetaUserDefined(aName); 2281cdf0e10cSrcweir bChanged = true; 2282cdf0e10cSrcweir } catch (uno::Exception &) { 2283cdf0e10cSrcweir // ignore 2284cdf0e10cSrcweir } 2285cdf0e10cSrcweir } 2286cdf0e10cSrcweir } 2287cdf0e10cSrcweir break; 2288cdf0e10cSrcweir default: 2289cdf0e10cSrcweir break; 2290cdf0e10cSrcweir } 2291cdf0e10cSrcweir 2292cdf0e10cSrcweir return bChanged; 2293cdf0e10cSrcweir } 2294cdf0e10cSrcweir 2295cdf0e10cSrcweir bool HTMLParser::ParseMetaOptions( 2296cdf0e10cSrcweir const uno::Reference<document::XDocumentProperties> & i_xDocProps, 2297cdf0e10cSrcweir SvKeyValueIterator *i_pHeader ) 2298cdf0e10cSrcweir { 2299cdf0e10cSrcweir sal_uInt16 nContentOption = HTML_O_CONTENT; 2300cdf0e10cSrcweir rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW; 2301cdf0e10cSrcweir 2302cdf0e10cSrcweir bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader, 2303cdf0e10cSrcweir GetOptions(&nContentOption), 2304cdf0e10cSrcweir eEnc ); 2305cdf0e10cSrcweir 2306cdf0e10cSrcweir // If the encoding is set by a META tag, it may only overwrite the 2307cdf0e10cSrcweir // current encoding if both, the current and the new encoding, are 1-sal_uInt8 2308cdf0e10cSrcweir // encodings. Everything else cannot lead to reasonable results. 2309cdf0e10cSrcweir if (RTL_TEXTENCODING_DONTKNOW != eEnc && 2310cdf0e10cSrcweir rtl_isOctetTextEncoding( eEnc ) && 2311cdf0e10cSrcweir rtl_isOctetTextEncoding( GetSrcEncoding() ) ) 2312cdf0e10cSrcweir { 2313cdf0e10cSrcweir eEnc = GetExtendedCompatibilityTextEncoding( eEnc ); // #89973# 2314cdf0e10cSrcweir SetSrcEncoding( eEnc ); 2315cdf0e10cSrcweir } 2316cdf0e10cSrcweir 2317cdf0e10cSrcweir return bRet; 2318cdf0e10cSrcweir } 2319cdf0e10cSrcweir 2320cdf0e10cSrcweir rtl_TextEncoding HTMLParser::GetEncodingByMIME( const String& rMime ) 2321cdf0e10cSrcweir { 2322cdf0e10cSrcweir ByteString sType; 2323cdf0e10cSrcweir ByteString sSubType; 2324cdf0e10cSrcweir INetContentTypeParameterList aParameters; 2325cdf0e10cSrcweir ByteString sMime( rMime, RTL_TEXTENCODING_ASCII_US ); 2326cdf0e10cSrcweir if (INetContentTypes::parse(sMime, sType, sSubType, &aParameters)) 2327cdf0e10cSrcweir { 2328cdf0e10cSrcweir const INetContentTypeParameter * pCharset 2329cdf0e10cSrcweir = aParameters.find("charset"); 2330cdf0e10cSrcweir if (pCharset != 0) 2331cdf0e10cSrcweir { 2332cdf0e10cSrcweir ByteString sValue( pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US ); 2333cdf0e10cSrcweir return GetExtendedCompatibilityTextEncoding( 2334cdf0e10cSrcweir rtl_getTextEncodingFromMimeCharset( sValue.GetBuffer() ) ); 2335cdf0e10cSrcweir } 2336cdf0e10cSrcweir } 2337cdf0e10cSrcweir return RTL_TEXTENCODING_DONTKNOW; 2338cdf0e10cSrcweir } 2339cdf0e10cSrcweir 2340cdf0e10cSrcweir rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader ) 2341cdf0e10cSrcweir { 2342cdf0e10cSrcweir rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW; 2343cdf0e10cSrcweir if( pHTTPHeader ) 2344cdf0e10cSrcweir { 2345cdf0e10cSrcweir SvKeyValue aKV; 2346cdf0e10cSrcweir for( sal_Bool bCont = pHTTPHeader->GetFirst( aKV ); bCont; 2347cdf0e10cSrcweir bCont = pHTTPHeader->GetNext( aKV ) ) 2348cdf0e10cSrcweir { 2349cdf0e10cSrcweir if( aKV.GetKey().EqualsIgnoreCaseAscii( OOO_STRING_SVTOOLS_HTML_META_content_type ) ) 2350cdf0e10cSrcweir { 2351cdf0e10cSrcweir if( aKV.GetValue().Len() ) 2352cdf0e10cSrcweir { 2353cdf0e10cSrcweir eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() ); 2354cdf0e10cSrcweir } 2355cdf0e10cSrcweir } 2356cdf0e10cSrcweir } 2357cdf0e10cSrcweir } 2358cdf0e10cSrcweir return eRet; 2359cdf0e10cSrcweir } 2360cdf0e10cSrcweir 2361cdf0e10cSrcweir sal_Bool HTMLParser::SetEncodingByHTTPHeader( 2362cdf0e10cSrcweir SvKeyValueIterator *pHTTPHeader ) 2363cdf0e10cSrcweir { 2364cdf0e10cSrcweir sal_Bool bRet = sal_False; 2365cdf0e10cSrcweir rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader ); 2366cdf0e10cSrcweir if(RTL_TEXTENCODING_DONTKNOW != eEnc) 2367cdf0e10cSrcweir { 2368cdf0e10cSrcweir SetSrcEncoding( eEnc ); 2369cdf0e10cSrcweir bRet = sal_True; 2370cdf0e10cSrcweir } 2371cdf0e10cSrcweir return bRet; 2372cdf0e10cSrcweir } 2373cdf0e10cSrcweir 2374cdf0e10cSrcweir 2375