khtml Library API Documentation

htmltokenizer.cpp

00001 /*
00002     This file is part of the KDE libraries
00003 
00004     Copyright (C) 1997 Martin Jones (mjones@kde.org)
00005               (C) 1997 Torben Weis (weis@kde.org)
00006               (C) 1998 Waldo Bastian (bastian@kde.org)
00007               (C) 1999 Lars Knoll (knoll@kde.org)
00008               (C) 1999 Antti Koivisto (koivisto@kde.org)
00009               (C) 2001-2003 Dirk Mueller (mueller@kde.org)
00010               (C) 2002 Apple Computer, Inc.
00011 
00012     This library is free software; you can redistribute it and/or
00013     modify it under the terms of the GNU Library General Public
00014     License as published by the Free Software Foundation; either
00015     version 2 of the License, or (at your option) any later version.
00016 
00017     This library is distributed in the hope that it will be useful,
00018     but WITHOUT ANY WARRANTY; without even the implied warranty of
00019     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00020     Library General Public License for more details.
00021 
00022     You should have received a copy of the GNU Library General Public License
00023     along with this library; see the file COPYING.LIB.  If not, write to
00024     the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
00025     Boston, MA 02111-1307, USA.
00026 */
00027 //----------------------------------------------------------------------------
00028 //
00029 // KDE HTML Widget - Tokenizers
00030 // $Id: htmltokenizer.cpp,v 1.248.2.6 2003/04/15 01:45:19 mueller Exp $
00031 
00032 //#define TOKEN_DEBUG 1
00033 //#define TOKEN_DEBUG 2
00034 
00035 #ifdef HAVE_CONFIG_H
00036 #include "config.h"
00037 #endif
00038 
00039 //#include <string.h>
00040 #include "html/htmltokenizer.h"
00041 #include "html/html_documentimpl.h"
00042 #include "html/htmlparser.h"
00043 #include "html/dtd.h"
00044 
00045 #include "misc/loader.h"
00046 #include "misc/htmlhashes.h"
00047 
00048 #include "khtmlview.h"
00049 #include "khtml_part.h"
00050 #include "xml/dom_docimpl.h"
00051 #include "css/csshelper.h"
00052 #include "ecma/kjs_proxy.h"
00053 #include <kcharsets.h>
00054 #include <kglobal.h>
00055 #include <ctype.h>
00056 #include <assert.h>
00057 #include <qvariant.h>
00058 #include <kdebug.h>
00059 #include <stdlib.h>
00060 
00061 #include "kentities.c"
00062 
00063 using namespace khtml;
00064 
00065 static const QChar commentStart [] = { '<','!','-','-', QChar::null };
00066 
00067 static const char scriptEnd [] = "</script";
00068 static const char xmpEnd [] = "</xmp";
00069 static const char styleEnd [] =  "</style";
00070 static const char textareaEnd [] = "</textarea";
00071 static const char titleEnd [] = "</title";
00072 
00073 #define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) malloc( sizeof(QChar)*( N ) )
00074 #define KHTML_REALLOC_QCHAR_VEC(P, N ) (QChar*) P = realloc(p, sizeof(QChar)*( N ))
00075 #define KHTML_DELETE_QCHAR_VEC( P ) free((char*)( P ))
00076 
00077 // Full support for MS Windows extensions to Latin-1.
00078 // Technically these extensions should only be activated for pages
00079 // marked "windows-1252" or "cp1252", but
00080 // in the standard Microsoft way, these extensions infect hundreds of thousands
00081 // of web pages.  Note that people with non-latin-1 Microsoft extensions
00082 // are SOL.
00083 //
00084 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
00085 //      http://www.bbsinc.com/iso8859.html
00086 //      http://www.obviously.com/
00087 //
00088 // There may be better equivalents
00089 #if 0
00090 #define fixUpChar(x)
00091 #else
00092 #define fixUpChar(x) \
00093             if (!(x).row() ) { \
00094                 switch ((x).cell()) \
00095                 { \
00096                 /* ALL of these should be changed to Unicode SOON */ \
00097                 case 0x80: (x) = 0x20ac; break; \
00098                 case 0x82: (x) = ',';    break; \
00099                 case 0x83: (x) = 0x0192; break; \
00100                 case 0x84: (x) = '"';    break; \
00101                 case 0x85: (x) = 0x2026; break; \
00102                 case 0x86: (x) = 0x2020; break; \
00103                 case 0x87: (x) = 0x2021; break; \
00104                 case 0x88: (x) = 0x02C6; break; \
00105                 case 0x89: (x) = 0x2030; break; \
00106                 case 0x8A: (x) = 0x0160; break; \
00107                 case 0x8b: (x) = '<';    break; \
00108                 case 0x8C: (x) = 0x0152; break; \
00109 \
00110                 case 0x8E: (x) = 0x017D; break; \
00111 \
00112 \
00113                 case 0x91: (x) = '\'';   break; \
00114                 case 0x92: (x) = '\'';   break; \
00115                 case 0x93: (x) = '"';    break; \
00116                 case 0x94: (x) = '"';    break; \
00117                 case 0x95: (x) = '*';    break; \
00118                 case 0x96: (x) = '-';    break; \
00119                 case 0x97: (x) = '-';    break; \
00120                 case 0x98: (x) = '~';    break; \
00121                 case 0x99: (x) = 0x2122; break; \
00122                 case 0x9A: (x) = 0x0161; break; \
00123                 case 0x9b: (x) = '>';    break; \
00124                 case 0x9C: (x) = 0x0153; break; \
00125 \
00126                 case 0x9E: (x) = 0x017E; break; \
00127                 case 0x9F: (x) = 0x0178; break; \
00128                 default: break; \
00129                 } \
00130             }
00131 #endif
00132 
00133 // ----------------------------------------------------------------------------
00134 
00135 HTMLTokenizer::HTMLTokenizer(DOM::DocumentPtr *_doc, KHTMLView *_view)
00136 {
00137     view = _view;
00138     buffer = 0;
00139     scriptCode = 0;
00140     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
00141     charsets = KGlobal::charsets();
00142     parser = new KHTMLParser(_view, _doc);
00143     m_executingScript = 0;
00144     onHold = false;
00145 
00146     reset();
00147 }
00148 
00149 HTMLTokenizer::HTMLTokenizer(DOM::DocumentPtr *_doc, DOM::DocumentFragmentImpl *i)
00150 {
00151     view = 0;
00152     buffer = 0;
00153     scriptCode = 0;
00154     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
00155     charsets = KGlobal::charsets();
00156     parser = new KHTMLParser( i, _doc );
00157     m_executingScript = 0;
00158     onHold = false;
00159 
00160     reset();
00161 }
00162 
00163 void HTMLTokenizer::reset()
00164 {
00165     assert(m_executingScript == 0);
00166     assert(onHold == false);
00167 
00168     while (!cachedScript.isEmpty())
00169         cachedScript.dequeue()->deref(this);
00170 
00171     if ( buffer )
00172         KHTML_DELETE_QCHAR_VEC(buffer);
00173     buffer = dest = 0;
00174     size = 0;
00175 
00176     if ( scriptCode )
00177         KHTML_DELETE_QCHAR_VEC(scriptCode);
00178     scriptCode = 0;
00179     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
00180 
00181     currToken.reset();
00182 }
00183 
00184 void HTMLTokenizer::begin()
00185 {
00186     m_executingScript = 0;
00187     onHold = false;
00188     reset();
00189     size = 254;
00190     buffer = KHTML_ALLOC_QCHAR_VEC( 255 );
00191     dest = buffer;
00192     tag = NoTag;
00193     pending = NonePending;
00194     discard = NoneDiscard;
00195     pre = false;
00196     prePos = 0;
00197     plaintext = false;
00198     xmp = false;
00199     processingInstruction = false;
00200     script = false;
00201     escaped = false;
00202     style = false;
00203     skipLF = false;
00204     select = false;
00205     comment = false;
00206     server = false;
00207     textarea = false;
00208     title = false;
00209     startTag = false;
00210     tquote = NoQuote;
00211     searchCount = 0;
00212     Entity = NoEntity;
00213     noMoreData = false;
00214     brokenComments = false;
00215     brokenServer = false;
00216     lineno = 0;
00217     scriptStartLineno = 0;
00218     tagStartLineno = 0;
00219 }
00220 
00221 void HTMLTokenizer::processListing(DOMStringIt list)
00222 {
00223     bool old_pre = pre;
00224 
00225     // This function adds the listing 'list' as
00226     // preformatted text-tokens to the token-collection
00227     // thereby converting TABs.
00228     if(!style) pre = true;
00229     prePos = 0;
00230 
00231     while ( list.length() )
00232     {
00233         checkBuffer(3*TAB_SIZE);
00234 
00235         if (skipLF && ( *list != '\n' ))
00236         {
00237             skipLF = false;
00238         }
00239 
00240         if (skipLF)
00241         {
00242             skipLF = false;
00243             ++list;
00244         }
00245         else if (( *list == '\n' ) || ( *list == '\r' ))
00246         {
00247             if (discard == LFDiscard)
00248             {
00249                 // Ignore this LF
00250                 discard = NoneDiscard; // We have discarded 1 LF
00251             }
00252             else
00253             {
00254                 // Process this LF
00255                 if (pending)
00256                     addPending();
00257                 pending = LFPending;
00258             }
00259             /* Check for MS-DOS CRLF sequence */
00260             if (*list == '\r')
00261             {
00262                 skipLF = true;
00263             }
00264             ++list;
00265         }
00266         else if (( *list == ' ' ) || ( *list == '\t'))
00267         {
00268             if (pending)
00269                 addPending();
00270             if (*list == ' ')
00271                 pending = SpacePending;
00272             else
00273                 pending = TabPending;
00274 
00275             ++list;
00276         }
00277         else
00278         {
00279             discard = NoneDiscard;
00280             if (pending)
00281                 addPending();
00282 
00283             prePos++;
00284             *dest++ = *list;
00285             ++list;
00286         }
00287 
00288     }
00289 
00290     if ((pending == SpacePending) || (pending == TabPending))
00291         addPending();
00292     else
00293         pending = NonePending;
00294 
00295     prePos = 0;
00296     pre = old_pre;
00297 }
00298 
00299 void HTMLTokenizer::parseSpecial(DOMStringIt &src)
00300 {
00301     assert( textarea || title || !Entity );
00302     assert( !tag );
00303     assert( xmp+textarea+title+style+script == 1 );
00304     if (script)
00305         scriptStartLineno = lineno+src.lineCount();
00306 
00307     if ( comment ) parseComment( src );
00308 
00309     while ( src.length() ) {
00310         checkScriptBuffer();
00311         unsigned char ch = src->latin1();
00312         if ( !scriptCodeResync && !brokenComments && !textarea && !xmp && !title && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && QConstString( scriptCode+scriptCodeSize-3, 3 ).string() == "<!-" ) {
00313             comment = true;
00314             parseComment( src );
00315             continue;
00316         }
00317         if ( scriptCodeResync && !tquote && ( ch == '>' ) ) {
00318             ++src;
00319             scriptCodeSize = scriptCodeResync-1;
00320             scriptCodeResync = 0;
00321             scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0;
00322             if ( script )
00323                 scriptHandler();
00324             else {
00325                 processListing(DOMStringIt(scriptCode, scriptCodeSize));
00326                 processToken();
00327                 if ( style )         { currToken.id = ID_STYLE + ID_CLOSE_TAG; }
00328                 else if ( textarea ) { currToken.id = ID_TEXTAREA + ID_CLOSE_TAG; }
00329                 else if ( title ) { currToken.id = ID_TITLE + ID_CLOSE_TAG; }
00330                 else if ( xmp )  { currToken.id = ID_XMP + ID_CLOSE_TAG; }
00331                 processToken();
00332                 style = script = style = textarea = title = xmp = false;
00333                 tquote = NoQuote;
00334                 scriptCodeSize = scriptCodeResync = 0;
00335             }
00336             return;
00337         }
00338         // possible end of tagname, lets check.
00339         if ( !scriptCodeResync && !escaped && !src.escaped() && ( ch == '>' || ch == '/' || ch <= ' ' ) && ch &&
00340              scriptCodeSize >= searchStopperLen &&
00341              !QConstString( scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen ).string().find( searchStopper, 0, false )) {
00342             scriptCodeResync = scriptCodeSize-searchStopperLen+1;
00343             tquote = NoQuote;
00344             continue;
00345         }
00346         if ( scriptCodeResync && !escaped ) {
00347             if(ch == '\"')
00348                 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
00349             else if(ch == '\'')
00350                 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
00351             else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
00352                 tquote = NoQuote;
00353         }
00354         escaped = ( !escaped && ch == '\\' );
00355         if (!scriptCodeResync && (textarea||title) && !src.escaped() && ch == '&') {
00356             QChar *scriptCodeDest = scriptCode+scriptCodeSize;
00357             ++src;
00358             parseEntity(src,scriptCodeDest,true);
00359             scriptCodeSize = scriptCodeDest-scriptCode;
00360         }
00361         else {
00362             scriptCode[ scriptCodeSize++ ] = *src;
00363             ++src;
00364         }
00365     }
00366 }
00367 
00368 void HTMLTokenizer::scriptHandler()
00369 {
00370     QString currentScriptSrc = scriptSrc;
00371     scriptSrc = QString::null;
00372 
00373     processListing(DOMStringIt(scriptCode, scriptCodeSize));
00374     QString exScript( buffer, dest-buffer );
00375 
00376     processToken();
00377     currToken.id = ID_SCRIPT + ID_CLOSE_TAG;
00378     processToken();
00379 
00380     QString prependingSrc;
00381 
00382     if ( !parser->skipMode() ) {
00383         CachedScript* cs = 0;
00384 
00385         // forget what we just got, load from src url instead
00386         if ( !currentScriptSrc.isEmpty() &&
00387              (cs = parser->doc()->docLoader()->requestScript(currentScriptSrc, scriptSrcCharset) ))
00388             cachedScript.enqueue(cs);
00389 
00390         if (cs) {
00391             pendingSrc.prepend( QString(src.current(), src.length() ) );
00392             setSrc(QString::null);
00393             scriptCodeSize = scriptCodeResync = 0;
00394             cs->ref(this);
00395 
00396         }
00397         else if (currentScriptSrc.isEmpty() && view && javascript ) {
00398             if ( !m_executingScript )
00399                 pendingSrc.prepend( QString( src.current(), src.length() ) ); // deep copy - again
00400             else
00401                 prependingSrc = QString( src.current(), src.length() ); // deep copy
00402 
00403             setSrc(QString::null);
00404             scriptCodeSize = scriptCodeResync = 0;
00405             scriptExecution( exScript, QString::null, tagStartLineno /*scriptStartLineno*/ );
00406         }
00407     }
00408 
00409     script = false;
00410     scriptCodeSize = scriptCodeResync = 0;
00411 
00412     if ( !m_executingScript && cachedScript.isEmpty() ) {
00413         // kdDebug( 6036 ) << "adding pending Output to parsed string" << endl;
00414         QString newStr = QString(src.current(), src.length());
00415         newStr += pendingSrc;
00416         setSrc(newStr);
00417         pendingSrc = QString::null;
00418     }
00419     else if ( !prependingSrc.isEmpty() )
00420         write( prependingSrc, false );
00421 }
00422 
00423 void HTMLTokenizer::scriptExecution( const QString& str, QString scriptURL,
00424                                      int baseLine)
00425 {
00426     bool oldscript = script;
00427     m_executingScript++;
00428     script = false;
00429     QString url;
00430     if (scriptURL.isNull())
00431       url = static_cast<DocumentImpl*>(view->part()->document().handle())->URL();
00432     else
00433       url = scriptURL;
00434 
00435     view->part()->executeScript(url,baseLine,Node(),str);
00436     m_executingScript--;
00437     script = oldscript;
00438 }
00439 
00440 void HTMLTokenizer::parseComment(DOMStringIt &src)
00441 {
00442     checkScriptBuffer(src.length());
00443     while ( src.length() ) {
00444         scriptCode[ scriptCodeSize++ ] = *src;
00445 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00446         qDebug("comment is now: *%s*",
00447                QConstString((QChar*)src.current(), QMIN(16, src.length())).string().latin1());
00448 #endif
00449         if (src->unicode() == '>' &&
00450             ( ( brokenComments && !( script || style ) ) ||
00451               ( scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' &&
00452                 scriptCode[scriptCodeSize-2] == '-' ) ) ) {
00453             ++src;
00454             if ( !( script || xmp || textarea || style) ) {
00455 #ifdef COMMENTS_IN_DOM
00456                 checkScriptBuffer();
00457                 scriptCode[ scriptCodeSize ] = 0;
00458                 scriptCode[ scriptCodeSize + 1 ] = 0;
00459                 currToken.id = ID_COMMENT;
00460                 processListing(DOMStringIt(scriptCode, scriptCodeSize - 2));
00461                 processToken();
00462                 currToken.id = ID_COMMENT + ID_CLOSE_TAG;
00463                 processToken();
00464 #endif
00465                 scriptCodeSize = 0;
00466             }
00467             comment = false;
00468             return; // Finished parsing comment
00469         }
00470         ++src;
00471     }
00472 }
00473 
00474 void HTMLTokenizer::parseServer(DOMStringIt &src)
00475 {
00476     checkScriptBuffer(src.length());
00477     while ( src.length() ) {
00478         scriptCode[ scriptCodeSize++ ] = *src;
00479         if (src->unicode() == '>' &&
00480             scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') {
00481             ++src;
00482             server = false;
00483             scriptCodeSize = 0;
00484             return; // Finished parsing server include
00485         }
00486         ++src;
00487     }
00488 }
00489 
00490 void HTMLTokenizer::parseProcessingInstruction(DOMStringIt &src)
00491 {
00492     char oldchar = 0;
00493     while ( src.length() )
00494     {
00495         unsigned char chbegin = src->latin1();
00496         if(chbegin == '\'') {
00497             tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
00498         }
00499         else if(chbegin == '\"') {
00500             tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
00501         }
00502         // Look for '?>'
00503         // some crappy sites omit the "?" before it, so
00504         // we look for an unquoted '>' instead. (IE compatible)
00505         else if ( chbegin == '>' && ( !tquote || oldchar == '?' ) )
00506         {
00507             // We got a '?>' sequence
00508             processingInstruction = false;
00509             ++src;
00510             discard=LFDiscard;
00511             return; // Finished parsing comment!
00512         }
00513         ++src;
00514         oldchar = chbegin;
00515     }
00516 }
00517 
00518 void HTMLTokenizer::parseText(DOMStringIt &src)
00519 {
00520     while ( src.length() )
00521     {
00522         // do we need to enlarge the buffer?
00523         checkBuffer();
00524 
00525         // ascii is okay because we only do ascii comparisons
00526         unsigned char chbegin = src->latin1();
00527 
00528         if (skipLF && ( chbegin != '\n' ))
00529         {
00530             skipLF = false;
00531         }
00532 
00533         if (skipLF)
00534         {
00535             skipLF = false;
00536             ++src;
00537         }
00538         else if (( chbegin == '\n' ) || ( chbegin == '\r' ))
00539         {
00540             if (chbegin == '\r')
00541                 skipLF = true;
00542 
00543             *dest++ = '\n';
00544             ++src;
00545         }
00546         else {
00547             *dest++ = *src;
00548             ++src;
00549         }
00550     }
00551 }
00552 
00553 
00554 void HTMLTokenizer::parseEntity(DOMStringIt &src, QChar *&dest, bool start)
00555 {
00556     if( start )
00557     {
00558         cBufferPos = 0;
00559         Entity = SearchEntity;
00560     }
00561 
00562     while( src.length() )
00563     {
00564         ushort cc = src->unicode();
00565         switch(Entity) {
00566         case NoEntity:
00567             return;
00568 
00569             break;
00570         case SearchEntity:
00571             if(cc == '#') {
00572                 cBuffer[cBufferPos++] = cc;
00573                 ++src;
00574                 Entity = NumericSearch;
00575             }
00576             else
00577                 Entity = EntityName;
00578 
00579             break;
00580 
00581         case NumericSearch:
00582             if(cc == 'x' || cc == 'X') {
00583                 cBuffer[cBufferPos++] = cc;
00584                 ++src;
00585                 Entity = Hexadecimal;
00586             }
00587             else if(cc >= '0' && cc <= '9')
00588                 Entity = Decimal;
00589             else
00590                 Entity = SearchSemicolon;
00591 
00592             break;
00593 
00594         case Hexadecimal:
00595         {
00596             int uc = EntityChar.unicode();
00597             int ll = kMin(src.length(), 9-cBufferPos);
00598             while(ll--) {
00599                 QChar csrc(src->lower());
00600                 cc = csrc.cell();
00601 
00602                 if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) {
00603                     Entity = SearchSemicolon;
00604                     break;
00605                 }
00606                 uc = uc*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10));
00607                 cBuffer[cBufferPos++] = cc;
00608                 ++src;
00609             }
00610             EntityChar = QChar(uc);
00611             if(cBufferPos == 9) Entity = SearchSemicolon;
00612             break;
00613         }
00614         case Decimal:
00615         {
00616             int uc = EntityChar.unicode();
00617             int ll = kMin(src.length(), 9-cBufferPos);
00618             while(ll--) {
00619                 cc = src->cell();
00620 
00621                 if(src->row() || !(cc >= '0' && cc <= '9')) {
00622                     Entity = SearchSemicolon;
00623                     break;
00624                 }
00625 
00626                 uc = uc * 10 + (cc - '0');
00627                 cBuffer[cBufferPos++] = cc;
00628                 ++src;
00629             }
00630             EntityChar = QChar(uc);
00631             if(cBufferPos == 9)  Entity = SearchSemicolon;
00632             break;
00633         }
00634         case EntityName:
00635         {
00636             int ll = kMin(src.length(), 9-cBufferPos);
00637             while(ll--) {
00638                 QChar csrc = *src;
00639                 cc = csrc.cell();
00640 
00641                 if(csrc.row() || !((cc >= 'a' && cc <= 'z') ||
00642                                    (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
00643                     Entity = SearchSemicolon;
00644                     break;
00645                 }
00646 
00647                 cBuffer[cBufferPos++] = cc;
00648                 ++src;
00649             }
00650             if(cBufferPos == 9) Entity = SearchSemicolon;
00651             if(Entity == SearchSemicolon) {
00652                 if(cBufferPos > 1) {
00653                     const entity *e = findEntity(cBuffer, cBufferPos);
00654                     if(e)
00655                         EntityChar = e->code;
00656 
00657                     // be IE compatible
00658                     if(tag && EntityChar.unicode() > 255 && *src != ';')
00659                         EntityChar = QChar::null;
00660                 }
00661             }
00662             else
00663                 break;
00664         }
00665         case SearchSemicolon:
00666 
00667             //kdDebug( 6036 ) << "ENTITY " << EntityChar.unicode() << ", " << res << endl;
00668 
00669             fixUpChar(EntityChar);
00670 
00671             if ( EntityChar != QChar::null ) {
00672                 checkBuffer();
00673                 // Just insert it
00674                 if (*src == ';')
00675                     ++src;
00676 
00677                 src.push( EntityChar );
00678             } else {
00679 #ifdef TOKEN_DEBUG
00680                 kdDebug( 6036 ) << "unknown entity!" << endl;
00681 #endif
00682                 checkBuffer(10);
00683                 // ignore the sequence, add it to the buffer as plaintext
00684                 *dest++ = '&';
00685                 for(unsigned int i = 0; i < cBufferPos; i++)
00686                     dest[i] = cBuffer[i];
00687                 dest += cBufferPos;
00688                 Entity = NoEntity;
00689                 if (pre)
00690                     prePos += cBufferPos+1;
00691             }
00692 
00693             Entity = NoEntity;
00694             EntityChar = QChar::null;
00695             return;
00696         };
00697     }
00698 }
00699 
00700 void HTMLTokenizer::parseTag(DOMStringIt &src)
00701 {
00702     assert(!Entity );
00703 
00704     while ( src.length() )
00705     {
00706         checkBuffer();
00707 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00708         uint l = 0;
00709         while(l < src.length() && (*(src.current()+l)).latin1() != '>')
00710             l++;
00711         qDebug("src is now: *%s*, tquote: %d",
00712                QConstString((QChar*)src.current(), l).string().latin1(), tquote);
00713 #endif
00714         switch(tag) {
00715         case NoTag:
00716         {
00717             return;
00718         }
00719         case TagName:
00720         {
00721 #if defined(TOKEN_DEBUG) &&  TOKEN_DEBUG > 1
00722             qDebug("TagName");
00723 #endif
00724             if (searchCount > 0)
00725             {
00726                 if (*src == commentStart[searchCount])
00727                 {
00728                     searchCount++;
00729                     if (searchCount == 4)
00730                     {
00731 #ifdef TOKEN_DEBUG
00732                         kdDebug( 6036 ) << "Found comment" << endl;
00733 #endif
00734                         // Found '<!--' sequence
00735                         ++src;
00736                         dest = buffer; // ignore the previous part of this tag
00737                         tag = NoTag;
00738 
00739                         comment = true;
00740                         // push what we parsed so far upon the stack. helps for <!-->
00741                         checkScriptBuffer();
00742                         scriptCode[0] = scriptCode[1] = '-';
00743                         scriptCodeSize = 2;
00744                         parseComment(src);
00745                         return; // Finished parsing tag!
00746                     }
00747                     // cuts of high part, is okay
00748                     cBuffer[cBufferPos++] = src->cell();
00749                     ++src;
00750                     break;
00751                 }
00752                 else
00753                     searchCount = 0; // Stop looking for '<!--' sequence
00754             }
00755 
00756             bool finish = false;
00757             unsigned int ll = kMin(src.length(), CBUFLEN-cBufferPos);
00758             while(ll--) {
00759                 ushort curchar = *src;
00760                 if(curchar <= ' ' || curchar == '>' ) {
00761                     finish = true;
00762                     break;
00763                 }
00764                 // this is a nasty performance trick. will work for the A-Z
00765                 // characters, but not for others. if it contains one,
00766                 // we fail anyway
00767                 char cc = curchar;
00768                 cBuffer[cBufferPos++] = cc | 0x20;
00769                 ++src;
00770             }
00771 
00772             // Disadvantage: we add the possible rest of the tag
00773             // as attribute names. ### judge if this causes problems
00774             if(finish || CBUFLEN == cBufferPos) {
00775                 bool beginTag;
00776                 char* ptr = cBuffer;
00777                 unsigned int len = cBufferPos;
00778                 cBuffer[cBufferPos] = '\0';
00779                 if ((cBufferPos > 0) && (*ptr == '/'))
00780                 {
00781                     // End Tag
00782                     beginTag = false;
00783                     ptr++;
00784                     len--;
00785                 }
00786                 else
00787                     // Start Tag
00788                     beginTag = true;
00789                 // Accept empty xml tags like <br/>
00790                 if(len > 1 && ptr[len-1] == '/' ) {
00791                     ptr[--len] = '\0';
00792                     // if its like <br/> and not like <input/ value=foo>, take it as flat
00793                     if (*src == '>')
00794                         currToken.flat = true;
00795                 }
00796 
00797                 uint tagID = khtml::getTagID(ptr, len);
00798                 if (!tagID) {
00799 #ifdef TOKEN_DEBUG
00800                     QCString tmp(ptr, len+1);
00801                     kdDebug( 6036 ) << "Unknown tag: \"" << tmp.data() << "\"" << endl;
00802 #endif
00803                     dest = buffer;
00804                 }
00805                 else
00806                 {
00807 #ifdef TOKEN_DEBUG
00808                     QCString tmp(ptr, len+1);
00809                     kdDebug( 6036 ) << "found tag id=" << tagID << ": " << tmp.data() << endl;
00810 #endif
00811                     currToken.id = beginTag ? tagID : tagID + ID_CLOSE_TAG;
00812                     dest = buffer;
00813                 }
00814                 tag = SearchAttribute;
00815                 cBufferPos = 0;
00816             }
00817             break;
00818         }
00819         case SearchAttribute:
00820         {
00821 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00822                 qDebug("SearchAttribute");
00823 #endif
00824             bool atespace = false;
00825             ushort curchar;
00826             while(src.length()) {
00827                 curchar = *src;
00828                 if(curchar > ' ') {
00829                     if(curchar == '>')
00830                         tag = SearchEnd;
00831                     else if(atespace && (curchar == '\'' || curchar == '"'))
00832                     {
00833                         tag = SearchValue;
00834                         *dest++ = 0;
00835                         attrName = QString::null;
00836                     }
00837                     else
00838                         tag = AttributeName;
00839 
00840                     cBufferPos = 0;
00841                     break;
00842                 }
00843                 atespace = true;
00844                 ++src;
00845             }
00846             break;
00847         }
00848         case AttributeName:
00849         {
00850 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00851                 qDebug("AttributeName");
00852 #endif
00853             ushort curchar;
00854             int ll = kMin(src.length(), CBUFLEN-cBufferPos);
00855 
00856             while(ll--) {
00857                 curchar = *src;
00858                 if(curchar <= '>') {
00859                     if(curchar <= ' ' || curchar == '=' || curchar == '>') {
00860                         unsigned int a;
00861                         cBuffer[cBufferPos] = '\0';
00862                         a = khtml::getAttrID(cBuffer, cBufferPos);
00863                         if ( !a )
00864                             attrName = QString::fromLatin1(QCString(cBuffer, cBufferPos+1).data());
00865 
00866                         dest = buffer;
00867                         *dest++ = a;
00868 #ifdef TOKEN_DEBUG
00869                         if (!a || (cBufferPos && *cBuffer == '!'))
00870                             kdDebug( 6036 ) << "Unknown attribute: *" << QCString(cBuffer, cBufferPos+1).data() << "*" << endl;
00871                         else
00872                             kdDebug( 6036 ) << "Known attribute: " << QCString(cBuffer, cBufferPos+1).data() << endl;
00873 #endif
00874                         // did we just get />
00875                         if (!a && cBufferPos == 1 && *cBuffer == '/' && curchar == '>')
00876                             currToken.flat = true;
00877 
00878                         tag = SearchEqual;
00879                         break;
00880                     }
00881                 }
00882                 cBuffer[cBufferPos++] = (char) curchar | 0x20;
00883                 ++src;
00884             }
00885             if ( cBufferPos == CBUFLEN ) {
00886                 cBuffer[cBufferPos] = '\0';
00887                 attrName = QString::fromLatin1(QCString(cBuffer, cBufferPos+1).data());
00888                 dest = buffer;
00889                 *dest++ = 0;
00890                 tag = SearchEqual;
00891             }
00892             break;
00893         }
00894         case SearchEqual:
00895         {
00896 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00897                 qDebug("SearchEqual");
00898 #endif
00899             ushort curchar;
00900             bool atespace = false;
00901             while(src.length()) {
00902                 curchar = src->unicode();
00903                 if(curchar > ' ') {
00904                     if(curchar == '=') {
00905 #ifdef TOKEN_DEBUG
00906                         kdDebug(6036) << "found equal" << endl;
00907 #endif
00908                         tag = SearchValue;
00909                         ++src;
00910                     }
00911                     else if(atespace && (curchar == '\'' || curchar == '"'))
00912                     {
00913                         tag = SearchValue;
00914                         *dest++ = 0;
00915                         attrName = QString::null;
00916                     }
00917                     else {
00918                         DOMString v("");
00919                         currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v);
00920                         dest = buffer;
00921                         tag = SearchAttribute;
00922                     }
00923                     break;
00924                 }
00925                 atespace = true;
00926                 ++src;
00927             }
00928             break;
00929         }
00930         case SearchValue:
00931         {
00932             ushort curchar;
00933             while(src.length()) {
00934                 curchar = src->unicode();
00935                 if(curchar > ' ') {
00936                     if(( curchar == '\'' || curchar == '\"' )) {
00937                         tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
00938                         tag = QuotedValue;
00939                         ++src;
00940                     } else
00941                         tag = Value;
00942 
00943                     break;
00944                 }
00945                 ++src;
00946             }
00947             break;
00948         }
00949         case QuotedValue:
00950         {
00951 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00952                 qDebug("QuotedValue");
00953 #endif
00954             ushort curchar;
00955             while(src.length()) {
00956                 checkBuffer();
00957 
00958                 curchar = src->unicode();
00959                 if(curchar <= '\'' && !src.escaped()) {
00960                     // ### attributes like '&{blaa....};' are supposed to be treated as jscript.
00961                     if ( curchar == '&' )
00962                     {
00963                         ++src;
00964                         parseEntity(src, dest, true);
00965                         break;
00966                     }
00967                     else if ( (tquote == SingleQuote && curchar == '\'') ||
00968                               (tquote == DoubleQuote && curchar == '\"') )
00969                     {
00970                         // some <input type=hidden> rely on trailing spaces. argh
00971                         while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
00972                             dest--; // remove trailing newlines
00973                         DOMString v(buffer+1, dest-buffer-1);
00974                         currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v);
00975 
00976                         dest = buffer;
00977                         tag = SearchAttribute;
00978                         tquote = NoQuote;
00979                         ++src;
00980                         break;
00981                     }
00982                 }
00983                 *dest++ = *src;
00984                 ++src;
00985             }
00986             break;
00987         }
00988         case Value:
00989         {
00990 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
00991             qDebug("Value");
00992 #endif
00993             ushort curchar;
00994             while(src.length()) {
00995                 checkBuffer();
00996                 curchar = src->unicode();
00997                 if(curchar <= '>' && !src.escaped()) {
00998                     // parse Entities
00999                     if ( curchar == '&' )
01000                     {
01001                         ++src;
01002                         parseEntity(src, dest, true);
01003                         break;
01004                     }
01005                     // no quotes. Every space means end of value
01006                     // '/' does not delimit in IE!
01007                     if ( curchar <= ' ' || curchar == '>' )
01008                     {
01009                         DOMString v(buffer+1, dest-buffer-1);
01010                         currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v);
01011                         dest = buffer;
01012                         tag = SearchAttribute;
01013                         break;
01014                     }
01015                 }
01016 
01017                 *dest++ = *src;
01018                 ++src;
01019             }
01020             break;
01021         }
01022         case SearchEnd:
01023         {
01024 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
01025                 qDebug("SearchEnd");
01026 #endif
01027             while(src.length()) {
01028                 if(*src == '>')
01029                     break;
01030 
01031                 if (*src == '/')
01032                     currToken.flat = true;
01033 
01034                 ++src;
01035             }
01036             if(!src.length() && *src != '>') break;
01037 
01038             searchCount = 0; // Stop looking for '<!--' sequence
01039             tag = NoTag;
01040             tquote = NoQuote;
01041             ++src;
01042 
01043             if ( !currToken.id ) //stop if tag is unknown
01044                 return;
01045 
01046             uint tagID = currToken.id;
01047 #if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
01048             kdDebug( 6036 ) << "appending Tag: " << tagID << endl;
01049 #endif
01050             bool beginTag = !currToken.flat && (tagID < ID_CLOSE_TAG);
01051 
01052             if(tagID >= ID_CLOSE_TAG)
01053                 tagID -= ID_CLOSE_TAG;
01054             else if ( beginTag && tagID == ID_SCRIPT ) {
01055                 AttributeImpl* a = 0;
01056                 scriptSrc = scriptSrcCharset = QString::null;
01057                 if ( currToken.attrs && /* potentially have a ATTR_SRC ? */
01058                      parser->doc()->view()->part()->jScriptEnabled() && /* jscript allowed at all? */
01059                      view /* are we a regular tokenizer or just for innerHTML ? */
01060                     ) {
01061                     if ( ( a = currToken.attrs->getAttributeItem( ATTR_SRC ) ) )
01062                         scriptSrc = parser->doc()->completeURL(khtml::parseURL( a->value() ).string() );
01063                     if ( ( a = currToken.attrs->getAttributeItem( ATTR_CHARSET ) ) )
01064                         scriptSrcCharset = a->value().string().stripWhiteSpace();
01065                     if ( scriptSrcCharset.isEmpty() )
01066                         scriptSrcCharset = parser->doc()->view()->part()->encoding();
01067                     if (!(a = currToken.attrs->getAttributeItem( ATTR_LANGUAGE )))
01068                         a = currToken.attrs->getAttributeItem(ATTR_TYPE);
01069                 }
01070                 javascript = true;
01071                 if( a ) {
01072                     QString lang = a->value().string();
01073                     lang = lang.lower();
01074                     if( !lang.contains("javascript") &&
01075                         !lang.contains("ecmascript") &&
01076                         !lang.contains("livescript") &&
01077                         !lang.contains("jscript") )
01078                         javascript = false;
01079                 }
01080             }
01081 
01082             processToken();
01083 
01084             // lets see if we're still in parsing mood for spaces
01085             pre = parser->preMode();
01086 
01087             switch( tagID ) {
01088             case ID_PRE:
01089                 prePos = 0;
01090                 break;
01091             case ID_SCRIPT:
01092                 if (beginTag) {
01093                     searchStopper = scriptEnd;
01094                     searchStopperLen = 8;
01095                     script = true;
01096                     parseSpecial(src);
01097                 }
01098                 break;
01099             case ID_STYLE:
01100                 if (beginTag) {
01101                     searchStopper = styleEnd;
01102                     searchStopperLen = 7;
01103                     style = true;
01104                     parseSpecial(src);
01105                 }
01106                 break;
01107             case ID_TEXTAREA:
01108                 if(beginTag) {
01109                     searchStopper = textareaEnd;
01110                     searchStopperLen = 10;
01111                     textarea = true;
01112                     discard = AllDiscard;
01113                     parseSpecial(src);
01114                 }
01115                 break;
01116             case ID_TITLE:
01117                 if (beginTag) {
01118                     searchStopper = titleEnd;
01119                     searchStopperLen = 7;
01120                     title = true;
01121                     parseSpecial(src);
01122                 }
01123                 break;
01124             case ID_XMP:
01125                 if (beginTag) {
01126                     searchStopper = xmpEnd;
01127                     searchStopperLen = 5;
01128                     xmp = true;
01129                     parseSpecial(src);
01130                 }
01131                 break;
01132             case ID_SELECT:
01133                 select = beginTag;
01134                 break;
01135             case ID_PLAINTEXT:
01136                 plaintext = beginTag;
01137                 break;
01138             }
01139             return; // Finished parsing tag!
01140         }
01141         } // end switch
01142     }
01143     return;
01144 }
01145 
01146 void HTMLTokenizer::addPending()
01147 {
01148     if ( select && !(comment || script))
01149     {
01150         *dest++ = ' ';
01151     }
01152     else if ( textarea )
01153     {
01154         switch(pending) {
01155         case LFPending:  *dest++ = '\n'; prePos = 0; break;
01156         case SpacePending: *dest++ = ' '; ++prePos; break;
01157         case TabPending: *dest++ = '\t'; prePos += TAB_SIZE - (prePos % TAB_SIZE); break;
01158         case NonePending:
01159             assert(0);
01160         }
01161     }
01162     else if ( pre )
01163     {
01164         int p;
01165 
01166         switch (pending)
01167         {
01168         case SpacePending:
01169             // Insert a breaking space
01170             *dest++ = QChar(' ');
01171             prePos++;
01172             break;
01173 
01174         case LFPending:
01175             *dest = '\n';
01176             dest++;
01177             prePos = 0;
01178             break;
01179 
01180         case TabPending:
01181             p = TAB_SIZE - ( prePos % TAB_SIZE );
01182             for ( int x = 0; x < p; x++ )
01183                 *dest++ = QChar(' ');
01184             prePos += p;
01185             break;
01186 
01187         case NonePending:
01188             assert(0);
01189             break;
01190         }
01191     }
01192     else
01193     {
01194         *dest++ = ' ';
01195     }
01196 
01197     pending = NonePending;
01198 }
01199 
01200 void HTMLTokenizer::write( const QString &str, bool appendData )
01201 {
01202 #ifdef TOKEN_DEBUG
01203     kdDebug( 6036 ) << this << " Tokenizer::write(\"" << str << "\"," << appendData << ")" << endl;
01204 #endif
01205 
01206     if ( !buffer )
01207         return;
01208 
01209     if ( ( m_executingScript && appendData ) ||
01210          ( !m_executingScript && cachedScript.count() ) ) {
01211         // don't parse; we will do this later
01212         pendingSrc += str;
01213         return;
01214     }
01215 
01216     if ( onHold ) {
01217         QString rest = QString( src.current(), src.length() );
01218         rest += str;
01219         setSrc(rest);
01220         return;
01221     }
01222     else
01223         setSrc(str);
01224 
01225 //     if (Entity)
01226 //         parseEntity(src, dest);
01227 
01228     while ( src.length() )
01229     {
01230         // do we need to enlarge the buffer?
01231         checkBuffer();
01232 
01233         ushort cc = src->unicode();
01234 
01235         if (skipLF && (cc != '\n'))
01236             skipLF = false;
01237 
01238         if (skipLF) {
01239             skipLF = false;
01240             ++src;
01241         }
01242         else if ( Entity )
01243             parseEntity( src, dest );
01244         else if ( plaintext )
01245             parseText( src );
01246         else if (script)
01247             parseSpecial(src);
01248         else if (style)
01249             parseSpecial(src);
01250         else if (xmp)
01251             parseSpecial(src);
01252         else if (textarea)
01253             parseSpecial(src);
01254         else if (title)
01255             parseSpecial(src);
01256         else if (comment)
01257             parseComment(src);
01258         else if (server)
01259             parseServer(src);
01260         else if (processingInstruction)
01261             parseProcessingInstruction(src);
01262         else if (tag)
01263             parseTag(src);
01264         else if ( startTag )
01265         {
01266             startTag = false;
01267 
01268             switch(cc) {
01269             case '/':
01270                 break;
01271             case '!':
01272             {
01273                 // <!-- comment -->
01274                 searchCount = 1; // Look for '<!--' sequence to start comment
01275 
01276                 break;
01277             }
01278             case '?':
01279             {
01280                 // xml processing instruction
01281                 processingInstruction = true;
01282                 tquote = NoQuote;
01283                 parseProcessingInstruction(src);
01284                 continue;
01285 
01286                 break;
01287             }
01288             case '%':
01289                 if (!brokenServer) {
01290                     // <% server stuff, handle as comment %>
01291                     server = true;
01292                     tquote = NoQuote;
01293                     parseServer(src);
01294                     continue;
01295                 }
01296                 // else fall through
01297             default:
01298             {
01299                 if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z')))
01300                 {
01301                     // Start of a Start-Tag
01302                 }
01303                 else
01304                 {
01305                     // Invalid tag
01306                     // Add as is
01307                     if (pending)
01308                         addPending();
01309                     *dest = '<';
01310                     dest++;
01311                     continue;
01312                 }
01313             }
01314             }; // end case
01315 
01316             if ( pending ) {
01317                 // pre context always gets its spaces/linefeeds
01318                 if ( pre )
01319                     addPending();
01320                 // only add in existing inline context or if
01321                 // we just started one, i.e. we're about to insert real text
01322                 else if ( !parser->selectMode() &&
01323                           ( !parser->noSpaces() || dest > buffer )) {
01324                     addPending();
01325             discard = AllDiscard;
01326                 }
01327                 // just forget it
01328                 else
01329                     pending = NonePending;
01330             }
01331 
01332             processToken();
01333 
01334             cBufferPos = 0;
01335             tag = TagName;
01336             parseTag(src);
01337         }
01338         else if ( cc == '&' && !src.escaped())
01339         {
01340             ++src;
01341             if ( pending )
01342                 addPending();
01343             parseEntity(src, dest, true);
01344         }
01345         else if ( cc == '<' && !src.escaped())
01346         {
01347             tagStartLineno = lineno+src.lineCount();
01348             ++src;
01349             startTag = true;
01350         }
01351         else if (( cc == '\n' ) || ( cc == '\r' ))
01352         {
01353             if ( pre || textarea)
01354             {
01355                 if (discard == LFDiscard || discard == AllDiscard)
01356                 {
01357                     // Ignore this LF
01358                     discard = NoneDiscard; // We have discarded 1 LF
01359                 }
01360                 else
01361                 {
01362                     // Process this LF
01363                     if (pending)
01364                         addPending();
01365                     pending = LFPending;
01366                 }
01367             }
01368             else
01369             {
01370                 if (discard == LFDiscard)
01371                 {
01372                     // Ignore this LF
01373                     discard = NoneDiscard; // We have discarded 1 LF
01374                 }
01375                 else if(discard == AllDiscard)
01376                 {
01377                 }
01378                 else
01379                 {
01380                     // Process this LF
01381                     if (pending == NonePending)
01382                         pending = LFPending;
01383                 }
01384             }
01385             /* Check for MS-DOS CRLF sequence */
01386             if (cc == '\r')
01387             {
01388                 skipLF = true;
01389             }
01390             ++src;
01391         }
01392         else if (( cc == ' ' ) || ( cc == '\t' ))
01393         {
01394             if ( pre || textarea)
01395             {
01396                 if (discard == SpaceDiscard || discard == AllDiscard)
01397                 {
01398                     // Ignore this LF
01399                     discard = NoneDiscard; // We have discarded 1 LF
01400                 }
01401                 else {
01402                     if (pending)
01403                         addPending();
01404                     if (cc == ' ')
01405                         pending = SpacePending;
01406                     else
01407                         pending = TabPending;
01408                 }
01409             }
01410             else
01411             {
01412                 if(discard == SpaceDiscard)
01413                     discard = NoneDiscard;
01414                 else if(discard == AllDiscard)
01415                 { }
01416                 else
01417                     pending = SpacePending;
01418             }
01419             ++src;
01420         }
01421         else
01422         {
01423             if (pending)
01424                 addPending();
01425 
01426             discard = NoneDiscard;
01427             if ( pre )
01428             {
01429                 prePos++;
01430             }
01431             *dest = *src;
01432             fixUpChar( *dest );
01433             ++dest;
01434             ++src;
01435         }
01436     }
01437     _src = QString::null;
01438 
01439     if (noMoreData && cachedScript.isEmpty() && !m_executingScript )
01440         end(); // this actually causes us to be deleted
01441 }
01442 
01443 void HTMLTokenizer::end()
01444 {
01445     if ( buffer == 0 ) {
01446         emit finishedParsing();
01447         return;
01448     }
01449 
01450     // parseTag is using the buffer for different matters
01451     if ( !tag )
01452         processToken();
01453 
01454     if(buffer)
01455         KHTML_DELETE_QCHAR_VEC(buffer);
01456 
01457     if(scriptCode)
01458         KHTML_DELETE_QCHAR_VEC(scriptCode);
01459 
01460     scriptCode = 0;
01461     scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
01462     buffer = 0;
01463     emit finishedParsing();
01464 }
01465 
01466 void HTMLTokenizer::finish()
01467 {
01468     // do this as long as we don't find matching comment ends
01469     while((comment || server) && scriptCode && scriptCodeSize)
01470     {
01471         // we've found an unmatched comment start
01472         if (comment)
01473             brokenComments = true;
01474         else
01475             brokenServer = true;
01476         checkScriptBuffer();
01477         scriptCode[ scriptCodeSize ] = 0;
01478         scriptCode[ scriptCodeSize + 1 ] = 0;
01479         int pos;
01480         QString food;
01481         if (script || style) {
01482             food.setUnicode(scriptCode, scriptCodeSize);
01483         }
01484         else if (server) {
01485             food = "<";
01486             food += QString(scriptCode, scriptCodeSize);
01487         }
01488         else {
01489             pos = QConstString(scriptCode, scriptCodeSize).string().find('>');
01490             food.setUnicode(scriptCode+pos+1, scriptCodeSize-pos-1); // deep copy
01491         }
01492         KHTML_DELETE_QCHAR_VEC(scriptCode);
01493         scriptCode = 0;
01494         scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
01495         comment = server = false;
01496         if ( !food.isEmpty() )
01497             write(food, true);
01498     }
01499     // this indicates we will not recieve any more data... but if we are waiting on
01500     // an external script to load, we can't finish parsing until that is done
01501     noMoreData = true;
01502     if (cachedScript.isEmpty() && !m_executingScript && !onHold)
01503         end(); // this actually causes us to be deleted
01504 }
01505 
01506 void HTMLTokenizer::processToken()
01507 {
01508     KJSProxy *jsProxy = view ? view->part()->jScript() : 0L;
01509     if (jsProxy)
01510         jsProxy->setEventHandlerLineno(tagStartLineno);
01511     if ( dest > buffer )
01512     {
01513 #ifdef TOKEN_DEBUG
01514         if(currToken.id) {
01515             qDebug( "unexpected token id: %d, str: *%s*", currToken.id,QConstString( buffer,dest-buffer ).string().latin1() );
01516             assert(0);
01517         }
01518 
01519 #endif
01520         currToken.text = new DOMStringImpl( buffer, dest - buffer );
01521         currToken.text->ref();
01522         currToken.id = ID_TEXT;
01523     }
01524     else if(!currToken.id) {
01525         currToken.reset();
01526         if (jsProxy)
01527             jsProxy->setEventHandlerLineno(lineno+src.lineCount());
01528         return;
01529     }
01530 
01531     dest = buffer;
01532 
01533 #ifdef TOKEN_DEBUG
01534     QString name = getTagName(currToken.id).string();
01535     QString text;
01536     if(currToken.text)
01537         text = QConstString(currToken.text->s, currToken.text->l).string();
01538 
01539     kdDebug( 6036 ) << "Token --> " << name << "   id = " << currToken.id << endl;
01540     if (currToken.flat)
01541         kdDebug( 6036 ) << "Token is FLAT!" << endl;
01542     if(!text.isNull())
01543         kdDebug( 6036 ) << "text: \"" << text << "\"" << endl;
01544     unsigned long l = currToken.attrs ? currToken.attrs->length() : 0;
01545     if(l) {
01546         kdDebug( 6036 ) << "Attributes: " << l << endl;
01547         for (unsigned long i = 0; i < l; ++i) {
01548             AttributeImpl* c = currToken.attrs->attributeItem(i);
01549             kdDebug( 6036 ) << "    " << c->id() << " " << parser->doc()->getDocument()->attrName(c->id()).string()
01550                             << "=\"" << c->value().string() << "\"" << endl;
01551         }
01552     }
01553     kdDebug( 6036 ) << endl;
01554 #endif
01555     // pass the token over to the parser, the parser DOES NOT delete the token
01556     parser->parseToken(&currToken);
01557 
01558     if ( currToken.flat && currToken.id != ID_TEXT && !parser->noSpaces() )
01559     discard = NoneDiscard;
01560     else if ( parser->selectMode() )
01561         discard = AllDiscard;
01562 
01563     currToken.reset();
01564     if (jsProxy)
01565         jsProxy->setEventHandlerLineno(0);
01566 }
01567 
01568 
01569 HTMLTokenizer::~HTMLTokenizer()
01570 {
01571     reset();
01572     delete parser;
01573 }
01574 
01575 
01576 void HTMLTokenizer::enlargeBuffer(int len)
01577 {
01578     int newsize = kMax(size*2, size+len);
01579     int oldoffs = (dest - buffer);
01580 
01581     buffer = (QChar*)realloc(buffer, newsize*sizeof(QChar));
01582     dest = buffer + oldoffs;
01583     size = newsize;
01584 }
01585 
01586 void HTMLTokenizer::enlargeScriptBuffer(int len)
01587 {
01588     int newsize = kMax(scriptCodeMaxSize*2, scriptCodeMaxSize+len);
01589     scriptCode = (QChar*)realloc(scriptCode, newsize*sizeof(QChar));
01590     scriptCodeMaxSize = newsize;
01591 }
01592 
01593 void HTMLTokenizer::notifyFinished(CachedObject* /*finishedObj*/)
01594 {
01595     assert(!cachedScript.isEmpty());
01596     bool done = false;
01597     while (!done && cachedScript.head()->isLoaded()) {
01598 #ifdef TOKEN_DEBUG
01599         kdDebug( 6036 ) << "Finished loading an external script" << endl;
01600 #endif
01601         CachedScript* cs = cachedScript.dequeue();
01602         done = cachedScript.isEmpty();
01603         DOMString scriptSource = cs->script();
01604 #ifdef TOKEN_DEBUG
01605         kdDebug( 6036 ) << "External script is:" << endl << scriptSource.string() << endl;
01606 #endif
01607         setSrc(QString::null);
01608 
01609         // make sure we forget about the script before we execute the new one
01610         // infinite recursion might happen otherwise
01611         QString cachedScriptUrl( cs->url().string() );
01612         cs->deref(this);
01613 
01614     scriptExecution( scriptSource.string(), cachedScriptUrl );
01615 
01616         // 'script' is true when we are called synchronously from
01617         // parseScript(). In that case parseScript() will take care
01618         // of 'scriptOutput'.
01619         if ( !script ) {
01620             QString rest = pendingSrc;
01621             pendingSrc = QString::null;
01622             write(rest, false);
01623             // we might be deleted at this point, do not
01624             // access any members.
01625         }
01626     }
01627 }
01628 
01629 void HTMLTokenizer::setSrc(const QString& source)
01630 {
01631     lineno += src.lineCount();
01632     _src = source;
01633     src = DOMStringIt(_src);
01634 }
01635 
01636 void HTMLTokenizer::setOnHold(bool _onHold)
01637 {
01638     if (onHold == _onHold) return;
01639     onHold = _onHold;
01640     if (onHold)
01641         setSrc(QString(src.current(), src.length())); // ### deep copy
01642 }
01643 
KDE Logo
This file is part of the documentation for kdelibs Version 3.1.3.
Documentation copyright © 1996-2002 the KDE developers.
Generated on Thu Apr 7 04:48:27 2005 by doxygen 1.3.5 written by Dimitri van Heesch, © 1997-2001