Kaydet (Commit) 6e4fd7ba authored tarafından Andras Timar's avatar Andras Timar

use u_isalpha() from ICU instead of home-grown solution

plus German comments were translated

Change-Id: Id9ff5d4835e4ea224c9e6232a1762822aa833d37
üst 4ab3d5bb
......@@ -18,13 +18,12 @@
*/
#include <unicode/uchar.h>
#include <svtools/syntaxhighlight.hxx>
#include <unotools/charclass.hxx>
#include <comphelper/string.hxx>
// ##########################################################################
// ATTENTION: all these words needs to be in small caps
// ATTENTION: all these words need to be in lower case
// ##########################################################################
static const char* strListBasicKeyWords[] = {
"access",
......@@ -232,111 +231,15 @@ extern "C" int CDECL compare_strings( const void *arg1, const void *arg2 )
namespace
{
class LetterTable
{
bool IsLetterTab[256];
public:
LetterTable( void );
inline bool isLetter( sal_Unicode c )
{
bool bRet = (c < 256) ? IsLetterTab[c] : isLetterUnicode( c );
return bRet;
}
bool isLetterUnicode( sal_Unicode c );
};
static bool isAlpha(sal_Unicode c)
{
if (comphelper::string::isalphaAscii(c))
return true;
static LetterTable aLetterTable;
return aLetterTable.isLetter(c);
return u_isalpha(c);
}
}
LetterTable::LetterTable( void )
{
for( int i = 0 ; i < 256 ; ++i )
IsLetterTab[i] = false;
IsLetterTab[0xC0] = true; // ?, CAPITAL LETTER A WITH GRAVE ACCENT
IsLetterTab[0xC1] = true; // ?, CAPITAL LETTER A WITH ACUTE ACCENT
IsLetterTab[0xC2] = true; // ?, CAPITAL LETTER A WITH CIRCUMFLEX ACCENT
IsLetterTab[0xC3] = true; // ?, CAPITAL LETTER A WITH TILDE
IsLetterTab[0xC4] = true; // ?, CAPITAL LETTER A WITH DIAERESIS
IsLetterTab[0xC5] = true; // ?, CAPITAL LETTER A WITH RING ABOVE
IsLetterTab[0xC6] = true; // ?, CAPITAL LIGATURE AE
IsLetterTab[0xC7] = true; // ?, CAPITAL LETTER C WITH CEDILLA
IsLetterTab[0xC8] = true; // ?, CAPITAL LETTER E WITH GRAVE ACCENT
IsLetterTab[0xC9] = true; // ?, CAPITAL LETTER E WITH ACUTE ACCENT
IsLetterTab[0xCA] = true; // ?, CAPITAL LETTER E WITH CIRCUMFLEX ACCENT
IsLetterTab[0xCB] = true; // ?, CAPITAL LETTER E WITH DIAERESIS
IsLetterTab[0xCC] = true; // ?, CAPITAL LETTER I WITH GRAVE ACCENT
IsLetterTab[0xCD] = true; // ?, CAPITAL LETTER I WITH ACUTE ACCENT
IsLetterTab[0xCE] = true; // ?, CAPITAL LETTER I WITH CIRCUMFLEX ACCENT
IsLetterTab[0xCF] = true; // ?, CAPITAL LETTER I WITH DIAERESIS
IsLetterTab[0xD0] = true; // ?, CAPITAL LETTER ETH
IsLetterTab[0xD1] = true; // ?, CAPITAL LETTER N WITH TILDE
IsLetterTab[0xD2] = true; // ?, CAPITAL LETTER O WITH GRAVE ACCENT
IsLetterTab[0xD3] = true; // ?, CAPITAL LETTER O WITH ACUTE ACCENT
IsLetterTab[0xD4] = true; // ?, CAPITAL LETTER O WITH CIRCUMFLEX ACCENT
IsLetterTab[0xD5] = true; // ?, CAPITAL LETTER O WITH TILDE
IsLetterTab[0xD6] = true; // ?, CAPITAL LETTER O WITH DIAERESIS
IsLetterTab[0xD8] = true; // ?, CAPITAL LETTER O WITH STROKE
IsLetterTab[0xD9] = true; // ?, CAPITAL LETTER U WITH GRAVE ACCENT
IsLetterTab[0xDA] = true; // ?, CAPITAL LETTER U WITH ACUTE ACCENT
IsLetterTab[0xDB] = true; // ?, CAPITAL LETTER U WITH CIRCUMFLEX ACCENT
IsLetterTab[0xDC] = true; // ?, CAPITAL LETTER U WITH DIAERESIS
IsLetterTab[0xDD] = true; // ?, CAPITAL LETTER Y WITH ACUTE ACCENT
IsLetterTab[0xDE] = true; // ?, CAPITAL LETTER THORN
IsLetterTab[0xDF] = true; // ?, SMALL LETTER SHARP S
IsLetterTab[0xE0] = true; // ?, SMALL LETTER A WITH GRAVE ACCENT
IsLetterTab[0xE1] = true; // ?, SMALL LETTER A WITH ACUTE ACCENT
IsLetterTab[0xE2] = true; // ?, SMALL LETTER A WITH CIRCUMFLEX ACCENT
IsLetterTab[0xE3] = true; // ?, SMALL LETTER A WITH TILDE
IsLetterTab[0xE4] = true; // ?, SMALL LETTER A WITH DIAERESIS
IsLetterTab[0xE5] = true; // ?, SMALL LETTER A WITH RING ABOVE
IsLetterTab[0xE6] = true; // ?, SMALL LIGATURE AE
IsLetterTab[0xE7] = true; // ?, SMALL LETTER C WITH CEDILLA
IsLetterTab[0xE8] = true; // ?, SMALL LETTER E WITH GRAVE ACCENT
IsLetterTab[0xE9] = true; // ?, SMALL LETTER E WITH ACUTE ACCENT
IsLetterTab[0xEA] = true; // ?, SMALL LETTER E WITH CIRCUMFLEX ACCENT
IsLetterTab[0xEB] = true; // ?, SMALL LETTER E WITH DIAERESIS
IsLetterTab[0xEC] = true; // ?, SMALL LETTER I WITH GRAVE ACCENT
IsLetterTab[0xED] = true; // ?, SMALL LETTER I WITH ACUTE ACCENT
IsLetterTab[0xEE] = true; // ?, SMALL LETTER I WITH CIRCUMFLEX ACCENT
IsLetterTab[0xEF] = true; // ?, SMALL LETTER I WITH DIAERESIS
IsLetterTab[0xF0] = true; // ?, SMALL LETTER ETH
IsLetterTab[0xF1] = true; // ?, SMALL LETTER N WITH TILDE
IsLetterTab[0xF2] = true; // ?, SMALL LETTER O WITH GRAVE ACCENT
IsLetterTab[0xF3] = true; // ?, SMALL LETTER O WITH ACUTE ACCENT
IsLetterTab[0xF4] = true; // ?, SMALL LETTER O WITH CIRCUMFLEX ACCENT
IsLetterTab[0xF5] = true; // ?, SMALL LETTER O WITH TILDE
IsLetterTab[0xF6] = true; // ?, SMALL LETTER O WITH DIAERESIS
IsLetterTab[0xF8] = true; // ?, SMALL LETTER O WITH OBLIQUE BAR
IsLetterTab[0xF9] = true; // ?, SMALL LETTER U WITH GRAVE ACCENT
IsLetterTab[0xFA] = true; // ?, SMALL LETTER U WITH ACUTE ACCENT
IsLetterTab[0xFB] = true; // ?, SMALL LETTER U WITH CIRCUMFLEX ACCENT
IsLetterTab[0xFC] = true; // ?, SMALL LETTER U WITH DIAERESIS
IsLetterTab[0xFD] = true; // ?, SMALL LETTER Y WITH ACUTE ACCENT
IsLetterTab[0xFE] = true; // ?, SMALL LETTER THORN
IsLetterTab[0xFF] = true; // � , SMALL LETTER Y WITH DIAERESIS
}
bool LetterTable::isLetterUnicode( sal_Unicode c )
{
static CharClass* pCharClass = NULL;
if( pCharClass == NULL )
pCharClass = new CharClass( Application::GetSettings().GetLanguageTag() );
rtl::OUString aStr( c );
bool bRet = pCharClass->isLetter( aStr, 0 );
return bRet;
}
// Hilfsfunktion: Zeichen-Flag Testen
// Helper function: test character flag
sal_Bool SimpleTokenizer_Impl::testCharFlags( sal_Unicode c, sal_uInt16 nTestFlags )
{
bool bRet = false;
......@@ -358,24 +261,20 @@ void SimpleTokenizer_Impl::setKeyWords( const char** ppKeyWords, sal_uInt16 nCou
nKeyWordCount = nCount;
}
// Neues Token holen
sal_Bool SimpleTokenizer_Impl::getNextToken( /*out*/TokenTypes& reType,
/*out*/const sal_Unicode*& rpStartPos, /*out*/const sal_Unicode*& rpEndPos )
{
reType = TT_UNKNOWN;
// Position merken
rpStartPos = mpActualPos;
// Zeichen untersuchen
sal_Unicode c = peekChar();
if( c == CHAR_EOF )
return sal_False;
// Zeichen lesen
getChar();
//*** Alle Moeglichkeiten durchgehen ***
//*** Go through all possibilities ***
// Space?
if ( (testCharFlags( c, CHAR_SPACE ) == sal_True) )
{
......@@ -401,7 +300,7 @@ sal_Bool SimpleTokenizer_Impl::getNextToken( /*out*/TokenTypes& reType,
reType = TT_IDENTIFIER;
// Schluesselwort-Tabelle
// Keyword table
if (ppListKeyWords != NULL)
{
int nCount = mpActualPos - rpStartPos;
......@@ -429,7 +328,7 @@ sal_Bool SimpleTokenizer_Impl::getNextToken( /*out*/TokenTypes& reType,
if (aByteStr.equalsL(RTL_CONSTASCII_STRINGPARAM("rem")))
{
// Alle Zeichen bis Zeilen-Ende oder EOF entfernen
// Remove all characters until end of line or EOF
sal_Unicode cPeek = peekChar();
while( cPeek != CHAR_EOF && testCharFlags( cPeek, CHAR_EOL ) == sal_False )
{
......@@ -456,7 +355,7 @@ sal_Bool SimpleTokenizer_Impl::getNextToken( /*out*/TokenTypes& reType,
sal_Bool bIdentifierChar;
do
{
// Naechstes Zeichen holen
// Get next character
c = peekChar();
bIdentifierChar = isAlpha(c);
if( bIdentifierChar )
......@@ -471,7 +370,7 @@ sal_Bool SimpleTokenizer_Impl::getNextToken( /*out*/TokenTypes& reType,
sal_Unicode cPeekNext = peekChar();
if (cPeekNext=='-')
{
// Alle Zeichen bis Zeilen-Ende oder EOF entfernen
// Remove all characters until end of line or EOF
while( cPeekNext != CHAR_EOF && testCharFlags( cPeekNext, CHAR_EOL ) == sal_False )
{
getChar();
......@@ -485,7 +384,7 @@ sal_Bool SimpleTokenizer_Impl::getNextToken( /*out*/TokenTypes& reType,
sal_Unicode cPeekNext = peekChar();
if (cPeekNext=='/')
{
// Alle Zeichen bis Zeilen-Ende oder EOF entfernen
// Remove all characters until end of line or EOF
while( cPeekNext != CHAR_EOF && testCharFlags( cPeekNext, CHAR_EOL ) == sal_False )
{
getChar();
......@@ -496,12 +395,12 @@ sal_Bool SimpleTokenizer_Impl::getNextToken( /*out*/TokenTypes& reType,
}
else
{
// Kommentar ?
// Comment?
if ( c == '\'' )
{
c = getChar(); // '/' entfernen
c = getChar();
// Alle Zeichen bis Zeilen-Ende oder EOF entfernen
// Remove all characters until end of line or EOF
sal_Unicode cPeek = c;
while( cPeek != CHAR_EOF && testCharFlags( cPeek, CHAR_EOL ) == sal_False )
{
......@@ -529,36 +428,36 @@ sal_Bool SimpleTokenizer_Impl::getNextToken( /*out*/TokenTypes& reType,
reType = TT_OPERATOR;
}
// Zahl?
// Number?
else if( testCharFlags( c, CHAR_START_NUMBER ) == sal_True )
{
reType = TT_NUMBER;
// Zahlensystem, 10 = normal, wird bei Oct/Hex geaendert
// Number system, 10 = normal, it is changed for Oct/Hex
int nRadix = 10;
// Ist es eine Hex- oder Oct-Zahl?
// Is it an Oct or a Hex number?
if( c == '&' )
{
// Octal?
if( peekChar() == 'o' || peekChar() == 'O' )
{
// o entfernen
// remove o
getChar();
nRadix = 8; // Octal-Basis
nRadix = 8; // Octal base
// Alle Ziffern einlesen
// Read all numbers
while( testCharFlags( peekChar(), CHAR_IN_OCT_NUMBER ) )
c = getChar();
}
// Hex?
// Hexadecimal?
else if( peekChar() == 'h' || peekChar() == 'H' )
{
// x entfernen
// remove x
getChar();
nRadix = 16; // Hex-Basis
nRadix = 16; // Hexadecimal base
// Alle Ziffern einlesen und puffern
// Read all numbers
while( testCharFlags( peekChar(), CHAR_IN_HEX_NUMBER ) )
c = getChar();
}
......@@ -568,38 +467,36 @@ sal_Bool SimpleTokenizer_Impl::getNextToken( /*out*/TokenTypes& reType,
}
}
// Wenn nicht Oct oder Hex als double ansehen
// When it is not Oct or Hex, then it is double
if( reType == TT_NUMBER && nRadix == 10 )
{
// Flag, ob das letzte Zeichen ein Exponent war
// Flag if the last character is an exponent
sal_Bool bAfterExpChar = sal_False;
// Alle Ziffern einlesen
// Read all numbers
while( testCharFlags( peekChar(), CHAR_IN_NUMBER ) ||
(bAfterExpChar && peekChar() == '+' ) ||
(bAfterExpChar && peekChar() == '-' ) )
// Nach Exponent auch +/- OK
// After exponent +/- are OK, too
{
c = getChar(); // Zeichen lesen
c = getChar();
bAfterExpChar = ( c == 'e' || c == 'E' );
}
}
// reType = TT_NUMBER;
}
// String?
else if( testCharFlags( c, CHAR_START_STRING ) == sal_True )
{
// Merken, welches Zeichen den String eroeffnet hat
// Remember which character has opened the string
sal_Unicode cEndString = c;
if( c == '[' )
cEndString = ']';
// Alle Ziffern einlesen und puffern
// Read all characters
while( peekChar() != cEndString )
{
// #58846 EOF vor getChar() abfangen, damit EOF micht verloren geht
// Detect EOF before getChar(), so we do not loose EOF
if( peekChar() == CHAR_EOF )
{
// ERROR: unterminated string literal
......@@ -615,7 +512,6 @@ sal_Bool SimpleTokenizer_Impl::getNextToken( /*out*/TokenTypes& reType,
}
}
// Zeichen lesen
if( reType != TT_ERROR )
{
getChar();
......@@ -626,25 +522,24 @@ sal_Bool SimpleTokenizer_Impl::getNextToken( /*out*/TokenTypes& reType,
}
}
// Zeilenende?
// End of line?
else if( testCharFlags( c, CHAR_EOL ) == sal_True )
{
// Falls ein weiteres anderes EOL-Char folgt, weg damit
// If another EOL character comes, read it
sal_Unicode cNext = peekChar();
if( cNext != c && testCharFlags( cNext, CHAR_EOL ) == sal_True )
getChar();
// Positions-Daten auf Zeilen-Beginn setzen
// Set position data at the line start
nCol = 0;
nLine++;
reType = TT_EOL;
}
// Alles andere bleibt TT_UNKNOWN
// All other will remain TT_UNKNOWN
// End-Position eintragen
// Save end position
rpEndPos = mpActualPos;
return sal_True;
}
......@@ -653,49 +548,47 @@ SimpleTokenizer_Impl::SimpleTokenizer_Impl( HighlighterLanguage aLang ): aLangua
{
memset( aCharTypeTab, 0, sizeof( aCharTypeTab ) );
// Zeichen-Tabelle fuellen
// Fill character table
sal_uInt16 i;
// Zulaessige Zeichen fuer Identifier
// Allowed characters for identifiers
sal_uInt16 nHelpMask = (sal_uInt16)( CHAR_START_IDENTIFIER | CHAR_IN_IDENTIFIER );
for( i = 'a' ; i <= 'z' ; i++ )
aCharTypeTab[i] |= nHelpMask;
for( i = 'A' ; i <= 'Z' ; i++ )
aCharTypeTab[i] |= nHelpMask;
// '_' extra eintragen
aCharTypeTab[(int)'_'] |= nHelpMask;
// AB 23.6.97: '$' ist auch erlaubt
aCharTypeTab[(int)'$'] |= nHelpMask;
// Ziffern (Identifier und Number ist moeglich)
// Digit (can be identifier and number)
nHelpMask = (sal_uInt16)( CHAR_IN_IDENTIFIER | CHAR_START_NUMBER |
CHAR_IN_NUMBER | CHAR_IN_HEX_NUMBER );
for( i = '0' ; i <= '9' ; i++ )
aCharTypeTab[i] |= nHelpMask;
// e und E sowie . von Hand ergaenzen
// Add e, E, . and & here manually
aCharTypeTab[(int)'e'] |= CHAR_IN_NUMBER;
aCharTypeTab[(int)'E'] |= CHAR_IN_NUMBER;
aCharTypeTab[(int)'.'] |= (sal_uInt16)( CHAR_IN_NUMBER | CHAR_START_NUMBER );
aCharTypeTab[(int)'&'] |= CHAR_START_NUMBER;
// Hex-Ziffern
// Hexadecimal digit
for( i = 'a' ; i <= 'f' ; i++ )
aCharTypeTab[i] |= CHAR_IN_HEX_NUMBER;
for( i = 'A' ; i <= 'F' ; i++ )
aCharTypeTab[i] |= CHAR_IN_HEX_NUMBER;
// Oct-Ziffern
// Octal digit
for( i = '0' ; i <= '7' ; i++ )
aCharTypeTab[i] |= CHAR_IN_OCT_NUMBER;
// String-Beginn/End-Zeichen
// String literal start/end characters
aCharTypeTab[(int)'\''] |= CHAR_START_STRING;
aCharTypeTab[(int)'\"'] |= CHAR_START_STRING;
aCharTypeTab[(int)'['] |= CHAR_START_STRING;
aCharTypeTab[(int)'`'] |= CHAR_START_STRING;
// Operator-Zeichen
// Operator characters
aCharTypeTab[(int)'!'] |= CHAR_OPERATOR;
aCharTypeTab[(int)'%'] |= CHAR_OPERATOR;
// aCharTypeTab[(int)'&'] |= CHAR_OPERATOR; Removed because of #i14140
......@@ -724,7 +617,7 @@ SimpleTokenizer_Impl::SimpleTokenizer_Impl( HighlighterLanguage aLang ): aLangua
aCharTypeTab[(int)' ' ] |= CHAR_SPACE;
aCharTypeTab[(int)'\t'] |= CHAR_SPACE;
// Zeilen-Ende-Zeichen
// End of line characters
aCharTypeTab[(int)'\r'] |= CHAR_EOL;
aCharTypeTab[(int)'\n'] |= CHAR_EOL;
......@@ -743,22 +636,21 @@ SimpleTokenizer_Impl* getSimpleTokenizer( void )
return pSimpleTokenizer;
}
// Heraussuchen der jeweils naechsten Funktion aus einem JavaScript-Modul
sal_uInt16 SimpleTokenizer_Impl::parseLine( sal_uInt32 nParseLine, const String* aSource )
{
// Position auf den Anfang des Source-Strings setzen
// Set the position to the beginning of the source string
mpStringBegin = mpActualPos = aSource->GetBuffer();
// Zeile und Spalte initialisieren
// Initialize row and column
nLine = nParseLine;
nCol = 0L;
// Variablen fuer die Out-Parameter
// Variables for the out parameter
TokenTypes eType;
const sal_Unicode* pStartPos;
const sal_Unicode* pEndPos;
// Schleife ueber alle Tokens
// Loop over all the tokens
sal_uInt16 nTokenCount = 0;
while( getNextToken( eType, pStartPos, pEndPos ) )
nTokenCount++;
......@@ -769,19 +661,19 @@ sal_uInt16 SimpleTokenizer_Impl::parseLine( sal_uInt32 nParseLine, const String*
void SimpleTokenizer_Impl::getHighlightPortions( sal_uInt32 nParseLine, const String& rLine,
/*out*/HighlightPortions& portions )
{
// Position auf den Anfang des Source-Strings setzen
// Set the position to the beginning of the source string
mpStringBegin = mpActualPos = rLine.GetBuffer();
// Zeile und Spalte initialisieren
// Initialize row and column
nLine = nParseLine;
nCol = 0L;
// Variablen fuer die Out-Parameter
// Variables for the out parameter
TokenTypes eType;
const sal_Unicode* pStartPos;
const sal_Unicode* pEndPos;
// Schleife ueber alle Tokens
// Loop over all the tokens
while( getNextToken( eType, pStartPos, pEndPos ) )
{
HighlightPortion portion;
......@@ -795,9 +687,6 @@ void SimpleTokenizer_Impl::getHighlightPortions( sal_uInt32 nParseLine, const St
}
//////////////////////////////////////////////////////////////////////////
// Implementierung des SyntaxHighlighter
SyntaxHighlighter::SyntaxHighlighter()
{
m_pSimpleTokenizer = 0;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment