Kaydet (Commit) bebbe972 authored tarafından Caolán McNamara's avatar Caolán McNamara

wtratree unbuilt

üst f2306b94
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*************************************************************************
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* Copyright 2000, 2010 Oracle and/or its affiliates.
*
* OpenOffice.org - a multi-platform office productivity suite
*
* This file is part of OpenOffice.org.
*
* OpenOffice.org is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License version 3
* only, as published by the Free Software Foundation.
*
* OpenOffice.org is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License version 3 for more details
* (a copy is included in the LICENSE file that accompanied this code).
*
* You should have received a copy of the GNU Lesser General Public License
* version 3 along with OpenOffice.org. If not, see
* <http://www.openoffice.org/license.html>
* for a copy of the LGPLv3 License.
*
************************************************************************/
#ifndef TX3_WTRATREE_HXX
#define TX3_WTRATREE_HXX
// USED
// Base Classes
// Components
// Parameters
#include <tools/string.hxx>
const INT16 C_NR_OF_WTT_RESULTS = 5;
const INT16 C_NR_OF_POSSIBLE_CHARS = 256;
typedef unsigned char u_char;
typedef const char * constr;
class WTT_Node;
/** @task
This class implements the functionality, that class WordTransformer
offers.
WordTransformer is dependant of this class, but NOT the other way!
**/
class WordTransTree
{
public:
enum E_Result
{
OK = 0,
HOTKEY_LOST,
OUTPUT_OVERFLOW
};
// LIFECYCLE
WordTransTree(
CharSet i_nWorkingCharSet = RTL_TEXTENCODING_MS_1252);
void SetCharSet(
CharSet i_nWorkingCharSet);
~WordTransTree();
void AddWordPair(
const ByteString & i_sOldString,
const ByteString & i_sReplaceString );
// OPERATIONS
void InitTransformation(
const char * i_sInput, /// [!=0], a range of i_nInputLength must be valid memory for read.
UINT32 i_nInputLength,
UINT32 i_nOutputMaxLength = STRING_MAXLEN - 12 );
E_Result TransformNextToken();
// INQUIRY
sal_Bool TextEndReached() const;
const char * Output() const;
// These 3 functions are valid between two calls of
// TransformNextToken():
E_Result CurResult() const;
ByteString CurReplacedString() const;
ByteString CurReplacingString() const;
char CurHotkey() const;
private:
// SERVICE FUNCTONS
UINT8 CalculateBranch(
u_char i_cInputChar ) const;
void Handle_Hotkey();
void Handle_TokenToKeep();
void Handle_TokenToTransform();
// DATA
// Fixed data
const u_char * sInput;
UINT32 nInputLength;
const u_char * pInputEnd;
u_char * sOutput; // DYN
UINT32 nOutputMaxLength;
WTT_Node * dpParsingTreeTop; // DYN
WTT_Node * pUnknownAlpha;
u_char cChar2Branch[C_NR_OF_POSSIBLE_CHARS];
u_char c_AE, c_OE, c_UE, c_ae, c_oe, c_ue;
// Working data
const u_char * pInputCurTokenStart;
const u_char * pInputPosition;
u_char * pOutputPosition;
WTT_Node * pCurParseNode;
// Data which are valid only after a completed call to TransformNextToken()
E_Result eCurResult;
u_char cCurHotkey; // Letter wich is used as hotkey
u_char cCurHotkeySign; // Letter which is used to assign hotkey ('~'or '&') .
};
inline sal_Bool
WordTransTree::TextEndReached() const
{ return pInputPosition == pInputEnd; }
inline const char *
WordTransTree::Output() const
{ return TextEndReached() ? (constr) sOutput : ""; }
inline WordTransTree::E_Result
WordTransTree::CurResult() const
{ return eCurResult; }
inline ByteString
WordTransTree::CurReplacedString() const
{ return ByteString((constr) pInputCurTokenStart,pInputPosition-pInputCurTokenStart); }
inline char
WordTransTree::CurHotkey() const
{ return cCurHotkey; }
inline UINT8
WordTransTree::CalculateBranch(u_char i_cInputChar) const
{ return cChar2Branch[i_cInputChar]; }
#endif
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*************************************************************************
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* Copyright 2000, 2010 Oracle and/or its affiliates.
*
* OpenOffice.org - a multi-platform office productivity suite
*
* This file is part of OpenOffice.org.
*
* OpenOffice.org is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License version 3
* only, as published by the Free Software Foundation.
*
* OpenOffice.org is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License version 3 for more details
* (a copy is included in the LICENSE file that accompanied this code).
*
* You should have received a copy of the GNU Lesser General Public License
* version 3 along with OpenOffice.org. If not, see
* <http://www.openoffice.org/license.html>
* for a copy of the LGPLv3 License.
*
************************************************************************/
// MARKER(update_precomp.py): autogen include statement, do not remove
#include "precompiled_l10ntools.hxx"
#include "wtratree.hxx"
/** @ATTENTION
For reasons of speed, class WordTransTree works with two simple
char arrays, sOutput and sInput, instead of secure containers or
streams. So be extremely careful, when changing this code!!!
**/
// NOT FULLY DECLARED SERVICES
#include <string.h>
#include <stdio.h>
#include <ctype.h>
#include "wtranode.hxx"
const BRANCH_T BR_END = 0;
const BRANCH_T BR_NONALPHA = 1;
const BRANCH_T BR_HOTKEY = 2;
const BRANCH_T BR_BACKSLASH = 3;
const BRANCH_T BR_ALPHABASE = 4; /// @ATTENTION All branches not valid for words must be smaller than this value!
const BRANCH_T BR_AE = 30;
const BRANCH_T BR_OE = 31;
const BRANCH_T BR_UE = 32;
const BRANCH_T BR_SZ = 33;
const BRANCH_T BR_MAX = 34; /// @ATTENTION Must be updated always!
const BRANCH_T BR_START = 0;
WordTransTree::WordTransTree(CharSet i_nWorkingCharSet)
: sInput(0),
nInputLength(0),
pInputEnd(0),
sOutput(0),
nOutputMaxLength(0),
dpParsingTreeTop(0),
pUnknownAlpha(0),
// cChar2Branch
c_AE(u_char('\xC4')), c_OE(u_char('\xD6')), c_UE(u_char('\xDC')),
c_ae(u_char('\xE4')), c_oe(u_char('\xF6')), c_ue(u_char('\xFC')),
pInputCurTokenStart(0),
pInputPosition(0),
pOutputPosition(0),
pCurParseNode(0),
eCurResult(OK),
cCurHotkey(0),
cCurHotkeySign(u_char('~'))
{
// Initialize parsing tree:
pUnknownAlpha = new WTT_Node(BR_ALPHABASE,0,0); // This will be deleted as part of the parsing tree.
for ( UINT8 i = BR_ALPHABASE; i < C_NR_OF_BRANCHES; i++)
{
pUnknownAlpha->SetBranch(i,pUnknownAlpha);
} // end for
dpParsingTreeTop = new WTT_Node(BR_START,0,pUnknownAlpha);
WTT_Node * dpNonAlpha = new WTT_Node(BR_NONALPHA,0,0);
dpNonAlpha->SetBranch(BR_NONALPHA,dpNonAlpha);
dpParsingTreeTop->SetBranch(BR_NONALPHA,dpNonAlpha);
WTT_Node * dpBackslash = new WTT_Node(BR_BACKSLASH,dpNonAlpha,dpNonAlpha);
dpBackslash->SetBranch(BR_END,0);
dpParsingTreeTop->SetBranch(BR_BACKSLASH,dpBackslash);
dpNonAlpha->SetBranch(BR_BACKSLASH,dpBackslash);
// Initialize character set:
SetCharSet(i_nWorkingCharSet);
if (C_BR_ALPHABASE != BR_ALPHABASE || C_NR_OF_BRANCHES != BR_MAX)
{
fprintf(stderr, "Assertion failed: file %s line %d.", __FILE__, __LINE__);
exit(1);
}
}
void
WordTransTree::SetCharSet(CharSet i_nWorkingCharSet)
{
ByteString sConvert("\xC4\xD6\xDC\xE4\xF6\xFC\xDF");
const u_char * pConvert = (const u_char * ) ( sConvert.Convert(RTL_TEXTENCODING_MS_1252, i_nWorkingCharSet).GetBuffer() );
INT16 i = 0;
for ( ; i < C_NR_OF_POSSIBLE_CHARS; ++i )
{
cChar2Branch[i] = BR_NONALPHA;
} // end for
for ( i = 'a'; i <= 'z'; ++i )
{
cChar2Branch[i] = BR_ALPHABASE + i - 'a';
} // end for
for ( i = 'A'; i <= 'Z'; ++i )
{
cChar2Branch[i] = BR_ALPHABASE + i - 'A';
} // end for
cChar2Branch[pConvert[0]] = BR_AE;
cChar2Branch[pConvert[1]] = BR_OE;
cChar2Branch[pConvert[2]] = BR_UE;
cChar2Branch[pConvert[3]] = BR_AE;
cChar2Branch[pConvert[4]] = BR_OE;
cChar2Branch[pConvert[5]] = BR_UE;
cChar2Branch[pConvert[6]] = BR_SZ;
cChar2Branch[u_char('~')] = BR_HOTKEY;
cChar2Branch[u_char('&')] = BR_HOTKEY;
c_AE = pConvert[0];
c_OE = pConvert[1];
c_UE = pConvert[2];
c_ae = pConvert[3];
c_oe = pConvert[4];
c_ue = pConvert[5];
}
WordTransTree::~WordTransTree()
{
delete dpParsingTreeTop;
if (sOutput != 0)
delete [] sOutput;
}
void
WordTransTree::AddWordPair( const ByteString & i_sOldString,
const ByteString & i_sReplaceString )
{
if (i_sOldString.Len() == 0)
return;
pCurParseNode = dpParsingTreeTop;
WTT_Node * pBranch = 0;
char cBranch = 0;
for ( constr pOld = i_sOldString.GetBuffer();
*pOld != 0;
pOld++ )
{
cBranch = CalculateBranch(*pOld);
pBranch = pCurParseNode->GetNextNode(cBranch);
if (pBranch == 0 || pBranch == pUnknownAlpha)
{
pBranch = new WTT_Node(cBranch,0,pUnknownAlpha);
pCurParseNode->SetBranch(cBranch,pBranch);
}
pCurParseNode = pBranch;
} // end for
pCurParseNode->SetAsTokenToReplace(i_sReplaceString);
}
void
WordTransTree::InitTransformation( const char * i_sInput,
UINT32 i_nInputLength,
UINT32 i_nOutputMaxLength )
{
sInput = (const u_char *)i_sInput;
nInputLength = i_nInputLength;
pInputEnd = &sInput[i_nInputLength];
pInputCurTokenStart = sInput;
pInputPosition = sInput;
if (nOutputMaxLength < i_nOutputMaxLength)
{
if (sOutput != 0)
delete [] sOutput;
sOutput = new unsigned char[i_nOutputMaxLength];
nOutputMaxLength = i_nOutputMaxLength;
}
pOutputPosition = sOutput;
}
/** pInputCurTokenStart and CurParseNode are updated just when
starting this function. After its end they must not be changed
till this functon is called again.
Outside this function pInputPositon and pOutputPosition are both
on the first not transformed char in their respective array.
**/
WordTransTree::E_Result
WordTransTree::TransformNextToken()
{
pInputCurTokenStart = pInputPosition;
pCurParseNode = dpParsingTreeTop;
cCurHotkey = 0;
eCurResult = OK;
WTT_Node * pBranch = 0;
UINT8 cBranch = 0;
for ( pCurParseNode = dpParsingTreeTop;
pInputPosition != pInputEnd;
++pInputPosition )
{
cBranch = CalculateBranch(*pInputPosition);
pBranch = pCurParseNode->GetNextNode( cBranch );
if (pBranch != 0)
{
pCurParseNode = pBranch;
}
else
{
if (cBranch == BR_HOTKEY) // current letter is '~' or '&'.
{
// Logic of the following. There are 9 possible cases -
// A = alphabetic letter, NA = non alphabetic, TB = token begin,
// Eot = end of text:
// 1. A~A set hotkey to following letter, continue
// 2. A~NA token end
// 3. A~Eot token end
// 4. NA~A token end
// 5. NA~NA continue
// 6. A~Eof continue
// 7. TB~A set hotkey to following letter, continue
// 8. TB~NA continue
// 9. TB~Eot continue
// bNext and Prev are true, if there are alphabetic letters:
sal_Bool bNext = pInputPosition + 1 != pInputEnd
? CalculateBranch(pInputPosition[1]) >= BR_ALPHABASE
: sal_False;
sal_Bool bPrev = pCurParseNode->Value() >= BR_ALPHABASE;
if ( bNext && (bPrev || pCurParseNode == dpParsingTreeTop) )
{ // case 1. and 7.
Handle_Hotkey();
continue;
}
else if (!bPrev && !bNext)
{ // case 5.,6.,8.,9.
continue;
}
// Case 2.,3.,4. :
// so this should be handled as an end of a token.
}
if (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
{
Handle_TokenToKeep();
return eCurResult;
}
else
{
Handle_TokenToTransform();
return eCurResult;
} // endif (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
} // endif (pBranch == 0) else
} // end for
// If here, the text end is reached
if (pCurParseNode->TokenType() == WTT_Node::token_to_keep)
{
Handle_TokenToKeep();
return eCurResult;
}
else
{
Handle_TokenToTransform();
return eCurResult;
}
}
ByteString
WordTransTree::CurReplacingString() const
{
return pCurParseNode->ReplaceString();
}
void
WordTransTree::Handle_Hotkey()
{
if (cCurHotkey == 0) // Avoid to replace the first found hotkey by
// a later one - though this shouldn't happen anyway.
{
cCurHotkey = (pInputPosition+1) != pInputEnd ? pInputPosition[1] : 0;
cCurHotkeySign = *pInputPosition;
}
}
void
WordTransTree::Handle_TokenToKeep()
{
UINT32 nTokenLength = pInputPosition-pInputCurTokenStart;
memcpy(pOutputPosition,pInputCurTokenStart,nTokenLength);
pOutputPosition += nTokenLength;
*pOutputPosition = '\0';
}
void
WordTransTree::Handle_TokenToTransform()
{
sal_Bool bHaveHotkey = CalculateBranch(cCurHotkey) >= BR_ALPHABASE;
const ByteString & rReplace = pCurParseNode->ReplaceString();
// Find position of hotkey in replace-string:
sal_uInt16 nHotkeyPos = bHaveHotkey
? rReplace.Search(char(cCurHotkey))
: STRING_NOTFOUND;
if (nHotkeyPos == STRING_NOTFOUND && bHaveHotkey)
{
if (cCurHotkey < 128)
{
if (islower(cCurHotkey))
nHotkeyPos = rReplace.Search(toupper(char(cCurHotkey)));
else
nHotkeyPos = rReplace.Search(tolower(char(cCurHotkey)));
}
else // cCurHotkey >= 128
{
if (cCurHotkey == c_ae)
nHotkeyPos = rReplace.Search(char(c_AE));
else if (cCurHotkey == c_oe)
nHotkeyPos = rReplace.Search(char(c_OE));
else if (cCurHotkey == c_ue)
nHotkeyPos = rReplace.Search(char(c_UE));
else if (cCurHotkey == c_AE)
nHotkeyPos = rReplace.Search(char(c_ae));
else if (cCurHotkey == c_OE)
nHotkeyPos = rReplace.Search(char(c_oe));
else if (cCurHotkey == c_UE)
nHotkeyPos = rReplace.Search(char(c_ue));
} // endif (cCurHotkey < 128) else
if (nHotkeyPos == STRING_NOTFOUND)
{
eCurResult = HOTKEY_LOST;
bHaveHotkey = sal_False;
}
} // endif (nHotkeyPos == STRING_NOT_FOUND && bHaveHotkey)
UINT32 nOutputTokenLength = rReplace.Len() + (bHaveHotkey ? 1 : 0);
if (bHaveHotkey)
{
memcpy( pOutputPosition,
pCurParseNode->ReplaceString().GetBuffer(),
nHotkeyPos );
*(pOutputPosition + nHotkeyPos) = cCurHotkeySign;
memcpy( pOutputPosition + nHotkeyPos + 1,
pCurParseNode->ReplaceString().GetBuffer() + nHotkeyPos,
nOutputTokenLength - nHotkeyPos - 1);
}
else
{
memcpy( pOutputPosition,
pCurParseNode->ReplaceString().GetBuffer(),
nOutputTokenLength );
}
// Convert first letter into upper if necessary:
u_char cInStart = CalculateBranch(*pInputCurTokenStart) == BR_HOTKEY
? pInputCurTokenStart[1]
: pInputCurTokenStart[0] ;
u_char * pOutStart = nHotkeyPos == 0
? pOutputPosition + 1
: pOutputPosition ;
if (isupper(cInStart) || cInStart > 127)
{ // Possibly cInStart is upper character:
if (isupper(cInStart) || cInStart == c_AE || cInStart == c_OE || cInStart == c_UE)
{ // Surely cInStart is upper character:
u_char cOutStart = *pOutStart;
if (cOutStart < 128)
*pOutStart = toupper(cOutStart);
else if (cOutStart == c_ae)
*pOutStart = c_AE;
else if (cOutStart == c_oe)
*pOutStart = c_OE;
else if (cOutStart == c_ue)
*pOutStart = c_UE;
}
} // endif (isupper(cInStart) || cInStart > 127)
pOutputPosition += nOutputTokenLength;
*pOutputPosition = '\0';
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment