Kaydet (Commit) a346dfcc authored tarafından Stephan Bergmann's avatar Stephan Bergmann

tdf#70833: IDNA support when exporing hyperlinks to PDF

Any URLs using non-ASCII IDNA syntax need to be resolved to ASCII-only, as PDF
URI Action's URI needs to be "encoded in 7-bit ASCII."

Introduce URIHelper::resolveIdnaHost (svl/urihelper.hxx), which internally uses
icu::IDNA, which requires to bump the minimal --with-system-icu requirement from
4.2 to 4.6, which means ICU_RECLASSIFIED_CLOSE_PARENTHESIS is always true now.

Change-Id: I0e20d9a20ed2b869fba0cc7c969721411db590b3
Reviewed-on: https://gerrit.libreoffice.org/19669Reviewed-by: 's avatarStephan Bergmann <sbergman@redhat.com>
Tested-by: 's avatarStephan Bergmann <sbergman@redhat.com>
üst b0515107
......@@ -273,7 +273,6 @@ export ICU_CFLAGS=$(gb_SPACE)@ICU_CFLAGS@
export ICU_LIBS=$(gb_SPACE)@ICU_LIBS@
export ICU_MAJOR=@ICU_MAJOR@
export ICU_MINOR=@ICU_MINOR@
export ICU_RECLASSIFIED_CLOSE_PARENTHESIS=@ICU_RECLASSIFIED_CLOSE_PARENTHESIS@
export ICU_RECLASSIFIED_CONDITIONAL_JAPANESE_STARTER=@ICU_RECLASSIFIED_CONDITIONAL_JAPANESE_STARTER@
export ICU_RECLASSIFIED_HEBREW_LETTER=@ICU_RECLASSIFIED_HEBREW_LETTER@
export ICU_RECLASSIFIED_PREPEND_SET_EMPTY=@ICU_RECLASSIFIED_PREPEND_SET_EMPTY@
......
......@@ -8949,7 +8949,6 @@ SYSTEM_GENCMN=
ICU_MAJOR=56
ICU_MINOR=1
ICU_RECLASSIFIED_CLOSE_PARENTHESIS="TRUE"
ICU_RECLASSIFIED_PREPEND_SET_EMPTY="TRUE"
ICU_RECLASSIFIED_CONDITIONAL_JAPANESE_STARTER="TRUE"
ICU_RECLASSIFIED_HEBREW_LETTER="TRUE"
......@@ -8974,10 +8973,10 @@ if test "$with_system_icu" = "yes"; then
ICU_MAJOR=`echo $ICU_VERSION | cut -d"." -f1`
ICU_MINOR=`echo $ICU_VERSION | cut -d"." -f2`
if test "$ICU_MAJOR" -ge "49" -o \( "$ICU_MAJOR" = "4" -a "$ICU_MINOR" -ge "2" \); then
if test "$ICU_MAJOR" -ge "49" -o \( "$ICU_MAJOR" = "4" -a "$ICU_MINOR" -ge "6" \); then
AC_MSG_RESULT([OK, $ICU_VERSION])
else
AC_MSG_ERROR([not suitable, only >= 4.2 supported currently])
AC_MSG_ERROR([not suitable, only >= 4.6 supported currently])
fi
fi
......@@ -9013,11 +9012,6 @@ You can use --with-system-icu-for-build=force to use it anyway.])
if test -z "$SYSTEM_GENCMN"; then
AC_MSG_ERROR([\'gencmn\' not found in \$PATH, install the icu development tool \'gencmn\'])
fi
if test "$ICU_MAJOR" -ge "49" -o \( "$ICU_MAJOR" = "4" -a "$ICU_MINOR" -ge "4" \); then
ICU_RECLASSIFIED_CLOSE_PARENTHESIS="TRUE"
else
ICU_RECLASSIFIED_CLOSE_PARENTHESIS=
fi
if test "$ICU_MAJOR" -ge "49"; then
ICU_RECLASSIFIED_PREPEND_SET_EMPTY="TRUE"
ICU_RECLASSIFIED_CONDITIONAL_JAPANESE_STARTER="TRUE"
......@@ -9055,7 +9049,6 @@ AC_SUBST(SYSTEM_GENCCODE)
AC_SUBST(SYSTEM_GENCMN)
AC_SUBST(ICU_MAJOR)
AC_SUBST(ICU_MINOR)
AC_SUBST(ICU_RECLASSIFIED_CLOSE_PARENTHESIS)
AC_SUBST(ICU_RECLASSIFIED_PREPEND_SET_EMPTY)
AC_SUBST(ICU_RECLASSIFIED_CONDITIONAL_JAPANESE_STARTER)
AC_SUBST(ICU_RECLASSIFIED_HEBREW_LETTER)
......
......@@ -98,8 +98,7 @@ $(i18npool_BIDIR)/%.brk : $(i18npool_BIDIR)/%.txt $(call gb_ExternalExecutable_g
# sed substitution...
$(i18npool_BIDIR)/%.txt : \
$(SRCDIR)/i18npool/source/breakiterator/data/%.txt | $(i18npool_BIDIR)/.dir
sed -e ': dummy' \
$(if $(ICU_RECLASSIFIED_CLOSE_PARENTHESIS),-e "s#\[:LineBreak = Close_Punctuation:\]#\[& \[:LineBreak = Close_Parenthesis:\]\]#") \
sed -e "s#\[:LineBreak = Close_Punctuation:\]#\[& \[:LineBreak = Close_Parenthesis:\]\]#" \
$(if $(ICU_RECLASSIFIED_CONDITIONAL_JAPANESE_STARTER),,\
-e '/\[:LineBreak = Conditional_Japanese_Starter:\]/d' \
-e 's# $$CJ##' \
......
......@@ -152,6 +152,23 @@ SVL_DLLPUBLIC OUString removePassword(OUString const & rURI,
INetURLObject::EncodeMechanism eEncodeMechanism = INetURLObject::WAS_ENCODED,
INetURLObject::DecodeMechanism eDecodeMechanism = INetURLObject::DECODE_TO_IURI,
rtl_TextEncoding eCharset = RTL_TEXTENCODING_UTF8);
/** Resolve a URL's host component domain name in IDNA syntax to plain DNS
syntax.
For details, see RFC 5890 "Internationalized Domain Names for Applications
(IDNA): Definitions and Document Framework."
@param: url An arbitrary string, should be a URI.
@return If the input matches the syntax of a hierarchical URL, and it has
a host component that matches the IDNA2008 domain name syntax, and that
domain name contains any U-labels, return a version of the input URL with
the host component resolved to plain DNS syntax. Otherwise, return the
input unchanged.
*/
SVL_DLLPUBLIC OUString resolveIdnaHost(OUString const & url);
}
#endif // INCLUDED_SVL_URIHELPER_HXX
......
......@@ -21,6 +21,8 @@ $(eval $(call gb_Library_Library,svl))
$(eval $(call gb_Library_use_externals,svl,\
boost_headers \
icu_headers \
icuuc \
mdds_headers \
libxml2 \
))
......
......@@ -198,9 +198,12 @@ public:
void testFindFirstURLInText();
void testResolveIdnaHost();
CPPUNIT_TEST_SUITE(Test);
CPPUNIT_TEST(testNormalizedMakeRelative);
CPPUNIT_TEST(testFindFirstURLInText);
CPPUNIT_TEST(testResolveIdnaHost);
CPPUNIT_TEST(finish);
CPPUNIT_TEST_SUITE_END();
......@@ -423,6 +426,66 @@ void Test::testFindFirstURLInText() {
}
}
void Test::testResolveIdnaHost() {
OUString input;
input.clear();
CPPUNIT_ASSERT_EQUAL(input, URIHelper::resolveIdnaHost(input));
input = OUString::fromUtf8("Foo.M\xC3\xBCnchen.de");
CPPUNIT_ASSERT_EQUAL(input, URIHelper::resolveIdnaHost(input));
input = OUString::fromUtf8("foo://Muenchen.de");
CPPUNIT_ASSERT_EQUAL(input, URIHelper::resolveIdnaHost(input));
input = OUString::fromUtf8("foo://-M\xC3\xBCnchen.de");
CPPUNIT_ASSERT_EQUAL(input, URIHelper::resolveIdnaHost(input));
input = OUString::fromUtf8("foo://M\xC3\xBCnchen-.de");
CPPUNIT_ASSERT_EQUAL(input, URIHelper::resolveIdnaHost(input));
input = OUString::fromUtf8("foo://xn--M\xC3\xBCnchen.de");
CPPUNIT_ASSERT_EQUAL(input, URIHelper::resolveIdnaHost(input));
input = OUString::fromUtf8("foo://xy--M\xC3\xBCnchen.de");
CPPUNIT_ASSERT_EQUAL(input, URIHelper::resolveIdnaHost(input));
input = OUString::fromUtf8("foo://.M\xC3\xBCnchen.de");
CPPUNIT_ASSERT_EQUAL(input, URIHelper::resolveIdnaHost(input));
input = OUString::fromUtf8("foo://-bar.M\xC3\xBCnchen.de");
CPPUNIT_ASSERT_EQUAL(input, URIHelper::resolveIdnaHost(input));
input = OUString::fromUtf8("foo://bar-.M\xC3\xBCnchen.de");
CPPUNIT_ASSERT_EQUAL(input, URIHelper::resolveIdnaHost(input));
input = OUString::fromUtf8("foo://xn--bar.M\xC3\xBCnchen.de");
CPPUNIT_ASSERT_EQUAL(input, URIHelper::resolveIdnaHost(input));
input = OUString::fromUtf8("foo://xy--bar.M\xC3\xBCnchen.de");
CPPUNIT_ASSERT_EQUAL(input, URIHelper::resolveIdnaHost(input));
CPPUNIT_ASSERT_EQUAL(
OUString::fromUtf8("foo://M\xC3\xBCnchen@xn--mnchen-3ya.de"),
URIHelper::resolveIdnaHost(
OUString::fromUtf8("foo://M\xC3\xBCnchen@M\xC3\xBCnchen.de")));
CPPUNIT_ASSERT_EQUAL(
OUString::fromUtf8("foo://xn--mnchen-3ya.de."),
URIHelper::resolveIdnaHost(
OUString::fromUtf8("foo://M\xC3\xBCnchen.de.")));
CPPUNIT_ASSERT_EQUAL(
OUString::fromUtf8("Foo://bar@xn--mnchen-3ya.de:123/?bar#baz"),
URIHelper::resolveIdnaHost(
OUString::fromUtf8("Foo://bar@M\xC3\xBCnchen.de:123/?bar#baz")));
CPPUNIT_ASSERT_EQUAL(
OUString::fromUtf8("foo://xn--mnchen-3ya.de"),
URIHelper::resolveIdnaHost(
OUString::fromUtf8("foo://Mu\xCC\x88nchen.de")));
}
css::uno::Reference< css::uno::XComponentContext > Test::m_context;
CPPUNIT_TEST_SUITE_REGISTRATION(Test);
......
......@@ -17,6 +17,10 @@
* the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/
#include <sal/config.h>
#include <unicode/idna.h>
#include <svl/urihelper.hxx>
#include <com/sun/star/ucb/Command.hpp>
#include <com/sun/star/ucb/IllegalIdentifierException.hpp>
......@@ -725,4 +729,68 @@ OUString URIHelper::removePassword(OUString const & rURI,
aObj.GetURLNoPass(eDecodeMechanism, eCharset);
}
OUString URIHelper::resolveIdnaHost(OUString const & url) {
css::uno::Reference<css::uri::XUriReference> uri(
css::uri::UriReferenceFactory::create(
comphelper::getProcessComponentContext())
->parse(url));
if (!(uri.is() && uri->hasAuthority())) {
return url;
}
auto auth(uri->getAuthority());
sal_Int32 hostStart = auth.indexOf('@') + 1;
sal_Int32 hostEnd = auth.getLength() - 1;
while (hostEnd > hostStart && rtl::isAsciiDigit(auth[hostEnd])) {
--hostEnd;
}
if (!(hostEnd > hostStart && auth[hostEnd] == ':')) {
hostEnd = auth.getLength() - 1;
}
auto asciiOnly = true;
for (auto i = hostStart; i != hostEnd; ++i) {
if (!rtl::isAscii(auth[i])) {
asciiOnly = false;
break;
}
}
if (asciiOnly) {
// Avoid icu::IDNA case normalization in purely non-IDNA domain names:
return url;
}
UErrorCode e = U_ZERO_ERROR;
std::unique_ptr<icu::IDNA> idna(
icu::IDNA::createUTS46Instance(
(UIDNA_USE_STD3_RULES | UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ
| UIDNA_CHECK_CONTEXTO),
e));
if (U_FAILURE(e)) {
SAL_WARN("vcl.gdi", "icu::IDNA::createUTS46Instance " << e);
return url;
}
icu::UnicodeString ascii;
icu::IDNAInfo info;
idna->nameToASCII(
icu::UnicodeString(
reinterpret_cast<UChar const *>(auth.getStr() + hostStart),
hostEnd - hostStart),
ascii, info, e);
if (U_FAILURE(e) || info.hasErrors()) {
return url;
}
OUStringBuffer buf(uri->getScheme());
buf.append("://").append(auth.getStr(), hostStart);
buf.append(
reinterpret_cast<sal_Unicode const *>(ascii.getBuffer()),
ascii.length());
buf.append(auth.getStr() + hostEnd, auth.getLength() - hostEnd)
.append(uri->getPath());
if (uri->hasQuery()) {
buf.append('?').append(uri->getQuery());
}
if (uri->hasFragment()) {
buf.append('#').append(uri->getFragment());
}
return buf.makeStringAndClear();
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
......@@ -44,6 +44,7 @@
#include <rtl/crc.h>
#include <rtl/digest.h>
#include <rtl/ustrbuf.hxx>
#include <svl/urihelper.hxx>
#include <tools/debug.hxx>
#include <tools/fract.hxx>
#include <tools/stream.hxx>
......@@ -4495,8 +4496,10 @@ we check in the following sequence:
// are the correct one!!
// extract target file type
auto url(URIHelper::resolveIdnaHost(rLink.m_aURL));
INetURLObject aDocumentURL( m_aContext.BaseURL );
INetURLObject aTargetURL( rLink.m_aURL );
INetURLObject aTargetURL( url );
bool bSetGoToRMode = false;
bool bTargetHasPDFExtension = false;
INetProtocol eTargetProtocol = aTargetURL.GetProtocol();
......@@ -4507,7 +4510,7 @@ we check in the following sequence:
// getting the needed URL information from the current document path
if( eTargetProtocol == INetProtocol::NotValid )
{
if( rLink.m_aURL.getLength() > 4 && rLink.m_aURL.startsWith("\\\\\\\\"))
if( url.getLength() > 4 && url.startsWith("\\\\\\\\"))
{
bIsUNCPath = true;
}
......@@ -4516,7 +4519,7 @@ we check in the following sequence:
INetURLObject aNewBase( aDocumentURL );//duplicate document URL
aNewBase.removeSegment(); //remove last segment from it, obtaining the base URL of the
//target document
aNewBase.insertName( rLink.m_aURL );
aNewBase.insertName( url );
aTargetURL = aNewBase;//reassign the new target URL
//recompute the target protocol, with the new URL
//normal URL processing resumes
......@@ -4564,7 +4567,7 @@ we check in the following sequence:
{
aLine.append( "/Launch/Win<</F" );
// INetURLObject is not good with UNC paths, use original path
appendLiteralStringEncrypt( rLink.m_aURL, rLink.m_nObject, aLine, osl_getThreadTextEncoding() );
appendLiteralStringEncrypt( url, rLink.m_nObject, aLine, osl_getThreadTextEncoding() );
aLine.append( ">>" );
}
else
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment