Resolves: #i120442# fix html-flavored copy+paste of unicode surrogate pairs

Patch-by: Chen Peng Found-by: Yan Ji Review-by and minor modifications: Herbert Duerr Change-Id: Id100b90367bc1b1603e22337b67b55bb1ffb40d4

Resolves: #i120442# fix html-flavored copy+paste of unicode surrogate pairs
Patch-by: Chen Peng Found-by: Yan Ji Review-by and minor modifications: Herbert Duerr Change-Id: Id100b90367bc1b1603e22337b67b55bb1ffb40d4
d47b5698 · Herbert Dürr · Caolán McNamara · 4d128516 · d47b5698
Kaydet (Commit) d47b5698 authored Agu 08, 2012 tarafından Herbert Dürr Kaydeden (comit) Caolán McNamara Mar 07, 2013
Hide whitespace changes
Inline Side-by-side

Showing with 29 additions and 9 deletions

htmlatr.cxx sw/source/filter/html/htmlatr.cxx +29 -9

No files found.
--- a/sw/source/filter/html/htmlatr.cxx
+++ b/sw/source/filter/html/htmlatr.cxx
@@ -2474,9 +2474,22 @@ Writer& OutHTML_SwTxtNode( Writer& rWrt, const SwCntntNode& rNode )

            if( bOutChar )
            {
-                sal_Unicode c = rStr.GetChar( nStrPos );
-                // versuche nach ungefaehr 255 Zeichen eine neue Zeile zu
-                // beginnen, aber nicht in PRE und nur bei Spaces
+                // #i120442#: get the UTF-32 codepoint by converting an eventual UTF-16 unicode surrogate pair
+                sal_uInt64 c = rStr.GetChar( nStrPos );
+                if( nStrPos < nEnde - 1 )
+                {
+                    const sal_Unicode d = rStr.GetChar( nStrPos + 1 );
+                    if( (c >= 0xd800 && c <= 0xdbff) && (d >= 0xdc00 && d <= 0xdfff) )
+                    {
+                        sal_uInt64 templow = d&0x03ff;
+                        sal_uInt64 temphi = ((c&0x03ff) + 0x0040)<<10;
+                        c = temphi|templow;
+                        nStrPos++;
+                    }
+                }
+
+                // try to split a line after about 255 characters
+                // at a space character unless in a PRE-context
                if( ' '==c && !rHTMLWrt.nLastParaToken  )
                {
                    xub_StrLen nLineLen;
@@ -2488,7 +2501,7 @@ Writer& OutHTML_SwTxtNode( Writer& rWrt, const SwCntntNode& rNode )
                    xub_StrLen nWordLen = rStr.Search( ' ', nStrPos+1 );
                    if( nWordLen == STRING_NOTFOUND )
                        nWordLen = nEnde;
-                    nWordLen = nWordLen - nStrPos;
+                    nWordLen -= nStrPos;

                    if( nLineLen >= rHTMLWrt.nWhishLineLen ||
                        (nLineLen+nWordLen) >= rHTMLWrt.nWhishLineLen )
@@ -2508,13 +2521,20 @@ Writer& OutHTML_SwTxtNode( Writer& rWrt, const SwCntntNode& rNode )
                        HTMLOutFuncs::FlushToAscii( rWrt.Strm(), aContext );
                        HTMLOutFuncs::Out_AsciiTag( rWrt.Strm(), OOO_STRING_SVTOOLS_HTML_linebreak );
                    }
+                    // #i120442#: if c is outside the unicode base plane output it as "&#******;"
+                    else if( c > 0xffff)
+                    {
+                        OUString sOut("&#");
+                        sOut += OUString::number( (sal_uInt64)c );
+                        sOut += ";";
+                        rWrt.Strm() << sOut.getStr();
+                    }
                    else
-                        HTMLOutFuncs::Out_Char( rWrt.Strm(), c, aContext, &rHTMLWrt.aNonConvertableCharacters );
+                        HTMLOutFuncs::Out_Char( rWrt.Strm(), (sal_Unicode)c, aContext, &rHTMLWrt.aNonConvertableCharacters );

-                    // Wenn das letzte Zeichen eines Absatzed ein harter
-                    // Zeilen-Umbruch ist brauchen wir noch ein <BR> mehr, weil
-                    // Netscape & Co in diesem Fall fuer den naechsten Absatz
-                    // nicht in die naechste Zeile gehen.
+                    // if a paragraph's last character is a hard line break
+                    // then we need to add an extra <br>
+                    // because browsers like Mozilla wouldn't add a line for the next paragraph
                    bWriteBreak = (0x0a == c) &&
                                  (HTML_PREFORMTXT_ON != rHTMLWrt.nLastParaToken);
                }