From 490b5abeda6d26dc3a478eb755ba3915c5e4fb7e Mon Sep 17 00:00:00 2001 From: Max Fortun Date: Wed, 24 Jan 2024 13:44:22 -0500 Subject: [PATCH 1/3] updated --- .../org/apache/xml/serializer/ToStream.java | 44 ++++++++++++++----- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/serializer/src/main/java/org/apache/xml/serializer/ToStream.java b/serializer/src/main/java/org/apache/xml/serializer/ToStream.java index 6d94582c9..237c1a803 100644 --- a/serializer/src/main/java/org/apache/xml/serializer/ToStream.java +++ b/serializer/src/main/java/org/apache/xml/serializer/ToStream.java @@ -175,6 +175,10 @@ abstract public class ToStream extends SerializerBase */ private boolean m_expandDTDEntities = true; + /** + * Track multibyte character in order to serialize when the whole byte sequence is available. + */ + private char m_highUTF16Surrogate = 0; /** * Default constructor @@ -1595,23 +1599,41 @@ else if (m_encodingInfo.isInEncoding(ch)) { // not in the normal ASCII range, we also // just leave it get added on to the clean characters } - else if (Encodings.isHighUTF16Surrogate(ch) && i < end-1 && Encodings.isLowUTF16Surrogate(chars[i+1])) { - // So, this is a (valid) surrogate pair - if (! m_encodingInfo.isInEncoding(ch, chars[i+1])) { - int codepoint = Encodings.toCodePoint(ch, chars[i+1]); - writeOutCleanChars(chars, i, lastDirtyCharProcessed); - writer.write("&#"); - writer.write(Integer.toString(codepoint)); - writer.write(';'); - lastDirtyCharProcessed = i+1; - } - i++; // skip the low surrogate, too + else if (Encodings.isHighUTF16Surrogate(ch)) { + // Store for later processing. We may be at the end of a buffer, + // and must wait till low surrogate arrives + // before we can do anything with this. + writeOutCleanChars(chars, i, lastDirtyCharProcessed); + m_highUTF16Surrogate = ch; + lastDirtyCharProcessed = i; + } + else if (m_highUTF16Surrogate != 0 && Encodings.isLowUTF16Surrogate(ch)) { + // The complete utf16 byte sequence is now available and may be serialized. + int codepoint = Encodings.toCodePoint(m_highUTF16Surrogate, ch); + if (! m_encodingInfo.isInEncoding(m_highUTF16Surrogate, ch)) { + writeOutCleanChars(chars, i, lastDirtyCharProcessed); + writer.write("&#"); + writer.write(Integer.toString(codepoint)); + writer.write(';'); + } else { + writer.write(m_highUTF16Surrogate); + writer.write(ch); + } + lastDirtyCharProcessed = i; + m_highUTF16Surrogate = 0; } else { // This is a fallback plan, we get here if the // encoding doesn't contain ch and it's not part // of a surrogate pair // The right thing is to write out an entity + if(m_highUTF16Surrogate != 0) { + writer.write("&#"); + writer.write(Integer.toString(m_highUTF16Surrogate)); + writer.write(';'); + m_highUTF16Surrogate = 0; + } + writeOutCleanChars(chars, i, lastDirtyCharProcessed); writer.write("&#"); writer.write(Integer.toString(ch)); From e92e1707cae600c1b318b6236cf55c85ad0280fe Mon Sep 17 00:00:00 2001 From: Max Fortun Date: Wed, 24 Jan 2024 13:54:39 -0500 Subject: [PATCH 2/3] updated --- .../src/main/java/org/apache/xml/serializer/ToStream.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/serializer/src/main/java/org/apache/xml/serializer/ToStream.java b/serializer/src/main/java/org/apache/xml/serializer/ToStream.java index 237c1a803..3e5a79408 100644 --- a/serializer/src/main/java/org/apache/xml/serializer/ToStream.java +++ b/serializer/src/main/java/org/apache/xml/serializer/ToStream.java @@ -1609,9 +1609,9 @@ else if (Encodings.isHighUTF16Surrogate(ch)) { } else if (m_highUTF16Surrogate != 0 && Encodings.isLowUTF16Surrogate(ch)) { // The complete utf16 byte sequence is now available and may be serialized. - int codepoint = Encodings.toCodePoint(m_highUTF16Surrogate, ch); + writeOutCleanChars(chars, i, lastDirtyCharProcessed); if (! m_encodingInfo.isInEncoding(m_highUTF16Surrogate, ch)) { - writeOutCleanChars(chars, i, lastDirtyCharProcessed); + int codepoint = Encodings.toCodePoint(m_highUTF16Surrogate, ch); writer.write("&#"); writer.write(Integer.toString(codepoint)); writer.write(';'); From 0006abf07732d2608d1bdcbc81bcabe1276bdbb3 Mon Sep 17 00:00:00 2001 From: Max Fortun Date: Wed, 24 Jan 2024 13:59:44 -0500 Subject: [PATCH 3/3] updated --- serializer/src/main/java/org/apache/xml/serializer/ToStream.java | 1 - 1 file changed, 1 deletion(-) diff --git a/serializer/src/main/java/org/apache/xml/serializer/ToStream.java b/serializer/src/main/java/org/apache/xml/serializer/ToStream.java index 3e5a79408..4de3ff891 100644 --- a/serializer/src/main/java/org/apache/xml/serializer/ToStream.java +++ b/serializer/src/main/java/org/apache/xml/serializer/ToStream.java @@ -1609,7 +1609,6 @@ else if (Encodings.isHighUTF16Surrogate(ch)) { } else if (m_highUTF16Surrogate != 0 && Encodings.isLowUTF16Surrogate(ch)) { // The complete utf16 byte sequence is now available and may be serialized. - writeOutCleanChars(chars, i, lastDirtyCharProcessed); if (! m_encodingInfo.isInEncoding(m_highUTF16Surrogate, ch)) { int codepoint = Encodings.toCodePoint(m_highUTF16Surrogate, ch); writer.write("&#");