Add a new UTF-8 decoder, similar to the encoder we've just added

Like before, this is taken from the existing QUrl code and is optimized for
ASCII handling (for the same reasons). And like previously, make
QString::fromUtf8 use a stateless version of the codec, which is faster.

There's a small change in behavior in the decoding: we insert a U+FFFD for
each byte that cannot be decoded properly. Previously, it would "eat" all bad
high-bit bytes and replace them all with one single U+FFFD. Either behavior is
allowed by the UTF-8 specifications, even though this new behavior will cause
misalignment in the Bradley Kuhn sample UTF-8 text.

Change-Id: Ib1b1f0b4291293bab345acaf376e00204ed87565
Reviewed-by: Olivier Goffart <ogoffart@woboq.com>
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Thiago Macieira 2013-10-20 17:43:46 +01:00 committed by The Qt Project
parent d51130cc3a
commit 8dd47e34b9
5 changed files with 226 additions and 136 deletions

View File

@ -128,114 +128,117 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
return rstr;
}
QString QUtf8::convertToUnicode(const char *chars, int len)
{
QString result(len + 1, Qt::Uninitialized); // worst case
ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
const uchar *src = reinterpret_cast<const uchar *>(chars);
const uchar *end = src + len;
while (src < end) {
uchar b = *src++;
int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
if (res < 0) {
// decoding error
*dst++ = QChar::ReplacementCharacter;
}
}
result.truncate(dst - reinterpret_cast<const ushort *>(result.constData()));
return result;
}
QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::ConverterState *state)
{
bool headerdone = false;
ushort replacement = QChar::ReplacementCharacter;
int need = 0;
int error = -1;
uint uc = 0;
uint min_uc = 0;
int invalid = 0;
int res;
uchar ch = 0;
QString result(need + len + 1, Qt::Uninitialized); // worst case
ushort *dst = reinterpret_cast<ushort *>(const_cast<QChar *>(result.constData()));
const uchar *src = reinterpret_cast<const uchar *>(chars);
const uchar *end = src + len;
if (state) {
if (state->flags & QTextCodec::IgnoreHeader)
headerdone = true;
if (state->flags & QTextCodec::ConvertInvalidToNull)
replacement = QChar::Null;
need = state->remainingChars;
if (need) {
uc = state->state_data[0];
min_uc = state->state_data[1];
if (state->remainingChars) {
// handle incoming state first
uchar remainingCharsData[4]; // longest UTF-8 sequence possible
int remainingCharsCount = state->remainingChars;
int newCharsToCopy = qMin<int>(sizeof(remainingCharsData) - remainingCharsCount, end - src);
memset(remainingCharsData, 0, sizeof(remainingCharsData));
memcpy(remainingCharsData, &state->state_data[0], remainingCharsCount);
memcpy(remainingCharsData + remainingCharsCount, src, newCharsToCopy);
const uchar *begin = &remainingCharsData[1];
res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(remainingCharsData[0], dst, begin,
static_cast<const uchar *>(remainingCharsData) + remainingCharsCount + newCharsToCopy);
if (res == QUtf8BaseTraits::EndOfString) {
// if we got EndOfString again, then there were too few bytes in src;
// copy to our state and return
state->remainingChars = remainingCharsCount + newCharsToCopy;
memcpy(&state->state_data[0], remainingCharsData, state->remainingChars);
return QString();
} else if (res == QUtf8BaseTraits::Error) {
++invalid;
*dst++ = replacement;
} else if (!headerdone && res >= 0) {
// eat the UTF-8 BOM
headerdone = true;
if (dst[-1] == 0xfeff)
--dst;
}
// adjust src now that we have maybe consumed a few chars
//Q_ASSERT(res > remainingCharsCount)
src += res - remainingCharsCount;
}
}
if (!headerdone && len > 3
&& (uchar)chars[0] == 0xef && (uchar)chars[1] == 0xbb && (uchar)chars[2] == 0xbf) {
// starts with a byte order mark
chars += 3;
len -= 3;
headerdone = true;
}
QString result(need + len + 1, Qt::Uninitialized); // worst case
ushort *qch = (ushort *)result.unicode();
uchar ch;
int invalid = 0;
for (int i = 0; i < len; ++i) {
ch = chars[i];
if (need) {
if ((ch&0xc0) == 0x80) {
uc = (uc << 6) | (ch & 0x3f);
--need;
if (!need) {
// utf-8 bom composes into 0xfeff code point
if (!headerdone && uc == 0xfeff) {
// don't do anything, just skip the BOM
} else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
// surrogate pair
Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
*qch++ = QChar::highSurrogate(uc);
*qch++ = QChar::lowSurrogate(uc);
} else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
// error: overlong sequence, UTF16 surrogate or non-character
*qch++ = replacement;
++invalid;
} else {
*qch++ = uc;
}
headerdone = true;
}
} else {
// error
i = error;
*qch++ = replacement;
++invalid;
need = 0;
headerdone = true;
}
} else {
if (ch < 128) {
*qch++ = ushort(ch);
headerdone = true;
} else if ((ch & 0xe0) == 0xc0) {
uc = ch & 0x1f;
need = 1;
error = i;
min_uc = 0x80;
headerdone = true;
} else if ((ch & 0xf0) == 0xe0) {
uc = ch & 0x0f;
need = 2;
error = i;
min_uc = 0x800;
} else if ((ch&0xf8) == 0xf0) {
uc = ch & 0x07;
need = 3;
error = i;
min_uc = 0x10000;
headerdone = true;
} else {
// error
*qch++ = replacement;
++invalid;
headerdone = true;
}
// main body, stateless decoding
res = 0;
while (res >= 0 && src < end) {
ch = *src++;
res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
if (!headerdone && res >= 0) {
headerdone = true;
// eat the UTF-8 BOM
if (dst[-1] == 0xfeff)
--dst;
}
}
if (!state && need > 0) {
// unterminated UTF sequence
for (int i = error; i < len; ++i) {
*qch++ = replacement;
if (res == QUtf8BaseTraits::Error) {
res = 0;
++invalid;
*dst++ = replacement;
}
}
result.truncate(qch - (ushort *)result.unicode());
if (!state && res == QUtf8BaseTraits::EndOfString) {
// unterminated UTF sequence
*dst++ = QChar::ReplacementCharacter;
while (src++ < end)
*dst++ = QChar::ReplacementCharacter;
}
result.truncate(dst - (ushort *)result.unicode());
if (state) {
state->invalidChars += invalid;
state->remainingChars = need;
if (headerdone)
state->flags |= QTextCodec::IgnoreHeader;
state->state_data[0] = need ? uc : 0;
state->state_data[1] = need ? min_uc : 0;
if (res == QUtf8BaseTraits::EndOfString) {
--src; // unread the byte in ch
state->remainingChars = end - src;
memcpy(&state->state_data[0], src, end - src);
} else {
state->remainingChars = 0;
}
}
return result;
}

View File

@ -169,6 +169,110 @@ namespace QUtf8Functions
Traits::appendByte(dst, 0x80 | (u & 0x3f));
return 0;
}
inline bool isContinuationByte(uchar b)
{
return (b & 0xc0) == 0x80;
}
/// returns the number of characters consumed (including \a b) in case of success;
/// returns negative in case of error: Traits::Error or Traits::EndOfString
template <typename Traits, typename OutputPtr, typename InputPtr> inline
int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end)
{
int charsNeeded;
uint min_uc;
uint uc;
if (!Traits::skipAsciiHandling && b < 0x80) {
// US-ASCII
Traits::appendUtf16(dst, b);
return 1;
}
if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) {
// an UTF-8 first character must be at least 0xC0
// however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
return Traits::Error;
} else if (b < 0xe0) {
charsNeeded = 2;
min_uc = 0x80;
uc = b & 0x1f;
} else if (b < 0xf0) {
charsNeeded = 3;
min_uc = 0x800;
uc = b & 0x0f;
} else if (b < 0xf5) {
charsNeeded = 4;
min_uc = 0x10000;
uc = b & 0x07;
} else {
// the last Unicode character is U+10FFFF
// it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
// therefore, a byte higher than 0xF4 is not the UTF-8 first byte
return Traits::Error;
}
int bytesAvailable = Traits::availableBytes(src, end);
if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) {
// it's possible that we have an error instead of just unfinished bytes
if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0)))
return Traits::Error;
if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1)))
return Traits::Error;
if (bytesAvailable > 2 && !isContinuationByte(Traits::peekByte(src, 2)))
return Traits::Error;
return Traits::EndOfString;
}
// first continuation character
b = Traits::peekByte(src, 0);
if (!isContinuationByte(b))
return Traits::Error;
uc <<= 6;
uc |= b & 0x3f;
if (charsNeeded > 2) {
// second continuation character
b = Traits::peekByte(src, 1);
if (!isContinuationByte(b))
return Traits::Error;
uc <<= 6;
uc |= b & 0x3f;
if (charsNeeded > 3) {
// third continuation character
b = Traits::peekByte(src, 2);
if (!isContinuationByte(b))
return Traits::Error;
uc <<= 6;
uc |= b & 0x3f;
}
}
// we've decoded something; safety-check it
if (!Traits::isTrusted) {
if (uc < min_uc)
return Traits::Error;
if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint)
return Traits::Error;
if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc))
return Traits::Error;
}
// write the UTF-16 sequence
if (!QChar::requiresSurrogates(uc)) {
// UTF-8 decoded and no surrogates are required
// detach if necessary
Traits::appendUtf16(dst, ushort(uc));
} else {
// UTF-8 decoded to something that requires a surrogate pair
Traits::appendUcs4(dst, uc);
}
Traits::advanceByte(src, charsNeeded - 1);
return charsNeeded;
}
}
enum DataEndianness
@ -180,6 +284,7 @@ enum DataEndianness
struct QUtf8
{
static QString convertToUnicode(const char *, int);
static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *);
static QByteArray convertFromUnicode(const QChar *, int);
static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *);

View File

@ -4317,7 +4317,7 @@ QString QString::fromUtf8_helper(const char *str, int size)
return QString();
Q_ASSERT(size != -1);
return QUtf8::convertToUnicode(str, size, 0);
return QUtf8::convertToUnicode(str, size);
}
/*!

View File

@ -456,7 +456,7 @@ void tst_QTextCodec::flagF7808080() const
//QVERIFY(!codec->canEncode(QChar(0x1C0000)));
QTextCodec::ConverterState state(QTextCodec::ConvertInvalidToNull);
QVERIFY(codec->toUnicode(input.constData(), input.length(), &state) == QChar(0));
QCOMPARE(codec->toUnicode(input.constData(), input.length(), &state), QString(input.size(), QChar(0)));
}
void tst_QTextCodec::nonFlaggedEFBFBF() const
@ -689,8 +689,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xbf);
utf8 += char(0xbf);
utf8 += char(0xbf);
str.clear();
str += QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.2.4") << utf8 << str << -1;
// 2.2.5 U+03FFFFFF (not a valid Unicode character)
@ -755,8 +754,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0x90);
utf8 += char(0x80);
utf8 += char(0x80);
str.clear();
str += QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 2.3.5") << utf8 << str << -1;
// 3.1.1
@ -1244,7 +1242,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8.clear();
utf8 += char(0xc0);
utf8 += char(0xaf);
str = QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.1") << utf8 << str << -1;
// 4.1.2
@ -1252,7 +1250,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xe0);
utf8 += char(0x80);
utf8 += char(0xaf);
str = QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.2") << utf8 << str << -1;
// 4.1.3
@ -1261,7 +1259,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0x80);
utf8 += char(0x80);
utf8 += char(0xaf);
str = QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.1.3") << utf8 << str << -1;
// 4.1.4
@ -1289,7 +1287,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8.clear();
utf8 += char(0xc1);
utf8 += char(0xbf);
str = QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.1") << utf8 << str << -1;
// 4.2.2
@ -1297,7 +1295,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xe0);
utf8 += char(0x9f);
utf8 += char(0xbf);
str = QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.2") << utf8 << str << -1;
// 4.2.3
@ -1306,7 +1304,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0x8f);
utf8 += char(0xbf);
utf8 += char(0xbf);
str = QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.2.3") << utf8 << str << -1;
// 4.2.4
@ -1334,7 +1332,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8.clear();
utf8 += char(0xc0);
utf8 += char(0x80);
str = QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.1") << utf8 << str << -1;
// 4.3.2
@ -1342,7 +1340,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xe0);
utf8 += char(0x80);
utf8 += char(0x80);
str = QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.2") << utf8 << str << -1;
// 4.3.3
@ -1351,7 +1349,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0x80);
utf8 += char(0x80);
utf8 += char(0x80);
str = QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 4.3.3") << utf8 << str << -1;
// 4.3.4
@ -1380,7 +1378,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xed);
utf8 += char(0xa0);
utf8 += char(0x80);
str = QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.1") << utf8 << str << -1;
// 5.1.2
@ -1388,7 +1386,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xed);
utf8 += char(0xad);
utf8 += char(0xbf);
str = QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.2") << utf8 << str << -1;
// 5.1.3
@ -1396,7 +1394,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xed);
utf8 += char(0xae);
utf8 += char(0x80);
str = QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.3") << utf8 << str << -1;
// 5.1.4
@ -1404,7 +1402,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xed);
utf8 += char(0xaf);
utf8 += char(0xbf);
str = QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.4") << utf8 << str << -1;
// 5.1.5
@ -1412,7 +1410,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xed);
utf8 += char(0xb0);
utf8 += char(0x80);
str = QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.5") << utf8 << str << -1;
// 5.1.6
@ -1420,7 +1418,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xed);
utf8 += char(0xbe);
utf8 += char(0x80);
str = QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.6") << utf8 << str << -1;
// 5.1.7
@ -1428,7 +1426,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xed);
utf8 += char(0xbf);
utf8 += char(0xbf);
str = QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.1.7") << utf8 << str << -1;
// 5.2.1
@ -1439,9 +1437,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xed);
utf8 += char(0xb0);
utf8 += char(0x80);
str.clear();
str += QChar(QChar::ReplacementCharacter);
str += QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.1") << utf8 << str << -1;
// 5.2.2
@ -1452,9 +1448,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xed);
utf8 += char(0xbf);
utf8 += char(0xbf);
str.clear();
str += QChar(QChar::ReplacementCharacter);
str += QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.2") << utf8 << str << -1;
// 5.2.3
@ -1465,9 +1459,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xed);
utf8 += char(0xb0);
utf8 += char(0x80);
str.clear();
str += QChar(QChar::ReplacementCharacter);
str += QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.3") << utf8 << str << -1;
// 5.2.4
@ -1478,9 +1470,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xed);
utf8 += char(0xbf);
utf8 += char(0xbf);
str.clear();
str += QChar(QChar::ReplacementCharacter);
str += QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.4") << utf8 << str << -1;
// 5.2.5
@ -1491,9 +1481,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xed);
utf8 += char(0xb0);
utf8 += char(0x80);
str.clear();
str += QChar(QChar::ReplacementCharacter);
str += QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.5") << utf8 << str << -1;
// 5.2.6
@ -1504,9 +1492,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xed);
utf8 += char(0xbf);
utf8 += char(0xbf);
str.clear();
str += QChar(QChar::ReplacementCharacter);
str += QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.6") << utf8 << str << -1;
// 5.2.7
@ -1517,9 +1503,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xed);
utf8 += char(0xb0);
utf8 += char(0x80);
str.clear();
str += QChar(QChar::ReplacementCharacter);
str += QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.7") << utf8 << str << -1;
// 5.2.8
@ -1530,9 +1514,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xed);
utf8 += char(0xbf);
utf8 += char(0xbf);
str.clear();
str += QChar(QChar::ReplacementCharacter);
str += QChar(QChar::ReplacementCharacter);
str = fromInvalidUtf8Sequence(utf8);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.2.8") << utf8 << str << -1;
// 5.3.1 - non-character code
@ -1541,7 +1523,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xbf);
utf8 += char(0xbe);
//str = QChar(QChar::ReplacementCharacter);
str = QString::fromUtf8(utf8);
str = QChar(0xfffe);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.1") << utf8 << str << -1;
// 5.3.2 - non-character code
@ -1550,7 +1532,7 @@ void tst_QTextCodec::utf8Codec_data()
utf8 += char(0xbf);
utf8 += char(0xbf);
//str = QChar(QChar::ReplacementCharacter);
str = QString::fromUtf8(utf8);
str = QChar(0xffff);
QTest::newRow("http://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html 5.3.2") << utf8 << str << -1;
}

View File

@ -1,6 +1,6 @@
setDocumentLocator(locator={columnNumber=1, lineNumber=1})
startDocument()
startElement(namespaceURI="", localName="doc", qName="doc", atts=[])
characters(ch="<22>")
characters(ch="<22><EFBFBD><EFBFBD><EFBFBD>")
endElement(namespaceURI="", localName="doc", qName="doc")
endDocument()