locale/test/test_utf.cpp

274 lines
7.6 KiB
C++

//
// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
//
// Distributed under the Boost Software License, Version 1.0.
// https://www.boost.org/LICENSE_1_0.txt
#include <boost/locale/utf.hpp>
#include <boost/locale/util/string.hpp>
#include "boostLocale/test/tools.hpp"
#include "boostLocale/test/unit_test.hpp"
#include <boost/detail/workaround.hpp>
#include <cstring>
using namespace boost::locale::utf;
const std::uint32_t* u32_seq(std::uint32_t a)
{
static std::uint32_t buf[2];
buf[0] = a;
buf[1] = 0;
return buf;
}
const std::uint16_t* u16_seq(std::uint16_t a)
{
static std::uint16_t buf[2];
buf[0] = a;
buf[1] = 0;
return buf;
}
const std::uint16_t* u16_seq(std::uint16_t a, std::uint16_t b)
{
static std::uint16_t buf[3];
buf[0] = a;
buf[1] = b;
buf[2] = 0;
return buf;
}
const char16_t* c16_seq(std::uint16_t a)
{
static char16_t buf[2];
buf[0] = static_cast<char16_t>(a);
buf[1] = 0;
return buf;
}
const char32_t* c32_seq(std::uint32_t a)
{
static char32_t buf[2];
buf[0] = static_cast<char32_t>(a);
buf[1] = 0;
return buf;
}
template<typename CharType>
void test_from_utf(const CharType* const s, unsigned codepoint)
{
const CharType* cur = s;
const CharType* const end = boost::locale::util::str_end(s);
typedef utf_traits<CharType> tr;
static_assert(tr::max_width == 4 / sizeof(CharType), "Wrong max_width");
TEST_EQ(tr::decode(cur, end), codepoint);
if(codepoint != illegal)
TEST(cur == end);
if(codepoint == incomplete) {
TEST(*s == 0 || tr::trail_length(*s) > 0);
TEST_GE(tr::trail_length(*s), end - s);
}
if(codepoint != incomplete && codepoint != illegal) {
TEST(tr::is_lead(*s));
TEST(!tr::is_trail(*s));
cur = s;
while(++cur != end) {
TEST(tr::is_trail(*cur));
TEST(!tr::is_lead(*cur));
}
TEST_EQ(tr::width(codepoint), end - s);
TEST_EQ(tr::trail_length(*s), tr::width(codepoint) - 1);
cur = s;
TEST_EQ(tr::decode_valid(cur), codepoint);
TEST(cur == end);
}
}
template<typename CharType>
void test_to_utf(const CharType* str, unsigned codepoint)
{
CharType buf[5] = {1, 1, 1, 1, 1};
CharType* p = buf;
p = utf_traits<CharType>::encode(codepoint, p);
const CharType* const end = boost::locale::util::str_end(str);
TEST_EQ(end - str, p - buf);
TEST(*p);
*p = 0;
TEST_EQ(memcmp(str, buf, sizeof(CharType) * (end - str)), 0);
}
template<typename CharType>
void test_valid_utf(const CharType* str, unsigned codepoint)
{
test_from_utf(str, codepoint);
test_to_utf(str, codepoint);
}
void test_utf8()
{
std::cout << "- Test UTF-8\n";
std::cout << "-- Correct" << std::endl;
test_valid_utf("\x7f", 0x7f);
test_valid_utf("\xc2\x80", 0x80);
test_valid_utf("\xdf\xbf", 0x7ff);
test_valid_utf("\xe0\xa0\x80", 0x800);
test_valid_utf("\xef\xbf\xbf", 0xffff);
test_valid_utf("\xf0\x90\x80\x80", 0x10000);
test_valid_utf("\xf4\x8f\xbf\xbf", 0x10ffff);
// test that this actually works
test_from_utf(make2(0x80), 0x80);
test_from_utf(make2(0x7ff), 0x7ff);
test_from_utf(make3(0x800), 0x800);
test_from_utf(make3(0xffff), 0xffff);
test_from_utf(make4(0x10000), 0x10000);
test_from_utf(make4(0x10ffff), 0x10ffff);
std::cout << "-- Too big" << std::endl;
test_from_utf("\xf4\x9f\x80\x80", illegal); // 11 0000
test_from_utf("\xfb\xbf\xbf\xbf", illegal); // 3ff ffff
test_from_utf("\xf8\x90\x80\x80\x80", illegal); // 400 0000
test_from_utf("\xfd\xbf\xbf\xbf\xbf\xbf", illegal); // 7fff ffff
std::cout << "-- Invalid length" << std::endl;
test_from_utf(make2(0), illegal);
test_from_utf(make3(0), illegal);
test_from_utf(make4(0), illegal);
test_from_utf(make2(0x7f), illegal);
test_from_utf(make3(0x7f), illegal);
test_from_utf(make4(0x7f), illegal);
test_from_utf(make3(0x80), illegal);
test_from_utf(make4(0x80), illegal);
test_from_utf(make3(0x7ff), illegal);
test_from_utf(make4(0x7ff), illegal);
test_from_utf(make4(0x8000), illegal);
test_from_utf(make4(0xffff), illegal);
test_from_utf(make4(0x110000), illegal);
test_from_utf(make4(0x1fffff), illegal);
std::cout << "-- Invalid surrogate" << std::endl;
test_from_utf(make3(0xd800), illegal);
test_from_utf(make3(0xdbff), illegal);
test_from_utf(make3(0xdc00), illegal);
test_from_utf(make3(0xdfff), illegal);
test_from_utf(make4(0xd800), illegal);
test_from_utf(make4(0xdbff), illegal);
test_from_utf(make4(0xdc00), illegal);
test_from_utf(make4(0xdfff), illegal);
std::cout << "-- Incomplete" << std::endl;
test_from_utf("", incomplete);
test_from_utf("\x80", illegal);
test_from_utf("\xc2", incomplete);
test_from_utf("\xdf", incomplete);
test_from_utf("\xe0", incomplete);
test_from_utf("\xe0\xa0", incomplete);
test_from_utf("\xef\xbf", incomplete);
test_from_utf("\xef", incomplete);
test_from_utf("\xf0\x90\x80", incomplete);
test_from_utf("\xf0\x90", incomplete);
test_from_utf("\xf0", incomplete);
test_from_utf("\xf4\x8f\xbf", incomplete);
test_from_utf("\xf4\x8f", incomplete);
test_from_utf("\xf4", incomplete);
}
void test_utf16()
{
std::cout << "- Test UTF-16\n";
std::cout << "-- Correct" << std::endl;
test_valid_utf(u16_seq(0x10), 0x10);
test_valid_utf(u16_seq(0xffff), 0xffff);
test_valid_utf(u16_seq(0xD800, 0xDC00), 0x10000);
test_valid_utf(u16_seq(0xDBFF, 0xDFFF), 0x10FFFF);
std::cout << "-- Invalid surrogate" << std::endl;
test_from_utf(u16_seq(0xDFFF), illegal);
test_from_utf(u16_seq(0xDC00), illegal);
std::cout << "-- Incomplete" << std::endl;
test_from_utf(u16_seq(0), incomplete);
test_from_utf(u16_seq(0xD800), incomplete);
test_from_utf(u16_seq(0xDBFF), incomplete);
std::cout << "-- Test char16_t" << std::endl;
#if BOOST_WORKAROUND(BOOST_GCC_VERSION, < 50000)
test_valid_utf(u"\x0010", 0x10);
test_valid_utf(u"\xffff", 0xffff);
#else
test_valid_utf(u"\u0010", 0x10);
test_valid_utf(u"\uffff", 0xffff);
#endif
test_valid_utf(u"\U00010000", 0x10000);
test_valid_utf(u"\U0010FFFF", 0x10FFFF);
test_from_utf(c16_seq(0xDFFF), illegal);
test_from_utf(c16_seq(0xDC00), illegal);
}
void test_utf32()
{
std::cout << "- Test UTF-32\n";
std::cout << "-- Correct" << std::endl;
test_valid_utf(u32_seq(0x10), 0x10);
test_valid_utf(u32_seq(0xffff), 0xffff);
test_valid_utf(u32_seq(0x10000), 0x10000);
test_valid_utf(u32_seq(0x10ffff), 0x10ffff);
std::cout << "-- Invalid surrogate" << std::endl;
test_from_utf(u32_seq(0xD800), illegal);
test_from_utf(u32_seq(0xDBFF), illegal);
test_from_utf(u32_seq(0xDFFF), illegal);
test_from_utf(u32_seq(0xDC00), illegal);
test_from_utf(u32_seq(0x110000), illegal);
std::cout << "-- Incomplete" << std::endl;
test_from_utf(u32_seq(0), incomplete);
std::cout << "-- Test char32_t" << std::endl;
#if BOOST_WORKAROUND(BOOST_GCC_VERSION, < 50000)
test_valid_utf(U"\x0010", 0x10);
#else
test_valid_utf(U"\U00000010", 0x10);
#endif
test_valid_utf(U"\U0000ffff", 0xffff);
test_valid_utf(U"\U00010000", 0x10000);
test_valid_utf(U"\U0010ffff", 0x10ffff);
test_from_utf(c32_seq(0xD800), illegal);
test_from_utf(c32_seq(0xDBFF), illegal);
test_from_utf(c32_seq(0xDFFF), illegal);
test_from_utf(c32_seq(0xDC00), illegal);
test_from_utf(c32_seq(0x110000), illegal);
}
void test_main(int /*argc*/, char** /*argv*/)
{
test_utf8();
test_utf16();
test_utf32();
}
// boostinspect:noascii