psemek/libs/util/source/unicode.cpp

#include <psemek/util/unicode.hpp>

#include <codecvt>
#include <locale>

namespace psemek::util
{

	using converter = std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t>;

	std::string to_utf8(std::u32string const & str)
	{
		return converter{}.to_bytes(str);
	}

	std::u32string from_utf8(std::string const & str)
	{
		return converter{}.from_bytes(str);
	}

	static bool is_1_byte(char c)
	{
		return (c & 0b10000000) == 0b00000000;
	}

	static bool is_2_byte(char c)
	{
		return (c & 0b11100000) == 0b11000000;
	}

	static bool is_3_byte(char c)
	{
		return (c & 0b11110000) == 0b11100000;
	}

	static bool is_4_byte(char c)
	{
		return (c & 0b11111000) == 0b11110000;
	}

	static bool is_middle_byte(char c)
	{
		return (c & 0b11000000) == 0b10000000;
	}

	static void assert_middle(char const * ptr)
	{
#ifdef PSEMEK_DEBUG
		if (!is_middle_byte(*ptr))
			throw invalid_utf8(ptr);
#else
		(void*)ptr;
#endif
	}

	utf8_iterator & utf8_iterator::operator ++()
	{
		if (is_1_byte(ptr[0]))
			ptr += 1;
		else if (is_2_byte(ptr[0]))
			ptr += 2;
		else if (is_3_byte(ptr[0]))
			ptr += 3;
		else if (is_4_byte(ptr[0]))
			ptr += 4;
		return *this;
	}

	utf8_iterator utf8_iterator::operator ++(int)
	{
		utf8_iterator copy(*this);
		++(*this);
		return copy;
	}

	char32_t utf8_iterator::operator *() const
	{
		if (is_1_byte(ptr[0]))
		{
			return ptr[0];
		}
		else if (is_2_byte(ptr[0]))
		{
			assert_middle(ptr + 1);
			return (ptr[0] & 0b11111) | ((ptr[1] & 0b111111) << 5);
		}
		else if (is_3_byte(ptr[0]))
		{
			assert_middle(ptr + 1);
			assert_middle(ptr + 2);
			return (ptr[0] & 0b1111) | ((ptr[1] & 0b111111) << 4) | ((ptr[2] & 0b111111) << 10);
		}
		else if (is_4_byte(ptr[0]))
		{
			assert_middle(ptr + 1);
			assert_middle(ptr + 2);
			assert_middle(ptr + 3);
			return (ptr[0] & 0b111) | ((ptr[1] & 0b111111) << 3) | ((ptr[2] & 0b111111) << 9) | ((ptr[3] & 0b111111) << 15);
		}
		else
			throw invalid_utf8(ptr);
	}

	char const * invalid_utf8::what() const noexcept
	{
		return "Invalid UTF-8 string";
	}

}