psemek/libs/util/source/unicode.cpp

#include <psemek/util/unicode.hpp>

#include <codecvt>
#include <locale>

namespace psemek::util
{

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
#endif

	std::string to_utf8(std::u32string const & str)
	{
		std::string result;
		result.resize(str.size() * 4);

		char * out = result.data();
		for (char32_t c : str)
			out = append_utf8(c, out);
		result.resize(out - result.data());

		return result;
	}

	std::u32string from_utf8(std::string const & str)
	{
		std::u32string result;
		result.reserve(str.size());
		for (char32_t c : utf8_range(str))
			result.push_back(c);
		return result;
	}

	char * append_utf8(char32_t c, char * out)
	{
		if (c < 0x80)
		{
			*out++ = c;
		}
		else if (c < 0x800)
		{
			*out++ = 0b11000000 | (c >> 6);
			*out++ = 0b10000000 | (c & 0b00111111);
		}
		else if (c < 0x10000)
		{
			*out++ = 0b11100000 | (c >> 12);
			*out++ = 0b10000000 | ((c >> 6) & 0b00111111);
			*out++ = 0b10000000 | (c & 0b00111111);
		}
		else
		{
			*out++ = 0b11110000 | (c >> 18);
			*out++ = 0b10000000 | ((c >> 12) & 0b00111111);
			*out++ = 0b10000000 | ((c >> 6) & 0b00111111);
			*out++ = 0b10000000 | (c & 0b00111111);
		}

		return out;
	}

#ifdef __clang__
#pragma clang diagnostic pop
#endif

	static bool is_1_byte(char c)
	{
		return (c & 0b10000000) == 0b00000000;
	}

	static bool is_2_byte(char c)
	{
		return (c & 0b11100000) == 0b11000000;
	}

	static bool is_3_byte(char c)
	{
		return (c & 0b11110000) == 0b11100000;
	}

	static bool is_4_byte(char c)
	{
		return (c & 0b11111000) == 0b11110000;
	}

	static bool is_middle_byte(char c)
	{
		return (c & 0b11000000) == 0b10000000;
	}

	static void assert_middle(char const * ptr)
	{
#ifdef PSEMEK_DEBUG
		if (!is_middle_byte(*ptr))
			throw invalid_utf8(ptr);
#else
		(void)is_middle_byte(*ptr);
#endif
	}

	utf8_iterator & utf8_iterator::operator ++()
	{
		if (is_1_byte(ptr[0]))
			ptr += 1;
		else if (is_2_byte(ptr[0]))
			ptr += 2;
		else if (is_3_byte(ptr[0]))
			ptr += 3;
		else if (is_4_byte(ptr[0]))
			ptr += 4;
		return *this;
	}

	utf8_iterator utf8_iterator::operator ++(int)
	{
		utf8_iterator copy(*this);
		++(*this);
		return copy;
	}

	utf8_iterator & utf8_iterator::operator --()
	{
		--ptr;
		while (is_middle_byte(*ptr))
			--ptr;
		return *this;
	}

	utf8_iterator utf8_iterator::operator --(int)
	{
		utf8_iterator copy(*this);
		--(*this);
		return copy;
	}

	char32_t utf8_iterator::operator *() const
	{
		if (is_1_byte(ptr[0]))
		{
			return ptr[0];
		}
		else if (is_2_byte(ptr[0]))
		{
			assert_middle(ptr + 1);
			return (char32_t(ptr[0] & 0b11111) << 6) | (char32_t(ptr[1] & 0b111111));
		}
		else if (is_3_byte(ptr[0]))
		{
			assert_middle(ptr + 1);
			assert_middle(ptr + 2);
			return (char32_t(ptr[0] & 0b1111) << 12) | (char32_t(ptr[1] & 0b111111) << 6) | char32_t(ptr[2] & 0b111111);
		}
		else if (is_4_byte(ptr[0]))
		{
			assert_middle(ptr + 1);
			assert_middle(ptr + 2);
			assert_middle(ptr + 3);
			return (char32_t(ptr[0] & 0b111) << 18) | (char32_t(ptr[1] & 0b111111) << 12) | (char32_t(ptr[2] & 0b111111) << 6) | char32_t(ptr[3] & 0b111111);
		}
		else
			throw invalid_utf8(ptr);
	}

}