Add utf8 iterator & range

2021-02-23 20:13:33 +03:00 · 2021-02-23 20:13:33 +03:00 · 7520901f30
commit 7520901f30
parent 95cc39421d
2 changed files with 164 additions and 0 deletions
--- a/libs/util/include/psemek/util/unicode.hpp
+++ b/libs/util/include/psemek/util/unicode.hpp
@ -1,6 +1,8 @@
 #pragma once

 #include <string>
+#include <iterator>
+#include <exception>

 namespace psemek::util
 {
@ -8,4 +10,78 @@ namespace psemek::util
 	std::string to_utf8(std::u32string const & str);
 	std::u32string from_utf8(std::string const & str);

+	struct utf8_iterator
+	{
+		using value_type = char32_t;
+		using pointer = void;
+		using reference = void;
+		using difference_type = std::ptrdiff_t;
+		using iterator_category = std::input_iterator_tag;
+
+		char const * ptr;
+
+		utf8_iterator & operator ++();
+		utf8_iterator operator ++(int);
+		char32_t operator *() const;
+	};
+
+	inline bool operator == (utf8_iterator const & i1, utf8_iterator const & i2)
+	{
+		return i1.ptr == i2.ptr;
+	}
+
+	inline bool operator != (utf8_iterator const & i1, utf8_iterator const & i2)
+	{
+		return i1.ptr != i2.ptr;
+	}
+
+	struct utf8_range
+	{
+		std::string_view str;
+
+		utf8_range(char const * begin)
+			: str(begin)
+		{}
+
+		utf8_range(char const * begin, char const * end)
+			: str(begin, end - begin)
+		{}
+
+		utf8_range(std::string_view str)
+			: str(str)
+		{}
+
+		utf8_range(utf8_range const &) = default;
+
+		auto begin() const
+		{
+			return utf8_iterator{str.data()};
+		}
+
+		auto end() const
+		{
+			return utf8_iterator{str.data() + str.size()};
+		}
+
+		std::size_t size() const
+		{
+			return std::distance(begin(), end());
+		}
+	};
+
+	struct invalid_utf8
+		: std::exception
+	{
+		invalid_utf8(char const * data)
+			: data_{data}
+		{}
+
+		char const * what() const noexcept override;
+
+		char const * data() const { return data_; }
+
+	private:
+		char const * data_;
+	};
+
 }
--- a/libs/util/source/unicode.cpp
+++ b/libs/util/source/unicode.cpp
@ -18,4 +18,92 @@ namespace psemek::util
 		return converter{}.from_bytes(str);
 	}

+	static bool is_1_byte(char c)
+	{
+		return (c & 0b10000000) == 0b00000000;
+	}
+
+	static bool is_2_byte(char c)
+	{
+		return (c & 0b11100000) == 0b11000000;
+	}
+
+	static bool is_3_byte(char c)
+	{
+		return (c & 0b11110000) == 0b11100000;
+	}
+
+	static bool is_4_byte(char c)
+	{
+		return (c & 0b11111000) == 0b11110000;
+	}
+
+	static bool is_middle_byte(char c)
+	{
+		return (c & 0b11000000) == 0b10000000;
+	}
+
+	static void assert_middle(char const * ptr)
+	{
+#ifdef PSEMEK_DEBUG
+		if (!is_middle_byte(*ptr))
+			throw invalid_utf8(ptr);
+#else
+		(void*)ptr;
+#endif
+	}
+
+	utf8_iterator & utf8_iterator::operator ++()
+	{
+		if (is_1_byte(ptr[0]))
+			ptr += 1;
+		else if (is_2_byte(ptr[0]))
+			ptr += 2;
+		else if (is_3_byte(ptr[0]))
+			ptr += 3;
+		else if (is_4_byte(ptr[0]))
+			ptr += 4;
+		return *this;
+	}
+
+	utf8_iterator utf8_iterator::operator ++(int)
+	{
+		utf8_iterator copy(*this);
+		++(*this);
+		return copy;
+	}
+
+	char32_t utf8_iterator::operator *() const
+	{
+		if (is_1_byte(ptr[0]))
+		{
+			return ptr[0];
+		}
+		else if (is_2_byte(ptr[0]))
+		{
+			assert_middle(ptr + 1);
+			return (ptr[0] & 0b11111) | ((ptr[1] & 0b111111) << 5);
+		}
+		else if (is_3_byte(ptr[0]))
+		{
+			assert_middle(ptr + 1);
+			assert_middle(ptr + 2);
+			return (ptr[0] & 0b1111) | ((ptr[1] & 0b111111) << 4) | ((ptr[2] & 0b111111) << 10);
+		}
+		else if (is_4_byte(ptr[0]))
+		{
+			assert_middle(ptr + 1);
+			assert_middle(ptr + 2);
+			assert_middle(ptr + 3);
+			return (ptr[0] & 0b111) | ((ptr[1] & 0b111111) << 3) | ((ptr[2] & 0b111111) << 9) | ((ptr[3] & 0b111111) << 15);
+		}
+		else
+			throw invalid_utf8(ptr);
+	}
+
+	char const * invalid_utf8::what() const noexcept
+	{
+		return "Invalid UTF-8 string";
+	}
+
 }