From 02f028b0c97c3de745b279b736523896e71d6597 Mon Sep 17 00:00:00 2001 From: lisyarus Date: Sun, 1 Nov 2020 22:02:32 +0300 Subject: [PATCH] Add parser combinator library (wip) with an example --- examples/parser.cpp | 87 ++++++ libs/parser/CMakeLists.txt | 6 + .../include/psemek/parser/combinators.hpp | 264 ++++++++++++++++++ libs/parser/include/psemek/parser/parser.hpp | 131 +++++++++ .../include/psemek/parser/primitives.hpp | 113 ++++++++ todo.md | 3 + 6 files changed, 604 insertions(+) create mode 100644 examples/parser.cpp create mode 100644 libs/parser/CMakeLists.txt create mode 100644 libs/parser/include/psemek/parser/combinators.hpp create mode 100644 libs/parser/include/psemek/parser/parser.hpp create mode 100644 libs/parser/include/psemek/parser/primitives.hpp diff --git a/examples/parser.cpp b/examples/parser.cpp new file mode 100644 index 00000000..2a84a4b6 --- /dev/null +++ b/examples/parser.cpp @@ -0,0 +1,87 @@ +#include + +#include +#include +#include + +#include + +#include + +template +Stream & operator << (Stream & s, std::monostate) +{ + return s << "()"; +} + +template +Stream & operator << (Stream & s, psemek::parser::end_token) +{ + return s << "(end)"; +} + +template +Stream & operator << (Stream & s, psemek::parser::ws_token) +{ + return s << "(ws)"; +} + +template +Stream & operator << (Stream & s, psemek::parser::newline_token) +{ + return s << "(newline)"; +} + +template +Stream & operator << (Stream & s, std::optional const & x) +{ + if (x) + return s << *x; + return s << "(none)"; +} + +template +Stream & operator << (Stream & s, std::vector const & v) +{ + s << '['; + for (std::size_t i = 0; i < v.size(); ++i) + { + if (i > 0) s << ", "; + s << v[i]; + } + return s << ']'; +} + +template +void print_tuple (Stream & s, T const & t, std::index_sequence) +{ + ((s << (I == 0 ? "" : ", ") << std::get(t)), ...); +} + +template +Stream & operator << (Stream & s, std::tuple const & t) +{ + s << '('; + print_tuple(s, t, std::make_index_sequence{}); + return s << ')'; +} + +template +Stream & operator << (Stream & s, std::variant const & v) +{ + auto visitor = [&s](auto const & x){ s << x; }; + std::visit(visitor, v); + return s; +} + +int main() +{ + using namespace psemek::parser; + + auto const p = map(concat(integer, ws, one_of(ch('+'), ch('-')), ws, integer), [](auto const & t){ + auto id = [](auto x){ return x; }; + return std::make_tuple(std::get<0>(t), std::visit(id, std::get<2>(t)), std::get<4>(t)); + }); + + std::cout << p.parse("45 + 67") << std::endl; +} diff --git a/libs/parser/CMakeLists.txt b/libs/parser/CMakeLists.txt new file mode 100644 index 00000000..55e9e814 --- /dev/null +++ b/libs/parser/CMakeLists.txt @@ -0,0 +1,6 @@ +file(GLOB_RECURSE PSEMEK_PARSER_HEADERS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "include/*.hpp") +file(GLOB_RECURSE PSEMEK_PARSER_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "source/*.cpp") + +add_library(psemek-parser ${PSEMEK_PARSER_HEADERS} ${PSEMEK_PARSER_SOURCES}) +target_include_directories(psemek-parser PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") +target_link_libraries(psemek-parser PUBLIC psemek-util) diff --git a/libs/parser/include/psemek/parser/combinators.hpp b/libs/parser/include/psemek/parser/combinators.hpp new file mode 100644 index 00000000..0cacc527 --- /dev/null +++ b/libs/parser/include/psemek/parser/combinators.hpp @@ -0,0 +1,264 @@ +#pragma once + +#include + +#include +#include +#include + +namespace psemek::parser +{ + + namespace detail + { + + struct concat_tag{}; + + template + bool concat_helper(std::integral_constant, Buffer &, Res &, error &) + { + return true; + } + + template + bool concat_helper(std::integral_constant, Buffer & buf, Res & res, error & e, P const & p, Ps const & ... ps) + { + auto r = p.apply(buf); + if (r.index() == 1) + { + e = std::get<1>(r); + return false; + } + std::get(res) = std::move(std::get<0>(r)); + return concat_helper(std::integral_constant{}, buf, res, e, ps...); + } + + template + auto concat_impl(Buffer & buf, Ps const & ... ps) + -> result> + { + std::tuple res; + error e; + auto it = buf.it; + if (!concat_helper(std::integral_constant{}, buf, res, e, ps...)) + { + buf.it = it; + return e; + } + return res; + } + + struct one_of_tag{}; + + template + bool one_of_helper(std::integral_constant, Buffer &, Res &, error &) + { + return false; + } + + template + bool one_of_helper(std::integral_constant, Buffer & buf, Res & res, error & e, P const & p, Ps const & ... ps) + { + auto r = p.apply(buf); + if (r.index() == 0) + { + res.template emplace(std::move(std::get<0>(r))); + return true; + } + e = std::get<1>(r); + return one_of_helper(std::integral_constant{}, buf, res, e, ps...); + } + + template + auto one_of_impl(Buffer & buf, Ps const & ... ps) + -> result> + { + std::variant res; + error e; + auto it = buf.it; + if (!one_of_helper(std::integral_constant{}, buf, res, e, ps...)) + { + buf.it = it; + return e; + } + return res; + } + + } + + template + auto map(P && p, F && f) + { + return make_parser([p = std::forward

(p), f = std::forward(f)](auto & buffer) + -> result + { + auto res = p.apply(buffer); + if (res.index() == 1) + return std::get<1>(res); + return f(std::get<0>(res)); + }); + } + + template + auto guard(P && p, F && f, std::string message = {}) + { + if (message.empty()) + message = "guard failed"; + + return make_parser([p = std::forward

(p), f = std::forward(f), message = std::move(message)](auto & buffer) + -> result + { + auto it = buffer.it; + auto res = p.apply(buffer); + if (res.index() == 1) + return std::get<1>(res); + if (!f(std::get<0>(res))) + { + buffer.it = it; + return error{buffer.offset(), message.data()}; + } + return std::get<0>(res); + }); + } + + template + auto maybe(P && p) + { + return make_parser([p = std::forward

(p)](auto & buffer) + -> result> + { + auto res = p.apply(buffer); + if (res.index() == 1) + return std::nullopt; + return std::get<0>(res); + }); + } + + template + auto from_to(P && p, std::size_t min_count, std::optional max_count, std::string message = {}) + { + if (message.empty()) + message = "(unknown)"; + + auto msg = std::string("expected at least ") + std::to_string(min_count) + std::string(" ") + message; + + return make_parser([p = std::forward

(p), min_count, max_count, msg = std::move(msg)](auto & buffer) + -> result> + { + auto it = buffer.it; + + std::vector res; + while (true) + { + if (max_count && res.size() >= *max_count) break; + + auto pos = buffer.it; + auto r = p.apply(buffer); + if (r.index() == 1) break; + if (buffer.it == pos) + throw grammar_error("infinite loop"); + res.push_back(std::move(std::get<0>(r))); + } + + if (res.size() < min_count) + { + buffer.it = it; + return error{buffer.offset(), msg.data()}; + } + + return res; + }); + } + + template + auto many(P && p) + { + return from_to(std::forward

(p), 0, std::nullopt); + } + + template + auto at_least(P && p, std::size_t count, std::string message = {}) + { + return from_to(std::forward

(p), count, std::nullopt, std::move(message)); + } + + template + auto exactly(P && p, std::size_t count, std::string message = {}) + { + return from_to(std::forward

(p), count, count, std::move(message)); + } + + template + auto concat(Ps && ... ps) + { + return make_parser([... ps = std::forward(ps)](auto & buffer) + -> result> + { + return detail::concat_impl(buffer, ps...); + }, detail::concat_tag{}); + } + + template + auto one_of(Ps && ... ps) + { + return make_parser([... ps = std::forward(ps)](auto & buffer) + -> result> + { + return detail::one_of_impl(buffer, ps...); + }, detail::one_of_tag{}); + } + + template + auto fold(P && p, F && f, A && a) + { + return make_parser([p = std::forward

(p), f = std::forward(f), a = std::forward(a)](auto & buffer) + -> result> + { + auto accum = a; + while (true) + { + auto pos = buffer.it; + auto res = p.apply(buffer); + if (res.index() == 1) + return accum; + if (pos == buffer.it) + throw grammar_error("infinite loop"); + + accum = f(accum, std::get<0>(res)); + } + }); + } + + template + auto fold(P && p, F && f) + { + return make_parser([p = std::forward

(p), f = std::forward(f)](auto & buffer) + -> result + { + auto res0 = p.apply(buffer); + if (res0.index() == 1) + return std::get<1>(res0); + auto accum = std::move(std::get<0>(res0)); + while (true) + { + auto pos = buffer.it; + auto res = p.apply(buffer); + if (res.index() == 1) + return accum; + if (pos == buffer.it) + throw grammar_error("infinite loop"); + + accum = f(std::move(accum), std::move(std::get<0>(res))); + } + }); + } + + struct skip_token{}; + + template + auto skip_while(P && p) + { + return fold(std::forward

(p), [](auto const &, auto const &){ return skip_token{}; }, skip_token{}); + } + +} diff --git a/libs/parser/include/psemek/parser/parser.hpp b/libs/parser/include/psemek/parser/parser.hpp new file mode 100644 index 00000000..8e9c62bf --- /dev/null +++ b/libs/parser/include/psemek/parser/parser.hpp @@ -0,0 +1,131 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +namespace psemek::parser +{ + + struct parse_error + : std::runtime_error + { + parse_error(std::string message, std::size_t line, std::size_t character) + : std::runtime_error(std::move(message)) + , line_{line} + , character_{character} + , what_{util::to_string(message, " at ", line, "#", character)} + {} + + std::size_t line() const { return line_; } + std::size_t character() const { return character_; } + + const char * what() const noexcept { return what_.data(); } + + private: + std::size_t line_; + std::size_t character_; + std::string what_; + }; + + struct grammar_error + : std::runtime_error + { + grammar_error(std::string message) + : std::runtime_error(std::move(message)) + {} + }; + + namespace detail + { + + template + auto result_type(P const & p, B & b) -> std::remove_cvref_t(p.apply(b)))>; + + template + struct buffer + { + using char_type = std::remove_cvref_t())>; + + buffer(Iterator begin, Iterator end) + : begin{begin} + , end{end} + , it{begin} + {} + + Iterator begin; + Iterator end; + Iterator it; + + std::ptrdiff_t offset() const { return it - begin; } + }; + + template + struct parser + { + using tag = Tag; + + P p; + + template + auto apply(Buffer & buf) const + { + return p(buf); + } + + auto parse(std::string_view text) const + { + buffer buf{text.data(), text.data() + text.size()}; + auto res = apply(buf); + if (res.index() == 1) + { + auto const & e = std::get<1>(res); + std::size_t l = 0; + std::size_t offset = e.offset; + + std::size_t last_newline = 0; + for (std::size_t i = 0; i < e.offset; ++i) + { + if (buf.begin[i] == '\n') + { + ++l; + offset -= i - last_newline; + last_newline = i; + } + } + + throw parse_error(e.message, l, offset); + } + return std::get<0>(res); + } + }; + + } + + struct error + { + std::ptrdiff_t offset; + char const * message; + }; + + template + using result = std::variant; + + template + auto make_parser(P && p) + { + return detail::parser>{std::forward

(p)}; + } + + template + auto make_parser(P && p, Tag) + { + return detail::parser, Tag>{std::forward

(p)}; + } + +} diff --git a/libs/parser/include/psemek/parser/primitives.hpp b/libs/parser/include/psemek/parser/primitives.hpp new file mode 100644 index 00000000..d5634b89 --- /dev/null +++ b/libs/parser/include/psemek/parser/primitives.hpp @@ -0,0 +1,113 @@ +#pragma once + +#include +#include + +namespace psemek::parser +{ + + struct end_token{}; + + const auto end = make_parser([](auto & buffer) + -> result + { + if (buffer.it == buffer.end) + return end_token{}; + return error{buffer.offset(), "unexpected trailing data"}; + }); + + template + auto pure(T && t) + { + return make_parser([t = std::forward(t)](auto &) + -> result> + { + return t; + }); + } + + const auto peek = make_parser([](auto & buffer) + -> result::char_type> + { + if (buffer.it == buffer.end) + return error{buffer.offset(), "unexpected end"}; + return *buffer.it++; + }); + + inline auto ch(char x) + { + return guard(peek, [x](auto const & y){ return x == y; }, std::string("expected \"") + std::string(1, x) + std::string("\"")); + } + + inline auto str(std::string s) + { + auto msg = std::string("expected \"") + s + std::string("\""); + return make_parser([s = std::move(s), msg = std::move(msg)](auto & buffer) + -> result + { + auto it = buffer.it; + std::size_t i = 0; + + while (it != buffer.end && i < s.size()) + { + if (*it != s[i]) + return error{buffer.offset(), msg.data()}; + + ++it; + ++i; + } + + if (i < s.size()) + return error{buffer.offset(), "unexpected end"}; + + buffer.it = it; + return s; + }); + } + + struct ws_token{}; + + inline auto ws = map(many(one_of(ch(' '), ch('\t'))), [](auto const &){ return ws_token{}; }); + + struct newline_token{}; + + inline auto newline = map(ch('\n'), [](auto const &){ return newline_token{}; }); + + inline auto alpha = guard(peek, [](auto c){ return std::isalpha(c); }); + + inline auto digit = map(guard(peek, [](auto c){ return '0' <= c && c <= '9'; }), [](char c){ return c - '0'; }); + + // TODO: overflow check for integers + + template + inline auto integer = map( + concat( + maybe(ch('-')), + fold(digit, [](int s, int d){ return 10 * s + d; }) + ), + [](auto const & t){ return std::get<1>(t) * (std::get<0>(t) ? -1 : 1); } + ); + + template + inline auto real = map( + concat( + maybe(ch('-')), + fold(digit, [](T s, int d){ return 10 * s + d; }), + maybe( + concat( + ch('.'), + fold(digit, [](auto p, int d){ + return std::make_pair(p.first + d * p.second / 10, p.second / 10); + }, std::make_pair(T{0}, T{1})) + ) + ) + ), + [](auto const & t){ + T sign = (std::get<0>(t) ? -1 : 1); + T i = std::get<1>(t); + T f = (std::get<2>(t) ? std::get<1>(*std::get<2>(t)).first : 0); + return sign * (i + f); + } + ); + +} diff --git a/todo.md b/todo.md index 5bc466b5..6ab3e7e1 100644 --- a/todo.md +++ b/todo.md @@ -15,3 +15,6 @@ * transparent objects * get rid of diffuse materials * find a better specular model (Blinn-Phong seems to over-shine the specular highlight) +* parser + * overflow checks for number parsers + * recursive parsers