lib: Add mlibc

Signed-off-by: Ian Moffett <ian@osmora.org>
author: Ian Moffett <ian@osmora.org> 2024-03-07 17:28:00 -0500
committer: Ian Moffett <ian@osmora.org> 2024-03-07 17:28:32 -0500
commit: bd5969fc876a10b18613302db7087ef3c40f18e1 (patch)
tree: 7c2b8619afe902abf99570df2873fbdf40a4d1a1 /lib/mlibc/options/internal/generic/charcode.cpp
parent: a95b38b1b92b172e6cc4e8e56a88a30cc65907b0 (diff)
1 files changed, 244 insertions, 0 deletions
diff --git a/lib/mlibc/options/internal/generic/charcode.cpp b/lib/mlibc/options/internal/generic/charcode.cpp
new file mode 100644
index 0000000..e09d5cd
--- /dev/null
+++ b/lib/mlibc/options/internal/generic/charcode.cpp
@@ -0,0 +1,244 @@
+
+#include <bits/ensure.h>
+#include <frg/string.hpp>
+#include <mlibc/charcode.hpp>
+#include <mlibc/debug.hpp>
+
+namespace mlibc {
+
+struct utf8_charcode {
+	static constexpr bool preserves_7bit_units = true;
+	static constexpr bool has_shift_states = false;
+
+	struct decode_state {
+		decode_state()
+		: _progress{0}, _cpoint{0} { }
+
+		auto progress() { return _progress; }
+		auto cpoint() { return _cpoint; }
+
+		charcode_error operator() (code_seq<const char> &seq) {
+			auto uc = static_cast<unsigned char>(*seq.it);
+			if(!_progress) {
+				if(!(uc & 0b1000'0000)) {
+					// ASCII-compatible.
+					_cpoint = uc;
+				}else if((uc & 0b1110'0000) == 0b1100'0000) {
+					_cpoint = uc & 0b1'1111;
+					_progress = 1;
+				}else if((uc & 0b1111'0000) == 0b1110'0000) {
+					_cpoint = uc & 0b1111;
+					_progress = 2;
+				}else if((uc & 0b1111'1000) == 0b1111'0000) {
+					_cpoint = uc & 0b111;
+					_progress = 3;
+				}else{
+					// If the highest two bits are 0b10, this is the second (or later) unit.
+					// Units with highest five bits = 0b11111 do not occur in valid UTF-8.
+					__ensure((uc & 0b1100'0000) == 0b1000'0000
+							|| (uc & 0b1111'1000) == 0b1111'1000);
+					return charcode_error::illegal_input;
+				}
+			}else{
+				// TODO: Return an error.
+				__ensure((uc & 0b1100'0000) == 0b1000'0000);
+				_cpoint = (_cpoint << 6) | (uc & 0x3F);
+				--_progress;
+			}
+			++seq.it;
+			return charcode_error::null;
+		}
+
+	private:
+		int _progress;
+		codepoint _cpoint;
+	};
+
+	struct encode_state {
+		// Encodes a single character from wseq + the current state and stores it in nseq.
+		// TODO: Convert decode_state to the same strategy.
+		charcode_error operator() (code_seq<char> &nseq, code_seq<const codepoint> &wseq) {
+			auto wc = *wseq.it;
+			__ensure(wc <= 0x7F && "utf8_charcode cannot encode multibyte chars yet");
+			*nseq.it = wc;
+			++wseq.it;
+			++nseq.it;
+			return charcode_error::null;
+		}
+	};
+};
+
+polymorphic_charcode::~polymorphic_charcode() = default;
+
+// For *decoding, this class assumes that:
+// - G::decode_state has members progress() and cpoint().
+// - G::decode_state::progress() >= 0 at all times.
+//   TODO: This will be needed on platforms like Windows, where wchar_t is UTF-16.
+//   TODO: There, we can use negative __mlibc_mbstate::progress to represent encoding to UTF-16.
+// - If G::decode_state::progress() == 0, the code point (given by cpoint())
+//   was decoded successfully.
+template<typename G>
+struct polymorphic_charcode_adapter : polymorphic_charcode {
+	polymorphic_charcode_adapter()
+	: polymorphic_charcode{G::preserves_7bit_units, G::has_shift_states} { }
+
+	charcode_error decode(code_seq<const char> &nseq, code_seq<codepoint> &wseq,
+			__mlibc_mbstate &st) override {
+		__ensure(!st.__progress); // TODO: Update st with ds.progress() and ds.cpoint().
+
+		code_seq<const char> decode_nseq = nseq;
+		typename G::decode_state ds;
+
+		while(decode_nseq && wseq) {
+			// Consume the next code unit.
+			if(auto e = ds(decode_nseq); e != charcode_error::null)
+				return e;
+
+			// Produce a new code point.
+			if(!ds.progress()) {
+				// "Commit" consumed code units (as there was no decode error).
+				nseq.it = decode_nseq.it;
+				if(!ds.cpoint()) // Stop on null characters.
+					return charcode_error::null;
+				*wseq.it = ds.cpoint();
+				++wseq.it;
+			}
+		}
+
+		if(ds.progress())
+			return charcode_error::input_underflow;
+		return charcode_error::null;
+	}
+
+	charcode_error decode_wtranscode(code_seq<const char> &nseq, code_seq<wchar_t> &wseq,
+			__mlibc_mbstate &st) override {
+		__ensure(!st.__progress); // TODO: Update st with ds.progress() and ds.cpoint().
+
+		code_seq<const char> decode_nseq = nseq;
+		typename G::decode_state ds;
+
+		while(decode_nseq && wseq) {
+			// Consume the next code unit.
+			if(auto e = ds(decode_nseq); e != charcode_error::null)
+				return e;
+
+			// Produce a new code point.
+			if(!ds.progress()) {
+				nseq.it = decode_nseq.it;
+				// "Commit" consumed code units (as there was no decode error).
+				if(!ds.cpoint()) // Stop on null characters.
+					return charcode_error::null;
+				*wseq.it = ds.cpoint();
+				++wseq.it;
+			}
+		}
+
+		if(ds.progress())
+			return charcode_error::input_underflow;
+		return charcode_error::null;
+	}
+
+	charcode_error decode_wtranscode_length(code_seq<const char> &nseq, size_t *n,
+			__mlibc_mbstate &st) override {
+		__ensure(!st.__progress); // TODO: Update st with ds.progress() and ds.cpoint().
+
+		code_seq<const char> decode_nseq = nseq;
+		typename G::decode_state ds;
+
+		*n = 0;
+		while(decode_nseq) {
+			// Consume the next code unit.
+			if(auto e = ds(decode_nseq); e != charcode_error::null)
+				return e;
+
+			if(!ds.progress()) {
+				nseq.it = decode_nseq.it;
+				// "Commit" consumed code units (as there was no decode error).
+				if(!ds.cpoint()) // Stop on null code points.
+					return charcode_error::null;
+				++(*n);
+			}
+		}
+
+		if(ds.progress())
+			return charcode_error::input_underflow;
+		return charcode_error::null;
+	}
+
+	charcode_error encode_wtranscode(code_seq<char> &nseq, code_seq<const wchar_t> &wseq,
+			__mlibc_mbstate &st) override {
+		__ensure(!st.__progress); // TODO: Update st with es.progress() and es.cpoint().
+
+		code_seq<char> encode_nseq = nseq;
+		typename G::encode_state es;
+
+		while(encode_nseq && wseq) {
+			codepoint cp = *wseq.it;
+			if(!cp)
+				return charcode_error::null;
+
+			code_seq<const codepoint> cps{&cp, &cp + 1};
+			if(auto e = es(encode_nseq, cps); e == charcode_error::dirty) {
+				continue;
+			}else if(e != charcode_error::null) {
+				return e;
+			}
+			__ensure(cps.it == cps.end);
+			++wseq.it;
+
+			// "Commit" produced code units (as there was no encode error).
+			nseq.it = encode_nseq.it;
+		}
+
+		if(encode_nseq.it != nseq.it)
+			return charcode_error::output_overflow;
+		return charcode_error::null;
+	}
+
+	charcode_error encode_wtranscode_length(code_seq<const wchar_t> &wseq, size_t *n,
+			__mlibc_mbstate &st) override {
+		__ensure(!st.__progress); // TODO: Update st with es.progress() and es.cpoint().
+
+		typename G::encode_state es;
+
+		*n = 0;
+		while(wseq) {
+			char temp[4];
+			code_seq<char> encode_nseq{temp, temp + 4};
+			codepoint cp = *wseq.it;
+			if(!cp)
+				return charcode_error::null;
+			// Consume the next code unit.
+			code_seq<const codepoint> cps{&cp, &cp + 1};
+			if(auto e = es(encode_nseq, cps); e == charcode_error::dirty) {
+				continue;
+			}else if(e != charcode_error::null) {
+				return e;
+			}
+
+			++(*n);
+			++wseq.it;
+		}
+
+		return charcode_error::null;
+	}
+};
+
+polymorphic_charcode *current_charcode() {
+	static polymorphic_charcode_adapter<utf8_charcode> global_charcode;
+	return &global_charcode;
+}
+
+charcode_error wide_charcode::promote(wchar_t nc, codepoint &wc) {
+	// TODO: Allow non-identity encodings of wchar_t.
+	wc = nc;
+	return charcode_error::null;
+}
+
+wide_charcode *platform_wide_charcode() {
+	static wide_charcode global_wide_charcode;
+	return &global_wide_charcode;
+}
+
+} // namespace mlibc
+
author	Ian Moffett <ian@osmora.org>	2024-03-07 17:28:00 -0500
committer	Ian Moffett <ian@osmora.org>	2024-03-07 17:28:32 -0500
commit	bd5969fc876a10b18613302db7087ef3c40f18e1 (patch)
tree	7c2b8619afe902abf99570df2873fbdf40a4d1a1 /lib/mlibc/options/internal/generic/charcode.cpp
parent	a95b38b1b92b172e6cc4e8e56a88a30cc65907b0 (diff)