aboutsummaryrefslogtreecommitdiff
path: root/lib/mlibc/options/internal/generic/charcode.cpp
diff options
context:
space:
mode:
authorIan Moffett <ian@osmora.org>2024-03-07 17:28:00 -0500
committerIan Moffett <ian@osmora.org>2024-03-07 17:28:32 -0500
commitbd5969fc876a10b18613302db7087ef3c40f18e1 (patch)
tree7c2b8619afe902abf99570df2873fbdf40a4d1a1 /lib/mlibc/options/internal/generic/charcode.cpp
parenta95b38b1b92b172e6cc4e8e56a88a30cc65907b0 (diff)
lib: Add mlibc
Signed-off-by: Ian Moffett <ian@osmora.org>
Diffstat (limited to 'lib/mlibc/options/internal/generic/charcode.cpp')
-rw-r--r--lib/mlibc/options/internal/generic/charcode.cpp244
1 files changed, 244 insertions, 0 deletions
diff --git a/lib/mlibc/options/internal/generic/charcode.cpp b/lib/mlibc/options/internal/generic/charcode.cpp
new file mode 100644
index 0000000..e09d5cd
--- /dev/null
+++ b/lib/mlibc/options/internal/generic/charcode.cpp
@@ -0,0 +1,244 @@
+
+#include <bits/ensure.h>
+#include <frg/string.hpp>
+#include <mlibc/charcode.hpp>
+#include <mlibc/debug.hpp>
+
+namespace mlibc {
+
+struct utf8_charcode {
+ static constexpr bool preserves_7bit_units = true;
+ static constexpr bool has_shift_states = false;
+
+ struct decode_state {
+ decode_state()
+ : _progress{0}, _cpoint{0} { }
+
+ auto progress() { return _progress; }
+ auto cpoint() { return _cpoint; }
+
+ charcode_error operator() (code_seq<const char> &seq) {
+ auto uc = static_cast<unsigned char>(*seq.it);
+ if(!_progress) {
+ if(!(uc & 0b1000'0000)) {
+ // ASCII-compatible.
+ _cpoint = uc;
+ }else if((uc & 0b1110'0000) == 0b1100'0000) {
+ _cpoint = uc & 0b1'1111;
+ _progress = 1;
+ }else if((uc & 0b1111'0000) == 0b1110'0000) {
+ _cpoint = uc & 0b1111;
+ _progress = 2;
+ }else if((uc & 0b1111'1000) == 0b1111'0000) {
+ _cpoint = uc & 0b111;
+ _progress = 3;
+ }else{
+ // If the highest two bits are 0b10, this is the second (or later) unit.
+ // Units with highest five bits = 0b11111 do not occur in valid UTF-8.
+ __ensure((uc & 0b1100'0000) == 0b1000'0000
+ || (uc & 0b1111'1000) == 0b1111'1000);
+ return charcode_error::illegal_input;
+ }
+ }else{
+ // TODO: Return an error.
+ __ensure((uc & 0b1100'0000) == 0b1000'0000);
+ _cpoint = (_cpoint << 6) | (uc & 0x3F);
+ --_progress;
+ }
+ ++seq.it;
+ return charcode_error::null;
+ }
+
+ private:
+ int _progress;
+ codepoint _cpoint;
+ };
+
+ struct encode_state {
+ // Encodes a single character from wseq + the current state and stores it in nseq.
+ // TODO: Convert decode_state to the same strategy.
+ charcode_error operator() (code_seq<char> &nseq, code_seq<const codepoint> &wseq) {
+ auto wc = *wseq.it;
+ __ensure(wc <= 0x7F && "utf8_charcode cannot encode multibyte chars yet");
+ *nseq.it = wc;
+ ++wseq.it;
+ ++nseq.it;
+ return charcode_error::null;
+ }
+ };
+};
+
+polymorphic_charcode::~polymorphic_charcode() = default;
+
+// For *decoding, this class assumes that:
+// - G::decode_state has members progress() and cpoint().
+// - G::decode_state::progress() >= 0 at all times.
+// TODO: This will be needed on platforms like Windows, where wchar_t is UTF-16.
+// TODO: There, we can use negative __mlibc_mbstate::progress to represent encoding to UTF-16.
+// - If G::decode_state::progress() == 0, the code point (given by cpoint())
+// was decoded successfully.
+template<typename G>
+struct polymorphic_charcode_adapter : polymorphic_charcode {
+ polymorphic_charcode_adapter()
+ : polymorphic_charcode{G::preserves_7bit_units, G::has_shift_states} { }
+
+ charcode_error decode(code_seq<const char> &nseq, code_seq<codepoint> &wseq,
+ __mlibc_mbstate &st) override {
+ __ensure(!st.__progress); // TODO: Update st with ds.progress() and ds.cpoint().
+
+ code_seq<const char> decode_nseq = nseq;
+ typename G::decode_state ds;
+
+ while(decode_nseq && wseq) {
+ // Consume the next code unit.
+ if(auto e = ds(decode_nseq); e != charcode_error::null)
+ return e;
+
+ // Produce a new code point.
+ if(!ds.progress()) {
+ // "Commit" consumed code units (as there was no decode error).
+ nseq.it = decode_nseq.it;
+ if(!ds.cpoint()) // Stop on null characters.
+ return charcode_error::null;
+ *wseq.it = ds.cpoint();
+ ++wseq.it;
+ }
+ }
+
+ if(ds.progress())
+ return charcode_error::input_underflow;
+ return charcode_error::null;
+ }
+
+ charcode_error decode_wtranscode(code_seq<const char> &nseq, code_seq<wchar_t> &wseq,
+ __mlibc_mbstate &st) override {
+ __ensure(!st.__progress); // TODO: Update st with ds.progress() and ds.cpoint().
+
+ code_seq<const char> decode_nseq = nseq;
+ typename G::decode_state ds;
+
+ while(decode_nseq && wseq) {
+ // Consume the next code unit.
+ if(auto e = ds(decode_nseq); e != charcode_error::null)
+ return e;
+
+ // Produce a new code point.
+ if(!ds.progress()) {
+ nseq.it = decode_nseq.it;
+ // "Commit" consumed code units (as there was no decode error).
+ if(!ds.cpoint()) // Stop on null characters.
+ return charcode_error::null;
+ *wseq.it = ds.cpoint();
+ ++wseq.it;
+ }
+ }
+
+ if(ds.progress())
+ return charcode_error::input_underflow;
+ return charcode_error::null;
+ }
+
+ charcode_error decode_wtranscode_length(code_seq<const char> &nseq, size_t *n,
+ __mlibc_mbstate &st) override {
+ __ensure(!st.__progress); // TODO: Update st with ds.progress() and ds.cpoint().
+
+ code_seq<const char> decode_nseq = nseq;
+ typename G::decode_state ds;
+
+ *n = 0;
+ while(decode_nseq) {
+ // Consume the next code unit.
+ if(auto e = ds(decode_nseq); e != charcode_error::null)
+ return e;
+
+ if(!ds.progress()) {
+ nseq.it = decode_nseq.it;
+ // "Commit" consumed code units (as there was no decode error).
+ if(!ds.cpoint()) // Stop on null code points.
+ return charcode_error::null;
+ ++(*n);
+ }
+ }
+
+ if(ds.progress())
+ return charcode_error::input_underflow;
+ return charcode_error::null;
+ }
+
+ charcode_error encode_wtranscode(code_seq<char> &nseq, code_seq<const wchar_t> &wseq,
+ __mlibc_mbstate &st) override {
+ __ensure(!st.__progress); // TODO: Update st with es.progress() and es.cpoint().
+
+ code_seq<char> encode_nseq = nseq;
+ typename G::encode_state es;
+
+ while(encode_nseq && wseq) {
+ codepoint cp = *wseq.it;
+ if(!cp)
+ return charcode_error::null;
+
+ code_seq<const codepoint> cps{&cp, &cp + 1};
+ if(auto e = es(encode_nseq, cps); e == charcode_error::dirty) {
+ continue;
+ }else if(e != charcode_error::null) {
+ return e;
+ }
+ __ensure(cps.it == cps.end);
+ ++wseq.it;
+
+ // "Commit" produced code units (as there was no encode error).
+ nseq.it = encode_nseq.it;
+ }
+
+ if(encode_nseq.it != nseq.it)
+ return charcode_error::output_overflow;
+ return charcode_error::null;
+ }
+
+ charcode_error encode_wtranscode_length(code_seq<const wchar_t> &wseq, size_t *n,
+ __mlibc_mbstate &st) override {
+ __ensure(!st.__progress); // TODO: Update st with es.progress() and es.cpoint().
+
+ typename G::encode_state es;
+
+ *n = 0;
+ while(wseq) {
+ char temp[4];
+ code_seq<char> encode_nseq{temp, temp + 4};
+ codepoint cp = *wseq.it;
+ if(!cp)
+ return charcode_error::null;
+ // Consume the next code unit.
+ code_seq<const codepoint> cps{&cp, &cp + 1};
+ if(auto e = es(encode_nseq, cps); e == charcode_error::dirty) {
+ continue;
+ }else if(e != charcode_error::null) {
+ return e;
+ }
+
+ ++(*n);
+ ++wseq.it;
+ }
+
+ return charcode_error::null;
+ }
+};
+
+polymorphic_charcode *current_charcode() {
+ static polymorphic_charcode_adapter<utf8_charcode> global_charcode;
+ return &global_charcode;
+}
+
+charcode_error wide_charcode::promote(wchar_t nc, codepoint &wc) {
+ // TODO: Allow non-identity encodings of wchar_t.
+ wc = nc;
+ return charcode_error::null;
+}
+
+wide_charcode *platform_wide_charcode() {
+ static wide_charcode global_wide_charcode;
+ return &global_wide_charcode;
+}
+
+} // namespace mlibc
+