3 files changed, 461 insertions, 0 deletions
diff --git a/compiler/lexer/char_info.c b/compiler/lexer/char_info.c
new file mode 100644
index 0000000..97ecb7d
--- /dev/null
+++ b/compiler/lexer/char_info.c
@@ -0,0 +1,106 @@
+/*
+ * Character info table.
+ * Copyright (c) 2023-2024, Quinn Stephens and the OSMORA team.
+ * Provided under the BSD 3-Clause license.
+ */
+
+#include "lexer/char_info.h"
+
+uint16_t char_info[256] = {
+    /*
+        NUL SOH STX ETX
+        EOT ENQ ACK BEL
+    */
+    0           , 0           , 0           , 0           ,
+    0           , 0           , 0           , 0           ,
+    /*
+        BS  TAB LF  VT
+        FF  CR  SO  SI
+    */
+    0           , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_VERT_WS,
+    CHAR_VERT_WS, CHAR_HORZ_WS, 0           , 0           ,
+    /*
+        DLE DC1 DC2 DC3
+        DC4 NAK SYN ETB
+    */
+    0           , 0           , 0           , 0           ,
+    0           , 0           , 0           , 0           ,
+    /*
+        CAN EM  SUB ESC
+        FS  GS  RS  US
+    */
+    0           , 0           , 0           , 0           ,
+    0           , 0           , 0           , 0           ,
+    /*
+          ! " #
+        $ % & '
+    */
+    CHAR_HORZ_WS, CHAR_EXCLAIM, 0           , 0           ,
+    0           , CHAR_PERCENT, CHAR_AMPER  , 0           ,
+    /*
+        ( ) * +
+        , - . /
+    */
+    CHAR_LPAREN , CHAR_RPAREN , CHAR_STAR   , CHAR_PLUS   ,
+    CHAR_COMMA  , CHAR_MINUS  , CHAR_DOT    , CHAR_SLASH  ,
+    /*
+        0 1 2 3
+        4 5 6 7
+    */
+    CHAR_DIGIT  , CHAR_DIGIT  , CHAR_DIGIT  , CHAR_DIGIT  ,
+    CHAR_DIGIT  , CHAR_DIGIT  , CHAR_DIGIT  , CHAR_DIGIT  ,
+    /*
+        8 9 : ;
+        < = > ?
+    */
+    CHAR_DIGIT  , CHAR_DIGIT  , CHAR_COLON  , CHAR_SEMI   ,
+    CHAR_LESS   , CHAR_EQUALS , CHAR_GREATER, 0           ,
+    /*
+        @ A B C
+        D E F G
+    */
+    0           , CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER ,
+    CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER , CHAR_UPPER  ,
+    /*
+        H I J K
+        L M N O
+    */
+    CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  ,
+    CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  ,
+    /*
+        P Q R S
+        T U V W
+    */
+    CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  ,
+    CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  ,
+    /*
+        X Y Z [
+        \ ] ^ _
+    */
+    CHAR_UPPER  , CHAR_UPPER  , CHAR_UPPER  , CHAR_LBRACK ,
+    0           , CHAR_RBRACK , CHAR_CARET  , 0           ,
+    /*
+        ` a b c
+        d e f g
+    */
+    0           , CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER ,
+    CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER , CHAR_LOWER  ,
+    /*
+        h i j k
+        l m n o
+    */
+    CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  ,
+    CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  ,
+    /*
+        p q r s
+        t u v w
+    */
+    CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  ,
+    CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  ,
+    /*
+        x y z {
+        | } ~
+    */
+    CHAR_LOWER  , CHAR_LOWER  , CHAR_LOWER  , CHAR_LBRACE ,
+    CHAR_PIPE   , CHAR_RBRACE , CHAR_TILDE  , 0           ,
+};
diff --git a/compiler/lexer/keywords.c b/compiler/lexer/keywords.c
new file mode 100644
index 0000000..0e95048
--- /dev/null
+++ b/compiler/lexer/keywords.c
@@ -0,0 +1,73 @@
+/*
+ * Keyword hashmap.
+ * Copyright (c) 2023-2024, Quinn Stephens and the OSMORA team.
+ * Provided under the BSD 3-Clause license.
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include "hash.h"
+#include "hashmap.h"
+#include "debug.h"
+#include "lexer/keywords.h"
+
+#define HASHMAP_ROWS 8
+
+struct keyword {
+    struct hashmap_entry hashmap_entry;
+    size_t len;
+    token_kind_t value;
+};
+
+static bool initialized = false;
+static struct list keywords_rows[HASHMAP_ROWS];
+static struct hashmap keywords;
+
+static void
+add_keyword(char *name, token_kind_t value)
+{
+    size_t len;
+    struct keyword *kwd;
+
+    len = strlen(name);
+    kwd = malloc(sizeof(struct keyword));
+    kwd->hashmap_entry.hash = hash_data(name, len);
+    kwd->len = len;
+    kwd->value = value;
+
+    hashmap_add(&keywords, &kwd->hashmap_entry);
+}
+
+token_kind_t
+keywords_find(struct token *tok)
+{
+    struct keyword *kwd;
+
+    kwd = (struct keyword*)hashmap_find(&keywords, tok->hash);
+    if (kwd == NULL || kwd->len != tok->len) {
+        return TK_UNKNOWN;
+    }
+
+    return kwd->value;
+}
+
+void
+keywords_init(void)
+{
+    if (initialized) {
+        return;
+    }
+
+    debug("Initializing keywords...\n");
+
+    keywords.rows = keywords_rows;
+    keywords.n_rows = HASHMAP_ROWS;
+    hashmap_init(&keywords);
+
+    add_keyword("type", TK_TYPE);
+    add_keyword("enum", TK_ENUM);
+    add_keyword("struct", TK_STRUCT);
+
+    initialized = true;
+}
diff --git a/compiler/lexer/lexer.c b/compiler/lexer/lexer.c
new file mode 100644
index 0000000..06f7e89
--- /dev/null
+++ b/compiler/lexer/lexer.c
@@ -0,0 +1,282 @@
+/*
+ * Quark lexer (lexical analyzer).
+ * Turns source code into tokens.
+ * Copyright (c) 2023-2024, Quinn Stephens and the OSMORA team.
+ * Provided under the BSD 3-Clause license.
+ */
+
+#include "debug.h"
+#include "lexer.h"
+#include "lexer/char_info.h"
+#include "lexer/keywords.h"
+
+static void
+skip_ignored(struct lexer *ctx)
+{
+    while (char_info[(int)*ctx->pos] & CHAR_WHITESPACE) {
+        if (char_info[(int)*ctx->pos] & CHAR_VERT_WS) {
+            ctx->line++;
+            ctx->line_start = ctx->pos + 1;
+        }
+
+        ctx->pos++;
+    }
+}
+
+static void
+parse_num_hex(struct lexer *ctx, struct token *tok)
+{
+    ctx->pos += 2;
+    tok->value = 0;
+    while (char_info[(int)*ctx->pos] & CHAR_HEX) {
+        if (char_info[(int)*ctx->pos] == CHAR_XLOWER) {
+            tok->value |= *ctx->pos++ - 'a' + 0xa;
+        } else if (char_info[(int)*ctx->pos] == CHAR_XUPPER) {
+            tok->value |= *ctx->pos++ - 'A' + 0xa;
+        } else {
+            tok->value |= *ctx->pos++ - '0';
+        }
+    }
+}
+
+static void
+parse_num_dec(struct lexer *ctx, struct token *tok)
+{
+    ctx->pos++;
+    tok->value = *ctx->pos - '0';
+    while (char_info[(int)*ctx->pos] & CHAR_DIGIT) {
+        tok->value *= 10;
+        tok->value += *ctx->pos++ - '0';
+    }
+}
+
+static void
+parse_num_bin(struct lexer *ctx, struct token *tok)
+{
+    ctx->pos += 2;
+    tok->value = 0;
+    while (*ctx->pos == '0' || *ctx->pos == '1') {
+        tok->value <<= 1;
+        tok->value |= *ctx->pos++ - '0';
+    }
+}
+
+static void
+lex_ident(struct lexer *ctx, struct token *tok)
+{
+    /* Find end of identifier */
+    ctx->pos++;
+    while (char_info[(int)*ctx->pos] & CHAR_ALNUM || *ctx->pos == '_') {
+        ctx->pos++;
+    }
+
+    tok->len = (size_t)(ctx->pos - tok->pos);
+    tok->hash = hash_data(tok->pos, tok->len);
+
+    /* Determine if this is a keyword or just an identifier */
+    tok->kind = keywords_find(tok);
+    if (tok->kind == TK_UNKNOWN) {
+        tok->kind = TK_IDENTIFIER;
+        return;
+    }
+}
+
+static void
+lex_oper(struct lexer *ctx, struct token *tok)
+{
+    tok->len = 1;
+
+    switch (*ctx->pos) {
+    case '+':
+        if (ctx->pos[1] == '+') {
+            tok->kind = TK_PLUS_PLUS;
+            tok->len = 2;
+        } else if (ctx->pos[1] == '=') {
+            tok->kind = TK_PLUS_EQUALS;
+            tok->len = 2;
+        } else {
+            tok->kind = TK_PLUS;
+        }
+
+        break;
+    case '-':
+        if (ctx->pos[1] == '>') {
+            tok->kind = TK_ARROW;
+            tok->len = 2;
+        } if (ctx->pos[1] == '-') {
+            tok->kind = TK_MINUS_MINUS;
+            tok->len = 2;
+        } else if (ctx->pos[1] == '=') {
+            tok->kind = TK_MINUS_EQUALS;
+            tok->len = 2;
+        } else {
+            tok->kind = TK_MINUS;
+        }
+
+        break;
+    case '<':
+        if (ctx->pos[1] == '=') {
+            tok->kind = TK_LESS_THAN_EQUALS;
+            tok->len = 2;
+        } else if (ctx->pos[1] == '<') {
+            if (ctx->pos[2] == '=') {
+                tok->kind = TK_SHIFT_LEFT_EQUALS;
+                tok->len = 3;
+            } else {
+                tok->kind = TK_SHIFT_LEFT;
+                tok->len = 2;
+            }
+        } else {
+            tok->kind = TK_LESS_THAN;
+        }
+
+        break;
+    case '>':
+        if (ctx->pos[1] == '=') {
+            tok->kind = TK_GREATER_THAN_EQUALS;
+            tok->len = 2;
+        } else if (ctx->pos[1] == '>') {
+            if (ctx->pos[2] == '=') {
+                tok->kind = TK_SHIFT_RIGHT_EQUALS;
+                tok->len = 3;
+            } else {
+                tok->kind = TK_SHIFT_RIGHT;
+                tok->len = 2;
+            }
+        } else {
+            tok->kind = TK_GREATER_THAN;
+        }
+
+        break;
+    default:
+        tok->kind = char_info[(int)*ctx->pos] >> CHAR_OPER_SHIFT;
+        if (ctx->pos[1] == '=') {
+            tok->kind++;
+            tok->len = 2;
+        }
+
+        break;
+    }
+
+    ctx->pos += tok->len;
+}
+
+static void
+lex_str(struct lexer *ctx, struct token *tok)
+{
+    /* Find end of string */
+    ctx->pos++;
+    while (*ctx->pos != '"') {
+        if (*ctx->pos == '\\' && ctx->pos[1] == '\"') {
+            ctx->pos++;
+        }
+
+        ctx->pos++;
+    }
+    ctx->pos++;
+
+    tok->kind = TK_STRING;
+    tok->len = (size_t)(ctx->pos - tok->pos) - 1;
+}
+
+static void
+lex_char(struct lexer *ctx, struct token *tok)
+{
+    /* Find end of character */
+    ctx->pos++;
+    while (*ctx->pos != '\'') {
+        if (*ctx->pos == '\\' && ctx->pos[1] == '\'') {
+            ctx->pos++;
+        }
+
+        ctx->pos++;
+    }
+    ctx->pos++;
+
+    tok->kind = TK_CHARACTER;
+    tok->len = (size_t)(ctx->pos - tok->pos) - 1;
+}
+
+void
+lexer_next(struct lexer *ctx, struct token *tok)
+{
+    if (tok == NULL) {
+        return;
+    }
+
+    if (ctx == NULL) {
+        tok->kind = TK_UNKNOWN;
+        return;
+    }
+
+    skip_ignored(ctx);
+
+    /* Initialize token */
+    tok->kind = TK_UNKNOWN;
+    tok->pos = ctx->pos;
+    tok->line = ctx->line;
+    tok->col = (int)(tok->pos - ctx->line_start) + 1;
+
+    if (char_info[(int)*ctx->pos] & CHAR_ALPHA || *ctx->pos == '_') {
+        lex_ident(ctx, tok);
+        return;
+    }
+
+    if (char_info[(int)*ctx->pos] & CHAR_SINGLE) {
+        tok->kind = char_info[(int)*ctx->pos] >> CHAR_SINGLE_SHIFT;
+        tok->len = 1;
+        ctx->pos++;
+        return;
+    }
+
+    if (char_info[(int)*ctx->pos] & CHAR_OPER) {
+        lex_oper(ctx, tok);
+        return;
+    }
+
+    if (char_info[(int)*ctx->pos] & CHAR_DIGIT) {
+        tok->kind = TK_NUMBER;
+
+        if (*ctx->pos == '0' && ctx->pos[1] == 'x') {
+            parse_num_hex(ctx, tok);
+        } else if (*ctx->pos == '0' && ctx->pos[1] == 'b') {
+            parse_num_bin(ctx, tok);
+        } else {
+            parse_num_dec(ctx, tok);
+        }
+
+        tok->len = (size_t)(ctx->pos - tok->pos);
+        return;
+    }
+
+    if (*ctx->pos == '"') {
+        lex_str(ctx, tok);
+        return;
+    }
+
+    if (*ctx->pos == '\'') {
+        lex_char(ctx, tok);
+        return;
+    }
+
+    if (*ctx->pos == '\0') {
+        tok->kind = TK_EOF;
+        return;
+    }
+}
+
+void
+lexer_init(struct lexer *ctx, char *source)
+{
+    if (ctx == NULL || source == NULL) {
+        return;
+    }
+
+    debug("Initializing lexer...\n");
+
+    ctx->pos = source;
+    ctx->line_start = ctx->pos;
+    ctx->line = 1;
+
+    keywords_init();
+}