From a515dfb3b8f8e999362db7a6b52b3104c03b750a Mon Sep 17 00:00:00 2001
From: Ian Moffett <ian@osmora.org>
Date: Fri, 1 Nov 2024 23:46:08 -0400
Subject: Import quark sources

Signed-off-by: Ian Moffett <ian@osmora.org>
---
 compiler/lexer/lexer.c | 282 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 282 insertions(+)
 create mode 100644 compiler/lexer/lexer.c

(limited to 'compiler/lexer/lexer.c')

diff --git a/compiler/lexer/lexer.c b/compiler/lexer/lexer.c
new file mode 100644
index 0000000..06f7e89
--- /dev/null
+++ b/compiler/lexer/lexer.c
@@ -0,0 +1,282 @@
+/*
+ * Quark lexer (lexical analyzer).
+ * Turns source code into tokens.
+ * Copyright (c) 2023-2024, Quinn Stephens and the OSMORA team.
+ * Provided under the BSD 3-Clause license.
+ */
+
+#include "debug.h"
+#include "lexer.h"
+#include "lexer/char_info.h"
+#include "lexer/keywords.h"
+
+static void
+skip_ignored(struct lexer *ctx)
+{
+    while (char_info[(int)*ctx->pos] & CHAR_WHITESPACE) {
+        if (char_info[(int)*ctx->pos] & CHAR_VERT_WS) {
+            ctx->line++;
+            ctx->line_start = ctx->pos + 1;
+        }
+
+        ctx->pos++;
+    }
+}
+
+static void
+parse_num_hex(struct lexer *ctx, struct token *tok)
+{
+    ctx->pos += 2;
+    tok->value = 0;
+    while (char_info[(int)*ctx->pos] & CHAR_HEX) {
+        if (char_info[(int)*ctx->pos] == CHAR_XLOWER) {
+            tok->value |= *ctx->pos++ - 'a' + 0xa;
+        } else if (char_info[(int)*ctx->pos] == CHAR_XUPPER) {
+            tok->value |= *ctx->pos++ - 'A' + 0xa;
+        } else {
+            tok->value |= *ctx->pos++ - '0';
+        }
+    }
+}
+
+static void
+parse_num_dec(struct lexer *ctx, struct token *tok)
+{
+    ctx->pos++;
+    tok->value = *ctx->pos - '0';
+    while (char_info[(int)*ctx->pos] & CHAR_DIGIT) {
+        tok->value *= 10;
+        tok->value += *ctx->pos++ - '0';
+    }
+}
+
+static void
+parse_num_bin(struct lexer *ctx, struct token *tok)
+{
+    ctx->pos += 2;
+    tok->value = 0;
+    while (*ctx->pos == '0' || *ctx->pos == '1') {
+        tok->value <<= 1;
+        tok->value |= *ctx->pos++ - '0';
+    }
+}
+
+static void
+lex_ident(struct lexer *ctx, struct token *tok)
+{
+    /* Find end of identifier */
+    ctx->pos++;
+    while (char_info[(int)*ctx->pos] & CHAR_ALNUM || *ctx->pos == '_') {
+        ctx->pos++;
+    }
+
+    tok->len = (size_t)(ctx->pos - tok->pos);
+    tok->hash = hash_data(tok->pos, tok->len);
+
+    /* Determine if this is a keyword or just an identifier */
+    tok->kind = keywords_find(tok);
+    if (tok->kind == TK_UNKNOWN) {
+        tok->kind = TK_IDENTIFIER;
+        return;
+    }
+}
+
+static void
+lex_oper(struct lexer *ctx, struct token *tok)
+{
+    tok->len = 1;
+
+    switch (*ctx->pos) {
+    case '+':
+        if (ctx->pos[1] == '+') {
+            tok->kind = TK_PLUS_PLUS;
+            tok->len = 2;
+        } else if (ctx->pos[1] == '=') {
+            tok->kind = TK_PLUS_EQUALS;
+            tok->len = 2;
+        } else {
+            tok->kind = TK_PLUS;
+        }
+
+        break;
+    case '-':
+        if (ctx->pos[1] == '>') {
+            tok->kind = TK_ARROW;
+            tok->len = 2;
+        } if (ctx->pos[1] == '-') {
+            tok->kind = TK_MINUS_MINUS;
+            tok->len = 2;
+        } else if (ctx->pos[1] == '=') {
+            tok->kind = TK_MINUS_EQUALS;
+            tok->len = 2;
+        } else {
+            tok->kind = TK_MINUS;
+        }
+
+        break;
+    case '<':
+        if (ctx->pos[1] == '=') {
+            tok->kind = TK_LESS_THAN_EQUALS;
+            tok->len = 2;
+        } else if (ctx->pos[1] == '<') {
+            if (ctx->pos[2] == '=') {
+                tok->kind = TK_SHIFT_LEFT_EQUALS;
+                tok->len = 3;
+            } else {
+                tok->kind = TK_SHIFT_LEFT;
+                tok->len = 2;
+            }
+        } else {
+            tok->kind = TK_LESS_THAN;
+        }
+
+        break;
+    case '>':
+        if (ctx->pos[1] == '=') {
+            tok->kind = TK_GREATER_THAN_EQUALS;
+            tok->len = 2;
+        } else if (ctx->pos[1] == '>') {
+            if (ctx->pos[2] == '=') {
+                tok->kind = TK_SHIFT_RIGHT_EQUALS;
+                tok->len = 3;
+            } else {
+                tok->kind = TK_SHIFT_RIGHT;
+                tok->len = 2;
+            }
+        } else {
+            tok->kind = TK_GREATER_THAN;
+        }
+
+        break;
+    default:
+        tok->kind = char_info[(int)*ctx->pos] >> CHAR_OPER_SHIFT;
+        if (ctx->pos[1] == '=') {
+            tok->kind++;
+            tok->len = 2;
+        }
+
+        break;
+    }
+
+    ctx->pos += tok->len;
+}
+
+static void
+lex_str(struct lexer *ctx, struct token *tok)
+{
+    /* Find end of string */
+    ctx->pos++;
+    while (*ctx->pos != '"') {
+        if (*ctx->pos == '\\' && ctx->pos[1] == '\"') {
+            ctx->pos++;
+        }
+
+        ctx->pos++;
+    }
+    ctx->pos++;
+
+    tok->kind = TK_STRING;
+    tok->len = (size_t)(ctx->pos - tok->pos) - 1;
+}
+
+static void
+lex_char(struct lexer *ctx, struct token *tok)
+{
+    /* Find end of character */
+    ctx->pos++;
+    while (*ctx->pos != '\'') {
+        if (*ctx->pos == '\\' && ctx->pos[1] == '\'') {
+            ctx->pos++;
+        }
+
+        ctx->pos++;
+    }
+    ctx->pos++;
+
+    tok->kind = TK_CHARACTER;
+    tok->len = (size_t)(ctx->pos - tok->pos) - 1;
+}
+
+void
+lexer_next(struct lexer *ctx, struct token *tok)
+{
+    if (tok == NULL) {
+        return;
+    }
+
+    if (ctx == NULL) {
+        tok->kind = TK_UNKNOWN;
+        return;
+    }
+
+    skip_ignored(ctx);
+
+    /* Initialize token */
+    tok->kind = TK_UNKNOWN;
+    tok->pos = ctx->pos;
+    tok->line = ctx->line;
+    tok->col = (int)(tok->pos - ctx->line_start) + 1;
+
+    if (char_info[(int)*ctx->pos] & CHAR_ALPHA || *ctx->pos == '_') {
+        lex_ident(ctx, tok);
+        return;
+    }
+
+    if (char_info[(int)*ctx->pos] & CHAR_SINGLE) {
+        tok->kind = char_info[(int)*ctx->pos] >> CHAR_SINGLE_SHIFT;
+        tok->len = 1;
+        ctx->pos++;
+        return;
+    }
+
+    if (char_info[(int)*ctx->pos] & CHAR_OPER) {
+        lex_oper(ctx, tok);
+        return;
+    }
+
+    if (char_info[(int)*ctx->pos] & CHAR_DIGIT) {
+        tok->kind = TK_NUMBER;
+
+        if (*ctx->pos == '0' && ctx->pos[1] == 'x') {
+            parse_num_hex(ctx, tok);
+        } else if (*ctx->pos == '0' && ctx->pos[1] == 'b') {
+            parse_num_bin(ctx, tok);
+        } else {
+            parse_num_dec(ctx, tok);
+        }
+
+        tok->len = (size_t)(ctx->pos - tok->pos);
+        return;
+    }
+
+    if (*ctx->pos == '"') {
+        lex_str(ctx, tok);
+        return;
+    }
+
+    if (*ctx->pos == '\'') {
+        lex_char(ctx, tok);
+        return;
+    }
+
+    if (*ctx->pos == '\0') {
+        tok->kind = TK_EOF;
+        return;
+    }
+}
+
+void
+lexer_init(struct lexer *ctx, char *source)
+{
+    if (ctx == NULL || source == NULL) {
+        return;
+    }
+
+    debug("Initializing lexer...\n");
+
+    ctx->pos = source;
+    ctx->line_start = ctx->pos;
+    ctx->line = 1;
+
+    keywords_init();
+}
-- 
cgit v1.2.3