summaryrefslogtreecommitdiff
path: root/compiler/lexer/lexer.c
diff options
context:
space:
mode:
authorIan Moffett <ian@osmora.org>2024-11-01 23:46:08 -0400
committerIan Moffett <ian@osmora.org>2024-11-01 23:46:08 -0400
commita515dfb3b8f8e999362db7a6b52b3104c03b750a (patch)
treed0180f0cbc39d9c3e367af30791ad774e4d419ff /compiler/lexer/lexer.c
Import quark sources
Signed-off-by: Ian Moffett <ian@osmora.org>
Diffstat (limited to 'compiler/lexer/lexer.c')
-rw-r--r--compiler/lexer/lexer.c282
1 files changed, 282 insertions, 0 deletions
diff --git a/compiler/lexer/lexer.c b/compiler/lexer/lexer.c
new file mode 100644
index 0000000..06f7e89
--- /dev/null
+++ b/compiler/lexer/lexer.c
@@ -0,0 +1,282 @@
+/*
+ * Quark lexer (lexical analyzer).
+ * Turns source code into tokens.
+ * Copyright (c) 2023-2024, Quinn Stephens and the OSMORA team.
+ * Provided under the BSD 3-Clause license.
+ */
+
+#include "debug.h"
+#include "lexer.h"
+#include "lexer/char_info.h"
+#include "lexer/keywords.h"
+
+static void
+skip_ignored(struct lexer *ctx)
+{
+ while (char_info[(int)*ctx->pos] & CHAR_WHITESPACE) {
+ if (char_info[(int)*ctx->pos] & CHAR_VERT_WS) {
+ ctx->line++;
+ ctx->line_start = ctx->pos + 1;
+ }
+
+ ctx->pos++;
+ }
+}
+
+static void
+parse_num_hex(struct lexer *ctx, struct token *tok)
+{
+ ctx->pos += 2;
+ tok->value = 0;
+ while (char_info[(int)*ctx->pos] & CHAR_HEX) {
+ if (char_info[(int)*ctx->pos] == CHAR_XLOWER) {
+ tok->value |= *ctx->pos++ - 'a' + 0xa;
+ } else if (char_info[(int)*ctx->pos] == CHAR_XUPPER) {
+ tok->value |= *ctx->pos++ - 'A' + 0xa;
+ } else {
+ tok->value |= *ctx->pos++ - '0';
+ }
+ }
+}
+
+static void
+parse_num_dec(struct lexer *ctx, struct token *tok)
+{
+ ctx->pos++;
+ tok->value = *ctx->pos - '0';
+ while (char_info[(int)*ctx->pos] & CHAR_DIGIT) {
+ tok->value *= 10;
+ tok->value += *ctx->pos++ - '0';
+ }
+}
+
+static void
+parse_num_bin(struct lexer *ctx, struct token *tok)
+{
+ ctx->pos += 2;
+ tok->value = 0;
+ while (*ctx->pos == '0' || *ctx->pos == '1') {
+ tok->value <<= 1;
+ tok->value |= *ctx->pos++ - '0';
+ }
+}
+
+static void
+lex_ident(struct lexer *ctx, struct token *tok)
+{
+ /* Find end of identifier */
+ ctx->pos++;
+ while (char_info[(int)*ctx->pos] & CHAR_ALNUM || *ctx->pos == '_') {
+ ctx->pos++;
+ }
+
+ tok->len = (size_t)(ctx->pos - tok->pos);
+ tok->hash = hash_data(tok->pos, tok->len);
+
+ /* Determine if this is a keyword or just an identifier */
+ tok->kind = keywords_find(tok);
+ if (tok->kind == TK_UNKNOWN) {
+ tok->kind = TK_IDENTIFIER;
+ return;
+ }
+}
+
+static void
+lex_oper(struct lexer *ctx, struct token *tok)
+{
+ tok->len = 1;
+
+ switch (*ctx->pos) {
+ case '+':
+ if (ctx->pos[1] == '+') {
+ tok->kind = TK_PLUS_PLUS;
+ tok->len = 2;
+ } else if (ctx->pos[1] == '=') {
+ tok->kind = TK_PLUS_EQUALS;
+ tok->len = 2;
+ } else {
+ tok->kind = TK_PLUS;
+ }
+
+ break;
+ case '-':
+ if (ctx->pos[1] == '>') {
+ tok->kind = TK_ARROW;
+ tok->len = 2;
+ } if (ctx->pos[1] == '-') {
+ tok->kind = TK_MINUS_MINUS;
+ tok->len = 2;
+ } else if (ctx->pos[1] == '=') {
+ tok->kind = TK_MINUS_EQUALS;
+ tok->len = 2;
+ } else {
+ tok->kind = TK_MINUS;
+ }
+
+ break;
+ case '<':
+ if (ctx->pos[1] == '=') {
+ tok->kind = TK_LESS_THAN_EQUALS;
+ tok->len = 2;
+ } else if (ctx->pos[1] == '<') {
+ if (ctx->pos[2] == '=') {
+ tok->kind = TK_SHIFT_LEFT_EQUALS;
+ tok->len = 3;
+ } else {
+ tok->kind = TK_SHIFT_LEFT;
+ tok->len = 2;
+ }
+ } else {
+ tok->kind = TK_LESS_THAN;
+ }
+
+ break;
+ case '>':
+ if (ctx->pos[1] == '=') {
+ tok->kind = TK_GREATER_THAN_EQUALS;
+ tok->len = 2;
+ } else if (ctx->pos[1] == '>') {
+ if (ctx->pos[2] == '=') {
+ tok->kind = TK_SHIFT_RIGHT_EQUALS;
+ tok->len = 3;
+ } else {
+ tok->kind = TK_SHIFT_RIGHT;
+ tok->len = 2;
+ }
+ } else {
+ tok->kind = TK_GREATER_THAN;
+ }
+
+ break;
+ default:
+ tok->kind = char_info[(int)*ctx->pos] >> CHAR_OPER_SHIFT;
+ if (ctx->pos[1] == '=') {
+ tok->kind++;
+ tok->len = 2;
+ }
+
+ break;
+ }
+
+ ctx->pos += tok->len;
+}
+
+static void
+lex_str(struct lexer *ctx, struct token *tok)
+{
+ /* Find end of string */
+ ctx->pos++;
+ while (*ctx->pos != '"') {
+ if (*ctx->pos == '\\' && ctx->pos[1] == '\"') {
+ ctx->pos++;
+ }
+
+ ctx->pos++;
+ }
+ ctx->pos++;
+
+ tok->kind = TK_STRING;
+ tok->len = (size_t)(ctx->pos - tok->pos) - 1;
+}
+
+static void
+lex_char(struct lexer *ctx, struct token *tok)
+{
+ /* Find end of character */
+ ctx->pos++;
+ while (*ctx->pos != '\'') {
+ if (*ctx->pos == '\\' && ctx->pos[1] == '\'') {
+ ctx->pos++;
+ }
+
+ ctx->pos++;
+ }
+ ctx->pos++;
+
+ tok->kind = TK_CHARACTER;
+ tok->len = (size_t)(ctx->pos - tok->pos) - 1;
+}
+
+void
+lexer_next(struct lexer *ctx, struct token *tok)
+{
+ if (tok == NULL) {
+ return;
+ }
+
+ if (ctx == NULL) {
+ tok->kind = TK_UNKNOWN;
+ return;
+ }
+
+ skip_ignored(ctx);
+
+ /* Initialize token */
+ tok->kind = TK_UNKNOWN;
+ tok->pos = ctx->pos;
+ tok->line = ctx->line;
+ tok->col = (int)(tok->pos - ctx->line_start) + 1;
+
+ if (char_info[(int)*ctx->pos] & CHAR_ALPHA || *ctx->pos == '_') {
+ lex_ident(ctx, tok);
+ return;
+ }
+
+ if (char_info[(int)*ctx->pos] & CHAR_SINGLE) {
+ tok->kind = char_info[(int)*ctx->pos] >> CHAR_SINGLE_SHIFT;
+ tok->len = 1;
+ ctx->pos++;
+ return;
+ }
+
+ if (char_info[(int)*ctx->pos] & CHAR_OPER) {
+ lex_oper(ctx, tok);
+ return;
+ }
+
+ if (char_info[(int)*ctx->pos] & CHAR_DIGIT) {
+ tok->kind = TK_NUMBER;
+
+ if (*ctx->pos == '0' && ctx->pos[1] == 'x') {
+ parse_num_hex(ctx, tok);
+ } else if (*ctx->pos == '0' && ctx->pos[1] == 'b') {
+ parse_num_bin(ctx, tok);
+ } else {
+ parse_num_dec(ctx, tok);
+ }
+
+ tok->len = (size_t)(ctx->pos - tok->pos);
+ return;
+ }
+
+ if (*ctx->pos == '"') {
+ lex_str(ctx, tok);
+ return;
+ }
+
+ if (*ctx->pos == '\'') {
+ lex_char(ctx, tok);
+ return;
+ }
+
+ if (*ctx->pos == '\0') {
+ tok->kind = TK_EOF;
+ return;
+ }
+}
+
+void
+lexer_init(struct lexer *ctx, char *source)
+{
+ if (ctx == NULL || source == NULL) {
+ return;
+ }
+
+ debug("Initializing lexer...\n");
+
+ ctx->pos = source;
+ ctx->line_start = ctx->pos;
+ ctx->line = 1;
+
+ keywords_init();
+}