From a515dfb3b8f8e999362db7a6b52b3104c03b750a Mon Sep 17 00:00:00 2001 From: Ian Moffett Date: Fri, 1 Nov 2024 23:46:08 -0400 Subject: Import quark sources Signed-off-by: Ian Moffett --- compiler/lexer/lexer.c | 282 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 282 insertions(+) create mode 100644 compiler/lexer/lexer.c (limited to 'compiler/lexer/lexer.c') diff --git a/compiler/lexer/lexer.c b/compiler/lexer/lexer.c new file mode 100644 index 0000000..06f7e89 --- /dev/null +++ b/compiler/lexer/lexer.c @@ -0,0 +1,282 @@ +/* + * Quark lexer (lexical analyzer). + * Turns source code into tokens. + * Copyright (c) 2023-2024, Quinn Stephens and the OSMORA team. + * Provided under the BSD 3-Clause license. + */ + +#include "debug.h" +#include "lexer.h" +#include "lexer/char_info.h" +#include "lexer/keywords.h" + +static void +skip_ignored(struct lexer *ctx) +{ + while (char_info[(int)*ctx->pos] & CHAR_WHITESPACE) { + if (char_info[(int)*ctx->pos] & CHAR_VERT_WS) { + ctx->line++; + ctx->line_start = ctx->pos + 1; + } + + ctx->pos++; + } +} + +static void +parse_num_hex(struct lexer *ctx, struct token *tok) +{ + ctx->pos += 2; + tok->value = 0; + while (char_info[(int)*ctx->pos] & CHAR_HEX) { + if (char_info[(int)*ctx->pos] == CHAR_XLOWER) { + tok->value |= *ctx->pos++ - 'a' + 0xa; + } else if (char_info[(int)*ctx->pos] == CHAR_XUPPER) { + tok->value |= *ctx->pos++ - 'A' + 0xa; + } else { + tok->value |= *ctx->pos++ - '0'; + } + } +} + +static void +parse_num_dec(struct lexer *ctx, struct token *tok) +{ + ctx->pos++; + tok->value = *ctx->pos - '0'; + while (char_info[(int)*ctx->pos] & CHAR_DIGIT) { + tok->value *= 10; + tok->value += *ctx->pos++ - '0'; + } +} + +static void +parse_num_bin(struct lexer *ctx, struct token *tok) +{ + ctx->pos += 2; + tok->value = 0; + while (*ctx->pos == '0' || *ctx->pos == '1') { + tok->value <<= 1; + tok->value |= *ctx->pos++ - '0'; + } +} + +static void +lex_ident(struct lexer *ctx, struct token *tok) +{ + /* Find end of identifier */ + ctx->pos++; + while (char_info[(int)*ctx->pos] & CHAR_ALNUM || *ctx->pos == '_') { + ctx->pos++; + } + + tok->len = (size_t)(ctx->pos - tok->pos); + tok->hash = hash_data(tok->pos, tok->len); + + /* Determine if this is a keyword or just an identifier */ + tok->kind = keywords_find(tok); + if (tok->kind == TK_UNKNOWN) { + tok->kind = TK_IDENTIFIER; + return; + } +} + +static void +lex_oper(struct lexer *ctx, struct token *tok) +{ + tok->len = 1; + + switch (*ctx->pos) { + case '+': + if (ctx->pos[1] == '+') { + tok->kind = TK_PLUS_PLUS; + tok->len = 2; + } else if (ctx->pos[1] == '=') { + tok->kind = TK_PLUS_EQUALS; + tok->len = 2; + } else { + tok->kind = TK_PLUS; + } + + break; + case '-': + if (ctx->pos[1] == '>') { + tok->kind = TK_ARROW; + tok->len = 2; + } if (ctx->pos[1] == '-') { + tok->kind = TK_MINUS_MINUS; + tok->len = 2; + } else if (ctx->pos[1] == '=') { + tok->kind = TK_MINUS_EQUALS; + tok->len = 2; + } else { + tok->kind = TK_MINUS; + } + + break; + case '<': + if (ctx->pos[1] == '=') { + tok->kind = TK_LESS_THAN_EQUALS; + tok->len = 2; + } else if (ctx->pos[1] == '<') { + if (ctx->pos[2] == '=') { + tok->kind = TK_SHIFT_LEFT_EQUALS; + tok->len = 3; + } else { + tok->kind = TK_SHIFT_LEFT; + tok->len = 2; + } + } else { + tok->kind = TK_LESS_THAN; + } + + break; + case '>': + if (ctx->pos[1] == '=') { + tok->kind = TK_GREATER_THAN_EQUALS; + tok->len = 2; + } else if (ctx->pos[1] == '>') { + if (ctx->pos[2] == '=') { + tok->kind = TK_SHIFT_RIGHT_EQUALS; + tok->len = 3; + } else { + tok->kind = TK_SHIFT_RIGHT; + tok->len = 2; + } + } else { + tok->kind = TK_GREATER_THAN; + } + + break; + default: + tok->kind = char_info[(int)*ctx->pos] >> CHAR_OPER_SHIFT; + if (ctx->pos[1] == '=') { + tok->kind++; + tok->len = 2; + } + + break; + } + + ctx->pos += tok->len; +} + +static void +lex_str(struct lexer *ctx, struct token *tok) +{ + /* Find end of string */ + ctx->pos++; + while (*ctx->pos != '"') { + if (*ctx->pos == '\\' && ctx->pos[1] == '\"') { + ctx->pos++; + } + + ctx->pos++; + } + ctx->pos++; + + tok->kind = TK_STRING; + tok->len = (size_t)(ctx->pos - tok->pos) - 1; +} + +static void +lex_char(struct lexer *ctx, struct token *tok) +{ + /* Find end of character */ + ctx->pos++; + while (*ctx->pos != '\'') { + if (*ctx->pos == '\\' && ctx->pos[1] == '\'') { + ctx->pos++; + } + + ctx->pos++; + } + ctx->pos++; + + tok->kind = TK_CHARACTER; + tok->len = (size_t)(ctx->pos - tok->pos) - 1; +} + +void +lexer_next(struct lexer *ctx, struct token *tok) +{ + if (tok == NULL) { + return; + } + + if (ctx == NULL) { + tok->kind = TK_UNKNOWN; + return; + } + + skip_ignored(ctx); + + /* Initialize token */ + tok->kind = TK_UNKNOWN; + tok->pos = ctx->pos; + tok->line = ctx->line; + tok->col = (int)(tok->pos - ctx->line_start) + 1; + + if (char_info[(int)*ctx->pos] & CHAR_ALPHA || *ctx->pos == '_') { + lex_ident(ctx, tok); + return; + } + + if (char_info[(int)*ctx->pos] & CHAR_SINGLE) { + tok->kind = char_info[(int)*ctx->pos] >> CHAR_SINGLE_SHIFT; + tok->len = 1; + ctx->pos++; + return; + } + + if (char_info[(int)*ctx->pos] & CHAR_OPER) { + lex_oper(ctx, tok); + return; + } + + if (char_info[(int)*ctx->pos] & CHAR_DIGIT) { + tok->kind = TK_NUMBER; + + if (*ctx->pos == '0' && ctx->pos[1] == 'x') { + parse_num_hex(ctx, tok); + } else if (*ctx->pos == '0' && ctx->pos[1] == 'b') { + parse_num_bin(ctx, tok); + } else { + parse_num_dec(ctx, tok); + } + + tok->len = (size_t)(ctx->pos - tok->pos); + return; + } + + if (*ctx->pos == '"') { + lex_str(ctx, tok); + return; + } + + if (*ctx->pos == '\'') { + lex_char(ctx, tok); + return; + } + + if (*ctx->pos == '\0') { + tok->kind = TK_EOF; + return; + } +} + +void +lexer_init(struct lexer *ctx, char *source) +{ + if (ctx == NULL || source == NULL) { + return; + } + + debug("Initializing lexer...\n"); + + ctx->pos = source; + ctx->line_start = ctx->pos; + ctx->line = 1; + + keywords_init(); +} -- cgit v1.2.3