diff options
Diffstat (limited to 'compiler/lexer')
-rw-r--r-- | compiler/lexer/char_info.c | 106 | ||||
-rw-r--r-- | compiler/lexer/keywords.c | 73 | ||||
-rw-r--r-- | compiler/lexer/lexer.c | 282 |
3 files changed, 461 insertions, 0 deletions
diff --git a/compiler/lexer/char_info.c b/compiler/lexer/char_info.c new file mode 100644 index 0000000..97ecb7d --- /dev/null +++ b/compiler/lexer/char_info.c @@ -0,0 +1,106 @@ +/* + * Character info table. + * Copyright (c) 2023-2024, Quinn Stephens and the OSMORA team. + * Provided under the BSD 3-Clause license. + */ + +#include "lexer/char_info.h" + +uint16_t char_info[256] = { + /* + NUL SOH STX ETX + EOT ENQ ACK BEL + */ + 0 , 0 , 0 , 0 , + 0 , 0 , 0 , 0 , + /* + BS TAB LF VT + FF CR SO SI + */ + 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_VERT_WS, + CHAR_VERT_WS, CHAR_HORZ_WS, 0 , 0 , + /* + DLE DC1 DC2 DC3 + DC4 NAK SYN ETB + */ + 0 , 0 , 0 , 0 , + 0 , 0 , 0 , 0 , + /* + CAN EM SUB ESC + FS GS RS US + */ + 0 , 0 , 0 , 0 , + 0 , 0 , 0 , 0 , + /* + ! " # + $ % & ' + */ + CHAR_HORZ_WS, CHAR_EXCLAIM, 0 , 0 , + 0 , CHAR_PERCENT, CHAR_AMPER , 0 , + /* + ( ) * + + , - . / + */ + CHAR_LPAREN , CHAR_RPAREN , CHAR_STAR , CHAR_PLUS , + CHAR_COMMA , CHAR_MINUS , CHAR_DOT , CHAR_SLASH , + /* + 0 1 2 3 + 4 5 6 7 + */ + CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , + CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , CHAR_DIGIT , + /* + 8 9 : ; + < = > ? + */ + CHAR_DIGIT , CHAR_DIGIT , CHAR_COLON , CHAR_SEMI , + CHAR_LESS , CHAR_EQUALS , CHAR_GREATER, 0 , + /* + @ A B C + D E F G + */ + 0 , CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER , + CHAR_XUPPER , CHAR_XUPPER , CHAR_XUPPER , CHAR_UPPER , + /* + H I J K + L M N O + */ + CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , + CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , + /* + P Q R S + T U V W + */ + CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , + CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , + /* + X Y Z [ + \ ] ^ _ + */ + CHAR_UPPER , CHAR_UPPER , CHAR_UPPER , CHAR_LBRACK , + 0 , CHAR_RBRACK , CHAR_CARET , 0 , + /* + ` a b c + d e f g + */ + 0 , CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER , + CHAR_XLOWER , CHAR_XLOWER , CHAR_XLOWER , CHAR_LOWER , + /* + h i j k + l m n o + */ + CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , + CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , + /* + p q r s + t u v w + */ + CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , + CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , + /* + x y z { + | } ~ + */ + CHAR_LOWER , CHAR_LOWER , CHAR_LOWER , CHAR_LBRACE , + CHAR_PIPE , CHAR_RBRACE , CHAR_TILDE , 0 , +}; diff --git a/compiler/lexer/keywords.c b/compiler/lexer/keywords.c new file mode 100644 index 0000000..0e95048 --- /dev/null +++ b/compiler/lexer/keywords.c @@ -0,0 +1,73 @@ +/* + * Keyword hashmap. + * Copyright (c) 2023-2024, Quinn Stephens and the OSMORA team. + * Provided under the BSD 3-Clause license. + */ + +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include "hash.h" +#include "hashmap.h" +#include "debug.h" +#include "lexer/keywords.h" + +#define HASHMAP_ROWS 8 + +struct keyword { + struct hashmap_entry hashmap_entry; + size_t len; + token_kind_t value; +}; + +static bool initialized = false; +static struct list keywords_rows[HASHMAP_ROWS]; +static struct hashmap keywords; + +static void +add_keyword(char *name, token_kind_t value) +{ + size_t len; + struct keyword *kwd; + + len = strlen(name); + kwd = malloc(sizeof(struct keyword)); + kwd->hashmap_entry.hash = hash_data(name, len); + kwd->len = len; + kwd->value = value; + + hashmap_add(&keywords, &kwd->hashmap_entry); +} + +token_kind_t +keywords_find(struct token *tok) +{ + struct keyword *kwd; + + kwd = (struct keyword*)hashmap_find(&keywords, tok->hash); + if (kwd == NULL || kwd->len != tok->len) { + return TK_UNKNOWN; + } + + return kwd->value; +} + +void +keywords_init(void) +{ + if (initialized) { + return; + } + + debug("Initializing keywords...\n"); + + keywords.rows = keywords_rows; + keywords.n_rows = HASHMAP_ROWS; + hashmap_init(&keywords); + + add_keyword("type", TK_TYPE); + add_keyword("enum", TK_ENUM); + add_keyword("struct", TK_STRUCT); + + initialized = true; +} diff --git a/compiler/lexer/lexer.c b/compiler/lexer/lexer.c new file mode 100644 index 0000000..06f7e89 --- /dev/null +++ b/compiler/lexer/lexer.c @@ -0,0 +1,282 @@ +/* + * Quark lexer (lexical analyzer). + * Turns source code into tokens. + * Copyright (c) 2023-2024, Quinn Stephens and the OSMORA team. + * Provided under the BSD 3-Clause license. + */ + +#include "debug.h" +#include "lexer.h" +#include "lexer/char_info.h" +#include "lexer/keywords.h" + +static void +skip_ignored(struct lexer *ctx) +{ + while (char_info[(int)*ctx->pos] & CHAR_WHITESPACE) { + if (char_info[(int)*ctx->pos] & CHAR_VERT_WS) { + ctx->line++; + ctx->line_start = ctx->pos + 1; + } + + ctx->pos++; + } +} + +static void +parse_num_hex(struct lexer *ctx, struct token *tok) +{ + ctx->pos += 2; + tok->value = 0; + while (char_info[(int)*ctx->pos] & CHAR_HEX) { + if (char_info[(int)*ctx->pos] == CHAR_XLOWER) { + tok->value |= *ctx->pos++ - 'a' + 0xa; + } else if (char_info[(int)*ctx->pos] == CHAR_XUPPER) { + tok->value |= *ctx->pos++ - 'A' + 0xa; + } else { + tok->value |= *ctx->pos++ - '0'; + } + } +} + +static void +parse_num_dec(struct lexer *ctx, struct token *tok) +{ + ctx->pos++; + tok->value = *ctx->pos - '0'; + while (char_info[(int)*ctx->pos] & CHAR_DIGIT) { + tok->value *= 10; + tok->value += *ctx->pos++ - '0'; + } +} + +static void +parse_num_bin(struct lexer *ctx, struct token *tok) +{ + ctx->pos += 2; + tok->value = 0; + while (*ctx->pos == '0' || *ctx->pos == '1') { + tok->value <<= 1; + tok->value |= *ctx->pos++ - '0'; + } +} + +static void +lex_ident(struct lexer *ctx, struct token *tok) +{ + /* Find end of identifier */ + ctx->pos++; + while (char_info[(int)*ctx->pos] & CHAR_ALNUM || *ctx->pos == '_') { + ctx->pos++; + } + + tok->len = (size_t)(ctx->pos - tok->pos); + tok->hash = hash_data(tok->pos, tok->len); + + /* Determine if this is a keyword or just an identifier */ + tok->kind = keywords_find(tok); + if (tok->kind == TK_UNKNOWN) { + tok->kind = TK_IDENTIFIER; + return; + } +} + +static void +lex_oper(struct lexer *ctx, struct token *tok) +{ + tok->len = 1; + + switch (*ctx->pos) { + case '+': + if (ctx->pos[1] == '+') { + tok->kind = TK_PLUS_PLUS; + tok->len = 2; + } else if (ctx->pos[1] == '=') { + tok->kind = TK_PLUS_EQUALS; + tok->len = 2; + } else { + tok->kind = TK_PLUS; + } + + break; + case '-': + if (ctx->pos[1] == '>') { + tok->kind = TK_ARROW; + tok->len = 2; + } if (ctx->pos[1] == '-') { + tok->kind = TK_MINUS_MINUS; + tok->len = 2; + } else if (ctx->pos[1] == '=') { + tok->kind = TK_MINUS_EQUALS; + tok->len = 2; + } else { + tok->kind = TK_MINUS; + } + + break; + case '<': + if (ctx->pos[1] == '=') { + tok->kind = TK_LESS_THAN_EQUALS; + tok->len = 2; + } else if (ctx->pos[1] == '<') { + if (ctx->pos[2] == '=') { + tok->kind = TK_SHIFT_LEFT_EQUALS; + tok->len = 3; + } else { + tok->kind = TK_SHIFT_LEFT; + tok->len = 2; + } + } else { + tok->kind = TK_LESS_THAN; + } + + break; + case '>': + if (ctx->pos[1] == '=') { + tok->kind = TK_GREATER_THAN_EQUALS; + tok->len = 2; + } else if (ctx->pos[1] == '>') { + if (ctx->pos[2] == '=') { + tok->kind = TK_SHIFT_RIGHT_EQUALS; + tok->len = 3; + } else { + tok->kind = TK_SHIFT_RIGHT; + tok->len = 2; + } + } else { + tok->kind = TK_GREATER_THAN; + } + + break; + default: + tok->kind = char_info[(int)*ctx->pos] >> CHAR_OPER_SHIFT; + if (ctx->pos[1] == '=') { + tok->kind++; + tok->len = 2; + } + + break; + } + + ctx->pos += tok->len; +} + +static void +lex_str(struct lexer *ctx, struct token *tok) +{ + /* Find end of string */ + ctx->pos++; + while (*ctx->pos != '"') { + if (*ctx->pos == '\\' && ctx->pos[1] == '\"') { + ctx->pos++; + } + + ctx->pos++; + } + ctx->pos++; + + tok->kind = TK_STRING; + tok->len = (size_t)(ctx->pos - tok->pos) - 1; +} + +static void +lex_char(struct lexer *ctx, struct token *tok) +{ + /* Find end of character */ + ctx->pos++; + while (*ctx->pos != '\'') { + if (*ctx->pos == '\\' && ctx->pos[1] == '\'') { + ctx->pos++; + } + + ctx->pos++; + } + ctx->pos++; + + tok->kind = TK_CHARACTER; + tok->len = (size_t)(ctx->pos - tok->pos) - 1; +} + +void +lexer_next(struct lexer *ctx, struct token *tok) +{ + if (tok == NULL) { + return; + } + + if (ctx == NULL) { + tok->kind = TK_UNKNOWN; + return; + } + + skip_ignored(ctx); + + /* Initialize token */ + tok->kind = TK_UNKNOWN; + tok->pos = ctx->pos; + tok->line = ctx->line; + tok->col = (int)(tok->pos - ctx->line_start) + 1; + + if (char_info[(int)*ctx->pos] & CHAR_ALPHA || *ctx->pos == '_') { + lex_ident(ctx, tok); + return; + } + + if (char_info[(int)*ctx->pos] & CHAR_SINGLE) { + tok->kind = char_info[(int)*ctx->pos] >> CHAR_SINGLE_SHIFT; + tok->len = 1; + ctx->pos++; + return; + } + + if (char_info[(int)*ctx->pos] & CHAR_OPER) { + lex_oper(ctx, tok); + return; + } + + if (char_info[(int)*ctx->pos] & CHAR_DIGIT) { + tok->kind = TK_NUMBER; + + if (*ctx->pos == '0' && ctx->pos[1] == 'x') { + parse_num_hex(ctx, tok); + } else if (*ctx->pos == '0' && ctx->pos[1] == 'b') { + parse_num_bin(ctx, tok); + } else { + parse_num_dec(ctx, tok); + } + + tok->len = (size_t)(ctx->pos - tok->pos); + return; + } + + if (*ctx->pos == '"') { + lex_str(ctx, tok); + return; + } + + if (*ctx->pos == '\'') { + lex_char(ctx, tok); + return; + } + + if (*ctx->pos == '\0') { + tok->kind = TK_EOF; + return; + } +} + +void +lexer_init(struct lexer *ctx, char *source) +{ + if (ctx == NULL || source == NULL) { + return; + } + + debug("Initializing lexer...\n"); + + ctx->pos = source; + ctx->line_start = ctx->pos; + ctx->line = 1; + + keywords_init(); +} |