/* * Quark lexer (lexical analyzer). * Turns source code into tokens. * Copyright (c) 2023-2024, Quinn Stephens and the OSMORA team. * Provided under the BSD 3-Clause license. */ #include "debug.h" #include "lexer.h" #include "lexer/char_info.h" #include "lexer/keywords.h" static void skip_ignored(struct lexer *ctx) { while (char_info[(int)*ctx->pos] & CHAR_WHITESPACE) { if (char_info[(int)*ctx->pos] & CHAR_VERT_WS) { ctx->line++; ctx->line_start = ctx->pos + 1; } ctx->pos++; } } static void parse_num_hex(struct lexer *ctx, struct token *tok) { ctx->pos += 2; tok->value = 0; while (char_info[(int)*ctx->pos] & CHAR_HEX) { if (char_info[(int)*ctx->pos] == CHAR_XLOWER) { tok->value |= *ctx->pos++ - 'a' + 0xa; } else if (char_info[(int)*ctx->pos] == CHAR_XUPPER) { tok->value |= *ctx->pos++ - 'A' + 0xa; } else { tok->value |= *ctx->pos++ - '0'; } } } static void parse_num_dec(struct lexer *ctx, struct token *tok) { ctx->pos++; tok->value = *ctx->pos - '0'; while (char_info[(int)*ctx->pos] & CHAR_DIGIT) { tok->value *= 10; tok->value += *ctx->pos++ - '0'; } } static void parse_num_bin(struct lexer *ctx, struct token *tok) { ctx->pos += 2; tok->value = 0; while (*ctx->pos == '0' || *ctx->pos == '1') { tok->value <<= 1; tok->value |= *ctx->pos++ - '0'; } } static void lex_ident(struct lexer *ctx, struct token *tok) { /* Find end of identifier */ ctx->pos++; while (char_info[(int)*ctx->pos] & CHAR_ALNUM || *ctx->pos == '_') { ctx->pos++; } tok->len = (size_t)(ctx->pos - tok->pos); tok->hash = hash_data(tok->pos, tok->len); /* Determine if this is a keyword or just an identifier */ tok->kind = keywords_find(tok); if (tok->kind == TK_UNKNOWN) { tok->kind = TK_IDENTIFIER; return; } } static void lex_oper(struct lexer *ctx, struct token *tok) { tok->len = 1; switch (*ctx->pos) { case '+': if (ctx->pos[1] == '+') { tok->kind = TK_PLUS_PLUS; tok->len = 2; } else if (ctx->pos[1] == '=') { tok->kind = TK_PLUS_EQUALS; tok->len = 2; } else { tok->kind = TK_PLUS; } break; case '-': if (ctx->pos[1] == '>') { tok->kind = TK_ARROW; tok->len = 2; } if (ctx->pos[1] == '-') { tok->kind = TK_MINUS_MINUS; tok->len = 2; } else if (ctx->pos[1] == '=') { tok->kind = TK_MINUS_EQUALS; tok->len = 2; } else { tok->kind = TK_MINUS; } break; case '<': if (ctx->pos[1] == '=') { tok->kind = TK_LESS_THAN_EQUALS; tok->len = 2; } else if (ctx->pos[1] == '<') { if (ctx->pos[2] == '=') { tok->kind = TK_SHIFT_LEFT_EQUALS; tok->len = 3; } else { tok->kind = TK_SHIFT_LEFT; tok->len = 2; } } else { tok->kind = TK_LESS_THAN; } break; case '>': if (ctx->pos[1] == '=') { tok->kind = TK_GREATER_THAN_EQUALS; tok->len = 2; } else if (ctx->pos[1] == '>') { if (ctx->pos[2] == '=') { tok->kind = TK_SHIFT_RIGHT_EQUALS; tok->len = 3; } else { tok->kind = TK_SHIFT_RIGHT; tok->len = 2; } } else { tok->kind = TK_GREATER_THAN; } break; default: tok->kind = char_info[(int)*ctx->pos] >> CHAR_OPER_SHIFT; if (ctx->pos[1] == '=') { tok->kind++; tok->len = 2; } break; } ctx->pos += tok->len; } static void lex_str(struct lexer *ctx, struct token *tok) { /* Find end of string */ ctx->pos++; while (*ctx->pos != '"') { if (*ctx->pos == '\\' && ctx->pos[1] == '\"') { ctx->pos++; } ctx->pos++; } ctx->pos++; tok->kind = TK_STRING; tok->len = (size_t)(ctx->pos - tok->pos) - 1; } static void lex_char(struct lexer *ctx, struct token *tok) { /* Find end of character */ ctx->pos++; while (*ctx->pos != '\'') { if (*ctx->pos == '\\' && ctx->pos[1] == '\'') { ctx->pos++; } ctx->pos++; } ctx->pos++; tok->kind = TK_CHARACTER; tok->len = (size_t)(ctx->pos - tok->pos) - 1; } void lexer_next(struct lexer *ctx, struct token *tok) { if (tok == NULL) { return; } if (ctx == NULL) { tok->kind = TK_UNKNOWN; return; } skip_ignored(ctx); /* Initialize token */ tok->kind = TK_UNKNOWN; tok->pos = ctx->pos; tok->line = ctx->line; tok->col = (int)(tok->pos - ctx->line_start) + 1; if (char_info[(int)*ctx->pos] & CHAR_ALPHA || *ctx->pos == '_') { lex_ident(ctx, tok); return; } if (char_info[(int)*ctx->pos] & CHAR_SINGLE) { tok->kind = char_info[(int)*ctx->pos] >> CHAR_SINGLE_SHIFT; tok->len = 1; ctx->pos++; return; } if (char_info[(int)*ctx->pos] & CHAR_OPER) { lex_oper(ctx, tok); return; } if (char_info[(int)*ctx->pos] & CHAR_DIGIT) { tok->kind = TK_NUMBER; if (*ctx->pos == '0' && ctx->pos[1] == 'x') { parse_num_hex(ctx, tok); } else if (*ctx->pos == '0' && ctx->pos[1] == 'b') { parse_num_bin(ctx, tok); } else { parse_num_dec(ctx, tok); } tok->len = (size_t)(ctx->pos - tok->pos); return; } if (*ctx->pos == '"') { lex_str(ctx, tok); return; } if (*ctx->pos == '\'') { lex_char(ctx, tok); return; } if (*ctx->pos == '\0') { tok->kind = TK_EOF; return; } } void lexer_init(struct lexer *ctx, char *source) { if (ctx == NULL || source == NULL) { return; } debug("Initializing lexer...\n"); ctx->pos = source; ctx->line_start = ctx->pos; ctx->line = 1; keywords_init(); }