/* * Copyright (c) 2025 Ian Marco Moffett and L5 engineers * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Description: Pirho compiler lexer * Author: Ian Marco Moffett */ #include #include #include #include #include #include #define pr_trace(fmt, ...) printf("pirho.lex: " fmt, ##__VA_ARGS__) #define pr_error(fmt, ...) printf("pirho.lex: error: " fmt, ##__VA_ARGS__) /* Max identifier length */ #define MAX_ID_LEN 32 /* Just some helpers */ #define to_lower(c) ((c) | 0x20) #define is_alpha(c) (to_lower(c) >= 'a' && to_lower(c) <= 'z') #define is_num(c) ((c) >= '0' && (c) <= '9') #define is_space(c) ((c) == ' ' || (c) == '\t' || (c) == '\f' || (c) == '\n') /* * Pop a character from the input work * * @work: Input work to pop from */ static char lex_pop(struct np_work *work) { struct lexer_state *lex_st; char c; if (work == NULL) { return '\0'; } /* * First we check the cache, if there is a char, * grab and clear it. */ if (work->ccache != '\0') { c = work->ccache; work->ccache = '\0'; return c; } /* Don't overflow the source file */ lex_st = &work->lex_st; if (lex_st->source_idx >= work->source_size) { return '\0'; } return work->source[lex_st->source_idx++]; } /* * Compare a token with existing integer types (used internally) */ static void lex_cmp_itype(const char *tokstr, struct lex_token *res) { switch (*tokstr) { case 'u': if (strcmp(tokstr, TOKEN_U8) == 0) { res->token = TT_U8; break; } if (strcmp(tokstr, TOKEN_U16) == 0) { res->token = TT_U16; break; } if (strcmp(tokstr, TOKEN_U32) == 0) { res->token = TT_U32; break; } if (strcmp(tokstr, TOKEN_U64) == 0) { res->token = TT_U64; break; } break; case 'i': if (strcmp(tokstr, TOKEN_I8) == 0) { res->token = TT_I8; break; } if (strcmp(tokstr, TOKEN_I16) == 0) { res->token = TT_I16; break; } if (strcmp(tokstr, TOKEN_I32) == 0) { res->token = TT_I32; break; } if (strcmp(tokstr, TOKEN_I64) == 0) { res->token = TT_I64; break; } break; } } /* * Compare a token with existing tokens (used internally) */ static int lex_cmptok(char *tokstr, struct lex_token *res) { switch (*tokstr) { case 'r': if (strcmp(tokstr, TOKEN_RETURN) == 0) { res->token = TT_RETURN; } return 0; case 'b': if (strcmp(tokstr, TOKEN_BEGIN) == 0) { res->token = TT_BEGIN; } return 0; case 'e': if (strcmp(tokstr, TOKEN_END) == 0) { res->token = TT_END; } return 0; case 'p': if (strcmp(tokstr, TOKEN_PROC) == 0) { res->token = TT_PROC; } return 0; case 'u': case 'i': lex_cmp_itype(tokstr, res); return 0; } res->token = TT_IDENT; return 0; } /* * Match a token with a set of known tokens * * @work: Input work * @c: Current character * @res: Token result * * Returns zero on success */ static int lex_matchstr(struct np_work *work, char c, struct lex_token *res) { char id[MAX_ID_LEN + 1]; size_t id_idx = 0; int error; if (work == NULL || res == NULL) { return -EINVAL; } /* Grab the identifier */ do { if (id_idx >= sizeof(id) - 1) { pr_error("identifier too long!\n"); return -1; } if (!is_alpha(c) && !is_num(c)) { work->ccache = c; break; } id[id_idx++] = c; } while ((c = lex_pop(work)) != 0); id[id_idx] = '\0'; res->val_str = ptrbox_strdup(id, work->work_mem); /* Match the tokens */ error = lex_cmptok(id, res); if (error < 0) { pr_error("invalid indentifier '%s'\n", id); } return error; } /* * Scan arithmetic operators and return the token type * (tt_t) on success * * @work: Input work * @c: Character to check * * Returns a less than zero value on failure */ static int lex_arithop(struct np_work *work, char c, struct lex_token *res) { switch (c) { case '*': res->token = TT_STAR; break; case '-': res->token = TT_MINUS; break; case '+': res->token = TT_PLUS; break; case '/': res->token = TT_SLASH; break; default: return -1; } return res->token; } /* * Scan compare operators and return the token type * (tt_t) on success * * @work: Input work * @c: Character to check * * Returns a less than zero value on failure */ static int lex_cmpop(struct np_work *work, char c, struct lex_token *res) { switch (c) { case '>': res->token = TT_GT; break; case '<': res->token = TT_LT; break; default: return -1; } return res->token; } /* * Parse a number and get a token value * * @work: Input work * @c: First character [digit] * @res: Result */ static int lex_nomnum(struct np_work *work, char c, struct lex_token *res) { uint64_t num = 0; if (work == NULL || res == NULL) { return -EINVAL; } while (is_num(c)) { num = num * 10 + (c - '0'); c = lex_pop(work); } res->token = TT_NUMBER; res->val = num; return 0; } /* * Nom a token */ int lex_nom(struct np_work *work, struct lex_token *res) { struct lexer_state *lex_st; int error = 0; char c; if (work == NULL || res == NULL) { return -EINVAL; } lex_st = &work->lex_st; res->token = TT_NONE; /* Skip all whitespace */ while ((c = lex_pop(work)) != 0) { if (c == '\n') { ++work->line_no; } if (is_space(c)) { continue; } break; } /* Match the token type */ switch (c) { case '\0': return LEX_EOF; case '(': res->token = TT_LPAREN; break; case ')': res->token = TT_RPAREN; break; case ',': res->token = TT_COMMA; break; case '=': res->token = TT_EQUALS; break; case '[': res->token = TT_LBRACK; break; case ']': res->token = TT_RBRACK; break; case ':': res->token = TT_COLON; break; default: if (is_num(c)) { lex_nomnum(work, c, res); break; } if (lex_arithop(work, c, res) >= 0) break; if (lex_cmpop(work, c, res) >= 0) break; /* Stuff like '1var_name' is invalid */ if (!is_alpha(c)) { pr_error("unexpected token '%c'\n", c); return -1; } error = lex_matchstr(work, c, res); if (error == 0) { break; } error = -1; break; } return error; } int lex_init(struct lexer_state *state, struct np_work *work) { if (state == NULL) { return -EINVAL; } memset(state, 0, sizeof(*state)); state->work = work; return 0; }