summaryrefslogtreecommitdiff
path: root/src/lexer.c
blob: 623dd391c4d08bfa19078b3e62db87232b1c9a9f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>

#include <lcq/pit/utils.h>
#include <lcq/pit/lexer.h>
#include <lcq/pit/types.h>

const char *PIT_LEX_TOKEN_NAMES[PIT_LEX_TOKEN__SENTINEL] = {
    /* [PIT_LEX_TOKEN_EOF] = */ "eof",
    /* [PIT_LEX_TOKEN_LPAREN] = */ "lparen",
    /* [PIT_LEX_TOKEN_RPAREN] = */ "rparen",
    /* [PIT_LEX_TOKEN_LSQUARE] = */ "lsquare",
    /* [PIT_LEX_TOKEN_RSQUARE] = */ "rsquare",
    /* [PIT_LEX_TOKEN_DOT] = */ "dot",
    /* [PIT_LEX_TOKEN_QUOTE] = */ "quote",
    /* [PIT_LEX_TOKEN_INTEGER_LITERAL] = */ "integer_literal",
    /* [PIT_LEX_TOKEN_STRING_LITERAL] = */ "string_literal",
    /* [PIT_LEX_TOKEN_SYMBOL] = */ "symbol",
};

const char *pit_lex_token_name(pit_lex_token t) {
    return PIT_LEX_TOKEN_NAMES[t];
}

static bool is_more_input(pit_lexer *st) {
    return st && st->end < st->len;
}

static int is_symchar(int c) {
    return c != '(' && c != ')' && c != '.' && c != '\'' && c != '"' && isprint(c) && !isspace(c);
}

static int is_hexdigit(int c) {
    return isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
}


static char peek(pit_lexer *st) {
    if (is_more_input(st)) return st->input[st->end];
    else return 0;
}

static char advance(pit_lexer *st) {
    if (is_more_input(st)) {
        char ret = st->input[st->end++];
        if (ret == '\n') {
            st->line += 1;
            st->column = 0;
        } else {
            st->column += 1;
        }
        return ret;
    }
    else return 0;
}

static bool match(pit_lexer *st, int (*f)(int)) {
    if (f(peek(st))) {
        advance(st);
        return true;
    } else return false;
}

void pit_lex_cstr(pit_lexer *ret, char *buf) {
    ret->input = buf;
    ret->len = (i64) strlen(buf);
    ret->start = 0;
    ret->end = 0;
    ret->line = ret->start_line = 1;
    ret->column = ret->start_column = 0;
    ret->error = NULL;
}

void pit_lex_bytes(pit_lexer *ret, char *buf, i64 len) {
    ret->len = len;
    ret->input = buf;
    ret->start = 0;
    ret->end = 0;
    ret->line = ret->start_line = 1;
    ret->column = ret->start_column = 0;
    ret->error = NULL;
}
void pit_lex_file(pit_lexer *ret, char *path) {
    FILE *f = fopen(path, "r");
    if (f == NULL) {
        pit_panic("failed to open file for lexing: %s", path);
        return;
    }
    fseek(f, 0, SEEK_END);
    i64 len = ftell(f);
    fseek(f, 0, SEEK_SET);
    char *buf = calloc((size_t) ret->len, sizeof(char));
    if ((size_t) ret->len != fread(ret->input, sizeof(char), (size_t) ret->len, f)) {
        pit_panic("failed to read file for lexing: %s", path);
        return;
    }
    fclose(f);
    pit_lex_bytes(ret, buf, len);
}

pit_lex_token pit_lex_next(pit_lexer *st) {
restart:
    st->start = st->end;
    st->start_line = st->line;
    st->start_column = st->column;
    char c = advance(st);
    switch (c) {
    case 0: return PIT_LEX_TOKEN_EOF;
    case ';': while (is_more_input(st) && advance(st) != '\n'); goto restart;
    case '(': return PIT_LEX_TOKEN_LPAREN;
    case ')': return PIT_LEX_TOKEN_RPAREN;
    case '[': return PIT_LEX_TOKEN_LSQUARE;
    case ']': return PIT_LEX_TOKEN_RSQUARE;
    case '.': return PIT_LEX_TOKEN_DOT;
    case '\'': return PIT_LEX_TOKEN_QUOTE;
    case '"':
        while (peek(st) != '"') {
            if (peek(st) == '\\') advance(st); /* skip escaped characters */
            if (!advance(st)) {
                st->error = "unterminated string";
                return PIT_LEX_TOKEN_ERROR;
            }
        }
        advance(st);
        return PIT_LEX_TOKEN_STRING_LITERAL;
    default:
        if (isspace(c)) goto restart;
        if (isdigit(c)) {
            if (c == '0') {
                int next = peek(st);
                if (next != 'x' && next != 'o' && next != 'b') return PIT_LEX_TOKEN_INTEGER_LITERAL;
                advance(st); /* skip base specifier */
            }
            while (match(st, is_hexdigit)) {}
            return PIT_LEX_TOKEN_INTEGER_LITERAL;
        } else {
            while (match(st, is_symchar)) {}
            return PIT_LEX_TOKEN_SYMBOL;
        }
    }
}