scanner.c
1 #include <assert.h>
2 #include <stdlib.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include <errno.h>
6 #include <sys/types.h>
7 
8 #include <kuroko/kuroko.h>
9 #include <kuroko/scanner.h>
10 
11 KrkScanner krk_initScanner(const char * src) {
12  KrkScanner scanner;
13  scanner.start = src;
14  scanner.cur = src;
15  scanner.line = 1;
16  scanner.linePtr = src;
17  scanner.startOfLine = 1;
18  scanner.hasUnget = 0;
19  return scanner;
20 }
21 
22 static int isAtEnd(const KrkScanner * scanner) {
23  return *scanner->cur == '\0';
24 }
25 
26 static void nextLine(KrkScanner * scanner) {
27  scanner->line++;
28  scanner->linePtr = scanner->cur;
29 }
30 
31 static KrkToken makeToken(const KrkScanner * scanner, KrkTokenType type) {
32  return (KrkToken){
33  .type = type,
34  .start = scanner->start,
35  .length = (type == TOKEN_EOL) ? 0 : (size_t)(scanner->cur - scanner->start),
36  .line = scanner->line,
37  .linePtr = scanner->linePtr,
38  .literalWidth = (type == TOKEN_EOL) ? 0 : (size_t)(scanner->cur - scanner->start),
39  .col = (scanner->start - scanner->linePtr) + 1,
40  };
41 }
42 
43 static KrkToken errorToken(const KrkScanner * scanner, const char * errorStr) {
44  ssize_t column = (scanner->linePtr < scanner->start) ? scanner->start - scanner->linePtr : 0;
45  ssize_t width = (scanner->start < scanner->cur) ? scanner->cur - scanner->start : 0;
46  return (KrkToken){
47  .type = TOKEN_ERROR,
48  .start = errorStr,
49  .length = strlen(errorStr),
50  .line = scanner->line,
51  .linePtr = scanner->linePtr,
52  .literalWidth = (size_t)(width),
53  .col = column + 1,
54  };
55 }
56 
57 static char advance(KrkScanner * scanner) {
58  return (*scanner->cur == '\0') ? '\0' : *(scanner->cur++);
59 }
60 
61 static int match(KrkScanner * scanner, char expected) {
62  if (isAtEnd(scanner)) return 0;
63  if (*scanner->cur != expected) return 0;
64  scanner->cur++;
65  return 1;
66 }
67 
68 static char peek(const KrkScanner * scanner) {
69  return *scanner->cur;
70 }
71 
72 static char peekNext(const KrkScanner * scanner, int n) {
73  if (isAtEnd(scanner)) return '\0';
74  for (int i = 0; i < n; ++i) if (scanner->cur[i] == '\0') return '\0';
75  return scanner->cur[n];
76 }
77 
78 static void skipWhitespace(KrkScanner * scanner) {
79  for (;;) {
80  char c = peek(scanner);
81  switch (c) {
82  case ' ':
83  case '\t':
84  advance(scanner);
85  break;
86  default:
87  return;
88  }
89  }
90 }
91 
92 static KrkToken makeIndentation(KrkScanner * scanner) {
93  char reject = (peek(scanner) == ' ') ? '\t' : ' ';
94  while (!isAtEnd(scanner) && (peek(scanner) == ' ' || peek(scanner) == '\t')) advance(scanner);
95  if (isAtEnd(scanner)) return makeToken(scanner, TOKEN_EOF);
96  for (const char * start = scanner->start; start < scanner->cur; start++) {
97  if (*start == reject) return errorToken(scanner, "Invalid mix of indentation.");
98  }
99  KrkToken out = makeToken(scanner, TOKEN_INDENTATION);
100  if (reject == ' ') out.length *= 8;
101  if (peek(scanner) == '#' || peek(scanner) == '\n') {
102  while (!isAtEnd(scanner) && peek(scanner) != '\n') advance(scanner);
103  scanner->startOfLine = 1;
104  return makeToken(scanner, TOKEN_RETRY);
105  }
106  return out;
107 }
108 
109 static KrkToken string(KrkScanner * scanner, char quoteMark) {
110  if (peek(scanner) == quoteMark && peekNext(scanner, 1) == quoteMark) {
111  advance(scanner); advance(scanner);
112  /* Big string */
113  while (!isAtEnd(scanner)) {
114  if (peek(scanner) == quoteMark && peekNext(scanner, 1) == quoteMark && peekNext(scanner, 2) == quoteMark) {
115  advance(scanner);
116  advance(scanner);
117  advance(scanner);
118  return makeToken(scanner, TOKEN_BIG_STRING);
119  }
120 
121  if (peek(scanner) == '\\') advance(scanner);
122  if (peek(scanner) == '\n') {
123  advance(scanner);
124  nextLine(scanner);
125  }
126  else advance(scanner);
127  }
128  if (isAtEnd(scanner)) return errorToken(scanner, "Unterminated string.");
129  }
130  while (peek(scanner) != quoteMark && !isAtEnd(scanner)) {
131  if (peek(scanner) == '\n') return errorToken(scanner, "Unterminated string.");
132  if (peek(scanner) == '\\') advance(scanner);
133  if (peek(scanner) == '\n') {
134  advance(scanner);
135  nextLine(scanner);
136  }
137  else advance(scanner);
138  }
139 
140  if (isAtEnd(scanner)) return errorToken(scanner, "Unterminated string.");
141 
142  assert(peek(scanner) == quoteMark);
143  advance(scanner);
144 
145  return makeToken(scanner, TOKEN_STRING);
146 }
147 
148 static int isDigit(char c) {
149  return c >= '0' && c <= '9';
150 }
151 
152 static KrkToken number(KrkScanner * scanner, char c) {
153  if (c == '0') {
154  if (peek(scanner) == 'x' || peek(scanner) == 'X') {
155  /* Hexadecimal */
156  advance(scanner);
157  while (isDigit(peek(scanner)) || (peek(scanner) >= 'a' && peek(scanner) <= 'f') ||
158  (peek(scanner) >= 'A' && peek(scanner) <= 'F') || (peek(scanner) == '_')) advance(scanner);
159  return makeToken(scanner, TOKEN_NUMBER);
160  } else if (peek(scanner) == 'b' || peek(scanner) == 'B') {
161  /* Binary */
162  advance(scanner);
163  while (peek(scanner) == '0' || peek(scanner) == '1' || (peek(scanner) == '_')) advance(scanner);
164  return makeToken(scanner, TOKEN_NUMBER);
165  } if (peek(scanner) == 'o' || peek(scanner) == 'O') {
166  /* Octal - must be 0o, none of those silly 0123 things */
167  advance(scanner);
168  while ((peek(scanner) >= '0' && peek(scanner) <= '7') || (peek(scanner) == '_')) advance(scanner);
169  return makeToken(scanner, TOKEN_NUMBER);
170  }
171  /* Otherwise, decimal and maybe 0.123 floating */
172  }
173 
174  /* Decimal */
175  while (isDigit(peek(scanner)) || peek(scanner) == '_') advance(scanner);
176 
177  /* Floating point */
178  if (peek(scanner) == '.' && isDigit(peekNext(scanner, 1))) {
179  advance(scanner);
180  while (isDigit(peek(scanner))) advance(scanner);
181  }
182 
183  if (peek(scanner) == 'e' || peek(scanner) == 'E') {
184  advance(scanner);
185  if (peek(scanner) == '+' || peek(scanner) == '-') advance(scanner);
186  while (isDigit(peek(scanner))) advance(scanner);
187  }
188 
189  return makeToken(scanner, TOKEN_NUMBER);
190 }
191 
192 static int isAlpha(char c) {
193  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c == '_');
194 }
195 
196 static int _checkKeyword(KrkScanner * scanner, size_t start, const char * rest, KrkTokenType type) {
197  size_t length = strlen(rest);
198  if ((size_t)(scanner->cur - scanner->start) == start + length &&
199  memcmp(scanner->start + start, rest, length) == 0) return type;
200  return TOKEN_IDENTIFIER;
201 }
202 
203 #define checkKeyword(a,b,c) _checkKeyword(scanner,a,b,c)
204 
205 static KrkTokenType identifierType(KrkScanner * scanner) {
206 #define MORE(i) (scanner->cur - scanner->start > i)
207  switch (*scanner->start) {
208  case 'a': if (MORE(1)) switch(scanner->start[1]) {
209  case 'n': return checkKeyword(2, "d", TOKEN_AND);
210  case 'w': return checkKeyword(2, "ait", TOKEN_AWAIT);
211  case 's': if (MORE(2)) {
212  switch (scanner->start[2]) {
213  case 's': return checkKeyword(3, "ert", TOKEN_ASSERT);
214  case 'y': return checkKeyword(3, "nc", TOKEN_ASYNC);
215  }
216  break;
217  } else {
218  return checkKeyword(2, "", TOKEN_AS);
219  }
220  } break;
221  case 'b': if (MORE(1)) return checkKeyword(1, "reak", TOKEN_BREAK);
222  else if (scanner->start[1] == '\'' || scanner->start[1] == '"') return TOKEN_PREFIX_B;
223  break;
224  case 'c': if (MORE(1)) switch(scanner->start[1]) {
225  case 'l': return checkKeyword(2, "ass", TOKEN_CLASS);
226  case 'o': return checkKeyword(2, "ntinue", TOKEN_CONTINUE);
227  } break;
228  case 'd': if (MORE(1)) switch(scanner->start[1]) {
229  case 'e': if (MORE(2)) switch (scanner->start[2]) {
230  case 'f': return checkKeyword(3, "", TOKEN_DEF);
231  case 'l': return checkKeyword(3, "", TOKEN_DEL);
232  } break;
233  } break;
234  case 'e': if (MORE(1)) switch(scanner->start[1]) {
235  case 'l': if (MORE(2)) switch(scanner->start[2]) {
236  case 's': return checkKeyword(3,"e", TOKEN_ELSE);
237  case 'i': return checkKeyword(3,"f", TOKEN_ELIF);
238  } break;
239  case 'x': return checkKeyword(2, "cept", TOKEN_EXCEPT);
240  } break;
241  case 'f': if (MORE(1)) switch(scanner->start[1]) {
242  case 'i': return checkKeyword(2, "nally", TOKEN_FINALLY);
243  case 'o': return checkKeyword(2, "r", TOKEN_FOR);
244  case 'r': return checkKeyword(2, "om", TOKEN_FROM);
245  } else if (scanner->start[1] == '\'' || scanner->start[1] == '"') return TOKEN_PREFIX_F;
246  break;
247  case 'F': return checkKeyword(1, "alse", TOKEN_FALSE);
248  case 'i': if (MORE(1)) switch (scanner->start[1]) {
249  case 'f': return checkKeyword(2, "", TOKEN_IF);
250  case 'n': return checkKeyword(2, "", TOKEN_IN);
251  case 'm': return checkKeyword(2, "port", TOKEN_IMPORT);
252  case 's': return checkKeyword(2, "", TOKEN_IS);
253  } break;
254  case 'l': if (MORE(1)) switch (scanner->start[1]) {
255  case 'a': return checkKeyword(2, "mbda", TOKEN_LAMBDA);
256  case 'e': return checkKeyword(2, "t", TOKEN_LET);
257  } break;
258  case 'n': return checkKeyword(1, "ot", TOKEN_NOT);
259  case 'N': return checkKeyword(1, "one", TOKEN_NONE);
260  case 'o': return checkKeyword(1, "r", TOKEN_OR);
261  case 'p': return checkKeyword(1, "ass", TOKEN_PASS);
262  case 'r': if (MORE(1)) switch (scanner->start[1]) {
263  case 'e': return checkKeyword(2, "turn", TOKEN_RETURN);
264  case 'a': return checkKeyword(2, "ise", TOKEN_RAISE);
265  } else if (scanner->start[1] == '\'' || scanner->start[1] == '"') return TOKEN_PREFIX_R;
266  break;
267  case 's': return checkKeyword(1, "uper", TOKEN_SUPER);
268  case 't': return checkKeyword(1, "ry", TOKEN_TRY);
269  case 'T': return checkKeyword(1, "rue", TOKEN_TRUE);
270  case 'w': if (MORE(1)) switch(scanner->start[1]) {
271  case 'h': return checkKeyword(2, "ile", TOKEN_WHILE);
272  case 'i': return checkKeyword(2, "th", TOKEN_WITH);
273  } break;
274  case 'y': return checkKeyword(1, "ield", TOKEN_YIELD);
275  }
276  return TOKEN_IDENTIFIER;
277 }
278 
279 static KrkToken identifier(KrkScanner * scanner) {
280  while (isAlpha(peek(scanner)) || isDigit(peek(scanner)) || (unsigned char)peek(scanner) > 0x7F) advance(scanner);
281 
282  return makeToken(scanner, identifierType(scanner));
283 }
284 
285 void krk_ungetToken(KrkScanner * scanner, KrkToken token) {
286  if (scanner->hasUnget) {
287  abort();
288  }
289  scanner->hasUnget = 1;
290  scanner->unget = token;
291 }
292 
294  return *scanner;
295 }
296 
297 void krk_rewindScanner(KrkScanner * scanner, KrkScanner other) {
298  *scanner = other;
299 }
300 
302 
303  if (scanner->hasUnget) {
304  scanner->hasUnget = 0;
305  return scanner->unget;
306  }
307 
308  /* If at start of line, do thing */
309  if (scanner->startOfLine && (peek(scanner) == ' ' || peek(scanner) == '\t')) {
310  scanner->start = scanner->cur;
311  scanner->startOfLine = 0;
312  return makeIndentation(scanner);
313  }
314 
315  /* Eat whitespace */
316  skipWhitespace(scanner);
317 
318  /* Skip comments */
319  if (peek(scanner) == '#') while (peek(scanner) != '\n' && !isAtEnd(scanner)) advance(scanner);
320 
321  scanner->start = scanner->cur;
322  if (isAtEnd(scanner)) return makeToken(scanner, TOKEN_EOF);
323 
324  char c = advance(scanner);
325 
326  if (c == '\n') {
327  KrkToken out;
328  if (scanner->startOfLine) {
329  /* Ignore completely blank lines */
330  out = makeToken(scanner, TOKEN_RETRY);
331  } else {
332  scanner->startOfLine = 1;
333  out = makeToken(scanner, TOKEN_EOL);
334  }
335  nextLine(scanner);
336  return out;
337  }
338 
339  if (c == '\\' && peek(scanner) == '\n') {
340  advance(scanner);
341  nextLine(scanner);
342  return makeToken(scanner, TOKEN_RETRY);
343  }
344 
345  /* Not indentation, not a linefeed on an empty line, must be not be start of line any more */
346  scanner->startOfLine = 0;
347 
348  if (isAlpha(c) || (unsigned char)c > 0x7F) return identifier(scanner);
349  if (isDigit(c)) return number(scanner, c);
350 
351  switch (c) {
352  case '(': return makeToken(scanner, TOKEN_LEFT_PAREN);
353  case ')': return makeToken(scanner, TOKEN_RIGHT_PAREN);
354  case '{': return makeToken(scanner, TOKEN_LEFT_BRACE);
355  case '}': return makeToken(scanner, TOKEN_RIGHT_BRACE);
356  case '[': return makeToken(scanner, TOKEN_LEFT_SQUARE);
357  case ']': return makeToken(scanner, TOKEN_RIGHT_SQUARE);
358  case ',': return makeToken(scanner, TOKEN_COMMA);
359  case ';': return makeToken(scanner, TOKEN_SEMICOLON);
360  case '~': return makeToken(scanner, TOKEN_TILDE);
361  case '.': return makeToken(scanner, peek(scanner) == '.' ? (peekNext(scanner,1) == '.' ? (advance(scanner), advance(scanner), TOKEN_ELLIPSIS) : TOKEN_DOT) : TOKEN_DOT);
362 
363  case ':': return makeToken(scanner, match(scanner, '=') ? TOKEN_WALRUS : TOKEN_COLON);
364  case '!': return makeToken(scanner, match(scanner, '=') ? TOKEN_BANG_EQUAL : TOKEN_BANG);
365  case '=': return makeToken(scanner, match(scanner, '=') ? TOKEN_EQUAL_EQUAL : TOKEN_EQUAL);
366  case '<': return makeToken(scanner, match(scanner, '=') ? TOKEN_LESS_EQUAL : (match(scanner, '<') ? (match(scanner, '=') ? TOKEN_LSHIFT_EQUAL : TOKEN_LEFT_SHIFT) : TOKEN_LESS));
367  case '>': return makeToken(scanner, match(scanner, '=') ? TOKEN_GREATER_EQUAL : (match(scanner, '>') ? (match(scanner, '=') ? TOKEN_RSHIFT_EQUAL : TOKEN_RIGHT_SHIFT) : TOKEN_GREATER));
368  case '-': return makeToken(scanner, match(scanner, '=') ? TOKEN_MINUS_EQUAL : (match(scanner, '-') ? TOKEN_MINUS_MINUS : (match(scanner, '>') ? TOKEN_ARROW : TOKEN_MINUS)));
369  case '+': return makeToken(scanner, match(scanner, '=') ? TOKEN_PLUS_EQUAL : (match(scanner, '+') ? TOKEN_PLUS_PLUS : TOKEN_PLUS));
370  case '^': return makeToken(scanner, match(scanner, '=') ? TOKEN_CARET_EQUAL : TOKEN_CARET);
371  case '|': return makeToken(scanner, match(scanner, '=') ? TOKEN_PIPE_EQUAL : TOKEN_PIPE);
372  case '&': return makeToken(scanner, match(scanner, '=') ? TOKEN_AMP_EQUAL : TOKEN_AMPERSAND);
373  case '/': return makeToken(scanner, match(scanner, '=') ? TOKEN_SOLIDUS_EQUAL : (match(scanner, '/') ? (match(scanner, '=') ? TOKEN_DSOLIDUS_EQUAL : TOKEN_DOUBLE_SOLIDUS) : TOKEN_SOLIDUS));
374  case '*': return makeToken(scanner, match(scanner, '=') ? TOKEN_ASTERISK_EQUAL: (match(scanner, '*') ? (match(scanner, '=') ? TOKEN_POW_EQUAL : TOKEN_POW) : TOKEN_ASTERISK));
375  case '%': return makeToken(scanner, match(scanner, '=') ? TOKEN_MODULO_EQUAL : TOKEN_MODULO);
376  case '@': return makeToken(scanner, match(scanner, '=') ? TOKEN_AT_EQUAL : TOKEN_AT);
377 
378  case '"': return string(scanner, '"');
379  case '\'': return string(scanner, '\'');
380  }
381 
382  return errorToken(scanner, "Unexpected character.");
383 }
384 
Top-level header with configuration macros.
Definitions used by the token scanner.
void krk_ungetToken(KrkScanner *, KrkToken token)
Push a token back to the scanner to be reprocessed.
Definition: scanner.c:285
KrkToken krk_scanToken(KrkScanner *)
Read the next token from the scanner.
Definition: scanner.c:301
KrkScanner krk_initScanner(const char *src)
Initialize the compiler to scan tokens from 'src'.
Definition: scanner.c:11
KrkScanner krk_tellScanner(KrkScanner *)
Retreive a copy of the current scanner state.
Definition: scanner.c:293
void krk_rewindScanner(KrkScanner *, KrkScanner to)
Rewind the scanner to a previous state.
Definition: scanner.c:297
Token scanner state.
Definition: scanner.h:140
A token from the scanner.
Definition: scanner.h:124