scanner.c
1 #include <assert.h>
2 #include <stdlib.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include <errno.h>
6 #include <sys/types.h>
7 
8 #include <kuroko/kuroko.h>
9 #include <kuroko/scanner.h>
10 
11 KrkScanner krk_initScanner(const char * src) {
12  KrkScanner scanner;
13  scanner.start = src;
14  scanner.cur = src;
15  scanner.line = 1;
16  scanner.linePtr = src;
17  scanner.startOfLine = 1;
18  scanner.hasUnget = 0;
19  return scanner;
20 }
21 
22 static int isAtEnd(const KrkScanner * scanner) {
23  return *scanner->cur == '\0';
24 }
25 
26 static void nextLine(KrkScanner * scanner) {
27  scanner->line++;
28  scanner->linePtr = scanner->cur;
29 }
30 
31 static KrkToken makeToken(const KrkScanner * scanner, KrkTokenType type) {
32  return (KrkToken){
33  .type = type,
34  .start = scanner->start,
35  .length = (type == TOKEN_EOL) ? 0 : (size_t)(scanner->cur - scanner->start),
36  .line = scanner->line,
37  .linePtr = scanner->linePtr,
38  .literalWidth = (type == TOKEN_EOL) ? 0 : (size_t)(scanner->cur - scanner->start),
39  .col = (scanner->start - scanner->linePtr) + 1,
40  };
41 }
42 
43 static KrkToken errorToken(const KrkScanner * scanner, const char * errorStr) {
44  ssize_t column = (scanner->linePtr < scanner->start) ? scanner->start - scanner->linePtr : 0;
45  ssize_t width = (scanner->start < scanner->cur) ? scanner->cur - scanner->start : 0;
46  return (KrkToken){
47  .type = TOKEN_ERROR,
48  .start = errorStr,
49  .length = strlen(errorStr),
50  .line = scanner->line,
51  .linePtr = scanner->linePtr,
52  .literalWidth = (size_t)(width),
53  .col = column + 1,
54  };
55 }
56 
57 static char advance(KrkScanner * scanner) {
58  return (*scanner->cur == '\0') ? '\0' : *(scanner->cur++);
59 }
60 
61 static int match(KrkScanner * scanner, char expected) {
62  if (isAtEnd(scanner)) return 0;
63  if (*scanner->cur != expected) return 0;
64  scanner->cur++;
65  return 1;
66 }
67 
68 static char peek(const KrkScanner * scanner) {
69  return *scanner->cur;
70 }
71 
72 static char peekNext(const KrkScanner * scanner, int n) {
73  if (isAtEnd(scanner)) return '\0';
74  for (int i = 0; i < n; ++i) if (scanner->cur[i] == '\0') return '\0';
75  return scanner->cur[n];
76 }
77 
78 static void skipWhitespace(KrkScanner * scanner) {
79  for (;;) {
80  char c = peek(scanner);
81  switch (c) {
82  case ' ':
83  case '\t':
84  advance(scanner);
85  break;
86  default:
87  return;
88  }
89  }
90 }
91 
92 static KrkToken makeIndentation(KrkScanner * scanner) {
93  char reject = (peek(scanner) == ' ') ? '\t' : ' ';
94  while (!isAtEnd(scanner) && (peek(scanner) == ' ' || peek(scanner) == '\t')) advance(scanner);
95  if (isAtEnd(scanner)) return makeToken(scanner, TOKEN_EOF);
96  for (const char * start = scanner->start; start < scanner->cur; start++) {
97  if (*start == reject) return errorToken(scanner, "Invalid mix of indentation.");
98  }
99  KrkToken out = makeToken(scanner, TOKEN_INDENTATION);
100  if (reject == ' ') out.length *= 8;
101  if (peek(scanner) == '#' || peek(scanner) == '\n') {
102  while (!isAtEnd(scanner) && peek(scanner) != '\n') advance(scanner);
103  scanner->startOfLine = 1;
104  return makeToken(scanner, TOKEN_RETRY);
105  }
106  return out;
107 }
108 
109 static KrkToken string(KrkScanner * scanner, char quoteMark) {
110  if (peek(scanner) == quoteMark && peekNext(scanner, 1) == quoteMark) {
111  advance(scanner); advance(scanner);
112  /* Big string */
113  while (!isAtEnd(scanner)) {
114  if (peek(scanner) == quoteMark && peekNext(scanner, 1) == quoteMark && peekNext(scanner, 2) == quoteMark) {
115  advance(scanner);
116  advance(scanner);
117  advance(scanner);
118  return makeToken(scanner, TOKEN_BIG_STRING);
119  }
120 
121  if (peek(scanner) == '\\') advance(scanner);
122  if (peek(scanner) == '\n') {
123  advance(scanner);
124  nextLine(scanner);
125  }
126  else advance(scanner);
127  }
128  if (isAtEnd(scanner)) return errorToken(scanner, "Unterminated string.");
129  }
130  while (peek(scanner) != quoteMark && !isAtEnd(scanner)) {
131  if (peek(scanner) == '\n') return errorToken(scanner, "Unterminated string.");
132  if (peek(scanner) == '\\') advance(scanner);
133  if (peek(scanner) == '\n') {
134  advance(scanner);
135  nextLine(scanner);
136  }
137  else advance(scanner);
138  }
139 
140  if (isAtEnd(scanner)) return errorToken(scanner, "Unterminated string.");
141 
142  assert(peek(scanner) == quoteMark);
143  advance(scanner);
144 
145  return makeToken(scanner, TOKEN_STRING);
146 }
147 
148 static int isDigit(char c) {
149  return c >= '0' && c <= '9';
150 }
151 
152 static KrkToken number(KrkScanner * scanner, char c) {
153  if (c == '0') {
154  if (peek(scanner) == 'x' || peek(scanner) == 'X') {
155  /* Hexadecimal */
156  advance(scanner);
157  while (isDigit(peek(scanner)) || (peek(scanner) >= 'a' && peek(scanner) <= 'f') ||
158  (peek(scanner) >= 'A' && peek(scanner) <= 'F') || (peek(scanner) == '_')) advance(scanner);
159  return makeToken(scanner, TOKEN_NUMBER);
160  } else if (peek(scanner) == 'b' || peek(scanner) == 'B') {
161  /* Binary */
162  advance(scanner);
163  while (peek(scanner) == '0' || peek(scanner) == '1' || (peek(scanner) == '_')) advance(scanner);
164  return makeToken(scanner, TOKEN_NUMBER);
165  } if (peek(scanner) == 'o' || peek(scanner) == 'O') {
166  /* Octal - must be 0o, none of those silly 0123 things */
167  advance(scanner);
168  while ((peek(scanner) >= '0' && peek(scanner) <= '7') || (peek(scanner) == '_')) advance(scanner);
169  return makeToken(scanner, TOKEN_NUMBER);
170  }
171  /* Otherwise, decimal and maybe 0.123 floating */
172  }
173 
174  /* Decimal */
175  while (isDigit(peek(scanner)) || peek(scanner) == '_') advance(scanner);
176 
177  /* Floating point */
178  if (peek(scanner) == '.' && isDigit(peekNext(scanner, 1))) {
179  advance(scanner);
180  while (isDigit(peek(scanner))) advance(scanner);
181  }
182 
183  return makeToken(scanner, TOKEN_NUMBER);
184 }
185 
186 static int isAlpha(char c) {
187  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c == '_');
188 }
189 
190 static int _checkKeyword(KrkScanner * scanner, size_t start, const char * rest, KrkTokenType type) {
191  size_t length = strlen(rest);
192  if ((size_t)(scanner->cur - scanner->start) == start + length &&
193  memcmp(scanner->start + start, rest, length) == 0) return type;
194  return TOKEN_IDENTIFIER;
195 }
196 
197 #define checkKeyword(a,b,c) _checkKeyword(scanner,a,b,c)
198 
199 static KrkTokenType identifierType(KrkScanner * scanner) {
200 #define MORE(i) (scanner->cur - scanner->start > i)
201  switch (*scanner->start) {
202  case 'a': if (MORE(1)) switch(scanner->start[1]) {
203  case 'n': return checkKeyword(2, "d", TOKEN_AND);
204  case 'w': return checkKeyword(2, "ait", TOKEN_AWAIT);
205  case 's': if (MORE(2)) {
206  switch (scanner->start[2]) {
207  case 's': return checkKeyword(3, "ert", TOKEN_ASSERT);
208  case 'y': return checkKeyword(3, "nc", TOKEN_ASYNC);
209  }
210  break;
211  } else {
212  return checkKeyword(2, "", TOKEN_AS);
213  }
214  } break;
215  case 'b': if (MORE(1)) return checkKeyword(1, "reak", TOKEN_BREAK);
216  else if (scanner->start[1] == '\'' || scanner->start[1] == '"') return TOKEN_PREFIX_B;
217  break;
218  case 'c': if (MORE(1)) switch(scanner->start[1]) {
219  case 'l': return checkKeyword(2, "ass", TOKEN_CLASS);
220  case 'o': return checkKeyword(2, "ntinue", TOKEN_CONTINUE);
221  } break;
222  case 'd': if (MORE(1)) switch(scanner->start[1]) {
223  case 'e': if (MORE(2)) switch (scanner->start[2]) {
224  case 'f': return checkKeyword(3, "", TOKEN_DEF);
225  case 'l': return checkKeyword(3, "", TOKEN_DEL);
226  } break;
227  } break;
228  case 'e': if (MORE(1)) switch(scanner->start[1]) {
229  case 'l': if (MORE(2)) switch(scanner->start[2]) {
230  case 's': return checkKeyword(3,"e", TOKEN_ELSE);
231  case 'i': return checkKeyword(3,"f", TOKEN_ELIF);
232  } break;
233  case 'x': return checkKeyword(2, "cept", TOKEN_EXCEPT);
234  } break;
235  case 'f': if (MORE(1)) switch(scanner->start[1]) {
236  case 'i': return checkKeyword(2, "nally", TOKEN_FINALLY);
237  case 'o': return checkKeyword(2, "r", TOKEN_FOR);
238  case 'r': return checkKeyword(2, "om", TOKEN_FROM);
239  } else if (scanner->start[1] == '\'' || scanner->start[1] == '"') return TOKEN_PREFIX_F;
240  break;
241  case 'F': return checkKeyword(1, "alse", TOKEN_FALSE);
242  case 'i': if (MORE(1)) switch (scanner->start[1]) {
243  case 'f': return checkKeyword(2, "", TOKEN_IF);
244  case 'n': return checkKeyword(2, "", TOKEN_IN);
245  case 'm': return checkKeyword(2, "port", TOKEN_IMPORT);
246  case 's': return checkKeyword(2, "", TOKEN_IS);
247  } break;
248  case 'l': if (MORE(1)) switch (scanner->start[1]) {
249  case 'a': return checkKeyword(2, "mbda", TOKEN_LAMBDA);
250  case 'e': return checkKeyword(2, "t", TOKEN_LET);
251  } break;
252  case 'n': return checkKeyword(1, "ot", TOKEN_NOT);
253  case 'N': return checkKeyword(1, "one", TOKEN_NONE);
254  case 'o': return checkKeyword(1, "r", TOKEN_OR);
255  case 'p': return checkKeyword(1, "ass", TOKEN_PASS);
256  case 'r': if (MORE(1)) switch (scanner->start[1]) {
257  case 'e': return checkKeyword(2, "turn", TOKEN_RETURN);
258  case 'a': return checkKeyword(2, "ise", TOKEN_RAISE);
259  } else if (scanner->start[1] == '\'' || scanner->start[1] == '"') return TOKEN_PREFIX_R;
260  break;
261  case 's': return checkKeyword(1, "uper", TOKEN_SUPER);
262  case 't': return checkKeyword(1, "ry", TOKEN_TRY);
263  case 'T': return checkKeyword(1, "rue", TOKEN_TRUE);
264  case 'w': if (MORE(1)) switch(scanner->start[1]) {
265  case 'h': return checkKeyword(2, "ile", TOKEN_WHILE);
266  case 'i': return checkKeyword(2, "th", TOKEN_WITH);
267  } break;
268  case 'y': return checkKeyword(1, "ield", TOKEN_YIELD);
269  }
270  return TOKEN_IDENTIFIER;
271 }
272 
273 static KrkToken identifier(KrkScanner * scanner) {
274  while (isAlpha(peek(scanner)) || isDigit(peek(scanner)) || (unsigned char)peek(scanner) > 0x7F) advance(scanner);
275 
276  return makeToken(scanner, identifierType(scanner));
277 }
278 
279 void krk_ungetToken(KrkScanner * scanner, KrkToken token) {
280  if (scanner->hasUnget) {
281  abort();
282  }
283  scanner->hasUnget = 1;
284  scanner->unget = token;
285 }
286 
288  return *scanner;
289 }
290 
291 void krk_rewindScanner(KrkScanner * scanner, KrkScanner other) {
292  *scanner = other;
293 }
294 
296 
297  if (scanner->hasUnget) {
298  scanner->hasUnget = 0;
299  return scanner->unget;
300  }
301 
302  /* If at start of line, do thing */
303  if (scanner->startOfLine && (peek(scanner) == ' ' || peek(scanner) == '\t')) {
304  scanner->start = scanner->cur;
305  scanner->startOfLine = 0;
306  return makeIndentation(scanner);
307  }
308 
309  /* Eat whitespace */
310  skipWhitespace(scanner);
311 
312  /* Skip comments */
313  if (peek(scanner) == '#') while (peek(scanner) != '\n' && !isAtEnd(scanner)) advance(scanner);
314 
315  scanner->start = scanner->cur;
316  if (isAtEnd(scanner)) return makeToken(scanner, TOKEN_EOF);
317 
318  char c = advance(scanner);
319 
320  if (c == '\n') {
321  KrkToken out;
322  if (scanner->startOfLine) {
323  /* Ignore completely blank lines */
324  out = makeToken(scanner, TOKEN_RETRY);
325  } else {
326  scanner->startOfLine = 1;
327  out = makeToken(scanner, TOKEN_EOL);
328  }
329  nextLine(scanner);
330  return out;
331  }
332 
333  if (c == '\\' && peek(scanner) == '\n') {
334  advance(scanner);
335  nextLine(scanner);
336  return makeToken(scanner, TOKEN_RETRY);
337  }
338 
339  /* Not indentation, not a linefeed on an empty line, must be not be start of line any more */
340  scanner->startOfLine = 0;
341 
342  if (isAlpha(c) || (unsigned char)c > 0x7F) return identifier(scanner);
343  if (isDigit(c)) return number(scanner, c);
344 
345  switch (c) {
346  case '(': return makeToken(scanner, TOKEN_LEFT_PAREN);
347  case ')': return makeToken(scanner, TOKEN_RIGHT_PAREN);
348  case '{': return makeToken(scanner, TOKEN_LEFT_BRACE);
349  case '}': return makeToken(scanner, TOKEN_RIGHT_BRACE);
350  case '[': return makeToken(scanner, TOKEN_LEFT_SQUARE);
351  case ']': return makeToken(scanner, TOKEN_RIGHT_SQUARE);
352  case ',': return makeToken(scanner, TOKEN_COMMA);
353  case ';': return makeToken(scanner, TOKEN_SEMICOLON);
354  case '~': return makeToken(scanner, TOKEN_TILDE);
355  case '.': return makeToken(scanner, peek(scanner) == '.' ? (peekNext(scanner,1) == '.' ? (advance(scanner), advance(scanner), TOKEN_ELLIPSIS) : TOKEN_DOT) : TOKEN_DOT);
356 
357  case ':': return makeToken(scanner, match(scanner, '=') ? TOKEN_WALRUS : TOKEN_COLON);
358  case '!': return makeToken(scanner, match(scanner, '=') ? TOKEN_BANG_EQUAL : TOKEN_BANG);
359  case '=': return makeToken(scanner, match(scanner, '=') ? TOKEN_EQUAL_EQUAL : TOKEN_EQUAL);
360  case '<': return makeToken(scanner, match(scanner, '=') ? TOKEN_LESS_EQUAL : (match(scanner, '<') ? (match(scanner, '=') ? TOKEN_LSHIFT_EQUAL : TOKEN_LEFT_SHIFT) : TOKEN_LESS));
361  case '>': return makeToken(scanner, match(scanner, '=') ? TOKEN_GREATER_EQUAL : (match(scanner, '>') ? (match(scanner, '=') ? TOKEN_RSHIFT_EQUAL : TOKEN_RIGHT_SHIFT) : TOKEN_GREATER));
362  case '-': return makeToken(scanner, match(scanner, '=') ? TOKEN_MINUS_EQUAL : (match(scanner, '-') ? TOKEN_MINUS_MINUS : (match(scanner, '>') ? TOKEN_ARROW : TOKEN_MINUS)));
363  case '+': return makeToken(scanner, match(scanner, '=') ? TOKEN_PLUS_EQUAL : (match(scanner, '+') ? TOKEN_PLUS_PLUS : TOKEN_PLUS));
364  case '^': return makeToken(scanner, match(scanner, '=') ? TOKEN_CARET_EQUAL : TOKEN_CARET);
365  case '|': return makeToken(scanner, match(scanner, '=') ? TOKEN_PIPE_EQUAL : TOKEN_PIPE);
366  case '&': return makeToken(scanner, match(scanner, '=') ? TOKEN_AMP_EQUAL : TOKEN_AMPERSAND);
367  case '/': return makeToken(scanner, match(scanner, '=') ? TOKEN_SOLIDUS_EQUAL : (match(scanner, '/') ? (match(scanner, '=') ? TOKEN_DSOLIDUS_EQUAL : TOKEN_DOUBLE_SOLIDUS) : TOKEN_SOLIDUS));
368  case '*': return makeToken(scanner, match(scanner, '=') ? TOKEN_ASTERISK_EQUAL: (match(scanner, '*') ? (match(scanner, '=') ? TOKEN_POW_EQUAL : TOKEN_POW) : TOKEN_ASTERISK));
369  case '%': return makeToken(scanner, match(scanner, '=') ? TOKEN_MODULO_EQUAL : TOKEN_MODULO);
370  case '@': return makeToken(scanner, match(scanner, '=') ? TOKEN_AT_EQUAL : TOKEN_AT);
371 
372  case '"': return string(scanner, '"');
373  case '\'': return string(scanner, '\'');
374  }
375 
376  return errorToken(scanner, "Unexpected character.");
377 }
378 
Top-level header with configuration macros.
Definitions used by the token scanner.
void krk_ungetToken(KrkScanner *, KrkToken token)
Push a token back to the scanner to be reprocessed.
Definition: scanner.c:279
KrkToken krk_scanToken(KrkScanner *)
Read the next token from the scanner.
Definition: scanner.c:295
KrkScanner krk_initScanner(const char *src)
Initialize the compiler to scan tokens from 'src'.
Definition: scanner.c:11
KrkScanner krk_tellScanner(KrkScanner *)
Retreive a copy of the current scanner state.
Definition: scanner.c:287
void krk_rewindScanner(KrkScanner *, KrkScanner to)
Rewind the scanner to a previous state.
Definition: scanner.c:291
Token scanner state.
Definition: scanner.h:140
A token from the scanner.
Definition: scanner.h:124