16 scanner.linePtr = src;
17 scanner.startOfLine = 1;
22 static int isAtEnd(
const KrkScanner * scanner) {
23 return *scanner->cur ==
'\0';
28 scanner->linePtr = scanner->cur;
34 .start = scanner->start,
35 .length = (type == TOKEN_EOL) ? 0 : (
size_t)(scanner->cur - scanner->start),
36 .line = scanner->line,
37 .linePtr = scanner->linePtr,
38 .literalWidth = (type == TOKEN_EOL) ? 0 : (size_t)(scanner->cur - scanner->start),
39 .col = (scanner->start - scanner->linePtr) + 1,
44 ssize_t column = (scanner->linePtr < scanner->start) ? scanner->start - scanner->linePtr : 0;
45 ssize_t width = (scanner->start < scanner->cur) ? scanner->cur - scanner->start : 0;
49 .length = strlen(errorStr),
50 .line = scanner->line,
51 .linePtr = scanner->linePtr,
52 .literalWidth = (size_t)(width),
58 return (*scanner->cur ==
'\0') ?
'\0' : *(scanner->cur++);
61 static int match(
KrkScanner * scanner,
char expected) {
62 if (isAtEnd(scanner))
return 0;
63 if (*scanner->cur != expected)
return 0;
72 static char peekNext(
const KrkScanner * scanner,
int n) {
73 if (isAtEnd(scanner))
return '\0';
74 for (
int i = 0; i < n; ++i)
if (scanner->cur[i] ==
'\0')
return '\0';
75 return scanner->cur[n];
78 static void skipWhitespace(
KrkScanner * scanner) {
80 char c = peek(scanner);
93 char reject = (peek(scanner) ==
' ') ?
'\t' :
' ';
94 while (!isAtEnd(scanner) && (peek(scanner) ==
' ' || peek(scanner) ==
'\t')) advance(scanner);
95 if (isAtEnd(scanner))
return makeToken(scanner, TOKEN_EOF);
96 for (
const char * start = scanner->start; start < scanner->cur; start++) {
97 if (*start == reject)
return errorToken(scanner,
"Invalid mix of indentation.");
99 KrkToken out = makeToken(scanner, TOKEN_INDENTATION);
100 if (reject ==
' ') out.length *= 8;
101 if (peek(scanner) ==
'#' || peek(scanner) ==
'\n') {
102 while (!isAtEnd(scanner) && peek(scanner) !=
'\n') advance(scanner);
103 scanner->startOfLine = 1;
104 return makeToken(scanner, TOKEN_RETRY);
110 if (peek(scanner) == quoteMark && peekNext(scanner, 1) == quoteMark) {
111 advance(scanner); advance(scanner);
113 while (!isAtEnd(scanner)) {
114 if (peek(scanner) == quoteMark && peekNext(scanner, 1) == quoteMark && peekNext(scanner, 2) == quoteMark) {
118 return makeToken(scanner, TOKEN_BIG_STRING);
121 if (peek(scanner) ==
'\\') advance(scanner);
122 if (peek(scanner) ==
'\n') {
126 else advance(scanner);
128 if (isAtEnd(scanner))
return errorToken(scanner,
"Unterminated string.");
130 while (peek(scanner) != quoteMark && !isAtEnd(scanner)) {
131 if (peek(scanner) ==
'\n')
return errorToken(scanner,
"Unterminated string.");
132 if (peek(scanner) ==
'\\') advance(scanner);
133 if (peek(scanner) ==
'\n') {
137 else advance(scanner);
140 if (isAtEnd(scanner))
return errorToken(scanner,
"Unterminated string.");
142 assert(peek(scanner) == quoteMark);
145 return makeToken(scanner, TOKEN_STRING);
148 static int isDigit(
char c) {
149 return c >=
'0' && c <=
'9';
154 if (peek(scanner) ==
'x' || peek(scanner) ==
'X') {
157 while (isDigit(peek(scanner)) || (peek(scanner) >=
'a' && peek(scanner) <=
'f') ||
158 (peek(scanner) >=
'A' && peek(scanner) <=
'F') || (peek(scanner) ==
'_')) advance(scanner);
159 return makeToken(scanner, TOKEN_NUMBER);
160 }
else if (peek(scanner) ==
'b' || peek(scanner) ==
'B') {
163 while (peek(scanner) ==
'0' || peek(scanner) ==
'1' || (peek(scanner) ==
'_')) advance(scanner);
164 return makeToken(scanner, TOKEN_NUMBER);
165 }
if (peek(scanner) ==
'o' || peek(scanner) ==
'O') {
168 while ((peek(scanner) >=
'0' && peek(scanner) <=
'7') || (peek(scanner) ==
'_')) advance(scanner);
169 return makeToken(scanner, TOKEN_NUMBER);
175 while (isDigit(peek(scanner)) || peek(scanner) ==
'_') advance(scanner);
178 if (peek(scanner) ==
'.' && isDigit(peekNext(scanner, 1))) {
180 while (isDigit(peek(scanner))) advance(scanner);
183 if (peek(scanner) ==
'e' || peek(scanner) ==
'E') {
185 if (peek(scanner) ==
'+' || peek(scanner) ==
'-') advance(scanner);
186 while (isDigit(peek(scanner))) advance(scanner);
189 return makeToken(scanner, TOKEN_NUMBER);
192 static int isAlpha(
char c) {
193 return (c >=
'a' && c <=
'z') || (c >=
'A' && c <=
'Z') || (c ==
'_');
196 static int _checkKeyword(
KrkScanner * scanner,
size_t start,
const char * rest, KrkTokenType type) {
197 size_t length = strlen(rest);
198 if ((
size_t)(scanner->cur - scanner->start) == start + length &&
199 memcmp(scanner->start + start, rest, length) == 0)
return type;
200 return TOKEN_IDENTIFIER;
203 #define checkKeyword(a,b,c) _checkKeyword(scanner,a,b,c)
205 static KrkTokenType identifierType(
KrkScanner * scanner) {
206 #define MORE(i) (scanner->cur - scanner->start > i)
207 switch (*scanner->start) {
208 case 'a':
if (MORE(1))
switch(scanner->start[1]) {
209 case 'n':
return checkKeyword(2,
"d", TOKEN_AND);
210 case 'w':
return checkKeyword(2,
"ait", TOKEN_AWAIT);
211 case 's':
if (MORE(2)) {
212 switch (scanner->start[2]) {
213 case 's':
return checkKeyword(3,
"ert", TOKEN_ASSERT);
214 case 'y':
return checkKeyword(3,
"nc", TOKEN_ASYNC);
218 return checkKeyword(2,
"", TOKEN_AS);
221 case 'b':
if (MORE(1))
return checkKeyword(1,
"reak", TOKEN_BREAK);
222 else if (scanner->start[1] ==
'\'' || scanner->start[1] ==
'"')
return TOKEN_PREFIX_B;
224 case 'c':
if (MORE(1))
switch(scanner->start[1]) {
225 case 'l':
return checkKeyword(2,
"ass", TOKEN_CLASS);
226 case 'o':
return checkKeyword(2,
"ntinue", TOKEN_CONTINUE);
228 case 'd':
if (MORE(1))
switch(scanner->start[1]) {
229 case 'e':
if (MORE(2))
switch (scanner->start[2]) {
230 case 'f':
return checkKeyword(3,
"", TOKEN_DEF);
231 case 'l':
return checkKeyword(3,
"", TOKEN_DEL);
234 case 'e':
if (MORE(1))
switch(scanner->start[1]) {
235 case 'l':
if (MORE(2))
switch(scanner->start[2]) {
236 case 's':
return checkKeyword(3,
"e", TOKEN_ELSE);
237 case 'i':
return checkKeyword(3,
"f", TOKEN_ELIF);
239 case 'x':
return checkKeyword(2,
"cept", TOKEN_EXCEPT);
241 case 'f':
if (MORE(1))
switch(scanner->start[1]) {
242 case 'i':
return checkKeyword(2,
"nally", TOKEN_FINALLY);
243 case 'o':
return checkKeyword(2,
"r", TOKEN_FOR);
244 case 'r':
return checkKeyword(2,
"om", TOKEN_FROM);
245 }
else if (scanner->start[1] ==
'\'' || scanner->start[1] ==
'"')
return TOKEN_PREFIX_F;
247 case 'F':
return checkKeyword(1,
"alse", TOKEN_FALSE);
248 case 'i':
if (MORE(1))
switch (scanner->start[1]) {
249 case 'f':
return checkKeyword(2,
"", TOKEN_IF);
250 case 'n':
return checkKeyword(2,
"", TOKEN_IN);
251 case 'm':
return checkKeyword(2,
"port", TOKEN_IMPORT);
252 case 's':
return checkKeyword(2,
"", TOKEN_IS);
254 case 'l':
if (MORE(1))
switch (scanner->start[1]) {
255 case 'a':
return checkKeyword(2,
"mbda", TOKEN_LAMBDA);
256 case 'e':
return checkKeyword(2,
"t", TOKEN_LET);
258 case 'n':
return checkKeyword(1,
"ot", TOKEN_NOT);
259 case 'N':
return checkKeyword(1,
"one", TOKEN_NONE);
260 case 'o':
return checkKeyword(1,
"r", TOKEN_OR);
261 case 'p':
return checkKeyword(1,
"ass", TOKEN_PASS);
262 case 'r':
if (MORE(1))
switch (scanner->start[1]) {
263 case 'e':
return checkKeyword(2,
"turn", TOKEN_RETURN);
264 case 'a':
return checkKeyword(2,
"ise", TOKEN_RAISE);
265 }
else if (scanner->start[1] ==
'\'' || scanner->start[1] ==
'"')
return TOKEN_PREFIX_R;
267 case 's':
return checkKeyword(1,
"uper", TOKEN_SUPER);
268 case 't':
return checkKeyword(1,
"ry", TOKEN_TRY);
269 case 'T':
return checkKeyword(1,
"rue", TOKEN_TRUE);
270 case 'w':
if (MORE(1))
switch(scanner->start[1]) {
271 case 'h':
return checkKeyword(2,
"ile", TOKEN_WHILE);
272 case 'i':
return checkKeyword(2,
"th", TOKEN_WITH);
274 case 'y':
return checkKeyword(1,
"ield", TOKEN_YIELD);
276 return TOKEN_IDENTIFIER;
280 while (isAlpha(peek(scanner)) || isDigit(peek(scanner)) || (
unsigned char)peek(scanner) > 0x7F) advance(scanner);
282 return makeToken(scanner, identifierType(scanner));
286 if (scanner->hasUnget) {
289 scanner->hasUnget = 1;
290 scanner->unget = token;
303 if (scanner->hasUnget) {
304 scanner->hasUnget = 0;
305 return scanner->unget;
309 if (scanner->startOfLine && (peek(scanner) ==
' ' || peek(scanner) ==
'\t')) {
310 scanner->start = scanner->cur;
311 scanner->startOfLine = 0;
312 return makeIndentation(scanner);
316 skipWhitespace(scanner);
319 if (peek(scanner) ==
'#')
while (peek(scanner) !=
'\n' && !isAtEnd(scanner)) advance(scanner);
321 scanner->start = scanner->cur;
322 if (isAtEnd(scanner))
return makeToken(scanner, TOKEN_EOF);
324 char c = advance(scanner);
328 if (scanner->startOfLine) {
330 out = makeToken(scanner, TOKEN_RETRY);
332 scanner->startOfLine = 1;
333 out = makeToken(scanner, TOKEN_EOL);
339 if (c ==
'\\' && peek(scanner) ==
'\n') {
342 return makeToken(scanner, TOKEN_RETRY);
346 scanner->startOfLine = 0;
348 if (isAlpha(c) || (
unsigned char)c > 0x7F)
return identifier(scanner);
349 if (isDigit(c))
return number(scanner, c);
352 case '(':
return makeToken(scanner, TOKEN_LEFT_PAREN);
353 case ')':
return makeToken(scanner, TOKEN_RIGHT_PAREN);
354 case '{':
return makeToken(scanner, TOKEN_LEFT_BRACE);
355 case '}':
return makeToken(scanner, TOKEN_RIGHT_BRACE);
356 case '[':
return makeToken(scanner, TOKEN_LEFT_SQUARE);
357 case ']':
return makeToken(scanner, TOKEN_RIGHT_SQUARE);
358 case ',':
return makeToken(scanner, TOKEN_COMMA);
359 case ';':
return makeToken(scanner, TOKEN_SEMICOLON);
360 case '~':
return makeToken(scanner, TOKEN_TILDE);
361 case '.':
return makeToken(scanner, peek(scanner) ==
'.' ? (peekNext(scanner,1) ==
'.' ? (advance(scanner), advance(scanner), TOKEN_ELLIPSIS) : TOKEN_DOT) : TOKEN_DOT);
363 case ':':
return makeToken(scanner, match(scanner,
'=') ? TOKEN_WALRUS : TOKEN_COLON);
364 case '!':
return makeToken(scanner, match(scanner,
'=') ? TOKEN_BANG_EQUAL : TOKEN_BANG);
365 case '=':
return makeToken(scanner, match(scanner,
'=') ? TOKEN_EQUAL_EQUAL : TOKEN_EQUAL);
366 case '<':
return makeToken(scanner, match(scanner,
'=') ? TOKEN_LESS_EQUAL : (match(scanner,
'<') ? (match(scanner,
'=') ? TOKEN_LSHIFT_EQUAL : TOKEN_LEFT_SHIFT) : TOKEN_LESS));
367 case '>':
return makeToken(scanner, match(scanner,
'=') ? TOKEN_GREATER_EQUAL : (match(scanner,
'>') ? (match(scanner,
'=') ? TOKEN_RSHIFT_EQUAL : TOKEN_RIGHT_SHIFT) : TOKEN_GREATER));
368 case '-':
return makeToken(scanner, match(scanner,
'=') ? TOKEN_MINUS_EQUAL : (match(scanner,
'-') ? TOKEN_MINUS_MINUS : (match(scanner,
'>') ? TOKEN_ARROW : TOKEN_MINUS)));
369 case '+':
return makeToken(scanner, match(scanner,
'=') ? TOKEN_PLUS_EQUAL : (match(scanner,
'+') ? TOKEN_PLUS_PLUS : TOKEN_PLUS));
370 case '^':
return makeToken(scanner, match(scanner,
'=') ? TOKEN_CARET_EQUAL : TOKEN_CARET);
371 case '|':
return makeToken(scanner, match(scanner,
'=') ? TOKEN_PIPE_EQUAL : TOKEN_PIPE);
372 case '&':
return makeToken(scanner, match(scanner,
'=') ? TOKEN_AMP_EQUAL : TOKEN_AMPERSAND);
373 case '/':
return makeToken(scanner, match(scanner,
'=') ? TOKEN_SOLIDUS_EQUAL : (match(scanner,
'/') ? (match(scanner,
'=') ? TOKEN_DSOLIDUS_EQUAL : TOKEN_DOUBLE_SOLIDUS) : TOKEN_SOLIDUS));
374 case '*':
return makeToken(scanner, match(scanner,
'=') ? TOKEN_ASTERISK_EQUAL: (match(scanner,
'*') ? (match(scanner,
'=') ? TOKEN_POW_EQUAL : TOKEN_POW) : TOKEN_ASTERISK));
375 case '%':
return makeToken(scanner, match(scanner,
'=') ? TOKEN_MODULO_EQUAL : TOKEN_MODULO);
376 case '@':
return makeToken(scanner, match(scanner,
'=') ? TOKEN_AT_EQUAL : TOKEN_AT);
378 case '"':
return string(scanner,
'"');
379 case '\'':
return string(scanner,
'\'');
382 return errorToken(scanner,
"Unexpected character.");
Top-level header with configuration macros.
Definitions used by the token scanner.
void krk_ungetToken(KrkScanner *, KrkToken token)
Push a token back to the scanner to be reprocessed.
KrkToken krk_scanToken(KrkScanner *)
Read the next token from the scanner.
KrkScanner krk_initScanner(const char *src)
Initialize the compiler to scan tokens from 'src'.
KrkScanner krk_tellScanner(KrkScanner *)
Retreive a copy of the current scanner state.
void krk_rewindScanner(KrkScanner *, KrkScanner to)
Rewind the scanner to a previous state.
A token from the scanner.