11 {
"char", TOKEN_CHAR_KW},
15 {
"while", TOKEN_WHILE},
17 {
"return", TOKEN_RETURN},
18 {
"break", TOKEN_BREAK},
19 {
"continue", TOKEN_CONTINUE},
22 {
"interface", TOKEN_INTERFACE},
23 {
"implementation", TOKEN_IMPLEMENTATION},
24 {
"protocol", TOKEN_PROTOCOL},
25 {
"property", TOKEN_PROPERTY},
26 {
"synthesize", TOKEN_SYNTHESIZE},
27 {
"dynamic", TOKEN_DYNAMIC},
28 {
"class", TOKEN_CLASS},
29 {
"selector", TOKEN_SELECTOR},
30 {
"encode", TOKEN_ENCODE},
31 {
"synchronized", TOKEN_SYNCHRONIZED},
33 {
"catch", TOKEN_CATCH},
34 {
"finally", TOKEN_FINALLY},
35 {
"throw", TOKEN_THROW},
36 {
"atomic", TOKEN_ATOMIC},
37 {
"nonatomic", TOKEN_NONATOMIC},
38 {
"retain", TOKEN_RETAIN},
39 {
"assign", TOKEN_ASSIGN_ATTR},
42 {
"strong", TOKEN_STRONG},
43 {
"readonly", TOKEN_READONLY},
44 {
"readwrite", TOKEN_READWRITE},
45 {
"getter", TOKEN_GETTER},
46 {
"setter", TOKEN_SETTER},
47 {
"optional", TOKEN_OPTIONAL},
48 {
"required", TOKEN_REQUIRED},
49 {
"public", TOKEN_PUBLIC},
50 {
"private", TOKEN_PRIVATE},
51 {
"protected", TOKEN_PROTECTED},
52 {
"package", TOKEN_PACKAGE},
55 {
"super", TOKEN_SUPER},
60 {
"instancetype", TOKEN_INSTANCETYPE},
61 {
"Class", TOKEN_CLASS_KW},
64 {
"BOOL", TOKEN_BOOL_KW},
65 {
"NSString", TOKEN_NSSTRING},
66 {
"NSArray", TOKEN_NSARRAY},
67 {
"NSDictionary", TOKEN_NSDICTIONARY},
68 {
"NSObject", TOKEN_NSOBJECT},
74Lexer *lexer_create(
const char *input,
const char *filename) {
76 fprintf(stderr,
"Error: lexer_create received null input\n");
82 error_fatal(
"Memory allocation failed for lexer");
87 memset(lexer, 0,
sizeof(
Lexer));
89 lexer->input_length = strlen(input);
90 lexer->input = malloc(lexer->input_length + 1);
93 error_fatal(
"Memory allocation failed for input buffer");
97 strcpy(lexer->input, input);
98 lexer->source = lexer->input;
104 lexer->has_error =
false;
105 lexer->error_message = NULL;
110void lexer_destroy(
Lexer *lexer) {
113 free(lexer->error_message);
118static char lexer_current_char(
Lexer *lexer) {
119 if (lexer->pos >= lexer->input_length) {
122 return lexer->input[lexer->pos];
125static char lexer_peek_char(
Lexer *lexer) {
126 if (lexer->pos + 1 >= lexer->input_length) {
129 return lexer->input[lexer->pos + 1];
132static void lexer_advance(
Lexer *lexer) {
133 if (lexer->pos < lexer->input_length) {
134 if (lexer->input[lexer->pos] ==
'\n') {
142 lexer->position = lexer->pos;
143 lexer->current = lexer->pos;
147static void lexer_skip_whitespace(
Lexer *lexer) {
148 while (isspace(lexer_current_char(lexer))) {
149 lexer_advance(lexer);
153static void lexer_skip_comment(
Lexer *lexer) {
154 if (lexer_current_char(lexer) ==
'/' && lexer_peek_char(lexer) ==
'/') {
155 while (lexer_current_char(lexer) !=
'\n' && lexer_current_char(lexer) !=
'\0') {
156 lexer_advance(lexer);
158 }
else if (lexer_current_char(lexer) ==
'/' && lexer_peek_char(lexer) ==
'*') {
159 lexer_advance(lexer);
160 lexer_advance(lexer);
161 while (!(lexer_current_char(lexer) ==
'*' && lexer_peek_char(lexer) ==
'/') &&
162 lexer_current_char(lexer) !=
'\0') {
163 lexer_advance(lexer);
165 if (lexer_current_char(lexer) ==
'*') {
166 lexer_advance(lexer);
167 lexer_advance(lexer);
172bool is_keyword(
const char *str,
TokenType *type) {
173 for (
int i = 0; keywords[i].keyword != NULL; i++) {
174 if (strcmp(str, keywords[i].keyword) == 0) {
175 *type = keywords[i].type;
182static Token lexer_read_identifier(
Lexer *lexer) {
184 memset(&token, 0,
sizeof(
Token));
185 token.line = lexer->line;
186 token.column = lexer->column;
189 token.value = malloc(MAX_IDENTIFIER_LENGTH);
191 error_fatal(
"Memory allocation failed for token value");
195 while ((isalnum(lexer_current_char(lexer)) || lexer_current_char(lexer) ==
'_') &&
196 i < MAX_IDENTIFIER_LENGTH - 1) {
197 token.value[i++] = lexer_current_char(lexer);
198 lexer_advance(lexer);
200 token.value[i] =
'\0';
202 if (!is_keyword(token.value, &token.type)) {
203 token.type = TOKEN_IDENTIFIER;
211 memset(&token, 0,
sizeof(
Token));
212 token.type = TOKEN_NUMBER;
213 token.line = lexer->line;
214 token.column = lexer->column;
217 token.value = malloc(MAX_STRING_LENGTH);
219 error_fatal(
"Memory allocation failed for token value");
223 while (isdigit(lexer_current_char(lexer)) && i < MAX_STRING_LENGTH - 1) {
224 token.value[i++] = lexer_current_char(lexer);
225 lexer_advance(lexer);
227 token.value[i] =
'\0';
230 token.literal.int_value = atoi(token.value);
237 memset(&token, 0,
sizeof(
Token));
238 token.type = TOKEN_STRING;
239 token.line = lexer->line;
240 token.column = lexer->column;
243 token.value = malloc(MAX_STRING_LENGTH);
245 error_fatal(
"Memory allocation failed for token value");
248 lexer_advance(lexer);
251 while (lexer_current_char(lexer) !=
'"' && lexer_current_char(lexer) !=
'\0' &&
252 i < MAX_STRING_LENGTH - 1) {
253 if (lexer_current_char(lexer) ==
'\\') {
254 lexer_advance(lexer);
255 switch (lexer_current_char(lexer)) {
256 case 'n': token.value[i++] =
'\n';
break;
257 case 't': token.value[i++] =
'\t';
break;
258 case 'r': token.value[i++] =
'\r';
break;
259 case '\\': token.value[i++] =
'\\';
break;
260 case '"': token.value[i++] =
'"';
break;
261 default: token.value[i++] = lexer_current_char(lexer);
break;
264 token.value[i++] = lexer_current_char(lexer);
266 lexer_advance(lexer);
269 if (lexer_current_char(lexer) ==
'"') {
270 lexer_advance(lexer);
272 error_syntax(lexer->line, lexer->column,
"Unterminated string literal");
275 token.value[i] =
'\0';
280static Token lexer_read_at_literal(
Lexer *lexer) {
282 memset(&token, 0,
sizeof(
Token));
283 token.line = lexer->line;
284 token.column = lexer->column;
286 lexer_advance(lexer);
288 char next_char = lexer_current_char(lexer);
290 if (next_char ==
'"') {
292 token = lexer_read_string(lexer);
293 token.type = TOKEN_NSSTRING_LITERAL;
295 char *old_value = token.value;
296 token.value = malloc(strlen(old_value) + 2);
298 error_fatal(
"Memory allocation failed for token value");
300 strcpy(token.value,
"@");
301 strcat(token.value, old_value);
303 }
else if (next_char ==
'[') {
305 token.type = TOKEN_NSARRAY_LITERAL;
306 token.value = malloc(2);
308 error_fatal(
"Memory allocation failed for token value");
310 strcpy(token.value,
"@");
311 }
else if (next_char ==
'{') {
313 token.type = TOKEN_NSDICTIONARY_LITERAL;
314 token.value = malloc(2);
316 error_fatal(
"Memory allocation failed for token value");
318 strcpy(token.value,
"@");
319 }
else if (next_char ==
'(') {
321 token.type = TOKEN_BOXED_EXPRESSION;
322 token.value = malloc(2);
324 error_fatal(
"Memory allocation failed for token value");
326 strcpy(token.value,
"@");
327 }
else if (isalpha(next_char) || next_char ==
'_') {
329 Token keyword_token = lexer_read_identifier(lexer);
332 if (strcmp(keyword_token.value,
"interface") == 0) {
333 token.type = TOKEN_AT_INTERFACE;
334 }
else if (strcmp(keyword_token.value,
"implementation") == 0) {
335 token.type = TOKEN_AT_IMPLEMENTATION;
336 }
else if (strcmp(keyword_token.value,
"protocol") == 0) {
337 token.type = TOKEN_AT_PROTOCOL;
338 }
else if (strcmp(keyword_token.value,
"property") == 0) {
339 token.type = TOKEN_AT_PROPERTY;
340 }
else if (strcmp(keyword_token.value,
"synthesize") == 0) {
341 token.type = TOKEN_AT_SYNTHESIZE;
342 }
else if (strcmp(keyword_token.value,
"dynamic") == 0) {
343 token.type = TOKEN_AT_DYNAMIC;
344 }
else if (strcmp(keyword_token.value,
"class") == 0) {
345 token.type = TOKEN_AT_CLASS;
346 }
else if (strcmp(keyword_token.value,
"selector") == 0) {
347 token.type = TOKEN_AT_SELECTOR;
348 }
else if (strcmp(keyword_token.value,
"encode") == 0) {
349 token.type = TOKEN_AT_ENCODE;
350 }
else if (strcmp(keyword_token.value,
"synchronized") == 0) {
351 token.type = TOKEN_AT_SYNCHRONIZED;
352 }
else if (strcmp(keyword_token.value,
"try") == 0) {
353 token.type = TOKEN_AT_TRY;
354 }
else if (strcmp(keyword_token.value,
"catch") == 0) {
355 token.type = TOKEN_AT_CATCH;
356 }
else if (strcmp(keyword_token.value,
"finally") == 0) {
357 token.type = TOKEN_AT_FINALLY;
358 }
else if (strcmp(keyword_token.value,
"throw") == 0) {
359 token.type = TOKEN_AT_THROW;
360 }
else if (strcmp(keyword_token.value,
"end") == 0) {
361 token.type = TOKEN_AT_END;
363 token.type = TOKEN_AT_IDENTIFIER;
367 token.value = malloc(strlen(keyword_token.value) + 2);
369 error_fatal(
"Memory allocation failed for token value");
371 strcpy(token.value,
"@");
372 strcat(token.value, keyword_token.value);
374 free(keyword_token.value);
375 }
else if (isdigit(next_char)) {
377 Token number_token = lexer_read_number(lexer);
378 token.type = TOKEN_BOXED_NUMBER;
381 token.value = malloc(strlen(number_token.value) + 2);
383 error_fatal(
"Memory allocation failed for token value");
385 strcpy(token.value,
"@");
386 strcat(token.value, number_token.value);
388 free(number_token.value);
391 token.type = TOKEN_AT;
392 token.value = malloc(2);
394 error_fatal(
"Memory allocation failed for token value");
396 strcpy(token.value,
"@");
404 memset(&token, 0,
sizeof(
Token));
406 while (lexer_current_char(lexer) !=
'\0') {
407 lexer_skip_whitespace(lexer);
409 if (lexer_current_char(lexer) ==
'/' &&
410 (lexer_peek_char(lexer) ==
'/' || lexer_peek_char(lexer) ==
'*')) {
411 lexer_skip_comment(lexer);
415 token.line = lexer->line;
416 token.column = lexer->column;
418 char c = lexer_current_char(lexer);
421 return lexer_read_at_literal(lexer);
424 if (isalpha(c) || c ==
'_') {
425 return lexer_read_identifier(lexer);
429 return lexer_read_number(lexer);
433 return lexer_read_string(lexer);
437 token.value = malloc(3);
439 error_fatal(
"Memory allocation failed for token value");
442 lexer_advance(lexer);
444 token.value[1] =
'\0';
447 case '+': token.type = TOKEN_PLUS;
break;
448 case '-': token.type = TOKEN_MINUS;
break;
449 case '*': token.type = TOKEN_MULTIPLY;
break;
450 case '/': token.type = TOKEN_DIVIDE;
break;
451 case '%': token.type = TOKEN_MODULO;
break;
453 if (lexer_current_char(lexer) ==
'=') {
454 lexer_advance(lexer);
455 token.type = TOKEN_EQUAL;
456 strcpy(token.value,
"==");
458 token.type = TOKEN_ASSIGN;
462 if (lexer_current_char(lexer) ==
'=') {
463 lexer_advance(lexer);
464 token.type = TOKEN_NOT_EQUAL;
465 strcpy(token.value,
"!=");
467 token.type = TOKEN_NOT;
471 if (lexer_current_char(lexer) ==
'=') {
472 lexer_advance(lexer);
473 token.type = TOKEN_LESS_EQUAL;
474 strcpy(token.value,
"<=");
476 token.type = TOKEN_LESS;
480 if (lexer_current_char(lexer) ==
'=') {
481 lexer_advance(lexer);
482 token.type = TOKEN_GREATER_EQUAL;
483 strcpy(token.value,
">=");
485 token.type = TOKEN_GREATER;
489 if (lexer_current_char(lexer) ==
'&') {
490 lexer_advance(lexer);
491 token.type = TOKEN_AND;
492 strcpy(token.value,
"&&");
494 token.type = TOKEN_UNKNOWN;
498 if (lexer_current_char(lexer) ==
'|') {
499 lexer_advance(lexer);
500 token.type = TOKEN_OR;
501 strcpy(token.value,
"||");
503 token.type = TOKEN_UNKNOWN;
506 case ';': token.type = TOKEN_SEMICOLON;
break;
507 case ',': token.type = TOKEN_COMMA;
break;
508 case '(': token.type = TOKEN_LPAREN;
break;
509 case ')': token.type = TOKEN_RPAREN;
break;
510 case '{': token.type = TOKEN_LBRACE;
break;
511 case '}': token.type = TOKEN_RBRACE;
break;
512 case '[': token.type = TOKEN_LBRACKET;
break;
513 case ']': token.type = TOKEN_RBRACKET;
break;
514 default: token.type = TOKEN_UNKNOWN;
break;
520 token.type = TOKEN_EOF;
521 token.line = lexer->line;
522 token.column = lexer->column;
523 token.value = malloc(4);
525 strcpy(token.value,
"EOF");
532 size_t saved_pos = lexer->pos;
533 size_t saved_line = lexer->line;
534 size_t saved_column = lexer->column;
536 Token token = lexer_next_token(lexer);
538 lexer->pos = saved_pos;
539 lexer->line = saved_line;
540 lexer->column = saved_column;
541 lexer->position = saved_pos;
542 lexer->current = saved_pos;
547const char *token_type_to_string(
TokenType type) {
549 case TOKEN_EOF:
return "EOF";
550 case TOKEN_IDENTIFIER:
return "IDENTIFIER";
551 case TOKEN_NUMBER:
return "NUMBER";
552 case TOKEN_STRING:
return "STRING";
553 case TOKEN_CHAR:
return "CHAR";
554 case TOKEN_INT:
return "INT";
555 case TOKEN_CHAR_KW:
return "CHAR_KW";
556 case TOKEN_VOID:
return "VOID";
557 case TOKEN_IF:
return "IF";
558 case TOKEN_ELSE:
return "ELSE";
559 case TOKEN_WHILE:
return "WHILE";
560 case TOKEN_FOR:
return "FOR";
561 case TOKEN_RETURN:
return "RETURN";
562 case TOKEN_BREAK:
return "BREAK";
563 case TOKEN_CONTINUE:
return "CONTINUE";
564 case TOKEN_PLUS:
return "PLUS";
565 case TOKEN_MINUS:
return "MINUS";
566 case TOKEN_MULTIPLY:
return "MULTIPLY";
567 case TOKEN_DIVIDE:
return "DIVIDE";
568 case TOKEN_MODULO:
return "MODULO";
569 case TOKEN_ASSIGN:
return "ASSIGN";
570 case TOKEN_EQUAL:
return "EQUAL";
571 case TOKEN_NOT_EQUAL:
return "NOT_EQUAL";
572 case TOKEN_LESS:
return "LESS";
573 case TOKEN_LESS_EQUAL:
return "LESS_EQUAL";
574 case TOKEN_GREATER:
return "GREATER";
575 case TOKEN_GREATER_EQUAL:
return "GREATER_EQUAL";
576 case TOKEN_AND:
return "AND";
577 case TOKEN_OR:
return "OR";
578 case TOKEN_NOT:
return "NOT";
579 case TOKEN_SEMICOLON:
return "SEMICOLON";
580 case TOKEN_COMMA:
return "COMMA";
581 case TOKEN_LPAREN:
return "LPAREN";
582 case TOKEN_RPAREN:
return "RPAREN";
583 case TOKEN_LBRACE:
return "LBRACE";
584 case TOKEN_RBRACE:
return "RBRACE";
585 case TOKEN_LBRACKET:
return "LBRACKET";
586 case TOKEN_RBRACKET:
return "RBRACKET";
589 case TOKEN_AT:
return "AT";
590 case TOKEN_AT_INTERFACE:
return "AT_INTERFACE";
591 case TOKEN_AT_IMPLEMENTATION:
return "AT_IMPLEMENTATION";
592 case TOKEN_AT_PROTOCOL:
return "AT_PROTOCOL";
593 case TOKEN_AT_PROPERTY:
return "AT_PROPERTY";
594 case TOKEN_AT_SYNTHESIZE:
return "AT_SYNTHESIZE";
595 case TOKEN_AT_DYNAMIC:
return "AT_DYNAMIC";
596 case TOKEN_AT_CLASS:
return "AT_CLASS";
597 case TOKEN_AT_SELECTOR:
return "AT_SELECTOR";
598 case TOKEN_AT_ENCODE:
return "AT_ENCODE";
599 case TOKEN_AT_SYNCHRONIZED:
return "AT_SYNCHRONIZED";
600 case TOKEN_AT_TRY:
return "AT_TRY";
601 case TOKEN_AT_CATCH:
return "AT_CATCH";
602 case TOKEN_AT_FINALLY:
return "AT_FINALLY";
603 case TOKEN_AT_THROW:
return "AT_THROW";
604 case TOKEN_AT_END:
return "AT_END";
605 case TOKEN_AT_IDENTIFIER:
return "AT_IDENTIFIER";
606 case TOKEN_NSSTRING_LITERAL:
return "NSSTRING_LITERAL";
607 case TOKEN_NSARRAY_LITERAL:
return "NSARRAY_LITERAL";
608 case TOKEN_NSDICTIONARY_LITERAL:
return "NSDICTIONARY_LITERAL";
609 case TOKEN_BOXED_EXPRESSION:
return "BOXED_EXPRESSION";
610 case TOKEN_BOXED_NUMBER:
return "BOXED_NUMBER";
613 case TOKEN_INTERFACE:
return "INTERFACE";
614 case TOKEN_IMPLEMENTATION:
return "IMPLEMENTATION";
615 case TOKEN_PROTOCOL:
return "PROTOCOL";
616 case TOKEN_PROPERTY:
return "PROPERTY";
617 case TOKEN_SYNTHESIZE:
return "SYNTHESIZE";
618 case TOKEN_DYNAMIC:
return "DYNAMIC";
619 case TOKEN_CLASS:
return "CLASS";
620 case TOKEN_SELECTOR:
return "SELECTOR";
621 case TOKEN_ENCODE:
return "ENCODE";
622 case TOKEN_SYNCHRONIZED:
return "SYNCHRONIZED";
623 case TOKEN_TRY:
return "TRY";
624 case TOKEN_CATCH:
return "CATCH";
625 case TOKEN_FINALLY:
return "FINALLY";
626 case TOKEN_THROW:
return "THROW";
627 case TOKEN_ATOMIC:
return "ATOMIC";
628 case TOKEN_NONATOMIC:
return "NONATOMIC";
629 case TOKEN_RETAIN:
return "RETAIN";
630 case TOKEN_ASSIGN_ATTR:
return "ASSIGN_ATTR";
631 case TOKEN_COPY:
return "COPY";
632 case TOKEN_WEAK:
return "WEAK";
633 case TOKEN_STRONG:
return "STRONG";
634 case TOKEN_READONLY:
return "READONLY";
635 case TOKEN_READWRITE:
return "READWRITE";
636 case TOKEN_GETTER:
return "GETTER";
637 case TOKEN_SETTER:
return "SETTER";
638 case TOKEN_OPTIONAL:
return "OPTIONAL";
639 case TOKEN_REQUIRED:
return "REQUIRED";
640 case TOKEN_PUBLIC:
return "PUBLIC";
641 case TOKEN_PRIVATE:
return "PRIVATE";
642 case TOKEN_PROTECTED:
return "PROTECTED";
643 case TOKEN_PACKAGE:
return "PACKAGE";
644 case TOKEN_END:
return "END";
645 case TOKEN_SELF:
return "SELF";
646 case TOKEN_SUPER:
return "SUPER";
647 case TOKEN_NIL:
return "NIL";
648 case TOKEN_YES:
return "YES";
649 case TOKEN_NO:
return "NO";
650 case TOKEN_ID:
return "ID";
651 case TOKEN_INSTANCETYPE:
return "INSTANCETYPE";
652 case TOKEN_CLASS_KW:
return "CLASS_KW";
653 case TOKEN_SEL:
return "SEL";
654 case TOKEN_IMP:
return "IMP";
655 case TOKEN_BOOL_KW:
return "BOOL_KW";
656 case TOKEN_NSSTRING:
return "NSSTRING";
657 case TOKEN_NSARRAY:
return "NSARRAY";
658 case TOKEN_NSDICTIONARY:
return "NSDICTIONARY";
659 case TOKEN_NSOBJECT:
return "NSOBJECT";
661 default:
return "UNKNOWN";
665void print_token(
const Token *token) {
666 printf(
"Token: %s '%s' at line %d, column %d\n",
667 token_type_to_string(token->type),
668 token->value ? token->value :
"NULL",
TokenType
Token types for lexical analysis.