/* tokenizer.c */ #include #include #include #include #include #include #include #include #include #include "tokenizer.h" #include "token.h" #include "input.h" #include "util.h" token_t *left_stack[8]; int left_stack_pos = 0; hash_table_t *typedef_table = NULL; /* Utility Functions */ void init_tokenizer(const char *filename) { input_init(filename); typedef_table = hash_table_create(16, cmp_string, hash_string, dtor_string); } void destroy_tokenizer(void) { input_destroy(); } void reject_token(token_t *token) { left_stack[left_stack_pos++] = token; } token_t *peek_token(void) { if (left_stack_pos > 0) { return left_stack[left_stack_pos - 1]; } token_t *token = next_token(); reject_token(token); return token; } /* Stringify Type */ const char *stringify_type(c_token_types type) { switch (type) { case TOK_IF: return "if"; case TOK_ELSE: return "else"; case TOK_SWITCH: return "switch"; case TOK_CASE: return "case"; case TOK_DEFAULT: return "default"; case TOK_WHILE: return "while"; case TOK_DO: return "do"; case TOK_FOR: return "for"; case TOK_CONTINUE: return "continue"; case TOK_BREAK: return "break"; case TOK_RETURN: return "return"; case TOK_GOTO: return "goto"; case TOK_VOID: return "void"; case TOK_CHAR: return "char"; case TOK_SHORT: return "short"; case TOK_INT: return "int"; case TOK_LONG: return "long"; case TOK_FLOAT: return "float"; case TOK_DOUBLE: return "double"; case TOK_SIGNED: return "signed"; case TOK_UNSIGNED: return "unsigned"; case TOK_STRUCT: return "struct"; case TOK_UNION: return "union"; case TOK_ENUM: return "enum"; case TOK_TYPEDEF: return "typedef"; case TOK_AUTO: return "auto"; case TOK_REGISTER: return "register"; case TOK_STATIC: return "static"; case TOK_EXTERN: return "extern"; case TOK_CONST: return "const"; case TOK_VOLATILE: return "volatile"; case TOK_SIZEOF: return "sizeof"; case TOK_ADD: return "+"; case TOK_SUB: return "-"; case TOK_MUL: return "*"; case TOK_DIV: return "/"; case TOK_MOD: return "%"; case TOK_BIT_AND: return "&"; case TOK_BIT_OR: return "|"; case TOK_BIT_XOR: return "^"; case TOK_BIT_NOT: return "~"; case TOK_LSHIFT: return "<<"; case TOK_RSHIFT: return ">>"; case TOK_NOT: return "!"; case TOK_ASSIGN: return "="; case TOK_LT: return "<"; case TOK_GT: return ">"; case TOK_INC: return "++"; case TOK_DEC: return "--"; case TOK_EQ: return "=="; case TOK_NE: return "!="; case TOK_LE: return "<="; case TOK_GE: return ">="; case TOK_AND: return "&&"; case TOK_OR: return "||"; case TOK_MEMBER_POINTER: return "->"; case TOK_MEMBER: return "."; case TOK_COND_DECISION: return ":"; case TOK_COND: return "?"; case TOK_ASSIGN_ADD: return "+="; case TOK_ASSIGN_SUB: return "-="; case TOK_ASSIGN_MUL: return "*="; case TOK_ASSIGN_DIV: return "/="; case TOK_ASSIGN_MOD: return "%="; case TOK_ASSIGN_BITAND: return "&="; case TOK_ASSIGN_BITOR: return "|="; case TOK_ASSIGN_BITXOR: return "^="; case TOK_ASSIGN_LSHIFT: return "<<="; case TOK_ASSIGN_RSHIFT: return ">>="; case TOK_HASH: return "#"; case TOK_ID: return "identifier"; case TOK_TYPEDEF_NAME: return "typedef name"; case TOK_INTEGER_U32: case TOK_INTEGER_U64: case TOK_INTEGER_S32: case TOK_INTEGER_S64: return "integer constant"; case TOK_FLOAT_32: case TOK_FLOAT_64: return "floating constant"; case TOK_CHAR_CONST: return "character constant"; case TOK_STRING_ASCII: return "string constant"; case TOK_EOF: return "EOF"; case TOK_ERROR: return "error"; case TOK_LEFT_PAREN: return "("; case TOK_RIGHT_PAREN: return ")"; case TOK_LEFT_BRACKET: return "["; case TOK_RIGHT_BRACKET: return "]"; case TOK_LEFT_BRACE: return "{"; case TOK_RIGHT_BRACE: return "}"; case TOK_COMMA: return ","; case TOK_SEMICOLON: return ";"; case TOK_DOT: return "."; case TOK_ELLIPSIS: return "..."; } return "UNKNOWN"; } void consume(c_token_types kind) { token_t *token = next_token(); if (token_type(token) != kind) { fprintf(stderr, "Error: Expected token of type \"%s\", got \"%s\"\n", stringify_type(kind), stringify_type(token_type(token))); exit(1); } token_destroy(token); } void consume_alt(c_token_types *kinds, int n) { token_t *token = next_token(); for (int i = 0; i < n; i++) { if (token_type(token) == kinds[i]) { token_destroy(token); return; } } fprintf(stderr, "Error: Expected one of the following tokens: "); for (int i = 0; i < n; i++) { fprintf(stderr, "\"%s\" ", stringify_type(kinds[i])); } fprintf(stderr, "got \"%s\"\n", stringify_type(token_type(token))); exit(1); } /* Tokenization Function */ char file_name[1024]; /* Warning/Error Functions */ void tok_error(const char *fmt, ...) { va_list args; va_start(args, fmt); fprintf(stderr, "Error in file %s at line %d, column %d: ", file_name, line, column); vfprintf(stderr, fmt, args); va_end(args); } void tok_warn(const char *fmt, ...) { va_list args; va_start(args, fmt); fprintf(stderr, "Warning in file %s at line %d, column %d: ", file_name, line, column); vfprintf(stderr, fmt, args); va_end(args); } /* Skip Whitespace */ static token_t *skip_whitespace(void) { int c; while ((c = input_getc()) != EOF) { if (isspace(c)) { // Whitespace if (c == '\n') { line++; column = 1; } else { column++; } } else if (c == '#') // GCC preprocessor line control directive. { char buf[512]; int i = 0; while ((c = input_getc()) != EOF && c != '\n') { buf[i++] = c; column++; } buf[i] = '\0'; if (sscanf(buf, "%d \"%[^\"]\"", &line, file_name) == 2) { column = 1; } else { tok_error("Invalid #line directive\n"); } if (c == EOF) { return NULL; } } else if (c == '/') { // Comment c = input_getc(); if (c == '/') { while ((c = input_getc()) != EOF && c != '\n') { column++; } if (c == EOF) { return NULL; } line++; column = 1; } else if (c == '*') { // Multiline comment while ((c = input_getc()) != EOF) { if (c == '*') { c = input_getc(); if (c == '/') { break; } } else if (c == '\n') { line++; column = 1; } else { column++; } } if (c == EOF) { return NULL; } } else { // Handled here to simplify the code. if (c == '=') return token_create(TOK_ASSIGN_DIV, line, column, 2); input_ungetc(c); return token_create(TOK_DIV, line, column, 1); } } else { input_ungetc(c); return NULL; } } return NULL; } /* Tokenize Identifier */ /* Get Keyword */ c_token_types get_keyword(const char *buf, int len) { switch (buf[0]) { case 'a': if (len == 4 && buf[1] == 'u' && buf[2] == 't' && buf[3] == 'o') return TOK_AUTO; break; case 'b': if (len == 5 && buf[1] == 'r' && buf[2] == 'e' && buf[3] == 'a' && buf[4] == 'k') return TOK_BREAK; break; case 'c': switch (buf[1]) { case 'a': if (len == 4 && buf[2] == 's' && buf[3] == 'e') return TOK_CASE; break; case 'h': if (len == 4 && buf[2] == 'a' && buf[3] == 'r') return TOK_CHAR; break; case 'o': if (len == 5 && buf[2] == 'n' && buf[3] == 's' && buf[4] == 't') return TOK_CONST; if (len == 8 && buf[2] == 'n' && buf[3] == 't' && buf[4] == 'i' && buf[5] == 'n' && buf[6] == 'u' && buf[7] == 'e') return TOK_CONTINUE; break; } break; case 'd': switch (buf[1]) { case 'e': if (len == 7 && buf[2] == 'f' && buf[3] == 'a' && buf[4] == 'u' && buf[5] == 'l' && buf[6] == 't') return TOK_DEFAULT; break; case 'o': if (len == 2 && buf[2] == '\0') return TOK_DO; if (len == 6 && buf[2] == 'u' && buf[3] == 'b' && buf[4] == 'l' && buf[5] == 'e') return TOK_DOUBLE; break; } break; case 'e': switch (buf[1]) { case 'l': if (len == 4 && buf[2] == 's' && buf[3] == 'e') return TOK_ELSE; break; case 'n': if (len == 4 && buf[2] == 'u' && buf[3] == 'm') return TOK_ENUM; break; case 'x': if (len == 6 && buf[2] == 't' && buf[3] == 'e' && buf[4] == 'r' && buf[5] == 'n') return TOK_EXTERN; break; } break; case 'f': switch (buf[1]) { case 'l': if (len == 5 && buf[2] == 'o' && buf[3] == 'a' && buf[4] == 't') return TOK_FLOAT; break; case 'o': if (len == 3 && buf[2] == 'r') return TOK_FOR; break; } break; case 'g': if (len == 4 && buf[1] == 'o' && buf[2] == 't' && buf[3] == 'o') return TOK_GOTO; break; case 'i': switch (buf[1]) { case 'f': if (len == 2 && buf[2] == '\0') return TOK_IF; break; case 'n': if (len == 3 && buf[2] == 't') return TOK_INT; break; } break; case 'l': if (len == 4 && buf[1] == 'o' && buf[2] == 'n' && buf[3] == 'g') return TOK_LONG; break; case 'r': switch (buf[1]) { case 'e': if (len == 8 && buf[2] == 'g' && buf[3] == 'i' && buf[4] == 's' && buf[5] == 't' && buf[6] == 'e' && buf[7] == 'r') return TOK_REGISTER; if (len == 6 && buf[2] == 't' && buf[3] == 'u' && buf[4] == 'r' && buf[5] == 'n') return TOK_RETURN; break; } break; case 's': switch (buf[1]) { case 'h': if (len == 5 && buf[2] == 'o' && buf[3] == 'r' && buf[4] == 't') return TOK_SHORT; break; case 't': if (len == 6 && buf[2] == 'a' && buf[3] == 't' && buf[4] == 'i' && buf[5] == 'c') return TOK_STATIC; break; case 'i': if (len == 6 && buf[2] == 'g' && buf[3] == 'n' && buf[4] == 'e' && buf[5] == 'd') return TOK_SIGNED; if (len == 6 && buf[2] == 'z' && buf[3] == 'e' && buf[4] == 'o' && buf[5] == 'f') return TOK_SIZEOF; break; case 'r': if (len == 6 && buf[2] == 'u' && buf[3] == 'c' && buf[4] == 't') return TOK_STRUCT; break; case 'w': if (len == 6 && buf[2] == 'i' && buf[3] == 't' && buf[4] == 'c' && buf[5] == 'h') return TOK_SWITCH; break; } break; case 't': if (len == 7 && buf[1] == 'y' && buf[2] == 'p' && buf[3] == 'e' && buf[4] == 'd' && buf[5] == 'e' && buf[6] == 'f') return TOK_TYPEDEF; break; case 'u': switch (buf[1]) { case 'n': if (len == 5 && buf[2] == 'i' && buf[3] == 'o' && buf[4] == 'n') return TOK_UNION; if (len == 8 && buf[2] == 's' && buf[3] == 'i' && buf[4] == 'g' && buf[5] == 'n' && buf[6] == 'e' && buf[7] == 'd') return TOK_UNSIGNED; break; } break; case 'v': switch (buf[1]) { case 'o': if (len == 4 && buf[2] == 'i' && buf[3] == 'd') return TOK_VOID; if (len == 8 && buf[2] == 'l' && buf[3] == 'a' && buf[4] == 't' && buf[5] == 'i' && buf[6] == 'l' && buf[7] == 'e') return TOK_VOLATILE; break; } break; case 'w': if (len == 5 && buf[1] == 'h' && buf[2] == 'i' && buf[3] == 'l' && buf[4] == 'e') return TOK_WHILE; break; default: return TOK_ID; } return TOK_ID; } static token_t *read_identifier(void) { int c; char buf[1024]; int i = 0; c = input_getc(); if (!isalpha(c) && c != '_') { input_ungetc(c); return NULL; } buf[i++] = c; while ((c = input_getc()) != EOF) { if (!isalnum(c) && c != '_') { input_ungetc(c); break; } buf[i++] = c; if (i >= 1008) { tok_error("Identifier too long\n"); exit(1); } } buf[i] = '\0'; column += i; // Check if it's a keyword c_token_types kind = get_keyword(buf, i); if (kind != TOK_ID) { return token_create(kind, line, column, i); } // Check if it's a typedef if (hash_table_get(typedef_table, buf) != NULL) { return token_create(TOK_TYPEDEF_NAME, line, column, i); } return token_create_string(kind, line, column, buf, i); } /* Tokenize Number */ static token_t *read_number(void) { int c; char buf[1024]; int i = 0; c = input_getc(); /* Check for valid prefix */ // If we don't have a digit or decimal point, it's not a number if (!isdigit(c) && c != '.') { input_ungetc(c); return NULL; } // Decimal point followed by non-digit is a struct member if (c == '.') { char cnext = input_getc(); if (!isdigit(cnext)) { input_ungetc(cnext); return token_create(TOK_MEMBER, line, column, 1); } input_ungetc(cnext); } int radix = 10; /* Process Radix */ // Check for hex and octal. if (c == '0') { char cnext = input_getc(); if (cnext == 'x' || cnext == 'X') { // Hex, discard the 0x radix = 16; } else { // Octal, discard the 0 input_ungetc(cnext); radix = 8; } } else { // Decimal, append the first digit buf[i++] = c; } int is_float = 0; /* Read Number Loop */ while ((c = input_getc()) != EOF) { // Since there can be multiple writes to the buffer, we want to make sure we // don't overflow by giving a 4 byte pad if (i > 1020) { tok_error("Number too long\n"); return NULL; } // Valid digits for the radix: 0-9 for decimal, 0-7 for octal, 0-9 and // a-f/A-F for hex if ((radix == 10 && isdigit(c)) || (radix == 16 && isxdigit(c)) || (radix == 8 && c >= '0' && c <= '7')) { buf[i++] = c; // Decimal point and not a float yet, must be a float } else if (c == '.' && !is_float) { is_float = 1; if (radix != 10) { tok_error("Invalid floating point constant, expected decimal, got %s\n", radix == 16 ? "hexadecimal" : "octal"); return NULL; } buf[i++] = c; } // Exponent on the end of a constant. (By standard this forces it to be a // float) else if (c == 'e' || c == 'E') { buf[i++] = c; c = input_getc(); // Sign on the exponent if (c == '+' || c == '-') { buf[i++] = c; c = input_getc(); } // Exponent must be a digit, I.E no 1e1.2 if (!isdigit(c)) { tok_error("Invalid floating point exponent\n"); return NULL; } buf[i++] = c; is_float = 1; } else { // Reached the end, unget the character so other functions can read it input_ungetc(c); break; } } buf[i] = '\0'; /* Process Suffixes */ int is_unsigned = 0; int is_long = 0; int is_single = 0; while (1) { c = input_getc(); if (c == 'u' || c == 'U') { if (is_unsigned) { tok_warn( "Warning: Duplicate suffix 'u' for integer constant ignored\n"); } is_unsigned = 1; } else if (c == 'l' || c == 'L') { if (is_long) { tok_warn( "Warning: Duplicate suffix 'l' for integer constant ignored\n"); } is_long = 1; } else if (c == 'f' || c == 'F') { if (is_single) { tok_warn("Warning: Duplicate suffix 'f' for floating point constant " "ignored\n"); } is_single = 1; } else { input_ungetc(c); break; } } /* Check for conflicting suffixes */ if (is_single && is_long) { tok_warn("Warning: Invalid suffixes 'l' and 'f' for floating point " "constant. Ignoring 'l'\n"); is_long = 0; } if (is_single && is_unsigned) { tok_warn("Warning: Invalid suffixes 'u' and 'f' for floating point " "constant. Ignoring 'u'\n"); is_unsigned = 0; } if (is_single && !is_float) { tok_warn( "Warning: Invalid suffix 'f' for integer constant. Ignoring 'f'\n"); is_single = 0; } if (is_float) { /* Convert to float */ errno = 0; // Strtod generates a unix-style error when it's given something out of // range, so we want to get on top of that quickly instead of ignoring it // That way we can avoid some nasty NAN-propagation in the constant folder. double f = strtod(buf, NULL); if (errno == ERANGE) { tok_error("Floating point constant out of range\n"); return NULL; } // Warn if the constant is out of range for a float, I.E it's too big or too // small if (is_single && (f < FLT_MIN || f > FLT_MAX)) { tok_warn( "Warning: Floating point constant %f is out of range for float\n", f); } // Warn if the constant is too precise for a float if (is_single && fabs((double)((float)f) - f) >= FLT_EPSILON) { tok_warn("Warning: Converting double precision floating point constant " "%f to float loses " "precision\n", f); } return token_create_float(is_single ? TOK_FLOAT_32 : TOK_FLOAT_64, line, column, f, i); } else { /* Convert to integer */ errno = 0; uint64_t int_ = strtoull(buf, NULL, radix); // Same as above, but for integers if (errno == ERANGE) { tok_error("Integer constant out of range\n"); return NULL; } if (is_unsigned) { if (is_long) { return token_create_int(TOK_INTEGER_U64, line, column, int_, i); } else { if (int_ > UINT32_MAX) { tok_warn( "Warning: Integer constant %lld is out of range for unsigned " "int\n", int_); } return token_create_int(TOK_INTEGER_U32, line, column, int_, i); } } else { if (is_long) { // If the highest bit is set, that means this will overflow a signed // long (Due to two's complement) if (int_ & (1UL << 63)) { tok_warn( "Warning: Integer constant %lld is out of range for long long\n", i); } return token_create_int(TOK_INTEGER_S64, line, column, int_, i); } else { if (int_ & (1UL << 31)) { tok_warn("Warning: Integer constant %lld is out of range for int\n", int_); } return token_create_int(TOK_INTEGER_S32, line, column, int_, i); } } } return NULL; } /* Tokenize String */ /* Read Escape Sequence */ static char read_escape_sequence(int *len) { int c = input_getc(); *len += 1; switch (c) { case 'a': return '\a'; case 'b': return '\b'; case 'f': return '\f'; case 'n': return '\n'; case 'r': return '\r'; case 't': return '\t'; case 'v': return '\v'; case '\'': return '\''; case '"': return '"'; case '?': return '?'; case '\\': return '\\'; case '0': return '\0'; case 'x': { c = input_getc(); if (!isxdigit(c)) { tok_error("Invalid hexadecimal escape sequence\n"); return 0; } int val = 0; while (isxdigit(c)) { *len += 1; val = val * 16 + (isdigit(c) ? c - '0' : tolower(c) - 'a' + 10); c = input_getc(); } input_ungetc(c); return (char)val; } default: if (!isdigit(c)) { tok_error("Invalid escape sequence\n"); return 0; } int val = 0; while (isdigit(c)) { *len += 1; val = val * 8 + c - '0'; c = input_getc(); } input_ungetc(c); return (char)val; } } static token_t *read_string_literal(void) { int c; c = input_getc(); if (c != '"') { input_ungetc(c); return NULL; } int i = 0; char s_buf[512]; char *buf = s_buf; int len = 512; int esc_pad = 0; while ((c = input_getc()) != EOF) { if (c == '"') { // Implicit skip of closing quote break; } if (c == '\\') { c = read_escape_sequence(&esc_pad); if (c == 0) { return NULL; } } if (i >= len) { if (buf == s_buf) { buf = malloc(1024); if (buf == NULL) { fputs("Out of memory. Could not parse string literal.\n", stderr); exit(1); } memcpy(buf, s_buf, 512); len *= 2; } else { len *= 2; buf = realloc(buf, len); } } buf[i++] = c; } buf[i] = '\0'; if (c == EOF) { tok_error("Unterminated string literal\n"); if (buf != s_buf) { free(buf); } return NULL; } token_t *tok = token_create_string(TOK_STRING_ASCII, line, column, buf, i + esc_pad + 2); if (buf != s_buf) { free(buf); } return tok; } /* Tokenize Character */ static token_t *read_char_constant(void) { int c; int len = 0; c = input_getc(); if (c != '\'') { input_ungetc(c); return NULL; } len++; c = input_getc(); if (c == '\'') { tok_error("Empty character constant\n"); return NULL; } if (c == '\\') { c = read_escape_sequence(&len); } int val = c; c = input_getc(); if (c != '\'') { tok_error("Expected closing quote for character constant\n"); return NULL; } len++; return token_create_char(TOK_CHAR_CONST, line, column, val, len); } /* Tokenize Operator */ token_t *read_operator(void) { int c; c = input_getc(); switch (c) { case '!': { c = input_getc(); if (c == '=') return token_create(TOK_NE, line, column, 2); input_ungetc(c); return token_create(TOK_NOT, line, column, 1); } case '%': { c = input_getc(); if (c == '=') return token_create(TOK_ASSIGN_MOD, line, column, 2); input_ungetc(c); return token_create(TOK_MOD, line, column, 1); } case '&': { c = input_getc(); if (c == '&') return token_create(TOK_AND, line, column, 2); if (c == '=') return token_create(TOK_ASSIGN_BITAND, line, column, 2); input_ungetc(c); return token_create(TOK_BIT_AND, line, column, 1); } case '(': return token_create(TOK_LEFT_PAREN, line, column, 1); case ')': return token_create(TOK_RIGHT_PAREN, line, column, 1); case '*': { c = input_getc(); if (c == '=') return token_create(TOK_ASSIGN_MUL, line, column, 2); input_ungetc(c); return token_create(TOK_MUL, line, column, 1); } case '+': { c = input_getc(); if (c == '+') return token_create(TOK_INC, line, column, 2); if (c == '=') return token_create(TOK_ASSIGN_ADD, line, column, 2); input_ungetc(c); return token_create(TOK_ADD, line, column, 2); } case ',': return token_create(TOK_COMMA, line, column, 1); case '-': { c = input_getc(); if (c == '-') return token_create(TOK_DEC, line, column, 2); if (c == '=') return token_create(TOK_ASSIGN_SUB, line, column, 2); if (c == '>') return token_create(TOK_MEMBER_POINTER, line, column, 2); input_ungetc(c); return token_create(TOK_SUB, line, column, 1); } case '.': { c = input_getc(); if (c == '.') { c = input_getc(); if (c == '.') { return token_create(TOK_ELLIPSIS, line, column, 3); } else { // Bail out, can't store more than one unget tok_error("Unexpected character '.' at line %d, column %d\n", line, column); exit(1); } } return token_create('.', line, column, 1); } case '/': { c = input_getc(); if (c == '=') return token_create(TOK_ASSIGN_DIV, line, column, 2); input_ungetc(c); return token_create(TOK_DIV, line, column, 1); } case ':': return token_create(TOK_COND_DECISION, line, column, 1); case ';': return token_create(TOK_SEMICOLON, line, column, 1); case '<': { c = input_getc(); if (c == '<') { c = input_getc(); if (c == '=') return token_create(TOK_ASSIGN_LSHIFT, line, column, 3); input_ungetc(c); return token_create(TOK_LSHIFT, line, column, 2); } if (c == '=') return token_create(TOK_LE, line, column, 2); input_ungetc(c); return token_create(TOK_LT, line, column, 1); } case '=': { c = input_getc(); if (c == '=') return token_create(TOK_ASSIGN, line, column, 2); input_ungetc(c); return token_create(TOK_ASSIGN, line, column, 1); } case '>': { c = input_getc(); if (c == '>') { c = input_getc(); if (c == '=') return token_create(TOK_ASSIGN_RSHIFT, line, column, 3); input_ungetc(c); return token_create(TOK_RSHIFT, line, column, 2); } if (c == '=') return token_create(TOK_GE, line, column, 2); input_ungetc(c); return token_create(TOK_GT, line, column, 1); } case '?': return token_create(TOK_COND, line, column, 1); case '[': return token_create(TOK_LEFT_BRACKET, line, column, 1); case ']': return token_create(TOK_RIGHT_BRACKET, line, column, 1); case '^': { c = input_getc(); if (c == '=') return token_create(TOK_ASSIGN_BITXOR, line, column, 2); input_ungetc(c); return token_create(TOK_BIT_XOR, line, column, 1); } case '{': return token_create(TOK_LEFT_BRACE, line, column, 1); case '|': { c = input_getc(); if (c == '|') return token_create(TOK_OR, line, column, 2); if (c == '=') return token_create(TOK_ASSIGN_BITOR, line, column, 2); input_ungetc(c); return token_create(TOK_BIT_OR, line, column, 1); } case '}': return token_create(TOK_RIGHT_BRACE, line, column, 1); case '~': return token_create(TOK_BIT_NOT, line, column, 1); default: input_ungetc(c); return NULL; } return NULL; } token_t *next_token(void) { if (left_stack_pos > 0) { return left_stack[--left_stack_pos]; } token_t *tok = skip_whitespace(); if (tok != NULL) { return tok; } tok = read_identifier(); if (tok != NULL) { return tok; } tok = read_number(); if (tok != NULL) { return tok; } tok = read_char_constant(); if (tok != NULL) { return tok; } tok = read_string_literal(); if (tok != NULL) { return tok; } tok = read_operator(); if (tok != NULL) { return tok; } int c = input_getc(); if (c == EOF) { return NULL; } tok_warn( "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c, line, column); return next_token(); } #ifdef TEST_TOKENIZER /* Run Test */ char *preprocess(char *in) { char *output_name = malloc(1024); snprintf(output_name, 1024, "%s.preprocessed", in); char *command = malloc(2048); snprintf(command, 2048, "gcc -E -xc %s -o %s", in, output_name); system(command); free(command); return output_name; } // Tokenize the input file int main(int argc, char **argv) { if (argc != 2) { fprintf(stderr, "Usage: %s \n", argv[0]); return 1; } char *input_name = argv[1]; char *preprocessed = preprocess(input_name); init_tokenizer(preprocessed); token_t *tok; while ((tok = next_token()) != NULL) { print_token(tok); token_destroy(tok); } destroy_tokenizer(); remove(preprocessed); free(preprocessed); hash_table_destroy(string_table); return 0; } #endif