/* tokenizer.c */
#include <ctype.h>
#include <errno.h>
#include <float.h>
#include <math.h>
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "tokenizer.h"
#include "token.h"
#include "input.h"
#include "util.h"
token_t *left_stack[8];
int left_stack_pos = 0;
hash_table_t *typedef_table = NULL;
/* Utility Functions */
void init_tokenizer(const char *filename) {
  input_init(filename);
  typedef_table = hash_table_create(16, cmp_string, hash_string, dtor_string);
}

void destroy_tokenizer(void) {
  input_destroy();
}

void reject_token(token_t *token) {
  left_stack[left_stack_pos++] = token;
}

token_t *peek_token(void) {
  if (left_stack_pos > 0) {
    return left_stack[left_stack_pos - 1];
  }
    token_t *token = next_token();
    reject_token(token);
    return token;
}

/* Stringify Type */
const char *stringify_type(c_token_types type) {
  switch (type) {
  case TOK_IF:
    return "if";
  case TOK_ELSE:
    return "else";
  case TOK_SWITCH:
    return "switch";
  case TOK_CASE:
    return "case";
  case TOK_DEFAULT:
    return "default";
  case TOK_WHILE:
    return "while";
  case TOK_DO:
    return "do";
  case TOK_FOR:
    return "for";
  case TOK_CONTINUE:
    return "continue";
  case TOK_BREAK:
    return "break";
  case TOK_RETURN:
    return "return";
  case TOK_GOTO:
    return "goto";
  case TOK_VOID:
    return "void";
  case TOK_CHAR:
    return "char";
  case TOK_SHORT:
    return "short";
  case TOK_INT:
    return "int";
  case TOK_LONG:
    return "long";
  case TOK_FLOAT:
    return "float";
  case TOK_DOUBLE:
    return "double";
  case TOK_SIGNED:
    return "signed";
  case TOK_UNSIGNED:
    return "unsigned";
  case TOK_STRUCT:
    return "struct";
  case TOK_UNION:
    return "union";
  case TOK_ENUM:
    return "enum";
  case TOK_TYPEDEF:
    return "typedef";
  case TOK_AUTO:
    return "auto";
  case TOK_REGISTER:
    return "register";
  case TOK_STATIC:
    return "static";
  case TOK_EXTERN:
    return "extern";
  case TOK_CONST:
    return "const";
  case TOK_VOLATILE:
    return "volatile";
  case TOK_SIZEOF:
    return "sizeof";
  case TOK_ADD:
    return "+";
  case TOK_SUB:
    return "-";
  case TOK_MUL:
    return "*";
  case TOK_DIV:
    return "/";
  case TOK_MOD:
    return "%";
  case TOK_BIT_AND:
    return "&";
  case TOK_BIT_OR:
    return "|";
  case TOK_BIT_XOR:
    return "^";
  case TOK_BIT_NOT:
    return "~";
  case TOK_LSHIFT:
    return "<<";
  case TOK_RSHIFT:
    return ">>";
  case TOK_NOT:
    return "!";
  case TOK_ASSIGN:
    return "=";
  case TOK_LT:
    return "<";
  case TOK_GT:
    return ">";
  case TOK_INC:
    return "++";
  case TOK_DEC:
    return "--";
  case TOK_EQ:
    return "==";
  case TOK_NE:
    return "!=";
  case TOK_LE:
    return "<=";
  case TOK_GE:
    return ">=";
  case TOK_AND:
    return "&&";
  case TOK_OR:
    return "||";
  case TOK_MEMBER_POINTER:
    return "->";
  case TOK_MEMBER:
    return ".";
  case TOK_COND_DECISION:
    return ":";
  case TOK_COND:
    return "?";
  case TOK_ASSIGN_ADD:
    return "+=";
  case TOK_ASSIGN_SUB:
    return "-=";
  case TOK_ASSIGN_MUL:
    return "*=";
  case TOK_ASSIGN_DIV:
    return "/=";
  case TOK_ASSIGN_MOD:
    return "%=";
  case TOK_ASSIGN_BITAND:
    return "&=";
  case TOK_ASSIGN_BITOR:
    return "|=";
  case TOK_ASSIGN_BITXOR:
    return "^=";
  case TOK_ASSIGN_LSHIFT:
    return "<<=";
  case TOK_ASSIGN_RSHIFT:
    return ">>=";
  case TOK_HASH:
    return "#";
  case TOK_ID:
    return "identifier";
  case TOK_TYPEDEF_NAME:
    return "typedef name";
  case TOK_INTEGER_U32:
  case TOK_INTEGER_U64:
  case TOK_INTEGER_S32:
  case TOK_INTEGER_S64:
    return "integer constant";
  case TOK_FLOAT_32:
  case TOK_FLOAT_64:
    return "floating constant";
  case TOK_CHAR_CONST:
    return "character constant";
  case TOK_STRING_ASCII:
    return "string constant";
  case TOK_EOF:
    return "EOF";
  case TOK_ERROR:
    return "error";
  case TOK_LEFT_PAREN:
    return "(";
  case TOK_RIGHT_PAREN:
    return ")";
  case TOK_LEFT_BRACKET:
    return "[";
  case TOK_RIGHT_BRACKET:
    return "]";
  case TOK_LEFT_BRACE:
    return "{";
  case TOK_RIGHT_BRACE:
    return "}";
  case TOK_COMMA:
    return ",";
  case TOK_SEMICOLON:
    return ";";
  case TOK_DOT:
    return ".";
  case TOK_ELLIPSIS:
    return "...";
  }
  return "UNKNOWN";
}


void consume(c_token_types kind) {
  token_t *token = next_token();
  if (token_type(token) != kind) {
    fprintf(stderr, "Error: Expected token of type \"%s\", got \"%s\"\n", stringify_type(kind), stringify_type(token_type(token)));
    exit(1);
  }
  token_destroy(token);
}

void consume_alt(c_token_types *kinds, int n) {
  token_t *token = next_token();
  for (int i = 0; i < n; i++) {
    if (token_type(token) == kinds[i]) {
      token_destroy(token);
      return;
    }
  }
  fprintf(stderr, "Error: Expected one of the following tokens: ");
  for (int i = 0; i < n; i++) {
    fprintf(stderr, "\"%s\" ", stringify_type(kinds[i]));
  }
  fprintf(stderr, "got \"%s\"\n", stringify_type(token_type(token)));
  exit(1);
}

/* Tokenization Function */
char file_name[1024];
/* Warning/Error Functions */
void tok_error(const char *fmt, ...) {
  va_list args;
  va_start(args, fmt);
  fprintf(stderr, "Error in file %s at line %d, column %d: ", file_name, line,
          column);
  vfprintf(stderr, fmt, args);
  va_end(args);
}

void tok_warn(const char *fmt, ...) {
  va_list args;
  va_start(args, fmt);
  fprintf(stderr, "Warning in file %s at line %d, column %d: ", file_name, line,
          column);
  vfprintf(stderr, fmt, args);
  va_end(args);
}

/* Skip Whitespace */
static token_t *skip_whitespace(void) {
  int c;
  while ((c = input_getc()) != EOF) {
    if (isspace(c)) { // Whitespace
      if (c == '\n') {
        line++;
        column = 1;
      } else {
        column++;
      }
    } else if (c == '#') // GCC preprocessor line control directive.
    {
      char buf[512];
      int i = 0;
      while ((c = input_getc()) != EOF && c != '\n') {
        buf[i++] = c;
        column++;
      }
      buf[i] = '\0';
      if (sscanf(buf, "%d \"%[^\"]\"", &line, file_name) == 2) {
        column = 1;
      } else {
        tok_error("Invalid #line directive\n");
      }
      if (c == EOF) {
        return NULL;
      }
    } else if (c == '/') { // Comment
      c = input_getc();
      if (c == '/') {
        while ((c = input_getc()) != EOF && c != '\n') {
          column++;
        }
        if (c == EOF) {
          return NULL;
        }
        line++;
        column = 1;
      } else if (c == '*') { // Multiline comment
        while ((c = input_getc()) != EOF) {
          if (c == '*') {
            c = input_getc();
            if (c == '/') {
              break;
            }
          } else if (c == '\n') {
            line++;
            column = 1;
          } else {
            column++;
          }
        }
        if (c == EOF) {
          return NULL;
        }
      } else { // Handled here to simplify the code.
        if (c == '=')
          return token_create(TOK_ASSIGN_DIV, line, column, 2);
        input_ungetc(c);
        return token_create(TOK_DIV, line, column, 1);
      }
    } else {
      input_ungetc(c);
      return NULL;
    }
  }
  return NULL;
}

/* Tokenize Identifier */
/* Get Keyword */
c_token_types get_keyword(const char *buf, int len) {
  switch (buf[0]) {
  case 'a':
    if (len == 4 && buf[1] == 'u' && buf[2] == 't' && buf[3] == 'o')
      return TOK_AUTO;
    break;

  case 'b':
    if (len == 5 && buf[1] == 'r' && buf[2] == 'e' && buf[3] == 'a' &&
        buf[4] == 'k')
      return TOK_BREAK;
    break;

  case 'c':
    switch (buf[1]) {
    case 'a':
      if (len == 4 && buf[2] == 's' && buf[3] == 'e')
        return TOK_CASE;
      break;
    case 'h':
      if (len == 4 && buf[2] == 'a' && buf[3] == 'r')
        return TOK_CHAR;
      break;
    case 'o':
      if (len == 5 && buf[2] == 'n' && buf[3] == 's' && buf[4] == 't')
        return TOK_CONST;
      if (len == 8 && buf[2] == 'n' && buf[3] == 't' && buf[4] == 'i' &&
          buf[5] == 'n' && buf[6] == 'u' && buf[7] == 'e')
        return TOK_CONTINUE;
      break;
    }
    break;

  case 'd':
    switch (buf[1]) {
    case 'e':
      if (len == 7 && buf[2] == 'f' && buf[3] == 'a' && buf[4] == 'u' &&
          buf[5] == 'l' && buf[6] == 't')
        return TOK_DEFAULT;
      break;
    case 'o':
      if (len == 2 && buf[2] == '\0')
        return TOK_DO;
      if (len == 6 && buf[2] == 'u' && buf[3] == 'b' && buf[4] == 'l' &&
          buf[5] == 'e')
        return TOK_DOUBLE;
      break;
    }
    break;

  case 'e':
    switch (buf[1]) {
    case 'l':
      if (len == 4 && buf[2] == 's' && buf[3] == 'e')
        return TOK_ELSE;
      break;
    case 'n':
      if (len == 4 && buf[2] == 'u' && buf[3] == 'm')
        return TOK_ENUM;
      break;
    case 'x':
      if (len == 6 && buf[2] == 't' && buf[3] == 'e' && buf[4] == 'r' &&
          buf[5] == 'n')
        return TOK_EXTERN;
      break;
    }
    break;

  case 'f':
    switch (buf[1]) {
    case 'l':
      if (len == 5 && buf[2] == 'o' && buf[3] == 'a' && buf[4] == 't')
        return TOK_FLOAT;
      break;
    case 'o':
      if (len == 3 && buf[2] == 'r')
        return TOK_FOR;
      break;
    }
    break;

  case 'g':
    if (len == 4 && buf[1] == 'o' && buf[2] == 't' && buf[3] == 'o')
      return TOK_GOTO;
    break;

  case 'i':
    switch (buf[1]) {
    case 'f':
      if (len == 2 && buf[2] == '\0')
        return TOK_IF;
      break;
    case 'n':
      if (len == 3 && buf[2] == 't')
        return TOK_INT;
      break;
    }
    break;

  case 'l':
    if (len == 4 && buf[1] == 'o' && buf[2] == 'n' && buf[3] == 'g')
      return TOK_LONG;
    break;

  case 'r':
    switch (buf[1]) {
    case 'e':
      if (len == 8 && buf[2] == 'g' && buf[3] == 'i' && buf[4] == 's' &&
          buf[5] == 't' && buf[6] == 'e' && buf[7] == 'r')
        return TOK_REGISTER;
      if (len == 6 && buf[2] == 't' && buf[3] == 'u' && buf[4] == 'r' &&
          buf[5] == 'n')
        return TOK_RETURN;
      break;
    }
    break;

  case 's':
    switch (buf[1]) {
    case 'h':
      if (len == 5 && buf[2] == 'o' && buf[3] == 'r' && buf[4] == 't')
        return TOK_SHORT;
      break;
    case 't':
      if (len == 6 && buf[2] == 'a' && buf[3] == 't' && buf[4] == 'i' &&
          buf[5] == 'c')
        return TOK_STATIC;
      break;
    case 'i':
      if (len == 6 && buf[2] == 'g' && buf[3] == 'n' && buf[4] == 'e' &&
          buf[5] == 'd')
        return TOK_SIGNED;
      if (len == 6 && buf[2] == 'z' && buf[3] == 'e' && buf[4] == 'o' &&
          buf[5] == 'f')
        return TOK_SIZEOF;
      break;
    case 'r':
      if (len == 6 && buf[2] == 'u' && buf[3] == 'c' && buf[4] == 't')
        return TOK_STRUCT;
      break;
    case 'w':
      if (len == 6 && buf[2] == 'i' && buf[3] == 't' && buf[4] == 'c' &&
          buf[5] == 'h')
        return TOK_SWITCH;
      break;
    }
    break;

  case 't':
    if (len == 7 && buf[1] == 'y' && buf[2] == 'p' && buf[3] == 'e' &&
        buf[4] == 'd' && buf[5] == 'e' && buf[6] == 'f')
      return TOK_TYPEDEF;
    break;

  case 'u':
    switch (buf[1]) {
    case 'n':
      if (len == 5 && buf[2] == 'i' && buf[3] == 'o' && buf[4] == 'n')
        return TOK_UNION;
      if (len == 8 && buf[2] == 's' && buf[3] == 'i' && buf[4] == 'g' &&
          buf[5] == 'n' && buf[6] == 'e' && buf[7] == 'd')
        return TOK_UNSIGNED;
      break;
    }
    break;

  case 'v':
    switch (buf[1]) {
    case 'o':
      if (len == 4 && buf[2] == 'i' && buf[3] == 'd')
        return TOK_VOID;
      if (len == 8 && buf[2] == 'l' && buf[3] == 'a' && buf[4] == 't' &&
          buf[5] == 'i' && buf[6] == 'l' && buf[7] == 'e')
        return TOK_VOLATILE;
      break;
    }
    break;

  case 'w':
    if (len == 5 && buf[1] == 'h' && buf[2] == 'i' && buf[3] == 'l' &&
        buf[4] == 'e')
      return TOK_WHILE;
    break;

  default:
    return TOK_ID;
  }
  return TOK_ID;
}

static token_t *read_identifier(void) {
  int c;
  char buf[1024];
  int i = 0;
  c = input_getc();
  if (!isalpha(c) && c != '_') {
    input_ungetc(c);
    return NULL;
  }
  buf[i++] = c;
  while ((c = input_getc()) != EOF) {
    if (!isalnum(c) && c != '_') {
      input_ungetc(c);
      break;
    }
    buf[i++] = c;
    if (i >= 1008) {
      tok_error("Identifier too long\n");
      exit(1);
    }
  }
  buf[i] = '\0';
  column += i;
  // Check if it's a keyword
  c_token_types kind = get_keyword(buf, i);
  if (kind != TOK_ID) {
    return token_create(kind, line, column, i);
  }
  // Check if it's a typedef
  if (hash_table_get(typedef_table, buf) != NULL) {
    return token_create(TOK_TYPEDEF_NAME, line, column, i);
  }
  return token_create_string(kind, line, column, buf, i);
}

/* Tokenize Number */
static token_t *read_number(void) {
  int c;
  char buf[1024];
  int i = 0;
  c = input_getc();
  /* Check for valid prefix */
   // If we don't have a digit or decimal point, it's not a number
    if (!isdigit(c) && c != '.') {
      input_ungetc(c);
      return NULL;
    }
    // Decimal point followed by non-digit is a struct member
    if (c == '.') {
      char cnext = input_getc();
      if (!isdigit(cnext)) {
        input_ungetc(cnext);
        return token_create(TOK_MEMBER, line, column, 1);
      }
      input_ungetc(cnext);
    }

  int radix = 10;
  /* Process Radix */
    // Check for hex and octal.
    if (c == '0') {
      char cnext = input_getc();
      if (cnext == 'x' || cnext == 'X') {
        // Hex, discard the 0x
        radix = 16;
      } else {
        // Octal, discard the 0
        input_ungetc(cnext);
        radix = 8;
      }
    } else {
      // Decimal, append the first digit
      buf[i++] = c;
    }

  int is_float = 0;
  /* Read Number Loop */
     while ((c = input_getc()) != EOF) {
      // Since there can be multiple writes to the buffer, we want to make sure we
      // don't overflow by giving a 4 byte pad
      if (i > 1020) {
        tok_error("Number too long\n");
        return NULL;
      }
      // Valid digits for the radix: 0-9 for decimal, 0-7 for octal, 0-9 and
      // a-f/A-F for hex
      if ((radix == 10 && isdigit(c)) || (radix == 16 && isxdigit(c)) ||
          (radix == 8 && c >= '0' && c <= '7')) {
        buf[i++] = c;
        // Decimal point and not a float yet, must be a float
      } else if (c == '.' && !is_float) {
        is_float = 1;
        if (radix != 10) {
          tok_error("Invalid floating point constant, expected decimal, got %s\n",
                    radix == 16 ? "hexadecimal" : "octal");
          return NULL;
        }
        buf[i++] = c;
      }
      // Exponent on the end of a constant. (By standard this forces it to be a
      // float)
      else if (c == 'e' || c == 'E') {
        buf[i++] = c;
        c = input_getc();
        // Sign on the exponent
        if (c == '+' || c == '-') {
          buf[i++] = c;
          c = input_getc();
        }
        // Exponent must be a digit, I.E no 1e1.2
        if (!isdigit(c)) {
          tok_error("Invalid floating point exponent\n");
          return NULL;
        }
        buf[i++] = c;
        is_float = 1;
      } else {
        // Reached the end, unget the character so other functions can read it
        input_ungetc(c);
        break;
      }
    }

  buf[i] = '\0';
  /* Process Suffixes */
    int is_unsigned = 0;
    int is_long = 0;
    int is_single = 0;
    while (1) {
      c = input_getc();
      if (c == 'u' || c == 'U') {
        if (is_unsigned) {
          tok_warn(
              "Warning: Duplicate suffix 'u' for integer constant ignored\n");
        }
        is_unsigned = 1;
      } else if (c == 'l' || c == 'L') {
        if (is_long) {
          tok_warn(
              "Warning: Duplicate suffix 'l' for integer constant ignored\n");
        }
        is_long = 1;
      } else if (c == 'f' || c == 'F') {
        if (is_single) {
          tok_warn("Warning: Duplicate suffix 'f' for floating point constant "
                   "ignored\n");
        }
        is_single = 1;
      } else {
        input_ungetc(c);
        break;
      }
    }

  /* Check for conflicting suffixes */
      if (is_single && is_long) {
      tok_warn("Warning: Invalid suffixes 'l' and 'f' for floating point "
               "constant. Ignoring 'l'\n");
      is_long = 0;
    }
    if (is_single && is_unsigned) {
      tok_warn("Warning: Invalid suffixes 'u' and 'f' for floating point "
               "constant. Ignoring 'u'\n");
      is_unsigned = 0;
    }
    if (is_single && !is_float) {
      tok_warn(
          "Warning: Invalid suffix 'f' for integer constant. Ignoring 'f'\n");
      is_single = 0;
    }

  if (is_float) {
    /* Convert to float */
        errno = 0;
        // Strtod generates a unix-style error when it's given something out of
        // range, so we want to get on top of that quickly instead of ignoring it
        // That way we can avoid some nasty NAN-propagation in the constant folder.
        double f = strtod(buf, NULL);
        if (errno == ERANGE) {
          tok_error("Floating point constant out of range\n");
          return NULL;
        }
        // Warn if the constant is out of range for a float, I.E it's too big or too
        // small
        if (is_single && (f < FLT_MIN || f > FLT_MAX)) {
          tok_warn(
              "Warning: Floating point constant %f is out of range for float\n", f);
        }
        // Warn if the constant is too precise for a float
        if (is_single && fabs((double)((float)f) - f) >= FLT_EPSILON) {
          tok_warn("Warning: Converting double precision floating point constant "
                   "%f to float loses "
                   "precision\n",
                   f);
        }
        return token_create_float(is_single ? TOK_FLOAT_32
                                            : TOK_FLOAT_64,
                                  line, column, f, i);

  } else {
    /* Convert to integer */
        errno = 0;
        uint64_t int_ = strtoull(buf, NULL, radix);
        // Same as above, but for integers
        if (errno == ERANGE) {
          tok_error("Integer constant out of range\n");
          return NULL;
        }
        if (is_unsigned) {
          if (is_long) {
            return token_create_int(TOK_INTEGER_U64, line, column, int_, i);
          } else {
            if (int_ > UINT32_MAX) {
              tok_warn(
                  "Warning: Integer constant %lld is out of range for unsigned "
                  "int\n",
                  int_);
            }
            return token_create_int(TOK_INTEGER_U32, line, column, int_, i);
          }
        } else {
          if (is_long) {
            // If the highest bit is set, that means this will overflow a signed
            // long (Due to two's complement)
            if (int_ & (1UL << 63)) {
              tok_warn(
                  "Warning: Integer constant %lld is out of range for long long\n",
                  i);
            }
            return token_create_int(TOK_INTEGER_S64, line, column, int_, i);
          } else {
            if (int_ & (1UL << 31)) {
              tok_warn("Warning: Integer constant %lld is out of range for int\n",
                       int_);
            }
            return token_create_int(TOK_INTEGER_S32, line, column, int_, i);
          }
        }

  }
  return NULL;
}

/* Tokenize String */
/* Read Escape Sequence */
static char read_escape_sequence(int *len) {
  int c = input_getc();
  *len += 1;
  switch (c) {
  case 'a':
    return '\a';
  case 'b':
    return '\b';
  case 'f':
    return '\f';
  case 'n':
    return '\n';
  case 'r':
    return '\r';
  case 't':
    return '\t';
  case 'v':
    return '\v';
  case '\'':
    return '\'';
  case '"':
    return '"';
  case '?':
    return '?';
  case '\\':
    return '\\';
  case '0':
    return '\0';
  case 'x': {
    c = input_getc();
    if (!isxdigit(c)) {
      tok_error("Invalid hexadecimal escape sequence\n");
      return 0;
    }
    int val = 0;
    while (isxdigit(c)) {
      *len += 1;
      val = val * 16 + (isdigit(c) ? c - '0' : tolower(c) - 'a' + 10);
      c = input_getc();
    }
    input_ungetc(c);
    return (char)val;
  }
  default:
    if (!isdigit(c)) {
      tok_error("Invalid escape sequence\n");
      return 0;
    }
    int val = 0;
    while (isdigit(c)) {
      *len += 1;
      val = val * 8 + c - '0';
      c = input_getc();
    }
    input_ungetc(c);
    return (char)val;
  }
}

static token_t *read_string_literal(void) {
  int c;
  c = input_getc();
  if (c != '"') {
    input_ungetc(c);
    return NULL;
  }
  int i = 0;
  char s_buf[512];
  char *buf = s_buf;
  int len = 512;
  int esc_pad = 0;
  while ((c = input_getc()) != EOF) {
    if (c == '"') {
      // Implicit skip of closing quote
      break;
    }
    if (c == '\\') {
      c = read_escape_sequence(&esc_pad);
      if (c == 0) {
        return NULL;
      }
    }
    if (i >= len) {
      if (buf == s_buf) {
        buf = malloc(1024);
        if (buf == NULL) {
          fputs("Out of memory. Could not parse string literal.\n", stderr);
          exit(1);
        }
        memcpy(buf, s_buf, 512);
        len *= 2;
      } else {
        len *= 2;
        buf = realloc(buf, len);
      }
    }
    buf[i++] = c;
  }
  buf[i] = '\0';
  if (c == EOF) {
    tok_error("Unterminated string literal\n");
    if (buf != s_buf) {
      free(buf);
    }
    return NULL;
  }

  token_t *tok = token_create_string(TOK_STRING_ASCII, line, column, buf,
                                     i + esc_pad + 2);
  if (buf != s_buf) {
    free(buf);
  }
  return tok;
}

/* Tokenize Character */
static token_t *read_char_constant(void) {
  int c;
  int len = 0;
  c = input_getc();
  if (c != '\'') {
    input_ungetc(c);
    return NULL;
  }
  len++;
  c = input_getc();
  if (c == '\'') {
    tok_error("Empty character constant\n");
    return NULL;
  }
  if (c == '\\') {
    c = read_escape_sequence(&len);
  }
  int val = c;
  c = input_getc();
  if (c != '\'') {
    tok_error("Expected closing quote for character constant\n");
    return NULL;
  }
  len++;
  return token_create_char(TOK_CHAR_CONST, line, column, val, len);
}

/* Tokenize Operator */

token_t *read_operator(void) {
  int c;
  c = input_getc();
  switch (c) {
  case '!': {
    c = input_getc();
    if (c == '=')
      return token_create(TOK_NE, line, column, 2);
    input_ungetc(c);
    return token_create(TOK_NOT, line, column, 1);
  }
  case '%': {
    c = input_getc();
    if (c == '=')
      return token_create(TOK_ASSIGN_MOD, line, column, 2);
    input_ungetc(c);
    return token_create(TOK_MOD, line, column, 1);
  }
  case '&': {
    c = input_getc();
    if (c == '&')
      return token_create(TOK_AND, line, column, 2);
    if (c == '=')
      return token_create(TOK_ASSIGN_BITAND, line, column, 2);
    input_ungetc(c);
    return token_create(TOK_BIT_AND, line, column, 1);
  }
  case '(':
    return token_create(TOK_LEFT_PAREN, line, column, 1);
  case ')':
    return token_create(TOK_RIGHT_PAREN, line, column, 1);
  case '*': {
    c = input_getc();
    if (c == '=')
      return token_create(TOK_ASSIGN_MUL, line, column, 2);
    input_ungetc(c);
    return token_create(TOK_MUL, line, column, 1);
  }
  case '+': {
    c = input_getc();
    if (c == '+')
      return token_create(TOK_INC, line, column, 2);
    if (c == '=')
      return token_create(TOK_ASSIGN_ADD, line, column, 2);
    input_ungetc(c);
    return token_create(TOK_ADD, line, column, 2);
  }
  case ',':
    return token_create(TOK_COMMA, line, column, 1);
  case '-': {
    c = input_getc();
    if (c == '-')
      return token_create(TOK_DEC, line, column, 2);
    if (c == '=')
      return token_create(TOK_ASSIGN_SUB, line, column, 2);
    if (c == '>')
      return token_create(TOK_MEMBER_POINTER, line, column, 2);
    input_ungetc(c);
    return token_create(TOK_SUB, line, column, 1);
  }
  case '.': {
    c = input_getc();
    if (c == '.') {
      c = input_getc();
      if (c == '.') {
        return token_create(TOK_ELLIPSIS, line, column, 3);
      } else {
        // Bail out, can't store more than one unget
        tok_error("Unexpected character '.' at line %d, column %d\n", line,
                  column);
        exit(1);
      }
    }
    return token_create('.', line, column, 1);
  }
  case '/': {
    c = input_getc();
    if (c == '=')
      return token_create(TOK_ASSIGN_DIV, line, column, 2);
    input_ungetc(c);
    return token_create(TOK_DIV, line, column, 1);
  }
  case ':':
    return token_create(TOK_COND_DECISION, line, column, 1);
  case ';':
    return token_create(TOK_SEMICOLON, line, column, 1);
  case '<': {
    c = input_getc();
    if (c == '<') {
      c = input_getc();
      if (c == '=')
        return token_create(TOK_ASSIGN_LSHIFT, line, column, 3);
      input_ungetc(c);
      return token_create(TOK_LSHIFT, line, column, 2);
    }
    if (c == '=')
      return token_create(TOK_LE, line, column, 2);
    input_ungetc(c);
    return token_create(TOK_LT, line, column, 1);
  }
  case '=': {
    c = input_getc();
    if (c == '=')
      return token_create(TOK_ASSIGN, line, column, 2);
    input_ungetc(c);
    return token_create(TOK_ASSIGN, line, column, 1);
  }
  case '>': {
    c = input_getc();
    if (c == '>') {
      c = input_getc();
      if (c == '=')
        return token_create(TOK_ASSIGN_RSHIFT, line, column, 3);
      input_ungetc(c);
      return token_create(TOK_RSHIFT, line, column, 2);
    }
    if (c == '=')
      return token_create(TOK_GE, line, column, 2);
    input_ungetc(c);
    return token_create(TOK_GT, line, column, 1);
  }
  case '?':
    return token_create(TOK_COND, line, column, 1);
  case '[':
    return token_create(TOK_LEFT_BRACKET, line, column, 1);
  case ']':
    return token_create(TOK_RIGHT_BRACKET, line, column, 1);
  case '^': {
    c = input_getc();
    if (c == '=')
      return token_create(TOK_ASSIGN_BITXOR, line, column, 2);
    input_ungetc(c);
    return token_create(TOK_BIT_XOR, line, column, 1);
  }
  case '{':
    return token_create(TOK_LEFT_BRACE, line, column, 1);
  case '|': {
    c = input_getc();
    if (c == '|')
      return token_create(TOK_OR, line, column, 2);
    if (c == '=')
      return token_create(TOK_ASSIGN_BITOR, line, column, 2);
    input_ungetc(c);
    return token_create(TOK_BIT_OR, line, column, 1);
  }
  case '}':
    return token_create(TOK_RIGHT_BRACE, line, column, 1);
  case '~':
    return token_create(TOK_BIT_NOT, line, column, 1);
  default:
    input_ungetc(c);
    return NULL;
  }

  return NULL;
}

token_t *next_token(void) {
  if (left_stack_pos > 0) {
    return left_stack[--left_stack_pos];
  }
  token_t *tok = skip_whitespace();
  if (tok != NULL) {
    return tok;
  }
  tok = read_identifier();
  if (tok != NULL) {
    return tok;
  }
  tok = read_number();
  if (tok != NULL) {
    return tok;
  }
  tok = read_char_constant();
  if (tok != NULL) {
    return tok;
  }
  tok = read_string_literal();
  if (tok != NULL) {
    return tok;
  }
  tok = read_operator();
  if (tok != NULL) {
    return tok;
  }
  int c = input_getc();
  if (c == EOF) {
    return NULL;
  }
  tok_warn(
      "Warning: Ignoring unexpected character '%c' at line %d, column %d\n", c,
      line, column);
  return next_token();
}

#ifdef TEST_TOKENIZER
/* Run Test */
char *preprocess(char *in) {
  char *output_name = malloc(1024);
  snprintf(output_name, 1024, "%s.preprocessed", in);
  char *command = malloc(2048);
  snprintf(command, 2048, "gcc -E -xc %s -o %s", in, output_name);
  system(command);
  free(command);
  return output_name;
}

// Tokenize the input file
int main(int argc, char **argv) {
  if (argc != 2) {
    fprintf(stderr, "Usage: %s <input.c>\n", argv[0]);
    return 1;
  }
  char *input_name = argv[1];
  char *preprocessed = preprocess(input_name);
  init_tokenizer(preprocessed);
  token_t *tok;
  while ((tok = next_token()) != NULL) {
    print_token(tok);
    token_destroy(tok);
  }
  destroy_tokenizer();
  remove(preprocessed);
  free(preprocessed);
  hash_table_destroy(string_table);
  return 0;
}

#endif