/* * scan.c */ #include #include /* host stdio since we're dealing with plain text */ #include #include #include "charbuf.h" #include "scan.h" #include "token.h" /* XXX: we should have a pdp10-limits.h */ #define PDP10_UCHAR_MAX PDP10_UINT9_MAX const char *scan_filename = ""; int scan_freopen(const char *filename) { if (filename != NULL) { if (freopen(filename, "r", stdin) == NULL) { fprintf(stderr, "as: Error opening %s: %s\n", filename, strerror(errno)); return -1; } scan_filename = filename; } return 0; } unsigned int scan_linenr; static void scan_ungetc(int ch) { if (ch != EOF && ungetc(ch, stdin) == EOF) perror("ungetc"); } static int scan_getchar(void) { return fgetc(stdin); } static void badchar(int ch, const char *context) { char buf[7]; if (ch == EOF) { buf[0] = '<'; buf[1] = 'E'; buf[2] = 'O'; buf[3] = 'F'; buf[4] = '>'; buf[5] = '\0'; } else if (' ' <= ch && ch <= '~') { buf[0] = '\''; buf[1] = ch; buf[2] = '\''; buf[3] = '\0'; } else { buf[0] = '\''; buf[1] = '\\'; buf[2] = '0' + ((ch >> 6) & 3); buf[3] = '0' + ((ch >> 3) & 7); buf[4] = '0' + (ch & 7); buf[5] = '\''; buf[6] = '\0'; } fprintf(stderr, "as: %s, line %u: invalid character %s %s\n", scan_filename, scan_linenr, buf, context); } static int is_decimal_digit(char ch) { return '0' <= ch && ch <= '9'; } static int is_octal_digit(char ch) { return '0' <= ch && ch <= '7'; } static unsigned int get_chval(int ch) { if ('0' <= ch && ch <= '9') return ch - '0'; if ('A' <= ch && ch <= 'F') return ch - ('A' - 10); if ('a' <= ch && ch <= 'f') return ch - ('a' - 10); return -1U; } static int do_escape(void) { int ch; ch = scan_getchar(); switch (ch) { case 'n': return '\n'; case 't': return '\t'; case 'f': return '\f'; case 'r': return '\r'; case 'b': return '\b'; case '\\': return ch; case '\'': return ch; case '"': return ch; case 'x': case 'X': { unsigned int chval; ch = scan_getchar(); chval = get_chval(ch); if (chval <= 15) { unsigned int val = 0; do { val = val * 16 + chval; ch = scan_getchar(); chval = get_chval(ch); } while (chval <= 15); scan_ungetc(ch); if (val > PDP10_UCHAR_MAX) fprintf(stderr, "as: %s, line %u: truncating escaped value from %#x to %#x\n", scan_filename, scan_linenr, val, val & PDP10_UCHAR_MAX); return val & PDP10_UCHAR_MAX; } break; } case EOF: break; default: if (is_octal_digit(ch)) { unsigned int val = ch - '0'; ch = scan_getchar(); if (is_octal_digit(ch)) { val = val * 8 + (ch - '0'); ch = scan_getchar(); if (is_octal_digit(ch)) val = val * 8 + (ch - '0'); else scan_ungetc(ch); } else scan_ungetc(ch); if (val > PDP10_UCHAR_MAX) fprintf(stderr, "as: %s, line %u: truncating escaped value from %#x to %#x\n", scan_filename, scan_linenr, val, val & PDP10_UCHAR_MAX); return val & PDP10_UCHAR_MAX; } break; } badchar(ch, "in \\ character escape sequence"); return ch; } static enum token do_char(union token_attribute *token_attr) { int ch; ch = scan_getchar(); switch (ch) { case '\\': ch = do_escape(); if (ch == EOF) return T_ERROR; break; case '\'': case EOF: badchar(ch, "in character literal"); return T_ERROR; default: break; } token_attr->uint = ch; ch = scan_getchar(); if (ch != '\'') { badchar(ch, "after character literal"); return T_ERROR; } return T_UINTEGER; } /* XXX: strings should be sequences of uint9_t not sequences of unsigned char */ static enum token do_string(union token_attribute *token_attr, struct charbuf *charbuf) { int ch; for (;;) { ch = scan_getchar(); switch (ch) { case '"': token_attr->text = charbuf_string(charbuf); return T_STRING; case '\\': ch = do_escape(); if (ch == EOF) return T_ERROR; break; case EOF: case '\n': badchar(ch, "in string literal"); if (ch == '\n') ++scan_linenr; return T_ERROR; default: break; } charbuf_append(charbuf, ch); } } static const struct { enum token token; const char *name; } directives[] = { { T_DOT_ALIGN, ".align" }, { T_DOT_ASCII, ".ascii" }, { T_DOT_ASCIZ, ".asciz" }, { T_DOT_BALIGN, ".balign" }, { T_DOT_BSS, ".bss" }, { T_DOT_BYTE, ".byte" }, { T_DOT_COMM, ".comm" }, { T_DOT_DATA, ".data" }, { T_DOT_FILE, ".file" }, { T_DOT_GLOBL, ".globl" }, { T_DOT_HIDDEN, ".hidden" }, { T_DOT_IDENT, ".ident" }, { T_DOT_INTERNAL, ".internal" }, { T_DOT_LOCAL, ".local" }, { T_DOT_LONG, ".long" }, { T_DOT_ORG, ".org" }, { T_DOT_P2ALIGN, ".p2align" }, { T_DOT_POPSECTION, ".popsection" }, { T_DOT_PREVIOUS, ".previous" }, { T_DOT_PROTECTED, ".protected" }, { T_DOT_PUSHSECTION, ".pushsection" }, { T_DOT_RODATA, ".rodata" }, { T_DOT_SECTION, ".section" }, { T_DOT_SET, ".set" }, { T_DOT_SHORT, ".short" }, { T_DOT_SIZE, ".size" }, { T_DOT_SUBSECTION, ".subsection" }, { T_DOT_SYMVER, ".symver" }, { T_DOT_TEXT, ".text" }, { T_DOT_TYPE, ".type" }, { T_DOT_WEAK, ".weak" }, { T_DOT_WEAKREF, ".weakref" }, }; static enum token mk_symbol(union token_attribute *token_attr, const struct charbuf *charbuf) { if (charbuf->head.buf[0] == '.') { /* check for <.directive> */ unsigned int low, high; low = 0; high = sizeof directives / sizeof directives[0]; while (low < high) { unsigned int middle; int cmp; middle = (low + high) / 2; cmp = charbuf_strcmp(charbuf, directives[middle].name); if (cmp < 0) high = middle; else if (cmp > 0) low = middle + 1; else return directives[middle].token; } } else if (charbuf->head.buf[0] == '$' && charbuf->head.next == NULL) { /* check for $ */ if (charbuf->pos == 2 && is_decimal_digit(charbuf->head.buf[1])) { token_attr->uint = charbuf->head.buf[1] - '0'; return T_REGISTER; } else if (charbuf->pos == 3) { if (is_decimal_digit(charbuf->head.buf[1]) && is_decimal_digit(charbuf->head.buf[2])) { unsigned int val; val = (charbuf->head.buf[1] - '0') * 10 + (charbuf->head.buf[2] - '0'); if (val < 16) { token_attr->uint = val; return T_REGISTER; } } else if (charbuf->head.buf[1] == 's' && charbuf->head.buf[2] == 'p') { token_attr->uint = 15; return T_REGISTER; } } } token_attr->text = charbuf_string(charbuf); return T_SYMBOL; } static int is_symbol_internal_char(int ch) { return ('A' <= ch && ch <= 'Z') || ('a' <= ch && ch <= 'z') || ('0' <= ch && ch <= '9') || ch == '_' || ch == '$' || ch == '.'; } static enum token do_symbol(union token_attribute *token_attr, int ch, struct charbuf *charbuf) { do { charbuf_append(charbuf, ch); ch = scan_getchar(); } while (is_symbol_internal_char(ch)); scan_ungetc(ch); return mk_symbol(token_attr, charbuf); } static enum token do_number(union token_attribute *token_attr, int ch) { unsigned int base, chval; pdp10_uint36_t numval; base = (ch == '0') ? 8 : 10; numval = ch - '0'; ch = scan_getchar(); /* handle 0x */ if (ch == 'x' || ch == 'X') { base = 16; /* must have at least one hex digit after 0x */ ch = scan_getchar(); chval = get_chval(ch); if (chval <= 15) numval = chval; else { badchar(ch, "after 0x in hexadecimal literal"); return T_ERROR; } ch = scan_getchar(); } /* the number is non-empty, consume and accumulate trailing characters as long as they are valid in the base */ for (;;) { chval = get_chval(ch); if (chval >= base) break; numval = numval * base + chval; ch = scan_getchar(); } /* check for {b,f} */ if (base <= 10 && (ch == 'b' || ch == 'f')) { /* represent the local label + direction in sign-magnitude with the sign in the least significant bit; using sign-magnitude allows to distinguish 0f from 0b (i.e., +0 from -0); storing the sign in the least significant bit makes us independent of word size */ token_attr->uint = (numval << 1) | (ch == 'f' ? 1 : 0); return T_LOCAL_LABEL; } /* plain integer literal */ scan_ungetc(ch); token_attr->uint = numval; return T_UINTEGER; } static enum token do_eq(void) { int ch; ch = scan_getchar(); switch (ch) { case '=': return T_EQEQ; default: scan_ungetc(ch); return T_EQ; } } static enum token do_ampersand(void) { int ch; ch = scan_getchar(); switch (ch) { case '&': return T_ANDAND; default: scan_ungetc(ch); return T_AND; } } static enum token do_bar(void) { int ch; ch = scan_getchar(); switch (ch) { case '|': return T_OROR; default: scan_ungetc(ch); return T_OR; } } static enum token do_gt(void) { int ch; ch = scan_getchar(); switch (ch) { case '>': return T_RSHIFT; case '=': return T_GE; default: scan_ungetc(ch); return T_GT; } } static enum token do_lt(void) { int ch; ch = scan_getchar(); switch (ch) { case '<': return T_LSHIFT; case '=': return T_LE; case '>': /* <> is the same as != */ return T_NEQ; default: scan_ungetc(ch); return T_LT; } } static enum token do_c_comment(void) { int ch; for (;;) { ch = scan_getchar(); switch (ch) { case EOF: badchar(ch, "in /**/-style comment"); return T_ERROR; case '*': for (;;) { ch = scan_getchar(); switch (ch) { case '*': continue; case '/': return T_EOF; /* fake token for a C comment */ case EOF: badchar(ch, "in /**/-style comment"); return T_ERROR; case '\n': ++scan_linenr; /*FALLTHROUGH*/ default: break; } break; } continue; case '\n': ++scan_linenr; /*FALLTHROUGH*/ default: continue; } } } static enum token do_slash(void) { int ch; ch = scan_getchar(); switch (ch) { case '*': return do_c_comment(); default: scan_ungetc(ch); return T_DIV; } } static enum token do_bang(void) { int ch; ch = scan_getchar(); switch (ch) { case '=': return T_NEQ; default: scan_ungetc(ch); return T_BANG; } } static int do_line_comment(void) { int ch; for (;;) { ch = scan_getchar(); switch (ch) { case '\n': ++scan_linenr; return 0; case EOF: badchar(ch, "in line comment"); return -1; default: continue; } } } static enum token do_scan(union token_attribute *token_attr, struct charbuf *charbuf) { int ch; ch = scan_getchar(); for (;; ch = scan_getchar()) { switch (ch) { case ' ': case '\t': case '\r': case '\f': continue; case '\n': ++scan_linenr; return T_NEWLINE; case '#': if (do_line_comment() != 0) return T_ERROR; return T_NEWLINE; case ';': return T_NEWLINE; case EOF: return T_EOF; case '@': return T_AT; case ':': return T_COLON; case ',': return T_COMMA; case '(': return T_LPAREN; case ')': return T_RPAREN; case '~': return T_TILDE; case '*': return T_MUL; case '/': /* "/""*", "/" */ switch (do_slash()) { case T_DIV: return T_DIV; case T_EOF: /* fake token for a C comment */ continue; default: /* error, eof in comment */ return T_ERROR; } case '%': return T_REM; case '<': /* <<, <=, < */ return do_lt(); case '>': /* >>, >=, > */ return do_gt(); case '|': /* ||, | */ return do_bar(); case '&': /* &&, & */ return do_ampersand(); case '^': return T_CARET; case '!': /* !=, ! */ return do_bang(); case '+': return T_PLUS; case '-': return T_MINUS; case '=': /* ==, = */ return do_eq(); case '"': return do_string(token_attr, charbuf); case '\'': return do_char(token_attr); case '.': /* Dot may start a floating point literal, but tests show that gcc always outputs floating point values as integer literals, so we shouldn't have to support floating point literals at all. */ case '$': case '_': return do_symbol(token_attr, ch, charbuf); default: if ('0' <= ch && ch <= '9') /* number or {b,f} */ return do_number(token_attr, ch); if (('A' <= ch && ch <= 'Z') || ('a' <= ch && ch <= 'z')) return do_symbol(token_attr, ch, charbuf); } badchar(ch, ""); return T_ERROR; } } enum token scan(union token_attribute *token_attr) { struct charbuf charbuf; enum token token; charbuf_init(&charbuf); token = do_scan(token_attr, &charbuf); charbuf_fini(&charbuf); return token; }