637 lines
13 KiB
C

/*
* scan.c
*/
#include <errno.h>
#include <stdio.h> /* host stdio since we're dealing with plain text */
#include <stdlib.h>
#include <string.h>
#include "charbuf.h"
#include "scan.h"
#include "token.h"
/* XXX: we should have a pdp10-limits.h */
#define PDP10_UCHAR_MAX PDP10_UINT9_MAX
const char *scan_filename = "<stdin>";
int scan_freopen(const char *filename)
{
if (filename != NULL) {
if (freopen(filename, "r", stdin) == NULL) {
fprintf(stderr, "as: Error opening %s: %s\n", filename, strerror(errno));
return -1;
}
scan_filename = filename;
}
return 0;
}
unsigned int scan_linenr;
static void scan_ungetc(int ch)
{
if (ch != EOF && ungetc(ch, stdin) == EOF)
perror("ungetc");
}
static int scan_getchar(void)
{
return fgetc(stdin);
}
static void badchar(int ch, const char *context)
{
char buf[7];
if (ch == EOF) {
buf[0] = '<';
buf[1] = 'E';
buf[2] = 'O';
buf[3] = 'F';
buf[4] = '>';
buf[5] = '\0';
} else if (' ' <= ch && ch <= '~') {
buf[0] = '\'';
buf[1] = ch;
buf[2] = '\'';
buf[3] = '\0';
} else {
buf[0] = '\'';
buf[1] = '\\';
buf[2] = '0' + ((ch >> 6) & 3);
buf[3] = '0' + ((ch >> 3) & 7);
buf[4] = '0' + (ch & 7);
buf[5] = '\'';
buf[6] = '\0';
}
fprintf(stderr, "as: %s, line %u: invalid character %s %s\n", scan_filename, scan_linenr, buf, context);
}
static int is_decimal_digit(char ch)
{
return '0' <= ch && ch <= '9';
}
static int is_octal_digit(char ch)
{
return '0' <= ch && ch <= '7';
}
static unsigned int get_chval(int ch)
{
if ('0' <= ch && ch <= '9')
return ch - '0';
if ('A' <= ch && ch <= 'F')
return ch - ('A' - 10);
if ('a' <= ch && ch <= 'f')
return ch - ('a' - 10);
return -1U;
}
static int do_escape(void)
{
int ch;
ch = scan_getchar();
switch (ch) {
case 'n':
return '\n';
case 't':
return '\t';
case 'f':
return '\f';
case 'r':
return '\r';
case 'b':
return '\b';
case '\\':
return ch;
case '\'':
return ch;
case '"':
return ch;
case 'x':
case 'X':
{
unsigned int chval;
ch = scan_getchar();
chval = get_chval(ch);
if (chval <= 15) {
unsigned int val = 0;
do {
val = val * 16 + chval;
ch = scan_getchar();
chval = get_chval(ch);
} while (chval <= 15);
scan_ungetc(ch);
if (val > PDP10_UCHAR_MAX)
fprintf(stderr, "as: %s, line %u: truncating escaped value from %#x to %#x\n", scan_filename, scan_linenr, val, val & PDP10_UCHAR_MAX);
return val & PDP10_UCHAR_MAX;
}
break;
}
case EOF:
break;
default:
if (is_octal_digit(ch)) {
unsigned int val = ch - '0';
ch = scan_getchar();
if (is_octal_digit(ch)) {
val = val * 8 + (ch - '0');
ch = scan_getchar();
if (is_octal_digit(ch))
val = val * 8 + (ch - '0');
else
scan_ungetc(ch);
} else
scan_ungetc(ch);
if (val > PDP10_UCHAR_MAX)
fprintf(stderr, "as: %s, line %u: truncating escaped value from %#x to %#x\n", scan_filename, scan_linenr, val, val & PDP10_UCHAR_MAX);
return val & PDP10_UCHAR_MAX;
}
break;
}
badchar(ch, "in \\ character escape sequence");
return ch;
}
static enum token do_char(union token_attribute *token_attr)
{
int ch;
ch = scan_getchar();
switch (ch) {
case '\\':
ch = do_escape();
if (ch == EOF)
return T_ERROR;
break;
case '\'':
case EOF:
badchar(ch, "in character literal");
return T_ERROR;
default:
break;
}
token_attr->uint = ch;
ch = scan_getchar();
if (ch != '\'') {
badchar(ch, "after character literal");
return T_ERROR;
}
return T_UINTEGER;
}
/* XXX: strings should be sequences of uint9_t not sequences of unsigned char */
static enum token do_string(union token_attribute *token_attr, struct charbuf *charbuf)
{
int ch;
for (;;) {
ch = scan_getchar();
switch (ch) {
case '"':
token_attr->text = charbuf_string(charbuf);
return T_STRING;
case '\\':
ch = do_escape();
if (ch == EOF)
return T_ERROR;
break;
case EOF:
case '\n':
badchar(ch, "in string literal");
if (ch == '\n')
++scan_linenr;
return T_ERROR;
default:
break;
}
charbuf_append(charbuf, ch);
}
}
static const struct {
enum token token;
const char *name;
} directives[] = {
{ T_DOT_ALIGN, ".align" },
{ T_DOT_ASCII, ".ascii" },
{ T_DOT_ASCIZ, ".asciz" },
{ T_DOT_BALIGN, ".balign" },
{ T_DOT_BSS, ".bss" },
{ T_DOT_BYTE, ".byte" },
{ T_DOT_COMM, ".comm" },
{ T_DOT_DATA, ".data" },
{ T_DOT_FILE, ".file" },
{ T_DOT_GLOBL, ".globl" },
{ T_DOT_HIDDEN, ".hidden" },
{ T_DOT_IDENT, ".ident" },
{ T_DOT_INTERNAL, ".internal" },
{ T_DOT_LOCAL, ".local" },
{ T_DOT_LONG, ".long" },
{ T_DOT_ORG, ".org" },
{ T_DOT_P2ALIGN, ".p2align" },
{ T_DOT_POPSECTION, ".popsection" },
{ T_DOT_PREVIOUS, ".previous" },
{ T_DOT_PROTECTED, ".protected" },
{ T_DOT_PUSHSECTION, ".pushsection" },
{ T_DOT_RODATA, ".rodata" },
{ T_DOT_SECTION, ".section" },
{ T_DOT_SET, ".set" },
{ T_DOT_SHORT, ".short" },
{ T_DOT_SIZE, ".size" },
{ T_DOT_SUBSECTION, ".subsection" },
{ T_DOT_SYMVER, ".symver" },
{ T_DOT_TEXT, ".text" },
{ T_DOT_TYPE, ".type" },
{ T_DOT_WEAK, ".weak" },
{ T_DOT_WEAKREF, ".weakref" },
};
static enum token mk_symbol(union token_attribute *token_attr, const struct charbuf *charbuf)
{
if (charbuf->head.buf[0] == '.') { /* check for <.directive> */
unsigned int low, high;
low = 0;
high = sizeof directives / sizeof directives[0];
while (low < high) {
unsigned int middle;
int cmp;
middle = (low + high) / 2;
cmp = charbuf_strcmp(charbuf, directives[middle].name);
if (cmp < 0)
high = middle;
else if (cmp > 0)
low = middle + 1;
else
return directives[middle].token;
}
} else if (charbuf->head.buf[0] == '$'
&& charbuf->head.next == NULL) { /* check for $<reg> */
if (charbuf->pos == 2
&& is_decimal_digit(charbuf->head.buf[1])) {
token_attr->uint = charbuf->head.buf[1] - '0';
return T_REGISTER;
} else if (charbuf->pos == 3) {
if (is_decimal_digit(charbuf->head.buf[1])
&& is_decimal_digit(charbuf->head.buf[2])) {
unsigned int val;
val = (charbuf->head.buf[1] - '0') * 10 + (charbuf->head.buf[2] - '0');
if (val < 16) {
token_attr->uint = val;
return T_REGISTER;
}
} else if (charbuf->head.buf[1] == 's'
&& charbuf->head.buf[2] == 'p') {
token_attr->uint = 15;
return T_REGISTER;
}
}
}
token_attr->text = charbuf_string(charbuf);
return T_SYMBOL;
}
static int is_symbol_internal_char(int ch)
{
return
('A' <= ch && ch <= 'Z')
|| ('a' <= ch && ch <= 'z')
|| ('0' <= ch && ch <= '9')
|| ch == '_'
|| ch == '$'
|| ch == '.';
}
static enum token do_symbol(union token_attribute *token_attr, int ch, struct charbuf *charbuf)
{
do {
charbuf_append(charbuf, ch);
ch = scan_getchar();
} while (is_symbol_internal_char(ch));
scan_ungetc(ch);
return mk_symbol(token_attr, charbuf);
}
static enum token do_number(union token_attribute *token_attr, int ch)
{
unsigned int base, chval;
pdp10_uint36_t numval;
base = (ch == '0') ? 8 : 10;
numval = ch - '0';
ch = scan_getchar();
/* handle 0x<first hexdig> */
if (ch == 'x' || ch == 'X') {
base = 16;
/* must have at least one hex digit after 0x */
ch = scan_getchar();
chval = get_chval(ch);
if (chval <= 15)
numval = chval;
else {
badchar(ch, "after 0x in hexadecimal literal");
return T_ERROR;
}
ch = scan_getchar();
}
/* the number is non-empty, consume and accumulate trailing
characters as long as they are valid in the base */
for (;;) {
chval = get_chval(ch);
if (chval >= base)
break;
numval = numval * base + chval;
ch = scan_getchar();
}
/* check for <local label>{b,f} */
if (base <= 10 && (ch == 'b' || ch == 'f')) {
/* represent the local label + direction in sign-magnitude with
the sign in the least significant bit; using sign-magnitude
allows to distinguish 0f from 0b (i.e., +0 from -0); storing
the sign in the least significant bit makes us independent of
word size */
token_attr->uint = (numval << 1) | (ch == 'f' ? 1 : 0);
return T_LOCAL_LABEL;
}
/* plain integer literal */
scan_ungetc(ch);
token_attr->uint = numval;
return T_UINTEGER;
}
static enum token do_eq(void)
{
int ch;
ch = scan_getchar();
switch (ch) {
case '=':
return T_EQEQ;
default:
scan_ungetc(ch);
return T_EQ;
}
}
static enum token do_ampersand(void)
{
int ch;
ch = scan_getchar();
switch (ch) {
case '&':
return T_ANDAND;
default:
scan_ungetc(ch);
return T_AND;
}
}
static enum token do_bar(void)
{
int ch;
ch = scan_getchar();
switch (ch) {
case '|':
return T_OROR;
default:
scan_ungetc(ch);
return T_OR;
}
}
static enum token do_gt(void)
{
int ch;
ch = scan_getchar();
switch (ch) {
case '>':
return T_RSHIFT;
case '=':
return T_GE;
default:
scan_ungetc(ch);
return T_GT;
}
}
static enum token do_lt(void)
{
int ch;
ch = scan_getchar();
switch (ch) {
case '<':
return T_LSHIFT;
case '=':
return T_LE;
case '>': /* <> is the same as != */
return T_NEQ;
default:
scan_ungetc(ch);
return T_LT;
}
}
static enum token do_c_comment(void)
{
int ch;
for (;;) {
ch = scan_getchar();
switch (ch) {
case EOF:
badchar(ch, "in /**/-style comment");
return T_ERROR;
case '*':
for (;;) {
ch = scan_getchar();
switch (ch) {
case '*':
continue;
case '/':
return T_EOF; /* fake token for a C comment */
case EOF:
badchar(ch, "in /**/-style comment");
return T_ERROR;
case '\n':
++scan_linenr;
/*FALLTHROUGH*/
default:
break;
}
break;
}
continue;
case '\n':
++scan_linenr;
/*FALLTHROUGH*/
default:
continue;
}
}
}
static enum token do_slash(void)
{
int ch;
ch = scan_getchar();
switch (ch) {
case '*':
return do_c_comment();
default:
scan_ungetc(ch);
return T_DIV;
}
}
static enum token do_bang(void)
{
int ch;
ch = scan_getchar();
switch (ch) {
case '=':
return T_NEQ;
default:
scan_ungetc(ch);
return T_BANG;
}
}
static int do_line_comment(void)
{
int ch;
for (;;) {
ch = scan_getchar();
switch (ch) {
case '\n':
++scan_linenr;
return 0;
case EOF:
badchar(ch, "in line comment");
return -1;
default:
continue;
}
}
}
static enum token do_scan(union token_attribute *token_attr, struct charbuf *charbuf)
{
int ch;
ch = scan_getchar();
for (;; ch = scan_getchar()) {
switch (ch) {
case ' ':
case '\t':
case '\r':
case '\f':
continue;
case '\n':
++scan_linenr;
return T_NEWLINE;
case '#':
if (do_line_comment() != 0)
return T_ERROR;
return T_NEWLINE;
case ';':
return T_NEWLINE;
case EOF:
return T_EOF;
case '@':
return T_AT;
case ':':
return T_COLON;
case ',':
return T_COMMA;
case '(':
return T_LPAREN;
case ')':
return T_RPAREN;
case '~':
return T_TILDE;
case '*':
return T_MUL;
case '/': /* "/""*", "/" */
switch (do_slash()) {
case T_DIV:
return T_DIV;
case T_EOF: /* fake token for a C comment */
continue;
default: /* error, eof in comment */
return T_ERROR;
}
case '%':
return T_REM;
case '<': /* <<, <=, < */
return do_lt();
case '>': /* >>, >=, > */
return do_gt();
case '|': /* ||, | */
return do_bar();
case '&': /* &&, & */
return do_ampersand();
case '^':
return T_CARET;
case '!': /* !=, ! */
return do_bang();
case '+':
return T_PLUS;
case '-':
return T_MINUS;
case '=': /* ==, = */
return do_eq();
case '"':
return do_string(token_attr, charbuf);
case '\'':
return do_char(token_attr);
case '.':
/* Dot may start a floating point literal, but tests show that
gcc always outputs floating point values as integer literals,
so we shouldn't have to support floating point literals at all. */
case '$':
case '_':
return do_symbol(token_attr, ch, charbuf);
default:
if ('0' <= ch && ch <= '9') /* number or <decimal>{b,f} */
return do_number(token_attr, ch);
if (('A' <= ch && ch <= 'Z') ||
('a' <= ch && ch <= 'z'))
return do_symbol(token_attr, ch, charbuf);
}
badchar(ch, "");
return T_ERROR;
}
}
enum token scan(union token_attribute *token_attr)
{
struct charbuf charbuf;
enum token token;
charbuf_init(&charbuf);
token = do_scan(token_attr, &charbuf);
charbuf_fini(&charbuf);
return token;
}