365 lines
8.7 KiB
C

/*
* scan.c
* Copyright (C) 2013-2015 Mikael Pettersson
*
* This file is part of pdp10-tools.
*
* pdp10-tools is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* pdp10-tools is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with pdp10-tools. If not, see <http://www.gnu.org/licenses/>.
*/
#include <errno.h>
#include <limits.h> /* XXX: for UCHAR_MAX, deleteme */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "scan.h"
#include "token.h"
void scan_init(struct scan_state *scan_state, const char *progname)
{
scan_state->progname = progname;
scan_state->filename = "<stdin>";
scan_state->linenr = 1;
}
int scan_open(struct scan_state *scan_state, const char *filename)
{
if (filename[0] == '-' && filename[1] == '-' && filename[2] == '\0') {
scan_state->filename = "<stdin>";
filename = "/dev/stdin";
} else
scan_state->filename = filename;
if (freopen(filename, "r", stdin) == NULL) {
fprintf(stderr, "%s: Error opening %s: %s\n", scan_state->progname, filename, strerror(errno));
return -1;
}
return 0;
}
static void scan_ungetc(struct scan_state *scan_state, int ch)
{
if (ch != EOF && ungetc(ch, stdin) == EOF)
fprintf(stderr, "%s: %s line %u: ungetc %d failed: %s\n",
scan_state->progname, scan_state->filename, scan_state->linenr, ch, strerror(errno));
}
static int scan_getchar(void)
{
return fgetc(stdin);
}
static void badchar(struct scan_state *scan_state, int ch, const char *context)
{
char buf[7];
if (ch == EOF) {
buf[0] = '<';
buf[1] = 'E';
buf[2] = 'O';
buf[3] = 'F';
buf[4] = '>';
buf[5] = '\0';
} else if (' ' <= ch && ch <= '~') {
buf[0] = '\'';
buf[1] = ch;
buf[2] = '\'';
buf[3] = '\0';
} else {
buf[0] = '\'';
buf[1] = '\\';
buf[2] = '0' + ((ch >> 6) & 3);
buf[3] = '0' + ((ch >> 3) & 7);
buf[4] = '0' + (ch & 7);
buf[5] = '\'';
buf[6] = '\0';
}
fprintf(stderr, "%s: %s line %u: invalid character %s%s\n",
scan_state->progname, scan_state->filename, scan_state->linenr, buf, context);
}
static unsigned int get_chval(int ch)
{
if ('0' <= ch && ch <= '9')
return ch - '0';
if ('A' <= ch && ch <= 'F')
return ch - ('A' - 10);
if ('a' <= ch && ch <= 'f')
return ch - ('a' - 10);
return -1U;
}
static int is_octal_digit(int ch)
{
return ch >= '0' && ch <= '7';
}
static int do_escape(struct scan_state *scan_state)
{
int ch;
ch = scan_getchar();
switch (ch) {
case 'n':
return '\n';
case 't':
return '\t';
case 'f':
return '\f';
case 'r':
return '\r';
case 'b':
return '\b';
case '\\':
case '\'':
case '"':
return ch;
default:
if (is_octal_digit(ch)) {
unsigned int val = ch - '0';
ch = scan_getchar();
if (is_octal_digit(ch)) {
val = val * 8 + (ch - '0');
ch = scan_getchar();
if (is_octal_digit(ch))
val = val * 8 + (ch - '0');
else
scan_ungetc(scan_state, ch);
} else
scan_ungetc(scan_state, ch);
/* XXX: this should be PDP10_UINT9_MAX, but our string elements are still char not pdp10_uint9_t for now */
if (val > UCHAR_MAX) {
fprintf(stderr, "%s: %s line %u: out of range character escape value %#x\n",
scan_state->progname, scan_state->filename, scan_state->linenr, val);
return EOF;
}
return val & UCHAR_MAX;
}
break;
}
badchar(scan_state, ch, "in \\ character escape");
if (ch == '\n')
++scan_state->linenr;
return EOF;
}
/* XXX: string literals should be sequences of pdp10_uint9_t, not sequences of char */
static enum token do_string(struct scan_state *scan_state, union token_attribute *token_attr)
{
char charbuf[4096]; /* 4095 char + NUL, XXX: make it dynamic */
unsigned int len;
char *text;
int ch;
len = 0;
for (;;) {
ch = scan_getchar();
switch (ch) {
case '"':
text = malloc(len + 1);
if (!text) {
fprintf(stderr, "%s: %s line %u: malloc(%u) failed: %s\n",
scan_state->progname, scan_state->filename, scan_state->linenr, len + 1, strerror(errno));
return T_ERROR;
}
strcpy(text, charbuf);
token_attr->text = text;
return T_STRING;
case '\\':
ch = do_escape(scan_state);
if (ch == EOF)
return T_ERROR;
break;
case EOF:
case '\n':
badchar(scan_state, ch, "in string literal");
if (ch == '\n')
++scan_state->linenr;
return T_ERROR;
default:
break;
}
if (len >= sizeof charbuf - 1) {
fprintf(stderr, "%s: %s line %u: too long string literal\n",
scan_state->progname, scan_state->filename, scan_state->linenr);
return T_ERROR;
}
charbuf[len] = ch;
++len;
}
}
static int is_symbol_internal_char(int ch)
{
return
('A' <= ch && ch <= 'Z')
|| ('a' <= ch && ch <= 'z')
|| ('0' <= ch && ch <= '9')
|| ch == '_'
|| ch == '$'
|| ch == '.';
}
static enum token do_symbol(struct scan_state *scan_state, union token_attribute *token_attr, int ch)
{
char charbuf[128]; /* 127 chars + NUL, XXX: make it dynamic */
unsigned int len;
char *text;
len = 0;
do {
if (len >= sizeof charbuf - 1) {
fprintf(stderr, "%s: %s line %u: too long symbol\n",
scan_state->progname, scan_state->filename, scan_state->linenr);
return T_ERROR;
}
charbuf[len] = ch;
++len;
ch = scan_getchar();
} while (is_symbol_internal_char(ch));
charbuf[len] = '\0';
scan_ungetc(scan_state, ch);
if (charbuf[0] == '.') {
enum token low, high;
if (charbuf[1] == '\0')
return T_DOT;
/* see token.def, reserved symbols occupy tokens [0,T_SYMBOL[ */
low = 0;
high = T_SYMBOL;
while (low < high) {
enum token middle;
int cmp;
middle = (low + high) / 2;
cmp = strcmp(charbuf, token_info[middle].print_name);
if (cmp < 0)
high = middle;
else if (cmp > 0)
low = middle + 1;
else
return middle;
}
}
text = malloc(len + 1);
if (!text) {
fprintf(stderr, "%s: %s line %u: malloc(%u) failed: %s\n",
scan_state->progname, scan_state->filename, scan_state->linenr, len + 1, strerror(errno));
return T_ERROR;
}
strcpy(text, charbuf);
token_attr->text = text;
return T_SYMBOL;
}
static enum token do_number(struct scan_state *scan_state, union token_attribute *token_attr, int ch)
{
unsigned int base, chval;
pdp10_uint36_t numval;
base = (ch == '0') ? 8 : 10;
numval = ch - '0';
/* handle 0x<first hexdig> */
ch = scan_getchar();
if (base == 8 && (ch == 'x' || ch == 'X')) {
base = 16;
/* must have at least one hex digit after 0x */
ch = scan_getchar();
chval = get_chval(ch);
if (chval >= 16) {
badchar(scan_state, ch, " after 0x in hexadecimal literal");
return T_ERROR;
}
numval = chval;
ch = scan_getchar();
}
/* the number is non-empty, consume and accumulate trailing
characters as long as they are valid in the base */
for (;;) {
chval = get_chval(ch);
if (chval >= base)
break;
numval = numval * base + chval; /* XXX: check for overflow */
ch = scan_getchar();
}
/* XXX: check for <decimal>{b,f} which is a local label reference */
/* plain integer literal */
scan_ungetc(scan_state, ch);
token_attr->uint = numval;
return T_UINTEGER;
}
enum token scan_token(struct scan_state *scan_state, union token_attribute *token_attr)
{
int ch;
ch = scan_getchar();
for (;; ch = scan_getchar()) {
switch (ch) {
case ' ':
case '\t':
case '\r':
case '\f':
continue;
case '\n':
++scan_state->linenr;
return T_NEWLINE;
case EOF:
return T_EOF;
case '@':
return T_AT;
case ':':
return T_COLON;
case ',':
return T_COMMA;
case '(':
return T_LPAREN;
case ')':
return T_RPAREN;
case '"':
return do_string(scan_state, token_attr);
case '-':
return T_MINUS;
case '.':
/* Dot may start a floating point literal, but tests show that
gcc always outputs floating point values as integer literals,
so we shouldn't have to support floating point literals at all. */
case '$':
case '_':
return do_symbol(scan_state, token_attr, ch);
default:
if ('0' <= ch && ch <= '9') /* number or <decimal>{b,f} */
return do_number(scan_state, token_attr, ch);
if (('A' <= ch && ch <= 'Z') ||
('a' <= ch && ch <= 'z'))
return do_symbol(scan_state, token_attr, ch);
break;
}
badchar(scan_state, ch, "");
return T_ERROR;
}
}