mirror of
https://github.com/mikpe/pdp10-tools.git
synced 2026-01-11 23:53:19 +00:00
365 lines
8.7 KiB
C
365 lines
8.7 KiB
C
/*
|
|
* scan.c
|
|
* Copyright (C) 2013-2015 Mikael Pettersson
|
|
*
|
|
* This file is part of pdp10-tools.
|
|
*
|
|
* pdp10-tools is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* pdp10-tools is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with pdp10-tools. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
#include <errno.h>
|
|
#include <limits.h> /* XXX: for UCHAR_MAX, deleteme */
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include "scan.h"
|
|
#include "token.h"
|
|
|
|
void scan_init(struct scan_state *scan_state, const char *progname)
|
|
{
|
|
scan_state->progname = progname;
|
|
scan_state->filename = "<stdin>";
|
|
scan_state->linenr = 1;
|
|
}
|
|
|
|
int scan_open(struct scan_state *scan_state, const char *filename)
|
|
{
|
|
if (filename[0] == '-' && filename[1] == '-' && filename[2] == '\0') {
|
|
scan_state->filename = "<stdin>";
|
|
filename = "/dev/stdin";
|
|
} else
|
|
scan_state->filename = filename;
|
|
|
|
if (freopen(filename, "r", stdin) == NULL) {
|
|
fprintf(stderr, "%s: Error opening %s: %s\n", scan_state->progname, filename, strerror(errno));
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void scan_ungetc(struct scan_state *scan_state, int ch)
|
|
{
|
|
if (ch != EOF && ungetc(ch, stdin) == EOF)
|
|
fprintf(stderr, "%s: %s line %u: ungetc %d failed: %s\n",
|
|
scan_state->progname, scan_state->filename, scan_state->linenr, ch, strerror(errno));
|
|
}
|
|
|
|
static int scan_getchar(void)
|
|
{
|
|
return fgetc(stdin);
|
|
}
|
|
|
|
static void badchar(struct scan_state *scan_state, int ch, const char *context)
|
|
{
|
|
char buf[7];
|
|
|
|
if (ch == EOF) {
|
|
buf[0] = '<';
|
|
buf[1] = 'E';
|
|
buf[2] = 'O';
|
|
buf[3] = 'F';
|
|
buf[4] = '>';
|
|
buf[5] = '\0';
|
|
} else if (' ' <= ch && ch <= '~') {
|
|
buf[0] = '\'';
|
|
buf[1] = ch;
|
|
buf[2] = '\'';
|
|
buf[3] = '\0';
|
|
} else {
|
|
buf[0] = '\'';
|
|
buf[1] = '\\';
|
|
buf[2] = '0' + ((ch >> 6) & 3);
|
|
buf[3] = '0' + ((ch >> 3) & 7);
|
|
buf[4] = '0' + (ch & 7);
|
|
buf[5] = '\'';
|
|
buf[6] = '\0';
|
|
}
|
|
|
|
fprintf(stderr, "%s: %s line %u: invalid character %s%s\n",
|
|
scan_state->progname, scan_state->filename, scan_state->linenr, buf, context);
|
|
}
|
|
|
|
static unsigned int get_chval(int ch)
|
|
{
|
|
if ('0' <= ch && ch <= '9')
|
|
return ch - '0';
|
|
if ('A' <= ch && ch <= 'F')
|
|
return ch - ('A' - 10);
|
|
if ('a' <= ch && ch <= 'f')
|
|
return ch - ('a' - 10);
|
|
return -1U;
|
|
}
|
|
|
|
static int is_octal_digit(int ch)
|
|
{
|
|
return ch >= '0' && ch <= '7';
|
|
}
|
|
|
|
static int do_escape(struct scan_state *scan_state)
|
|
{
|
|
int ch;
|
|
|
|
ch = scan_getchar();
|
|
switch (ch) {
|
|
case 'n':
|
|
return '\n';
|
|
case 't':
|
|
return '\t';
|
|
case 'f':
|
|
return '\f';
|
|
case 'r':
|
|
return '\r';
|
|
case 'b':
|
|
return '\b';
|
|
case '\\':
|
|
case '\'':
|
|
case '"':
|
|
return ch;
|
|
default:
|
|
if (is_octal_digit(ch)) {
|
|
unsigned int val = ch - '0';
|
|
ch = scan_getchar();
|
|
if (is_octal_digit(ch)) {
|
|
val = val * 8 + (ch - '0');
|
|
ch = scan_getchar();
|
|
if (is_octal_digit(ch))
|
|
val = val * 8 + (ch - '0');
|
|
else
|
|
scan_ungetc(scan_state, ch);
|
|
} else
|
|
scan_ungetc(scan_state, ch);
|
|
/* XXX: this should be PDP10_UINT9_MAX, but our string elements are still char not pdp10_uint9_t for now */
|
|
if (val > UCHAR_MAX) {
|
|
fprintf(stderr, "%s: %s line %u: out of range character escape value %#x\n",
|
|
scan_state->progname, scan_state->filename, scan_state->linenr, val);
|
|
return EOF;
|
|
}
|
|
return val & UCHAR_MAX;
|
|
}
|
|
break;
|
|
}
|
|
badchar(scan_state, ch, "in \\ character escape");
|
|
if (ch == '\n')
|
|
++scan_state->linenr;
|
|
return EOF;
|
|
}
|
|
|
|
/* XXX: string literals should be sequences of pdp10_uint9_t, not sequences of char */
|
|
|
|
static enum token do_string(struct scan_state *scan_state, union token_attribute *token_attr)
|
|
{
|
|
char charbuf[4096]; /* 4095 char + NUL, XXX: make it dynamic */
|
|
unsigned int len;
|
|
char *text;
|
|
int ch;
|
|
|
|
len = 0;
|
|
for (;;) {
|
|
ch = scan_getchar();
|
|
switch (ch) {
|
|
case '"':
|
|
text = malloc(len + 1);
|
|
if (!text) {
|
|
fprintf(stderr, "%s: %s line %u: malloc(%u) failed: %s\n",
|
|
scan_state->progname, scan_state->filename, scan_state->linenr, len + 1, strerror(errno));
|
|
return T_ERROR;
|
|
}
|
|
strcpy(text, charbuf);
|
|
token_attr->text = text;
|
|
return T_STRING;
|
|
case '\\':
|
|
ch = do_escape(scan_state);
|
|
if (ch == EOF)
|
|
return T_ERROR;
|
|
break;
|
|
case EOF:
|
|
case '\n':
|
|
badchar(scan_state, ch, "in string literal");
|
|
if (ch == '\n')
|
|
++scan_state->linenr;
|
|
return T_ERROR;
|
|
default:
|
|
break;
|
|
}
|
|
if (len >= sizeof charbuf - 1) {
|
|
fprintf(stderr, "%s: %s line %u: too long string literal\n",
|
|
scan_state->progname, scan_state->filename, scan_state->linenr);
|
|
return T_ERROR;
|
|
}
|
|
charbuf[len] = ch;
|
|
++len;
|
|
}
|
|
}
|
|
|
|
static int is_symbol_internal_char(int ch)
|
|
{
|
|
return
|
|
('A' <= ch && ch <= 'Z')
|
|
|| ('a' <= ch && ch <= 'z')
|
|
|| ('0' <= ch && ch <= '9')
|
|
|| ch == '_'
|
|
|| ch == '$'
|
|
|| ch == '.';
|
|
}
|
|
|
|
static enum token do_symbol(struct scan_state *scan_state, union token_attribute *token_attr, int ch)
|
|
{
|
|
char charbuf[128]; /* 127 chars + NUL, XXX: make it dynamic */
|
|
unsigned int len;
|
|
char *text;
|
|
|
|
len = 0;
|
|
do {
|
|
if (len >= sizeof charbuf - 1) {
|
|
fprintf(stderr, "%s: %s line %u: too long symbol\n",
|
|
scan_state->progname, scan_state->filename, scan_state->linenr);
|
|
return T_ERROR;
|
|
}
|
|
charbuf[len] = ch;
|
|
++len;
|
|
ch = scan_getchar();
|
|
} while (is_symbol_internal_char(ch));
|
|
charbuf[len] = '\0';
|
|
scan_ungetc(scan_state, ch);
|
|
|
|
if (charbuf[0] == '.') {
|
|
enum token low, high;
|
|
|
|
if (charbuf[1] == '\0')
|
|
return T_DOT;
|
|
|
|
/* see token.def, reserved symbols occupy tokens [0,T_SYMBOL[ */
|
|
low = 0;
|
|
high = T_SYMBOL;
|
|
|
|
while (low < high) {
|
|
enum token middle;
|
|
int cmp;
|
|
|
|
middle = (low + high) / 2;
|
|
cmp = strcmp(charbuf, token_info[middle].print_name);
|
|
|
|
if (cmp < 0)
|
|
high = middle;
|
|
else if (cmp > 0)
|
|
low = middle + 1;
|
|
else
|
|
return middle;
|
|
}
|
|
}
|
|
|
|
text = malloc(len + 1);
|
|
if (!text) {
|
|
fprintf(stderr, "%s: %s line %u: malloc(%u) failed: %s\n",
|
|
scan_state->progname, scan_state->filename, scan_state->linenr, len + 1, strerror(errno));
|
|
return T_ERROR;
|
|
}
|
|
strcpy(text, charbuf);
|
|
token_attr->text = text;
|
|
return T_SYMBOL;
|
|
}
|
|
|
|
static enum token do_number(struct scan_state *scan_state, union token_attribute *token_attr, int ch)
|
|
{
|
|
unsigned int base, chval;
|
|
pdp10_uint36_t numval;
|
|
|
|
base = (ch == '0') ? 8 : 10;
|
|
numval = ch - '0';
|
|
|
|
/* handle 0x<first hexdig> */
|
|
ch = scan_getchar();
|
|
if (base == 8 && (ch == 'x' || ch == 'X')) {
|
|
base = 16;
|
|
/* must have at least one hex digit after 0x */
|
|
ch = scan_getchar();
|
|
chval = get_chval(ch);
|
|
if (chval >= 16) {
|
|
badchar(scan_state, ch, " after 0x in hexadecimal literal");
|
|
return T_ERROR;
|
|
}
|
|
numval = chval;
|
|
ch = scan_getchar();
|
|
}
|
|
|
|
/* the number is non-empty, consume and accumulate trailing
|
|
characters as long as they are valid in the base */
|
|
for (;;) {
|
|
chval = get_chval(ch);
|
|
if (chval >= base)
|
|
break;
|
|
numval = numval * base + chval; /* XXX: check for overflow */
|
|
ch = scan_getchar();
|
|
}
|
|
|
|
/* XXX: check for <decimal>{b,f} which is a local label reference */
|
|
|
|
/* plain integer literal */
|
|
scan_ungetc(scan_state, ch);
|
|
token_attr->uint = numval;
|
|
return T_UINTEGER;
|
|
}
|
|
|
|
enum token scan_token(struct scan_state *scan_state, union token_attribute *token_attr)
|
|
{
|
|
int ch;
|
|
|
|
ch = scan_getchar();
|
|
|
|
for (;; ch = scan_getchar()) {
|
|
switch (ch) {
|
|
case ' ':
|
|
case '\t':
|
|
case '\r':
|
|
case '\f':
|
|
continue;
|
|
case '\n':
|
|
++scan_state->linenr;
|
|
return T_NEWLINE;
|
|
case EOF:
|
|
return T_EOF;
|
|
case '@':
|
|
return T_AT;
|
|
case ':':
|
|
return T_COLON;
|
|
case ',':
|
|
return T_COMMA;
|
|
case '(':
|
|
return T_LPAREN;
|
|
case ')':
|
|
return T_RPAREN;
|
|
case '"':
|
|
return do_string(scan_state, token_attr);
|
|
case '-':
|
|
return T_MINUS;
|
|
case '.':
|
|
/* Dot may start a floating point literal, but tests show that
|
|
gcc always outputs floating point values as integer literals,
|
|
so we shouldn't have to support floating point literals at all. */
|
|
case '$':
|
|
case '_':
|
|
return do_symbol(scan_state, token_attr, ch);
|
|
default:
|
|
if ('0' <= ch && ch <= '9') /* number or <decimal>{b,f} */
|
|
return do_number(scan_state, token_attr, ch);
|
|
if (('A' <= ch && ch <= 'Z') ||
|
|
('a' <= ch && ch <= 'z'))
|
|
return do_symbol(scan_state, token_attr, ch);
|
|
break;
|
|
}
|
|
badchar(scan_state, ch, "");
|
|
return T_ERROR;
|
|
}
|
|
}
|