as: add initial, primitive, but working assembler

2026-04-20 09:26:40 +00:00 · 2013-07-23 20:46:16 +00:00
parent d080f69cbc
commit ab2037008a
20 changed files with 1430 additions and 182 deletions
--- a/as/0LD/main.c
+++ b/as/0LD/main.c
@@ -1,40 +0,0 @@
-/*
- * main.c
- */
-#include <stdio.h>
-#include <unistd.h>
-#include "pass1.h"
-
-int main(int argc, char **argv)
-{
-    int ch;
-    const char *outfile = "a.out";
-    const char *infile = NULL;
-
-    for (;;) {
-	ch = getopt(argc, argv, "o:");
-	switch (ch) {
-	case 'o':
-	    outfile = optarg;
-	    continue;
-	case -1:
-	    break;
-	default:
-	    fprintf(stderr, "as: invalid option '%c'\n", ch);
-	    return 1;
-	}
-    }
-    if (optind + 1 == argc)
-	infile = argv[optind];
-
-    if (pass1(infile) < 0)
-	return 1;
-
-    if (pass2() < 0)
-	return 1;
-
-    if (pass3(outfile) < 0)
-	return 1;
-
-    return 0;
-}
--- a/as/0LD/parse.h
+++ b/as/0LD/parse.h
@@ -1,11 +0,0 @@
-/*
- * parse.h
- */
-#ifndef PARSE_H
-#define PARSE_H
-
-#include "stmt.h"
-
-int parse_stmt(struct stmt *stmt);
-
-#endif /* PARSE_H */
--- a/as/0LD/scan.h
+++ b/as/0LD/scan.h
@@ -1,15 +0,0 @@
-/*
- * scan.h
- */
-#ifndef SCAN_H
-#define SCAN_H
-
-#include "token.h"
-
-const char *scan_filename;
-int scan_freopen(const char *filename);
-
-unsigned int scan_linenr;
-enum token scan(union token_attribute *token_attr);
-
-#endif /* SCAN_H */
--- a/as/0LD/token.def
+++ b/as/0LD/token.def
@@ -1,77 +0,0 @@
-/*
- * token.def
- *
- * TOKEN(T_<name>, <print name>, <attribute fmt>)
- */
-
-/* directives */
-TOKEN(T_DOT_ALIGN, ".align", FMT_NONE)
-TOKEN(T_DOT_ASCII, ".ascii", FMT_NONE)
-TOKEN(T_DOT_ASCIZ, ".asciz", FMT_NONE)
-TOKEN(T_DOT_BALIGN, ".balign", FMT_NONE)
-TOKEN(T_DOT_BSS, ".bss", FMT_NONE)
-TOKEN(T_DOT_BYTE, ".byte", FMT_NONE)
-TOKEN(T_DOT_COMM, ".comm", FMT_NONE)
-TOKEN(T_DOT_DATA, ".data", FMT_NONE)
-TOKEN(T_DOT_FILE, ".file", FMT_NONE)
-TOKEN(T_DOT_GLOBL, ".globl", FMT_NONE)
-TOKEN(T_DOT_HIDDEN, ".hidden", FMT_NONE)
-TOKEN(T_DOT_IDENT, ".ident", FMT_NONE)
-TOKEN(T_DOT_INTERNAL, ".internal", FMT_NONE)
-TOKEN(T_DOT_LOCAL, ".local", FMT_NONE)
-TOKEN(T_DOT_LONG, ".long", FMT_NONE)
-TOKEN(T_DOT_ORG, ".org", FMT_NONE)
-TOKEN(T_DOT_P2ALIGN, ".p2align", FMT_NONE)
-TOKEN(T_DOT_POPSECTION, ".popsection", FMT_NONE)
-TOKEN(T_DOT_PREVIOUS, ".previous", FMT_NONE)
-TOKEN(T_DOT_PROTECTED, ".protected", FMT_NONE)
-TOKEN(T_DOT_PUSHSECTION, ".pushsection", FMT_NONE)
-TOKEN(T_DOT_RODATA, ".rodata", FMT_NONE)
-TOKEN(T_DOT_SECTION, ".section", FMT_NONE)
-TOKEN(T_DOT_SET, ".set", FMT_NONE)
-TOKEN(T_DOT_SHORT, ".short", FMT_NONE)
-TOKEN(T_DOT_SIZE, ".size", FMT_NONE)
-TOKEN(T_DOT_SUBSECTION, ".subsection", FMT_NONE)
-TOKEN(T_DOT_SYMVER, ".symver", FMT_NONE)
-TOKEN(T_DOT_TEXT, ".text", FMT_NONE)
-TOKEN(T_DOT_TYPE, ".type", FMT_NONE)
-TOKEN(T_DOT_WEAK, ".weak", FMT_NONE)
-TOKEN(T_DOT_WEAKREF, ".weakref", FMT_NONE)
-/* other symbols */
-TOKEN(T_REGISTER, "<register>", FMT_UINT)
-TOKEN(T_SYMBOL, "<symbol>", FMT_SYMBOL)
-TOKEN(T_LOCAL_LABEL, "<local label>", FMT_UINT)	/* 1f, 2b */
-TOKEN(T_AT, "@", FMT_NONE)
-TOKEN(T_COLON, ":", FMT_NONE)
-/* literals */
-TOKEN(T_UINTEGER, "<integer>", FMT_UINT)
-TOKEN(T_STRING, "<string>", FMT_STRING)
-/* operators, separators */
-TOKEN(T_COMMA, ",", FMT_NONE)
-TOKEN(T_LPAREN, "(", FMT_NONE)
-TOKEN(T_RPAREN, ")", FMT_NONE)
-TOKEN(T_TILDE, "~", FMT_NONE)
-TOKEN(T_MUL, "*", FMT_NONE)
-TOKEN(T_DIV, "/", FMT_NONE)
-TOKEN(T_REM, "%", FMT_NONE)
-TOKEN(T_LSHIFT, "<<", FMT_NONE)
-TOKEN(T_RSHIFT, ">>", FMT_NONE)
-TOKEN(T_OR, "|", FMT_NONE)
-TOKEN(T_AND, "&", FMT_NONE)
-TOKEN(T_CARET, "^", FMT_NONE)
-TOKEN(T_BANG, "!", FMT_NONE)
-TOKEN(T_PLUS, "+", FMT_NONE)
-TOKEN(T_MINUS, "-", FMT_NONE)
-TOKEN(T_EQ, "=", FMT_NONE)
-TOKEN(T_EQEQ, "==", FMT_NONE)
-TOKEN(T_NEQ, "!=", FMT_NONE)
-TOKEN(T_LT, "<", FMT_NONE)
-TOKEN(T_GT, ">", FMT_NONE)
-TOKEN(T_GE, ">=", FMT_NONE)
-TOKEN(T_LE, "<=", FMT_NONE)
-TOKEN(T_ANDAND, "&&", FMT_NONE)
-TOKEN(T_OROR, "||", FMT_NONE)
-/* misc */
-TOKEN(T_NEWLINE, "<newline>", FMT_NONE)
-TOKEN(T_EOF, "<eof>", FMT_NONE)
-TOKEN(T_ERROR, "<error>", FMT_NONE)
--- a/as/0LD/token.h
+++ b/as/0LD/token.h
@@ -1,22 +0,0 @@
-/*
- * token.h
- */
-#ifndef TOKEN_H
-#define TOKEN_H
-
-#include "pdp10-stdint.h"
-
-enum token {
-#define TOKEN(T,P,F)	T,
-#include "token.def"
-#undef TOKEN
-};
-
-union token_attribute {
-    const char *text;		/* symbol, string */
-    pdp10_uint36_t uint;	/* uinteger */
-};
-
-void token_print(FILE *fp, enum token token, const union token_attribute *token_attr);
-
-#endif /* TOKEN_H */
--- a/as/Makefile
+++ b/as/Makefile
@@ -0,0 +1,19 @@
+CC=gcc
+CFLAGS=-O2 -g -Wall
+CPPFLAGS=-I../include
+
+ASOBJS=assemble.o input.o main.o output.o parse.o scan.o token.o
+LIBOBJS=../lib/pdp10-elf36.o ../lib/pdp10-extint.o ../lib/pdp10-opcodes.o ../lib/pdp10-stdio.o
+
+as:	$(ASOBJS) $(LIBOBJS)
+	$(LINK.c) -o $@ $^
+
+input.o:	input.h parse.h scan.h token.def token.h
+main.o:		assemble.h input.h output.h
+output.o:	assemble.h output.h
+parse.o:	input.h scan.h token.def token.h
+scan.o:		scan.h token.def token.h
+token.o:	token.def token.h
+
+clean:
+	rm -f $(ASOBJS) as a.out core.*
--- a/as/assemble.c
+++ b/as/assemble.c
@@ -0,0 +1,106 @@
+/*
+ * assemble.c
+ */
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "assemble.h"
+#include "input.h"
+
+static struct aunit_symbol *symbol(const char *progname, struct aunit *aunit, const char *name)
+{
+    struct aunit_symbol *sym;
+
+    for (sym = aunit->symbols; sym; sym = sym->next)
+	if (strcmp(name, sym->name) == 0)
+	    return sym;
+
+    sym = malloc(sizeof *sym);
+    if (!sym) {
+	fprintf(stderr, "%s: failed to allocate %zu bytes for aunit_symbol: %s\n", progname, sizeof *sym, strerror(errno));
+	return NULL;
+    }
+
+    sym->name = name;
+    sym->text_offset = 0;
+    sym->is_global = 0;
+    sym->is_defined = 0;
+
+    sym->next = aunit->symbols;
+    aunit->symbols = sym;
+
+    return sym;
+}
+
+int assemble(const char *progname, struct iunit *iunit, struct aunit *aunit)
+{
+    struct stmt *stmt;
+    struct aunit_symbol *sym;
+    pdp10_uint36_t i, n;
+
+    aunit->text_words = NULL;
+    aunit->text_nr_words = 0;
+    aunit->symbols = NULL;
+
+    n = 0;
+    for (stmt = iunit->text.head; stmt; stmt = stmt->next) {
+	switch (stmt->tag) {
+	case S_DOT_GLOBL:
+	    sym = symbol(progname, aunit, stmt->u.symbol.name);
+	    if (!sym)
+		return -1;
+	    sym->is_global = 1;
+	    break;
+	case S_LABEL:
+	    (void)symbol(progname, aunit, stmt->u.symbol.name);
+	    break;
+	case S_INSN:
+	    ++n;
+	    break;
+	default:
+	    break;
+	}
+    }
+
+    aunit->text_nr_words = n;
+    aunit->text_words = malloc(n * sizeof(pdp10_uint36_t));
+    if (!aunit->text_words) {
+	fprintf(stderr, "%s: failed to allocate %zu bytes for text image: %s\n", progname, n * sizeof(pdp10_uint36_t), strerror(errno));
+	return -1;
+    }
+
+    i = 0;
+    for (stmt = iunit->text.head; stmt; stmt = stmt->next) {
+	switch (stmt->tag) {
+	case S_LABEL:
+	    sym = symbol(progname, aunit, stmt->u.symbol.name);
+	    if (!sym)
+		return -1;
+	    sym->is_defined = 1;
+	    sym->text_offset = i * 4;
+	    break;
+	case S_INSN:
+	    if (i >= n) {
+		fprintf(stderr, "%s: internal error: text image overflow\n", progname);
+		return -1;
+	    }
+	    aunit->text_words[i] =
+		((pdp10_uint36_t)(stmt->u.insn.opcode & 0x1FF) << (36 - 9)
+		 | ((stmt->u.insn.accumulator & 0xF) << (36 - 13))
+		 | ((stmt->u.insn.at & 1) << (36 - 14))
+		 | ((stmt->u.insn.indexreg & 0xF) << (36 - 18))
+		 | (stmt->u.insn.address & PDP10_UINT18_MAX));
+	    ++i;
+	    break;
+	default:
+	    break;
+	}
+    }
+    if (i != n) {
+	fprintf(stderr, "%s: internal error: text image size mismatch\n", progname);
+	return -1;
+    }
+
+    return 0;
+}
--- a/as/assemble.h
+++ b/as/assemble.h
@@ -0,0 +1,26 @@
+/*
+ * assemble.h
+ */
+#ifndef ASSEMBLE_H
+#define ASSEMBLE_H
+
+#include "pdp10-stdint.h"
+#include "input.h"
+
+struct aunit_symbol {
+    struct aunit_symbol *next;
+    const char *name;
+    pdp10_uint36_t text_offset;
+    int is_global;
+    int is_defined;
+};
+
+struct aunit {
+    pdp10_uint36_t *text_words;
+    pdp10_uint36_t text_nr_words;
+    struct aunit_symbol *symbols;
+};
+
+int assemble(const char *progname, struct iunit *iunit, struct aunit *aunit);
+
+#endif /* ASSEMBLE_H */
--- a/as/input.c
+++ b/as/input.c
@@ -0,0 +1,85 @@
+/*
+ * input.c
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "input.h"
+#include "parse.h"
+#include "scan.h"
+
+static int interpret(struct scan_state *scan_state, struct iunit *iunit, struct stmt *stmt)
+{
+    struct stmt *stmt2;
+
+    switch (stmt->tag) {
+    case S_DOT_GLOBL:
+	break;
+    case S_DOT_TEXT:
+	return 0;	/* XXX: nothing to do yet */
+    case S_LABEL:
+	break;
+    case S_INSN:
+	break;
+    default:
+	fprintf(stderr, "%s: %s line %u: parser returned unexpected stmt->tag %u\n",
+		scan_state->progname, scan_state->filename, scan_state->linenr, stmt->tag);
+	return -1;
+    }
+
+    stmt2 = malloc(sizeof *stmt2);
+    if (!stmt2) {
+	fprintf(stderr, "%s: %s line %u: malloc(%zu) failed: %s\n",
+		scan_state->progname, scan_state->filename, scan_state->linenr, sizeof *stmt2, strerror(errno));
+	return -1;
+    }
+
+    *stmt2 = *stmt;
+    stmt2->next = NULL;
+
+    *iunit->text.tailptr = stmt2;
+    iunit->text.tailptr = &stmt2->next;
+
+    return 0;
+}
+
+int input(const char *progname, char **files, int nrfiles, struct iunit *iunit)
+{
+    char fake_file[3];
+    char *fake_files[1];
+    struct scan_state scan_state;
+    int i;
+    struct stmt stmt;
+    int status;
+
+    if (nrfiles <= 0) {
+	fake_file[0] = '-';
+	fake_file[1] = '-';
+	fake_file[2] = '\0';
+	fake_files[0] = fake_file;
+	files = fake_files;
+	nrfiles = 1;
+    }
+
+    iunit->text.head = NULL;
+    iunit->text.tailptr = &iunit->text.head;
+
+    scan_init(&scan_state, progname);
+
+    for (i = 0; i < nrfiles; ++i) {
+	if (scan_open(&scan_state, files[i]) < 0)
+	    return -1;
+	for (;;) {
+	    status = parse_stmt(&scan_state, &stmt);
+	    if (status < 0)
+		return -1;
+	    if (status == 0)
+		break;
+	    if (interpret(&scan_state, iunit, &stmt) < 0)
+		return -1;
+	}
+    }
+
+    return 0;
+}
--- a/as/input.h
+++ b/as/input.h
@@ -0,0 +1,53 @@
+/*
+ * input.h
+ */
+#ifndef INPUT_H
+#define INPUT_H
+
+/*
+ * A directives, label, or instruction is parsed to a statement, which is
+ * either interpreted immediately or appended to the representation of the
+ * current section.
+ */
+
+enum stmt_tag {
+    /* directives */
+    S_DOT_GLOBL,
+    S_DOT_TEXT,
+    /* non-directives */
+    S_LABEL,
+    S_INSN,
+};
+
+struct stmt {
+    struct stmt *next;
+    enum stmt_tag tag;
+    union {
+	struct {	/* S_DOT_GLOBL, S_LABEL */
+	    const char *name;
+	} symbol;
+	struct {	/* S_INSN */
+	    unsigned int opcode;
+	    unsigned int accumulator;
+	    int at;
+	    unsigned int address;	/* XXX: relocatable expr */
+	    unsigned int indexreg;
+	} insn;
+    } u;
+};
+
+/*
+ * The input unit object is the top-level container for the representation
+ * of the sections, and all other information collected from the input.
+ */
+
+struct iunit {
+    struct {
+	struct stmt *head;
+	struct stmt **tailptr;
+    } text;
+};
+
+int input(const char *progname, char **files, int nrfiles, struct iunit *iunit);
+
+#endif /* INPUT_H */
--- a/as/main.c
+++ b/as/main.c
@@ -0,0 +1,52 @@
+/*
+ * main.c
+ *
+ * as clone for PDP10 with Elf36 object files.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include "assemble.h"
+#include "input.h"
+#include "output.h"
+
+#define VERSION "pdp10-tools as version 0.1, built " __DATE__ " " __TIME__ "\n"
+
+int main(int argc, char **argv)
+{
+    const char *outfile = "a.out";
+    struct iunit iunit;
+    struct aunit aunit;
+
+    for (;;) {
+	int ch;
+
+	ch = getopt(argc, argv, "vo:");
+	switch (ch) {
+	case 'v':
+	    printf(VERSION);
+	    continue;
+	case 'o':
+	    outfile = optarg;
+	    continue;
+	case -1:
+	    break;
+	default:
+	    fprintf(stderr, "Usage: %s [-v] [-o outfile] [files..]\n", argv[0]);
+	    return 1;
+	}
+	break;
+    }
+
+    if (input(argv[0], &argv[optind], argc - optind, &iunit) < 0)
+	return 1;
+
+    if (assemble(argv[0], &iunit, &aunit) < 0)
+	return 1;
+
+    /* XXX: iunit_fini(&iunit) */
+
+    if (output(argv[0], &aunit, outfile) < 0)
+	return 1;
+
+    return 0;
+}
--- a/as/output.c
+++ b/as/output.c
@@ -0,0 +1,325 @@
+/*
+ * output.c
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pdp10-elf36.h"
+#include "pdp10-stdint.h"
+#include "pdp10-stdio.h"
+#include "assemble.h"
+#include "output.h"
+
+struct strtab_entry {
+    struct strtab_entry *next;
+    const char *string;
+    unsigned int nrbytes;	/* strlen(string) + 1 */
+};
+
+struct strtab {
+    struct strtab_entry *head;
+    unsigned int nrbytes;
+};
+
+static void strtab_init(struct strtab *strtab)
+{
+    strtab->head = NULL;
+    strtab->nrbytes = 0;
+}
+
+static pdp10_uint36_t strtab_enter(const char *progname, struct strtab *strtab, const char *name)
+{
+    struct strtab_entry *prev, *here;
+    pdp10_uint36_t index;
+
+    index = 1;
+    prev = NULL;
+    here = strtab->head;
+    while (here != NULL) {
+	if (strcmp(name, here->string) == 0)
+	    return index;
+	index += here->nrbytes;
+	prev = here;
+	here = here->next;
+    }
+
+    here = malloc(sizeof *here);
+    if (!here) {
+	fprintf(stderr, "%s: failed to allocate %zu bytes for a strtab_entry: %s\n",
+		progname, sizeof *here, strerror(errno));
+	return 0;
+    }
+    here->next = NULL;
+    here->string = name;
+    here->nrbytes = strlen(name) + 1;
+
+    if (prev) {
+	prev->next = here;
+    } else {
+	strtab->head = here;
+	index = 1;
+	strtab->nrbytes = 1;
+    }
+
+    strtab->nrbytes += here->nrbytes;
+
+    return index;
+}
+
+static int strtab_write(PDP10_FILE *pdp10fp, const struct strtab *strtab)
+{
+    struct strtab_entry *here;
+    unsigned int i;
+
+    if (pdp10_elf36_write_uint9(pdp10fp, '\0') < 0)
+	return -1;
+
+    for (here = strtab->head; here; here = here->next)
+	for (i = 0; i < here->nrbytes; ++i)
+	    if (pdp10_elf36_write_uint9(pdp10fp, here->string[i]) < 0)
+		return -1;
+
+    i = (4 - (strtab->nrbytes & 3)) & 3;
+    while (i != 0) {
+	if (pdp10_elf36_write_uint9(pdp10fp, '\0') < 0)
+	    return -1;
+	--i;
+    }
+
+    return 0;
+}
+
+int output(const char *progname, struct aunit *aunit, const char *outfile)
+{
+    pdp10_uint36_t shnum, text_shndx, symtab_shndx, strtab_shndx, shstrtab_shndx;
+    pdp10_uint36_t text_shstrndx, symtab_shstrndx, strtab_shstrndx, shstrtab_shstrndx;
+    Elf36_Sym *symtab;
+    pdp10_uint36_t symnum;
+    struct strtab strtab, shstrtab;
+    struct aunit_symbol *asym;
+    pdp10_uint36_t i;
+    Elf36_Shdr *shtab;
+    pdp10_uint36_t offset;
+    Elf36_Ehdr ehdr;
+    PDP10_FILE *pdp10fp;
+
+    shnum = 0;
+    shstrtab_shndx = 0;
+    text_shndx = 0;
+    symtab_shndx = 0;
+    strtab_shndx = 0;
+    symtab = NULL;
+    symnum = 0;
+    strtab_init(&strtab);
+    strtab_init(&shstrtab);
+    shtab = NULL;
+
+    shnum = 1;	/* tentative */
+
+    if (aunit->text_nr_words != 0) {
+	text_shstrndx = strtab_enter(progname, &shstrtab, ".text");
+	if (text_shstrndx == 0)
+	    return -1;
+	text_shndx = shnum;
+	++shnum;
+    }
+
+    for (asym = aunit->symbols; asym; asym = asym->next)
+	++symnum;
+    if (symnum != 0) {
+	symtab_shstrndx = strtab_enter(progname, &shstrtab, ".symtab");
+	if (symtab_shstrndx == 0)
+	    return -1;
+	strtab_shstrndx = strtab_enter(progname, &shstrtab, ".strtab");
+	if (strtab_shstrndx == 0)
+	    return -1;
+	symtab_shndx = shnum;
+	strtab_shndx = shnum + 1;
+	shnum += 2;
+    }
+
+    if (shnum == 1) {
+	shstrtab_shndx = 0;
+	shnum = 0;
+    } else {
+	shstrtab_shstrndx = strtab_enter(progname, &shstrtab, ".shstrtab");
+	if (shstrtab_shstrndx == 0)
+	    return -1;
+	shstrtab_shndx = shnum;
+	++shnum;
+    }
+
+    if (symnum) {
+	++symnum;	/* for initial stub entry */
+	symtab = malloc(symnum * sizeof(Elf36_Sym));
+	if (!symtab) {
+	    fprintf(stderr, "%s: failed to allocate %zu bytes for Elf36 symbol table: %s\n",
+		    progname, symnum * sizeof(Elf36_Sym), strerror(errno));
+	    return -1;
+	}
+
+	symtab[0].st_name = 0;
+	symtab[0].st_value = 0;
+	symtab[0].st_size = 0;
+	symtab[0].st_info = ELF36_ST_INFO(STB_LOCAL, STT_NOTYPE);
+	symtab[0].st_other = 0;
+	symtab[0].st_shndx = SHN_UNDEF;
+
+	for (i = 1, asym = aunit->symbols; asym; ++i, asym = asym->next) {
+	    symtab[i].st_name = strtab_enter(progname, &strtab, asym->name);
+	    if (symtab[i].st_name == 0)
+		return -1;
+	    symtab[i].st_value = asym->text_offset;
+	    symtab[i].st_size = 0;
+	    if (asym->is_global)
+		symtab[i].st_info = ELF36_ST_INFO(STB_GLOBAL, STT_NOTYPE);
+	    else
+		symtab[i].st_info = ELF36_ST_INFO(STB_LOCAL, STT_NOTYPE);
+	    symtab[i].st_other = STV_DEFAULT;
+	    symtab[i].st_shndx = text_shndx;
+	}
+    }
+
+    if (shnum) {
+	shtab = malloc(shnum * sizeof(Elf36_Shdr));
+	if (!shtab) {
+	    fprintf(stderr, "%s: failed to allocate %zu bytes for Elf36 section header table: %s\n",
+		    progname, shnum * sizeof(Elf36_Shdr), strerror(errno));
+	    return -1;
+	}
+
+	shtab[0].sh_name = 0;
+	shtab[0].sh_type = SHT_NULL;
+	shtab[0].sh_flags = 0;
+	shtab[0].sh_addr = 0;
+	shtab[0].sh_offset = 0;
+	shtab[0].sh_size = 0;
+	shtab[0].sh_link = 0;
+	shtab[0].sh_info = 0;
+	shtab[0].sh_addralign = 0;
+	shtab[0].sh_entsize = 0;
+
+	offset = ELF36_EHDR_SIZEOF;
+
+	if (text_shndx) {
+	    shtab[text_shndx].sh_name = text_shstrndx;
+	    shtab[text_shndx].sh_type = SHT_PROGBITS;
+	    shtab[text_shndx].sh_flags = SHF_ALLOC | SHF_EXECINSTR;
+	    shtab[text_shndx].sh_addr = 0;
+	    shtab[text_shndx].sh_offset = offset;
+	    shtab[text_shndx].sh_size = aunit->text_nr_words * 4;
+	    shtab[text_shndx].sh_link = 0;
+	    shtab[text_shndx].sh_info = 0;
+	    shtab[text_shndx].sh_addralign = 4;
+	    shtab[text_shndx].sh_entsize = 0;
+	    offset += aunit->text_nr_words * 4;
+	}
+
+	if (symtab_shndx) {
+	    shtab[symtab_shndx].sh_name = symtab_shstrndx;
+	    shtab[symtab_shndx].sh_type = SHT_SYMTAB;
+	    shtab[symtab_shndx].sh_flags = 0;
+	    shtab[symtab_shndx].sh_addr = 0;
+	    shtab[symtab_shndx].sh_offset = offset;
+	    shtab[symtab_shndx].sh_size = symnum * ELF36_SYM_SIZEOF;
+	    shtab[symtab_shndx].sh_link = strtab_shndx;
+	    shtab[symtab_shndx].sh_info = 0 + 1;	/* XXX: LAST_LOCAL + 1 */
+	    shtab[symtab_shndx].sh_addralign = 4;
+	    shtab[symtab_shndx].sh_entsize = ELF36_SYM_SIZEOF;
+	    offset += symnum * ELF36_SYM_SIZEOF;
+	}
+
+	if (strtab_shndx) {
+	    shtab[strtab_shndx].sh_name = strtab_shstrndx;
+	    shtab[strtab_shndx].sh_type = SHT_STRTAB;
+	    shtab[strtab_shndx].sh_flags = 0;
+	    shtab[strtab_shndx].sh_addr = 0;
+	    shtab[strtab_shndx].sh_offset = offset;
+	    shtab[strtab_shndx].sh_size = strtab.nrbytes;
+	    shtab[strtab_shndx].sh_link = 0;
+	    shtab[strtab_shndx].sh_info = 0;
+	    shtab[strtab_shndx].sh_addralign = 1;
+	    shtab[strtab_shndx].sh_entsize = 0;
+	    offset += (strtab.nrbytes + 3) & ~3;
+	}
+
+	if (shstrtab_shndx) {
+	    shtab[shstrtab_shndx].sh_name = shstrtab_shstrndx;
+	    shtab[shstrtab_shndx].sh_type = SHT_STRTAB;
+	    shtab[shstrtab_shndx].sh_flags = 0;
+	    shtab[shstrtab_shndx].sh_addr = 0;
+	    shtab[shstrtab_shndx].sh_offset = offset;
+	    shtab[shstrtab_shndx].sh_size = shstrtab.nrbytes;
+	    shtab[shstrtab_shndx].sh_link = 0;
+	    shtab[shstrtab_shndx].sh_info = 0;
+	    shtab[shstrtab_shndx].sh_addralign = 1;
+	    shtab[shstrtab_shndx].sh_entsize = 0;
+	    offset += (shstrtab.nrbytes + 3) & ~3;
+	}
+
+	/* offset is now the offset of the section header table, which is last in the file */
+    } else
+	offset = 0;
+
+    ehdr.e_wident[0] = (((pdp10_uint36_t)ELFMAG0 << 28)
+			| (ELFMAG1 << 20)
+			| (ELFMAG2 << 12)
+			| (ELFMAG3 << 4)
+			| (ELFCLASS36 >> 4));
+    ehdr.e_wident[1] = (((pdp10_uint36_t)(ELFCLASS36 & 0x0f) << 32)
+			| (ELFDATA2MSB << 24)
+			| (EV_CURRENT << 16)
+			| (ELFOSABI_NONE << 8)
+			| 0);	/* EI_ABIVERSION */
+    ehdr.e_wident[2] = 0;
+    ehdr.e_wident[3] = 0;
+    ehdr.e_type = ET_REL;
+    ehdr.e_machine = EM_PDP10;
+    ehdr.e_version = EV_CURRENT;
+    ehdr.e_entry = 0;
+    ehdr.e_phoff = 0;
+    ehdr.e_shoff = offset;
+    ehdr.e_flags = 0;
+    ehdr.e_ehsize = ELF36_EHDR_SIZEOF;
+    ehdr.e_phentsize = 0;
+    ehdr.e_phnum = 0;
+    ehdr.e_shentsize = ELF36_SHDR_SIZEOF;
+    ehdr.e_shnum = shnum;
+    ehdr.e_shstrndx = shstrtab_shndx;
+
+    pdp10fp = pdp10_fopen(outfile, "wb");
+    if (!pdp10fp) {
+	fprintf(stderr, "%s: failed to open %s: %s\n", progname, outfile, strerror(errno));
+	return -1;
+    }
+
+    if (pdp10_elf36_write_ehdr(pdp10fp, &ehdr) < 0)
+	return -1;
+
+    if (text_shndx)
+	for (i = 0; i < aunit->text_nr_words; ++i)
+	    if (pdp10_elf36_write_uint36(pdp10fp, aunit->text_words[i]) < 0)
+		return -1;
+
+    if (symtab_shndx)
+	for (i = 0; i < symnum; ++i)
+	    if (pdp10_elf36_write_sym(pdp10fp, &symtab[i]) < 0)
+		return -1;
+
+    if (strtab_shndx)
+	if (strtab_write(pdp10fp, &strtab) < 0)
+	    return -1;
+
+    if (shstrtab_shndx)
+	if (strtab_write(pdp10fp, &shstrtab) < 0)
+	    return -1;
+
+    if (shnum)
+	for (i = 0; i < shnum; ++i)
+	    if (pdp10_elf36_write_shdr(pdp10fp, &shtab[i]) < 0)
+		return -1;
+
+    pdp10_fclose(pdp10fp);
+    return 0;
+}
--- a/as/output.h
+++ b/as/output.h
@@ -0,0 +1,11 @@
+/*
+ * output.h
+ */
+#ifndef OUTPUT_H
+#define OUTPUT_H
+
+#include "assemble.h"
+
+int output(const char *progname, struct aunit *aunit, const char *outfile);
+
+#endif /* OUTPUT_H */
--- a/as/parse.c
+++ b/as/parse.c
@@ -0,0 +1,354 @@
+/*
+ * parse.c
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pdp10-opcodes.h"
+#include "input.h"	/* for struct stmt */
+#include "scan.h"
+#include "token.h"
+
+static int error(struct scan_state *scan_state, const char *msg, enum token token, const union token_attribute *token_attr)
+{
+    fprintf(stderr, "%s: %s line %u: syntax error: %s; current token is ",
+	    scan_state->progname, scan_state->filename, scan_state->linenr, msg);
+    token_print(stderr, token, token_attr);
+    fprintf(stderr, "\n");
+    return -1;
+}
+
+static int parse_dot_globl(struct scan_state *scan_state, struct stmt *stmt)
+{
+    enum token token;
+    union token_attribute token_attr;
+
+    token = scan_token(scan_state, &token_attr);
+    if (token == T_SYMBOL) {
+	stmt->u.symbol.name = token_attr.text;
+	token = scan_token(scan_state, &token_attr);
+	if (token == T_NEWLINE) {
+	    stmt->tag = S_DOT_GLOBL;
+	    return 1;
+	}
+    }
+    return error(scan_state, "junk after .globl directive", token, &token_attr);
+}
+
+static int parse_dot_text(struct scan_state *scan_state, struct stmt *stmt)
+{
+    enum token token;
+    union token_attribute token_attr;
+
+    token = scan_token(scan_state, &token_attr);
+    if (token == T_NEWLINE) {
+	stmt->tag = S_DOT_TEXT;
+	return 1;
+    }
+    return error(scan_state, "junk after .text directive", token, &token_attr);
+}
+
+/*
+ * Recognize:
+ *
+ * <label> ::= <symbol> ":"
+ *
+ * <insn> ::= <symbol> (<accumulator> ",")? <address> <newline>
+ *
+ * <accumulator> ::= <uinteger> [uint <= 0xF]
+ *
+ * <address> ::= "@"? <displacement>? <index>?
+ *
+ * <displacement> ::= <uinteger> [uint <= 1^18 - 1]
+ * <displacement> ::= "(" <uinteger> ")" [uint <= 1^18 - 1]
+ *
+ * <index> ::= "(" <indexreg> ")"
+ * <indexreg> ::= <uinteger> [uint <= 0xF]
+ *
+ * Examples:
+ * foo:
+ * popj 17,
+ * pushj 17,bar
+ * movei 1,@fum(2)
+ *
+ * Ambiguous examples:
+ *
+ * <symbol> (<uinteger>) <newline>
+ *
+ * This is ambigouous since we have no special notation for <register>, and the same kind of
+ * parentheses are used for expression grouping in the displacement as for the index register.
+ *
+ * This might denote an insn with a parenthesized displacement and no index,
+ * or it might denote an insn with an index but no displacement.
+ *
+ * However, the uinteger in an indexreg cannot be > 0xF, and it rarely makes sense to form
+ * an effective address with a displacement <= 0xF and no index.
+ *
+ * Therefore, if the uinteger is <= 0xF this is an index with no displacement,
+ * otherwise it is a displacement without an index.
+ */
+
+static int parse_insn_index_after_lparen(struct scan_state *scan_state, struct stmt *stmt)
+{
+    enum token token;
+    union token_attribute token_attr;
+
+    token = scan_token(scan_state, &token_attr);
+    if (token != T_UINTEGER
+	|| token_attr.uint > 0xF)
+	return error(scan_state, "invalid <indexreg>", token, &token_attr);
+
+    stmt->u.insn.indexreg = token_attr.uint;
+
+    token = scan_token(scan_state, &token_attr);
+    if (token != T_RPAREN)
+	return error(scan_state, "junk after '(' <indexreg>", token, &token_attr);
+
+    token = scan_token(scan_state, &token_attr);
+    if (token != T_NEWLINE)
+	return error(scan_state, "junk after '(' <indexreg> ')'", token, &token_attr);
+
+    return 1;
+}
+
+static int parse_insn_address_after_lparen_uinteger_rparen(struct scan_state *scan_state, struct stmt *stmt, union token_attribute *uinteger_attr)
+{
+    enum token token;
+    union token_attribute token_attr;
+
+    token = scan_token(scan_state, &token_attr);
+    switch (token) {
+    case T_NEWLINE:	/* might be <displacement> or <index>, inspect the <uinteger>'s value to disambiguate */
+	if (uinteger_attr->uint > PDP10_UINT18_MAX)
+	    return error(scan_state, "invalid <displacement>", T_UINTEGER, uinteger_attr);
+	if (uinteger_attr->uint <= 0xF)	/* it's the <index> */
+	    stmt->u.insn.indexreg = uinteger_attr->uint;
+	else				/* it's the <displacement> */
+	    stmt->u.insn.address = uinteger_attr->uint;
+	return 1;
+    case T_LPAREN:	/* the <uinteger> is the <displacement>, followed by <index> */
+	if (uinteger_attr->uint > PDP10_UINT18_MAX)
+	    return error(scan_state, "invalid <displacement>", T_UINTEGER, uinteger_attr);
+	stmt->u.insn.address = uinteger_attr->uint;
+	return parse_insn_index_after_lparen(scan_state, stmt);
+    default:
+	return error(scan_state, "junk in <address> after '(' <uinteger> ')'", token, &token_attr);
+    }
+}
+
+static int parse_insn_address_after_lparen_uinteger(struct scan_state *scan_state, struct stmt *stmt, union token_attribute *uinteger_attr)
+{
+    enum token token;
+    union token_attribute token_attr;
+
+    token = scan_token(scan_state, &token_attr);
+    switch (token) {
+    case T_RPAREN:	/* might be <displacement> or <index> */
+	return parse_insn_address_after_lparen_uinteger_rparen(scan_state, stmt, uinteger_attr);
+    default:
+	return error(scan_state, "junk in <address> after '(' <uinteger>", token, &token_attr);
+    }
+}
+
+static int parse_insn_address_after_lparen(struct scan_state *scan_state, struct stmt *stmt)
+{
+    enum token token;
+    union token_attribute token_attr;
+
+    token = scan_token(scan_state, &token_attr);
+    switch (token) {
+    case T_UINTEGER:	/* might be <displacement> or <index> */
+	return parse_insn_address_after_lparen_uinteger(scan_state, stmt, &token_attr);
+    default:
+	return error(scan_state, "junk in <address> after '('", token, &token_attr);
+    }
+}
+
+static int parse_insn_after_displacement(struct scan_state *scan_state, struct stmt *stmt)
+{
+    enum token token;
+    union token_attribute token_attr;
+
+    token = scan_token(scan_state, &token_attr);
+    switch (token) {
+    case T_NEWLINE:	/* no <index> */
+	return 1;
+    case T_LPAREN:	/* need <index> */
+	return parse_insn_index_after_lparen(scan_state, stmt);
+    default:
+	return error(scan_state, "junk in <address> after <displacement>", token, &token_attr);
+    }
+}
+
+static int parse_insn_address_after_at(struct scan_state *scan_state, struct stmt *stmt)
+{
+    enum token token;
+    union token_attribute token_attr;
+
+    token = scan_token(scan_state, &token_attr);
+    switch (token) {
+    case T_NEWLINE:
+	return 1;
+    case T_LPAREN:	/* might be <displacement> or <index> */
+	return parse_insn_address_after_lparen(scan_state, stmt);
+    case T_UINTEGER:
+	if (token_attr.uint > PDP10_UINT18_MAX)
+	    return error(scan_state, "invalid <displacement>", token, &token_attr);
+	stmt->u.insn.address = token_attr.uint;
+	return parse_insn_after_displacement(scan_state, stmt);
+    default:
+	return error(scan_state, "invalid <address>", token, &token_attr);
+    }
+}
+
+static int parse_insn_address(struct scan_state *scan_state, struct stmt *stmt, const struct pdp10_instruction *insndesc)
+{
+    enum token token;
+    union token_attribute token_attr;
+
+    token = scan_token(scan_state, &token_attr);
+    if (token == T_NEWLINE)
+	return 1;
+
+    if (insndesc->type & PDP10_E_UNUSED)
+	return error(scan_state, "<address> not allowed in this instruction", token, &token_attr);
+
+    switch (token) {
+    case T_LPAREN:	/* might be <displacement> or <index> */
+	return parse_insn_address_after_lparen(scan_state, stmt);
+    case T_UINTEGER:
+	if (token_attr.uint > PDP10_UINT18_MAX)
+	    return error(scan_state, "invalid <displacement>", token, &token_attr);
+	stmt->u.insn.address = token_attr.uint;
+	return parse_insn_after_displacement(scan_state, stmt);
+    case T_AT:
+	stmt->u.insn.at = 1;
+	return parse_insn_address_after_at(scan_state, stmt);
+    default:
+	return error(scan_state, "invalid <address>", token, &token_attr);
+    }
+}
+
+static int parse_insn_after_symbol_uinteger(
+    struct scan_state *scan_state, struct stmt *stmt, const struct pdp10_instruction *insndesc, union token_attribute *uinteger_attr)
+{
+    enum token token;
+    union token_attribute token_attr;
+
+    token = scan_token(scan_state, &token_attr);
+    if (token == T_COMMA) {	/* the <uinteger> is the <accumulator> */
+	if (uinteger_attr->uint > 0xF)
+	    return error(scan_state, "invalid <accumulator>", T_UINTEGER, uinteger_attr);
+	if (insndesc->type & (PDP10_A_OPCODE | PDP10_A_UNUSED))
+	    return error(scan_state, "<accumulator> not allowed in this instruction", T_UINTEGER, uinteger_attr);
+	stmt->u.insn.accumulator = uinteger_attr->uint;
+	return parse_insn_address(scan_state, stmt, insndesc);
+    }
+
+    if (insndesc->type & PDP10_E_UNUSED)
+	return error(scan_state, "<address> not allowed in this instruction", token, &token_attr);
+
+    switch (token) {
+    case T_LPAREN:	/* the <uinteger> is the <displacement>, followed by <index> */
+	if (uinteger_attr->uint > PDP10_UINT18_MAX)
+	    return error(scan_state, "invalid <displacement>", T_UINTEGER, uinteger_attr);
+	stmt->u.insn.address = uinteger_attr->uint;
+	return parse_insn_index_after_lparen(scan_state, stmt);
+    case T_NEWLINE:	/* the <uinteger> is the <displacement>, there is no <accumulator> or <index> */
+	if (uinteger_attr->uint > PDP10_UINT18_MAX)
+	    return error(scan_state, "invalid <displacement>", T_UINTEGER, uinteger_attr);
+	stmt->u.insn.address = uinteger_attr->uint;
+	return 1;
+    default:
+	return error(scan_state, "junk after <symbol> <uinteger>", token, &token_attr);
+    }
+}
+
+static int parse_after_symbol(struct scan_state *scan_state, struct stmt *stmt, union token_attribute *symbol_attr)
+{
+    enum token token;
+    union token_attribute token_attr;
+    const struct pdp10_instruction *insndesc;
+
+    token = scan_token(scan_state, &token_attr);
+    if (token == T_COLON) {
+	stmt->u.symbol.name = symbol_attr->text;
+	stmt->tag = S_LABEL;
+	return 1;
+    }
+
+    insndesc = pdp10_instruction_from_name(symbol_attr->text);
+    if (!insndesc)
+	return error(scan_state, "invalid instruction name", T_SYMBOL, symbol_attr);
+
+    stmt->tag = S_INSN;
+    stmt->u.insn.at = 0;
+    stmt->u.insn.address = 0;
+    stmt->u.insn.indexreg = 0;
+
+    if (insndesc->type & PDP10_A_OPCODE) {
+	/* XXX: this is too intimate with quirky ->opcode representation */
+	stmt->u.insn.opcode = (insndesc->opcode >> 6) & 0x1FF;
+	stmt->u.insn.accumulator = (insndesc->opcode >> 2) & 0xF;
+    } else {
+	stmt->u.insn.opcode = insndesc->opcode & 0x1FF;
+	stmt->u.insn.accumulator = 0;
+    }
+
+    switch (token) {
+    case T_NEWLINE:
+	return 1;
+    case T_UINTEGER:	/* might be <accumulator> or <displacement> */
+	return parse_insn_after_symbol_uinteger(scan_state, stmt, insndesc, &token_attr);
+    default:
+	break;
+    }
+
+    if (insndesc->type & PDP10_E_UNUSED)
+	return error(scan_state, "<address> not allowed in this instruction", token, &token_attr);
+
+    switch (token) {
+    case T_AT:
+	stmt->u.insn.at = 1;
+	return parse_insn_address_after_at(scan_state, stmt);
+    case T_LPAREN:	/* might be <displacement> or <index> */
+	return parse_insn_address_after_lparen(scan_state, stmt);
+    default:
+	return error(scan_state, "junk after instruction name", token, &token_attr);
+    }
+}
+
+int parse_stmt(struct scan_state *scan_state, struct stmt *stmt)
+{
+    enum token token;
+    union token_attribute token_attr;
+
+    for (;;) {
+	token = scan_token(scan_state, &token_attr);
+	switch (token) {
+	    /*
+	     * directives
+	     */
+	case T_DOT_GLOBL:
+	    return parse_dot_globl(scan_state, stmt);
+	case T_DOT_TEXT:
+	    return parse_dot_text(scan_state, stmt);
+	    /*
+	     * other symbols
+	     */
+	case T_SYMBOL:	/* start of label, insn, or symbol assignment */
+	    return parse_after_symbol(scan_state, stmt, &token_attr);
+	    /*
+	     * synthetic symbols
+	     */
+	case T_ERROR:
+	    return -1;	/* diagnostics already emitted by scan.c */
+	case T_EOF:
+	    return 0;
+	case T_NEWLINE:
+	    continue;
+	default:
+	    return error(scan_state, "expected directive, label, or instruction", token, &token_attr);
+	}
+    }
+}
--- a/as/parse.h
+++ b/as/parse.h
@@ -0,0 +1,12 @@
+/*
+ * parse.h
+ */
+#ifndef PARSE_H
+#define PARSE_H
+
+#include "input.h"	/* for struct stmt */
+#include "scan.h"
+
+int parse_stmt(struct scan_state *scan_state, struct stmt *stmt);
+
+#endif /* PARSE_H */
--- a/as/scan.c
+++ b/as/scan.c
@@ -0,0 +1,239 @@
+/*
+ * scan.c
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "scan.h"
+#include "token.h"
+
+void scan_init(struct scan_state *scan_state, const char *progname)
+{
+    scan_state->progname = progname;
+    scan_state->filename = "<stdin>";
+    scan_state->linenr = 1;
+}
+
+int scan_open(struct scan_state *scan_state, const char *filename)
+{
+    if (filename[0] == '-' && filename[1] == '-' && filename[2] == '\0') {
+	scan_state->filename = "<stdin>";
+	filename = "/dev/stdin";
+    } else
+	scan_state->filename = filename;
+
+    if (freopen(filename, "r", stdin) == NULL) {
+	fprintf(stderr, "%s: Error opening %s: %s\n", scan_state->progname, filename, strerror(errno));
+	return -1;
+    }
+
+    return 0;
+}
+
+static void scan_ungetc(struct scan_state *scan_state, int ch)
+{
+    if (ch != EOF && ungetc(ch, stdin) == EOF)
+	fprintf(stderr, "%s: %s line %u: ungetc %d failed: %s\n",
+		scan_state->progname, scan_state->filename, scan_state->linenr, ch, strerror(errno));
+}
+
+static int scan_getchar(void)
+{
+    return fgetc(stdin);
+}
+
+static void badchar(struct scan_state *scan_state, int ch, const char *context)
+{
+    char buf[7];
+
+    if (ch == EOF) {
+	buf[0] = '<';
+	buf[1] = 'E';
+	buf[2] = 'O';
+	buf[3] = 'F';
+	buf[4] = '>';
+	buf[5] = '\0';
+    } else if (' ' <= ch && ch <= '~') {
+	buf[0] = '\'';
+	buf[1] = ch;
+	buf[2] = '\'';
+	buf[3] = '\0';
+    } else {
+	buf[0] = '\'';
+	buf[1] = '\\';
+	buf[2] = '0' + ((ch >> 6) & 3);
+	buf[3] = '0' + ((ch >> 3) & 7);
+	buf[4] = '0' + (ch & 7);
+	buf[5] = '\'';
+	buf[6] = '\0';
+    }
+
+    fprintf(stderr, "%s: %s, line %u: invalid character %s%s\n",
+	    scan_state->progname, scan_state->filename, scan_state->linenr, buf, context);
+}
+
+static unsigned int get_chval(int ch)
+{
+    if ('0' <= ch && ch <= '9')
+	return ch - '0';
+    if ('A' <= ch && ch <= 'F')
+	return ch - ('A' - 10);
+    if ('a' <= ch && ch <= 'f')
+	return ch - ('a' - 10);
+    return -1U;
+}
+
+static int is_symbol_internal_char(int ch)
+{
+    return
+	('A' <= ch && ch <= 'Z')
+	|| ('a' <= ch && ch <= 'z')
+	|| ('0' <= ch && ch <= '9')
+	|| ch == '_'
+	|| ch == '$'
+	|| ch == '.';
+}
+
+static enum token do_symbol(struct scan_state *scan_state, union token_attribute *token_attr, int ch)
+{
+    char charbuf[128];	/* 127 chars + NUL, XXX: make it dynamic */
+    unsigned int len;
+    char *text;
+
+    len = 0;
+    do {
+	if (len >= sizeof charbuf - 1) {
+	    fprintf(stderr, "%s: %s line %u: too long symbol\n",
+		    scan_state->progname, scan_state->filename, scan_state->linenr);
+	    return T_ERROR;
+	}
+	charbuf[len] = ch;
+	++len;
+	ch = scan_getchar();
+    } while (is_symbol_internal_char(ch));
+    charbuf[len] = '\0';
+    scan_ungetc(scan_state, ch);
+    
+    if (charbuf[0] == '.') {
+	enum token low, high;
+
+	/* see token.def, reserved symbols occupy tokens [0,T_SYMBOL[ */
+	low = 0;
+	high = T_SYMBOL;
+
+	while (low < high) {
+	    enum token middle;
+	    int cmp;
+
+	    middle = (low + high) / 2;
+	    cmp = strcmp(charbuf, token_info[middle].print_name);
+
+	    if (cmp < 0)
+		high = middle;
+	    else if (cmp > 0)
+		low = middle + 1;
+	    else
+		return middle;
+	}
+    }
+
+    text = malloc(len + 1);
+    if (!text) {
+	fprintf(stderr, "%s: %s line %u: malloc(%u) failed: %s\n",
+		scan_state->progname, scan_state->filename, scan_state->linenr, len + 1, strerror(errno));
+	return T_ERROR;
+    }
+    strcpy(text, charbuf);
+    token_attr->text = text;
+    return T_SYMBOL;
+}
+
+static enum token do_number(struct scan_state *scan_state, union token_attribute *token_attr, int ch)
+{
+    unsigned int base, chval;
+    pdp10_uint36_t numval;
+
+    base = (ch == '0') ? 8 : 10;
+    numval = ch - '0';
+
+    /* handle 0x<first hexdig> */
+    ch = scan_getchar();
+    if (base == 8 && (ch == 'x' || ch == 'X')) {
+	base = 16;
+	/* must have at least one hex digit after 0x */
+	ch = scan_getchar();
+	chval = get_chval(ch);
+	if (chval >= 16) {
+	    badchar(scan_state, ch, " after 0x in hexadecimal literal");
+	    return T_ERROR;
+	}
+	numval = chval;
+	ch = scan_getchar();
+    }
+
+    /* the number is non-empty, consume and accumulate trailing
+       characters as long as they are valid in the base */
+    for (;;) {
+	chval = get_chval(ch);
+	if (chval >= base)
+	    break;
+	numval = numval * base + chval;	/* XXX: check for overflow */
+	ch = scan_getchar();
+    }
+
+    /* XXX: check for <decimal>{b,f} which is a local label reference */
+
+    /* plain integer literal */
+    scan_ungetc(scan_state, ch);
+    token_attr->uint = numval;
+    return T_UINTEGER;
+}
+
+enum token scan_token(struct scan_state *scan_state, union token_attribute *token_attr)
+{
+    int ch;
+
+    ch = scan_getchar();
+
+    for (;; ch = scan_getchar()) {
+	switch (ch) {
+	case ' ':
+	case '\t':
+	case '\r':
+	case '\f':
+	    continue;
+	case '\n':
+	    ++scan_state->linenr;
+	    return T_NEWLINE;
+	case EOF:
+	    return T_EOF;
+	case '@':
+	    return T_AT;
+	case ':':
+	    return T_COLON;
+	case ',':
+	    return T_COMMA;
+	case '(':
+	    return T_LPAREN;
+	case ')':
+	    return T_RPAREN;
+	case '.':
+	    /* Dot may start a floating point literal, but tests show that
+	       gcc always outputs floating point values as integer literals,
+	       so we shouldn't have to support floating point literals at all.  */
+	case '$':
+	case '_':
+	    return do_symbol(scan_state, token_attr, ch);
+	default:
+	    if ('0' <= ch && ch <= '9')	/* number or <decimal>{b,f} */
+		return do_number(scan_state, token_attr, ch);
+	    if (('A' <= ch && ch <= 'Z') ||
+		('a' <= ch && ch <= 'z'))
+		return do_symbol(scan_state, token_attr, ch);
+	    break;
+	}
+	badchar(scan_state, ch, "");
+	return T_ERROR;
+    }
+}
--- a/as/scan.h
+++ b/as/scan.h
@@ -0,0 +1,19 @@
+/*
+ * scan.h
+ */
+#ifndef SCAN_H
+#define SCAN_H
+
+#include "token.h"
+
+struct scan_state {
+    const char *progname;	/* for diagnostics, does not change after scan_init() */
+    const char *filename;	/* for diagnostics, set by scan_open() */
+    unsigned int linenr;
+};
+
+void scan_init(struct scan_state *scan_state, const char *progname);
+int scan_open(struct scan_state *scan_state, const char *filename);
+enum token scan_token(struct scan_state *scan_state, union token_attribute *token_attr);
+
+#endif /* SCAN_H */
--- a/as/0LD/token.c
+++ b/as/0LD/token.c
@@ -5,19 +5,7 @@
 #include "pdp10-inttypes.h"
 #include "token.h"

-enum {
-    FMT_NONE = 0,
-    FMT_UINT = 1,
-    FMT_SYMBOL = 2,
-    FMT_STRING = 3,
-};
-
-struct token_info {
-    char print_name[15];
-    unsigned char attribute_fmt;
-};
-
-static const struct token_info token_info[] = {
+const struct token_info token_info[] = {
 #define TOKEN(T,P,F) { P, F },
 #include "token.def"
 #undef TOKEN
@@ -39,13 +27,13 @@ void token_print(FILE *fp, enum token token, const union token_attribute *token_
 	return;

    switch (ti->attribute_fmt) {
-    case FMT_UINT:
-	fprintf(fp, " [%" PDP10_PRIu36 "u]", token_attr->uint);
+    case TAFMT_UINT:
+	fprintf(fp, " [%" PDP10_PRIu36 "]", token_attr->uint);
 	break;
-    case FMT_SYMBOL:
+    case TAFMT_SYMBOL:
 	fprintf(fp, " [%s]", token_attr->text);
 	break;
-    case FMT_STRING:
+    case TAFMT_STRING:
 	fprintf(fp, " [\"%s\"]", token_attr->text);
 	break;
    default:
--- a/as/token.def
+++ b/as/token.def
@@ -0,0 +1,84 @@
+/*
+ * token.def
+ *
+ * TOKEN(T_<name>, <print name>, <attribute fmt>)
+ */
+
+/* reserved symbols including directives; MUST come first and MUST be listed in increasing alphanumeric order */
+TOKEN(T_DOT_GLOBL, ".globl", TAFMT_NONE)
+TOKEN(T_DOT_TEXT, ".text", TAFMT_NONE)
+/* non-reserved symbols; T_SYMBOL MUST be the first token after the list of reserved symbols */
+TOKEN(T_SYMBOL, "<symbol>", TAFMT_SYMBOL)
+/* literals */
+TOKEN(T_UINTEGER, "<integer>", TAFMT_UINT)
+/* special symbols including operators and separators */
+TOKEN(T_AT, "@", TAFMT_NONE)
+TOKEN(T_COLON, ":", TAFMT_NONE)
+TOKEN(T_COMMA, ",", TAFMT_NONE)
+TOKEN(T_LPAREN, "(", TAFMT_NONE)
+TOKEN(T_RPAREN, ")", TAFMT_NONE)
+/* synthetic symbols */
+TOKEN(T_NEWLINE, "<newline>", TAFMT_NONE)
+TOKEN(T_EOF, "<eof>", TAFMT_NONE)
+TOKEN(T_ERROR, "<error>", TAFMT_NONE)
+
+/* XXX: old tokens not yet resurrected */
+#if 0
+TOKEN(T_DOT_ALIGN, ".align", TAFMT_NONE)
+TOKEN(T_DOT_ASCII, ".ascii", TAFMT_NONE)
+TOKEN(T_DOT_ASCIZ, ".asciz", TAFMT_NONE)
+TOKEN(T_DOT_BALIGN, ".balign", TAFMT_NONE)
+TOKEN(T_DOT_BSS, ".bss", TAFMT_NONE)
+TOKEN(T_DOT_BYTE, ".byte", TAFMT_NONE)
+TOKEN(T_DOT_COMM, ".comm", TAFMT_NONE)
+TOKEN(T_DOT_DATA, ".data", TAFMT_NONE)
+TOKEN(T_DOT_FILE, ".file", TAFMT_NONE)
+TOKEN(T_DOT_HIDDEN, ".hidden", TAFMT_NONE)
+TOKEN(T_DOT_IDENT, ".ident", TAFMT_NONE)
+TOKEN(T_DOT_INTERNAL, ".internal", TAFMT_NONE)
+TOKEN(T_DOT_LOCAL, ".local", TAFMT_NONE)
+TOKEN(T_DOT_LONG, ".long", TAFMT_NONE)
+TOKEN(T_DOT_ORG, ".org", TAFMT_NONE)
+TOKEN(T_DOT_P2ALIGN, ".p2align", TAFMT_NONE)
+TOKEN(T_DOT_POPSECTION, ".popsection", TAFMT_NONE)
+TOKEN(T_DOT_PREVIOUS, ".previous", TAFMT_NONE)
+TOKEN(T_DOT_PROTECTED, ".protected", TAFMT_NONE)
+TOKEN(T_DOT_PUSHSECTION, ".pushsection", TAFMT_NONE)
+TOKEN(T_DOT_RODATA, ".rodata", TAFMT_NONE)
+TOKEN(T_DOT_SECTION, ".section", TAFMT_NONE)
+TOKEN(T_DOT_SET, ".set", TAFMT_NONE)
+TOKEN(T_DOT_SHORT, ".short", TAFMT_NONE)
+TOKEN(T_DOT_SIZE, ".size", TAFMT_NONE)
+TOKEN(T_DOT_SUBSECTION, ".subsection", TAFMT_NONE)
+TOKEN(T_DOT_SYMVER, ".symver", TAFMT_NONE)
+TOKEN(T_DOT_TYPE, ".type", TAFMT_NONE)
+TOKEN(T_DOT_WEAK, ".weak", TAFMT_NONE)
+TOKEN(T_DOT_WEAKREF, ".weakref", TAFMT_NONE)
+/* other symbols */
+TOKEN(T_REGISTER, "<register>", TAFMT_UINT)
+TOKEN(T_LOCAL_LABEL, "<local label>", TAFMT_UINT)	/* 1f, 2b */
+/* literals */
+TOKEN(T_STRING, "<string>", TAFMT_STRING)
+/* operators, separators */
+TOKEN(T_TILDE, "~", TAFMT_NONE)
+TOKEN(T_MUL, "*", TAFMT_NONE)
+TOKEN(T_DIV, "/", TAFMT_NONE)
+TOKEN(T_REM, "%", TAFMT_NONE)
+TOKEN(T_LSHIFT, "<<", TAFMT_NONE)
+TOKEN(T_RSHIFT, ">>", TAFMT_NONE)
+TOKEN(T_OR, "|", TAFMT_NONE)
+TOKEN(T_AND, "&", TAFMT_NONE)
+TOKEN(T_CARET, "^", TAFMT_NONE)
+TOKEN(T_BANG, "!", TAFMT_NONE)
+TOKEN(T_PLUS, "+", TAFMT_NONE)
+TOKEN(T_MINUS, "-", TAFMT_NONE)
+TOKEN(T_EQ, "=", TAFMT_NONE)
+TOKEN(T_EQEQ, "==", TAFMT_NONE)
+TOKEN(T_NEQ, "!=", TAFMT_NONE)
+TOKEN(T_LT, "<", TAFMT_NONE)
+TOKEN(T_GT, ">", TAFMT_NONE)
+TOKEN(T_GE, ">=", TAFMT_NONE)
+TOKEN(T_LE, "<=", TAFMT_NONE)
+TOKEN(T_ANDAND, "&&", TAFMT_NONE)
+TOKEN(T_OROR, "||", TAFMT_NONE)
+#endif
--- a/as/token.h
+++ b/as/token.h
@@ -0,0 +1,40 @@
+/*
+ * token.h
+ */
+#ifndef TOKEN_H
+#define TOKEN_H
+
+#include <stdio.h>
+#include "pdp10-stdint.h"
+
+enum token {
+#define TOKEN(T,P,F)	T,
+#include "token.def"
+#undef TOKEN
+};
+
+enum {
+    TAFMT_NONE = 0,
+    TAFMT_UINT = 1,
+    TAFMT_SYMBOL = 2,
+    TAFMT_STRING = 3,
+};
+
+struct token_info {
+    char print_name[15];
+    unsigned char attribute_fmt;
+};
+
+/* token_info[] is indexed by token and is used by token_print() to print tokens;
+   it is also public so the scanner can map directive names to tokens without
+   duplicating the names or the name-to-token mapping */
+extern const struct token_info token_info[];
+
+union token_attribute {
+    const char *text;		/* symbol, string */
+    pdp10_uint36_t uint;	/* uinteger */
+};
+
+void token_print(FILE *fp, enum token token, const union token_attribute *token_attr);
+
+#endif /* TOKEN_H */