From 0d1527a773e2a9ce589f6449a3c4a0f1f25f0c61 Mon Sep 17 00:00:00 2001 From: Connor Olding Date: Sun, 27 Nov 2016 05:52:45 -0800 Subject: [PATCH] refactor Muncher into separate TokenIter class this also fixes lexing the EOL after an include directive. --- TODO | 7 + lips/Collector.lua | 221 ++++++++++++---------------- lips/Lexer.lua | 21 ++- lips/{Muncher.lua => TokenIter.lua} | 136 ++++++++++++----- 4 files changed, 217 insertions(+), 168 deletions(-) rename lips/{Muncher.lua => TokenIter.lua} (58%) diff --git a/TODO b/TODO index e983802..8fcf85f 100644 --- a/TODO +++ b/TODO @@ -1,3 +1,10 @@ +unify/optimize ascii/asciiz/byte/halfword/word into BIN directives + +directive aliases, are these right? + DB = 'BYTE', + DH = 'HALFWORD', + DW = 'WORD', + add basic command-line interface (patch.lua) add macros diff --git a/lips/Collector.lua b/lips/Collector.lua index 4e1f470..ff37f7a 100644 --- a/lips/Collector.lua +++ b/lips/Collector.lua @@ -1,11 +1,17 @@ local insert = table.insert local path = string.gsub(..., "[^.]+$", "") +local Base = require(path.."Base") local Token = require(path.."Token") +local TokenIter = require(path.."TokenIter") local Statement = require(path.."Statement") -local Muncher = require(path.."Muncher") -local arg_types = { -- for instructions +local Collector = Base:extend() +function Collector:init(options) + self.options = options or {} +end + +Collector.arg_types = { -- for instructions NUM = true, REG = true, VARSYM = true, @@ -13,17 +19,14 @@ local arg_types = { -- for instructions RELLABELSYM = true, } -local Collector = Muncher:extend() -function Collector:init(options) - self.options = options or {} -end - function Collector:statement(...) - local s = Statement(self.fn, self.line, ...) + local I = self.iter + local s = Statement(I.fn, I.line, ...) return s end function Collector:push_data(datum, size) + local I = self.iter --[[ pseudo-example: Statement{type='!DATA', {tt='BYTES', tok={0, 1, 2}}, @@ -35,9 +38,10 @@ function Collector:push_data(datum, size) -- FIXME: optimize the hell out of this garbage, preferably in the lexer -- TODO: consider not scrunching data statements, just their tokens + -- TODO: concatenate strings; use !BIN instead of !DATA if type(datum) == 'number' then - datum = self:token(datum) + datum = I:token(datum) end local last_statement = self.statements[#self.statements] @@ -59,13 +63,13 @@ function Collector:push_data(datum, size) insert(s, datum) return else - self:error('labels are too large to be used in this directive') + I:error('labels are too large to be used in this directive') end elseif datum.tt == 'VARSYM' then insert(s, datum:set('size', size)) return elseif datum.tt ~= 'NUM' then - self:error('unsupported data type', datum.tt) + I:error('unsupported data type', datum.tt) end local sizes = size..'S' @@ -75,77 +79,67 @@ function Collector:push_data(datum, size) if last_token and last_token.tt == sizes then t = last_token else - t = self:token(sizes, {}) + t = I:token(sizes, {}) insert(s, t) s:validate() end insert(t.tok, datum.tok) end -function Collector:variable() - local t = self.t - local t2 = self:advance() - - local s = self:statement('!VAR', t, t2) - insert(self.statements, s) - self:advance() -end - -function Collector:directive() - local name = self.tok - self:advance() +function Collector:directive(name) + local I = self.iter local function add(kind, ...) insert(self.statements, self:statement('!'..kind, ...)) end + if name == 'ORG' or name == 'BASE' then - add(name, self:const(nil, 'no labels')) + add(name, I:const(nil, 'no labels')) elseif name == 'PUSH' or name == 'POP' then - add(name, self:const()) - while not self:is_EOL() do - self:optional_comma() - add(name, self:const()) + add(name, I:const()) + while not I:is_EOL() do + I:eat_comma() + add(name, I:const()) end elseif name == 'ALIGN' or name == 'SKIP' then - if self:is_EOL() and name == 'ALIGN' then + if I:is_EOL() and name == 'ALIGN' then add(name) else - local size = self:const(nil, 'no label') - if self:is_EOL() then + local size = I:const(nil, 'no label') + if I:is_EOL() then add(name, size) else - self:optional_comma() - add(name, size, self:const(nil, 'no label')) + I:eat_comma() + add(name, size, I:const(nil, 'no label')) end end elseif name == 'BIN' then -- FIXME: not a real directive, just a workaround - add(name, self:string()) + add(name, I:string()) elseif name == 'BYTE' or name == 'HALFWORD' or name == 'WORD' then - self:push_data(self:const(), name) - while not self:is_EOL() do - self:optional_comma() - self:push_data(self:const(), name) + self:push_data(I:const(), name) + while not I:is_EOL() do + I:eat_comma() + self:push_data(I:const(), name) end elseif name == 'HEX' then - if self.tt ~= 'OPEN' then - self:error('expected opening brace for hex directive', self.tt) + if I.tt ~= 'OPEN' then + I:error('expected opening brace for hex directive', I.tt) end - self:advance() + I:next() - while self.tt ~= 'CLOSE' do - if self.tt == 'EOL' then - self:advance() + while I.tt ~= 'CLOSE' do + if I.tt == 'EOL' then + I:next() else - self:push_data(self:const(), 'BYTE') + self:push_data(I:const(), 'BYTE') end end - self:advance() + I:next() elseif name == 'INC' or name == 'INCBIN' then -- noop, handled by lexer - self:string() - return -- don't expect EOL + I:string() elseif name == 'ASCII' or name == 'ASCIIZ' then - local bytes = self:string() + local bytes = I:string() for i, number in ipairs(bytes.tok) do self:push_data(number, 'BYTE') end @@ -153,85 +147,61 @@ function Collector:directive() self:push_data(0, 'BYTE') end elseif name == 'FLOAT' then - self:error('unimplemented directive', name) + I:error('unimplemented directive', name) else - self:error('unknown directive', name) + I:error('unknown directive', name) end - self:expect_EOL() + + I:expect_EOL() end -function Collector:basic_special() - local name, args = self:special() - - local portion - if name == 'hi' then - portion = 'upperoff' - elseif name == 'up' then - portion = 'upper' - elseif name == 'lo' then - portion = 'lower' - else - self:error('unknown special', name) - end - - if #args ~= 1 then - self:error(name..' expected one argument', #args) - end - - local t = self:token(args[1]):set('portion', portion) - return t -end - -function Collector:instruction() - local s = self:statement(self.tok) +function Collector:instruction(name) + local I = self.iter + local s = self:statement(name) insert(self.statements, s) - self:advance() - while self.tt ~= 'EOL' do - local t = self.t - if self.tt == 'OPEN' then - t = self:deref() - t.tt = 'DEREF' -- TODO: should just be returned by :deref - insert(s, t) - elseif self.tt == 'UNARY' then - local peek = self.tokens[self.i + 1] + while I.tt ~= 'EOL' do + local t = I.t + if I.tt == 'OPEN' then + insert(s, I:deref()) + elseif I.tt == 'UNARY' then + local peek = assert(I:peek()) if peek.tt == 'VARSYM' then local negate = t.tok == -1 - t = self:advance() + t = I:next() t = Token(t):set('negate', negate) insert(s, t) - self:advance() + I:next() elseif peek.tt == 'EOL' or peek.tt == 'SEP' then local tok = t.tok == 1 and '+' or t.tok == -1 and '-' - t = Token(self.fn, self.line, 'RELLABELSYM', tok) + t = Token(I.fn, I.line, 'RELLABELSYM', tok) insert(s, t) - self:advance() + I:next() else - self:error('unexpected token after unary operator', peek.tt) + I:error('unexpected token after unary operator', peek.tt) end - elseif self.tt == 'SPECIAL' then - t = self:basic_special() + elseif I.tt == 'SPECIAL' then + t = I:basic_special() insert(s, t) - self:advance() - elseif self.tt == 'SEP' then - self:error('extraneous comma') - elseif not arg_types[self.tt] then - self:error('unexpected argument type in instruction', self.tt) + I:next() + elseif I.tt == 'SEP' then + I:error('extraneous comma') + elseif not self.arg_types[I.tt] then + I:error('unexpected argument type in instruction', I.tt) else insert(s, t) - self:advance() + I:next() end - self:optional_comma() + I:eat_comma() end - self:expect_EOL() + I:expect_EOL() s:validate() end function Collector:collect(tokens, fn) - self.tokens = tokens - self.fn = fn or '(string)' - self.main_fn = self.fn + self.iter = TokenIter(tokens) + local I = self.iter self.statements = {} @@ -245,29 +215,28 @@ function Collector:collect(tokens, fn) insert(self.statements, s) end - self.i = 0 -- set up Muncher iteration - self:advance() -- load up the first token - while true do - if self.tt == 'EOF' then - -- don't break if this is an included file's EOF - if self.fn == self.main_fn then - break - end - self:advance() - elseif self.tt == 'EOL' then - -- empty line - self:advance() - elseif self.tt == 'VAR' then - self:variable() -- handles advancing - elseif self.tt == 'LABEL' or self.tt == 'RELLABEL' then - insert(self.statements, self:statement('!LABEL', self.t)) - self:advance() - elseif self.tt == 'DIR' then - self:directive() -- handles advancing - elseif self.tt == 'INSTR' then - self:instruction() -- handles advancing + for t in I do + print(t.tt, t.tok) + if t.tt == 'EOF' then + -- noop + elseif t.tt == 'EOL' then + -- noop; empty line + elseif t.tt == 'LABEL' or t.tt == 'RELLABEL' then + insert(self.statements, self:statement('!LABEL', t)) + elseif t.tt == 'VAR' then + local t2 = I:next() + I:next() + local s = self:statement('!VAR', t, t2) + insert(self.statements, s) + I:expect_EOL() + elseif t.tt == 'DIR' then + I:next() + self:directive(t.tok) + elseif t.tt == 'INSTR' then + I:next() + self:instruction(t.tok) else - self:error('expected starting token for statement', self.tt) + I:error('expected starting token for statement', t.tt) end end diff --git a/lips/Lexer.lua b/lips/Lexer.lua index f479fbb..4194ade 100644 --- a/lips/Lexer.lua +++ b/lips/Lexer.lua @@ -289,7 +289,7 @@ function Lexer:lex_string_naive(yield) -- no escape sequences yield('STRING', buff) end -function Lexer:lex_include(_yield) +function Lexer:lex_filename(_yield) self:read_spaces() local fn self:lex_string_naive(function(tt, tok) @@ -297,6 +297,18 @@ function Lexer:lex_include(_yield) end) _yield('STRING', fn, self.fn, self.line) + if self.chr ~= '\n' then + self:error('expected EOL after filename') + end + _yield('EOL', '\n', self.fn, self.line) + self:nextc() + + return fn +end + +function Lexer:lex_include(_yield) + local fn = self:lex_filename(_yield) + if self.options.path then fn = self.options.path..fn end @@ -308,12 +320,7 @@ function Lexer:lex_include(_yield) end function Lexer:lex_include_binary(_yield) - self:read_spaces() - local fn - self:lex_string_naive(function(tt, tok) - fn = tok - end) - _yield('STRING', fn, self.fn, self.line) + local fn = self:lex_filename(_yield) -- TODO: allow optional offset and size arguments if self.options.path then diff --git a/lips/Muncher.lua b/lips/TokenIter.lua similarity index 58% rename from lips/Muncher.lua rename to lips/TokenIter.lua index 984eeef..1532cb8 100644 --- a/lips/Muncher.lua +++ b/lips/TokenIter.lua @@ -2,29 +2,71 @@ local format = string.format local insert = table.insert local path = string.gsub(..., "[^.]+$", "") -local data = require(path.."data") -local Base = require(path.."Base") local Token = require(path.."Token") -local arg_types = { - NUM = true, - REG = true, - VARSYM = true, - LABELSYM = true, - RELLABELSYM = true, -} +local Iter = {} +function Iter:__call() + return self:next(1) +end -local Muncher = Base:extend() --- no base init method +local TokenIter = {} +function TokenIter:init(tokens) + assert(tokens ~= nil) + self.tokens = tokens + self:reset() +end -function Muncher:error(msg, got) +function TokenIter:error(msg, got) if got ~= nil then msg = msg..', got '..tostring(got) end error(format('%s:%d: Error: %s', self.fn, self.line, msg), 2) end -function Muncher:token(t, val) +function TokenIter:reset() + self.i = 0 + self.tt = nil + self.tok = nil + self.fn = nil + self.line = nil + self.ended = false +end + +function TokenIter:advance(n) + n = n or 0 + if self.ended then + error('Internal Error: attempted to advance iterator past end', 2 + n) + end + + self.i = self.i + 1 + self.t = self.tokens[self.i] + if self.t == nil then + self.tt = nil + self.tok = nil + self.fn = nil + self.line = nil + self.ended = true + else + self.tt = self.t.tt + self.tok = self.t.tok + self.fn = self.t.fn + self.line = self.t.line + end +end + +function TokenIter:next(n) + n = n or 0 + self:advance(n + 1) + if self.t then return self.t end +end + +function TokenIter:peek() + return self.tokens[self.i + 1] +end + +-- now begins the parsing stuff + +function TokenIter:token(t, val) -- note: call Token directly if you want to specify fn and line manually if type(t) == 'table' then t.fn = self.fn @@ -37,36 +79,25 @@ function Muncher:token(t, val) end end -function Muncher:advance() - self.i = self.i + 1 - self.t = self.tokens[self.i] - self.tt = self.t.tt - self.tok = self.t.tok - self.fn = self.t.fn - self.line = self.t.line - return self.t -end - -function Muncher:is_EOL() +function TokenIter:is_EOL() return self.tt == 'EOL' or self.tt == 'EOF' end -function Muncher:expect_EOL() +function TokenIter:expect_EOL() if self:is_EOL() then - self:advance() return end self:error('expected end of line', self.tt) end -function Muncher:optional_comma() +function TokenIter:eat_comma() if self.tt == 'SEP' and self.tok == ',' then self:advance() return true end end -function Muncher:number() +function TokenIter:number() if self.tt ~= 'NUM' then self:error('expected number', self.tt) end @@ -75,7 +106,7 @@ function Muncher:number() return self:token(t) end -function Muncher:string() +function TokenIter:string() if self.tt ~= 'STRING' then self:error('expected string', self.tt) end @@ -84,7 +115,7 @@ function Muncher:string() return self:token(t) end -function Muncher:register(registers) +function TokenIter:register(registers) registers = registers or data.registers if self.tt ~= 'REG' then self:error('expected register', self.tt) @@ -97,7 +128,7 @@ function Muncher:register(registers) return self:token(t) end -function Muncher:deref() +function TokenIter:deref() if self.tt ~= 'OPEN' then self:error('expected opening parenthesis for dereferencing', self.tt) end @@ -111,10 +142,10 @@ function Muncher:deref() self:error('expected closing parenthesis for dereferencing', self.tt) end self:advance() - return self:token(t) + return self:token(t):set('tt', 'DEREF') end -function Muncher:const(relative, no_label) +function TokenIter:const(relative, no_label) local good = { NUM = true, EXPR = true, @@ -132,7 +163,7 @@ function Muncher:const(relative, no_label) return t end -function Muncher:special() +function TokenIter:special() if self.tt ~= 'SPECIAL' then self:error('expected special name to call', self.tt) end @@ -163,4 +194,39 @@ function Muncher:special() return name, args end -return Muncher +function TokenIter:basic_special() + local name, args = self:special() + + local portion + if name == 'hi' then + portion = 'upperoff' + elseif name == 'up' then + portion = 'upper' + elseif name == 'lo' then + portion = 'lower' + else + self:error('unknown special', name) + end + + if #args ~= 1 then + self:error(name..' expected one argument', #args) + end + + local t = self:token(args[1]):set('portion', portion) + return t +end + +-- TODO: move this boilerplate elsewhere + +local MetaBlah = { + __index = TokenIter, + __call = TokenIter.next, +} + +local ClassBlah = {} +function ClassBlah:__call(...) + local obj = setmetatable({}, MetaBlah) + return obj, obj:init(...) +end + +return setmetatable(TokenIter, ClassBlah)