1
0
Fork 0
mirror of https://github.com/notwa/lips synced 2024-04-25 15:03:22 -07:00
lips/lips/Lexer.lua
Connor Olding ec76e37014 add \x escapes to strings...
also adds the plain \e escape and allows escapes in filenames.
2016-12-28 04:04:05 -08:00

533 lines
16 KiB
Lua

local byte = string.byte
local char = string.char
local find = string.find
local format = string.format
local insert = table.insert
local unpack = rawget(_G, 'unpack') or table.unpack
local path = string.gsub(..., "[^.]+$", "")
local data = require(path.."data")
local util = require(path.."util")
local Base = require(path.."Base")
local simple_escapes = {
['0'] = 0x00,
['\\'] = 0x5C,
['"'] = 0x22,
['a'] = 0x07,
['b'] = 0x08,
['e'] = 0x1B,
['f'] = 0x0C,
['n'] = 0x0A,
['r'] = 0x0D,
['t'] = 0x09,
['v'] = 0x0B,
}
local Lexer = Base:extend()
function Lexer:init(asm, fn, options)
self.asm = asm
self.fn = fn or '(string)'
self.options = options or {}
self.pos = 1
self.line = 1
self.EOF = -1
self.was_EOL = false
self:nextc()
end
function Lexer:error(msg)
error(format('%s:%d: Error: %s', self.fn, self.line, msg), 2)
end
function Lexer:nextc()
-- iterate to the next character while translating newlines.
-- outputs:
--self.chr the character as a string
--self.chr2 the character after it as a string
--self.chrchr both characters as a string
-- chr values can be empty
--self.ord numeric value of the character
--self.ord2 numeric value of the character after it
-- ord values can be self.EOF
--self.was_EOL if the character was an EOL
-- this EOL state is preserved past the EOF
-- so it can be used to determine if the file lacks a final EOL
if self.pos > #self.asm then
self.ord = self.EOF
self.ord2 = self.EOF
self.chr = ''
self.chr2 = ''
self.chrchr = ''
return
end
if self.chr == '\n' then
self.line = self.line + 1
end
self.ord = byte(self.asm, self.pos)
self.pos = self.pos + 1
-- handle newlines; translate CRLF to LF
if self.ord == 13 then
if self.pos <= #self.asm and byte(self.asm, self.pos) == 10 then
self.pos = self.pos + 1
end
self.ord = 10
end
self.was_EOL = self.ord == 10
self.chr = char(self.ord)
if self.pos <= #self.asm then
self.ord2 = byte(self.asm, self.pos)
self.chr2 = char(self.ord2)
self.chrchr = char(self.ord, self.ord2)
else
self.ord2 = self.EOF
self.chr2 = ''
self.chrchr = self.chr
end
end
function Lexer:skip_to_EOL()
while self.chr ~= '\n' and self.ord ~= self.EOF do
self:nextc()
end
end
function Lexer:read_chars(pattern)
local buff = ''
while find(self.chr, pattern) do
buff = buff..self.chr
self:nextc()
end
return buff
end
function Lexer:read_spaces()
return self:read_chars('[ \t]')
end
function Lexer:read_decimal()
local buff = self:read_chars('%d')
local num = tonumber(buff)
if not num then self:error('invalid decimal number') end
return num
end
function Lexer:read_hex()
local buff = self:read_chars('%x')
local num = tonumber(buff, 16)
if not num then self:error('invalid hex number') end
return num
end
function Lexer:read_octal()
local buff = self:read_chars('[0-7]')
local num = tonumber(buff, 8)
if not num then self:error('invalid octal number') end
return num
end
function Lexer:read_binary()
local buff = self:read_chars('[01]')
local num = tonumber(buff, 2)
if not num then self:error('invalid binary number') end
return num
end
function Lexer:read_number()
if self.chr == '%' then
self:nextc()
return self:read_binary()
elseif self.chr == '$' then
self:nextc()
return self:read_hex()
elseif self.chr:find('%d') then
if self.chr2 == 'x' then
self:nextc()
self:nextc()
return self:read_hex()
elseif self.chr2 == 'o' then
self:nextc()
self:nextc()
return self:read_octal()
elseif self.chr2 == 'b' then
self:nextc()
self:nextc()
return self:read_binary()
elseif self.chr == '0' and self.chr2:find('%d') then
self:nextc()
return self:read_octal()
else
return self:read_decimal()
end
elseif self.chr == '#' then
self:nextc()
return self:read_decimal()
end
end
function Lexer:lex_hex(yield)
local hexmatch = '[0-9A-Fa-f]'
local entered = false
while true do
if self.chr == '\n' then
yield('EOL', '\n')
self:nextc()
elseif self.ord == self.EOF then
self:error('unexpected EOF; incomplete hex directive')
elseif self.chr == ';' then
self:skip_to_EOL()
elseif self.chrchr == '//' then
self:skip_to_EOL()
elseif self.chrchr == '/*' then
self:nextc()
self:nextc()
self:lex_block_comment(yield)
elseif self.chr:find('%s') then
self:nextc()
elseif self.chr == '{' then
if entered then
self:error('unexpected opening brace')
end
self:nextc()
entered = true
yield('OPEN', '{')
elseif self.chr == '}' then
if not entered then
self:error('expected opening brace')
end
self:nextc()
yield('CLOSE', '}')
break
elseif self.chr == ',' then
self:error('commas are not allowed in HEX directives')
elseif self.chr:find(hexmatch) and self.chr2:find(hexmatch) then
local num = tonumber(self.chrchr, 16)
self:nextc()
self:nextc()
if self.chr:find(hexmatch) then
self:error('too many hex digits to be a single byte')
end
yield('NUM', num)
elseif self.chr:find(hexmatch) then
self:error('expected two hex digits to make a byte')
else
if entered then
self:error('expected bytes given in hex or closing brace')
else
self:error('expected opening brace')
end
end
end
end
function Lexer:lex_block_comment(yield)
while true do
if self.chr == '\n' then
yield('EOL', '\n')
self:nextc()
elseif self.ord == self.EOF then
self:error('unexpected EOF; incomplete block comment')
elseif self.chrchr == '*/' then
self:nextc()
self:nextc()
break
else
self:nextc()
end
end
end
function Lexer:lex_string(yield)
if self.chr ~= '"' then
self:error('expected opening double quote')
end
self:nextc()
local bytes = {}
while true do
if self.chr == '\n' then
self:error('unimplemented: newlines in strings')
yield('EOL', '\n')
self:nextc()
elseif self.ord == self.EOF then
self:nextc()
self:error('unexpected EOF; incomplete string')
elseif self.chr == '"' then
self:nextc()
break
elseif self.chr == '\\' then
self:nextc()
local simple = simple_escapes[self.chr]
if simple then
insert(bytes, simple)
elseif self.chr == 'x' then
self:nextc()
local hex = self.chrchr
if not self.chr:find('[0-9a-fA-F]') then
self:error('invalid hex escape sequence: \\x'..hex)
end
self:nextc()
if not self.chr:find('[0-9a-fA-F]') then
self:error('invalid hex escape sequence: \\x'..hex)
end
local byte = tonumber(hex, 16)
insert(bytes, byte)
else
self:error('unknown escape sequence')
end
self:nextc()
else
insert(bytes, byte(self.chr))
self:nextc()
end
end
yield('STRING', bytes)
end
function Lexer:lex_filename(_yield)
self:read_spaces()
local fn = ''
self:lex_string(function(tt, tok)
fn = char(unpack(tok))
end)
_yield('STRING', fn, self.fn, self.line)
self:read_spaces()
if self.chr == ';' or self.chrchr == '//' then
self:skip_to_EOL()
end
if self.chr == '\n' then
_yield('EOL', '\n', self.fn, self.line)
self:nextc()
elseif self.ord == self.EOF then
_yield('EOL', '\n', self.fn, self.line)
self.was_EOL = true
else
self:error('expected EOL after filename')
end
return fn
end
function Lexer:lex_include(_yield)
local fn = self:lex_filename(_yield)
if self.options.path then
fn = self.options.path..fn
end
local new_options = setmetatable({}, {__index=self.options})
new_options.path = fn:match(".*/")
local sublexer = Lexer(util.readfile(fn), fn, new_options)
sublexer:lex(_yield)
end
function Lexer:lex_include_binary(_yield)
local fn = self:lex_filename(_yield)
-- TODO: allow optional offset and size arguments
if self.options.path then
fn = self.options.path..fn
end
local data = util.readfile(fn, true)
_yield('DIR', 'BIN', fn, 0)
_yield('STRING', data, fn, 0)
_yield('EOF', self.EOF, self.fn, self.line)
end
function Lexer:lex_expression(yield)
if self.chr ~= '(' then
self:error('expected opening parenthesis for expression')
end
self:nextc()
local expr = ""
local depth = 1
while true do
if self.chr == '\n' then
self:error('unexpected newline; incomplete expression')
elseif self.ord == self.EOF then
self:nextc()
self:error('unexpected EOF; incomplete expression')
elseif self.chr == '(' then
depth = depth + 1
self:nextc()
expr = expr..'('
elseif self.chr == ')' then
depth = depth - 1
self:nextc()
if depth == 0 then break end
expr = expr..')'
else
expr = expr..self.chr
self:nextc()
end
end
yield('EXPR', expr)
end
function Lexer:lex(_yield)
local function yield(tt, tok)
return _yield(tt, tok, self.fn, self.line)
end
while true do
if self.chr == '\n' then
yield('EOL', '\n')
self:nextc()
elseif self.ord == self.EOF then
if not self.was_EOL then
yield('EOL', '\n')
end
yield('EOF', self.EOF)
break
elseif self.chr == ';' then
self:skip_to_EOL()
elseif self.chrchr == '//' then
self:skip_to_EOL()
elseif self.chrchr == '/*' then
self:nextc()
self:nextc()
self:lex_block_comment(yield)
elseif self.chr:find('%s') then
self:nextc()
elseif self.chr == ',' then
self:nextc()
yield('SEP', ',')
elseif self.chr == '[' then
self:nextc()
if self.chr:find('%d') then
self:error('variable names cannot begin with a number')
end
local buff = self:read_chars('[%w_]')
if self.chr ~= ']' then
self:error('invalid variable name')
end
self:nextc()
if self.chr ~= ':' then
self:error('expected a colon after closing bracket')
end
self:nextc()
yield('VAR', buff)
self:read_spaces()
if self.chr == '@' then
-- old syntax; nothing to do here
else
buff = self:read_chars('[^;\n]')
yield('EXPR', buff)
end
elseif self.chr == ']' then
self:error('unmatched closing bracket')
elseif self.chr == '(' then
self:nextc()
yield('OPEN', '(')
elseif self.chr == ')' then
self:nextc()
yield('CLOSE', ')')
elseif self.chr == '.' then
self:nextc()
local buff = self:read_chars('[%w]')
local up = buff:upper()
if data.directive_aliases[up] then
up = data.directive_aliases[up]
end
if not data.all_directives[up] then
self:error('unknown directive')
end
if up == 'INC' or up == 'INCASM' or up == 'INCLUDE' then
yield('DIR', 'INC')
self:lex_include(_yield)
elseif up == 'INCBIN' then
yield('DIR', 'INCBIN')
self:lex_include_binary(_yield)
else
yield('DIR', up)
end
elseif self.chr == '"' then
self:lex_string(yield)
elseif self.chr == '@' then
self:nextc()
if self.chr:find('%d') then
self:error('variable names cannot begin with a number')
end
local buff = self:read_chars('[%w_]')
yield('VARSYM', buff)
elseif self.chr == '%' then
self:nextc()
if self.chr:find('[%a_]') then
local call = self:read_chars('[%w_]')
if call ~= '' then
yield('SPECIAL', call)
end
elseif self.chr:find('[01]') then
yield('NUM', self:read_binary())
elseif self.chr == '(' then
self:lex_expression(yield)
else
self:error('unknown % syntax')
end
elseif self.chr:find('[%a_]') then
local buff = self:read_chars('[%w_.]')
local up = buff:upper()
if self.chr == ':' then
if buff:find('%.') then
self:error('labels cannot contain dots')
end
self:nextc()
yield('LABEL', buff)
elseif up == 'HEX' then
yield('DIR', 'HEX')
self:lex_hex(yield)
elseif data.all_registers[up] then
yield('REG', up)
elseif data.all_instructions[up] then
yield('INSTR', up:gsub('%.', '_'))
else
if buff:find('%.') then
self:error('labels cannot contain dots')
end
yield('LABELSYM', buff)
end
elseif self.chr == '+' or self.chr == '-' then
local sign_chr = self.chr
local sign = sign_chr == '+' and 1 or -1
local signs = self:read_chars('%'..self.chr)
local name = ''
if self.chr:find('[%a_]') then
name = self:read_chars('[%w_]')
end
if #signs == 1 and self.chr == ':' then
self:nextc()
yield('RELLABEL', signs..name)
else
self:read_spaces()
local n = self:read_number()
if n then
yield('NUM', sign*n)
elseif #signs == 1 and name == '' then
-- this could be a RELLABELSYM
-- we'll have to let the preproc figure it out
yield('UNARY', sign)
else
yield('RELLABELSYM', signs..name)
end
end
else
local n = self:read_number()
if n then
yield('NUM', n)
else
self:error('unknown character or control character')
end
end
end
end
return Lexer