local byte = string.byte local char = string.char local find = string.find local format = string.format local insert = table.insert local unpack = rawget(_G, 'unpack') or table.unpack local path = string.gsub(..., "[^.]+$", "") local data = require(path.."data") local util = require(path.."util") local Base = require(path.."Base") local simple_escapes = { ['0'] = 0x00, ['\\'] = 0x5C, ['"'] = 0x22, ['a'] = 0x07, ['b'] = 0x08, ['e'] = 0x1B, ['f'] = 0x0C, ['n'] = 0x0A, ['r'] = 0x0D, ['t'] = 0x09, ['v'] = 0x0B, } local Lexer = Base:extend() function Lexer:init(asm, fn, options) self.asm = asm self.fn = fn or '(string)' self.options = options or {} self.pos = 1 self.line = 1 self.EOF = -1 self.was_EOL = false self:nextc() end function Lexer:error(msg) error(format('%s:%d: Error: %s', self.fn, self.line, msg), 2) end function Lexer:nextc() -- iterate to the next character while translating newlines. -- outputs: --self.chr the character as a string --self.chr2 the character after it as a string --self.chrchr both characters as a string -- chr values can be empty --self.ord numeric value of the character --self.ord2 numeric value of the character after it -- ord values can be self.EOF --self.was_EOL if the character was an EOL -- this EOL state is preserved past the EOF -- so it can be used to determine if the file lacks a final EOL if self.pos > #self.asm then self.ord = self.EOF self.ord2 = self.EOF self.chr = '' self.chr2 = '' self.chrchr = '' return end if self.chr == '\n' then self.line = self.line + 1 end self.ord = byte(self.asm, self.pos) self.pos = self.pos + 1 -- handle newlines; translate CRLF to LF if self.ord == 13 then if self.pos <= #self.asm and byte(self.asm, self.pos) == 10 then self.pos = self.pos + 1 end self.ord = 10 end self.was_EOL = self.ord == 10 self.chr = char(self.ord) if self.pos <= #self.asm then self.ord2 = byte(self.asm, self.pos) self.chr2 = char(self.ord2) self.chrchr = char(self.ord, self.ord2) else self.ord2 = self.EOF self.chr2 = '' self.chrchr = self.chr end end function Lexer:skip_to_EOL() while self.chr ~= '\n' and self.ord ~= self.EOF do self:nextc() end end function Lexer:read_chars(pattern) local buff = '' while find(self.chr, pattern) do buff = buff..self.chr self:nextc() end return buff end function Lexer:read_spaces() return self:read_chars('[ \t]') end function Lexer:read_decimal() local buff = self:read_chars('%d') local num = tonumber(buff) if not num then self:error('invalid decimal number') end return num end function Lexer:read_hex() local buff = self:read_chars('%x') local num = tonumber(buff, 16) if not num then self:error('invalid hex number') end return num end function Lexer:read_octal() local buff = self:read_chars('[0-7]') local num = tonumber(buff, 8) if not num then self:error('invalid octal number') end return num end function Lexer:read_binary() local buff = self:read_chars('[01]') local num = tonumber(buff, 2) if not num then self:error('invalid binary number') end return num end function Lexer:read_number() if self.chr == '%' then self:nextc() return self:read_binary() elseif self.chr == '$' then self:nextc() return self:read_hex() elseif self.chr:find('%d') then if self.chr2 == 'x' then self:nextc() self:nextc() return self:read_hex() elseif self.chr2 == 'o' then self:nextc() self:nextc() return self:read_octal() elseif self.chr2 == 'b' then self:nextc() self:nextc() return self:read_binary() elseif self.chr == '0' and self.chr2:find('%d') then self:nextc() return self:read_octal() else return self:read_decimal() end elseif self.chr == '#' then self:nextc() return self:read_decimal() end end function Lexer:lex_hex(yield) local hexmatch = '[0-9A-Fa-f]' local entered = false while true do if self.chr == '\n' then yield('EOL', '\n') self:nextc() elseif self.ord == self.EOF then self:error('unexpected EOF; incomplete hex directive') elseif self.chr == ';' then self:skip_to_EOL() elseif self.chrchr == '//' then self:skip_to_EOL() elseif self.chrchr == '/*' then self:nextc() self:nextc() self:lex_block_comment(yield) elseif self.chr:find('%s') then self:nextc() elseif self.chr == '{' then if entered then self:error('unexpected opening brace') end self:nextc() entered = true yield('OPEN', '{') elseif self.chr == '}' then if not entered then self:error('expected opening brace') end self:nextc() yield('CLOSE', '}') break elseif self.chr == ',' then self:error('commas are not allowed in HEX directives') elseif self.chr:find(hexmatch) and self.chr2:find(hexmatch) then local num = tonumber(self.chrchr, 16) self:nextc() self:nextc() if self.chr:find(hexmatch) then self:error('too many hex digits to be a single byte') end yield('NUM', num) elseif self.chr:find(hexmatch) then self:error('expected two hex digits to make a byte') else if entered then self:error('expected bytes given in hex or closing brace') else self:error('expected opening brace') end end end end function Lexer:lex_block_comment(yield) while true do if self.chr == '\n' then yield('EOL', '\n') self:nextc() elseif self.ord == self.EOF then self:error('unexpected EOF; incomplete block comment') elseif self.chrchr == '*/' then self:nextc() self:nextc() break else self:nextc() end end end function Lexer:lex_string(yield) if self.chr ~= '"' then self:error('expected opening double quote') end self:nextc() local bytes = {} while true do if self.chr == '\n' then self:error('unimplemented: newlines in strings') yield('EOL', '\n') self:nextc() elseif self.ord == self.EOF then self:nextc() self:error('unexpected EOF; incomplete string') elseif self.chr == '"' then self:nextc() break elseif self.chr == '\\' then self:nextc() local simple = simple_escapes[self.chr] if simple then insert(bytes, simple) elseif self.chr == 'x' then self:nextc() local hex = self.chrchr if not self.chr:find('[0-9a-fA-F]') then self:error('invalid hex escape sequence: \\x'..hex) end self:nextc() if not self.chr:find('[0-9a-fA-F]') then self:error('invalid hex escape sequence: \\x'..hex) end local byte = tonumber(hex, 16) insert(bytes, byte) else self:error('unknown escape sequence') end self:nextc() else insert(bytes, byte(self.chr)) self:nextc() end end yield('STRING', bytes) end function Lexer:lex_filename(_yield) self:read_spaces() local fn = '' self:lex_string(function(tt, tok) fn = char(unpack(tok)) end) _yield('STRING', fn, self.fn, self.line) self:read_spaces() if self.chr == ';' or self.chrchr == '//' then self:skip_to_EOL() end if self.chr == '\n' then _yield('EOL', '\n', self.fn, self.line) self:nextc() elseif self.ord == self.EOF then _yield('EOL', '\n', self.fn, self.line) self.was_EOL = true else self:error('expected EOL after filename') end return fn end function Lexer:lex_include(_yield) local fn = self:lex_filename(_yield) if self.options.path then fn = self.options.path..fn end local new_options = setmetatable({}, {__index=self.options}) new_options.path = fn:match(".*/") local sublexer = Lexer(util.readfile(fn), fn, new_options) sublexer:lex(_yield) end function Lexer:lex_include_binary(_yield) local fn = self:lex_filename(_yield) -- TODO: allow optional offset and size arguments if self.options.path then fn = self.options.path..fn end local data = util.readfile(fn, true) _yield('DIR', 'BIN', fn, 0) _yield('STRING', data, fn, 0) _yield('EOF', self.EOF, self.fn, self.line) end function Lexer:lex_expression(yield) if self.chr ~= '(' then self:error('expected opening parenthesis for expression') end self:nextc() local expr = "" local depth = 1 while true do if self.chr == '\n' then self:error('unexpected newline; incomplete expression') elseif self.ord == self.EOF then self:nextc() self:error('unexpected EOF; incomplete expression') elseif self.chr == '(' then depth = depth + 1 self:nextc() expr = expr..'(' elseif self.chr == ')' then depth = depth - 1 self:nextc() if depth == 0 then break end expr = expr..')' else expr = expr..self.chr self:nextc() end end yield('EXPR', expr) end function Lexer:lex(_yield) local function yield(tt, tok) return _yield(tt, tok, self.fn, self.line) end while true do if self.chr == '\n' then yield('EOL', '\n') self:nextc() elseif self.ord == self.EOF then if not self.was_EOL then yield('EOL', '\n') end yield('EOF', self.EOF) break elseif self.chr == ';' then self:skip_to_EOL() elseif self.chrchr == '//' then self:skip_to_EOL() elseif self.chrchr == '/*' then self:nextc() self:nextc() self:lex_block_comment(yield) elseif self.chr:find('%s') then self:nextc() elseif self.chr == ',' then self:nextc() yield('SEP', ',') elseif self.chr == '[' then self:nextc() if self.chr:find('%d') then self:error('variable names cannot begin with a number') end local buff = self:read_chars('[%w_]') if self.chr ~= ']' then self:error('invalid variable name') end self:nextc() if self.chr ~= ':' then self:error('expected a colon after closing bracket') end self:nextc() yield('VAR', buff) self:read_spaces() if self.chr == '@' then -- old syntax; nothing to do here else buff = self:read_chars('[^;\n]') yield('EXPR', buff) end elseif self.chr == ']' then self:error('unmatched closing bracket') elseif self.chr == '(' then self:nextc() yield('OPEN', '(') elseif self.chr == ')' then self:nextc() yield('CLOSE', ')') elseif self.chr == '.' then self:nextc() local buff = self:read_chars('[%w]') local up = buff:upper() if data.directive_aliases[up] then up = data.directive_aliases[up] end if not data.all_directives[up] then self:error('unknown directive') end if up == 'INC' or up == 'INCASM' or up == 'INCLUDE' then yield('DIR', 'INC') self:lex_include(_yield) elseif up == 'INCBIN' then yield('DIR', 'INCBIN') self:lex_include_binary(_yield) else yield('DIR', up) end elseif self.chr == '"' then self:lex_string(yield) elseif self.chr == '@' then self:nextc() if self.chr:find('%d') then self:error('variable names cannot begin with a number') end local buff = self:read_chars('[%w_]') yield('VARSYM', buff) elseif self.chr == '%' then self:nextc() if self.chr:find('[%a_]') then local call = self:read_chars('[%w_]') if call ~= '' then yield('SPECIAL', call) end elseif self.chr:find('[01]') then yield('NUM', self:read_binary()) elseif self.chr == '(' then self:lex_expression(yield) else self:error('unknown % syntax') end elseif self.chr:find('[%a_]') then local buff = self:read_chars('[%w_.]') local up = buff:upper() if self.chr == ':' then if buff:find('%.') then self:error('labels cannot contain dots') end self:nextc() yield('LABEL', buff) elseif up == 'HEX' then yield('DIR', 'HEX') self:lex_hex(yield) elseif data.all_registers[up] then yield('REG', up) elseif data.all_instructions[up] then yield('INSTR', up:gsub('%.', '_')) else if buff:find('%.') then self:error('labels cannot contain dots') end yield('LABELSYM', buff) end elseif self.chr == '+' or self.chr == '-' then local sign_chr = self.chr local sign = sign_chr == '+' and 1 or -1 local signs = self:read_chars('%'..self.chr) local name = '' if self.chr:find('[%a_]') then name = self:read_chars('[%w_]') end if #signs == 1 and self.chr == ':' then self:nextc() yield('RELLABEL', signs..name) else self:read_spaces() local n = self:read_number() if n then yield('NUM', sign*n) elseif #signs == 1 and name == '' then -- this could be a RELLABELSYM -- we'll have to let the preproc figure it out yield('UNARY', sign) else yield('RELLABELSYM', signs..name) end end else local n = self:read_number() if n then yield('NUM', n) else self:error('unknown character or control character') end end end end return Lexer