diff options
Diffstat (limited to 'Data/Libraries/LDoc/ldoc/lexer.lua')
-rw-r--r-- | Data/Libraries/LDoc/ldoc/lexer.lua | 504 |
1 files changed, 504 insertions, 0 deletions
diff --git a/Data/Libraries/LDoc/ldoc/lexer.lua b/Data/Libraries/LDoc/ldoc/lexer.lua new file mode 100644 index 0000000..aafb07d --- /dev/null +++ b/Data/Libraries/LDoc/ldoc/lexer.lua @@ -0,0 +1,504 @@ +--- Lexical scanner for creating a sequence of tokens from text. <br> +-- <p><code>lexer.scan(s)</code> returns an iterator over all tokens found in the +-- string <code>s</code>. This iterator returns two values, a token type string +-- (such as 'string' for quoted string, 'iden' for identifier) and the value of the +-- token. +-- <p> +-- Versions specialized for Lua and C are available; these also handle block comments +-- and classify keywords as 'keyword' tokens. For example: +-- <pre class=example> +-- > s = 'for i=1,n do' +-- > for t,v in lexer.lua(s) do print(t,v) end +-- keyword for +-- iden i +-- = = +-- number 1 +-- , , +-- iden n +-- keyword do +-- </pre> +-- +-- Based on pl.lexer from Penlight + +local strfind = string.find +local strsub = string.sub +local append = table.insert + +local function assert_arg(idx,val,tp) + if type(val) ~= tp then + error("argument "..idx.." must be "..tp, 2) + end +end + +local lexer = {} + +local NUMBER1 = '^[%+%-]?%d+%.?%d*[eE][%+%-]?%d+' +local NUMBER2 = '^[%+%-]?%d+%.?%d*' +local NUMBER3 = '^0x[%da-fA-F]+' +local NUMBER4 = '^%d+%.?%d*[eE][%+%-]?%d+' +local NUMBER5 = '^%d+%.?%d*' +local IDEN = '^[%a_][%w_]*' +local WSPACE = '^%s+' +local STRING1 = [[^'.-[^\\]']] +local STRING2 = [[^".-[^\\]"]] +local STRING3 = "^((['\"])%2)" -- empty string +local PREPRO = '^#.-[^\\]\n' + +local plain_matches,lua_matches,cpp_matches,cpp_matches_no_string,lua_keyword,cpp_keyword + +local function tdump(tok) + return tok,tok +end + +local function ndump(tok,options) + if options and options.number then + tok = tonumber(tok) + end + return "number",tok +end + +-- regular strings, single or double quotes; usually we want them +-- without the quotes +local function sdump(tok,options) + if options and options.string then + tok = tok:sub(2,-2) + end + return "string",tok +end + +-- strings enclosed in back ticks +local function bdump(tok,options) + if options and options.string then + tok = tok:sub(2,-2) + end + return "backtick",tok +end + +-- long Lua strings need extra work to get rid of the quotes +local function sdump_l(tok,options) + if options and options.string then + tok = tok:sub(3,-3) + end + return "string",tok +end + +local function chdump(tok,options) + if options and options.string then + tok = tok:sub(2,-2) + end + return "char",tok +end + +local function cdump(tok) + return 'comment',tok +end + +local function wsdump (tok) + return "space",tok +end + +local function pdump (tok) + return 'prepro',tok +end + +local function plain_vdump(tok) + return "iden",tok +end + +local function lua_vdump(tok) + if lua_keyword[tok] then + return "keyword",tok + else + return "iden",tok + end +end + +local function cpp_vdump(tok) + if cpp_keyword[tok] then + return "keyword",tok + else + return "iden",tok + end +end + +local function count_lines(line, text) + local index, limit = 1, #text + while index <= limit do + local start, stop = text:find('\r\n', index, true) + if not start then + start, stop = text:find('[\r\n\f]', index) + if not start then break end + end + index = stop + 1 + line = line + 1 + end + return line +end + +local multiline = { comment = true, space = true } + + +--- create a plain token iterator from a string or file-like object. +-- @param s the string +-- @param matches an optional match table (set of pattern-action pairs) +-- @param filter a table of token types to exclude, by default {space=true} +-- @param options a table of options; by default, {number=true,string=true}, +-- which means convert numbers and strip string quotes. +function lexer.scan (s,matches,filter,options) + --assert_arg(1,s,'string') + local file = type(s) ~= 'string' and s + filter = filter or {space=true} + options = options or {number=true,string=true} + if filter then + if filter.space then filter[wsdump] = true end + if filter.comments then + filter[cdump] = true + end + end + if not matches then + if not plain_matches then + plain_matches = { + {WSPACE,wsdump}, + {NUMBER3,ndump}, + {IDEN,plain_vdump}, + {NUMBER1,ndump}, + {NUMBER2,ndump}, + {STRING3,sdump}, + {STRING1,sdump}, + {STRING2,sdump}, + {'^.',tdump} + } + end + matches = plain_matches + end + local i1,i2,tok,pat,fun + local line = 1 + if file then + s = file:read() + if not s then return nil end -- empty file + if s:match '^\239\187' then -- UTF-8 BOM Abomination + s = s:sub(4) + end + s = s ..'\n' + end + local sz = #s + local idx = 1 + if sz == 0 then return nil end -- empty file + + local res = {} + local mt = {} + mt.__index = mt + setmetatable(res,mt) + + function mt.lineno() return line end + + function mt.getline() + if idx < sz then + tok = strsub(s,idx,-2) + idx = sz + 1 + line = line + 1 + return tok + else + idx = sz + 1 + line = line + 1 + return file:read() + end + end + + function mt.next (tok) + local t,v = tok() + while t == 'space' do + t,v = tok() + end + return t,v + end + + function mt.__call () + if not s then return end + while true do + for _,m in ipairs(matches) do + pat,fun = m[1],m[2] + if fun == nil then error("no match for "..pat) end + i1,i2 = strfind(s,pat,idx) + if i1 then + tok = strsub(s,i1,i2) + idx = i2 + 1 + if not (filter and filter[fun]) then + lexer.finished = idx > sz + local t,v = fun(tok,options) + if not file and multiline[t] then + line = count_lines(line,v) + end + return t,v + end + end + end + if idx > sz then + if file then + line = line + 1 + s = file:read() + if not s then return end + s = s .. '\n' + idx ,sz = 1,#s + else + return + end + end + end + end + return res +end + +--- get everything in a stream upto a newline. +-- @param tok a token stream +-- @return a string +function lexer.getline (tok) + return tok:getline() +end + +--- get current line number. <br> +-- Only available if the input source is a file-like object. +-- @param tok a token stream +-- @return the line number and current column +function lexer.lineno (tok) + return tok:lineno() +end + +--- get the Lua keywords as a set-like table. +-- So <code>res["and"]</code> etc would be <code>true</code>. +-- @return a table +function lexer.get_keywords () + if not lua_keyword then + lua_keyword = { + ["and"] = true, ["break"] = true, ["do"] = true, + ["else"] = true, ["elseif"] = true, ["end"] = true, + ["false"] = true, ["for"] = true, ["function"] = true, + ["if"] = true, ["in"] = true, ["local"] = true, ["nil"] = true, + ["not"] = true, ["or"] = true, ["repeat"] = true, + ["return"] = true, ["then"] = true, ["true"] = true, + ["until"] = true, ["while"] = true + } + end + return lua_keyword +end + + +--- create a Lua token iterator from a string or file-like object. +-- Will return the token type and value. +-- @param s the string +-- @param filter a table of token types to exclude, by default {space=true,comments=true} +-- @param options a table of options; by default, {number=true,string=true}, +-- which means convert numbers and strip string quotes. +function lexer.lua(s,filter,options) + filter = filter or {space=true,comments=true} + lexer.get_keywords() + if not lua_matches then + lua_matches = { + {WSPACE,wsdump}, + {NUMBER3,ndump}, + {IDEN,lua_vdump}, + {NUMBER4,ndump}, + {NUMBER5,ndump}, + {STRING3,sdump}, + {STRING1,sdump}, + {STRING2,sdump}, + {'^`[^`]+`',bdump}, + {'^%-%-%[(=*)%[.-%]%1%]',cdump}, + {'^%-%-.-\n',cdump}, + {'^%-%-.-$',cdump}, + {'^%[(=*)%[.-%]%1%]',sdump_l}, + {'^==',tdump}, + {'^~=',tdump}, + {'^<=',tdump}, + {'^>=',tdump}, + {'^%.%.%.',tdump}, + {'^%.%.',tdump}, + {'^.',tdump} + } + end + return lexer.scan(s,lua_matches,filter,options) +end + +--- create a C/C++ token iterator from a string or file-like object. +-- Will return the token type type and value. +-- @param s the string +-- @param filter a table of token types to exclude, by default {space=true,comments=true} +-- @param options a table of options; by default, {number=true,string=true}, +-- which means convert numbers and strip string quotes. +function lexer.cpp(s,filter,options,no_string) + filter = filter or {comments=true} + if not cpp_keyword then + cpp_keyword = { + ["class"] = true, ["break"] = true, ["do"] = true, ["sizeof"] = true, + ["else"] = true, ["continue"] = true, ["struct"] = true, + ["false"] = true, ["for"] = true, ["public"] = true, ["void"] = true, + ["private"] = true, ["protected"] = true, ["goto"] = true, + ["if"] = true, ["static"] = true, ["const"] = true, ["typedef"] = true, + ["enum"] = true, ["char"] = true, ["int"] = true, ["bool"] = true, + ["long"] = true, ["float"] = true, ["true"] = true, ["delete"] = true, + ["double"] = true, ["while"] = true, ["new"] = true, + ["namespace"] = true, ["try"] = true, ["catch"] = true, + ["switch"] = true, ["case"] = true, ["extern"] = true, + ["return"] = true,["default"] = true,['unsigned'] = true,['signed'] = true, + ["union"] = true, ["volatile"] = true, ["register"] = true,["short"] = true, + } + end + if not cpp_matches then + cpp_matches = { + {WSPACE,wsdump}, + {PREPRO,pdump}, + {NUMBER3,ndump}, + {IDEN,cpp_vdump}, + {NUMBER4,ndump}, + {NUMBER5,ndump}, + {STRING3,sdump}, + {STRING1,chdump}, + {STRING2,sdump}, + {'^//.-\n',cdump}, + {'^//.-$',cdump}, + {'^/%*.-%*/',cdump}, + {'^==',tdump}, + {'^!=',tdump}, + {'^<=',tdump}, + {'^>=',tdump}, + {'^->',tdump}, + {'^&&',tdump}, + {'^||',tdump}, + {'^%+%+',tdump}, + {'^%-%-',tdump}, + {'^%+=',tdump}, + {'^%-=',tdump}, + {'^%*=',tdump}, + {'^/=',tdump}, + {'^|=',tdump}, + {'^%^=',tdump}, + {'^::',tdump}, + {'^%.%.%.',tdump}, + {'^.',tdump} + } + end + if not cpp_matches_no_string then + cpp_matches_no_string = { + {WSPACE,wsdump}, + {PREPRO,pdump}, + {NUMBER3,ndump}, + {IDEN,cpp_vdump}, + {NUMBER4,ndump}, + {NUMBER5,ndump}, + {'^//.-\n',cdump}, + {'^/%*.-%*/',cdump}, + {'^==',tdump}, + {'^!=',tdump}, + {'^<=',tdump}, + {'^>=',tdump}, + {'^->',tdump}, + {'^&&',tdump}, + {'^||',tdump}, + {'^%+%+',tdump}, + {'^%-%-',tdump}, + {'^%+=',tdump}, + {'^%-=',tdump}, + {'^%*=',tdump}, + {'^/=',tdump}, + {'^|=',tdump}, + {'^%^=',tdump}, + {'^::',tdump}, + {'^%.%.%.',tdump}, + {'^.',tdump} + } + end + return lexer.scan(s, + not no_string and cpp_matches or cpp_matches_no_string, + filter,options) +end + +--- get a list of parameters separated by a delimiter from a stream. +-- @param tok the token stream +-- @param endtoken end of list (default ')'). Can be '\n' +-- @param delim separator (default ',') +-- @return a list of token lists. +function lexer.get_separated_list(tok,endtoken,delim) + endtoken = endtoken or ')' + delim = delim or ',' + local function tappend (tl,t,val) + val = val or t + append(tl,{t,val}) + end + local is_end + if endtoken == '\n' then + is_end = function(t,val) + return t == 'space' and val:find '\n' + end + else + is_end = function (t) + return t == endtoken + end + end + local is_delim + if type(delim) == 'function' then + is_delim = delim + else + is_delim = function(t) + return t == delim + end + end + local parm_values = {} + local level = 1 -- used to count ( and ) + local tl = {} + local token,value + while true do + token,value=tok() + if not token then return nil,'EOS' end -- end of stream is an error! + if is_end(token,value) and level == 1 then + if next(tl) then + append(parm_values,tl) + end + break + elseif token == '(' then + level = level + 1 + tappend(tl,'(') + elseif token == ')' then + level = level - 1 + if level == 0 then -- finished with parm list + append(parm_values,tl) + break + else + tappend(tl,')') + end + elseif level == 1 and is_delim(token) then + append(parm_values,tl) -- a new parm + tl = {} + else + tappend(tl,token,value) + end + end + return parm_values,{token,value} +end + +--- get the next non-space token from the stream. +-- @param tok the token stream. +function lexer.skipws (tok) + return tok:next() +end + +local skipws = lexer.skipws + +--- get the next token, which must be of the expected type. +-- Throws an error if this type does not match! +-- @param tok the token stream +-- @param expected_type the token type +-- @param no_skip_ws whether we should skip whitespace +function lexer.expecting (tok,expected_type,no_skip_ws) + assert_arg(1,tok,'function') + assert_arg(2,expected_type,'string') + local t,v + if no_skip_ws then + t,v = tok() + else + t,v = skipws(tok) + end + if t ~= expected_type then error ("expecting "..expected_type,2) end + return v +end + +return lexer |