summaryrefslogtreecommitdiff
path: root/Tools/LuaMacro/macro/lexer.lua
blob: 58ab53a55ef0986aea972cca77d1db12bb1604ad (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
--[[--- A Lua lexical scanner using LPeg.
= CREDITS
Written by Peter Odding, 2007/04/04

= THANKS TO
- the Lua authors for a wonderful language;
- Roberto for LPeg;
- caffeine for keeping me awake :)

= LICENSE
Shamelessly ripped from the SQLite[3] project:

   The author disclaims copyright to this source code.  In place of a legal
   notice, here is a blessing:

      May you do good and not evil.
      May you find forgiveness for yourself and forgive others.
      May you share freely, never taking more than you give.

@module macro.lexer
--]]

local lexer = {}
local lpeg = require 'lpeg'
local P, R, S, C, Cb, Cc, Cg, Cmt, Ct =
      lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cb, lpeg.Cc, lpeg.Cg, lpeg.Cmt, lpeg.Ct

-- create a pattern which captures the lua value [id] and the input matching
-- [patt] in a table
local function token(id, patt) return Ct(Cc(id) * C(patt)) end

-- private interface
local table_of_tokens
local extra_tokens

function lexer.add_extra_tokens(extra)
    extra_tokens = extra_tokens or {}
    for _,t in ipairs(extra) do
        table.insert(extra_tokens,t)
    end
    table_of_tokens = nil -- re-initialize
end

function lexer.init ()
    local digit = R('09')

    -- range of valid characters after first character of identifier
    --local idsafe = R('AZ', 'az', '\127\255') + P '_'
    local idsafe = R('AZ', 'az') + P '_' + R '\206\223' * R '\128\255'
    -- operators
    local OT = P '=='
    if extra_tokens then
        for _,ex in ipairs(extra_tokens) do
            OT = OT + P(ex)
        end
    end
    local operator = token('operator', OT + P '.' + P '~=' + P '<=' + P '>=' + P '...'
                                              + P '..' + S '+-*/%^#=<>;:,.{}[]()')
    -- identifiers
    local ident = token('iden', idsafe * (idsafe + digit) ^ 0)

    -- keywords
    local keyword = token('keyword', (P 'and' + P 'break' + P 'do' + P 'elseif' +
       P 'else' + P 'end' + P 'false' + P 'for' + P 'function' + P 'if' +
       P 'in' + P 'local' + P 'nil' + P 'not' + P 'or' + P 'repeat' + P 'return' +
       P 'then' + P 'true' + P 'until' + P 'while') * -(idsafe + digit))

    -- numbers
    local number_sign = S'+-'^-1
    local number_decimal = digit ^ 1
    local number_hexadecimal = P '0' * S 'xX' * R('09', 'AF', 'af') ^ 1
    local number_float = (digit^1 * P'.' * digit^0 + P'.' * digit^1) *
                         (S'eE' * number_sign * digit^1)^-1
    local number = token('number', number_hexadecimal +
                                   number_float +
                                   number_decimal)

    -- callback for [=[ long strings ]=]
    -- ps. LPeg is for Lua what regex is for Perl, which makes me smile :)
    local equals  = P '=' ^ 0
    local open    = P '[' * Cg(equals, "init") * P '[' * P '\n' ^ -1
    local close   = P ']' * C(equals) * P ']'
    local closeeq = Cmt(close * Cb "init", function (s, i, a, b) return a == b end)
    local longstring = open * C((P(1) - closeeq)^0) * close --/ 1

    -- strings
    local singlequoted_string = P "'" * ((1 - S "'\r\n\f\\") + (P '\\' * 1)) ^ 0 * "'"
    local doublequoted_string = P '"' * ((1 - S '"\r\n\f\\') + (P '\\' * 1)) ^ 0 * '"'
    local string = token('string', singlequoted_string +
                                   doublequoted_string +
                                   longstring)

    -- comments
    local singleline_comment = P '--' * (1 - S '\r\n\f') ^ 0
    local multiline_comment = P '--' * longstring
    local comment = token('comment', multiline_comment + singleline_comment)

    -- whitespace
    local whitespace = token('space', S('\r\n\f\t ')^1)

    -- ordered choice of all tokens and last-resort error which consumes one character
    local any_token = whitespace + number + keyword + ident +
                      string + comment + operator + token('error', 1)


    table_of_tokens = Ct(any_token ^ 0)
end

-- increment [line] by the number of line-ends in [text]
local function sync(line, text)
   local index, limit = 1, #text
   while index <= limit do
      local start, stop = text:find('\r\n', index, true)
      if not start then
         start, stop = text:find('[\r\n\f]', index)
         if not start then break end
      end
      index = stop + 1
      line = line + 1
   end
   return line
end
lexer.sync = sync

lexer.line = 0

-- we only need to synchronize the line-counter for these token types
local multiline_tokens = { comment = true, string = true, space = true }
lexer.multiline_tokens = multiline_tokens

function lexer.scan_lua_tokenlist(input)
    if not table_of_tokens then
        lexer.init()
    end
    assert(type(input) == 'string', 'bad argument #1 (expected string)')
    local line = 1
    local tokens = lpeg.match(table_of_tokens, input)
    for i, token in pairs(tokens) do
        local t = token[1]
        if t == 'operator' or t == 'error' then
            token[1] = token[2]
        end
        token[3] = line
        if multiline_tokens[t] then
            line = sync(line, token[2])
        end
    end
    return tokens
end

--- get a token iterator from a source containing Lua code.
-- Note that this token iterator includes spaces and comments, and does not convert
-- string and number tokens - so e.g. a string token is quoted and a number token is
-- an unconverted string.
-- @param input the source - can be a string or a file-like object (i.e. read() returns line)
-- @param name for the source
function lexer.scan_lua(input,name)
    if type(input) ~= 'string' and input.read then
        input = input:read('*a')
    end
    local tokens = lexer.scan_lua_tokenlist(input)
    local i, n = 1, #tokens
    return function(k)
        if k ~= nil then
            k = i + k
            if k < 1 or k > n then return nil end
            return tokens[k]
        end
        local tok = tokens[i]
        i = i + 1
        if tok then
            lexer.line = tok[3]
            lexer.name = name
            return tok[1],tok[2]
        end
    end
end

return lexer