summaryrefslogtreecommitdiff
path: root/Tools/LuaMacro/macro/clexer.lua
blob: fd859a8755c02fc662edcebbe57a8c506c4434b6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
--[[--- A C lexical scanner using LPeg.
= CREDITS
= based on the C lexer in Peter Odding's lua-lxsh
@module macro.clexer
--]]

local clexer = {}
local lpeg = require 'lpeg'
local P, R, S, C, Cc, Ct = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cc, lpeg.Ct

-- create a pattern which captures the lua value [id] and the input matching
-- [patt] in a table
local function token(id, patt) return Ct(Cc(id) * C(patt)) end

-- private interface
local table_of_tokens
local extra_tokens

function clexer.add_extra_tokens(extra)
    extra_tokens = extra_tokens or {}
    for _,t in ipairs(extra) do
        table.insert(extra_tokens,t)
    end
    table_of_tokens = nil -- re-initialize
end

function clexer.init ()
    local digit = R('09')

    local upp, low = R'AZ', R'az'
    local oct, dec = R'07', R'09'
    local hex      = dec + R'AF' + R'af'
    local letter   = upp + low
    local alnum    = letter + dec + '_'
    local endline  = S'\r\n\f'
    local newline  = '\r\n' + endline
    local escape   = '\\' * ( newline
                        + S'\\"\'?abfnrtv'
                        + (#oct * oct^-3)
                        + ('x' * #hex * hex^-2))


    -- range of valid characters after first character of identifier
    local idsafe = R('AZ', 'az', '\127\255') + P '_'

    -- operators
    local OT = P '=='
    if extra_tokens then
        for _,ex in ipairs(extra_tokens) do
            OT = OT + P(ex)
        end
    end
    local operator = token('operator', OT + P '.' + P'>>=' + '<<=' + '--' + '>>' + '>=' + '/=' + '==' + '<='
    + '+=' + '<<' + '*=' + '++' + '&&' + '|=' + '||' + '!=' + '&=' + '-='
    + '^=' + '%=' + '->' + S',)*%+&(-~/^]{}|.[>!?:=<;')
    -- identifiers
    local ident = token('iden', idsafe * (idsafe + digit) ^ 0)

    -- keywords
    local keyword = token('keyword', (P 'auto' + P 'break' + P 'case' + P'char' +
        P 'const' + P 'continue' + P 'default' +
        P 'do' + P 'double' + P 'else' + P 'enum' + P 'extern' + P 'float' +
        P 'for' + P 'goto' + P 'if' + P 'int' + P 'long' + P 'register' +
        P 'return' + P 'short' + P 'signed' + P 'sizeof' + P 'static' +
        P 'struct' + P 'switch' + P 'typedef' + P 'union' + P 'void' +
        P 'volatile' + P 'while') * -(idsafe + digit))

    -- numbers
    local number_sign = S'+-'^-1
    local number_decimal = digit ^ 1
    local number_hexadecimal = P '0' * S 'xX' * R('09', 'AF', 'af') ^ 1
    local number_float = (digit^1 * P'.' * digit^0 + P'.' * digit^1) *
                         (S'eE' * number_sign * digit^1)^-1
    local number = token('number', number_hexadecimal +
                                   number_float +
                                   number_decimal)


    local string = token('string', '"' * ((1 - S'\\\r\n\f"') + escape)^0 * '"')
    local char = token('char',"'" * ((1 - S"\\\r\n\f'") + escape) * "'")

    -- comments
    local singleline_comment = P '//' * (1 - S '\r\n\f') ^ 0
    local multiline_comment = '/*' * (1 - P'*/')^0 * '*/'
    local comment = token('comment', multiline_comment + singleline_comment)
    local prepro = token('prepro',P '#' * (1 - S '\r\n\f') ^ 0)

    -- whitespace
    local whitespace = token('space', S('\r\n\f\t ')^1)

    -- ordered choice of all tokens and last-resort error which consumes one character
    local any_token = whitespace + number + keyword + ident +
                      string + char + comment + prepro + operator + token('error', 1)


    table_of_tokens = Ct(any_token ^ 0)
end

-- increment [line] by the number of line-ends in [text]
local function sync(line, text)
   local index, limit = 1, #text
   while index <= limit do
      local start, stop = text:find('\r\n', index, true)
      if not start then
         start, stop = text:find('[\r\n\f]', index)
         if not start then break end
      end
      index = stop + 1
      line = line + 1
   end
   return line
end
clexer.sync = sync

clexer.line = 0

-- we only need to synchronize the line-counter for these token types
local multiline_tokens = { comment = true, space = true }
clexer.multiline_tokens = multiline_tokens

function clexer.scan_c_tokenlist(input)
    if not table_of_tokens then
        clexer.init()
    end
    assert(type(input) == 'string', 'bad argument #1 (expected string)')
    local line = 1
    local tokens = lpeg.match(table_of_tokens, input)
    for i, token in pairs(tokens) do
        local t = token[1]
        if t == 'operator' or t == 'error' then
            token[1] = token[2]
        end
        token[3] = line
        if multiline_tokens[t] then
            line = sync(line, token[2])
        end
    end
    return tokens
end

--- get a token iterator from a source containing Lua code.
-- S  is the source - can be a string or a file-like object (i.e. read() returns line)
-- Note that this token iterator includes spaces and comments, and does not convert
-- string and number tokens - so e.g. a string token is quoted and a number token is
-- an unconverted string.
function clexer.scan_c(input,name)
    if type(input) ~= 'string' and input.read then
        input = input:read('*a')
    end
    local tokens = clexer.scan_c_tokenlist(input)
    local i, n = 1, #tokens
    return function(k)
        if k ~= nil then
            k = i + k
            if k < 1 or k > n then return nil end
            return tokens[k]
        end
        local tok = tokens[i]
        i = i + 1
        if tok then
            clexer.line = tok[3]
            clexer.name = name
            return tok[1],tok[2]
        end
    end

end

return clexer