1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
|
--[[--- A C lexical scanner using LPeg.
= CREDITS
= based on the C lexer in Peter Odding's lua-lxsh
@module macro.clexer
--]]
local clexer = {}
local lpeg = require 'lpeg'
local P, R, S, C, Cc, Ct = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cc, lpeg.Ct
-- create a pattern which captures the lua value [id] and the input matching
-- [patt] in a table
local function token(id, patt) return Ct(Cc(id) * C(patt)) end
-- private interface
local table_of_tokens
local extra_tokens
function clexer.add_extra_tokens(extra)
extra_tokens = extra_tokens or {}
for _,t in ipairs(extra) do
table.insert(extra_tokens,t)
end
table_of_tokens = nil -- re-initialize
end
function clexer.init ()
local digit = R('09')
local upp, low = R'AZ', R'az'
local oct, dec = R'07', R'09'
local hex = dec + R'AF' + R'af'
local letter = upp + low
local alnum = letter + dec + '_'
local endline = S'\r\n\f'
local newline = '\r\n' + endline
local escape = '\\' * ( newline
+ S'\\"\'?abfnrtv'
+ (#oct * oct^-3)
+ ('x' * #hex * hex^-2))
-- range of valid characters after first character of identifier
local idsafe = R('AZ', 'az', '\127\255') + P '_'
-- operators
local OT = P '=='
if extra_tokens then
for _,ex in ipairs(extra_tokens) do
OT = OT + P(ex)
end
end
local operator = token('operator', OT + P '.' + P'>>=' + '<<=' + '--' + '>>' + '>=' + '/=' + '==' + '<='
+ '+=' + '<<' + '*=' + '++' + '&&' + '|=' + '||' + '!=' + '&=' + '-='
+ '^=' + '%=' + '->' + S',)*%+&(-~/^]{}|.[>!?:=<;')
-- identifiers
local ident = token('iden', idsafe * (idsafe + digit) ^ 0)
-- keywords
local keyword = token('keyword', (P 'auto' + P 'break' + P 'case' + P'char' +
P 'const' + P 'continue' + P 'default' +
P 'do' + P 'double' + P 'else' + P 'enum' + P 'extern' + P 'float' +
P 'for' + P 'goto' + P 'if' + P 'int' + P 'long' + P 'register' +
P 'return' + P 'short' + P 'signed' + P 'sizeof' + P 'static' +
P 'struct' + P 'switch' + P 'typedef' + P 'union' + P 'void' +
P 'volatile' + P 'while') * -(idsafe + digit))
-- numbers
local number_sign = S'+-'^-1
local number_decimal = digit ^ 1
local number_hexadecimal = P '0' * S 'xX' * R('09', 'AF', 'af') ^ 1
local number_float = (digit^1 * P'.' * digit^0 + P'.' * digit^1) *
(S'eE' * number_sign * digit^1)^-1
local number = token('number', number_hexadecimal +
number_float +
number_decimal)
local string = token('string', '"' * ((1 - S'\\\r\n\f"') + escape)^0 * '"')
local char = token('char',"'" * ((1 - S"\\\r\n\f'") + escape) * "'")
-- comments
local singleline_comment = P '//' * (1 - S '\r\n\f') ^ 0
local multiline_comment = '/*' * (1 - P'*/')^0 * '*/'
local comment = token('comment', multiline_comment + singleline_comment)
local prepro = token('prepro',P '#' * (1 - S '\r\n\f') ^ 0)
-- whitespace
local whitespace = token('space', S('\r\n\f\t ')^1)
-- ordered choice of all tokens and last-resort error which consumes one character
local any_token = whitespace + number + keyword + ident +
string + char + comment + prepro + operator + token('error', 1)
table_of_tokens = Ct(any_token ^ 0)
end
-- increment [line] by the number of line-ends in [text]
local function sync(line, text)
local index, limit = 1, #text
while index <= limit do
local start, stop = text:find('\r\n', index, true)
if not start then
start, stop = text:find('[\r\n\f]', index)
if not start then break end
end
index = stop + 1
line = line + 1
end
return line
end
clexer.sync = sync
clexer.line = 0
-- we only need to synchronize the line-counter for these token types
local multiline_tokens = { comment = true, space = true }
clexer.multiline_tokens = multiline_tokens
function clexer.scan_c_tokenlist(input)
if not table_of_tokens then
clexer.init()
end
assert(type(input) == 'string', 'bad argument #1 (expected string)')
local line = 1
local tokens = lpeg.match(table_of_tokens, input)
for i, token in pairs(tokens) do
local t = token[1]
if t == 'operator' or t == 'error' then
token[1] = token[2]
end
token[3] = line
if multiline_tokens[t] then
line = sync(line, token[2])
end
end
return tokens
end
--- get a token iterator from a source containing Lua code.
-- S is the source - can be a string or a file-like object (i.e. read() returns line)
-- Note that this token iterator includes spaces and comments, and does not convert
-- string and number tokens - so e.g. a string token is quoted and a number token is
-- an unconverted string.
function clexer.scan_c(input,name)
if type(input) ~= 'string' and input.read then
input = input:read('*a')
end
local tokens = clexer.scan_c_tokenlist(input)
local i, n = 1, #tokens
return function(k)
if k ~= nil then
k = i + k
if k < 1 or k > n then return nil end
return tokens[k]
end
local tok = tokens[i]
i = i + 1
if tok then
clexer.line = tok[3]
clexer.name = name
return tok[1],tok[2]
end
end
end
return clexer
|