diff options
Diffstat (limited to 'ThirdParty/lpeg-1.0.2')
-rw-r--r-- | ThirdParty/lpeg-1.0.2/HISTORY | 100 | ||||
-rw-r--r-- | ThirdParty/lpeg-1.0.2/lpcap.c | 555 | ||||
-rw-r--r-- | ThirdParty/lpeg-1.0.2/lpcap.h | 57 | ||||
-rw-r--r-- | ThirdParty/lpeg-1.0.2/lpcode.c | 1014 | ||||
-rw-r--r-- | ThirdParty/lpeg-1.0.2/lpcode.h | 40 | ||||
-rw-r--r-- | ThirdParty/lpeg-1.0.2/lpeg-128.gif | bin | 0 -> 4923 bytes | |||
-rw-r--r-- | ThirdParty/lpeg-1.0.2/lpeg.html | 1439 | ||||
-rw-r--r-- | ThirdParty/lpeg-1.0.2/lpprint.c | 244 | ||||
-rw-r--r-- | ThirdParty/lpeg-1.0.2/lpprint.h | 36 | ||||
-rw-r--r-- | ThirdParty/lpeg-1.0.2/lptree.c | 1305 | ||||
-rw-r--r-- | ThirdParty/lpeg-1.0.2/lptree.h | 82 | ||||
-rw-r--r-- | ThirdParty/lpeg-1.0.2/lptypes.h | 145 | ||||
-rw-r--r-- | ThirdParty/lpeg-1.0.2/lpvm.c | 374 | ||||
-rw-r--r-- | ThirdParty/lpeg-1.0.2/lpvm.h | 58 | ||||
-rw-r--r-- | ThirdParty/lpeg-1.0.2/makefile | 55 | ||||
-rw-r--r-- | ThirdParty/lpeg-1.0.2/re.html | 494 | ||||
-rw-r--r-- | ThirdParty/lpeg-1.0.2/re.lua | 267 | ||||
-rw-r--r-- | ThirdParty/lpeg-1.0.2/test.lua | 1523 |
18 files changed, 7788 insertions, 0 deletions
diff --git a/ThirdParty/lpeg-1.0.2/HISTORY b/ThirdParty/lpeg-1.0.2/HISTORY new file mode 100644 index 0000000..66a8e14 --- /dev/null +++ b/ThirdParty/lpeg-1.0.2/HISTORY @@ -0,0 +1,100 @@ +HISTORY for LPeg 1.0.2 + +* Changes from version 1.0.1 to 1.0.2 + --------------------------------- + + some bugs fixed + +* Changes from version 0.12 to 1.0.1 + --------------------------------- + + group "names" can be any Lua value + + some bugs fixed + + other small improvements + +* Changes from version 0.11 to 0.12 + --------------------------------- + + no "unsigned short" limit for pattern sizes + + mathtime captures considered nullable + + some bugs fixed + +* Changes from version 0.10 to 0.11 + ------------------------------- + + complete reimplementation of the code generator + + new syntax for table captures + + new functions in module 're' + + other small improvements + +* Changes from version 0.9 to 0.10 + ------------------------------- + + backtrack stack has configurable size + + better error messages + + Notation for non-terminals in 're' back to A instead o <A> + + experimental look-behind pattern + + support for external extensions + + works with Lua 5.2 + + consumes less C stack + + - "and" predicates do not keep captures + +* Changes from version 0.8 to 0.9 + ------------------------------- + + The accumulator capture was replaced by a fold capture; + programs that used the old 'lpeg.Ca' will need small changes. + + Some support for character classes from old C locales. + + A new named-group capture. + +* Changes from version 0.7 to 0.8 + ------------------------------- + + New "match-time" capture. + + New "argument capture" that allows passing arguments into the pattern. + + Better documentation for 're'. + + Several small improvements for 're'. + + The 're' module has an incompatibility with previous versions: + now, any use of a non-terminal must be enclosed in angle brackets + (like <B>). + +* Changes from version 0.6 to 0.7 + ------------------------------- + + Several improvements in module 're': + - better documentation; + - support for most captures (all but accumulator); + - limited repetitions p{n,m}. + + Small improvements in efficiency. + + Several small bugs corrected (special thanks to Hans Hagen + and Taco Hoekwater). + +* Changes from version 0.5 to 0.6 + ------------------------------- + + Support for non-numeric indices in grammars. + + Some bug fixes (thanks to the luatex team). + + Some new optimizations; (thanks to Mike Pall). + + A new page layout (thanks to Andre Carregal). + + Minimal documentation for module 're'. + +* Changes from version 0.4 to 0.5 + ------------------------------- + + Several optimizations. + + lpeg.P now accepts booleans. + + Some new examples. + + A proper license. + + Several small improvements. + +* Changes from version 0.3 to 0.4 + ------------------------------- + + Static check for loops in repetitions and grammars. + + Removed label option in captures. + + The implementation of captures uses less memory. + +* Changes from version 0.2 to 0.3 + ------------------------------- + + User-defined patterns in Lua. + + Several new captures. + +* Changes from version 0.1 to 0.2 + ------------------------------- + + Several small corrections. + + Handles embedded zeros like any other character. + + Capture "name" can be any Lua value. + + Unlimited number of captures. + + Match gets an optional initial position. + +(end of HISTORY) diff --git a/ThirdParty/lpeg-1.0.2/lpcap.c b/ThirdParty/lpeg-1.0.2/lpcap.c new file mode 100644 index 0000000..b332fde --- /dev/null +++ b/ThirdParty/lpeg-1.0.2/lpcap.c @@ -0,0 +1,555 @@ +/* +** $Id: lpcap.c $ +** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) +*/ + +#include "lua.h" +#include "lauxlib.h" + +#include "lpcap.h" +#include "lptypes.h" + + +#define captype(cap) ((cap)->kind) + +#define isclosecap(cap) (captype(cap) == Cclose) + +#define closeaddr(c) ((c)->s + (c)->siz - 1) + +#define isfullcap(cap) ((cap)->siz != 0) + +#define getfromktable(cs,v) lua_rawgeti((cs)->L, ktableidx((cs)->ptop), v) + +#define pushluaval(cs) getfromktable(cs, (cs)->cap->idx) + + + +/* +** Put at the cache for Lua values the value indexed by 'v' in ktable +** of the running pattern (if it is not there yet); returns its index. +*/ +static int updatecache (CapState *cs, int v) { + int idx = cs->ptop + 1; /* stack index of cache for Lua values */ + if (v != cs->valuecached) { /* not there? */ + getfromktable(cs, v); /* get value from 'ktable' */ + lua_replace(cs->L, idx); /* put it at reserved stack position */ + cs->valuecached = v; /* keep track of what is there */ + } + return idx; +} + + +static int pushcapture (CapState *cs); + + +/* +** Goes back in a list of captures looking for an open capture +** corresponding to a close +*/ +static Capture *findopen (Capture *cap) { + int n = 0; /* number of closes waiting an open */ + for (;;) { + cap--; + if (isclosecap(cap)) n++; /* one more open to skip */ + else if (!isfullcap(cap)) + if (n-- == 0) return cap; + } +} + + +/* +** Go to the next capture +*/ +static void nextcap (CapState *cs) { + Capture *cap = cs->cap; + if (!isfullcap(cap)) { /* not a single capture? */ + int n = 0; /* number of opens waiting a close */ + for (;;) { /* look for corresponding close */ + cap++; + if (isclosecap(cap)) { + if (n-- == 0) break; + } + else if (!isfullcap(cap)) n++; + } + } + cs->cap = cap + 1; /* + 1 to skip last close (or entire single capture) */ +} + + +/* +** Push on the Lua stack all values generated by nested captures inside +** the current capture. Returns number of values pushed. 'addextra' +** makes it push the entire match after all captured values. The +** entire match is pushed also if there are no other nested values, +** so the function never returns zero. +*/ +static int pushnestedvalues (CapState *cs, int addextra) { + Capture *co = cs->cap; + if (isfullcap(cs->cap++)) { /* no nested captures? */ + lua_pushlstring(cs->L, co->s, co->siz - 1); /* push whole match */ + return 1; /* that is it */ + } + else { + int n = 0; + while (!isclosecap(cs->cap)) /* repeat for all nested patterns */ + n += pushcapture(cs); + if (addextra || n == 0) { /* need extra? */ + lua_pushlstring(cs->L, co->s, cs->cap->s - co->s); /* push whole match */ + n++; + } + cs->cap++; /* skip close entry */ + return n; + } +} + + +/* +** Push only the first value generated by nested captures +*/ +static void pushonenestedvalue (CapState *cs) { + int n = pushnestedvalues(cs, 0); + if (n > 1) + lua_pop(cs->L, n - 1); /* pop extra values */ +} + + +/* +** Try to find a named group capture with the name given at the top of +** the stack; goes backward from 'cap'. +*/ +static Capture *findback (CapState *cs, Capture *cap) { + lua_State *L = cs->L; + while (cap-- > cs->ocap) { /* repeat until end of list */ + if (isclosecap(cap)) + cap = findopen(cap); /* skip nested captures */ + else if (!isfullcap(cap)) + continue; /* opening an enclosing capture: skip and get previous */ + if (captype(cap) == Cgroup) { + getfromktable(cs, cap->idx); /* get group name */ + if (lp_equal(L, -2, -1)) { /* right group? */ + lua_pop(L, 2); /* remove reference name and group name */ + return cap; + } + else lua_pop(L, 1); /* remove group name */ + } + } + luaL_error(L, "back reference '%s' not found", lua_tostring(L, -1)); + return NULL; /* to avoid warnings */ +} + + +/* +** Back-reference capture. Return number of values pushed. +*/ +static int backrefcap (CapState *cs) { + int n; + Capture *curr = cs->cap; + pushluaval(cs); /* reference name */ + cs->cap = findback(cs, curr); /* find corresponding group */ + n = pushnestedvalues(cs, 0); /* push group's values */ + cs->cap = curr + 1; + return n; +} + + +/* +** Table capture: creates a new table and populates it with nested +** captures. +*/ +static int tablecap (CapState *cs) { + lua_State *L = cs->L; + int n = 0; + lua_newtable(L); + if (isfullcap(cs->cap++)) + return 1; /* table is empty */ + while (!isclosecap(cs->cap)) { + if (captype(cs->cap) == Cgroup && cs->cap->idx != 0) { /* named group? */ + pushluaval(cs); /* push group name */ + pushonenestedvalue(cs); + lua_settable(L, -3); + } + else { /* not a named group */ + int i; + int k = pushcapture(cs); + for (i = k; i > 0; i--) /* store all values into table */ + lua_rawseti(L, -(i + 1), n + i); + n += k; + } + } + cs->cap++; /* skip close entry */ + return 1; /* number of values pushed (only the table) */ +} + + +/* +** Table-query capture +*/ +static int querycap (CapState *cs) { + int idx = cs->cap->idx; + pushonenestedvalue(cs); /* get nested capture */ + lua_gettable(cs->L, updatecache(cs, idx)); /* query cap. value at table */ + if (!lua_isnil(cs->L, -1)) + return 1; + else { /* no value */ + lua_pop(cs->L, 1); /* remove nil */ + return 0; + } +} + + +/* +** Fold capture +*/ +static int foldcap (CapState *cs) { + int n; + lua_State *L = cs->L; + int idx = cs->cap->idx; + if (isfullcap(cs->cap++) || /* no nested captures? */ + isclosecap(cs->cap) || /* no nested captures (large subject)? */ + (n = pushcapture(cs)) == 0) /* nested captures with no values? */ + return luaL_error(L, "no initial value for fold capture"); + if (n > 1) + lua_pop(L, n - 1); /* leave only one result for accumulator */ + while (!isclosecap(cs->cap)) { + lua_pushvalue(L, updatecache(cs, idx)); /* get folding function */ + lua_insert(L, -2); /* put it before accumulator */ + n = pushcapture(cs); /* get next capture's values */ + lua_call(L, n + 1, 1); /* call folding function */ + } + cs->cap++; /* skip close entry */ + return 1; /* only accumulator left on the stack */ +} + + +/* +** Function capture +*/ +static int functioncap (CapState *cs) { + int n; + int top = lua_gettop(cs->L); + pushluaval(cs); /* push function */ + n = pushnestedvalues(cs, 0); /* push nested captures */ + lua_call(cs->L, n, LUA_MULTRET); /* call function */ + return lua_gettop(cs->L) - top; /* return function's results */ +} + + +/* +** Select capture +*/ +static int numcap (CapState *cs) { + int idx = cs->cap->idx; /* value to select */ + if (idx == 0) { /* no values? */ + nextcap(cs); /* skip entire capture */ + return 0; /* no value produced */ + } + else { + int n = pushnestedvalues(cs, 0); + if (n < idx) /* invalid index? */ + return luaL_error(cs->L, "no capture '%d'", idx); + else { + lua_pushvalue(cs->L, -(n - idx + 1)); /* get selected capture */ + lua_replace(cs->L, -(n + 1)); /* put it in place of 1st capture */ + lua_pop(cs->L, n - 1); /* remove other captures */ + return 1; + } + } +} + + +/* +** Return the stack index of the first runtime capture in the given +** list of captures (or zero if no runtime captures) +*/ +int finddyncap (Capture *cap, Capture *last) { + for (; cap < last; cap++) { + if (cap->kind == Cruntime) + return cap->idx; /* stack position of first capture */ + } + return 0; /* no dynamic captures in this segment */ +} + + +/* +** Calls a runtime capture. Returns number of captures "removed" by the +** call, that is, those inside the group capture. Captures to be added +** are on the Lua stack. +*/ +int runtimecap (CapState *cs, Capture *close, const char *s, int *rem) { + int n, id; + lua_State *L = cs->L; + int otop = lua_gettop(L); + Capture *open = findopen(close); /* get open group capture */ + assert(captype(open) == Cgroup); + id = finddyncap(open, close); /* get first dynamic capture argument */ + close->kind = Cclose; /* closes the group */ + close->s = s; + cs->cap = open; cs->valuecached = 0; /* prepare capture state */ + luaL_checkstack(L, 4, "too many runtime captures"); + pushluaval(cs); /* push function to be called */ + lua_pushvalue(L, SUBJIDX); /* push original subject */ + lua_pushinteger(L, s - cs->s + 1); /* push current position */ + n = pushnestedvalues(cs, 0); /* push nested captures */ + lua_call(L, n + 2, LUA_MULTRET); /* call dynamic function */ + if (id > 0) { /* are there old dynamic captures to be removed? */ + int i; + for (i = id; i <= otop; i++) + lua_remove(L, id); /* remove old dynamic captures */ + *rem = otop - id + 1; /* total number of dynamic captures removed */ + } + else + *rem = 0; /* no dynamic captures removed */ + return close - open - 1; /* number of captures to be removed */ +} + + +/* +** Auxiliary structure for substitution and string captures: keep +** information about nested captures for future use, avoiding to push +** string results into Lua +*/ +typedef struct StrAux { + int isstring; /* whether capture is a string */ + union { + Capture *cp; /* if not a string, respective capture */ + struct { /* if it is a string... */ + const char *s; /* ... starts here */ + const char *e; /* ... ends here */ + } s; + } u; +} StrAux; + +#define MAXSTRCAPS 10 + +/* +** Collect values from current capture into array 'cps'. Current +** capture must be Cstring (first call) or Csimple (recursive calls). +** (In first call, fills %0 with whole match for Cstring.) +** Returns number of elements in the array that were filled. +*/ +static int getstrcaps (CapState *cs, StrAux *cps, int n) { + int k = n++; + cps[k].isstring = 1; /* get string value */ + cps[k].u.s.s = cs->cap->s; /* starts here */ + if (!isfullcap(cs->cap++)) { /* nested captures? */ + while (!isclosecap(cs->cap)) { /* traverse them */ + if (n >= MAXSTRCAPS) /* too many captures? */ + nextcap(cs); /* skip extra captures (will not need them) */ + else if (captype(cs->cap) == Csimple) /* string? */ + n = getstrcaps(cs, cps, n); /* put info. into array */ + else { + cps[n].isstring = 0; /* not a string */ + cps[n].u.cp = cs->cap; /* keep original capture */ + nextcap(cs); + n++; + } + } + cs->cap++; /* skip close */ + } + cps[k].u.s.e = closeaddr(cs->cap - 1); /* ends here */ + return n; +} + + +/* +** add next capture value (which should be a string) to buffer 'b' +*/ +static int addonestring (luaL_Buffer *b, CapState *cs, const char *what); + + +/* +** String capture: add result to buffer 'b' (instead of pushing +** it into the stack) +*/ +static void stringcap (luaL_Buffer *b, CapState *cs) { + StrAux cps[MAXSTRCAPS]; + int n; + size_t len, i; + const char *fmt; /* format string */ + fmt = lua_tolstring(cs->L, updatecache(cs, cs->cap->idx), &len); + n = getstrcaps(cs, cps, 0) - 1; /* collect nested captures */ + for (i = 0; i < len; i++) { /* traverse them */ + if (fmt[i] != '%') /* not an escape? */ + luaL_addchar(b, fmt[i]); /* add it to buffer */ + else if (fmt[++i] < '0' || fmt[i] > '9') /* not followed by a digit? */ + luaL_addchar(b, fmt[i]); /* add to buffer */ + else { + int l = fmt[i] - '0'; /* capture index */ + if (l > n) + luaL_error(cs->L, "invalid capture index (%d)", l); + else if (cps[l].isstring) + luaL_addlstring(b, cps[l].u.s.s, cps[l].u.s.e - cps[l].u.s.s); + else { + Capture *curr = cs->cap; + cs->cap = cps[l].u.cp; /* go back to evaluate that nested capture */ + if (!addonestring(b, cs, "capture")) + luaL_error(cs->L, "no values in capture index %d", l); + cs->cap = curr; /* continue from where it stopped */ + } + } + } +} + + +/* +** Substitution capture: add result to buffer 'b' +*/ +static void substcap (luaL_Buffer *b, CapState *cs) { + const char *curr = cs->cap->s; + if (isfullcap(cs->cap)) /* no nested captures? */ + luaL_addlstring(b, curr, cs->cap->siz - 1); /* keep original text */ + else { + cs->cap++; /* skip open entry */ + while (!isclosecap(cs->cap)) { /* traverse nested captures */ + const char *next = cs->cap->s; + luaL_addlstring(b, curr, next - curr); /* add text up to capture */ + if (addonestring(b, cs, "replacement")) + curr = closeaddr(cs->cap - 1); /* continue after match */ + else /* no capture value */ + curr = next; /* keep original text in final result */ + } + luaL_addlstring(b, curr, cs->cap->s - curr); /* add last piece of text */ + } + cs->cap++; /* go to next capture */ +} + + +/* +** Evaluates a capture and adds its first value to buffer 'b'; returns +** whether there was a value +*/ +static int addonestring (luaL_Buffer *b, CapState *cs, const char *what) { + switch (captype(cs->cap)) { + case Cstring: + stringcap(b, cs); /* add capture directly to buffer */ + return 1; + case Csubst: + substcap(b, cs); /* add capture directly to buffer */ + return 1; + default: { + lua_State *L = cs->L; + int n = pushcapture(cs); + if (n > 0) { + if (n > 1) lua_pop(L, n - 1); /* only one result */ + if (!lua_isstring(L, -1)) + luaL_error(L, "invalid %s value (a %s)", what, luaL_typename(L, -1)); + luaL_addvalue(b); + } + return n; + } + } +} + + +#if !defined(MAXRECLEVEL) +#define MAXRECLEVEL 200 +#endif + + +/* +** Push all values of the current capture into the stack; returns +** number of values pushed +*/ +static int pushcapture (CapState *cs) { + lua_State *L = cs->L; + int res; + luaL_checkstack(L, 4, "too many captures"); + if (cs->reclevel++ > MAXRECLEVEL) + return luaL_error(L, "subcapture nesting too deep"); + switch (captype(cs->cap)) { + case Cposition: { + lua_pushinteger(L, cs->cap->s - cs->s + 1); + cs->cap++; + res = 1; + break; + } + case Cconst: { + pushluaval(cs); + cs->cap++; + res = 1; + break; + } + case Carg: { + int arg = (cs->cap++)->idx; + if (arg + FIXEDARGS > cs->ptop) + return luaL_error(L, "reference to absent extra argument #%d", arg); + lua_pushvalue(L, arg + FIXEDARGS); + res = 1; + break; + } + case Csimple: { + int k = pushnestedvalues(cs, 1); + lua_insert(L, -k); /* make whole match be first result */ + res = k; + break; + } + case Cruntime: { + lua_pushvalue(L, (cs->cap++)->idx); /* value is in the stack */ + res = 1; + break; + } + case Cstring: { + luaL_Buffer b; + luaL_buffinit(L, &b); + stringcap(&b, cs); + luaL_pushresult(&b); + res = 1; + break; + } + case Csubst: { + luaL_Buffer b; + luaL_buffinit(L, &b); + substcap(&b, cs); + luaL_pushresult(&b); + res = 1; + break; + } + case Cgroup: { + if (cs->cap->idx == 0) /* anonymous group? */ + res = pushnestedvalues(cs, 0); /* add all nested values */ + else { /* named group: add no values */ + nextcap(cs); /* skip capture */ + res = 0; + } + break; + } + case Cbackref: res = backrefcap(cs); break; + case Ctable: res = tablecap(cs); break; + case Cfunction: res = functioncap(cs); break; + case Cnum: res = numcap(cs); break; + case Cquery: res = querycap(cs); break; + case Cfold: res = foldcap(cs); break; + default: assert(0); res = 0; + } + cs->reclevel--; + return res; +} + + +/* +** Prepare a CapState structure and traverse the entire list of +** captures in the stack pushing its results. 's' is the subject +** string, 'r' is the final position of the match, and 'ptop' +** the index in the stack where some useful values were pushed. +** Returns the number of results pushed. (If the list produces no +** results, push the final position of the match.) +*/ +int getcaptures (lua_State *L, const char *s, const char *r, int ptop) { + Capture *capture = (Capture *)lua_touserdata(L, caplistidx(ptop)); + int n = 0; + if (!isclosecap(capture)) { /* is there any capture? */ + CapState cs; + cs.ocap = cs.cap = capture; cs.L = L; cs.reclevel = 0; + cs.s = s; cs.valuecached = 0; cs.ptop = ptop; + do { /* collect their values */ + n += pushcapture(&cs); + } while (!isclosecap(cs.cap)); + } + if (n == 0) { /* no capture values? */ + lua_pushinteger(L, r - s + 1); /* return only end position */ + n = 1; + } + return n; +} + + diff --git a/ThirdParty/lpeg-1.0.2/lpcap.h b/ThirdParty/lpeg-1.0.2/lpcap.h new file mode 100644 index 0000000..dc10d69 --- /dev/null +++ b/ThirdParty/lpeg-1.0.2/lpcap.h @@ -0,0 +1,57 @@ +/* +** $Id: lpcap.h $ +*/ + +#if !defined(lpcap_h) +#define lpcap_h + + +#include "lptypes.h" + + +/* kinds of captures */ +typedef enum CapKind { + Cclose, /* not used in trees */ + Cposition, + Cconst, /* ktable[key] is Lua constant */ + Cbackref, /* ktable[key] is "name" of group to get capture */ + Carg, /* 'key' is arg's number */ + Csimple, /* next node is pattern */ + Ctable, /* next node is pattern */ + Cfunction, /* ktable[key] is function; next node is pattern */ + Cquery, /* ktable[key] is table; next node is pattern */ + Cstring, /* ktable[key] is string; next node is pattern */ + Cnum, /* numbered capture; 'key' is number of value to return */ + Csubst, /* substitution capture; next node is pattern */ + Cfold, /* ktable[key] is function; next node is pattern */ + Cruntime, /* not used in trees (is uses another type for tree) */ + Cgroup /* ktable[key] is group's "name" */ +} CapKind; + + +typedef struct Capture { + const char *s; /* subject position */ + unsigned short idx; /* extra info (group name, arg index, etc.) */ + byte kind; /* kind of capture */ + byte siz; /* size of full capture + 1 (0 = not a full capture) */ +} Capture; + + +typedef struct CapState { + Capture *cap; /* current capture */ + Capture *ocap; /* (original) capture list */ + lua_State *L; + int ptop; /* index of last argument to 'match' */ + const char *s; /* original string */ + int valuecached; /* value stored in cache slot */ + int reclevel; /* recursion level */ +} CapState; + + +int runtimecap (CapState *cs, Capture *close, const char *s, int *rem); +int getcaptures (lua_State *L, const char *s, const char *r, int ptop); +int finddyncap (Capture *cap, Capture *last); + +#endif + + diff --git a/ThirdParty/lpeg-1.0.2/lpcode.c b/ThirdParty/lpeg-1.0.2/lpcode.c new file mode 100644 index 0000000..3923459 --- /dev/null +++ b/ThirdParty/lpeg-1.0.2/lpcode.c @@ -0,0 +1,1014 @@ +/* +** $Id: lpcode.c $ +** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) +*/ + +#include <limits.h> + + +#include "lua.h" +#include "lauxlib.h" + +#include "lptypes.h" +#include "lpcode.h" + + +/* signals a "no-instruction */ +#define NOINST -1 + + + +static const Charset fullset_ = + {{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}}; + +static const Charset *fullset = &fullset_; + +/* +** {====================================================== +** Analysis and some optimizations +** ======================================================= +*/ + +/* +** Check whether a charset is empty (returns IFail), singleton (IChar), +** full (IAny), or none of those (ISet). When singleton, '*c' returns +** which character it is. (When generic set, the set was the input, +** so there is no need to return it.) +*/ +static Opcode charsettype (const byte *cs, int *c) { + int count = 0; /* number of characters in the set */ + int i; + int candidate = -1; /* candidate position for the singleton char */ + for (i = 0; i < CHARSETSIZE; i++) { /* for each byte */ + int b = cs[i]; + if (b == 0) { /* is byte empty? */ + if (count > 1) /* was set neither empty nor singleton? */ + return ISet; /* neither full nor empty nor singleton */ + /* else set is still empty or singleton */ + } + else if (b == 0xFF) { /* is byte full? */ + if (count < (i * BITSPERCHAR)) /* was set not full? */ + return ISet; /* neither full nor empty nor singleton */ + else count += BITSPERCHAR; /* set is still full */ + } + else if ((b & (b - 1)) == 0) { /* has byte only one bit? */ + if (count > 0) /* was set not empty? */ + return ISet; /* neither full nor empty nor singleton */ + else { /* set has only one char till now; track it */ + count++; + candidate = i; + } + } + else return ISet; /* byte is neither empty, full, nor singleton */ + } + switch (count) { + case 0: return IFail; /* empty set */ + case 1: { /* singleton; find character bit inside byte */ + int b = cs[candidate]; + *c = candidate * BITSPERCHAR; + if ((b & 0xF0) != 0) { *c += 4; b >>= 4; } + if ((b & 0x0C) != 0) { *c += 2; b >>= 2; } + if ((b & 0x02) != 0) { *c += 1; } + return IChar; + } + default: { + assert(count == CHARSETSIZE * BITSPERCHAR); /* full set */ + return IAny; + } + } +} + + +/* +** A few basic operations on Charsets +*/ +static void cs_complement (Charset *cs) { + loopset(i, cs->cs[i] = ~cs->cs[i]); +} + +static int cs_equal (const byte *cs1, const byte *cs2) { + loopset(i, if (cs1[i] != cs2[i]) return 0); + return 1; +} + +static int cs_disjoint (const Charset *cs1, const Charset *cs2) { + loopset(i, if ((cs1->cs[i] & cs2->cs[i]) != 0) return 0;) + return 1; +} + + +/* +** If 'tree' is a 'char' pattern (TSet, TChar, TAny), convert it into a +** charset and return 1; else return 0. +*/ +int tocharset (TTree *tree, Charset *cs) { + switch (tree->tag) { + case TSet: { /* copy set */ + loopset(i, cs->cs[i] = treebuffer(tree)[i]); + return 1; + } + case TChar: { /* only one char */ + assert(0 <= tree->u.n && tree->u.n <= UCHAR_MAX); + loopset(i, cs->cs[i] = 0); /* erase all chars */ + setchar(cs->cs, tree->u.n); /* add that one */ + return 1; + } + case TAny: { + loopset(i, cs->cs[i] = 0xFF); /* add all characters to the set */ + return 1; + } + default: return 0; + } +} + + +/* +** Visit a TCall node taking care to stop recursion. If node not yet +** visited, return 'f(sib2(tree))', otherwise return 'def' (default +** value) +*/ +static int callrecursive (TTree *tree, int f (TTree *t), int def) { + int key = tree->key; + assert(tree->tag == TCall); + assert(sib2(tree)->tag == TRule); + if (key == 0) /* node already visited? */ + return def; /* return default value */ + else { /* first visit */ + int result; + tree->key = 0; /* mark call as already visited */ + result = f(sib2(tree)); /* go to called rule */ + tree->key = key; /* restore tree */ + return result; + } +} + + +/* +** Check whether a pattern tree has captures +*/ +int hascaptures (TTree *tree) { + tailcall: + switch (tree->tag) { + case TCapture: case TRunTime: + return 1; + case TCall: + return callrecursive(tree, hascaptures, 0); + case TRule: /* do not follow siblings */ + tree = sib1(tree); goto tailcall; + case TOpenCall: assert(0); + default: { + switch (numsiblings[tree->tag]) { + case 1: /* return hascaptures(sib1(tree)); */ + tree = sib1(tree); goto tailcall; + case 2: + if (hascaptures(sib1(tree))) + return 1; + /* else return hascaptures(sib2(tree)); */ + tree = sib2(tree); goto tailcall; + default: assert(numsiblings[tree->tag] == 0); return 0; + } + } + } +} + + +/* +** Checks how a pattern behaves regarding the empty string, +** in one of two different ways: +** A pattern is *nullable* if it can match without consuming any character; +** A pattern is *nofail* if it never fails for any string +** (including the empty string). +** The difference is only for predicates and run-time captures; +** for other patterns, the two properties are equivalent. +** (With predicates, &'a' is nullable but not nofail. Of course, +** nofail => nullable.) +** These functions are all convervative in the following way: +** p is nullable => nullable(p) +** nofail(p) => p cannot fail +** The function assumes that TOpenCall is not nullable; +** this will be checked again when the grammar is fixed. +** Run-time captures can do whatever they want, so the result +** is conservative. +*/ +int checkaux (TTree *tree, int pred) { + tailcall: + switch (tree->tag) { + case TChar: case TSet: case TAny: + case TFalse: case TOpenCall: + return 0; /* not nullable */ + case TRep: case TTrue: + return 1; /* no fail */ + case TNot: case TBehind: /* can match empty, but can fail */ + if (pred == PEnofail) return 0; + else return 1; /* PEnullable */ + case TAnd: /* can match empty; fail iff body does */ + if (pred == PEnullable) return 1; + /* else return checkaux(sib1(tree), pred); */ + tree = sib1(tree); goto tailcall; + case TRunTime: /* can fail; match empty iff body does */ + if (pred == PEnofail) return 0; + /* else return checkaux(sib1(tree), pred); */ + tree = sib1(tree); goto tailcall; + case TSeq: + if (!checkaux(sib1(tree), pred)) return 0; + /* else return checkaux(sib2(tree), pred); */ + tree = sib2(tree); goto tailcall; + case TChoice: + if (checkaux(sib2(tree), pred)) return 1; + /* else return checkaux(sib1(tree), pred); */ + tree = sib1(tree); goto tailcall; + case TCapture: case TGrammar: case TRule: + /* return checkaux(sib1(tree), pred); */ + tree = sib1(tree); goto tailcall; + case TCall: /* return checkaux(sib2(tree), pred); */ + tree = sib2(tree); goto tailcall; + default: assert(0); return 0; + } +} + + +/* +** number of characters to match a pattern (or -1 if variable) +*/ +int fixedlen (TTree *tree) { + int len = 0; /* to accumulate in tail calls */ + tailcall: + switch (tree->tag) { + case TChar: case TSet: case TAny: + return len + 1; + case TFalse: case TTrue: case TNot: case TAnd: case TBehind: + return len; + case TRep: case TRunTime: case TOpenCall: + return -1; + case TCapture: case TRule: case TGrammar: + /* return fixedlen(sib1(tree)); */ + tree = sib1(tree); goto tailcall; + case TCall: { + int n1 = callrecursive(tree, fixedlen, -1); + if (n1 < 0) + return -1; + else + return len + n1; + } + case TSeq: { + int n1 = fixedlen(sib1(tree)); + if (n1 < 0) + return -1; + /* else return fixedlen(sib2(tree)) + len; */ + len += n1; tree = sib2(tree); goto tailcall; + } + case TChoice: { + int n1 = fixedlen(sib1(tree)); + int n2 = fixedlen(sib2(tree)); + if (n1 != n2 || n1 < 0) + return -1; + else + return len + n1; + } + default: assert(0); return 0; + }; +} + + +/* +** Computes the 'first set' of a pattern. +** The result is a conservative aproximation: +** match p ax -> x (for some x) ==> a belongs to first(p) +** or +** a not in first(p) ==> match p ax -> fail (for all x) +** +** The set 'follow' is the first set of what follows the +** pattern (full set if nothing follows it). +** +** The function returns 0 when this resulting set can be used for +** test instructions that avoid the pattern altogether. +** A non-zero return can happen for two reasons: +** 1) match p '' -> '' ==> return has bit 1 set +** (tests cannot be used because they would always fail for an empty input); +** 2) there is a match-time capture ==> return has bit 2 set +** (optimizations should not bypass match-time captures). +*/ +static int getfirst (TTree *tree, const Charset *follow, Charset *firstset) { + tailcall: + switch (tree->tag) { + case TChar: case TSet: case TAny: { + tocharset(tree, firstset); + return 0; + } + case TTrue: { + loopset(i, firstset->cs[i] = follow->cs[i]); + return 1; /* accepts the empty string */ + } + case TFalse: { + loopset(i, firstset->cs[i] = 0); + return 0; + } + case TChoice: { + Charset csaux; + int e1 = getfirst(sib1(tree), follow, firstset); + int e2 = getfirst(sib2(tree), follow, &csaux); + loopset(i, firstset->cs[i] |= csaux.cs[i]); + return e1 | e2; + } + case TSeq: { + if (!nullable(sib1(tree))) { + /* when p1 is not nullable, p2 has nothing to contribute; + return getfirst(sib1(tree), fullset, firstset); */ + tree = sib1(tree); follow = fullset; goto tailcall; + } + else { /* FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl)) */ + Charset csaux; + int e2 = getfirst(sib2(tree), follow, &csaux); + int e1 = getfirst(sib1(tree), &csaux, firstset); + if (e1 == 0) return 0; /* 'e1' ensures that first can be used */ + else if ((e1 | e2) & 2) /* one of the children has a matchtime? */ + return 2; /* pattern has a matchtime capture */ + else return e2; /* else depends on 'e2' */ + } + } + case TRep: { + getfirst(sib1(tree), follow, firstset); + loopset(i, firstset->cs[i] |= follow->cs[i]); + return 1; /* accept the empty string */ + } + case TCapture: case TGrammar: case TRule: { + /* return getfirst(sib1(tree), follow, firstset); */ + tree = sib1(tree); goto tailcall; + } + case TRunTime: { /* function invalidates any follow info. */ + int e = getfirst(sib1(tree), fullset, firstset); + if (e) return 2; /* function is not "protected"? */ + else return 0; /* pattern inside capture ensures first can be used */ + } + case TCall: { + /* return getfirst(sib2(tree), follow, firstset); */ + tree = sib2(tree); goto tailcall; + } + case TAnd: { + int e = getfirst(sib1(tree), follow, firstset); + loopset(i, firstset->cs[i] &= follow->cs[i]); + return e; + } + case TNot: { + if (tocharset(sib1(tree), firstset)) { + cs_complement(firstset); + return 1; + } + /* else go through */ + } + case TBehind: { /* instruction gives no new information */ + /* call 'getfirst' only to check for math-time captures */ + int e = getfirst(sib1(tree), follow, firstset); + loopset(i, firstset->cs[i] = follow->cs[i]); /* uses follow */ + return e | 1; /* always can accept the empty string */ + } + default: assert(0); return 0; + } +} + + +/* +** If 'headfail(tree)' true, then 'tree' can fail only depending on the +** next character of the subject. +*/ +static int headfail (TTree *tree) { + tailcall: + switch (tree->tag) { + case TChar: case TSet: case TAny: case TFalse: + return 1; + case TTrue: case TRep: case TRunTime: case TNot: + case TBehind: + return 0; + case TCapture: case TGrammar: case TRule: case TAnd: + tree = sib1(tree); goto tailcall; /* return headfail(sib1(tree)); */ + case TCall: + tree = sib2(tree); goto tailcall; /* return headfail(sib2(tree)); */ + case TSeq: + if (!nofail(sib2(tree))) return 0; + /* else return headfail(sib1(tree)); */ + tree = sib1(tree); goto tailcall; + case TChoice: + if (!headfail(sib1(tree))) return 0; + /* else return headfail(sib2(tree)); */ + tree = sib2(tree); goto tailcall; + default: assert(0); return 0; + } +} + + +/* +** Check whether the code generation for the given tree can benefit +** from a follow set (to avoid computing the follow set when it is +** not needed) +*/ +static int needfollow (TTree *tree) { + tailcall: + switch (tree->tag) { + case TChar: case TSet: case TAny: + case TFalse: case TTrue: case TAnd: case TNot: + case TRunTime: case TGrammar: case TCall: case TBehind: + return 0; + case TChoice: case TRep: + return 1; + case TCapture: + tree = sib1(tree); goto tailcall; + case TSeq: + tree = sib2(tree); goto tailcall; + default: assert(0); return 0; + } +} + +/* }====================================================== */ + + + +/* +** {====================================================== +** Code generation +** ======================================================= +*/ + + +/* +** size of an instruction +*/ +int sizei (const Instruction *i) { + switch((Opcode)i->i.code) { + case ISet: case ISpan: return CHARSETINSTSIZE; + case ITestSet: return CHARSETINSTSIZE + 1; + case ITestChar: case ITestAny: case IChoice: case IJmp: case ICall: + case IOpenCall: case ICommit: case IPartialCommit: case IBackCommit: + return 2; + default: return 1; + } +} + + +/* +** state for the compiler +*/ +typedef struct CompileState { + Pattern *p; /* pattern being compiled */ + int ncode; /* next position in p->code to be filled */ + lua_State *L; +} CompileState; + + +/* +** code generation is recursive; 'opt' indicates that the code is being +** generated as the last thing inside an optional pattern (so, if that +** code is optional too, it can reuse the 'IChoice' already in place for +** the outer pattern). 'tt' points to a previous test protecting this +** code (or NOINST). 'fl' is the follow set of the pattern. +*/ +static void codegen (CompileState *compst, TTree *tree, int opt, int tt, + const Charset *fl); + + +void realloccode (lua_State *L, Pattern *p, int nsize) { + void *ud; + lua_Alloc f = lua_getallocf(L, &ud); + void *newblock = f(ud, p->code, p->codesize * sizeof(Instruction), + nsize * sizeof(Instruction)); + if (newblock == NULL && nsize > 0) + luaL_error(L, "not enough memory"); + p->code = (Instruction *)newblock; + p->codesize = nsize; +} + + +static int nextinstruction (CompileState *compst) { + int size = compst->p->codesize; + if (compst->ncode >= size) + realloccode(compst->L, compst->p, size * 2); + return compst->ncode++; +} + + +#define getinstr(cs,i) ((cs)->p->code[i]) + + +static int addinstruction (CompileState *compst, Opcode op, int aux) { + int i = nextinstruction(compst); + getinstr(compst, i).i.code = op; + getinstr(compst, i).i.aux = aux; + return i; +} + + +/* +** Add an instruction followed by space for an offset (to be set later) +*/ +static int addoffsetinst (CompileState *compst, Opcode op) { + int i = addinstruction(compst, op, 0); /* instruction */ + addinstruction(compst, (Opcode)0, 0); /* open space for offset */ + assert(op == ITestSet || sizei(&getinstr(compst, i)) == 2); + return i; +} + + +/* +** Set the offset of an instruction +*/ +static void setoffset (CompileState *compst, int instruction, int offset) { + getinstr(compst, instruction + 1).offset = offset; +} + + +/* +** Add a capture instruction: +** 'op' is the capture instruction; 'cap' the capture kind; +** 'key' the key into ktable; 'aux' is the optional capture offset +** +*/ +static int addinstcap (CompileState *compst, Opcode op, int cap, int key, + int aux) { + int i = addinstruction(compst, op, joinkindoff(cap, aux)); + getinstr(compst, i).i.key = key; + return i; +} + + +#define gethere(compst) ((compst)->ncode) + +#define target(code,i) ((i) + code[i + 1].offset) + + +/* +** Patch 'instruction' to jump to 'target' +*/ +static void jumptothere (CompileState *compst, int instruction, int target) { + if (instruction >= 0) + setoffset(compst, instruction, target - instruction); +} + + +/* +** Patch 'instruction' to jump to current position +*/ +static void jumptohere (CompileState *compst, int instruction) { + jumptothere(compst, instruction, gethere(compst)); +} + + +/* +** Code an IChar instruction, or IAny if there is an equivalent +** test dominating it +*/ +static void codechar (CompileState *compst, int c, int tt) { + if (tt >= 0 && getinstr(compst, tt).i.code == ITestChar && + getinstr(compst, tt).i.aux == c) + addinstruction(compst, IAny, 0); + else + addinstruction(compst, IChar, c); +} + + +/* +** Add a charset posfix to an instruction +*/ +static void addcharset (CompileState *compst, const byte *cs) { + int p = gethere(compst); + int i; + for (i = 0; i < (int)CHARSETINSTSIZE - 1; i++) + nextinstruction(compst); /* space for buffer */ + /* fill buffer with charset */ + loopset(j, getinstr(compst, p).buff[j] = cs[j]); +} + + +/* +** code a char set, optimizing unit sets for IChar, "complete" +** sets for IAny, and empty sets for IFail; also use an IAny +** when instruction is dominated by an equivalent test. +*/ +static void codecharset (CompileState *compst, const byte *cs, int tt) { + int c = 0; /* (=) to avoid warnings */ + Opcode op = charsettype(cs, &c); + switch (op) { + case IChar: codechar(compst, c, tt); break; + case ISet: { /* non-trivial set? */ + if (tt >= 0 && getinstr(compst, tt).i.code == ITestSet && + cs_equal(cs, getinstr(compst, tt + 2).buff)) + addinstruction(compst, IAny, 0); + else { + addinstruction(compst, ISet, 0); + addcharset(compst, cs); + } + break; + } + default: addinstruction(compst, op, c); break; + } +} + + +/* +** code a test set, optimizing unit sets for ITestChar, "complete" +** sets for ITestAny, and empty sets for IJmp (always fails). +** 'e' is true iff test should accept the empty string. (Test +** instructions in the current VM never accept the empty string.) +*/ +static int codetestset (CompileState *compst, Charset *cs, int e) { + if (e) return NOINST; /* no test */ + else { + int c = 0; + Opcode op = charsettype(cs->cs, &c); + switch (op) { + case IFail: return addoffsetinst(compst, IJmp); /* always jump */ + case IAny: return addoffsetinst(compst, ITestAny); + case IChar: { + int i = addoffsetinst(compst, ITestChar); + getinstr(compst, i).i.aux = c; + return i; + } + case ISet: { + int i = addoffsetinst(compst, ITestSet); + addcharset(compst, cs->cs); + return i; + } + default: assert(0); return 0; + } + } +} + + +/* +** Find the final destination of a sequence of jumps +*/ +static int finaltarget (Instruction *code, int i) { + while (code[i].i.code == IJmp) + i = target(code, i); + return i; +} + + +/* +** final label (after traversing any jumps) +*/ +static int finallabel (Instruction *code, int i) { + return finaltarget(code, target(code, i)); +} + + +/* +** <behind(p)> == behind n; <p> (where n = fixedlen(p)) +*/ +static void codebehind (CompileState *compst, TTree *tree) { + if (tree->u.n > 0) + addinstruction(compst, IBehind, tree->u.n); + codegen(compst, sib1(tree), 0, NOINST, fullset); +} + + +/* +** Choice; optimizations: +** - when p1 is headfail or +** when first(p1) and first(p2) are disjoint, than +** a character not in first(p1) cannot go to p1, and a character +** in first(p1) cannot go to p2 (at it is not in first(p2)). +** (The optimization is not valid if p1 accepts the empty string, +** as then there is no character at all...) +** - when p2 is empty and opt is true; a IPartialCommit can reuse +** the Choice already active in the stack. +*/ +static void codechoice (CompileState *compst, TTree *p1, TTree *p2, int opt, + const Charset *fl) { + int emptyp2 = (p2->tag == TTrue); + Charset cs1, cs2; + int e1 = getfirst(p1, fullset, &cs1); + if (headfail(p1) || + (!e1 && (getfirst(p2, fl, &cs2), cs_disjoint(&cs1, &cs2)))) { + /* <p1 / p2> == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2: */ + int test = codetestset(compst, &cs1, 0); + int jmp = NOINST; + codegen(compst, p1, 0, test, fl); + if (!emptyp2) + jmp = addoffsetinst(compst, IJmp); + jumptohere(compst, test); + codegen(compst, p2, opt, NOINST, fl); + jumptohere(compst, jmp); + } + else if (opt && emptyp2) { + /* p1? == IPartialCommit; p1 */ + jumptohere(compst, addoffsetinst(compst, IPartialCommit)); + codegen(compst, p1, 1, NOINST, fullset); + } + else { + /* <p1 / p2> == + test(first(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: */ + int pcommit; + int test = codetestset(compst, &cs1, e1); + int pchoice = addoffsetinst(compst, IChoice); + codegen(compst, p1, emptyp2, test, fullset); + pcommit = addoffsetinst(compst, ICommit); + jumptohere(compst, pchoice); + jumptohere(compst, test); + codegen(compst, p2, opt, NOINST, fl); + jumptohere(compst, pcommit); + } +} + + +/* +** And predicate +** optimization: fixedlen(p) = n ==> <&p> == <p>; behind n +** (valid only when 'p' has no captures) +*/ +static void codeand (CompileState *compst, TTree *tree, int tt) { + int n = fixedlen(tree); + if (n >= 0 && n <= MAXBEHIND && !hascaptures(tree)) { + codegen(compst, tree, 0, tt, fullset); + if (n > 0) + addinstruction(compst, IBehind, n); + } + else { /* default: Choice L1; p1; BackCommit L2; L1: Fail; L2: */ + int pcommit; + int pchoice = addoffsetinst(compst, IChoice); + codegen(compst, tree, 0, tt, fullset); + pcommit = addoffsetinst(compst, IBackCommit); + jumptohere(compst, pchoice); + addinstruction(compst, IFail, 0); + jumptohere(compst, pcommit); + } +} + + +/* +** Captures: if pattern has fixed (and not too big) length, and it +** has no nested captures, use a single IFullCapture instruction +** after the match; otherwise, enclose the pattern with OpenCapture - +** CloseCapture. +*/ +static void codecapture (CompileState *compst, TTree *tree, int tt, + const Charset *fl) { + int len = fixedlen(sib1(tree)); + if (len >= 0 && len <= MAXOFF && !hascaptures(sib1(tree))) { + codegen(compst, sib1(tree), 0, tt, fl); + addinstcap(compst, IFullCapture, tree->cap, tree->key, len); + } + else { + addinstcap(compst, IOpenCapture, tree->cap, tree->key, 0); + codegen(compst, sib1(tree), 0, tt, fl); + addinstcap(compst, ICloseCapture, Cclose, 0, 0); + } +} + + +static void coderuntime (CompileState *compst, TTree *tree, int tt) { + addinstcap(compst, IOpenCapture, Cgroup, tree->key, 0); + codegen(compst, sib1(tree), 0, tt, fullset); + addinstcap(compst, ICloseRunTime, Cclose, 0, 0); +} + + +/* +** Repetion; optimizations: +** When pattern is a charset, can use special instruction ISpan. +** When pattern is head fail, or if it starts with characters that +** are disjoint from what follows the repetions, a simple test +** is enough (a fail inside the repetition would backtrack to fail +** again in the following pattern, so there is no need for a choice). +** When 'opt' is true, the repetion can reuse the Choice already +** active in the stack. +*/ +static void coderep (CompileState *compst, TTree *tree, int opt, + const Charset *fl) { + Charset st; + if (tocharset(tree, &st)) { + addinstruction(compst, ISpan, 0); + addcharset(compst, st.cs); + } + else { + int e1 = getfirst(tree, fullset, &st); + if (headfail(tree) || (!e1 && cs_disjoint(&st, fl))) { + /* L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: */ + int jmp; + int test = codetestset(compst, &st, 0); + codegen(compst, tree, 0, test, fullset); + jmp = addoffsetinst(compst, IJmp); + jumptohere(compst, test); + jumptothere(compst, jmp, test); + } + else { + /* test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2: */ + /* or (if 'opt'): partialcommit L1; L1: <p>; partialcommit L1; */ + int commit, l2; + int test = codetestset(compst, &st, e1); + int pchoice = NOINST; + if (opt) + jumptohere(compst, addoffsetinst(compst, IPartialCommit)); + else + pchoice = addoffsetinst(compst, IChoice); + l2 = gethere(compst); + codegen(compst, tree, 0, NOINST, fullset); + commit = addoffsetinst(compst, IPartialCommit); + jumptothere(compst, commit, l2); + jumptohere(compst, pchoice); + jumptohere(compst, test); + } + } +} + + +/* +** Not predicate; optimizations: +** In any case, if first test fails, 'not' succeeds, so it can jump to +** the end. If pattern is headfail, that is all (it cannot fail +** in other parts); this case includes 'not' of simple sets. Otherwise, +** use the default code (a choice plus a failtwice). +*/ +static void codenot (CompileState *compst, TTree *tree) { + Charset st; + int e = getfirst(tree, fullset, &st); + int test = codetestset(compst, &st, e); + if (headfail(tree)) /* test (fail(p1)) -> L1; fail; L1: */ + addinstruction(compst, IFail, 0); + else { + /* test(fail(p))-> L1; choice L1; <p>; failtwice; L1: */ + int pchoice = addoffsetinst(compst, IChoice); + codegen(compst, tree, 0, NOINST, fullset); + addinstruction(compst, IFailTwice, 0); + jumptohere(compst, pchoice); + } + jumptohere(compst, test); +} + + +/* +** change open calls to calls, using list 'positions' to find +** correct offsets; also optimize tail calls +*/ +static void correctcalls (CompileState *compst, int *positions, + int from, int to) { + int i; + Instruction *code = compst->p->code; + for (i = from; i < to; i += sizei(&code[i])) { + if (code[i].i.code == IOpenCall) { + int n = code[i].i.key; /* rule number */ + int rule = positions[n]; /* rule position */ + assert(rule == from || code[rule - 1].i.code == IRet); + if (code[finaltarget(code, i + 2)].i.code == IRet) /* call; ret ? */ + code[i].i.code = IJmp; /* tail call */ + else + code[i].i.code = ICall; + jumptothere(compst, i, rule); /* call jumps to respective rule */ + } + } + assert(i == to); +} + + +/* +** Code for a grammar: +** call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2: +*/ +static void codegrammar (CompileState *compst, TTree *grammar) { + int positions[MAXRULES]; + int rulenumber = 0; + TTree *rule; + int firstcall = addoffsetinst(compst, ICall); /* call initial rule */ + int jumptoend = addoffsetinst(compst, IJmp); /* jump to the end */ + int start = gethere(compst); /* here starts the initial rule */ + jumptohere(compst, firstcall); + for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { + positions[rulenumber++] = gethere(compst); /* save rule position */ + codegen(compst, sib1(rule), 0, NOINST, fullset); /* code rule */ + addinstruction(compst, IRet, 0); + } + assert(rule->tag == TTrue); + jumptohere(compst, jumptoend); + correctcalls(compst, positions, start, gethere(compst)); +} + + +static void codecall (CompileState *compst, TTree *call) { + int c = addoffsetinst(compst, IOpenCall); /* to be corrected later */ + getinstr(compst, c).i.key = sib2(call)->cap; /* rule number */ + assert(sib2(call)->tag == TRule); +} + + +/* +** Code first child of a sequence +** (second child is called in-place to allow tail call) +** Return 'tt' for second child +*/ +static int codeseq1 (CompileState *compst, TTree *p1, TTree *p2, + int tt, const Charset *fl) { + if (needfollow(p1)) { + Charset fl1; + getfirst(p2, fl, &fl1); /* p1 follow is p2 first */ + codegen(compst, p1, 0, tt, &fl1); + } + else /* use 'fullset' as follow */ + codegen(compst, p1, 0, tt, fullset); + if (fixedlen(p1) != 0) /* can 'p1' consume anything? */ + return NOINST; /* invalidate test */ + else return tt; /* else 'tt' still protects sib2 */ +} + + +/* +** Main code-generation function: dispatch to auxiliar functions +** according to kind of tree. ('needfollow' should return true +** only for consructions that use 'fl'.) +*/ +static void codegen (CompileState *compst, TTree *tree, int opt, int tt, + const Charset *fl) { + tailcall: + switch (tree->tag) { + case TChar: codechar(compst, tree->u.n, tt); break; + case TAny: addinstruction(compst, IAny, 0); break; + case TSet: codecharset(compst, treebuffer(tree), tt); break; + case TTrue: break; + case TFalse: addinstruction(compst, IFail, 0); break; + case TChoice: codechoice(compst, sib1(tree), sib2(tree), opt, fl); break; + case TRep: coderep(compst, sib1(tree), opt, fl); break; + case TBehind: codebehind(compst, tree); break; + case TNot: codenot(compst, sib1(tree)); break; + case TAnd: codeand(compst, sib1(tree), tt); break; + case TCapture: codecapture(compst, tree, tt, fl); break; + case TRunTime: coderuntime(compst, tree, tt); break; + case TGrammar: codegrammar(compst, tree); break; + case TCall: codecall(compst, tree); break; + case TSeq: { + tt = codeseq1(compst, sib1(tree), sib2(tree), tt, fl); /* code 'p1' */ + /* codegen(compst, p2, opt, tt, fl); */ + tree = sib2(tree); goto tailcall; + } + default: assert(0); + } +} + + +/* +** Optimize jumps and other jump-like instructions. +** * Update labels of instructions with labels to their final +** destinations (e.g., choice L1; ... L1: jmp L2: becomes +** choice L2) +** * Jumps to other instructions that do jumps become those +** instructions (e.g., jump to return becomes a return; jump +** to commit becomes a commit) +*/ +static void peephole (CompileState *compst) { + Instruction *code = compst->p->code; + int i; + for (i = 0; i < compst->ncode; i += sizei(&code[i])) { + redo: + switch (code[i].i.code) { + case IChoice: case ICall: case ICommit: case IPartialCommit: + case IBackCommit: case ITestChar: case ITestSet: + case ITestAny: { /* instructions with labels */ + jumptothere(compst, i, finallabel(code, i)); /* optimize label */ + break; + } + case IJmp: { + int ft = finaltarget(code, i); + switch (code[ft].i.code) { /* jumping to what? */ + case IRet: case IFail: case IFailTwice: + case IEnd: { /* instructions with unconditional implicit jumps */ + code[i] = code[ft]; /* jump becomes that instruction */ + code[i + 1].i.code = IAny; /* 'no-op' for target position */ + break; + } + case ICommit: case IPartialCommit: + case IBackCommit: { /* inst. with unconditional explicit jumps */ + int fft = finallabel(code, ft); + code[i] = code[ft]; /* jump becomes that instruction... */ + jumptothere(compst, i, fft); /* but must correct its offset */ + goto redo; /* reoptimize its label */ + } + default: { + jumptothere(compst, i, ft); /* optimize label */ + break; + } + } + break; + } + default: break; + } + } + assert(code[i - 1].i.code == IEnd); +} + + +/* +** Compile a pattern +*/ +Instruction *compile (lua_State *L, Pattern *p) { + CompileState compst; + compst.p = p; compst.ncode = 0; compst.L = L; + realloccode(L, p, 2); /* minimum initial size */ + codegen(&compst, p->tree, 0, NOINST, fullset); + addinstruction(&compst, IEnd, 0); + realloccode(L, p, compst.ncode); /* set final size */ + peephole(&compst); + return p->code; +} + + +/* }====================================================== */ + diff --git a/ThirdParty/lpeg-1.0.2/lpcode.h b/ThirdParty/lpeg-1.0.2/lpcode.h new file mode 100644 index 0000000..34ee276 --- /dev/null +++ b/ThirdParty/lpeg-1.0.2/lpcode.h @@ -0,0 +1,40 @@ +/* +** $Id: lpcode.h $ +*/ + +#if !defined(lpcode_h) +#define lpcode_h + +#include "lua.h" + +#include "lptypes.h" +#include "lptree.h" +#include "lpvm.h" + +int tocharset (TTree *tree, Charset *cs); +int checkaux (TTree *tree, int pred); +int fixedlen (TTree *tree); +int hascaptures (TTree *tree); +int lp_gc (lua_State *L); +Instruction *compile (lua_State *L, Pattern *p); +void realloccode (lua_State *L, Pattern *p, int nsize); +int sizei (const Instruction *i); + + +#define PEnullable 0 +#define PEnofail 1 + +/* +** nofail(t) implies that 't' cannot fail with any input +*/ +#define nofail(t) checkaux(t, PEnofail) + +/* +** (not nullable(t)) implies 't' cannot match without consuming +** something +*/ +#define nullable(t) checkaux(t, PEnullable) + + + +#endif diff --git a/ThirdParty/lpeg-1.0.2/lpeg-128.gif b/ThirdParty/lpeg-1.0.2/lpeg-128.gif Binary files differnew file mode 100644 index 0000000..bbf5e78 --- /dev/null +++ b/ThirdParty/lpeg-1.0.2/lpeg-128.gif diff --git a/ThirdParty/lpeg-1.0.2/lpeg.html b/ThirdParty/lpeg-1.0.2/lpeg.html new file mode 100644 index 0000000..8b9f59c --- /dev/null +++ b/ThirdParty/lpeg-1.0.2/lpeg.html @@ -0,0 +1,1439 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> +<head> + <title>LPeg - Parsing Expression Grammars For Lua</title> + <link rel="stylesheet" + href="http://www.inf.puc-rio.br/~roberto/lpeg/doc.css" + type="text/css"/> + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> +</head> +<body> + +<!-- $Id: lpeg.html $ --> + +<div id="container"> + +<div id="product"> + <div id="product_logo"> + <a href="http://www.inf.puc-rio.br/~roberto/lpeg/"> + <img alt="LPeg logo" src="lpeg-128.gif"/></a> + + </div> + <div id="product_name"><big><strong>LPeg</strong></big></div> + <div id="product_description"> + Parsing Expression Grammars For Lua, version 1.0 + </div> +</div> <!-- id="product" --> + +<div id="main"> + +<div id="navigation"> +<h1>LPeg</h1> + +<ul> + <li><strong>Home</strong> + <ul> + <li><a href="#intro">Introduction</a></li> + <li><a href="#func">Functions</a></li> + <li><a href="#basic">Basic Constructions</a></li> + <li><a href="#grammar">Grammars</a></li> + <li><a href="#captures">Captures</a></li> + <li><a href="#ex">Some Examples</a></li> + <li><a href="re.html">The <code>re</code> Module</a></li> + <li><a href="#download">Download</a></li> + <li><a href="#license">License</a></li> + </ul> + </li> +</ul> +</div> <!-- id="navigation" --> + +<div id="content"> + + +<h2><a name="intro">Introduction</a></h2> + +<p> +<em>LPeg</em> is a new pattern-matching library for Lua, +based on +<a href="http://pdos.csail.mit.edu/%7Ebaford/packrat/"> +Parsing Expression Grammars</a> (PEGs). +This text is a reference manual for the library. +For a more formal treatment of LPeg, +as well as some discussion about its implementation, +see +<a href="http://www.inf.puc-rio.br/~roberto/docs/peg.pdf"> +A Text Pattern-Matching Tool based on Parsing Expression Grammars</a>. +(You may also be interested in my +<a href="http://vimeo.com/1485123">talk about LPeg</a> +given at the III Lua Workshop.) +</p> + +<p> +Following the Snobol tradition, +LPeg defines patterns as first-class objects. +That is, patterns are regular Lua values +(represented by userdata). +The library offers several functions to create +and compose patterns. +With the use of metamethods, +several of these functions are provided as infix or prefix +operators. +On the one hand, +the result is usually much more verbose than the typical +encoding of patterns using the so called +<em>regular expressions</em> +(which typically are not regular expressions in the formal sense). +On the other hand, +first-class patterns allow much better documentation +(as it is easy to comment the code, +to break complex definitions in smaller parts, etc.) +and are extensible, +as we can define new functions to create and compose patterns. +</p> + +<p> +For a quick glance of the library, +the following table summarizes its basic operations +for creating patterns: +</p> +<table border="1"> +<tbody><tr><td><b>Operator</b></td><td><b>Description</b></td></tr> +<tr><td><a href="#op-p"><code>lpeg.P(string)</code></a></td> + <td>Matches <code>string</code> literally</td></tr> +<tr><td><a href="#op-p"><code>lpeg.P(n)</code></a></td> + <td>Matches exactly <code>n</code> characters</td></tr> +<tr><td><a href="#op-s"><code>lpeg.S(string)</code></a></td> + <td>Matches any character in <code>string</code> (Set)</td></tr> +<tr><td><a href="#op-r"><code>lpeg.R("<em>xy</em>")</code></a></td> + <td>Matches any character between <em>x</em> and <em>y</em> (Range)</td></tr> +<tr><td><a href="#op-pow"><code>patt^n</code></a></td> + <td>Matches at least <code>n</code> repetitions of <code>patt</code></td></tr> +<tr><td><a href="#op-pow"><code>patt^-n</code></a></td> + <td>Matches at most <code>n</code> repetitions of <code>patt</code></td></tr> +<tr><td><a href="#op-mul"><code>patt1 * patt2</code></a></td> + <td>Matches <code>patt1</code> followed by <code>patt2</code></td></tr> +<tr><td><a href="#op-add"><code>patt1 + patt2</code></a></td> + <td>Matches <code>patt1</code> or <code>patt2</code> + (ordered choice)</td></tr> +<tr><td><a href="#op-sub"><code>patt1 - patt2</code></a></td> + <td>Matches <code>patt1</code> if <code>patt2</code> does not match</td></tr> +<tr><td><a href="#op-unm"><code>-patt</code></a></td> + <td>Equivalent to <code>("" - patt)</code></td></tr> +<tr><td><a href="#op-len"><code>#patt</code></a></td> + <td>Matches <code>patt</code> but consumes no input</td></tr> +<tr><td><a href="#op-behind"><code>lpeg.B(patt)</code></a></td> + <td>Matches <code>patt</code> behind the current position, + consuming no input</td></tr> +</tbody></table> + +<p>As a very simple example, +<code>lpeg.R("09")^1</code> creates a pattern that +matches a non-empty sequence of digits. +As a not so simple example, +<code>-lpeg.P(1)</code> +(which can be written as <code>lpeg.P(-1)</code>, +or simply <code>-1</code> for operations expecting a pattern) +matches an empty string only if it cannot match a single character; +so, it succeeds only at the end of the subject. +</p> + +<p> +LPeg also offers the <a href="re.html"><code>re</code> module</a>, +which implements patterns following a regular-expression style +(e.g., <code>[09]+</code>). +(This module is 260 lines of Lua code, +and of course it uses LPeg to parse regular expressions and +translate them to regular LPeg patterns.) +</p> + + +<h2><a name="func">Functions</a></h2> + + +<h3><a name="f-match"></a><code>lpeg.match (pattern, subject [, init])</code></h3> +<p> +The matching function. +It attempts to match the given pattern against the subject string. +If the match succeeds, +returns the index in the subject of the first character after the match, +or the <a href="#captures">captured values</a> +(if the pattern captured any value). +</p> + +<p> +An optional numeric argument <code>init</code> makes the match +start at that position in the subject string. +As usual in Lua libraries, +a negative value counts from the end. +</p> + +<p> +Unlike typical pattern-matching functions, +<code>match</code> works only in <em>anchored</em> mode; +that is, it tries to match the pattern with a prefix of +the given subject string (at position <code>init</code>), +not with an arbitrary substring of the subject. +So, if we want to find a pattern anywhere in a string, +we must either write a loop in Lua or write a pattern that +matches anywhere. +This second approach is easy and quite efficient; +see <a href="#ex">examples</a>. +</p> + +<h3><a name="f-type"></a><code>lpeg.type (value)</code></h3> +<p> +If the given value is a pattern, +returns the string <code>"pattern"</code>. +Otherwise returns nil. +</p> + +<h3><a name="f-version"></a><code>lpeg.version ()</code></h3> +<p> +Returns a string with the running version of LPeg. +</p> + +<h3><a name="f-setstack"></a><code>lpeg.setmaxstack (max)</code></h3> +<p> +Sets a limit for the size of the backtrack stack used by LPeg to +track calls and choices. +(The default limit is 400.) +Most well-written patterns need little backtrack levels and +therefore you seldom need to change this limit; +before changing it you should try to rewrite your +pattern to avoid the need for extra space. +Nevertheless, a few useful patterns may overflow. +Also, with recursive grammars, +subjects with deep recursion may also need larger limits. +</p> + + +<h2><a name="basic">Basic Constructions</a></h2> + +<p> +The following operations build patterns. +All operations that expect a pattern as an argument +may receive also strings, tables, numbers, booleans, or functions, +which are translated to patterns according to +the rules of function <a href="#op-p"><code>lpeg.P</code></a>. +</p> + + + +<h3><a name="op-p"></a><code>lpeg.P (value)</code></h3> +<p> +Converts the given value into a proper pattern, +according to the following rules: +</p> +<ul> + +<li><p> +If the argument is a pattern, +it is returned unmodified. +</p></li> + +<li><p> +If the argument is a string, +it is translated to a pattern that matches the string literally. +</p></li> + +<li><p> +If the argument is a non-negative number <em>n</em>, +the result is a pattern that matches exactly <em>n</em> characters. +</p></li> + +<li><p> +If the argument is a negative number <em>-n</em>, +the result is a pattern that +succeeds only if the input string has less than <em>n</em> characters left: +<code>lpeg.P(-n)</code> +is equivalent to <code>-lpeg.P(n)</code> +(see the <a href="#op-unm">unary minus operation</a>). +</p></li> + +<li><p> +If the argument is a boolean, +the result is a pattern that always succeeds or always fails +(according to the boolean value), +without consuming any input. +</p></li> + +<li><p> +If the argument is a table, +it is interpreted as a grammar +(see <a href="#grammar">Grammars</a>). +</p></li> + +<li><p> +If the argument is a function, +returns a pattern equivalent to a +<a href="#matchtime">match-time capture</a> over the empty string. +</p></li> + +</ul> + + +<h3><a name="op-behind"></a><code>lpeg.B(patt)</code></h3> +<p> +Returns a pattern that +matches only if the input string at the current position +is preceded by <code>patt</code>. +Pattern <code>patt</code> must match only strings +with some fixed length, +and it cannot contain captures. +</p> + +<p> +Like the <a href="#op-len">and predicate</a>, +this pattern never consumes any input, +independently of success or failure. +</p> + + +<h3><a name="op-r"></a><code>lpeg.R ({range})</code></h3> +<p> +Returns a pattern that matches any single character +belonging to one of the given <em>ranges</em>. +Each <code>range</code> is a string <em>xy</em> of length 2, +representing all characters with code +between the codes of <em>x</em> and <em>y</em> +(both inclusive). +</p> + +<p> +As an example, the pattern +<code>lpeg.R("09")</code> matches any digit, +and <code>lpeg.R("az", "AZ")</code> matches any ASCII letter. +</p> + + +<h3><a name="op-s"></a><code>lpeg.S (string)</code></h3> +<p> +Returns a pattern that matches any single character that +appears in the given string. +(The <code>S</code> stands for <em>Set</em>.) +</p> + +<p> +As an example, the pattern +<code>lpeg.S("+-*/")</code> matches any arithmetic operator. +</p> + +<p> +Note that, if <code>s</code> is a character +(that is, a string of length 1), +then <code>lpeg.P(s)</code> is equivalent to <code>lpeg.S(s)</code> +which is equivalent to <code>lpeg.R(s..s)</code>. +Note also that both <code>lpeg.S("")</code> and <code>lpeg.R()</code> +are patterns that always fail. +</p> + + +<h3><a name="op-v"></a><code>lpeg.V (v)</code></h3> +<p> +This operation creates a non-terminal (a <em>variable</em>) +for a grammar. +The created non-terminal refers to the rule indexed by <code>v</code> +in the enclosing grammar. +(See <a href="#grammar">Grammars</a> for details.) +</p> + + +<h3><a name="op-locale"></a><code>lpeg.locale ([table])</code></h3> +<p> +Returns a table with patterns for matching some character classes +according to the current locale. +The table has fields named +<code>alnum</code>, +<code>alpha</code>, +<code>cntrl</code>, +<code>digit</code>, +<code>graph</code>, +<code>lower</code>, +<code>print</code>, +<code>punct</code>, +<code>space</code>, +<code>upper</code>, and +<code>xdigit</code>, +each one containing a correspondent pattern. +Each pattern matches any single character that belongs to its class. +</p> + +<p> +If called with an argument <code>table</code>, +then it creates those fields inside the given table and +returns that table. +</p> + + +<h3><a name="op-len"></a><code>#patt</code></h3> +<p> +Returns a pattern that +matches only if the input string matches <code>patt</code>, +but without consuming any input, +independently of success or failure. +(This pattern is called an <em>and predicate</em> +and it is equivalent to +<em>&patt</em> in the original PEG notation.) +</p> + + +<p> +This pattern never produces any capture. +</p> + + +<h3><a name="op-unm"></a><code>-patt</code></h3> +<p> +Returns a pattern that +matches only if the input string does not match <code>patt</code>. +It does not consume any input, +independently of success or failure. +(This pattern is equivalent to +<em>!patt</em> in the original PEG notation.) +</p> + +<p> +As an example, the pattern +<code>-lpeg.P(1)</code> matches only the end of string. +</p> + +<p> +This pattern never produces any captures, +because either <code>patt</code> fails +or <code>-patt</code> fails. +(A failing pattern never produces captures.) +</p> + + +<h3><a name="op-add"></a><code>patt1 + patt2</code></h3> +<p> +Returns a pattern equivalent to an <em>ordered choice</em> +of <code>patt1</code> and <code>patt2</code>. +(This is denoted by <em>patt1 / patt2</em> in the original PEG notation, +not to be confused with the <code>/</code> operation in LPeg.) +It matches either <code>patt1</code> or <code>patt2</code>, +with no backtracking once one of them succeeds. +The identity element for this operation is the pattern +<code>lpeg.P(false)</code>, +which always fails. +</p> + +<p> +If both <code>patt1</code> and <code>patt2</code> are +character sets, +this operation is equivalent to set union. +</p> +<pre class="example"> +lower = lpeg.R("az") +upper = lpeg.R("AZ") +letter = lower + upper +</pre> + + +<h3><a name="op-sub"></a><code>patt1 - patt2</code></h3> +<p> +Returns a pattern equivalent to <em>!patt2 patt1</em>. +This pattern asserts that the input does not match +<code>patt2</code> and then matches <code>patt1</code>. +</p> + +<p> +When successful, +this pattern produces all captures from <code>patt1</code>. +It never produces any capture from <code>patt2</code> +(as either <code>patt2</code> fails or +<code>patt1 - patt2</code> fails). +</p> + +<p> +If both <code>patt1</code> and <code>patt2</code> are +character sets, +this operation is equivalent to set difference. +Note that <code>-patt</code> is equivalent to <code>"" - patt</code> +(or <code>0 - patt</code>). +If <code>patt</code> is a character set, +<code>1 - patt</code> is its complement. +</p> + + +<h3><a name="op-mul"></a><code>patt1 * patt2</code></h3> +<p> +Returns a pattern that matches <code>patt1</code> +and then matches <code>patt2</code>, +starting where <code>patt1</code> finished. +The identity element for this operation is the +pattern <code>lpeg.P(true)</code>, +which always succeeds. +</p> + +<p> +(LPeg uses the <code>*</code> operator +[instead of the more obvious <code>..</code>] +both because it has +the right priority and because in formal languages it is +common to use a dot for denoting concatenation.) +</p> + + +<h3><a name="op-pow"></a><code>patt^n</code></h3> +<p> +If <code>n</code> is nonnegative, +this pattern is +equivalent to <em>patt<sup>n</sup> patt*</em>: +It matches <code>n</code> or more occurrences of <code>patt</code>. +</p> + +<p> +Otherwise, when <code>n</code> is negative, +this pattern is equivalent to <em>(patt?)<sup>-n</sup></em>: +It matches at most <code>|n|</code> +occurrences of <code>patt</code>. +</p> + +<p> +In particular, <code>patt^0</code> is equivalent to <em>patt*</em>, +<code>patt^1</code> is equivalent to <em>patt+</em>, +and <code>patt^-1</code> is equivalent to <em>patt?</em> +in the original PEG notation. +</p> + +<p> +In all cases, +the resulting pattern is greedy with no backtracking +(also called a <em>possessive</em> repetition). +That is, it matches only the longest possible sequence +of matches for <code>patt</code>. +</p> + + + +<h2><a name="grammar">Grammars</a></h2> + +<p> +With the use of Lua variables, +it is possible to define patterns incrementally, +with each new pattern using previously defined ones. +However, this technique does not allow the definition of +recursive patterns. +For recursive patterns, +we need real grammars. +</p> + +<p> +LPeg represents grammars with tables, +where each entry is a rule. +</p> + +<p> +The call <code>lpeg.V(v)</code> +creates a pattern that represents the nonterminal +(or <em>variable</em>) with index <code>v</code> in a grammar. +Because the grammar still does not exist when +this function is evaluated, +the result is an <em>open reference</em> to the respective rule. +</p> + +<p> +A table is <em>fixed</em> when it is converted to a pattern +(either by calling <code>lpeg.P</code> or by using it wherein a +pattern is expected). +Then every open reference created by <code>lpeg.V(v)</code> +is corrected to refer to the rule indexed by <code>v</code> in the table. +</p> + +<p> +When a table is fixed, +the result is a pattern that matches its <em>initial rule</em>. +The entry with index 1 in the table defines its initial rule. +If that entry is a string, +it is assumed to be the name of the initial rule. +Otherwise, LPeg assumes that the entry 1 itself is the initial rule. +</p> + +<p> +As an example, +the following grammar matches strings of a's and b's that +have the same number of a's and b's: +</p> +<pre class="example"> +equalcount = lpeg.P{ + "S"; -- initial rule name + S = "a" * lpeg.V"B" + "b" * lpeg.V"A" + "", + A = "a" * lpeg.V"S" + "b" * lpeg.V"A" * lpeg.V"A", + B = "b" * lpeg.V"S" + "a" * lpeg.V"B" * lpeg.V"B", +} * -1 +</pre> +<p> +It is equivalent to the following grammar in standard PEG notation: +</p> +<pre class="example"> + S <- 'a' B / 'b' A / '' + A <- 'a' S / 'b' A A + B <- 'b' S / 'a' B B +</pre> + + +<h2><a name="captures">Captures</a></h2> + +<p> +A <em>capture</em> is a pattern that produces values +(the so called <em>semantic information</em>) +according to what it matches. +LPeg offers several kinds of captures, +which produces values based on matches and combine these values to +produce new values. +Each capture may produce zero or more values. +</p> + +<p> +The following table summarizes the basic captures: +</p> +<table border="1"> +<tbody><tr><td><b>Operation</b></td><td><b>What it Produces</b></td></tr> +<tr><td><a href="#cap-c"><code>lpeg.C(patt)</code></a></td> + <td>the match for <code>patt</code> plus all captures + made by <code>patt</code></td></tr> +<tr><td><a href="#cap-arg"><code>lpeg.Carg(n)</code></a></td> + <td>the value of the n<sup>th</sup> extra argument to + <code>lpeg.match</code> (matches the empty string)</td></tr> +<tr><td><a href="#cap-b"><code>lpeg.Cb(name)</code></a></td> + <td>the values produced by the previous + group capture named <code>name</code> + (matches the empty string)</td></tr> +<tr><td><a href="#cap-cc"><code>lpeg.Cc(values)</code></a></td> + <td>the given values (matches the empty string)</td></tr> +<tr><td><a href="#cap-f"><code>lpeg.Cf(patt, func)</code></a></td> + <td>a <em>folding</em> of the captures from <code>patt</code></td></tr> +<tr><td><a href="#cap-g"><code>lpeg.Cg(patt [, name])</code></a></td> + <td>the values produced by <code>patt</code>, + optionally tagged with <code>name</code></td></tr> +<tr><td><a href="#cap-p"><code>lpeg.Cp()</code></a></td> + <td>the current position (matches the empty string)</td></tr> +<tr><td><a href="#cap-s"><code>lpeg.Cs(patt)</code></a></td> + <td>the match for <code>patt</code> + with the values from nested captures replacing their matches</td></tr> +<tr><td><a href="#cap-t"><code>lpeg.Ct(patt)</code></a></td> + <td>a table with all captures from <code>patt</code></td></tr> +<tr><td><a href="#cap-string"><code>patt / string</code></a></td> + <td><code>string</code>, with some marks replaced by captures + of <code>patt</code></td></tr> +<tr><td><a href="#cap-num"><code>patt / number</code></a></td> + <td>the n-th value captured by <code>patt</code>, +or no value when <code>number</code> is zero.</td></tr> +<tr><td><a href="#cap-query"><code>patt / table</code></a></td> + <td><code>table[c]</code>, where <code>c</code> is the (first) + capture of <code>patt</code></td></tr> +<tr><td><a href="#cap-func"><code>patt / function</code></a></td> + <td>the returns of <code>function</code> applied to the captures + of <code>patt</code></td></tr> +<tr><td><a href="#matchtime"><code>lpeg.Cmt(patt, function)</code></a></td> + <td>the returns of <code>function</code> applied to the captures + of <code>patt</code>; the application is done at match time</td></tr> +</tbody></table> + +<p> +A capture pattern produces its values only when it succeeds. +For instance, +the pattern <code>lpeg.C(lpeg.P"a"^-1)</code> +produces the empty string when there is no <code>"a"</code> +(because the pattern <code>"a"?</code> succeeds), +while the pattern <code>lpeg.C("a")^-1</code> +does not produce any value when there is no <code>"a"</code> +(because the pattern <code>"a"</code> fails). +A pattern inside a loop or inside a recursive structure +produces values for each match. +</p> + +<p> +Usually, +LPeg does not specify when (and if) it evaluates its captures. +(As an example, +consider the pattern <code>lpeg.P"a" / func / 0</code>. +Because the "division" by 0 instructs LPeg to throw away the +results from the pattern, +LPeg may or may not call <code>func</code>.) +Therefore, captures should avoid side effects. +Moreover, +most captures cannot affect the way a pattern matches a subject. +The only exception to this rule is the +so-called <a href="#matchtime"><em>match-time capture</em></a>. +When a match-time capture matches, +it forces the immediate evaluation of all its nested captures +and then calls its corresponding function, +which defines whether the match succeeds and also +what values are produced. +</p> + +<h3><a name="cap-c"></a><code>lpeg.C (patt)</code></h3> +<p> +Creates a <em>simple capture</em>, +which captures the substring of the subject that matches <code>patt</code>. +The captured value is a string. +If <code>patt</code> has other captures, +their values are returned after this one. +</p> + + +<h3><a name="cap-arg"></a><code>lpeg.Carg (n)</code></h3> +<p> +Creates an <em>argument capture</em>. +This pattern matches the empty string and +produces the value given as the n<sup>th</sup> extra +argument given in the call to <code>lpeg.match</code>. +</p> + + +<h3><a name="cap-b"></a><code>lpeg.Cb (name)</code></h3> +<p> +Creates a <em>back capture</em>. +This pattern matches the empty string and +produces the values produced by the <em>most recent</em> +<a href="#cap-g">group capture</a> named <code>name</code> +(where <code>name</code> can be any Lua value). +</p> + +<p> +<em>Most recent</em> means the last +<em>complete</em> +<em>outermost</em> +group capture with the given name. +A <em>Complete</em> capture means that the entire pattern +corresponding to the capture has matched. +An <em>Outermost</em> capture means that the capture is not inside +another complete capture. +</p> + +<p> +In the same way that LPeg does not specify when it evaluates captures, +it does not specify whether it reuses +values previously produced by the group +or re-evaluates them. +</p> + +<h3><a name="cap-cc"></a><code>lpeg.Cc ([value, ...])</code></h3> +<p> +Creates a <em>constant capture</em>. +This pattern matches the empty string and +produces all given values as its captured values. +</p> + + +<h3><a name="cap-f"></a><code>lpeg.Cf (patt, func)</code></h3> +<p> +Creates a <em>fold capture</em>. +If <code>patt</code> produces a list of captures +<em>C<sub>1</sub> C<sub>2</sub> ... C<sub>n</sub></em>, +this capture will produce the value +<em>func(...func(func(C<sub>1</sub>, C<sub>2</sub>), C<sub>3</sub>)..., + C<sub>n</sub>)</em>, +that is, it will <em>fold</em> +(or <em>accumulate</em>, or <em>reduce</em>) +the captures from <code>patt</code> using function <code>func</code>. +</p> + +<p> +This capture assumes that <code>patt</code> should produce +at least one capture with at least one value (of any type), +which becomes the initial value of an <em>accumulator</em>. +(If you need a specific initial value, +you may prefix a <a href="#cap-cc">constant capture</a> to <code>patt</code>.) +For each subsequent capture, +LPeg calls <code>func</code> +with this accumulator as the first argument and all values produced +by the capture as extra arguments; +the first result from this call +becomes the new value for the accumulator. +The final value of the accumulator becomes the captured value. +</p> + +<p> +As an example, +the following pattern matches a list of numbers separated +by commas and returns their addition: +</p> +<pre class="example"> +-- matches a numeral and captures its numerical value +number = lpeg.R"09"^1 / tonumber + +-- matches a list of numbers, capturing their values +list = number * ("," * number)^0 + +-- auxiliary function to add two numbers +function add (acc, newvalue) return acc + newvalue end + +-- folds the list of numbers adding them +sum = lpeg.Cf(list, add) + +-- example of use +print(sum:match("10,30,43")) --> 83 +</pre> + + +<h3><a name="cap-g"></a><code>lpeg.Cg (patt [, name])</code></h3> +<p> +Creates a <em>group capture</em>. +It groups all values returned by <code>patt</code> +into a single capture. +The group may be anonymous (if no name is given) +or named with the given name +(which can be any non-nil Lua value). +</p> + +<p> +An anonymous group serves to join values from several captures into +a single capture. +A named group has a different behavior. +In most situations, a named group returns no values at all. +Its values are only relevant for a following +<a href="#cap-b">back capture</a> or when used +inside a <a href="#cap-t">table capture</a>. +</p> + + +<h3><a name="cap-p"></a><code>lpeg.Cp ()</code></h3> +<p> +Creates a <em>position capture</em>. +It matches the empty string and +captures the position in the subject where the match occurs. +The captured value is a number. +</p> + + +<h3><a name="cap-s"></a><code>lpeg.Cs (patt)</code></h3> +<p> +Creates a <em>substitution capture</em>, +which captures the substring of the subject that matches <code>patt</code>, +with <em>substitutions</em>. +For any capture inside <code>patt</code> with a value, +the substring that matched the capture is replaced by the capture value +(which should be a string). +The final captured value is the string resulting from +all replacements. +</p> + + +<h3><a name="cap-t"></a><code>lpeg.Ct (patt)</code></h3> +<p> +Creates a <em>table capture</em>. +This capture returns a table with all values from all anonymous captures +made by <code>patt</code> inside this table in successive integer keys, +starting at 1. +Moreover, +for each named capture group created by <code>patt</code>, +the first value of the group is put into the table +with the group name as its key. +The captured value is only the table. +</p> + + +<h3><a name="cap-string"></a><code>patt / string</code></h3> +<p> +Creates a <em>string capture</em>. +It creates a capture string based on <code>string</code>. +The captured value is a copy of <code>string</code>, +except that the character <code>%</code> works as an escape character: +any sequence in <code>string</code> of the form <code>%<em>n</em></code>, +with <em>n</em> between 1 and 9, +stands for the match of the <em>n</em>-th capture in <code>patt</code>. +The sequence <code>%0</code> stands for the whole match. +The sequence <code>%%</code> stands for a single <code>%</code>. +</p> + + +<h3><a name="cap-num"></a><code>patt / number</code></h3> +<p> +Creates a <em>numbered capture</em>. +For a non-zero number, +the captured value is the n-th value +captured by <code>patt</code>. +When <code>number</code> is zero, +there are no captured values. +</p> + + +<h3><a name="cap-query"></a><code>patt / table</code></h3> +<p> +Creates a <em>query capture</em>. +It indexes the given table using as key the first value captured by +<code>patt</code>, +or the whole match if <code>patt</code> produced no value. +The value at that index is the final value of the capture. +If the table does not have that key, +there is no captured value. +</p> + + +<h3><a name="cap-func"></a><code>patt / function</code></h3> +<p> +Creates a <em>function capture</em>. +It calls the given function passing all captures made by +<code>patt</code> as arguments, +or the whole match if <code>patt</code> made no capture. +The values returned by the function +are the final values of the capture. +In particular, +if <code>function</code> returns no value, +there is no captured value. +</p> + + +<h3><a name="matchtime"></a><code>lpeg.Cmt(patt, function)</code></h3> +<p> +Creates a <em>match-time capture</em>. +Unlike all other captures, +this one is evaluated immediately when a match occurs +(even if it is part of a larger pattern that fails later). +It forces the immediate evaluation of all its nested captures +and then calls <code>function</code>. +</p> + +<p> +The given function gets as arguments the entire subject, +the current position (after the match of <code>patt</code>), +plus any capture values produced by <code>patt</code>. +</p> + +<p> +The first value returned by <code>function</code> +defines how the match happens. +If the call returns a number, +the match succeeds +and the returned number becomes the new current position. +(Assuming a subject <em>s</em> and current position <em>i</em>, +the returned number must be in the range <em>[i, len(s) + 1]</em>.) +If the call returns <b>true</b>, +the match succeeds without consuming any input. +(So, to return <b>true</b> is equivalent to return <em>i</em>.) +If the call returns <b>false</b>, <b>nil</b>, or no value, +the match fails. +</p> + +<p> +Any extra values returned by the function become the +values produced by the capture. +</p> + + + + +<h2><a name="ex">Some Examples</a></h2> + +<h3>Using a Pattern</h3> +<p> +This example shows a very simple but complete program +that builds and uses a pattern: +</p> +<pre class="example"> +local lpeg = require "lpeg" + +-- matches a word followed by end-of-string +p = lpeg.R"az"^1 * -1 + +print(p:match("hello")) --> 6 +print(lpeg.match(p, "hello")) --> 6 +print(p:match("1 hello")) --> nil +</pre> +<p> +The pattern is simply a sequence of one or more lower-case letters +followed by the end of string (-1). +The program calls <code>match</code> both as a method +and as a function. +In both sucessful cases, +the match returns +the index of the first character after the match, +which is the string length plus one. +</p> + + +<h3>Name-value lists</h3> +<p> +This example parses a list of name-value pairs and returns a table +with those pairs: +</p> +<pre class="example"> +lpeg.locale(lpeg) -- adds locale entries into 'lpeg' table + +local space = lpeg.space^0 +local name = lpeg.C(lpeg.alpha^1) * space +local sep = lpeg.S(",;") * space +local pair = lpeg.Cg(name * "=" * space * name) * sep^-1 +local list = lpeg.Cf(lpeg.Ct("") * pair^0, rawset) +t = list:match("a=b, c = hi; next = pi") --> { a = "b", c = "hi", next = "pi" } +</pre> +<p> +Each pair has the format <code>name = name</code> followed by +an optional separator (a comma or a semicolon). +The <code>pair</code> pattern encloses the pair in a group pattern, +so that the names become the values of a single capture. +The <code>list</code> pattern then folds these captures. +It starts with an empty table, +created by a table capture matching an empty string; +then for each capture (a pair of names) it applies <code>rawset</code> +over the accumulator (the table) and the capture values (the pair of names). +<code>rawset</code> returns the table itself, +so the accumulator is always the table. +</p> + +<h3>Splitting a string</h3> +<p> +The following code builds a pattern that +splits a string using a given pattern +<code>sep</code> as a separator: +</p> +<pre class="example"> +function split (s, sep) + sep = lpeg.P(sep) + local elem = lpeg.C((1 - sep)^0) + local p = elem * (sep * elem)^0 + return lpeg.match(p, s) +end +</pre> +<p> +First the function ensures that <code>sep</code> is a proper pattern. +The pattern <code>elem</code> is a repetition of zero of more +arbitrary characters as long as there is not a match against +the separator. +It also captures its match. +The pattern <code>p</code> matches a list of elements separated +by <code>sep</code>. +</p> + +<p> +If the split results in too many values, +it may overflow the maximum number of values +that can be returned by a Lua function. +In this case, +we can collect these values in a table: +</p> +<pre class="example"> +function split (s, sep) + sep = lpeg.P(sep) + local elem = lpeg.C((1 - sep)^0) + local p = lpeg.Ct(elem * (sep * elem)^0) -- make a table capture + return lpeg.match(p, s) +end +</pre> + + +<h3>Searching for a pattern</h3> +<p> +The primitive <code>match</code> works only in anchored mode. +If we want to find a pattern anywhere in a string, +we must write a pattern that matches anywhere. +</p> + +<p> +Because patterns are composable, +we can write a function that, +given any arbitrary pattern <code>p</code>, +returns a new pattern that searches for <code>p</code> +anywhere in a string. +There are several ways to do the search. +One way is like this: +</p> +<pre class="example"> +function anywhere (p) + return lpeg.P{ p + 1 * lpeg.V(1) } +end +</pre> +<p> +This grammar has a straight reading: +it matches <code>p</code> or skips one character and tries again. +</p> + +<p> +If we want to know where the pattern is in the string +(instead of knowing only that it is there somewhere), +we can add position captures to the pattern: +</p> +<pre class="example"> +local I = lpeg.Cp() +function anywhere (p) + return lpeg.P{ I * p * I + 1 * lpeg.V(1) } +end + +print(anywhere("world"):match("hello world!")) -> 7 12 +</pre> + +<p> +Another option for the search is like this: +</p> +<pre class="example"> +local I = lpeg.Cp() +function anywhere (p) + return (1 - lpeg.P(p))^0 * I * p * I +end +</pre> +<p> +Again the pattern has a straight reading: +it skips as many characters as possible while not matching <code>p</code>, +and then matches <code>p</code> (plus appropriate captures). +</p> + +<p> +If we want to look for a pattern only at word boundaries, +we can use the following transformer: +</p> + +<pre class="example"> +local t = lpeg.locale() + +function atwordboundary (p) + return lpeg.P{ + [1] = p + t.alpha^0 * (1 - t.alpha)^1 * lpeg.V(1) + } +end +</pre> + + +<h3><a name="balanced"></a>Balanced parentheses</h3> +<p> +The following pattern matches only strings with balanced parentheses: +</p> +<pre class="example"> +b = lpeg.P{ "(" * ((1 - lpeg.S"()") + lpeg.V(1))^0 * ")" } +</pre> +<p> +Reading the first (and only) rule of the given grammar, +we have that a balanced string is +an open parenthesis, +followed by zero or more repetitions of either +a non-parenthesis character or +a balanced string (<code>lpeg.V(1)</code>), +followed by a closing parenthesis. +</p> + + +<h3>Global substitution</h3> +<p> +The next example does a job somewhat similar to <code>string.gsub</code>. +It receives a pattern and a replacement value, +and substitutes the replacement value for all occurrences of the pattern +in a given string: +</p> +<pre class="example"> +function gsub (s, patt, repl) + patt = lpeg.P(patt) + patt = lpeg.Cs((patt / repl + 1)^0) + return lpeg.match(patt, s) +end +</pre> +<p> +As in <code>string.gsub</code>, +the replacement value can be a string, +a function, or a table. +</p> + + +<h3><a name="CSV"></a>Comma-Separated Values (CSV)</h3> +<p> +This example breaks a string into comma-separated values, +returning all fields: +</p> +<pre class="example"> +local field = '"' * lpeg.Cs(((lpeg.P(1) - '"') + lpeg.P'""' / '"')^0) * '"' + + lpeg.C((1 - lpeg.S',\n"')^0) + +local record = field * (',' * field)^0 * (lpeg.P'\n' + -1) + +function csv (s) + return lpeg.match(record, s) +end +</pre> +<p> +A field is either a quoted field +(which may contain any character except an individual quote, +which may be written as two quotes that are replaced by one) +or an unquoted field +(which cannot contain commas, newlines, or quotes). +A record is a list of fields separated by commas, +ending with a newline or the string end (-1). +</p> + +<p> +As it is, +the previous pattern returns each field as a separated result. +If we add a table capture in the definition of <code>record</code>, +the pattern will return instead a single table +containing all fields: +</p> +<pre> +local record = lpeg.Ct(field * (',' * field)^0) * (lpeg.P'\n' + -1) +</pre> + + +<h3>UTF-8 and Latin 1</h3> +<p> +It is not difficult to use LPeg to convert a string from +UTF-8 encoding to Latin 1 (ISO 8859-1): +</p> + +<pre class="example"> +-- convert a two-byte UTF-8 sequence to a Latin 1 character +local function f2 (s) + local c1, c2 = string.byte(s, 1, 2) + return string.char(c1 * 64 + c2 - 12416) +end + +local utf8 = lpeg.R("\0\127") + + lpeg.R("\194\195") * lpeg.R("\128\191") / f2 + +local decode_pattern = lpeg.Cs(utf8^0) * -1 +</pre> +<p> +In this code, +the definition of UTF-8 is already restricted to the +Latin 1 range (from 0 to 255). +Any encoding outside this range (as well as any invalid encoding) +will not match that pattern. +</p> + +<p> +As the definition of <code>decode_pattern</code> demands that +the pattern matches the whole input (because of the -1 at its end), +any invalid string will simply fail to match, +without any useful information about the problem. +We can improve this situation redefining <code>decode_pattern</code> +as follows: +</p> +<pre class="example"> +local function er (_, i) error("invalid encoding at position " .. i) end + +local decode_pattern = lpeg.Cs(utf8^0) * (-1 + lpeg.P(er)) +</pre> +<p> +Now, if the pattern <code>utf8^0</code> stops +before the end of the string, +an appropriate error function is called. +</p> + + +<h3>UTF-8 and Unicode</h3> +<p> +We can extend the previous patterns to handle all Unicode code points. +Of course, +we cannot translate them to Latin 1 or any other one-byte encoding. +Instead, our translation results in a array with the code points +represented as numbers. +The full code is here: +</p> +<pre class="example"> +-- decode a two-byte UTF-8 sequence +local function f2 (s) + local c1, c2 = string.byte(s, 1, 2) + return c1 * 64 + c2 - 12416 +end + +-- decode a three-byte UTF-8 sequence +local function f3 (s) + local c1, c2, c3 = string.byte(s, 1, 3) + return (c1 * 64 + c2) * 64 + c3 - 925824 +end + +-- decode a four-byte UTF-8 sequence +local function f4 (s) + local c1, c2, c3, c4 = string.byte(s, 1, 4) + return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168 +end + +local cont = lpeg.R("\128\191") -- continuation byte + +local utf8 = lpeg.R("\0\127") / string.byte + + lpeg.R("\194\223") * cont / f2 + + lpeg.R("\224\239") * cont * cont / f3 + + lpeg.R("\240\244") * cont * cont * cont / f4 + +local decode_pattern = lpeg.Ct(utf8^0) * -1 +</pre> + + +<h3>Lua's long strings</h3> +<p> +A long string in Lua starts with the pattern <code>[=*[</code> +and ends at the first occurrence of <code>]=*]</code> with +exactly the same number of equal signs. +If the opening brackets are followed by a newline, +this newline is discarded +(that is, it is not part of the string). +</p> + +<p> +To match a long string in Lua, +the pattern must capture the first repetition of equal signs and then, +whenever it finds a candidate for closing the string, +check whether it has the same number of equal signs. +</p> + +<pre class="example"> +equals = lpeg.P"="^0 +open = "[" * lpeg.Cg(equals, "init") * "[" * lpeg.P"\n"^-1 +close = "]" * lpeg.C(equals) * "]" +closeeq = lpeg.Cmt(close * lpeg.Cb("init"), function (s, i, a, b) return a == b end) +string = open * lpeg.C((lpeg.P(1) - closeeq)^0) * close / 1 +</pre> + +<p> +The <code>open</code> pattern matches <code>[=*[</code>, +capturing the repetitions of equal signs in a group named <code>init</code>; +it also discharges an optional newline, if present. +The <code>close</code> pattern matches <code>]=*]</code>, +also capturing the repetitions of equal signs. +The <code>closeeq</code> pattern first matches <code>close</code>; +then it uses a back capture to recover the capture made +by the previous <code>open</code>, +which is named <code>init</code>; +finally it uses a match-time capture to check +whether both captures are equal. +The <code>string</code> pattern starts with an <code>open</code>, +then it goes as far as possible until matching <code>closeeq</code>, +and then matches the final <code>close</code>. +The final numbered capture simply discards +the capture made by <code>close</code>. +</p> + + +<h3>Arithmetic expressions</h3> +<p> +This example is a complete parser and evaluator for simple +arithmetic expressions. +We write it in two styles. +The first approach first builds a syntax tree and then +traverses this tree to compute the expression value: +</p> +<pre class="example"> +-- Lexical Elements +local Space = lpeg.S(" \n\t")^0 +local Number = lpeg.C(lpeg.P"-"^-1 * lpeg.R("09")^1) * Space +local TermOp = lpeg.C(lpeg.S("+-")) * Space +local FactorOp = lpeg.C(lpeg.S("*/")) * Space +local Open = "(" * Space +local Close = ")" * Space + +-- Grammar +local Exp, Term, Factor = lpeg.V"Exp", lpeg.V"Term", lpeg.V"Factor" +G = lpeg.P{ Exp, + Exp = lpeg.Ct(Term * (TermOp * Term)^0); + Term = lpeg.Ct(Factor * (FactorOp * Factor)^0); + Factor = Number + Open * Exp * Close; +} + +G = Space * G * -1 + +-- Evaluator +function eval (x) + if type(x) == "string" then + return tonumber(x) + else + local op1 = eval(x[1]) + for i = 2, #x, 2 do + local op = x[i] + local op2 = eval(x[i + 1]) + if (op == "+") then op1 = op1 + op2 + elseif (op == "-") then op1 = op1 - op2 + elseif (op == "*") then op1 = op1 * op2 + elseif (op == "/") then op1 = op1 / op2 + end + end + return op1 + end +end + +-- Parser/Evaluator +function evalExp (s) + local t = lpeg.match(G, s) + if not t then error("syntax error", 2) end + return eval(t) +end + +-- small example +print(evalExp"3 + 5*9 / (1+1) - 12") --> 13.5 +</pre> + +<p> +The second style computes the expression value on the fly, +without building the syntax tree. +The following grammar takes this approach. +(It assumes the same lexical elements as before.) +</p> +<pre class="example"> +-- Auxiliary function +function eval (v1, op, v2) + if (op == "+") then return v1 + v2 + elseif (op == "-") then return v1 - v2 + elseif (op == "*") then return v1 * v2 + elseif (op == "/") then return v1 / v2 + end +end + +-- Grammar +local V = lpeg.V +G = lpeg.P{ "Exp", + Exp = lpeg.Cf(V"Term" * lpeg.Cg(TermOp * V"Term")^0, eval); + Term = lpeg.Cf(V"Factor" * lpeg.Cg(FactorOp * V"Factor")^0, eval); + Factor = Number / tonumber + Open * V"Exp" * Close; +} + +-- small example +print(lpeg.match(G, "3 + 5*9 / (1+1) - 12")) --> 13.5 +</pre> +<p> +Note the use of the fold (accumulator) capture. +To compute the value of an expression, +the accumulator starts with the value of the first term, +and then applies <code>eval</code> over +the accumulator, the operator, +and the new term for each repetition. +</p> + + + +<h2><a name="download"></a>Download</h2> + +<p>LPeg +<a href="http://www.inf.puc-rio.br/~roberto/lpeg/lpeg-1.0.2.tar.gz">source code</a>.</p> + + +<h2><a name="license">License</a></h2> + +<p> +Copyright © 2007-2019 Lua.org, PUC-Rio. +</p> +<p> +Permission is hereby granted, free of charge, +to any person obtaining a copy of this software and +associated documentation files (the "Software"), +to deal in the Software without restriction, +including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, +and to permit persons to whom the Software is +furnished to do so, +subject to the following conditions: +</p> + +<p> +The above copyright notice and this permission notice +shall be included in all copies or substantial portions of the Software. +</p> + +<p> +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +</p> + +</div> <!-- id="content" --> + +</div> <!-- id="main" --> + +</div> <!-- id="container" --> + +</body> +</html> diff --git a/ThirdParty/lpeg-1.0.2/lpprint.c b/ThirdParty/lpeg-1.0.2/lpprint.c new file mode 100644 index 0000000..df62cbe --- /dev/null +++ b/ThirdParty/lpeg-1.0.2/lpprint.c @@ -0,0 +1,244 @@ +/* +** $Id: lpprint.c $ +** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) +*/ + +#include <ctype.h> +#include <limits.h> +#include <stdio.h> + + +#include "lptypes.h" +#include "lpprint.h" +#include "lpcode.h" + + +#if defined(LPEG_DEBUG) + +/* +** {====================================================== +** Printing patterns (for debugging) +** ======================================================= +*/ + + +void printcharset (const byte *st) { + int i; + printf("["); + for (i = 0; i <= UCHAR_MAX; i++) { + int first = i; + while (testchar(st, i) && i <= UCHAR_MAX) i++; + if (i - 1 == first) /* unary range? */ + printf("(%02x)", first); + else if (i - 1 > first) /* non-empty range? */ + printf("(%02x-%02x)", first, i - 1); + } + printf("]"); +} + + +static const char *capkind (int kind) { + const char *const modes[] = { + "close", "position", "constant", "backref", + "argument", "simple", "table", "function", + "query", "string", "num", "substitution", "fold", + "runtime", "group"}; + return modes[kind]; +} + + +static void printjmp (const Instruction *op, const Instruction *p) { + printf("-> %d", (int)(p + (p + 1)->offset - op)); +} + + +void printinst (const Instruction *op, const Instruction *p) { + const char *const names[] = { + "any", "char", "set", + "testany", "testchar", "testset", + "span", "behind", + "ret", "end", + "choice", "jmp", "call", "open_call", + "commit", "partial_commit", "back_commit", "failtwice", "fail", "giveup", + "fullcapture", "opencapture", "closecapture", "closeruntime" + }; + printf("%02ld: %s ", (long)(p - op), names[p->i.code]); + switch ((Opcode)p->i.code) { + case IChar: { + printf("'%c'", p->i.aux); + break; + } + case ITestChar: { + printf("'%c'", p->i.aux); printjmp(op, p); + break; + } + case IFullCapture: { + printf("%s (size = %d) (idx = %d)", + capkind(getkind(p)), getoff(p), p->i.key); + break; + } + case IOpenCapture: { + printf("%s (idx = %d)", capkind(getkind(p)), p->i.key); + break; + } + case ISet: { + printcharset((p+1)->buff); + break; + } + case ITestSet: { + printcharset((p+2)->buff); printjmp(op, p); + break; + } + case ISpan: { + printcharset((p+1)->buff); + break; + } + case IOpenCall: { + printf("-> %d", (p + 1)->offset); + break; + } + case IBehind: { + printf("%d", p->i.aux); + break; + } + case IJmp: case ICall: case ICommit: case IChoice: + case IPartialCommit: case IBackCommit: case ITestAny: { + printjmp(op, p); + break; + } + default: break; + } + printf("\n"); +} + + +void printpatt (Instruction *p, int n) { + Instruction *op = p; + while (p < op + n) { + printinst(op, p); + p += sizei(p); + } +} + + +#if defined(LPEG_DEBUG) +static void printcap (Capture *cap) { + printf("%s (idx: %d - size: %d) -> %p\n", + capkind(cap->kind), cap->idx, cap->siz, cap->s); +} + + +void printcaplist (Capture *cap, Capture *limit) { + printf(">======\n"); + for (; cap->s && (limit == NULL || cap < limit); cap++) + printcap(cap); + printf("=======\n"); +} +#endif + +/* }====================================================== */ + + +/* +** {====================================================== +** Printing trees (for debugging) +** ======================================================= +*/ + +static const char *tagnames[] = { + "char", "set", "any", + "true", "false", + "rep", + "seq", "choice", + "not", "and", + "call", "opencall", "rule", "grammar", + "behind", + "capture", "run-time" +}; + + +void printtree (TTree *tree, int ident) { + int i; + for (i = 0; i < ident; i++) printf(" "); + printf("%s", tagnames[tree->tag]); + switch (tree->tag) { + case TChar: { + int c = tree->u.n; + if (isprint(c)) + printf(" '%c'\n", c); + else + printf(" (%02X)\n", c); + break; + } + case TSet: { + printcharset(treebuffer(tree)); + printf("\n"); + break; + } + case TOpenCall: case TCall: { + assert(sib2(tree)->tag == TRule); + printf(" key: %d (rule: %d)\n", tree->key, sib2(tree)->cap); + break; + } + case TBehind: { + printf(" %d\n", tree->u.n); + printtree(sib1(tree), ident + 2); + break; + } + case TCapture: { + printf(" kind: '%s' key: %d\n", capkind(tree->cap), tree->key); + printtree(sib1(tree), ident + 2); + break; + } + case TRule: { + printf(" n: %d key: %d\n", tree->cap, tree->key); + printtree(sib1(tree), ident + 2); + break; /* do not print next rule as a sibling */ + } + case TGrammar: { + TTree *rule = sib1(tree); + printf(" %d\n", tree->u.n); /* number of rules */ + for (i = 0; i < tree->u.n; i++) { + printtree(rule, ident + 2); + rule = sib2(rule); + } + assert(rule->tag == TTrue); /* sentinel */ + break; + } + default: { + int sibs = numsiblings[tree->tag]; + printf("\n"); + if (sibs >= 1) { + printtree(sib1(tree), ident + 2); + if (sibs >= 2) + printtree(sib2(tree), ident + 2); + } + break; + } + } +} + + +void printktable (lua_State *L, int idx) { + int n, i; + lua_getuservalue(L, idx); + if (lua_isnil(L, -1)) /* no ktable? */ + return; + n = lua_rawlen(L, -1); + printf("["); + for (i = 1; i <= n; i++) { + printf("%d = ", i); + lua_rawgeti(L, -1, i); + if (lua_isstring(L, -1)) + printf("%s ", lua_tostring(L, -1)); + else + printf("%s ", lua_typename(L, lua_type(L, -1))); + lua_pop(L, 1); + } + printf("]\n"); + /* leave ktable at the stack */ +} + +/* }====================================================== */ + +#endif diff --git a/ThirdParty/lpeg-1.0.2/lpprint.h b/ThirdParty/lpeg-1.0.2/lpprint.h new file mode 100644 index 0000000..15ef121 --- /dev/null +++ b/ThirdParty/lpeg-1.0.2/lpprint.h @@ -0,0 +1,36 @@ +/* +** $Id: lpprint.h $ +*/ + + +#if !defined(lpprint_h) +#define lpprint_h + + +#include "lptree.h" +#include "lpvm.h" + + +#if defined(LPEG_DEBUG) + +void printpatt (Instruction *p, int n); +void printtree (TTree *tree, int ident); +void printktable (lua_State *L, int idx); +void printcharset (const byte *st); +void printcaplist (Capture *cap, Capture *limit); +void printinst (const Instruction *op, const Instruction *p); + +#else + +#define printktable(L,idx) \ + luaL_error(L, "function only implemented in debug mode") +#define printtree(tree,i) \ + luaL_error(L, "function only implemented in debug mode") +#define printpatt(p,n) \ + luaL_error(L, "function only implemented in debug mode") + +#endif + + +#endif + diff --git a/ThirdParty/lpeg-1.0.2/lptree.c b/ThirdParty/lpeg-1.0.2/lptree.c new file mode 100644 index 0000000..5c8de94 --- /dev/null +++ b/ThirdParty/lpeg-1.0.2/lptree.c @@ -0,0 +1,1305 @@ +/* +** $Id: lptree.c $ +** Copyright 2013, Lua.org & PUC-Rio (see 'lpeg.html' for license) +*/ + +#include <ctype.h> +#include <limits.h> +#include <string.h> + + +#include "lua.h" +#include "lauxlib.h" + +#include "lptypes.h" +#include "lpcap.h" +#include "lpcode.h" +#include "lpprint.h" +#include "lptree.h" + + +/* number of siblings for each tree */ +const byte numsiblings[] = { + 0, 0, 0, /* char, set, any */ + 0, 0, /* true, false */ + 1, /* rep */ + 2, 2, /* seq, choice */ + 1, 1, /* not, and */ + 0, 0, 2, 1, /* call, opencall, rule, grammar */ + 1, /* behind */ + 1, 1 /* capture, runtime capture */ +}; + + +static TTree *newgrammar (lua_State *L, int arg); + + +/* +** returns a reasonable name for value at index 'idx' on the stack +*/ +static const char *val2str (lua_State *L, int idx) { + const char *k = lua_tostring(L, idx); + if (k != NULL) + return lua_pushfstring(L, "%s", k); + else + return lua_pushfstring(L, "(a %s)", luaL_typename(L, idx)); +} + + +/* +** Fix a TOpenCall into a TCall node, using table 'postable' to +** translate a key to its rule address in the tree. Raises an +** error if key does not exist. +*/ +static void fixonecall (lua_State *L, int postable, TTree *g, TTree *t) { + int n; + lua_rawgeti(L, -1, t->key); /* get rule's name */ + lua_gettable(L, postable); /* query name in position table */ + n = lua_tonumber(L, -1); /* get (absolute) position */ + lua_pop(L, 1); /* remove position */ + if (n == 0) { /* no position? */ + lua_rawgeti(L, -1, t->key); /* get rule's name again */ + luaL_error(L, "rule '%s' undefined in given grammar", val2str(L, -1)); + } + t->tag = TCall; + t->u.ps = n - (t - g); /* position relative to node */ + assert(sib2(t)->tag == TRule); + sib2(t)->key = t->key; /* fix rule's key */ +} + + +/* +** Transform left associative constructions into right +** associative ones, for sequence and choice; that is: +** (t11 + t12) + t2 => t11 + (t12 + t2) +** (t11 * t12) * t2 => t11 * (t12 * t2) +** (that is, Op (Op t11 t12) t2 => Op t11 (Op t12 t2)) +*/ +static void correctassociativity (TTree *tree) { + TTree *t1 = sib1(tree); + assert(tree->tag == TChoice || tree->tag == TSeq); + while (t1->tag == tree->tag) { + int n1size = tree->u.ps - 1; /* t1 == Op t11 t12 */ + int n11size = t1->u.ps - 1; + int n12size = n1size - n11size - 1; + memmove(sib1(tree), sib1(t1), n11size * sizeof(TTree)); /* move t11 */ + tree->u.ps = n11size + 1; + sib2(tree)->tag = tree->tag; + sib2(tree)->u.ps = n12size + 1; + } +} + + +/* +** Make final adjustments in a tree. Fix open calls in tree 't', +** making them refer to their respective rules or raising appropriate +** errors (if not inside a grammar). Correct associativity of associative +** constructions (making them right associative). Assume that tree's +** ktable is at the top of the stack (for error messages). +*/ +static void finalfix (lua_State *L, int postable, TTree *g, TTree *t) { + tailcall: + switch (t->tag) { + case TGrammar: /* subgrammars were already fixed */ + return; + case TOpenCall: { + if (g != NULL) /* inside a grammar? */ + fixonecall(L, postable, g, t); + else { /* open call outside grammar */ + lua_rawgeti(L, -1, t->key); + luaL_error(L, "rule '%s' used outside a grammar", val2str(L, -1)); + } + break; + } + case TSeq: case TChoice: + correctassociativity(t); + break; + } + switch (numsiblings[t->tag]) { + case 1: /* finalfix(L, postable, g, sib1(t)); */ + t = sib1(t); goto tailcall; + case 2: + finalfix(L, postable, g, sib1(t)); + t = sib2(t); goto tailcall; /* finalfix(L, postable, g, sib2(t)); */ + default: assert(numsiblings[t->tag] == 0); break; + } +} + + + +/* +** {=================================================================== +** KTable manipulation +** +** - The ktable of a pattern 'p' can be shared by other patterns that +** contain 'p' and no other constants. Because of this sharing, we +** should not add elements to a 'ktable' unless it was freshly created +** for the new pattern. +** +** - The maximum index in a ktable is USHRT_MAX, because trees and +** patterns use unsigned shorts to store those indices. +** ==================================================================== +*/ + +/* +** Create a new 'ktable' to the pattern at the top of the stack. +*/ +static void newktable (lua_State *L, int n) { + lua_createtable(L, n, 0); /* create a fresh table */ + lua_setuservalue(L, -2); /* set it as 'ktable' for pattern */ +} + + +/* +** Add element 'idx' to 'ktable' of pattern at the top of the stack; +** Return index of new element. +** If new element is nil, does not add it to table (as it would be +** useless) and returns 0, as ktable[0] is always nil. +*/ +static int addtoktable (lua_State *L, int idx) { + if (lua_isnil(L, idx)) /* nil value? */ + return 0; + else { + int n; + lua_getuservalue(L, -1); /* get ktable from pattern */ + n = lua_rawlen(L, -1); + if (n >= USHRT_MAX) + luaL_error(L, "too many Lua values in pattern"); + lua_pushvalue(L, idx); /* element to be added */ + lua_rawseti(L, -2, ++n); + lua_pop(L, 1); /* remove 'ktable' */ + return n; + } +} + + +/* +** Return the number of elements in the ktable at 'idx'. +** In Lua 5.2/5.3, default "environment" for patterns is nil, not +** a table. Treat it as an empty table. In Lua 5.1, assumes that +** the environment has no numeric indices (len == 0) +*/ +static int ktablelen (lua_State *L, int idx) { + if (!lua_istable(L, idx)) return 0; + else return lua_rawlen(L, idx); +} + + +/* +** Concatentate the contents of table 'idx1' into table 'idx2'. +** (Assume that both indices are negative.) +** Return the original length of table 'idx2' (or 0, if no +** element was added, as there is no need to correct any index). +*/ +static int concattable (lua_State *L, int idx1, int idx2) { + int i; + int n1 = ktablelen(L, idx1); + int n2 = ktablelen(L, idx2); + if (n1 + n2 > USHRT_MAX) + luaL_error(L, "too many Lua values in pattern"); + if (n1 == 0) return 0; /* nothing to correct */ + for (i = 1; i <= n1; i++) { + lua_rawgeti(L, idx1, i); + lua_rawseti(L, idx2 - 1, n2 + i); /* correct 'idx2' */ + } + return n2; +} + + +/* +** When joining 'ktables', constants from one of the subpatterns must +** be renumbered; 'correctkeys' corrects their indices (adding 'n' +** to each of them) +*/ +static void correctkeys (TTree *tree, int n) { + if (n == 0) return; /* no correction? */ + tailcall: + switch (tree->tag) { + case TOpenCall: case TCall: case TRunTime: case TRule: { + if (tree->key > 0) + tree->key += n; + break; + } + case TCapture: { + if (tree->key > 0 && tree->cap != Carg && tree->cap != Cnum) + tree->key += n; + break; + } + default: break; + } + switch (numsiblings[tree->tag]) { + case 1: /* correctkeys(sib1(tree), n); */ + tree = sib1(tree); goto tailcall; + case 2: + correctkeys(sib1(tree), n); + tree = sib2(tree); goto tailcall; /* correctkeys(sib2(tree), n); */ + default: assert(numsiblings[tree->tag] == 0); break; + } +} + + +/* +** Join the ktables from p1 and p2 the ktable for the new pattern at the +** top of the stack, reusing them when possible. +*/ +static void joinktables (lua_State *L, int p1, TTree *t2, int p2) { + int n1, n2; + lua_getuservalue(L, p1); /* get ktables */ + lua_getuservalue(L, p2); + n1 = ktablelen(L, -2); + n2 = ktablelen(L, -1); + if (n1 == 0 && n2 == 0) /* are both tables empty? */ + lua_pop(L, 2); /* nothing to be done; pop tables */ + else if (n2 == 0 || lp_equal(L, -2, -1)) { /* 2nd table empty or equal? */ + lua_pop(L, 1); /* pop 2nd table */ + lua_setuservalue(L, -2); /* set 1st ktable into new pattern */ + } + else if (n1 == 0) { /* first table is empty? */ + lua_setuservalue(L, -3); /* set 2nd table into new pattern */ + lua_pop(L, 1); /* pop 1st table */ + } + else { + lua_createtable(L, n1 + n2, 0); /* create ktable for new pattern */ + /* stack: new p; ktable p1; ktable p2; new ktable */ + concattable(L, -3, -1); /* from p1 into new ktable */ + concattable(L, -2, -1); /* from p2 into new ktable */ + lua_setuservalue(L, -4); /* new ktable becomes 'p' environment */ + lua_pop(L, 2); /* pop other ktables */ + correctkeys(t2, n1); /* correction for indices from p2 */ + } +} + + +/* +** copy 'ktable' of element 'idx' to new tree (on top of stack) +*/ +static void copyktable (lua_State *L, int idx) { + lua_getuservalue(L, idx); + lua_setuservalue(L, -2); +} + + +/* +** merge 'ktable' from 'stree' at stack index 'idx' into 'ktable' +** from tree at the top of the stack, and correct corresponding +** tree. +*/ +static void mergektable (lua_State *L, int idx, TTree *stree) { + int n; + lua_getuservalue(L, -1); /* get ktables */ + lua_getuservalue(L, idx); + n = concattable(L, -1, -2); + lua_pop(L, 2); /* remove both ktables */ + correctkeys(stree, n); +} + + +/* +** Create a new 'ktable' to the pattern at the top of the stack, adding +** all elements from pattern 'p' (if not 0) plus element 'idx' to it. +** Return index of new element. +*/ +static int addtonewktable (lua_State *L, int p, int idx) { + newktable(L, 1); + if (p) + mergektable(L, p, NULL); + return addtoktable(L, idx); +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Tree generation +** ======================================================= +*/ + +/* +** In 5.2, could use 'luaL_testudata'... +*/ +static int testpattern (lua_State *L, int idx) { + if (lua_touserdata(L, idx)) { /* value is a userdata? */ + if (lua_getmetatable(L, idx)) { /* does it have a metatable? */ + luaL_getmetatable(L, PATTERN_T); + if (lua_rawequal(L, -1, -2)) { /* does it have the correct mt? */ + lua_pop(L, 2); /* remove both metatables */ + return 1; + } + } + } + return 0; +} + + +static Pattern *getpattern (lua_State *L, int idx) { + return (Pattern *)luaL_checkudata(L, idx, PATTERN_T); +} + + +static int getsize (lua_State *L, int idx) { + return (lua_rawlen(L, idx) - sizeof(Pattern)) / sizeof(TTree) + 1; +} + + +static TTree *gettree (lua_State *L, int idx, int *len) { + Pattern *p = getpattern(L, idx); + if (len) + *len = getsize(L, idx); + return p->tree; +} + + +/* +** create a pattern. Set its uservalue (the 'ktable') equal to its +** metatable. (It could be any empty sequence; the metatable is at +** hand here, so we use it.) +*/ +static TTree *newtree (lua_State *L, int len) { + size_t size = (len - 1) * sizeof(TTree) + sizeof(Pattern); + Pattern *p = (Pattern *)lua_newuserdata(L, size); + luaL_getmetatable(L, PATTERN_T); + lua_pushvalue(L, -1); + lua_setuservalue(L, -3); + lua_setmetatable(L, -2); + p->code = NULL; p->codesize = 0; + return p->tree; +} + + +static TTree *newleaf (lua_State *L, int tag) { + TTree *tree = newtree(L, 1); + tree->tag = tag; + return tree; +} + + +static TTree *newcharset (lua_State *L) { + TTree *tree = newtree(L, bytes2slots(CHARSETSIZE) + 1); + tree->tag = TSet; + loopset(i, treebuffer(tree)[i] = 0); + return tree; +} + + +/* +** add to tree a sequence where first sibling is 'sib' (with size +** 'sibsize'); returns position for second sibling +*/ +static TTree *seqaux (TTree *tree, TTree *sib, int sibsize) { + tree->tag = TSeq; tree->u.ps = sibsize + 1; + memcpy(sib1(tree), sib, sibsize * sizeof(TTree)); + return sib2(tree); +} + + +/* +** Build a sequence of 'n' nodes, each with tag 'tag' and 'u.n' got +** from the array 's' (or 0 if array is NULL). (TSeq is binary, so it +** must build a sequence of sequence of sequence...) +*/ +static void fillseq (TTree *tree, int tag, int n, const char *s) { + int i; + for (i = 0; i < n - 1; i++) { /* initial n-1 copies of Seq tag; Seq ... */ + tree->tag = TSeq; tree->u.ps = 2; + sib1(tree)->tag = tag; + sib1(tree)->u.n = s ? (byte)s[i] : 0; + tree = sib2(tree); + } + tree->tag = tag; /* last one does not need TSeq */ + tree->u.n = s ? (byte)s[i] : 0; +} + + +/* +** Numbers as patterns: +** 0 == true (always match); n == TAny repeated 'n' times; +** -n == not (TAny repeated 'n' times) +*/ +static TTree *numtree (lua_State *L, int n) { + if (n == 0) + return newleaf(L, TTrue); + else { + TTree *tree, *nd; + if (n > 0) + tree = nd = newtree(L, 2 * n - 1); + else { /* negative: code it as !(-n) */ + n = -n; + tree = newtree(L, 2 * n); + tree->tag = TNot; + nd = sib1(tree); + } + fillseq(nd, TAny, n, NULL); /* sequence of 'n' any's */ + return tree; + } +} + + +/* +** Convert value at index 'idx' to a pattern +*/ +static TTree *getpatt (lua_State *L, int idx, int *len) { + TTree *tree; + switch (lua_type(L, idx)) { + case LUA_TSTRING: { + size_t slen; + const char *s = lua_tolstring(L, idx, &slen); /* get string */ + if (slen == 0) /* empty? */ + tree = newleaf(L, TTrue); /* always match */ + else { + tree = newtree(L, 2 * (slen - 1) + 1); + fillseq(tree, TChar, slen, s); /* sequence of 'slen' chars */ + } + break; + } + case LUA_TNUMBER: { + int n = lua_tointeger(L, idx); + tree = numtree(L, n); + break; + } + case LUA_TBOOLEAN: { + tree = (lua_toboolean(L, idx) ? newleaf(L, TTrue) : newleaf(L, TFalse)); + break; + } + case LUA_TTABLE: { + tree = newgrammar(L, idx); + break; + } + case LUA_TFUNCTION: { + tree = newtree(L, 2); + tree->tag = TRunTime; + tree->key = addtonewktable(L, 0, idx); + sib1(tree)->tag = TTrue; + break; + } + default: { + return gettree(L, idx, len); + } + } + lua_replace(L, idx); /* put new tree into 'idx' slot */ + if (len) + *len = getsize(L, idx); + return tree; +} + + +/* +** create a new tree, whith a new root and one sibling. +** Sibling must be on the Lua stack, at index 1. +*/ +static TTree *newroot1sib (lua_State *L, int tag) { + int s1; + TTree *tree1 = getpatt(L, 1, &s1); + TTree *tree = newtree(L, 1 + s1); /* create new tree */ + tree->tag = tag; + memcpy(sib1(tree), tree1, s1 * sizeof(TTree)); + copyktable(L, 1); + return tree; +} + + +/* +** create a new tree, whith a new root and 2 siblings. +** Siblings must be on the Lua stack, first one at index 1. +*/ +static TTree *newroot2sib (lua_State *L, int tag) { + int s1, s2; + TTree *tree1 = getpatt(L, 1, &s1); + TTree *tree2 = getpatt(L, 2, &s2); + TTree *tree = newtree(L, 1 + s1 + s2); /* create new tree */ + tree->tag = tag; + tree->u.ps = 1 + s1; + memcpy(sib1(tree), tree1, s1 * sizeof(TTree)); + memcpy(sib2(tree), tree2, s2 * sizeof(TTree)); + joinktables(L, 1, sib2(tree), 2); + return tree; +} + + +static int lp_P (lua_State *L) { + luaL_checkany(L, 1); + getpatt(L, 1, NULL); + lua_settop(L, 1); + return 1; +} + + +/* +** sequence operator; optimizations: +** false x => false, x true => x, true x => x +** (cannot do x . false => false because x may have runtime captures) +*/ +static int lp_seq (lua_State *L) { + TTree *tree1 = getpatt(L, 1, NULL); + TTree *tree2 = getpatt(L, 2, NULL); + if (tree1->tag == TFalse || tree2->tag == TTrue) + lua_pushvalue(L, 1); /* false . x == false, x . true = x */ + else if (tree1->tag == TTrue) + lua_pushvalue(L, 2); /* true . x = x */ + else + newroot2sib(L, TSeq); + return 1; +} + + +/* +** choice operator; optimizations: +** charset / charset => charset +** true / x => true, x / false => x, false / x => x +** (x / true is not equivalent to true) +*/ +static int lp_choice (lua_State *L) { + Charset st1, st2; + TTree *t1 = getpatt(L, 1, NULL); + TTree *t2 = getpatt(L, 2, NULL); + if (tocharset(t1, &st1) && tocharset(t2, &st2)) { + TTree *t = newcharset(L); + loopset(i, treebuffer(t)[i] = st1.cs[i] | st2.cs[i]); + } + else if (nofail(t1) || t2->tag == TFalse) + lua_pushvalue(L, 1); /* true / x => true, x / false => x */ + else if (t1->tag == TFalse) + lua_pushvalue(L, 2); /* false / x => x */ + else + newroot2sib(L, TChoice); + return 1; +} + + +/* +** p^n +*/ +static int lp_star (lua_State *L) { + int size1; + int n = (int)luaL_checkinteger(L, 2); + TTree *tree1 = getpatt(L, 1, &size1); + if (n >= 0) { /* seq tree1 (seq tree1 ... (seq tree1 (rep tree1))) */ + TTree *tree = newtree(L, (n + 1) * (size1 + 1)); + if (nullable(tree1)) + luaL_error(L, "loop body may accept empty string"); + while (n--) /* repeat 'n' times */ + tree = seqaux(tree, tree1, size1); + tree->tag = TRep; + memcpy(sib1(tree), tree1, size1 * sizeof(TTree)); + } + else { /* choice (seq tree1 ... choice tree1 true ...) true */ + TTree *tree; + n = -n; + /* size = (choice + seq + tree1 + true) * n, but the last has no seq */ + tree = newtree(L, n * (size1 + 3) - 1); + for (; n > 1; n--) { /* repeat (n - 1) times */ + tree->tag = TChoice; tree->u.ps = n * (size1 + 3) - 2; + sib2(tree)->tag = TTrue; + tree = sib1(tree); + tree = seqaux(tree, tree1, size1); + } + tree->tag = TChoice; tree->u.ps = size1 + 1; + sib2(tree)->tag = TTrue; + memcpy(sib1(tree), tree1, size1 * sizeof(TTree)); + } + copyktable(L, 1); + return 1; +} + + +/* +** #p == &p +*/ +static int lp_and (lua_State *L) { + newroot1sib(L, TAnd); + return 1; +} + + +/* +** -p == !p +*/ +static int lp_not (lua_State *L) { + newroot1sib(L, TNot); + return 1; +} + + +/* +** [t1 - t2] == Seq (Not t2) t1 +** If t1 and t2 are charsets, make their difference. +*/ +static int lp_sub (lua_State *L) { + Charset st1, st2; + int s1, s2; + TTree *t1 = getpatt(L, 1, &s1); + TTree *t2 = getpatt(L, 2, &s2); + if (tocharset(t1, &st1) && tocharset(t2, &st2)) { + TTree *t = newcharset(L); + loopset(i, treebuffer(t)[i] = st1.cs[i] & ~st2.cs[i]); + } + else { + TTree *tree = newtree(L, 2 + s1 + s2); + tree->tag = TSeq; /* sequence of... */ + tree->u.ps = 2 + s2; + sib1(tree)->tag = TNot; /* ...not... */ + memcpy(sib1(sib1(tree)), t2, s2 * sizeof(TTree)); /* ...t2 */ + memcpy(sib2(tree), t1, s1 * sizeof(TTree)); /* ... and t1 */ + joinktables(L, 1, sib1(tree), 2); + } + return 1; +} + + +static int lp_set (lua_State *L) { + size_t l; + const char *s = luaL_checklstring(L, 1, &l); + TTree *tree = newcharset(L); + while (l--) { + setchar(treebuffer(tree), (byte)(*s)); + s++; + } + return 1; +} + + +static int lp_range (lua_State *L) { + int arg; + int top = lua_gettop(L); + TTree *tree = newcharset(L); + for (arg = 1; arg <= top; arg++) { + int c; + size_t l; + const char *r = luaL_checklstring(L, arg, &l); + luaL_argcheck(L, l == 2, arg, "range must have two characters"); + for (c = (byte)r[0]; c <= (byte)r[1]; c++) + setchar(treebuffer(tree), c); + } + return 1; +} + + +/* +** Look-behind predicate +*/ +static int lp_behind (lua_State *L) { + TTree *tree; + TTree *tree1 = getpatt(L, 1, NULL); + int n = fixedlen(tree1); + luaL_argcheck(L, n >= 0, 1, "pattern may not have fixed length"); + luaL_argcheck(L, !hascaptures(tree1), 1, "pattern have captures"); + luaL_argcheck(L, n <= MAXBEHIND, 1, "pattern too long to look behind"); + tree = newroot1sib(L, TBehind); + tree->u.n = n; + return 1; +} + + +/* +** Create a non-terminal +*/ +static int lp_V (lua_State *L) { + TTree *tree = newleaf(L, TOpenCall); + luaL_argcheck(L, !lua_isnoneornil(L, 1), 1, "non-nil value expected"); + tree->key = addtonewktable(L, 0, 1); + return 1; +} + + +/* +** Create a tree for a non-empty capture, with a body and +** optionally with an associated Lua value (at index 'labelidx' in the +** stack) +*/ +static int capture_aux (lua_State *L, int cap, int labelidx) { + TTree *tree = newroot1sib(L, TCapture); + tree->cap = cap; + tree->key = (labelidx == 0) ? 0 : addtonewktable(L, 1, labelidx); + return 1; +} + + +/* +** Fill a tree with an empty capture, using an empty (TTrue) sibling. +** (The 'key' field must be filled by the caller to finish the tree.) +*/ +static TTree *auxemptycap (TTree *tree, int cap) { + tree->tag = TCapture; + tree->cap = cap; + sib1(tree)->tag = TTrue; + return tree; +} + + +/* +** Create a tree for an empty capture. +*/ +static TTree *newemptycap (lua_State *L, int cap, int key) { + TTree *tree = auxemptycap(newtree(L, 2), cap); + tree->key = key; + return tree; +} + + +/* +** Create a tree for an empty capture with an associated Lua value. +*/ +static TTree *newemptycapkey (lua_State *L, int cap, int idx) { + TTree *tree = auxemptycap(newtree(L, 2), cap); + tree->key = addtonewktable(L, 0, idx); + return tree; +} + + +/* +** Captures with syntax p / v +** (function capture, query capture, string capture, or number capture) +*/ +static int lp_divcapture (lua_State *L) { + switch (lua_type(L, 2)) { + case LUA_TFUNCTION: return capture_aux(L, Cfunction, 2); + case LUA_TTABLE: return capture_aux(L, Cquery, 2); + case LUA_TSTRING: return capture_aux(L, Cstring, 2); + case LUA_TNUMBER: { + int n = lua_tointeger(L, 2); + TTree *tree = newroot1sib(L, TCapture); + luaL_argcheck(L, 0 <= n && n <= SHRT_MAX, 1, "invalid number"); + tree->cap = Cnum; + tree->key = n; + return 1; + } + default: return luaL_argerror(L, 2, "invalid replacement value"); + } +} + + +static int lp_substcapture (lua_State *L) { + return capture_aux(L, Csubst, 0); +} + + +static int lp_tablecapture (lua_State *L) { + return capture_aux(L, Ctable, 0); +} + + +static int lp_groupcapture (lua_State *L) { + if (lua_isnoneornil(L, 2)) + return capture_aux(L, Cgroup, 0); + else + return capture_aux(L, Cgroup, 2); +} + + +static int lp_foldcapture (lua_State *L) { + luaL_checktype(L, 2, LUA_TFUNCTION); + return capture_aux(L, Cfold, 2); +} + + +static int lp_simplecapture (lua_State *L) { + return capture_aux(L, Csimple, 0); +} + + +static int lp_poscapture (lua_State *L) { + newemptycap(L, Cposition, 0); + return 1; +} + + +static int lp_argcapture (lua_State *L) { + int n = (int)luaL_checkinteger(L, 1); + luaL_argcheck(L, 0 < n && n <= SHRT_MAX, 1, "invalid argument index"); + newemptycap(L, Carg, n); + return 1; +} + + +static int lp_backref (lua_State *L) { + luaL_checkany(L, 1); + newemptycapkey(L, Cbackref, 1); + return 1; +} + + +/* +** Constant capture +*/ +static int lp_constcapture (lua_State *L) { + int i; + int n = lua_gettop(L); /* number of values */ + if (n == 0) /* no values? */ + newleaf(L, TTrue); /* no capture */ + else if (n == 1) + newemptycapkey(L, Cconst, 1); /* single constant capture */ + else { /* create a group capture with all values */ + TTree *tree = newtree(L, 1 + 3 * (n - 1) + 2); + newktable(L, n); /* create a 'ktable' for new tree */ + tree->tag = TCapture; + tree->cap = Cgroup; + tree->key = 0; + tree = sib1(tree); + for (i = 1; i <= n - 1; i++) { + tree->tag = TSeq; + tree->u.ps = 3; /* skip TCapture and its sibling */ + auxemptycap(sib1(tree), Cconst); + sib1(tree)->key = addtoktable(L, i); + tree = sib2(tree); + } + auxemptycap(tree, Cconst); + tree->key = addtoktable(L, i); + } + return 1; +} + + +static int lp_matchtime (lua_State *L) { + TTree *tree; + luaL_checktype(L, 2, LUA_TFUNCTION); + tree = newroot1sib(L, TRunTime); + tree->key = addtonewktable(L, 1, 2); + return 1; +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Grammar - Tree generation +** ======================================================= +*/ + +/* +** push on the stack the index and the pattern for the +** initial rule of grammar at index 'arg' in the stack; +** also add that index into position table. +*/ +static void getfirstrule (lua_State *L, int arg, int postab) { + lua_rawgeti(L, arg, 1); /* access first element */ + if (lua_isstring(L, -1)) { /* is it the name of initial rule? */ + lua_pushvalue(L, -1); /* duplicate it to use as key */ + lua_gettable(L, arg); /* get associated rule */ + } + else { + lua_pushinteger(L, 1); /* key for initial rule */ + lua_insert(L, -2); /* put it before rule */ + } + if (!testpattern(L, -1)) { /* initial rule not a pattern? */ + if (lua_isnil(L, -1)) + luaL_error(L, "grammar has no initial rule"); + else + luaL_error(L, "initial rule '%s' is not a pattern", lua_tostring(L, -2)); + } + lua_pushvalue(L, -2); /* push key */ + lua_pushinteger(L, 1); /* push rule position (after TGrammar) */ + lua_settable(L, postab); /* insert pair at position table */ +} + +/* +** traverse grammar at index 'arg', pushing all its keys and patterns +** into the stack. Create a new table (before all pairs key-pattern) to +** collect all keys and their associated positions in the final tree +** (the "position table"). +** Return the number of rules and (in 'totalsize') the total size +** for the new tree. +*/ +static int collectrules (lua_State *L, int arg, int *totalsize) { + int n = 1; /* to count number of rules */ + int postab = lua_gettop(L) + 1; /* index of position table */ + int size; /* accumulator for total size */ + lua_newtable(L); /* create position table */ + getfirstrule(L, arg, postab); + size = 2 + getsize(L, postab + 2); /* TGrammar + TRule + rule */ + lua_pushnil(L); /* prepare to traverse grammar table */ + while (lua_next(L, arg) != 0) { + if (lua_tonumber(L, -2) == 1 || + lp_equal(L, -2, postab + 1)) { /* initial rule? */ + lua_pop(L, 1); /* remove value (keep key for lua_next) */ + continue; + } + if (!testpattern(L, -1)) /* value is not a pattern? */ + luaL_error(L, "rule '%s' is not a pattern", val2str(L, -2)); + luaL_checkstack(L, LUA_MINSTACK, "grammar has too many rules"); + lua_pushvalue(L, -2); /* push key (to insert into position table) */ + lua_pushinteger(L, size); + lua_settable(L, postab); + size += 1 + getsize(L, -1); /* update size */ + lua_pushvalue(L, -2); /* push key (for next lua_next) */ + n++; + } + *totalsize = size + 1; /* TTrue to finish list of rules */ + return n; +} + + +static void buildgrammar (lua_State *L, TTree *grammar, int frule, int n) { + int i; + TTree *nd = sib1(grammar); /* auxiliary pointer to traverse the tree */ + for (i = 0; i < n; i++) { /* add each rule into new tree */ + int ridx = frule + 2*i + 1; /* index of i-th rule */ + int rulesize; + TTree *rn = gettree(L, ridx, &rulesize); + nd->tag = TRule; + nd->key = 0; /* will be fixed when rule is used */ + nd->cap = i; /* rule number */ + nd->u.ps = rulesize + 1; /* point to next rule */ + memcpy(sib1(nd), rn, rulesize * sizeof(TTree)); /* copy rule */ + mergektable(L, ridx, sib1(nd)); /* merge its ktable into new one */ + nd = sib2(nd); /* move to next rule */ + } + nd->tag = TTrue; /* finish list of rules */ +} + + +/* +** Check whether a tree has potential infinite loops +*/ +static int checkloops (TTree *tree) { + tailcall: + if (tree->tag == TRep && nullable(sib1(tree))) + return 1; + else if (tree->tag == TGrammar) + return 0; /* sub-grammars already checked */ + else { + switch (numsiblings[tree->tag]) { + case 1: /* return checkloops(sib1(tree)); */ + tree = sib1(tree); goto tailcall; + case 2: + if (checkloops(sib1(tree))) return 1; + /* else return checkloops(sib2(tree)); */ + tree = sib2(tree); goto tailcall; + default: assert(numsiblings[tree->tag] == 0); return 0; + } + } +} + + +/* +** Give appropriate error message for 'verifyrule'. If a rule appears +** twice in 'passed', there is path from it back to itself without +** advancing the subject. +*/ +static int verifyerror (lua_State *L, int *passed, int npassed) { + int i, j; + for (i = npassed - 1; i >= 0; i--) { /* search for a repetition */ + for (j = i - 1; j >= 0; j--) { + if (passed[i] == passed[j]) { + lua_rawgeti(L, -1, passed[i]); /* get rule's key */ + return luaL_error(L, "rule '%s' may be left recursive", val2str(L, -1)); + } + } + } + return luaL_error(L, "too many left calls in grammar"); +} + + +/* +** Check whether a rule can be left recursive; raise an error in that +** case; otherwise return 1 iff pattern is nullable. +** The return value is used to check sequences, where the second pattern +** is only relevant if the first is nullable. +** Parameter 'nb' works as an accumulator, to allow tail calls in +** choices. ('nb' true makes function returns true.) +** Parameter 'passed' is a list of already visited rules, 'npassed' +** counts the elements in 'passed'. +** Assume ktable at the top of the stack. +*/ +static int verifyrule (lua_State *L, TTree *tree, int *passed, int npassed, + int nb) { + tailcall: + switch (tree->tag) { + case TChar: case TSet: case TAny: + case TFalse: + return nb; /* cannot pass from here */ + case TTrue: + case TBehind: /* look-behind cannot have calls */ + return 1; + case TNot: case TAnd: case TRep: + /* return verifyrule(L, sib1(tree), passed, npassed, 1); */ + tree = sib1(tree); nb = 1; goto tailcall; + case TCapture: case TRunTime: + /* return verifyrule(L, sib1(tree), passed, npassed, nb); */ + tree = sib1(tree); goto tailcall; + case TCall: + /* return verifyrule(L, sib2(tree), passed, npassed, nb); */ + tree = sib2(tree); goto tailcall; + case TSeq: /* only check 2nd child if first is nb */ + if (!verifyrule(L, sib1(tree), passed, npassed, 0)) + return nb; + /* else return verifyrule(L, sib2(tree), passed, npassed, nb); */ + tree = sib2(tree); goto tailcall; + case TChoice: /* must check both children */ + nb = verifyrule(L, sib1(tree), passed, npassed, nb); + /* return verifyrule(L, sib2(tree), passed, npassed, nb); */ + tree = sib2(tree); goto tailcall; + case TRule: + if (npassed >= MAXRULES) + return verifyerror(L, passed, npassed); + else { + passed[npassed++] = tree->key; + /* return verifyrule(L, sib1(tree), passed, npassed); */ + tree = sib1(tree); goto tailcall; + } + case TGrammar: + return nullable(tree); /* sub-grammar cannot be left recursive */ + default: assert(0); return 0; + } +} + + +static void verifygrammar (lua_State *L, TTree *grammar) { + int passed[MAXRULES]; + TTree *rule; + /* check left-recursive rules */ + for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { + if (rule->key == 0) continue; /* unused rule */ + verifyrule(L, sib1(rule), passed, 0, 0); + } + assert(rule->tag == TTrue); + /* check infinite loops inside rules */ + for (rule = sib1(grammar); rule->tag == TRule; rule = sib2(rule)) { + if (rule->key == 0) continue; /* unused rule */ + if (checkloops(sib1(rule))) { + lua_rawgeti(L, -1, rule->key); /* get rule's key */ + luaL_error(L, "empty loop in rule '%s'", val2str(L, -1)); + } + } + assert(rule->tag == TTrue); +} + + +/* +** Give a name for the initial rule if it is not referenced +*/ +static void initialrulename (lua_State *L, TTree *grammar, int frule) { + if (sib1(grammar)->key == 0) { /* initial rule is not referenced? */ + int n = lua_rawlen(L, -1) + 1; /* index for name */ + lua_pushvalue(L, frule); /* rule's name */ + lua_rawseti(L, -2, n); /* ktable was on the top of the stack */ + sib1(grammar)->key = n; + } +} + + +static TTree *newgrammar (lua_State *L, int arg) { + int treesize; + int frule = lua_gettop(L) + 2; /* position of first rule's key */ + int n = collectrules(L, arg, &treesize); + TTree *g = newtree(L, treesize); + luaL_argcheck(L, n <= MAXRULES, arg, "grammar has too many rules"); + g->tag = TGrammar; g->u.n = n; + lua_newtable(L); /* create 'ktable' */ + lua_setuservalue(L, -2); + buildgrammar(L, g, frule, n); + lua_getuservalue(L, -1); /* get 'ktable' for new tree */ + finalfix(L, frule - 1, g, sib1(g)); + initialrulename(L, g, frule); + verifygrammar(L, g); + lua_pop(L, 1); /* remove 'ktable' */ + lua_insert(L, -(n * 2 + 2)); /* move new table to proper position */ + lua_pop(L, n * 2 + 1); /* remove position table + rule pairs */ + return g; /* new table at the top of the stack */ +} + +/* }====================================================== */ + + +static Instruction *prepcompile (lua_State *L, Pattern *p, int idx) { + lua_getuservalue(L, idx); /* push 'ktable' (may be used by 'finalfix') */ + finalfix(L, 0, NULL, p->tree); + lua_pop(L, 1); /* remove 'ktable' */ + return compile(L, p); +} + + +static int lp_printtree (lua_State *L) { + TTree *tree = getpatt(L, 1, NULL); + int c = lua_toboolean(L, 2); + if (c) { + lua_getuservalue(L, 1); /* push 'ktable' (may be used by 'finalfix') */ + finalfix(L, 0, NULL, tree); + lua_pop(L, 1); /* remove 'ktable' */ + } + printktable(L, 1); + printtree(tree, 0); + return 0; +} + + +static int lp_printcode (lua_State *L) { + Pattern *p = getpattern(L, 1); + printktable(L, 1); + if (p->code == NULL) /* not compiled yet? */ + prepcompile(L, p, 1); + printpatt(p->code, p->codesize); + return 0; +} + + +/* +** Get the initial position for the match, interpreting negative +** values from the end of the subject +*/ +static size_t initposition (lua_State *L, size_t len) { + lua_Integer ii = luaL_optinteger(L, 3, 1); + if (ii > 0) { /* positive index? */ + if ((size_t)ii <= len) /* inside the string? */ + return (size_t)ii - 1; /* return it (corrected to 0-base) */ + else return len; /* crop at the end */ + } + else { /* negative index */ + if ((size_t)(-ii) <= len) /* inside the string? */ + return len - ((size_t)(-ii)); /* return position from the end */ + else return 0; /* crop at the beginning */ + } +} + + +/* +** Main match function +*/ +static int lp_match (lua_State *L) { + Capture capture[INITCAPSIZE]; + const char *r; + size_t l; + Pattern *p = (getpatt(L, 1, NULL), getpattern(L, 1)); + Instruction *code = (p->code != NULL) ? p->code : prepcompile(L, p, 1); + const char *s = luaL_checklstring(L, SUBJIDX, &l); + size_t i = initposition(L, l); + int ptop = lua_gettop(L); + lua_pushnil(L); /* initialize subscache */ + lua_pushlightuserdata(L, capture); /* initialize caplistidx */ + lua_getuservalue(L, 1); /* initialize penvidx */ + r = match(L, s, s + i, s + l, code, capture, ptop); + if (r == NULL) { + lua_pushnil(L); + return 1; + } + return getcaptures(L, s, r, ptop); +} + + + +/* +** {====================================================== +** Library creation and functions not related to matching +** ======================================================= +*/ + +/* maximum limit for stack size */ +#define MAXLIM (INT_MAX / 100) + +static int lp_setmax (lua_State *L) { + lua_Integer lim = luaL_checkinteger(L, 1); + luaL_argcheck(L, 0 < lim && lim <= MAXLIM, 1, "out of range"); + lua_settop(L, 1); + lua_setfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX); + return 0; +} + + +static int lp_version (lua_State *L) { + lua_pushstring(L, VERSION); + return 1; +} + + +static int lp_type (lua_State *L) { + if (testpattern(L, 1)) + lua_pushliteral(L, "pattern"); + else + lua_pushnil(L); + return 1; +} + + +int lp_gc (lua_State *L) { + Pattern *p = getpattern(L, 1); + realloccode(L, p, 0); /* delete code block */ + return 0; +} + + +static void createcat (lua_State *L, const char *catname, int (catf) (int)) { + TTree *t = newcharset(L); + int i; + for (i = 0; i <= UCHAR_MAX; i++) + if (catf(i)) setchar(treebuffer(t), i); + lua_setfield(L, -2, catname); +} + + +static int lp_locale (lua_State *L) { + if (lua_isnoneornil(L, 1)) { + lua_settop(L, 0); + lua_createtable(L, 0, 12); + } + else { + luaL_checktype(L, 1, LUA_TTABLE); + lua_settop(L, 1); + } + createcat(L, "alnum", isalnum); + createcat(L, "alpha", isalpha); + createcat(L, "cntrl", iscntrl); + createcat(L, "digit", isdigit); + createcat(L, "graph", isgraph); + createcat(L, "lower", islower); + createcat(L, "print", isprint); + createcat(L, "punct", ispunct); + createcat(L, "space", isspace); + createcat(L, "upper", isupper); + createcat(L, "xdigit", isxdigit); + return 1; +} + + +static struct luaL_Reg pattreg[] = { + {"ptree", lp_printtree}, + {"pcode", lp_printcode}, + {"match", lp_match}, + {"B", lp_behind}, + {"V", lp_V}, + {"C", lp_simplecapture}, + {"Cc", lp_constcapture}, + {"Cmt", lp_matchtime}, + {"Cb", lp_backref}, + {"Carg", lp_argcapture}, + {"Cp", lp_poscapture}, + {"Cs", lp_substcapture}, + {"Ct", lp_tablecapture}, + {"Cf", lp_foldcapture}, + {"Cg", lp_groupcapture}, + {"P", lp_P}, + {"S", lp_set}, + {"R", lp_range}, + {"locale", lp_locale}, + {"version", lp_version}, + {"setmaxstack", lp_setmax}, + {"type", lp_type}, + {NULL, NULL} +}; + + +static struct luaL_Reg metareg[] = { + {"__mul", lp_seq}, + {"__add", lp_choice}, + {"__pow", lp_star}, + {"__gc", lp_gc}, + {"__len", lp_and}, + {"__div", lp_divcapture}, + {"__unm", lp_not}, + {"__sub", lp_sub}, + {NULL, NULL} +}; + + +int luaopen_lpeg (lua_State *L); +int luaopen_lpeg (lua_State *L) { + luaL_newmetatable(L, PATTERN_T); + lua_pushnumber(L, MAXBACK); /* initialize maximum backtracking */ + lua_setfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX); + luaL_setfuncs(L, metareg, 0); + luaL_newlib(L, pattreg); + lua_pushvalue(L, -1); + lua_setfield(L, -3, "__index"); + return 1; +} + +/* }====================================================== */ diff --git a/ThirdParty/lpeg-1.0.2/lptree.h b/ThirdParty/lpeg-1.0.2/lptree.h new file mode 100644 index 0000000..25906d5 --- /dev/null +++ b/ThirdParty/lpeg-1.0.2/lptree.h @@ -0,0 +1,82 @@ +/* +** $Id: lptree.h $ +*/ + +#if !defined(lptree_h) +#define lptree_h + + +#include "lptypes.h" + + +/* +** types of trees +*/ +typedef enum TTag { + TChar = 0, /* 'n' = char */ + TSet, /* the set is stored in next CHARSETSIZE bytes */ + TAny, + TTrue, + TFalse, + TRep, /* 'sib1'* */ + TSeq, /* 'sib1' 'sib2' */ + TChoice, /* 'sib1' / 'sib2' */ + TNot, /* !'sib1' */ + TAnd, /* &'sib1' */ + TCall, /* ktable[key] is rule's key; 'sib2' is rule being called */ + TOpenCall, /* ktable[key] is rule's key */ + TRule, /* ktable[key] is rule's key (but key == 0 for unused rules); + 'sib1' is rule's pattern; + 'sib2' is next rule; 'cap' is rule's sequential number */ + TGrammar, /* 'sib1' is initial (and first) rule */ + TBehind, /* 'sib1' is pattern, 'n' is how much to go back */ + TCapture, /* captures: 'cap' is kind of capture (enum 'CapKind'); + ktable[key] is Lua value associated with capture; + 'sib1' is capture body */ + TRunTime /* run-time capture: 'key' is Lua function; + 'sib1' is capture body */ +} TTag; + + +/* +** Tree trees +** The first child of a tree (if there is one) is immediately after +** the tree. A reference to a second child (ps) is its position +** relative to the position of the tree itself. +*/ +typedef struct TTree { + byte tag; + byte cap; /* kind of capture (if it is a capture) */ + unsigned short key; /* key in ktable for Lua data (0 if no key) */ + union { + int ps; /* occasional second child */ + int n; /* occasional counter */ + } u; +} TTree; + + +/* +** A complete pattern has its tree plus, if already compiled, +** its corresponding code +*/ +typedef struct Pattern { + union Instruction *code; + int codesize; + TTree tree[1]; +} Pattern; + + +/* number of children for each tree */ +extern const byte numsiblings[]; + +/* access to children */ +#define sib1(t) ((t) + 1) +#define sib2(t) ((t) + (t)->u.ps) + + + + + + +#endif + diff --git a/ThirdParty/lpeg-1.0.2/lptypes.h b/ThirdParty/lpeg-1.0.2/lptypes.h new file mode 100644 index 0000000..1d9d59f --- /dev/null +++ b/ThirdParty/lpeg-1.0.2/lptypes.h @@ -0,0 +1,145 @@ +/* +** $Id: lptypes.h $ +** LPeg - PEG pattern matching for Lua +** Copyright 2007-2019, Lua.org & PUC-Rio (see 'lpeg.html' for license) +** written by Roberto Ierusalimschy +*/ + +#if !defined(lptypes_h) +#define lptypes_h + + +#include <assert.h> +#include <limits.h> + +#include "lua.h" + + +#define VERSION "1.0.2" + + +#define PATTERN_T "lpeg-pattern" +#define MAXSTACKIDX "lpeg-maxstack" + + +/* +** compatibility with Lua 5.1 +*/ +#if (LUA_VERSION_NUM == 501) + +#define lp_equal lua_equal + +#define lua_getuservalue lua_getfenv +#define lua_setuservalue lua_setfenv + +#define lua_rawlen lua_objlen + +#define luaL_setfuncs(L,f,n) luaL_register(L,NULL,f) +#define luaL_newlib(L,f) luaL_register(L,"lpeg",f) + +#endif + + +#if !defined(lp_equal) +#define lp_equal(L,idx1,idx2) lua_compare(L,(idx1),(idx2),LUA_OPEQ) +#endif + + +/* default maximum size for call/backtrack stack */ +#if !defined(MAXBACK) +#define MAXBACK 400 +#endif + + +/* maximum number of rules in a grammar (limited by 'unsigned char') */ +#if !defined(MAXRULES) +#define MAXRULES 250 +#endif + + + +/* initial size for capture's list */ +#define INITCAPSIZE 32 + + +/* index, on Lua stack, for subject */ +#define SUBJIDX 2 + +/* number of fixed arguments to 'match' (before capture arguments) */ +#define FIXEDARGS 3 + +/* index, on Lua stack, for capture list */ +#define caplistidx(ptop) ((ptop) + 2) + +/* index, on Lua stack, for pattern's ktable */ +#define ktableidx(ptop) ((ptop) + 3) + +/* index, on Lua stack, for backtracking stack */ +#define stackidx(ptop) ((ptop) + 4) + + + +typedef unsigned char byte; + + +#define BITSPERCHAR 8 + +#define CHARSETSIZE ((UCHAR_MAX/BITSPERCHAR) + 1) + + + +typedef struct Charset { + byte cs[CHARSETSIZE]; +} Charset; + + + +#define loopset(v,b) { int v; for (v = 0; v < CHARSETSIZE; v++) {b;} } + +/* access to charset */ +#define treebuffer(t) ((byte *)((t) + 1)) + +/* number of slots needed for 'n' bytes */ +#define bytes2slots(n) (((n) - 1) / sizeof(TTree) + 1) + +/* set 'b' bit in charset 'cs' */ +#define setchar(cs,b) ((cs)[(b) >> 3] |= (1 << ((b) & 7))) + + +/* +** in capture instructions, 'kind' of capture and its offset are +** packed in field 'aux', 4 bits for each +*/ +#define getkind(op) ((op)->i.aux & 0xF) +#define getoff(op) (((op)->i.aux >> 4) & 0xF) +#define joinkindoff(k,o) ((k) | ((o) << 4)) + +#define MAXOFF 0xF +#define MAXAUX 0xFF + + +/* maximum number of bytes to look behind */ +#define MAXBEHIND MAXAUX + + +/* maximum size (in elements) for a pattern */ +#define MAXPATTSIZE (SHRT_MAX - 10) + + +/* size (in elements) for an instruction plus extra l bytes */ +#define instsize(l) (((l) + sizeof(Instruction) - 1)/sizeof(Instruction) + 1) + + +/* size (in elements) for a ISet instruction */ +#define CHARSETINSTSIZE instsize(CHARSETSIZE) + +/* size (in elements) for a IFunc instruction */ +#define funcinstsize(p) ((p)->i.aux + 2) + + + +#define testchar(st,c) (((int)(st)[((c) >> 3)] & (1 << ((c) & 7)))) + + +#endif + diff --git a/ThirdParty/lpeg-1.0.2/lpvm.c b/ThirdParty/lpeg-1.0.2/lpvm.c new file mode 100644 index 0000000..737418c --- /dev/null +++ b/ThirdParty/lpeg-1.0.2/lpvm.c @@ -0,0 +1,374 @@ +/* +** $Id: lpvm.c $ +** Copyright 2007, Lua.org & PUC-Rio (see 'lpeg.html' for license) +*/ + +#include <limits.h> +#include <string.h> + + +#include "lua.h" +#include "lauxlib.h" + +#include "lpcap.h" +#include "lptypes.h" +#include "lpvm.h" +#include "lpprint.h" + + +/* initial size for call/backtrack stack */ +#if !defined(INITBACK) +#define INITBACK MAXBACK +#endif + + +#define getoffset(p) (((p) + 1)->offset) + +static const Instruction giveup = {{IGiveup, 0, 0}}; + + +/* +** {====================================================== +** Virtual Machine +** ======================================================= +*/ + + +typedef struct Stack { + const char *s; /* saved position (or NULL for calls) */ + const Instruction *p; /* next instruction */ + int caplevel; +} Stack; + + +#define getstackbase(L, ptop) ((Stack *)lua_touserdata(L, stackidx(ptop))) + + +/* +** Ensures the size of array 'capture' (with size '*capsize' and +** 'captop' elements being used) is enough to accomodate 'n' extra +** elements plus one. (Because several opcodes add stuff to the capture +** array, it is simpler to ensure the array always has at least one free +** slot upfront and check its size later.) +*/ +static Capture *growcap (lua_State *L, Capture *capture, int *capsize, + int captop, int n, int ptop) { + if (*capsize - captop > n) + return capture; /* no need to grow array */ + else { /* must grow */ + Capture *newc; + int newsize = captop + n + 1; /* minimum size needed */ + if (newsize < INT_MAX/((int)sizeof(Capture) * 2)) + newsize *= 2; /* twice that size, if not too big */ + else if (newsize >= INT_MAX/((int)sizeof(Capture))) + luaL_error(L, "too many captures"); + newc = (Capture *)lua_newuserdata(L, newsize * sizeof(Capture)); + memcpy(newc, capture, captop * sizeof(Capture)); + *capsize = newsize; + lua_replace(L, caplistidx(ptop)); + return newc; + } +} + + +/* +** Double the size of the stack +*/ +static Stack *doublestack (lua_State *L, Stack **stacklimit, int ptop) { + Stack *stack = getstackbase(L, ptop); + Stack *newstack; + int n = *stacklimit - stack; /* current stack size */ + int max, newn; + lua_getfield(L, LUA_REGISTRYINDEX, MAXSTACKIDX); + max = lua_tointeger(L, -1); /* maximum allowed size */ + lua_pop(L, 1); + if (n >= max) /* already at maximum size? */ + luaL_error(L, "backtrack stack overflow (current limit is %d)", max); + newn = 2 * n; /* new size */ + if (newn > max) newn = max; + newstack = (Stack *)lua_newuserdata(L, newn * sizeof(Stack)); + memcpy(newstack, stack, n * sizeof(Stack)); + lua_replace(L, stackidx(ptop)); + *stacklimit = newstack + newn; + return newstack + n; /* return next position */ +} + + +/* +** Interpret the result of a dynamic capture: false -> fail; +** true -> keep current position; number -> next position. +** Return new subject position. 'fr' is stack index where +** is the result; 'curr' is current subject position; 'limit' +** is subject's size. +*/ +static int resdyncaptures (lua_State *L, int fr, int curr, int limit) { + lua_Integer res; + if (!lua_toboolean(L, fr)) { /* false value? */ + lua_settop(L, fr - 1); /* remove results */ + return -1; /* and fail */ + } + else if (lua_isboolean(L, fr)) /* true? */ + res = curr; /* keep current position */ + else { + res = lua_tointeger(L, fr) - 1; /* new position */ + if (res < curr || res > limit) + luaL_error(L, "invalid position returned by match-time capture"); + } + lua_remove(L, fr); /* remove first result (offset) */ + return res; +} + + +/* +** Add capture values returned by a dynamic capture to the list +** 'capture', nested inside a group. 'fd' indexes the first capture +** value, 'n' is the number of values (at least 1). The open group +** capture is already in 'capture', before the place for the new entries. +*/ +static void adddyncaptures (const char *s, Capture *capture, int n, int fd) { + int i; + assert(capture[-1].kind == Cgroup && capture[-1].siz == 0); + capture[-1].idx = 0; /* make group capture an anonymous group */ + for (i = 0; i < n; i++) { /* add runtime captures */ + capture[i].kind = Cruntime; + capture[i].siz = 1; /* mark it as closed */ + capture[i].idx = fd + i; /* stack index of capture value */ + capture[i].s = s; + } + capture[n].kind = Cclose; /* close group */ + capture[n].siz = 1; + capture[n].s = s; +} + + +/* +** Remove dynamic captures from the Lua stack (called in case of failure) +*/ +static int removedyncap (lua_State *L, Capture *capture, + int level, int last) { + int id = finddyncap(capture + level, capture + last); /* index of 1st cap. */ + int top = lua_gettop(L); + if (id == 0) return 0; /* no dynamic captures? */ + lua_settop(L, id - 1); /* remove captures */ + return top - id + 1; /* number of values removed */ +} + + +/* +** Opcode interpreter +*/ +const char *match (lua_State *L, const char *o, const char *s, const char *e, + Instruction *op, Capture *capture, int ptop) { + Stack stackbase[INITBACK]; + Stack *stacklimit = stackbase + INITBACK; + Stack *stack = stackbase; /* point to first empty slot in stack */ + int capsize = INITCAPSIZE; + int captop = 0; /* point to first empty slot in captures */ + int ndyncap = 0; /* number of dynamic captures (in Lua stack) */ + const Instruction *p = op; /* current instruction */ + stack->p = &giveup; stack->s = s; stack->caplevel = 0; stack++; + lua_pushlightuserdata(L, stackbase); + for (;;) { +#if defined(DEBUG) + printf("-------------------------------------\n"); + printcaplist(capture, capture + captop); + printf("s: |%s| stck:%d, dyncaps:%d, caps:%d ", + s, (int)(stack - getstackbase(L, ptop)), ndyncap, captop); + printinst(op, p); +#endif + assert(stackidx(ptop) + ndyncap == lua_gettop(L) && ndyncap <= captop); + switch ((Opcode)p->i.code) { + case IEnd: { + assert(stack == getstackbase(L, ptop) + 1); + capture[captop].kind = Cclose; + capture[captop].s = NULL; + return s; + } + case IGiveup: { + assert(stack == getstackbase(L, ptop)); + return NULL; + } + case IRet: { + assert(stack > getstackbase(L, ptop) && (stack - 1)->s == NULL); + p = (--stack)->p; + continue; + } + case IAny: { + if (s < e) { p++; s++; } + else goto fail; + continue; + } + case ITestAny: { + if (s < e) p += 2; + else p += getoffset(p); + continue; + } + case IChar: { + if ((byte)*s == p->i.aux && s < e) { p++; s++; } + else goto fail; + continue; + } + case ITestChar: { + if ((byte)*s == p->i.aux && s < e) p += 2; + else p += getoffset(p); + continue; + } + case ISet: { + int c = (byte)*s; + if (testchar((p+1)->buff, c) && s < e) + { p += CHARSETINSTSIZE; s++; } + else goto fail; + continue; + } + case ITestSet: { + int c = (byte)*s; + if (testchar((p + 2)->buff, c) && s < e) + p += 1 + CHARSETINSTSIZE; + else p += getoffset(p); + continue; + } + case IBehind: { + int n = p->i.aux; + if (n > s - o) goto fail; + s -= n; p++; + continue; + } + case ISpan: { + for (; s < e; s++) { + int c = (byte)*s; + if (!testchar((p+1)->buff, c)) break; + } + p += CHARSETINSTSIZE; + continue; + } + case IJmp: { + p += getoffset(p); + continue; + } + case IChoice: { + if (stack == stacklimit) + stack = doublestack(L, &stacklimit, ptop); + stack->p = p + getoffset(p); + stack->s = s; + stack->caplevel = captop; + stack++; + p += 2; + continue; + } + case ICall: { + if (stack == stacklimit) + stack = doublestack(L, &stacklimit, ptop); + stack->s = NULL; + stack->p = p + 2; /* save return address */ + stack++; + p += getoffset(p); + continue; + } + case ICommit: { + assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); + stack--; + p += getoffset(p); + continue; + } + case IPartialCommit: { + assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); + (stack - 1)->s = s; + (stack - 1)->caplevel = captop; + p += getoffset(p); + continue; + } + case IBackCommit: { + assert(stack > getstackbase(L, ptop) && (stack - 1)->s != NULL); + s = (--stack)->s; + captop = stack->caplevel; + p += getoffset(p); + continue; + } + case IFailTwice: + assert(stack > getstackbase(L, ptop)); + stack--; + /* go through */ + case IFail: + fail: { /* pattern failed: try to backtrack */ + do { /* remove pending calls */ + assert(stack > getstackbase(L, ptop)); + s = (--stack)->s; + } while (s == NULL); + if (ndyncap > 0) /* is there matchtime captures? */ + ndyncap -= removedyncap(L, capture, stack->caplevel, captop); + captop = stack->caplevel; + p = stack->p; +#if defined(DEBUG) + printf("**FAIL**\n"); +#endif + continue; + } + case ICloseRunTime: { + CapState cs; + int rem, res, n; + int fr = lua_gettop(L) + 1; /* stack index of first result */ + cs.reclevel = 0; cs.L = L; + cs.s = o; cs.ocap = capture; cs.ptop = ptop; + n = runtimecap(&cs, capture + captop, s, &rem); /* call function */ + captop -= n; /* remove nested captures */ + ndyncap -= rem; /* update number of dynamic captures */ + fr -= rem; /* 'rem' items were popped from Lua stack */ + res = resdyncaptures(L, fr, s - o, e - o); /* get result */ + if (res == -1) /* fail? */ + goto fail; + s = o + res; /* else update current position */ + n = lua_gettop(L) - fr + 1; /* number of new captures */ + ndyncap += n; /* update number of dynamic captures */ + if (n == 0) /* no new captures? */ + captop--; /* remove open group */ + else { /* new captures; keep original open group */ + if (fr + n >= SHRT_MAX) + luaL_error(L, "too many results in match-time capture"); + /* add new captures + close group to 'capture' list */ + capture = growcap(L, capture, &capsize, captop, n + 1, ptop); + adddyncaptures(s, capture + captop, n, fr); + captop += n + 1; /* new captures + close group */ + } + p++; + continue; + } + case ICloseCapture: { + const char *s1 = s; + assert(captop > 0); + /* if possible, turn capture into a full capture */ + if (capture[captop - 1].siz == 0 && + s1 - capture[captop - 1].s < UCHAR_MAX) { + capture[captop - 1].siz = s1 - capture[captop - 1].s + 1; + p++; + continue; + } + else { + capture[captop].siz = 1; /* mark entry as closed */ + capture[captop].s = s; + goto pushcapture; + } + } + case IOpenCapture: + capture[captop].siz = 0; /* mark entry as open */ + capture[captop].s = s; + goto pushcapture; + case IFullCapture: + capture[captop].siz = getoff(p) + 1; /* save capture size */ + capture[captop].s = s - getoff(p); + /* goto pushcapture; */ + pushcapture: { + capture[captop].idx = p->i.key; + capture[captop].kind = getkind(p); + captop++; + capture = growcap(L, capture, &capsize, captop, 0, ptop); + p++; + continue; + } + default: assert(0); return NULL; + } + } +} + +/* }====================================================== */ + + diff --git a/ThirdParty/lpeg-1.0.2/lpvm.h b/ThirdParty/lpeg-1.0.2/lpvm.h new file mode 100644 index 0000000..69ec33d --- /dev/null +++ b/ThirdParty/lpeg-1.0.2/lpvm.h @@ -0,0 +1,58 @@ +/* +** $Id: lpvm.h $ +*/ + +#if !defined(lpvm_h) +#define lpvm_h + +#include "lpcap.h" + + +/* Virtual Machine's instructions */ +typedef enum Opcode { + IAny, /* if no char, fail */ + IChar, /* if char != aux, fail */ + ISet, /* if char not in buff, fail */ + ITestAny, /* in no char, jump to 'offset' */ + ITestChar, /* if char != aux, jump to 'offset' */ + ITestSet, /* if char not in buff, jump to 'offset' */ + ISpan, /* read a span of chars in buff */ + IBehind, /* walk back 'aux' characters (fail if not possible) */ + IRet, /* return from a rule */ + IEnd, /* end of pattern */ + IChoice, /* stack a choice; next fail will jump to 'offset' */ + IJmp, /* jump to 'offset' */ + ICall, /* call rule at 'offset' */ + IOpenCall, /* call rule number 'key' (must be closed to a ICall) */ + ICommit, /* pop choice and jump to 'offset' */ + IPartialCommit, /* update top choice to current position and jump */ + IBackCommit, /* "fails" but jump to its own 'offset' */ + IFailTwice, /* pop one choice and then fail */ + IFail, /* go back to saved state on choice and jump to saved offset */ + IGiveup, /* internal use */ + IFullCapture, /* complete capture of last 'off' chars */ + IOpenCapture, /* start a capture */ + ICloseCapture, + ICloseRunTime +} Opcode; + + + +typedef union Instruction { + struct Inst { + byte code; + byte aux; + short key; + } i; + int offset; + byte buff[1]; +} Instruction; + + +void printpatt (Instruction *p, int n); +const char *match (lua_State *L, const char *o, const char *s, const char *e, + Instruction *op, Capture *capture, int ptop); + + +#endif + diff --git a/ThirdParty/lpeg-1.0.2/makefile b/ThirdParty/lpeg-1.0.2/makefile new file mode 100644 index 0000000..1e32195 --- /dev/null +++ b/ThirdParty/lpeg-1.0.2/makefile @@ -0,0 +1,55 @@ +LIBNAME = lpeg +LUADIR = ../lua/ + +COPT = -O2 -DNDEBUG +# COPT = -g + +CWARNS = -Wall -Wextra -pedantic \ + -Waggregate-return \ + -Wcast-align \ + -Wcast-qual \ + -Wdisabled-optimization \ + -Wpointer-arith \ + -Wshadow \ + -Wsign-compare \ + -Wundef \ + -Wwrite-strings \ + -Wbad-function-cast \ + -Wdeclaration-after-statement \ + -Wmissing-prototypes \ + -Wnested-externs \ + -Wstrict-prototypes \ +# -Wunreachable-code \ + + +CFLAGS = $(CWARNS) $(COPT) -std=c99 -I$(LUADIR) -fPIC +CC = gcc + +FILES = lpvm.o lpcap.o lptree.o lpcode.o lpprint.o + +# For Linux +linux: + $(MAKE) lpeg.so "DLLFLAGS = -shared -fPIC" + +# For Mac OS +macosx: + $(MAKE) lpeg.so "DLLFLAGS = -bundle -undefined dynamic_lookup" + +lpeg.so: $(FILES) + env $(CC) $(DLLFLAGS) $(FILES) -o lpeg.so + +$(FILES): makefile + +test: test.lua re.lua lpeg.so + ./test.lua + +clean: + rm -f $(FILES) lpeg.so + + +lpcap.o: lpcap.c lpcap.h lptypes.h +lpcode.o: lpcode.c lptypes.h lpcode.h lptree.h lpvm.h lpcap.h +lpprint.o: lpprint.c lptypes.h lpprint.h lptree.h lpvm.h lpcap.h +lptree.o: lptree.c lptypes.h lpcap.h lpcode.h lptree.h lpvm.h lpprint.h +lpvm.o: lpvm.c lpcap.h lptypes.h lpvm.h lpprint.h lptree.h + diff --git a/ThirdParty/lpeg-1.0.2/re.html b/ThirdParty/lpeg-1.0.2/re.html new file mode 100644 index 0000000..ad60d50 --- /dev/null +++ b/ThirdParty/lpeg-1.0.2/re.html @@ -0,0 +1,494 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> +<html> +<head> + <title>LPeg.re - Regex syntax for LPEG</title> + <link rel="stylesheet" + href="http://www.inf.puc-rio.br/~roberto/lpeg/doc.css" + type="text/css"/> + <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/> +</head> +<body> + +<!-- $Id: re.html $ --> + +<div id="container"> + +<div id="product"> + <div id="product_logo"> + <a href="http://www.inf.puc-rio.br/~roberto/lpeg/"> + <img alt="LPeg logo" src="lpeg-128.gif"/> + </a> + </div> + <div id="product_name"><big><strong>LPeg.re</strong></big></div> + <div id="product_description"> + Regex syntax for LPEG + </div> +</div> <!-- id="product" --> + +<div id="main"> + +<div id="navigation"> +<h1>re</h1> + +<ul> + <li><a href="#basic">Basic Constructions</a></li> + <li><a href="#func">Functions</a></li> + <li><a href="#ex">Some Examples</a></li> + <li><a href="#license">License</a></li> + </ul> + </li> +</ul> +</div> <!-- id="navigation" --> + +<div id="content"> + +<h2><a name="basic"></a>The <code>re</code> Module</h2> + +<p> +The <code>re</code> module +(provided by file <code>re.lua</code> in the distribution) +supports a somewhat conventional regex syntax +for pattern usage within <a href="lpeg.html">LPeg</a>. +</p> + +<p> +The next table summarizes <code>re</code>'s syntax. +A <code>p</code> represents an arbitrary pattern; +<code>num</code> represents a number (<code>[0-9]+</code>); +<code>name</code> represents an identifier +(<code>[a-zA-Z][a-zA-Z0-9_]*</code>). +Constructions are listed in order of decreasing precedence. +<table border="1"> +<tbody><tr><td><b>Syntax</b></td><td><b>Description</b></td></tr> +<tr><td><code>( p )</code></td> <td>grouping</td></tr> +<tr><td><code>'string'</code></td> <td>literal string</td></tr> +<tr><td><code>"string"</code></td> <td>literal string</td></tr> +<tr><td><code>[class]</code></td> <td>character class</td></tr> +<tr><td><code>.</code></td> <td>any character</td></tr> +<tr><td><code>%name</code></td> + <td>pattern <code>defs[name]</code> or a pre-defined pattern</td></tr> +<tr><td><code>name</code></td><td>non terminal</td></tr> +<tr><td><code><name></code></td><td>non terminal</td></tr> +<tr><td><code>{}</code></td> <td>position capture</td></tr> +<tr><td><code>{ p }</code></td> <td>simple capture</td></tr> +<tr><td><code>{: p :}</code></td> <td>anonymous group capture</td></tr> +<tr><td><code>{:name: p :}</code></td> <td>named group capture</td></tr> +<tr><td><code>{~ p ~}</code></td> <td>substitution capture</td></tr> +<tr><td><code>{| p |}</code></td> <td>table capture</td></tr> +<tr><td><code>=name</code></td> <td>back reference +</td></tr> +<tr><td><code>p ?</code></td> <td>optional match</td></tr> +<tr><td><code>p *</code></td> <td>zero or more repetitions</td></tr> +<tr><td><code>p +</code></td> <td>one or more repetitions</td></tr> +<tr><td><code>p^num</code></td> <td>exactly <code>n</code> repetitions</td></tr> +<tr><td><code>p^+num</code></td> + <td>at least <code>n</code> repetitions</td></tr> +<tr><td><code>p^-num</code></td> + <td>at most <code>n</code> repetitions</td></tr> +<tr><td><code>p -> 'string'</code></td> <td>string capture</td></tr> +<tr><td><code>p -> "string"</code></td> <td>string capture</td></tr> +<tr><td><code>p -> num</code></td> <td>numbered capture</td></tr> +<tr><td><code>p -> name</code></td> <td>function/query/string capture +equivalent to <code>p / defs[name]</code></td></tr> +<tr><td><code>p => name</code></td> <td>match-time capture +equivalent to <code>lpeg.Cmt(p, defs[name])</code></td></tr> +<tr><td><code>p ~> name</code></td> <td>fold capture +equivalent to <code>lpeg.Cf(p, defs[name])</code></td></tr> +<tr><td><code>& p</code></td> <td>and predicate</td></tr> +<tr><td><code>! p</code></td> <td>not predicate</td></tr> +<tr><td><code>p1 p2</code></td> <td>concatenation</td></tr> +<tr><td><code>p1 / p2</code></td> <td>ordered choice</td></tr> +<tr><td>(<code>name <- p</code>)<sup>+</sup></td> <td>grammar</td></tr> +</tbody></table> +<p> +Any space appearing in a syntax description can be +replaced by zero or more space characters and Lua-style comments +(<code>--</code> until end of line). +</p> + +<p> +Character classes define sets of characters. +An initial <code>^</code> complements the resulting set. +A range <em>x</em><code>-</code><em>y</em> includes in the set +all characters with codes between the codes of <em>x</em> and <em>y</em>. +A pre-defined class <code>%</code><em>name</em> includes all +characters of that class. +A simple character includes itself in the set. +The only special characters inside a class are <code>^</code> +(special only if it is the first character); +<code>]</code> +(can be included in the set as the first character, +after the optional <code>^</code>); +<code>%</code> (special only if followed by a letter); +and <code>-</code> +(can be included in the set as the first or the last character). +</p> + +<p> +Currently the pre-defined classes are similar to those from the +Lua's string library +(<code>%a</code> for letters, +<code>%A</code> for non letters, etc.). +There is also a class <code>%nl</code> +containing only the newline character, +which is particularly handy for grammars written inside long strings, +as long strings do not interpret escape sequences like <code>\n</code>. +</p> + + +<h2><a name="func">Functions</a></h2> + +<h3><code>re.compile (string, [, defs])</code></h3> +<p> +Compiles the given string and +returns an equivalent LPeg pattern. +The given string may define either an expression or a grammar. +The optional <code>defs</code> table provides extra Lua values +to be used by the pattern. +</p> + +<h3><code>re.find (subject, pattern [, init])</code></h3> +<p> +Searches the given pattern in the given subject. +If it finds a match, +returns the index where this occurrence starts and +the index where it ends. +Otherwise, returns nil. +</p> + +<p> +An optional numeric argument <code>init</code> makes the search +starts at that position in the subject string. +As usual in Lua libraries, +a negative value counts from the end. +</p> + +<h3><code>re.gsub (subject, pattern, replacement)</code></h3> +<p> +Does a <em>global substitution</em>, +replacing all occurrences of <code>pattern</code> +in the given <code>subject</code> by <code>replacement</code>. + +<h3><code>re.match (subject, pattern)</code></h3> +<p> +Matches the given pattern against the given subject, +returning all captures. +</p> + +<h3><code>re.updatelocale ()</code></h3> +<p> +Updates the pre-defined character classes to the current locale. +</p> + + +<h2><a name="ex">Some Examples</a></h2> + +<h3>A complete simple program</h3> +<p> +The next code shows a simple complete Lua program using +the <code>re</code> module: +</p> +<pre class="example"> +local re = require"re" + +-- find the position of the first numeral in a string +print(re.find("the number 423 is odd", "[0-9]+")) --> 12 14 + +-- returns all words in a string +print(re.match("the number 423 is odd", "({%a+} / .)*")) +--> the number is odd + +-- returns the first numeral in a string +print(re.match("the number 423 is odd", "s <- {%d+} / . s")) +--> 423 + +print(re.gsub("hello World", "[aeiou]", ".")) +--> h.ll. W.rld +</pre> + + +<h3>Balanced parentheses</h3> +<p> +The following call will produce the same pattern produced by the +Lua expression in the +<a href="lpeg.html#balanced">balanced parentheses</a> example: +</p> +<pre class="example"> +b = re.compile[[ balanced <- "(" ([^()] / balanced)* ")" ]] +</pre> + +<h3>String reversal</h3> +<p> +The next example reverses a string: +</p> +<pre class="example"> +rev = re.compile[[ R <- (!.) -> '' / ({.} R) -> '%2%1']] +print(rev:match"0123456789") --> 9876543210 +</pre> + +<h3>CSV decoder</h3> +<p> +The next example replicates the <a href="lpeg.html#CSV">CSV decoder</a>: +</p> +<pre class="example"> +record = re.compile[[ + record <- {| field (',' field)* |} (%nl / !.) + field <- escaped / nonescaped + nonescaped <- { [^,"%nl]* } + escaped <- '"' {~ ([^"] / '""' -> '"')* ~} '"' +]] +</pre> + +<h3>Lua's long strings</h3> +<p> +The next example matches Lua long strings: +</p> +<pre class="example"> +c = re.compile([[ + longstring <- ('[' {:eq: '='* :} '[' close) + close <- ']' =eq ']' / . close +]]) + +print(c:match'[==[]]===]]]]==]===[]') --> 17 +</pre> + +<h3>Abstract Syntax Trees</h3> +<p> +This example shows a simple way to build an +abstract syntax tree (AST) for a given grammar. +To keep our example simple, +let us consider the following grammar +for lists of names: +</p> +<pre class="example"> +p = re.compile[[ + listname <- (name s)* + name <- [a-z][a-z]* + s <- %s* +]] +</pre> +<p> +Now, we will add captures to build a corresponding AST. +As a first step, the pattern will build a table to +represent each non terminal; +terminals will be represented by their corresponding strings: +</p> +<pre class="example"> +c = re.compile[[ + listname <- {| (name s)* |} + name <- {| {[a-z][a-z]*} |} + s <- %s* +]] +</pre> +<p> +Now, a match against <code>"hi hello bye"</code> +results in the table +<code>{{"hi"}, {"hello"}, {"bye"}}</code>. +</p> +<p> +For such a simple grammar, +this AST is more than enough; +actually, the tables around each single name +are already overkilling. +More complex grammars, +however, may need some more structure. +Specifically, +it would be useful if each table had +a <code>tag</code> field telling what non terminal +that table represents. +We can add such a tag using +<a href="lpeg.html#cap-g">named group captures</a>: +</p> +<pre class="example"> +x = re.compile[[ + listname <- {| {:tag: '' -> 'list':} (name s)* |} + name <- {| {:tag: '' -> 'id':} {[a-z][a-z]*} |} + s <- ' '* +]] +</pre> +<p> +With these group captures, +a match against <code>"hi hello bye"</code> +results in the following table: +</p> +<pre class="example"> +{tag="list", + {tag="id", "hi"}, + {tag="id", "hello"}, + {tag="id", "bye"} +} +</pre> + + +<h3>Indented blocks</h3> +<p> +This example breaks indented blocks into tables, +respecting the indentation: +</p> +<pre class="example"> +p = re.compile[[ + block <- {| {:ident:' '*:} line + ((=ident !' ' line) / &(=ident ' ') block)* |} + line <- {[^%nl]*} %nl +]] +</pre> +<p> +As an example, +consider the following text: +</p> +<pre class="example"> +t = p:match[[ +first line + subline 1 + subline 2 +second line +third line + subline 3.1 + subline 3.1.1 + subline 3.2 +]] +</pre> +<p> +The resulting table <code>t</code> will be like this: +</p> +<pre class="example"> + {'first line'; {'subline 1'; 'subline 2'; ident = ' '}; + 'second line'; + 'third line'; { 'subline 3.1'; {'subline 3.1.1'; ident = ' '}; + 'subline 3.2'; ident = ' '}; + ident = ''} +</pre> + +<h3>Macro expander</h3> +<p> +This example implements a simple macro expander. +Macros must be defined as part of the pattern, +following some simple rules: +</p> +<pre class="example"> +p = re.compile[[ + text <- {~ item* ~} + item <- macro / [^()] / '(' item* ')' + arg <- ' '* {~ (!',' item)* ~} + args <- '(' arg (',' arg)* ')' + -- now we define some macros + macro <- ('apply' args) -> '%1(%2)' + / ('add' args) -> '%1 + %2' + / ('mul' args) -> '%1 * %2' +]] + +print(p:match"add(mul(a,b), apply(f,x))") --> a * b + f(x) +</pre> +<p> +A <code>text</code> is a sequence of items, +wherein we apply a substitution capture to expand any macros. +An <code>item</code> is either a macro, +any character different from parentheses, +or a parenthesized expression. +A macro argument (<code>arg</code>) is a sequence +of items different from a comma. +(Note that a comma may appear inside an item, +e.g., inside a parenthesized expression.) +Again we do a substitution capture to expand any macro +in the argument before expanding the outer macro. +<code>args</code> is a list of arguments separated by commas. +Finally we define the macros. +Each macro is a string substitution; +it replaces the macro name and its arguments by its corresponding string, +with each <code>%</code><em>n</em> replaced by the <em>n</em>-th argument. +</p> + +<h3>Patterns</h3> +<p> +This example shows the complete syntax +of patterns accepted by <code>re</code>. +</p> +<pre class="example"> +p = [=[ + +pattern <- exp !. +exp <- S (grammar / alternative) + +alternative <- seq ('/' S seq)* +seq <- prefix* +prefix <- '&' S prefix / '!' S prefix / suffix +suffix <- primary S (([+*?] + / '^' [+-]? num + / '->' S (string / '{}' / name) + / '=>' S name) S)* + +primary <- '(' exp ')' / string / class / defined + / '{:' (name ':')? exp ':}' + / '=' name + / '{}' + / '{~' exp '~}' + / '{' exp '}' + / '.' + / name S !arrow + / '<' name '>' -- old-style non terminals + +grammar <- definition+ +definition <- name S arrow exp + +class <- '[' '^'? item (!']' item)* ']' +item <- defined / range / . +range <- . '-' [^]] + +S <- (%s / '--' [^%nl]*)* -- spaces and comments +name <- [A-Za-z][A-Za-z0-9_]* +arrow <- '<-' +num <- [0-9]+ +string <- '"' [^"]* '"' / "'" [^']* "'" +defined <- '%' name + +]=] + +print(re.match(p, p)) -- a self description must match itself +</pre> + + + +<h2><a name="license">License</a></h2> + +<p> +Copyright © 2008-2015 Lua.org, PUC-Rio. +</p> +<p> +Permission is hereby granted, free of charge, +to any person obtaining a copy of this software and +associated documentation files (the "Software"), +to deal in the Software without restriction, +including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, +and to permit persons to whom the Software is +furnished to do so, +subject to the following conditions: +</p> + +<p> +The above copyright notice and this permission notice +shall be included in all copies or substantial portions of the Software. +</p> + +<p> +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +</p> + +</div> <!-- id="content" --> + +</div> <!-- id="main" --> + +</div> <!-- id="container" --> + +</body> +</html> diff --git a/ThirdParty/lpeg-1.0.2/re.lua b/ThirdParty/lpeg-1.0.2/re.lua new file mode 100644 index 0000000..3bb8af7 --- /dev/null +++ b/ThirdParty/lpeg-1.0.2/re.lua @@ -0,0 +1,267 @@ +-- $Id: re.lua $ + +-- imported functions and modules +local tonumber, type, print, error = tonumber, type, print, error +local setmetatable = setmetatable +local m = require"lpeg" + +-- 'm' will be used to parse expressions, and 'mm' will be used to +-- create expressions; that is, 're' runs on 'm', creating patterns +-- on 'mm' +local mm = m + +-- pattern's metatable +local mt = getmetatable(mm.P(0)) + + + +-- No more global accesses after this point +local version = _VERSION +if version == "Lua 5.2" then _ENV = nil end + + +local any = m.P(1) + + +-- Pre-defined names +local Predef = { nl = m.P"\n" } + + +local mem +local fmem +local gmem + + +local function updatelocale () + mm.locale(Predef) + Predef.a = Predef.alpha + Predef.c = Predef.cntrl + Predef.d = Predef.digit + Predef.g = Predef.graph + Predef.l = Predef.lower + Predef.p = Predef.punct + Predef.s = Predef.space + Predef.u = Predef.upper + Predef.w = Predef.alnum + Predef.x = Predef.xdigit + Predef.A = any - Predef.a + Predef.C = any - Predef.c + Predef.D = any - Predef.d + Predef.G = any - Predef.g + Predef.L = any - Predef.l + Predef.P = any - Predef.p + Predef.S = any - Predef.s + Predef.U = any - Predef.u + Predef.W = any - Predef.w + Predef.X = any - Predef.x + mem = {} -- restart memoization + fmem = {} + gmem = {} + local mt = {__mode = "v"} + setmetatable(mem, mt) + setmetatable(fmem, mt) + setmetatable(gmem, mt) +end + + +updatelocale() + + + +local I = m.P(function (s,i) print(i, s:sub(1, i-1)); return i end) + + +local function patt_error (s, i) + local msg = (#s < i + 20) and s:sub(i) + or s:sub(i,i+20) .. "..." + msg = ("pattern error near '%s'"):format(msg) + error(msg, 2) +end + +local function mult (p, n) + local np = mm.P(true) + while n >= 1 do + if n%2 >= 1 then np = np * p end + p = p * p + n = n/2 + end + return np +end + +local function equalcap (s, i, c) + if type(c) ~= "string" then return nil end + local e = #c + i + if s:sub(i, e - 1) == c then return e else return nil end +end + + +local S = (Predef.space + "--" * (any - Predef.nl)^0)^0 + +local name = m.R("AZ", "az", "__") * m.R("AZ", "az", "__", "09")^0 + +local arrow = S * "<-" + +local seq_follow = m.P"/" + ")" + "}" + ":}" + "~}" + "|}" + (name * arrow) + -1 + +name = m.C(name) + + +-- a defined name only have meaning in a given environment +local Def = name * m.Carg(1) + + +local function getdef (id, defs) + local c = defs and defs[id] + if not c then error("undefined name: " .. id) end + return c +end + +-- match a name and return a group of its corresponding definition +-- and 'f' (to be folded in 'Suffix') +local function defwithfunc (f) + return m.Cg(Def / getdef * m.Cc(f)) +end + + +local num = m.C(m.R"09"^1) * S / tonumber + +local String = "'" * m.C((any - "'")^0) * "'" + + '"' * m.C((any - '"')^0) * '"' + + +local defined = "%" * Def / function (c,Defs) + local cat = Defs and Defs[c] or Predef[c] + if not cat then error ("name '" .. c .. "' undefined") end + return cat +end + +local Range = m.Cs(any * (m.P"-"/"") * (any - "]")) / mm.R + +local item = (defined + Range + m.C(any)) / m.P + +local Class = + "[" + * (m.C(m.P"^"^-1)) -- optional complement symbol + * m.Cf(item * (item - "]")^0, mt.__add) / + function (c, p) return c == "^" and any - p or p end + * "]" + +local function adddef (t, k, exp) + if t[k] then + error("'"..k.."' already defined as a rule") + else + t[k] = exp + end + return t +end + +local function firstdef (n, r) return adddef({n}, n, r) end + + +local function NT (n, b) + if not b then + error("rule '"..n.."' used outside a grammar") + else return mm.V(n) + end +end + + +local exp = m.P{ "Exp", + Exp = S * ( m.V"Grammar" + + m.Cf(m.V"Seq" * ("/" * S * m.V"Seq")^0, mt.__add) ); + Seq = m.Cf(m.Cc(m.P"") * m.V"Prefix"^0 , mt.__mul) + * (#seq_follow + patt_error); + Prefix = "&" * S * m.V"Prefix" / mt.__len + + "!" * S * m.V"Prefix" / mt.__unm + + m.V"Suffix"; + Suffix = m.Cf(m.V"Primary" * S * + ( ( m.P"+" * m.Cc(1, mt.__pow) + + m.P"*" * m.Cc(0, mt.__pow) + + m.P"?" * m.Cc(-1, mt.__pow) + + "^" * ( m.Cg(num * m.Cc(mult)) + + m.Cg(m.C(m.S"+-" * m.R"09"^1) * m.Cc(mt.__pow)) + ) + + "->" * S * ( m.Cg((String + num) * m.Cc(mt.__div)) + + m.P"{}" * m.Cc(nil, m.Ct) + + defwithfunc(mt.__div) + ) + + "=>" * S * defwithfunc(m.Cmt) + + "~>" * S * defwithfunc(m.Cf) + ) * S + )^0, function (a,b,f) return f(a,b) end ); + Primary = "(" * m.V"Exp" * ")" + + String / mm.P + + Class + + defined + + "{:" * (name * ":" + m.Cc(nil)) * m.V"Exp" * ":}" / + function (n, p) return mm.Cg(p, n) end + + "=" * name / function (n) return mm.Cmt(mm.Cb(n), equalcap) end + + m.P"{}" / mm.Cp + + "{~" * m.V"Exp" * "~}" / mm.Cs + + "{|" * m.V"Exp" * "|}" / mm.Ct + + "{" * m.V"Exp" * "}" / mm.C + + m.P"." * m.Cc(any) + + (name * -arrow + "<" * name * ">") * m.Cb("G") / NT; + Definition = name * arrow * m.V"Exp"; + Grammar = m.Cg(m.Cc(true), "G") * + m.Cf(m.V"Definition" / firstdef * m.Cg(m.V"Definition")^0, + adddef) / mm.P +} + +local pattern = S * m.Cg(m.Cc(false), "G") * exp / mm.P * (-any + patt_error) + + +local function compile (p, defs) + if mm.type(p) == "pattern" then return p end -- already compiled + local cp = pattern:match(p, 1, defs) + if not cp then error("incorrect pattern", 3) end + return cp +end + +local function match (s, p, i) + local cp = mem[p] + if not cp then + cp = compile(p) + mem[p] = cp + end + return cp:match(s, i or 1) +end + +local function find (s, p, i) + local cp = fmem[p] + if not cp then + cp = compile(p) / 0 + cp = mm.P{ mm.Cp() * cp * mm.Cp() + 1 * mm.V(1) } + fmem[p] = cp + end + local i, e = cp:match(s, i or 1) + if i then return i, e - 1 + else return i + end +end + +local function gsub (s, p, rep) + local g = gmem[p] or {} -- ensure gmem[p] is not collected while here + gmem[p] = g + local cp = g[rep] + if not cp then + cp = compile(p) + cp = mm.Cs((cp / rep + 1)^0) + g[rep] = cp + end + return cp:match(s) +end + + +-- exported names +local re = { + compile = compile, + match = match, + find = find, + gsub = gsub, + updatelocale = updatelocale, +} + +if version == "Lua 5.1" then _G.re = re end + +return re diff --git a/ThirdParty/lpeg-1.0.2/test.lua b/ThirdParty/lpeg-1.0.2/test.lua new file mode 100644 index 0000000..8f9f574 --- /dev/null +++ b/ThirdParty/lpeg-1.0.2/test.lua @@ -0,0 +1,1523 @@ +#!/usr/bin/env lua + +-- $Id: test.lua $ + +-- require"strict" -- just to be pedantic + +local m = require"lpeg" + + +-- for general use +local a, b, c, d, e, f, g, p, t + + +-- compatibility with Lua 5.2 +local unpack = rawget(table, "unpack") or unpack +local loadstring = rawget(_G, "loadstring") or load + + +local any = m.P(1) +local space = m.S" \t\n"^0 + +local function checkeq (x, y, p) +if p then print(x,y) end + if type(x) ~= "table" then assert(x == y) + else + for k,v in pairs(x) do checkeq(v, y[k], p) end + for k,v in pairs(y) do checkeq(v, x[k], p) end + end +end + + +local mt = getmetatable(m.P(1)) + + +local allchar = {} +for i=0,255 do allchar[i + 1] = i end +allchar = string.char(unpack(allchar)) +assert(#allchar == 256) + +local function cs2str (c) + return m.match(m.Cs((c + m.P(1)/"")^0), allchar) +end + +local function eqcharset (c1, c2) + assert(cs2str(c1) == cs2str(c2)) +end + + +print"General tests for LPeg library" + +assert(type(m.version()) == "string") +print("version " .. m.version()) +assert(m.type("alo") ~= "pattern") +assert(m.type(io.input) ~= "pattern") +assert(m.type(m.P"alo") == "pattern") + +-- tests for some basic optimizations +assert(m.match(m.P(false) + "a", "a") == 2) +assert(m.match(m.P(true) + "a", "a") == 1) +assert(m.match("a" + m.P(false), "b") == nil) +assert(m.match("a" + m.P(true), "b") == 1) + +assert(m.match(m.P(false) * "a", "a") == nil) +assert(m.match(m.P(true) * "a", "a") == 2) +assert(m.match("a" * m.P(false), "a") == nil) +assert(m.match("a" * m.P(true), "a") == 2) + +assert(m.match(#m.P(false) * "a", "a") == nil) +assert(m.match(#m.P(true) * "a", "a") == 2) +assert(m.match("a" * #m.P(false), "a") == nil) +assert(m.match("a" * #m.P(true), "a") == 2) + + +-- tests for locale +do + assert(m.locale(m) == m) + local t = {} + assert(m.locale(t, m) == t) + local x = m.locale() + for n,v in pairs(x) do + assert(type(n) == "string") + eqcharset(v, m[n]) + end +end + + +assert(m.match(3, "aaaa")) +assert(m.match(4, "aaaa")) +assert(not m.match(5, "aaaa")) +assert(m.match(-3, "aa")) +assert(not m.match(-3, "aaa")) +assert(not m.match(-3, "aaaa")) +assert(not m.match(-4, "aaaa")) +assert(m.P(-5):match"aaaa") + +assert(m.match("a", "alo") == 2) +assert(m.match("al", "alo") == 3) +assert(not m.match("alu", "alo")) +assert(m.match(true, "") == 1) + +local digit = m.S"0123456789" +local upper = m.S"ABCDEFGHIJKLMNOPQRSTUVWXYZ" +local lower = m.S"abcdefghijklmnopqrstuvwxyz" +local letter = m.S"" + upper + lower +local alpha = letter + digit + m.R() + +eqcharset(m.S"", m.P(false)) +eqcharset(upper, m.R("AZ")) +eqcharset(lower, m.R("az")) +eqcharset(upper + lower, m.R("AZ", "az")) +eqcharset(upper + lower, m.R("AZ", "cz", "aa", "bb", "90")) +eqcharset(digit, m.S"01234567" + "8" + "9") +eqcharset(upper, letter - lower) +eqcharset(m.S(""), m.R()) +assert(cs2str(m.S("")) == "") + +eqcharset(m.S"\0", "\0") +eqcharset(m.S"\1\0\2", m.R"\0\2") +eqcharset(m.S"\1\0\2", m.R"\1\2" + "\0") +eqcharset(m.S"\1\0\2" - "\0", m.R"\1\2") + +local word = alpha^1 * (1 - alpha)^0 + +assert((word^0 * -1):match"alo alo") +assert(m.match(word^1 * -1, "alo alo")) +assert(m.match(word^2 * -1, "alo alo")) +assert(not m.match(word^3 * -1, "alo alo")) + +assert(not m.match(word^-1 * -1, "alo alo")) +assert(m.match(word^-2 * -1, "alo alo")) +assert(m.match(word^-3 * -1, "alo alo")) + +local eos = m.P(-1) + +assert(m.match(digit^0 * letter * digit * eos, "1298a1")) +assert(not m.match(digit^0 * letter * eos, "1257a1")) + +b = { + [1] = "(" * (((1 - m.S"()") + #m.P"(" * m.V(1))^0) * ")" +} + +assert(m.match(b, "(al())()")) +assert(not m.match(b * eos, "(al())()")) +assert(m.match(b * eos, "((al())()(é))")) +assert(not m.match(b, "(al()()")) + +assert(not m.match(letter^1 - "for", "foreach")) +assert(m.match(letter^1 - ("for" * eos), "foreach")) +assert(not m.match(letter^1 - ("for" * eos), "for")) + +function basiclookfor (p) + return m.P { + [1] = p + (1 * m.V(1)) + } +end + +function caplookfor (p) + return basiclookfor(p:C()) +end + +assert(m.match(caplookfor(letter^1), " 4achou123...") == "achou") +a = {m.match(caplookfor(letter^1)^0, " two words, one more ")} +checkeq(a, {"two", "words", "one", "more"}) + +assert(m.match( basiclookfor((#m.P(b) * 1) * m.Cp()), " ( (a)") == 7) + +a = {m.match(m.C(digit^1 * m.Cc"d") + m.C(letter^1 * m.Cc"l"), "123")} +checkeq(a, {"123", "d"}) + +-- bug in LPeg 0.12 (nil value does not create a 'ktable') +assert(m.match(m.Cc(nil), "") == nil) + +a = {m.match(m.C(digit^1 * m.Cc"d") + m.C(letter^1 * m.Cc"l"), "abcd")} +checkeq(a, {"abcd", "l"}) + +a = {m.match(m.Cc(10,20,30) * 'a' * m.Cp(), 'aaa')} +checkeq(a, {10,20,30,2}) +a = {m.match(m.Cp() * m.Cc(10,20,30) * 'a' * m.Cp(), 'aaa')} +checkeq(a, {1,10,20,30,2}) +a = m.match(m.Ct(m.Cp() * m.Cc(10,20,30) * 'a' * m.Cp()), 'aaa') +checkeq(a, {1,10,20,30,2}) +a = m.match(m.Ct(m.Cp() * m.Cc(7,8) * m.Cc(10,20,30) * 'a' * m.Cp()), 'aaa') +checkeq(a, {1,7,8,10,20,30,2}) +a = {m.match(m.Cc() * m.Cc() * m.Cc(1) * m.Cc(2,3,4) * m.Cc() * 'a', 'aaa')} +checkeq(a, {1,2,3,4}) + +a = {m.match(m.Cp() * letter^1 * m.Cp(), "abcd")} +checkeq(a, {1, 5}) + + +t = {m.match({[1] = m.C(m.C(1) * m.V(1) + -1)}, "abc")} +checkeq(t, {"abc", "a", "bc", "b", "c", "c", ""}) + +-- bug in 0.12 ('hascapture' did not check for captures inside a rule) +do + local pat = m.P{ + 'S'; + S1 = m.C('abc') + 3, + S = #m.V('S1') -- rule has capture, but '#' must ignore it + } + assert(pat:match'abc' == 1) +end + + +-- bug: loop in 'hascaptures' +do + local p = m.C(-m.P{m.P'x' * m.V(1) + m.P'y'}) + assert(p:match("xxx") == "") +end + + + +-- test for small capture boundary +for i = 250,260 do + assert(#m.match(m.C(i), string.rep('a', i)) == i) + assert(#m.match(m.C(m.C(i)), string.rep('a', i)) == i) +end + +-- tests for any*n and any*-n +for n = 1, 550, 13 do + local x_1 = string.rep('x', n - 1) + local x = x_1 .. 'a' + assert(not m.P(n):match(x_1)) + assert(m.P(n):match(x) == n + 1) + assert(n < 4 or m.match(m.P(n) + "xxx", x_1) == 4) + assert(m.C(n):match(x) == x) + assert(m.C(m.C(n)):match(x) == x) + assert(m.P(-n):match(x_1) == 1) + assert(not m.P(-n):match(x)) + assert(n < 13 or m.match(m.Cc(20) * ((n - 13) * m.P(10)) * 3, x) == 20) + local n3 = math.floor(n/3) + assert(m.match(n3 * m.Cp() * n3 * n3, x) == n3 + 1) +end + +-- true values +assert(m.P(0):match("x") == 1) +assert(m.P(0):match("") == 1) +assert(m.C(0):match("x") == "") + +assert(m.match(m.Cc(0) * m.P(10) + m.Cc(1) * "xuxu", "xuxu") == 1) +assert(m.match(m.Cc(0) * m.P(10) + m.Cc(1) * "xuxu", "xuxuxuxuxu") == 0) +assert(m.match(m.C(m.P(2)^1), "abcde") == "abcd") +p = m.Cc(0) * 1 + m.Cc(1) * 2 + m.Cc(2) * 3 + m.Cc(3) * 4 + + +-- test for alternation optimization +assert(m.match(m.P"a"^1 + "ab" + m.P"x"^0, "ab") == 2) +assert(m.match((m.P"a"^1 + "ab" + m.P"x"^0 * 1)^0, "ab") == 3) +assert(m.match(m.P"ab" + "cd" + "" + "cy" + "ak", "98") == 1) +assert(m.match(m.P"ab" + "cd" + "ax" + "cy", "ax") == 3) +assert(m.match("a" * m.P"b"^0 * "c" + "cd" + "ax" + "cy", "ax") == 3) +assert(m.match((m.P"ab" + "cd" + "ax" + "cy")^0, "ax") == 3) +assert(m.match(m.P(1) * "x" + m.S"" * "xu" + "ay", "ay") == 3) +assert(m.match(m.P"abc" + "cde" + "aka", "aka") == 4) +assert(m.match(m.S"abc" * "x" + "cde" + "aka", "ax") == 3) +assert(m.match(m.S"abc" * "x" + "cde" + "aka", "aka") == 4) +assert(m.match(m.S"abc" * "x" + "cde" + "aka", "cde") == 4) +assert(m.match(m.S"abc" * "x" + "ide" + m.S"ab" * "ka", "aka") == 4) +assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "cde" + "aka", "ax") == 3) +assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "cde" + "aka", "aka") == 4) +assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "cde" + "aka", "cde") == 4) +assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "ide" + m.S"ab" * "ka", "aka") == 4) +assert(m.match("ab" + m.S"abc" * m.P"y"^0 * "x" + "ide" + m.S"ab" * "ka", "ax") == 3) +assert(m.match(m.P(1) * "x" + "cde" + m.S"ab" * "ka", "aka") == 4) +assert(m.match(m.P(1) * "x" + "cde" + m.P(1) * "ka", "aka") == 4) +assert(m.match(m.P(1) * "x" + "cde" + m.P(1) * "ka", "cde") == 4) +assert(m.match(m.P"eb" + "cd" + m.P"e"^0 + "x", "ee") == 3) +assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "abcd") == 3) +assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "eeex") == 4) +assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "cd") == 3) +assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x", "x") == 1) +assert(m.match(m.P"ab" + "cd" + m.P"e"^0 + "x" + "", "zee") == 1) +assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "abcd") == 3) +assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "eeex") == 4) +assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "cd") == 3) +assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x", "x") == 2) +assert(m.match(m.P"ab" + "cd" + m.P"e"^1 + "x" + "", "zee") == 1) +assert(not m.match(("aa" * m.P"bc"^-1 + "aab") * "e", "aabe")) + +assert(m.match("alo" * (m.P"\n" + -1), "alo") == 4) + + +-- bug in 0.12 (rc1) +assert(m.match((m.P"\128\187\191" + m.S"abc")^0, "\128\187\191") == 4) + +assert(m.match(m.S"\0\128\255\127"^0, string.rep("\0\128\255\127", 10)) == + 4*10 + 1) + +-- optimizations with optional parts +assert(m.match(("ab" * -m.P"c")^-1, "abc") == 1) +assert(m.match(("ab" * #m.P"c")^-1, "abd") == 1) +assert(m.match(("ab" * m.B"c")^-1, "ab") == 1) +assert(m.match(("ab" * m.P"cd"^0)^-1, "abcdcdc") == 7) + +assert(m.match(m.P"ab"^-1 - "c", "abcd") == 3) + +p = ('Aa' * ('Bb' * ('Cc' * m.P'Dd'^0)^0)^0)^-1 +assert(p:match("AaBbCcDdBbCcDdDdDdBb") == 21) + + +-- bug in 0.12.2 +-- p = { ('ab' ('c' 'ef'?)*)? } +p = m.C(('ab' * ('c' * m.P'ef'^-1)^0)^-1) +s = "abcefccefc" +assert(s == p:match(s)) + + +pi = "3.14159 26535 89793 23846 26433 83279 50288 41971 69399 37510" +assert(m.match(m.Cs((m.P"1" / "a" + m.P"5" / "b" + m.P"9" / "c" + 1)^0), pi) == + m.match(m.Cs((m.P(1) / {["1"] = "a", ["5"] = "b", ["9"] = "c"})^0), pi)) +print"+" + + +-- tests for capture optimizations +assert(m.match((m.P(3) + 4 * m.Cp()) * "a", "abca") == 5) +t = {m.match(((m.P"a" + m.Cp()) * m.P"x")^0, "axxaxx")} +checkeq(t, {3, 6}) + + +-- tests for numbered captures +p = m.C(1) +assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 3, "abcdefgh") == "a") +assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 1, "abcdefgh") == "abcdef") +assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 4, "abcdefgh") == "bc") +assert(m.match(m.C(m.C(p * m.C(2)) * m.C(3)) / 0, "abcdefgh") == 7) + +a, b, c = m.match(p * (m.C(p * m.C(2)) * m.C(3) / 4) * p, "abcdefgh") +assert(a == "a" and b == "efg" and c == "h") + +-- test for table captures +t = m.match(m.Ct(letter^1), "alo") +checkeq(t, {}) + +t, n = m.match(m.Ct(m.C(letter)^1) * m.Cc"t", "alo") +assert(n == "t" and table.concat(t) == "alo") + +t = m.match(m.Ct(m.C(m.C(letter)^1)), "alo") +assert(table.concat(t, ";") == "alo;a;l;o") + +t = m.match(m.Ct(m.C(m.C(letter)^1)), "alo") +assert(table.concat(t, ";") == "alo;a;l;o") + +t = m.match(m.Ct(m.Ct((m.Cp() * letter * m.Cp())^1)), "alo") +assert(table.concat(t[1], ";") == "1;2;2;3;3;4") + +t = m.match(m.Ct(m.C(m.C(1) * 1 * m.C(1))), "alo") +checkeq(t, {"alo", "a", "o"}) + + +-- tests for groups +p = m.Cg(1) -- no capture +assert(p:match('x') == 'x') +p = m.Cg(m.P(true)/function () end * 1) -- no value +assert(p:match('x') == 'x') +p = m.Cg(m.Cg(m.Cg(m.C(1)))) +assert(p:match('x') == 'x') +p = m.Cg(m.Cg(m.Cg(m.C(1))^0) * m.Cg(m.Cc(1) * m.Cc(2))) +t = {p:match'abc'} +checkeq(t, {'a', 'b', 'c', 1, 2}) + +p = m.Ct(m.Cg(m.Cc(10), "hi") * m.C(1)^0 * m.Cg(m.Cc(20), "ho")) +t = p:match'' +checkeq(t, {hi = 10, ho = 20}) +t = p:match'abc' +checkeq(t, {hi = 10, ho = 20, 'a', 'b', 'c'}) + +-- non-string group names +p = m.Ct(m.Cg(1, print) * m.Cg(1, 23.5) * m.Cg(1, io)) +t = p:match('abcdefghij') +assert(t[print] == 'a' and t[23.5] == 'b' and t[io] == 'c') + + +-- test for error messages +local function checkerr (msg, f, ...) + local st, err = pcall(f, ...) + assert(not st and m.match({ m.P(msg) + 1 * m.V(1) }, err)) +end + +checkerr("rule '1' may be left recursive", m.match, { m.V(1) * 'a' }, "a") +checkerr("rule '1' used outside a grammar", m.match, m.V(1), "") +checkerr("rule 'hiii' used outside a grammar", m.match, m.V('hiii'), "") +checkerr("rule 'hiii' undefined in given grammar", m.match, { m.V('hiii') }, "") +checkerr("undefined in given grammar", m.match, { m.V{} }, "") + +checkerr("rule 'A' is not a pattern", m.P, { m.P(1), A = {} }) +checkerr("grammar has no initial rule", m.P, { [print] = {} }) + +-- grammar with a long call chain before left recursion +p = {'a', + a = m.V'b' * m.V'c' * m.V'd' * m.V'a', + b = m.V'c', + c = m.V'd', + d = m.V'e', + e = m.V'f', + f = m.V'g', + g = m.P'' +} +checkerr("rule 'a' may be left recursive", m.match, p, "a") + +-- Bug in peephole optimization of LPeg 0.12 (IJmp -> ICommit) +-- the next grammar has an original sequence IJmp -> ICommit -> IJmp L1 +-- that is optimized to ICommit L1 + +p = m.P { (m.P {m.P'abc'} + 'ayz') * m.V'y'; y = m.P'x' } +assert(p:match('abcx') == 5 and p:match('ayzx') == 5 and not p:match'abc') + + +do + -- large dynamic Cc + local lim = 2^16 - 1 + local c = 0 + local function seq (n) + if n == 1 then c = c + 1; return m.Cc(c) + else + local m = math.floor(n / 2) + return seq(m) * seq(n - m) + end + end + p = m.Ct(seq(lim)) + t = p:match('') + assert(t[lim] == lim) + checkerr("too many", function () p = p / print end) + checkerr("too many", seq, lim + 1) +end + + +do + -- nesting of captures too deep + local p = m.C(1) + for i = 1, 300 do + p = m.Ct(p) + end + checkerr("too deep", p.match, p, "x") +end + + +-- tests for non-pattern as arguments to pattern functions + +p = { ('a' * m.V(1))^-1 } * m.P'b' * { 'a' * m.V(2); m.V(1)^-1 } +assert(m.match(p, "aaabaac") == 7) + +p = m.P'abc' * 2 * -5 * true * 'de' -- mix of numbers and strings and booleans + +assert(p:match("abc01de") == 8) +assert(p:match("abc01de3456") == nil) + +p = 'abc' * (2 * (-5 * (true * m.P'de'))) + +assert(p:match("abc01de") == 8) +assert(p:match("abc01de3456") == nil) + +p = { m.V(2), m.P"abc" } * + (m.P{ "xx", xx = m.P"xx" } + { "x", x = m.P"a" * m.V"x" + "" }) +assert(p:match("abcaaaxx") == 7) +assert(p:match("abcxx") == 6) + + +-- a large table capture +t = m.match(m.Ct(m.C('a')^0), string.rep("a", 10000)) +assert(#t == 10000 and t[1] == 'a' and t[#t] == 'a') + +print('+') + + +-- bug in 0.10 (rechecking a grammar, after tail-call optimization) +m.P{ m.P { (m.P(3) + "xuxu")^0 * m.V"xuxu", xuxu = m.P(1) } } + +local V = m.V + +local Space = m.S(" \n\t")^0 +local Number = m.C(m.R("09")^1) * Space +local FactorOp = m.C(m.S("+-")) * Space +local TermOp = m.C(m.S("*/")) * Space +local Open = "(" * Space +local Close = ")" * Space + + +local function f_factor (v1, op, v2, d) + assert(d == nil) + if op == "+" then return v1 + v2 + else return v1 - v2 + end +end + + +local function f_term (v1, op, v2, d) + assert(d == nil) + if op == "*" then return v1 * v2 + else return v1 / v2 + end +end + +G = m.P{ "Exp", + Exp = m.Cf(V"Factor" * m.Cg(FactorOp * V"Factor")^0, f_factor); + Factor = m.Cf(V"Term" * m.Cg(TermOp * V"Term")^0, f_term); + Term = Number / tonumber + Open * V"Exp" * Close; +} + +G = Space * G * -1 + +for _, s in ipairs{" 3 + 5*9 / (1+1) ", "3+4/2", "3+3-3- 9*2+3*9/1- 8"} do + assert(m.match(G, s) == loadstring("return "..s)()) +end + + +-- test for grammars (errors deep in calling non-terminals) +g = m.P{ + [1] = m.V(2) + "a", + [2] = "a" * m.V(3) * "x", + [3] = "b" * m.V(3) + "c" +} + +assert(m.match(g, "abbbcx") == 7) +assert(m.match(g, "abbbbx") == 2) + + +-- tests for \0 +assert(m.match(m.R("\0\1")^1, "\0\1\0") == 4) +assert(m.match(m.S("\0\1ab")^1, "\0\1\0a") == 5) +assert(m.match(m.P(1)^3, "\0\1\0a") == 5) +assert(not m.match(-4, "\0\1\0a")) +assert(m.match("\0\1\0a", "\0\1\0a") == 5) +assert(m.match("\0\0\0", "\0\0\0") == 4) +assert(not m.match("\0\0\0", "\0\0")) + + +-- tests for predicates +assert(not m.match(-m.P("a") * 2, "alo")) +assert(m.match(- -m.P("a") * 2, "alo") == 3) +assert(m.match(#m.P("a") * 2, "alo") == 3) +assert(m.match(##m.P("a") * 2, "alo") == 3) +assert(not m.match(##m.P("c") * 2, "alo")) +assert(m.match(m.Cs((##m.P("a") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") +assert(m.match(m.Cs((#((#m.P"a")/"") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") +assert(m.match(m.Cs((- -m.P("a") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") +assert(m.match(m.Cs((-((-m.P"a")/"") * 1 + m.P(1)/".")^0), "aloal") == "a..a.") + + +-- fixed length +do + -- 'and' predicate using fixed length + local p = m.C(#("a" * (m.P("bd") + "cd")) * 2) + assert(p:match("acd") == "ac") + + p = #m.P{ "a" * m.V(2), m.P"b" } * 2 + assert(p:match("abc") == 3) + + p = #(m.P"abc" * m.B"c") + assert(p:match("abc") == 1 and not p:match("ab")) + + p = m.P{ "a" * m.V(2), m.P"b"^1 } + checkerr("pattern may not have fixed length", m.B, p) + + p = "abc" * (m.P"b"^1 + m.P"a"^0) + checkerr("pattern may not have fixed length", m.B, p) +end + + +p = -m.P'a' * m.Cc(1) + -m.P'b' * m.Cc(2) + -m.P'c' * m.Cc(3) +assert(p:match('a') == 2 and p:match('') == 1 and p:match('b') == 1) + +p = -m.P'a' * m.Cc(10) + #m.P'a' * m.Cc(20) +assert(p:match('a') == 20 and p:match('') == 10 and p:match('b') == 10) + + + +-- look-behind predicate +assert(not m.match(m.B'a', 'a')) +assert(m.match(1 * m.B'a', 'a') == 2) +assert(not m.match(m.B(1), 'a')) +assert(m.match(1 * m.B(1), 'a') == 2) +assert(m.match(-m.B(1), 'a') == 1) +assert(m.match(m.B(250), string.rep('a', 250)) == nil) +assert(m.match(250 * m.B(250), string.rep('a', 250)) == 251) + +-- look-behind with an open call +checkerr("pattern may not have fixed length", m.B, m.V'S1') +checkerr("too long to look behind", m.B, 260) + +B = #letter * -m.B(letter) + -letter * m.B(letter) +x = m.Ct({ (B * m.Cp())^-1 * (1 * m.V(1) + m.P(true)) }) +checkeq(m.match(x, 'ar cal c'), {1,3,4,7,9,10}) +checkeq(m.match(x, ' ar cal '), {2,4,5,8}) +checkeq(m.match(x, ' '), {}) +checkeq(m.match(x, 'aloalo'), {1,7}) + +assert(m.match(B, "a") == 1) +assert(m.match(1 * B, "a") == 2) +assert(not m.B(1 - letter):match("")) +assert((-m.B(letter)):match("") == 1) + +assert((4 * m.B(letter, 4)):match("aaaaaaaa") == 5) +assert(not (4 * m.B(#letter * 5)):match("aaaaaaaa")) +assert((4 * -m.B(#letter * 5)):match("aaaaaaaa") == 5) + +-- look-behind with grammars +assert(m.match('a' * m.B{'x', x = m.P(3)}, 'aaa') == nil) +assert(m.match('aa' * m.B{'x', x = m.P('aaa')}, 'aaaa') == nil) +assert(m.match('aaa' * m.B{'x', x = m.P('aaa')}, 'aaaaa') == 4) + + + +-- bug in 0.9 +assert(m.match(('a' * #m.P'b'), "ab") == 2) +assert(not m.match(('a' * #m.P'b'), "a")) + +assert(not m.match(#m.S'567', "")) +assert(m.match(#m.S'567' * 1, "6") == 2) + + +-- tests for Tail Calls + +p = m.P{ 'a' * m.V(1) + '' } +assert(p:match(string.rep('a', 1000)) == 1001) + +-- create a grammar for a simple DFA for even number of 0s and 1s +-- +-- ->1 <---0---> 2 +-- ^ ^ +-- | | +-- 1 1 +-- | | +-- V V +-- 3 <---0---> 4 +-- +-- this grammar should keep no backtracking information + +p = m.P{ + [1] = '0' * m.V(2) + '1' * m.V(3) + -1, + [2] = '0' * m.V(1) + '1' * m.V(4), + [3] = '0' * m.V(4) + '1' * m.V(1), + [4] = '0' * m.V(3) + '1' * m.V(2), +} + +assert(p:match(string.rep("00", 10000))) +assert(p:match(string.rep("01", 10000))) +assert(p:match(string.rep("011", 10000))) +assert(not p:match(string.rep("011", 10000) .. "1")) +assert(not p:match(string.rep("011", 10001))) + + +-- this grammar does need backtracking info. +local lim = 10000 +p = m.P{ '0' * m.V(1) + '0' } +checkerr("stack overflow", m.match, p, string.rep("0", lim)) +m.setmaxstack(2*lim) +checkerr("stack overflow", m.match, p, string.rep("0", lim)) +m.setmaxstack(2*lim + 4) +assert(m.match(p, string.rep("0", lim)) == lim + 1) + +-- this repetition should not need stack space (only the call does) +p = m.P{ ('a' * m.V(1))^0 * 'b' + 'c' } +m.setmaxstack(200) +assert(p:match(string.rep('a', 180) .. 'c' .. string.rep('b', 180)) == 362) + +m.setmaxstack(100) -- restore low limit + +-- tests for optional start position +assert(m.match("a", "abc", 1)) +assert(m.match("b", "abc", 2)) +assert(m.match("c", "abc", 3)) +assert(not m.match(1, "abc", 4)) +assert(m.match("a", "abc", -3)) +assert(m.match("b", "abc", -2)) +assert(m.match("c", "abc", -1)) +assert(m.match("abc", "abc", -4)) -- truncate to position 1 + +assert(m.match("", "abc", 10)) -- empty string is everywhere! +assert(m.match("", "", 10)) +assert(not m.match(1, "", 1)) +assert(not m.match(1, "", -1)) +assert(not m.match(1, "", 0)) + +print("+") + + +-- tests for argument captures +checkerr("invalid argument", m.Carg, 0) +checkerr("invalid argument", m.Carg, -1) +checkerr("invalid argument", m.Carg, 2^18) +checkerr("absent extra argument #1", m.match, m.Carg(1), 'a', 1) +assert(m.match(m.Carg(1), 'a', 1, print) == print) +x = {m.match(m.Carg(1) * m.Carg(2), '', 1, 10, 20)} +checkeq(x, {10, 20}) + +assert(m.match(m.Cmt(m.Cg(m.Carg(3), "a") * + m.Cmt(m.Cb("a"), function (s,i,x) + assert(s == "a" and i == 1); + return i, x+1 + end) * + m.Carg(2), function (s,i,a,b,c) + assert(s == "a" and i == 1 and c == nil); + return i, 2*a + 3*b + end) * "a", + "a", 1, false, 100, 1000) == 2*1001 + 3*100) + + +-- tests for Lua functions + +t = {} +s = "" +p = m.P(function (s1, i) assert(s == s1); t[#t + 1] = i; return nil end) * false +s = "hi, this is a test" +assert(m.match(((p - m.P(-1)) + 2)^0, s) == string.len(s) + 1) +assert(#t == string.len(s)/2 and t[1] == 1 and t[2] == 3) + +assert(not m.match(p, s)) + +p = mt.__add(function (s, i) return i end, function (s, i) return nil end) +assert(m.match(p, "alo")) + +p = mt.__mul(function (s, i) return i end, function (s, i) return nil end) +assert(not m.match(p, "alo")) + + +t = {} +p = function (s1, i) assert(s == s1); t[#t + 1] = i; return i end +s = "hi, this is a test" +assert(m.match((m.P(1) * p)^0, s) == string.len(s) + 1) +assert(#t == string.len(s) and t[1] == 2 and t[2] == 3) + +t = {} +p = m.P(function (s1, i) assert(s == s1); t[#t + 1] = i; + return i <= s1:len() and i end) * 1 +s = "hi, this is a test" +assert(m.match(p^0, s) == string.len(s) + 1) +assert(#t == string.len(s) + 1 and t[1] == 1 and t[2] == 2) + +p = function (s1, i) return m.match(m.P"a"^1, s1, i) end +assert(m.match(p, "aaaa") == 5) +assert(m.match(p, "abaa") == 2) +assert(not m.match(p, "baaa")) + +checkerr("invalid position", m.match, function () return 2^20 end, s) +checkerr("invalid position", m.match, function () return 0 end, s) +checkerr("invalid position", m.match, function (s, i) return i - 1 end, s) +checkerr("invalid position", m.match, + m.P(1)^0 * function (_, i) return i - 1 end, s) +assert(m.match(m.P(1)^0 * function (_, i) return i end * -1, s)) +checkerr("invalid position", m.match, + m.P(1)^0 * function (_, i) return i + 1 end, s) +assert(m.match(m.P(function (s, i) return s:len() + 1 end) * -1, s)) +checkerr("invalid position", m.match, m.P(function (s, i) return s:len() + 2 end) * -1, s) +assert(not m.match(m.P(function (s, i) return s:len() end) * -1, s)) +assert(m.match(m.P(1)^0 * function (_, i) return true end, s) == + string.len(s) + 1) +for i = 1, string.len(s) + 1 do + assert(m.match(function (_, _) return i end, s) == i) +end + +p = (m.P(function (s, i) return i%2 == 0 and i end) * 1 + + m.P(function (s, i) return i%2 ~= 0 and i + 2 <= s:len() and i end) * 3)^0 + * -1 +assert(p:match(string.rep('a', 14000))) + +-- tests for Function Replacements +f = function (a, ...) if a ~= "x" then return {a, ...} end end + +t = m.match(m.C(1)^0/f, "abc") +checkeq(t, {"a", "b", "c"}) + +t = m.match(m.C(1)^0/f/f, "abc") +checkeq(t, {{"a", "b", "c"}}) + +t = m.match(m.P(1)^0/f/f, "abc") -- no capture +checkeq(t, {{"abc"}}) + +t = m.match((m.P(1)^0/f * m.Cp())/f, "abc") +checkeq(t, {{"abc"}, 4}) + +t = m.match((m.C(1)^0/f * m.Cp())/f, "abc") +checkeq(t, {{"a", "b", "c"}, 4}) + +t = m.match((m.C(1)^0/f * m.Cp())/f, "xbc") +checkeq(t, {4}) + +t = m.match(m.C(m.C(1)^0)/f, "abc") +checkeq(t, {"abc", "a", "b", "c"}) + +g = function (...) return 1, ... end +t = {m.match(m.C(1)^0/g/g, "abc")} +checkeq(t, {1, 1, "a", "b", "c"}) + +t = {m.match(m.Cc(nil,nil,4) * m.Cc(nil,3) * m.Cc(nil, nil) / g / g, "")} +t1 = {1,1,nil,nil,4,nil,3,nil,nil} +for i=1,10 do assert(t[i] == t1[i]) end + +-- bug in 0.12.2: ktable with only nil could be eliminated when joining +-- with a pattern without ktable +assert((m.P"aaa" * m.Cc(nil)):match"aaa" == nil) + +t = {m.match((m.C(1) / function (x) return x, x.."x" end)^0, "abc")} +checkeq(t, {"a", "ax", "b", "bx", "c", "cx"}) + +t = m.match(m.Ct((m.C(1) / function (x,y) return y, x end * m.Cc(1))^0), "abc") +checkeq(t, {nil, "a", 1, nil, "b", 1, nil, "c", 1}) + +-- tests for Query Replacements + +assert(m.match(m.C(m.C(1)^0)/{abc = 10}, "abc") == 10) +assert(m.match(m.C(1)^0/{a = 10}, "abc") == 10) +assert(m.match(m.S("ba")^0/{ab = 40}, "abc") == 40) +t = m.match(m.Ct((m.S("ba")/{a = 40})^0), "abc") +checkeq(t, {40}) + +assert(m.match(m.Cs((m.C(1)/{a=".", d=".."})^0), "abcdde") == ".bc....e") +assert(m.match(m.Cs((m.C(1)/{f="."})^0), "abcdde") == "abcdde") +assert(m.match(m.Cs((m.C(1)/{d="."})^0), "abcdde") == "abc..e") +assert(m.match(m.Cs((m.C(1)/{e="."})^0), "abcdde") == "abcdd.") +assert(m.match(m.Cs((m.C(1)/{e=".", f="+"})^0), "eefef") == "..+.+") +assert(m.match(m.Cs((m.C(1))^0), "abcdde") == "abcdde") +assert(m.match(m.Cs(m.C(m.C(1)^0)), "abcdde") == "abcdde") +assert(m.match(1 * m.Cs(m.P(1)^0), "abcdde") == "bcdde") +assert(m.match(m.Cs((m.C('0')/'x' + 1)^0), "abcdde") == "abcdde") +assert(m.match(m.Cs((m.C('0')/'x' + 1)^0), "0ab0b0") == "xabxbx") +assert(m.match(m.Cs((m.C('0')/'x' + m.P(1)/{b=3})^0), "b0a0b") == "3xax3") +assert(m.match(m.P(1)/'%0%0'/{aa = -3} * 'x', 'ax') == -3) +assert(m.match(m.C(1)/'%0%1'/{aa = 'z'}/{z = -3} * 'x', 'ax') == -3) + +assert(m.match(m.Cs(m.Cc(0) * (m.P(1)/"")), "4321") == "0") + +assert(m.match(m.Cs((m.P(1) / "%0")^0), "abcd") == "abcd") +assert(m.match(m.Cs((m.P(1) / "%0.%0")^0), "abcd") == "a.ab.bc.cd.d") +assert(m.match(m.Cs((m.P("a") / "%0.%0" + 1)^0), "abcad") == "a.abca.ad") +assert(m.match(m.C("a") / "%1%%%0", "a") == "a%a") +assert(m.match(m.Cs((m.P(1) / ".xx")^0), "abcd") == ".xx.xx.xx.xx") +assert(m.match(m.Cp() * m.P(3) * m.Cp()/"%2%1%1 - %0 ", "abcde") == + "411 - abc ") + +assert(m.match(m.P(1)/"%0", "abc") == "a") +checkerr("invalid capture index", m.match, m.P(1)/"%1", "abc") +checkerr("invalid capture index", m.match, m.P(1)/"%9", "abc") + +p = m.C(1) +p = p * p; p = p * p; p = p * p * m.C(1) / "%9 - %1" +assert(p:match("1234567890") == "9 - 1") + +assert(m.match(m.Cc(print), "") == print) + +-- too many captures (just ignore extra ones) +p = m.C(1)^0 / "%2-%9-%0-%9" +assert(p:match"01234567890123456789" == "1-8-01234567890123456789-8") +s = string.rep("12345678901234567890", 20) +assert(m.match(m.C(1)^0 / "%9-%1-%0-%3", s) == "9-1-" .. s .. "-3") + +-- string captures with non-string subcaptures +p = m.Cc('alo') * m.C(1) / "%1 - %2 - %1" +assert(p:match'x' == 'alo - x - alo') + +checkerr("invalid capture value (a boolean)", m.match, m.Cc(true) / "%1", "a") + +-- long strings for string capture +l = 10000 +s = string.rep('a', l) .. string.rep('b', l) .. string.rep('c', l) + +p = (m.C(m.P'a'^1) * m.C(m.P'b'^1) * m.C(m.P'c'^1)) / '%3%2%1' + +assert(p:match(s) == string.rep('c', l) .. + string.rep('b', l) .. + string.rep('a', l)) + +print"+" + +-- accumulator capture +function f (x) return x + 1 end +assert(m.match(m.Cf(m.Cc(0) * m.C(1)^0, f), "alo alo") == 7) + +t = {m.match(m.Cf(m.Cc(1,2,3), error), "")} +checkeq(t, {1}) +p = m.Cf(m.Ct(true) * m.Cg(m.C(m.R"az"^1) * "=" * m.C(m.R"az"^1) * ";")^0, + rawset) +t = p:match("a=b;c=du;xux=yuy;") +checkeq(t, {a="b", c="du", xux="yuy"}) + + +-- errors in accumulator capture + +-- no initial capture +checkerr("no initial value", m.match, m.Cf(m.P(5), print), 'aaaaaa') +-- no initial capture (very long match forces fold to be a pair open-close) +checkerr("no initial value", m.match, m.Cf(m.P(500), print), + string.rep('a', 600)) + +-- nested capture produces no initial value +checkerr("no initial value", m.match, m.Cf(m.P(1) / {}, print), "alo") + + +-- tests for loop checker + +local function isnullable (p) + checkerr("may accept empty string", function (p) return p^0 end, m.P(p)) +end + +isnullable(m.P("x")^-4) +assert(m.match(((m.P(0) + 1) * m.S"al")^0, "alo") == 3) +assert(m.match((("x" + #m.P(1))^-4 * m.S"al")^0, "alo") == 3) +isnullable("") +isnullable(m.P("x")^0) +isnullable(m.P("x")^-1) +isnullable(m.P("x") + 1 + 2 + m.P("a")^-1) +isnullable(-m.P("ab")) +isnullable(- -m.P("ab")) +isnullable(# #(m.P("ab") + "xy")) +isnullable(- #m.P("ab")^0) +isnullable(# -m.P("ab")^1) +isnullable(#m.V(3)) +isnullable(m.V(3) + m.V(1) + m.P('a')^-1) +isnullable({[1] = m.V(2) * m.V(3), [2] = m.V(3), [3] = m.P(0)}) +assert(m.match(m.P{[1] = m.V(2) * m.V(3), [2] = m.V(3), [3] = m.P(1)}^0, "abc") + == 3) +assert(m.match(m.P""^-3, "a") == 1) + +local function find (p, s) + return m.match(basiclookfor(p), s) +end + + +local function badgrammar (g, expected) + local stat, msg = pcall(m.P, g) + assert(not stat) + if expected then assert(find(expected, msg)) end +end + +badgrammar({[1] = m.V(1)}, "rule '1'") +badgrammar({[1] = m.V(2)}, "rule '2'") -- invalid non-terminal +badgrammar({[1] = m.V"x"}, "rule 'x'") -- invalid non-terminal +badgrammar({[1] = m.V{}}, "rule '(a table)'") -- invalid non-terminal +badgrammar({[1] = #m.P("a") * m.V(1)}, "rule '1'") -- left-recursive +badgrammar({[1] = -m.P("a") * m.V(1)}, "rule '1'") -- left-recursive +badgrammar({[1] = -1 * m.V(1)}, "rule '1'") -- left-recursive +badgrammar({[1] = -1 + m.V(1)}, "rule '1'") -- left-recursive +badgrammar({[1] = 1 * m.V(2), [2] = m.V(2)}, "rule '2'") -- left-recursive +badgrammar({[1] = 1 * m.V(2)^0, [2] = m.P(0)}, "rule '1'") -- inf. loop +badgrammar({ m.V(2), m.V(3)^0, m.P"" }, "rule '2'") -- inf. loop +badgrammar({ m.V(2) * m.V(3)^0, m.V(3)^0, m.P"" }, "rule '1'") -- inf. loop +badgrammar({"x", x = #(m.V(1) * 'a') }, "rule '1'") -- inf. loop +badgrammar({ -(m.V(1) * 'a') }, "rule '1'") -- inf. loop +badgrammar({"x", x = m.P'a'^-1 * m.V"x"}, "rule 'x'") -- left recursive +badgrammar({"x", x = m.P'a' * m.V"y"^1, y = #m.P(1)}, "rule 'x'") + +assert(m.match({'a' * -m.V(1)}, "aaa") == 2) +assert(m.match({'a' * -m.V(1)}, "aaaa") == nil) + + +-- good x bad grammars +m.P{ ('a' * m.V(1))^-1 } +m.P{ -('a' * m.V(1)) } +m.P{ ('abc' * m.V(1))^-1 } +m.P{ -('abc' * m.V(1)) } +badgrammar{ #m.P('abc') * m.V(1) } +badgrammar{ -('a' + m.V(1)) } +m.P{ #('a' * m.V(1)) } +badgrammar{ #('a' + m.V(1)) } +m.P{ m.B{ m.P'abc' } * 'a' * m.V(1) } +badgrammar{ m.B{ m.P'abc' } * m.V(1) } +badgrammar{ ('a' + m.P'bcd')^-1 * m.V(1) } + + +-- simple tests for maximum sizes: +local p = m.P"a" +for i=1,14 do p = p * p end + +p = {} +for i=1,100 do p[i] = m.P"a" end +p = m.P(p) + + +-- strange values for rule labels + +p = m.P{ "print", + print = m.V(print), + [print] = m.V(_G), + [_G] = m.P"a", + } + +assert(p:match("a")) + +-- initial rule +g = {} +for i = 1, 10 do g["i"..i] = "a" * m.V("i"..i+1) end +g.i11 = m.P"" +for i = 1, 10 do + g[1] = "i"..i + local p = m.P(g) + assert(p:match("aaaaaaaaaaa") == 11 - i + 1) +end + +print"+" + + +-- tests for back references +checkerr("back reference 'x' not found", m.match, m.Cb('x'), '') +checkerr("back reference 'b' not found", m.match, m.Cg(1, 'a') * m.Cb('b'), 'a') + +p = m.Cg(m.C(1) * m.C(1), "k") * m.Ct(m.Cb("k")) +t = p:match("ab") +checkeq(t, {"a", "b"}) + +p = m.P(true) +for i = 1, 10 do p = p * m.Cg(1, i) end +for i = 1, 10 do + local p = p * m.Cb(i) + assert(p:match('abcdefghij') == string.sub('abcdefghij', i, i)) +end + + +t = {} +function foo (p) t[#t + 1] = p; return p .. "x" end + +p = m.Cg(m.C(2) / foo, "x") * m.Cb"x" * + m.Cg(m.Cb('x') / foo, "x") * m.Cb"x" * + m.Cg(m.Cb('x') / foo, "x") * m.Cb"x" * + m.Cg(m.Cb('x') / foo, "x") * m.Cb"x" +x = {p:match'ab'} +checkeq(x, {'abx', 'abxx', 'abxxx', 'abxxxx'}) +checkeq(t, {'ab', + 'ab', 'abx', + 'ab', 'abx', 'abxx', + 'ab', 'abx', 'abxx', 'abxxx'}) + + + +-- tests for match-time captures + +p = m.P'a' * (function (s, i) return (s:sub(i, i) == 'b') and i + 1 end) + + 'acd' + +assert(p:match('abc') == 3) +assert(p:match('acd') == 4) + +local function id (s, i, ...) + return true, ... +end + +assert(m.Cmt(m.Cs((m.Cmt(m.S'abc' / { a = 'x', c = 'y' }, id) + + m.R'09'^1 / string.char + + m.P(1))^0), id):match"acb98+68c" == "xyb\98+\68y") + +p = m.P{'S', + S = m.V'atom' * space + + m.Cmt(m.Ct("(" * space * (m.Cmt(m.V'S'^1, id) + m.P(true)) * ")" * space), id), + atom = m.Cmt(m.C(m.R("AZ", "az", "09")^1), id) +} +x = p:match"(a g () ((b) c) (d (e)))" +checkeq(x, {'a', 'g', {}, {{'b'}, 'c'}, {'d', {'e'}}}); + +x = {(m.Cmt(1, id)^0):match(string.rep('a', 500))} +assert(#x == 500) + +local function id(s, i, x) + if x == 'a' then return i, 1, 3, 7 + else return nil, 2, 4, 6, 8 + end +end + +p = ((m.P(id) * 1 + m.Cmt(2, id) * 1 + m.Cmt(1, id) * 1))^0 +assert(table.concat{p:match('abababab')} == string.rep('137', 4)) + +local function ref (s, i, x) + return m.match(x, s, i - x:len()) +end + +assert(m.Cmt(m.P(1)^0, ref):match('alo') == 4) +assert((m.P(1) * m.Cmt(m.P(1)^0, ref)):match('alo') == 4) +assert(not (m.P(1) * m.Cmt(m.C(1)^0, ref)):match('alo')) + +ref = function (s,i,x) return i == tonumber(x) and i, 'xuxu' end + +assert(m.Cmt(1, ref):match'2') +assert(not m.Cmt(1, ref):match'1') +assert(m.Cmt(m.P(1)^0, ref):match'03') + +function ref (s, i, a, b) + if a == b then return i, a:upper() end +end + +p = m.Cmt(m.C(m.R"az"^1) * "-" * m.C(m.R"az"^1), ref) +p = (any - p)^0 * p * any^0 * -1 + +assert(p:match'abbbc-bc ddaa' == 'BC') + +do -- match-time captures cannot be optimized away + local touch = 0 + f = m.P(function () touch = touch + 1; return true end) + + local function check(n) n = n or 1; assert(touch == n); touch = 0 end + + assert(m.match(f * false + 'b', 'a') == nil); check() + assert(m.match(f * false + 'b', '') == nil); check() + assert(m.match( (f * 'a')^0 * 'b', 'b') == 2); check() + assert(m.match( (f * 'a')^0 * 'b', '') == nil); check() + assert(m.match( (f * 'a')^-1 * 'b', 'b') == 2); check() + assert(m.match( (f * 'a')^-1 * 'b', '') == nil); check() + assert(m.match( ('b' + f * 'a')^-1 * 'b', '') == nil); check() + assert(m.match( (m.P'b'^-1 * f * 'a')^-1 * 'b', '') == nil); check() + assert(m.match( (-m.P(1) * m.P'b'^-1 * f * 'a')^-1 * 'b', '') == nil); + check() + assert(m.match( (f * 'a' + 'b')^-1 * 'b', '') == nil); check() + assert(m.match(f * 'a' + f * 'b', 'b') == 2); check(2) + assert(m.match(f * 'a' + f * 'b', 'a') == 2); check(1) + assert(m.match(-f * 'a' + 'b', 'b') == 2); check(1) + assert(m.match(-f * 'a' + 'b', '') == nil); check(1) +end + +c = '[' * m.Cg(m.P'='^0, "init") * '[' * + { m.Cmt(']' * m.C(m.P'='^0) * ']' * m.Cb("init"), function (_, _, s1, s2) + return s1 == s2 end) + + 1 * m.V(1) } / 0 + +assert(c:match'[==[]]====]]]]==]===[]' == 18) +assert(c:match'[[]=]====]=]]]==]===[]' == 14) +assert(not c:match'[[]=]====]=]=]==]===[]') + + +-- old bug: optimization of concat with fail removed match-time capture +p = m.Cmt(0, function (s) p = s end) * m.P(false) +assert(not p:match('alo')) +assert(p == 'alo') + + +-- ensure that failed match-time captures are not kept on Lua stack +do + local t = {__mode = "kv"}; setmetatable(t,t) + local c = 0 + + local function foo (s,i) + collectgarbage(); + assert(next(t) == "__mode" and next(t, "__mode") == nil) + local x = {} + t[x] = true + c = c + 1 + return i, x + end + + local p = m.P{ m.Cmt(0, foo) * m.P(false) + m.P(1) * m.V(1) + m.P"" } + p:match(string.rep('1', 10)) + assert(c == 11) +end + + +-- Return a match-time capture that returns 'n' captures +local function manyCmt (n) + return m.Cmt("a", function () + local a = {}; for i = 1, n do a[i] = n - i end + return true, unpack(a) + end) +end + +-- bug in 1.0: failed match-time that used previous match-time results +do + local x + local function aux (...) x = #{...}; return false end + local res = {m.match(m.Cmt(manyCmt(20), aux) + manyCmt(10), "a")} + assert(#res == 10 and res[1] == 9 and res[10] == 0) +end + + +-- bug in 1.0: problems with math-times returning too many captures +do + local lim = 2^11 - 10 + local res = {m.match(manyCmt(lim), "a")} + assert(#res == lim and res[1] == lim - 1 and res[lim] == 0) + checkerr("too many", m.match, manyCmt(2^15), "a") +end + +p = (m.P(function () return true, "a" end) * 'a' + + m.P(function (s, i) return i, "aa", 20 end) * 'b' + + m.P(function (s,i) if i <= #s then return i, "aaa" end end) * 1)^0 + +t = {p:match('abacc')} +checkeq(t, {'a', 'aa', 20, 'a', 'aaa', 'aaa'}) + + +------------------------------------------------------------------- +-- Tests for 're' module +------------------------------------------------------------------- + +local re = require "re" + +local match, compile = re.match, re.compile + + + +assert(match("a", ".") == 2) +assert(match("a", "''") == 1) +assert(match("", " ! . ") == 1) +assert(not match("a", " ! . ")) +assert(match("abcde", " ( . . ) * ") == 5) +assert(match("abbcde", " [a-c] +") == 5) +assert(match("0abbc1de", "'0' [a-c]+ '1'") == 7) +assert(match("0zz1dda", "'0' [^a-c]+ 'a'") == 8) +assert(match("abbc--", " [a-c] + +") == 5) +assert(match("abbc--", " [ac-] +") == 2) +assert(match("abbc--", " [-acb] + ") == 7) +assert(not match("abbcde", " [b-z] + ")) +assert(match("abb\"de", '"abb"["]"de"') == 7) +assert(match("abceeef", "'ac' ? 'ab' * 'c' { 'e' * } / 'abceeef' ") == "eee") +assert(match("abceeef", "'ac'? 'ab'* 'c' { 'f'+ } / 'abceeef' ") == 8) + +assert(re.match("aaand", "[a]^2") == 3) + +local t = {match("abceefe", "( ( & 'e' {} ) ? . ) * ")} +checkeq(t, {4, 5, 7}) +local t = {match("abceefe", "((&&'e' {})? .)*")} +checkeq(t, {4, 5, 7}) +local t = {match("abceefe", "( ( ! ! 'e' {} ) ? . ) *")} +checkeq(t, {4, 5, 7}) +local t = {match("abceefe", "(( & ! & ! 'e' {})? .)*")} +checkeq(t, {4, 5, 7}) + +assert(match("cccx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 5) +assert(match("cdx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 4) +assert(match("abcdcdx" , "'ab'? ('ccc' / ('cde' / 'cd'*)? / 'ccc') 'x'+") == 8) + +assert(match("abc", "a <- (. a)?") == 4) +b = "balanced <- '(' ([^()] / balanced)* ')'" +assert(match("(abc)", b)) +assert(match("(a(b)((c) (d)))", b)) +assert(not match("(a(b ((c) (d)))", b)) + +b = compile[[ balanced <- "(" ([^()] / balanced)* ")" ]] +assert(b == m.P(b)) +assert(b:match"((((a))(b)))") + +local g = [[ + S <- "0" B / "1" A / "" -- balanced strings + A <- "0" S / "1" A A -- one more 0 + B <- "1" S / "0" B B -- one more 1 +]] +assert(match("00011011", g) == 9) + +local g = [[ + S <- ("0" B / "1" A)* + A <- "0" / "1" A A + B <- "1" / "0" B B +]] +assert(match("00011011", g) == 9) +assert(match("000110110", g) == 9) +assert(match("011110110", g) == 3) +assert(match("000110010", g) == 1) + +s = "aaaaaaaaaaaaaaaaaaaaaaaa" +assert(match(s, "'a'^3") == 4) +assert(match(s, "'a'^0") == 1) +assert(match(s, "'a'^+3") == s:len() + 1) +assert(not match(s, "'a'^+30")) +assert(match(s, "'a'^-30") == s:len() + 1) +assert(match(s, "'a'^-5") == 6) +for i = 1, s:len() do + assert(match(s, string.format("'a'^+%d", i)) >= i + 1) + assert(match(s, string.format("'a'^-%d", i)) <= i + 1) + assert(match(s, string.format("'a'^%d", i)) == i + 1) +end +assert(match("01234567890123456789", "[0-9]^3+") == 19) + + +assert(match("01234567890123456789", "({....}{...}) -> '%2%1'") == "4560123") +t = match("0123456789", "{| {.}* |}") +checkeq(t, {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}) +assert(match("012345", "{| (..) -> '%0%0' |}")[1] == "0101") + +assert(match("abcdef", "( {.} {.} {.} {.} {.} ) -> 3") == "c") +assert(match("abcdef", "( {:x: . :} {.} {.} {.} {.} ) -> 3") == "d") +assert(match("abcdef", "( {:x: . :} {.} {.} {.} {.} ) -> 0") == 6) + +assert(not match("abcdef", "{:x: ({.} {.} {.}) -> 2 :} =x")) +assert(match("abcbef", "{:x: ({.} {.} {.}) -> 2 :} =x")) + +eqcharset(compile"[]]", "]") +eqcharset(compile"[][]", m.S"[]") +eqcharset(compile"[]-]", m.S"-]") +eqcharset(compile"[-]", m.S"-") +eqcharset(compile"[az-]", m.S"a-z") +eqcharset(compile"[-az]", m.S"a-z") +eqcharset(compile"[a-z]", m.R"az") +eqcharset(compile"[]['\"]", m.S[[]['"]]) + +eqcharset(compile"[^]]", any - "]") +eqcharset(compile"[^][]", any - m.S"[]") +eqcharset(compile"[^]-]", any - m.S"-]") +eqcharset(compile"[^]-]", any - m.S"-]") +eqcharset(compile"[^-]", any - m.S"-") +eqcharset(compile"[^az-]", any - m.S"a-z") +eqcharset(compile"[^-az]", any - m.S"a-z") +eqcharset(compile"[^a-z]", any - m.R"az") +eqcharset(compile"[^]['\"]", any - m.S[[]['"]]) + +-- tests for comments in 're' +e = compile[[ +A <- _B -- \t \n %nl .<> <- -> -- +_B <- 'x' --]] +assert(e:match'xy' == 2) + +-- tests for 're' with pre-definitions +defs = {digits = m.R"09", letters = m.R"az", _=m.P"__"} +e = compile("%letters (%letters / %digits)*", defs) +assert(e:match"x123" == 5) +e = compile("%_", defs) +assert(e:match"__" == 3) + +e = compile([[ + S <- A+ + A <- %letters+ B + B <- %digits+ +]], defs) + +e = compile("{[0-9]+'.'?[0-9]*} -> sin", math) +assert(e:match("2.34") == math.sin(2.34)) + + +function eq (_, _, a, b) return a == b end + +c = re.compile([[ + longstring <- '[' {:init: '='* :} '[' close + close <- ']' =init ']' / . close +]]) + +assert(c:match'[==[]]===]]]]==]===[]' == 17) +assert(c:match'[[]=]====]=]]]==]===[]' == 14) +assert(not c:match'[[]=]====]=]=]==]===[]') + +c = re.compile" '[' {:init: '='* :} '[' (!(']' =init ']') .)* ']' =init ']' !. " + +assert(c:match'[==[]]===]]]]==]') +assert(c:match'[[]=]====]=][]==]===[]]') +assert(not c:match'[[]=]====]=]=]==]===[]') + +assert(re.find("hi alalo", "{:x:..:} =x") == 4) +assert(re.find("hi alalo", "{:x:..:} =x", 4) == 4) +assert(not re.find("hi alalo", "{:x:..:} =x", 5)) +assert(re.find("hi alalo", "{'al'}", 5) == 6) +assert(re.find("hi aloalolo", "{:x:..:} =x") == 8) +assert(re.find("alo alohi x x", "{:word:%w+:}%W*(=word)!%w") == 11) + +-- re.find discards any captures +local a,b,c = re.find("alo", "{.}{'o'}") +assert(a == 2 and b == 3 and c == nil) + +local function match (s,p) + local i,e = re.find(s,p) + if i then return s:sub(i, e) end +end +assert(match("alo alo", '[a-z]+') == "alo") +assert(match("alo alo", '{:x: [a-z]+ :} =x') == nil) +assert(match("alo alo", "{:x: [a-z]+ :} ' ' =x") == "alo alo") + +assert(re.gsub("alo alo", "[abc]", "x") == "xlo xlo") +assert(re.gsub("alo alo", "%w+", ".") == ". .") +assert(re.gsub("hi, how are you", "[aeiou]", string.upper) == + "hI, hOw ArE yOU") + +s = 'hi [[a comment[=]=] ending here]] and [=[another]]=]]' +c = re.compile" '[' {:i: '='* :} '[' (!(']' =i ']') .)* ']' { =i } ']' " +assert(re.gsub(s, c, "%2") == 'hi and =]') +assert(re.gsub(s, c, "%0") == s) +assert(re.gsub('[=[hi]=]', c, "%2") == '=') + +assert(re.find("", "!.") == 1) +assert(re.find("alo", "!.") == 4) + +function addtag (s, i, t, tag) t.tag = tag; return i, t end + +c = re.compile([[ + doc <- block !. + block <- (start {| (block / { [^<]+ })* |} end?) => addtag + start <- '<' {:tag: [a-z]+ :} '>' + end <- '</' { =tag } '>' +]], {addtag = addtag}) + +x = c:match[[ +<x>hi<b>hello</b>but<b>totheend</x>]] +checkeq(x, {tag='x', 'hi', {tag = 'b', 'hello'}, 'but', + {'totheend'}}) + + +-- test for folding captures +c = re.compile([[ + S <- (number (%s+ number)*) ~> add + number <- %d+ -> tonumber +]], {tonumber = tonumber, add = function (a,b) return a + b end}) +assert(c:match("3 401 50") == 3 + 401 + 50) + +-- tests for look-ahead captures +x = {re.match("alo", "&(&{.}) !{'b'} {&(...)} &{..} {...} {!.}")} +checkeq(x, {"", "alo", ""}) + +assert(re.match("aloalo", + "{~ (((&'al' {.}) -> 'A%1' / (&%l {.}) -> '%1%1') / .)* ~}") + == "AallooAalloo") + +-- bug in 0.9 (and older versions), due to captures in look-aheads +x = re.compile[[ {~ (&(. ([a-z]* -> '*')) ([a-z]+ -> '+') ' '*)* ~} ]] +assert(x:match"alo alo" == "+ +") + +-- valid capture in look-ahead (used inside the look-ahead itself) +x = re.compile[[ + S <- &({:two: .. :} . =two) {[a-z]+} / . S +]] +assert(x:match("hello aloaLo aloalo xuxu") == "aloalo") + + +p = re.compile[[ + block <- {| {:ident:space*:} line + ((=ident !space line) / &(=ident space) block)* |} + line <- {[^%nl]*} %nl + space <- '_' -- should be ' ', but '_' is simpler for editors +]] + +t= p:match[[ +1 +__1.1 +__1.2 +____1.2.1 +____ +2 +__2.1 +]] +checkeq(t, {"1", {"1.1", "1.2", {"1.2.1", "", ident = "____"}, ident = "__"}, + "2", {"2.1", ident = "__"}, ident = ""}) + + +-- nested grammars +p = re.compile[[ + s <- a b !. + b <- ( x <- ('b' x)? ) + a <- ( x <- 'a' x? ) +]] + +assert(p:match'aaabbb') +assert(p:match'aaa') +assert(not p:match'bbb') +assert(not p:match'aaabbba') + +-- testing groups +t = {re.match("abc", "{:S <- {:.:} {S} / '':}")} +checkeq(t, {"a", "bc", "b", "c", "c", ""}) + +t = re.match("1234", "{| {:a:.:} {:b:.:} {:c:.{.}:} |}") +checkeq(t, {a="1", b="2", c="4"}) +t = re.match("1234", "{|{:a:.:} {:b:{.}{.}:} {:c:{.}:}|}") +checkeq(t, {a="1", b="2", c="4"}) +t = re.match("12345", "{| {:.:} {:b:{.}{.}:} {:{.}{.}:} |}") +checkeq(t, {"1", b="2", "4", "5"}) +t = re.match("12345", "{| {:.:} {:{:b:{.}{.}:}:} {:{.}{.}:} |}") +checkeq(t, {"1", "23", "4", "5"}) +t = re.match("12345", "{| {:.:} {{:b:{.}{.}:}} {:{.}{.}:} |}") +checkeq(t, {"1", "23", "4", "5"}) + + +-- testing pre-defined names +assert(os.setlocale("C") == "C") + +function eqlpeggsub (p1, p2) + local s1 = cs2str(re.compile(p1)) + local s2 = string.gsub(allchar, "[^" .. p2 .. "]", "") + -- if s1 ~= s2 then print(#s1,#s2) end + assert(s1 == s2) +end + + +eqlpeggsub("%w", "%w") +eqlpeggsub("%a", "%a") +eqlpeggsub("%l", "%l") +eqlpeggsub("%u", "%u") +eqlpeggsub("%p", "%p") +eqlpeggsub("%d", "%d") +eqlpeggsub("%x", "%x") +eqlpeggsub("%s", "%s") +eqlpeggsub("%c", "%c") + +eqlpeggsub("%W", "%W") +eqlpeggsub("%A", "%A") +eqlpeggsub("%L", "%L") +eqlpeggsub("%U", "%U") +eqlpeggsub("%P", "%P") +eqlpeggsub("%D", "%D") +eqlpeggsub("%X", "%X") +eqlpeggsub("%S", "%S") +eqlpeggsub("%C", "%C") + +eqlpeggsub("[%w]", "%w") +eqlpeggsub("[_%w]", "_%w") +eqlpeggsub("[^%w]", "%W") +eqlpeggsub("[%W%S]", "%W%S") + +re.updatelocale() + + +-- testing nested substitutions x string captures + +p = re.compile[[ + text <- {~ item* ~} + item <- macro / [^()] / '(' item* ')' + arg <- ' '* {~ (!',' item)* ~} + args <- '(' arg (',' arg)* ')' + macro <- ('apply' args) -> '%1(%2)' + / ('add' args) -> '%1 + %2' + / ('mul' args) -> '%1 * %2' +]] + +assert(p:match"add(mul(a,b), apply(f,x))" == "a * b + f(x)") + +rev = re.compile[[ R <- (!.) -> '' / ({.} R) -> '%2%1']] + +assert(rev:match"0123456789" == "9876543210") + + +-- testing error messages in re + +local function errmsg (p, err) + checkerr(err, re.compile, p) +end + +errmsg('aaaa', "rule 'aaaa'") +errmsg('a', 'outside') +errmsg('b <- a', 'undefined') +errmsg("x <- 'a' x <- 'b'", 'already defined') +errmsg("'a' -", "near '-'") + + +print"OK" + + |