diff options
Diffstat (limited to 'Data/Libraries/Penlight/lua/pl/data.lua')
-rw-r--r-- | Data/Libraries/Penlight/lua/pl/data.lua | 654 |
1 files changed, 654 insertions, 0 deletions
diff --git a/Data/Libraries/Penlight/lua/pl/data.lua b/Data/Libraries/Penlight/lua/pl/data.lua new file mode 100644 index 0000000..a565ebc --- /dev/null +++ b/Data/Libraries/Penlight/lua/pl/data.lua @@ -0,0 +1,654 @@ +--- Reading and querying simple tabular data. +-- +-- data.read 'test.txt' +-- ==> {{10,20},{2,5},{40,50},fieldnames={'x','y'},delim=','} +-- +-- Provides a way of creating basic SQL-like queries. +-- +-- require 'pl' +-- local d = data.read('xyz.txt') +-- local q = d:select('x,y,z where x > 3 and z < 2 sort by y') +-- for x,y,z in q do +-- print(x,y,z) +-- end +-- +-- See @{06-data.md.Reading_Columnar_Data|the Guide} +-- +-- Dependencies: `pl.utils`, `pl.array2d` (fallback methods) +-- @module pl.data + +local utils = require 'pl.utils' +local _DEBUG = rawget(_G,'_DEBUG') + +local patterns,function_arg,usplit,array_tostring = utils.patterns,utils.function_arg,utils.split,utils.array_tostring +local append,concat = table.insert,table.concat +local gsub = string.gsub +local io = io +local _G,print,type,tonumber,ipairs,setmetatable = _G,print,type,tonumber,ipairs,setmetatable + + +local data = {} + +local parse_select + +local function rstrip(s) + return (s:gsub('%s+$','')) +end + +local function strip (s) + return (rstrip(s):gsub('^%s*','')) +end + +-- This gives `l` the standard List metatable, +-- pulling in the List module. +local function makelist(l) + return setmetatable(l, require('pl.List')) +end + +local function map(fun,t) + local res = {} + for i = 1,#t do + res[i] = fun(t[i]) + end + return res +end + +local function split(line,delim,csv,n) + local massage + -- CSV fields may be double-quoted and may contain commas! + if csv and line:match '"' then + line = line:gsub('"([^"]+)"',function(str) + local s,cnt = str:gsub(',','\001') + if cnt > 0 then massage = true end + return s + end) + if massage then + massage = function(s) return (s:gsub('\001',',')) end + end + end + local res = (usplit(line,delim,false,n)) + if csv then + -- restore CSV commas-in-fields + if massage then res = map(massage,res) end + -- in CSV mode trailiing commas are significant! + if line:match ',$' then append(res,'') end + end + return makelist(res) +end + +local function find(t,v) + for i = 1,#t do + if v == t[i] then return i end + end +end + +local DataMT = { + column_by_name = function(self,name) + if type(name) == 'number' then + name = '$'..name + end + local arr = {} + for res in data.query(self,name) do + append(arr,res) + end + return makelist(arr) + end, + + copy_select = function(self,condn) + condn = parse_select(condn,self) + local iter = data.query(self,condn) + local res = {} + local row = makelist{iter()} + while #row > 0 do + append(res,row) + row = makelist{iter()} + end + res.delim = self.delim + return data.new(res,split(condn.fields,',')) + end, + + column_names = function(self) + return self.fieldnames + end, +} + +local array2d + +DataMT.__index = function(self,name) + local f = DataMT[name] + if f then return f end + if not array2d then + array2d = require 'pl.array2d' + end + return array2d[name] +end + +--- return a particular column as a list of values (method). +-- @param name either name of column, or numerical index. +-- @function Data.column_by_name + +--- return a query iterator on this data (method). +-- @string condn the query expression +-- @function Data.select +-- @see data.query + +--- return a row iterator on this data (method). +-- @string condn the query expression +-- @function Data.select_row + +--- return a new data object based on this query (method). +-- @string condn the query expression +-- @function Data.copy_select + +--- return the field names of this data object (method). +-- @function Data.column_names + +--- write out a row (method). +-- @param f file-like object +-- @function Data.write_row + +--- write data out to file (method). +-- @param f file-like object +-- @function Data.write + + +-- [guessing delimiter] We check for comma, tab and spaces in that order. +-- [issue] any other delimiters to be checked? +local delims = {',', '\t', ' ', ';'} + +local function guess_delim (line) + if line=='' then return ' ' end + for _,delim in ipairs(delims) do + if line:find(delim) then + return delim == ' ' and '%s+' or delim + end + end + return ' ' +end + +-- [file parameter] If it's a string, we try open as a filename. If nil, then +-- either stdin or stdout depending on the mode. Otherwise, check if this is +-- a file-like object (implements read or write depending) +local function open_file (f,mode) + local opened, err + local reading = mode == 'r' + if type(f) == 'string' then + if f == 'stdin' then + f = io.stdin + elseif f == 'stdout' then + f = io.stdout + else + f,err = io.open(f,mode) + if not f then return nil,err end + opened = true + end + end + if f and ((reading and not f.read) or (not reading and not f.write)) then + return nil, "not a file-like object" + end + return f,nil,opened +end + +--- read a delimited file in a Lua table. +-- By default, attempts to treat first line as separated list of fieldnames. +-- @param file a filename or a file-like object +-- @tab cnfg parsing options +-- @string cnfg.delim a string pattern to split fields +-- @array cnfg.fieldnames (i.e. don't read from first line) +-- @bool cnfg.no_convert (default is to try conversion on first data line) +-- @tab cnfg.convert table of custom conversion functions with column keys +-- @int cnfg.numfields indices of columns known to be numbers +-- @bool cnfg.last_field_collect only split as many fields as fieldnames. +-- @int cnfg.thousands_dot thousands separator in Excel CSV is '.' +-- @bool cnfg.csv fields may be double-quoted and contain commas; +-- Also, empty fields are considered to be equivalent to zero. +-- @return `data` object, or `nil` +-- @return error message. May be a file error, 'not a file-like object' +-- or a conversion error +function data.read(file,cnfg) + local count,line + local D = {} + if not cnfg then cnfg = {} end + local f,err,opened = open_file(file,'r') + if not f then return nil, err end + local thousands_dot = cnfg.thousands_dot + local csv = cnfg.csv + if csv then cnfg.delim = ',' end + + -- note that using dot as the thousands separator (@thousands_dot) + -- requires a special conversion function! For CSV, _empty fields_ are + -- considered to default to numerial zeroes. + local tonumber = tonumber + local function try_number(x) + if thousands_dot then x = x:gsub('%.(...)','%1') end + if csv and x == '' then x = '0' end + local v = tonumber(x) + if v == nil then return nil,"not a number" end + return v + end + + count = 1 + line = f:read() + if not line then return nil, "empty file" end + + -- first question: what is the delimiter? + D.delim = cnfg.delim and cnfg.delim or guess_delim(line) + local delim = D.delim + + local conversion + local numfields = {} + local function append_conversion (idx,conv) + conversion = conversion or {} + append(numfields,idx) + append(conversion,conv) + end + if cnfg.numfields then + for _,n in ipairs(cnfg.numfields) do append_conversion(n,try_number) end + end + + -- some space-delimited data starts with a space. This should not be a column, + -- although it certainly would be for comma-separated, etc. + local stripper + if delim == '%s+' and line:find(delim) == 1 then + stripper = function(s) return s:gsub('^%s+','') end + line = stripper(line) + end + -- first line will usually be field names. Unless fieldnames are specified, + -- we check if it contains purely numerical values for the case of reading + -- plain data files. + if not cnfg.fieldnames then + local fields,nums + fields = split(line,delim,csv) + if not cnfg.convert then + nums = map(tonumber,fields) + if #nums == #fields then -- they're ALL numbers! + append(D,nums) -- add the first converted row + -- and specify conversions for subsequent rows + for i = 1,#nums do append_conversion(i,try_number) end + else -- we'll try to check numbers just now.. + nums = nil + end + else -- [explicit column conversions] (any deduced number conversions will be added) + for idx,conv in pairs(cnfg.convert) do append_conversion(idx,conv) end + end + if nums == nil then + cnfg.fieldnames = fields + end + line = f:read() + count = count + 1 + if stripper then line = stripper(line) end + elseif type(cnfg.fieldnames) == 'string' then + cnfg.fieldnames = split(cnfg.fieldnames,delim,csv) + end + local nfields + -- at this point, the column headers have been read in. If the first + -- row consisted of numbers, it has already been added to the dataset. + if cnfg.fieldnames then + D.fieldnames = cnfg.fieldnames + -- [collecting end field] If @last_field_collect then we'll + -- only split as many fields as there are fieldnames + if cnfg.last_field_collect then + nfields = #D.fieldnames + end + -- [implicit column conversion] unless @no_convert, we need the numerical field indices + -- of the first data row. These can also be specified explicitly by @numfields. + if not cnfg.no_convert then + local fields = split(line,D.delim,csv,nfields) + for i = 1,#fields do + if not find(numfields,i) and try_number(fields[i]) then + append_conversion(i,try_number) + end + end + end + end + -- keep going until finished + while line do + if not line:find ('^%s*$') then -- [blank lines] ignore them! + if stripper then line = stripper(line) end + local fields = split(line,delim,csv,nfields) + if conversion then -- there were field conversions... + for k = 1,#numfields do + local i,conv = numfields[k],conversion[k] + local val,err = conv(fields[i]) + if val == nil then + return nil, err..": "..fields[i].." at line "..count + else + fields[i] = val + end + end + end + append(D,fields) + end + line = f:read() + count = count + 1 + end + if opened then f:close() end + if delim == '%s+' then D.delim = ' ' end + if not D.fieldnames then D.fieldnames = {} end + return data.new(D) +end + +local function write_row (data,f,row,delim) + data.temp = array_tostring(row,data.temp) + f:write(concat(data.temp,delim),'\n') +end + +function DataMT:write_row(f,row) + write_row(self,f,row,self.delim) +end + +--- write 2D data to a file. +-- Does not assume that the data has actually been +-- generated with `new` or `read`. +-- @param data 2D array +-- @param file filename or file-like object +-- @tparam[opt] {string} fieldnames list of fields (optional) +-- @string[opt='\t'] delim delimiter (default tab) +-- @return true or nil, error +function data.write (data,file,fieldnames,delim) + local f,err,opened = open_file(file,'w') + if not f then return nil, err end + if not fieldnames then + fieldnames = data.fieldnames + end + delim = delim or '\t' + if fieldnames and #fieldnames > 0 then + f:write(concat(fieldnames,delim),'\n') + end + for i = 1,#data do + write_row(data,f,data[i],delim) + end + if opened then f:close() end + return true +end + + +function DataMT:write(file) + data.write(self,file,self.fieldnames,self.delim) +end + +local function massage_fieldnames (fields,copy) + -- fieldnames must be valid Lua identifiers; ignore any surrounding padding + -- but keep the original fieldnames... + for i = 1,#fields do + local f = strip(fields[i]) + copy[i] = f + fields[i] = f:gsub('%W','_') + end +end + +--- create a new dataset from a table of rows. +-- Can specify the fieldnames, else the table must have a field called +-- 'fieldnames', which is either a string of delimiter-separated names, +-- or a table of names. <br> +-- If the table does not have a field called 'delim', then an attempt will be +-- made to guess it from the fieldnames string, defaults otherwise to tab. +-- @param d the table. +-- @tparam[opt] {string} fieldnames optional fieldnames +-- @return the table. +function data.new (d,fieldnames) + d.fieldnames = d.fieldnames or fieldnames or '' + if not d.delim and type(d.fieldnames) == 'string' then + d.delim = guess_delim(d.fieldnames) + d.fieldnames = split(d.fieldnames,d.delim) + end + d.fieldnames = makelist(d.fieldnames) + d.original_fieldnames = {} + massage_fieldnames(d.fieldnames,d.original_fieldnames) + setmetatable(d,DataMT) + -- a query with just the fieldname will return a sequence + -- of values, which seq.copy turns into a table. + return d +end + +local sorted_query = [[ +return function (t) + local i = 0 + local v + local ls = {} + for i,v in ipairs(t) do + if CONDITION then + ls[#ls+1] = v + end + end + table.sort(ls,function(v1,v2) + return SORT_EXPR + end) + local n = #ls + return function() + i = i + 1 + v = ls[i] + if i > n then return end + return FIELDLIST + end +end +]] + +-- question: is this optimized case actually worth the extra code? +local simple_query = [[ +return function (t) + local n = #t + local i = 0 + local v + return function() + repeat + i = i + 1 + v = t[i] + until i > n or CONDITION + if i > n then return end + return FIELDLIST + end +end +]] + +local function is_string (s) + return type(s) == 'string' +end + +local field_error + +local function fieldnames_as_string (data) + return concat(data.fieldnames,',') +end + +local function massage_fields(data,f) + local idx + if f:find '^%d+$' then + idx = tonumber(f) + else + idx = find(data.fieldnames,f) + end + if idx then + return 'v['..idx..']' + else + field_error = f..' not found in '..fieldnames_as_string(data) + return f + end +end + + +local function process_select (data,parms) + --- preparing fields ---- + field_error = nil + local fields = parms.fields + local numfields = fields:find '%$' or #data.fieldnames == 0 + if fields:find '^%s*%*%s*' then + if not numfields then + fields = fieldnames_as_string(data) + else + local ncol = #data[1] + fields = {} + for i = 1,ncol do append(fields,'$'..i) end + fields = concat(fields,',') + end + end + local idpat = patterns.IDEN + if numfields then + idpat = '%$(%d+)' + else + -- massage field names to replace non-identifier chars + fields = rstrip(fields):gsub('[^,%w]','_') + end + local massage_fields = utils.bind1(massage_fields,data) + local ret = gsub(fields,idpat,massage_fields) + if field_error then return nil,field_error end + parms.fields = fields + parms.proc_fields = ret + parms.where = parms.where or 'true' + if is_string(parms.where) then + parms.where = gsub(parms.where,idpat,massage_fields) + field_error = nil + end + return true +end + + +parse_select = function(s,data) + local endp + local parms = {} + local w1,w2 = s:find('where ') + local s1,s2 = s:find('sort by ') + if w1 then -- where clause! + endp = (s1 or 0)-1 + parms.where = s:sub(w2+1,endp) + end + if s1 then -- sort by clause (must be last!) + parms.sort_by = s:sub(s2+1) + end + endp = (w1 or s1 or 0)-1 + parms.fields = s:sub(1,endp) + local status,err = process_select(data,parms) + if not status then return nil,err + else return parms end +end + +--- create a query iterator from a select string. +-- Select string has this format: <br> +-- FIELDLIST [ where LUA-CONDN [ sort by FIELD] ]<br> +-- FIELDLIST is a comma-separated list of valid fields, or '*'. <br> <br> +-- The condition can also be a table, with fields 'fields' (comma-sep string or +-- table), 'sort_by' (string) and 'where' (Lua expression string or function) +-- @param data table produced by read +-- @param condn select string or table +-- @param context a list of tables to be searched when resolving functions +-- @param return_row if true, wrap the results in a row table +-- @return an iterator over the specified fields, or nil +-- @return an error message +function data.query(data,condn,context,return_row) + local err + if is_string(condn) then + condn,err = parse_select(condn,data) + if not condn then return nil,err end + elseif type(condn) == 'table' then + if type(condn.fields) == 'table' then + condn.fields = concat(condn.fields,',') + end + if not condn.proc_fields then + local status,err = process_select(data,condn) + if not status then return nil,err end + end + else + return nil, "condition must be a string or a table" + end + local query + if condn.sort_by then -- use sorted_query + query = sorted_query + else + query = simple_query + end + local fields = condn.proc_fields or condn.fields + if return_row then + fields = '{'..fields..'}' + end + query = query:gsub('FIELDLIST',fields) + if is_string(condn.where) then + query = query:gsub('CONDITION',condn.where) + condn.where = nil + else + query = query:gsub('CONDITION','_condn(v)') + condn.where = function_arg(0,condn.where,'condition.where must be callable') + end + if condn.sort_by then + local expr,sort_var,sort_dir + local sort_by = condn.sort_by + local i1,i2 = sort_by:find('%s+') + if i1 then + sort_var,sort_dir = sort_by:sub(1,i1-1),sort_by:sub(i2+1) + else + sort_var = sort_by + sort_dir = 'asc' + end + if sort_var:match '^%$' then sort_var = sort_var:sub(2) end + sort_var = massage_fields(data,sort_var) + if field_error then return nil,field_error end + if sort_dir == 'asc' then + sort_dir = '<' + else + sort_dir = '>' + end + expr = ('%s %s %s'):format(sort_var:gsub('v','v1'),sort_dir,sort_var:gsub('v','v2')) + query = query:gsub('SORT_EXPR',expr) + end + if condn.where then + query = 'return function(_condn) '..query..' end' + end + if _DEBUG then print(query) end + + local fn,err = utils.load(query,'tmp') + if not fn then return nil,err end + fn = fn() -- get the function + if condn.where then + fn = fn(condn.where) + end + local qfun = fn(data) + if context then + -- [specifying context for condition] @context is a list of tables which are + -- 'injected'into the condition's custom context + append(context,_G) + local lookup = {} + utils.setfenv(qfun,lookup) + setmetatable(lookup,{ + __index = function(tbl,key) + -- _G.print(tbl,key) + for k,t in ipairs(context) do + if t[key] then return t[key] end + end + end + }) + end + return qfun +end + + +DataMT.select = data.query +DataMT.select_row = function(d,condn,context) + return data.query(d,condn,context,true) +end + +--- Filter input using a query. +-- @string Q a query string +-- @param infile filename or file-like object +-- @param outfile filename or file-like object +-- @bool dont_fail true if you want to return an error, not just fail +function data.filter (Q,infile,outfile,dont_fail) + local d = data.read(infile or 'stdin') + local out = open_file(outfile or 'stdout') + local iter,err = d:select(Q) + local delim = d.delim + if not iter then + err = 'error: '..err + if dont_fail then + return nil,err + else + utils.quit(1,err) + end + end + while true do + local res = {iter()} + if #res == 0 then break end + out:write(concat(res,delim),'\n') + end +end + +return data + |