summaryrefslogtreecommitdiff
path: root/Data/Libraries/Penlight/tests/test-data.lua
diff options
context:
space:
mode:
Diffstat (limited to 'Data/Libraries/Penlight/tests/test-data.lua')
-rw-r--r--Data/Libraries/Penlight/tests/test-data.lua245
1 files changed, 245 insertions, 0 deletions
diff --git a/Data/Libraries/Penlight/tests/test-data.lua b/Data/Libraries/Penlight/tests/test-data.lua
new file mode 100644
index 0000000..95eeba1
--- /dev/null
+++ b/Data/Libraries/Penlight/tests/test-data.lua
@@ -0,0 +1,245 @@
+local data = require 'pl.data'
+local List = require 'pl.List'
+local array = require 'pl.array2d'
+local func = require 'pl.func'
+local seq = require 'pl.seq'
+local stringio = require 'pl.stringio'
+local open = stringio. open
+local asserteq = require 'pl.test' . asserteq
+local T = require 'pl.test'. tuple
+
+--[=[
+dat,err = data.read(open [[
+1.0 0.1
+0.2 1.3
+]])
+
+if err then print(err) end
+
+require 'pl.pretty'.dump(dat)
+os.exit(0)
+--]=]
+
+-- tab-separated data, explicit column names
+local t1f = open [[
+EventID Magnitude LocationX LocationY LocationZ LocationError EventDate DataFile
+981124001 2.0 18988.4 10047.1 4149.7 33.8 24/11/1998 11:18:05 981124DF.AAB
+981125001 0.8 19104.0 9970.4 5088.7 3.0 25/11/1998 05:44:54 981125DF.AAB
+981127003 0.5 19012.5 9946.9 3831.2 46.0 27/11/1998 17:15:17 981127DF.AAD
+981127005 0.6 18676.4 10606.2 3761.9 4.4 27/11/1998 17:46:36 981127DF.AAF
+981127006 0.2 19109.9 9716.5 3612.0 11.8 27/11/1998 19:29:51 981127DF.AAG
+]]
+
+local t1 = data.read (t1f)
+-- column_by_name returns a List
+asserteq(t1:column_by_name 'Magnitude',List{2,0.8,0.5,0.6,0.2})
+-- can use array.column as well
+asserteq(array.column(t1,2),{2,0.8,0.5,0.6,0.2})
+
+-- only numerical columns (deduced from first data row) are converted by default
+-- can look up indices in the list fieldnames.
+local EDI = t1.fieldnames:index 'EventDate'
+assert(type(t1[1][EDI]) == 'string')
+
+-- select method returns a sequence, in this case single-valued.
+-- (Note that seq.copy returns a List)
+asserteq(seq(t1:select 'LocationX where Magnitude > 0.5'):copy(),List{18988.4,19104,18676.4})
+
+--[[
+--a common select usage pattern:
+for event,mag in t1:select 'EventID,Magnitude sort by Magnitude desc' do
+ print(event,mag)
+end
+--]]
+
+-- space-separated, but with last field containing spaces.
+local t2f = open [[
+USER PID %MEM %CPU COMMAND
+sdonovan 2333 0.3 0.1 background --n=2
+root 2332 0.4 0.2 fred --start=yes
+root 2338 0.2 0.1 backyard-process
+]]
+
+local t2,err = data.read(t2f,{last_field_collect=true})
+if not t2 then return print (err) end
+
+-- the last_field_collect option is useful with space-delimited data where the last
+-- field may contain spaces. Otherwise, a record count mismatch should be an error!
+local lt2 = List(t2[2])
+asserteq(lt2:join ',','root,2332,0.4,0.2,fred --start=yes')
+
+-- fieldnames are converted into valid identifiers by substituting _
+-- (we do this to make select queries parseable by Lua)
+asserteq(t2.fieldnames,List{'USER','PID','_MEM','_CPU','COMMAND'})
+
+-- select queries are NOT SQL so remember to use == ! (and no 'between' operator, sorry)
+--s,err = t2:select('_MEM where USER="root"')
+--assert(err == [[[string "tmp"]:9: unexpected symbol near '=']])
+
+local s = t2:select('_MEM where USER=="root"')
+assert(s() == 0.4)
+assert(s() == 0.2)
+assert(s() == nil)
+
+-- CSV, Excel style. Double-quoted fields are allowed, and they may contain commas!
+local t3f = open [[
+"Department Name","Employee ID",Project,"Hours Booked"
+sales,1231,overhead,4
+sales,1255,overhead,3
+engineering,1501,development,5
+engineering,1501,maintenance,3
+engineering,1433,maintenance,10
+]]
+
+local t3 = data.read(t3f,{csv=true})
+
+-- although fieldnames are turned in valid Lua identifiers, there is always `original_fieldnames`
+asserteq(t3.fieldnames,List{'Department_Name','Employee_ID','Project','Hours_Booked'})
+asserteq(t3.original_fieldnames,List{'Department Name','Employee ID','Project','Hours Booked'})
+
+-- a common operation is to select using a given list of columns, and each row
+-- on some explicit condition. The select() method can take a table with these
+-- parameters
+local keepcols = {'Employee_ID','Hours_Booked'}
+
+local q = t3:select { fields = keepcols,
+ where = function(row) return row[1]=='engineering' end
+ }
+
+asserteq(seq.copy2(q),{{1501,5},{1501,3},{1433,10}})
+
+-- another pattern is doing a select to restrict rows & columns, process some
+-- fields and write out the modified rows.
+
+local outf = stringio.create()
+
+local names = {[1501]='don',[1433]='dilbert'}
+
+t3:write_row (outf,{'Employee','Hours_Booked'})
+q = t3:select_row {fields=keepcols,where=func.Eq(func._1[1],'engineering')}
+for row in q do
+ row[1] = names[row[1]]
+ t3:write_row(outf,row)
+end
+
+asserteq(outf:value(),
+[[
+Employee,Hours_Booked
+don,5
+don,3
+dilbert,10
+]])
+
+-- data may not always have column headers. When creating a data object
+-- from a two-dimensional array, may specify the fieldnames, as a list or a string.
+-- The delimiter is deduced from the fieldname string, so a string just containing
+-- the delimiter will set it, and the fieldnames will be empty.
+local dat = List()
+local row = List.range(1,10)
+for i = 1,10 do
+ dat:append(row:map('*',i))
+end
+dat = data.new(dat,',')
+local out = stringio.create()
+dat:write(out,',')
+asserteq(out:value(), [[
+1,2,3,4,5,6,7,8,9,10
+2,4,6,8,10,12,14,16,18,20
+3,6,9,12,15,18,21,24,27,30
+4,8,12,16,20,24,28,32,36,40
+5,10,15,20,25,30,35,40,45,50
+6,12,18,24,30,36,42,48,54,60
+7,14,21,28,35,42,49,56,63,70
+8,16,24,32,40,48,56,64,72,80
+9,18,27,36,45,54,63,72,81,90
+10,20,30,40,50,60,70,80,90,100
+]])
+
+-- you can always use numerical field indices, AWK-style;
+-- note how the copy_select method gives you a data object instead of an
+-- iterator over the fields
+local res = dat:copy_select '$1,$3 where $1 > 5'
+local L = List
+asserteq(L(res),L{
+ L{6, 18},
+ L{7,21},
+ L{8,24},
+ L{9,27},
+ L{10,30},
+})
+
+-- the column_by_name method may take a fieldname or an index
+asserteq(dat:column_by_name(2), L{2,4,6,8,10,12,14,16,18,20})
+
+-- the field list may contain expressions or even constants
+local q = dat:select '$3,2*$4 where $1 == 8'
+asserteq(T(q()),T(24,64))
+
+dat,err = data.read(open [[
+1.0 0.1
+0.2 1.3
+]])
+
+if err then print(err) end
+
+-- if a method cannot be found, then we look up in array2d
+-- array2d.flatten(t) makes a 1D list out of a 2D array,
+-- and then List.minmax() gets the extrema.
+
+asserteq(T(dat:flatten():minmax()),T(0.1,1.3))
+
+local f = open [[
+Time Message
+1266840760 +# EE7C0600006F0D00C00F06010302054000000308010A00002B00407B00
+1266840760 closure data 0.000000 1972 1972 0
+1266840760 ++ 1266840760 EE 1
+1266840760 +# EE7C0600006F0D00C00F06010302054000000408020A00002B00407B00
+1266840764 closure data 0.000000 1972 1972 0
+1266840764 ++ 1266840764 EE 1
+1266840764 +# EE7C0600006F0D00C00F06010302054000000508030A00002B00407B00
+1266840768 duplicate?
+1266840768 +# EE7C0600006F0D00C00F06010302054000000508030A00002B00407B00
+1266840768 closure data 0.000000 1972 1972 0
+]]
+
+-- the `convert` option provides custom converters for each specified column.
+-- Here we convert the timestamps into Date objects and collect everything
+-- else into one field
+local Date = require 'pl.Date'
+
+local function date_convert (ds)
+ return Date(tonumber(ds))
+end
+
+local d = data.read(f,{convert={[1]=date_convert},last_field_collect=true})
+
+asserteq(#d[1],2)
+asserteq(d[2][1]:year(),2010)
+
+d = {{1,2,3},{10,20,30}}
+out = stringio.create()
+data.write(d,out,{'A','B','C'},',')
+asserteq(out:value(),
+[[
+A,B,C
+1,2,3
+10,20,30
+]])
+
+out = stringio.create()
+d.fieldnames = {'A','B','C'}
+data.write(d,out)
+
+asserteq(out:value(),
+[[
+A B C
+1 2 3
+10 20 30
+]])
+
+
+d = data.read(stringio.open 'One,Two\n1,\n,20\n',{csv=true})
+asserteq(d,{
+ {1,0},{0,20},
+ original_fieldnames={"One","Two"},fieldnames={"One","Two"},delim=","
+})