data-exp.lua / last modification: 2020-01-30 14:16
if not modules then modules = { } end modules ['data-exp'] = {
    version   = 1.001,
    comment   = "companion to luat-lib.mkiv",
    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
    copyright = "PRAGMA ADE / ConTeXt Development Team",
    license   = "see context related readme files",
}

local format, find, gmatch, lower, char, sub = string.format, string.find, string.gmatch, string.lower, string.char, string.sub
local concat, sort = table.concat, table.sort
local sortedkeys = table.sortedkeys
local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
local Ct, Cs, Cc, Carg, P, C, S = lpeg.Ct, lpeg.Cs, lpeg.Cc, lpeg.Carg, lpeg.P, lpeg.C, lpeg.S
local type, next = type, next
local isdir = lfs.isdir

local collapsepath, joinpath, basename = file.collapsepath, file.join, file.basename

local trace_locating   = false  trackers.register("resolvers.locating",   function(v) trace_locating   = v end)
local trace_expansions = false  trackers.register("resolvers.expansions", function(v) trace_expansions = v end)
local trace_globbing   = true   trackers.register("resolvers.globbing",   function(v) trace_globbing   = v end)

local report_expansions = logs.reporter("resolvers","expansions")
local report_globbing   = logs.reporter("resolvers","globbing")

local resolvers     = resolvers
local resolveprefix = resolvers.resolve

-- As this bit of code is somewhat special it gets its own module. After
-- all, when working on the main resolver code, I don't want to scroll
-- past this every time. See data-obs.lua for the gsub variant.

-- local function f_first(a,b)
--     local t, n = { }, 0
--     for s in gmatch(b,"[^,]+") do
--         n = n + 1 ; t[n] = a .. s
--     end
--     return concat(t,",")
-- end
--
-- local function f_second(a,b)
--     local t, n = { }, 0
--     for s in gmatch(a,"[^,]+") do
--         n = n + 1 ; t[n] = s .. b
--     end
--     return concat(t,",")
-- end

-- kpsewhich --expand-braces '{a,b}{c,d}'
-- ac:bc:ad:bd

-- old  {a,b}{c,d} => ac ad bc bd
--
-- local function f_both(a,b)
--     local t, n = { }, 0
--     for sa in gmatch(a,"[^,]+") do
--         for sb in gmatch(b,"[^,]+") do
--             n = n + 1 ; t[n] = sa .. sb
--         end
--     end
--     return concat(t,",")
-- end
--
-- new  {a,b}{c,d} => ac bc ad bd

local function f_both(a,b)
    local t, n = { }, 0
    for sb in gmatch(b,"[^,]+") do              -- and not sa
        for sa in gmatch(a,"[^,]+") do          --         sb
            n = n + 1 ; t[n] = sa .. sb
        end
    end
    return concat(t,",")
end

local comma   = P(",")
local nocomma = (1-comma)^1
local docomma = comma^1/","
local before  = Cs((nocomma * Carg(1) + docomma)^0)
local after   = Cs((Carg(1) * nocomma + docomma)^0)
local both    = Cs(((C(nocomma) * Carg(1))/function(a,b) return lpegmatch(before,b,1,a) end + docomma)^0)

local function f_first (a,b) return lpegmatch(after, b,1,a) end
local function f_second(a,b) return lpegmatch(before,a,1,b) end
local function f_both  (a,b) return lpegmatch(both,  b,1,a) end

-- print(f_first ("a",    "x,y,z"))
-- print(f_second("a,b,c","x"))
-- print(f_both  ("a,b,c","x,y,z"))

local left  = P("{")
local right = P("}")
local var   = P((1 - S("{}" ))^0)
local set   = P((1 - S("{},"))^0)
local other = P(1)

local l_first  = Cs( ( Cc("{") * (C(set) * left * C(var) * right / f_first) * Cc("}")               + other )^0 )
local l_second = Cs( ( Cc("{") * (left * C(var) * right * C(set) / f_second) * Cc("}")              + other )^0 )
local l_both   = Cs( ( Cc("{") * (left * C(var) * right * left * C(var) * right / f_both) * Cc("}") + other )^0 )
local l_rest   = Cs( ( left * var * (left/"") * var * (right/"") * var * right                      + other )^0 )

local stripper_1 = lpeg.stripper ("{}@")
local replacer_1 = lpeg.replacer { { ",}", ",@}" }, { "{,", "{@," }, }

local function splitpathexpr(str, newlist, validate) -- I couldn't resist lpegging it (nice exercise).
    if trace_expansions then
        report_expansions("expanding variable %a",str)
    end
    local t, ok, done = newlist or { }, false, false
    local n = #t
    str = lpegmatch(replacer_1,str)
    repeat
        local old = str
        repeat
            local old = str
            str = lpegmatch(l_first, str)
        until old == str
        repeat
            local old = str
            str = lpegmatch(l_second,str)
        until old == str
        repeat
            local old = str
            str = lpegmatch(l_both,  str)
        until old == str
        repeat
            local old = str
            str = lpegmatch(l_rest,  str)
        until old == str
    until old == str -- or not find(str,"{",1,true)
    str = lpegmatch(stripper_1,str)
    if validate then
        for s in gmatch(str,"[^,]+") do
            s = validate(s)
            if s then
                n = n + 1
                t[n] = s
            end
        end
    else
        for s in gmatch(str,"[^,]+") do
            n = n + 1
            t[n] = s
        end
    end
    if trace_expansions then
        for k=1,#t do
            report_expansions("% 4i: %s",k,t[k])
        end
    end
    return t
end

-- We could make the previous one public.

local function validate(s)
    s = collapsepath(s) -- already keeps the trailing / and //
    return s ~= "" and not find(s,"^!*unset/*$") and s
end

resolvers.validatedpath = validate -- keeps the trailing //

function resolvers.expandedpathfromlist(pathlist)
    local newlist = { }
    for k=1,#pathlist do
        splitpathexpr(pathlist[k],newlist,validate)
    end
    return newlist
end

-- {a,b,c,d}
-- a,b,c/{p,q,r},d
-- a,b,c/{p,q,r}/d/{x,y,z}//
-- a,b,c/{p,q/{x,y,z},r},d/{p,q,r}
-- a,b,c/{p,q/{x,y,z},r},d/{p,q,r}
-- a{b,c}{d,e}f
-- {a,b,c,d}
-- {a,b,c/{p,q,r},d}
-- {a,b,c/{p,q,r}/d/{x,y,z}//}
-- {a,b,c/{p,q/{x,y,z}},d/{p,q,r}}
-- {a,b,c/{p,q/{x,y,z},w}v,d/{p,q,r}}
-- {$SELFAUTODIR,$SELFAUTOPARENT}{,{/share,}/texmf{-local,.local,}/web2c}

local usedhomedir = nil
local donegation  = (P("!") /""     )^0
local doslashes   = (P("\\")/"/" + 1)^0

local function expandedhome()
    if not usedhomedir then
        usedhomedir = lpegmatch(Cs(donegation * doslashes),environment.homedir or "")
        if usedhomedir == "~" or usedhomedir == "" or not isdir(usedhomedir) then
            if trace_expansions then
                report_expansions("no home dir set, ignoring dependent path using current path")
            end
            usedhomedir = "."
        end
    end
    return usedhomedir
end

local dohome  = ((P("~") + P("$HOME") + P("%HOME%")) / expandedhome)^0
local cleanup = Cs(donegation * dohome * doslashes)

resolvers.cleanpath = function(str)
    return str and lpegmatch(cleanup,str) or ""
end

-- print(resolvers.cleanpath(""))
-- print(resolvers.cleanpath("!"))
-- print(resolvers.cleanpath("~"))
-- print(resolvers.cleanpath("~/test"))
-- print(resolvers.cleanpath("!~/test"))
-- print(resolvers.cleanpath("~/test~test"))

-- This one strips quotes and funny tokens.

-- we have several options here:
--
-- expandhome = P("~") / "$HOME"              : relocateble
-- expandhome = P("~") / "home:"              : relocateble
-- expandhome = P("~") / environment.homedir  : frozen but unexpanded
-- expandhome = P("~") = dohome               : frozen and expanded

local expandhome = P("~") / "$HOME"

local dodouble = P('"') / "" * (expandhome + (1 - P('"')))^0 * P('"') / ""
local dosingle = P("'") / "" * (expandhome + (1 - P("'")))^0 * P("'") / ""
local dostring =               (expandhome +  1              )^0

local stripper = Cs(
    lpegpatterns.unspacer * (dosingle + dodouble + dostring) * lpegpatterns.unspacer
)

function resolvers.checkedvariable(str) -- assumes str is a string
    return type(str) == "string" and lpegmatch(stripper,str) or str
end

-- The path splitter:

-- A config (optionally) has the paths split in tables. Internally
-- we join them and split them after the expansion has taken place. This
-- is more convenient.

local cache = { }

local splitter = lpeg.tsplitat(";") -- as we move towards urls, prefixes and use tables we no longer do :

local backslashswapper = lpeg.replacer("\\","/")

local function splitconfigurationpath(str) -- beware, this can be either a path or a { specification }
    if str then
        local found = cache[str]
        if not found then
            if str == "" then
                found = { }
            else
                local split = lpegmatch(splitter,lpegmatch(backslashswapper,str)) -- can be combined
                found = { }
                local noffound = 0
                for i=1,#split do
                    local s = split[i]
                    if not find(s,"^{*unset}*") then
                        noffound = noffound + 1
                        found[noffound] = s
                    end
                end
                if trace_expansions then
                    report_expansions("splitting path specification %a",str)
                    for k=1,noffound do
                        report_expansions("% 4i: %s",k,found[k])
                    end
                end
                cache[str] = found
            end
        end
        return found
    end
end

resolvers.splitconfigurationpath = splitconfigurationpath

function resolvers.splitpath(str)
    if type(str) == 'table' then
        return str
    else
        return splitconfigurationpath(str)
    end
end

function resolvers.joinpath(str)
    if type(str) == 'table' then
        return joinpath(str)
    else
        return str
    end
end

-- The next function scans directories and returns a hash where the
-- entries are either strings or tables.
--
-- starting with . or .. etc or funny char
--
-- local l_forbidden = S("~`!#$%^&*()={}[]:;\"\'||\\/<>,?\n\r\t")
-- local l_confusing = P(" ")
-- local l_character = lpegpatterns.utf8
-- local l_dangerous = P(".")
--
-- local l_normal = (l_character - l_forbidden - l_confusing - l_dangerous) * (l_character - l_forbidden - l_confusing^2)^0 * P(-1)
-- ----- l_normal = l_normal * Cc(true) + Cc(false)
--
-- local function test(str)
--     print(str,lpegmatch(l_normal,str))
-- end
-- test("ヒラギノ明朝 Pro W3")
-- test("..ヒラギノ明朝 Pro W3")
-- test(":ヒラギノ明朝 Pro W3;")
-- test("ヒラギノ明朝 /Pro W3;")
-- test("ヒラギノ明朝 Pro  W3")

-- a lot of this caching can be stripped away when we have ssd's everywhere
--
-- we could cache all the (sub)paths here if needed

local attributes, directory = lfs.attributes, lfs.dir

local weird          = P(".")^1 + lpeg.anywhere(S("~`!#$%^&*()={}[]:;\"\'||<>,?\n\r\t"))
local lessweird      = P(".")^1 + lpeg.anywhere(S("~`#$%^&*:;\"\'||<>,?\n\r\t"))
local timer          = { }
local scanned        = { }
local nofscans       = 0
local scancache      = { }
local fullcache      = { }
----- simplecache    = { }
local nofsharedscans = 0
local addcasecraptoo = true -- experiment to let case matter a  bit (still fuzzy)

-- So, we assume either a lowercase name or a mixed case one but only one such case
-- as having Foo fOo foo FoO FOo etc on the system is braindead in any sane project.

local function scan(files,remap,spec,path,n,m,r,onlyone,tolerant)
    local full     = path == "" and spec or (spec .. path .. '/')
    local dirlist  = { }
    local nofdirs  = 0
    local pattern  = tolerant and lessweird or weird
    local filelist = { }
    local noffiles = 0
    for name, mode in directory(full) do
        if not lpegmatch(pattern,name) then
            if not mode then
                mode = attributes(full..name,"mode")
            end
            if mode == "file" then
                n = n + 1
                noffiles = noffiles + 1
                filelist[noffiles] = name
            elseif mode == "directory" then
                m = m + 1
                nofdirs = nofdirs + 1
                if path ~= "" then
                    dirlist[nofdirs] = path .. "/" .. name
                else
                    dirlist[nofdirs] = name
                end
            end
        end
    end
    if noffiles > 0 then
        sort(filelist)
        for i=1,noffiles do
            local name  = filelist[i]
            local lower = lower(name)
            local paths = files[lower]
            if paths then
                if onlyone then
                    -- forget about it
                else
                    if name ~= lower then
                        local rl = remap[lower]
                        if not rl then
                            remap[lower] = name
                            r = r + 1
                        elseif trace_globbing and rl ~= name then
                            report_globbing("confusing filename, name: %a, lower: %a, already: %a",name,lower,rl)
                        end
                        if addcasecraptoo then
                            local paths = files[name]
                            if not paths then
                                files[name] = path
                            elseif type(paths) == "string" then
                                files[name] = { paths, path }
                            else
                                paths[#paths+1] = path
                            end
                        end
                    end
                    if type(paths) == "string" then
                        files[lower] = { paths, path }
                    else
                        paths[#paths+1] = path
                    end
                end
            else -- probably unique anyway
                files[lower] = path
                if name ~= lower then
                    local rl = remap[lower]
                    if not rl then
                        remap[lower] = name
                        r = r + 1
                    elseif trace_globbing and rl ~= name then
                        report_globbing("confusing filename, name: %a, lower: %a, already: %a",name,lower,rl)
                    end
                end
            end
        end
    end
    if nofdirs > 0 then
        sort(dirlist)
        for i=1,nofdirs do
            files, remap, n, m, r = scan(files,remap,spec,dirlist[i],n,m,r,onlyonce,tolerant)
        end
    end
    scancache[sub(full,1,-2)] = files
    return files, remap, n, m, r
end

local function scanfiles(path,branch,usecache,onlyonce,tolerant)
    local realpath = resolveprefix(path)
    if usecache then
        local content = fullcache[realpath]
        if content then
            if trace_locating then
                report_expansions("using cached scan of path %a, branch %a",path,branch or path)
            end
            nofsharedscans = nofsharedscans + 1
            return content
        end
    end
    --
    statistics.starttiming(timer)
    if trace_locating then
        report_expansions("scanning path %a, branch %a",path,branch or path)
    end
    local content
    if isdir(realpath) then
        local files, remap, n, m, r = scan({ },{ },realpath .. '/',"",0,0,0,onlyonce,tolerant)
        content = {
            metadata = {
                path        = path, -- can be selfautoparent:texmf-whatever
                files       = n,
                directories = m,
                remappings  = r,
            },
            files = files,
            remap = remap,
        }
        if trace_locating then
            report_expansions("%s files found on %s directories with %s uppercase remappings",n,m,r)
        end
    else
        content = {
            metadata = {
                path        = path, -- can be selfautoparent:texmf-whatever
                files       = 0,
                directories = 0,
                remappings  = 0,
            },
            files = { },
            remap = { },
        }
        if trace_locating then
            report_expansions("invalid path %a",realpath)
        end
    end
    if usecache then
        scanned[#scanned+1] = realpath
        fullcache[realpath] = content
    end
    nofscans = nofscans + 1
    statistics.stoptiming(timer)
    return content
end

resolvers.scanfiles = scanfiles

function resolvers.simplescanfiles(path,branch,usecache)
    return scanfiles(path,branch,usecache,true,true) -- onlyonce
end

function resolvers.scandata()
    table.sort(scanned)
    return {
        n      = nofscans,
        shared = nofsharedscans,
        time   = statistics.elapsedtime(timer),
        paths  = scanned,
    }
end

function resolvers.get_from_content(content,path,name) -- or (content,name)
    if not content then
        return
    end
    local files = content.files
    if not files then
        return
    end
    local remap = content.remap
    if not remap then
        return
    end
    if name then
        -- this one resolves a remapped name
        local used = lower(name)
        return path, remap[used] or used
    else
        -- this one does a lookup and resolves a remapped name
        local name = path
--         if addcasecraptoo then
--             local path = files[name]
--             if path then
--                 return path, name
--             end
--         end
        local used = lower(name)
        local path = files[used]
        if path then
            return path, remap[used] or used
        end
    end
end

local nothing = function() end

function resolvers.filtered_from_content(content,pattern)
    if content and type(pattern) == "string" then
        local pattern = lower(pattern)
        local files   = content.files -- we could store the sorted list
        local remap   = content.remap
        if files and remap then
            local f = sortedkeys(files)
            local n = #f
            local i = 0
            local function iterator()
                while i < n do
                    i = i + 1
                    local k = f[i]
                    if find(k,pattern) then
                        return files[k], remap and remap[k] or k
                    end
                end
            end
            return iterator
        end
    end
    return nothing
end

-- inspect(resolvers.simplescanfiles("e:/temporary/mb-mp"))
-- inspect(resolvers.scanfiles("e:/temporary/mb-mp"))