data-hsh.lmt /size: 12 Kb    last modification: 2021-10-28 13:51
1-- only lmt because the backend code doesn't deal with it and it makes
2-- no sense to waste time on that for mkiv
3
4if not modules then modules = { } end modules ['data-hsh'] = {
5    version   = 0.002,
6    comment   = "companion to luat-lib.mkiv",
7    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
8    copyright = "PRAGMA ADE / ConTeXt Development Team",
9    license   = "see context related readme files"
10}
11
12-- todo: options
13--
14-- lowercase
15-- cleanupnames (normalize)
16-- use database from project tree
17
18local type = type
19local gsub = string.gsub
20local addsuffix, basename, pathpart, filesuffix, filesize = file.addsuffix, file.basename, file.pathpart, file.suffix, file.size
21local loadtable, savetable = table.load, table.save
22local loaddata, savedata, open = io.loaddata, io.savedata, io.open
23
24local trace_hashed  = false
25local report_hashed = logs.reporter("resolvers","hashed")
26
27trackers.register("resolvers.locating", function(v) trace_hashed = v end)
28trackers.register("resolvers.hashed",   function(v) trace_hashed = v end)
29
30-- we can have a virtual file: open at the position, make sure read and seek don't
31-- go beyond the boundaries
32
33local resolvers = resolvers
34local finders   = resolvers.finders
35local openers   = resolvers.openers
36local loaders   = resolvers.loaders
37
38local ordered = { }
39local hashed  = { }
40local version = 0.002
41
42-- local lowercase = characters.lower
43
44local function showstatus(database,metadata)
45    report_hashed("database %a, %i paths, %i names, %i unique blobs, %i compressed blobs",
46        database, metadata.nofpaths, metadata.nofnames, metadata.nofblobs, metadata.nofcompressed
47    )
48end
49
50local function validhashed(database)
51    local found = hashed[database]
52    if found then
53        return found
54    else
55        local metaname = addsuffix(database,"lua")
56        local dataname = addsuffix(database,"dat")
57        local metadata = loadtable(metaname)
58        if type(metadata) ~= "table" then
59            report_hashed("invalid database %a",metaname)
60        elseif metadata.version ~= version then
61            report_hashed("version mismatch in database %a",metaname)
62        elseif not lfs.isfile(dataname) then
63            report_hashed("missing data data file for %a",metaname)
64        else
65            return {
66                database = database,
67                metadata = metadata,
68                dataname = dataname,
69            }
70        end
71    end
72end
73
74local function registerhashed(database)
75    if not hashed[database] then
76        local valid = validhashed(database)
77        if valid then
78            ordered[#ordered + 1] = valid
79            hashed[database] = ordered[#ordered]
80            showstatus(database,valid.metadata)
81        end
82    end
83end
84
85local registerfilescheme  do
86
87    local findfile = finders.file
88
89    local list = { }
90    local done = { }
91    local hash = { }
92
93    registerfilescheme = function(name)
94        if not done[name] then
95            list[#list+1] = name
96            done[name]    = true
97        end
98    end
99
100    -- why does the finder not remember ?
101
102    function finders.file(specification,filetype)
103        if type(specification) == "table" then
104            local original = specification.original
105         -- print(original)
106            if original then
107                local found = hash[original]
108                if found == nil then
109                    for i=1,#list do
110                        local scheme = list[i]
111                        local found  = finders[scheme](specification,filetype)
112                        if found then
113                            hash[original] = found
114                            if trace_hashed then
115                                report_hashed("found by auto scheme %s: %s",scheme,found)
116                            end
117                            return found
118                        end
119                    end
120                    local found = findfile(specification,filetype)
121                    if found then
122                        hash[original] = found
123                        if trace_hashed then
124                            report_hashed("found by normal file scheme: %s",found)
125                        end
126                        return found
127                    end
128                    hash[original] = false
129                elseif found then
130                    return found
131                end
132                return false
133            else
134                -- something is wrong here, maybe we should trace it (scheme can be "unknown")
135            end
136        end
137        -- again, something is wrong
138        return findfile(specification,filetype)
139    end
140
141end
142
143finders.helpers.validhashed        = validhashed
144finders.helpers.registerhashed     = registerhashed
145finders.helpers.registerfilescheme = registerfilescheme
146
147local function locate(found,path,name)
148    local files  = found.metadata.files
149    local hashes = found.metadata.hashes
150    local fp = files[path]
151    local hash = fp and fp[name]
152    if hash and hashes[hash] then
153        return hash
154    end
155end
156
157local function locatehash(filename,database)
158    if filename then
159        local name = basename(filename)
160        local path = pathpart(filename)
161        local hash = false
162        if database then
163            local found = hashed[database]
164            if found then
165                hash = locate(found,path,name), database, path, name
166            end
167        else
168            for i=1,#ordered do
169                local found = ordered[i]
170                hash = locate(found,path,name)
171                if hash then
172                    database = found.database
173                    break
174                end
175            end
176        end
177        if hash then
178            return {
179                hash = hash,
180                name = name,
181                path = path,
182                base = database,
183            }
184        end
185    end
186end
187
188-- no caching yet, we don't always want the file and it's fast enough
189
190local function locateblob(filename,database)
191    local found = locatehash(filename,database)
192    if found then
193        local database = found.base
194        local data     = hashed[database]
195        if data then
196            local metadata = data.metadata
197            local dataname = data.dataname
198            local hashes   = metadata.hashes
199            local blobdata = hashes[found.hash]
200            if blobdata and dataname then
201                local position = blobdata.position
202                local f = open(dataname,"rb")
203                if f then
204                    f:seek("set",position)
205                    local blob = f:read(blobdata.datasize)
206                    if blobdata.compress == "zip" then
207                        blob = zlib.decompresssize(blob,blobdata.filesize)
208                    end
209                    return blob
210                end
211            end
212        end
213    end
214end
215
216local finders  = resolvers.finders
217local notfound = finders.notfound
218
219function finders.hashed(specification)
220    local original = specification.original
221    local fullpath = specification.path
222    if fullpath then
223        local found = locatehash(fullpath)
224        if found then
225            if trace_hashed then
226                report_hashed("finder: file %a found",original)
227            end
228            return original
229        end
230    end
231    if trace_hashed then
232        report_hashed("finder: unknown file %a",original)
233    end
234    return notfound()
235end
236
237local notfound   = openers.notfound
238local textopener = openers.helpers.textopener
239
240function openers.hashed(specification)
241    local original = specification.original
242    local fullpath = specification.path
243    if fullpath then
244        local found = locateblob(fullpath)
245        if found then
246            if trace_hashed then
247                report_hashed("finder: file %a found",original)
248            end
249            return textopener("hashed",original,found,"utf-8")
250        end
251    end
252    if trace_hashed then
253        report_hashed("finder: unknown file %a",original)
254    end
255    return notfound()
256end
257
258local notfound = loaders.notfound
259
260function loaders.hashed(specification)
261    local original = specification.original
262    local fullpath = specification.path
263    if fullpath then
264        local found = locateblob(fullpath)
265        if found then
266            if trace_hashed then
267                report_hashed("finder: file %a found",original)
268            end
269            return true, found, found and #found or 0
270        end
271    end
272    if trace_hashed then
273        report_hashed("finder: unknown file %a",original)
274    end
275    return notfound()
276end
277
278-- this actually could end up in the generate namespace but it is not
279-- really a 'generic' feature, more a module (at least for now)
280
281local calculatehash = sha2.HEX256 -- md5.HEX is not unique enough
282
283function resolvers.finders.helpers.createhashed(specification)
284    local database = specification.database
285    local patterns = specification.patterns
286    if not patterns then
287        local pattern = specification.pattern
288        if pattern then
289            patterns = {
290                {
291                    pattern  = pattern,
292                    compress = specification.compress,
293                }
294            }
295        end
296    end
297    local datname  = addsuffix(database,"dat")
298    local luaname  = addsuffix(database,"lua")
299    local metadata = loadtable(luaname)
300    if type(metadata) ~= "table" then
301        metadata = false
302    elseif metadata.kind == "hashed" and metadata.version ~= version then
303        report_hashed("version mismatch, starting with new table")
304        metadata = false
305    end
306    if not metadata then
307        metadata = {
308            version       = version,
309            kind          = "hashed",
310            files         = { },
311            hashes        = { },
312            nofnames      = 0,
313            nofpaths      = 0,
314            nofblobs      = 0,
315            nofcompressed = 0,
316        }
317    end
318    local files         = metadata.files
319    local hashes        = metadata.hashes
320    local nofpaths      = metadata.nofpaths
321    local nofnames      = metadata.nofnames
322    local nofblobs      = metadata.nofblobs
323    local nofcompressed = metadata.nofcompressed
324    if type(patterns) == "table" then
325        for i=1,#patterns do
326            local pattern = patterns[i].pattern
327            if pattern then
328                local compress = patterns[i].compress
329                local list     = dir.glob(pattern)
330                local total    = #list
331                report_hashed("database %a, adding pattern %a, compression %l",database,pattern,compress)
332                for i=1,total do
333                    local filename = list[i]
334                    local name     = basename(filename)
335                    local path     = pathpart(filename)
336                    local data     = loaddata(filename)
337                    -- cleanup
338                    path = gsub(path,"^[./]*","")
339                    --
340                    if data then
341                        local fp = files[path]
342                        if not fp then
343                            fp = { }
344                            files[path] = fp
345                            nofpaths = nofpaths + 1
346                        end
347                        local ff = fp[name]
348                        if not ff then
349                            local hash = calculatehash(data)
350                            if not hashes[hash] then
351                                local size = #data
352                                if compress then
353                                    data = zlib.compresssize(data,size)
354                                    nofcompressed = nofcompressed + 1
355                                end
356                                local position = filesize(datname)
357                                savedata(datname,data,"",true)
358                                hashes[hash] = {
359                                    filesize = size,
360                                    datasize = #data,
361                                    compress = compress and "zip",
362                                    position = position,
363                                }
364                                nofblobs = nofblobs + 1
365                            end
366                            fp[name] = hash
367                            nofnames = nofnames + 1
368                        end
369                    end
370                end
371            end
372        end
373    end
374    metadata.nofpaths      = nofpaths
375    metadata.nofnames      = nofnames
376    metadata.nofblobs      = nofblobs
377    metadata.nofcompressed = nofcompressed
378    savetable(luaname, metadata)
379    showstatus(database,metadata)
380    return metadata
381end
382
383