publ-dat.lua /size: 43 Kb    last modification: 2023-12-21 09:44
1if not modules then modules = { } end modules ['publ-dat'] = {
2    version   = 1.001,
3    comment   = "this module part of publication support",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- todo: strip the @ in the lpeg instead of on do_definition and do_shortcut
10-- todo: store bibroot and bibrootdt
11-- todo: dataset = datasets[dataset] => current = datasets[dataset]
12-- todo: maybe split this file
13
14if not characters then
15    dofile(resolvers.findfile("char-utf.lua"))
16    dofile(resolvers.findfile("char-tex.lua"))
17end
18
19if not utilities.sequencers then
20    dofile(resolvers.findfile("util-seq.lua"))
21end
22
23local lower, find, sub = string.lower, string.find, string.sub
24local concat, copy, tohash = table.concat, table.copy, table.tohash
25local next, type, rawget, tonumber = next, type, rawget, tonumber
26local utfchar = utf.char
27local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
28local textoutf = characters and characters.tex.toutf
29local settings_to_hash, settings_to_array = utilities.parsers.settings_to_hash, utilities.parsers.settings_to_array
30local formatters = string.formatters
31local sortedkeys, sortedhash, keys, sort = table.sortedkeys, table.sortedhash, table.keys, table.sort
32local xmlcollected, xmltext, xmlconvert = xml.collected, xml.text, xml.convert
33local setmetatableindex = table.setmetatableindex
34
35-- todo: more allocate
36
37local P, R, S, V, C, Cc, Cs, Ct, Carg, Cmt, Cp = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.C, lpeg.Cc, lpeg.Cs, lpeg.Ct, lpeg.Carg, lpeg.Cmt, lpeg.Cp
38
39local p_whitespace      = lpegpatterns.whitespace
40local p_utf8character   = lpegpatterns.utf8character
41
42local trace             = false  trackers.register("publications",            function(v) trace = v end)
43local trace_duplicates  = true   trackers.register("publications.duplicates", function(v) trace = v end)
44local trace_strings     = false  trackers.register("publications.strings",    function(v) trace = v end)
45
46local report            = logs.reporter("publications")
47local report_duplicates = logs.reporter("publications","duplicates")
48local report_strings    = logs.reporter("publications","strings")
49
50local allocate          = utilities.storage.allocate
51
52local commands          = commands
53local implement         = interfaces and interfaces.implement
54
55publications            = publications or { }
56local publications      = publications
57
58local datasets          = publications.datasets or { }
59publications.datasets   = datasets
60
61local writers           = publications.writers or { }
62publications.writers    = writers
63
64local tables            = publications.tables or { }
65publications.tables     = tables
66
67publications.statistics = publications.statistics or { }
68local publicationsstats = publications.statistics
69
70local loaders           = publications.loaders or { }
71publications.loaders    = loaders
72
73local casters           = { }
74publications.casters    = casters
75
76-- local sorters           = { }
77-- publications.sorters    = sorters
78--
79-- local indexers          = { }
80-- publications.indexers   = indexers
81
82local components        = { }
83publications.components = components -- register components
84
85local enhancers         = publications.enhancers or { }
86publications.enhancers  = enhancers
87
88local enhancer          = publications.enhancer or utilities.sequencers.new { arguments = "dataset" }
89publications.enhancer   = enhancer
90
91utilities.sequencers.appendgroup(enhancer,"system") -- private
92
93publicationsstats.nofbytes       = 0
94publicationsstats.nofdefinitions = 0
95publicationsstats.nofshortcuts   = 0
96publicationsstats.nofdatasets    = 0
97
98local privates = allocate {
99    category      = true,
100    tag           = true,
101    index         = true,
102    suffix        = true,
103    specification = true,
104}
105
106local specials = allocate {
107    key      = true,
108    crossref = true,
109    keywords = true,
110    language = true,
111    comment  = true,
112}
113
114local implicits = allocate {
115    category = "implicit",
116    tag      = "implicit",
117    key      = "implicit",
118    keywords = "implicit",
119    language = "implicit",
120    crossref = "implicit",
121}
122
123local origins = allocate {
124    "optional",
125    "extra",
126    "required",
127    "virtual",
128}
129
130local virtuals = allocate {
131    "authoryear",
132    "authoryears",
133    "authornum",
134    "num",
135    "suffix",
136}
137
138local defaulttypes = allocate {
139    author     = "author",
140    editor     = "author",
141    translator = "author",
142 -- publisher  = "author",
143    page       = "pagenumber",
144    pages      = "pagenumber",
145    keywords   = "keyword",
146    doi        = "url",
147    url        = "url",
148}
149
150local defaultsets = allocate {
151    page = { "page", "pages" },
152}
153
154tables.implicits = implicits
155tables.origins   = origins
156tables.virtuals  = virtuals
157tables.types     = defaulttypes
158tables.sets      = defaultsets
159tables.privates  = privates
160tables.specials  = specials
161
162local variables  = interfaces and interfaces.variables or setmetatableindex("self")
163
164local v_all      = variables.all
165local v_default  = variables.default
166
167if not publications.usedentries then
168    function publications.usedentries()
169        return { }
170    end
171end
172
173local xmlplaceholder = "<?xml version='1.0' standalone='yes'?>\n<bibtex></bibtex>"
174
175local defaultshortcuts = allocate {
176    jan =  "1",
177    feb =  "2",
178    mar =  "3",
179    apr =  "4",
180    may =  "5",
181    jun =  "6",
182    jul =  "7",
183    aug =  "8",
184    sep =  "9",
185    oct = "10",
186    nov = "11",
187    dec = "12",
188}
189
190local space      = p_whitespace^0
191local separator  = space * "+" * space
192local p_splitter = lpeg.tsplitat(separator)
193
194local unknownfield = function(t,k)
195    local v = "extra"
196    t[k] = v
197    return v
198end
199
200local unknowncategory = function(t,k)
201    local v = {
202        required = false,
203        optional = false,
204        virtual  = false,
205        fields   = setmetatableindex(unknownfield), -- this will remember them
206        types    = unknowntypes,
207        sets     = setmetatableindex(defaultsets),  -- new, but rather small
208    }
209    t[k] = v
210    return v
211end
212
213local unknowntype = function(t,k)
214    local v = "string"
215    t[k] = v
216    return v
217end
218
219local default = {
220    name       = name,
221    version    = "1.00",
222    comment    = "unknown specification.",
223    author     = "anonymous",
224    copyright  = "no one",
225    categories = setmetatableindex(unknowncategory),
226    types      = setmetatableindex(defaulttypes,unknowntype),
227}
228
229-- maybe at some point we can have a handlers table with per field
230-- a found, fetch, ... method
231
232local function checkfield(specification,category,data)
233    local list    = setmetatableindex({},implicits)
234    data.fields   = list
235    data.category = category
236    local sets    = data.sets or { }
237    for i=1,#origins do
238        local t = origins[i]
239        local d = data[t]
240        if d then
241            for i=1,#d do
242                local di = d[i]
243                di = sets[di] or di
244                if type(di) == "table" then
245                    for i=1,#di do
246                        list[di[i]] = t
247                    end
248                else
249                    list[di] = t
250                end
251            end
252        else
253            data[t] = { }
254        end
255    end
256    return data
257end
258
259local specifications = setmetatableindex(function(t,name)
260    if not name then
261        return default -- initializer
262    end
263    local filename = formatters["publ-imp-%s.lua"](name)
264    local fullname = resolvers.findfile(filename) or ""
265    if fullname == "" then
266        report("no data definition file %a for %a",filename,name)
267        t[name] = default
268        return default
269    end
270    local specification = table.load(fullname)
271    if not specification then
272        report("invalid data definition file %a for %a",fullname,name)
273        t[name] = default
274        return default
275    end
276    --
277    local categories = specification.categories
278    if not categories then
279        categories = { }
280        specification.categories = categories
281    end
282    setmetatableindex(categories,unknowncategory)
283    --
284    local types = specification.types
285    if not types then
286        types = defaulttypes
287        specification.types = types
288    end
289    setmetatableindex(types,unknowntype)
290    --
291    local fields = setmetatableindex(unknownfield)
292    specification.fields = fields
293    --
294    local virtual = specification.virtual
295    if virtual == nil then -- so false is valid
296        virtual = { }
297    elseif virtual == false then
298        virtual = { }
299    elseif type(virtual) ~= table then
300        virtual = virtuals
301    end
302    specification.virtual = virtual
303    specification.virtualfields = tohash(virtual)
304    --
305    for category, data in next, categories do
306        categories[category] = checkfield(specification,category,copy(data)) -- we make sure we have no clones
307    end
308    --
309    t[name] = specification
310    --
311    return specification
312end)
313
314publications.specifications = specifications
315
316function publications.setcategory(target,category,data)
317    local specification = specifications[target]
318    specification.categories[category] = checkfield(specification,category,data)
319end
320
321function publications.parenttag(dataset,tag)
322    if not dataset or not tag then
323        report("error in specification, dataset %a, tag %a",dataset,tag)
324    elseif find(tag,"+",1,true) then
325        local tags    = lpegmatch(p_splitter,tag)
326        local parent  = tags[1]
327        local current = datasets[dataset]
328        local luadata = current.luadata
329        local details = current.details
330        local first   = luadata[parent]
331        if first then
332            local detail   = details[parent]
333            local children = detail.children
334            if not children then
335                children = { }
336                detail.children = children
337            end
338            -- add new ones but only once
339            for i=2,#tags do
340                local tag = tags[i]
341                for j=1,#children do
342                    if children[j] == tag then
343                        tag = false
344                    end
345                end
346                if tag then
347                    local entry = luadata[tag]
348                    if entry then
349                        local detail = details[tag]
350                        children[#children+1] = tag
351                        if detail.parent then
352                            report("error in combination, dataset %a, tag %a, parent %a, ignored %a",dataset,tag,detail.parent,parent)
353                        else
354                            report("combining, dataset %a, tag %a, parent %a",dataset,tag,parent)
355                            detail.parent = parent
356                        end
357                    end
358                end
359            end
360            return parent
361        end
362    end
363    return tag or ""
364end
365
366function publications.new(name)
367    publicationsstats.nofdatasets = publicationsstats.nofdatasets + 1
368    local dataset = {
369        name       = name or "dataset " .. publicationsstats.nofdatasets,
370        nofentries = 0,
371        shortcuts  = { },
372        luadata    = { },
373        suffixes   = { },
374        xmldata    = xmlconvert(xmlplaceholder),
375        details    = { },
376        missing    = { },
377        ordered    = { },
378        nofbytes   = 0,
379        entries    = nil, -- empty == all
380        sources    = { },
381        loaded     = { },
382        fields     = { },
383        userdata   = { },
384        used       = { },
385        commands   = { }, -- for statistical purposes
386        citestate  = { },
387        status     = {
388            resources = false,
389            userdata  = false,
390        },
391        specifications = {
392            -- used specifications
393        },
394        suffixed   = false,
395    }
396    -- we delay details till we need it (maybe we just delay the
397    -- individual fields but that is tricky as there can be some
398    -- depedencies)
399    return dataset
400end
401
402setmetatableindex(datasets,function(t,k)
403    if type(k) == "table" then
404        return k -- so we can use this accessor as checker
405    else
406        local v = publications.new(k)
407        datasets[k] = v
408        return v
409    end
410end)
411
412local function getindex(dataset,luadata,tag)
413    local found = luadata[tag]
414    if found then
415        local index = found.index or 0
416        dataset.ordered[tag] = index
417        return index
418    else
419        local index = dataset.nofentries + 1
420        dataset.nofentries = index
421        dataset.ordered[index] = tag
422        return index
423    end
424end
425
426publications.getindex = getindex
427
428do
429
430    -- we apply some normalization
431
432    local space     = S(" \t\n\r\f") -- / " "
433    local collapsed = space^1/" "
434    local csletter  = lpegpatterns.csletter or R("az","AZ")
435
436    ----- command   = P("\\") * Cc("btxcmd{") * (R("az","AZ")^1) * Cc("}")
437    ----- command   = P("\\") * (Carg(1) * C(R("az","AZ")^1) / function(list,c) list[c] = (list[c] or 0) + 1 return "btxcmd{" .. c .. "}" end)
438    ----- command   = P("\\") * (Carg(1) * C(R("az","AZ")^1) * space^0 / function(list,c) list[c] = (list[c] or 0) + 1 return "btxcmd{" .. c .. "}" end)
439    local command   = P("\\") * (Carg(1) * C(csletter^1) * space^0 / function(list,c) list[c] = (list[c] or 0) + 1 return "btxcmd{" .. c .. "}" end)
440    local whatever  = P("\\") * P(" ")^1 / " "
441    -----           + P("\\") * ( P("hbox") + P("raise") ) -- bah -- no longer
442    local somemath  = P("$") * ((1-P("$"))^1) * P("$") -- let's not assume nested math
443    ----- character = lpegpatterns.utf8character
444    local any       = P(1)
445    local done      = P(-1)
446 -- local one_l     = P("{")  / ""
447 -- local one_r     = P("}")  / ""
448 -- local two_l     = P("{{") / ""
449 -- local two_r     = P("}}") / ""
450    local zero_l_r  = P("{}") / "" * #P(1)
451    local special   = P("#")  / "\\letterhash "
452
453    local filter_0  = S('\\{}#')
454    local filter_1  = (1-filter_0)^0 * filter_0
455    local filter_2  = Cs(
456    -- {{...}} ... {{...}}
457    --     two_l * (command + special + any - two_r - done)^0 * two_r * done +
458    --     one_l * (command + special + any - one_r - done)^0 * one_r * done +
459                (
460                    somemath +
461                    whatever +
462                    command +
463                    special +
464                    collapsed +
465                    zero_l_r +
466                    any
467                )^0
468    )
469
470    -- Currently we expand shortcuts and for large ones (like the acknowledgements
471    -- in tugboat.bib) this is not that efficient. However, eventually strings get
472    -- hashed again.
473
474    local function do_shortcut(key,value,dataset)
475        publicationsstats.nofshortcuts = publicationsstats.nofshortcuts + 1
476        dataset.shortcuts[key] = value
477    end
478
479    -- todo: categories : metatable that lowers and also counts
480    -- todo: fields     : metatable that lowers
481
482    local tags = table.setmetatableindex("table")
483
484    local indirectcrossrefs = true
485
486    local function do_definition(category,tag,tab,dataset)
487        publicationsstats.nofdefinitions = publicationsstats.nofdefinitions + 1
488        if tag == "" then
489            tag = "no-tag-set"
490        end
491        local fields  = dataset.fields
492        local luadata = dataset.luadata
493        local hashtag = tag
494        if luadata[tag] then
495            local t = tags[tag]
496            local d = dataset.name
497            local n = (t[d] or 0) + 1
498            t[d] = n
499            hashtag = tag .. "-" .. n
500            if trace_duplicates then
501                local p = { }
502                for k, v in sortedhash(t) do
503                    p[#p+1] = formatters["%s:%s"](k,v)
504                end
505                report_duplicates("tag %a is present multiple times: % t, assigning hashtag %a",tag,p,hashtag)
506            end
507        end
508        local index  = getindex(dataset,luadata,hashtag)
509        local entries = {
510            category = lower(category),
511            tag      = tag,
512            index    = index,
513        }
514        for i=1,#tab,2 do
515            local original   = tab[i]
516            local normalized = fields[original]
517            if not normalized then
518                normalized = lower(original) -- we assume ascii fields
519                fields[original] = normalized
520            end
521         -- if entries[normalized] then
522            if rawget(entries,normalized) then
523                if trace_duplicates then
524                    report_duplicates("redundant field %a is ignored for tag %a in dataset %a",normalized,tag,dataset.name)
525                end
526            else
527                local value = tab[i+1]
528                value = textoutf(value)
529                if lpegmatch(filter_1,value) then
530                    value = lpegmatch(filter_2,value,1,dataset.commands) -- we need to start at 1 for { }
531                end
532                if normalized == "crossref" then
533                    if indirectcrossrefs then
534                        setmetatableindex(entries,function(t,k)
535                            local parent = rawget(luadata,value)
536                            if parent == entries then
537                                report_duplicates("bad parent %a for %a in dataset %s",value,hashtag,dataset.name)
538                                setmetatableindex(entries,nil)
539                                return entries
540                            elseif parent then
541                                setmetatableindex(entries,parent)
542                                return entries[k]
543                            else
544                                report_duplicates("no valid parent %a for %a in dataset %s",value,hashtag,dataset.name)
545                                setmetatableindex(entries,nil)
546                            end
547                        end)
548                    else
549                        dataset.nofcrossrefs = dataset.nofcrossrefs +1
550                    end
551                end
552                entries[normalized] = value
553            end
554        end
555        luadata[hashtag] = entries
556    end
557
558    local f_invalid = formatters["<invalid: %s>"]
559
560    local function resolve(s,dataset)
561        local e = dataset.shortcuts[s]
562        if e then
563            if trace_strings then
564                report_strings("%a resolves to %a",s,e)
565            end
566            return e
567        end
568        e = defaultshortcuts[s]
569        if e then
570            if trace_strings then
571                report_strings("%a resolves to default %a",s,e)
572            end
573            return e
574        end
575        if tonumber(s) then
576            return s
577        end
578        report("error in database, invalid value %a",s)
579        return f_invalid(s)
580    end
581
582    local pattern = p_whitespace^0
583                  * C(P("message") + P("warning") + P("error") + P("comment")) * p_whitespace^0 * P(":")
584                  * p_whitespace^0
585                  * C(P(1)^1)
586
587    local function do_comment(s,dataset)
588        local how, what = lpegmatch(pattern,s)
589        if how and what then
590            local t = string.splitlines(utilities.strings.striplines(what))
591            local b = file.basename(dataset.fullname or dataset.name or "unset")
592            for i=1,#t do
593                report("%s > %s : %s",b,how,t[i])
594            end
595        end
596    end
597
598    local percent    = P("%")
599    local start      = P("@")
600    local comma      = P(",")
601    local hash       = P("#")
602    local escape     = P("\\")
603    local single     = P("'")
604    local double     = P('"')
605    local left       = P('{')
606    local right      = P('}')
607    local both       = left + right
608    local lineending = S("\n\r")
609    local space      = S(" \t\n\r\f") -- / " "
610    local spacing    = space^0
611    local equal      = P("=")
612    ----- collapsed  = (space^1)/ " "
613    local collapsed  = p_whitespace^1/" "
614    local nospaces   = p_whitespace^1/""
615
616    local p_left     = (p_whitespace^0 * left) / ""
617    local p_right    = (right * p_whitespace^0) / ""
618
619    local keyword    = C((R("az","AZ","09") + S("@_:-"))^1)
620    local key        = C((1-space-equal)^1)
621    local tag        = C((1-space-comma)^0)
622    local category   = C((1-space-left)^1)
623    local s_quoted   = ((escape*single) + collapsed + (1-single))^0
624    local d_quoted   = ((escape*double) + collapsed + (1-double))^0
625
626    local reference  = P("@{") * C((R("az","AZ","09") + S("_:-"))^1) * P("}")
627    local r_value    = reference * Carg(1) / resolve
628
629    local balanced   = P {
630        ((escape * (left+right)) + (collapsed + r_value + 1 - (left+right))^1 + V(2))^0,
631        left * V(1) * right,
632    }
633
634 -- local unbalanced = P {
635 --     left * V(2) * right,
636 --     ((escape * (left+right)) + (collapsed + 1 - (left+right))^1 + V(1))^0,
637 -- }
638
639    local unbalanced = (left/"") * balanced * (right/"") * P(-1)
640
641    local reference  = C((R("az","AZ","09") + S("_:-"))^1)
642    local b_value    = p_left * balanced * p_right
643    local s_value    = (single/"") * (unbalanced + s_quoted) * (single/"")
644    local d_value    = (double/"") * (unbalanced + d_quoted) * (double/"")
645    local r_value    = P("@") * reference * Carg(1) / resolve
646                     +          reference * Carg(1) / resolve
647    local n_value    = C(R("09")^1)
648
649    local e_value    = Cs((left * balanced * right + (1 - S(",}")))^0) * Carg(1) / function(s,dataset)
650        return resolve(s,dataset)
651    end
652
653    local somevalue  = d_value + b_value + s_value + r_value + n_value + e_value
654    local value      = Cs((somevalue * ((spacing * hash * spacing)/"" * somevalue)^0))
655
656    local stripper   = lpegpatterns.collapser
657    local stripped   = value / function(s) return lpegmatch(stripper,s) end
658
659    local forget     = percent^1 * (1-lineending)^0
660    local spacing    = spacing * forget^0 * spacing
661    local replacement= spacing * key * spacing * equal * spacing * value    * spacing
662    local assignment = spacing * key * spacing * equal * spacing * stripped * spacing
663    local definition = category * spacing * left * spacing * tag * spacing * comma * Ct((assignment * comma^0)^0) * spacing * right * Carg(1) / do_definition
664
665    local crapword   = C((1-space-left)^1)
666    local shortcut   = Cmt(crapword,function(_,p,s) return lower(s) == "string"  and p end) * spacing * left * ((replacement * Carg(1))/do_shortcut * comma^0)^0  * spacing * right
667    local comment    = Cmt(crapword,function(_,p,s) return lower(s) == "comment" and p end) * spacing * lpegpatterns.argument * Carg(1) / do_comment
668
669    local casecrap   = #S("sScC") * (shortcut + comment)
670
671    local bibtotable = (space + forget + P("@") * (casecrap + definition) + 1)^0
672
673    -- todo \%
674
675    -- loadbibdata  -> dataset.luadata
676    -- loadtexdata  -> dataset.luadata
677    -- loadluadata  -> dataset.luadata
678
679    -- converttoxml -> dataset.xmldata from dataset.luadata
680
681    -- author = "al-" # @AHSAI # "," # @SHAYKH # " " # @AHMAD # " Ibn " # @ZAYNIDDIN
682    -- author = {al-@{AHSAI}, @{SHAYKH} @{AHMAD} Ibn @{ZAYNIDDIN}}
683
684    function publications.loadbibdata(dataset,content,source,kind)
685        if not source then
686            report("invalid source for dataset %a",dataset)
687            return
688        end
689        local current = datasets[dataset]
690        local size = #content
691        if size == 0 then
692            report("empty source %a for dataset %a",source,current.name)
693        else
694            report("adding bib data to set %a from source %a",current.name,source)
695        end
696        statistics.starttiming(publications)
697        publicationsstats.nofbytes = publicationsstats.nofbytes + size
698        current.nofbytes = current.nofbytes + size
699        current.nofcrossrefs = 0
700        if source then
701            table.insert(current.sources, { filename = source, checksum = md5.HEX(content) })
702            current.loaded[source] = kind or true
703        end
704        local luadata = current.luadata
705        current.newtags = #luadata > 0 and { } or current.newtags
706        lpegmatch(bibtotable,content or "",1,current)
707        if current.nofcrossrefs > 0 then
708            for tag, entries in next, luadata do
709                local value = entries.crossref
710                if value then
711                    local parent = luadata[value]
712                    if parent == entries then
713                        report_duplicates("bad parent %a for %a in dataset %s",value,hashtag,dataset.name)
714                    elseif parent then
715                        local t = { }
716                        for k, v in next, parent do
717                            if not entries[k] then
718                                entries[k] = v
719                                t[#t+1] = k
720                            end
721                        end
722                        sort(t)
723                        entries.inherited = concat(t,",")
724                    else
725                        report_duplicates("no valid parent %a for %a in dataset %s",value,hashtag,dataset.name)
726                    end
727                end
728            end
729        end
730--         inspect(luadata)
731        statistics.stoptiming(publications)
732    end
733
734end
735
736do
737
738    -- we could use xmlescape again
739
740    local cleaner_0 = S('<>&')
741    local cleaner_1 = (1-cleaner_0)^0 * cleaner_0
742    local cleaner_2 = Cs ( (
743        P("<") / "&lt;" +
744        P(">") / "&gt;" +
745        P("&") / "&amp;" +
746        P(1)
747    )^0)
748
749    local compact = false -- can be a directive but then we also need to deal with newlines ... not now
750
751    function publications.converttoxml(dataset,nice,dontstore,usedonly,subset,noversion,rawtoo) -- we have fields !
752        local current = datasets[dataset]
753        local luadata = subset or (current and current.luadata)
754        if luadata then
755            statistics.starttiming(publications)
756            --
757            local result, r, n = { }, 0, 0
758            if usedonly then
759                usedonly = publications.usedentries()
760                usedonly = usedonly[current.name]
761            end
762            --
763            r = r + 1 ; result[r] = "<?xml version='1.0' standalone='yes'?>"
764            r = r + 1 ; result[r] = formatters["<bibtex dataset='%s'>"](current.name)
765            --
766            if nice then -- will be default
767                local f_entry_start = formatters[" <entry tag='%s' category='%s' index='%s'>"]
768                local s_entry_stop  = " </entry>"
769                local f_field       = formatters["  <field name='%s'>%s</field>"]
770                local f_cdata       = formatters["  <field name='rawbibtex'><![CDATA[%s]]></field>"]
771
772                for tag, entry in sortedhash(luadata) do
773                    if not usedonly or usedonly[tag] then
774                        r = r + 1 ; result[r] = f_entry_start(tag,entry.category,entry.index)
775                        for key, value in sortedhash(entry) do
776                            if key ~= "tag" and key ~= "category" and key ~= "index" then
777                                if lpegmatch(cleaner_1,value) then
778                                    value = lpegmatch(cleaner_2,value)
779                                end
780                                if value ~= "" then
781                                    r = r + 1 ; result[r] = f_field(key,value)
782                                end
783                            end
784                        end
785                        if rawtoo then
786                            local s = publications.savers.bib(current,false,{ [tag] = entry })
787                            s = utilities.strings.striplines(s,"prune and collapse")
788                            r = r + 1 ; result[r] = f_cdata(s)
789                        end
790                        r = r + 1 ; result[r] = s_entry_stop
791                        n = n + 1
792                    end
793                end
794            else
795                local f_entry_start = formatters["<entry tag='%s' category='%s' index='%s'>"]
796                local s_entry_stop  = "</entry>"
797                local f_field       = formatters["<field name='%s'>%s</field>"]
798                for tag, entry in next, luadata do
799                    if not usedonly or usedonly[tag] then
800                        r = r + 1 ; result[r] = f_entry_start(entry.tag,entry.category,entry.index)
801                        for key, value in next, entry do
802                            if key ~= "tag" and key ~= "category" and key ~= "index" then
803                                if lpegmatch(cleaner_1,value) then
804                                    value = lpegmatch(cleaner_2,value)
805                                end
806                                if value ~= "" then
807                                    r = r + 1 ; result[r] = f_field(key,value)
808                                end
809                            end
810                        end
811                        r = r + 1 ; result[r] = s_entry_stop
812                        n = n + 1
813                    end
814                end
815            end
816            --
817            r = r + 1 ; result[r] = "</bibtex>"
818            --
819            result = concat(result,nice and "\n" or nil,noversion and 2 or 1,#result)
820            --
821            if dontstore then
822                -- indeed
823            else
824                statistics.starttiming(xml)
825                current.xmldata = xmlconvert(result, {
826                    resolve_entities            = true,
827                    resolve_predefined_entities = true, -- in case we have escaped entities
828                 -- unify_predefined_entities   = true, -- &#038; -> &amp;
829                    utfize_entities             = true,
830                } )
831                statistics.stoptiming(xml)
832                if lxml then
833                    lxml.register(formatters["btx:%s"](current.name),current.xmldata)
834                end
835            end
836            statistics.stoptiming(publications)
837            return result, n
838        end
839    end
840
841end
842
843do
844
845    local function resolvedname(dataset,filename)
846        local current = datasets[dataset]
847        if type(filename) ~= "string" then
848            report("invalid filename %a",tostring(filename))
849        end
850        local fullname = resolvers.findfile(filename,"bib")
851        if fullname == "" then
852            fullname = resolvers.findfile(filename) -- let's not be too picky
853        end
854        if not fullname or fullname == "" then
855            report("no file %a",filename)
856            current.fullname = filename
857            return current, false
858        else
859            current.fullname = fullname
860            return current, fullname
861        end
862    end
863
864    publications.resolvedname = resolvedname
865
866    local cleaner = false
867    local cleaned = false
868
869    function loaders.registercleaner(what,fullname)
870        if not fullname or fullname == "" then
871            report("no %s file %a",what,fullname)
872            return
873        end
874        local list = table.load(fullname)
875        if not list then
876            report("invalid %s file %a",what,fullname)
877            return
878        end
879        list = list.replacements
880        if not list then
881            report("no replacement table in %a",fullname)
882            return
883        end
884        if cleaned then
885            report("adding replacements from %a",fullname)
886            for k, v in next, list do
887                cleaned[k] = v
888            end
889        else
890            report("using replacements from %a",fullname)
891            cleaned = list
892        end
893        cleaner = true
894    end
895
896    function loaders.bib(dataset,filename,kind)
897        local dataset, fullname = resolvedname(dataset,filename)
898        if not fullname then
899            return
900        end
901        local data = io.loaddata(fullname) or ""
902        if data == "" then
903            report("empty file %a, nothing loaded",fullname)
904            return
905        end
906        if cleaner == true then
907            cleaner = Cs((lpeg.utfchartabletopattern(keys(cleaned)) / cleaned + p_utf8character)^1)
908        end
909        if cleaner ~= false then
910            data = lpegmatch(cleaner,data)
911        end
912        if trace then
913            report("loading file %a",fullname)
914        end
915        publications.loadbibdata(dataset,data,fullname,kind)
916    end
917
918    function loaders.lua(dataset,filename,loader) -- if filename is a table we load that one
919        local current, data, fullname
920        if type(filename) == "table" then
921            current = datasets[dataset]
922            data    = filename
923        else
924            dataset, fullname = resolvedname(dataset,filename)
925            if not fullname then
926                return
927            end
928            current = datasets[dataset]
929            data    = (loader or table.load)(fullname)
930        end
931        if data then
932            local luadata = current.luadata
933            -- we want the same index each run
934            for tag, entry in sortedhash(data) do
935                if type(entry) == "table" then
936                    entry.index  = getindex(current,luadata,tag)
937                    entry.tag    = tag
938                    luadata[tag] = entry -- no cleaning yet
939                end
940            end
941        end
942    end
943
944    function loaders.json(dataset,filename)
945        loaders.lua(dataset,filename,utilities.json.load)
946    end
947
948    function loaders.buffer(dataset,name) -- if filename is a table we load that one
949        local current  = datasets[dataset]
950        local barename = file.removesuffix(name)
951        local data     = buffers.getcontent(barename) or ""
952        if data == "" then
953            report("empty buffer %a, nothing loaded",barename)
954            return
955        end
956        if trace then
957            report("loading buffer",barename)
958        end
959        publications.loadbibdata(current,data,barename,"bib")
960    end
961
962    function loaders.xml(dataset,filename)
963        local dataset, fullname = resolvedname(dataset,filename)
964        if not fullname then
965            return
966        end
967        local current = datasets[dataset]
968        local luadata = current.luadata
969        local root    = xml.load(fullname)
970        for bibentry in xmlcollected(root,"/bibtex/entry") do
971            local attributes = bibentry.at
972            local tag        = attributes.tag
973            local entry      = {
974                category = attributes.category,
975                tag      = tag, -- afterwards also set, to prevent overload
976                index    = 0,   -- prelocated
977            }
978            for field in xmlcollected(bibentry,"/field") do
979                entry[field.at.name] = field.dt[1] -- no cleaning yet | xmltext(field)
980            end
981            entry.index  = getindex(current,luadata,tag)
982            entry.tag    = tag
983            luadata[tag] = entry
984        end
985    end
986
987    setmetatableindex(loaders,function(t,filetype)
988        local v = function(dataset,filename)
989            report("no loader for file %a with filetype %a",filename,filetype)
990        end
991        t[filetype] = v
992        return v
993    end)
994
995    local done = setmetatableindex("table")
996
997    function publications.load(specification)
998        local name     = specification.dataset or v_default
999        local current  = datasets[name]
1000        local files    = settings_to_array(specification.filename)
1001        local kind     = specification.kind
1002        local dataspec = specification.specification
1003        statistics.starttiming(publications)
1004        local somedone = false
1005        for i=1,#files do
1006            local filetype, filename = string.splitup(files[i],"::")
1007            if not filename then
1008                filename = filetype
1009                filetype = file.suffix(filename)
1010            end
1011            if filename then
1012                if not filetype or filetype == "" then
1013                    filetype = "bib"
1014                end
1015                if file.suffix(filename) == "" then
1016                    file.addsuffix(filename,filetype)
1017                end
1018                if done[current][filename] then
1019                    report("file %a is already loaded in dataset %a",filename,name)
1020                else
1021                    loaders[filetype](current,filename)
1022                    done[current][filename] = true
1023                    somedone = true
1024                end
1025                if kind then
1026                    current.loaded[current.fullname or filename] = kind
1027                end
1028                if dataspec then
1029                    current.specifications[dataspec] = true
1030                end
1031            end
1032        end
1033        if somedone then
1034            local runner = enhancer.runner
1035            if runner then
1036                runner(current)
1037            end
1038        end
1039        statistics.stoptiming(publications)
1040        return current
1041    end
1042
1043end
1044
1045do
1046
1047    function enhancers.order(dataset)
1048        local luadata = dataset.luadata
1049        local ordered = dataset.ordered
1050        for i=1,#ordered do
1051            local tag = ordered[i]
1052            if type(tag) == "string" then
1053                ordered[i] = luadata[tag]
1054            end
1055        end
1056    end
1057
1058    function enhancers.details(dataset)
1059        local luadata = dataset.luadata
1060        local details = dataset.details
1061        for tag, entry in next, luadata do
1062            if not details[tag] then
1063                details[tag] = { }
1064            end
1065        end
1066    end
1067
1068    utilities.sequencers.appendaction(enhancer,"system","publications.enhancers.order")
1069    utilities.sequencers.appendaction(enhancer,"system","publications.enhancers.details")
1070
1071end
1072
1073do
1074
1075    local checked  = function(s,d) d[s] = (d[s] or 0) + 1 end
1076    local checktex = ( (1-P("\\"))^1 + P("\\") * ((C(R("az","AZ")^1)  * Carg(1))/checked))^0
1077
1078    function publications.analyze(dataset)
1079        local current    = datasets[dataset]
1080        local data       = current.luadata
1081        local categories = { }
1082        local fields     = { }
1083        local commands   = { }
1084        for k, v in next, data do
1085            categories[v.category] = (categories[v.category] or 0) + 1
1086            for k, v in next, v do
1087                fields[k] = (fields[k] or 0) + 1
1088                lpegmatch(checktex,v,1,commands)
1089            end
1090        end
1091        current.analysis = {
1092            categories = categories,
1093            fields     = fields,
1094            commands   = commands,
1095        }
1096    end
1097
1098end
1099
1100function publications.tags(dataset)
1101    return sortedkeys(datasets[dataset].luadata)
1102end
1103
1104function publications.sortedentries(dataset)
1105    return sortedhash(datasets[dataset].luadata)
1106end
1107
1108-- a helper:
1109
1110function publications.concatstate(i,n)
1111    if i == 0 then
1112        return 0
1113    elseif i == 1 then
1114        return 1
1115    elseif i == 2 and n == 2 then
1116        return 4
1117    elseif i == n then
1118        return 3
1119    else
1120        return 2
1121    end
1122end
1123
1124-- savers
1125
1126do
1127
1128    local savers = { }
1129
1130    local s_preamble = [[
1131% this is an export from context mkiv
1132
1133@preamble{
1134    \ifdefined\btxcmd
1135        % we're probably in context
1136    \else
1137        \def\btxcmd#1{\begincsname#1\endcsname}
1138    \fi
1139}
1140
1141]]
1142
1143    function savers.bib(dataset,filename,tobesaved)
1144        local f_start = formatters["@%s{%s,\n"]
1145        local f_field = formatters["  %s = {%s},\n"]
1146        local s_stop  = "}\n\n"
1147        local result  = { }
1148        local n, r = 0, 0
1149        for tag, data in sortedhash(tobesaved) do
1150            r = r + 1 ; result[r] = f_start(data.category or "article",tag)
1151            for key, value in sortedhash(data) do
1152                if not privates[key] then
1153                    r = r + 1 ; result[r] = f_field(key,value)
1154                end
1155            end
1156            r = r + 1 ; result[r] = s_stop
1157            n = n + 1
1158        end
1159        result = concat(result)
1160        if find(result,"\\btxcmd") then
1161            result = s_preamble .. result
1162        end
1163        if filename then
1164            report("%s entries from dataset %a saved in %a",n,dataset,filename)
1165            io.savedata(filename,result)
1166        else
1167            return result
1168        end
1169    end
1170
1171    function savers.lua(dataset,filename,tobesaved,options)
1172        local list  = { }
1173        local n     = 0
1174
1175        local function totable(data,category)
1176            local t = { }
1177            for key, value in next, data do
1178                if not privates[key] then
1179                    t[key] = value
1180                end
1181            end
1182            t.category = category
1183            n = n + 1
1184            return t
1185        end
1186
1187        if options.category then
1188            setmetatableindex(list,"table")
1189            for tag, data in next, tobesaved do
1190                list[data.category or "unknown"][tag] = totable(data)
1191            end
1192        else
1193            for tag, data in next, tobesaved do
1194                list[tag] = totable(data,data.category)
1195            end
1196        end
1197        report("%s entries from dataset %a saved in %a",n,dataset,filename)
1198        table.save(filename,list)
1199    end
1200
1201    function savers.xml(dataset,filename,tobesaved,rawtoo)
1202        local result, n = publications.converttoxml(dataset,true,true,false,tobesaved,false,rawtoo)
1203        report("%s entries from dataset %a saved in %a",n,dataset,filename)
1204        io.savedata(filename,result)
1205    end
1206
1207    function publications.save(specification)
1208        local dataset   = specification.dataset
1209        local filename  = specification.filename
1210        local filetype  = specification.filetype
1211        local criterium = specification.criterium
1212        local options   = settings_to_hash(specification.options or "")
1213        statistics.starttiming(publications)
1214        if not filename or filename == "" then
1215            report("no filename for saving given")
1216            return
1217        end
1218        if not filetype or filetype == "" then
1219            filetype = file.suffix(filename)
1220        end
1221        if not criterium or criterium == "" then
1222            criterium = v_all
1223        end
1224        local saver = savers[filetype]
1225        if saver then
1226            local current   = datasets[dataset]
1227            local luadata   = current.luadata or { }
1228            local tobesaved = { }
1229            local result  = structures.lists.filter({criterium = criterium, names = "btx"}) or { }
1230            for i=1,#result do
1231                local userdata = result[i].userdata
1232                if userdata then
1233                    local set = userdata.btxset or v_default
1234                    if set == dataset then
1235                        local tag = userdata.btxref
1236                        if tag then
1237                            tobesaved[tag] = luadata[tag]
1238                        end
1239                    end
1240                end
1241            end
1242            saver(dataset,filename,tobesaved,options)
1243        else
1244            report("unknown format %a for saving %a",filetype,dataset)
1245        end
1246        statistics.stoptiming(publications)
1247        return dataset
1248    end
1249
1250    publications.savers = savers
1251
1252    if implement then
1253
1254        implement {
1255            name      = "btxsavedataset",
1256            actions   = publications.save,
1257            arguments = {
1258                {
1259                    { "dataset" },
1260                    { "filename" },
1261                    { "filetype" },
1262                    { "criterium" },
1263                    { "options" },
1264                }
1265            }
1266        }
1267
1268    end
1269
1270end
1271
1272-- casters
1273
1274do
1275
1276    publications.detailed = setmetatableindex(function(detailed,kind)
1277        local values = setmetatableindex(function(values,value)
1278            local caster = casters[kind]
1279            local cast   = caster and caster(value) or value
1280            values[value] = cast
1281            return cast
1282        end)
1283        detailed[kind] = values
1284        return values
1285    end)
1286
1287    local keywordsplitter = utilities.parsers.groupedsplitat(";,")
1288
1289    casters.keyword = function(str)
1290        return lpegmatch(keywordsplitter,str)
1291    end
1292
1293
1294    writers.keyword = function(k)
1295        if type(k) == "table" then
1296            return concat(p,";")
1297        else
1298            return k
1299        end
1300    end
1301
1302    local pagessplitter = lpeg.splitat((
1303        P("-") + -- hyphen
1304        P("") + -- U+2014
1305        P("") + -- U+2013
1306        P("")   -- U+2012
1307    )^1)
1308
1309    casters.range = function(str)
1310        local first, last = lpegmatch(pagessplitter,str)
1311        return first and last and { first, last } or str
1312    end
1313
1314    writers.range = function(p)
1315        if type(p) == "table" then
1316            return concat(p,"-")
1317        else
1318            return p
1319        end
1320    end
1321
1322    casters.pagenumber = casters.range
1323    writers.pagenumber = writers.range
1324
1325end
1326
1327if implement then
1328
1329    implement {
1330        name      = "btxshortcut",
1331        arguments = "2 strings",
1332        actions   = function(instance,key)
1333            local d = publications.datasets[instance]
1334            context(d and d.shortcuts[key] or "?")
1335        end,
1336    }
1337
1338end
1339
1340-- inspect(publications.load { filename = "e:/tmp/oeps.bib" })
1341