publ-dat.lua /size: 43 Kb    last modification: 2021-10-28 13:50
1if not modules then modules = { } end modules ['publ-dat'] = {
2    version   = 1.001,
3    comment   = "this module part of publication support",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- todo: strip the @ in the lpeg instead of on do_definition and do_shortcut
10-- todo: store bibroot and bibrootdt
11-- todo: dataset = datasets[dataset] => current = datasets[dataset]
12-- todo: maybe split this file
13
14--[[ldx--
15<p>This is a prelude to integrated bibliography support. This file just loads
16bibtex files and converts them to xml so that the we access the content
17in a convenient way. Actually handling the data takes place elsewhere.</p>
18--ldx]]--
19
20if not characters then
21    dofile(resolvers.findfile("char-utf.lua"))
22    dofile(resolvers.findfile("char-tex.lua"))
23end
24
25if not utilities.sequencers then
26    dofile(resolvers.findfile("util-seq.lua"))
27end
28
29local lower, find, sub = string.lower, string.find, string.sub
30local concat, copy, tohash = table.concat, table.copy, table.tohash
31local next, type, rawget, tonumber = next, type, rawget, tonumber
32local utfchar = utf.char
33local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
34local textoutf = characters and characters.tex.toutf
35local settings_to_hash, settings_to_array = utilities.parsers.settings_to_hash, utilities.parsers.settings_to_array
36local formatters = string.formatters
37local sortedkeys, sortedhash, keys, sort = table.sortedkeys, table.sortedhash, table.keys, table.sort
38local xmlcollected, xmltext, xmlconvert = xml.collected, xml.text, xml.convert
39local setmetatableindex = table.setmetatableindex
40
41-- todo: more allocate
42
43local P, R, S, V, C, Cc, Cs, Ct, Carg, Cmt, Cp = lpeg.P, lpeg.R, lpeg.S, lpeg.V, lpeg.C, lpeg.Cc, lpeg.Cs, lpeg.Ct, lpeg.Carg, lpeg.Cmt, lpeg.Cp
44
45local p_whitespace      = lpegpatterns.whitespace
46local p_utf8character   = lpegpatterns.utf8character
47
48local trace             = false  trackers.register("publications",            function(v) trace = v end)
49local trace_duplicates  = true   trackers.register("publications.duplicates", function(v) trace = v end)
50local trace_strings     = false  trackers.register("publications.strings",    function(v) trace = v end)
51
52local report            = logs.reporter("publications")
53local report_duplicates = logs.reporter("publications","duplicates")
54local report_strings    = logs.reporter("publications","strings")
55
56local allocate          = utilities.storage.allocate
57
58local commands          = commands
59local implement         = interfaces and interfaces.implement
60
61publications            = publications or { }
62local publications      = publications
63
64local datasets          = publications.datasets or { }
65publications.datasets   = datasets
66
67local writers           = publications.writers or { }
68publications.writers    = writers
69
70local tables            = publications.tables or { }
71publications.tables     = tables
72
73publications.statistics = publications.statistics or { }
74local publicationsstats = publications.statistics
75
76local loaders           = publications.loaders or { }
77publications.loaders    = loaders
78
79local casters           = { }
80publications.casters    = casters
81
82-- local sorters           = { }
83-- publications.sorters    = sorters
84--
85-- local indexers          = { }
86-- publications.indexers   = indexers
87
88local components        = { }
89publications.components = components -- register components
90
91local enhancers         = publications.enhancers or { }
92publications.enhancers  = enhancers
93
94local enhancer          = publications.enhancer or utilities.sequencers.new { arguments = "dataset" }
95publications.enhancer   = enhancer
96
97utilities.sequencers.appendgroup(enhancer,"system") -- private
98
99publicationsstats.nofbytes       = 0
100publicationsstats.nofdefinitions = 0
101publicationsstats.nofshortcuts   = 0
102publicationsstats.nofdatasets    = 0
103
104local privates = allocate {
105    category      = true,
106    tag           = true,
107    index         = true,
108    suffix        = true,
109    specification = true,
110}
111
112local specials = allocate {
113    key      = true,
114    crossref = true,
115    keywords = true,
116    language = true,
117    comment  = true,
118}
119
120local implicits = allocate {
121    category = "implicit",
122    tag      = "implicit",
123    key      = "implicit",
124    keywords = "implicit",
125    language = "implicit",
126    crossref = "implicit",
127}
128
129local origins = allocate {
130    "optional",
131    "extra",
132    "required",
133    "virtual",
134}
135
136local virtuals = allocate {
137    "authoryear",
138    "authoryears",
139    "authornum",
140    "num",
141    "suffix",
142}
143
144local defaulttypes = allocate {
145    author     = "author",
146    editor     = "author",
147    translator = "author",
148 -- publisher  = "author",
149    page       = "pagenumber",
150    pages      = "pagenumber",
151    keywords   = "keyword",
152    doi        = "url",
153    url        = "url",
154}
155
156local defaultsets = allocate {
157    page = { "page", "pages" },
158}
159
160tables.implicits = implicits
161tables.origins   = origins
162tables.virtuals  = virtuals
163tables.types     = defaulttypes
164tables.sets      = defaultsets
165tables.privates  = privates
166tables.specials  = specials
167
168local variables  = interfaces and interfaces.variables or setmetatableindex("self")
169
170local v_all      = variables.all
171local v_default  = variables.default
172
173if not publications.usedentries then
174    function publications.usedentries()
175        return { }
176    end
177end
178
179local xmlplaceholder = "<?xml version='1.0' standalone='yes'?>\n<bibtex></bibtex>"
180
181local defaultshortcuts = allocate {
182    jan =  "1",
183    feb =  "2",
184    mar =  "3",
185    apr =  "4",
186    may =  "5",
187    jun =  "6",
188    jul =  "7",
189    aug =  "8",
190    sep =  "9",
191    oct = "10",
192    nov = "11",
193    dec = "12",
194}
195
196local space      = p_whitespace^0
197local separator  = space * "+" * space
198local p_splitter = lpeg.tsplitat(separator)
199
200local unknownfield = function(t,k)
201    local v = "extra"
202    t[k] = v
203    return v
204end
205
206local unknowncategory = function(t,k)
207    local v = {
208        required = false,
209        optional = false,
210        virtual  = false,
211        fields   = setmetatableindex(unknownfield), -- this will remember them
212        types    = unknowntypes,
213        sets     = setmetatableindex(defaultsets),  -- new, but rather small
214    }
215    t[k] = v
216    return v
217end
218
219local unknowntype = function(t,k)
220    local v = "string"
221    t[k] = v
222    return v
223end
224
225local default = {
226    name       = name,
227    version    = "1.00",
228    comment    = "unknown specification.",
229    author     = "anonymous",
230    copyright  = "no one",
231    categories = setmetatableindex(unknowncategory),
232    types      = setmetatableindex(defaulttypes,unknowntype),
233}
234
235-- maybe at some point we can have a handlers table with per field
236-- a found, fetch, ... method
237
238local function checkfield(specification,category,data)
239    local list    = setmetatableindex({},implicits)
240    data.fields   = list
241    data.category = category
242    local sets    = data.sets or { }
243    for i=1,#origins do
244        local t = origins[i]
245        local d = data[t]
246        if d then
247            for i=1,#d do
248                local di = d[i]
249                di = sets[di] or di
250                if type(di) == "table" then
251                    for i=1,#di do
252                        list[di[i]] = t
253                    end
254                else
255                    list[di] = t
256                end
257            end
258        else
259            data[t] = { }
260        end
261    end
262    return data
263end
264
265local specifications = setmetatableindex(function(t,name)
266    if not name then
267        return default -- initializer
268    end
269    local filename = formatters["publ-imp-%s.lua"](name)
270    local fullname = resolvers.findfile(filename) or ""
271    if fullname == "" then
272        report("no data definition file %a for %a",filename,name)
273        t[name] = default
274        return default
275    end
276    local specification = table.load(fullname)
277    if not specification then
278        report("invalid data definition file %a for %a",fullname,name)
279        t[name] = default
280        return default
281    end
282    --
283    local categories = specification.categories
284    if not categories then
285        categories = { }
286        specification.categories = categories
287    end
288    setmetatableindex(categories,unknowncategory)
289    --
290    local types = specification.types
291    if not types then
292        types = defaulttypes
293        specification.types = types
294    end
295    setmetatableindex(types,unknowntype)
296    --
297    local fields = setmetatableindex(unknownfield)
298    specification.fields = fields
299    --
300    local virtual = specification.virtual
301    if virtual == nil then -- so false is valid
302        virtual = { }
303    elseif virtual == false then
304        virtual = { }
305    elseif type(virtual) ~= table then
306        virtual = virtuals
307    end
308    specification.virtual = virtual
309    specification.virtualfields = tohash(virtual)
310    --
311    for category, data in next, categories do
312        categories[category] = checkfield(specification,category,copy(data)) -- we make sure we have no clones
313    end
314    --
315    t[name] = specification
316    --
317    return specification
318end)
319
320publications.specifications = specifications
321
322function publications.setcategory(target,category,data)
323    local specification = specifications[target]
324    specification.categories[category] = checkfield(specification,category,data)
325end
326
327function publications.parenttag(dataset,tag)
328    if not dataset or not tag then
329        report("error in specification, dataset %a, tag %a",dataset,tag)
330    elseif find(tag,"+",1,true) then
331        local tags    = lpegmatch(p_splitter,tag)
332        local parent  = tags[1]
333        local current = datasets[dataset]
334        local luadata = current.luadata
335        local details = current.details
336        local first   = luadata[parent]
337        if first then
338            local detail   = details[parent]
339            local children = detail.children
340            if not children then
341                children = { }
342                detail.children = children
343            end
344            -- add new ones but only once
345            for i=2,#tags do
346                local tag = tags[i]
347                for j=1,#children do
348                    if children[j] == tag then
349                        tag = false
350                    end
351                end
352                if tag then
353                    local entry = luadata[tag]
354                    if entry then
355                        local detail = details[tag]
356                        children[#children+1] = tag
357                        if detail.parent then
358                            report("error in combination, dataset %a, tag %a, parent %a, ignored %a",dataset,tag,detail.parent,parent)
359                        else
360                            report("combining, dataset %a, tag %a, parent %a",dataset,tag,parent)
361                            detail.parent = parent
362                        end
363                    end
364                end
365            end
366            return parent
367        end
368    end
369    return tag or ""
370end
371
372function publications.new(name)
373    publicationsstats.nofdatasets = publicationsstats.nofdatasets + 1
374    local dataset = {
375        name       = name or "dataset " .. publicationsstats.nofdatasets,
376        nofentries = 0,
377        shortcuts  = { },
378        luadata    = { },
379        suffixes   = { },
380        xmldata    = xmlconvert(xmlplaceholder),
381        details    = { },
382        ordered    = { },
383        nofbytes   = 0,
384        entries    = nil, -- empty == all
385        sources    = { },
386        loaded     = { },
387        fields     = { },
388        userdata   = { },
389        used       = { },
390        commands   = { }, -- for statistical purposes
391        citestate  = { },
392        status     = {
393            resources = false,
394            userdata  = false,
395        },
396        specifications = {
397            -- used specifications
398        },
399        suffixed   = false,
400    }
401    -- we delay details till we need it (maybe we just delay the
402    -- individual fields but that is tricky as there can be some
403    -- depedencies)
404    return dataset
405end
406
407setmetatableindex(datasets,function(t,k)
408    if type(k) == "table" then
409        return k -- so we can use this accessor as checker
410    else
411        local v = publications.new(k)
412        datasets[k] = v
413        return v
414    end
415end)
416
417local function getindex(dataset,luadata,tag)
418    local found = luadata[tag]
419    if found then
420        local index = found.index or 0
421        dataset.ordered[tag] = index
422        return index
423    else
424        local index = dataset.nofentries + 1
425        dataset.nofentries = index
426        dataset.ordered[index] = tag
427        return index
428    end
429end
430
431publications.getindex = getindex
432
433do
434
435    -- we apply some normalization
436
437    local space     = S(" \t\n\r\f") -- / " "
438    local collapsed = space^1/" "
439    local csletter  = lpegpatterns.csletter or R("az","AZ")
440
441    ----- command   = P("\\") * Cc("btxcmd{") * (R("az","AZ")^1) * Cc("}")
442    ----- command   = P("\\") * (Carg(1) * C(R("az","AZ")^1) / function(list,c) list[c] = (list[c] or 0) + 1 return "btxcmd{" .. c .. "}" end)
443    ----- command   = P("\\") * (Carg(1) * C(R("az","AZ")^1) * space^0 / function(list,c) list[c] = (list[c] or 0) + 1 return "btxcmd{" .. c .. "}" end)
444    local command   = P("\\") * (Carg(1) * C(csletter^1) * space^0 / function(list,c) list[c] = (list[c] or 0) + 1 return "btxcmd{" .. c .. "}" end)
445    local whatever  = P("\\") * P(" ")^1 / " "
446    -----           + P("\\") * ( P("hbox") + P("raise") ) -- bah -- no longer
447    local somemath  = P("$") * ((1-P("$"))^1) * P("$") -- let's not assume nested math
448    ----- character = lpegpatterns.utf8character
449    local any       = P(1)
450    local done      = P(-1)
451 -- local one_l     = P("{")  / ""
452 -- local one_r     = P("}")  / ""
453 -- local two_l     = P("{{") / ""
454 -- local two_r     = P("}}") / ""
455    local zero_l_r  = P("{}") / "" * #P(1)
456    local special   = P("#")  / "\\letterhash "
457
458    local filter_0  = S('\\{}#')
459    local filter_1  = (1-filter_0)^0 * filter_0
460    local filter_2  = Cs(
461    -- {{...}} ... {{...}}
462    --     two_l * (command + special + any - two_r - done)^0 * two_r * done +
463    --     one_l * (command + special + any - one_r - done)^0 * one_r * done +
464                (
465                    somemath +
466                    whatever +
467                    command +
468                    special +
469                    collapsed +
470                    zero_l_r +
471                    any
472                )^0
473    )
474
475    -- Currently we expand shortcuts and for large ones (like the acknowledgements
476    -- in tugboat.bib) this is not that efficient. However, eventually strings get
477    -- hashed again.
478
479    local function do_shortcut(key,value,dataset)
480        publicationsstats.nofshortcuts = publicationsstats.nofshortcuts + 1
481        dataset.shortcuts[key] = value
482    end
483
484    -- todo: categories : metatable that lowers and also counts
485    -- todo: fields     : metatable that lowers
486
487    local tags = table.setmetatableindex("table")
488
489    local indirectcrossrefs = true
490
491    local function do_definition(category,tag,tab,dataset)
492        publicationsstats.nofdefinitions = publicationsstats.nofdefinitions + 1
493        if tag == "" then
494            tag = "no-tag-set"
495        end
496        local fields  = dataset.fields
497        local luadata = dataset.luadata
498        local hashtag = tag
499        if luadata[tag] then
500            local t = tags[tag]
501            local d = dataset.name
502            local n = (t[d] or 0) + 1
503            t[d] = n
504            hashtag = tag .. "-" .. n
505            if trace_duplicates then
506                local p = { }
507                for k, v in sortedhash(t) do
508                    p[#p+1] = formatters["%s:%s"](k,v)
509                end
510                report_duplicates("tag %a is present multiple times: % t, assigning hashtag %a",tag,p,hashtag)
511            end
512        end
513        local index  = getindex(dataset,luadata,hashtag)
514        local entries = {
515            category = lower(category),
516            tag      = tag,
517            index    = index,
518        }
519        for i=1,#tab,2 do
520            local original   = tab[i]
521            local normalized = fields[original]
522            if not normalized then
523                normalized = lower(original) -- we assume ascii fields
524                fields[original] = normalized
525            end
526         -- if entries[normalized] then
527            if rawget(entries,normalized) then
528                if trace_duplicates then
529                    report_duplicates("redundant field %a is ignored for tag %a in dataset %a",normalized,tag,dataset.name)
530                end
531            else
532                local value = tab[i+1]
533                value = textoutf(value)
534                if lpegmatch(filter_1,value) then
535                    value = lpegmatch(filter_2,value,1,dataset.commands) -- we need to start at 1 for { }
536                end
537                if normalized == "crossref" then
538                    if indirectcrossrefs then
539                        setmetatableindex(entries,function(t,k)
540                            local parent = rawget(luadata,value)
541                            if parent == entries then
542                                report_duplicates("bad parent %a for %a in dataset %s",value,hashtag,dataset.name)
543                                setmetatableindex(entries,nil)
544                                return entries
545                            elseif parent then
546                                setmetatableindex(entries,parent)
547                                return entries[k]
548                            else
549                                report_duplicates("no valid parent %a for %a in dataset %s",value,hashtag,dataset.name)
550                                setmetatableindex(entries,nil)
551                            end
552                        end)
553                    else
554                        dataset.nofcrossrefs = dataset.nofcrossrefs +1
555                    end
556                end
557                entries[normalized] = value
558            end
559        end
560        luadata[hashtag] = entries
561    end
562
563    local f_invalid = formatters["<invalid: %s>"]
564
565    local function resolve(s,dataset)
566        local e = dataset.shortcuts[s]
567        if e then
568            if trace_strings then
569                report_strings("%a resolves to %a",s,e)
570            end
571            return e
572        end
573        e = defaultshortcuts[s]
574        if e then
575            if trace_strings then
576                report_strings("%a resolves to default %a",s,e)
577            end
578            return e
579        end
580        if tonumber(s) then
581            return s
582        end
583        report("error in database, invalid value %a",s)
584        return f_invalid(s)
585    end
586
587    local pattern = p_whitespace^0
588                  * C(P("message") + P("warning") + P("error") + P("comment")) * p_whitespace^0 * P(":")
589                  * p_whitespace^0
590                  * C(P(1)^1)
591
592    local function do_comment(s,dataset)
593        local how, what = lpegmatch(pattern,s)
594        if how and what then
595            local t = string.splitlines(utilities.strings.striplines(what))
596            local b = file.basename(dataset.fullname or dataset.name or "unset")
597            for i=1,#t do
598                report("%s > %s : %s",b,how,t[i])
599            end
600        end
601    end
602
603    local percent    = P("%")
604    local start      = P("@")
605    local comma      = P(",")
606    local hash       = P("#")
607    local escape     = P("\\")
608    local single     = P("'")
609    local double     = P('"')
610    local left       = P('{')
611    local right      = P('}')
612    local both       = left + right
613    local lineending = S("\n\r")
614    local space      = S(" \t\n\r\f") -- / " "
615    local spacing    = space^0
616    local equal      = P("=")
617    ----- collapsed  = (space^1)/ " "
618    local collapsed  = p_whitespace^1/" "
619    local nospaces   = p_whitespace^1/""
620
621    local p_left     = (p_whitespace^0 * left) / ""
622    local p_right    = (right * p_whitespace^0) / ""
623
624    local keyword    = C((R("az","AZ","09") + S("@_:-"))^1)
625    local key        = C((1-space-equal)^1)
626    local tag        = C((1-space-comma)^0)
627    local category   = C((1-space-left)^1)
628    local s_quoted   = ((escape*single) + collapsed + (1-single))^0
629    local d_quoted   = ((escape*double) + collapsed + (1-double))^0
630
631    local reference  = P("@{") * C((R("az","AZ","09") + S("_:-"))^1) * P("}")
632    local r_value    = reference * Carg(1) / resolve
633
634    local balanced   = P {
635        ((escape * (left+right)) + (collapsed + r_value + 1 - (left+right))^1 + V(2))^0,
636        left * V(1) * right,
637    }
638
639 -- local unbalanced = P {
640 --     left * V(2) * right,
641 --     ((escape * (left+right)) + (collapsed + 1 - (left+right))^1 + V(1))^0,
642 -- }
643
644    local unbalanced = (left/"") * balanced * (right/"") * P(-1)
645
646    local reference  = C((R("az","AZ","09") + S("_:-"))^1)
647    local b_value    = p_left * balanced * p_right
648    local s_value    = (single/"") * (unbalanced + s_quoted) * (single/"")
649    local d_value    = (double/"") * (unbalanced + d_quoted) * (double/"")
650    local r_value    = P("@") * reference * Carg(1) / resolve
651                     +          reference * Carg(1) / resolve
652    local n_value    = C(R("09")^1)
653
654    local e_value    = Cs((left * balanced * right + (1 - S(",}")))^0) * Carg(1) / function(s,dataset)
655        return resolve(s,dataset)
656    end
657
658    local somevalue  = d_value + b_value + s_value + r_value + n_value + e_value
659    local value      = Cs((somevalue * ((spacing * hash * spacing)/"" * somevalue)^0))
660
661    local stripper   = lpegpatterns.collapser
662    local stripped   = value / function(s) return lpegmatch(stripper,s) end
663
664    local forget     = percent^1 * (1-lineending)^0
665    local spacing    = spacing * forget^0 * spacing
666    local replacement= spacing * key * spacing * equal * spacing * value    * spacing
667    local assignment = spacing * key * spacing * equal * spacing * stripped * spacing
668    local definition = category * spacing * left * spacing * tag * spacing * comma * Ct((assignment * comma^0)^0) * spacing * right * Carg(1) / do_definition
669
670    local crapword   = C((1-space-left)^1)
671    local shortcut   = Cmt(crapword,function(_,p,s) return lower(s) == "string"  and p end) * spacing * left * ((replacement * Carg(1))/do_shortcut * comma^0)^0  * spacing * right
672    local comment    = Cmt(crapword,function(_,p,s) return lower(s) == "comment" and p end) * spacing * lpegpatterns.argument * Carg(1) / do_comment
673
674    local casecrap   = #S("sScC") * (shortcut + comment)
675
676    local bibtotable = (space + forget + P("@") * (casecrap + definition) + 1)^0
677
678    -- todo \%
679
680    -- loadbibdata  -> dataset.luadata
681    -- loadtexdata  -> dataset.luadata
682    -- loadluadata  -> dataset.luadata
683
684    -- converttoxml -> dataset.xmldata from dataset.luadata
685
686    -- author = "al-" # @AHSAI # "," # @SHAYKH # " " # @AHMAD # " Ibn " # @ZAYNIDDIN
687    -- author = {al-@{AHSAI}, @{SHAYKH} @{AHMAD} Ibn @{ZAYNIDDIN}}
688
689    function publications.loadbibdata(dataset,content,source,kind)
690        if not source then
691            report("invalid source for dataset %a",dataset)
692            return
693        end
694        local current = datasets[dataset]
695        local size = #content
696        if size == 0 then
697            report("empty source %a for dataset %a",source,current.name)
698        else
699            report("adding bib data to set %a from source %a",current.name,source)
700        end
701        statistics.starttiming(publications)
702        publicationsstats.nofbytes = publicationsstats.nofbytes + size
703        current.nofbytes = current.nofbytes + size
704        current.nofcrossrefs = 0
705        if source then
706            table.insert(current.sources, { filename = source, checksum = md5.HEX(content) })
707            current.loaded[source] = kind or true
708        end
709        local luadata = current.luadata
710        current.newtags = #luadata > 0 and { } or current.newtags
711        lpegmatch(bibtotable,content or "",1,current)
712        if current.nofcrossrefs > 0 then
713            for tag, entries in next, luadata do
714                local value = entries.crossref
715                if value then
716                    local parent = luadata[value]
717                    if parent == entries then
718                        report_duplicates("bad parent %a for %a in dataset %s",value,hashtag,dataset.name)
719                    elseif parent then
720                        local t = { }
721                        for k, v in next, parent do
722                            if not entries[k] then
723                                entries[k] = v
724                                t[#t+1] = k
725                            end
726                        end
727                        sort(t)
728                        entries.inherited = concat(t,",")
729                    else
730                        report_duplicates("no valid parent %a for %a in dataset %s",value,hashtag,dataset.name)
731                    end
732                end
733            end
734        end
735        statistics.stoptiming(publications)
736    end
737
738end
739
740do
741
742    -- we could use xmlescape again
743
744    local cleaner_0 = S('<>&')
745    local cleaner_1 = (1-cleaner_0)^0 * cleaner_0
746    local cleaner_2 = Cs ( (
747        P("<") / "&lt;" +
748        P(">") / "&gt;" +
749        P("&") / "&amp;" +
750        P(1)
751    )^0)
752
753    local compact = false -- can be a directive but then we also need to deal with newlines ... not now
754
755    function publications.converttoxml(dataset,nice,dontstore,usedonly,subset,noversion,rawtoo) -- we have fields !
756        local current = datasets[dataset]
757        local luadata = subset or (current and current.luadata)
758        if luadata then
759            statistics.starttiming(publications)
760            --
761            local result, r, n = { }, 0, 0
762            if usedonly then
763                usedonly = publications.usedentries()
764                usedonly = usedonly[current.name]
765            end
766            --
767            r = r + 1 ; result[r] = "<?xml version='1.0' standalone='yes'?>"
768            r = r + 1 ; result[r] = formatters["<bibtex dataset='%s'>"](current.name)
769            --
770            if nice then -- will be default
771                local f_entry_start = formatters[" <entry tag='%s' category='%s' index='%s'>"]
772                local s_entry_stop  = " </entry>"
773                local f_field       = formatters["  <field name='%s'>%s</field>"]
774                local f_cdata       = formatters["  <field name='rawbibtex'><![CDATA[%s]]></field>"]
775
776                for tag, entry in sortedhash(luadata) do
777                    if not usedonly or usedonly[tag] then
778                        r = r + 1 ; result[r] = f_entry_start(tag,entry.category,entry.index)
779                        for key, value in sortedhash(entry) do
780                            if key ~= "tag" and key ~= "category" and key ~= "index" then
781                                if lpegmatch(cleaner_1,value) then
782                                    value = lpegmatch(cleaner_2,value)
783                                end
784                                if value ~= "" then
785                                    r = r + 1 ; result[r] = f_field(key,value)
786                                end
787                            end
788                        end
789                        if rawtoo then
790                            local s = publications.savers.bib(current,false,{ [tag] = entry })
791                            s = utilities.strings.striplines(s,"prune and collapse")
792                            r = r + 1 ; result[r] = f_cdata(s)
793                        end
794                        r = r + 1 ; result[r] = s_entry_stop
795                        n = n + 1
796                    end
797                end
798            else
799                local f_entry_start = formatters["<entry tag='%s' category='%s' index='%s'>"]
800                local s_entry_stop  = "</entry>"
801                local f_field       = formatters["<field name='%s'>%s</field>"]
802                for tag, entry in next, luadata do
803                    if not usedonly or usedonly[tag] then
804                        r = r + 1 ; result[r] = f_entry_start(entry.tag,entry.category,entry.index)
805                        for key, value in next, entry do
806                            if key ~= "tag" and key ~= "category" and key ~= "index" then
807                                if lpegmatch(cleaner_1,value) then
808                                    value = lpegmatch(cleaner_2,value)
809                                end
810                                if value ~= "" then
811                                    r = r + 1 ; result[r] = f_field(key,value)
812                                end
813                            end
814                        end
815                        r = r + 1 ; result[r] = s_entry_stop
816                        n = n + 1
817                    end
818                end
819            end
820            --
821            r = r + 1 ; result[r] = "</bibtex>"
822            --
823            result = concat(result,nice and "\n" or nil,noversion and 2 or 1,#result)
824            --
825            if dontstore then
826                -- indeed
827            else
828                statistics.starttiming(xml)
829                current.xmldata = xmlconvert(result, {
830                    resolve_entities            = true,
831                    resolve_predefined_entities = true, -- in case we have escaped entities
832                 -- unify_predefined_entities   = true, -- &#038; -> &amp;
833                    utfize_entities             = true,
834                } )
835                statistics.stoptiming(xml)
836                if lxml then
837                    lxml.register(formatters["btx:%s"](current.name),current.xmldata)
838                end
839            end
840            statistics.stoptiming(publications)
841            return result, n
842        end
843    end
844
845end
846
847do
848
849    local function resolvedname(dataset,filename)
850        local current = datasets[dataset]
851        if type(filename) ~= "string" then
852            report("invalid filename %a",tostring(filename))
853        end
854        local fullname = resolvers.findfile(filename,"bib")
855        if fullname == "" then
856            fullname = resolvers.findfile(filename) -- let's not be too picky
857        end
858        if not fullname or fullname == "" then
859            report("no file %a",filename)
860            current.fullname = filename
861            return current, false
862        else
863            current.fullname = fullname
864            return current, fullname
865        end
866    end
867
868    publications.resolvedname = resolvedname
869
870    local cleaner = false
871    local cleaned = false
872
873    function loaders.registercleaner(what,fullname)
874        if not fullname or fullname == "" then
875            report("no %s file %a",what,fullname)
876            return
877        end
878        local list = table.load(fullname)
879        if not list then
880            report("invalid %s file %a",what,fullname)
881            return
882        end
883        list = list.replacements
884        if not list then
885            report("no replacement table in %a",fullname)
886            return
887        end
888        if cleaned then
889            report("adding replacements from %a",fullname)
890            for k, v in next, list do
891                cleaned[k] = v
892            end
893        else
894            report("using replacements from %a",fullname)
895            cleaned = list
896        end
897        cleaner = true
898    end
899
900    function loaders.bib(dataset,filename,kind)
901        local dataset, fullname = resolvedname(dataset,filename)
902        if not fullname then
903            return
904        end
905        local data = io.loaddata(fullname) or ""
906        if data == "" then
907            report("empty file %a, nothing loaded",fullname)
908            return
909        end
910        if cleaner == true then
911            cleaner = Cs((lpeg.utfchartabletopattern(keys(cleaned)) / cleaned + p_utf8character)^1)
912        end
913        if cleaner ~= false then
914            data = lpegmatch(cleaner,data)
915        end
916        if trace then
917            report("loading file %a",fullname)
918        end
919        publications.loadbibdata(dataset,data,fullname,kind)
920    end
921
922    function loaders.lua(dataset,filename,loader) -- if filename is a table we load that one
923        local current, data, fullname
924        if type(filename) == "table" then
925            current = datasets[dataset]
926            data    = filename
927        else
928            dataset, fullname = resolvedname(dataset,filename)
929            if not fullname then
930                return
931            end
932            current = datasets[dataset]
933            data    = (loader or table.load)(fullname)
934        end
935        if data then
936            local luadata = current.luadata
937            -- we want the same index each run
938            for tag, entry in sortedhash(data) do
939                if type(entry) == "table" then
940                    entry.index  = getindex(current,luadata,tag)
941                    entry.tag    = tag
942                    luadata[tag] = entry -- no cleaning yet
943                end
944            end
945        end
946    end
947
948    function loaders.json(dataset,filename)
949        loaders.lua(dataset,filename,utilities.json.load)
950    end
951
952    function loaders.buffer(dataset,name) -- if filename is a table we load that one
953        local current  = datasets[dataset]
954        local barename = file.removesuffix(name)
955        local data     = buffers.getcontent(barename) or ""
956        if data == "" then
957            report("empty buffer %a, nothing loaded",barename)
958            return
959        end
960        if trace then
961            report("loading buffer",barename)
962        end
963        publications.loadbibdata(current,data,barename,"bib")
964    end
965
966    function loaders.xml(dataset,filename)
967        local dataset, fullname = resolvedname(dataset,filename)
968        if not fullname then
969            return
970        end
971        local current = datasets[dataset]
972        local luadata = current.luadata
973        local root    = xml.load(fullname)
974        for bibentry in xmlcollected(root,"/bibtex/entry") do
975            local attributes = bibentry.at
976            local tag        = attributes.tag
977            local entry      = {
978                category = attributes.category,
979                tag      = tag, -- afterwards also set, to prevent overload
980                index    = 0,   -- prelocated
981            }
982            for field in xmlcollected(bibentry,"/field") do
983                entry[field.at.name] = field.dt[1] -- no cleaning yet | xmltext(field)
984            end
985            entry.index  = getindex(current,luadata,tag)
986            entry.tag    = tag
987            luadata[tag] = entry
988        end
989    end
990
991    setmetatableindex(loaders,function(t,filetype)
992        local v = function(dataset,filename)
993            report("no loader for file %a with filetype %a",filename,filetype)
994        end
995        t[filetype] = v
996        return v
997    end)
998
999    local done = setmetatableindex("table")
1000
1001    function publications.load(specification)
1002        local name     = specification.dataset or v_default
1003        local current  = datasets[name]
1004        local files    = settings_to_array(specification.filename)
1005        local kind     = specification.kind
1006        local dataspec = specification.specification
1007        statistics.starttiming(publications)
1008        local somedone = false
1009        for i=1,#files do
1010            local filetype, filename = string.splitup(files[i],"::")
1011            if not filename then
1012                filename = filetype
1013                filetype = file.suffix(filename)
1014            end
1015            if filename then
1016                if not filetype or filetype == "" then
1017                    filetype = "bib"
1018                end
1019                if file.suffix(filename) == "" then
1020                    file.addsuffix(filename,filetype)
1021                end
1022                if done[current][filename] then
1023                    report("file %a is already loaded in dataset %a",filename,name)
1024                else
1025                    loaders[filetype](current,filename)
1026                    done[current][filename] = true
1027                    somedone = true
1028                end
1029                if kind then
1030                    current.loaded[current.fullname or filename] = kind
1031                end
1032                if dataspec then
1033                    current.specifications[dataspec] = true
1034                end
1035            end
1036        end
1037        if somedone then
1038            local runner = enhancer.runner
1039            if runner then
1040                runner(current)
1041            end
1042        end
1043        statistics.stoptiming(publications)
1044        return current
1045    end
1046
1047end
1048
1049do
1050
1051    function enhancers.order(dataset)
1052        local luadata = dataset.luadata
1053        local ordered = dataset.ordered
1054        for i=1,#ordered do
1055            local tag = ordered[i]
1056            if type(tag) == "string" then
1057                ordered[i] = luadata[tag]
1058            end
1059        end
1060    end
1061
1062    function enhancers.details(dataset)
1063        local luadata = dataset.luadata
1064        local details = dataset.details
1065        for tag, entry in next, luadata do
1066            if not details[tag] then
1067                details[tag] = { }
1068            end
1069        end
1070    end
1071
1072    utilities.sequencers.appendaction(enhancer,"system","publications.enhancers.order")
1073    utilities.sequencers.appendaction(enhancer,"system","publications.enhancers.details")
1074
1075end
1076
1077do
1078
1079    local checked  = function(s,d) d[s] = (d[s] or 0) + 1 end
1080    local checktex = ( (1-P("\\"))^1 + P("\\") * ((C(R("az","AZ")^1)  * Carg(1))/checked))^0
1081
1082    function publications.analyze(dataset)
1083        local current    = datasets[dataset]
1084        local data       = current.luadata
1085        local categories = { }
1086        local fields     = { }
1087        local commands   = { }
1088        for k, v in next, data do
1089            categories[v.category] = (categories[v.category] or 0) + 1
1090            for k, v in next, v do
1091                fields[k] = (fields[k] or 0) + 1
1092                lpegmatch(checktex,v,1,commands)
1093            end
1094        end
1095        current.analysis = {
1096            categories = categories,
1097            fields     = fields,
1098            commands   = commands,
1099        }
1100    end
1101
1102end
1103
1104function publications.tags(dataset)
1105    return sortedkeys(datasets[dataset].luadata)
1106end
1107
1108function publications.sortedentries(dataset)
1109    return sortedhash(datasets[dataset].luadata)
1110end
1111
1112-- a helper:
1113
1114function publications.concatstate(i,n)
1115    if i == 0 then
1116        return 0
1117    elseif i == 1 then
1118        return 1
1119    elseif i == 2 and n == 2 then
1120        return 4
1121    elseif i == n then
1122        return 3
1123    else
1124        return 2
1125    end
1126end
1127
1128-- savers
1129
1130do
1131
1132    local savers = { }
1133
1134    local s_preamble = [[
1135% this is an export from context mkiv
1136
1137@preamble{
1138    \ifdefined\btxcmd
1139        % we're probably in context
1140    \else
1141        \def\btxcmd#1{\begincsname#1\endcsname}
1142    \fi
1143}
1144
1145]]
1146
1147    function savers.bib(dataset,filename,tobesaved)
1148        local f_start = formatters["@%s{%s,\n"]
1149        local f_field = formatters["  %s = {%s},\n"]
1150        local s_stop  = "}\n\n"
1151        local result  = { }
1152        local n, r = 0, 0
1153        for tag, data in sortedhash(tobesaved) do
1154            r = r + 1 ; result[r] = f_start(data.category or "article",tag)
1155            for key, value in sortedhash(data) do
1156                if not privates[key] then
1157                    r = r + 1 ; result[r] = f_field(key,value)
1158                end
1159            end
1160            r = r + 1 ; result[r] = s_stop
1161            n = n + 1
1162        end
1163        result = concat(result)
1164        if find(result,"\\btxcmd") then
1165            result = s_preamble .. result
1166        end
1167        if filename then
1168            report("%s entries from dataset %a saved in %a",n,dataset,filename)
1169            io.savedata(filename,result)
1170        else
1171            return result
1172        end
1173    end
1174
1175    function savers.lua(dataset,filename,tobesaved)
1176        local list = { }
1177        local n = 0
1178        for tag, data in next, tobesaved do
1179            local t = { }
1180            for key, value in next, data do
1181                if not privates[key] then
1182                    d[key] = value
1183                end
1184            end
1185            list[tag] = t
1186            n = n + 1
1187        end
1188        report("%s entries from dataset %a saved in %a",n,dataset,filename)
1189        table.save(filename,list)
1190    end
1191
1192    function savers.xml(dataset,filename,tobesaved,rawtoo)
1193        local result, n = publications.converttoxml(dataset,true,true,false,tobesaved,false,rawtoo)
1194        report("%s entries from dataset %a saved in %a",n,dataset,filename)
1195        io.savedata(filename,result)
1196    end
1197
1198    function publications.save(specification)
1199        local dataset   = specification.dataset
1200        local filename  = specification.filename
1201        local filetype  = specification.filetype
1202        local criterium = specification.criterium
1203        statistics.starttiming(publications)
1204        if not filename or filename == "" then
1205            report("no filename for saving given")
1206            return
1207        end
1208        if not filetype or filetype == "" then
1209            filetype = file.suffix(filename)
1210        end
1211        if not criterium or criterium == "" then
1212            criterium = v_all
1213        end
1214        local saver = savers[filetype]
1215        if saver then
1216            local current   = datasets[dataset]
1217            local luadata   = current.luadata or { }
1218            local tobesaved = { }
1219            local result  = structures.lists.filter({criterium = criterium, names = "btx"}) or { }
1220            for i=1,#result do
1221                local userdata = result[i].userdata
1222                if userdata then
1223                    local set = userdata.btxset or v_default
1224                    if set == dataset then
1225                        local tag = userdata.btxref
1226                        if tag then
1227                            tobesaved[tag] = luadata[tag]
1228                        end
1229                    end
1230                end
1231            end
1232            saver(dataset,filename,tobesaved)
1233        else
1234            report("unknown format %a for saving %a",filetype,dataset)
1235        end
1236        statistics.stoptiming(publications)
1237        return dataset
1238    end
1239
1240    publications.savers = savers
1241
1242    if implement then
1243
1244        implement {
1245            name      = "btxsavedataset",
1246            actions   = publications.save,
1247            arguments = {
1248                {
1249                    { "dataset" },
1250                    { "filename" },
1251                    { "filetype" },
1252                    { "criterium" },
1253                }
1254            }
1255        }
1256
1257    end
1258
1259end
1260
1261-- casters
1262
1263do
1264
1265    publications.detailed = setmetatableindex(function(detailed,kind)
1266        local values = setmetatableindex(function(values,value)
1267            local caster = casters[kind]
1268            local cast   = caster and caster(value) or value
1269            values[value] = cast
1270            return cast
1271        end)
1272        detailed[kind] = values
1273        return values
1274    end)
1275
1276    local keywordsplitter = utilities.parsers.groupedsplitat(";,")
1277
1278    casters.keyword = function(str)
1279        return lpegmatch(keywordsplitter,str)
1280    end
1281
1282
1283    writers.keyword = function(k)
1284        if type(k) == "table" then
1285            return concat(p,";")
1286        else
1287            return k
1288        end
1289    end
1290
1291    local pagessplitter = lpeg.splitat((
1292        P("-") + -- hyphen
1293        P("") + -- U+2014
1294        P("") + -- U+2013
1295        P("")   -- U+2012
1296    )^1)
1297
1298    casters.range = function(str)
1299        local first, last = lpegmatch(pagessplitter,str)
1300        return first and last and { first, last } or str
1301    end
1302
1303    writers.range = function(p)
1304        if type(p) == "table" then
1305            return concat(p,"-")
1306        else
1307            return p
1308        end
1309    end
1310
1311    casters.pagenumber = casters.range
1312    writers.pagenumber = writers.range
1313
1314end
1315
1316if implement then
1317
1318    implement {
1319        name      = "btxshortcut",
1320        arguments = "2 strings",
1321        actions   = function(instance,key)
1322            local d = publications.datasets[instance]
1323            context(d and d.shortcuts[key] or "?")
1324        end,
1325    }
1326
1327end
1328
1329-- inspect(publications.load { filename = "e:/tmp/oeps.bib" })
1330