if not modules then modules = { } end modules ['bibl-bib'] = { version = 1.001, comment = "this module is the basis for the lxml-* ones", author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", copyright = "PRAGMA ADE / ConTeXt Development Team", license = "see context related readme files" } -- This is a prelude to integrated bibliography support. This file just loads bibtex -- files and converts them to xml so that the we access the content in a convenient -- way. Actually handling the data takes place elsewhere. local lower, format, gsub, concat = string.lower, string.format, string.gsub, table.concat local next = next local utfchar = utf.char local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns local textoutf = characters and characters.tex.toutf local variables = interfaces and interfaces.variables local settings_to_hash = utilities.parsers.settings_to_hash local finalizers = xml.finalizers.tex local xmlfilter, xmltext, getid = xml.filter, xml.text, lxml.getid local formatters = string.formatters local P, R, S, C, Cc, Cs, Ct = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cc, lpeg.Cs, lpeg.Ct local trace_bibxml = false trackers.register("publications.bibxml", function(v) trace_bibtex = v end) local report_xml = logs.reporter("publications","xml") bibtex = bibtex or { } local bibtex = bibtex bibtex.statistics = bibtex.statistics or { } local bibtexstats = bibtex.statistics bibtexstats.nofbytes = 0 bibtexstats.nofdefinitions = 0 bibtexstats.nofshortcuts = 0 local defaultshortcuts = { jan = "1", feb = "2", mar = "3", apr = "4", may = "5", jun = "6", jul = "7", aug = "8", sep = "9", oct = "10", nov = "11", dec = "12", } local shortcuts = { } local data = { } local entries -- Currently we expand shortcuts and for large ones (like the acknowledgements -- in tugboat.bib this is not that efficient. However, eventually strings get -- hashed again. local function do_shortcut(tag,key,value) bibtexstats.nofshortcuts = bibtexstats.nofshortcuts + 1 if lower(tag) == "@string" then shortcuts[key] = value end end local function do_definition(tag,key,tab) -- maybe check entries here (saves memory) if not entries or entries[key] then bibtexstats.nofdefinitions = bibtexstats.nofdefinitions + 1 local t = { } for i=1,#tab,2 do t[tab[i]] = tab[i+1] end local p = data[tag] if not p then data[tag] = { [key] = t } else p[key] = t end end end local function resolve(s) return shortcuts[s] or defaultshortcuts[s] or s -- can be number end local percent = P("%") local start = P("@") local comma = P(",") local hash = P("#") local escape = P("\\") local single = P("'") local double = P('"') local left = P('{') local right = P('}') local both = left + right local lineending = S("\n\r") local space = S(" \t\n\r\f") local spacing = space^0 local equal = P("=") local collapsed = (space^1)/ " " ----- function add(a,b) if b then return a..b else return a end end local keyword = C((R("az","AZ","09") + S("@_:-"))^1) -- C((1-space)^1) local s_quoted = ((escape*single) + collapsed + (1-single))^0 local d_quoted = ((escape*double) + collapsed + (1-double))^0 local balanced = lpegpatterns.balanced local s_value = (single/"") * s_quoted * (single/"") local d_value = (double/"") * d_quoted * (double/"") local b_value = (left /"") * balanced * (right /"") local r_value = keyword/resolve local somevalue = s_value + d_value + b_value + r_value local value = Cs((somevalue * ((spacing * hash * spacing)/"" * somevalue)^0)) local assignment = spacing * keyword * spacing * equal * spacing * value * spacing local shortcut = keyword * spacing * left * spacing * (assignment * comma^0)^0 * spacing * right local definition = keyword * spacing * left * spacing * keyword * comma * Ct((assignment * comma^0)^0) * spacing * right local comment = keyword * spacing * left * (1-right)^0 * spacing * right local forget = percent^1 * (1-lineending)^0 -- todo \% local grammar = (space + forget + shortcut/do_shortcut + definition/do_definition + comment + 1)^0 function bibtex.convert(session,content) statistics.starttiming(bibtex) data, shortcuts, entries = session.data, session.shortcuts, session.entries bibtexstats.nofbytes = bibtexstats.nofbytes + #content session.nofbytes = session.nofbytes + #content lpegmatch(grammar,content or "") statistics.stoptiming(bibtex) end function bibtex.load(session,filename) statistics.starttiming(bibtex) local filename = resolvers.findfile(filename,"bib") if filename ~= "" then local data = io.loaddata(filename) or "" if data == "" then report_xml("empty file %a, no conversion to xml",filename) elseif trace_bibxml then report_xml("converting file %a to xml",filename) end bibtex.convert(session,data) end statistics.stoptiming(bibtex) end function bibtex.new() return { data = { }, shortcuts = { }, xml = xml.convert("\n"), nofbytes = 0, entries = nil, loaded = false, } end local p_escaped = lpegpatterns.xml.escaped local ihatethis = { f = "\\f", n = "\\n", r = "\\r", s = "\\s", t = "\\t", v = "\\v", z = "\\z", } local command = P("\\")/"" * Cc("\\bibtexcommand{") * (R("az","AZ")^1) * Cc("}") local any = P(1) local done = P(-1) local one_l = P("{") / "" local one_r = P("}") / "" local two_l = P("{{") / "" local two_r = P("}}") / "" local filter = Cs( two_l * (command + any - two_r - done)^0 * two_r * done + one_l * (command + any - one_r - done)^0 * one_r * done + (command + any )^0 ) function bibtex.toxml(session,options) if session.loaded then return else session.loaded = true end -- we can always speed this up if needed -- format slows down things a bit but who cares statistics.starttiming(bibtex) local result, r = { }, 0 local options = settings_to_hash(options) local convert = options.convert -- todo: interface local strip = options.strip -- todo: interface local entries = session.entries r = r + 1 ; result[r] = "" r = r + 1 ; result[r] = "" for id, categories in next, session.data do id = lower(gsub(id,"^@","")) for name, entry in next, categories do if not entries or entries[name] then r = r + 1 ; result[r] = formatters[""](lower(name),id) for key, value in next, entry do value = gsub(value,"\\(.)",ihatethis) -- this really needs checking value = lpegmatch(p_escaped,value) if value ~= "" then if convert then value = textoutf(value,true) end if strip then -- as there is no proper namespace in bibtex we need this -- kind of hackery ... bibtex databases are quite unportable value = lpegmatch(filter,value) or value end r = r + 1 ; result[r] = formatters[" %s"](key,value) end end r = r + 1 ; result[r] = "" end end end r = r + 1 ; result[r] = "" result = concat(result,"\n") -- alternatively we could use lxml.convert session.xml = xml.convert(result, { resolve_entities = true, resolve_predefined_entities = true, -- in case we have escaped entities -- unify_predefined_entities = true, -- & -> & utfize_entities = true, } ) session.data = nil session.shortcuts = nil statistics.stoptiming(bibtex) end statistics.register("bibtex load time", function() local nofbytes = bibtexstats.nofbytes if nofbytes > 0 then return format("%s seconds, %s bytes, %s definitions, %s shortcuts", statistics.elapsedtime(bibtex),nofbytes,bibtexstats.nofdefinitions,bibtexstats.nofshortcuts) else return nil end end) --~ str = [[ --~ @COMMENT { CRAP } --~ @STRING{ hans = "h a n s" } --~ @STRING{ taco = "t a c o" } --~ @SOMETHING{ key1, abc = "t a c o" , def = "h a n s" } --~ @SOMETHING{ key2, abc = hans # taco } --~ @SOMETHING{ key3, abc = "hans" # taco } --~ @SOMETHING{ key4, abc = hans # "taco" } --~ @SOMETHING{ key5, abc = hans # taco # "hans" # "taco"} --~ @SOMETHING{ key6, abc = {oeps {oeps} oeps} } --~ ]] --~ local session = bibtex.new() --~ bibtex.convert(session,str) --~ bibtex.toxml(session) --~ print(session.nofbytes,statistics.elapsedtime(bibtex)) --~ local session = bibtex.new() --~ bibtex.load(session,"IEEEabrv.bib") --~ bibtex.load(session,"IEEEfull.bib") --~ bibtex.load(session,"IEEEexample.bib") --~ bibtex.toxml(session) --~ print(session.nofbytes,statistics.elapsedtime(bibtex)) --~ local session = bibtex.new() --~ bibtex.load(session,"gut.bib") --~ bibtex.load(session,"komoedie.bib") --~ bibtex.load(session,"texbook1.bib") --~ bibtex.load(session,"texbook2.bib") --~ bibtex.load(session,"texbook3.bib") --~ bibtex.load(session,"texgraph.bib") --~ bibtex.load(session,"texjourn.bib") --~ bibtex.load(session,"texnique.bib") --~ bibtex.load(session,"tugboat.bib") --~ bibtex.toxml(session) --~ print(session.nofbytes,statistics.elapsedtime(bibtex)) --~ print(table.serialize(session.data)) --~ print(table.serialize(session.shortcuts)) --~ print(xml.serialize(session.xml)) if not characters then dofile(resolvers.findfile("char-def.lua")) end local chardata = characters.data local concat = table.concat local lpeg = lpeg local P, Ct, lpegmatch, lpegpatterns = lpeg.P, lpeg.Ct, lpeg.match, lpeg.patterns local space, comma = P(" "), P(",") local andsplitter = lpeg.tsplitat(space^1 * "and" * space^1) local commasplitter = lpeg.tsplitat(space^0 * comma * space^0) local spacesplitter = lpeg.tsplitat(space^1) local firstcharacter = lpegpatterns.utf8byte local function is_upper(str) local first = lpegmatch(firstcharacter,str) local okay = chardata[first] return okay and okay.category == "lu" end local function splitauthors(str) local authors = lpegmatch(andsplitter,str) for i=1,#authors do local firstnames, vons, surnames, initials, juniors, words local author = authors[i] local split = lpegmatch(commasplitter,author) local n = #split if n == 1 then --~ First von Last words = lpegmatch(spacesplitter,author) firstnames, vons, surnames = { }, { }, { } local i, n = 1, #words while i <= n do local w = words[i] if is_upper(w) then firstnames[#firstnames+1], i = w, i + 1 else break end end while i <= n do local w = words[i] if is_upper(w) then break else vons[#vons+1], i = w, i + 1 end end while i <= n do surnames[#surnames+1], i = words[i], i + 1 end elseif n == 2 then --~ von Last, First words = lpegmatch(spacesplitter,split[2]) surnames = lpegmatch(spacesplitter,split[1]) firstnames, vons = { }, { } local i, n = 1, #words while i <= n do local w = words[i] if is_upper(w) then firstnames[#firstnames+1], i = w, i + 1 else break end end while i <= n do vons[#vons+1], i = words[i], i + 1 end else --~ von Last, Jr ,First firstnames = lpegmatch(spacesplitter,split[1]) juniors = lpegmatch(spacesplitter,split[2]) surnames = lpegmatch(spacesplitter,split[3]) if n > 3 then -- error end end if #surnames == 0 then surnames[1] = firstnames[#firstnames] firstnames[#firstnames] = nil end if firstnames then initials = { } for i=1,#firstnames do initials[i] = utfchar(lpegmatch(firstcharacter,firstnames[i])) end end authors[i] = { original = author, firstnames = firstnames, vons = vons, surnames = surnames, initials = initials, juniors = juniors, } end authors.original = str return authors end local function the_initials(initials,symbol) local t, symbol = { }, symbol or "." for i=1,#initials do t[i] = initials[i] .. symbol end return t end -- authors bibtex.authors = bibtex.authors or { } local authors = bibtex.authors local defaultsettings = { firstnamesep = " ", vonsep = " ", surnamesep = " ", juniorsep = " ", surnamejuniorsep = ", ", juniorjuniorsep = ", ", surnamefirstnamesep = ", ", surnameinitialsep = ", ", namesep = ", ", lastnamesep = " and ", finalnamesep = " and ", } function authors.normal(author,settings) local firstnames, vons, surnames, juniors = author.firstnames, author.vons, author.surnames, author.juniors local result, settings = { }, settings or defaultsettings if firstnames and #firstnames > 0 then result[#result+1] = concat(firstnames," ") result[#result+1] = settings.firstnamesep or defaultsettings.firstnamesep end if vons and #vons > 0 then result[#result+1] = concat(vons," ") result[#result+1] = settings.vonsep or defaultsettings.vonsep end if surnames then result[#result+1] = concat(surnames," ") end if juniors and #juniors > 0 then result[#result+1] = concat(juniors," ") result[#result+1] = settings.surnamesep or defaultsettings.surnamesep end return concat(result) end function authors.normalshort(author,settings) local firstnames, vons, surnames, juniors = author.firstnames, author.vons, author.surnames, author.juniors local result, settings = { }, settings or defaultsettings if firstnames and #firstnames > 0 then result[#result+1] = concat(firstnames," ") result[#result+1] = settings.firstnamesep or defaultsettings.firstnamesep end if vons and #vons > 0 then result[#result+1] = concat(vons," ") result[#result+1] = settings.vonsep or defaultsettings.vonsep end if surnames then result[#result+1] = concat(surnames," ") end if juniors and #juniors > 0 then result[#result+1] = concat(juniors," ") result[#result+1] = settings.surnamejuniorsep or defaultsettings.surnamejuniorsep end return concat(result) end function authors.inverted(author,settings) local firstnames, vons, surnames, juniors = author.firstnames, author.vons, author.surnames, author.juniors local result, settings = { }, settings or defaultsettings if vons and #vons > 0 then result[#result+1] = concat(vons," ") result[#result+1] = settings.vonsep or defaultsettings.vonsep end if surnames then result[#result+1] = concat(surnames," ") end if juniors and #juniors > 0 then result[#result+1] = settings.juniorjuniorsep or defaultsettings.juniorjuniorsep result[#result+1] = concat(juniors," ") end if firstnames and #firstnames > 0 then result[#result+1] = settings.surnamefirstnamesep or defaultsettings.surnamefirstnamesep result[#result+1] = concat(firstnames," ") end return concat(result) end function authors.invertedshort(author,settings) local vons, surnames, initials, juniors = author.vons, author.surnames, author.initials, author.juniors local result, settings = { }, settings or defaultsettings if vons and #vons > 0 then result[#result+1] = concat(vons," ") result[#result+1] = settings.vonsep or defaultsettings.vonsep end if surnames then result[#result+1] = concat(surnames," ") end if juniors and #juniors > 0 then result[#result+1] = settings.juniorjuniorsep or defaultsettings.juniorjuniorsep result[#result+1] = concat(juniors," ") end if initials and #initials > 0 then result[#result+1] = settings.surnameinitialsep or defaultsettings.surnameinitialsep result[#result+1] = concat(the_initials(initials)," ") end return concat(result) end local lastconcatsize = 1 local function bibtexconcat(t,settings) local namesep = settings.namesep or defaultsettings.namesep or ", " local lastnamesep = settings.lastnamesep or defaultsettings.lastnamesep or namesep local finalnamesep = settings.finalnamesep or defaultsettings.finalnamesep or lastnamesep local lastconcatsize = #t if lastconcatsize > 2 then local s = { } for i=1,lastconcatsize-2 do s[i] = t[i] .. namesep end s[lastconcatsize-1], s[lastconcatsize] = t[lastconcatsize-1] .. finalnamesep, t[lastconcatsize] return concat(s) elseif lastconcatsize > 1 then return concat(t,lastnamesep) elseif lastconcatsize > 0 then return t[1] else return "" end end function authors.concat(author,combiner,what,settings) if type(combiner) == "string" then combiner = authors[combiner or "normal"] or authors.normal end local split = splitauthors(author) local setting = settings[what] local etallimit, etaldisplay, etaltext = 1000, 1000, "" if setting then etallimit = settings.etallimit or 1000 etaldisplay = settings.etaldisplay or etallimit etalltext = settings.etaltext or "" end local max = #split if max > etallimit and etaldisplay < max then max = etaldisplay end for i=1,max do split[i] = combiner(split[i],settings) end local result = bibtexconcat(split,settings) if max < #split then return result else return result .. etaltext end end function authors.short(author,year) local result = { } if author then local authors = splitauthors(author) for a=1,#authors do local aa = authors[a] local initials = aa.initials for i=1,#initials do result[#result+1] = initials[i] end local surnames = aa.surnames for s=1,#surnames do result[#result+1] = utfchar(lpegmatch(firstcharacter,surnames[s])) end end end if year then result[#result+1] = year end return concat(result) end -- We can consider creating a hashtable key -> entry but I wonder if -- pays off. local function collectauthoryears(id,list) list = settings_to_hash(list) id = getid(id) local found = { } for e in xml.collected(id,"/bibtex/entry") do if list[e.at.tag] then local year = xmlfilter(e,"xml:///field[@name='year']/text()") local author = xmlfilter(e,"xml:///field[@name='author']/text()") if author and year then local a = found[author] if not a then a = { } found[author] = a end local y = a[year] if not y then y = { } a[year] = y end y[#y+1] = e end end end -- found = { author = { year_1 = { e1, e2, e3 } } } local done = { } for author, years in next, found do local yrs = { } for year, entries in next, years do if subyears then -- -- add letters to all entries of an author and if so shouldn't -- -- we tag all years of an author as soon as we do this? -- if #entries > 1 then -- for i=1,#years do -- local entry = years[i] -- -- years[i] = year .. string.char(i + string.byte("0") - 1) -- end -- end else yrs[#yrs+1] = year end end done[author] = yrs end return done end local method, settings = "normal", { } function authors.setsettings(s) settings = s or settings end if commands then local sessions = { } function commands.definebibtexsession(name) sessions[name] = bibtex.new() end function commands.preparebibtexsession(name,xmlname,options) bibtex.toxml(sessions[name],options) lxml.register(xmlname,sessions[name].xml) end function commands.registerbibtexfile(name,filename) bibtex.load(sessions[name],filename) end function commands.registerbibtexentry(name,entry) local session = sessions[name] local entries = session.entries if not entries then session.entries = { [entry] = true } -- here we can keep more info else entries[entry] = true end end -- commands.bibtexconcat = bibtexconcat -- finalizers can be rather dumb as we have just text and no embedded xml function finalizers.bibtexconcat(collected,method,what) if collected then local author = collected[1].dt[1] or "" if author ~= "" then context(authors.concat(author,method,what,settings)) end end end function finalizers.bibtexshort(collected) if collected then local c = collected[1] local year = xmlfilter(c,"xml://field[@name='year']/text()") local author = xmlfilter(c,"xml://field[@name='author']/text()") context(authors.short(author,year)) end end -- experiment: --~ -- alternative approach: keep data at the tex end --~ local function xbibtexconcat(t,sep,finalsep,lastsep) --~ local n = #t --~ if n > 0 then --~ context(t[1]) --~ if n > 1 then --~ if n > 2 then --~ for i=2,n-1 do --~ context.bibtexpublicationsparameter("sep") --~ context(t[i]) --~ end --~ context.bibtexpublicationsparameter("finalsep") --~ else --~ context.bibtexpublicationsparameter("lastsep") --~ end --~ context(t[n]) --~ end --~ end --~ end -- todo : sort -- todo: choose between bibtex or commands namespace function bibtex.authorref(id,list) local result = collectauthoryears(id,list,method,what) for author, years in next, result do context(authors.concat(author,method,what,settings)) end end function bibtex.authoryearref(id,list) local result = collectauthoryears(id,list,method,what) for author, years in next, result do context("%s (%s)",authors.concat(author,method,what,settings),concat(years,", ")) end end function bibtex.authoryearsref(id,list) local result = collectauthoryears(id,list,method,what) for author, years in next, result do context("(%s, %s)",authors.concat(author,method,what,settings),concat(years,", ")) end end function bibtex.singularorplural(singular,plural) if lastconcatsize and lastconcatsize > 1 then context(plural) else context(singular) end end end --~ local function test(sample) --~ local authors = splitauthors(sample) --~ print(table.serialize(authors)) --~ for i=1,#authors do --~ local author = authors[i] --~ print(normalauthor (author,settings)) --~ print(normalshortauthor (author,settings)) --~ print(invertedauthor (author,settings)) --~ print(invertedshortauthor(author,settings)) --~ end --~ print(concatauthors(sample,settings,normalauthor)) --~ print(concatauthors(sample,settings,normalshortauthor)) --~ print(concatauthors(sample,settings,invertedauthor)) --~ print(concatauthors(sample,settings,invertedshortauthor)) --~ end --~ local sample_a = "Hagen, Hans and Hoekwater, Taco Whoever T. Ex. and Henkel Hut, Hartmut Harald von der" --~ local sample_b = "Hans Hagen and Taco Whoever T. Ex. Hoekwater and Hartmut Harald von der Henkel Hut" --~ test(sample_a) --~ test(sample_b)