lang-ini.lmt /size: 52 Kb    last modification: 2024-01-16 09:02
1if not modules then modules = { } end modules ['lang-ini'] = {
2    version   = 1.001,
3    comment   = "companion to lang-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- needs a cleanup (share locals)
10-- discard language when redefined
11
12-- 002D : hyphen-minus (ascii)
13-- 002D : hyphen-minus (ascii)
14-- 2010 : hyphen
15-- 2011 : nonbreakable hyphen
16-- 2013 : endash (compound hyphen)
17
18-- todo: no foo:bar but foo(bar,...)
19
20-- https://wortschatz.uni-leipzig.de/de/download/German : lots of lists
21
22local type, tonumber, next = type, tonumber, next
23local utfbyte, utflength = utf.byte, utf.length
24local format, gsub, gmatch, find = string.format, string.gsub, string.gmatch, string.find
25local concat, sortedkeys, sortedhash, keys, insert, tohash = table.concat, table.sortedkeys, table.sortedhash, table.keys, table.insert, table.tohash
26local setmetatableindex = table.setmetatableindex
27local utfvalues, strip, utfcharacters = string.utfvalues, string.strip, utf.characters
28
29local context   = context
30local commands  = commands
31local implement = interfaces.implement
32
33local settings_to_array = utilities.parsers.settings_to_array
34local settings_to_set   = utilities.parsers.settings_to_set
35
36local trace_patterns = false  trackers.register("languages.patterns", function(v) trace_patterns = v end)
37local trace_goodies  = false  trackers.register("languages.goodies",  function(v) trace_goodies  = v end)
38local trace_applied  = false  trackers.register("languages.applied",  function(v) trace_applied  = v end)
39
40local report_initialization = logs.reporter("languages","initialization")
41local report_goodies        = logs.reporter("languages","goodies")
42
43local prehyphenchar    = language.prehyphenchar    -- global per language
44local posthyphenchar   = language.posthyphenchar   -- global per language
45local preexhyphenchar  = language.preexhyphenchar  -- global per language
46local postexhyphenchar = language.postexhyphenchar -- global per language
47----- lefthyphenmin    = language.lefthyphenmin
48----- righthyphenmin   = language.righthyphenmin
49local sethjcode        = language.sethjcode
50local currentlanguage  = language.current -- or function() return tex.normallanguage or tex.language end
51
52local uccodes          = characters.uccodes
53local lccodes          = characters.lccodes
54
55local new_language     = language.new
56
57languages              = languages or {}
58local languages        = languages
59
60languages.version      = 1.010
61
62languages.registered   = languages.registered or { }
63local registered       = languages.registered
64
65languages.associated   = languages.associated or { }
66local associated       = languages.associated
67
68languages.numbers      = languages.numbers    or { }
69local numbers          = languages.numbers
70
71languages.data         = languages.data       or { }
72local data             = languages.data
73
74storage.register("languages/registered",registered,"languages.registered")
75storage.register("languages/associated",associated,"languages.associated")
76storage.register("languages/numbers",   numbers,   "languages.numbers")
77storage.register("languages/data",      data,      "languages.data")
78
79local variables = interfaces.variables
80
81local v_reset   = variables.reset
82local v_yes     = variables.yes
83
84local nofloaded  = 0
85
86local function resolve(tag)
87    local data, instance = registered[tag], nil
88    if data then
89        instance = data.instance
90        if not instance then
91            instance = new_language(data.number)
92            data.instance = instance
93        end
94    end
95    return data, instance
96end
97
98local function tolang(what) -- returns lang object
99    if not what then
100        what = currentlanguage()
101    end
102    if type(what) == "userdata" then
103        return what
104    end
105    local tag = numbers[what]
106    local data = tag and registered[tag] or registered[what]
107    if data then
108        local instance = data.instance -- .lang -- was this ok ?
109        if not instance then
110            instance = new_language(data.number)
111            data.instance = instance
112        end
113        return instance
114    end
115end
116
117function languages.getdata(tag) -- or number
118    if tag then
119        return registered[tag] or registered[numbers[tag]]
120    else
121        return registered[numbers[currentlanguage()]]
122    end
123end
124
125languages.tolang = tolang
126
127-- patterns=en
128-- patterns=en,de
129
130local function validdata(loaded,what,tag)
131    local dataset = loaded[what]
132    if dataset then
133        local data = dataset.data
134        if not data or data == "" then
135            -- nothing
136        elseif dataset.compression == "zlib" then
137            data = zlib.decompress(data)
138            if dataset.length and dataset.length ~= #data then
139                report_initialization("compression error in %a for language %a","patterns",what,tag)
140            end
141            return data
142        else
143            return data
144        end
145    end
146end
147
148-- languages.hjcounts[unicode].count
149
150-- hjcode: 0       not to be hyphenated
151--         1--31   length
152--         32      zero length
153--         > 32    hyphenated with length 1
154
155local function sethjcodes(instance,loaded,what,factor)
156    local l = loaded[what]
157    local c = l and l.characters
158    if c then
159        local hjcounts = factor and languages.hjcounts or false
160        --
161        local h = loaded.codehash
162        if not h then
163            h = { }
164            loaded.codehash = h
165        end
166        --
167        local function setcode(code)
168            local l = lccodes[code] -- just in case we get a mixture
169            local u = uccodes[code] -- just in case we get a mixture
170            local s = l
171            if type(s) ~= "number" then
172                l = code
173                s = code
174            end
175            if hjcounts then
176                local c = hjcounts[s]
177                if c then
178                    c = c.count
179                    if not c then
180                        -- error, keep as 1
181                    elseif c <= 0 then
182                        -- counts as 0 i.e. ignored
183                        s = 32
184                    elseif c >= 31 then
185                        -- counts as 31
186                        s = 31
187                    else
188                        -- count c times
189                        s = c
190                    end
191                end
192            end
193            sethjcode(instance,l,s)
194            if u ~= l and type(u) == "number" then
195                sethjcode(instance,u,s)
196                h[u] = s
197            end
198        end
199        --
200        local s = tex.savinghyphcodes
201        tex.savinghyphcodes = 0
202        if type(c) == "table" then
203            if #c > 0 then
204                -- list: { U, U, U, "chr", "chr", ... }
205                for i=1,#c do
206                    local v = c[i]
207                    setcode(type(v) == "string" and utfbyte(v) or v)
208                end
209            else
210                -- hash: { ["chr"] = true, ... }
211               for k, v in sortedhash(c) do
212                    if v then
213                        setcode(utfbyte(k))
214                    end
215                end
216            end
217        elseif type(c) == "string" then
218            for l in utfvalues(c) do
219                setcode(l)
220            end
221        end
222        tex.savinghyphcodes = s
223    end
224end
225
226local function addhjcodestoinstance(instance,characters)
227    if type(characters) == "table" then
228        local nofcharacters = #characters
229        if nofcharacters > 0 then
230            -- list: { U, U, U, "chr", "chr", ... }
231            for i=1,nofcharacters do
232                local v = characters[i]
233                local h = type(v) == "string" and utfbyte(v) or v
234                sethjcode(instance,h,h)
235            end
236        else
237            -- hash: { ["chr"] = true, ... }
238            for k, v in next, characters do
239                if v then
240                    local h = type(k) == "string" and utfbyte(k) or k
241                    sethjcode(instance,h,h)
242                end
243            end
244        end
245    elseif type(characters) == "string" then
246        for h in utfvalues(characters) do
247            sethjcode(instance,h,h)
248        end
249    end
250end
251
252-- 2'2 conflicts with 4' ... and luatex barks on it
253
254local P, S, R, C, Cs, Ct, lpegmatch, lpegpatterns = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.Cs, lpeg.Ct, lpeg.match, lpeg.patterns
255
256local utfsplit = utf.split
257
258local space       = lpegpatterns.space
259local whitespace  = lpegpatterns.whitespace^1
260local nospace     = lpegpatterns.utf8char - whitespace
261local digit       = lpegpatterns.digit
262----- endofstring = #whitespace + P(-1)
263local endofstring = #whitespace
264
265local word        = (digit/"")^0 * (digit/"" * endofstring + digit/" " + nospace)^1
266local anyword     = (1-whitespace)^1
267local analyze     = Ct((whitespace + Cs(word))^1)
268
269local function unique(tag,requested,loaded)
270    local nofloaded = #loaded
271    if nofloaded == 0 then
272        return ""
273    elseif nofloaded == 1 then
274        return loaded[1]
275    else
276        insert(loaded,1," ") -- no need then for special first word
277     -- insert(loaded,  " ")
278        loaded = concat(loaded," ")
279        local t = lpegmatch(analyze,loaded) or { }
280        local h = { }
281        local b = { }
282        for i=1,#t do
283            local ti = t[i]
284            local hi = h[ti]
285            if not hi then
286                h[ti] = 1
287            elseif hi == 1 then
288                h[ti] = 2
289                b[#b+1] = utfsplit(ti," ")
290            end
291        end
292        -- sort
293        local nofbad = #b
294        if nofbad > 0 then
295            local word
296            for i=1,nofbad do
297                local bi = b[i]
298                local p = P(bi[1])
299                for i=2,#bi do
300                    p = p * digit * P(bi[i])
301                end
302                if word then
303                    word = word + p
304                else
305                    word = p
306                end
307                report_initialization("language %a, patterns %a, discarding conflict (0-9)%{[0-9]}t(0-9)",tag,requested,bi)
308            end
309            t, h, b = nil, nil, nil -- permit gc
310            local someword = digit^0 * word * digit^0 * endofstring / ""
311         -- local strip    = Cs(someword^-1 * (someword + anyword + whitespace)^1)
312            local strip    = Cs((someword + anyword + whitespace)^1)
313            return lpegmatch(strip,loaded) or loaded
314        else
315            return loaded
316        end
317    end
318end
319
320local shared = false
321
322local function loaddefinitions(tag,specification)
323    statistics.starttiming(languages)
324    local data, instance = resolve(tag)
325    local requested = specification.patterns or ""
326    local definitions = settings_to_array(requested)
327    if #definitions > 0 then
328        if trace_patterns then
329            report_initialization("pattern specification for language %a: %s",tag,specification.patterns)
330        end
331        local ploaded = instance:patterns()
332        local eloaded = instance:hyphenation()
333        if not ploaded or ploaded == ""  then
334            ploaded = { }
335        else
336            ploaded = { ploaded }
337        end
338        if not eloaded or eloaded == ""  then
339            eloaded = { }
340        else
341            eloaded = { eloaded }
342        end
343        local dataused  = data.used
344        local ok        = false
345        local resources = data.resources or { }
346        data.resources  = resources
347        if not shared then
348            local found = resolvers.findfile("lang-exc.lua")
349            if found then
350                shared = dofile(found)
351                if type(shared) == "table" then
352                    shared = concat(shared," ")
353                else
354                    shared = true
355                end
356            else
357                shared = true
358            end
359        end
360        for i=1,#definitions do
361            local definition = definitions[i]
362            if definition == "" then
363                -- error
364            elseif definition == v_reset then
365                if trace_patterns then
366                    report_initialization("clearing patterns for language %a",tag)
367                end
368                instance:clearpatterns()
369                instance:clearhyphenation()
370                ploaded = { }
371                eloaded = { }
372            elseif not dataused[definition] then
373                dataused[definition] = definition
374                local filename = "lang-" .. definition .. ".lua"
375                local fullname = resolvers.findfile(filename) or ""
376                if fullname == "" then
377                    fullname = resolvers.findfile(filename .. ".gz") or ""
378                end
379                if fullname ~= "" then
380                    if trace_patterns then
381                        report_initialization("loading definition %a for language %a from %a",definition,tag,fullname)
382                    end
383                    local suffix, gzipped = gzip.suffix(fullname)
384                    local loaded = table.load(fullname,gzipped and gzip.load)
385                    if loaded then -- todo: version test
386                        ok, nofloaded = true, nofloaded + 1
387                        sethjcodes(instance,loaded,"patterns",specification.factor)
388                        sethjcodes(instance,loaded,"exceptions",specification.factor)
389                        local p = validdata(loaded,"patterns",tag)
390                        local e = validdata(loaded,"exceptions",tag)
391                        if p and p ~= "" then
392                            ploaded[#ploaded+1] = p
393                        end
394                        if e and e ~= "" then
395                            eloaded[#eloaded+1] = e
396                        end
397                        resources[#resources+1] = loaded -- so we can use them otherwise
398                    else
399                        report_initialization("invalid definition %a for language %a in %a",definition,tag,filename)
400                    end
401                elseif trace_patterns then
402                    report_initialization("invalid definition %a for language %a in %a",definition,tag,filename)
403                end
404            elseif trace_patterns then
405                report_initialization("definition %a for language %a already loaded",definition,tag)
406            end
407        end
408        if #ploaded > 0 then
409            -- why not always clear
410            instance:clearpatterns()
411            instance:patterns(unique(tag,requested,ploaded))
412        end
413        if #eloaded > 0 then
414            -- why not always clear
415            instance:clearhyphenation()
416            instance:hyphenation(concat(eloaded," "))
417        end
418        if type(shared) == "string" then
419            instance:hyphenation(shared)
420        end
421        return ok
422    elseif trace_patterns then
423        report_initialization("no definitions for language %a",tag)
424    end
425    statistics.stoptiming(languages)
426end
427
428storage.shared.noflanguages = storage.shared.noflanguages or 0
429
430local noflanguages = storage.shared.noflanguages
431
432function languages.define(tag,parent)
433    noflanguages = noflanguages + 1
434    if trace_patterns then
435        report_initialization("assigning number %a to %a",noflanguages,tag)
436    end
437    numbers[noflanguages] = tag
438    numbers[tag] = noflanguages
439    registered[tag] = {
440        tag      = tag,
441        parent   = parent or "",
442        patterns = "",
443        loaded   = false,
444        used     = { },
445        dirty    = true,
446        number   = noflanguages,
447        instance = nil, -- luatex data structure
448        synonyms = { },
449    }
450    storage.shared.noflanguages = noflanguages
451end
452
453function languages.setsynonym(synonym,tag) -- convenience function
454    local l = registered[tag]
455    if l then
456        l.synonyms[synonym] = true -- maybe some day more info
457    end
458end
459
460function languages.installed(separator)
461    return concat(sortedkeys(registered),separator or ",")
462end
463
464function languages.current(n)
465    return numbers[n and tonumber(n) or currentlanguage()]
466end
467
468function languages.associate(tag,script,language) -- not yet used
469    associated[tag] = { script, language }
470end
471
472function languages.association(tag) -- not yet used
473    if not tag then
474        tag = numbers[currentlanguage()]
475    elseif type(tag) == "number" then
476        tag = numbers[tag]
477    end
478    local lat = tag and associated[tag]
479    if lat then
480        return lat[1], lat[2]
481    end
482end
483
484function languages.loadable(tag,defaultlanguage) -- hack
485    local l = registered[tag] -- no synonyms
486    if l and resolvers.findfile("lang-"..l.patterns..".lua") then
487        return true
488    else
489        return false
490    end
491end
492
493-- a bit messy, we will do all language setting in lua as we can now assign
494-- and 'patterns' will go away here.
495
496function languages.unload(tag)
497    local l = registered[tag]
498    if l then
499        l.dirty = true
500    end
501end
502
503-- not that usefull, global values
504
505function languages.prehyphenchar   (what) return prehyphenchar   (tolang(what)) end
506function languages.posthyphenchar  (what) return posthyphenchar  (tolang(what)) end
507function languages.preexhyphenchar (what) return preexhyphenchar (tolang(what)) end
508function languages.postexhyphenchar(what) return postexhyphenchar(tolang(what)) end
509-------- languages.lefthyphenmin   (what) return lefthyphenmin   (tolang(what)) end
510-------- languages.righthyphenmin  (what) return righthyphenmin  (tolang(what)) end
511
512-- e['implementer']= 'imple{m}{-}{-}menter'
513-- e['manual'] = 'man{}{}{}'
514-- e['as'] = 'a-s'
515-- e['user-friendly'] = 'user=friend-ly'
516-- e['exceptionally-friendly'] = 'excep-tionally=friend-ly'
517
518local invalid = { "{", "}", "(", ")", "-", " " }
519
520local function collecthjcodes(data,str)
521    local found = data.extras and data.extras.characters or { }
522    if type(str) == "string" then
523        for s in utfcharacters(str) do
524            if not found[s] then
525                found[s] = true
526            end
527        end
528    elseif type(str) == "table" then
529        for i=1,#str do
530            local s = str[i]
531            if not found[s] then
532                found[s] = true
533            end
534        end
535    end
536    for i=1,#invalid do -- less checks this way
537        local c = invalid[i]
538        if found[c] then
539            found[c] = nil
540        end
541    end
542    data.extras = { characters = found }
543    sethjcodes(data.instance,data,"extras",data.factor)
544end
545
546function languages.loadwords(tag,filename)
547    local data, instance = resolve(tag)
548    if data then
549        statistics.starttiming(languages)
550        local str = io.loaddata(filename) or ""
551        collecthjcodes(data,str)
552        instance:hyphenation(str)
553        statistics.stoptiming(languages)
554    end
555end
556
557
558function languages.setexceptions(tag,str)
559    local data, instance = resolve(tag)
560    if data then
561        str = strip(str) -- we need to strip leading spaces
562        collecthjcodes(data,str)
563        instance:hyphenation(str)
564    end
565end
566
567function languages.setpatterns(tag,str)
568    local data, instance = resolve(tag)
569    if data then
570        str = strip(str) -- we need to strip leading spaces
571        collecthjcodes(data,str)
572        instance:patterns(str)
573    end
574end
575
576local function setwordhandler(tag,action)
577    local data, instance = resolve(tag)
578    if data then
579        instance:setwordhandler(action)
580    end
581end
582
583languages.setwordhandler = setwordhandler
584
585function languages.setoptions(tag,str)
586    languages.addgoodiesdata(tag,{ { words = str } })
587    -- for now:
588    languages.setgoodieshandler { tag = tag, goodies = tag }
589end
590
591function languages.hyphenate(tag,str)
592    -- todo: does this still work?
593    local data, instance = resolve(tag)
594    if data then
595        return instance:hyphenate(str)
596    else
597        return str
598    end
599end
600
601-- This code is here for some testing (and discussion) but it might end up in its
602-- own module. I wrote it after listening to the end March 2021 live concert of
603-- Mandoki Soulmates: Hungarian Pictures (music is the greatest unifier) with his
604-- usual incredible international lineup. After that, and realizing that we needed
605-- to deal better with some language issues as follow up on a mailing list thread, I
606-- needed only a few loops of relistening the concert to implement it. In
607-- restrospect this was a language feature that should have been there a while ago.
608
609local expand ; do
610
611    local nuts        = nodes.nuts
612    local nextglyph   = nuts.traversers.glyph
613    local setoptions  = nuts.setoptions
614
615    local getnext     = nuts.getnext
616    local getprev     = nuts.getprev
617    local setchar     = nuts.setchar
618    local setnext     = nuts.setnext
619    local setlink     = nuts.setlink
620    local setfield    = nuts.setfield
621    local setdisc     = nuts.setdisc
622    local getprop     = nuts.getprop
623    local setprop     = nuts.setprop
624    local setattrlist = nuts.setattrlist
625
626    local new_disc    = nuts.pool.disc
627    local new_glyph   = nuts.pool.glyph
628    local copy_node   = nuts.copy
629    local flushlist   = nuts.flushlist
630
631    local glyphoptioncodes      = tex.glyphoptioncodes
632
633    local lower                 = characters.lower
634    local replacer              = utf.replacer
635    local utfchartabletopattern = lpeg.utfchartabletopattern
636
637    local report                = logs.reporter("languages","goodies")
638
639    -- can be shared
640
641    local goodiesdata = setmetatableindex(function(t,k)
642        local v = {
643            properties    = { },
644            replacements  = { },
645            characters    = { },
646            exceptions    = { },
647            substitutions = { },
648        }
649        t[k] = v
650        return v
651    end)
652
653    -- can be a helper
654
655    local compound_disc_code = tex.discoptioncodes.preword | tex.discoptioncodes.postword
656
657    local function setcompound(current,id,first,last,lh,rh,hyphen)
658        local prev     = getprev(current)
659     -- local language = tolang(id)
660     -- local prechar  = prehyphenchar(language)
661     -- local postchar = posthyphenchar(language)
662        local prechar  = prehyphenchar(id)
663        local postchar = posthyphenchar(id)
664        local pre      = prechar  and copy_node(current)
665        local post     = postchar and copy_node(current)
666        local replace  = hyphen and prechar and copy_node(current)
667        local disc     = new_disc()
668        if pre then
669            setchar(pre,prechar)
670        end
671        if post then
672            setchar(post,postchar)
673        end
674        if replace then
675            setchar(replace,prechar)
676        end
677        setattrlist(disc,current)
678        setoptions(disc,0x3) -- todo foo_code
679        setdisc(disc,pre,post,replace)
680        setlink(prev,disc,current)
681        if lh then
682            setfield(first,"rhmin",rh)
683        end
684
685        if rh then
686            setfield(current,"lhmin",lh)
687        end
688
689    end
690
691    local setcompounds = setmetatableindex(function(t,l)
692        local v = setmetatableindex(function(t,r)
693            local v = function(current,id,first,last) return setcompound(current,id,first,last,l,r) end
694            t[r] = v
695            return v
696        end)
697        t[l] = v
698        return v
699    end)
700
701    local sethyphens = setmetatableindex(function(t,l)
702        local v = setmetatableindex(function(t,r)
703            local v = function(current,id,first,last) return setcompound(current,id,first,last,l,r,true) end
704            t[r] = v
705            return v
706        end)
707        t[l] = v
708        return v
709    end)
710
711    local function replaceword(first,last,old,new,oldlen)
712        local oldlen = utflength(old)
713        local newlen = utflength(new)
714        if newlen == 0 then
715            -- forget about it
716        elseif newlen <= oldlen then
717            for s in utfvalues(new) do
718                setchar(first,s)
719                first = getnext(first)
720            end
721            if newlen < oldlen then
722                -- first is one ahead
723                local after  = getnext(last)
724                local before = getprev(first)
725                setnext(last)
726                setlink(before,after)
727                flushlist(first)
728            end
729        else
730            local i = 0
731            local l = getnext(last)
732            for s in utfvalues(new) do
733                i = i + 1
734                if i > oldlen then
735                    local g = copy_node(first)
736                    setlink(first,g,l)
737                    setchar(g,s)
738                    first = g
739                elseif i == oldlen then
740                    setchar(first,s)
741                else
742                    setchar(first,s)
743                    first = getnext(first)
744                end
745            end
746        end
747    end
748
749 -- local optioncodes = table.copy(glyphoptioncodes)
750 --
751 -- optioncodes.nokerns     = optioncodes.noleftkern     | optioncodes.norightkern
752 -- optioncodes.noligatures = optioncodes.noleftligature | optioncodes.norightligature
753
754    local lh, rh = false, false
755
756    local cache = setmetatableindex(function(t,k)
757        local v = 0
758        if k == "compound" then
759            v = setcompounds[lh][rh]
760        elseif k == "hyphen" then
761            v = sethyphens[lh][rh]
762        else
763            v = 0
764            for s in gmatch(k,"%w+") do
765                local o = glyphoptioncodes[s]
766             -- local o = optioncodes[s]
767                if o then
768                    v = v | o
769                end
770            end
771        end
772        t[k] = v
773        return v
774    end)
775
776    local function checkglyphproperties(options)
777        -- we sort, just to be sure
778        for word, list in sortedhash(options) do
779            if type(list) == "string" then
780                options[word] = options[list]
781            else
782                for index, option in sortedhash(list) do
783                    if type(option) == "string" then
784                        list[index] = cache[option]
785                    end
786                end
787            end
788        end
789    end
790
791    -- statistics.starttiming(languages)
792    -- statistics.stoptiming(languages)
793
794    -- 1: restart 2: exceptions+patterns 3: patterns *: next word
795
796    local sequencers    = utilities.sequencers
797    local newsequencer  = sequencers.new
798    local appendgroup   = sequencers.appendgroup
799    local prependaction = sequencers.prependaction
800    local appendaction  = sequencers.appendaction
801    local enableaction  = sequencers.enableaction
802    local disableaction = sequencers.disableaction
803
804    local template = {
805        arguments    = "s",
806        returnvalues = "r,i",
807        results      = "r,i",
808    }
809
810    local registeredactions = setmetatableindex ( function(t,tag)
811        local actions = newsequencer(template)
812        appendgroup(actions,"user")
813        t[tag] = actions
814        return actions
815    end )
816
817    languages.registeredactions = registeredactions
818
819    function languages.installhandler(tag,func)
820        local todo = not rawget(registeredactions,tag)
821        local actions = registeredactions[tag]
822        appendaction(actions,"user",func)
823        enableaction(actions,func)
824        report("installing handler %a for language %a",func,tag)
825        if todo then
826            setwordhandler(tag,function(n,original,remapped,length,first,last)
827                local runner = actions.runner
828                if runner then
829                    if getprop(first,"replaced") then
830                        -- maybe some deadcycles
831                    else
832                        local r, result = runner(original)
833                        if not r or original == r then
834                            return result or 0
835                        else
836                            setprop(first,"replaced",true)
837                            replaceword(first,last,original,r,length)
838                            return 1
839                        end
840                    end
841                end
842                return 2
843            end)
844        end
845    end
846
847    local appliedoptions     = setmetatableindex("table")
848    languages.appliedoptions = appliedoptions
849
850    languages.setgoodieshandler = function(specification) -- will become a table specifier
851        if type(specification) == "table" then
852            local tag           = specification.tag
853            local goodies       = specification.goodies or tag
854            local result        = specification.result or 2
855            local data          = goodiesdata[goodies]
856            local properties    = data.properties
857            local replacements  = data.replacements
858            local substitutions = data.substitutions
859            local characters    = data.characters
860            local exceptions    = data.exceptions
861            local replacer      = nil
862            local substituter   = nil
863            local d, instance   = resolve(tag)
864            local done          = false
865            -- check if something at all
866            if type(characters) == "table" and characters and next(characters) then
867                addhjcodestoinstance(instance,characters)
868                if trace_goodies then
869                    report_goodies("registering %a characters for %a",goodies,tag)
870                end
871                done = true
872            end
873            if type(properties) == "table" and next(properties) then
874                checkglyphproperties(properties) -- checks in place!
875                if trace_goodies then
876                    report_goodies("registering %a properties for %a",goodies,tag)
877                end
878                done = true
879            end
880            if type(replacements) == "table" and next(replacements) then
881                replacer = Cs((utfchartabletopattern(replacements) / replacements + 1)^0)
882                if trace_goodies then
883                    report_goodies("registering %a replacer for %a",goodies,tag)
884                end
885                done = true
886            end
887            if type(substitutions) == "table" and next(substitutions) then
888                substituter = Cs((utfchartabletopattern(substitutions) / substitutions + 1)^0)
889                if trace_goodies then
890                    report_goodies("registering %a substitutor for %a",goodies,tag)
891                end
892                done = true
893            end
894            if type(exceptions) == "table" and next(exceptions) then
895                done = true
896            else
897                exceptions = false
898            end
899            if done then
900                local registered = registeredactions[tag]
901                local applied    = appliedoptions[tag]
902                setwordhandler(tag,function(n,original,remapped,length,first,last)
903                    local runner = registered.runner
904                    if runner then
905                        if getprop(first,"replaced") then
906                            -- maybe some deadcycles
907                        else
908                            local r, result = runner(original)
909                            if not r then
910                                if trace_goodies then
911                                    report_goodies("kept by runner: %s => %s, result %i",original,remapped, result or 0)
912                                end
913                                return result or 0
914                            elseif original == r then
915                                if result then
916                                    if trace_goodies then
917                                        report_goodies("kept by runner: %s => %s, result %i",original,remapped, result)
918                                    end
919                                    return result
920                                else
921                                    if trace_goodies then
922                                        report_goodies("kept by runner: %s => %s, continue",original,remapped)
923                                    end
924                                end
925                            else
926                                if trace_goodies then
927                                    report_goodies("replaced by runner: %s => %s => %s, restart",original,remapped,r)
928                                end
929                                setprop(first,"replaced",true)
930                                replaceword(first,last,original,r,length)
931                                return 1
932                            end
933                        end
934                    end
935                    local result = 2
936                    local o = properties[remapped]
937                  ::again::
938                    if o then
939                        if trace_goodies then
940                            report("properties: %s %s",original,remapped)
941                        end
942                        if trace_applied then
943                            applied[original] = (applied[original] or 0) + 1
944                        end
945                        local index = 0
946                        for g, c in nextglyph, first do
947                            index = index + 1
948                            local oi = o[index]
949                            if oi then
950                                if type(oi) == "function" then
951                                    oi(g,n,first,last) -- maybe return value
952                                    result = 1
953                                else
954                                    setoptions(g,oi)
955                                end
956                            end
957                            if g == last then
958                                break
959                            end
960                        end
961                        return result
962                    end
963                    if replacer then
964                        -- todo: check lengths so that we can avoid a check
965                        if getprop(first,"replaced") then
966                            -- maybe some deadcycles
967                        else
968                            local r = lpegmatch(replacer,original)
969                            if original == r then
970                                if trace_goodies then
971                                    report_goodies("kept: %s => %s",original,remapped)
972                                end
973                            else
974                                if trace_goodies then
975                                    report_goodies("replaced: %s => %s => %s",original,remapped,r)
976                                end
977                                setprop(first,"replaced",true)
978                                replaceword(first,last,original,r,length)
979                                result = 1
980                            end
981                        end
982                        return result
983                    end
984                    if substituter then
985                        if getprop(first,"replaced") then
986                            -- maybe some deadcycles
987                        else
988                            local r = lpegmatch(substituter,original)
989                            if original == r then
990                                if trace_goodies then
991                                    report_goodies("kept: %s => %s",original,remapped)
992                                end
993                            else
994                                if trace_goodies then
995                                    report_goodies("substituted: %s => %s => %s",original,remapped,r)
996                                end
997                                setprop(first,"replaced",true)
998                                if not properties[r] then
999                                    o = expand(r)
1000                                    properties[original] = o
1001                                    goto again
1002                                end
1003                            end
1004                        end
1005                    end
1006                    if exceptions then
1007                        local exception = exceptions[original]
1008                        if exception then
1009                            if trace_goodies then
1010                                report_goodies("exception: %s => %s",original,exception)
1011                            end
1012                            result = exception
1013                        else
1014                            result = 3
1015                        end
1016                        return result
1017                    end
1018                    if trace_goodies then
1019                        report_goodies("ignored: %s => %s",original,remapped)
1020                    end
1021                    return result
1022                end)
1023            elseif trace_goodies then
1024                report_goodies("nothing useable in %a for %a",goodies,tag)
1025            end
1026        else
1027            setwordhandler(tag)
1028        end
1029    end
1030
1031    local norightligature_option = glyphoptioncodes.norightligature
1032    local noleftligature_option  = glyphoptioncodes.noleftligature
1033    local norightkern_option     = glyphoptioncodes.norightkern
1034    local noleftkern_option      = glyphoptioncodes.noleftkern
1035
1036    local function applyaction(oc,v,n)
1037        if oc == "noligature" then
1038            if n > 0 then
1039                local vv = v[n-1]
1040                if vv then
1041                    v[n-1] = vv | norightligature_option
1042                else
1043                    v[n-1] = norightligature_option
1044                end
1045            end
1046            v[n] = noleftligature_option
1047        elseif oc == "compound" then
1048            if n > 1 then
1049             -- v[n] = setcompound
1050                v[n] = setcompounds[lh][rh]
1051                return true
1052            end
1053        elseif oc == "hyphen" then
1054            if n > 1 then
1055                v[n] = sethyphens[lh][rh]
1056                return true
1057            end
1058        elseif oc == "nokern" then
1059            if n > 0 then
1060                local vv = v[n-1]
1061                if vv then
1062                    v[n-1] = vv | norightkern_option
1063                else
1064                    v[n-1] = norightkern_option
1065                end
1066            end
1067            v[n] = noleftkern_option
1068        elseif oc == "noleftkern" then
1069            v[n] = noleftkern_option
1070        elseif oc == "norightkern" then
1071            if n > 0 then
1072                local vv = v[n-1]
1073                if vv then
1074                    v[n-1] = vv | norightkern_option
1075                else
1076                    v[n-1] = norightkern_option
1077                end
1078            end
1079        else
1080            for s in gmatch(oc,"%w+") do
1081                if applyaction(s,v,n) then
1082                    return
1083                end
1084            end
1085        end
1086    end
1087
1088    -- a|b : a:norightligature b:noleftligature
1089    -- a=b : a:norightkern     b:noleftkern
1090    -- a<b :                   b:noleftkern
1091    -- a>b : a:norightkern
1092    -- a-b : hyphen
1093    -- a+b : compound
1094
1095    local actions = {
1096        ["|"] = "noligature",
1097        ["="] = "nokern",
1098        ["<"] = "noleftkern",
1099        [">"] = "norightkern",
1100        ["+"] = "compound",
1101        ["-"] = "hyphen",
1102    }
1103
1104    local function analyzed(m,a,t,k)
1105        local v = { }
1106        local n = 1
1107        if m == true then
1108            for c in gmatch(k,".") do
1109                local ac = a[c]
1110                if not ac then
1111                    n = n + 1
1112                else
1113                    applyaction(ac,v,n)
1114                end
1115            end
1116        elseif type(m) == "number" then
1117            local i = 0
1118            for c in gmatch(k,".") do
1119                local ac = a[c]
1120                if not ac then
1121                    n = n + 1
1122                else
1123                    i = i + 1
1124                    if i == m then
1125                        applyaction(ac,v,n)
1126                        break
1127                    end
1128                end
1129            end
1130        elseif type(m) == "table" then
1131            -- happens here, otherwise no stable caching key, we could hash these too
1132            m = tohash(m)
1133            local i = 0
1134            for c in gmatch(k,".") do
1135                local ac = a[c]
1136                if not ac then
1137                    n = n + 1
1138                else
1139                    i = i + 1
1140                    if m[i] then
1141                        applyaction(ac,v,n)
1142                    end
1143                end
1144            end
1145        else
1146            -- error
1147        end
1148        t[k] = v
1149        return v
1150    end
1151
1152    local cache = setmetatableindex(function(t,m)
1153        local v = setmetatableindex(function(t,a)
1154            local v = setmetatableindex(function(t,k)
1155                return analyzed(m,a,t,k)
1156            end)
1157            t[m] = v
1158            return v
1159        end)
1160        t[m] = v
1161        return v
1162    end)
1163
1164    expand = function(str)
1165        return analyzed(true,actions,{},str)
1166    end
1167
1168    -- maybe also a skip symbol
1169
1170    local replace1 = Cs ( ( S("|=<>+-.0123456789")/"" + lpegpatterns.utf8character    )^0 )
1171    local replace2 = Cs ( ( S("|=<>+-.0123456789")    + lpegpatterns.utf8character/".")^0 )
1172
1173    local function stripped(str)
1174        -- todo : lpeg
1175        str = gsub(str,"%-%-[^\n]*\n","")
1176        str = gsub(str,"%%[^\n]*\n","")
1177        str = gsub(str,"%s+"," ")
1178        str = gsub(str,"^%s+","")
1179        str = gsub(str,"%s+$","")
1180        return str
1181    end
1182
1183    local registerexceptions  do
1184
1185        local lbrace   = P("{")
1186        local rbrace   = P("}")
1187        local lbracket = P("[")
1188        local rbracket = P("]")
1189        local lparent  = P("(")
1190        local rparent  = P(")")
1191        local hyphen   = P("-")
1192
1193        local p = Cs ( (
1194            lbrace *  ((1-rbrace)^0) * rbrace
1195          * lbrace *  ((1-rbrace)^0) * rbrace
1196          * lbrace * C((1-rbrace)^0) * rbrace * (lparent * C((1-rparent)^0) * rparent)^0 / function(a,b) return b or a end
1197          + (lbracket * (1-rbracket)^0 * rbracket) / ""
1198          + hyphen / ""
1199          + lpegpatterns.utf8character
1200        )^0 )
1201
1202        registerexceptions = function(target,str)
1203            local kind = type(str)
1204            if kind == "string" then
1205                for v in gmatch(stripped(str),"%S+") do
1206                    local k = lpegmatch(p,v)
1207                    if k ~= v then
1208                        target[k] = v
1209                    end
1210                end
1211            elseif kind == "table" then
1212                local n = #str
1213                if n > 0 then
1214                    for i=1,n do
1215                        local v = str[i]
1216                        local k = lpegmatch(p,v)
1217                        if k ~= v then
1218                            target[k] = v
1219                        end
1220                    end
1221                else
1222                    -- maybe check for sanity
1223                    for k, v in next, str do
1224                        target[k] = v
1225                    end
1226                end
1227            end
1228        end
1229
1230    end
1231
1232    function languages.strippedgoodiewords(str)
1233        return lpegmatch(replace1,stripped(str))
1234    end
1235
1236    local splitter = lpeg.tsplitat(" ")
1237
1238    local function addgoodies(tag,list,filename)
1239        local np = 0
1240        local nd = 0
1241        local nw = 0
1242        local nl = #list
1243        --
1244        local data          = goodiesdata[tag]
1245        local properties    = data.properties
1246        local replacements  = data.replacements
1247        local substitutions = data.substitutions
1248        local characters    = data.characters
1249        local exceptions    = data.exceptions
1250        if filename then
1251            if not data.goodies then
1252                data.goodies = { }
1253            end
1254            insert(data.goodies,filename)
1255        end
1256        --
1257        lh = false
1258        rh = false
1259        --
1260        for i=1,nl do
1261            local l = list[i]
1262            if type(l) == "table" then
1263                local w = l.words
1264                local p = l.patterns
1265                local s = l.substitutions
1266                local c = l.characters
1267                local e = l.exceptions
1268                lh = l.left  or false -- for practical reasons these are semi-global
1269                rh = l.right or false -- for practical reasons these are semi-global
1270                if c then
1271                    for v in utfvalues(c) do
1272                        characters[v] = true
1273                    end
1274                end
1275                if w then
1276                    local prefixes    = l.prefixes
1277                    local nofprefixes = 0
1278                    local suffixes    = l.suffixes
1279                    local nofsuffixes = 0
1280                    if prefixes then
1281                        prefixes    = lpegmatch(splitter,lower(stripped(prefixes)))
1282                        nofprefixes = #prefixes
1283                    end
1284                    if suffixes then
1285                        suffixes    = lpegmatch(splitter,lower(stripped(suffixes)))
1286                        nofsuffixes = #suffixes
1287                    end
1288                    w = lower(stripped(w))
1289                    if p then
1290                        local pattern = Cs((utfchartabletopattern(p) / p + 1)^0)
1291                        w = lpegmatch(pattern,w)
1292                        np = np + 1
1293                    else
1294                        nd = nd + 1
1295                    end
1296                    local m = l.matches
1297                    if not m then
1298                        m = true
1299                    end
1300                    local a = l.actions
1301                    if a then
1302                        setmetatableindex(a,actions)
1303                    else
1304                        a = actions
1305                    end
1306                    local cach = cache[m][a]
1307                    if nofprefixes > 0 then
1308                        if nofsuffixes > 0 then
1309                            for wrd in gmatch(w,"%S+") do
1310                                properties[lpegmatch(replace1,wrd)] = cach[lpegmatch(replace2,wrd)]
1311                                nw = nw + 1
1312                                for i=1,nofprefixes do
1313                                    local tmp = prefixes[i] .. wrd
1314                                    for i=1,nofsuffixes do
1315                                        local str = tmp .. suffixes[i]
1316                                        properties[lpegmatch(replace1,str)] = cach[lpegmatch(replace2,str)]
1317                                        nw = nw + 1
1318                                    end
1319                                end
1320                            end
1321                        else
1322                            for wrd in gmatch(w,"%S+") do
1323                                properties[lpegmatch(replace1,wrd)] = cach[lpegmatch(replace2,wrd)]
1324                                nw = nw + 1
1325                                for i=1,nofprefixes do
1326                                    local str = prefixes[i] .. wrd
1327                                    properties[lpegmatch(replace1,str)] = cach[lpegmatch(replace2,str)]
1328                                    nw = nw + 1
1329                                end
1330                            end
1331                        end
1332                    elseif nofsuffixes > 0 then
1333                        for wrd in gmatch(w,"%S+") do
1334                            properties[lpegmatch(replace1,wrd)] = cach[lpegmatch(replace2,wrd)]
1335                            nw = nw + 1
1336                            for i=1,nofsuffixes do
1337                                local str = wrd .. suffixes[i]
1338                                properties[lpegmatch(replace1,str)] = cach[lpegmatch(replace2,str)]
1339                                nw = nw + 1
1340                            end
1341                        end
1342                    else
1343                        for wrd in gmatch(w,"%S+") do
1344                            properties[lpegmatch(replace1,wrd)] = cach[lpegmatch(replace2,wrd)]
1345                            nw = nw + 1
1346                        end
1347                    end
1348                elseif s then
1349                    for k, v in next, s do
1350                        substitutions[k] = v
1351                    end
1352                elseif p then
1353                    for k, v in next, p do
1354                        replacements[k] = v
1355                    end
1356                elseif e then
1357                    registerexceptions(exceptions,e)
1358                end
1359            end
1360        end
1361
1362        lh = false
1363        rh = false
1364
1365        return { np = np, nd = nd, nw = nw, nl = nl }
1366    end
1367
1368    function languages.goodiefiles(tag)
1369        local d = goodiesdata[tag]
1370        return d and d.goodies
1371    end
1372
1373    function languages.addgoodiesfile(tag,filename)
1374        local fullname = resolvers.findfile(file.addsuffix(filename,"llg")) or ""
1375        if fullname == "" then
1376            report_goodies("file %a is not found",filename)
1377        else
1378            local list = table.load(fullname)
1379            if not list then
1380                report_goodies("file %a is invalid",fullname)
1381            else
1382                list = list.options
1383                if not list then
1384                    report_goodies("file %a has no options",fullname)
1385                else
1386                    local ok = addgoodies(tag,list,filename)
1387                    report_goodies("tag %a, file %a loaded, %i lists, %i via patterns, %i direct, %i words",
1388                        tag,fullname,ok.nl,ok.np,ok.nd,ok.nw)
1389                end
1390            end
1391        end
1392    end
1393
1394    function languages.addgoodiesdata(tag,list)
1395        local ok = addgoodies(tag,list)
1396        report_goodies("tag %a, data loaded, %i lists, %i via patterns, %i direct, %i words",
1397            tag,ok.nl,ok.np,ok.nd,ok.nw)
1398    end
1399
1400end
1401
1402if environment.initex then
1403
1404    function languages.getnumber()
1405        return 0
1406    end
1407
1408else
1409
1410    function languages.getnumber(tag,default,patterns,goodies,factor)
1411        local l = registered[tag]
1412        if l then
1413            if l.dirty then
1414                l.factor = factor == v_yes and true or false
1415                if trace_patterns then
1416                    report_initialization("checking patterns for %a with default %a",tag,default)
1417                end
1418                -- patterns is already resolved to parent patterns if applicable
1419                if patterns and patterns ~= "" then
1420                    if l.patterns ~= patterns then
1421                        l.patterns = patterns
1422                        if trace_patterns then
1423                            report_initialization("loading patterns for %a using specification %a",tag,patterns)
1424                        end
1425                        loaddefinitions(tag,l)
1426                    else
1427                        -- unchanged
1428                    end
1429                elseif l.patterns == "" then
1430                    l.patterns = tag
1431                    if trace_patterns then
1432                        report_initialization("loading patterns for %a using tag",tag)
1433                    end
1434                    local ok = loaddefinitions(tag,l)
1435                    if not ok and tag ~= default then
1436                        l.patterns = default
1437                        if trace_patterns then
1438                            report_initialization("loading patterns for %a using default",tag)
1439                        end
1440                        loaddefinitions(tag,l)
1441                    end
1442                end
1443                if goodies and goodies ~= "" then
1444                    goodies = settings_to_array(goodies)
1445                    for i=1,#goodies do
1446                        local goodie = goodies[i]
1447                        -- we can cache this but it doesn't pay off to do so
1448                        languages.addgoodiesfile(tag,goodie)
1449                    end
1450                    languages.setgoodieshandler {
1451                        tag     = tag,
1452                        goodies = tag,
1453                    }
1454                end
1455                l.loaded = true
1456                l.dirty  = false
1457            end
1458            return l.number
1459        else
1460            return 0
1461        end
1462    end
1463
1464    numbers[0] = "null"
1465
1466    registered.null = {
1467        number   = 0,
1468        instance = new_language(0),
1469    }
1470
1471end
1472
1473-- hyphenation.define        ("zerolanguage")
1474-- hyphenation.loadpatterns  ("zerolanguage") -- else bug
1475-- hyphenation.loadexceptions("zerolanguage") -- else bug
1476
1477languages.logger = languages.logger or { }
1478
1479function languages.logger.report()
1480    local result, r = { }, 0
1481    for tag, l in sortedhash(registered) do
1482        if l.loaded then
1483            r = r + 1
1484            result[r] = format("%s:%s:%s",tag,l.parent,l.number)
1485        end
1486    end
1487    return r > 0 and concat(result," ") or "none"
1488end
1489
1490-- must happen at the tex end .. will use lang-def.lua
1491
1492languages.associate('en','latn','eng')
1493languages.associate('uk','latn','eng')
1494languages.associate('nl','latn','nld')
1495languages.associate('de','latn','deu')
1496languages.associate('fr','latn','fra')
1497
1498statistics.register("loaded patterns", function()
1499    local result = languages.logger.report()
1500    if result ~= "none" then
1501     -- return result
1502        return format("%s, load time: %s",result,statistics.elapsedtime(languages))
1503    end
1504end)
1505
1506-- statistics.register("language load time", function()
1507--     -- often zero so we can merge that in the above
1508--     return statistics.elapsedseconds(languages, format(", nofpatterns: %s",nofloaded))
1509-- end)
1510
1511-- interface
1512
1513implement {
1514    name      = "languagenumber",
1515    actions   = { languages.getnumber, context },
1516    arguments = "5 strings"
1517}
1518
1519implement {
1520    name      = "installedlanguages",
1521    actions   = { languages.installed, context },
1522}
1523
1524implement {
1525    name      = "definelanguage",
1526    actions   = languages.define,
1527    arguments = "2 strings"
1528}
1529
1530implement {
1531    name      = "setlanguagesynonym",
1532    actions   = languages.setsynonym,
1533    arguments = "2 strings"
1534}
1535
1536implement {
1537    name      = "unloadlanguage",
1538    actions   = languages.unload,
1539    arguments = "string"
1540}
1541
1542implement {
1543    name      = "setlanguageexceptions",
1544    actions   = languages.setexceptions,
1545    arguments = "2 strings"
1546}
1547
1548implement {
1549    name      = "setlanguagepatterns",
1550    actions   = languages.setpatterns,
1551    arguments = "2 strings"
1552}
1553
1554implement {
1555    name      = "setlanguageoptions",
1556    actions   = languages.setoptions,
1557    arguments = "2 strings"
1558}
1559
1560implement {
1561    name      = "currentprehyphenchar",
1562    actions   = function()
1563        local c = prehyphenchar(tolang())
1564        if c and c > 0 then
1565            context.char(c)
1566        end
1567    end
1568}
1569
1570implement {
1571    name      = "currentposthyphenchar",
1572    actions   = function()
1573        local c = posthyphenchar(tolang())
1574        if c and c > 0 then
1575            context.char(c)
1576        end
1577    end
1578}
1579