lang-ini.lmt /size: 50 Kb    last modification: 2021-10-28 13:51
1if not modules then modules = { } end modules ['lang-ini'] = {
2    version   = 1.001,
3    comment   = "companion to lang-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- needs a cleanup (share locals)
10-- discard language when redefined
11
12-- 002D : hyphen-minus (ascii)
13-- 002D : hyphen-minus (ascii)
14-- 2010 : hyphen
15-- 2011 : nonbreakable hyphen
16-- 2013 : endash (compound hyphen)
17
18-- todo: no foo:bar but foo(bar,...)
19
20-- https://wortschatz.uni-leipzig.de/de/download/German : lots of lists
21
22local type, tonumber, next = type, tonumber, next
23local utfbyte, utflength = utf.byte, utf.length
24local format, gsub, gmatch, find = string.format, string.gsub, string.gmatch, string.find
25local concat, sortedkeys, sortedhash, keys, insert, tohash = table.concat, table.sortedkeys, table.sortedhash, table.keys, table.insert, table.tohash
26local setmetatableindex = table.setmetatableindex
27local utfvalues, strip, utfcharacters = string.utfvalues, string.strip, utf.characters
28
29local context   = context
30local commands  = commands
31local implement = interfaces.implement
32
33local settings_to_array = utilities.parsers.settings_to_array
34local settings_to_set   = utilities.parsers.settings_to_set
35
36local trace_patterns = false  trackers.register("languages.patterns", function(v) trace_patterns = v end)
37local trace_goodies  = false  trackers.register("languages.goodies",  function(v) trace_goodies  = v end)
38local trace_applied  = false  trackers.register("languages.applied",  function(v) trace_applied  = v end)
39
40local report_initialization = logs.reporter("languages","initialization")
41local report_goodies        = logs.reporter("languages","goodies")
42
43local prehyphenchar    = language.prehyphenchar    -- global per language
44local posthyphenchar   = language.posthyphenchar   -- global per language
45local preexhyphenchar  = language.preexhyphenchar  -- global per language
46local postexhyphenchar = language.postexhyphenchar -- global per language
47----- lefthyphenmin    = language.lefthyphenmin
48----- righthyphenmin   = language.righthyphenmin
49local sethjcode        = language.sethjcode
50local currentlanguage  = language.current -- or function() return tex.normallanguage or tex.language end
51
52local uccodes          = characters.uccodes
53local lccodes          = characters.lccodes
54
55local new_language     = language.new
56
57languages              = languages or {}
58local languages        = languages
59
60languages.version      = 1.010
61
62languages.registered   = languages.registered or { }
63local registered       = languages.registered
64
65languages.associated   = languages.associated or { }
66local associated       = languages.associated
67
68languages.numbers      = languages.numbers    or { }
69local numbers          = languages.numbers
70
71languages.data         = languages.data       or { }
72local data             = languages.data
73
74storage.register("languages/registered",registered,"languages.registered")
75storage.register("languages/associated",associated,"languages.associated")
76storage.register("languages/numbers",   numbers,   "languages.numbers")
77storage.register("languages/data",      data,      "languages.data")
78
79local variables = interfaces.variables
80
81local v_reset   = variables.reset
82local v_yes     = variables.yes
83
84local nofloaded  = 0
85
86local function resolve(tag)
87    local data, instance = registered[tag], nil
88    if data then
89        instance = data.instance
90        if not instance then
91            instance = new_language(data.number)
92            data.instance = instance
93        end
94    end
95    return data, instance
96end
97
98local function tolang(what) -- returns lang object
99    if not what then
100        what = currentlanguage()
101    end
102    if type(what) == "userdata" then
103        return what
104    end
105    local tag = numbers[what]
106    local data = tag and registered[tag] or registered[what]
107    if data then
108        local instance = data.instance -- .lang -- was this ok ?
109        if not instance then
110            instance = new_language(data.number)
111            data.instance = instance
112        end
113        return instance
114    end
115end
116
117function languages.getdata(tag) -- or number
118    if tag then
119        return registered[tag] or registered[numbers[tag]]
120    else
121        return registered[numbers[currentlanguage()]]
122    end
123end
124
125languages.tolang = tolang
126
127-- patterns=en
128-- patterns=en,de
129
130local function validdata(loaded,what,tag)
131    local dataset = loaded[what]
132    if dataset then
133        local data = dataset.data
134        if not data or data == "" then
135            -- nothing
136        elseif dataset.compression == "zlib" then
137            data = zlib.decompress(data)
138            if dataset.length and dataset.length ~= #data then
139                report_initialization("compression error in %a for language %a","patterns",what,tag)
140            end
141            return data
142        else
143            return data
144        end
145    end
146end
147
148-- languages.hjcounts[unicode].count
149
150-- hjcode: 0       not to be hyphenated
151--         1--31   length
152--         32      zero length
153--         > 32    hyphenated with length 1
154
155local function sethjcodes(instance,loaded,what,factor)
156    local l = loaded[what]
157    local c = l and l.characters
158    if c then
159        local hjcounts = factor and languages.hjcounts or false
160        --
161        local h = loaded.codehash
162        if not h then
163            h = { }
164            loaded.codehash = h
165        end
166        --
167        local function setcode(code)
168            local l = lccodes[code] -- just in case we get a mixture
169            local u = uccodes[code] -- just in case we get a mixture
170            local s = l
171            if type(s) ~= "number" then
172                l = code
173                s = code
174            end
175            if hjcounts then
176                local c = hjcounts[s]
177                if c then
178                    c = c.count
179                    if not c then
180                        -- error, keep as 1
181                    elseif c <= 0 then
182                        -- counts as 0 i.e. ignored
183                        s = 32
184                    elseif c >= 31 then
185                        -- counts as 31
186                        s = 31
187                    else
188                        -- count c times
189                        s = c
190                    end
191                end
192            end
193            sethjcode(instance,l,s)
194            if u ~= l and type(u) == "number" then
195                sethjcode(instance,u,s)
196                h[u] = s
197            end
198        end
199        --
200        local s = tex.savinghyphcodes
201        tex.savinghyphcodes = 0
202        if type(c) == "table" then
203            if #c > 0 then
204                -- list: { U, U, U, "chr", "chr", ... }
205                for i=1,#c do
206                    local v = c[i]
207                    setcode(type(v) == "string" and utfbyte(v) or v)
208                end
209            else
210                -- hash: { ["chr"] = true, ... }
211               for k, v in sortedhash(c) do
212                    if v then
213                        setcode(utfbyte(k))
214                    end
215                end
216            end
217        elseif type(c) == "string" then
218            for l in utfvalues(c) do
219                setcode(l)
220            end
221        end
222        tex.savinghyphcodes = s
223    end
224end
225
226local function addhjcodestoinstance(instance,characters)
227    if type(characters) == "table" then
228        local nofcharacters = #characters
229        if nofcharacters > 0 then
230            -- list: { U, U, U, "chr", "chr", ... }
231            for i=1,nofcharacters do
232                local v = characters[i]
233                local h = type(v) == "string" and utfbyte(v) or v
234                sethjcode(instance,h,h)
235            end
236        else
237            -- hash: { ["chr"] = true, ... }
238            for k, v in next, characters do
239                if v then
240                    local h = type(k) == "string" and utfbyte(k) or k
241                    sethjcode(instance,h,h)
242                end
243            end
244        end
245    elseif type(characters) == "string" then
246        for h in utfvalues(characters) do
247            sethjcode(instance,h,h)
248        end
249    end
250end
251
252-- 2'2 conflicts with 4' ... and luatex barks on it
253
254local P, S, R, C, Cs, Ct, lpegmatch, lpegpatterns = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.Cs, lpeg.Ct, lpeg.match, lpeg.patterns
255
256local utfsplit = utf.split
257
258local space       = lpegpatterns.space
259local whitespace  = lpegpatterns.whitespace^1
260local nospace     = lpegpatterns.utf8char - whitespace
261local digit       = lpegpatterns.digit
262----- endofstring = #whitespace + P(-1)
263local endofstring = #whitespace
264
265local word        = (digit/"")^0 * (digit/"" * endofstring + digit/" " + nospace)^1
266local anyword     = (1-whitespace)^1
267local analyze     = Ct((whitespace + Cs(word))^1)
268
269local function unique(tag,requested,loaded)
270    local nofloaded = #loaded
271    if nofloaded == 0 then
272        return ""
273    elseif nofloaded == 1 then
274        return loaded[1]
275    else
276        insert(loaded,1," ") -- no need then for special first word
277     -- insert(loaded,  " ")
278        loaded = concat(loaded," ")
279        local t = lpegmatch(analyze,loaded) or { }
280        local h = { }
281        local b = { }
282        for i=1,#t do
283            local ti = t[i]
284            local hi = h[ti]
285            if not hi then
286                h[ti] = 1
287            elseif hi == 1 then
288                h[ti] = 2
289                b[#b+1] = utfsplit(ti," ")
290            end
291        end
292        -- sort
293        local nofbad = #b
294        if nofbad > 0 then
295            local word
296            for i=1,nofbad do
297                local bi = b[i]
298                local p = P(bi[1])
299                for i=2,#bi do
300                    p = p * digit * P(bi[i])
301                end
302                if word then
303                    word = word + p
304                else
305                    word = p
306                end
307                report_initialization("language %a, patterns %a, discarding conflict (0-9)%{[0-9]}t(0-9)",tag,requested,bi)
308            end
309            t, h, b = nil, nil, nil -- permit gc
310            local someword = digit^0 * word * digit^0 * endofstring / ""
311         -- local strip    = Cs(someword^-1 * (someword + anyword + whitespace)^1)
312            local strip    = Cs((someword + anyword + whitespace)^1)
313            return lpegmatch(strip,loaded) or loaded
314        else
315            return loaded
316        end
317    end
318end
319
320local shared = false
321
322local function loaddefinitions(tag,specification)
323    statistics.starttiming(languages)
324    local data, instance = resolve(tag)
325    local requested = specification.patterns or ""
326    local definitions = settings_to_array(requested)
327    if #definitions > 0 then
328        if trace_patterns then
329            report_initialization("pattern specification for language %a: %s",tag,specification.patterns)
330        end
331        local ploaded = instance:patterns()
332        local eloaded = instance:hyphenation()
333        if not ploaded or ploaded == ""  then
334            ploaded = { }
335        else
336            ploaded = { ploaded }
337        end
338        if not eloaded or eloaded == ""  then
339            eloaded = { }
340        else
341            eloaded = { eloaded }
342        end
343        local dataused  = data.used
344        local ok        = false
345        local resources = data.resources or { }
346        data.resources  = resources
347        if not shared then
348            local found = resolvers.findfile("lang-exc.lua")
349            if found then
350                shared = dofile(found)
351                if type(shared) == "table" then
352                    shared = concat(shared," ")
353                else
354                    shared = true
355                end
356            else
357                shared = true
358            end
359        end
360        for i=1,#definitions do
361            local definition = definitions[i]
362            if definition == "" then
363                -- error
364            elseif definition == v_reset then
365                if trace_patterns then
366                    report_initialization("clearing patterns for language %a",tag)
367                end
368                instance:clearpatterns()
369                instance:clearhyphenation()
370                ploaded = { }
371                eloaded = { }
372            elseif not dataused[definition] then
373                dataused[definition] = definition
374                local filename = "lang-" .. definition .. ".lua"
375                local fullname = resolvers.findfile(filename) or ""
376                if fullname == "" then
377                    fullname = resolvers.findfile(filename .. ".gz") or ""
378                end
379                if fullname ~= "" then
380                    if trace_patterns then
381                        report_initialization("loading definition %a for language %a from %a",definition,tag,fullname)
382                    end
383                    local suffix, gzipped = gzip.suffix(fullname)
384                    local loaded = table.load(fullname,gzipped and gzip.load)
385                    if loaded then -- todo: version test
386                        ok, nofloaded = true, nofloaded + 1
387                        sethjcodes(instance,loaded,"patterns",specification.factor)
388                        sethjcodes(instance,loaded,"exceptions",specification.factor)
389                        local p = validdata(loaded,"patterns",tag)
390                        local e = validdata(loaded,"exceptions",tag)
391                        if p and p ~= "" then
392                            ploaded[#ploaded+1] = p
393                        end
394                        if e and e ~= "" then
395                            eloaded[#eloaded+1] = e
396                        end
397                        resources[#resources+1] = loaded -- so we can use them otherwise
398                    else
399                        report_initialization("invalid definition %a for language %a in %a",definition,tag,filename)
400                    end
401                elseif trace_patterns then
402                    report_initialization("invalid definition %a for language %a in %a",definition,tag,filename)
403                end
404            elseif trace_patterns then
405                report_initialization("definition %a for language %a already loaded",definition,tag)
406            end
407        end
408        if #ploaded > 0 then
409            -- why not always clear
410            instance:clearpatterns()
411            instance:patterns(unique(tag,requested,ploaded))
412        end
413        if #eloaded > 0 then
414            -- why not always clear
415            instance:clearhyphenation()
416            instance:hyphenation(concat(eloaded," "))
417        end
418        if type(shared) == "string" then
419            instance:hyphenation(shared)
420        end
421        return ok
422    elseif trace_patterns then
423        report_initialization("no definitions for language %a",tag)
424    end
425    statistics.stoptiming(languages)
426end
427
428storage.shared.noflanguages = storage.shared.noflanguages or 0
429
430local noflanguages = storage.shared.noflanguages
431
432function languages.define(tag,parent)
433    noflanguages = noflanguages + 1
434    if trace_patterns then
435        report_initialization("assigning number %a to %a",noflanguages,tag)
436    end
437    numbers[noflanguages] = tag
438    registered[tag] = {
439        tag      = tag,
440        parent   = parent or "",
441        patterns = "",
442        loaded   = false,
443        used     = { },
444        dirty    = true,
445        number   = noflanguages,
446        instance = nil, -- luatex data structure
447        synonyms = { },
448    }
449    storage.shared.noflanguages = noflanguages
450end
451
452function languages.setsynonym(synonym,tag) -- convenience function
453    local l = registered[tag]
454    if l then
455        l.synonyms[synonym] = true -- maybe some day more info
456    end
457end
458
459function languages.installed(separator)
460    return concat(sortedkeys(registered),separator or ",")
461end
462
463function languages.current(n)
464    return numbers[n and tonumber(n) or currentlanguage()]
465end
466
467function languages.associate(tag,script,language) -- not yet used
468    associated[tag] = { script, language }
469end
470
471function languages.association(tag) -- not yet used
472    if not tag then
473        tag = numbers[currentlanguage()]
474    elseif type(tag) == "number" then
475        tag = numbers[tag]
476    end
477    local lat = tag and associated[tag]
478    if lat then
479        return lat[1], lat[2]
480    end
481end
482
483function languages.loadable(tag,defaultlanguage) -- hack
484    local l = registered[tag] -- no synonyms
485    if l and resolvers.findfile("lang-"..l.patterns..".lua") then
486        return true
487    else
488        return false
489    end
490end
491
492-- a bit messy, we will do all language setting in lua as we can now assign
493-- and 'patterns' will go away here.
494
495function languages.unload(tag)
496    local l = registered[tag]
497    if l then
498        l.dirty = true
499    end
500end
501
502-- not that usefull, global values
503
504function languages.prehyphenchar   (what) return prehyphenchar   (tolang(what)) end
505function languages.posthyphenchar  (what) return posthyphenchar  (tolang(what)) end
506function languages.preexhyphenchar (what) return preexhyphenchar (tolang(what)) end
507function languages.postexhyphenchar(what) return postexhyphenchar(tolang(what)) end
508-------- languages.lefthyphenmin   (what) return lefthyphenmin   (tolang(what)) end
509-------- languages.righthyphenmin  (what) return righthyphenmin  (tolang(what)) end
510
511-- e['implementer']= 'imple{m}{-}{-}menter'
512-- e['manual'] = 'man{}{}{}'
513-- e['as'] = 'a-s'
514-- e['user-friendly'] = 'user=friend-ly'
515-- e['exceptionally-friendly'] = 'excep-tionally=friend-ly'
516
517local invalid = { "{", "}", "(", ")", "-", " " }
518
519local function collecthjcodes(data,str)
520    local found = data.extras and data.extras.characters or { }
521    if type(str) == "string" then
522        for s in utfcharacters(str) do
523            if not found[s] then
524                found[s] = true
525            end
526        end
527    elseif type(str) == "table" then
528        for i=1,#str do
529            local s = str[i]
530            if not found[s] then
531                found[s] = true
532            end
533        end
534    end
535    for i=1,#invalid do -- less checks this way
536        local c = invalid[i]
537        if found[c] then
538            found[c] = nil
539        end
540    end
541    data.extras = { characters = found }
542    sethjcodes(data.instance,data,"extras",data.factor)
543end
544
545function languages.loadwords(tag,filename)
546    local data, instance = resolve(tag)
547    if data then
548        statistics.starttiming(languages)
549        local str = io.loaddata(filename) or ""
550        collecthjcodes(data,str)
551        instance:hyphenation(str)
552        statistics.stoptiming(languages)
553    end
554end
555
556
557function languages.setexceptions(tag,str)
558    local data, instance = resolve(tag)
559    if data then
560        str = strip(str) -- we need to strip leading spaces
561        collecthjcodes(data,str)
562        instance:hyphenation(str)
563    end
564end
565
566function languages.setpatterns(tag,str)
567    local data, instance = resolve(tag)
568    if data then
569        str = strip(str) -- we need to strip leading spaces
570        collecthjcodes(data,str)
571        instance:patterns(str)
572    end
573end
574
575local function setwordhandler(tag,action)
576    local data, instance = resolve(tag)
577    if data then
578        instance:setwordhandler(action)
579    end
580end
581
582languages.setwordhandler = setwordhandler
583
584function languages.setoptions(tag,str)
585    languages.addgoodiesdata(tag,{ { words = str } })
586    -- for now:
587    languages.setgoodieshandler { tag = tag, goodies = tag }
588end
589
590function languages.hyphenate(tag,str)
591    -- todo: does this still work?
592    local data, instance = resolve(tag)
593    if data then
594        return instance:hyphenate(str)
595    else
596        return str
597    end
598end
599
600-- This code is here for some testing (and discussion) but it might end up in its
601-- own module. I wrote it after listening to the end March 2021 live concert of
602-- Mandoki Soulmates: Hungarian Pictures (music is the greatest unifier) with his
603-- usual incredible international lineup. After that, and realizing that we needed
604-- to deal better with some language issues as follow up on a mailing list thread, I
605-- needed only a few loops of relistening the concert to implement it. In
606-- restrospect this was a language feature that should have been there a while ago.
607
608do
609
610    local nuts                  = nodes.nuts
611    local nextglyph             = nuts.traversers.glyph
612    local setoptions            = nuts.setoptions
613
614    local getnext        = nuts.getnext
615    local getprev        = nuts.getprev
616    local setchar        = nuts.setchar
617    local setnext        = nuts.setnext
618    local setlink        = nuts.setlink
619    local setfield       = nuts.setfield
620    local setdisc        = nuts.setdisc
621    local getprop        = nuts.getprop
622    local setprop        = nuts.setprop
623    local setattrlist    = nuts.setattrlist
624
625    local new_disc       = nuts.pool.disc
626    local new_glyph      = nuts.pool.glyph
627    local copy_node      = nuts.copy
628    local flushlist      = nuts.flushlist
629
630    local glyphoptioncodes      = tex.glyphoptioncodes
631
632    local lower                 = characters.lower
633    local replacer              = utf.replacer
634    local utfchartabletopattern = lpeg.utfchartabletopattern
635
636    local report                = logs.reporter("languages","goodies")
637
638    -- can be shared
639
640    local goodiesdata = setmetatableindex(function(t,k)
641        local v = {
642            properties   = { },
643            replacements = { },
644            characters   = { },
645            exceptions   = { },
646        }
647        t[k] = v
648        return v
649    end)
650
651    -- can be a helper
652
653    local compound_disc_code = tex.discoptioncodes.preword | tex.discoptioncodes.postword
654
655    local function setcompound(current,id,first,last,lh,rh,hyphen)
656        local prev     = getprev(current)
657     -- local language = tolang(id)
658     -- local prechar  = prehyphenchar(language)
659     -- local postchar = posthyphenchar(language)
660        local prechar  = prehyphenchar(id)
661        local postchar = posthyphenchar(id)
662        local pre      = prechar  and copy_node(current)
663        local post     = postchar and copy_node(current)
664        local replace  = hyphen and prechar and copy_node(current)
665        local disc     = new_disc()
666        if pre then
667            setchar(pre,prechar)
668        end
669        if post then
670            setchar(post,postchar)
671        end
672        if replace then
673            setchar(replace,prechar)
674        end
675        setattrlist(disc,current)
676        setoptions(disc,0x3) -- todo foo_code
677        setdisc(disc,pre,post,replace)
678        setlink(prev,disc,current)
679        if lh then
680            setfield(first,"rhmin",rh)
681        end
682
683        if rh then
684            setfield(current,"lhmin",lh)
685        end
686
687    end
688
689    local setcompounds = setmetatableindex(function(t,l)
690        local v = setmetatableindex(function(t,r)
691            local v = function(current,id,first,last) return setcompound(current,id,first,last,l,r) end
692            t[r] = v
693            return v
694        end)
695        t[l] = v
696        return v
697    end)
698
699    local sethyphens = setmetatableindex(function(t,l)
700        local v = setmetatableindex(function(t,r)
701            local v = function(current,id,first,last) return setcompound(current,id,first,last,l,r,true) end
702            t[r] = v
703            return v
704        end)
705        t[l] = v
706        return v
707    end)
708
709    local function replaceword(first,last,old,new,oldlen)
710        local oldlen = utflength(old)
711        local newlen = utflength(new)
712        if newlen == 0 then
713            -- forget about it
714        elseif newlen <= oldlen then
715            for s in utfvalues(new) do
716                setchar(first,s)
717                first = getnext(first)
718            end
719            if newlen < oldlen then
720                -- first is one ahead
721                local after  = getnext(last)
722                local before = getprev(first)
723                setnext(last)
724                setlink(before,after)
725                flushlist(first)
726            end
727        else
728            local i = 0
729            local l = getnext(last)
730            for s in utfvalues(new) do
731                i = i + 1
732                if i > oldlen then
733                    local g = copy_node(first)
734                    setlink(first,g,l)
735                    setchar(g,s)
736                    first = g
737                elseif i == oldlen then
738                    setchar(first,s)
739                else
740                    setchar(first,s)
741                    first = getnext(first)
742                end
743            end
744        end
745    end
746
747 -- local optioncodes = table.copy(glyphoptioncodes)
748 --
749 -- optioncodes.nokerns     = optioncodes.noleftkern     | optioncodes.norightkern
750 -- optioncodes.noligatures = optioncodes.noleftligature | optioncodes.norightligature
751
752    local lh, rh = false, false
753
754    local cache = setmetatableindex(function(t,k)
755        local v = 0
756        if k == "compound" then
757            v = setcompounds[lh][rh]
758        elseif k == "hyphen" then
759            v = sethyphens[lh][rh]
760        else
761            v = 0
762            for s in gmatch(k,"%w+") do
763                local o = glyphoptioncodes[s]
764             -- local o = optioncodes[s]
765                if o then
766                    v = v | o
767                end
768            end
769        end
770        t[k] = v
771        return v
772    end)
773
774    local function checkglyphproperties(options)
775        -- we sort, just to be sure
776        for word, list in sortedhash(options) do
777            if type(list) == "string" then
778                options[word] = options[list]
779            else
780                for index, option in sortedhash(list) do
781                    if type(option) == "string" then
782                        list[index] = cache[option]
783                    end
784                end
785            end
786        end
787    end
788
789    -- statistics.starttiming(languages)
790    -- statistics.stoptiming(languages)
791
792    -- 1: restart 2: exceptions+patterns 3: patterns *: next word
793
794    local sequencers    = utilities.sequencers
795    local newsequencer  = sequencers.new
796    local appendgroup   = sequencers.appendgroup
797    local appendaction  = sequencers.appendaction
798    local enableaction  = sequencers.enableaction
799    local disableaction = sequencers.disableaction
800
801    local template = {
802        arguments    = "s",
803        returnvalues = "r,i",
804        results      = "r,i",
805    }
806
807    local registeredactions = setmetatableindex ( function(t,tag)
808        local actions = newsequencer(template)
809        appendgroup(actions,"user")
810        t[tag] = actions
811        return actions
812    end )
813
814    languages.registeredactions = registeredactions
815
816    function languages.installhandler(tag,func)
817        local todo = not rawget(registeredactions,tag)
818        local actions = registeredactions[tag]
819        appendaction(actions,"user",func)
820        enableaction(actions,func)
821        report("installing handler %a for language %a",func,tag)
822        if todo then
823            setwordhandler(tag,function(n,original,remapped,length,first,last)
824                local runner = actions.runner
825                if runner then
826                    if getprop(first,"replaced") then
827                        -- maybe some deadcycles
828                    else
829                        local r, result = runner(original)
830                        if not r or original == r then
831                            return result or 0
832                        else
833                            setprop(first,"replaced",true)
834                            replaceword(first,last,original,r,length)
835                            return 1
836                        end
837                    end
838                end
839                return 2
840            end)
841        end
842    end
843
844    local appliedoptions     = setmetatableindex("table")
845    languages.appliedoptions = appliedoptions
846
847    languages.setgoodieshandler = function(specification) -- will become a table specifier
848        if type(specification) == "table" then
849            local tag          = specification.tag
850            local goodies      = specification.goodies or tag
851            local result       = specification.result or 2
852            local data         = goodiesdata[goodies]
853            local properties   = data.properties
854            local replacements = data.replacements
855            local characters   = data.characters
856            local exceptions   = data.exceptions
857            local replacer     = nil
858            local d, instance  = resolve(tag)
859            local done         = false
860            -- check if something at all
861            if type(characters) == "table" and characters and next(characters) then
862                addhjcodestoinstance(instance,characters)
863                if trace_goodies then
864                    report_goodies("registering %a characters for %a",goodies,tag)
865                end
866                done = true
867            end
868            if type(properties) == "table" and next(properties) then
869                checkglyphproperties(properties) -- checks in place!
870                if trace_goodies then
871                    report_goodies("registering %a properties for %a",goodies,tag)
872                end
873                done = true
874            end
875            if type(replacements) == "table" and next(replacements) then
876                replacer = Cs((utfchartabletopattern(replacements) / replacements + 1)^0)
877                if trace_goodies then
878                    report_goodies("registering %a replacer for %a",goodies,tag)
879                end
880                done = true
881            end
882            if type(exceptions) == "table" and next(exceptions) then
883                done = true
884            else
885                exceptions = false
886            end
887            if done then
888                local registered = registeredactions[tag]
889                local applied    = appliedoptions[tag]
890                setwordhandler(tag,function(n,original,remapped,length,first,last)
891                    local runner = registered.runner
892                    if runner then
893                        if getprop(first,"replaced") then
894                            -- maybe some deadcycles
895                        else
896                            local r, result = runner(original)
897                            if not r then
898                                if trace_goodies then
899                                    report_goodies("kept by runner: %s => %s, result %i",original,remapped, result or 0)
900                                end
901                                return result or 0
902                            elseif original == r then
903                                if result then
904                                    if trace_goodies then
905                                        report_goodies("kept by runner: %s => %s, result %i",original,remapped, result)
906                                    end
907                                    return result
908                                else
909                                    if trace_goodies then
910                                        report_goodies("kept by runner: %s => %s, continue",original,remapped)
911                                    end
912                                end
913                            else
914                                if trace_goodies then
915                                    report_goodies("replaced by runner: %s => %s => %s, restart",original,remapped,r)
916                                end
917                                setprop(first,"replaced",true)
918                                replaceword(first,last,original,r,length)
919                                return 1
920                            end
921                        end
922                    end
923                    local result = 2
924                    local o = properties[remapped]
925                    if o then
926                        if trace_goodies then
927                            report("properties: %s %s",original,remapped)
928                        end
929                        if trace_applied then
930                            applied[original] = (applied[original] or 0) + 1
931                        end
932                        local index = 0
933                        for g, c in nextglyph, first do
934                            index = index + 1
935                            local oi = o[index]
936                            if oi then
937                                if type(oi) == "function" then
938                                    oi(g,n,first,last) -- maybe return value
939                                    result = 1
940                                else
941                                    setoptions(g,oi)
942                                end
943                            end
944                            if g == last then
945                                break
946                            end
947                        end
948                    elseif replacer then
949                        -- todo: check lengths so that we can avoid a check
950                        if getprop(first,"replaced") then
951                            -- maybe some deadcycles
952                        else
953                            local r = lpegmatch(replacer,original)
954                            if original == r then
955                                if trace_goodies then
956                                    report_goodies("kept: %s => %s",original,remapped)
957                                end
958                            else
959                                if trace_goodies then
960                                    report_goodies("replaced: %s => %s => %s",original,remapped,r)
961                                end
962                                setprop(first,"replaced",true)
963                                replaceword(first,last,original,r,length)
964                                result = 1
965                            end
966                        end
967                    elseif exceptions then
968                        local exception = exceptions[original]
969                        if exception then
970                            if trace_goodies then
971                                report_goodies("exception: %s => %s",original,exception)
972                            end
973                            result = exception
974                        else
975                            result = 3
976                        end
977                    else
978                        if trace_goodies then
979                            report_goodies("ignored: %s => %s",original,remapped)
980                        end
981                    end
982                    return result
983                end)
984            elseif trace_goodies then
985                report_goodies("nothing useable in %a for %a",goodies,tag)
986            end
987        else
988            setwordhandler(tag)
989        end
990    end
991
992    local norightligature_option = glyphoptioncodes.norightligature
993    local noleftligature_option  = glyphoptioncodes.noleftligature
994    local norightkern_option     = glyphoptioncodes.norightkern
995    local noleftkern_option      = glyphoptioncodes.noleftkern
996
997    local function applyaction(oc,v,n)
998        if oc == "noligature" then
999            if n > 0 then
1000                local vv = v[n-1]
1001                if vv then
1002                    v[n-1] = vv | norightligature_option
1003                else
1004                    v[n-1] = norightligature_option
1005                end
1006            end
1007            v[n] = noleftligature_option
1008        elseif oc == "compound" then
1009            if n > 1 then
1010             -- v[n] = setcompound
1011                v[n] = setcompounds[lh][rh]
1012                return true
1013            end
1014        elseif oc == "hyphen" then
1015            if n > 1 then
1016                v[n] = sethyphens[lh][rh]
1017                return true
1018            end
1019        elseif oc == "nokern" then
1020            if n > 0 then
1021                local vv = v[n-1]
1022                if vv then
1023                    v[n-1] = vv | norightkern_option
1024                else
1025                    v[n-1] = norightkern_option
1026                end
1027            end
1028            v[n] = noleftkern_option
1029        elseif oc == "noleftkern" then
1030            v[n] = noleftkern_option
1031        elseif oc == "norightkern" then
1032            if n > 0 then
1033                local vv = v[n-1]
1034                if vv then
1035                    v[n-1] = vv | norightkern_option
1036                else
1037                    v[n-1] = norightkern_option
1038                end
1039            end
1040        else
1041            for s in gmatch(oc,"%w+") do
1042                if applyaction(s,v,n) then
1043                    return
1044                end
1045            end
1046        end
1047    end
1048
1049    -- a|b : a:norightligature b:noleftligature
1050    -- a=b : a:norightkern     b:noleftkern
1051    -- a<b :                   b:noleftkern
1052    -- a>b : a:norightkern
1053    -- a-b : hyphen
1054    -- a+b : compound
1055
1056    local actions = {
1057        ["|"] = "noligature",
1058        ["="] = "nokern",
1059        ["<"] = "noleftkern",
1060        [">"] = "norightkern",
1061        ["+"] = "compound",
1062        ["-"] = "hyphen",
1063    }
1064
1065    local function analyzed(m,a,t,k)
1066        local v = { }
1067        local n = 1
1068        if m == true then
1069            for c in gmatch(k,".") do
1070                local ac = a[c]
1071                if not ac then
1072                    n = n + 1
1073                else
1074                    applyaction(ac,v,n)
1075                end
1076            end
1077        elseif type(m) == "number" then
1078            local i = 0
1079            for c in gmatch(k,".") do
1080                local ac = a[c]
1081                if not ac then
1082                    n = n + 1
1083                else
1084                    i = i + 1
1085                    if i == m then
1086                        applyaction(ac,v,n)
1087                        break
1088                    end
1089                end
1090            end
1091        elseif type(m) == "table" then
1092            -- happens here, otherwise no stable caching key, we could hash these too
1093            m = tohash(m)
1094            local i = 0
1095            for c in gmatch(k,".") do
1096                local ac = a[c]
1097                if not ac then
1098                    n = n + 1
1099                else
1100                    i = i + 1
1101                    if m[i] then
1102                        applyaction(ac,v,n)
1103                    end
1104                end
1105            end
1106        else
1107            -- error
1108        end
1109        t[k] = v
1110        return v
1111    end
1112
1113    local cache = setmetatableindex(function(t,m)
1114        local v = setmetatableindex(function(t,a)
1115            local v = setmetatableindex(function(t,k)
1116                return analyzed(m,a,t,k)
1117            end)
1118            t[m] = v
1119            return v
1120        end)
1121        t[m] = v
1122        return v
1123    end)
1124
1125    -- maybe also a skip symbol
1126
1127    local replace1 = Cs ( ( S("|=<>+-.0123456789")/"" + lpegpatterns.utf8character    )^0 )
1128    local replace2 = Cs ( ( S("|=<>+-.0123456789")    + lpegpatterns.utf8character/".")^0 )
1129
1130    local function stripped(str)
1131        -- todo : lpeg
1132        str = gsub(str,"%-%-[^\n]*\n","")
1133        str = gsub(str,"%%[^\n]*\n","")
1134        str = gsub(str,"%s+"," ")
1135        str = gsub(str,"^%s+","")
1136        str = gsub(str,"%s+$","")
1137        return str
1138    end
1139
1140    local registerexceptions  do
1141
1142        local lbrace   = P("{")
1143        local rbrace   = P("}")
1144        local lbracket = P("[")
1145        local rbracket = P("]")
1146        local lparent  = P("(")
1147        local rparent  = P(")")
1148        local hyphen   = P("-")
1149
1150        local p = Cs ( (
1151            lbrace *  ((1-rbrace)^0) * rbrace
1152          * lbrace *  ((1-rbrace)^0) * rbrace
1153          * lbrace * C((1-rbrace)^0) * rbrace * (lparent * C((1-rparent)^0) * rparent)^0 / function(a,b) return b or a end
1154          + (lbracket * (1-rbracket)^0 * rbracket) / ""
1155          + hyphen / ""
1156          + lpegpatterns.utf8character
1157        )^0 )
1158
1159        registerexceptions = function(target,str)
1160            local kind = type(str)
1161            if kind == "string" then
1162                for v in gmatch(stripped(str),"%S+") do
1163                    local k = lpegmatch(p,v)
1164                    if k ~= v then
1165                        target[k] = v
1166                    end
1167                end
1168            elseif kind == "table" then
1169                local n = #str
1170                if n > 0 then
1171                    for i=1,n do
1172                        local v = str[i]
1173                        local k = lpegmatch(p,v)
1174                        if k ~= v then
1175                            target[k] = v
1176                        end
1177                    end
1178                else
1179                    -- maybe check for sanity
1180                    for k, v in next, str do
1181                        target[k] = v
1182                    end
1183                end
1184            end
1185        end
1186
1187    end
1188
1189    function languages.strippedgoodiewords(str)
1190        return lpegmatch(replace1,stripped(str))
1191    end
1192
1193    local splitter = lpeg.tsplitat(" ")
1194
1195    local function addgoodies(tag,list,filename)
1196        local np = 0
1197        local nd = 0
1198        local nw = 0
1199        local nl = #list
1200        --
1201        local data         = goodiesdata[tag]
1202        local properties   = data.properties
1203        local replacements = data.replacements
1204        local characters   = data.characters
1205        local exceptions   = data.exceptions
1206        if filename then
1207            if not data.goodies then
1208                data.goodies = { }
1209            end
1210            insert(data.goodies,filename)
1211        end
1212        --
1213        lh = false
1214        rh = false
1215        --
1216        for i=1,nl do
1217            local l = list[i]
1218            if type(l) == "table" then
1219                local w  = l.words
1220                local p  = l.patterns
1221                local c  = l.characters
1222                local e  = l.exceptions
1223                lh = l.left  or false -- for practical reasons these are semi-global
1224                rh = l.right or false -- for practical reasons these are semi-global
1225                if c then
1226                    for v in utfvalues(c) do
1227                        characters[v] = true
1228                    end
1229                end
1230                if w then
1231                    local prefixes    = l.prefixes
1232                    local nofprefixes = 0
1233                    local suffixes    = l.suffixes
1234                    local nofsuffixes = 0
1235                    if prefixes then
1236                        prefixes    = lpegmatch(splitter,lower(stripped(prefixes)))
1237                        nofprefixes = #prefixes
1238                    end
1239                    if suffixes then
1240                        suffixes    = lpegmatch(splitter,lower(stripped(suffixes)))
1241                        nofsuffixes = #suffixes
1242                    end
1243                    w = lower(stripped(w))
1244                    if p then
1245                        local pattern = Cs((utfchartabletopattern(p) / p + 1)^0)
1246                        w = lpegmatch(pattern,w)
1247                        np = np + 1
1248                    else
1249                        nd = nd + 1
1250                    end
1251                    local m = l.matches
1252                    if not m then
1253                        m = true
1254                    end
1255                    local a = l.actions
1256                    if a then
1257                        setmetatableindex(a,actions)
1258                    else
1259                        a = actions
1260                    end
1261                    local cach = cache[m][a]
1262                    if nofprefixes > 0 then
1263                        if nofsuffixes > 0 then
1264                            for wrd in gmatch(w,"%S+") do
1265                                properties[lpegmatch(replace1,wrd)] = cach[lpegmatch(replace2,wrd)]
1266                                nw = nw + 1
1267                                for i=1,nofprefixes do
1268                                    local tmp = prefixes[i] .. wrd
1269                                    for i=1,nofsuffixes do
1270                                        local str = tmp .. suffixes[i]
1271                                        properties[lpegmatch(replace1,str)] = cach[lpegmatch(replace2,str)]
1272                                        nw = nw + 1
1273                                    end
1274                                end
1275                            end
1276                        else
1277                            for wrd in gmatch(w,"%S+") do
1278                                properties[lpegmatch(replace1,wrd)] = cach[lpegmatch(replace2,wrd)]
1279                                nw = nw + 1
1280                                for i=1,nofprefixes do
1281                                    local str = prefixes[i] .. wrd
1282                                    properties[lpegmatch(replace1,str)] = cach[lpegmatch(replace2,str)]
1283                                    nw = nw + 1
1284                                end
1285                            end
1286                        end
1287                    elseif nofsuffixes > 0 then
1288                        for wrd in gmatch(w,"%S+") do
1289                            properties[lpegmatch(replace1,wrd)] = cach[lpegmatch(replace2,wrd)]
1290                            nw = nw + 1
1291                            for i=1,nofsuffixes do
1292                                local str = wrd .. suffixes[i]
1293                                properties[lpegmatch(replace1,str)] = cach[lpegmatch(replace2,str)]
1294                                nw = nw + 1
1295                            end
1296                        end
1297                    else
1298                        for wrd in gmatch(w,"%S+") do
1299                            properties[lpegmatch(replace1,wrd)] = cach[lpegmatch(replace2,wrd)]
1300                            nw = nw + 1
1301                        end
1302                    end
1303                elseif p then
1304                    for k, v in next, p do
1305                        -- todo: warning overload
1306                        replacements[k] = v
1307                    end
1308                elseif e then
1309                    registerexceptions(exceptions,e)
1310                end
1311            end
1312        end
1313
1314        lh = false
1315        rh = false
1316
1317        return { np = np, nd = nd, nw = nw, nl = nl }
1318    end
1319
1320    function languages.goodiefiles(tag)
1321        local d = goodiesdata[tag]
1322        return d and d.goodies
1323    end
1324
1325    function languages.addgoodiesfile(tag,filename)
1326        local fullname = resolvers.findfile(file.addsuffix(filename,"llg")) or ""
1327        if fullname == "" then
1328            report_goodies("file %a is not found",filename)
1329        else
1330            local list = table.load(fullname)
1331            if not list then
1332                report_goodies("file %a is invalid",fullname)
1333            else
1334                list = list.options
1335                if not list then
1336                    report_goodies("file %a has no options",fullname)
1337                else
1338                    local ok = addgoodies(tag,list,filename)
1339                    report_goodies("tag %a, file %a loaded, %i lists, %i via patterns, %i direct, %i words",
1340                        tag,fullname,ok.nl,ok.np,ok.nd,ok.nw)
1341                end
1342            end
1343        end
1344    end
1345
1346    function languages.addgoodiesdata(tag,list)
1347        local ok = addgoodies(tag,list)
1348        report_goodies("tag %a, data loaded, %i lists, %i via patterns, %i direct, %i words",
1349            tag,ok.nl,ok.np,ok.nd,ok.nw)
1350    end
1351
1352end
1353
1354if environment.initex then
1355
1356    function languages.getnumber()
1357        return 0
1358    end
1359
1360else
1361
1362    function languages.getnumber(tag,default,patterns,goodies,factor)
1363        local l = registered[tag]
1364        if l then
1365            if l.dirty then
1366                l.factor = factor == v_yes and true or false
1367                if trace_patterns then
1368                    report_initialization("checking patterns for %a with default %a",tag,default)
1369                end
1370                -- patterns is already resolved to parent patterns if applicable
1371                if patterns and patterns ~= "" then
1372                    if l.patterns ~= patterns then
1373                        l.patterns = patterns
1374                        if trace_patterns then
1375                            report_initialization("loading patterns for %a using specification %a",tag,patterns)
1376                        end
1377                        loaddefinitions(tag,l)
1378                    else
1379                        -- unchanged
1380                    end
1381                elseif l.patterns == "" then
1382                    l.patterns = tag
1383                    if trace_patterns then
1384                        report_initialization("loading patterns for %a using tag",tag)
1385                    end
1386                    local ok = loaddefinitions(tag,l)
1387                    if not ok and tag ~= default then
1388                        l.patterns = default
1389                        if trace_patterns then
1390                            report_initialization("loading patterns for %a using default",tag)
1391                        end
1392                        loaddefinitions(tag,l)
1393                    end
1394                end
1395                if goodies and goodies ~= "" then
1396                    goodies = settings_to_array(goodies)
1397                    for i=1,#goodies do
1398                        local goodie = goodies[i]
1399                        -- we can cache this but it doesn't pay off to do so
1400                        languages.addgoodiesfile(tag,goodie)
1401                    end
1402                    languages.setgoodieshandler {
1403                        tag     = tag,
1404                        goodies = tag,
1405                    }
1406                end
1407                l.loaded = true
1408                l.dirty  = false
1409            end
1410            return l.number
1411        else
1412            return 0
1413        end
1414    end
1415
1416    numbers[0] = "null"
1417
1418    registered.null = {
1419        number   = 0,
1420        instance = new_language(0),
1421    }
1422
1423end
1424
1425-- hyphenation.define        ("zerolanguage")
1426-- hyphenation.loadpatterns  ("zerolanguage") -- else bug
1427-- hyphenation.loadexceptions("zerolanguage") -- else bug
1428
1429languages.logger = languages.logger or { }
1430
1431function languages.logger.report()
1432    local result, r = { }, 0
1433    for tag, l in sortedhash(registered) do
1434        if l.loaded then
1435            r = r + 1
1436            result[r] = format("%s:%s:%s",tag,l.parent,l.number)
1437        end
1438    end
1439    return r > 0 and concat(result," ") or "none"
1440end
1441
1442-- must happen at the tex end .. will use lang-def.lua
1443
1444languages.associate('en','latn','eng')
1445languages.associate('uk','latn','eng')
1446languages.associate('nl','latn','nld')
1447languages.associate('de','latn','deu')
1448languages.associate('fr','latn','fra')
1449
1450statistics.register("loaded patterns", function()
1451    local result = languages.logger.report()
1452    if result ~= "none" then
1453     -- return result
1454        return format("%s, load time: %s",result,statistics.elapsedtime(languages))
1455    end
1456end)
1457
1458-- statistics.register("language load time", function()
1459--     -- often zero so we can merge that in the above
1460--     return statistics.elapsedseconds(languages, format(", nofpatterns: %s",nofloaded))
1461-- end)
1462
1463-- interface
1464
1465implement {
1466    name      = "languagenumber",
1467    actions   = { languages.getnumber, context },
1468    arguments = "5 strings"
1469}
1470
1471implement {
1472    name      = "installedlanguages",
1473    actions   = { languages.installed, context },
1474}
1475
1476implement {
1477    name      = "definelanguage",
1478    actions   = languages.define,
1479    arguments = "2 strings"
1480}
1481
1482implement {
1483    name      = "setlanguagesynonym",
1484    actions   = languages.setsynonym,
1485    arguments = "2 strings"
1486}
1487
1488implement {
1489    name      = "unloadlanguage",
1490    actions   = languages.unload,
1491    arguments = "string"
1492}
1493
1494implement {
1495    name      = "setlanguageexceptions",
1496    actions   = languages.setexceptions,
1497    arguments = "2 strings"
1498}
1499
1500implement {
1501    name      = "setlanguagepatterns",
1502    actions   = languages.setpatterns,
1503    arguments = "2 strings"
1504}
1505
1506implement {
1507    name      = "setlanguageoptions",
1508    actions   = languages.setoptions,
1509    arguments = "2 strings"
1510}
1511
1512implement {
1513    name      = "currentprehyphenchar",
1514    actions   = function()
1515        local c = prehyphenchar(tolang())
1516        if c and c > 0 then
1517            context.char(c)
1518        end
1519    end
1520}
1521
1522implement {
1523    name      = "currentposthyphenchar",
1524    actions   = function()
1525        local c = posthyphenchar(tolang())
1526        if c and c > 0 then
1527            context.char(c)
1528        end
1529    end
1530}
1531