lang-ini.lmt /size: 54 Kb    last modification: 2025-02-21 11:03
1if not modules then modules = { } end modules ['lang-ini'] = {
2    version   = 1.001,
3    comment   = "companion to lang-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- needs a cleanup (share locals)
10-- discard language when redefined
11
12-- 002D : hyphen-minus (ascii)
13-- 002D : hyphen-minus (ascii)
14-- 2010 : hyphen
15-- 2011 : nonbreakable hyphen
16-- 2013 : endash (compound hyphen)
17
18-- todo: no foo:bar but foo(bar,...)
19
20-- https://wortschatz.uni-leipzig.de/de/download/German : lots of lists
21
22local type, tonumber, next = type, tonumber, next
23local utfbyte, utflength = utf.byte, utf.length
24local format, gsub, gmatch, find = string.format, string.gsub, string.gmatch, string.find
25local concat, sortedkeys, sortedhash, keys, insert, tohash = table.concat, table.sortedkeys, table.sortedhash, table.keys, table.insert, table.tohash
26local setmetatableindex = table.setmetatableindex
27local utfvalues, strip, utfcharacters = string.utfvalues, string.strip, utf.characters
28
29local context   = context
30local commands  = commands
31local implement = interfaces.implement
32
33local settings_to_array = utilities.parsers.settings_to_array
34local settings_to_set   = utilities.parsers.settings_to_set
35
36local trace_patterns = false  trackers.register("languages.patterns", function(v) trace_patterns = v end)
37local trace_goodies  = false  trackers.register("languages.goodies",  function(v) trace_goodies  = v end)
38local trace_applied  = false  trackers.register("languages.applied",  function(v) trace_applied  = v end)
39
40local report_initialization = logs.reporter("languages","initialization")
41local report_goodies        = logs.reporter("languages","goodies")
42
43local prehyphenchar    = language.prehyphenchar    -- global per language
44local posthyphenchar   = language.posthyphenchar   -- global per language
45local preexhyphenchar  = language.preexhyphenchar  -- global per language
46local postexhyphenchar = language.postexhyphenchar -- global per language
47----- lefthyphenmin    = language.lefthyphenmin
48----- righthyphenmin   = language.righthyphenmin
49local sethjcode        = language.sethjcode
50local currentlanguage  = language.current -- or function() return tex.normallanguage or tex.language end
51
52local uccodes          = characters.uccodes
53local lccodes          = characters.lccodes
54
55local new_language     = language.new
56
57languages              = languages or {}
58local languages        = languages
59
60languages.version      = 1.010
61
62languages.registered   = languages.registered or { }
63local registered       = languages.registered
64
65languages.associated   = languages.associated or { }
66local associated       = languages.associated
67
68languages.numbers      = languages.numbers    or { }
69local numbers          = languages.numbers
70
71languages.data         = languages.data       or { }
72local data             = languages.data
73
74storage.register("languages/registered",registered,"languages.registered")
75storage.register("languages/associated",associated,"languages.associated")
76storage.register("languages/numbers",   numbers,   "languages.numbers")
77storage.register("languages/data",      data,      "languages.data")
78
79local v_reset <const> = interfaces.variables.reset
80local v_yes   <const> = interfaces.variables.yes
81
82local nofloaded  = 0
83
84local function resolve(tag)
85    local data, instance = registered[tag], nil
86    if data then
87        instance = data.instance
88        if not instance then
89            instance = new_language(data.number)
90            data.instance = instance
91        end
92    end
93    return data, instance
94end
95
96local function tolang(what) -- returns lang object
97    if not what then
98        what = currentlanguage()
99    end
100    if type(what) == "userdata" then
101        return what
102    end
103    local tag = numbers[what]
104    local data = tag and registered[tag] or registered[what]
105    if data then
106        local instance = data.instance -- .lang -- was this ok ?
107        if not instance then
108            instance = new_language(data.number)
109            data.instance = instance
110        end
111        return instance
112    end
113end
114
115function languages.getdata(tag) -- or number
116    if tag then
117        return registered[tag] or registered[numbers[tag]]
118    else
119        return registered[numbers[currentlanguage()]]
120    end
121end
122
123languages.tolang = tolang
124
125-- patterns=en
126-- patterns=en,de
127
128local function validdata(loaded,what,tag)
129    local dataset = loaded[what]
130    if dataset then
131        local data = dataset.data
132        if not data or data == "" then
133            -- nothing
134        elseif dataset.compression == "zlib" then
135            data = zlib.decompress(data)
136            if dataset.length and dataset.length ~= #data then
137                report_initialization("compression error in %a for language %a","patterns",what,tag)
138            end
139            return data
140        else
141            return data
142        end
143    end
144end
145
146-- languages.hjcounts[unicode].count
147
148-- hjcode: 0       not to be hyphenated
149--         1--31   length
150--         32      zero length
151--         > 32    hyphenated with length 1
152
153local function sethjcodes(instance,loaded,what,factor)
154    local l = loaded[what]
155    local c = l and l.characters
156    if c then
157        local hjcounts = factor and languages.hjcounts or false
158        --
159        local h = loaded.codehash
160        if not h then
161            h = { }
162            loaded.codehash = h
163        end
164        --
165        local function setcode(code)
166            local l = lccodes[code] -- just in case we get a mixture
167            local u = uccodes[code] -- just in case we get a mixture
168            local s = l
169            if type(s) ~= "number" then
170                l = code
171                s = code
172            end
173            if hjcounts then
174                local c = hjcounts[s]
175                if c then
176                    c = c.count
177                    if not c then
178                        -- error, keep as 1
179                    elseif c <= 0 then
180                        -- counts as 0 i.e. ignored
181                        s = 32
182                    elseif c >= 31 then
183                        -- counts as 31
184                        s = 31
185                    else
186                        -- count c times
187                        s = c
188                    end
189                end
190            end
191            sethjcode(instance,l,s)
192            if u ~= l and type(u) == "number" then
193                sethjcode(instance,u,s)
194                h[u] = s
195            end
196        end
197        --
198        local s = tex.savinghyphcodes
199        tex.savinghyphcodes = 0
200        if type(c) == "table" then
201            if #c > 0 then
202                -- list: { U, U, U, "chr", "chr", ... }
203                for i=1,#c do
204                    local v = c[i]
205                    setcode(type(v) == "string" and utfbyte(v) or v)
206                end
207            else
208                -- hash: { ["chr"] = true, ... }
209               for k, v in sortedhash(c) do
210                    if v then
211                        setcode(utfbyte(k))
212                    end
213                end
214            end
215        elseif type(c) == "string" then
216            for l in utfvalues(c) do
217                setcode(l)
218            end
219        end
220        tex.savinghyphcodes = s
221    end
222end
223
224local function addhjcodestoinstance(instance,characters)
225    if type(characters) == "table" then
226        local nofcharacters = #characters
227        if nofcharacters > 0 then
228            -- list: { U, U, U, "chr", "chr", ... }
229            for i=1,nofcharacters do
230                local v = characters[i]
231                local h = type(v) == "string" and utfbyte(v) or v
232                sethjcode(instance,h,h)
233            end
234        else
235            -- hash: { ["chr"] = true, ... }
236            for k, v in next, characters do
237                if v then
238                    local h = type(k) == "string" and utfbyte(k) or k
239                    sethjcode(instance,h,h)
240                end
241            end
242        end
243    elseif type(characters) == "string" then
244        for h in utfvalues(characters) do
245            sethjcode(instance,h,h)
246        end
247    end
248end
249
250-- 2'2 conflicts with 4' ... and luatex barks on it
251
252local P, S, R, C, Cs, Ct, lpegmatch, lpegpatterns = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.Cs, lpeg.Ct, lpeg.match, lpeg.patterns
253
254local utfsplit = utf.split
255
256local space       = lpegpatterns.space
257local whitespace  = lpegpatterns.whitespace^1
258local nospace     = lpegpatterns.utf8char - whitespace
259local digit       = lpegpatterns.digit
260----- endofstring = #whitespace + P(-1)
261local endofstring = #whitespace
262
263local word        = (digit/"")^0 * (digit/"" * endofstring + digit/" " + nospace)^1
264local anyword     = (1-whitespace)^1
265local analyze     = Ct((whitespace + Cs(word))^1)
266
267local function unique(tag,requested,loaded)
268    local nofloaded = #loaded
269    if nofloaded == 0 then
270        return ""
271    elseif nofloaded == 1 then
272        return loaded[1]
273    else
274        insert(loaded,1," ") -- no need then for special first word
275     -- insert(loaded,  " ")
276        loaded = concat(loaded," ")
277        local t = lpegmatch(analyze,loaded) or { }
278        local h = { }
279        local b = { }
280        for i=1,#t do
281            local ti = t[i]
282            local hi = h[ti]
283            if not hi then
284                h[ti] = 1
285            elseif hi == 1 then
286                h[ti] = 2
287                b[#b+1] = utfsplit(ti," ")
288            end
289        end
290        -- sort
291        local nofbad = #b
292        if nofbad > 0 then
293            local word
294            for i=1,nofbad do
295                local bi = b[i]
296                local p = P(bi[1])
297                for i=2,#bi do
298                    p = p * digit * P(bi[i])
299                end
300                if word then
301                    word = word + p
302                else
303                    word = p
304                end
305                report_initialization("language %a, patterns %a, discarding conflict (0-9)%{[0-9]}t(0-9)",tag,requested,bi)
306            end
307            t, h, b = nil, nil, nil -- permit gc
308            local someword = digit^0 * word * digit^0 * endofstring / ""
309         -- local strip    = Cs(someword^-1 * (someword + anyword + whitespace)^1)
310            local strip    = Cs((someword + anyword + whitespace)^1)
311            return lpegmatch(strip,loaded) or loaded
312        else
313            return loaded
314        end
315    end
316end
317
318local shared = false
319
320local function loaddefinitions(tag,specification)
321    statistics.starttiming(languages)
322    local data, instance = resolve(tag)
323    local requested = specification.patterns or ""
324    local definitions = settings_to_array(requested)
325    if #definitions > 0 then
326        if trace_patterns then
327            report_initialization("pattern specification for language %a: %s",tag,specification.patterns)
328        end
329        local ploaded = instance:patterns()
330        local eloaded = instance:hyphenation()
331        if not ploaded or ploaded == ""  then
332            ploaded = { }
333        else
334            ploaded = { ploaded }
335        end
336        if not eloaded or eloaded == ""  then
337            eloaded = { }
338        else
339            eloaded = { eloaded }
340        end
341        local dataused  = data.used
342        local ok        = false
343        local resources = data.resources or { }
344        data.resources  = resources
345        if not shared then
346            local found = resolvers.findfile("lang-exc.lua")
347            if found then
348                shared = dofile(found)
349                if type(shared) == "table" then
350                    shared = concat(shared," ")
351                else
352                    shared = true
353                end
354            else
355                shared = true
356            end
357        end
358        for i=1,#definitions do
359            local definition = definitions[i]
360            if definition == "" then
361                -- error
362            elseif definition == v_reset then
363                if trace_patterns then
364                    report_initialization("clearing patterns for language %a",tag)
365                end
366                instance:clearpatterns()
367                instance:clearhyphenation()
368                ploaded = { }
369                eloaded = { }
370            elseif not dataused[definition] then
371                dataused[definition] = definition
372                local filename = "lang-" .. definition .. ".lua"
373                local fullname = resolvers.findfile(filename) or ""
374                if fullname == "" then
375                    fullname = resolvers.findfile(filename .. ".gz") or ""
376                end
377                if fullname ~= "" then
378                    if trace_patterns then
379                        report_initialization("loading definition %a for language %a from %a",definition,tag,fullname)
380                    end
381                    local suffix, gzipped = gzip.suffix(fullname)
382                    local loaded = table.load(fullname,gzipped and gzip.load)
383                    if loaded then -- todo: version test
384                        ok, nofloaded = true, nofloaded + 1
385                        sethjcodes(instance,loaded,"patterns",specification.factor)
386                        sethjcodes(instance,loaded,"exceptions",specification.factor)
387                        local p = validdata(loaded,"patterns",tag)
388                        local e = validdata(loaded,"exceptions",tag)
389                        if p and p ~= "" then
390                            ploaded[#ploaded+1] = p
391                        end
392                        if e and e ~= "" then
393                            eloaded[#eloaded+1] = e
394                        end
395                        resources[#resources+1] = loaded -- so we can use them otherwise
396                    else
397                        report_initialization("invalid definition %a for language %a in %a",definition,tag,filename)
398                    end
399                elseif trace_patterns then
400                    report_initialization("invalid definition %a for language %a in %a",definition,tag,filename)
401                end
402            elseif trace_patterns then
403                report_initialization("definition %a for language %a already loaded",definition,tag)
404            end
405        end
406        if #ploaded > 0 then
407            -- why not always clear
408            instance:clearpatterns()
409            instance:patterns(unique(tag,requested,ploaded))
410        end
411        if #eloaded > 0 then
412            -- why not always clear
413            instance:clearhyphenation()
414            instance:hyphenation(concat(eloaded," "))
415        end
416        if type(shared) == "string" then
417            instance:hyphenation(shared)
418        end
419        return ok
420    elseif trace_patterns then
421        report_initialization("no definitions for language %a",tag)
422    end
423    statistics.stoptiming(languages)
424end
425
426storage.shared.noflanguages = storage.shared.noflanguages or 0
427
428local noflanguages = storage.shared.noflanguages
429
430function languages.define(tag,parent)
431    noflanguages = noflanguages + 1
432    if trace_patterns then
433        report_initialization("assigning number %a to %a",noflanguages,tag)
434    end
435    numbers[noflanguages] = tag
436    numbers[tag] = noflanguages
437    registered[tag] = {
438        tag      = tag,
439        parent   = parent or "",
440        patterns = "",
441        loaded   = false,
442        used     = { },
443        dirty    = true,
444        number   = noflanguages,
445        instance = nil, -- luatex data structure
446        synonyms = { },
447    }
448    storage.shared.noflanguages = noflanguages
449end
450
451function languages.setsynonym(synonym,tag) -- convenience function
452    local l = registered[tag]
453    if l then
454        l.synonyms[synonym] = true -- maybe some day more info
455    end
456end
457
458function languages.installed(separator)
459    return concat(sortedkeys(registered),separator or ",")
460end
461
462function languages.current(n)
463    return numbers[n and tonumber(n) or currentlanguage()]
464end
465
466function languages.associate(tag,script,language) -- not yet used
467    associated[tag] = { script, language }
468end
469
470function languages.association(tag) -- not yet used
471    if not tag then
472        tag = numbers[currentlanguage()]
473    elseif type(tag) == "number" then
474        tag = numbers[tag]
475    end
476    local lat = tag and associated[tag]
477    if lat then
478        return lat[1], lat[2]
479    end
480end
481
482function languages.loadable(tag,defaultlanguage) -- hack
483    local l = registered[tag] -- no synonyms
484    if l and resolvers.findfile("lang-"..l.patterns..".lua") then
485        return true
486    else
487        return false
488    end
489end
490
491-- a bit messy, we will do all language setting in lua as we can now assign
492-- and 'patterns' will go away here.
493
494function languages.unload(tag)
495    local l = registered[tag]
496    if l then
497        l.dirty = true
498    end
499end
500
501-- not that usefull, global values
502
503function languages.prehyphenchar   (what) return prehyphenchar   (tolang(what)) end
504function languages.posthyphenchar  (what) return posthyphenchar  (tolang(what)) end
505function languages.preexhyphenchar (what) return preexhyphenchar (tolang(what)) end
506function languages.postexhyphenchar(what) return postexhyphenchar(tolang(what)) end
507-------- languages.lefthyphenmin   (what) return lefthyphenmin   (tolang(what)) end
508-------- languages.righthyphenmin  (what) return righthyphenmin  (tolang(what)) end
509
510-- e['implementer']= 'imple{m}{-}{-}menter'
511-- e['manual'] = 'man{}{}{}'
512-- e['as'] = 'a-s'
513-- e['user-friendly'] = 'user=friend-ly'
514-- e['exceptionally-friendly'] = 'excep-tionally=friend-ly'
515
516local invalid = { "{", "}", "(", ")", "-", " " }
517
518local function collecthjcodes(data,str)
519    local found = data.extras and data.extras.characters or { }
520    if type(str) == "string" then
521        for s in utfcharacters(str) do
522            if not found[s] then
523                found[s] = true
524            end
525        end
526    elseif type(str) == "table" then
527        for i=1,#str do
528            local s = str[i]
529            if not found[s] then
530                found[s] = true
531            end
532        end
533    end
534    for i=1,#invalid do -- less checks this way
535        local c = invalid[i]
536        if found[c] then
537            found[c] = nil
538        end
539    end
540    data.extras = { characters = found }
541    sethjcodes(data.instance,data,"extras",data.factor)
542end
543
544function languages.loadwords(tag,filename)
545    local data, instance = resolve(tag)
546    if data then
547        statistics.starttiming(languages)
548        local str = io.loaddata(filename) or ""
549        collecthjcodes(data,str)
550        instance:hyphenation(str)
551        statistics.stoptiming(languages)
552    end
553end
554
555
556function languages.setexceptions(tag,str)
557    local data, instance = resolve(tag)
558    if data then
559        str = strip(str) -- we need to strip leading spaces
560        collecthjcodes(data,str)
561        instance:hyphenation(str)
562    end
563end
564
565function languages.setpatterns(tag,str)
566    local data, instance = resolve(tag)
567    if data then
568        str = strip(str) -- we need to strip leading spaces
569        collecthjcodes(data,str)
570        instance:patterns(str)
571    end
572end
573
574local function setwordhandler(tag,action)
575    local data, instance = resolve(tag)
576    if data then
577        instance:setwordhandler(action)
578    end
579end
580
581languages.setwordhandler = setwordhandler
582
583function languages.setoptions(tag,str)
584    languages.addgoodiesdata(tag,{ { words = str } })
585    -- for now:
586    languages.setgoodieshandler { tag = tag, goodies = tag }
587end
588
589function languages.hyphenate(tag,str)
590    -- todo: does this still work?
591    local data, instance = resolve(tag)
592    if data then
593        return instance:hyphenate(str)
594    else
595        return str
596    end
597end
598
599-- This code is here for some testing (and discussion) but it might end up in its
600-- own module. I wrote it after listening to the end March 2021 live concert of
601-- Mandoki Soulmates: Hungarian Pictures (music is the greatest unifier) with his
602-- usual incredible international lineup. After that, and realizing that we needed
603-- to deal better with some language issues as follow up on a mailing list thread, I
604-- needed only a few loops of relistening the concert to implement it. In
605-- restrospect this was a language feature that should have been there a while ago.
606
607local expand ; do
608
609    local nuts        = nodes.nuts
610    local nextglyph   = nuts.traversers.glyph
611    local setoptions  = nuts.setoptions
612
613    local getnext     = nuts.getnext
614    local getprev     = nuts.getprev
615    local setchar     = nuts.setchar
616    local setnext     = nuts.setnext
617    local setlink     = nuts.setlink
618    local setfield    = nuts.setfield
619    local setdisc     = nuts.setdisc
620    local getprop     = nuts.getprop
621    local setprop     = nuts.setprop
622    local setattrlist = nuts.setattrlist
623
624    local new_disc    = nuts.pool.disc
625    local new_glyph   = nuts.pool.glyph
626    local copy_node   = nuts.copy
627    local flushlist   = nuts.flushlist
628
629    local glyphoptioncodes      = tex.glyphoptioncodes
630
631    local lower                 = characters.lower
632    local replacer              = utf.replacer
633    local utfchartabletopattern = lpeg.utfchartabletopattern
634
635    local report                = logs.reporter("languages","goodies")
636
637    -- can be shared
638
639    local goodiesdata = setmetatableindex(function(t,k)
640        local v = {
641            properties    = { },
642            replacements  = { },
643            characters    = { },
644            exceptions    = { },
645            substitutions = { },
646            experiments   = { },
647        }
648        t[k] = v
649        return v
650    end)
651
652    -- can be a helper
653
654    local compound_disc_code <const> = tex.discoptioncodes.preword | tex.discoptioncodes.postword
655
656    local function setcompound(current,id,first,last,lh,rh,hyphen)
657        local prev     = getprev(current)
658     -- local language = tolang(id)
659     -- local prechar  = prehyphenchar(language)
660     -- local postchar = posthyphenchar(language)
661        local prechar  = prehyphenchar(id)
662        local postchar = posthyphenchar(id)
663        local pre      = prechar  and copy_node(current)
664        local post     = postchar and copy_node(current)
665        local replace  = hyphen and prechar and copy_node(current)
666        local disc     = new_disc()
667        if pre then
668            setchar(pre,prechar)
669        end
670        if post then
671            setchar(post,postchar)
672        end
673        if replace then
674            setchar(replace,prechar)
675        end
676        setattrlist(disc,current)
677        setoptions(disc,compound_disc_code) -- 0x03
678        setdisc(disc,pre,post,replace)
679        setlink(prev,disc,current)
680        if lh then
681            setfield(first,"rhmin",rh)
682        end
683
684        if rh then
685            setfield(current,"lhmin",lh)
686        end
687
688    end
689
690    local setcompounds = setmetatableindex(function(t,l)
691        local v = setmetatableindex(function(t,r)
692            local v = function(current,id,first,last) return setcompound(current,id,first,last,l,r) end
693            t[r] = v
694            return v
695        end)
696        t[l] = v
697        return v
698    end)
699
700    local sethyphens = setmetatableindex(function(t,l)
701        local v = setmetatableindex(function(t,r)
702            local v = function(current,id,first,last) return setcompound(current,id,first,last,l,r,true) end
703            t[r] = v
704            return v
705        end)
706        t[l] = v
707        return v
708    end)
709
710    local function replaceword(first,last,old,new,oldlen)
711        local oldlen = utflength(old)
712        local newlen = utflength(new)
713        if newlen == 0 then
714            -- forget about it
715        elseif newlen <= oldlen then
716            for s in utfvalues(new) do
717                setchar(first,s)
718                first = getnext(first)
719            end
720            if newlen < oldlen then
721                -- first is one ahead
722                local after  = getnext(last)
723                local before = getprev(first)
724                setnext(last)
725                setlink(before,after)
726                flushlist(first)
727            end
728        else
729            local i = 0
730            local l = getnext(last)
731            for s in utfvalues(new) do
732                i = i + 1
733                if i > oldlen then
734                    local g = copy_node(first)
735                    setlink(first,g,l)
736                    setchar(g,s)
737                    first = g
738                elseif i == oldlen then
739                    setchar(first,s)
740                else
741                    setchar(first,s)
742                    first = getnext(first)
743                end
744            end
745        end
746    end
747
748 -- local optioncodes = table.copy(glyphoptioncodes)
749 --
750 -- optioncodes.nokerns     <const> = optioncodes.noleftkern     | optioncodes.norightkern
751 -- optioncodes.noligatures <const> = optioncodes.noleftligature | optioncodes.norightligature
752
753    local lh, rh = false, false
754
755    local cache = setmetatableindex(function(t,k)
756        local v = 0
757        if k == "compound" then
758            v = setcompounds[lh][rh]
759        elseif k == "hyphen" then
760            v = sethyphens[lh][rh]
761        else
762            v = 0
763            for s in gmatch(k,"%w+") do
764                local o = glyphoptioncodes[s]
765             -- local o = optioncodes[s]
766                if o then
767                    v = v | o
768                end
769            end
770        end
771        t[k] = v
772        return v
773    end)
774
775    local function checkglyphproperties(options)
776        -- we sort, just to be sure
777        for word, list in sortedhash(options) do
778            if type(list) == "string" then
779                options[word] = options[list]
780            else
781                for index, option in sortedhash(list) do
782                    if type(option) == "string" then
783                        list[index] = cache[option]
784                    end
785                end
786            end
787        end
788    end
789
790    -- statistics.starttiming(languages)
791    -- statistics.stoptiming(languages)
792
793    -- 1: restart 2: exceptions+patterns 3: patterns *: next word
794
795    local sequencers    = utilities.sequencers
796    local newsequencer  = sequencers.new
797    local appendgroup   = sequencers.appendgroup
798    local prependaction = sequencers.prependaction
799    local appendaction  = sequencers.appendaction
800    local enableaction  = sequencers.enableaction
801    local disableaction = sequencers.disableaction
802
803    local template = {
804        arguments    = "s",
805        returnvalues = "r,i",
806        results      = "r,i",
807    }
808
809    local registeredactions = setmetatableindex ( function(t,tag)
810        local actions = newsequencer(template)
811        appendgroup(actions,"user")
812        t[tag] = actions
813        return actions
814    end )
815
816    languages.registeredactions = registeredactions
817
818    function languages.installhandler(tag,func)
819        local todo = not rawget(registeredactions,tag)
820        local actions = registeredactions[tag]
821        appendaction(actions,"user",func)
822        enableaction(actions,func)
823        report("installing handler %a for language %a",func,tag)
824        if todo then
825            setwordhandler(tag,function(n,original,remapped,length,first,last)
826                local runner = actions.runner
827                if runner then
828                    if getprop(first,"replaced") then
829                        -- maybe some deadcycles
830                    else
831                        local r, result = runner(original)
832                        if not r or original == r then
833                            return result or 0
834                        else
835                            setprop(first,"replaced",true)
836                            replaceword(first,last,original,r,length)
837                            return 1
838                        end
839                    end
840                end
841                return 2
842            end)
843        end
844    end
845
846    local appliedoptions     = setmetatableindex("table")
847    languages.appliedoptions = appliedoptions
848
849    languages.setgoodieshandler = function(specification) -- will become a table specifier
850        if type(specification) == "table" then
851            local tag           = specification.tag
852            local goodies       = specification.goodies or tag
853            local result        = specification.result or 2
854            local data          = goodiesdata[goodies]
855            local properties    = data.properties
856            local replacements  = data.replacements
857            local substitutions = data.substitutions
858            local characters    = data.characters
859            local exceptions    = data.exceptions
860            local experiments   = data.experiments
861            local replacer      = nil
862            local substituter   = nil
863            local d, instance   = resolve(tag)
864            local done          = false
865            -- check if something at all
866            if type(characters) == "table" and characters and next(characters) then
867                addhjcodestoinstance(instance,characters)
868                if trace_goodies then
869                    report_goodies("registering %a characters for %a",goodies,tag)
870                end
871                done = true
872            end
873            if type(properties) == "table" and next(properties) then
874                checkglyphproperties(properties) -- checks in place!
875                if trace_goodies then
876                    report_goodies("registering %a properties for %a",goodies,tag)
877                end
878                done = true
879            end
880            if type(replacements) == "table" and next(replacements) then
881                replacer = Cs((utfchartabletopattern(replacements) / replacements + 1)^0)
882                if trace_goodies then
883                    report_goodies("registering %a replacer for %a",goodies,tag)
884                end
885                done = true
886            end
887            if type(substitutions) == "table" and next(substitutions) then
888                substituter = Cs((utfchartabletopattern(substitutions) / substitutions + 1)^0)
889                if trace_goodies then
890                    report_goodies("registering %a substitutor for %a",goodies,tag)
891                end
892                done = true
893            end
894            if type(exceptions) == "table" and next(exceptions) then
895                done = true
896            else
897                exceptions = false
898            end
899            if type(experiments) == "table" and next(experiments) then
900                done = true
901                if trace_goodies then
902                    report_goodies("registering %a experiments for %a",goodies,tag)
903                end
904            else
905                experiments = false
906            end
907            if done then
908                local registered = registeredactions[tag]
909                local applied    = appliedoptions[tag]
910                setwordhandler(tag,function(n,original,remapped,length,first,last)
911                    local runner = registered.runner
912                    if runner then
913                        if getprop(first,"replaced") then
914                            -- maybe some deadcycles
915                        else
916                            local r, result = runner(original)
917                            if not r then
918                                if trace_goodies then
919                                    report_goodies("kept by runner: %s => %s, result %i",original,remapped, result or 0)
920                                end
921                                return result or 0
922                            elseif original == r then
923                                if result then
924                                    if trace_goodies then
925                                        report_goodies("kept by runner: %s => %s, result %i",original,remapped, result)
926                                    end
927                                    return result
928                                else
929                                    if trace_goodies then
930                                        report_goodies("kept by runner: %s => %s, continue",original,remapped)
931                                    end
932                                end
933                            else
934                                if trace_goodies then
935                                    report_goodies("replaced by runner: %s => %s => %s, restart",original,remapped,r)
936                                end
937                                setprop(first,"replaced",true)
938                                replaceword(first,last,original,r,length)
939                                return 1
940                            end
941                        end
942                    end
943                    local result = 2
944                    local o = properties[remapped]
945                  ::again::
946                    if o then
947                        if trace_goodies then
948                            report("properties: %s %s",original,remapped)
949                        end
950                        if trace_applied then
951                            applied[original] = (applied[original] or 0) + 1
952                        end
953                        local index = 0
954                        for g, c in nextglyph, first do
955                            index = index + 1
956                            local oi = o[index]
957                            if oi then
958                                if type(oi) == "function" then
959                                    oi(g,n,first,last) -- maybe return value
960                                    result = 1
961                                else
962                                    setoptions(g,oi)
963                                end
964                            end
965                            if g == last then
966                                break
967                            end
968                        end
969                        return result
970                    end
971                    if replacer then
972                        -- todo: check lengths so that we can avoid a check
973                        if getprop(first,"replaced") then
974                            -- maybe some deadcycles
975                        else
976                            local r = lpegmatch(replacer,original)
977                            if original == r then
978                                if trace_goodies then
979                                    report_goodies("kept: %s => %s",original,remapped)
980                                end
981                            else
982                                if trace_goodies then
983                                    report_goodies("replaced: %s => %s => %s",original,remapped,r)
984                                end
985                                setprop(first,"replaced",true)
986                                replaceword(first,last,original,r,length)
987                                result = 1
988                            end
989                        end
990                        return result
991                    end
992                    if substituter then
993                        if getprop(first,"replaced") then
994                            -- maybe some deadcycles
995                        else
996                            local r = lpegmatch(substituter,original)
997                            if original == r then
998                                if trace_goodies then
999                                    report_goodies("kept: %s => %s",original,remapped)
1000                                end
1001                            else
1002                                if trace_goodies then
1003                                    report_goodies("substituted: %s => %s => %s",original,remapped,r)
1004                                end
1005                                setprop(first,"replaced",true)
1006                                if not properties[r] then
1007                                    o = expand(r)
1008                                    properties[original] = o
1009                                    goto again
1010                                end
1011                            end
1012                        end
1013                    end
1014                    if exceptions then
1015                        local exception = exceptions[original]
1016                        if exception then
1017                            if trace_goodies then
1018                                report_goodies("exception: %s => %s",original,exception)
1019                            end
1020                            result = exception
1021                        else
1022                            result = 3
1023                        end
1024                        return result
1025                    end
1026                    -- can be optimized ....
1027                    if experiments then
1028                        for i=1,#experiments do
1029                            local result = experiments[i](original)
1030                            if result then
1031                                if trace_goodies then
1032                                    report_goodies("experiment: %s => %s",original,result)
1033                                end
1034                                return result
1035                            end
1036                        end
1037                        return 3
1038                    end
1039                    if trace_goodies then
1040                        report_goodies("ignored: %s => %s",original,remapped)
1041                    end
1042                    return result
1043                end)
1044            elseif trace_goodies then
1045                report_goodies("nothing useable in %a for %a",goodies,tag)
1046            end
1047        else
1048            setwordhandler(tag)
1049        end
1050    end
1051
1052    local norightligature_option <const> = glyphoptioncodes.norightligature
1053    local noleftligature_option  <const> = glyphoptioncodes.noleftligature
1054    local norightkern_option     <const> = glyphoptioncodes.norightkern
1055    local noleftkern_option      <const> = glyphoptioncodes.noleftkern
1056
1057    local function applyaction(oc,v,n)
1058        if oc == "noligature" then
1059            if n > 0 then
1060                local vv = v[n-1]
1061                if vv then
1062                    v[n-1] = vv | norightligature_option
1063                else
1064                    v[n-1] = norightligature_option
1065                end
1066            end
1067            v[n] = noleftligature_option
1068        elseif oc == "compound" then
1069            if n > 1 then
1070             -- v[n] = setcompound
1071                v[n] = setcompounds[lh][rh]
1072                return true
1073            end
1074        elseif oc == "hyphen" then
1075            if n > 1 then
1076                v[n] = sethyphens[lh][rh]
1077                return true
1078            end
1079        elseif oc == "nokern" then
1080            if n > 0 then
1081                local vv = v[n-1]
1082                if vv then
1083                    v[n-1] = vv | norightkern_option
1084                else
1085                    v[n-1] = norightkern_option
1086                end
1087            end
1088            v[n] = noleftkern_option
1089        elseif oc == "noleftkern" then
1090            v[n] = noleftkern_option
1091        elseif oc == "norightkern" then
1092            if n > 0 then
1093                local vv = v[n-1]
1094                if vv then
1095                    v[n-1] = vv | norightkern_option
1096                else
1097                    v[n-1] = norightkern_option
1098                end
1099            end
1100        else
1101            for s in gmatch(oc,"%w+") do
1102                if applyaction(s,v,n) then
1103                    return
1104                end
1105            end
1106        end
1107    end
1108
1109    -- a|b : a:norightligature b:noleftligature
1110    -- a=b : a:norightkern     b:noleftkern
1111    -- a<b :                   b:noleftkern
1112    -- a>b : a:norightkern
1113    -- a-b : hyphen
1114    -- a+b : compound
1115
1116    local actions = {
1117        ["|"] = "noligature",
1118        ["="] = "nokern",
1119        ["<"] = "noleftkern",
1120        [">"] = "norightkern",
1121        ["+"] = "compound",
1122        ["-"] = "hyphen",
1123    }
1124
1125    local function analyzed(m,a,t,k)
1126        local v = { }
1127        local n = 1
1128        if m == true then
1129            for c in gmatch(k,".") do
1130                local ac = a[c]
1131                if not ac then
1132                    n = n + 1
1133                else
1134                    applyaction(ac,v,n)
1135                end
1136            end
1137        elseif type(m) == "number" then
1138            local i = 0
1139            for c in gmatch(k,".") do
1140                local ac = a[c]
1141                if not ac then
1142                    n = n + 1
1143                else
1144                    i = i + 1
1145                    if i == m then
1146                        applyaction(ac,v,n)
1147                        break
1148                    end
1149                end
1150            end
1151        elseif type(m) == "table" then
1152            -- happens here, otherwise no stable caching key, we could hash these too
1153            m = tohash(m)
1154            local i = 0
1155            for c in gmatch(k,".") do
1156                local ac = a[c]
1157                if not ac then
1158                    n = n + 1
1159                else
1160                    i = i + 1
1161                    if m[i] then
1162                        applyaction(ac,v,n)
1163                    end
1164                end
1165            end
1166        end
1167        t[k] = v
1168        return v
1169    end
1170
1171    local cache = setmetatableindex(function(t,m)
1172        local v = setmetatableindex(function(t,a)
1173            local v = setmetatableindex(function(t,k)
1174                return analyzed(m,a,t,k)
1175            end)
1176            t[m] = v
1177            return v
1178        end)
1179        t[m] = v
1180        return v
1181    end)
1182
1183    expand = function(str)
1184        return analyzed(true,actions,{},str)
1185    end
1186
1187    -- maybe also a skip symbol
1188
1189    local replace1 = Cs ( ( S("|=<>+-.0123456789")/"" + lpegpatterns.utf8character    )^0 )
1190    local replace2 = Cs ( ( S("|=<>+-.0123456789")    + lpegpatterns.utf8character/".")^0 )
1191
1192    local function stripped(str)
1193        -- todo : lpeg
1194        str = gsub(str,"%-%-[^\n]*\n","")
1195        str = gsub(str,"%%[^\n]*\n","")
1196        str = gsub(str,"%s+"," ")
1197        str = gsub(str,"^%s+","")
1198        str = gsub(str,"%s+$","")
1199        return str
1200    end
1201
1202    local registerexceptions  do
1203
1204        local lbrace   = P("{")
1205        local rbrace   = P("}")
1206        local lbracket = P("[")
1207        local rbracket = P("]")
1208        local lparent  = P("(")
1209        local rparent  = P(")")
1210        local hyphen   = P("-")
1211
1212        local p = Cs ( (
1213            lbrace *  ((1-rbrace)^0) * rbrace
1214          * lbrace *  ((1-rbrace)^0) * rbrace
1215          * lbrace * C((1-rbrace)^0) * rbrace * (lparent * C((1-rparent)^0) * rparent)^0 / function(a,b) return b or a end
1216          + (lbracket * (1-rbracket)^0 * rbracket) / ""
1217          + hyphen / ""
1218          + lpegpatterns.utf8character
1219        )^0 )
1220
1221        registerexceptions = function(target,str)
1222            local kind = type(str)
1223            if kind == "string" then
1224                for v in gmatch(stripped(str),"%S+") do
1225                    local k = lpegmatch(p,v)
1226                    if k ~= v then
1227                        target[k] = v
1228                    end
1229                end
1230            elseif kind == "table" then
1231                local n = #str
1232                if n > 0 then
1233                    for i=1,n do
1234                        local v = str[i]
1235                        local k = lpegmatch(p,v)
1236                        if k ~= v then
1237                            target[k] = v
1238                        end
1239                    end
1240                else
1241                    -- maybe check for sanity
1242                    for k, v in next, str do
1243                        target[k] = v
1244                    end
1245                end
1246            end
1247        end
1248
1249    end
1250
1251    local registerexperiments  do
1252
1253        registerexperiments = function(target,str)
1254            local kind = type(str)
1255            if kind == "function" then
1256                target[#target+1] = str
1257            end
1258        end
1259
1260    end
1261
1262    function languages.strippedgoodiewords(str)
1263        return lpegmatch(replace1,stripped(str))
1264    end
1265
1266    local splitter = lpeg.tsplitat(" ")
1267
1268    local function addgoodies(tag,list,filename)
1269        local np = 0
1270        local nd = 0
1271        local nw = 0
1272        local nl = #list
1273        --
1274        local data          = goodiesdata[tag]
1275        local properties    = data.properties
1276        local replacements  = data.replacements
1277        local substitutions = data.substitutions
1278        local characters    = data.characters
1279        local exceptions    = data.exceptions
1280        local experiments   = data.experiments
1281        if filename then
1282            if not data.goodies then
1283                data.goodies = { }
1284            end
1285            insert(data.goodies,filename)
1286        end
1287        --
1288        lh = false
1289        rh = false
1290        --
1291        for i=1,nl do
1292            local l = list[i]
1293            if type(l) == "table" then
1294                local w = l.words
1295                local p = l.patterns
1296                local s = l.substitutions
1297                local c = l.characters
1298                local e = l.exceptions
1299                local x = l.experiments
1300                lh = l.left  or false -- for practical reasons these are semi-global
1301                rh = l.right or false -- for practical reasons these are semi-global
1302                if c then
1303                    for v in utfvalues(c) do
1304                        characters[v] = true
1305                    end
1306                end
1307                if w then
1308                    local prefixes    = l.prefixes
1309                    local nofprefixes = 0
1310                    local suffixes    = l.suffixes
1311                    local nofsuffixes = 0
1312                    if prefixes then
1313                        prefixes    = lpegmatch(splitter,lower(stripped(prefixes)))
1314                        nofprefixes = #prefixes
1315                    end
1316                    if suffixes then
1317                        suffixes    = lpegmatch(splitter,lower(stripped(suffixes)))
1318                        nofsuffixes = #suffixes
1319                    end
1320                    w = lower(stripped(w))
1321                    if p then
1322                        local pattern = Cs((utfchartabletopattern(p) / p + 1)^0)
1323                        w = lpegmatch(pattern,w)
1324                        np = np + 1
1325                    else
1326                        nd = nd + 1
1327                    end
1328                    local m = l.matches
1329                    if not m then
1330                        m = true
1331                    end
1332                    local a = l.actions
1333                    if a then
1334                        setmetatableindex(a,actions)
1335                    else
1336                        a = actions
1337                    end
1338                    local cach = cache[m][a]
1339                    if nofprefixes > 0 then
1340                        if nofsuffixes > 0 then
1341                            for wrd in gmatch(w,"%S+") do
1342                                properties[lpegmatch(replace1,wrd)] = cach[lpegmatch(replace2,wrd)]
1343                                nw = nw + 1
1344                                for i=1,nofprefixes do
1345                                    local tmp = prefixes[i] .. wrd
1346                                    for i=1,nofsuffixes do
1347                                        local str = tmp .. suffixes[i]
1348                                        properties[lpegmatch(replace1,str)] = cach[lpegmatch(replace2,str)]
1349                                        nw = nw + 1
1350                                    end
1351                                end
1352                            end
1353                        else
1354                            for wrd in gmatch(w,"%S+") do
1355                                properties[lpegmatch(replace1,wrd)] = cach[lpegmatch(replace2,wrd)]
1356                                nw = nw + 1
1357                                for i=1,nofprefixes do
1358                                    local str = prefixes[i] .. wrd
1359                                    properties[lpegmatch(replace1,str)] = cach[lpegmatch(replace2,str)]
1360                                    nw = nw + 1
1361                                end
1362                            end
1363                        end
1364                    elseif nofsuffixes > 0 then
1365                        for wrd in gmatch(w,"%S+") do
1366                            properties[lpegmatch(replace1,wrd)] = cach[lpegmatch(replace2,wrd)]
1367                            nw = nw + 1
1368                            for i=1,nofsuffixes do
1369                                local str = wrd .. suffixes[i]
1370                                properties[lpegmatch(replace1,str)] = cach[lpegmatch(replace2,str)]
1371                                nw = nw + 1
1372                            end
1373                        end
1374                    else
1375                        for wrd in gmatch(w,"%S+") do
1376                            properties[lpegmatch(replace1,wrd)] = cach[lpegmatch(replace2,wrd)]
1377                            nw = nw + 1
1378                        end
1379                    end
1380                elseif s then
1381                    for k, v in next, s do
1382                        substitutions[k] = v
1383                    end
1384                elseif p then
1385                    for k, v in next, p do
1386                        replacements[k] = v
1387                    end
1388                elseif e then
1389                    registerexceptions(exceptions,e)
1390                elseif x then
1391                    registerexperiments(experiments,x)
1392                end
1393            end
1394        end
1395
1396        lh = false
1397        rh = false
1398
1399        return { np = np, nd = nd, nw = nw, nl = nl }
1400    end
1401
1402    function languages.goodiefiles(tag)
1403        local d = goodiesdata[tag]
1404        return d and d.goodies
1405    end
1406
1407    function languages.addgoodiesfile(tag,filename)
1408        local fullname = resolvers.findfile(file.addsuffix(filename,"llg")) or ""
1409        if fullname == "" then
1410            report_goodies("file %a is not found",filename)
1411        else
1412            local list = table.load(fullname)
1413            if not list then
1414                report_goodies("file %a is invalid",fullname)
1415            else
1416                list = list.options
1417                if not list then
1418                    report_goodies("file %a has no options",fullname)
1419                else
1420                    local ok = addgoodies(tag,list,filename)
1421                    report_goodies("tag %a, file %a loaded, %i lists, %i via patterns, %i direct, %i words",
1422                        tag,fullname,ok.nl,ok.np,ok.nd,ok.nw)
1423                end
1424            end
1425        end
1426    end
1427
1428    function languages.addgoodiesdata(tag,list)
1429        local ok = addgoodies(tag,list)
1430        report_goodies("tag %a, data loaded, %i lists, %i via patterns, %i direct, %i words",
1431            tag,ok.nl,ok.np,ok.nd,ok.nw)
1432    end
1433
1434end
1435
1436if environment.initex then
1437
1438    function languages.getnumber()
1439        return 0
1440    end
1441
1442else
1443
1444    function languages.getnumber(tag,default,patterns,goodies,factor)
1445        local l = registered[tag]
1446        if l then
1447            if l.dirty then
1448                l.factor = factor == v_yes and true or false
1449                if trace_patterns then
1450                    report_initialization("checking patterns for %a with default %a",tag,default)
1451                end
1452                -- patterns is already resolved to parent patterns if applicable
1453                if patterns and patterns ~= "" then
1454                    if l.patterns ~= patterns then
1455                        l.patterns = patterns
1456                        if trace_patterns then
1457                            report_initialization("loading patterns for %a using specification %a",tag,patterns)
1458                        end
1459                        loaddefinitions(tag,l)
1460                    else
1461                        -- unchanged
1462                    end
1463                elseif l.patterns == "" then
1464                    l.patterns = tag
1465                    if trace_patterns then
1466                        report_initialization("loading patterns for %a using tag",tag)
1467                    end
1468                    local ok = loaddefinitions(tag,l)
1469                    if not ok and tag ~= default then
1470                        l.patterns = default
1471                        if trace_patterns then
1472                            report_initialization("loading patterns for %a using default",tag)
1473                        end
1474                        loaddefinitions(tag,l)
1475                    end
1476                end
1477                if goodies and goodies ~= "" then
1478                    goodies = settings_to_array(goodies)
1479                    for i=1,#goodies do
1480                        local goodie = goodies[i]
1481                        -- we can cache this but it doesn't pay off to do so
1482                        languages.addgoodiesfile(tag,goodie)
1483                    end
1484                    languages.setgoodieshandler {
1485                        tag     = tag,
1486                        goodies = tag,
1487                    }
1488                end
1489                l.loaded = true
1490                l.dirty  = false
1491            end
1492            return l.number
1493        else
1494            return 0
1495        end
1496    end
1497
1498    numbers[0] = "null"
1499
1500    registered.null = {
1501        number   = 0,
1502        instance = new_language(0),
1503    }
1504
1505end
1506
1507-- hyphenation.define        ("zerolanguage")
1508-- hyphenation.loadpatterns  ("zerolanguage") -- else bug
1509-- hyphenation.loadexceptions("zerolanguage") -- else bug
1510
1511languages.logger = languages.logger or { }
1512
1513function languages.logger.report()
1514    local result, r = { }, 0
1515    for tag, l in sortedhash(registered) do
1516        if l.loaded then
1517            r = r + 1
1518            result[r] = format("%s:%s:%s",tag,l.parent,l.number)
1519        end
1520    end
1521    return r > 0 and concat(result," ") or "none"
1522end
1523
1524-- must happen at the tex end .. will use lang-def.lua
1525
1526languages.associate('en','latn','eng')
1527languages.associate('uk','latn','eng')
1528languages.associate('nl','latn','nld')
1529languages.associate('de','latn','deu')
1530languages.associate('fr','latn','fra')
1531
1532statistics.register("loaded patterns", function()
1533    local result = languages.logger.report()
1534    if result ~= "none" then
1535     -- return result
1536        return format("%s, load time: %s",result,statistics.elapsedtime(languages))
1537    end
1538end)
1539
1540-- statistics.register("language load time", function()
1541--     -- often zero so we can merge that in the above
1542--     return statistics.elapsedseconds(languages, format(", nofpatterns: %s",nofloaded))
1543-- end)
1544
1545-- interface
1546
1547implement {
1548    name      = "languagenumber",
1549    actions   = { languages.getnumber, context },
1550    arguments = "5 strings"
1551}
1552
1553implement {
1554    name      = "installedlanguages",
1555    actions   = { languages.installed, context },
1556}
1557
1558implement {
1559    name      = "definelanguage",
1560    actions   = languages.define,
1561    arguments = "2 strings"
1562}
1563
1564implement {
1565    name      = "setlanguagesynonym",
1566    actions   = languages.setsynonym,
1567    arguments = "2 strings"
1568}
1569
1570implement {
1571    name      = "unloadlanguage",
1572    actions   = languages.unload,
1573    arguments = "string"
1574}
1575
1576implement {
1577    name      = "setlanguageexceptions",
1578    actions   = languages.setexceptions,
1579    arguments = "2 strings"
1580}
1581
1582implement {
1583    name      = "setlanguagepatterns",
1584    actions   = languages.setpatterns,
1585    arguments = "2 strings"
1586}
1587
1588implement {
1589    name      = "setlanguageoptions",
1590    actions   = languages.setoptions,
1591    arguments = "2 strings"
1592}
1593
1594implement {
1595    name      = "currentprehyphenchar",
1596    actions   = function()
1597        local c = prehyphenchar(tolang())
1598        if c and c > 0 then
1599            context.char(c)
1600        end
1601    end
1602}
1603
1604implement {
1605    name      = "currentposthyphenchar",
1606    actions   = function()
1607        local c = posthyphenchar(tolang())
1608        if c and c > 0 then
1609            context.char(c)
1610        end
1611    end
1612}
1613