lang-ini.lua /size: 20 Kb    last modification: 2021-10-28 13:50
1if not modules then modules = { } end modules ['lang-ini'] = {
2    version   = 1.001,
3    comment   = "companion to lang-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- needs a cleanup (share locals)
10-- discard language when redefined
11
12-- 002D : hyphen-minus (ascii)
13-- 2010 : hyphen
14-- 2011 : nonbreakable hyphen
15-- 2013 : endash (compound hyphen)
16
17--~ lang:hyphenation(string) string = lang:hyphenation() lang:clear_hyphenation()
18
19-- todo: no foo:bar but foo(bar,...)
20
21local type, tonumber, next = type, tonumber, next
22local utfbyte = utf.byte
23local format, gsub, gmatch, find = string.format, string.gsub, string.gmatch, string.find
24local concat, sortedkeys, sortedhash, keys, insert = table.concat, table.sortedkeys, table.sortedhash, table.keys, table.insert
25local utfvalues, strip, utfcharacters = string.utfvalues, string.strip, utf.characters
26
27local context   = context
28local commands  = commands
29local implement = interfaces.implement
30
31local settings_to_array = utilities.parsers.settings_to_array
32local settings_to_set = utilities.parsers.settings_to_set
33
34local trace_patterns = false  trackers.register("languages.patterns", function(v) trace_patterns = v end)
35
36local report_initialization = logs.reporter("languages","initialization")
37
38local lang             = lang
39language               = lang -- we use that in lmtx
40
41local prehyphenchar    = language.prehyphenchar    -- global per language
42local posthyphenchar   = language.posthyphenchar   -- global per language
43local preexhyphenchar  = language.preexhyphenchar  -- global per language
44local postexhyphenchar = language.postexhyphenchar -- global per language
45----- lefthyphenmin    = language.lefthyphenmin
46----- righthyphenmin   = language.righthyphenmin
47local sethjcode        = language.sethjcode
48
49local uccodes          = characters.uccodes
50local lccodes          = characters.lccodes
51
52local new_language     = language.new
53
54languages              = languages or {}
55local languages        = languages
56
57languages.version      = 1.010
58
59languages.registered   = languages.registered or { }
60local registered       = languages.registered
61
62languages.associated   = languages.associated or { }
63local associated       = languages.associated
64
65languages.numbers      = languages.numbers    or { }
66local numbers          = languages.numbers
67
68languages.data         = languages.data       or { }
69local data             = languages.data
70
71storage.register("languages/registered",registered,"languages.registered")
72storage.register("languages/associated",associated,"languages.associated")
73storage.register("languages/numbers",   numbers,   "languages.numbers")
74storage.register("languages/data",      data,      "languages.data")
75
76local variables = interfaces.variables
77
78local v_reset   = variables.reset
79local v_yes     = variables.yes
80
81local nofloaded  = 0
82
83local function resolve(tag)
84    local data, instance = registered[tag], nil
85    if data then
86        instance = data.instance
87        if not instance then
88            instance = new_language(data.number)
89            data.instance = instance
90        end
91    end
92    return data, instance
93end
94
95local function tolang(what) -- returns lang object
96    if not what then
97        what = tex.language
98    end
99    local tag = numbers[what]
100    local data = tag and registered[tag] or registered[what]
101    if data then
102        local instance = data.lang
103        if not instance then
104            instance = new_language(data.number)
105            data.instance = instance
106        end
107        return instance
108    end
109end
110
111function languages.getdata(tag) -- or number
112    if tag then
113        return registered[tag] or registered[numbers[tag]]
114    else
115        return registered[numbers[tex.language]]
116    end
117end
118
119-- languages.tolang = tolang
120
121-- patterns=en
122-- patterns=en,de
123
124local function validdata(loaded,what,tag)
125    local dataset = loaded[what]
126    if dataset then
127        local data = dataset.data
128        if not data or data == "" then
129            -- nothing
130        elseif dataset.compression == "zlib" then
131            data = zlib.decompress(data)
132            if dataset.length and dataset.length ~= #data then
133                report_initialization("compression error in %a for language %a","patterns",what,tag)
134            end
135            return data
136        else
137            return data
138        end
139    end
140end
141
142-- languages.hjcounts[unicode].count
143
144-- hjcode: 0       not to be hyphenated
145--         1--31   length
146--         32      zero length
147--         > 32    hyphenated with length 1
148
149local function sethjcodes(instance,loaded,what,factor)
150    local l = loaded[what]
151    local c = l and l.characters
152    if c then
153        local hjcounts = factor and languages.hjcounts or false
154        --
155        local h = loaded.codehash
156        if not h then
157            h = { }
158            loaded.codehash = h
159        end
160        --
161        local function setcode(code)
162            local l = lccodes[code] -- just in case we get a mixture
163            local u = uccodes[code] -- just in case we get a mixture
164            local s = l
165            if type(s) ~= "number" then
166                l = code
167                s = code
168            end
169            if hjcounts then
170                local c = hjcounts[s]
171                if c then
172                    c = c.count
173                    if not c then
174                        -- error, keep as 1
175                    elseif c <= 0 then
176                        -- counts as 0 i.e. ignored
177                        s = 32
178                    elseif c >= 31 then
179                        -- counts as 31
180                        s = 31
181                    else
182                        -- count c times
183                        s = c
184                    end
185                end
186            end
187            sethjcode(instance,l,s)
188            h[l] = s
189            if u ~= l and type(u) == "number" then
190                sethjcode(instance,u,s)
191                h[u] = s
192            end
193        end
194        --
195        local s = tex.savinghyphcodes
196        tex.savinghyphcodes = 0
197        if type(c) == "table" then
198            for l in sortedhash(c) do
199                setcode(utfbyte(l))
200            end
201        else
202            for l in utfvalues(c) do
203                setcode(l)
204            end
205        end
206        tex.savinghyphcodes = s
207    end
208end
209
210-- 2'2 conflicts with 4' ... and luatex barks on it
211
212local P, R, Cs, Ct, lpegmatch, lpegpatterns = lpeg.P, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.match, lpeg.patterns
213
214local utfsplit = utf.split
215
216local space       = lpegpatterns.space
217local whitespace  = lpegpatterns.whitespace^1
218local nospace     = lpegpatterns.utf8char - whitespace
219local digit       = lpegpatterns.digit
220----- endofstring = #whitespace + P(-1)
221local endofstring = #whitespace
222
223local word        = (digit/"")^0 * (digit/"" * endofstring + digit/" " + nospace)^1
224local anyword     = (1-whitespace)^1
225local analyze     = Ct((whitespace + Cs(word))^1)
226
227local function unique(tag,requested,loaded)
228    local nofloaded = #loaded
229    if nofloaded == 0 then
230        return ""
231    elseif nofloaded == 1 then
232        return loaded[1]
233    else
234        insert(loaded,1," ") -- no need then for special first word
235     -- insert(loaded,  " ")
236        loaded = concat(loaded," ")
237        local t = lpegmatch(analyze,loaded) or { }
238        local h = { }
239        local b = { }
240        for i=1,#t do
241            local ti = t[i]
242            local hi = h[ti]
243            if not hi then
244                h[ti] = 1
245            elseif hi == 1 then
246                h[ti] = 2
247                b[#b+1] = utfsplit(ti," ")
248            end
249        end
250        -- sort
251        local nofbad = #b
252        if nofbad > 0 then
253            local word
254            for i=1,nofbad do
255                local bi = b[i]
256                local p = P(bi[1])
257                for i=2,#bi do
258                    p = p * digit * P(bi[i])
259                end
260                if word then
261                    word = word + p
262                else
263                    word = p
264                end
265                report_initialization("language %a, patterns %a, discarding conflict (0-9)%{[0-9]}t(0-9)",tag,requested,bi)
266            end
267            t, h, b = nil, nil, nil -- permit gc
268            local someword = digit^0 * word * digit^0 * endofstring / ""
269         -- local strip    = Cs(someword^-1 * (someword + anyword + whitespace)^1)
270            local strip    = Cs((someword + anyword + whitespace)^1)
271            return lpegmatch(strip,loaded) or loaded
272        else
273            return loaded
274        end
275    end
276end
277
278local shared = false
279
280local function loaddefinitions(tag,specification)
281    statistics.starttiming(languages)
282    local data, instance = resolve(tag)
283    local requested = specification.patterns or ""
284    local definitions = settings_to_array(requested)
285    if #definitions > 0 then
286        if trace_patterns then
287            report_initialization("pattern specification for language %a: %s",tag,specification.patterns)
288        end
289        local ploaded = instance:patterns()
290        local eloaded = instance:hyphenation()
291        if not ploaded or ploaded == ""  then
292            ploaded = { }
293        else
294            ploaded = { ploaded }
295        end
296        if not eloaded or eloaded == ""  then
297            eloaded = { }
298        else
299            eloaded = { eloaded }
300        end
301        local dataused  = data.used
302        local ok        = false
303        local resources = data.resources or { }
304        data.resources  = resources
305        if not shared then
306            local found = resolvers.findfile("lang-exc.lua")
307            if found then
308                shared = dofile(found)
309                if type(shared) == "table" then
310                    shared = concat(shared," ")
311                else
312                    shared = true
313                end
314            else
315                shared = true
316            end
317        end
318        for i=1,#definitions do
319            local definition = definitions[i]
320            if definition == "" then
321                -- error
322            elseif definition == v_reset then
323                if trace_patterns then
324                    report_initialization("clearing patterns for language %a",tag)
325                end
326                instance:clear_patterns()
327                instance:clear_hyphenation()
328                ploaded = { }
329                eloaded = { }
330            elseif not dataused[definition] then
331                dataused[definition] = definition
332                local filename = "lang-" .. definition .. ".lua"
333                local fullname = resolvers.findfile(filename) or ""
334                if fullname == "" then
335                    fullname = resolvers.findfile(filename .. ".gz") or ""
336                end
337                if fullname ~= "" then
338                    if trace_patterns then
339                        report_initialization("loading definition %a for language %a from %a",definition,tag,fullname)
340                    end
341                    local suffix, gzipped = gzip.suffix(fullname)
342                    local loaded = table.load(fullname,gzipped and gzip.load)
343                    if loaded then -- todo: version test
344                        ok, nofloaded = true, nofloaded + 1
345                        sethjcodes(instance,loaded,"patterns",specification.factor)
346                        sethjcodes(instance,loaded,"exceptions",specification.factor)
347                        local p = validdata(loaded,"patterns",tag)
348                        local e = validdata(loaded,"exceptions",tag)
349                        if p and p ~= "" then
350                            ploaded[#ploaded+1] = p
351                        end
352                        if e and e ~= "" then
353                            eloaded[#eloaded+1] = e
354                        end
355                        resources[#resources+1] = loaded -- so we can use them otherwise
356                    else
357                        report_initialization("invalid definition %a for language %a in %a",definition,tag,filename)
358                    end
359                elseif trace_patterns then
360                    report_initialization("invalid definition %a for language %a in %a",definition,tag,filename)
361                end
362            elseif trace_patterns then
363                report_initialization("definition %a for language %a already loaded",definition,tag)
364            end
365        end
366        if #ploaded > 0 then
367            -- why not always clear
368            instance:clear_patterns()
369            instance:patterns(unique(tag,requested,ploaded))
370        end
371        if #eloaded > 0 then
372            -- why not always clear
373            instance:clear_hyphenation()
374            instance:hyphenation(concat(eloaded," "))
375        end
376        if type(shared) == "string" then
377            instance:hyphenation(shared)
378        end
379        return ok
380    elseif trace_patterns then
381        report_initialization("no definitions for language %a",tag)
382    end
383    statistics.stoptiming(languages)
384end
385
386storage.shared.noflanguages = storage.shared.noflanguages or 0
387
388local noflanguages = storage.shared.noflanguages
389
390function languages.define(tag,parent)
391    noflanguages = noflanguages + 1
392    if trace_patterns then
393        report_initialization("assigning number %a to %a",noflanguages,tag)
394    end
395    numbers[noflanguages] = tag
396    registered[tag] = {
397        tag      = tag,
398        parent   = parent or "",
399        patterns = "",
400        loaded   = false,
401        used     = { },
402        dirty    = true,
403        number   = noflanguages,
404        instance = nil, -- luatex data structure
405        synonyms = { },
406    }
407    storage.shared.noflanguages = noflanguages
408end
409
410function languages.setsynonym(synonym,tag) -- convenience function
411    local l = registered[tag]
412    if l then
413        l.synonyms[synonym] = true -- maybe some day more info
414    end
415end
416
417function languages.installed(separator)
418    return concat(sortedkeys(registered),separator or ",")
419end
420
421function languages.current(n)
422    return numbers[n and tonumber(n) or tex.language]
423end
424
425function languages.associate(tag,script,language) -- not yet used
426    associated[tag] = { script, language }
427end
428
429function languages.association(tag) -- not yet used
430    if not tag then
431        tag = numbers[tex.language]
432    elseif type(tag) == "number" then
433        tag = numbers[tag]
434    end
435    local lat = tag and associated[tag]
436    if lat then
437        return lat[1], lat[2]
438    end
439end
440
441function languages.loadable(tag,defaultlanguage) -- hack
442    local l = registered[tag] -- no synonyms
443    if l and resolvers.findfile("lang-"..l.patterns..".lua") then
444        return true
445    else
446        return false
447    end
448end
449
450-- a bit messy, we will do all language setting in lua as we can now assign
451-- and 'patterns' will go away here.
452
453function languages.unload(tag)
454    local l = registered[tag]
455    if l then
456        l.dirty = true
457    end
458end
459
460if environment.initex then
461
462    function languages.getnumber()
463        return 0
464    end
465
466else
467
468    function languages.getnumber(tag,default,patterns,factor)
469        local l = registered[tag]
470        if l then
471            if l.dirty then
472                l.factor = factor == v_yes and true or false
473                if trace_patterns then
474                    report_initialization("checking patterns for %a with default %a",tag,default)
475                end
476                -- patterns is already resolved to parent patterns if applicable
477                if patterns and patterns ~= "" then
478                    if l.patterns ~= patterns then
479                        l.patterns = patterns
480                        if trace_patterns then
481                            report_initialization("loading patterns for %a using specification %a",tag,patterns)
482                        end
483                        loaddefinitions(tag,l)
484                    else
485                        -- unchanged
486                    end
487                elseif l.patterns == "" then
488                    l.patterns = tag
489                    if trace_patterns then
490                        report_initialization("loading patterns for %a using tag",tag)
491                    end
492                    local ok = loaddefinitions(tag,l)
493                    if not ok and tag ~= default then
494                        l.patterns = default
495                        if trace_patterns then
496                            report_initialization("loading patterns for %a using default",tag)
497                        end
498                        loaddefinitions(tag,l)
499                    end
500                end
501                l.loaded = true
502                l.dirty = false
503            end
504            return l.number
505        else
506            return 0
507        end
508    end
509end
510
511-- not that usefull, global values
512
513function languages.prehyphenchar   (what) return prehyphenchar   (tolang(what)) end
514function languages.posthyphenchar  (what) return posthyphenchar  (tolang(what)) end
515function languages.preexhyphenchar (what) return preexhyphenchar (tolang(what)) end
516function languages.postexhyphenchar(what) return postexhyphenchar(tolang(what)) end
517-------- languages.lefthyphenmin   (what) return lefthyphenmin   (tolang(what)) end
518-------- languages.righthyphenmin  (what) return righthyphenmin  (tolang(what)) end
519
520-- e['implementer']= 'imple{m}{-}{-}menter'
521-- e['manual'] = 'man{}{}{}'
522-- e['as'] = 'a-s'
523-- e['user-friendly'] = 'user=friend-ly'
524-- e['exceptionally-friendly'] = 'excep-tionally=friend-ly'
525
526local invalid = { "{", "}", "-" }
527
528local function collecthjcodes(data,str)
529    local found = data.extras and data.extras.characters or { }
530    for s in utfcharacters(str) do
531        if not found[s] then
532            found[s] = true
533        end
534    end
535    for i=1,#invalid do -- less checks this way
536        local c = invalid[i]
537        if found[c] then
538            found[c] = nil
539        end
540    end
541    data.extras = { characters = found }
542    sethjcodes(data.instance,data,"extras",data.factor)
543end
544
545function languages.loadwords(tag,filename)
546    local data, instance = resolve(tag)
547    if data then
548        statistics.starttiming(languages)
549        local str = io.loaddata(filename) or ""
550        collecthjcodes(data,str)
551        instance:hyphenation(str)
552        statistics.stoptiming(languages)
553    end
554end
555
556
557function languages.setexceptions(tag,str)
558    local data, instance = resolve(tag)
559    if data then
560        str = strip(str) -- we need to strip leading spaces
561        collecthjcodes(data,str)
562        instance:hyphenation(str)
563    end
564end
565
566function languages.hyphenate(tag,str)
567    -- todo: does this still work?
568    local data, instance = resolve(tag)
569    if data then
570        return instance:hyphenate(str)
571    else
572        return str
573    end
574end
575
576-- hyphenation.define        ("zerolanguage")
577-- hyphenation.loadpatterns  ("zerolanguage") -- else bug
578-- hyphenation.loadexceptions("zerolanguage") -- else bug
579
580languages.logger = languages.logger or { }
581
582function languages.logger.report()
583    local result, r = { }, 0
584    for tag, l in sortedhash(registered) do
585        if l.loaded then
586            r = r + 1
587            result[r] = format("%s:%s:%s",tag,l.parent,l.number)
588        end
589    end
590    return r > 0 and concat(result," ") or "none"
591end
592
593-- must happen at the tex end .. will use lang-def.lua
594
595languages.associate('en','latn','eng')
596languages.associate('uk','latn','eng')
597languages.associate('nl','latn','nld')
598languages.associate('de','latn','deu')
599languages.associate('fr','latn','fra')
600
601statistics.register("loaded patterns", function()
602    local result = languages.logger.report()
603    if result ~= "none" then
604     -- return result
605        return format("%s, load time: %s",result,statistics.elapsedtime(languages))
606    end
607end)
608
609-- statistics.register("language load time", function()
610--     -- often zero so we can merge that in the above
611--     return statistics.elapsedseconds(languages, format(", nofpatterns: %s",nofloaded))
612-- end)
613
614-- interface
615
616implement {
617    name      = "languagenumber",
618    actions   = { languages.getnumber, context },
619    arguments = "4 strings"
620}
621
622implement {
623    name      = "installedlanguages",
624    actions   = { languages.installed, context },
625}
626
627implement {
628    name      = "definelanguage",
629    actions   = languages.define,
630    arguments = "2 strings"
631}
632
633implement {
634    name      = "setlanguagesynonym",
635    actions   = languages.setsynonym,
636    arguments = "2 strings"
637}
638
639implement {
640    name      = "unloadlanguage",
641    actions   = languages.unload,
642    arguments = "string"
643}
644
645implement {
646    name      = "setlanguageexceptions",
647    actions   = languages.setexceptions,
648    arguments = "2 strings"
649}
650
651implement {
652    name      = "currentprehyphenchar",
653    actions   = function()
654        local c = prehyphenchar(tolang())
655        if c and c > 0 then
656            context.char(c)
657        end
658    end
659}
660
661implement {
662    name      = "currentposthyphenchar",
663    actions   = function()
664        local c = posthyphenchar(tolang())
665        if c and c > 0 then
666            context.char(c)
667        end
668    end
669}
670