regi-ini.lmt /size: 9 Kb    last modification: 2024-01-16 10:22
1if not modules then modules = { } end modules ['regi-ini'] = {
2    version   = 1.001,
3    comment   = "companion to regi-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- Regimes take care of converting the input characters into UTF sequences. The
10-- conversion tables are loaded at runtime.
11
12local tostring = tostring
13local utfchar = utf.char
14local P, Cs, Cc, lpegmatch = lpeg.P, lpeg.Cs, lpeg.Cc, lpeg.match
15local char, gsub, format, gmatch, byte, match, lower = string.char, string.gsub, string.format, string.gmatch, string.byte, string.match, string.lower
16local next = next
17local insert, remove, fastcopy = table.insert, table.remove, table.fastcopy
18local concat = table.concat
19local totable = string.totable
20
21local allocate          = utilities.storage.allocate
22local sequencers        = utilities.sequencers
23local textlineactions   = resolvers.openers.helpers.textlineactions
24local setmetatableindex = table.setmetatableindex
25
26-- We will hook regime handling code into the input methods.
27
28local trace_translating = false  trackers.register("regimes.translating", function(v) trace_translating = v end)
29
30local report_loading     = logs.reporter("regimes","loading")
31local report_translating = logs.reporter("regimes","translating")
32
33regimes        = regimes or { }
34local regimes  = regimes
35
36local mapping  = allocate {
37    utf = false
38}
39
40local backmapping = allocate {
41}
42
43-- regimes.mapping  = mapping
44
45local synonyms = { -- backward compatibility list
46
47    ["windows-1250"] = "cp1250",
48    ["windows-1251"] = "cp1251",
49    ["windows-1252"] = "cp1252",
50    ["windows-1253"] = "cp1253",
51    ["windows-1254"] = "cp1254",
52    ["windows-1255"] = "cp1255",
53    ["windows-1256"] = "cp1256",
54    ["windows-1257"] = "cp1257",
55    ["windows-1258"] = "cp1258",
56
57    ["il1"]          = "8859-1",
58    ["il2"]          = "8859-2",
59    ["il3"]          = "8859-3",
60    ["il4"]          = "8859-4",
61    ["il5"]          = "8859-9",
62    ["il6"]          = "8859-10",
63    ["il7"]          = "8859-13",
64    ["il8"]          = "8859-14",
65    ["il9"]          = "8859-15",
66    ["il10"]         = "8859-16",
67
68    ["iso-8859-1"]   = "8859-1",
69    ["iso-8859-2"]   = "8859-2",
70    ["iso-8859-3"]   = "8859-3",
71    ["iso-8859-4"]   = "8859-4",
72    ["iso-8859-9"]   = "8859-9",
73    ["iso-8859-10"]  = "8859-10",
74    ["iso-8859-13"]  = "8859-13",
75    ["iso-8859-14"]  = "8859-14",
76    ["iso-8859-15"]  = "8859-15",
77    ["iso-8859-16"]  = "8859-16",
78
79    ["latin1"]       = "8859-1",
80    ["latin2"]       = "8859-2",
81    ["latin3"]       = "8859-3",
82    ["latin4"]       = "8859-4",
83    ["latin5"]       = "8859-9",
84    ["latin6"]       = "8859-10",
85    ["latin7"]       = "8859-13",
86    ["latin8"]       = "8859-14",
87    ["latin9"]       = "8859-15",
88    ["latin10"]      = "8859-16",
89
90    ["utf-8"]        = "utf",
91    ["utf8"]         = "utf",
92    [""]             = "utf",
93
94    ["windows"]      = "cp1252",
95
96    ["pdf"]          = "pdfdoc",
97
98    ["437"]          = "ibm",
99}
100
101local currentregime = "utf"
102
103local function loadregime(mapping,regime)
104    regime = lower(tostring(regime))
105    regime = synonyms[regime] or synonyms["windows-"..regime] or regime
106    local name = resolvers.findfile(format("regi-%s.lua",regime)) or ""
107    local data = name ~= "" and dofile(name)
108    if data then
109        vector = { }
110        for eightbit, unicode in next, data do
111            vector[char(eightbit)] = utfchar(unicode)
112        end
113        report_loading("vector %a is loaded",regime)
114    else
115        vector = false
116        report_loading("vector %a is unknown",regime)
117    end
118    mapping[regime] = vector
119    return vector
120end
121
122local function loadreverse(t,k)
123    local t = { }
124    local m = mapping[k]
125    if m then
126        for k, v in next, m do
127            t[v] = k
128        end
129    end
130    backmapping[k] = t
131    return t
132end
133
134setmetatableindex(mapping,    loadregime)
135setmetatableindex(backmapping,loadreverse)
136
137regimes.mapping     = mapping
138regimes.backmapping = backmapping
139
140local function fromregime(regime,line)
141    if line and #line > 0 then
142     -- local map = mapping[regime and synonyms[regime] or regime or currentregime]
143        local map = mapping[regime or currentregime]
144        if map then
145            line = gsub(line,".",map)
146        end
147    end
148    return line
149end
150
151local cache = { } -- if really needed we can copy vectors and hash defaults
152
153setmetatableindex(cache, function(t,k)
154    local v = { remappers = { } }
155    t[k] = v
156    return v
157end)
158
159local function toregime(vector,str,default) -- toregime('8859-1',"abcde Ä","?")
160    local d = default or "?"
161    local c = cache[vector].remappers
162    local r = c[d]
163    if not r then
164        local t = fastcopy(backmapping[vector])
165     -- r = utf.remapper(t) -- not good for defaults here
166        local pattern = Cs((lpeg.utfchartabletopattern(t)/t + lpeg.patterns.utf8character/d + P(1)/d)^0)
167        r = function(str)
168            if not str or str == "" then
169                return ""
170            else
171                return lpegmatch(pattern,str)
172            end
173        end
174        c[d]  = r
175    end
176    return r(str)
177end
178
179local function disable()
180    currentregime = "utf"
181    sequencers.disableaction(textlineactions,"regimes.process")
182    return currentregime
183end
184
185local function enable(regime)
186    regime = synonyms[regime] or regime
187    if mapping[regime] == false then
188        disable()
189    else
190        currentregime = regime
191        sequencers.enableaction(textlineactions,"regimes.process")
192    end
193    return currentregime
194end
195
196regimes.toregime   = toregime
197regimes.fromregime = fromregime
198regimes.translate  = function(str,regime) return fromregime(regime,str) end
199regimes.enable     = enable
200regimes.disable    = disable
201
202-- The following function can be used when we want to make sure that utf gets passed
203-- unharmed. This is needed for modules.
204
205local level = 0
206
207function regimes.process(str,filename,currentline,noflines,coding)
208    if level == 0 and coding ~= "utf-8" then
209        str = fromregime(currentregime,str)
210        if trace_translating then
211            report_translating("utf: %s",str)
212        end
213    end
214    return str
215end
216
217local function push()
218    level = level + 1
219    if trace_translating then
220        report_translating("pushing level %s",level)
221    end
222end
223
224local function pop()
225    if level > 0 then
226        if trace_translating then
227            report_translating("popping level %s",level)
228        end
229        level = level - 1
230    end
231end
232
233regimes.push = push
234regimes.pop  = pop
235
236function regimes.list()
237    local name = resolvers.findfile(format("regi-ini.lua",regime)) or ""
238    local okay = { }
239    if name then
240        local list = dir.glob(file.join(file.dirname(name),"regi-*.lua"))
241        for i=1,#list do
242            local name = list[i]
243            if name ~= "regi-ini.lua" then
244                okay[#okay+1] = match(name,"regi%-(.-)%.lua")
245            end
246            table.sort(okay)
247        end
248    end
249    return okay
250end
251
252sequencers.prependaction(textlineactions,"system","regimes.process")
253sequencers.disableaction(textlineactions,"regimes.process")
254
255-- Next we provide some hacks. Unfortunately we run into crappy encoded (read:
256-- mixed) encoded xml files that have these ë ä ö ü sequences instead of ë ä ö ü
257-- etc.
258
259local patterns = { }
260
261function regimes.cleanup(regime,str)
262    if not str or str == "" then
263        return str
264    end
265    local p = patterns[regime]
266    if p == nil then
267        regime = regime and synonyms[regime] or regime or currentregime
268        local vector = regime ~= "utf" and regime ~= "utf-8" and mapping[regime]
269        if vector then
270            local mapping = { }
271            for k, v in next, vector do
272                local split = totable(v)
273                for i=1,#split do
274                    split[i] = utfchar(byte(split[i]))
275                end
276                split = concat(split)
277                if v ~= split then
278                    mapping[split] = v
279                end
280            end
281            p = Cs((lpeg.utfchartabletopattern(mapping)/mapping+P(1))^0)
282        else
283            p = false
284        end
285        patterns[regime] = p
286    end
287    return p and lpegmatch(p,str) or str
288end
289
290-- local old = [[test ë ä ö ü crap]]
291-- local new = regimes.cleanup("cp1252",old)
292-- report_translating("%s -> %s",old,new)
293-- local old = "Pozn" .. char(0xE1) .. "mky"
294-- local new = fromregime("cp1250",old)
295-- report_translating("%s -> %s",old,new)
296
297-- interface (might move to regi-tex.lua)
298
299if interfaces then
300
301    local implement = interfaces.implement
302    local setmacro  = interfaces.setmacro
303
304    implement {
305        name      = "enableregime",
306        public    = true,
307        protected = true,
308        arguments = "optional",
309        actions   = function(regime) setmacro("currentregime",enable(regime)) end
310    }
311
312    implement {
313        name      = "disableregime",
314        public    = true,
315        protected = true,
316        actions   = function() setmacro("currentregime",disable()) end
317    }
318
319    implement {
320        name      = "pushregime",
321        public    = true,
322        protected = true,
323        actions   = push
324    }
325
326    implement {
327        name      = "popregime",
328        public    = true,
329        protected = true,
330        actions   = pop
331    }
332
333    local stack = { }
334
335    implement {
336        name      = "startregime",
337        public    = true,
338        protected = true,
339        arguments = "optional",
340        actions   = function(regime)
341            insert(stack,currentregime)
342            if trace_translating then
343                report_translating("start using %a",regime)
344            end
345            setmacro("currentregime",enable(regime))
346        end
347    }
348
349    implement {
350        name      = "stopregime",
351        public    = true,
352        protected = true,
353        actions   = function()
354            if #stack > 0 then
355                local regime = remove(stack)
356                if trace_translating then
357                    report_translating("stop using %a",regime)
358                end
359                setmacro("currentregime",enable(regime))
360            end
361        end
362    }
363
364end
365