regi-ini.lua /size: 13 Kb    last modification: 2023-12-21 09:44
1if not modules then modules = { } end modules ['regi-ini'] = {
2    version   = 1.001,
3    comment   = "companion to regi-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- Regimes take care of converting the input characters into UTF sequences. The
10-- conversion tables are loaded at runtime.
11
12-- Todo: use regi-imp*.lua instead
13
14local commands, context = commands, context
15
16local tostring = tostring
17local utfchar = utf.char
18local P, Cs, Cc, lpegmatch = lpeg.P, lpeg.Cs, lpeg.Cc, lpeg.match
19local char, gsub, format, gmatch, byte, match, lower = string.char, string.gsub, string.format, string.gmatch, string.byte, string.match, string.lower
20local next = next
21local insert, remove, fastcopy = table.insert, table.remove, table.fastcopy
22local concat = table.concat
23local totable = string.totable
24
25local allocate          = utilities.storage.allocate
26local sequencers        = utilities.sequencers
27local textlineactions   = resolvers.openers.helpers.textlineactions
28local setmetatableindex = table.setmetatableindex
29
30-- We will hook regime handling code into the input methods.
31
32local trace_translating = false  trackers.register("regimes.translating", function(v) trace_translating = v end)
33
34local report_loading     = logs.reporter("regimes","loading")
35local report_translating = logs.reporter("regimes","translating")
36
37regimes        = regimes or { }
38local regimes  = regimes
39
40local mapping  = allocate {
41    utf = false
42}
43
44local backmapping = allocate {
45}
46
47-- regimes.mapping  = mapping
48
49local synonyms = { -- backward compatibility list
50
51    ["windows-1250"] = "cp1250",
52    ["windows-1251"] = "cp1251",
53    ["windows-1252"] = "cp1252",
54    ["windows-1253"] = "cp1253",
55    ["windows-1254"] = "cp1254",
56    ["windows-1255"] = "cp1255",
57    ["windows-1256"] = "cp1256",
58    ["windows-1257"] = "cp1257",
59    ["windows-1258"] = "cp1258",
60
61    ["il1"]          = "8859-1",
62    ["il2"]          = "8859-2",
63    ["il3"]          = "8859-3",
64    ["il4"]          = "8859-4",
65    ["il5"]          = "8859-9",
66    ["il6"]          = "8859-10",
67    ["il7"]          = "8859-13",
68    ["il8"]          = "8859-14",
69    ["il9"]          = "8859-15",
70    ["il10"]         = "8859-16",
71
72    ["iso-8859-1"]   = "8859-1",
73    ["iso-8859-2"]   = "8859-2",
74    ["iso-8859-3"]   = "8859-3",
75    ["iso-8859-4"]   = "8859-4",
76    ["iso-8859-9"]   = "8859-9",
77    ["iso-8859-10"]  = "8859-10",
78    ["iso-8859-13"]  = "8859-13",
79    ["iso-8859-14"]  = "8859-14",
80    ["iso-8859-15"]  = "8859-15",
81    ["iso-8859-16"]  = "8859-16",
82
83    ["latin1"]       = "8859-1",
84    ["latin2"]       = "8859-2",
85    ["latin3"]       = "8859-3",
86    ["latin4"]       = "8859-4",
87    ["latin5"]       = "8859-9",
88    ["latin6"]       = "8859-10",
89    ["latin7"]       = "8859-13",
90    ["latin8"]       = "8859-14",
91    ["latin9"]       = "8859-15",
92    ["latin10"]      = "8859-16",
93
94    ["utf-8"]        = "utf",
95    ["utf8"]         = "utf",
96    [""]             = "utf",
97
98    ["windows"]      = "cp1252",
99
100    ["pdf"]          = "pdfdoc",
101
102    ["437"]          = "ibm",
103}
104
105local currentregime = "utf"
106
107local function loadregime(mapping,regime)
108    regime = lower(tostring(regime))
109    regime = synonyms[regime] or synonyms["windows-"..regime] or regime
110    local name = resolvers.findfile(format("regi-%s.lua",regime)) or ""
111    local data = name ~= "" and dofile(name)
112    if data then
113        vector = { }
114        for eightbit, unicode in next, data do
115            vector[char(eightbit)] = utfchar(unicode)
116        end
117        report_loading("vector %a is loaded",regime)
118    else
119        vector = false
120        report_loading("vector %a is unknown",regime)
121    end
122    mapping[regime] = vector
123    return vector
124end
125
126local function loadreverse(t,k)
127    local t = { }
128    local m = mapping[k]
129    if m then
130        for k, v in next, m do
131            t[v] = k
132        end
133    end
134    backmapping[k] = t
135    return t
136end
137
138setmetatableindex(mapping,    loadregime)
139setmetatableindex(backmapping,loadreverse)
140
141regimes.mapping     = mapping
142regimes.backmapping = backmapping
143
144local function fromregime(regime,line)
145    if line and #line > 0 then
146--         local map = mapping[regime and synonyms[regime] or regime or currentregime]
147        local map = mapping[regime or currentregime]
148        if map then
149            line = gsub(line,".",map)
150        end
151    end
152    return line
153end
154
155-- local remappers = { }
156--
157-- local function toregime(vector,str,default) -- toregime('8859-1',"abcde Ä","?")
158--     local t = backmapping[vector]
159--     local remapper = remappers[vector]
160--     if not remapper then
161--         remapper = utf.remapper(t)
162--         remappers[t] = remapper
163--     end
164--     local m = getmetatable(t)
165--     setmetatableindex(t, function(t,k)
166--         local v = default or "?"
167--         t[k] = v
168--         return v
169--     end)
170--     str = remapper(str)
171--     setmetatable(t,m)
172--     return str
173-- end
174--
175-- -- much faster (but only matters when we have > 10K calls
176
177local cache = { } -- if really needed we can copy vectors and hash defaults
178
179setmetatableindex(cache, function(t,k)
180    local v = { remappers = { } }
181    t[k] = v
182    return v
183end)
184
185local function toregime(vector,str,default) -- toregime('8859-1',"abcde Ä","?")
186    local d = default or "?"
187    local c = cache[vector].remappers
188    local r = c[d]
189    if not r then
190        local t = fastcopy(backmapping[vector])
191     -- r = utf.remapper(t) -- not good for defaults here
192        local pattern = Cs((lpeg.utfchartabletopattern(t)/t + lpeg.patterns.utf8character/d + P(1)/d)^0)
193        r = function(str)
194            if not str or str == "" then
195                return ""
196            else
197                return lpegmatch(pattern,str)
198            end
199        end
200        c[d]  = r
201    end
202    return r(str)
203end
204
205local function disable()
206    currentregime = "utf"
207    sequencers.disableaction(textlineactions,"regimes.process")
208    return currentregime
209end
210
211local function enable(regime)
212    regime = synonyms[regime] or regime
213    if mapping[regime] == false then
214        disable()
215    else
216        currentregime = regime
217        sequencers.enableaction(textlineactions,"regimes.process")
218    end
219    return currentregime
220end
221
222regimes.toregime   = toregime
223regimes.fromregime = fromregime
224regimes.translate  = function(str,regime) return fromregime(regime,str) end
225regimes.enable     = enable
226regimes.disable    = disable
227
228-- The following function can be used when we want to make sure that
229-- utf gets passed unharmed. This is needed for modules.
230
231local level = 0
232
233function regimes.process(str,filename,currentline,noflines,coding)
234    if level == 0 and coding ~= "utf-8" then
235        str = fromregime(currentregime,str)
236        if trace_translating then
237            report_translating("utf: %s",str)
238        end
239    end
240    return str
241end
242
243local function push()
244    level = level + 1
245    if trace_translating then
246        report_translating("pushing level %s",level)
247    end
248end
249
250local function pop()
251    if level > 0 then
252        if trace_translating then
253            report_translating("popping level %s",level)
254        end
255        level = level - 1
256    end
257end
258
259regimes.push = push
260regimes.pop  = pop
261
262function regimes.list()
263    local name = resolvers.findfile(format("regi-ini.lua",regime)) or ""
264    local okay = { }
265    if name then
266        local list = dir.glob(file.join(file.dirname(name),"regi-*.lua"))
267        for i=1,#list do
268            local name = list[i]
269            if name ~= "regi-ini.lua" then
270                okay[#okay+1] = match(name,"regi%-(.-)%.lua")
271            end
272            table.sort(okay)
273        end
274    end
275    return okay
276end
277
278if sequencers then
279
280    sequencers.prependaction(textlineactions,"system","regimes.process")
281    sequencers.disableaction(textlineactions,"regimes.process")
282
283end
284
285-- Next we provide some hacks. Unfortunately we run into crappy encoded
286-- (read : mixed) encoded xml files that have these ë ä ö ü sequences
287-- instead of ë ä ö ü
288
289local patterns = { }
290
291-- function regimes.cleanup(regime,str)
292--     local p = patterns[regime]
293--     if p == nil then
294--         regime = regime and synonyms[regime] or regime or currentregime
295--         local vector = regime ~= "utf" and mapping[regime]
296--         if vector then
297--             local list = { }
298--             for k, uchar in next, vector do
299--                 local stream = totable(uchar)
300--                 for i=1,#stream do
301--                     stream[i] = vector[stream[i]]
302--                 end
303--                 list[concat(stream)] = uchar
304--             end
305--             p = lpeg.append(list,nil,true)
306--             p = Cs((p+1)^0)
307--          -- lpeg.print(p) -- size 1604
308--         else
309--             p = false
310--         end
311--         patterns[vector] = p
312--     end
313--     return p and lpegmatch(p,str) or str
314-- end
315--
316-- twice as fast and much less lpeg bytecode
317
318-- function regimes.cleanup(regime,str)
319--     if not str or str == "" then
320--         return str
321--     end
322--     local p = patterns[regime]
323--     if p == nil then
324--         regime = regime and synonyms[regime] or regime or currentregime
325--         local vector = regime ~= "utf" and regime ~= "utf-8" and mapping[regime]
326--         if vector then
327--             local utfchars = { }
328--             local firsts = { }
329--             for k, uchar in next, vector do
330--                 local stream = { }
331--                 local split = totable(uchar)
332--                 local nofsplits = #split
333--                 if nofsplits > 1 then
334--                     local first
335--                     for i=1,nofsplits do
336--                         local u = vector[split[i]]
337--                         if not first then
338--                             first = firsts[u]
339--                             if not first then
340--                                 first = { }
341--                                 firsts[u] = first
342--                             end
343--                         end
344--                         stream[i] = u
345--                     end
346--                     local nofstream = #stream
347--                     if nofstream > 1 then
348--                         first[#first+1] = concat(stream,2,nofstream)
349--                         utfchars[concat(stream)] = uchar
350--                     end
351--                 end
352--             end
353--             p = P(false)
354--             for k, v in next, firsts do
355--                 local q = P(false)
356--                 for i=1,#v do
357--                     q = q + P(v[i])
358--                 end
359--                 p = p + P(k) * q
360--             end
361--             p = Cs(((p+1)/utfchars)^1)
362--          -- lpeg.print(p) -- size: 1042
363--         else
364--             p = false
365--         end
366--         patterns[regime] = p
367--     end
368--     return p and lpegmatch(p,str) or str
369-- end
370--
371-- 5 times faster:
372
373function regimes.cleanup(regime,str)
374    if not str or str == "" then
375        return str
376    end
377    local p = patterns[regime]
378    if p == nil then
379        regime = regime and synonyms[regime] or regime or currentregime
380        local vector = regime ~= "utf" and regime ~= "utf-8" and mapping[regime]
381        if vector then
382            local mapping = { }
383            for k, v in next, vector do
384                local split = totable(v)
385                for i=1,#split do
386                    split[i] = utfchar(byte(split[i]))
387                end
388                split = concat(split)
389                if v ~= split then
390                    mapping[split] = v
391                end
392            end
393            p = Cs((lpeg.utfchartabletopattern(mapping)/mapping+P(1))^0)
394        else
395            p = false
396        end
397        patterns[regime] = p
398    end
399    return p and lpegmatch(p,str) or str
400end
401
402-- local old = [[test ë ä ö ü crap]]
403-- local new = regimes.cleanup("cp1252",old)
404-- report_translating("%s -> %s",old,new)
405-- local old = "Pozn" .. char(0xE1) .. "mky"
406-- local new = fromregime("cp1250",old)
407-- report_translating("%s -> %s",old,new)
408
409-- interface (might move to regi-tex.lua)
410
411if interfaces then
412
413    local implement = interfaces.implement
414    local setmacro  = interfaces.setmacro
415
416    implement {
417        name      = "enableregime",
418        arguments = "string",
419        actions   = function(regime) setmacro("currentregime",enable(regime)) end
420    }
421
422    implement {
423        name      = "disableregime",
424        actions   = function() setmacro("currentregime",disable()) end
425    }
426
427    implement {
428        name      = "pushregime",
429        actions   = push
430    }
431
432    implement {
433        name      = "popregime",
434        actions   = pop
435    }
436
437    local stack = { }
438
439    implement {
440        name      = "startregime",
441        arguments = "string",
442        actions   = function(regime)
443            insert(stack,currentregime)
444            if trace_translating then
445                report_translating("start using %a",regime)
446            end
447            setmacro("currentregime",enable(regime))
448        end
449    }
450
451    implement {
452        name      = "stopregime",
453        actions   = function()
454            if #stack > 0 then
455                local regime = remove(stack)
456                if trace_translating then
457                    report_translating("stop using %a",regime)
458                end
459                setmacro("currentregime",enable(regime))
460            end
461        end
462    }
463
464end
465
466-- Actually we can have a function returned from the lookup but we don't
467-- really use this code so I'm in no hurry.
468
469-- if os.getcodepage then
470--     local cod, acp, map
471--     function os.tocodepage(name)
472--         if map == nil then
473--             cod, acp = os.getcodepage()
474--             map = cod and cod ~= 65001 and regimes.toregime
475--         end
476--         return map and map(cod,name) or name
477--     end
478-- else
479--     function os.tocodepage(name)
480--         return name
481--     end
482-- end
483