regi-ini.lua /size: 13 Kb    last modification: 2020-07-01 14:35
1if not modules then modules = { } end modules ['regi-ini'] = {
2    version   = 1.001,
3    comment   = "companion to regi-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9--[[ldx--
10<p>Regimes take care of converting the input characters into
11<l n='utf'/> sequences. The conversion tables are loaded at
12runtime.</p>
13--ldx]]--
14
15-- Todo: use regi-imp*.lua instead
16
17local commands, context = commands, context
18
19
20local tostring = tostring
21local utfchar = utf.char
22local P, Cs, Cc, lpegmatch = lpeg.P, lpeg.Cs, lpeg.Cc, lpeg.match
23local char, gsub, format, gmatch, byte, match, lower = string.char, string.gsub, string.format, string.gmatch, string.byte, string.match, string.lower
24local next = next
25local insert, remove, fastcopy = table.insert, table.remove, table.fastcopy
26local concat = table.concat
27local totable = string.totable
28
29local allocate          = utilities.storage.allocate
30local sequencers        = utilities.sequencers
31local textlineactions   = resolvers.openers.helpers.textlineactions
32local setmetatableindex = table.setmetatableindex
33
34--[[ldx--
35<p>We will hook regime handling code into the input methods.</p>
36--ldx]]--
37
38local trace_translating = false  trackers.register("regimes.translating", function(v) trace_translating = v end)
39
40local report_loading     = logs.reporter("regimes","loading")
41local report_translating = logs.reporter("regimes","translating")
42
43regimes        = regimes or { }
44local regimes  = regimes
45
46local mapping  = allocate {
47    utf = false
48}
49
50local backmapping = allocate {
51}
52
53-- regimes.mapping  = mapping
54
55local synonyms = { -- backward compatibility list
56
57    ["windows-1250"] = "cp1250",
58    ["windows-1251"] = "cp1251",
59    ["windows-1252"] = "cp1252",
60    ["windows-1253"] = "cp1253",
61    ["windows-1254"] = "cp1254",
62    ["windows-1255"] = "cp1255",
63    ["windows-1256"] = "cp1256",
64    ["windows-1257"] = "cp1257",
65    ["windows-1258"] = "cp1258",
66
67    ["il1"]          = "8859-1",
68    ["il2"]          = "8859-2",
69    ["il3"]          = "8859-3",
70    ["il4"]          = "8859-4",
71    ["il5"]          = "8859-9",
72    ["il6"]          = "8859-10",
73    ["il7"]          = "8859-13",
74    ["il8"]          = "8859-14",
75    ["il9"]          = "8859-15",
76    ["il10"]         = "8859-16",
77
78    ["iso-8859-1"]   = "8859-1",
79    ["iso-8859-2"]   = "8859-2",
80    ["iso-8859-3"]   = "8859-3",
81    ["iso-8859-4"]   = "8859-4",
82    ["iso-8859-9"]   = "8859-9",
83    ["iso-8859-10"]  = "8859-10",
84    ["iso-8859-13"]  = "8859-13",
85    ["iso-8859-14"]  = "8859-14",
86    ["iso-8859-15"]  = "8859-15",
87    ["iso-8859-16"]  = "8859-16",
88
89    ["latin1"]       = "8859-1",
90    ["latin2"]       = "8859-2",
91    ["latin3"]       = "8859-3",
92    ["latin4"]       = "8859-4",
93    ["latin5"]       = "8859-9",
94    ["latin6"]       = "8859-10",
95    ["latin7"]       = "8859-13",
96    ["latin8"]       = "8859-14",
97    ["latin9"]       = "8859-15",
98    ["latin10"]      = "8859-16",
99
100    ["utf-8"]        = "utf",
101    ["utf8"]         = "utf",
102    [""]             = "utf",
103
104    ["windows"]      = "cp1252",
105
106    ["pdf"]          = "pdfdoc",
107
108    ["437"]          = "ibm",
109}
110
111local currentregime = "utf"
112
113local function loadregime(mapping,regime)
114    regime = lower(tostring(regime))
115    regime = synonyms[regime] or synonyms["windows-"..regime] or regime
116    local name = resolvers.findfile(format("regi-%s.lua",regime)) or ""
117    local data = name ~= "" and dofile(name)
118    if data then
119        vector = { }
120        for eightbit, unicode in next, data do
121            vector[char(eightbit)] = utfchar(unicode)
122        end
123        report_loading("vector %a is loaded",regime)
124    else
125        vector = false
126        report_loading("vector %a is unknown",regime)
127    end
128    mapping[regime] = vector
129    return vector
130end
131
132local function loadreverse(t,k)
133    local t = { }
134    local m = mapping[k]
135    if m then
136        for k, v in next, m do
137            t[v] = k
138        end
139    end
140    backmapping[k] = t
141    return t
142end
143
144setmetatableindex(mapping,    loadregime)
145setmetatableindex(backmapping,loadreverse)
146
147regimes.mapping     = mapping
148regimes.backmapping = backmapping
149
150local function fromregime(regime,line)
151    if line and #line > 0 then
152--         local map = mapping[regime and synonyms[regime] or regime or currentregime]
153        local map = mapping[regime or currentregime]
154        if map then
155            line = gsub(line,".",map)
156        end
157    end
158    return line
159end
160
161-- local remappers = { }
162--
163-- local function toregime(vector,str,default) -- toregime('8859-1',"abcde Ä","?")
164--     local t = backmapping[vector]
165--     local remapper = remappers[vector]
166--     if not remapper then
167--         remapper = utf.remapper(t)
168--         remappers[t] = remapper
169--     end
170--     local m = getmetatable(t)
171--     setmetatableindex(t, function(t,k)
172--         local v = default or "?"
173--         t[k] = v
174--         return v
175--     end)
176--     str = remapper(str)
177--     setmetatable(t,m)
178--     return str
179-- end
180--
181-- -- much faster (but only matters when we have > 10K calls
182
183local cache = { } -- if really needed we can copy vectors and hash defaults
184
185setmetatableindex(cache, function(t,k)
186    local v = { remappers = { } }
187    t[k] = v
188    return v
189end)
190
191local function toregime(vector,str,default) -- toregime('8859-1',"abcde Ä","?")
192    local d = default or "?"
193    local c = cache[vector].remappers
194    local r = c[d]
195    if not r then
196        local t = fastcopy(backmapping[vector])
197     -- r = utf.remapper(t) -- not good for defaults here
198        local pattern = Cs((lpeg.utfchartabletopattern(t)/t + lpeg.patterns.utf8character/d + P(1)/d)^0)
199        r = function(str)
200            if not str or str == "" then
201                return ""
202            else
203                return lpegmatch(pattern,str)
204            end
205        end
206        c[d]  = r
207    end
208    return r(str)
209end
210
211local function disable()
212    currentregime = "utf"
213    sequencers.disableaction(textlineactions,"regimes.process")
214    return currentregime
215end
216
217local function enable(regime)
218    regime = synonyms[regime] or regime
219    if mapping[regime] == false then
220        disable()
221    else
222        currentregime = regime
223        sequencers.enableaction(textlineactions,"regimes.process")
224    end
225    return currentregime
226end
227
228regimes.toregime   = toregime
229regimes.fromregime = fromregime
230regimes.translate  = function(str,regime) return fromregime(regime,str) end
231regimes.enable     = enable
232regimes.disable    = disable
233
234-- The following function can be used when we want to make sure that
235-- utf gets passed unharmed. This is needed for modules.
236
237local level = 0
238
239function regimes.process(str,filename,currentline,noflines,coding)
240    if level == 0 and coding ~= "utf-8" then
241        str = fromregime(currentregime,str)
242        if trace_translating then
243            report_translating("utf: %s",str)
244        end
245    end
246    return str
247end
248
249local function push()
250    level = level + 1
251    if trace_translating then
252        report_translating("pushing level %s",level)
253    end
254end
255
256local function pop()
257    if level > 0 then
258        if trace_translating then
259            report_translating("popping level %s",level)
260        end
261        level = level - 1
262    end
263end
264
265regimes.push = push
266regimes.pop  = pop
267
268function regimes.list()
269    local name = resolvers.findfile(format("regi-ini.lua",regime)) or ""
270    local okay = { }
271    if name then
272        local list = dir.glob(file.join(file.dirname(name),"regi-*.lua"))
273        for i=1,#list do
274            local name = list[i]
275            if name ~= "regi-ini.lua" then
276                okay[#okay+1] = match(name,"regi%-(.-)%.lua")
277            end
278            table.sort(okay)
279        end
280    end
281    return okay
282end
283
284if sequencers then
285
286    sequencers.prependaction(textlineactions,"system","regimes.process")
287    sequencers.disableaction(textlineactions,"regimes.process")
288
289end
290
291-- Next we provide some hacks. Unfortunately we run into crappy encoded
292-- (read : mixed) encoded xml files that have these ë ä ö ü sequences
293-- instead of ë ä ö ü
294
295local patterns = { }
296
297-- function regimes.cleanup(regime,str)
298--     local p = patterns[regime]
299--     if p == nil then
300--         regime = regime and synonyms[regime] or regime or currentregime
301--         local vector = regime ~= "utf" and mapping[regime]
302--         if vector then
303--             local list = { }
304--             for k, uchar in next, vector do
305--                 local stream = totable(uchar)
306--                 for i=1,#stream do
307--                     stream[i] = vector[stream[i]]
308--                 end
309--                 list[concat(stream)] = uchar
310--             end
311--             p = lpeg.append(list,nil,true)
312--             p = Cs((p+1)^0)
313--          -- lpeg.print(p) -- size 1604
314--         else
315--             p = false
316--         end
317--         patterns[vector] = p
318--     end
319--     return p and lpegmatch(p,str) or str
320-- end
321--
322-- twice as fast and much less lpeg bytecode
323
324-- function regimes.cleanup(regime,str)
325--     if not str or str == "" then
326--         return str
327--     end
328--     local p = patterns[regime]
329--     if p == nil then
330--         regime = regime and synonyms[regime] or regime or currentregime
331--         local vector = regime ~= "utf" and regime ~= "utf-8" and mapping[regime]
332--         if vector then
333--             local utfchars = { }
334--             local firsts = { }
335--             for k, uchar in next, vector do
336--                 local stream = { }
337--                 local split = totable(uchar)
338--                 local nofsplits = #split
339--                 if nofsplits > 1 then
340--                     local first
341--                     for i=1,nofsplits do
342--                         local u = vector[split[i]]
343--                         if not first then
344--                             first = firsts[u]
345--                             if not first then
346--                                 first = { }
347--                                 firsts[u] = first
348--                             end
349--                         end
350--                         stream[i] = u
351--                     end
352--                     local nofstream = #stream
353--                     if nofstream > 1 then
354--                         first[#first+1] = concat(stream,2,nofstream)
355--                         utfchars[concat(stream)] = uchar
356--                     end
357--                 end
358--             end
359--             p = P(false)
360--             for k, v in next, firsts do
361--                 local q = P(false)
362--                 for i=1,#v do
363--                     q = q + P(v[i])
364--                 end
365--                 p = p + P(k) * q
366--             end
367--             p = Cs(((p+1)/utfchars)^1)
368--          -- lpeg.print(p) -- size: 1042
369--         else
370--             p = false
371--         end
372--         patterns[regime] = p
373--     end
374--     return p and lpegmatch(p,str) or str
375-- end
376--
377-- 5 times faster:
378
379function regimes.cleanup(regime,str)
380    if not str or str == "" then
381        return str
382    end
383    local p = patterns[regime]
384    if p == nil then
385        regime = regime and synonyms[regime] or regime or currentregime
386        local vector = regime ~= "utf" and regime ~= "utf-8" and mapping[regime]
387        if vector then
388            local mapping = { }
389            for k, v in next, vector do
390                local split = totable(v)
391                for i=1,#split do
392                    split[i] = utfchar(byte(split[i]))
393                end
394                split = concat(split)
395                if v ~= split then
396                    mapping[split] = v
397                end
398            end
399            p = Cs((lpeg.utfchartabletopattern(mapping)/mapping+P(1))^0)
400        else
401            p = false
402        end
403        patterns[regime] = p
404    end
405    return p and lpegmatch(p,str) or str
406end
407
408-- local old = [[test ë ä ö ü crap]]
409-- local new = regimes.cleanup("cp1252",old)
410-- report_translating("%s -> %s",old,new)
411-- local old = "Pozn" .. char(0xE1) .. "mky"
412-- local new = fromregime("cp1250",old)
413-- report_translating("%s -> %s",old,new)
414
415-- interface (might move to regi-tex.lua)
416
417if interfaces then
418
419    local implement = interfaces.implement
420    local setmacro  = interfaces.setmacro
421
422    implement {
423        name      = "enableregime",
424        arguments = "string",
425        actions   = function(regime) setmacro("currentregime",enable(regime)) end
426    }
427
428    implement {
429        name      = "disableregime",
430        actions   = function() setmacro("currentregime",disable()) end
431    }
432
433    implement {
434        name      = "pushregime",
435        actions   = push
436    }
437
438    implement {
439        name      = "popregime",
440        actions   = pop
441    }
442
443    local stack = { }
444
445    implement {
446        name      = "startregime",
447        arguments = "string",
448        actions   = function(regime)
449            insert(stack,currentregime)
450            if trace_translating then
451                report_translating("start using %a",regime)
452            end
453            setmacro("currentregime",enable(regime))
454        end
455    }
456
457    implement {
458        name      = "stopregime",
459        actions   = function()
460            if #stack > 0 then
461                local regime = remove(stack)
462                if trace_translating then
463                    report_translating("stop using %a",regime)
464                end
465                setmacro("currentregime",enable(regime))
466            end
467        end
468    }
469
470end
471
472-- Actually we can have a function returned from the lookup but we don't
473-- really use this code so I'm in no hurry.
474
475-- if os.getcodepage then
476--     local cod, acp, map
477--     function os.tocodepage(name)
478--         if map == nil then
479--             cod, acp = os.getcodepage()
480--             map = cod and cod ~= 65001 and regimes.toregime
481--         end
482--         return map and map(cod,name) or name
483--     end
484-- else
485--     function os.tocodepage(name)
486--         return name
487--     end
488-- end
489