if not modules then modules = { } end modules ['regi-ini'] = { version = 1.001, comment = "companion to regi-ini.mkiv", author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", copyright = "PRAGMA ADE / ConTeXt Development Team", license = "see context related readme files" } -- Regimes take care of converting the input characters into UTF sequences. The -- conversion tables are loaded at runtime. local tostring = tostring local utfchar = utf.char local P, Cs, Cc, lpegmatch = lpeg.P, lpeg.Cs, lpeg.Cc, lpeg.match local char, gsub, format, gmatch, byte, match, lower = string.char, string.gsub, string.format, string.gmatch, string.byte, string.match, string.lower local next = next local insert, remove, fastcopy = table.insert, table.remove, table.fastcopy local concat = table.concat local totable = string.totable local allocate = utilities.storage.allocate local sequencers = utilities.sequencers local textlineactions = resolvers.openers.helpers.textlineactions local setmetatableindex = table.setmetatableindex -- We will hook regime handling code into the input methods. local trace_translating = false trackers.register("regimes.translating", function(v) trace_translating = v end) local report_loading = logs.reporter("regimes","loading") local report_translating = logs.reporter("regimes","translating") regimes = regimes or { } local regimes = regimes local mapping = allocate { utf = false } local backmapping = allocate { } -- regimes.mapping = mapping local synonyms = { -- backward compatibility list ["windows-1250"] = "cp1250", ["windows-1251"] = "cp1251", ["windows-1252"] = "cp1252", ["windows-1253"] = "cp1253", ["windows-1254"] = "cp1254", ["windows-1255"] = "cp1255", ["windows-1256"] = "cp1256", ["windows-1257"] = "cp1257", ["windows-1258"] = "cp1258", ["il1"] = "8859-1", ["il2"] = "8859-2", ["il3"] = "8859-3", ["il4"] = "8859-4", ["il5"] = "8859-9", ["il6"] = "8859-10", ["il7"] = "8859-13", ["il8"] = "8859-14", ["il9"] = "8859-15", ["il10"] = "8859-16", ["iso-8859-1"] = "8859-1", ["iso-8859-2"] = "8859-2", ["iso-8859-3"] = "8859-3", ["iso-8859-4"] = "8859-4", ["iso-8859-9"] = "8859-9", ["iso-8859-10"] = "8859-10", ["iso-8859-13"] = "8859-13", ["iso-8859-14"] = "8859-14", ["iso-8859-15"] = "8859-15", ["iso-8859-16"] = "8859-16", ["latin1"] = "8859-1", ["latin2"] = "8859-2", ["latin3"] = "8859-3", ["latin4"] = "8859-4", ["latin5"] = "8859-9", ["latin6"] = "8859-10", ["latin7"] = "8859-13", ["latin8"] = "8859-14", ["latin9"] = "8859-15", ["latin10"] = "8859-16", ["utf-8"] = "utf", ["utf8"] = "utf", [""] = "utf", ["windows"] = "cp1252", ["pdf"] = "pdfdoc", ["437"] = "ibm", } local currentregime = "utf" local function loadregime(mapping,regime) regime = lower(tostring(regime)) regime = synonyms[regime] or synonyms["windows-"..regime] or regime local name = resolvers.findfile(format("regi-%s.lua",regime)) or "" local data = name ~= "" and dofile(name) if data then vector = { } for eightbit, unicode in next, data do vector[char(eightbit)] = utfchar(unicode) end report_loading("vector %a is loaded",regime) else vector = false report_loading("vector %a is unknown",regime) end mapping[regime] = vector return vector end local function loadreverse(t,k) local t = { } local m = mapping[k] if m then for k, v in next, m do t[v] = k end end backmapping[k] = t return t end setmetatableindex(mapping, loadregime) setmetatableindex(backmapping,loadreverse) regimes.mapping = mapping regimes.backmapping = backmapping local function fromregime(regime,line) if line and #line > 0 then -- local map = mapping[regime and synonyms[regime] or regime or currentregime] local map = mapping[regime or currentregime] if map then line = gsub(line,".",map) end end return line end local cache = { } -- if really needed we can copy vectors and hash defaults setmetatableindex(cache, function(t,k) local v = { remappers = { } } t[k] = v return v end) local function toregime(vector,str,default) -- toregime('8859-1',"abcde Ä","?") local d = default or "?" local c = cache[vector].remappers local r = c[d] if not r then local t = fastcopy(backmapping[vector]) -- r = utf.remapper(t) -- not good for defaults here local pattern = Cs((lpeg.utfchartabletopattern(t)/t + lpeg.patterns.utf8character/d + P(1)/d)^0) r = function(str) if not str or str == "" then return "" else return lpegmatch(pattern,str) end end c[d] = r end return r(str) end local function disable() currentregime = "utf" sequencers.disableaction(textlineactions,"regimes.process") return currentregime end local function enable(regime) regime = synonyms[regime] or regime if mapping[regime] == false then disable() else currentregime = regime sequencers.enableaction(textlineactions,"regimes.process") end return currentregime end regimes.toregime = toregime regimes.fromregime = fromregime regimes.translate = function(str,regime) return fromregime(regime,str) end regimes.enable = enable regimes.disable = disable -- The following function can be used when we want to make sure that utf gets passed -- unharmed. This is needed for modules. local level = 0 function regimes.process(str,filename,currentline,noflines,coding) if level == 0 and coding ~= "utf-8" then str = fromregime(currentregime,str) if trace_translating then report_translating("utf: %s",str) end end return str end local function push() level = level + 1 if trace_translating then report_translating("pushing level %s",level) end end local function pop() if level > 0 then if trace_translating then report_translating("popping level %s",level) end level = level - 1 end end regimes.push = push regimes.pop = pop function regimes.list() local name = resolvers.findfile(format("regi-ini.lua",regime)) or "" local okay = { } if name then local list = dir.glob(file.join(file.dirname(name),"regi-*.lua")) for i=1,#list do local name = list[i] if name ~= "regi-ini.lua" then okay[#okay+1] = match(name,"regi%-(.-)%.lua") end table.sort(okay) end end return okay end sequencers.prependaction(textlineactions,"system","regimes.process") sequencers.disableaction(textlineactions,"regimes.process") -- Next we provide some hacks. Unfortunately we run into crappy encoded (read: -- mixed) encoded xml files that have these ë ä ö ü sequences instead of ë ä ö ü -- etc. local patterns = { } function regimes.cleanup(regime,str) if not str or str == "" then return str end local p = patterns[regime] if p == nil then regime = regime and synonyms[regime] or regime or currentregime local vector = regime ~= "utf" and regime ~= "utf-8" and mapping[regime] if vector then local mapping = { } for k, v in next, vector do local split = totable(v) for i=1,#split do split[i] = utfchar(byte(split[i])) end split = concat(split) if v ~= split then mapping[split] = v end end p = Cs((lpeg.utfchartabletopattern(mapping)/mapping+P(1))^0) else p = false end patterns[regime] = p end return p and lpegmatch(p,str) or str end -- local old = [[test ë ä ö ü crap]] -- local new = regimes.cleanup("cp1252",old) -- report_translating("%s -> %s",old,new) -- local old = "Pozn" .. char(0xE1) .. "mky" -- local new = fromregime("cp1250",old) -- report_translating("%s -> %s",old,new) -- interface (might move to regi-tex.lua) if interfaces then local implement = interfaces.implement local setmacro = interfaces.setmacro implement { name = "enableregime", public = true, protected = true, arguments = "optional", actions = function(regime) setmacro("currentregime",enable(regime)) end } implement { name = "disableregime", public = true, protected = true, actions = function() setmacro("currentregime",disable()) end } implement { name = "pushregime", public = true, protected = true, actions = push } implement { name = "popregime", public = true, protected = true, actions = pop } local stack = { } implement { name = "startregime", public = true, protected = true, arguments = "optional", actions = function(regime) insert(stack,currentregime) if trace_translating then report_translating("start using %a",regime) end setmacro("currentregime",enable(regime)) end } implement { name = "stopregime", public = true, protected = true, actions = function() if #stack > 0 then local regime = remove(stack) if trace_translating then report_translating("stop using %a",regime) end setmacro("currentregime",enable(regime)) end end } end