if not modules then modules = { } end modules ['char-utf'] = { version = 1.001, comment = "companion to char-utf.mkiv", author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", copyright = "PRAGMA ADE / ConTeXt Development Team", license = "see context related readme files" } -- When a sequence of UTF characters enters the application, it may be -- neccessary to collapse subsequences into their composed variant. -- -- This module implements methods for collapsing and expanding UTF sequences. We -- also provide means to deal with characters that are special to TeX as well as -- 8-bit characters that need to end up in special kinds of output (for instance -- PDF). -- -- We implement these manipulations as filters. One can run multiple filters over a -- string. -- -- The old code has now been moved to char-obs.lua which we keep around for -- educational purposes. local next, type = next, type local gsub, find = string.gsub, string.find local concat, sortedhash, keys, sort = table.concat, table.sortedhash, table.keys, table.sort local utfchar, utfbyte, utfcharacters = utf.char, utf.byte, utf.characters local P, Cs, Cmt, Ct = lpeg.P, lpeg.Cs, lpeg.Cmt, lpeg.Ct if not characters then require("char-def") end if not characters.blocks then require("char-ini") end local lpegmatch = lpeg.match local lpegpatterns = lpeg.patterns local p_utf8character = lpegpatterns.utf8character local p_utf8byte = lpegpatterns.utf8byte local utfchartabletopattern = lpeg.utfchartabletopattern local formatters = string.formatters local allocate = utilities.storage.allocate or function() return { } end local mark = utilities.storage.mark or allocate local charfromnumber = characters.fromnumber characters = characters or { } local characters = characters local filters = allocate() characters.filters = filters local utffilters = { } characters.filters.utf = utffilters local data = characters.data -- It only makes sense to collapse at runtime, since we don't expect source code to -- depend on collapsing. -- for the moment, will be entries in char-def.lua .. this is just a subset that for -- typographic (font) reasons we want to have split ... if we decompose all, we get -- problems with fonts local decomposed = allocate { ["IJ"] = "IJ", ["ij"] = "ij", ["և"] = "եւ", ["ff"] = "ff", ["fi"] = "fi", ["fl"] = "fl", ["ffi"] = "ffi", ["ffl"] = "ffl", ["ſt"] = "ſt", ["st"] = "st", ["ﬓ"] = "մն", ["ﬔ"] = "մե", ["ﬕ"] = "մի", ["ﬖ"] = "վն", ["ﬗ"] = "մխ", } characters.decomposed = decomposed local graphemes = characters.graphemes local collapsed = characters.collapsed local combined = characters.combined local mathlists = characters.mathlists if graphemes then mark(graphemes) mark(collapsed) mark(combined) mark(mathlists) else graphemes = allocate() collapsed = allocate() combined = allocate() mathlists = allocate() characters.graphemes = graphemes characters.collapsed = collapsed characters.combined = combined characters.mathlists = mathlists local function backtrack(v,last,target) local vs = v.specials if vs and #vs == 3 then local kind = vs[1] if kind == "char" or kind == "with" then local one = vs[2] local two = vs[3] local first = utfchar(one) local second = utfchar(two) .. last collapsed[first..second] = target backtrack(data[one],second,target) end end end local function setlist(unicode,list,start,category) if list[start] ~= 0x20 then local t = mathlists for i=start,#list do local l = list[i] local f = t[l] if f then t = f else f = { } t[l] = f t = f end end t[category] = unicode end end local mlists = { } for unicode, v in next, data do local vs = v.specials if vs then local kind = vs[1] local size = #vs if kind == "char" or char == "with" then -- with added if size == 3 then local one = vs[2] local two = vs[3] local first = utfchar(one) local second = utfchar(two) local combination = utfchar(unicode) -- collapsed[first..second] = combination backtrack(data[one],second,combination) -- sort of obsolete: local cgf = graphemes[first] if not cgf then cgf = { [second] = combination } graphemes[first] = cgf else cgf[second] = combination end end if size > 2 and (v.mathclass or v.mathspec) then setlist(unicode,vs,2,"specials") end elseif kind == "with" then if size == 3 then -- combined[utfchar(vs[2])..utfchar(vs[3])] = utfchar(unicode) combined[utfchar(vs[2],vs[3])] = utfchar(unicode) end elseif kind == "compat" then if size == 3 then -- combined[utfchar(vs[2])..utfchar(vs[3])] = utfchar(unicode) combined[utfchar(vs[2],vs[3])] = utfchar(unicode) end if size > 2 and (v.mathclass or v.mathspec) then setlist(unicode,vs,2,"specials") end end end local ml = v.mathlist if ml then mlists[unicode] = ml end end -- these win: for unicode, ml in next, mlists do setlist(unicode,ml,1,"mathlist") end mlists = nil if storage then storage.register("characters/graphemes", graphemes, "characters.graphemes") storage.register("characters/collapsed", collapsed, "characters.collapsed") storage.register("characters/combined", combined, "characters.combined") storage.register("characters/mathlists", mathlists, "characters.mathlists") end end function characters.initialize() end -- dummy local skippable = { } local filesuffix = file.suffix function utffilters.setskippable(suffix,value) if value == nil then value = true end if type(suffix) == "table" then for i=1,#suffix do skippable[suffix[i]] = value end else skippable[suffix] = value end end local p_collapse = nil -- so we can reset if needed local function prepare() local tree = utfchartabletopattern(collapsed) -- p_collapse = Cs((tree/collapsed + p_utf8character)^0 * P(-1)) p_collapse = Cs((tree/collapsed + p_utf8character)^0) end function utffilters.collapse(str,filename) if not p_collapse then prepare() end if not str or str == "" or #str == 1 then return str elseif filename and skippable[filesuffix(filename)] then -- we could hash the collapsables or do a quicker test return str else return lpegmatch(p_collapse,str) or str end end local p_combine = nil -- only for tex local function prepare() local tree = utfchartabletopattern(combined) p_combine = Cs((tree/combined + p_utf8character)^0) end function utffilters.combine(str) -- not in files -- we could merge collapse into combine ... maybe if not p_combine then prepare() end if not str or str == "" or #str == 1 then return str else return lpegmatch(p_combine,str) or str end end local p_decompose = nil local function prepare() local tree = utfchartabletopattern(decomposed) p_decompose = Cs((tree/decomposed + p_utf8character)^0 * P(-1)) end function utffilters.decompose(str,filename) -- 3 to 4 times faster than the above if not p_decompose then prepare() end if str and str ~= "" and #str > 1 then return lpegmatch(p_decompose,str) end if not str or str == "" or #str < 2 then return str elseif filename and skippable[filesuffix(filename)] then return str else return lpegmatch(p_decompose,str) or str end return str end -- utffilters.addgrapheme(utfchar(318),'l','\string~') -- utffilters.addgrapheme('c','a','b') function utffilters.addgrapheme(result,first,second) -- can be U+ 0x string or utf or number local result = charfromnumber(result) local first = charfromnumber(first) local second = charfromnumber(second) if not graphemes[first] then graphemes[first] = { [second] = result } else graphemes[first][second] = result end local pair = first .. second if not collapsed[pair] then collapsed[pair] = result p_composed = nil end end if interfaces then -- eventually this goes to char-ctx.lua interfaces.implement { name = "addgrapheme", actions = utffilters.addgrapheme, arguments = "3 strings", } end -- -- local p_reorder = nil -- local sorter = function(a,b) return b[2] < a[2] end -- -- local function swapper(s,p,t) -- local old = { } -- for i=1,#t do -- old[i] = t[i][1] -- end -- old = concat(old) -- sort(t,sorter) -- for i=1,#t do -- t[i] = t[i][1] -- end -- local new = concat(t) -- if old ~= new then -- print("reordered",old,"->",new) -- end -- return p, new -- end -- -- the next one into stable for similar weights local sorter = function(a,b) return b[2] < a[2] end local function swapper(s,p,t) sort(t,sorter) for i=1,#t do t[i] = t[i][1] end return p, concat(t) end -- -- the next one keeps similar weights in the original order -- -- local sorter = function(a,b) -- local b2, a2 = b[2], a[2] -- if a2 == b2 then -- return b[3] > a[3] -- else -- return b2 < a2 -- end -- end -- -- local function swapper(s,p,t) -- for i=1,#t do -- t[i][3] = i -- end -- sort(t,sorter) -- for i=1,#t do -- t[i] = t[i][1] -- end -- return p, concat(t) -- end -- at some point exceptions will become an option, for now it's an experiment -- to overcome bugs (that have become features) in unicode .. or we might decide -- for an extra ordering key in char-def that takes precedence over combining local exceptions = { -- frozen unicode bug ["َّ"] = "َّ", -- U+64E .. U+651 => U+651 .. U+64E } local function prepare() local hash = { } for k, v in sortedhash(characters.data) do local combining = v.combining -- v.ordering or v.combining if combining then local u = utfchar(k) hash[u] = { u, combining, 0 } -- slot 3 can be used in sort end end local e = utfchartabletopattern(exceptions) local p = utfchartabletopattern(hash) p_reorder = Cs((e/exceptions + Cmt(Ct((p/hash)^2),swapper) + p_utf8character)^0) * P(-1) end function utffilters.reorder(str,filename) if not p_reorder then prepare() end if not str or str == "" or #str < 2 then return str elseif filename and skippable[filesuffix(filename)] then return str else return lpegmatch(p_reorder,str) or str end return str end -- local collapse = utffilters.collapse -- local decompose = utffilters.decompose -- local reorder = utffilters.reorder -- -- local c1, c2, c3 = "a", "̂", "̃" -- local r2, r3 = "â", "ẫ" -- local l1 = "ffl" -- -- local str = c1..c2..c3 .. " " .. c1..c2 .. " " .. l1 -- local res = r3 .. " " .. r2 .. " " .. "ffl" -- -- local text = io.loaddata("t:/sources/tufte.tex") -- -- local function test(n) -- local data = text .. string.rep(str,100) .. text -- local okay = text .. string.rep(res,100) .. text -- local t = os.clock() -- for i=1,10000 do -- collapse(data) -- decompose(data) -- -- reorder(data) -- end -- print(os.clock()-t,decompose(collapse(data))==okay,decompose(collapse(str))) -- end -- -- test(050) -- test(150) -- -- local old = "foo" .. string.char(0xE1) .. "bar" -- local new = collapse(old) -- print(old,new) -- local one_old = "فَأَصَّدَّقَ دَّ" local one_new = utffilters.reorder(one_old) -- local two_old = "فَأَصَّدَّقَ دَّ" local two_new = utffilters.reorder(two_old) -- -- print(one_old,two_old,one_old==two_old,false) -- print(one_new,two_new,one_new==two_new,true) -- -- local test = "foo" .. utf.reverse("ؚ" .. "ً" .. "ٌ" .. "ٍ" .. "َ" .. "ُ" .. "ِ" .. "ّ" .. "ْ" ) .. "bar" -- local done = utffilters.reorder(test) -- -- print(test,done,test==done,false) local f_default = formatters["[%U] "] local f_description = formatters["[%s] "] local function convert(n) local d = data[n] d = d and d.description if d then return f_description(d) else return f_default(n) end end local pattern = Cs((p_utf8byte / convert)^1) function utffilters.verbose(data) return data and lpegmatch(pattern,data) or "" end return characters