char-utf.lua / last modification: 2008-06-10 21:20
if not modules then modules = { } end modules ['char-utf'] = {
    version   = 1.001,
    comment   = "companion to char-ini.tex",
    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
    copyright = "PRAGMA ADE / ConTeXt Development Team",
    license   = "see context related readme files"
}

--[[ldx--
<p>When a sequence of <l n='utf'/> characters enters the application, it may
be neccessary to collapse subsequences into their composed variant.</p>

<p>This module implements methods for collapsing and expanding <l n='utf'/>
sequences. We also provide means to deal with characters that are
special to <l n='tex'/> as well as 8-bit characters that need to end up
in special kinds of output (for instance <l n='pdf'/>).</p>

<p>We implement these manipulations as filters. One can run multiple filters
over a string.</p>
--ldx]]--

utf = utf or unicode.utf8

characters              = characters              or { }
characters.graphemes    = characters.graphemes    or { }
characters.filters      = characters.filters      or { }
characters.filters.utf  = characters.filters.utf  or { }

characters.filters.utf.initialized = false
characters.filters.utf.collapsing  = true
characters.filters.utf.expanding   = true

--[[ldx--
<p>It only makes sense to collapse at runtime, since we don't expect
source code to depend on collapsing:</p>

<typing>
characters.filters.utf.collapsing = true
input.filters.utf_translator      = characters.filters.utf.collapse
</typing>
--ldx]]--

function characters.filters.utf.initialize()
    if characters.filters.utf.collapsing and not characters.filters.utf.initialized then
        local cg = characters.graphemes
        local uc = utf.char
        for k,v in pairs(characters.data) do
            -- using vs and first testing for length is faster (.02->.01 s)
            local vs = v.specials
            if vs and #vs == 3 and vs[1] == 'char' then
                local first, second = uc(vs[2]), uc(vs[3])
                local cgf = cg[first]
                if not cgf then
                    cgf = { }
                    cg[first] = cgf
                end
                cgf[second] = uc(k)
            end
        end
        characters.filters.utf.initialized = true
    end
end

-- characters.filters.utf.add_grapheme(utf.char(318),'l','\string~')
-- characters.filters.utf.add_grapheme('c','a','b')

--~ function characters.filters.utf.add_grapheme(result,...)
--~     local cg = characters.graphemes
--~     local t = {...}
--~     local n = table.getn(t)
--~     for i=1,n do
--~         local v = t[i]
--~         if not cg[v] then
--~             cg[v] = { }
--~         end
--~         if i == n then
--~            cg[v] = result
--~         else
--~             cg = cg[v]
--~         end
--~     end
--~ end

function characters.filters.utf.add_grapheme(result,first,second)
    local cg, uc = characters.graphemes, utf.char
    local r, f, s = tonumber(result), tonumber(first), tonumber(second)
    if r then result = uc(r) end
    if f then first  = uc(f) end
    if s then second = uc(s) end
    if not cg[first] then
        cg[first] = { [second] = result }
    else
        cg[first][second] = result
    end
end

function characters.filters.utf.collapse(str) -- old one
    if characters.filters.utf.collapsing and str and #str > 1 then
        if not characters.filters.utf.initialized then -- saves a call
            characters.filters.utf.initialize()
        end
        local tokens, first, done = { }, false, false
        local cg = characters.graphemes
        for second in str:utfcharacters() do
            local cgf = cg[first]
            if cgf and cgf[second] then
                first, done = cgf[second], true
            elseif first then
                tokens[#tokens+1] = first
                first = second
            else
                first = second
            end
        end
        if done then
            tokens[#tokens+1] = first
            return table.concat(tokens,"")
        end
    end
    return str
end

--[[ldx--
<p>In order to deal with 8-bit output, we need to find a way to
go from <l n='utf'/> to 8-bit. This is handled in the
<l n='luatex'/> engine itself.</p>

<p>This leaves us problems with characters that are specific to
<l n='tex'/> like <type>{}</type>, <type>$</type> and alike.</p>

<p>We can remap some chars that tex input files are sensitive for to
a private area (while writing to a utility file) and revert then
to their original slot when we read in such a file. Instead of
reverting, we can (when we resolve characters to glyphs) map them
to their right glyph there.</p>

<p>For this purpose we can use the private planes 0x0F0000 and
0x100000.</p>
--ldx]]--

characters.filters.utf.private = {
    high    = { },
    low     = { },
    escapes = { },
}

do

    local low     = characters.filters.utf.private.low
    local high    = characters.filters.utf.private.high
    local escapes = characters.filters.utf.private.escapes
    local special = "~#$%^&_{}\\"

    local ub, uc, ug = utf.byte, utf.char, utf.gsub

    function characters.filters.utf.private.set(ch)
        local cb
        if type(ch) == "number" then
            cb, ch = ch, uc(ch)
        else
            cb = ub(ch)
        end
        if cb < 256 then
            low    [ch]                = uc(0x0F0000 + cb)
            high   [uc(0x0F0000 + cb)] = ch
            escapes[ch]                = "\\" .. ch
        end
    end

    function characters.filters.utf.private.replace(str) return ug(str,"(.)", low    ) end
    function characters.filters.utf.private.revert(str)  return ug(str,"(.)", high   ) end
    function characters.filters.utf.private.escape(str)  return ug(str,"(.)", escapes) end

    local set = characters.filters.utf.private.set

    for ch in special:gmatch(".") do set(ch) end

end

--[[ldx--
<p>We get a more efficient variant of this when we integrate
replacements in collapser. This more or less renders the previous
private code redundant. The following code is equivalent but the
first snippet uses the relocated dollars.</p>

<typing>
[󰀤x󰀤] [$x$]
</typing>
--ldx]]--

do

    local cg = characters.graphemes
    local cr = characters.filters.utf.private.high -- kan via een lpeg
    local cf = characters.filters.utf

    local concat = table.concat

    --~ keep this  one, it's the baseline
    --~
    --~ function characters.filters.utf.collapse(str)
    --~     if cf.collapsing and str then
    --~         if #str > 1 then
    --~             if not cf.initialized then -- saves a call
    --~                 cf.initialize()
    --~             end
    --~             local tokens, first, done = { }, false, false
    --~             for second in str:utfcharacters() do
    --~                 if cr[second] then
    --~                     if first then
    --~                         tokens[#tokens+1] = first
    --~                     end
    --~                     first, done = cr[second], true
    --~                 else
    --~                     local cgf = cg[first]
    --~                     if cgf and cgf[second] then
    --~                         first, done = cgf[second], true
    --~                     elseif first then
    --~                         tokens[#tokens+1] = first
    --~                         first = second
    --~                     else
    --~                         first = second
    --~                     end
    --~                 end
    --~             end
    --~             if done then
    --~                 tokens[#tokens+1] = first
    --~                 return concat(tokens,"") -- seldom called
    --~             end
    --~         elseif #str > 0 then
    --~             return cr[str] or str
    --~         end
    --~     end
    --~     return str
    --~ end

    --[[ldx--
    <p>The next variant has lazy token collecting, on a 140 page mk.tex this saves
    about .25 seconds, which is understandable because we have no graphmes and
    not collecting tokens is not only faster but also saves garbage collecting.
    </p>
    --ldx]]--

    function characters.filters.utf.collapse(str) -- not really tested (we could preallocate a table)
        if cf.collapsing and str then
            if #str > 1 then
                if not cf.initialized then -- saves a call
                    cf.initialize()
                end
                local tokens, first, done, n = { }, false, false, 0
                for second in str:utfcharacters() do
                    if done then
                        if cr[second] then
                            if first then
                                tokens[#tokens+1] = first
                            end
                            first = cr[second]
                        else
                            local cgf = cg[first]
                            if cgf and cgf[second] then
                                first = cgf[second]
                            elseif first then
                                tokens[#tokens+1] = first
                                first = second
                            else
                                first = second
                            end
                        end
                    else
                        if cr[second] then
                            for s in str:utfcharacters() do
                                if n == 1 then
                                    break
                                else
                                    tokens[#tokens+1], n = s, n - 1
                                end
                            end
                            if first then
                                tokens[#tokens+1] = first
                            end
                            first, done = cr[second], true
                        else
                            local cgf = cg[first]
                            if cgf and cgf[second] then
                                for s in str:utfcharacters() do
                                    if n == 1 then
                                        break
                                    else
                                        tokens[#tokens+1], n = s, n -1
                                    end
                                end
                                first, done = cgf[second], true
                            else
                                first, n = second, n + 1
                            end
                        end
                    end
                end
                if done then
                    tokens[#tokens+1] = first
                    return concat(tokens,"") -- seldom called
                end
            elseif #str > 0 then
                return cr[str] or str
            end
        end
        return str
    end

    --~ not faster (0.1 seconds on a 500 k collapsable file)
    --~
    --~ local specials, initials = lpeg.P(false), ""
    --~ for k,v in pairs(cr) do
    --~     specials, initials = specials + lpeg.P(k)/v, initials .. k:sub(1,1)
    --~ end
    --~ specials = lpeg.Cs(lpeg.P((1-lpeg.S(initials)) + specials)^0)
    --~ local graphemes = ""
    --~ for _, v in pairs(cg) do
    --~     for kk, _ in pairs(v) do
    --~         graphemes = graphemes .. kk:sub(1,1)
    --~     end
    --~ end
    --~ graphemes = lpeg.P{ lpeg.S(graphemes) + 1 * lpeg.V(1) }
    --~
    --~ function characters.filters.utf.collapse(str)
    --~     if cf.collapsing and str then
    --~         if #str > 1 then
    --~             str = specials:match(str)
    --~             if graphemes:match(str) then
    --~                 if not cf.initialized then -- saves a call
    --~                     cf.initialize()
    --~                 end
    --~                 local tokens, first, done = { }, false, false
    --~                 for second in str:utfcharacters() do
    --~                     local cgf = cg[first]
    --~                     if cgf and cgf[second] then
    --~                         first, done = cgf[second], true
    --~                     elseif first then
    --~                         tokens[#tokens+1] = first
    --~                         first = second
    --~                     else
    --~                         first = second
    --~                     end
    --~                 end
    --~                 if done then
    --~                     tokens[#tokens+1] = first
    --~                     return table.concat(tokens,"")
    --~                 end
    --~             end
    --~         elseif #str > 0 then
    --~             return cr[str] or str
    --~         end
    --~     end
    --~     return str
    --~ end

end

--[[ldx--
<p>In the beginning of <l n='luatex'/> we experimented with a sequence
of filters so that we could manipulate the input stream. However, since
this is a partial solution (not taking macro expansion into account)
and since it may interfere with non-text, we will not use this feature
by default.</p>

<typing>
characters.filters.utf.collapsing = true
characters.filters.append(characters.filters.utf.collapse)
characters.filters.activated = true
callback.register('process_input_buffer', characters.filters.process)
</typing>

<p>The following helper functions may disappear (or become optional)
in the future. Well, they are now.</p>
--ldx]]--

--[[obsolete--

characters.filters.sequences = characters.filters.sequences or { }
characters.filters.activated = false

function characters.filters.append(name)
    table.insert(characters.filters.sequences,name)
end

function characters.filters.prepend(name)
    table.insert(characters.filters.sequences,1,name)
end

function characters.filters.remove(name)
    for k,v in ipairs(characters.filters.sequences) do
        if v == name then
            table.remove(characters.filters.sequences,k)
        end
    end
end

function characters.filters.replace(name_1,name_2)
    for k,v in ipairs(characters.filters.sequences) do
        if v == name_1 then
            characters.filters.sequences[k] = name_2
            break
        end
    end
end

function characters.filters.insert_before(name_1,name_2)
    for k,v in ipairs(characters.filters.sequences) do
        if v == name_1 then
            table.insert(characters.filters.sequences,k,name_2)
            break
        end
    end
end

function characters.filters.insert_after(name_1,name_2)
    for k,v in ipairs(characters.filters.sequences) do
        if v == name_1 then
            table.insert(characters.filters.sequences,k+1,name_2)
            break
        end
    end
end

function characters.filters.list(separator)
    table.concat(characters.filters.sequences,seperator or ' ')
end

function characters.filters.process(str)
    if characters.filters.activated then
        for _,v in ipairs(characters.filters.sequences) do
            str = v(str)
        end
        return str
    else
        return nil -- luatex callback optimalisation
    end
end

--obsolete]]--

--[[ldx--
<p>The following code is no longer needed and replaced by token
collectors somehwere else.</p>
--ldx]]--

--[[obsolete--

characters.filters.collector            = { }
characters.filters.collector.data       = { }
characters.filters.collector.collecting = false

function characters.filters.collector.reset()
    characters.filters.collector.data = { }
end

function characters.filters.collector.flush(separator)
    tex.sprint(table.concat(characters.filters.collector.data,separator))
end

function characters.filters.collector.prune(n)
    for i=1,n do
        table.remove(characters.filters.collector.data,-1)
    end
end

function characters.filters.collector.numerate(str)
    if characters.filters.collector.collecting then
        table.insert(characters.filters.collector.data,(unicode.utf8.gsub(str,"(.)", function(c)
            return string.format("0x%04X ",unicode.utf8.byte(c))
        end)))
    end
    return str
end

--obsolete]]--