char-cmp.lua / last modification: 2008-02-12 14:50
if not modules then modules = { } end modules ['char-cmp'] = {
    version   = 1.001,
    comment   = "companion to char-ini.tex",
    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
    copyright = "PRAGMA ADE / ConTeXt Development Team",
    license   = "see context related readme files"
}

characters            = characters            or { }
characters.uncomposed = characters.uncomposed or { }

--[[ldx--
<p>The code defined here may move to the big character table.</p>
--ldx]]--

characters.basedigits = {
    ['zero']  = 48, ['one']   = 49,
    ['two']   = 50, ['three'] = 51,
    ['four']  = 52, ['five']  = 53,
    ['six']   = 54, ['seven'] = 55,
    ['eight'] = 56, ['nine']  = 57
}

--[[ldx--
<p>The next three tables can for instance be be used to enhance
kerning tables that lack kerning pairs for these special characters.
Of course they may come in handy elsewhere too</p>
--ldx]]--

-- we can use shcodes, but then we also need slcode and srcode
--
-- AEligature
--    => slcode == ub('A')
--    => srcode == ub('E')
-- or
--    => shcode == { ub('A'), ub('E') }
--    => reduction = "AE"
--
-- eacute
--    => shcode == ub('A')
-- or
--    => shcode == { ub('a') }
--    => reduction = "a"

characters.uncomposed.left = {
    AEligature = "A",  aeligature = "a",
    OEligature = "O",  oeligature = "o",
    IJligature = "I",  ijligature = "i",
    AE         = "A",  ae         = "a",
    OE         = "O",  oe         = "o",
    IJ         = "I",  ij         = "i",
    Ssharp     = "S",  ssharp     = "s",
}

characters.uncomposed.right = {
    AEligature = "E",  aeligature = "e",
    OEligature = "E",  oeligature = "e",
    IJligature = "J",  ijligature = "j",
    AE         = "E",  ae         = "e",
    OE         = "E",  oe         = "e",
    IJ         = "J",  ij         = "j",
    Ssharp     = "S",  ssharp     = "s",
}

characters.uncomposed.both = {
    Acircumflex = "A",  acircumflex = "a",
    Ccircumflex = "C",  ccircumflex = "c",
    Ecircumflex = "E",  ecircumflex = "e",
    Gcircumflex = "G",  gcircumflex = "g",
    Hcircumflex = "H",  hcircumflex = "h",
    Icircumflex = "I",  icircumflex = "i",
    Jcircumflex = "J",  jcircumflex = "j",
    Ocircumflex = "O",  ocircumflex = "o",
    Scircumflex = "S",  scircumflex = "s",
    Ucircumflex = "U",  ucircumflex = "u",
    Wcircumflex = "W",  wcircumflex = "w",
    Ycircumflex = "Y",  ycircumflex = "y",

    Agrave = "A",  agrave = "a",
    Egrave = "E",  egrave = "e",
    Igrave = "I",  igrave = "i",
    Ograve = "O",  ograve = "o",
    Ugrave = "U",  ugrave = "u",
    Ygrave = "Y",  ygrave = "y",

    Atilde = "A",  atilde = "a",
    Itilde = "I",  itilde = "i",
    Otilde = "O",  otilde = "o",
    Utilde = "U",  utilde = "u",
    Ntilde = "N",  ntilde = "n",

    Adiaeresis = "A",  adiaeresis = "a",  Adieresis = "A",  adieresis = "a",
    Ediaeresis = "E",  ediaeresis = "e",  Edieresis = "E",  edieresis = "e",
    Idiaeresis = "I",  idiaeresis = "i",  Idieresis = "I",  idieresis = "i",
    Odiaeresis = "O",  odiaeresis = "o",  Odieresis = "O",  odieresis = "o",
    Udiaeresis = "U",  udiaeresis = "u",  Udieresis = "U",  udieresis = "u",
    Ydiaeresis = "Y",  ydiaeresis = "y",  Ydieresis = "Y",  ydieresis = "y",

    Aacute = "A",  aacute = "a",
    Cacute = "C",  cacute = "c",
    Eacute = "E",  eacute = "e",
    Iacute = "I",  iacute = "i",
    Lacute = "L",  lacute = "l",
    Nacute = "N",  nacute = "n",
    Oacute = "O",  oacute = "o",
    Racute = "R",  racute = "r",
    Sacute = "S",  sacute = "s",
    Uacute = "U",  uacute = "u",
    Yacute = "Y",  yacute = "y",
    Zacute = "Z",  zacute = "z",

    Dstroke = "D",  dstroke = "d",
    Hstroke = "H",  hstroke = "h",
    Tstroke = "T",  tstroke = "t",

    Cdotaccent = "C",  cdotaccent = "c",
    Edotaccent = "E",  edotaccent = "e",
    Gdotaccent = "G",  gdotaccent = "g",
    Idotaccent = "I",  idotaccent = "i",
    Zdotaccent = "Z",  zdotaccent = "z",

    Amacron = "A",  amacron = "a",
    Emacron = "E",  emacron = "e",
    Imacron = "I",  imacron = "i",
    Omacron = "O",  omacron = "o",
    Umacron = "U",  umacron = "u",

    Ccedilla = "C",  ccedilla = "c",
    Kcedilla = "K",  kcedilla = "k",
    Lcedilla = "L",  lcedilla = "l",
    Ncedilla = "N",  ncedilla = "n",
    Rcedilla = "R",  rcedilla = "r",
    Scedilla = "S",  scedilla = "s",
    Tcedilla = "T",  tcedilla = "t",

    Ohungarumlaut = "O",  ohungarumlaut = "o",
    Uhungarumlaut = "U",  uhungarumlaut = "u",

    Aogonek = "A",  aogonek = "a",
    Eogonek = "E",  eogonek = "e",
    Iogonek = "I",  iogonek = "i",
    Uogonek = "U",  uogonek = "u",

    Aring = "A",  aring = "a",
    Uring = "U",  uring = "u",

    Abreve = "A",  abreve = "a",
    Ebreve = "E",  ebreve = "e",
    Gbreve = "G",  gbreve = "g",
    Ibreve = "I",  ibreve = "i",
    Obreve = "O",  obreve = "o",
    Ubreve = "U",  ubreve = "u",

    Ccaron = "C",  ccaron = "c",
    Dcaron = "D",  dcaron = "d",
    Ecaron = "E",  ecaron = "e",
    Lcaron = "L",  lcaron = "l",
    Ncaron = "N",  ncaron = "n",
    Rcaron = "R",  rcaron = "r",
    Scaron = "S",  scaron = "s",
    Tcaron = "T",  tcaron = "t",
    Zcaron = "Z",  zcaron = "z",

    dotlessI = "I",  dotlessi = "i",
    dotlessJ = "J",  dotlessj = "j",

    AEligature = "AE",  aeligature = "ae",  AE         = "AE",  ae         = "ae",
    OEligature = "OE",  oeligature = "oe",  OE         = "OE",  oe         = "oe",
    IJligature = "IJ",  ijligature = "ij",  IJ         = "IJ",  ij         = "ij",

    Lstroke    = "L",   lstroke    = "l",   Lslash     = "L",   lslash     = "l",
    Ostroke    = "O",   ostroke    = "o",   Oslash     = "O",   oslash     = "o",

    Ssharp     = "SS",  ssharp     = "ss",

    Aumlaut = "A",  aumlaut = "a",
    Eumlaut = "E",  eumlaut = "e",
    Iumlaut = "I",  iumlaut = "i",
    Oumlaut = "O",  oumlaut = "o",
    Uumlaut = "U",  uumlaut = "u",

}

--[[ldx--
<p>The following function is used in the indexing code, where
we need some sort of default fallback mapping.</p>
--ldx]]--

function characters.uncompose(n) -- n == string|number, returns string
    local cdn
    if type(n) == "string" then
        cdn = characters.data[utf.byte(n)]
    else
        cdn = characters.data[n]
    end
    -- return characters.shape(n)
    if cdn then
        local shcode = cdn.shcode
        if not shcode then
            return characters.uncomposed.both[cdn.contextname] or n
        elseif type(shcode) == "table" then
            return utf.char(unpack(cdn.shcode))
        else
            return utf.char(cdn.shcode)
        end
    end
    return n
end

--[[ldx--
<p>Only characters with a code smaller than 128 make sense,
anything larger is encoding dependent. An interesting complication
is that a character can be in an encoding twice but is hashed
once.</p>
--ldx]]--

characters.ligatures = {
    ['f'] = {
        { 'f', 'ff' },
        { 'i', 'fi' },
        { 'l', 'fl' },
    },
    ['ff'] = {
        { 'i', 'ffi' }
    },
    ['fi'] = {
        { 'i', 'fii' }
    },
    ['fl'] = {
        { 'i', 'fli' }
    },
    ['s'] = {
        { 't', 'st' }
    },
    ['i'] = {
        { 'j', 'ij' }
    },
}

characters.texligatures = {
 -- ['space'] = {
 --     { 'L', 'Lslash' },
 --     { 'l', 'lslash' }
 -- },
 -- ['question'] = {
 --     { 'quoteleft', 'questiondown' }
 -- },
 -- ['exclam'] = {
 --     { 'quoteleft', 'exclamdown' }
 -- },
    ['quoteleft'] = {
        { 'quoteleft', 'quotedblleft' }
    },
    ['quoteright'] = {
        { 'quoteright', 'quotedblright' }
    },
    ['hyphen'] = {
        { 'hyphen', 'endash' }
    },
    ['endash'] = {
        { 'hyphen', 'emdash' }
    }
}

--~ U+2019: right single quotation mark / quoteright