font-map.lmt /size: 19 Kb    last modification: 2025-02-21 11:03
1if not modules then modules = { } end modules ['font-map'] = {
2    version   = 1.001,
3    optimize  = true,
4    comment   = "companion to font-ini.mkiv",
5    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
6    copyright = "PRAGMA ADE / ConTeXt Development Team",
7    license   = "see context related readme files"
8}
9
10local next, type = next, type
11
12local match, format, find, concat, gsub, lower = string.match, string.format, string.find, table.concat, string.gsub, string.lower
13local P, R, S, C, Ct, Cc, lpegmatch = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Ct, lpeg.Cc, lpeg.match
14local formatters = string.formatters
15local utfbyte = utf.byte
16local sortedhash, sortedkeys = table.sortedhash, table.sortedkeys
17local hextointeger, dectointeger = string.hextointeger, string.dectointeger
18
19local trace_loading = false  trackers.register("fonts.loading", function(v) trace_loading = v end)
20local trace_mapping = false  trackers.register("fonts.mapping", function(v) trace_mapping = v end)
21
22local report_fonts  = logs.reporter("fonts","loading") -- not otf only
23
24-- force_ligatures was true for a while so that these emoji's with bad names work too
25
26local force_ligatures = false  directives.register("fonts.mapping.forceligatures",function(v) force_ligatures = v end)
27
28local fonts    = fonts or { }
29local mappings = fonts.mappings or { }
30fonts.mappings = mappings
31
32local allocate = utilities.storage.allocate
33
34local hex      = R("AF","af","09")
35local hexfour  = (hex*hex*hex^-2) / function(s) return hextointeger(s) end
36local hexsix   = (hex*hex*hex^-4) / function(s) return hextointeger(s) end
37local dec      = (R("09")^1) / dectointeger
38local period   = P(".")
39local unicode  = (P("uni") + P("UNI")) * (hexfour * (period + P(-1)) * Cc(false) + Ct(hexfour^1) * Cc(true)) -- base planes
40local ucode    = (P("u")   + P("U")  ) * (hexsix  * (period + P(-1)) * Cc(false) + Ct(hexsix ^1) * Cc(true)) -- extended
41local index    = P("index") * dec * Cc(false)
42
43local parser   = unicode + ucode + index
44local parsers  = { }
45
46local function makenameparser(str)
47    if not str or str == "" then
48        return parser
49    else
50        local p = parsers[str]
51        if not p then
52            p = P(str) * period * dec * Cc(false)
53            parsers[str] = p
54        end
55        return p
56    end
57end
58
59local f_single  = formatters["%04X"]
60local f_double  = formatters["%04X%04X"]
61local s_unknown = "FFFD"
62
63local function tounicode16(unicode)
64    if unicode < 0xD7FF or (unicode > 0xDFFF and unicode <= 0xFFFF) then
65        return f_single(unicode)
66    elseif unicode >= 0x00E000 and unicode <= 0x00F8FF then
67        return s_unknown
68    elseif unicode >= 0x0F0000 and unicode <= 0x0FFFFF then
69        return s_unknown
70    elseif unicode >= 0x100000 and unicode <= 0x10FFFF then
71        return s_unknown
72    elseif unicode >= 0x00D800 and unicode <= 0x00DFFF then
73        return s_unknown
74    else
75        unicode = unicode - 0x10000
76        return f_double((unicode//0x400)+0xD800,unicode%0x400+0xDC00)
77    end
78end
79
80local function tounicode16sequence(unicodes)
81    local t = { }
82    for l=1,#unicodes do
83        local u = unicodes[l]
84        if u < 0xD7FF or (u > 0xDFFF and u <= 0xFFFF) then
85            t[l] = f_single(u)
86        elseif unicode >= 0x00E000 and unicode <= 0x00F8FF then
87            t[l] = s_unknown
88        elseif unicode >= 0x0F0000 and unicode <= 0x0FFFFF then
89            t[l] = s_unknown
90        elseif unicode >= 0x100000 and unicode <= 0x10FFFF then
91            t[l] = s_unknown
92     -- elseif unicode >= 0x00D800 and unicode <= 0x00DFFF then
93        elseif unicode >= 0x00D7FF and unicode <= 0x00DFFF then
94            t[l] = s_unknown
95        else
96            u = u - 0x10000
97            t[l] = f_double((u//0x400)+0xD800,u%0x400+0xDC00)
98        end
99    end
100    return concat(t)
101end
102
103
104local hash = { }
105local conc = { }
106
107table.setmetatableindex(hash,function(t,k)
108    local v
109    if k < 0xD7FF or (k > 0xDFFF and k <= 0xFFFF) then
110        v = f_single(k)
111    else
112        local k = k - 0x10000
113        v = f_double((k//0x400)+0xD800,k%0x400+0xDC00)
114    end
115    t[k] = v
116    return v
117end)
118
119local function tounicode(k)
120    local t = type(k)
121    if t == "table" then
122        local n = #k
123        for l=1,n do
124            conc[l] = hash[k[l]]
125        end
126        return concat(conc,"",1,n)
127    elseif t == "string" then
128        return hash[utfbyte(k)]
129    elseif t ~= "number" then
130        return s_unknown
131    elseif k >= 0x00E000 and k <= 0x00F8FF then
132        return s_unknown
133    elseif k >= 0x0F0000 and k <= 0x0FFFFF then
134        return s_unknown
135    elseif k >= 0x100000 and k <= 0x10FFFF then
136        return s_unknown
137 -- elseif k >= 0x00D800 and k <= 0x00DFFF then
138    elseif k >= 0x00D7FF and k <= 0x00DFFF then
139        return s_unknown
140    else
141        return hash[k]
142    end
143end
144
145local function fromunicode16(str)
146    if #str == 4 then
147        return hextointeger(str)
148    else
149        local l, r = match(str,"(....)(....)")
150        return 0x10000 + (hextointeger(l)-0xD800)*0x400  + hextointeger(r) - 0xDC00
151    end
152end
153
154-- Slightly slower:
155--
156-- local p = C(4) * (C(4)^-1) / function(l,r)
157--     if r then
158--         return (hextointeger(l))*0x400  + hextointeger(r) - 0xDC00
159--     else
160--         return hextointeger(l)
161--     end
162-- end
163--
164-- local function fromunicode16(str)
165--     return lpegmatch(p,str)
166-- end
167
168mappings.makenameparser      = makenameparser
169mappings.tounicode           = tounicode
170mappings.tounicode16         = tounicode16
171mappings.tounicode16sequence = tounicode16sequence
172mappings.fromunicode16       = fromunicode16
173
174-- mozilla emoji has bad lig names: name = gsub(name,"(u[a-f0-9_]+)%-([a-f0-9_]+)","%1_%2")
175
176local ligseparator = P("_")
177local varseparator = P(".")
178local namesplitter = Ct(C((1 - ligseparator - varseparator)^1) * (ligseparator * C((1 - ligseparator - varseparator)^1))^0)
179
180-- maybe: ff fi fl ffi ffl => f_f f_i f_l f_f_i f_f_l
181
182-- local function test(name)
183--     local split = lpegmatch(namesplitter,name)
184--     print(string.formatters["%s: [% t]"](name,split))
185-- end
186
187-- test("i.f_")
188-- test("this")
189-- test("this.that")
190-- test("japan1.123")
191-- test("such_so_more")
192-- test("such_so_more.that")
193
194-- to be completed .. for fonts that use unicodes for ligatures which
195-- is a actually a bad thing and should be avoided in the first place
196
197do
198
199    local overloads = {
200        IJ  = { name = "I_J",   unicode = { 0x49, 0x4A },       mess = 0x0132 },
201        ij  = { name = "i_j",   unicode = { 0x69, 0x6A },       mess = 0x0133 },
202        ff  = { name = "f_f",   unicode = { 0x66, 0x66 },       mess = 0xFB00 },
203        fi  = { name = "f_i",   unicode = { 0x66, 0x69 },       mess = 0xFB01 },
204        fl  = { name = "f_l",   unicode = { 0x66, 0x6C },       mess = 0xFB02 },
205        ffi = { name = "f_f_i", unicode = { 0x66, 0x66, 0x69 }, mess = 0xFB03 },
206        ffl = { name = "f_f_l", unicode = { 0x66, 0x66, 0x6C }, mess = 0xFB04 },
207        fj  = { name = "f_j",   unicode = { 0x66, 0x6A } },
208        fk  = { name = "f_k",   unicode = { 0x66, 0x6B } },
209
210     -- endash = { name = "endash", unicode = 0x2013, mess = 0x2013 },
211     -- emdash = { name = "emdash", unicode = 0x2014, mess = 0x2014 },
212    }
213
214    local o = allocate { }
215
216    for k, v in next, overloads do
217        local name = v.name
218        local mess = v.mess
219        if name then
220            o[name] = v
221        end
222        if mess then
223            o[mess] = v
224        end
225        o[k] = v
226    end
227
228    mappings.overloads = o
229
230end
231
232function mappings.addtounicode(data,filename,checklookups,forceligatures)
233    local resources = data.resources
234    local unicodes  = resources.unicodes
235    if not unicodes then
236        if trace_mapping then
237            report_fonts("no unicode list, quitting tounicode for %a",filename)
238        end
239        return
240    end
241    local properties    = data.properties
242    local descriptions  = data.descriptions
243    local overloads     = mappings.overloads
244    -- we need to move this code
245    unicodes['space']   = unicodes['space']  or 32
246    unicodes['hyphen']  = unicodes['hyphen'] or 45
247    unicodes['zwj']     = unicodes['zwj']    or 0x200D
248    unicodes['zwnj']    = unicodes['zwnj']   or 0x200C
249    --
250    local private       = fonts.constructors and fonts.constructors.privateoffset or 0xF0000 -- 0x10FFFF
251    local unicodevector = fonts.encodings.agl.unicodes or { } -- loaded runtime in context
252    local contextvector = fonts.encodings.agl.ctxcodes or { } -- loaded runtime in context
253    local missing       = { }
254    local nofmissing    = 0
255    local oparser       = nil
256    local cidnames      = nil
257    local cidcodes      = nil
258    local cidinfo       = properties.cidinfo
259    local usedmap       = cidinfo and fonts.cid.getmap(cidinfo)
260    local uparser       = makenameparser() -- hm, every time?
261    if usedmap then
262        oparser  = usedmap and makenameparser(cidinfo.ordering)
263        cidnames = usedmap.names
264        cidcodes = usedmap.unicodes
265    end
266    local ns = 0
267    local nl = 0
268    --
269    -- in order to avoid differences between runs due to hash randomization we
270    -- run over a sorted list
271    --
272    local dlist = sortedkeys(descriptions)
273    --
274 -- for du, glyph in next, descriptions do
275    for i=1,#dlist do
276        local du    = dlist[i]
277        local glyph = descriptions[du]
278        local name  = glyph.name
279        if name then
280            local overload = overloads[name] or overloads[du]
281            if overload then
282                -- get rid of weird ligatures
283             -- glyph.name    = overload.name
284                glyph.unicode = overload.unicode
285            else
286                local gu = glyph.unicode -- can already be set (number or table)
287                if not gu or gu == -1 or du >= private or (du >= 0xE000 and du <= 0xF8FF) or du == 0xFFFE or du == 0xFFFF then
288                    local unicode = unicodevector[name] or contextvector[name]
289                    if unicode then
290                        glyph.unicode = unicode
291                        ns            = ns + 1
292                    end
293                    -- cidmap heuristics, beware, there is no guarantee for a match unless
294                    -- the chain resolves
295                    if (not unicode) and usedmap then
296                        local foundindex = lpegmatch(oparser,name)
297                        if foundindex then
298                            unicode = cidcodes[foundindex] -- name to number
299                            if unicode then
300                                glyph.unicode = unicode
301                                ns            = ns + 1
302                            else
303                                local reference = cidnames[foundindex] -- number to name
304                                if reference then
305                                    local foundindex = lpegmatch(oparser,reference)
306                                    if foundindex then
307                                        unicode = cidcodes[foundindex]
308                                        if unicode then
309                                            glyph.unicode = unicode
310                                            ns            = ns + 1
311                                        end
312                                    end
313                                    if not unicode or unicode == "" then
314                                        local foundcodes, multiple = lpegmatch(uparser,reference)
315                                        if foundcodes then
316                                            glyph.unicode = foundcodes
317                                            if multiple then
318                                                nl      = nl + 1
319                                                unicode = true
320                                            else
321                                                ns      = ns + 1
322                                                unicode = foundcodes
323                                            end
324                                        end
325                                    end
326                                end
327                            end
328                        end
329                    end
330                    -- a.whatever or a_b_c.whatever or a_b_c (no numbers) a.b_
331                    --
332                    -- It is not trivial to find a solution that suits all fonts. We tried several alternatives
333                    -- and this one seems to work reasonable also with fonts that use less standardized naming
334                    -- schemes. The extra private test is tested by KE and seems to work okay with non-typical
335                    -- fonts as well.
336                    --
337                    if not unicode or unicode == "" then
338                        local split  = lpegmatch(namesplitter,name)
339                        local nsplit = split and #split or 0 -- add if
340                        if nsplit == 0 then
341                            -- skip
342                        elseif nsplit == 1 then
343                            local base = split[1]
344                            local u = unicodes[base] or unicodevector[base] or contextvector[name]
345                            if not u then
346                                -- skip
347                            elseif type(u) == "table" then
348                                -- unlikely
349                                if u[1] < private then
350                                    unicode = u
351                                    glyph.unicode = unicode
352                                end
353                            elseif u < private then
354                                unicode = u
355                                glyph.unicode = unicode
356                            end
357                        else
358                            local t = { }
359                            local n = 0
360                            for l=1,nsplit do
361                                local base = split[l]
362                                local u = unicodes[base] or unicodevector[base] or contextvector[name]
363                                if not u then
364                                    break
365                                elseif type(u) == "table" then
366                                    if u[1] >= private then
367                                        break
368                                    end
369                                    n = n + 1
370                                    t[n] = u[1]
371                                else
372                                    if u >= private then
373                                        break
374                                    end
375                                    n = n + 1
376                                    t[n] = u
377                                end
378                            end
379                            if n > 0 then
380                                if n == 1 then
381                                    unicode = t[1]
382                                else
383                                    unicode = t
384                                end
385                                glyph.unicode = unicode
386                            end
387                        end
388                        nl = nl + 1
389                    end
390                    -- last resort (we might need to catch private here as well)
391                    if not unicode or unicode == "" then
392                        local foundcodes, multiple = lpegmatch(uparser,name)
393                        if foundcodes then
394                            glyph.unicode = foundcodes
395                            if multiple then
396                                nl      = nl + 1
397                                unicode = true
398                            else
399                                ns      = ns + 1
400                                unicode = foundcodes
401                            end
402                        end
403                    end
404                    -- check using substitutes and alternates
405                    local r = overloads[unicode]
406                    if r then
407                        unicode = r.unicode
408                        glyph.unicode = unicode
409                    end
410                    --
411                    if not unicode then
412                        missing[du] = true
413                        nofmissing  = nofmissing + 1
414                    end
415                else
416                     -- maybe a message or so
417                end
418            end
419        else
420            local overload = overloads[du]
421            if overload then
422                glyph.unicode = overload.unicode
423            elseif not glyph.unicode then
424                missing[du] = true
425                nofmissing  = nofmissing + 1
426            end
427        end
428    end
429    if type(checklookups) == "function" then
430        checklookups(data,missing,nofmissing)
431    end
432
433    local unicoded  = 0
434    local collected = fonts.handlers.otf.readers.getcomponents(data) -- neglectable overhead
435
436    local function resolve(glyph,u)
437        local n = #u
438        for i=1,n do
439            if u[i] > private then
440                n = 0
441                break
442            end
443        end
444        if n > 0 then
445            if n > 1 then
446                glyph.unicode = u
447            else
448                glyph.unicode = u[1]
449            end
450            unicoded = unicoded + 1
451        end
452    end
453
454    if not collected then
455        -- move on
456    elseif forceligatures or force_ligatures then
457        for i=1,#dlist do
458            local du = dlist[i]
459            if du >= private or (du >= 0xE000 and du <= 0xF8FF) then
460                local u  = collected[du] -- always tables
461                if u then
462                    resolve(descriptions[du],u)
463                end
464            end
465        end
466    else
467        for i=1,#dlist do
468            local du = dlist[i]
469            if du >= private or (du >= 0xE000 and du <= 0xF8FF) then
470                local glyph = descriptions[du]
471                if glyph.class == "ligature" and not glyph.unicode then
472                    local u = collected[du] -- always tables
473                    if u then
474                         resolve(glyph,u)
475                    end
476                end
477            end
478        end
479    end
480
481    if trace_mapping and unicoded > 0 then
482        report_fonts("%n ligature tounicode mappings deduced from gsub ligature features",unicoded)
483    end
484    if trace_mapping then
485     -- for unic, glyph in sortedhash(descriptions) do
486        for i=1,#dlist do
487            local du      = dlist[i]
488            local glyph   = descriptions[du]
489            local name    = glyph.name or "-"
490            local index   = glyph.index or 0
491            local unicode = glyph.unicode
492            if unicode then
493                if type(unicode) == "table" then
494                    local unicodes = { }
495                    for i=1,#unicode do
496                        unicodes[i] = formatters("%U",unicode[i])
497                    end
498                    report_fonts("internal slot %U, name %a, unicode %U, tounicode % t",index,name,du,unicodes)
499                else
500                    report_fonts("internal slot %U, name %a, unicode %U, tounicode %U",index,name,du,unicode)
501                end
502            else
503                report_fonts("internal slot %U, name %a, unicode %U",index,name,du)
504            end
505        end
506    end
507    if trace_loading and (ns > 0 or nl > 0) then
508        report_fonts("%s tounicode entries added, ligatures %s",nl+ns,ns)
509    end
510end
511
512-- local parser = makenameparser("Japan1")
513-- local parser = makenameparser()
514-- local function test(str)
515--     local b, a = lpegmatch(parser,str)
516--     print((a and table.serialize(b)) or b)
517-- end
518-- test("a.sc")
519-- test("a")
520-- test("uni1234")
521-- test("uni1234.xx")
522-- test("uni12349876")
523-- test("u123400987600")
524-- test("index1234")
525-- test("Japan1.123")
526