font-map.lua /size: 19 Kb    last modification: 2023-12-21 09:44
1if not modules then modules = { } end modules ['font-map'] = {
2    version   = 1.001,
3    optimize  = true,
4    comment   = "companion to font-ini.mkiv",
5    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
6    copyright = "PRAGMA ADE / ConTeXt Development Team",
7    license   = "see context related readme files"
8}
9
10local tonumber, next, type = tonumber, next, type
11
12local match, format, find, concat, gsub, lower = string.match, string.format, string.find, table.concat, string.gsub, string.lower
13local P, R, S, C, Ct, Cc, lpegmatch = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Ct, lpeg.Cc, lpeg.match
14local formatters = string.formatters
15local sortedhash, sortedkeys = table.sortedhash, table.sortedkeys
16local idiv = number.idiv
17
18local trace_loading = false  trackers.register("fonts.loading", function(v) trace_loading = v end)
19local trace_mapping = false  trackers.register("fonts.mapping", function(v) trace_mapping = v end)
20
21local report_fonts  = logs.reporter("fonts","loading") -- not otf only
22
23-- force_ligatures was true for a while so that these emoji's with bad names work too
24
25local force_ligatures = false  directives.register("fonts.mapping.forceligatures",function(v) force_ligatures = v end)
26
27local fonts    = fonts or { }
28local mappings = fonts.mappings or { }
29fonts.mappings = mappings
30
31local allocate = utilities.storage.allocate
32
33local hex      = R("AF","af","09")
34local hexfour  = (hex*hex*hex^-2) / function(s) return tonumber(s,16) end
35local hexsix   = (hex*hex*hex^-4) / function(s) return tonumber(s,16) end
36local dec      = (R("09")^1) / tonumber
37local period   = P(".")
38local unicode  = (P("uni") + P("UNI")) * (hexfour * (period + P(-1)) * Cc(false) + Ct(hexfour^1) * Cc(true)) -- base planes
39local ucode    = (P("u")   + P("U")  ) * (hexsix  * (period + P(-1)) * Cc(false) + Ct(hexsix ^1) * Cc(true)) -- extended
40local index    = P("index") * dec * Cc(false)
41
42local parser   = unicode + ucode + index
43local parsers  = { }
44
45local function makenameparser(str)
46    if not str or str == "" then
47        return parser
48    else
49        local p = parsers[str]
50        if not p then
51            p = P(str) * period * dec * Cc(false)
52            parsers[str] = p
53        end
54        return p
55    end
56end
57
58local f_single  = formatters["%04X"]
59local f_double  = formatters["%04X%04X"]
60local s_unknown = "FFFD"
61
62local function tounicode16(unicode)
63    if unicode < 0xD7FF or (unicode > 0xDFFF and unicode <= 0xFFFF) then
64        return f_single(unicode)
65    elseif unicode >= 0x00E000 and unicode <= 0x00F8FF then
66        return s_unknown
67    elseif unicode >= 0x0F0000 and unicode <= 0x0FFFFF then
68        return s_unknown
69    elseif unicode >= 0x100000 and unicode <= 0x10FFFF then
70        return s_unknown
71    elseif unicode >= 0x00D800 and unicode <= 0x00DFFF then
72        return s_unknown
73    else
74        unicode = unicode - 0x10000
75        return f_double(idiv(unicode,0x400)+0xD800,unicode%0x400+0xDC00)
76    end
77end
78
79local function tounicode16sequence(unicodes)
80    local t = { }
81    for l=1,#unicodes do
82        local u = unicodes[l]
83        if u < 0xD7FF or (u > 0xDFFF and u <= 0xFFFF) then
84            t[l] = f_single(u)
85        elseif unicode >= 0x00E000 and unicode <= 0x00F8FF then
86            t[l] = s_unknown
87        elseif unicode >= 0x0F0000 and unicode <= 0x0FFFFF then
88            t[l] = s_unknown
89        elseif unicode >= 0x100000 and unicode <= 0x10FFFF then
90            t[l] = s_unknown
91     -- elseif unicode >= 0x00D800 and unicode <= 0x00DFFF then
92        elseif unicode >= 0x00D7FF and unicode <= 0x00DFFF then
93            t[l] = s_unknown
94        else
95            u = u - 0x10000
96            t[l] = f_double(idiv(u,0x400)+0xD800,u%0x400+0xDC00)
97        end
98    end
99    return concat(t)
100end
101
102
103local hash = { }
104local conc = { }
105
106table.setmetatableindex(hash,function(t,k)
107    local v
108    if k < 0xD7FF or (k > 0xDFFF and k <= 0xFFFF) then
109        v = f_single(k)
110    else
111        local k = k - 0x10000
112        v = f_double(idiv(k,0x400)+0xD800,k%0x400+0xDC00)
113    end
114    t[k] = v
115    return v
116end)
117
118local function tounicode(k)
119    if type(k) == "table" then
120        local n = #k
121        for l=1,n do
122            conc[l] = hash[k[l]]
123        end
124        return concat(conc,"",1,n)
125    elseif k >= 0x00E000 and k <= 0x00F8FF then
126        return s_unknown
127    elseif k >= 0x0F0000 and k <= 0x0FFFFF then
128        return s_unknown
129    elseif k >= 0x100000 and k <= 0x10FFFF then
130        return s_unknown
131 -- elseif k >= 0x00D800 and k <= 0x00DFFF then
132    elseif k >= 0x00D7FF and k <= 0x00DFFF then
133        return s_unknown
134    else
135        return hash[k]
136    end
137end
138
139local function fromunicode16(str)
140    if #str == 4 then
141        return tonumber(str,16)
142    else
143        local l, r = match(str,"(....)(....)")
144        return 0x10000 + (tonumber(l,16)-0xD800)*0x400  + tonumber(r,16) - 0xDC00
145    end
146end
147
148-- Slightly slower:
149--
150-- local p = C(4) * (C(4)^-1) / function(l,r)
151--     if r then
152--         return (tonumber(l,16))*0x400  + tonumber(r,16) - 0xDC00
153--     else
154--         return tonumber(l,16)
155--     end
156-- end
157--
158-- local function fromunicode16(str)
159--     return lpegmatch(p,str)
160-- end
161
162mappings.makenameparser      = makenameparser
163mappings.tounicode           = tounicode
164mappings.tounicode16         = tounicode16
165mappings.tounicode16sequence = tounicode16sequence
166mappings.fromunicode16       = fromunicode16
167
168-- mozilla emoji has bad lig names: name = gsub(name,"(u[a-f0-9_]+)%-([a-f0-9_]+)","%1_%2")
169
170local ligseparator = P("_")
171local varseparator = P(".")
172local namesplitter = Ct(C((1 - ligseparator - varseparator)^1) * (ligseparator * C((1 - ligseparator - varseparator)^1))^0)
173
174-- maybe: ff fi fl ffi ffl => f_f f_i f_l f_f_i f_f_l
175
176-- local function test(name)
177--     local split = lpegmatch(namesplitter,name)
178--     print(string.formatters["%s: [% t]"](name,split))
179-- end
180
181-- test("i.f_")
182-- test("this")
183-- test("this.that")
184-- test("japan1.123")
185-- test("such_so_more")
186-- test("such_so_more.that")
187
188-- to be completed .. for fonts that use unicodes for ligatures which
189-- is a actually a bad thing and should be avoided in the first place
190
191do
192
193    local overloads = {
194        IJ  = { name = "I_J",   unicode = { 0x49, 0x4A },       mess = 0x0132 },
195        ij  = { name = "i_j",   unicode = { 0x69, 0x6A },       mess = 0x0133 },
196        ff  = { name = "f_f",   unicode = { 0x66, 0x66 },       mess = 0xFB00 },
197        fi  = { name = "f_i",   unicode = { 0x66, 0x69 },       mess = 0xFB01 },
198        fl  = { name = "f_l",   unicode = { 0x66, 0x6C },       mess = 0xFB02 },
199        ffi = { name = "f_f_i", unicode = { 0x66, 0x66, 0x69 }, mess = 0xFB03 },
200        ffl = { name = "f_f_l", unicode = { 0x66, 0x66, 0x6C }, mess = 0xFB04 },
201        fj  = { name = "f_j",   unicode = { 0x66, 0x6A } },
202        fk  = { name = "f_k",   unicode = { 0x66, 0x6B } },
203
204     -- endash = { name = "endash", unicode = 0x2013, mess = 0x2013 },
205     -- emdash = { name = "emdash", unicode = 0x2014, mess = 0x2014 },
206    }
207
208    local o = allocate { }
209
210    for k, v in next, overloads do
211        local name = v.name
212        local mess = v.mess
213        if name then
214            o[name] = v
215        end
216        if mess then
217            o[mess] = v
218        end
219        o[k] = v
220    end
221
222    mappings.overloads = o
223
224end
225
226function mappings.addtounicode(data,filename,checklookups,forceligatures)
227    local resources = data.resources
228    local unicodes  = resources.unicodes
229    if not unicodes then
230        if trace_mapping then
231            report_fonts("no unicode list, quitting tounicode for %a",filename)
232        end
233        return
234    end
235    local properties    = data.properties
236    local descriptions  = data.descriptions
237    local overloads     = mappings.overloads
238    -- we need to move this code
239    unicodes['space']   = unicodes['space']  or 32
240    unicodes['hyphen']  = unicodes['hyphen'] or 45
241    unicodes['zwj']     = unicodes['zwj']    or 0x200D
242    unicodes['zwnj']    = unicodes['zwnj']   or 0x200C
243    --
244    local private       = fonts.constructors and fonts.constructors.privateoffset or 0xF0000 -- 0x10FFFF
245    local unicodevector = fonts.encodings.agl.unicodes or { } -- loaded runtime in context
246    local contextvector = fonts.encodings.agl.ctxcodes or { } -- loaded runtime in context
247    local missing       = { }
248    local nofmissing    = 0
249    local oparser       = nil
250    local cidnames      = nil
251    local cidcodes      = nil
252    local cidinfo       = properties.cidinfo
253    local usedmap       = cidinfo and fonts.cid.getmap(cidinfo)
254    local uparser       = makenameparser() -- hm, every time?
255    if usedmap then
256        oparser  = usedmap and makenameparser(cidinfo.ordering)
257        cidnames = usedmap.names
258        cidcodes = usedmap.unicodes
259    end
260    local ns = 0
261    local nl = 0
262    --
263    -- in order to avoid differences between runs due to hash randomization we
264    -- run over a sorted list
265    --
266    local dlist = sortedkeys(descriptions)
267    --
268 -- for du, glyph in next, descriptions do
269    for i=1,#dlist do
270        local du    = dlist[i]
271        local glyph = descriptions[du]
272        local name  = glyph.name
273        if name then
274            local overload = overloads[name] or overloads[du]
275            if overload then
276                -- get rid of weird ligatures
277             -- glyph.name    = overload.name
278                glyph.unicode = overload.unicode
279            else
280                local gu = glyph.unicode -- can already be set (number or table)
281                if not gu or gu == -1 or du >= private or (du >= 0xE000 and du <= 0xF8FF) or du == 0xFFFE or du == 0xFFFF then
282                    local unicode = unicodevector[name] or contextvector[name]
283                    if unicode then
284                        glyph.unicode = unicode
285                        ns            = ns + 1
286                    end
287                    -- cidmap heuristics, beware, there is no guarantee for a match unless
288                    -- the chain resolves
289                    if (not unicode) and usedmap then
290                        local foundindex = lpegmatch(oparser,name)
291                        if foundindex then
292                            unicode = cidcodes[foundindex] -- name to number
293                            if unicode then
294                                glyph.unicode = unicode
295                                ns            = ns + 1
296                            else
297                                local reference = cidnames[foundindex] -- number to name
298                                if reference then
299                                    local foundindex = lpegmatch(oparser,reference)
300                                    if foundindex then
301                                        unicode = cidcodes[foundindex]
302                                        if unicode then
303                                            glyph.unicode = unicode
304                                            ns            = ns + 1
305                                        end
306                                    end
307                                    if not unicode or unicode == "" then
308                                        local foundcodes, multiple = lpegmatch(uparser,reference)
309                                        if foundcodes then
310                                            glyph.unicode = foundcodes
311                                            if multiple then
312                                                nl      = nl + 1
313                                                unicode = true
314                                            else
315                                                ns      = ns + 1
316                                                unicode = foundcodes
317                                            end
318                                        end
319                                    end
320                                end
321                            end
322                        end
323                    end
324                    -- a.whatever or a_b_c.whatever or a_b_c (no numbers) a.b_
325                    --
326                    -- It is not trivial to find a solution that suits all fonts. We tried several alternatives
327                    -- and this one seems to work reasonable also with fonts that use less standardized naming
328                    -- schemes. The extra private test is tested by KE and seems to work okay with non-typical
329                    -- fonts as well.
330                    --
331                    if not unicode or unicode == "" then
332                        local split  = lpegmatch(namesplitter,name)
333                        local nsplit = split and #split or 0 -- add if
334                        if nsplit == 0 then
335                            -- skip
336                        elseif nsplit == 1 then
337                            local base = split[1]
338                            local u = unicodes[base] or unicodevector[base] or contextvector[name]
339                            if not u then
340                                -- skip
341                            elseif type(u) == "table" then
342                                -- unlikely
343                                if u[1] < private then
344                                    unicode = u
345                                    glyph.unicode = unicode
346                                end
347                            elseif u < private then
348                                unicode = u
349                                glyph.unicode = unicode
350                            end
351                        else
352                            local t = { }
353                            local n = 0
354                            for l=1,nsplit do
355                                local base = split[l]
356                                local u = unicodes[base] or unicodevector[base] or contextvector[name]
357                                if not u then
358                                    break
359                                elseif type(u) == "table" then
360                                    if u[1] >= private then
361                                        break
362                                    end
363                                    n = n + 1
364                                    t[n] = u[1]
365                                else
366                                    if u >= private then
367                                        break
368                                    end
369                                    n = n + 1
370                                    t[n] = u
371                                end
372                            end
373                            if n > 0 then
374                                if n == 1 then
375                                    unicode = t[1]
376                                else
377                                    unicode = t
378                                end
379                                glyph.unicode = unicode
380                            end
381                        end
382                        nl = nl + 1
383                    end
384                    -- last resort (we might need to catch private here as well)
385                    if not unicode or unicode == "" then
386                        local foundcodes, multiple = lpegmatch(uparser,name)
387                        if foundcodes then
388                            glyph.unicode = foundcodes
389                            if multiple then
390                                nl      = nl + 1
391                                unicode = true
392                            else
393                                ns      = ns + 1
394                                unicode = foundcodes
395                            end
396                        end
397                    end
398                    -- check using substitutes and alternates
399                    local r = overloads[unicode]
400                    if r then
401                        unicode = r.unicode
402                        glyph.unicode = unicode
403                    end
404                    --
405                    if not unicode then
406                        missing[du] = true
407                        nofmissing  = nofmissing + 1
408                    end
409                else
410                     -- maybe a message or so
411                end
412            end
413        else
414            local overload = overloads[du]
415            if overload then
416                glyph.unicode = overload.unicode
417            elseif not glyph.unicode then
418                missing[du] = true
419                nofmissing  = nofmissing + 1
420            end
421        end
422    end
423    if type(checklookups) == "function" then
424        checklookups(data,missing,nofmissing)
425    end
426
427    local unicoded  = 0
428    local collected = fonts.handlers.otf.readers.getcomponents(data) -- neglectable overhead
429
430    local function resolve(glyph,u)
431        local n = #u
432        for i=1,n do
433            if u[i] > private then
434                n = 0
435                break
436            end
437        end
438        if n > 0 then
439            if n > 1 then
440                glyph.unicode = u
441            else
442                glyph.unicode = u[1]
443            end
444            unicoded = unicoded + 1
445        end
446    end
447
448    if not collected then
449        -- move on
450    elseif forceligatures or force_ligatures then
451        for i=1,#dlist do
452            local du = dlist[i]
453            if du >= private or (du >= 0xE000 and du <= 0xF8FF) then
454                local u  = collected[du] -- always tables
455                if u then
456                    resolve(descriptions[du],u)
457                end
458            end
459        end
460    else
461        for i=1,#dlist do
462            local du = dlist[i]
463            if du >= private or (du >= 0xE000 and du <= 0xF8FF) then
464                local glyph = descriptions[du]
465                if glyph.class == "ligature" and not glyph.unicode then
466                    local u = collected[du] -- always tables
467                    if u then
468                         resolve(glyph,u)
469                    end
470                end
471            end
472        end
473    end
474
475    if trace_mapping and unicoded > 0 then
476        report_fonts("%n ligature tounicode mappings deduced from gsub ligature features",unicoded)
477    end
478    if trace_mapping then
479     -- for unic, glyph in sortedhash(descriptions) do
480        for i=1,#dlist do
481            local du      = dlist[i]
482            local glyph   = descriptions[du]
483            local name    = glyph.name or "-"
484            local index   = glyph.index or 0
485            local unicode = glyph.unicode
486            if unicode then
487                if type(unicode) == "table" then
488                    local unicodes = { }
489                    for i=1,#unicode do
490                        unicodes[i] = formatters("%U",unicode[i])
491                    end
492                    report_fonts("internal slot %U, name %a, unicode %U, tounicode % t",index,name,du,unicodes)
493                else
494                    report_fonts("internal slot %U, name %a, unicode %U, tounicode %U",index,name,du,unicode)
495                end
496            else
497                report_fonts("internal slot %U, name %a, unicode %U",index,name,du)
498            end
499        end
500    end
501    if trace_loading and (ns > 0 or nl > 0) then
502        report_fonts("%s tounicode entries added, ligatures %s",nl+ns,ns)
503    end
504end
505
506-- local parser = makenameparser("Japan1")
507-- local parser = makenameparser()
508-- local function test(str)
509--     local b, a = lpegmatch(parser,str)
510--     print((a and table.serialize(b)) or b)
511-- end
512-- test("a.sc")
513-- test("a")
514-- test("uni1234")
515-- test("uni1234.xx")
516-- test("uni12349876")
517-- test("u123400987600")
518-- test("index1234")
519-- test("Japan1.123")
520