lang-hyp.lua /size: 67 Kb    last modification: 2021-10-28 13:50
1if not modules then modules = { } end modules ['lang-hyp'] = {
2    version   = 1.001,
3    comment   = "companion to lang-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- In an automated workflow hypenation of long titles can be somewhat problematic
10-- especially when demands conflict. For that reason I played a bit with a Lua based
11-- variant of the traditional hyphenation machinery. This mechanism has been extended
12-- several times in projects, of which a good description can be found in TUGboat,
13-- Volume 27 (2006), No. 2 — Proceedings of EuroTEX2006: Automatic non-standard
14-- hyphenation in OpenOffice.org by László Németh.
15--
16-- Being the result of two days experimenting the following implementation is probably
17-- not completely okay yet. If there is demand I might add some more features and plugs.
18-- The performance is quite okay but can probably improved a bit, although this is not
19-- the most critital code. For instance, on a metafun manual run the overhead is about
20-- 0.3 seconds on 19 seconds which is not that bad.
21--
22-- In the procecess of wrapping up (for the ctx conference proceedings) I cleaned up
23-- and extended the code a bit. It can be used in production.
24--
25-- . a l g o r i t h m .
26--    4l1g4
27--     l g o3
28--      1g o
29--            2i t h
30--                4h1m
31-- ---------------------
32--    4 1 4 3 2 0 4 1
33--   a l-g o-r i t h-m
34
35-- . a s s z o n n y a l .
36--     s1s z/sz=sz,1,3
37--             n1n y/ny=ny,1,3
38-- -----------------------
39--    0 1 0 0 0 1 0 0 0/sz=sz,2,3,ny=ny,6,3
40--   a s-s z o n-n y a l/sz=sz,2,3,ny=ny,6,3
41--
42-- ab1cd/ef=gh,2,2 : acd - efd (pattern/replacement,start,length
43--
44-- todo  : support hjcodes (<32 == length) like luatex does now (no need/demand so far)
45-- maybe : support hyphenation over range (can alsready be done using attributes/language)
46-- maybe : reset dictionary.hyphenated when a pattern is added and/or forced reset option
47-- todo  : check subtypes (because they have subtle meanings in the line breaking)
48--
49-- word start (in tex engine):
50--
51-- boundary  : yes when wordboundary
52-- hlist     : when hyphenationbounds 1 or 3
53-- vlist     : when hyphenationbounds 1 or 3
54-- rule      : when hyphenationbounds 1 or 3
55-- dir       : when hyphenationbounds 1 or 3
56-- whatsit   : when hyphenationbounds 1 or 3
57-- glue      : yes
58-- math      : skipped
59-- glyph     : exhyphenchar (one only) : yes (so no -- ---)
60-- otherwise : yes
61--
62-- word end (in tex engine):
63--
64-- boundary  : yes
65-- glyph     : yes when different language
66-- glue      : yes
67-- penalty   : yes
68-- kern      : yes when not italic (for some historic reason)
69-- hlist     : when hyphenationbounds 2 or 3
70-- vlist     : when hyphenationbounds 2 or 3
71-- rule      : when hyphenationbounds 2 or 3
72-- dir       : when hyphenationbounds 2 or 3
73-- whatsit   : when hyphenationbounds 2 or 3
74-- ins       : when hyphenationbounds 2 or 3
75-- adjust    : when hyphenationbounds 2 or 3
76
77local type, rawget, rawset, tonumber, next = type, rawget, rawset, tonumber, next
78
79local P, R, S, Cg, Cf, Ct, Cc, C, Carg, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.Cg, lpeg.Cf, lpeg.Ct, lpeg.Cc, lpeg.C, lpeg.Carg, lpeg.Cs
80local lpegmatch = lpeg.match
81
82local context    = context
83
84local concat     = table.concat
85local insert     = table.insert
86local remove     = table.remove
87local formatters = string.formatters
88local utfchar    = utf.char
89local utfbyte    = utf.byte
90
91if not characters then
92    require("char-ini")
93end
94
95local setmetatableindex = table.setmetatableindex
96
97-- \enabletrackers[hyphenator.steps=silent] will not write to the terminal
98
99local trace_steps       = false  trackers.register("hyphenator.steps",    function(v) trace_steps     = v end)
100local trace_visualize   = false  trackers.register("hyphenator.visualize",function(v) trace_visualize = v end)
101
102local report            = logs.reporter("hyphenator")
103
104local implement         = interfaces and interfaces.implement or function() end
105
106languages               = languages or { }
107local hyphenators       = languages.hyphenators or { }
108languages.hyphenators   = hyphenators
109local traditional       = hyphenators.traditional or { }
110hyphenators.traditional = traditional
111
112local dictionaries = setmetatableindex(function(t,k)
113    local v = {
114        patterns   = { },
115        hyphenated = { },
116        specials   = { },
117        exceptions = { },
118        loaded     = false,
119    }
120    t[k] = v
121    return v
122end)
123
124hyphenators.dictionaries = dictionaries
125
126local character      = lpeg.patterns.utf8character
127local digit          = R("09")
128local weight         = digit/tonumber + Cc(0)
129local fence          = P(".")
130local hyphen         = P("-")
131local space          = P(" ")
132local char           = character - space
133local validcharacter = (character - S("./"))
134local keycharacter   =  character - S("/")
135----- basepart       = Ct( (Cc(0) * fence)^-1 * (weight * validcharacter)^1 * weight * (fence * Cc(0))^-1)
136local specpart       = (P("/") * Cf ( Ct("") *
137        Cg ( Cc("before") * C((1-P("="))^1) * P("=") ) *
138        Cg ( Cc("after")  * C((1-P(","))^1)  ) *
139        (   P(",") *
140            Cg ( Cc("start")  * ((1-P(","))^1/tonumber) * P(",") ) *
141            Cg ( Cc("length") * ((1-P(-1) )^1/tonumber)          )
142        )^-1
143    , rawset))^-1
144
145local make_hashkey_p = Cs((digit/"" + keycharacter)^1)
146----- make_pattern_p = basepart * specpart
147local make_hashkey_e = Cs((hyphen/"" + keycharacter)^1)
148local make_pattern_e = Ct(P(char) * (hyphen * Cc(true) * P(char) + P(char) * Cc(false))^1) -- catch . and char after -
149
150-- local make_hashkey_c = Cs((digit + keycharacter/"")^1)
151-- local make_pattern_c = Ct((P(1)/tonumber)^1)
152
153-- local cache = setmetatableindex(function(t,k)
154--     local n = lpegmatch(make_hashkey_c,k)
155--     local v = lpegmatch(make_pattern_c,n)
156--     t[k] = v
157--     return v
158-- end)
159--
160-- local weight_n       = digit + Cc("0")
161-- local basepart_n     = Cs( (Cc("0") * fence)^-1 * (weight * validcharacter)^1 * weight * (fence * Cc("0"))^-1) / cache
162-- local make_pattern_n = basepart_n * specpart
163
164local make_pattern_c = Ct((P(1)/tonumber)^1)
165
166-- us + nl: 17664 entries -> 827 unique (saves some 3M)
167
168local cache = setmetatableindex(function(t,k)
169    local v = lpegmatch(make_pattern_c,k)
170    t[k] = v
171    return v
172end)
173
174local weight_n       = digit + Cc("0")
175local fence_n        = fence / "0"
176local char_n         = validcharacter / ""
177local basepart_n     = Cs(fence_n^-1 * (weight_n * char_n)^1 * weight_n * fence_n^-1) / cache
178local make_pattern_n = basepart_n * specpart
179
180local function register_pattern(patterns,specials,str,specification)
181    local k = lpegmatch(make_hashkey_p,str)
182 -- local v1, v2 = lpegmatch(make_pattern_p,str)
183    local v1, v2 = lpegmatch(make_pattern_n,str)
184    patterns[k] = v1 -- is this key still ok for complex patterns
185    if specification then
186        specials[k] = specification
187    elseif v2 then
188        specials[k] = v2
189    end
190end
191
192local function unregister_pattern(patterns,specials,str)
193    local k = lpegmatch(make_hashkey_p,str)
194    patterns[k] = nil
195    specials[k] = nil
196end
197
198local p_lower = lpeg.patterns.utf8lower
199
200local function register_exception(exceptions,str,specification)
201    local l = lpegmatch(p_lower,str)
202    local k = lpegmatch(make_hashkey_e,l)
203    local v = lpegmatch(make_pattern_e,l)
204    exceptions[k] = v
205end
206
207local p_pattern   = ((Carg(1) * Carg(2) * C(char^1)) / register_pattern   + 1)^1
208local p_exception = ((Carg(1)           * C(char^1)) / register_exception + 1)^1
209local p_split     = Ct(C(character)^1)
210
211function traditional.loadpatterns(language,filename)
212    local dictionary    = dictionaries[language]
213    if not dictionary.loaded then
214        if not filename or filename == "" then
215            filename = "lang-" .. language
216        end
217        filename = file.addsuffix(filename,"lua")
218        local fullname = resolvers.findfile(filename)
219        if fullname and fullname ~= "" then
220            local specification = dofile(fullname)
221            if specification then
222                local patterns = specification.patterns
223                if patterns then
224                    local data = patterns.data
225                    if data and data ~= "" then
226                        lpegmatch(p_pattern,data,1,dictionary.patterns,dictionary.specials)
227                    end
228                end
229                local exceptions = specification.exceptions
230                if exceptions then
231                    local data = exceptions.data
232                    if data and data ~= "" then
233                        lpegmatch(p_exception,data,1,dictionary.exceptions)
234                    end
235                end
236            end
237        end
238        dictionary.loaded = true
239    end
240    return dictionary
241end
242
243local lcchars    = characters.lcchars
244local uccodes    = characters.uccodes
245local categories = characters.categories
246local nofwords   = 0
247local nofhashed  = 0
248
249local steps     = nil
250local f_show    = formatters["%w%s"]
251
252local function show_log()
253    if trace_steps == true then
254        report()
255        local w = #steps[1][1]
256        for i=1,#steps do
257            local s = steps[i]
258            report("%s%w%S  %S",s[1],w - #s[1] + 3,s[2],s[3] or "")
259        end
260        report()
261    end
262end
263
264local function show_1(wsplit)
265    local u = concat(wsplit," ")
266    steps = { { f_show(0,u), f_show(0,u) } }
267end
268
269local function show_2(c,m,wsplit,done,i,spec)
270    local s = lpegmatch(p_split,c)
271    local t = { }
272    local n = #m
273    local w = #wsplit
274    for j=1,n do
275        t[#t+1] = m[j]
276        t[#t+1] = s[j]
277    end
278    local m = 2*i-2
279    local l = #t
280    local s = spec and table.sequenced(spec) or ""
281    if m == 0 then
282        steps[#steps+1] = { f_show(m,  concat(t,"",2)),      f_show(1,concat(done," ",2,#done),s) }
283    elseif i+1 == w then
284        steps[#steps+1] = { f_show(m-1,concat(t,"",1,#t-1)), f_show(1,concat(done," ",2,#done),s) }
285    else
286        steps[#steps+1] = { f_show(m-1,concat(t)),           f_show(1,concat(done," ",2,#done),s) }
287    end
288end
289
290local function show_3(wsplit,done)
291    local t = { }
292    local h = { }
293    local n = #wsplit
294    for i=1,n do
295        local w = wsplit[i]
296        if i > 1 then
297            local d = done[i]
298            t[#t+1] = i > 2 and d % 2 == 1 and "-" or " "
299            h[#h+1] = d
300        end
301        t[#t+1] = w
302        h[#h+1] = w
303    end
304    steps[#steps+1] = { f_show(0,concat(h)), f_show(0,concat(t)) }
305    show_log()
306end
307
308local function show_4(wsplit,done)
309    steps = { { concat(wsplit," ") } }
310    show_log()
311end
312
313function traditional.lasttrace()
314    return steps
315end
316
317-- We could reuse the w table but as we cache the resolved words there is not much gain in
318-- that complication.
319--
320-- Beware: word can be a table and when n is passed to we can assume reuse so we need to
321-- honor that n then.
322--
323-- todo: a fast variant for tex ... less lookups (we could check is dictionary has changed)
324-- ... although due to caching the already done words, we don't do much here
325
326local function hyphenate(dictionary,word,n) -- odd is okay
327    nofwords = nofwords + 1
328    local hyphenated = dictionary.hyphenated
329    local isstring = type(word) == "string"
330    if isstring then
331        local done = hyphenated[word]
332        if done ~= nil then
333            return done
334        end
335    elseif n then
336        local done = hyphenated[concat(word,"",1,n)]
337        if done ~= nil then
338            return done
339        end
340    else
341        local done = hyphenated[concat(word)]
342        if done ~= nil then
343            return done
344        end
345    end
346    local key
347    if isstring then
348        key = word
349        word = lpegmatch(p_split,word)
350        if not n then
351            n = #word
352        end
353    else
354        if not n then
355            n = #word
356        end
357        key = concat(word,"",1,n)
358    end
359    local l = 1
360    local w = { "." }
361 -- local d = dictionary.codehash
362    for i=1,n do
363        local c = word[i]
364     -- l = l + (d[c] or 1)
365        l = l + 1
366        w[l] = lcchars[c] or c
367    end
368    l = l + 1
369    w[l] = "."
370    local c = concat(w,"",2,l-1)
371    --
372    local done = hyphenated[c]
373    if done ~= nil then
374        hyphenated[key] = done
375        nofhashed = nofhashed + 1
376        return done
377    end
378    --
379    local exceptions = dictionary.exceptions
380    local exception  = exceptions[c]
381    if exception then
382        if trace_steps then
383            show_4(w,exception)
384        end
385        hyphenated[key] = exception
386        nofhashed = nofhashed + 1
387        return exception
388    end
389    --
390    if trace_steps then
391        show_1(w)
392    end
393    --
394    local specials = dictionary.specials
395    local patterns = dictionary.patterns
396    --
397    local spec
398    for i=1,l do
399        for j=i,l do
400            local c = concat(w,"",i,j)
401            local m = patterns[c]
402            if m then
403                local s = specials[c]
404                if not done then
405                    done = { }
406                    spec = nil
407                    -- the string that we resolve has explicit fences (.) so done starts at
408                    -- the first fence and runs upto the last one so we need one slot less
409                    for i=1,l do
410                        done[i] = 0
411                    end
412                end
413                -- we run over the pattern that always has a (zero) value for each character
414                -- plus one more as we look at both sides
415                for k=1,#m do
416                    local new = m[k]
417                    if not new then
418                        break
419                    elseif new == true then
420                        report("fatal error")
421                        break
422                    elseif new > 0 then
423                        local pos = i + k - 1
424                        local old = done[pos]
425                        if not old then
426                            -- break ?
427                        elseif new > old then
428                            done[pos] = new
429                            if s then
430                                local b = i + (s.start or 1) - 1
431                                if b > 0 then
432                                    local e = b + (s.length or 2) - 1
433                                    if e > 0 then
434                                        if pos >= b and pos <= e then
435                                            if spec then
436                                                spec[pos] = { s, k - 1 }
437                                            else
438                                                spec = { [pos] = { s, k - 1 } }
439                                            end
440                                        end
441                                    end
442                                end
443                            end
444                        end
445                    end
446                end
447                if trace_steps and done then
448                    show_2(c,m,w,done,i,s)
449                end
450            end
451        end
452    end
453    if trace_steps and done then
454        show_3(w,done)
455    end
456    if done then
457        local okay = false
458        for i=3,#done do
459            if done[i] % 2 == 1 then
460                done[i-2] = spec and spec[i] or true
461                okay = true
462            else
463                done[i-2] = false
464            end
465        end
466        if okay then
467            done[#done] = nil
468            done[#done] = nil
469        else
470            done = false
471        end
472    else
473        done = false
474    end
475    hyphenated[key] = done
476    nofhashed = nofhashed + 1
477    return done
478end
479
480function traditional.gettrace(language,word)
481    if not word or word == "" then
482        return
483    end
484    local dictionary = dictionaries[language]
485    if dictionary then
486        local hyphenated = dictionary.hyphenated
487        hyphenated[word] = nil
488        hyphenate(dictionary,word)
489        return steps
490    end
491end
492
493local methods = setmetatableindex(function(t,k) local v = hyphenate t[k] = v return v end)
494
495function traditional.installmethod(name,f)
496    if rawget(methods,name) then
497        report("overloading %a is not permitted",name)
498    else
499        methods[name] = f
500    end
501end
502
503local s_detail_1 = "-"
504local f_detail_2 = formatters["%s-%s"]
505local f_detail_3 = formatters["{%s}{%s}{}"]
506local f_detail_4 = formatters["{%s%s}{%s%s}{%s}"]
507
508function traditional.injecthyphens(dictionary,word,specification)
509    if not word then
510        return false
511    end
512    if not specification then
513        return word
514    end
515    local hyphens = hyphenate(dictionary,word)
516    if not hyphens then
517        return word
518    end
519
520    -- the following code is similar to code later on but here we have strings while there
521    -- we have hyphen specs
522
523    local word      = lpegmatch(p_split,word)
524    local size      = #word
525
526    local leftmin   = specification.leftcharmin or 2
527    local rightmin  = size - (specification.rightcharmin or leftmin)
528    local leftchar  = specification.leftchar
529    local rightchar = specification.rightchar
530
531    local result    = { }
532    local rsize     = 0
533    local position  = 1
534
535    while position <= size do
536        if position >= leftmin and position <= rightmin then
537            local hyphen = hyphens[position]
538            if not hyphen then
539                rsize = rsize + 1
540                result[rsize] = word[position]
541                position = position + 1
542            elseif hyphen == true then
543                rsize = rsize + 1
544                result[rsize] = word[position]
545                rsize = rsize + 1
546                if leftchar and rightchar then
547                    result[rsize] = f_detail_3(rightchar,leftchar)
548                else
549                    result[rsize] = s_detail_1
550                end
551                position = position + 1
552            else
553                local o, h = hyphen[2]
554                if o then
555                    h = hyphen[1]
556                else
557                    h = hyphen
558                    o = 1
559                end
560                local b = position - o + (h.start  or 1)
561                local e = b + (h.length or 2) - 1
562                if b > 0 and e >= b then
563                    for i=1,b-position do
564                        rsize = rsize + 1
565                        result[rsize] = word[position]
566                        position = position + 1
567                    end
568                    rsize = rsize + 1
569                    if leftchar and rightchar then
570                        result[rsize] = f_detail_4(h.before,rightchar,leftchar,h.after,concat(word,"",b,e))
571                    else
572                        result[rsize] = f_detail_2(h.before,h.after)
573                    end
574                    position = e + 1
575                else
576                    -- error
577                    rsize = rsize + 1
578                    result[rsize] = word[position]
579                    position = position + 1
580                end
581            end
582        else
583            rsize = rsize + 1
584            result[rsize] = word[position]
585            position = position + 1
586        end
587    end
588    return concat(result)
589end
590
591do
592
593    local word      = C((1-space)^1)
594    local spaces    = space^1
595
596    local u_pattern = (Carg(1) * Carg(2) * word           / unregister_pattern + spaces)^1
597    local r_pattern = (Carg(1) * Carg(2) * word * Carg(3) /   register_pattern + spaces)^1
598    local e_pattern = (Carg(1)           * word           / register_exception + spaces)^1
599
600    function traditional.registerpattern(language,str,specification)
601        local dictionary = dictionaries[language]
602        if specification == false then
603            lpegmatch(u_pattern,str,1,dictionary.patterns,dictionary.specials)
604         -- unregister_pattern(dictionary.patterns,dictionary.specials,str)
605        else
606            lpegmatch(r_pattern,str,1,dictionary.patterns,dictionary.specials,type(specification) == "table" and specification or false)
607         -- register_pattern(dictionary.patterns,dictionary.specials,str,specification)
608        end
609    end
610
611    function traditional.registerexception(language,str)
612        lpegmatch(e_pattern,str,1,dictionaries[language].exceptions)
613    end
614
615end
616
617-- todo: unicodes or utfhash ?
618
619if context then
620
621    local nodecodes          = nodes.nodecodes
622    local disccodes          = nodes.disccodes
623
624    local glyph_code         = nodecodes.glyph
625    local disc_code          = nodecodes.disc
626    local math_code          = nodecodes.math
627    local hlist_code         = nodecodes.hlist
628
629    local automaticdisc_code = disccodes.automatic
630    local regulardisc_code   = disccodes.regular
631
632    local nuts               = nodes.nuts
633    local tonode             = nodes.tonode
634    local nodepool           = nuts.pool
635
636    local new_disc           = nodepool.disc
637    local new_penalty        = nodepool.penalty
638
639    local getfield           = nuts.getfield
640    local getfont            = nuts.getfont
641    local getid              = nuts.getid
642    local getattr            = nuts.getattr
643    local getnext            = nuts.getnext
644    local getprev            = nuts.getprev
645    local getsubtype         = nuts.getsubtype
646    local getlist            = nuts.getlist
647    local getlanguage        = nuts.getlanguage
648    local getattrlist        = nuts.getattrlist
649    local setattrlist        = nuts.setattrlist
650    local isglyph            = nuts.isglyph
651    local ischar             = nuts.ischar
652
653    local setchar            = nuts.setchar
654    local setdisc            = nuts.setdisc
655    local setlink            = nuts.setlink
656    local setprev            = nuts.setprev
657    local setnext            = nuts.setnext
658
659    local insertbefore       = nuts.insertbefore
660    local insertafter        = nuts.insertafter
661    local copy_node          = nuts.copy
662    local copylist           = nuts.copylist
663    local remove_node        = nuts.remove
664    local endofmath          = nuts.endofmath
665    local node_tail          = nuts.tail
666
667    local nexthlist          = nuts.traversers.hlist
668    local nextdisc           = nuts.traversers.disc
669
670    local setcolor           = nodes.tracers.colors.set
671
672    local variables          = interfaces.variables
673    local v_reset            = variables.reset
674    local v_yes              = variables.yes
675    local v_word             = variables.word
676    local v_all              = variables.all
677
678    local settings_to_array  = utilities.parsers.settings_to_array
679
680    local unsetvalue         = attributes.unsetvalue
681    local texsetattribute    = tex.setattribute
682
683    local prehyphenchar      = lang.prehyphenchar
684    local posthyphenchar     = lang.posthyphenchar
685    local preexhyphenchar    = lang.preexhyphenchar
686    local postexhyphenchar   = lang.postexhyphenchar
687
688    local a_hyphenation      = attributes.private("hyphenation")
689
690    local interwordpenalty   = 5000
691
692    function traditional.loadpatterns(language)
693        return dictionaries[language]
694    end
695
696    -- for the moment we use an independent data structure
697
698    setmetatableindex(dictionaries,function(t,k)
699        if type(k) == "string" then
700            -- this will force a load if not yet loaded (we need a nicer way) for the moment
701            -- that will do (nneeded for examples that register a pattern specification
702            languages.getnumber(k)
703        end
704        local specification = languages.getdata(k)
705        local dictionary = {
706            patterns   = { },
707            exceptions = { },
708            hyphenated = { },
709            specials   = { },
710            instance   = false,
711            characters = { },
712            unicodes   = { },
713        }
714        if specification then
715            local resources = specification.resources
716            if resources then
717                local characters = dictionary.characters or { }
718                local unicodes   = dictionary.unicodes   or { }
719                for i=1,#resources do
720                    local r = resources[i]
721                    if not r.in_dictionary then
722                        r.in_dictionary = true
723                        local patterns = r.patterns
724                        if patterns then
725                            local data = patterns.data
726                            if data then
727                                -- regular patterns
728                                lpegmatch(p_pattern,data,1,dictionary.patterns,dictionary.specials)
729                            end
730                            local extra = patterns.extra
731                            if extra then
732                                -- special patterns
733                                lpegmatch(p_pattern,extra,1,dictionary.patterns,dictionary.specials)
734                            end
735                        end
736                        local exceptions = r.exceptions
737                        if exceptions then
738                            local data = exceptions.data
739                            if data and data ~= "" then
740                                lpegmatch(p_exception,data,1,dictionary.exceptions)
741                            end
742                        end
743                        local usedchars  = lpegmatch(p_split,patterns.characters)
744                        for i=1,#usedchars do
745                            local char  = usedchars[i]
746                            local code  = utfbyte(char)
747                            local upper = uccodes[code]
748                            characters[char]  = code
749                            unicodes  [code]  = char
750                            if type(upper) == "table" then
751                                for i=1,#upper do
752                                    local u = upper[i]
753                                    unicodes[u] = utfchar(u)
754                                end
755                            else
756                                unicodes[upper] = utfchar(upper)
757                            end
758                        end
759                    end
760                end
761                dictionary.characters = characters
762                dictionary.unicodes   = unicodes
763                setmetatableindex(characters,function(t,k) local v = k and utfbyte(k) t[k] = v return v end)
764            end
765            t[specification.number] = dictionary
766            dictionary.instance = specification.instance -- needed for hyphenchars
767        end
768        t[k] = dictionary
769        return dictionary
770    end)
771
772    -- Beware: left and right min doesn't mean that in a 1 mmm hsize there can be snippets
773    -- with less characters than either of them! This could be an option but such a narrow
774    -- hsize doesn't make sense anyway.
775
776    -- We assume that featuresets are defined global ... local definitions (also mid paragraph)
777    -- make not much sense anyway. For the moment we assume no predefined sets so we don't need
778    -- to store them. Nor do we need to hash them in order to save space ... no sane user will
779    -- define many of them.
780
781    local featuresets       = hyphenators.featuresets or { }
782    hyphenators.featuresets = featuresets
783
784    storage.shared.noflanguagesfeaturesets = storage.shared.noflanguagesfeaturesets or 0
785
786    local noffeaturesets = storage.shared.noflanguagesfeaturesets
787
788    storage.register("languages/hyphenators/featuresets",featuresets,"languages.hyphenators.featuresets")
789
790    ----- hash = table.sequenced(featureset,",") -- no need now
791
792    local function register(name,featureset)
793        noffeaturesets = noffeaturesets + 1
794        featureset.attribute = noffeaturesets
795        featuresets[noffeaturesets] = featureset  -- access by attribute
796        featuresets[name] = featureset            -- access by name
797        storage.shared.noflanguagesfeaturesets = noffeaturesets
798        return noffeaturesets
799    end
800
801    local function makeset(...)
802        -- a bit overkill, supporting variants but who cares
803        local set = { }
804        for i=1,select("#",...) do
805            local list = select(i,...)
806            local kind = type(list)
807            local used = nil
808            if kind == "string" then
809                if list == v_all then
810                    -- not ok ... now all get ignored
811                    return setmetatableindex(function(t,k) local v = utfchar(k) t[k] = v return v end)
812                elseif list ~= "" then
813                    used = lpegmatch(p_split,list)
814                    set  = set or { }
815                    for i=1,#used do
816                        local char = used[i]
817                        set[utfbyte(char)] = char
818                    end
819                end
820            elseif kind == "table" then
821                if next(list) then
822                    set = set or { }
823                    for byte, char in next, list do
824                        set[byte] = char == true and utfchar(byte) or char
825                    end
826                elseif #list > 0 then
827                    set = set or { }
828                    for i=1,#list do
829                        local l = list[i]
830                        if type(l) == "number" then
831                            set[l] = utfchar(l)
832                        else
833                            set[utfbyte(l)] = l
834                        end
835                    end
836                end
837            end
838        end
839        return set
840    end
841
842    -- category pd (tex also sees --- and -- as hyphens but do we really want that
843
844    local defaulthyphens = {
845        [0x002D] = true,   -- HYPHEN-MINUS
846        [0x00AD] = 0x002D, -- SOFT HYPHEN (active in ConTeXt)
847     -- [0x058A] = true,   -- ARMENIAN HYPHEN
848     -- [0x1400] = true,   -- CANADIAN SYLLABICS HYPHEN
849     -- [0x1806] = true,   -- MONGOLIAN TODO SOFT HYPHEN
850        [0x2010] = true,   -- HYPHEN
851     -- [0x2011] = true,   -- NON-BREAKING HYPHEN
852     -- [0x2012] = true,   -- FIGURE DASH
853        [0x2013] = true,   -- EN DASH
854        [0x2014] = true,   -- EM DASH
855     -- [0x2015] = true,   -- HORIZONTAL BAR
856     -- [0x2027] = true,   -- HYPHENATION POINT
857     -- [0x2E17] = true,   -- DOUBLE OBLIQUE HYPHEN
858     -- [0x2E1A] = true,   -- HYPHEN WITH DIAERESIS
859     -- [0x2E3A] = true,   -- TWO-EM DASH
860     -- [0x2E3B] = true,   -- THREE-EM DASH
861     -- [0x2E40] = true,   -- DOUBLE HYPHEN
862     -- [0x301C] = true,   -- WAVE DASH
863     -- [0x3030] = true,   -- WAVY DASH
864     -- [0x30A0] = true,   -- KATAKANA-HIRAGANA DOUBLE HYPHEN
865     -- [0xFE31] = true,   -- PRESENTATION FORM FOR VERTICAL EM DASH
866     -- [0xFE32] = true,   -- PRESENTATION FORM FOR VERTICAL EN DASH
867     -- [0xFE58] = true,   -- SMALL EM DASH
868     -- [0xFE63] = true,   -- SMALL HYPHEN-MINUS
869     -- [0xFF0D] = true,   -- FULLWIDTH HYPHEN-MINUS
870    }
871
872    local defaultjoiners = {
873        [0x200C] = true, -- nzwj
874        [0x200D] = true, -- zwj
875    }
876
877    local function somehyphenchar(c)
878        c = tonumber(c)
879        return c ~= 0 and c or nil
880    end
881
882    local function definefeatures(name,featureset)
883        local extrachars   = featureset.characters -- "[]()"
884        local hyphenchars  = featureset.hyphens
885        local joinerchars  = featureset.joiners
886        local alternative  = featureset.alternative
887        local rightwordmin = tonumber(featureset.rightwordmin)
888        local charmin      = tonumber(featureset.charmin) -- luatex now also has hyphenationmin
889        local leftcharmin  = tonumber(featureset.leftcharmin)
890        local rightcharmin = tonumber(featureset.rightcharmin)
891        local leftchar     = somehyphenchar(featureset.leftchar)
892        local rightchar    = somehyphenchar(featureset.rightchar)
893        local rightchars   = featureset.rightchars
894local rightedge    = featureset.rightedge
895local autohyphen   = v_yes -- featureset.autohyphen -- insert disc
896local hyphenonly   = v_yes -- featureset.hyphenonly -- don't hyphenate around
897        rightchars  = rightchars  == v_word and true           or tonumber(rightchars)
898        joinerchars = joinerchars == v_yes  and defaultjoiners or joinerchars -- table
899        hyphenchars = hyphenchars == v_yes  and defaulthyphens or hyphenchars -- table
900        -- not yet ok: extrachars have to be ignored  so it cannot be all)
901        featureset.extrachars   = makeset(joinerchars or "",extrachars or "")
902        featureset.hyphenchars  = makeset(hyphenchars or "")
903        featureset.alternative  = alternative or "hyphenate"
904        featureset.rightwordmin = rightwordmin and rightwordmin > 0 and rightwordmin or nil
905        featureset.charmin      = charmin      and charmin      > 0 and charmin      or nil
906        featureset.leftcharmin  = leftcharmin  and leftcharmin  > 0 and leftcharmin  or nil
907        featureset.rightcharmin = rightcharmin and rightcharmin > 0 and rightcharmin or nil
908        featureset.rightchars   = rightchars
909        featureset.leftchar     = leftchar
910        featureset.rightchar    = rightchar
911     -- featureset.strict       = rightedge  == "tex"
912featureset.autohyphen   = autohyphen == v_yes
913featureset.hyphenonly   = hyphenonly == v_yes
914        return register(name,featureset)
915    end
916
917    local function setfeatures(n)
918        if not n or n == v_reset then
919            n = false
920        else
921            local f = featuresets[n]
922            if not f and type(n) == "string" then
923                local t = settings_to_array(n)
924                local s = { }
925                for i=1,#t do
926                    local ti = t[i]
927                    local fs = featuresets[ti]
928                    if fs then
929                        for k, v in next, fs do
930                            s[k] = v
931                        end
932                    end
933                end
934                n = register(n,s)
935            else
936                n = f and f.attribute
937            end
938        end
939        texsetattribute(a_hyphenation,n or unsetvalue)
940    end
941
942    traditional.definefeatures = definefeatures
943    traditional.setfeatures    = setfeatures
944
945    implement {
946        name      = "definehyphenationfeatures",
947        actions   = definefeatures,
948        arguments = {
949            "string",
950            {
951                { "characters" },
952                { "hyphens" },
953                { "joiners" },
954                { "rightchars" },
955                { "rightwordmin", "integer" },
956                { "charmin", "integer" },
957                { "leftcharmin", "integer" },
958                { "rightcharmin", "integer" },
959                { "leftchar", "integer" },
960                { "rightchar", "integer" },
961                { "alternative" },
962                { "rightedge" },
963            }
964        }
965    }
966
967    implement {
968        name      = "sethyphenationfeatures",
969        actions   = setfeatures,
970        arguments = "string"
971    }
972
973    implement {
974        name      = "registerhyphenationpattern",
975        actions   = traditional.registerpattern,
976        arguments = { "string",  "string",  "boolean" }
977    }
978
979    implement {
980        name      = "registerhyphenationexception",
981        actions   = traditional.registerexception,
982        arguments = "2 strings",
983    }
984
985    -- This is a relative large function with local variables and local functions. A previous
986    -- implementation had the functions outside but this is cleaner and as efficient. The test
987    -- runs 100 times over tufte.tex, knuth.tex, zapf.tex, ward.tex and darwin.tex in lower
988    -- and uppercase with a 1mm hsize.
989    --
990    --         language=0     language>0     4 | 3 * slower
991    --
992    -- tex     2.34 | 1.30    2.55 | 1.45    0.21 | 0.15
993    -- lua     2.42 | 1.38    3.30 | 1.84    0.88 | 0.46
994    --
995    -- Of course we have extra overhead (virtual Lua machine) but also we check attributes and
996    -- support specific local options). The test puts the typeset text in boxes and discards
997    -- it. If we also flush the runtime is 4.31|2.56 and 4.99|2.94 seconds so the relative
998    -- difference is (somehow) smaller. The test has 536 pages. There is a little bit of extra
999    -- overhead because we store the patterns in a different way.
1000    --
1001    -- As usual I will look for speedups. Some 0.01 seconds could be gained by sharing patterns
1002    -- which is not impressive but it does save some 3M memory on this test. (Some optimizations
1003    -- already brought the 3.30 seconds down to 3.14 but it all depends on aggressive caching.)
1004
1005    -- As we kick in the hyphenator before fonts get handled, we don't look at implicit (font)
1006    -- kerns or ligatures.
1007
1008    local starttiming = statistics.starttiming
1009    local stoptiming  = statistics.stoptiming
1010
1011 -- local strictids = {
1012 --     [nodecodes.hlist]   = true,
1013 --     [nodecodes.vlist]   = true,
1014 --     [nodecodes.rule]    = true,
1015 --     [nodecodes.dir]     = true,
1016 --     [nodecodes.whatsit] = true,
1017 --     [nodecodes.insert]  = true,
1018 --     [nodecodes.adjust]  = true,
1019 --
1020 --     [nodecodes.math]    = true,
1021 --     [nodecodes.disc]    = true,
1022 --
1023 --     [nodecodes.accent]  = true, -- never used in context
1024 -- }
1025
1026    -- a lot of overhead when only one char
1027
1028    function traditional.hyphenate(head)
1029
1030        local first           = head
1031        local tail            = nil
1032        local last            = nil
1033        local current         = first
1034        local dictionary      = nil
1035        local instance        = nil
1036        local characters      = nil
1037        local unicodes        = nil
1038        local exhyphenchar    = tex.exhyphenchar
1039        local extrachars      = nil
1040        local hyphenchars     = nil
1041        local language        = nil
1042        local lastfont        = nil
1043        local start           = nil
1044        local stop            = nil
1045        local word            = { } -- we reuse this table
1046        local size            = 0
1047        local leftchar        = false
1048        local rightchar       = false -- utfbyte("-")
1049        local leftexchar      = false
1050        local rightexchar     = false -- utfbyte("-")
1051        local leftmin         = 0
1052        local rightmin        = 0
1053        local charmin         = 1
1054        local leftcharmin     = nil
1055        local rightcharmin    = nil
1056        ----- leftwordmin     = nil
1057        local rightwordmin    = nil
1058        local rightchars      = nil
1059        local leftchar        = nil
1060        local rightchar       = nil
1061        local attr            = nil
1062        local lastwordlast    = nil
1063        local hyphenated      = hyphenate
1064        ----- strict          = nil
1065        local exhyphenpenalty = tex.exhyphenpenalty
1066        local hyphenpenalty   = tex.hyphenpenalty
1067        local autohyphen      = false
1068        local hyphenonly      = false
1069
1070        -- We cannot use an 'enabled' boolean (false when no characters or extras) because we
1071        -- can have plugins that set a characters metatable and so) ... it doesn't save much
1072        -- anyway. Using (unicodes and unicodes[code]) and a nil table when no characters also
1073        -- doesn't save much. So there not that much to gain for languages that don't hyphenate.
1074        --
1075        -- enabled = (unicodes and (next(unicodes) or getmetatable(unicodes)))
1076        --        or (extrachars and next(extrachars))
1077        --
1078        -- This can be used to not add characters i.e. keep size 0 but then we need to check for
1079        -- attributes that change it, which costs time too. Not much to gain there.
1080
1081        starttiming(traditional)
1082
1083        local function insertpenalty()
1084            local p = new_penalty(interwordpenalty)
1085            setattrlist(p,last)
1086            if trace_visualize then
1087                nuts.setvisual(p,"penalty")
1088            end
1089            last = getprev(last)
1090            first, last = insertafter(first,last,p)
1091        end
1092
1093        local function synchronizefeatureset(a)
1094            local f = a and featuresets[a]
1095            if f then
1096                hyphenated   = methods[f.alternative or "hyphenate"]
1097                extrachars   = f.extrachars
1098                hyphenchars  = f.hyphenchars
1099                rightwordmin = f.rightwordmin
1100                charmin      = f.charmin
1101                leftcharmin  = f.leftcharmin
1102                rightcharmin = f.rightcharmin
1103                leftchar     = f.leftchar
1104                rightchar    = f.rightchar
1105             -- strict       = f.strict and strictids
1106                rightchars   = f.rightchars
1107                autohyphen   = f.autohyphen
1108                hyphenonly   = f.hyphenonly
1109                if rightwordmin and rightwordmin > 0 and lastwordlast ~= rightwordmin then
1110                    -- so we can change mid paragraph but it's kind of unpredictable then
1111                    if not tail then
1112                        tail = node_tail(first)
1113                    end
1114                    last = tail
1115                    local inword = false
1116                    local count  = 0
1117                    while last and rightwordmin > 0 do
1118                        local id = getid(last)
1119                        if id == glyph_code then
1120                            count = count + 1
1121                            inword = true
1122                            if trace_visualize then
1123                                setcolor(last,"darkgreen")
1124                            end
1125                        elseif inword then
1126                            inword = false
1127                            rightwordmin = rightwordmin - 1
1128                            if rightchars == true then
1129                                if rightwordmin > 0 then
1130                                    insertpenalty()
1131                                end
1132                            elseif rightchars and count <= rightchars then
1133                                insertpenalty()
1134                            end
1135                        end
1136                        last = getprev(last)
1137                    end
1138                    lastwordlast = rightwordmin
1139                end
1140                if not charmin or charmin == 0 then
1141                    charmin = 1
1142                end
1143            else
1144                hyphenated   = methods.hyphenate
1145                extrachars   = false
1146                hyphenchars  = false
1147                rightwordmin = false
1148                charmin      = 1
1149                leftcharmin  = false
1150                rightcharmin = false
1151                leftchar     = false
1152                rightchar    = false
1153             -- strict       = false
1154                autohyphen   = false
1155                hyphenonly   = false
1156            end
1157
1158            return a
1159        end
1160
1161        local function flush(hyphens) -- todo: no need for result
1162
1163            local rightmin = size - rightmin
1164            local result   = { }
1165            local rsize    = 0
1166            local position = 1
1167
1168            -- todo: remember last dics and don't go back to before that (plus message) ...
1169            -- for simplicity we also assume that we don't start with a dics node
1170            --
1171            -- there can be a conflict: if we backtrack then we can end up in another disc
1172            -- and get out of sync (dup chars and so)
1173
1174            while position <= size do
1175                if position >= leftmin and position <= rightmin then
1176                    local hyphen = hyphens[position]
1177                    if not hyphen then
1178                        rsize = rsize + 1
1179                        result[rsize] = word[position]
1180                        position = position + 1
1181                    elseif hyphen == true then
1182                        rsize = rsize + 1
1183                        result[rsize] = word[position]
1184                        rsize = rsize + 1
1185                        result[rsize] = true
1186                        position = position + 1
1187                    else
1188                        local o, h = hyphen[2]
1189                        if o then
1190                            -- { hyphen, offset)
1191                            h = hyphen[1]
1192                        else
1193                            -- hyphen
1194                            h = hyphen
1195                            o = 1
1196                        end
1197                        local b = position - o + (h.start  or 1)
1198                        local e = b + (h.length or 2) - 1
1199                        if b > 0 and e >= b then
1200                            for i=1,b-position do
1201                                rsize = rsize + 1
1202                                result[rsize] = word[position]
1203                                position = position + 1
1204                            end
1205                            rsize = rsize + 1
1206                            result[rsize] = {
1207                                h.before or "",      -- pre
1208                                h.after or "",       -- post
1209                                concat(word,"",b,e), -- replace
1210                                h.right,             -- optional after pre
1211                                h.left,              -- optional before post
1212                            }
1213                            position = e + 1
1214                        else
1215                            -- error
1216                            rsize = rsize + 1
1217                            result[rsize] = word[position]
1218                            position = position + 1
1219                        end
1220                    end
1221                else
1222                    rsize = rsize + 1
1223                    result[rsize] = word[position]
1224                    position = position + 1
1225                end
1226            end
1227
1228            local function serialize(replacement,leftchar,rightchar)
1229                if not replacement then
1230                    return
1231                elseif replacement == true then
1232                    local glyph = copy_node(stop)
1233                    setchar(glyph,leftchar or rightchar)
1234                    return glyph
1235                end
1236                local head    = nil
1237                local current = nil
1238                if leftchar then
1239                    head    = copy_node(stop)
1240                    current = head
1241                    setchar(head,leftchar)
1242                end
1243                local rsize = #replacement
1244                if rsize == 1 then
1245                    local glyph = copy_node(stop)
1246                    setchar(glyph,characters[replacement])
1247                    if head then
1248                        insertafter(current,current,glyph)
1249                    else
1250                        head = glyph
1251                    end
1252                    current = glyph
1253                elseif rsize > 0 then
1254                    local list = lpegmatch(p_split,replacement) -- this is an utf split (could be cached)
1255                    for i=1,#list do
1256                        local glyph = copy_node(stop)
1257                        setchar(glyph,characters[list[i]])
1258                        if head then
1259                            insertafter(current,current,glyph)
1260                        else
1261                            head = glyph
1262                        end
1263                        current = glyph
1264                    end
1265                end
1266                if rightchar then
1267                    local glyph = copy_node(stop)
1268                    insertafter(current,current,glyph)
1269                    setchar(glyph,rightchar)
1270                end
1271                return head
1272            end
1273
1274            local current  = start
1275            local attrnode = start -- will be different, just the first char
1276
1277            for i=1,rsize do
1278                local r = result[i]
1279                if r == true then
1280                    local disc = new_disc()
1281                    local pre  = nil
1282                    local post = nil
1283                    if rightchar then
1284                        pre = serialize(true,rightchar)
1285                    end
1286                    if leftchar then
1287                        post = serialize(true,leftchar)
1288                    end
1289                    setdisc(disc,pre,post,nil,regulardisc_code,hyphenpenalty)
1290                    if attrnode then
1291                        setattrlist(disc,attrnode)
1292                    end
1293                    -- could be a replace as well
1294                    insertbefore(first,current,disc)
1295                elseif type(r) == "table" then
1296                    local disc    = new_disc()
1297                    local pre     = r[1]
1298                    local post    = r[2]
1299                    local replace = r[3]
1300                    local right   = r[4] ~= false and rightchar
1301                    local left    = r[5] ~= false and leftchar
1302                    if pre then
1303                        if pre ~= "" then
1304                            pre = serialize(pre,false,right)
1305                        else
1306                            pre = nil
1307                        end
1308                    end
1309                    if post then
1310                        if post ~= "" then
1311                            post = serialize(post,left,false)
1312                        else
1313                            post = nil
1314                        end
1315                    end
1316                    if replace then
1317                        if replace ~= "" then
1318                            replace = serialize(replace)
1319                        else
1320                            replace = nil
1321                        end
1322                    end
1323                    -- maybe regular code
1324                    setdisc(disc,pre,post,replace,regulardisc_code,hyphenpenalty)
1325                    if attrnode then
1326                        setattrlist(disc,attrnode)
1327                    end
1328                    insertbefore(first,current,disc)
1329                else
1330                    setchar(current,characters[r])
1331                    if i < rsize then
1332                        current = getnext(current)
1333                    end
1334                end
1335            end
1336            if current and current ~= stop then
1337                local current = getnext(current)
1338                local last    = getnext(stop)
1339                while current ~= last do
1340                    first, current = remove_node(first,current,true)
1341                end
1342            end
1343
1344        end
1345
1346        local function inject(leftchar,rightchar,code,attrnode)
1347            if first ~= current then
1348                local disc = new_disc()
1349                first, current, glyph = remove_node(first,current)
1350                first, current = insertbefore(first,current,disc)
1351                if trace_visualize then
1352                    setcolor(glyph,"darkred")  -- these get checked
1353                    setcolor(disc,"darkgreen") -- in the colorizer
1354                end
1355                local pre     = nil
1356                local post    = nil
1357                local replace = glyph
1358                if leftchar and leftchar > 0 then
1359                    post = copy_node(glyph)
1360                    setchar(post,leftchar)
1361                end
1362                pre = copy_node(glyph)
1363                setchar(pre,rightchar and rightchar > 0 and rightchar or code)
1364                setdisc(disc,pre,post,replace,automaticdisc_code,hyphenpenalty) -- ex ?
1365                if attrnode then
1366                    setattrlist(disc,attrnode)
1367                end
1368            end
1369            return current
1370        end
1371
1372        local function injectseries(current,last,next,attrnode)
1373            local disc  = new_disc()
1374            local start = current
1375            first, current = insertbefore(first,current,disc)
1376            setprev(start)
1377            setnext(last)
1378            if next then
1379                setlink(current,next)
1380            else
1381                setnext(current)
1382            end
1383            local pre     = copylist(start)
1384            local post    = nil
1385            local replace = start
1386            setdisc(disc,pre,post,replace,automaticdisc_code,hyphenpenalty) -- ex ?
1387            if attrnode then
1388                setattrlist(disc,attrnode)
1389            end
1390            return current
1391        end
1392
1393        local a = getattr(first,a_hyphenation)
1394        if a ~= attr then
1395            attr = synchronizefeatureset(a)
1396        end
1397
1398        -- The first attribute in a word determines the way a word gets hyphenated and if
1399        -- relevant, other properties are also set then. We could optimize for silly one-char
1400        -- cases but it has no priority as the code is still not that much slower than the
1401        -- native hyphenator and this variant also provides room for extensions.
1402
1403        local skipping = false
1404
1405        -- In "word word word." the sequences "word" and "." can be a different font!
1406
1407        while current and current ~= last do -- and current
1408            local code, id = isglyph(current)
1409            if code then
1410                if skipping then
1411                    current = getnext(current)
1412                else
1413                    local lang = getlanguage(current)
1414                    local font = getfont(current)
1415                    if lang ~= language or font ~= lastfont then
1416                        if dictionary and size > charmin and leftmin + rightmin <= size then
1417                            -- only german has many words starting with an uppercase character
1418                            if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then
1419                                -- skip
1420                            else
1421                                local hyphens = hyphenated(dictionary,word,size)
1422                                if hyphens then
1423                                    flush(hyphens)
1424                                end
1425                            end
1426                        end
1427                        lastfont = font
1428                        if language ~= lang and lang > 0 then
1429                            --
1430                            dictionary = dictionaries[lang]
1431                            instance   = dictionary.instance
1432                            characters = dictionary.characters
1433                            unicodes   = dictionary.unicodes
1434                            --
1435                            local a = getattr(current,a_hyphenation)
1436                            attr        = synchronizefeatureset(a)
1437                            leftchar    = leftchar     or (instance and posthyphenchar  (instance)) -- we can make this more
1438                            rightchar   = rightchar    or (instance and prehyphenchar   (instance)) -- efficient if needed
1439                            leftexchar  =                 (instance and preexhyphenchar (instance))
1440                            rightexchar =                 (instance and postexhyphenchar(instance))
1441                            leftmin     = leftcharmin  or getfield(current,"left")
1442                            rightmin    = rightcharmin or getfield(current,"right")
1443                            if not leftchar or leftchar < 0 then
1444                                leftchar = false
1445                            end
1446                            if not rightchar or rightchar < 0 then
1447                                rightchar = false
1448                            end
1449                            --
1450                            local char = unicodes[code] or (extrachars and extrachars[code])
1451                            if char then
1452                                word[1] = char
1453                                size    = 1
1454                                start   = current
1455                            else
1456                                size = 0
1457                            end
1458                        else
1459                            size = 0
1460                        end
1461                        language = lang
1462                    elseif language <= 0 then
1463                        --
1464                    elseif size > 0 then
1465                        local char = unicodes[code] or (extrachars and extrachars[code])
1466                        if char then
1467                            size = size + 1
1468                            word[size] = char
1469                        elseif dictionary then
1470                            if not hyphenonly or code ~= exhyphenchar then
1471                                if size > charmin and leftmin + rightmin <= size then
1472                                    if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then
1473                                        -- skip
1474                                    else
1475                                        local hyphens = hyphenated(dictionary,word,size)
1476                                        if hyphens then
1477                                            flush(hyphens)
1478                                        end
1479                                    end
1480                                end
1481                            end
1482                            size = 0
1483                            if code == exhyphenchar then -- normally the -
1484                                local next = getnext(current)
1485                                local last = current
1486                                local font = getfont(current)
1487                                while next and ischar(next,font) == code do
1488                                    last = next
1489                                    next = getnext(next)
1490                                end
1491                                if not autohyphen then
1492                                    current = last
1493                                elseif current == last then
1494                                    current = inject(leftexchar,rightexchar,code,current)
1495                                else
1496                                    current = injectseries(current,last,next,current)
1497                                end
1498                                if hyphenonly then
1499                                    skipping = true
1500                                end
1501                            elseif hyphenchars then
1502                                local char = hyphenchars[code]
1503                                if char == true then
1504                                    char = code
1505                                end
1506                                if char then
1507                                    current = inject(leftchar and char or nil,rightchar and char or nil,char,current)
1508                                end
1509                            end
1510                        end
1511                    else
1512                        local a = getattr(current,a_hyphenation)
1513                        if a ~= attr then
1514                            attr        = synchronizefeatureset(a) -- influences extrachars
1515                            leftchar    = leftchar     or (instance and posthyphenchar  (instance)) -- we can make this more
1516                            rightchar   = rightchar    or (instance and prehyphenchar   (instance)) -- efficient if needed
1517                            leftexchar  =                 (instance and preexhyphenchar (instance))
1518                            rightexchar =                 (instance and postexhyphenchar(instance))
1519                            leftmin     = leftcharmin  or getfield(current,"left")
1520                            rightmin    = rightcharmin or getfield(current,"right")
1521                            if not leftchar or leftchar < 0 then
1522                                leftchar = false
1523                            end
1524                            if not rightchar or rightchar < 0 then
1525                                rightchar = false
1526                            end
1527                        end
1528                        --
1529                        local char = unicodes[code] or (extrachars and extrachars[code])
1530                        if char then
1531                            word[1] = char
1532                            size    = 1
1533                            start   = current
1534                        end
1535                    end
1536                    stop    = current
1537                    current = getnext(current)
1538                end
1539            else
1540                if skipping then
1541                    skipping = false
1542                end
1543                if id == disc_code then
1544                    size = 0
1545                    current = getnext(current)
1546                    if hyphenonly then
1547                        skipping = true
1548                    end
1549             -- elseif strict and strict[id] then
1550             --     current = id == math_code and getnext(endofmath(current)) or getnext(current)
1551             --     size = 0
1552                else
1553                    current = id == math_code and getnext(endofmath(current)) or getnext(current)
1554                end
1555                if size > 0 then
1556                    if dictionary and size > charmin and leftmin + rightmin <= size then
1557                        if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then
1558                            -- skip
1559                        else
1560                            local hyphens = hyphenated(dictionary,word,size)
1561                            if hyphens then
1562                                flush(hyphens)
1563                            end
1564                        end
1565                    end
1566                    size = 0
1567                end
1568            end
1569        end
1570        -- we can have quit due to last so we need to flush the last seen word, we could move
1571        -- this in the loop and test for current but ... messy
1572        if dictionary and size > charmin and leftmin + rightmin <= size then
1573            if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then
1574                -- skip
1575            else
1576                local hyphens = hyphenated(dictionary,word,size)
1577                if hyphens then
1578                    flush(hyphens)
1579                end
1580            end
1581        end
1582
1583        stoptiming(traditional)
1584
1585        return head
1586    end
1587
1588    statistics.register("hyphenation",function()
1589        if nofwords > 0 or statistics.elapsed(traditional) > 0 then
1590            return string.format("%s words hyphenated, %s unique, used time %s",
1591                nofwords,nofhashed,statistics.elapsedseconds(traditional) or 0)
1592        end
1593    end)
1594
1595    local texmethod = "builders.kernel.hyphenation"
1596    local oldmethod = texmethod
1597    local newmethod = texmethod
1598
1599 -- local newmethod = "languages.hyphenators.traditional.hyphenate"
1600 --
1601 -- nodes.tasks.prependaction("processors","words",newmethod)
1602 -- nodes.tasks.disableaction("processors",oldmethod)
1603 --
1604 -- nodes.tasks.replaceaction("processors","words",oldmethod,newmethod)
1605
1606 -- \enabledirectives[hyphenators.method=traditional]
1607 -- \enabledirectives[hyphenators.method=builtin]
1608
1609    -- push / pop ? check first attribute
1610
1611    -- local replaceaction = nodes.tasks.replaceaction -- no longer overload this way (too many local switches)
1612
1613    local hyphenate    = lang.hyphenate
1614    local hyphenating  = nuts.hyphenating
1615    local methods      = { }
1616    local usedmethod   = false
1617    local stack        = { }
1618
1619    local original = hyphenating and
1620        function(head)
1621            return (hyphenating(head))
1622        end
1623    or
1624        function(head)
1625            hyphenate(tonode(head))
1626            return head -- a nut
1627        end
1628
1629 -- local has_language = lang.has_language
1630 --
1631 -- local function original(head) -- kernel.hyphenation(head)
1632 --     local h = tonode(head)
1633 --     if has_language(h) then
1634 --         hyphenate(h)
1635 --     end
1636 --     return head
1637 -- end
1638
1639    local getcount = tex.getcount
1640
1641    hyphenators.methods  = methods
1642    local optimize       = false
1643
1644    directives.register("hyphenator.optimize", function(v) optimize = v end)
1645
1646    function hyphenators.handler(head,groupcode)
1647        if usedmethod then
1648            if optimize and (groupcode == "hbox" or groupcode == "adjusted_hbox") then
1649                if getcount("hyphenstate") > 0 then
1650                    forced = false
1651                    return usedmethod(head)
1652                else
1653                    return head
1654                end
1655            else
1656                return usedmethod(head)
1657            end
1658        else
1659            return head
1660        end
1661    end
1662
1663    methods.tex         = original
1664    methods.original    = original
1665    methods.expanded    = original -- was expanded before 1.005
1666    methods.traditional = languages.hyphenators.traditional.hyphenate
1667    methods.none        = false -- function(head) return head, false end
1668
1669    usedmethod          = original
1670
1671    local function setmethod(method)
1672        usedmethod = type(method) == "string" and methods[method]
1673        if usedmethod == nil then
1674            usedmethod = methods.tex
1675        end
1676    end
1677    local function pushmethod(method)
1678        insert(stack,usedmethod)
1679        usedmethod = type(method) == "string" and methods[method]
1680        if usedmethod == nil then
1681            usedmethod = methods.tex
1682        end
1683    end
1684    local function popmethod()
1685        usedmethod = remove(stack) or methods.tex
1686    end
1687
1688    hyphenators.setmethod  = setmethod
1689    hyphenators.pushmethod = pushmethod
1690    hyphenators.popmethod  = popmethod
1691
1692    directives.register("hyphenators.method",setmethod)
1693
1694    function hyphenators.setup(specification)
1695        local method = specification.method
1696        if method then
1697            setmethod(method)
1698        end
1699    end
1700
1701    implement { name = "sethyphenationmethod", actions = setmethod,  arguments = "string" }
1702    implement { name = "pushhyphenation",      actions = pushmethod, arguments = "string" }
1703    implement { name = "pophyphenation",       actions = popmethod }
1704
1705    -- can become a runtime loaded one:
1706
1707    local context      = context
1708    local ctx_NC       = context.NC
1709    local ctx_NR       = context.NR
1710    local ctx_verbatim = context.verbatim
1711
1712    function hyphenators.showhyphenationtrace(language,word)
1713        if not word or word == "" then
1714            return
1715        end
1716        local saved = trace_steps
1717        trace_steps = "silent"
1718        local steps = traditional.gettrace(language,word)
1719        trace_steps = saved
1720        if steps then
1721            local n = #steps
1722            if n > 0 then
1723                context.starttabulate { "|r|l|l|l|" }
1724                for i=1,n do
1725                    local s = steps[i]
1726                    ctx_NC() if i > 1 and i < n then context(i-1) end
1727                    ctx_NC() ctx_verbatim(s[1])
1728                    ctx_NC() ctx_verbatim(s[2])
1729                    ctx_NC() ctx_verbatim(s[3])
1730                    ctx_NC()
1731                    ctx_NR()
1732                end
1733                context.stoptabulate()
1734            end
1735        end
1736    end
1737
1738    implement {
1739        name      = "showhyphenationtrace",
1740        actions   = hyphenators.showhyphenationtrace,
1741        arguments = "2 strings",
1742    }
1743
1744    function nodes.stripdiscretionaries(head)
1745        for l in nexthlist, head do
1746            for d in nextdisc, getlist(l) do
1747                remove_node(h,false,true)
1748            end
1749        end
1750        return head
1751    end
1752
1753
1754else
1755
1756    -- traditional.loadpatterns("nl","lang-nl")
1757    -- traditional.loadpatterns("de","lang-de")
1758    -- traditional.loadpatterns("us","lang-us")
1759
1760    -- traditional.registerpattern("nl","e1ë",      { start = 1, length = 2, before = "e",  after = "e"  } )
1761    -- traditional.registerpattern("nl","oo7ë",     { start = 2, length = 3, before = "o",  after = "e"  } )
1762    -- traditional.registerpattern("de","qqxc9xkqq",{ start = 3, length = 4, before = "ab", after = "cd" } )
1763
1764    -- local specification = {
1765    --     leftcharmin     = 2,
1766    --     rightcharmin    = 2,
1767    --     leftchar        = "<",
1768    --     rightchar       = ">",
1769    -- }
1770
1771    -- print("reëel",       traditional.injecthyphens(dictionaries.nl,"reëel",       specification),"r{e>}{<e}{eë}el")
1772    -- print("reeëel",      traditional.injecthyphens(dictionaries.nl,"reeëel",      specification),"re{e>}{<e}{eë}el")
1773    -- print("rooëel",      traditional.injecthyphens(dictionaries.nl,"rooëel",      specification),"r{o>}{<e}{ooë}el")
1774
1775    -- print(   "qxcxkq",   traditional.injecthyphens(dictionaries.de,   "qxcxkq",   specification),"")
1776    -- print(  "qqxcxkqq",  traditional.injecthyphens(dictionaries.de,  "qqxcxkqq",  specification),"")
1777    -- print( "qqqxcxkqqq", traditional.injecthyphens(dictionaries.de, "qqqxcxkqqq", specification),"")
1778    -- print("qqqqxcxkqqqq",traditional.injecthyphens(dictionaries.de,"qqqqxcxkqqqq",specification),"")
1779
1780    -- print("kunstmatig",       traditional.injecthyphens(dictionaries.nl,"kunstmatig",       specification),"")
1781    -- print("kunststofmatig",   traditional.injecthyphens(dictionaries.nl,"kunststofmatig",   specification),"")
1782    -- print("kunst[stof]matig", traditional.injecthyphens(dictionaries.nl,"kunst[stof]matig", specification),"")
1783
1784    -- traditional.loadpatterns("us","lang-us")
1785
1786    -- local specification = {
1787    --     leftcharmin     = 2,
1788    --     rightcharmin    = 2,
1789    --     leftchar        = false,
1790    --     rightchar       = false,
1791    -- }
1792
1793    -- trace_steps = true
1794
1795    -- print("components",    traditional.injecthyphens(dictionaries.us,"components", specification),"")
1796    -- print("single",        traditional.injecthyphens(dictionaries.us,"single",     specification),"sin-gle")
1797    -- print("everyday",      traditional.injecthyphens(dictionaries.us,"everyday",   specification),"every-day")
1798    -- print("associate",     traditional.injecthyphens(dictionaries.us,"associate",     specification),"as-so-ciate")
1799    -- print("philanthropic", traditional.injecthyphens(dictionaries.us,"philanthropic", specification),"phil-an-thropic")
1800    -- print("projects",      traditional.injecthyphens(dictionaries.us,"projects",      specification),"projects")
1801    -- print("Associate",     traditional.injecthyphens(dictionaries.us,"Associate",     specification),"As-so-ciate")
1802    -- print("Philanthropic", traditional.injecthyphens(dictionaries.us,"Philanthropic", specification),"Phil-an-thropic")
1803    -- print("Projects",      traditional.injecthyphens(dictionaries.us,"Projects",      specification),"Projects")
1804
1805end
1806
1807