lang-hyp.lmt /size: 68 Kb    last modification: 2025-02-21 11:03
1if not modules then modules = { } end modules ['lang-hyp'] = {
2    version   = 1.001,
3    comment   = "companion to lang-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- This one needs checking wrt luametatex additions!
10
11-- In an automated workflow hypenation of long titles can be somewhat problematic
12-- especially when demands conflict. For that reason I played a bit with a Lua based
13-- variant of the traditional hyphenation machinery. This mechanism has been extended
14-- several times in projects, of which a good description can be found in TUGboat,
15-- Volume 27 (2006), No. 2 — Proceedings of EuroTEX2006: Automatic non-standard
16-- hyphenation in OpenOffice.org by László Németh.
17--
18-- Being the result of two days experimenting the following implementation is probably
19-- not completely okay yet. If there is demand I might add some more features and plugs.
20-- The performance is quite okay but can probably improved a bit, although this is not
21-- the most critital code. For instance, on a metafun manual run the overhead is about
22-- 0.3 seconds on 19 seconds which is not that bad.
23--
24-- In the procecess of wrapping up (for the ctx conference proceedings) I cleaned up
25-- and extended the code a bit. It can be used in production.
26--
27-- . a l g o r i t h m .
28--    4l1g4
29--     l g o3
30--      1g o
31--            2i t h
32--                4h1m
33-- ---------------------
34--    4 1 4 3 2 0 4 1
35--   a l-g o-r i t h-m
36
37-- . a s s z o n n y a l .
38--     s1s z/sz=sz,1,3
39--             n1n y/ny=ny,1,3
40-- -----------------------
41--    0 1 0 0 0 1 0 0 0/sz=sz,2,3,ny=ny,6,3
42--   a s-s z o n-n y a l/sz=sz,2,3,ny=ny,6,3
43--
44-- ab1cd/ef=gh,2,2 : acd - efd (pattern/replacement,start,length
45--
46-- todo  : support hjcodes (<32 == length) like luatex does now (no need/demand so far)
47-- maybe : support hyphenation over range (can alsready be done using attributes/language)
48-- maybe : reset dictionary.hyphenated when a pattern is added and/or forced reset option
49-- todo  : check subtypes (because they have subtle meanings in the line breaking)
50--
51-- word start (in tex engine):
52--
53-- boundary  : yes when wordboundary
54-- hlist     : when hyphenationbounds 1 or 3
55-- vlist     : when hyphenationbounds 1 or 3
56-- rule      : when hyphenationbounds 1 or 3
57-- dir       : when hyphenationbounds 1 or 3
58-- whatsit   : when hyphenationbounds 1 or 3
59-- glue      : yes
60-- math      : skipped
61-- glyph     : exhyphenchar (one only) : yes (so no -- ---)
62-- otherwise : yes
63--
64-- word end (in tex engine):
65--
66-- boundary  : yes
67-- glyph     : yes when different language
68-- glue      : yes
69-- penalty   : yes
70-- kern      : yes when not italic (for some historic reason)
71-- hlist     : when hyphenationbounds 2 or 3
72-- vlist     : when hyphenationbounds 2 or 3
73-- rule      : when hyphenationbounds 2 or 3
74-- dir       : when hyphenationbounds 2 or 3
75-- whatsit   : when hyphenationbounds 2 or 3
76-- ins       : when hyphenationbounds 2 or 3
77-- adjust    : when hyphenationbounds 2 or 3
78
79local type, rawget, rawset, tonumber, next = type, rawget, rawset, tonumber, next
80
81local P, R, S, Cg, Cf, Ct, Cc, C, Carg, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.Cg, lpeg.Cf, lpeg.Ct, lpeg.Cc, lpeg.C, lpeg.Carg, lpeg.Cs
82local lpegmatch = lpeg.match
83
84local context    = context
85
86local concat     = table.concat
87local insert     = table.insert
88local remove     = table.remove
89local formatters = string.formatters
90local utfchar    = utf.char
91local utfbyte    = utf.byte
92
93if not characters then
94    require("char-ini")
95end
96
97local setmetatableindex = table.setmetatableindex
98
99-- \enabletrackers[hyphenator.steps=silent] will not write to the terminal
100
101local trace_steps       = false  trackers.register("hyphenator.steps",    function(v) trace_steps     = v end)
102local trace_visualize   = false  trackers.register("hyphenator.visualize",function(v) trace_visualize = v end)
103
104local report            = logs.reporter("hyphenator")
105
106local implement         = interfaces and interfaces.implement or function() end
107
108languages               = languages or { }
109local hyphenators       = languages.hyphenators or { }
110languages.hyphenators   = hyphenators
111local traditional       = hyphenators.traditional or { }
112hyphenators.traditional = traditional
113
114local dictionaries = setmetatableindex(function(t,k)
115    local v = {
116        patterns   = { },
117        hyphenated = { },
118        specials   = { },
119        exceptions = { },
120        loaded     = false,
121    }
122    t[k] = v
123    return v
124end)
125
126hyphenators.dictionaries = dictionaries
127
128local character      = lpeg.patterns.utf8character
129local digit          = R("09")
130local weight         = digit/tonumber + Cc(0)
131local fence          = P(".")
132local hyphen         = P("-")
133local space          = P(" ")
134local char           = character - space
135local validcharacter = (character - S("./"))
136local keycharacter   =  character - S("/")
137----- basepart       = Ct( (Cc(0) * fence)^-1 * (weight * validcharacter)^1 * weight * (fence * Cc(0))^-1)
138local specpart       = (P("/") * Cf ( Ct("") *
139        Cg ( Cc("before") * C((1-P("="))^1) * P("=") ) *
140        Cg ( Cc("after")  * C((1-P(","))^1)  ) *
141        (   P(",") *
142            Cg ( Cc("start")  * ((1-P(","))^1/tonumber) * P(",") ) *
143            Cg ( Cc("length") * ((1-P(-1) )^1/tonumber)          )
144        )^-1
145    , rawset))^-1
146
147local make_hashkey_p = Cs((digit/"" + keycharacter)^1)
148----- make_pattern_p = basepart * specpart
149local make_hashkey_e = Cs((hyphen/"" + keycharacter)^1)
150local make_pattern_e = Ct(P(char) * (hyphen * Cc(true) * P(char) + P(char) * Cc(false))^1) -- catch . and char after -
151
152-- local make_hashkey_c = Cs((digit + keycharacter/"")^1)
153-- local make_pattern_c = Ct((P(1)/tonumber)^1)
154
155-- local cache = setmetatableindex(function(t,k)
156--     local n = lpegmatch(make_hashkey_c,k)
157--     local v = lpegmatch(make_pattern_c,n)
158--     t[k] = v
159--     return v
160-- end)
161--
162-- local weight_n       = digit + Cc("0")
163-- local basepart_n     = Cs( (Cc("0") * fence)^-1 * (weight * validcharacter)^1 * weight * (fence * Cc("0"))^-1) / cache
164-- local make_pattern_n = basepart_n * specpart
165
166local make_pattern_c = Ct((P(1)/tonumber)^1)
167
168-- us + nl: 17664 entries -> 827 unique (saves some 3M)
169
170local cache = setmetatableindex(function(t,k)
171    local v = lpegmatch(make_pattern_c,k)
172    t[k] = v
173    return v
174end)
175
176local weight_n       = digit + Cc("0")
177local fence_n        = fence / "0"
178local char_n         = validcharacter / ""
179local basepart_n     = Cs(fence_n^-1 * (weight_n * char_n)^1 * weight_n * fence_n^-1) / cache
180local make_pattern_n = basepart_n * specpart
181
182local function register_pattern(patterns,specials,str,specification)
183    local k = lpegmatch(make_hashkey_p,str)
184 -- local v1, v2 = lpegmatch(make_pattern_p,str)
185    local v1, v2 = lpegmatch(make_pattern_n,str)
186    patterns[k] = v1 -- is this key still ok for complex patterns
187    if specification then
188        specials[k] = specification
189    elseif v2 then
190        specials[k] = v2
191    end
192end
193
194local function unregister_pattern(patterns,specials,str)
195    local k = lpegmatch(make_hashkey_p,str)
196    patterns[k] = nil
197    specials[k] = nil
198end
199
200local p_lower = lpeg.patterns.utf8lower
201
202local function register_exception(exceptions,str,specification)
203    local l = lpegmatch(p_lower,str)
204    local k = lpegmatch(make_hashkey_e,l)
205    local v = lpegmatch(make_pattern_e,l)
206    exceptions[k] = v
207end
208
209local p_pattern   = ((Carg(1) * Carg(2) * C(char^1)) / register_pattern   + 1)^1
210local p_exception = ((Carg(1)           * C(char^1)) / register_exception + 1)^1
211local p_split     = Ct(C(character)^1)
212
213function traditional.loadpatterns(language,filename)
214    local dictionary    = dictionaries[language]
215    if not dictionary.loaded then
216        if not filename or filename == "" then
217            filename = "lang-" .. language
218        end
219        filename = file.addsuffix(filename,"lua")
220        local fullname = resolvers.findfile(filename)
221        if fullname and fullname ~= "" then
222            local specification = dofile(fullname)
223            if specification then
224                local patterns = specification.patterns
225                if patterns then
226                    local data = patterns.data
227                    if data and data ~= "" then
228                        lpegmatch(p_pattern,data,1,dictionary.patterns,dictionary.specials)
229                    end
230                end
231                local exceptions = specification.exceptions
232                if exceptions then
233                    local data = exceptions.data
234                    if data and data ~= "" then
235                        lpegmatch(p_exception,data,1,dictionary.exceptions)
236                    end
237                end
238                dictionary.lefthyphenmin  = patterns.lefthyphenmin
239                dictionary.righthyphenmin = patterns.righthyphenmin
240            end
241        end
242        dictionary.loaded = true
243    end
244    return dictionary
245end
246
247local lcchars    = characters.lcchars
248local uccodes    = characters.uccodes
249local categories = characters.categories
250local nofwords   = 0
251local nofhashed  = 0
252
253local steps     = nil
254local f_show    = formatters["%w%s"]
255
256local function show_log()
257    if trace_steps == true then
258        report()
259        local w = #steps[1][1]
260        for i=1,#steps do
261            local s = steps[i]
262            report("%s%w%S  %S",s[1],w - #s[1] + 3,s[2] or s[1],s[3] or "")
263        end
264        report()
265    end
266end
267
268local function show_1(wsplit)
269    local u = concat(wsplit," ")
270    steps = { { f_show(0,u), f_show(0,u) } }
271end
272
273local function show_2(c,m,wsplit,done,i,spec)
274    local s = lpegmatch(p_split,c)
275    local t = { }
276    local n = #m
277    local w = #wsplit
278    for j=1,n do
279        t[#t+1] = m[j]
280        t[#t+1] = s[j]
281    end
282    local m = 2*i-2
283    local l = #t
284    local s = spec and table.sequenced(spec) or ""
285    if m == 0 then
286        steps[#steps+1] = { f_show(m,  concat(t,"",2)),      f_show(1,concat(done," ",2,#done),s) }
287    elseif i+1 == w then
288        steps[#steps+1] = { f_show(m-1,concat(t,"",1,#t-1)), f_show(1,concat(done," ",2,#done),s) }
289    else
290        steps[#steps+1] = { f_show(m-1,concat(t)),           f_show(1,concat(done," ",2,#done),s) }
291    end
292end
293
294local function show_3(wsplit,done)
295    local t = { }
296    local h = { }
297    local n = #wsplit
298    for i=1,n do
299        local w = wsplit[i]
300        if i > 1 then
301            local d = done[i]
302            t[#t+1] = i > 2 and d % 2 == 1 and "-" or " "
303            h[#h+1] = d
304        end
305        t[#t+1] = w
306        h[#h+1] = w
307    end
308    steps[#steps+1] = { f_show(0,concat(h)), f_show(0,concat(t)) }
309    show_log()
310end
311
312local function show_4(wsplit,done)
313    steps = { { concat(wsplit," ") } }
314    show_log()
315end
316
317function traditional.lasttrace()
318    return steps
319end
320
321-- We could reuse the w table but as we cache the resolved words there is not much gain in
322-- that complication.
323--
324-- Beware: word can be a table and when n is passed to we can assume reuse so we need to
325-- honor that n then.
326--
327-- todo: a fast variant for tex ... less lookups (we could check is dictionary has changed)
328-- ... although due to caching the already done words, we don't do much here
329
330local function hyphenate(dictionary,word,n) -- odd is okay
331    nofwords = nofwords + 1
332    local hyphenated = dictionary.hyphenated
333    local isstring = type(word) == "string"
334    if isstring then
335        local done = hyphenated[word]
336        if done ~= nil then
337            return done
338        end
339    elseif n then
340        local done = hyphenated[concat(word,"",1,n)]
341        if done ~= nil then
342            return done
343        end
344    else
345        local done = hyphenated[concat(word)]
346        if done ~= nil then
347            return done
348        end
349    end
350    local key
351    if isstring then
352        key = word
353        word = lpegmatch(p_split,word)
354        if not n then
355            n = #word
356        end
357    else
358        if not n then
359            n = #word
360        end
361        key = concat(word,"",1,n)
362    end
363    local l = 1
364    local w = { "." }
365 -- local d = dictionary.codehash
366    for i=1,n do
367        local c = word[i]
368     -- l = l + (d[c] or 1)
369        l = l + 1
370        w[l] = lcchars[c] or c
371    end
372    l = l + 1
373    w[l] = "."
374    local c = concat(w,"",2,l-1)
375    --
376    local done = hyphenated[c]
377    if done ~= nil then
378        hyphenated[key] = done
379        nofhashed = nofhashed + 1
380        return done
381    end
382    --
383    local exceptions = dictionary.exceptions
384    local exception  = exceptions[c]
385    if exception then
386        if trace_steps then
387            show_4(w,exception)
388        end
389        hyphenated[key] = exception
390        nofhashed = nofhashed + 1
391        return exception
392    end
393    --
394    if trace_steps then
395        show_1(w)
396    end
397    --
398    local specials = dictionary.specials
399    local patterns = dictionary.patterns
400    --
401    local spec
402    for i=1,l do
403        for j=i,l do
404            local c = concat(w,"",i,j)
405            local m = patterns[c]
406            if m then
407                local s = specials[c]
408                if not done then
409                    done = { }
410                    spec = nil
411                    -- the string that we resolve has explicit fences (.) so done starts at
412                    -- the first fence and runs upto the last one so we need one slot less
413                    for i=1,l do
414                        done[i] = 0
415                    end
416                end
417                -- we run over the pattern that always has a (zero) value for each character
418                -- plus one more as we look at both sides
419                for k=1,#m do
420                    local new = m[k]
421                    if not new then
422                        break
423                    elseif new == true then
424                        report("fatal error")
425                        break
426                    elseif new > 0 then
427                        local pos = i + k - 1
428                        local old = done[pos]
429                        if not old then
430                            -- break ?
431                        elseif new > old then
432                            done[pos] = new
433                            if s then
434                                local b = i + (s.start or 1) - 1
435                                if b > 0 then
436                                    local e = b + (s.length or 2) - 1
437                                    if e > 0 then
438                                        if pos >= b and pos <= e then
439                                            if spec then
440                                                spec[pos] = { s, k - 1 }
441                                            else
442                                                spec = { [pos] = { s, k - 1 } }
443                                            end
444                                        end
445                                    end
446                                end
447                            end
448                        end
449                    end
450                end
451                if trace_steps and done then
452                    show_2(c,m,w,done,i,s)
453                end
454            end
455        end
456    end
457    if trace_steps and done then
458        show_3(w,done)
459    end
460    if done then
461        local okay = false
462        for i=3,#done do
463            if done[i] % 2 == 1 then
464                done[i-2] = spec and spec[i] or true
465                okay = true
466            else
467                done[i-2] = false
468            end
469        end
470        if okay then
471            done[#done] = nil
472            done[#done] = nil
473        else
474            done = false
475        end
476    else
477        done = false
478    end
479    hyphenated[key] = done
480    nofhashed = nofhashed + 1
481    return done
482end
483
484function traditional.gettrace(language,word)
485    if not word or word == "" then
486        return
487    end
488    local dictionary = dictionaries[language]
489    if dictionary then
490        local hyphenated = dictionary.hyphenated
491        hyphenated[word] = nil
492        hyphenate(dictionary,word)
493        return steps
494    end
495end
496
497local methods = setmetatableindex(function(t,k) local v = hyphenate t[k] = v return v end)
498
499function traditional.installmethod(name,f)
500    if rawget(methods,name) then
501        report("overloading %a is not permitted",name)
502    else
503        methods[name] = f
504    end
505end
506
507local s_detail_1 = "-"
508local f_detail_2 = formatters["%s-%s"]
509local f_detail_3 = formatters["{%s}{%s}{}"]
510local f_detail_4 = formatters["{%s%s}{%s%s}{%s}"]
511
512function traditional.injecthyphens(dictionary,word,specification)
513    if not word then
514        return false
515    end
516    if not specification then
517        return word
518    end
519    local hyphens = hyphenate(dictionary,word)
520    if not hyphens then
521        return word
522    end
523
524    -- the following code is similar to code later on but here we have strings while there
525    -- we have hyphen specs
526
527    local word      = lpegmatch(p_split,word)
528    local size      = #word
529
530    local leftmin   = specification.leftcharmin or 2
531    local rightmin  = size - (specification.rightcharmin or leftmin)
532    local leftchar  = specification.leftchar
533    local rightchar = specification.rightchar
534
535    local result    = { }
536    local rsize     = 0
537    local position  = 1
538
539    while position <= size do
540        if position >= leftmin and position <= rightmin then
541            local hyphen = hyphens[position]
542            if not hyphen then
543                rsize = rsize + 1
544                result[rsize] = word[position]
545                position = position + 1
546            elseif hyphen == true then
547                rsize = rsize + 1
548                result[rsize] = word[position]
549                rsize = rsize + 1
550                if leftchar and rightchar then
551                    result[rsize] = f_detail_3(rightchar,leftchar)
552                else
553                    result[rsize] = s_detail_1
554                end
555                position = position + 1
556            else
557                local o, h = hyphen[2]
558                if o then
559                    h = hyphen[1]
560                else
561                    h = hyphen
562                    o = 1
563                end
564                local b = position - o + (h.start  or 1)
565                local e = b + (h.length or 2) - 1
566                if b > 0 and e >= b then
567                    for i=1,b-position do
568                        rsize = rsize + 1
569                        result[rsize] = word[position]
570                        position = position + 1
571                    end
572                    rsize = rsize + 1
573                    if leftchar and rightchar then
574                        result[rsize] = f_detail_4(h.before,rightchar,leftchar,h.after,concat(word,"",b,e))
575                    else
576                        result[rsize] = f_detail_2(h.before,h.after)
577                    end
578                    position = e + 1
579                else
580                    -- error
581                    rsize = rsize + 1
582                    result[rsize] = word[position]
583                    position = position + 1
584                end
585            end
586        else
587            rsize = rsize + 1
588            result[rsize] = word[position]
589            position = position + 1
590        end
591    end
592    return concat(result)
593end
594
595do
596
597    local word      = C((1-space)^1)
598    local spaces    = space^1
599
600    local u_pattern = (Carg(1) * Carg(2) * word           / unregister_pattern + spaces)^1
601    local r_pattern = (Carg(1) * Carg(2) * word * Carg(3) /   register_pattern + spaces)^1
602    local e_pattern = (Carg(1)           * word           / register_exception + spaces)^1
603
604    function traditional.registerpattern(language,str,specification)
605        local dictionary = dictionaries[language]
606        if specification == false then
607            lpegmatch(u_pattern,str,1,dictionary.patterns,dictionary.specials)
608         -- unregister_pattern(dictionary.patterns,dictionary.specials,str)
609        else
610            lpegmatch(r_pattern,str,1,dictionary.patterns,dictionary.specials,type(specification) == "table" and specification or false)
611         -- register_pattern(dictionary.patterns,dictionary.specials,str,specification)
612        end
613    end
614
615    function traditional.registerexception(language,str)
616        lpegmatch(e_pattern,str,1,dictionaries[language].exceptions)
617    end
618
619end
620
621-- todo: unicodes or utfhash ?
622
623if context then
624
625    local nodecodes          = nodes.nodecodes
626    local disccodes          = nodes.disccodes
627
628    local glyph_code         <const> = nodecodes.glyph
629    local disc_code          <const> = nodecodes.disc
630    local math_code          <const> = nodecodes.math
631    local hlist_code         <const> = nodecodes.hlist
632
633    local automaticdisc_code <const> = disccodes.automatic
634    local regulardisc_code   <const> = disccodes.regular
635
636    local nuts               = nodes.nuts
637    local tonode             = nodes.tonode
638    local nodepool           = nuts.pool
639
640    local new_disc           = nodepool.disc
641    local new_penalty        = nodepool.penalty
642
643    local getfield           = nuts.getfield
644    local getfont            = nuts.getfont
645    local getid              = nuts.getid
646    local getattr            = nuts.getattr
647    local getnext            = nuts.getnext
648    local getprev            = nuts.getprev
649    local getsubtype         = nuts.getsubtype
650    local getlist            = nuts.getlist
651    local getlanguage        = nuts.getlanguage
652    local setattrlist        = nuts.setattrlist
653    local isglyph            = nuts.isglyph
654    local ischar             = nuts.ischar
655
656    local setchar            = nuts.setchar
657    local setdisc            = nuts.setdisc
658    local setlink            = nuts.setlink
659    local setprev            = nuts.setprev
660    local setnext            = nuts.setnext
661
662    local insertbefore       = nuts.insertbefore
663    local insertafter        = nuts.insertafter
664    local copy_node          = nuts.copy
665    local copylist           = nuts.copylist
666    local remove_node        = nuts.remove
667    local endofmath          = nuts.endofmath
668    local node_tail          = nuts.tail
669
670    local nexthlist          = nuts.traversers.hlist
671    local nextdisc           = nuts.traversers.disc
672
673    local setcolor           = nodes.tracers.colors.set
674
675    local variables          = interfaces.variables
676    local v_reset            <const> = variables.reset
677    local v_yes              <const> = variables.yes
678    local v_word             <const> = variables.word
679    local v_all              <const> = variables.all
680
681    local settings_to_array  = utilities.parsers.settings_to_array
682
683    local texsetattribute    = tex.setattribute
684
685    local prehyphenchar      = language.prehyphenchar
686    local posthyphenchar     = language.posthyphenchar
687    local preexhyphenchar    = language.preexhyphenchar
688    local postexhyphenchar   = language.postexhyphenchar
689
690    local a_hyphenation      <const> = attributes.private("hyphenation")
691    local unsetvalue         <const> = attributes.unsetvalue
692
693    local interwordpenalty   = 5000
694
695    function traditional.loadpatterns(language)
696        return dictionaries[language]
697    end
698
699    -- for the moment we use an independent data structure
700
701    setmetatableindex(dictionaries,function(t,k)
702        if type(k) == "string" then
703            -- this will force a load if not yet loaded (we need a nicer way) for the moment
704            -- that will do (nneeded for examples that register a pattern specification
705            languages.getnumber(k)
706        end
707        local specification = languages.getdata(k)
708        local dictionary = {
709            patterns   = { },
710            exceptions = { },
711            hyphenated = { },
712            specials   = { },
713            instance   = false,
714            characters = { },
715            unicodes   = { },
716        }
717        if specification then
718            local resources = specification.resources
719            if resources then
720                local characters = dictionary.characters or { }
721                local unicodes   = dictionary.unicodes   or { }
722                for i=1,#resources do
723                    local r = resources[i]
724                    if not r.in_dictionary then
725                        r.in_dictionary = true
726                        local patterns = r.patterns
727                        if patterns then
728                            local data = patterns.data
729                            if data then
730                                -- regular patterns
731                                lpegmatch(p_pattern,data,1,dictionary.patterns,dictionary.specials)
732                            end
733                            local extra = patterns.extra
734                            if extra then
735                                -- special patterns
736                                lpegmatch(p_pattern,extra,1,dictionary.patterns,dictionary.specials)
737                            end
738                        end
739                        local exceptions = r.exceptions
740                        if exceptions then
741                            local data = exceptions.data
742                            if data and data ~= "" then
743                                lpegmatch(p_exception,data,1,dictionary.exceptions)
744                            end
745                        end
746                        local usedchars  = lpegmatch(p_split,patterns.characters)
747                        for i=1,#usedchars do
748                            local char  = usedchars[i]
749                            local code  = utfbyte(char)
750                            local upper = uccodes[code]
751                            characters[char]  = code
752                            unicodes  [code]  = char
753                            if type(upper) == "table" then
754                                for i=1,#upper do
755                                    local u = upper[i]
756                                    unicodes[u] = utfchar(u)
757                                end
758                            else
759                                unicodes[upper] = utfchar(upper)
760                            end
761                        end
762                    end
763                end
764                dictionary.characters = characters
765                dictionary.unicodes   = unicodes
766                setmetatableindex(characters,function(t,k) local v = k and utfbyte(k) t[k] = v return v end)
767            end
768            t[specification.number] = dictionary
769            dictionary.instance = specification.instance -- needed for hyphenchars
770        end
771        t[k] = dictionary
772        return dictionary
773    end)
774
775    -- Beware: left and right min doesn't mean that in a 1 mmm hsize there can be snippets
776    -- with less characters than either of them! This could be an option but such a narrow
777    -- hsize doesn't make sense anyway.
778
779    -- We assume that featuresets are defined global ... local definitions (also mid paragraph)
780    -- make not much sense anyway. For the moment we assume no predefined sets so we don't need
781    -- to store them. Nor do we need to hash them in order to save space ... no sane user will
782    -- define many of them.
783
784    local featuresets       = hyphenators.featuresets or { }
785    hyphenators.featuresets = featuresets
786
787    storage.shared.noflanguagesfeaturesets = storage.shared.noflanguagesfeaturesets or 0
788
789    local noffeaturesets = storage.shared.noflanguagesfeaturesets
790
791    storage.register("languages/hyphenators/featuresets",featuresets,"languages.hyphenators.featuresets")
792
793    ----- hash = table.sequenced(featureset,",") -- no need now
794
795    local function register(name,featureset)
796        noffeaturesets = noffeaturesets + 1
797        featureset.attribute = noffeaturesets
798        featuresets[noffeaturesets] = featureset  -- access by attribute
799        featuresets[name] = featureset            -- access by name
800        storage.shared.noflanguagesfeaturesets = noffeaturesets
801        return noffeaturesets
802    end
803
804    local function makeset(...)
805        -- a bit overkill, supporting variants but who cares
806        local set = { }
807        for i=1,select("#",...) do
808            local list = select(i,...)
809            local kind = type(list)
810            local used = nil
811            if kind == "string" then
812                if list == v_all then
813                    -- not ok ... now all get ignored
814                    return setmetatableindex(function(t,k) local v = utfchar(k) t[k] = v return v end)
815                elseif list ~= "" then
816                    used = lpegmatch(p_split,list)
817                    set  = set or { }
818                    for i=1,#used do
819                        local char = used[i]
820                        set[utfbyte(char)] = char
821                    end
822                end
823            elseif kind == "table" then
824                if next(list) then
825                    set = set or { }
826                    for byte, char in next, list do
827                        set[byte] = char == true and utfchar(byte) or char
828                    end
829                elseif #list > 0 then
830                    set = set or { }
831                    for i=1,#list do
832                        local l = list[i]
833                        if type(l) == "number" then
834                            set[l] = utfchar(l)
835                        else
836                            set[utfbyte(l)] = l
837                        end
838                    end
839                end
840            end
841        end
842        return set
843    end
844
845    -- category pd (tex also sees --- and -- as hyphens but do we really want that
846
847    local defaulthyphens = {
848        [0x002D] = true,   -- HYPHEN-MINUS
849        [0x00AD] = 0x002D, -- SOFT HYPHEN (active in ConTeXt)
850     -- [0x058A] = true,   -- ARMENIAN HYPHEN
851     -- [0x1400] = true,   -- CANADIAN SYLLABICS HYPHEN
852     -- [0x1806] = true,   -- MONGOLIAN TODO SOFT HYPHEN
853        [0x2010] = true,   -- HYPHEN
854     -- [0x2011] = true,   -- NON-BREAKING HYPHEN
855     -- [0x2012] = true,   -- FIGURE DASH
856        [0x2013] = true,   -- EN DASH
857        [0x2014] = true,   -- EM DASH
858     -- [0x2015] = true,   -- HORIZONTAL BAR
859     -- [0x2027] = true,   -- HYPHENATION POINT
860     -- [0x2E17] = true,   -- DOUBLE OBLIQUE HYPHEN
861     -- [0x2E1A] = true,   -- HYPHEN WITH DIAERESIS
862     -- [0x2E3A] = true,   -- TWO-EM DASH
863     -- [0x2E3B] = true,   -- THREE-EM DASH
864     -- [0x2E40] = true,   -- DOUBLE HYPHEN
865     -- [0x301C] = true,   -- WAVE DASH
866     -- [0x3030] = true,   -- WAVY DASH
867     -- [0x30A0] = true,   -- KATAKANA-HIRAGANA DOUBLE HYPHEN
868     -- [0xFE31] = true,   -- PRESENTATION FORM FOR VERTICAL EM DASH
869     -- [0xFE32] = true,   -- PRESENTATION FORM FOR VERTICAL EN DASH
870     -- [0xFE58] = true,   -- SMALL EM DASH
871     -- [0xFE63] = true,   -- SMALL HYPHEN-MINUS
872     -- [0xFF0D] = true,   -- FULLWIDTH HYPHEN-MINUS
873    }
874
875    local defaultjoiners = {
876        [0x200C] = true, -- nzwj
877        [0x200D] = true, -- zwj
878    }
879
880    local function somehyphenchar(c)
881        c = tonumber(c)
882        return c ~= 0 and c or nil
883    end
884
885    local function definefeatures(name,featureset)
886        local extrachars   = featureset.characters -- "[]()"
887        local hyphenchars  = featureset.hyphens
888        local joinerchars  = featureset.joiners
889        local alternative  = featureset.alternative
890        local rightwordmin = tonumber(featureset.rightwordmin)
891        local charmin      = tonumber(featureset.charmin) -- luatex now also has hyphenationmin
892        local leftcharmin  = tonumber(featureset.leftcharmin)
893        local rightcharmin = tonumber(featureset.rightcharmin)
894        local leftchar     = somehyphenchar(featureset.leftchar)
895        local rightchar    = somehyphenchar(featureset.rightchar)
896        local rightchars   = featureset.rightchars
897local rightedge    = featureset.rightedge
898local autohyphen   = v_yes -- featureset.autohyphen -- insert disc
899local hyphenonly   = v_yes -- featureset.hyphenonly -- don't hyphenate around
900        rightchars  = rightchars  == v_word and true           or tonumber(rightchars)
901        joinerchars = joinerchars == v_yes  and defaultjoiners or joinerchars -- table
902        hyphenchars = hyphenchars == v_yes  and defaulthyphens or hyphenchars -- table
903        -- not yet ok: extrachars have to be ignored  so it cannot be all)
904        featureset.extrachars   = makeset(joinerchars or "",extrachars or "")
905        featureset.hyphenchars  = makeset(hyphenchars or "")
906        featureset.alternative  = alternative or "hyphenate"
907        featureset.rightwordmin = rightwordmin and rightwordmin > 0 and rightwordmin or nil
908        featureset.charmin      = charmin      and charmin      > 0 and charmin      or nil
909        featureset.leftcharmin  = leftcharmin  and leftcharmin  > 0 and leftcharmin  or nil
910        featureset.rightcharmin = rightcharmin and rightcharmin > 0 and rightcharmin or nil
911        featureset.rightchars   = rightchars
912        featureset.leftchar     = leftchar
913        featureset.rightchar    = rightchar
914     -- featureset.strict       = rightedge  == "tex"
915featureset.autohyphen   = autohyphen == v_yes
916featureset.hyphenonly   = hyphenonly == v_yes
917        return register(name,featureset)
918    end
919
920    local function setfeatures(n)
921        if not n or n == v_reset then
922            n = false
923        else
924            local f = featuresets[n]
925            if not f and type(n) == "string" then
926                local t = settings_to_array(n)
927                local s = { }
928                for i=1,#t do
929                    local ti = t[i]
930                    local fs = featuresets[ti]
931                    if fs then
932                        for k, v in next, fs do
933                            s[k] = v
934                        end
935                    end
936                end
937                n = register(n,s)
938            else
939                n = f and f.attribute
940            end
941        end
942        texsetattribute(a_hyphenation,n or unsetvalue)
943    end
944
945    traditional.definefeatures = definefeatures
946    traditional.setfeatures    = setfeatures
947
948    implement {
949        name      = "definehyphenationfeatures",
950        actions   = definefeatures,
951        arguments = {
952            "string",
953            {
954                { "characters" },
955                { "hyphens" },
956                { "joiners" },
957                { "rightchars" },
958                { "rightwordmin", "integer" },
959                { "charmin", "integer" },
960                { "leftcharmin", "integer" },
961                { "rightcharmin", "integer" },
962                { "leftchar", "integer" },
963                { "rightchar", "integer" },
964                { "alternative" },
965                { "rightedge" },
966            }
967        }
968    }
969
970    implement {
971        name      = "sethyphenationfeatures",
972        actions   = setfeatures,
973        arguments = "string"
974    }
975
976    implement {
977        name      = "registerhyphenationpattern",
978        actions   = traditional.registerpattern,
979        arguments = { "string",  "string",  "boolean" }
980    }
981
982    implement {
983        name      = "registerhyphenationexception",
984        actions   = traditional.registerexception,
985        arguments = "2 strings",
986    }
987
988    -- This is a relative large function with local variables and local functions. A previous
989    -- implementation had the functions outside but this is cleaner and as efficient. The test
990    -- runs 100 times over tufte.tex, knuth.tex, zapf.tex, ward.tex and darwin.tex in lower
991    -- and uppercase with a 1mm hsize.
992    --
993    --         language=0     language>0     4 | 3 * slower
994    --
995    -- tex     2.34 | 1.30    2.55 | 1.45    0.21 | 0.15
996    -- lua     2.42 | 1.38    3.30 | 1.84    0.88 | 0.46
997    --
998    -- Of course we have extra overhead (virtual Lua machine) but also we check attributes and
999    -- support specific local options). The test puts the typeset text in boxes and discards
1000    -- it. If we also flush the runtime is 4.31|2.56 and 4.99|2.94 seconds so the relative
1001    -- difference is (somehow) smaller. The test has 536 pages. There is a little bit of extra
1002    -- overhead because we store the patterns in a different way.
1003    --
1004    -- As usual I will look for speedups. Some 0.01 seconds could be gained by sharing patterns
1005    -- which is not impressive but it does save some 3M memory on this test. (Some optimizations
1006    -- already brought the 3.30 seconds down to 3.14 but it all depends on aggressive caching.)
1007
1008    -- As we kick in the hyphenator before fonts get handled, we don't look at implicit (font)
1009    -- kerns or ligatures.
1010
1011    local starttiming = statistics.starttiming
1012    local stoptiming  = statistics.stoptiming
1013
1014 -- local strictids = {
1015 --     [nodecodes.hlist]   = true,
1016 --     [nodecodes.vlist]   = true,
1017 --     [nodecodes.rule]    = true,
1018 --     [nodecodes.dir]     = true,
1019 --     [nodecodes.whatsit] = true,
1020 --     [nodecodes.insert]  = true,
1021 --     [nodecodes.adjust]  = true,
1022 --
1023 --     [nodecodes.math]    = true,
1024 --     [nodecodes.disc]    = true,
1025 --
1026 --     [nodecodes.accent]  = true, -- never used in context
1027 -- }
1028
1029    -- a lot of overhead when only one char
1030
1031    function traditional.hyphenate(head)
1032
1033        local first           = head
1034        local tail            = nil
1035        local last            = nil
1036        local current         = first
1037        local dictionary      = nil
1038        local instance        = nil
1039        local characters      = nil
1040        local unicodes        = nil
1041        local exhyphenchar    = tex.exhyphenchar
1042        local extrachars      = nil
1043        local hyphenchars     = nil
1044        local language        = nil
1045        local lastfont        = nil
1046        local start           = nil
1047        local stop            = nil
1048        local word            = { } -- we reuse this table
1049        local size            = 0
1050     -- local leftchar        = false
1051     -- local rightchar       = false -- utfbyte("-")
1052        local leftexchar      = false
1053        local rightexchar     = false -- utfbyte("-")
1054        local leftmin         = 0
1055        local rightmin        = 0
1056        local charmin         = 1
1057        local leftcharmin     = nil
1058        local rightcharmin    = nil
1059        ----- leftwordmin     = nil
1060        local rightwordmin    = nil
1061        local rightchars      = nil
1062        local leftchar        = nil
1063        local rightchar       = nil
1064        local attr            = nil
1065        local lastwordlast    = nil
1066        local hyphenated      = hyphenate
1067        ----- strict          = nil
1068        local exhyphenpenalty = tex.exhyphenpenalty
1069        local hyphenpenalty   = tex.hyphenpenalty
1070        local autohyphen      = false
1071        local hyphenonly      = false
1072
1073        -- We cannot use an 'enabled' boolean (false when no characters or extras) because we
1074        -- can have plugins that set a characters metatable and so) ... it doesn't save much
1075        -- anyway. Using (unicodes and unicodes[code]) and a nil table when no characters also
1076        -- doesn't save much. So there not that much to gain for languages that don't hyphenate.
1077        --
1078        -- enabled = (unicodes and (next(unicodes) or getmetatable(unicodes)))
1079        --        or (extrachars and next(extrachars))
1080        --
1081        -- This can be used to not add characters i.e. keep size 0 but then we need to check for
1082        -- attributes that change it, which costs time too. Not much to gain there.
1083
1084        starttiming(traditional)
1085
1086        local function insertpenalty()
1087            local p = new_penalty(interwordpenalty)
1088            setattrlist(p,last)
1089            if trace_visualize then
1090                nuts.setvisual(p,"penalty")
1091            end
1092            last = getprev(last)
1093            first, last = insertafter(first,last,p)
1094        end
1095
1096        local function synchronizefeatureset(a)
1097            local f = a and featuresets[a]
1098            if f then
1099                hyphenated   = methods[f.alternative or "hyphenate"]
1100                extrachars   = f.extrachars
1101                hyphenchars  = f.hyphenchars
1102                rightwordmin = f.rightwordmin
1103                charmin      = f.charmin
1104                leftcharmin  = f.leftcharmin
1105                rightcharmin = f.rightcharmin
1106                leftchar     = f.leftchar
1107                rightchar    = f.rightchar
1108             -- strict       = f.strict and strictids
1109                rightchars   = f.rightchars
1110                autohyphen   = f.autohyphen
1111                hyphenonly   = f.hyphenonly
1112                if rightwordmin and rightwordmin > 0 and lastwordlast ~= rightwordmin then
1113                    -- so we can change mid paragraph but it's kind of unpredictable then
1114                    if not tail then
1115                        tail = node_tail(first)
1116                    end
1117                    last = tail
1118                    local inword = false
1119                    local count  = 0
1120                    while last and rightwordmin > 0 do
1121                        local id = getid(last)
1122                        if id == glyph_code then
1123                            count = count + 1
1124                            inword = true
1125                            if trace_visualize then
1126                                setcolor(last,"darkgreen")
1127                            end
1128                        elseif inword then
1129                            inword = false
1130                            rightwordmin = rightwordmin - 1
1131                            if rightchars == true then
1132                                if rightwordmin > 0 then
1133                                    insertpenalty()
1134                                end
1135                            elseif rightchars and count <= rightchars then
1136                                insertpenalty()
1137                            end
1138                        end
1139                        last = getprev(last)
1140                    end
1141                    lastwordlast = rightwordmin
1142                end
1143                if not charmin or charmin == 0 then
1144                    charmin = 1
1145                end
1146            else
1147                hyphenated   = methods.hyphenate
1148                extrachars   = false
1149                hyphenchars  = false
1150                rightwordmin = false
1151                charmin      = 1
1152                leftcharmin  = false
1153                rightcharmin = false
1154                leftchar     = false
1155                rightchar    = false
1156             -- strict       = false
1157                autohyphen   = false
1158                hyphenonly   = false
1159            end
1160
1161            return a
1162        end
1163
1164        local function flush(hyphens) -- todo: no need for result
1165
1166            local rightmin = size - rightmin
1167            local result   = { }
1168            local rsize    = 0
1169            local position = 1
1170
1171            -- todo: remember last dics and don't go back to before that (plus message) ...
1172            -- for simplicity we also assume that we don't start with a dics node
1173            --
1174            -- there can be a conflict: if we backtrack then we can end up in another disc
1175            -- and get out of sync (dup chars and so)
1176
1177            while position <= size do
1178                if position >= leftmin and position <= rightmin then
1179                    local hyphen = hyphens[position]
1180                    if not hyphen then
1181                        rsize = rsize + 1
1182                        result[rsize] = word[position]
1183                        position = position + 1
1184                    elseif hyphen == true then
1185                        rsize = rsize + 1
1186                        result[rsize] = word[position]
1187                        rsize = rsize + 1
1188                        result[rsize] = true
1189                        position = position + 1
1190                    else
1191                        local o, h = hyphen[2]
1192                        if o then
1193                            -- { hyphen, offset)
1194                            h = hyphen[1]
1195                        else
1196                            -- hyphen
1197                            h = hyphen
1198                            o = 1
1199                        end
1200                        local b = position - o + (h.start  or 1)
1201                        local e = b + (h.length or 2) - 1
1202                        if b > 0 and e >= b then
1203                            for i=1,b-position do
1204                                rsize = rsize + 1
1205                                result[rsize] = word[position]
1206                                position = position + 1
1207                            end
1208                            rsize = rsize + 1
1209                            result[rsize] = {
1210                                h.before or "",      -- pre
1211                                h.after or "",       -- post
1212                                concat(word,"",b,e), -- replace
1213                                h.right,             -- optional after pre
1214                                h.left,              -- optional before post
1215                            }
1216                            position = e + 1
1217                        else
1218                            -- error
1219                            rsize = rsize + 1
1220                            result[rsize] = word[position]
1221                            position = position + 1
1222                        end
1223                    end
1224                else
1225                    rsize = rsize + 1
1226                    result[rsize] = word[position]
1227                    position = position + 1
1228                end
1229            end
1230
1231            local function serialize(replacement,leftchar,rightchar)
1232                if not replacement then
1233                    return
1234                elseif replacement == true then
1235                    local glyph = copy_node(stop)
1236                    setchar(glyph,leftchar or rightchar)
1237                    return glyph
1238                end
1239                local head    = nil
1240                local current = nil
1241                if leftchar then
1242                    head    = copy_node(stop)
1243                    current = head
1244                    setchar(head,leftchar)
1245                end
1246                local rsize = #replacement
1247                if rsize == 1 then
1248                    local glyph = copy_node(stop)
1249                    setchar(glyph,characters[replacement])
1250                    if head then
1251                        insertafter(current,current,glyph)
1252                    else
1253                        head = glyph
1254                    end
1255                    current = glyph
1256                elseif rsize > 0 then
1257                    local list = lpegmatch(p_split,replacement) -- this is an utf split (could be cached)
1258                    for i=1,#list do
1259                        local glyph = copy_node(stop)
1260                        setchar(glyph,characters[list[i]])
1261                        if head then
1262                            insertafter(current,current,glyph)
1263                        else
1264                            head = glyph
1265                        end
1266                        current = glyph
1267                    end
1268                end
1269                if rightchar then
1270                    local glyph = copy_node(stop)
1271                    insertafter(current,current,glyph)
1272                    setchar(glyph,rightchar)
1273                end
1274                return head
1275            end
1276
1277            local current  = start
1278            local attrnode = start -- will be different, just the first char
1279
1280            for i=1,rsize do
1281                local r = result[i]
1282                if r == true then
1283                    local disc = new_disc()
1284                    local pre  = nil
1285                    local post = nil
1286                    if rightchar then
1287                        pre = serialize(true,rightchar)
1288                    end
1289                    if leftchar then
1290                        post = serialize(true,leftchar)
1291                    end
1292                    setdisc(disc,pre,post,nil,regulardisc_code,hyphenpenalty)
1293                    if attrnode then
1294                        setattrlist(disc,attrnode)
1295                    end
1296                    -- could be a replace as well
1297                    insertbefore(first,current,disc)
1298                elseif type(r) == "table" then
1299                    local disc    = new_disc()
1300                    local pre     = r[1]
1301                    local post    = r[2]
1302                    local replace = r[3]
1303                    local right   = r[4] ~= false and rightchar
1304                    local left    = r[5] ~= false and leftchar
1305                    if pre then
1306                        if pre ~= "" then
1307                            pre = serialize(pre,false,right)
1308                        else
1309                            pre = nil
1310                        end
1311                    end
1312                    if post then
1313                        if post ~= "" then
1314                            post = serialize(post,left,false)
1315                        else
1316                            post = nil
1317                        end
1318                    end
1319                    if replace then
1320                        if replace ~= "" then
1321                            replace = serialize(replace)
1322                        else
1323                            replace = nil
1324                        end
1325                    end
1326                    -- maybe regular code
1327                    setdisc(disc,pre,post,replace,regulardisc_code,hyphenpenalty)
1328                    if attrnode then
1329                        setattrlist(disc,attrnode)
1330                    end
1331                    insertbefore(first,current,disc)
1332                else
1333                    setchar(current,characters[r])
1334                    if i < rsize then
1335                        current = getnext(current)
1336                    end
1337                end
1338            end
1339            if current and current ~= stop then
1340                local current = getnext(current)
1341                local last    = getnext(stop)
1342                while current ~= last do
1343                    first, current = remove_node(first,current,true)
1344                end
1345            end
1346        end
1347
1348        local function inject(leftchar,rightchar,code,attrnode)
1349            if first ~= current then
1350                local disc = new_disc()
1351                first, current, glyph = remove_node(first,current)
1352                first, current = insertbefore(first,current,disc)
1353                if trace_visualize then
1354                    setcolor(glyph,"darkred")  -- these get checked
1355                    setcolor(disc,"darkgreen") -- in the colorizer
1356                end
1357                local pre     = nil
1358                local post    = nil
1359                local replace = glyph
1360                if leftchar and leftchar > 0 then
1361                    post = copy_node(glyph)
1362                    setchar(post,leftchar)
1363                end
1364                pre = copy_node(glyph)
1365                setchar(pre,rightchar and rightchar > 0 and rightchar or code)
1366                setdisc(disc,pre,post,replace,automaticdisc_code,hyphenpenalty) -- ex ?
1367                if attrnode then
1368                    setattrlist(disc,attrnode)
1369                end
1370            end
1371            return current
1372        end
1373
1374        local function injectseries(current,last,next,attrnode)
1375            local disc  = new_disc()
1376            local start = current
1377            first, current = insertbefore(first,current,disc)
1378            setprev(start)
1379            setnext(last)
1380            if next then
1381                setlink(current,next)
1382            else
1383                setnext(current)
1384            end
1385            local pre     = copylist(start)
1386            local post    = nil
1387            local replace = start
1388            setdisc(disc,pre,post,replace,automaticdisc_code,hyphenpenalty) -- ex ?
1389            if attrnode then
1390                setattrlist(disc,attrnode)
1391            end
1392            return current
1393        end
1394
1395        local a = getattr(first,a_hyphenation)
1396        if a ~= attr then
1397            attr = synchronizefeatureset(a)
1398        end
1399
1400        -- The first attribute in a word determines the way a word gets hyphenated and if
1401        -- relevant, other properties are also set then. We could optimize for silly one-char
1402        -- cases but it has no priority as the code is still not that much slower than the
1403        -- native hyphenator and this variant also provides room for extensions.
1404
1405        local skipping = false
1406
1407        -- In "word word word." the sequences "word" and "." can be a different font!
1408
1409        while current and current ~= last do -- and current
1410            local code, id = isglyph(current)
1411            if code then
1412                if skipping then
1413                    current = getnext(current)
1414                else
1415                    local lang = getlanguage(current)
1416                    local font = getfont(current)
1417                    if lang ~= language or font ~= lastfont then
1418                        if dictionary and size > charmin and leftmin + rightmin <= size then
1419                            -- only german has many words starting with an uppercase character
1420                            if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then
1421                                -- skip
1422                            else
1423                                local hyphens = hyphenated(dictionary,word,size)
1424                                if hyphens then
1425                                    flush(hyphens)
1426                                end
1427                            end
1428                        end
1429                        lastfont = font
1430                        if language ~= lang and lang > 0 then
1431                            --
1432                            dictionary = dictionaries[lang]
1433                            instance   = dictionary.instance
1434                            characters = dictionary.characters
1435                            unicodes   = dictionary.unicodes
1436                            --
1437                            local a = getattr(current,a_hyphenation)
1438                            attr        = synchronizefeatureset(a)
1439                            leftchar    = leftchar     or (instance and posthyphenchar  (instance)) -- we can make this more
1440                            rightchar   = rightchar    or (instance and prehyphenchar   (instance)) -- efficient if needed
1441                            leftexchar  =                 (instance and preexhyphenchar (instance))
1442                            rightexchar =                 (instance and postexhyphenchar(instance))
1443                            leftmin     = leftcharmin  or getfield(current,"lhmin")
1444                            rightmin    = rightcharmin or getfield(current,"rhmin")
1445                            if not leftchar or leftchar < 0 then
1446                                leftchar = false
1447                            end
1448                            if not rightchar or rightchar < 0 then
1449                                rightchar = false
1450                            end
1451                            --
1452                            local char = unicodes[code] or (extrachars and extrachars[code])
1453                            if char then
1454                                word[1] = char
1455                                size    = 1
1456                                start   = current
1457                            else
1458                                size = 0
1459                            end
1460                        else
1461                            size = 0
1462                        end
1463                        language = lang
1464                    elseif language <= 0 then
1465                        --
1466                    elseif size > 0 then
1467                        local char = unicodes[code] or (extrachars and extrachars[code])
1468                        if char then
1469                            size = size + 1
1470                            word[size] = char
1471                        elseif dictionary then
1472                            if not hyphenonly or code ~= exhyphenchar then
1473                                if size > charmin and leftmin + rightmin <= size then
1474                                    if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then
1475                                        -- skip
1476                                    else
1477                                        local hyphens = hyphenated(dictionary,word,size)
1478                                        if hyphens then
1479                                            flush(hyphens)
1480                                        end
1481                                    end
1482                                end
1483                            end
1484                            size = 0
1485                            if code == exhyphenchar then -- normally the -
1486                                local next = getnext(current)
1487                                local last = current
1488                                local font = getfont(current)
1489                                while next and ischar(next,font) == code do
1490                                    last = next
1491                                    next = getnext(next)
1492                                end
1493                                if not autohyphen then
1494                                    current = last
1495                                elseif current == last then
1496                                    current = inject(leftexchar,rightexchar,code,current)
1497                                else
1498                                    current = injectseries(current,last,next,current)
1499                                end
1500                                if hyphenonly then
1501                                    skipping = true
1502                                end
1503                            elseif hyphenchars then
1504                                local char = hyphenchars[code]
1505                                if char == true then
1506                                    char = code
1507                                end
1508                                if char then
1509                                    current = inject(leftchar and char or nil,rightchar and char or nil,char,current)
1510                                end
1511                            end
1512                        end
1513                    else
1514                        local a = getattr(current,a_hyphenation)
1515                        if a ~= attr then
1516                            attr        = synchronizefeatureset(a) -- influences extrachars
1517                            leftchar    = leftchar     or (instance and posthyphenchar  (instance)) -- we can make this more
1518                            rightchar   = rightchar    or (instance and prehyphenchar   (instance)) -- efficient if needed
1519                            leftexchar  =                 (instance and preexhyphenchar (instance))
1520                            rightexchar =                 (instance and postexhyphenchar(instance))
1521                            leftmin     = leftcharmin  or getfield(current,"lhmin")
1522                            rightmin    = rightcharmin or getfield(current,"rhmin")
1523                            if not leftchar or leftchar < 0 then
1524                                leftchar = false
1525                            end
1526                            if not rightchar or rightchar < 0 then
1527                                rightchar = false
1528                            end
1529                        end
1530                        --
1531                        local char = unicodes[code] or (extrachars and extrachars[code])
1532                        if char then
1533                            word[1] = char
1534                            size    = 1
1535                            start   = current
1536                        end
1537                    end
1538                    stop    = current
1539                    current = getnext(current)
1540                end
1541            else
1542                if skipping then
1543                    skipping = false
1544                end
1545                if id == disc_code then
1546                    size = 0
1547                    current = getnext(current)
1548                    if hyphenonly then
1549                        skipping = true
1550                    end
1551             -- elseif strict and strict[id] then
1552             --     current = id == math_code and getnext(endofmath(current)) or getnext(current)
1553             --     size = 0
1554                else
1555                    current = id == math_code and getnext(endofmath(current)) or getnext(current)
1556                end
1557                if size > 0 then
1558                    if dictionary and size > charmin and leftmin + rightmin <= size then
1559                        if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then
1560                            -- skip
1561                        else
1562                            local hyphens = hyphenated(dictionary,word,size)
1563                            if hyphens then
1564                                flush(hyphens)
1565                            end
1566                        end
1567                    end
1568                    size = 0
1569                end
1570            end
1571        end
1572        -- we can have quit due to last so we need to flush the last seen word, we could move
1573        -- this in the loop and test for current but ... messy
1574        if dictionary and size > charmin and leftmin + rightmin <= size then
1575            if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then
1576                -- skip
1577            else
1578                local hyphens = hyphenated(dictionary,word,size)
1579                if hyphens then
1580                    flush(hyphens)
1581                end
1582            end
1583        end
1584
1585        stoptiming(traditional)
1586
1587        return head
1588    end
1589
1590    statistics.register("hyphenation",function()
1591        if nofwords > 0 or statistics.elapsed(traditional) > 0 then
1592            return string.format("%s words hyphenated, %s unique, used time %s",
1593                nofwords,nofhashed,statistics.elapsedseconds(traditional) or 0)
1594        end
1595    end)
1596
1597    local texmethod = "builders.kernel.hyphenation"
1598    local oldmethod = texmethod
1599    local newmethod = texmethod
1600
1601 -- local newmethod = "languages.hyphenators.traditional.hyphenate"
1602 --
1603 -- nodes.tasks.prependaction("processors","words",newmethod)
1604 -- nodes.tasks.disableaction("processors",oldmethod)
1605 --
1606 -- nodes.tasks.replaceaction("processors","words",oldmethod,newmethod)
1607
1608 -- \enabledirectives[hyphenators.method=traditional]
1609 -- \enabledirectives[hyphenators.method=builtin]
1610
1611    -- push / pop ? check first attribute
1612
1613    -- local replaceaction = nodes.tasks.replaceaction -- no longer overload this way (too many local switches)
1614
1615    local hyphenate    = language.hyphenate
1616    local hyphenating  = nuts.hyphenating
1617    local methods      = { }
1618    local usedmethod   = false
1619    local stack        = { }
1620
1621    local original = hyphenating and
1622        function(head)
1623             return (hyphenating(head))
1624        end
1625    or
1626        function(head)
1627            hyphenate(tonode(head))
1628            return head -- a nut
1629        end
1630
1631 -- local has_language = language.has_language
1632 --
1633 -- local function original(head) -- kernel.hyphenation(head)
1634 --     local h = tonode(head)
1635 --     if has_language(h) then
1636 --         hyphenate(h)
1637 --     end
1638 --     return head
1639 -- end
1640
1641    local getcount = tex.getcount
1642
1643    hyphenators.methods  = methods
1644    local optimize       = false
1645
1646    directives.register("hyphenator.optimize", function(v) optimize = v end)
1647
1648    function hyphenators.handler(head,groupcode)
1649        if usedmethod then
1650            if optimize and (groupcode == "hbox" or groupcode == "adjustedhbox") then
1651                if getcount("hyphenstate") > 0 then
1652                    forced = false
1653                    return usedmethod(head)
1654                else
1655                    return head
1656                end
1657            else
1658                return usedmethod(head)
1659            end
1660        else
1661            return head
1662        end
1663    end
1664
1665    methods.tex         = original
1666    methods.original    = original
1667    methods.expanded    = original -- was expanded before 1.005
1668    methods.traditional = languages.hyphenators.traditional.hyphenate
1669    methods.none        = false -- function(head) return head, false end
1670
1671    usedmethod          = original
1672
1673    local function setmethod(method)
1674        usedmethod = type(method) == "string" and methods[method]
1675        if usedmethod == nil then
1676            usedmethod = methods.tex
1677        end
1678    end
1679    local function pushmethod(method)
1680        insert(stack,usedmethod)
1681        usedmethod = type(method) == "string" and methods[method]
1682        if usedmethod == nil then
1683            usedmethod = methods.tex
1684        end
1685    end
1686    local function popmethod()
1687        usedmethod = remove(stack) or methods.tex
1688    end
1689
1690    hyphenators.setmethod  = setmethod
1691    hyphenators.pushmethod = pushmethod
1692    hyphenators.popmethod  = popmethod
1693
1694    directives.register("hyphenators.method",setmethod)
1695
1696    function hyphenators.setup(specification)
1697        local method = specification.method
1698        if method then
1699            setmethod(method)
1700        end
1701    end
1702
1703    implement { name = "sethyphenationmethod", actions = setmethod,  arguments = "string" }
1704    implement { name = "pushhyphenation",      actions = pushmethod, arguments = "string" }
1705    implement { name = "pophyphenation",       actions = popmethod }
1706
1707    -- can become a runtime loaded one:
1708
1709    local context      = context
1710    local ctx_NC       = context.NC
1711    local ctx_NR       = context.NR
1712    local ctx_verbatim = context.verbatim
1713
1714    function hyphenators.showhyphenationtrace(language,word)
1715        if not word or word == "" then
1716            return
1717        end
1718        local saved = trace_steps
1719        trace_steps = "silent"
1720        local steps = traditional.gettrace(language,word)
1721        trace_steps = saved
1722        if steps then
1723            local n = #steps
1724            if n > 0 then
1725                context.starttabulate { "|r|l|l|l|" }
1726                for i=1,n do
1727                    local s = steps[i]
1728                    ctx_NC() if i > 1 and i < n then context(i-1) end
1729                    ctx_NC() ctx_verbatim(s[1])
1730                    ctx_NC() ctx_verbatim(s[2])
1731                    ctx_NC() ctx_verbatim(s[3])
1732                    ctx_NC()
1733                    ctx_NR()
1734                end
1735                context.stoptabulate()
1736            end
1737        end
1738    end
1739
1740    implement {
1741        name      = "showhyphenationtrace",
1742        actions   = hyphenators.showhyphenationtrace,
1743        arguments = "2 strings",
1744    }
1745
1746    function nodes.stripdiscretionaries(head)
1747        for l in nexthlist, head do
1748            for d in nextdisc, getlist(l) do
1749                remove_node(h,false,true)
1750            end
1751        end
1752        return head
1753    end
1754
1755
1756else
1757
1758    -- traditional.loadpatterns("nl","lang-nl")
1759    -- traditional.loadpatterns("de","lang-de")
1760    -- traditional.loadpatterns("us","lang-us")
1761
1762    -- traditional.registerpattern("nl","e1ë",      { start = 1, length = 2, before = "e",  after = "e"  } )
1763    -- traditional.registerpattern("nl","oo7ë",     { start = 2, length = 3, before = "o",  after = "e"  } )
1764    -- traditional.registerpattern("de","qqxc9xkqq",{ start = 3, length = 4, before = "ab", after = "cd" } )
1765
1766    -- local specification = {
1767    --     leftcharmin     = 2,
1768    --     rightcharmin    = 2,
1769    --     leftchar        = "<",
1770    --     rightchar       = ">",
1771    -- }
1772
1773    -- print("reëel",       traditional.injecthyphens(dictionaries.nl,"reëel",       specification),"r{e>}{<e}{eë}el")
1774    -- print("reeëel",      traditional.injecthyphens(dictionaries.nl,"reeëel",      specification),"re{e>}{<e}{eë}el")
1775    -- print("rooëel",      traditional.injecthyphens(dictionaries.nl,"rooëel",      specification),"r{o>}{<e}{ooë}el")
1776
1777    -- print(   "qxcxkq",   traditional.injecthyphens(dictionaries.de,   "qxcxkq",   specification),"")
1778    -- print(  "qqxcxkqq",  traditional.injecthyphens(dictionaries.de,  "qqxcxkqq",  specification),"")
1779    -- print( "qqqxcxkqqq", traditional.injecthyphens(dictionaries.de, "qqqxcxkqqq", specification),"")
1780    -- print("qqqqxcxkqqqq",traditional.injecthyphens(dictionaries.de,"qqqqxcxkqqqq",specification),"")
1781
1782    -- print("kunstmatig",       traditional.injecthyphens(dictionaries.nl,"kunstmatig",       specification),"")
1783    -- print("kunststofmatig",   traditional.injecthyphens(dictionaries.nl,"kunststofmatig",   specification),"")
1784    -- print("kunst[stof]matig", traditional.injecthyphens(dictionaries.nl,"kunst[stof]matig", specification),"")
1785
1786    -- traditional.loadpatterns("us","lang-us")
1787
1788    -- local specification = {
1789    --     leftcharmin     = 2,
1790    --     rightcharmin    = 2,
1791    --     leftchar        = false,
1792    --     rightchar       = false,
1793    -- }
1794
1795    -- trace_steps = true
1796
1797    -- print("components",    traditional.injecthyphens(dictionaries.us,"components", specification),"")
1798    -- print("single",        traditional.injecthyphens(dictionaries.us,"single",     specification),"sin-gle")
1799    -- print("everyday",      traditional.injecthyphens(dictionaries.us,"everyday",   specification),"every-day")
1800    -- print("associate",     traditional.injecthyphens(dictionaries.us,"associate",     specification),"as-so-ciate")
1801    -- print("philanthropic", traditional.injecthyphens(dictionaries.us,"philanthropic", specification),"phil-an-thropic")
1802    -- print("projects",      traditional.injecthyphens(dictionaries.us,"projects",      specification),"projects")
1803    -- print("Associate",     traditional.injecthyphens(dictionaries.us,"Associate",     specification),"As-so-ciate")
1804    -- print("Philanthropic", traditional.injecthyphens(dictionaries.us,"Philanthropic", specification),"Phil-an-thropic")
1805    -- print("Projects",      traditional.injecthyphens(dictionaries.us,"Projects",      specification),"Projects")
1806
1807end
1808
1809