lang-hyp.lua /size: 68 Kb    last modification: 2025-02-21 11:03
1if not modules then modules = { } end modules ['lang-hyp'] = {
2    version   = 1.001,
3    comment   = "companion to lang-ini.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9-- In an automated workflow hypenation of long titles can be somewhat problematic
10-- especially when demands conflict. For that reason I played a bit with a Lua based
11-- variant of the traditional hyphenation machinery. This mechanism has been extended
12-- several times in projects, of which a good description can be found in TUGboat,
13-- Volume 27 (2006), No. 2 — Proceedings of EuroTEX2006: Automatic non-standard
14-- hyphenation in OpenOffice.org by László Németh.
15--
16-- Being the result of two days experimenting the following implementation is probably
17-- not completely okay yet. If there is demand I might add some more features and plugs.
18-- The performance is quite okay but can probably improved a bit, although this is not
19-- the most critital code. For instance, on a metafun manual run the overhead is about
20-- 0.3 seconds on 19 seconds which is not that bad.
21--
22-- In the procecess of wrapping up (for the ctx conference proceedings) I cleaned up
23-- and extended the code a bit. It can be used in production.
24--
25-- . a l g o r i t h m .
26--    4l1g4
27--     l g o3
28--      1g o
29--            2i t h
30--                4h1m
31-- ---------------------
32--    4 1 4 3 2 0 4 1
33--   a l-g o-r i t h-m
34
35-- . a s s z o n n y a l .
36--     s1s z/sz=sz,1,3
37--             n1n y/ny=ny,1,3
38-- -----------------------
39--    0 1 0 0 0 1 0 0 0/sz=sz,2,3,ny=ny,6,3
40--   a s-s z o n-n y a l/sz=sz,2,3,ny=ny,6,3
41--
42-- ab1cd/ef=gh,2,2 : acd - efd (pattern/replacement,start,length
43--
44-- todo  : support hjcodes (<32 == length) like luatex does now (no need/demand so far)
45-- maybe : support hyphenation over range (can alsready be done using attributes/language)
46-- maybe : reset dictionary.hyphenated when a pattern is added and/or forced reset option
47-- todo  : check subtypes (because they have subtle meanings in the line breaking)
48--
49-- word start (in tex engine):
50--
51-- boundary  : yes when wordboundary
52-- hlist     : when hyphenationbounds 1 or 3
53-- vlist     : when hyphenationbounds 1 or 3
54-- rule      : when hyphenationbounds 1 or 3
55-- dir       : when hyphenationbounds 1 or 3
56-- whatsit   : when hyphenationbounds 1 or 3
57-- glue      : yes
58-- math      : skipped
59-- glyph     : exhyphenchar (one only) : yes (so no -- ---)
60-- otherwise : yes
61--
62-- word end (in tex engine):
63--
64-- boundary  : yes
65-- glyph     : yes when different language
66-- glue      : yes
67-- penalty   : yes
68-- kern      : yes when not italic (for some historic reason)
69-- hlist     : when hyphenationbounds 2 or 3
70-- vlist     : when hyphenationbounds 2 or 3
71-- rule      : when hyphenationbounds 2 or 3
72-- dir       : when hyphenationbounds 2 or 3
73-- whatsit   : when hyphenationbounds 2 or 3
74-- ins       : when hyphenationbounds 2 or 3
75-- adjust    : when hyphenationbounds 2 or 3
76
77local type, rawget, rawset, tonumber, next = type, rawget, rawset, tonumber, next
78
79local P, R, S, Cg, Cf, Ct, Cc, C, Carg, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.Cg, lpeg.Cf, lpeg.Ct, lpeg.Cc, lpeg.C, lpeg.Carg, lpeg.Cs
80local lpegmatch = lpeg.match
81
82local context    = context
83
84local concat     = table.concat
85local insert     = table.insert
86local remove     = table.remove
87local formatters = string.formatters
88local utfchar    = utf.char
89local utfbyte    = utf.byte
90
91if not characters then
92    require("char-ini")
93end
94
95local setmetatableindex = table.setmetatableindex
96
97-- \enabletrackers[hyphenator.steps=silent] will not write to the terminal
98
99local trace_steps       = false  trackers.register("hyphenator.steps",    function(v) trace_steps     = v end)
100local trace_visualize   = false  trackers.register("hyphenator.visualize",function(v) trace_visualize = v end)
101
102local report            = logs.reporter("hyphenator")
103
104local implement         = interfaces and interfaces.implement or function() end
105
106languages               = languages or { }
107local hyphenators       = languages.hyphenators or { }
108languages.hyphenators   = hyphenators
109local traditional       = hyphenators.traditional or { }
110hyphenators.traditional = traditional
111
112local dictionaries = setmetatableindex(function(t,k)
113    local v = {
114        patterns   = { },
115        hyphenated = { },
116        specials   = { },
117        exceptions = { },
118        loaded     = false,
119    }
120    t[k] = v
121    return v
122end)
123
124hyphenators.dictionaries = dictionaries
125
126local character      = lpeg.patterns.utf8character
127local digit          = R("09")
128local weight         = digit/tonumber + Cc(0)
129local fence          = P(".")
130local hyphen         = P("-")
131local space          = P(" ")
132local char           = character - space
133local validcharacter = (character - S("./"))
134local keycharacter   =  character - S("/")
135----- basepart       = Ct( (Cc(0) * fence)^-1 * (weight * validcharacter)^1 * weight * (fence * Cc(0))^-1)
136local specpart       = (P("/") * Cf ( Ct("") *
137        Cg ( Cc("before") * C((1-P("="))^1) * P("=") ) *
138        Cg ( Cc("after")  * C((1-P(","))^1)  ) *
139        (   P(",") *
140            Cg ( Cc("start")  * ((1-P(","))^1/tonumber) * P(",") ) *
141            Cg ( Cc("length") * ((1-P(-1) )^1/tonumber)          )
142        )^-1
143    , rawset))^-1
144
145local make_hashkey_p = Cs((digit/"" + keycharacter)^1)
146----- make_pattern_p = basepart * specpart
147local make_hashkey_e = Cs((hyphen/"" + keycharacter)^1)
148local make_pattern_e = Ct(P(char) * (hyphen * Cc(true) * P(char) + P(char) * Cc(false))^1) -- catch . and char after -
149
150-- local make_hashkey_c = Cs((digit + keycharacter/"")^1)
151-- local make_pattern_c = Ct((P(1)/tonumber)^1)
152
153-- local cache = setmetatableindex(function(t,k)
154--     local n = lpegmatch(make_hashkey_c,k)
155--     local v = lpegmatch(make_pattern_c,n)
156--     t[k] = v
157--     return v
158-- end)
159--
160-- local weight_n       = digit + Cc("0")
161-- local basepart_n     = Cs( (Cc("0") * fence)^-1 * (weight * validcharacter)^1 * weight * (fence * Cc("0"))^-1) / cache
162-- local make_pattern_n = basepart_n * specpart
163
164local make_pattern_c = Ct((P(1)/tonumber)^1)
165
166-- us + nl: 17664 entries -> 827 unique (saves some 3M)
167
168local cache = setmetatableindex(function(t,k)
169    local v = lpegmatch(make_pattern_c,k)
170    t[k] = v
171    return v
172end)
173
174local weight_n       = digit + Cc("0")
175local fence_n        = fence / "0"
176local char_n         = validcharacter / ""
177local basepart_n     = Cs(fence_n^-1 * (weight_n * char_n)^1 * weight_n * fence_n^-1) / cache
178local make_pattern_n = basepart_n * specpart
179
180local function register_pattern(patterns,specials,str,specification)
181    local k = lpegmatch(make_hashkey_p,str)
182 -- local v1, v2 = lpegmatch(make_pattern_p,str)
183    local v1, v2 = lpegmatch(make_pattern_n,str)
184    patterns[k] = v1 -- is this key still ok for complex patterns
185    if specification then
186        specials[k] = specification
187    elseif v2 then
188        specials[k] = v2
189    end
190end
191
192local function unregister_pattern(patterns,specials,str)
193    local k = lpegmatch(make_hashkey_p,str)
194    patterns[k] = nil
195    specials[k] = nil
196end
197
198local p_lower = lpeg.patterns.utf8lower
199
200local function register_exception(exceptions,str,specification)
201    local l = lpegmatch(p_lower,str)
202    local k = lpegmatch(make_hashkey_e,l)
203    local v = lpegmatch(make_pattern_e,l)
204    exceptions[k] = v
205end
206
207local p_pattern   = ((Carg(1) * Carg(2) * C(char^1)) / register_pattern   + 1)^1
208local p_exception = ((Carg(1)           * C(char^1)) / register_exception + 1)^1
209local p_split     = Ct(C(character)^1)
210
211function traditional.loadpatterns(language,filename)
212    local dictionary    = dictionaries[language]
213    if not dictionary.loaded then
214        if not filename or filename == "" then
215            filename = "lang-" .. language
216        end
217        filename = file.addsuffix(filename,"lua")
218        local fullname = resolvers.findfile(filename)
219        if fullname and fullname ~= "" then
220            local specification = dofile(fullname)
221            if specification then
222                local patterns = specification.patterns
223                if patterns then
224                    local data = patterns.data
225                    if data and data ~= "" then
226                        lpegmatch(p_pattern,data,1,dictionary.patterns,dictionary.specials)
227                    end
228                end
229                local exceptions = specification.exceptions
230                if exceptions then
231                    local data = exceptions.data
232                    if data and data ~= "" then
233                        lpegmatch(p_exception,data,1,dictionary.exceptions)
234                    end
235                end
236                dictionary.lefthyphenmin  = patterns.lefthyphenmin
237                dictionary.righthyphenmin = patterns.righthyphenmin
238            end
239        end
240        dictionary.loaded = true
241    end
242    return dictionary
243end
244
245local lcchars    = characters.lcchars
246local uccodes    = characters.uccodes
247local categories = characters.categories
248local nofwords   = 0
249local nofhashed  = 0
250
251local steps     = nil
252local f_show    = formatters["%w%s"]
253
254local function show_log()
255    if trace_steps == true then
256        report()
257        local w = #steps[1][1]
258        for i=1,#steps do
259            local s = steps[i]
260            report("%s%w%S  %S",s[1],w - #s[1] + 3,s[2] or s[1],s[3] or "")
261        end
262        report()
263    end
264end
265
266local function show_1(wsplit)
267    local u = concat(wsplit," ")
268    steps = { { f_show(0,u), f_show(0,u) } }
269end
270
271local function show_2(c,m,wsplit,done,i,spec)
272    local s = lpegmatch(p_split,c)
273    local t = { }
274    local n = #m
275    local w = #wsplit
276    for j=1,n do
277        t[#t+1] = m[j]
278        t[#t+1] = s[j]
279    end
280    local m = 2*i-2
281    local l = #t
282    local s = spec and table.sequenced(spec) or ""
283    if m == 0 then
284        steps[#steps+1] = { f_show(m,  concat(t,"",2)),      f_show(1,concat(done," ",2,#done),s) }
285    elseif i+1 == w then
286        steps[#steps+1] = { f_show(m-1,concat(t,"",1,#t-1)), f_show(1,concat(done," ",2,#done),s) }
287    else
288        steps[#steps+1] = { f_show(m-1,concat(t)),           f_show(1,concat(done," ",2,#done),s) }
289    end
290end
291
292local function show_3(wsplit,done)
293    local t = { }
294    local h = { }
295    local n = #wsplit
296    for i=1,n do
297        local w = wsplit[i]
298        if i > 1 then
299            local d = done[i]
300            t[#t+1] = i > 2 and d % 2 == 1 and "-" or " "
301            h[#h+1] = d
302        end
303        t[#t+1] = w
304        h[#h+1] = w
305    end
306    steps[#steps+1] = { f_show(0,concat(h)), f_show(0,concat(t)) }
307    show_log()
308end
309
310local function show_4(wsplit,done)
311    steps = { { concat(wsplit," ") } }
312    show_log()
313end
314
315function traditional.lasttrace()
316    return steps
317end
318
319-- We could reuse the w table but as we cache the resolved words there is not much gain in
320-- that complication.
321--
322-- Beware: word can be a table and when n is passed to we can assume reuse so we need to
323-- honor that n then.
324--
325-- todo: a fast variant for tex ... less lookups (we could check is dictionary has changed)
326-- ... although due to caching the already done words, we don't do much here
327
328local function hyphenate(dictionary,word,n) -- odd is okay
329    nofwords = nofwords + 1
330    local hyphenated = dictionary.hyphenated
331    local isstring = type(word) == "string"
332    if isstring then
333        local done = hyphenated[word]
334        if done ~= nil then
335            return done
336        end
337    elseif n then
338        local done = hyphenated[concat(word,"",1,n)]
339        if done ~= nil then
340            return done
341        end
342    else
343        local done = hyphenated[concat(word)]
344        if done ~= nil then
345            return done
346        end
347    end
348    local key
349    if isstring then
350        key = word
351        word = lpegmatch(p_split,word)
352        if not n then
353            n = #word
354        end
355    else
356        if not n then
357            n = #word
358        end
359        key = concat(word,"",1,n)
360    end
361    local l = 1
362    local w = { "." }
363 -- local d = dictionary.codehash
364    for i=1,n do
365        local c = word[i]
366     -- l = l + (d[c] or 1)
367        l = l + 1
368        w[l] = lcchars[c] or c
369    end
370    l = l + 1
371    w[l] = "."
372    local c = concat(w,"",2,l-1)
373    --
374    local done = hyphenated[c]
375    if done ~= nil then
376        hyphenated[key] = done
377        nofhashed = nofhashed + 1
378        return done
379    end
380    --
381    local exceptions = dictionary.exceptions
382    local exception  = exceptions[c]
383    if exception then
384        if trace_steps then
385            show_4(w,exception)
386        end
387        hyphenated[key] = exception
388        nofhashed = nofhashed + 1
389        return exception
390    end
391    --
392    if trace_steps then
393        show_1(w)
394    end
395    --
396    local specials = dictionary.specials
397    local patterns = dictionary.patterns
398    --
399    local spec
400    for i=1,l do
401        for j=i,l do
402            local c = concat(w,"",i,j)
403            local m = patterns[c]
404            if m then
405                local s = specials[c]
406                if not done then
407                    done = { }
408                    spec = nil
409                    -- the string that we resolve has explicit fences (.) so done starts at
410                    -- the first fence and runs upto the last one so we need one slot less
411                    for i=1,l do
412                        done[i] = 0
413                    end
414                end
415                -- we run over the pattern that always has a (zero) value for each character
416                -- plus one more as we look at both sides
417                for k=1,#m do
418                    local new = m[k]
419                    if not new then
420                        break
421                    elseif new == true then
422                        report("fatal error")
423                        break
424                    elseif new > 0 then
425                        local pos = i + k - 1
426                        local old = done[pos]
427                        if not old then
428                            -- break ?
429                        elseif new > old then
430                            done[pos] = new
431                            if s then
432                                local b = i + (s.start or 1) - 1
433                                if b > 0 then
434                                    local e = b + (s.length or 2) - 1
435                                    if e > 0 then
436                                        if pos >= b and pos <= e then
437                                            if spec then
438                                                spec[pos] = { s, k - 1 }
439                                            else
440                                                spec = { [pos] = { s, k - 1 } }
441                                            end
442                                        end
443                                    end
444                                end
445                            end
446                        end
447                    end
448                end
449                if trace_steps and done then
450                    show_2(c,m,w,done,i,s)
451                end
452            end
453        end
454    end
455    if trace_steps and done then
456        show_3(w,done)
457    end
458    if done then
459        local okay = false
460        for i=3,#done do
461            if done[i] % 2 == 1 then
462                done[i-2] = spec and spec[i] or true
463                okay = true
464            else
465                done[i-2] = false
466            end
467        end
468        if okay then
469            done[#done] = nil
470            done[#done] = nil
471        else
472            done = false
473        end
474    else
475        done = false
476    end
477    hyphenated[key] = done
478    nofhashed = nofhashed + 1
479    return done
480end
481
482function traditional.gettrace(language,word)
483    if not word or word == "" then
484        return
485    end
486    local dictionary = dictionaries[language]
487    if dictionary then
488        local hyphenated = dictionary.hyphenated
489        hyphenated[word] = nil
490        hyphenate(dictionary,word)
491        return steps
492    end
493end
494
495local methods = setmetatableindex(function(t,k) local v = hyphenate t[k] = v return v end)
496
497function traditional.installmethod(name,f)
498    if rawget(methods,name) then
499        report("overloading %a is not permitted",name)
500    else
501        methods[name] = f
502    end
503end
504
505local s_detail_1 = "-"
506local f_detail_2 = formatters["%s-%s"]
507local f_detail_3 = formatters["{%s}{%s}{}"]
508local f_detail_4 = formatters["{%s%s}{%s%s}{%s}"]
509
510function traditional.injecthyphens(dictionary,word,specification)
511    if not word then
512        return false
513    end
514    if not specification then
515        return word
516    end
517    local hyphens = hyphenate(dictionary,word)
518    if not hyphens then
519        return word
520    end
521
522    -- the following code is similar to code later on but here we have strings while there
523    -- we have hyphen specs
524
525    local word      = lpegmatch(p_split,word)
526    local size      = #word
527
528    local leftmin   = specification.leftcharmin or 2
529    local rightmin  = size - (specification.rightcharmin or leftmin)
530    local leftchar  = specification.leftchar
531    local rightchar = specification.rightchar
532
533    local result    = { }
534    local rsize     = 0
535    local position  = 1
536
537    while position <= size do
538        if position >= leftmin and position <= rightmin then
539            local hyphen = hyphens[position]
540            if not hyphen then
541                rsize = rsize + 1
542                result[rsize] = word[position]
543                position = position + 1
544            elseif hyphen == true then
545                rsize = rsize + 1
546                result[rsize] = word[position]
547                rsize = rsize + 1
548                if leftchar and rightchar then
549                    result[rsize] = f_detail_3(rightchar,leftchar)
550                else
551                    result[rsize] = s_detail_1
552                end
553                position = position + 1
554            else
555                local o, h = hyphen[2]
556                if o then
557                    h = hyphen[1]
558                else
559                    h = hyphen
560                    o = 1
561                end
562                local b = position - o + (h.start  or 1)
563                local e = b + (h.length or 2) - 1
564                if b > 0 and e >= b then
565                    for i=1,b-position do
566                        rsize = rsize + 1
567                        result[rsize] = word[position]
568                        position = position + 1
569                    end
570                    rsize = rsize + 1
571                    if leftchar and rightchar then
572                        result[rsize] = f_detail_4(h.before,rightchar,leftchar,h.after,concat(word,"",b,e))
573                    else
574                        result[rsize] = f_detail_2(h.before,h.after)
575                    end
576                    position = e + 1
577                else
578                    -- error
579                    rsize = rsize + 1
580                    result[rsize] = word[position]
581                    position = position + 1
582                end
583            end
584        else
585            rsize = rsize + 1
586            result[rsize] = word[position]
587            position = position + 1
588        end
589    end
590    return concat(result)
591end
592
593do
594
595    local word      = C((1-space)^1)
596    local spaces    = space^1
597
598    local u_pattern = (Carg(1) * Carg(2) * word           / unregister_pattern + spaces)^1
599    local r_pattern = (Carg(1) * Carg(2) * word * Carg(3) /   register_pattern + spaces)^1
600    local e_pattern = (Carg(1)           * word           / register_exception + spaces)^1
601
602    function traditional.registerpattern(language,str,specification)
603        local dictionary = dictionaries[language]
604        if specification == false then
605            lpegmatch(u_pattern,str,1,dictionary.patterns,dictionary.specials)
606         -- unregister_pattern(dictionary.patterns,dictionary.specials,str)
607        else
608            lpegmatch(r_pattern,str,1,dictionary.patterns,dictionary.specials,type(specification) == "table" and specification or false)
609         -- register_pattern(dictionary.patterns,dictionary.specials,str,specification)
610        end
611    end
612
613    function traditional.registerexception(language,str)
614        lpegmatch(e_pattern,str,1,dictionaries[language].exceptions)
615    end
616
617end
618
619-- todo: unicodes or utfhash ?
620
621if context then
622
623    local nodecodes          = nodes.nodecodes
624    local disccodes          = nodes.disccodes
625
626    local glyph_code         = nodecodes.glyph
627    local disc_code          = nodecodes.disc
628    local math_code          = nodecodes.math
629    local hlist_code         = nodecodes.hlist
630
631    local automaticdisc_code = disccodes.automatic
632    local regulardisc_code   = disccodes.regular
633
634    local nuts               = nodes.nuts
635    local tonode             = nodes.tonode
636    local nodepool           = nuts.pool
637
638    local new_disc           = nodepool.disc
639    local new_penalty        = nodepool.penalty
640
641    local getfield           = nuts.getfield
642    local getfont            = nuts.getfont
643    local getid              = nuts.getid
644    local getattr            = nuts.getattr
645    local getnext            = nuts.getnext
646    local getprev            = nuts.getprev
647    local getsubtype         = nuts.getsubtype
648    local getlist            = nuts.getlist
649    local getlanguage        = nuts.getlanguage
650    local getattrlist        = nuts.getattrlist
651    local setattrlist        = nuts.setattrlist
652    local isglyph            = nuts.isglyph
653    local ischar             = nuts.ischar
654
655    local setchar            = nuts.setchar
656    local setdisc            = nuts.setdisc
657    local setlink            = nuts.setlink
658    local setprev            = nuts.setprev
659    local setnext            = nuts.setnext
660
661    local insertbefore       = nuts.insertbefore
662    local insertafter        = nuts.insertafter
663    local copy_node          = nuts.copy
664    local copylist           = nuts.copylist
665    local remove_node        = nuts.remove
666    local endofmath          = nuts.endofmath
667    local node_tail          = nuts.tail
668
669    local nexthlist          = nuts.traversers.hlist
670    local nextdisc           = nuts.traversers.disc
671
672    local setcolor           = nodes.tracers.colors.set
673
674    local variables          = interfaces.variables
675    local v_reset            = variables.reset
676    local v_yes              = variables.yes
677    local v_word             = variables.word
678    local v_all              = variables.all
679
680    local settings_to_array  = utilities.parsers.settings_to_array
681
682    local unsetvalue         = attributes.unsetvalue
683    local texsetattribute    = tex.setattribute
684
685    local prehyphenchar      = lang.prehyphenchar
686    local posthyphenchar     = lang.posthyphenchar
687    local preexhyphenchar    = lang.preexhyphenchar
688    local postexhyphenchar   = lang.postexhyphenchar
689
690    local a_hyphenation      = attributes.private("hyphenation")
691
692    local interwordpenalty   = 5000
693
694    function traditional.loadpatterns(language)
695        return dictionaries[language]
696    end
697
698    -- for the moment we use an independent data structure
699
700    setmetatableindex(dictionaries,function(t,k)
701        if type(k) == "string" then
702            -- this will force a load if not yet loaded (we need a nicer way) for the moment
703            -- that will do (nneeded for examples that register a pattern specification
704            languages.getnumber(k)
705        end
706        local specification = languages.getdata(k)
707        local dictionary = {
708            patterns   = { },
709            exceptions = { },
710            hyphenated = { },
711            specials   = { },
712            instance   = false,
713            characters = { },
714            unicodes   = { },
715        }
716        if specification then
717            local resources = specification.resources
718            if resources then
719                local characters = dictionary.characters or { }
720                local unicodes   = dictionary.unicodes   or { }
721                for i=1,#resources do
722                    local r = resources[i]
723                    if not r.in_dictionary then
724                        r.in_dictionary = true
725                        local patterns = r.patterns
726                        if patterns then
727                            local data = patterns.data
728                            if data then
729                                -- regular patterns
730                                lpegmatch(p_pattern,data,1,dictionary.patterns,dictionary.specials)
731                            end
732                            local extra = patterns.extra
733                            if extra then
734                                -- special patterns
735                                lpegmatch(p_pattern,extra,1,dictionary.patterns,dictionary.specials)
736                            end
737                        end
738                        local exceptions = r.exceptions
739                        if exceptions then
740                            local data = exceptions.data
741                            if data and data ~= "" then
742                                lpegmatch(p_exception,data,1,dictionary.exceptions)
743                            end
744                        end
745                        local usedchars  = lpegmatch(p_split,patterns.characters)
746                        for i=1,#usedchars do
747                            local char  = usedchars[i]
748                            local code  = utfbyte(char)
749                            local upper = uccodes[code]
750                            characters[char]  = code
751                            unicodes  [code]  = char
752                            if type(upper) == "table" then
753                                for i=1,#upper do
754                                    local u = upper[i]
755                                    unicodes[u] = utfchar(u)
756                                end
757                            else
758                                unicodes[upper] = utfchar(upper)
759                            end
760                        end
761                    end
762                end
763                dictionary.characters = characters
764                dictionary.unicodes   = unicodes
765                setmetatableindex(characters,function(t,k) local v = k and utfbyte(k) t[k] = v return v end)
766            end
767            t[specification.number] = dictionary
768            dictionary.instance = specification.instance -- needed for hyphenchars
769        end
770        t[k] = dictionary
771        return dictionary
772    end)
773
774    -- Beware: left and right min doesn't mean that in a 1 mmm hsize there can be snippets
775    -- with less characters than either of them! This could be an option but such a narrow
776    -- hsize doesn't make sense anyway.
777
778    -- We assume that featuresets are defined global ... local definitions (also mid paragraph)
779    -- make not much sense anyway. For the moment we assume no predefined sets so we don't need
780    -- to store them. Nor do we need to hash them in order to save space ... no sane user will
781    -- define many of them.
782
783    local featuresets       = hyphenators.featuresets or { }
784    hyphenators.featuresets = featuresets
785
786    storage.shared.noflanguagesfeaturesets = storage.shared.noflanguagesfeaturesets or 0
787
788    local noffeaturesets = storage.shared.noflanguagesfeaturesets
789
790    storage.register("languages/hyphenators/featuresets",featuresets,"languages.hyphenators.featuresets")
791
792    ----- hash = table.sequenced(featureset,",") -- no need now
793
794    local function register(name,featureset)
795        noffeaturesets = noffeaturesets + 1
796        featureset.attribute = noffeaturesets
797        featuresets[noffeaturesets] = featureset  -- access by attribute
798        featuresets[name] = featureset            -- access by name
799        storage.shared.noflanguagesfeaturesets = noffeaturesets
800        return noffeaturesets
801    end
802
803    local function makeset(...)
804        -- a bit overkill, supporting variants but who cares
805        local set = { }
806        for i=1,select("#",...) do
807            local list = select(i,...)
808            local kind = type(list)
809            local used = nil
810            if kind == "string" then
811                if list == v_all then
812                    -- not ok ... now all get ignored
813                    return setmetatableindex(function(t,k) local v = utfchar(k) t[k] = v return v end)
814                elseif list ~= "" then
815                    used = lpegmatch(p_split,list)
816                    set  = set or { }
817                    for i=1,#used do
818                        local char = used[i]
819                        set[utfbyte(char)] = char
820                    end
821                end
822            elseif kind == "table" then
823                if next(list) then
824                    set = set or { }
825                    for byte, char in next, list do
826                        set[byte] = char == true and utfchar(byte) or char
827                    end
828                elseif #list > 0 then
829                    set = set or { }
830                    for i=1,#list do
831                        local l = list[i]
832                        if type(l) == "number" then
833                            set[l] = utfchar(l)
834                        else
835                            set[utfbyte(l)] = l
836                        end
837                    end
838                end
839            end
840        end
841        return set
842    end
843
844    -- category pd (tex also sees --- and -- as hyphens but do we really want that
845
846    local defaulthyphens = {
847        [0x002D] = true,   -- HYPHEN-MINUS
848        [0x00AD] = 0x002D, -- SOFT HYPHEN (active in ConTeXt)
849     -- [0x058A] = true,   -- ARMENIAN HYPHEN
850     -- [0x1400] = true,   -- CANADIAN SYLLABICS HYPHEN
851     -- [0x1806] = true,   -- MONGOLIAN TODO SOFT HYPHEN
852        [0x2010] = true,   -- HYPHEN
853     -- [0x2011] = true,   -- NON-BREAKING HYPHEN
854     -- [0x2012] = true,   -- FIGURE DASH
855        [0x2013] = true,   -- EN DASH
856        [0x2014] = true,   -- EM DASH
857     -- [0x2015] = true,   -- HORIZONTAL BAR
858     -- [0x2027] = true,   -- HYPHENATION POINT
859     -- [0x2E17] = true,   -- DOUBLE OBLIQUE HYPHEN
860     -- [0x2E1A] = true,   -- HYPHEN WITH DIAERESIS
861     -- [0x2E3A] = true,   -- TWO-EM DASH
862     -- [0x2E3B] = true,   -- THREE-EM DASH
863     -- [0x2E40] = true,   -- DOUBLE HYPHEN
864     -- [0x301C] = true,   -- WAVE DASH
865     -- [0x3030] = true,   -- WAVY DASH
866     -- [0x30A0] = true,   -- KATAKANA-HIRAGANA DOUBLE HYPHEN
867     -- [0xFE31] = true,   -- PRESENTATION FORM FOR VERTICAL EM DASH
868     -- [0xFE32] = true,   -- PRESENTATION FORM FOR VERTICAL EN DASH
869     -- [0xFE58] = true,   -- SMALL EM DASH
870     -- [0xFE63] = true,   -- SMALL HYPHEN-MINUS
871     -- [0xFF0D] = true,   -- FULLWIDTH HYPHEN-MINUS
872    }
873
874    local defaultjoiners = {
875        [0x200C] = true, -- nzwj
876        [0x200D] = true, -- zwj
877    }
878
879    local function somehyphenchar(c)
880        c = tonumber(c)
881        return c ~= 0 and c or nil
882    end
883
884    local function definefeatures(name,featureset)
885        local extrachars   = featureset.characters -- "[]()"
886        local hyphenchars  = featureset.hyphens
887        local joinerchars  = featureset.joiners
888        local alternative  = featureset.alternative
889        local rightwordmin = tonumber(featureset.rightwordmin)
890        local charmin      = tonumber(featureset.charmin) -- luatex now also has hyphenationmin
891        local leftcharmin  = tonumber(featureset.leftcharmin)
892        local rightcharmin = tonumber(featureset.rightcharmin)
893        local leftchar     = somehyphenchar(featureset.leftchar)
894        local rightchar    = somehyphenchar(featureset.rightchar)
895        local rightchars   = featureset.rightchars
896local rightedge    = featureset.rightedge
897local autohyphen   = v_yes -- featureset.autohyphen -- insert disc
898local hyphenonly   = v_yes -- featureset.hyphenonly -- don't hyphenate around
899        rightchars  = rightchars  == v_word and true           or tonumber(rightchars)
900        joinerchars = joinerchars == v_yes  and defaultjoiners or joinerchars -- table
901        hyphenchars = hyphenchars == v_yes  and defaulthyphens or hyphenchars -- table
902        -- not yet ok: extrachars have to be ignored  so it cannot be all)
903        featureset.extrachars   = makeset(joinerchars or "",extrachars or "")
904        featureset.hyphenchars  = makeset(hyphenchars or "")
905        featureset.alternative  = alternative or "hyphenate"
906        featureset.rightwordmin = rightwordmin and rightwordmin > 0 and rightwordmin or nil
907        featureset.charmin      = charmin      and charmin      > 0 and charmin      or nil
908        featureset.leftcharmin  = leftcharmin  and leftcharmin  > 0 and leftcharmin  or nil
909        featureset.rightcharmin = rightcharmin and rightcharmin > 0 and rightcharmin or nil
910        featureset.rightchars   = rightchars
911        featureset.leftchar     = leftchar
912        featureset.rightchar    = rightchar
913     -- featureset.strict       = rightedge  == "tex"
914featureset.autohyphen   = autohyphen == v_yes
915featureset.hyphenonly   = hyphenonly == v_yes
916        return register(name,featureset)
917    end
918
919    local function setfeatures(n)
920        if not n or n == v_reset then
921            n = false
922        else
923            local f = featuresets[n]
924            if not f and type(n) == "string" then
925                local t = settings_to_array(n)
926                local s = { }
927                for i=1,#t do
928                    local ti = t[i]
929                    local fs = featuresets[ti]
930                    if fs then
931                        for k, v in next, fs do
932                            s[k] = v
933                        end
934                    end
935                end
936                n = register(n,s)
937            else
938                n = f and f.attribute
939            end
940        end
941        texsetattribute(a_hyphenation,n or unsetvalue)
942    end
943
944    traditional.definefeatures = definefeatures
945    traditional.setfeatures    = setfeatures
946
947    implement {
948        name      = "definehyphenationfeatures",
949        actions   = definefeatures,
950        arguments = {
951            "string",
952            {
953                { "characters" },
954                { "hyphens" },
955                { "joiners" },
956                { "rightchars" },
957                { "rightwordmin", "integer" },
958                { "charmin", "integer" },
959                { "leftcharmin", "integer" },
960                { "rightcharmin", "integer" },
961                { "leftchar", "integer" },
962                { "rightchar", "integer" },
963                { "alternative" },
964                { "rightedge" },
965            }
966        }
967    }
968
969    implement {
970        name      = "sethyphenationfeatures",
971        actions   = setfeatures,
972        arguments = "string"
973    }
974
975    implement {
976        name      = "registerhyphenationpattern",
977        actions   = traditional.registerpattern,
978        arguments = { "string",  "string",  "boolean" }
979    }
980
981    implement {
982        name      = "registerhyphenationexception",
983        actions   = traditional.registerexception,
984        arguments = "2 strings",
985    }
986
987    -- This is a relative large function with local variables and local functions. A previous
988    -- implementation had the functions outside but this is cleaner and as efficient. The test
989    -- runs 100 times over tufte.tex, knuth.tex, zapf.tex, ward.tex and darwin.tex in lower
990    -- and uppercase with a 1mm hsize.
991    --
992    --         language=0     language>0     4 | 3 * slower
993    --
994    -- tex     2.34 | 1.30    2.55 | 1.45    0.21 | 0.15
995    -- lua     2.42 | 1.38    3.30 | 1.84    0.88 | 0.46
996    --
997    -- Of course we have extra overhead (virtual Lua machine) but also we check attributes and
998    -- support specific local options). The test puts the typeset text in boxes and discards
999    -- it. If we also flush the runtime is 4.31|2.56 and 4.99|2.94 seconds so the relative
1000    -- difference is (somehow) smaller. The test has 536 pages. There is a little bit of extra
1001    -- overhead because we store the patterns in a different way.
1002    --
1003    -- As usual I will look for speedups. Some 0.01 seconds could be gained by sharing patterns
1004    -- which is not impressive but it does save some 3M memory on this test. (Some optimizations
1005    -- already brought the 3.30 seconds down to 3.14 but it all depends on aggressive caching.)
1006
1007    -- As we kick in the hyphenator before fonts get handled, we don't look at implicit (font)
1008    -- kerns or ligatures.
1009
1010    local starttiming = statistics.starttiming
1011    local stoptiming  = statistics.stoptiming
1012
1013 -- local strictids = {
1014 --     [nodecodes.hlist]   = true,
1015 --     [nodecodes.vlist]   = true,
1016 --     [nodecodes.rule]    = true,
1017 --     [nodecodes.dir]     = true,
1018 --     [nodecodes.whatsit] = true,
1019 --     [nodecodes.insert]  = true,
1020 --     [nodecodes.adjust]  = true,
1021 --
1022 --     [nodecodes.math]    = true,
1023 --     [nodecodes.disc]    = true,
1024 --
1025 --     [nodecodes.accent]  = true, -- never used in context
1026 -- }
1027
1028    -- a lot of overhead when only one char
1029
1030    function traditional.hyphenate(head)
1031
1032        local first           = head
1033        local tail            = nil
1034        local last            = nil
1035        local current         = first
1036        local dictionary      = nil
1037        local instance        = nil
1038        local characters      = nil
1039        local unicodes        = nil
1040        local exhyphenchar    = tex.exhyphenchar
1041        local extrachars      = nil
1042        local hyphenchars     = nil
1043        local language        = nil
1044        local lastfont        = nil
1045        local start           = nil
1046        local stop            = nil
1047        local word            = { } -- we reuse this table
1048        local size            = 0
1049        local leftchar        = false
1050        local rightchar       = false -- utfbyte("-")
1051        local leftexchar      = false
1052        local rightexchar     = false -- utfbyte("-")
1053        local leftmin         = 0
1054        local rightmin        = 0
1055        local charmin         = 1
1056        local leftcharmin     = nil
1057        local rightcharmin    = nil
1058        ----- leftwordmin     = nil
1059        local rightwordmin    = nil
1060        local rightchars      = nil
1061        local leftchar        = nil
1062        local rightchar       = nil
1063        local attr            = nil
1064        local lastwordlast    = nil
1065        local hyphenated      = hyphenate
1066        ----- strict          = nil
1067        local exhyphenpenalty = tex.exhyphenpenalty
1068        local hyphenpenalty   = tex.hyphenpenalty
1069        local autohyphen      = false
1070        local hyphenonly      = false
1071
1072        -- We cannot use an 'enabled' boolean (false when no characters or extras) because we
1073        -- can have plugins that set a characters metatable and so) ... it doesn't save much
1074        -- anyway. Using (unicodes and unicodes[code]) and a nil table when no characters also
1075        -- doesn't save much. So there not that much to gain for languages that don't hyphenate.
1076        --
1077        -- enabled = (unicodes and (next(unicodes) or getmetatable(unicodes)))
1078        --        or (extrachars and next(extrachars))
1079        --
1080        -- This can be used to not add characters i.e. keep size 0 but then we need to check for
1081        -- attributes that change it, which costs time too. Not much to gain there.
1082
1083        starttiming(traditional)
1084
1085        local function insertpenalty()
1086            local p = new_penalty(interwordpenalty)
1087            setattrlist(p,last)
1088            if trace_visualize then
1089                nuts.setvisual(p,"penalty")
1090            end
1091            last = getprev(last)
1092            first, last = insertafter(first,last,p)
1093        end
1094
1095        local function synchronizefeatureset(a)
1096            local f = a and featuresets[a]
1097            if f then
1098                hyphenated   = methods[f.alternative or "hyphenate"]
1099                extrachars   = f.extrachars
1100                hyphenchars  = f.hyphenchars
1101                rightwordmin = f.rightwordmin
1102                charmin      = f.charmin
1103                leftcharmin  = f.leftcharmin
1104                rightcharmin = f.rightcharmin
1105                leftchar     = f.leftchar
1106                rightchar    = f.rightchar
1107             -- strict       = f.strict and strictids
1108                rightchars   = f.rightchars
1109                autohyphen   = f.autohyphen
1110                hyphenonly   = f.hyphenonly
1111                if rightwordmin and rightwordmin > 0 and lastwordlast ~= rightwordmin then
1112                    -- so we can change mid paragraph but it's kind of unpredictable then
1113                    if not tail then
1114                        tail = node_tail(first)
1115                    end
1116                    last = tail
1117                    local inword = false
1118                    local count  = 0
1119                    while last and rightwordmin > 0 do
1120                        local id = getid(last)
1121                        if id == glyph_code then
1122                            count = count + 1
1123                            inword = true
1124                            if trace_visualize then
1125                                setcolor(last,"darkgreen")
1126                            end
1127                        elseif inword then
1128                            inword = false
1129                            rightwordmin = rightwordmin - 1
1130                            if rightchars == true then
1131                                if rightwordmin > 0 then
1132                                    insertpenalty()
1133                                end
1134                            elseif rightchars and count <= rightchars then
1135                                insertpenalty()
1136                            end
1137                        end
1138                        last = getprev(last)
1139                    end
1140                    lastwordlast = rightwordmin
1141                end
1142                if not charmin or charmin == 0 then
1143                    charmin = 1
1144                end
1145            else
1146                hyphenated   = methods.hyphenate
1147                extrachars   = false
1148                hyphenchars  = false
1149                rightwordmin = false
1150                charmin      = 1
1151                leftcharmin  = false
1152                rightcharmin = false
1153                leftchar     = false
1154                rightchar    = false
1155             -- strict       = false
1156                autohyphen   = false
1157                hyphenonly   = false
1158            end
1159
1160            return a
1161        end
1162
1163        local function flush(hyphens) -- todo: no need for result
1164
1165            local rightmin = size - rightmin
1166            local result   = { }
1167            local rsize    = 0
1168            local position = 1
1169
1170            -- todo: remember last dics and don't go back to before that (plus message) ...
1171            -- for simplicity we also assume that we don't start with a dics node
1172            --
1173            -- there can be a conflict: if we backtrack then we can end up in another disc
1174            -- and get out of sync (dup chars and so)
1175
1176            while position <= size do
1177                if position >= leftmin and position <= rightmin then
1178                    local hyphen = hyphens[position]
1179                    if not hyphen then
1180                        rsize = rsize + 1
1181                        result[rsize] = word[position]
1182                        position = position + 1
1183                    elseif hyphen == true then
1184                        rsize = rsize + 1
1185                        result[rsize] = word[position]
1186                        rsize = rsize + 1
1187                        result[rsize] = true
1188                        position = position + 1
1189                    else
1190                        local o, h = hyphen[2]
1191                        if o then
1192                            -- { hyphen, offset)
1193                            h = hyphen[1]
1194                        else
1195                            -- hyphen
1196                            h = hyphen
1197                            o = 1
1198                        end
1199                        local b = position - o + (h.start  or 1)
1200                        local e = b + (h.length or 2) - 1
1201                        if b > 0 and e >= b then
1202                            for i=1,b-position do
1203                                rsize = rsize + 1
1204                                result[rsize] = word[position]
1205                                position = position + 1
1206                            end
1207                            rsize = rsize + 1
1208                            result[rsize] = {
1209                                h.before or "",      -- pre
1210                                h.after or "",       -- post
1211                                concat(word,"",b,e), -- replace
1212                                h.right,             -- optional after pre
1213                                h.left,              -- optional before post
1214                            }
1215                            position = e + 1
1216                        else
1217                            -- error
1218                            rsize = rsize + 1
1219                            result[rsize] = word[position]
1220                            position = position + 1
1221                        end
1222                    end
1223                else
1224                    rsize = rsize + 1
1225                    result[rsize] = word[position]
1226                    position = position + 1
1227                end
1228            end
1229
1230            local function serialize(replacement,leftchar,rightchar)
1231                if not replacement then
1232                    return
1233                elseif replacement == true then
1234                    local glyph = copy_node(stop)
1235                    setchar(glyph,leftchar or rightchar)
1236                    return glyph
1237                end
1238                local head    = nil
1239                local current = nil
1240                if leftchar then
1241                    head    = copy_node(stop)
1242                    current = head
1243                    setchar(head,leftchar)
1244                end
1245                local rsize = #replacement
1246                if rsize == 1 then
1247                    local glyph = copy_node(stop)
1248                    setchar(glyph,characters[replacement])
1249                    if head then
1250                        insertafter(current,current,glyph)
1251                    else
1252                        head = glyph
1253                    end
1254                    current = glyph
1255                elseif rsize > 0 then
1256                    local list = lpegmatch(p_split,replacement) -- this is an utf split (could be cached)
1257                    for i=1,#list do
1258                        local glyph = copy_node(stop)
1259                        setchar(glyph,characters[list[i]])
1260                        if head then
1261                            insertafter(current,current,glyph)
1262                        else
1263                            head = glyph
1264                        end
1265                        current = glyph
1266                    end
1267                end
1268                if rightchar then
1269                    local glyph = copy_node(stop)
1270                    insertafter(current,current,glyph)
1271                    setchar(glyph,rightchar)
1272                end
1273                return head
1274            end
1275
1276            local current  = start
1277            local attrnode = start -- will be different, just the first char
1278
1279            for i=1,rsize do
1280                local r = result[i]
1281                if r == true then
1282                    local disc = new_disc()
1283                    local pre  = nil
1284                    local post = nil
1285                    if rightchar then
1286                        pre = serialize(true,rightchar)
1287                    end
1288                    if leftchar then
1289                        post = serialize(true,leftchar)
1290                    end
1291                    setdisc(disc,pre,post,nil,regulardisc_code,hyphenpenalty)
1292                    if attrnode then
1293                        setattrlist(disc,attrnode)
1294                    end
1295                    -- could be a replace as well
1296                    insertbefore(first,current,disc)
1297                elseif type(r) == "table" then
1298                    local disc    = new_disc()
1299                    local pre     = r[1]
1300                    local post    = r[2]
1301                    local replace = r[3]
1302                    local right   = r[4] ~= false and rightchar
1303                    local left    = r[5] ~= false and leftchar
1304                    if pre then
1305                        if pre ~= "" then
1306                            pre = serialize(pre,false,right)
1307                        else
1308                            pre = nil
1309                        end
1310                    end
1311                    if post then
1312                        if post ~= "" then
1313                            post = serialize(post,left,false)
1314                        else
1315                            post = nil
1316                        end
1317                    end
1318                    if replace then
1319                        if replace ~= "" then
1320                            replace = serialize(replace)
1321                        else
1322                            replace = nil
1323                        end
1324                    end
1325                    -- maybe regular code
1326                    setdisc(disc,pre,post,replace,regulardisc_code,hyphenpenalty)
1327                    if attrnode then
1328                        setattrlist(disc,attrnode)
1329                    end
1330                    insertbefore(first,current,disc)
1331                else
1332                    setchar(current,characters[r])
1333                    if i < rsize then
1334                        current = getnext(current)
1335                    end
1336                end
1337            end
1338            if current and current ~= stop then
1339                local current = getnext(current)
1340                local last    = getnext(stop)
1341                while current ~= last do
1342                    first, current = remove_node(first,current,true)
1343                end
1344            end
1345
1346        end
1347
1348        local function inject(leftchar,rightchar,code,attrnode)
1349            if first ~= current then
1350                local disc = new_disc()
1351                first, current, glyph = remove_node(first,current)
1352                first, current = insertbefore(first,current,disc)
1353                if trace_visualize then
1354                    setcolor(glyph,"darkred")  -- these get checked
1355                    setcolor(disc,"darkgreen") -- in the colorizer
1356                end
1357                local pre     = nil
1358                local post    = nil
1359                local replace = glyph
1360                if leftchar and leftchar > 0 then
1361                    post = copy_node(glyph)
1362                    setchar(post,leftchar)
1363                end
1364                pre = copy_node(glyph)
1365                setchar(pre,rightchar and rightchar > 0 and rightchar or code)
1366                setdisc(disc,pre,post,replace,automaticdisc_code,hyphenpenalty) -- ex ?
1367                if attrnode then
1368                    setattrlist(disc,attrnode)
1369                end
1370            end
1371            return current
1372        end
1373
1374        local function injectseries(current,last,next,attrnode)
1375            local disc  = new_disc()
1376            local start = current
1377            first, current = insertbefore(first,current,disc)
1378            setprev(start)
1379            setnext(last)
1380            if next then
1381                setlink(current,next)
1382            else
1383                setnext(current)
1384            end
1385            local pre     = copylist(start)
1386            local post    = nil
1387            local replace = start
1388            setdisc(disc,pre,post,replace,automaticdisc_code,hyphenpenalty) -- ex ?
1389            if attrnode then
1390                setattrlist(disc,attrnode)
1391            end
1392            return current
1393        end
1394
1395        local a = getattr(first,a_hyphenation)
1396        if a ~= attr then
1397            attr = synchronizefeatureset(a)
1398        end
1399
1400        -- The first attribute in a word determines the way a word gets hyphenated and if
1401        -- relevant, other properties are also set then. We could optimize for silly one-char
1402        -- cases but it has no priority as the code is still not that much slower than the
1403        -- native hyphenator and this variant also provides room for extensions.
1404
1405        local skipping = false
1406
1407        -- In "word word word." the sequences "word" and "." can be a different font!
1408
1409        while current and current ~= last do -- and current
1410            local code, id = isglyph(current)
1411            if code then
1412                if skipping then
1413                    current = getnext(current)
1414                else
1415                    local lang = getlanguage(current)
1416                    local font = getfont(current)
1417                    if lang ~= language or font ~= lastfont then
1418                        if dictionary and size > charmin and leftmin + rightmin <= size then
1419                            -- only german has many words starting with an uppercase character
1420                            if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then
1421                                -- skip
1422                            else
1423                                local hyphens = hyphenated(dictionary,word,size)
1424                                if hyphens then
1425                                    flush(hyphens)
1426                                end
1427                            end
1428                        end
1429                        lastfont = font
1430                        if language ~= lang and lang > 0 then
1431                            --
1432                            dictionary = dictionaries[lang]
1433                            instance   = dictionary.instance
1434                            characters = dictionary.characters
1435                            unicodes   = dictionary.unicodes
1436                            --
1437                            local a = getattr(current,a_hyphenation)
1438                            attr        = synchronizefeatureset(a)
1439                            leftchar    = leftchar     or (instance and posthyphenchar  (instance)) -- we can make this more
1440                            rightchar   = rightchar    or (instance and prehyphenchar   (instance)) -- efficient if needed
1441                            leftexchar  =                 (instance and preexhyphenchar (instance))
1442                            rightexchar =                 (instance and postexhyphenchar(instance))
1443                            leftmin     = leftcharmin  or getfield(current,"left")
1444                            rightmin    = rightcharmin or getfield(current,"right")
1445                            if not leftchar or leftchar < 0 then
1446                                leftchar = false
1447                            end
1448                            if not rightchar or rightchar < 0 then
1449                                rightchar = false
1450                            end
1451                            --
1452                            local char = unicodes[code] or (extrachars and extrachars[code])
1453                            if char then
1454                                word[1] = char
1455                                size    = 1
1456                                start   = current
1457                            else
1458                                size = 0
1459                            end
1460                        else
1461                            size = 0
1462                        end
1463                        language = lang
1464                    elseif language <= 0 then
1465                        --
1466                    elseif size > 0 then
1467                        local char = unicodes[code] or (extrachars and extrachars[code])
1468                        if char then
1469                            size = size + 1
1470                            word[size] = char
1471                        elseif dictionary then
1472                            if not hyphenonly or code ~= exhyphenchar then
1473                                if size > charmin and leftmin + rightmin <= size then
1474                                    if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then
1475                                        -- skip
1476                                    else
1477                                        local hyphens = hyphenated(dictionary,word,size)
1478                                        if hyphens then
1479                                            flush(hyphens)
1480                                        end
1481                                    end
1482                                end
1483                            end
1484                            size = 0
1485                            if code == exhyphenchar then -- normally the -
1486                                local next = getnext(current)
1487                                local last = current
1488                                local font = getfont(current)
1489                                while next and ischar(next,font) == code do
1490                                    last = next
1491                                    next = getnext(next)
1492                                end
1493                                if not autohyphen then
1494                                    current = last
1495                                elseif current == last then
1496                                    current = inject(leftexchar,rightexchar,code,current)
1497                                else
1498                                    current = injectseries(current,last,next,current)
1499                                end
1500                                if hyphenonly then
1501                                    skipping = true
1502                                end
1503                            elseif hyphenchars then
1504                                local char = hyphenchars[code]
1505                                if char == true then
1506                                    char = code
1507                                end
1508                                if char then
1509                                    current = inject(leftchar and char or nil,rightchar and char or nil,char,current)
1510                                end
1511                            end
1512                        end
1513                    else
1514                        local a = getattr(current,a_hyphenation)
1515                        if a ~= attr then
1516                            attr        = synchronizefeatureset(a) -- influences extrachars
1517                            leftchar    = leftchar     or (instance and posthyphenchar  (instance)) -- we can make this more
1518                            rightchar   = rightchar    or (instance and prehyphenchar   (instance)) -- efficient if needed
1519                            leftexchar  =                 (instance and preexhyphenchar (instance))
1520                            rightexchar =                 (instance and postexhyphenchar(instance))
1521                            leftmin     = leftcharmin  or getfield(current,"left")
1522                            rightmin    = rightcharmin or getfield(current,"right")
1523                            if not leftchar or leftchar < 0 then
1524                                leftchar = false
1525                            end
1526                            if not rightchar or rightchar < 0 then
1527                                rightchar = false
1528                            end
1529                        end
1530                        --
1531                        local char = unicodes[code] or (extrachars and extrachars[code])
1532                        if char then
1533                            word[1] = char
1534                            size    = 1
1535                            start   = current
1536                        end
1537                    end
1538                    stop    = current
1539                    current = getnext(current)
1540                end
1541            else
1542                if skipping then
1543                    skipping = false
1544                end
1545                if id == disc_code then
1546                    size = 0
1547                    current = getnext(current)
1548                    if hyphenonly then
1549                        skipping = true
1550                    end
1551             -- elseif strict and strict[id] then
1552             --     current = id == math_code and getnext(endofmath(current)) or getnext(current)
1553             --     size = 0
1554                else
1555                    current = id == math_code and getnext(endofmath(current)) or getnext(current)
1556                end
1557                if size > 0 then
1558                    if dictionary and size > charmin and leftmin + rightmin <= size then
1559                        if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then
1560                            -- skip
1561                        else
1562                            local hyphens = hyphenated(dictionary,word,size)
1563                            if hyphens then
1564                                flush(hyphens)
1565                            end
1566                        end
1567                    end
1568                    size = 0
1569                end
1570            end
1571        end
1572        -- we can have quit due to last so we need to flush the last seen word, we could move
1573        -- this in the loop and test for current but ... messy
1574        if dictionary and size > charmin and leftmin + rightmin <= size then
1575            if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then
1576                -- skip
1577            else
1578                local hyphens = hyphenated(dictionary,word,size)
1579                if hyphens then
1580                    flush(hyphens)
1581                end
1582            end
1583        end
1584
1585        stoptiming(traditional)
1586
1587        return head
1588    end
1589
1590    statistics.register("hyphenation",function()
1591        if nofwords > 0 or statistics.elapsed(traditional) > 0 then
1592            return string.format("%s words hyphenated, %s unique, used time %s",
1593                nofwords,nofhashed,statistics.elapsedseconds(traditional) or 0)
1594        end
1595    end)
1596
1597    local texmethod = "builders.kernel.hyphenation"
1598    local oldmethod = texmethod
1599    local newmethod = texmethod
1600
1601 -- local newmethod = "languages.hyphenators.traditional.hyphenate"
1602 --
1603 -- nodes.tasks.prependaction("processors","words",newmethod)
1604 -- nodes.tasks.disableaction("processors",oldmethod)
1605 --
1606 -- nodes.tasks.replaceaction("processors","words",oldmethod,newmethod)
1607
1608 -- \enabledirectives[hyphenators.method=traditional]
1609 -- \enabledirectives[hyphenators.method=builtin]
1610
1611    -- push / pop ? check first attribute
1612
1613    -- local replaceaction = nodes.tasks.replaceaction -- no longer overload this way (too many local switches)
1614
1615    local hyphenate    = lang.hyphenate
1616    local hyphenating  = nuts.hyphenating
1617    local methods      = { }
1618    local usedmethod   = false
1619    local stack        = { }
1620
1621    local original = hyphenating and
1622        function(head)
1623            return (hyphenating(head))
1624        end
1625    or
1626        function(head)
1627            hyphenate(tonode(head))
1628            return head -- a nut
1629        end
1630
1631 -- local has_language = lang.has_language
1632 --
1633 -- local function original(head) -- kernel.hyphenation(head)
1634 --     local h = tonode(head)
1635 --     if has_language(h) then
1636 --         hyphenate(h)
1637 --     end
1638 --     return head
1639 -- end
1640
1641    local getcount = tex.getcount
1642
1643    hyphenators.methods  = methods
1644    local optimize       = false
1645
1646    directives.register("hyphenator.optimize", function(v) optimize = v end)
1647
1648    function hyphenators.handler(head,groupcode)
1649        if usedmethod then
1650            if optimize and (groupcode == "hbox" or groupcode == "adjusted_hbox") then
1651                if getcount("hyphenstate") > 0 then
1652                    forced = false
1653                    return usedmethod(head)
1654                else
1655                    return head
1656                end
1657            else
1658                return usedmethod(head)
1659            end
1660        else
1661            return head
1662        end
1663    end
1664
1665    methods.tex         = original
1666    methods.original    = original
1667    methods.expanded    = original -- was expanded before 1.005
1668    methods.traditional = languages.hyphenators.traditional.hyphenate
1669    methods.none        = false -- function(head) return head, false end
1670
1671    usedmethod          = original
1672
1673    local function setmethod(method)
1674        usedmethod = type(method) == "string" and methods[method]
1675        if usedmethod == nil then
1676            usedmethod = methods.tex
1677        end
1678    end
1679    local function pushmethod(method)
1680        insert(stack,usedmethod)
1681        usedmethod = type(method) == "string" and methods[method]
1682        if usedmethod == nil then
1683            usedmethod = methods.tex
1684        end
1685    end
1686    local function popmethod()
1687        usedmethod = remove(stack) or methods.tex
1688    end
1689
1690    hyphenators.setmethod  = setmethod
1691    hyphenators.pushmethod = pushmethod
1692    hyphenators.popmethod  = popmethod
1693
1694    directives.register("hyphenators.method",setmethod)
1695
1696    function hyphenators.setup(specification)
1697        local method = specification.method
1698        if method then
1699            setmethod(method)
1700        end
1701    end
1702
1703    implement { name = "sethyphenationmethod", actions = setmethod,  arguments = "string" }
1704    implement { name = "pushhyphenation",      actions = pushmethod, arguments = "string" }
1705    implement { name = "pophyphenation",       actions = popmethod }
1706
1707    -- can become a runtime loaded one:
1708
1709    local context      = context
1710    local ctx_NC       = context.NC
1711    local ctx_NR       = context.NR
1712    local ctx_verbatim = context.verbatim
1713
1714    function hyphenators.showhyphenationtrace(language,word)
1715        if not word or word == "" then
1716            return
1717        end
1718        local saved = trace_steps
1719        trace_steps = "silent"
1720        local steps = traditional.gettrace(language,word)
1721        trace_steps = saved
1722        if steps then
1723            local n = #steps
1724            if n > 0 then
1725                context.starttabulate { "|r|l|l|l|" }
1726                for i=1,n do
1727                    local s = steps[i]
1728                    ctx_NC() if i > 1 and i < n then context(i-1) end
1729                    ctx_NC() ctx_verbatim(s[1])
1730                    ctx_NC() ctx_verbatim(s[2])
1731                    ctx_NC() ctx_verbatim(s[3])
1732                    ctx_NC()
1733                    ctx_NR()
1734                end
1735                context.stoptabulate()
1736            end
1737        end
1738    end
1739
1740    implement {
1741        name      = "showhyphenationtrace",
1742        actions   = hyphenators.showhyphenationtrace,
1743        arguments = "2 strings",
1744    }
1745
1746    function nodes.stripdiscretionaries(head)
1747        for l in nexthlist, head do
1748            for d in nextdisc, getlist(l) do
1749                remove_node(h,false,true)
1750            end
1751        end
1752        return head
1753    end
1754
1755
1756else
1757
1758    -- traditional.loadpatterns("nl","lang-nl")
1759    -- traditional.loadpatterns("de","lang-de")
1760    -- traditional.loadpatterns("us","lang-us")
1761
1762    -- traditional.registerpattern("nl","e1ë",      { start = 1, length = 2, before = "e",  after = "e"  } )
1763    -- traditional.registerpattern("nl","oo7ë",     { start = 2, length = 3, before = "o",  after = "e"  } )
1764    -- traditional.registerpattern("de","qqxc9xkqq",{ start = 3, length = 4, before = "ab", after = "cd" } )
1765
1766    -- local specification = {
1767    --     leftcharmin     = 2,
1768    --     rightcharmin    = 2,
1769    --     leftchar        = "<",
1770    --     rightchar       = ">",
1771    -- }
1772
1773    -- print("reëel",       traditional.injecthyphens(dictionaries.nl,"reëel",       specification),"r{e>}{<e}{eë}el")
1774    -- print("reeëel",      traditional.injecthyphens(dictionaries.nl,"reeëel",      specification),"re{e>}{<e}{eë}el")
1775    -- print("rooëel",      traditional.injecthyphens(dictionaries.nl,"rooëel",      specification),"r{o>}{<e}{ooë}el")
1776
1777    -- print(   "qxcxkq",   traditional.injecthyphens(dictionaries.de,   "qxcxkq",   specification),"")
1778    -- print(  "qqxcxkqq",  traditional.injecthyphens(dictionaries.de,  "qqxcxkqq",  specification),"")
1779    -- print( "qqqxcxkqqq", traditional.injecthyphens(dictionaries.de, "qqqxcxkqqq", specification),"")
1780    -- print("qqqqxcxkqqqq",traditional.injecthyphens(dictionaries.de,"qqqqxcxkqqqq",specification),"")
1781
1782    -- print("kunstmatig",       traditional.injecthyphens(dictionaries.nl,"kunstmatig",       specification),"")
1783    -- print("kunststofmatig",   traditional.injecthyphens(dictionaries.nl,"kunststofmatig",   specification),"")
1784    -- print("kunst[stof]matig", traditional.injecthyphens(dictionaries.nl,"kunst[stof]matig", specification),"")
1785
1786    -- traditional.loadpatterns("us","lang-us")
1787
1788    -- local specification = {
1789    --     leftcharmin     = 2,
1790    --     rightcharmin    = 2,
1791    --     leftchar        = false,
1792    --     rightchar       = false,
1793    -- }
1794
1795    -- trace_steps = true
1796
1797    -- print("components",    traditional.injecthyphens(dictionaries.us,"components", specification),"")
1798    -- print("single",        traditional.injecthyphens(dictionaries.us,"single",     specification),"sin-gle")
1799    -- print("everyday",      traditional.injecthyphens(dictionaries.us,"everyday",   specification),"every-day")
1800    -- print("associate",     traditional.injecthyphens(dictionaries.us,"associate",     specification),"as-so-ciate")
1801    -- print("philanthropic", traditional.injecthyphens(dictionaries.us,"philanthropic", specification),"phil-an-thropic")
1802    -- print("projects",      traditional.injecthyphens(dictionaries.us,"projects",      specification),"projects")
1803    -- print("Associate",     traditional.injecthyphens(dictionaries.us,"Associate",     specification),"As-so-ciate")
1804    -- print("Philanthropic", traditional.injecthyphens(dictionaries.us,"Philanthropic", specification),"Phil-an-thropic")
1805    -- print("Projects",      traditional.injecthyphens(dictionaries.us,"Projects",      specification),"Projects")
1806
1807end
1808
1809