if not modules then modules = { } end modules ['lang-hyp'] = { version = 1.001, comment = "companion to lang-ini.mkiv", author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", copyright = "PRAGMA ADE / ConTeXt Development Team", license = "see context related readme files" } -- In an automated workflow hypenation of long titles can be somewhat problematic -- especially when demands conflict. For that reason I played a bit with a Lua based -- variant of the traditional hyphenation machinery. This mechanism has been extended -- several times in projects, of which a good description can be found in TUGboat, -- Volume 27 (2006), No. 2 — Proceedings of EuroTEX2006: Automatic non-standard -- hyphenation in OpenOffice.org by László Németh. -- -- Being the result of two days experimenting the following implementation is probably -- not completely okay yet. If there is demand I might add some more features and plugs. -- The performance is quite okay but can probably improved a bit, although this is not -- the most critital code. For instance, on a metafun manual run the overhead is about -- 0.3 seconds on 19 seconds which is not that bad. -- -- In the procecess of wrapping up (for the ctx conference proceedings) I cleaned up -- and extended the code a bit. It can be used in production. -- -- . a l g o r i t h m . -- 4l1g4 -- l g o3 -- 1g o -- 2i t h -- 4h1m -- --------------------- -- 4 1 4 3 2 0 4 1 -- a l-g o-r i t h-m -- . a s s z o n n y a l . -- s1s z/sz=sz,1,3 -- n1n y/ny=ny,1,3 -- ----------------------- -- 0 1 0 0 0 1 0 0 0/sz=sz,2,3,ny=ny,6,3 -- a s-s z o n-n y a l/sz=sz,2,3,ny=ny,6,3 -- -- ab1cd/ef=gh,2,2 : acd - efd (pattern/replacement,start,length -- -- todo : support hjcodes (<32 == length) like luatex does now (no need/demand so far) -- maybe : support hyphenation over range (can alsready be done using attributes/language) -- maybe : reset dictionary.hyphenated when a pattern is added and/or forced reset option -- todo : check subtypes (because they have subtle meanings in the line breaking) -- -- word start (in tex engine): -- -- boundary : yes when wordboundary -- hlist : when hyphenationbounds 1 or 3 -- vlist : when hyphenationbounds 1 or 3 -- rule : when hyphenationbounds 1 or 3 -- dir : when hyphenationbounds 1 or 3 -- whatsit : when hyphenationbounds 1 or 3 -- glue : yes -- math : skipped -- glyph : exhyphenchar (one only) : yes (so no -- ---) -- otherwise : yes -- -- word end (in tex engine): -- -- boundary : yes -- glyph : yes when different language -- glue : yes -- penalty : yes -- kern : yes when not italic (for some historic reason) -- hlist : when hyphenationbounds 2 or 3 -- vlist : when hyphenationbounds 2 or 3 -- rule : when hyphenationbounds 2 or 3 -- dir : when hyphenationbounds 2 or 3 -- whatsit : when hyphenationbounds 2 or 3 -- ins : when hyphenationbounds 2 or 3 -- adjust : when hyphenationbounds 2 or 3 local type, rawget, rawset, tonumber, next = type, rawget, rawset, tonumber, next local P, R, S, Cg, Cf, Ct, Cc, C, Carg, Cs = lpeg.P, lpeg.R, lpeg.S, lpeg.Cg, lpeg.Cf, lpeg.Ct, lpeg.Cc, lpeg.C, lpeg.Carg, lpeg.Cs local lpegmatch = lpeg.match local context = context local concat = table.concat local insert = table.insert local remove = table.remove local formatters = string.formatters local utfchar = utf.char local utfbyte = utf.byte if not characters then require("char-ini") end local setmetatableindex = table.setmetatableindex -- \enabletrackers[hyphenator.steps=silent] will not write to the terminal local trace_steps = false trackers.register("hyphenator.steps", function(v) trace_steps = v end) local trace_visualize = false trackers.register("hyphenator.visualize",function(v) trace_visualize = v end) local report = logs.reporter("hyphenator") local implement = interfaces and interfaces.implement or function() end languages = languages or { } local hyphenators = languages.hyphenators or { } languages.hyphenators = hyphenators local traditional = hyphenators.traditional or { } hyphenators.traditional = traditional local dictionaries = setmetatableindex(function(t,k) local v = { patterns = { }, hyphenated = { }, specials = { }, exceptions = { }, loaded = false, } t[k] = v return v end) hyphenators.dictionaries = dictionaries local character = lpeg.patterns.utf8character local digit = R("09") local weight = digit/tonumber + Cc(0) local fence = P(".") local hyphen = P("-") local space = P(" ") local char = character - space local validcharacter = (character - S("./")) local keycharacter = character - S("/") ----- basepart = Ct( (Cc(0) * fence)^-1 * (weight * validcharacter)^1 * weight * (fence * Cc(0))^-1) local specpart = (P("/") * Cf ( Ct("") * Cg ( Cc("before") * C((1-P("="))^1) * P("=") ) * Cg ( Cc("after") * C((1-P(","))^1) ) * ( P(",") * Cg ( Cc("start") * ((1-P(","))^1/tonumber) * P(",") ) * Cg ( Cc("length") * ((1-P(-1) )^1/tonumber) ) )^-1 , rawset))^-1 local make_hashkey_p = Cs((digit/"" + keycharacter)^1) ----- make_pattern_p = basepart * specpart local make_hashkey_e = Cs((hyphen/"" + keycharacter)^1) local make_pattern_e = Ct(P(char) * (hyphen * Cc(true) * P(char) + P(char) * Cc(false))^1) -- catch . and char after - -- local make_hashkey_c = Cs((digit + keycharacter/"")^1) -- local make_pattern_c = Ct((P(1)/tonumber)^1) -- local cache = setmetatableindex(function(t,k) -- local n = lpegmatch(make_hashkey_c,k) -- local v = lpegmatch(make_pattern_c,n) -- t[k] = v -- return v -- end) -- -- local weight_n = digit + Cc("0") -- local basepart_n = Cs( (Cc("0") * fence)^-1 * (weight * validcharacter)^1 * weight * (fence * Cc("0"))^-1) / cache -- local make_pattern_n = basepart_n * specpart local make_pattern_c = Ct((P(1)/tonumber)^1) -- us + nl: 17664 entries -> 827 unique (saves some 3M) local cache = setmetatableindex(function(t,k) local v = lpegmatch(make_pattern_c,k) t[k] = v return v end) local weight_n = digit + Cc("0") local fence_n = fence / "0" local char_n = validcharacter / "" local basepart_n = Cs(fence_n^-1 * (weight_n * char_n)^1 * weight_n * fence_n^-1) / cache local make_pattern_n = basepart_n * specpart local function register_pattern(patterns,specials,str,specification) local k = lpegmatch(make_hashkey_p,str) -- local v1, v2 = lpegmatch(make_pattern_p,str) local v1, v2 = lpegmatch(make_pattern_n,str) patterns[k] = v1 -- is this key still ok for complex patterns if specification then specials[k] = specification elseif v2 then specials[k] = v2 end end local function unregister_pattern(patterns,specials,str) local k = lpegmatch(make_hashkey_p,str) patterns[k] = nil specials[k] = nil end local p_lower = lpeg.patterns.utf8lower local function register_exception(exceptions,str,specification) local l = lpegmatch(p_lower,str) local k = lpegmatch(make_hashkey_e,l) local v = lpegmatch(make_pattern_e,l) exceptions[k] = v end local p_pattern = ((Carg(1) * Carg(2) * C(char^1)) / register_pattern + 1)^1 local p_exception = ((Carg(1) * C(char^1)) / register_exception + 1)^1 local p_split = Ct(C(character)^1) function traditional.loadpatterns(language,filename) local dictionary = dictionaries[language] if not dictionary.loaded then if not filename or filename == "" then filename = "lang-" .. language end filename = file.addsuffix(filename,"lua") local fullname = resolvers.findfile(filename) if fullname and fullname ~= "" then local specification = dofile(fullname) if specification then local patterns = specification.patterns if patterns then local data = patterns.data if data and data ~= "" then lpegmatch(p_pattern,data,1,dictionary.patterns,dictionary.specials) end end local exceptions = specification.exceptions if exceptions then local data = exceptions.data if data and data ~= "" then lpegmatch(p_exception,data,1,dictionary.exceptions) end end end end dictionary.loaded = true end return dictionary end local lcchars = characters.lcchars local uccodes = characters.uccodes local categories = characters.categories local nofwords = 0 local nofhashed = 0 local steps = nil local f_show = formatters["%w%s"] local function show_log() if trace_steps == true then report() local w = #steps[1][1] for i=1,#steps do local s = steps[i] report("%s%w%S %S",s[1],w - #s[1] + 3,s[2],s[3] or "") end report() end end local function show_1(wsplit) local u = concat(wsplit," ") steps = { { f_show(0,u), f_show(0,u) } } end local function show_2(c,m,wsplit,done,i,spec) local s = lpegmatch(p_split,c) local t = { } local n = #m local w = #wsplit for j=1,n do t[#t+1] = m[j] t[#t+1] = s[j] end local m = 2*i-2 local l = #t local s = spec and table.sequenced(spec) or "" if m == 0 then steps[#steps+1] = { f_show(m, concat(t,"",2)), f_show(1,concat(done," ",2,#done),s) } elseif i+1 == w then steps[#steps+1] = { f_show(m-1,concat(t,"",1,#t-1)), f_show(1,concat(done," ",2,#done),s) } else steps[#steps+1] = { f_show(m-1,concat(t)), f_show(1,concat(done," ",2,#done),s) } end end local function show_3(wsplit,done) local t = { } local h = { } local n = #wsplit for i=1,n do local w = wsplit[i] if i > 1 then local d = done[i] t[#t+1] = i > 2 and d % 2 == 1 and "-" or " " h[#h+1] = d end t[#t+1] = w h[#h+1] = w end steps[#steps+1] = { f_show(0,concat(h)), f_show(0,concat(t)) } show_log() end local function show_4(wsplit,done) steps = { { concat(wsplit," ") } } show_log() end function traditional.lasttrace() return steps end -- We could reuse the w table but as we cache the resolved words there is not much gain in -- that complication. -- -- Beware: word can be a table and when n is passed to we can assume reuse so we need to -- honor that n then. -- -- todo: a fast variant for tex ... less lookups (we could check is dictionary has changed) -- ... although due to caching the already done words, we don't do much here local function hyphenate(dictionary,word,n) -- odd is okay nofwords = nofwords + 1 local hyphenated = dictionary.hyphenated local isstring = type(word) == "string" if isstring then local done = hyphenated[word] if done ~= nil then return done end elseif n then local done = hyphenated[concat(word,"",1,n)] if done ~= nil then return done end else local done = hyphenated[concat(word)] if done ~= nil then return done end end local key if isstring then key = word word = lpegmatch(p_split,word) if not n then n = #word end else if not n then n = #word end key = concat(word,"",1,n) end local l = 1 local w = { "." } -- local d = dictionary.codehash for i=1,n do local c = word[i] -- l = l + (d[c] or 1) l = l + 1 w[l] = lcchars[c] or c end l = l + 1 w[l] = "." local c = concat(w,"",2,l-1) -- local done = hyphenated[c] if done ~= nil then hyphenated[key] = done nofhashed = nofhashed + 1 return done end -- local exceptions = dictionary.exceptions local exception = exceptions[c] if exception then if trace_steps then show_4(w,exception) end hyphenated[key] = exception nofhashed = nofhashed + 1 return exception end -- if trace_steps then show_1(w) end -- local specials = dictionary.specials local patterns = dictionary.patterns -- local spec for i=1,l do for j=i,l do local c = concat(w,"",i,j) local m = patterns[c] if m then local s = specials[c] if not done then done = { } spec = nil -- the string that we resolve has explicit fences (.) so done starts at -- the first fence and runs upto the last one so we need one slot less for i=1,l do done[i] = 0 end end -- we run over the pattern that always has a (zero) value for each character -- plus one more as we look at both sides for k=1,#m do local new = m[k] if not new then break elseif new == true then report("fatal error") break elseif new > 0 then local pos = i + k - 1 local old = done[pos] if not old then -- break ? elseif new > old then done[pos] = new if s then local b = i + (s.start or 1) - 1 if b > 0 then local e = b + (s.length or 2) - 1 if e > 0 then if pos >= b and pos <= e then if spec then spec[pos] = { s, k - 1 } else spec = { [pos] = { s, k - 1 } } end end end end end end end end if trace_steps and done then show_2(c,m,w,done,i,s) end end end end if trace_steps and done then show_3(w,done) end if done then local okay = false for i=3,#done do if done[i] % 2 == 1 then done[i-2] = spec and spec[i] or true okay = true else done[i-2] = false end end if okay then done[#done] = nil done[#done] = nil else done = false end else done = false end hyphenated[key] = done nofhashed = nofhashed + 1 return done end function traditional.gettrace(language,word) if not word or word == "" then return end local dictionary = dictionaries[language] if dictionary then local hyphenated = dictionary.hyphenated hyphenated[word] = nil hyphenate(dictionary,word) return steps end end local methods = setmetatableindex(function(t,k) local v = hyphenate t[k] = v return v end) function traditional.installmethod(name,f) if rawget(methods,name) then report("overloading %a is not permitted",name) else methods[name] = f end end local s_detail_1 = "-" local f_detail_2 = formatters["%s-%s"] local f_detail_3 = formatters["{%s}{%s}{}"] local f_detail_4 = formatters["{%s%s}{%s%s}{%s}"] function traditional.injecthyphens(dictionary,word,specification) if not word then return false end if not specification then return word end local hyphens = hyphenate(dictionary,word) if not hyphens then return word end -- the following code is similar to code later on but here we have strings while there -- we have hyphen specs local word = lpegmatch(p_split,word) local size = #word local leftmin = specification.leftcharmin or 2 local rightmin = size - (specification.rightcharmin or leftmin) local leftchar = specification.leftchar local rightchar = specification.rightchar local result = { } local rsize = 0 local position = 1 while position <= size do if position >= leftmin and position <= rightmin then local hyphen = hyphens[position] if not hyphen then rsize = rsize + 1 result[rsize] = word[position] position = position + 1 elseif hyphen == true then rsize = rsize + 1 result[rsize] = word[position] rsize = rsize + 1 if leftchar and rightchar then result[rsize] = f_detail_3(rightchar,leftchar) else result[rsize] = s_detail_1 end position = position + 1 else local o, h = hyphen[2] if o then h = hyphen[1] else h = hyphen o = 1 end local b = position - o + (h.start or 1) local e = b + (h.length or 2) - 1 if b > 0 and e >= b then for i=1,b-position do rsize = rsize + 1 result[rsize] = word[position] position = position + 1 end rsize = rsize + 1 if leftchar and rightchar then result[rsize] = f_detail_4(h.before,rightchar,leftchar,h.after,concat(word,"",b,e)) else result[rsize] = f_detail_2(h.before,h.after) end position = e + 1 else -- error rsize = rsize + 1 result[rsize] = word[position] position = position + 1 end end else rsize = rsize + 1 result[rsize] = word[position] position = position + 1 end end return concat(result) end do local word = C((1-space)^1) local spaces = space^1 local u_pattern = (Carg(1) * Carg(2) * word / unregister_pattern + spaces)^1 local r_pattern = (Carg(1) * Carg(2) * word * Carg(3) / register_pattern + spaces)^1 local e_pattern = (Carg(1) * word / register_exception + spaces)^1 function traditional.registerpattern(language,str,specification) local dictionary = dictionaries[language] if specification == false then lpegmatch(u_pattern,str,1,dictionary.patterns,dictionary.specials) -- unregister_pattern(dictionary.patterns,dictionary.specials,str) else lpegmatch(r_pattern,str,1,dictionary.patterns,dictionary.specials,type(specification) == "table" and specification or false) -- register_pattern(dictionary.patterns,dictionary.specials,str,specification) end end function traditional.registerexception(language,str) lpegmatch(e_pattern,str,1,dictionaries[language].exceptions) end end -- todo: unicodes or utfhash ? if context then local nodecodes = nodes.nodecodes local disccodes = nodes.disccodes local glyph_code = nodecodes.glyph local disc_code = nodecodes.disc local math_code = nodecodes.math local hlist_code = nodecodes.hlist local automaticdisc_code = disccodes.automatic local regulardisc_code = disccodes.regular local nuts = nodes.nuts local tonode = nodes.tonode local nodepool = nuts.pool local new_disc = nodepool.disc local new_penalty = nodepool.penalty local getfield = nuts.getfield local getfont = nuts.getfont local getid = nuts.getid local getattr = nuts.getattr local getnext = nuts.getnext local getprev = nuts.getprev local getsubtype = nuts.getsubtype local getlist = nuts.getlist local getlanguage = nuts.getlanguage local getattrlist = nuts.getattrlist local setattrlist = nuts.setattrlist local isglyph = nuts.isglyph local ischar = nuts.ischar local setchar = nuts.setchar local setdisc = nuts.setdisc local setlink = nuts.setlink local setprev = nuts.setprev local setnext = nuts.setnext local insertbefore = nuts.insertbefore local insertafter = nuts.insertafter local copy_node = nuts.copy local copylist = nuts.copylist local remove_node = nuts.remove local endofmath = nuts.endofmath local node_tail = nuts.tail local nexthlist = nuts.traversers.hlist local nextdisc = nuts.traversers.disc local setcolor = nodes.tracers.colors.set local variables = interfaces.variables local v_reset = variables.reset local v_yes = variables.yes local v_word = variables.word local v_all = variables.all local settings_to_array = utilities.parsers.settings_to_array local unsetvalue = attributes.unsetvalue local texsetattribute = tex.setattribute local prehyphenchar = language.prehyphenchar local posthyphenchar = language.posthyphenchar local preexhyphenchar = language.preexhyphenchar local postexhyphenchar = language.postexhyphenchar local a_hyphenation = attributes.private("hyphenation") local interwordpenalty = 5000 function traditional.loadpatterns(language) return dictionaries[language] end -- for the moment we use an independent data structure setmetatableindex(dictionaries,function(t,k) if type(k) == "string" then -- this will force a load if not yet loaded (we need a nicer way) for the moment -- that will do (nneeded for examples that register a pattern specification languages.getnumber(k) end local specification = languages.getdata(k) local dictionary = { patterns = { }, exceptions = { }, hyphenated = { }, specials = { }, instance = false, characters = { }, unicodes = { }, } if specification then local resources = specification.resources if resources then local characters = dictionary.characters or { } local unicodes = dictionary.unicodes or { } for i=1,#resources do local r = resources[i] if not r.in_dictionary then r.in_dictionary = true local patterns = r.patterns if patterns then local data = patterns.data if data then -- regular patterns lpegmatch(p_pattern,data,1,dictionary.patterns,dictionary.specials) end local extra = patterns.extra if extra then -- special patterns lpegmatch(p_pattern,extra,1,dictionary.patterns,dictionary.specials) end end local exceptions = r.exceptions if exceptions then local data = exceptions.data if data and data ~= "" then lpegmatch(p_exception,data,1,dictionary.exceptions) end end local usedchars = lpegmatch(p_split,patterns.characters) for i=1,#usedchars do local char = usedchars[i] local code = utfbyte(char) local upper = uccodes[code] characters[char] = code unicodes [code] = char if type(upper) == "table" then for i=1,#upper do local u = upper[i] unicodes[u] = utfchar(u) end else unicodes[upper] = utfchar(upper) end end end end dictionary.characters = characters dictionary.unicodes = unicodes setmetatableindex(characters,function(t,k) local v = k and utfbyte(k) t[k] = v return v end) end t[specification.number] = dictionary dictionary.instance = specification.instance -- needed for hyphenchars end t[k] = dictionary return dictionary end) -- Beware: left and right min doesn't mean that in a 1 mmm hsize there can be snippets -- with less characters than either of them! This could be an option but such a narrow -- hsize doesn't make sense anyway. -- We assume that featuresets are defined global ... local definitions (also mid paragraph) -- make not much sense anyway. For the moment we assume no predefined sets so we don't need -- to store them. Nor do we need to hash them in order to save space ... no sane user will -- define many of them. local featuresets = hyphenators.featuresets or { } hyphenators.featuresets = featuresets storage.shared.noflanguagesfeaturesets = storage.shared.noflanguagesfeaturesets or 0 local noffeaturesets = storage.shared.noflanguagesfeaturesets storage.register("languages/hyphenators/featuresets",featuresets,"languages.hyphenators.featuresets") ----- hash = table.sequenced(featureset,",") -- no need now local function register(name,featureset) noffeaturesets = noffeaturesets + 1 featureset.attribute = noffeaturesets featuresets[noffeaturesets] = featureset -- access by attribute featuresets[name] = featureset -- access by name storage.shared.noflanguagesfeaturesets = noffeaturesets return noffeaturesets end local function makeset(...) -- a bit overkill, supporting variants but who cares local set = { } for i=1,select("#",...) do local list = select(i,...) local kind = type(list) local used = nil if kind == "string" then if list == v_all then -- not ok ... now all get ignored return setmetatableindex(function(t,k) local v = utfchar(k) t[k] = v return v end) elseif list ~= "" then used = lpegmatch(p_split,list) set = set or { } for i=1,#used do local char = used[i] set[utfbyte(char)] = char end end elseif kind == "table" then if next(list) then set = set or { } for byte, char in next, list do set[byte] = char == true and utfchar(byte) or char end elseif #list > 0 then set = set or { } for i=1,#list do local l = list[i] if type(l) == "number" then set[l] = utfchar(l) else set[utfbyte(l)] = l end end end end end return set end -- category pd (tex also sees --- and -- as hyphens but do we really want that local defaulthyphens = { [0x002D] = true, -- HYPHEN-MINUS [0x00AD] = 0x002D, -- SOFT HYPHEN (active in ConTeXt) -- [0x058A] = true, -- ARMENIAN HYPHEN -- [0x1400] = true, -- CANADIAN SYLLABICS HYPHEN -- [0x1806] = true, -- MONGOLIAN TODO SOFT HYPHEN [0x2010] = true, -- HYPHEN -- [0x2011] = true, -- NON-BREAKING HYPHEN -- [0x2012] = true, -- FIGURE DASH [0x2013] = true, -- EN DASH [0x2014] = true, -- EM DASH -- [0x2015] = true, -- HORIZONTAL BAR -- [0x2027] = true, -- HYPHENATION POINT -- [0x2E17] = true, -- DOUBLE OBLIQUE HYPHEN -- [0x2E1A] = true, -- HYPHEN WITH DIAERESIS -- [0x2E3A] = true, -- TWO-EM DASH -- [0x2E3B] = true, -- THREE-EM DASH -- [0x2E40] = true, -- DOUBLE HYPHEN -- [0x301C] = true, -- WAVE DASH -- [0x3030] = true, -- WAVY DASH -- [0x30A0] = true, -- KATAKANA-HIRAGANA DOUBLE HYPHEN -- [0xFE31] = true, -- PRESENTATION FORM FOR VERTICAL EM DASH -- [0xFE32] = true, -- PRESENTATION FORM FOR VERTICAL EN DASH -- [0xFE58] = true, -- SMALL EM DASH -- [0xFE63] = true, -- SMALL HYPHEN-MINUS -- [0xFF0D] = true, -- FULLWIDTH HYPHEN-MINUS } local defaultjoiners = { [0x200C] = true, -- nzwj [0x200D] = true, -- zwj } local function somehyphenchar(c) c = tonumber(c) return c ~= 0 and c or nil end local function definefeatures(name,featureset) local extrachars = featureset.characters -- "[]()" local hyphenchars = featureset.hyphens local joinerchars = featureset.joiners local alternative = featureset.alternative local rightwordmin = tonumber(featureset.rightwordmin) local charmin = tonumber(featureset.charmin) -- luatex now also has hyphenationmin local leftcharmin = tonumber(featureset.leftcharmin) local rightcharmin = tonumber(featureset.rightcharmin) local leftchar = somehyphenchar(featureset.leftchar) local rightchar = somehyphenchar(featureset.rightchar) local rightchars = featureset.rightchars local rightedge = featureset.rightedge local autohyphen = v_yes -- featureset.autohyphen -- insert disc local hyphenonly = v_yes -- featureset.hyphenonly -- don't hyphenate around rightchars = rightchars == v_word and true or tonumber(rightchars) joinerchars = joinerchars == v_yes and defaultjoiners or joinerchars -- table hyphenchars = hyphenchars == v_yes and defaulthyphens or hyphenchars -- table -- not yet ok: extrachars have to be ignored so it cannot be all) featureset.extrachars = makeset(joinerchars or "",extrachars or "") featureset.hyphenchars = makeset(hyphenchars or "") featureset.alternative = alternative or "hyphenate" featureset.rightwordmin = rightwordmin and rightwordmin > 0 and rightwordmin or nil featureset.charmin = charmin and charmin > 0 and charmin or nil featureset.leftcharmin = leftcharmin and leftcharmin > 0 and leftcharmin or nil featureset.rightcharmin = rightcharmin and rightcharmin > 0 and rightcharmin or nil featureset.rightchars = rightchars featureset.leftchar = leftchar featureset.rightchar = rightchar -- featureset.strict = rightedge == "tex" featureset.autohyphen = autohyphen == v_yes featureset.hyphenonly = hyphenonly == v_yes return register(name,featureset) end local function setfeatures(n) if not n or n == v_reset then n = false else local f = featuresets[n] if not f and type(n) == "string" then local t = settings_to_array(n) local s = { } for i=1,#t do local ti = t[i] local fs = featuresets[ti] if fs then for k, v in next, fs do s[k] = v end end end n = register(n,s) else n = f and f.attribute end end texsetattribute(a_hyphenation,n or unsetvalue) end traditional.definefeatures = definefeatures traditional.setfeatures = setfeatures implement { name = "definehyphenationfeatures", actions = definefeatures, arguments = { "string", { { "characters" }, { "hyphens" }, { "joiners" }, { "rightchars" }, { "rightwordmin", "integer" }, { "charmin", "integer" }, { "leftcharmin", "integer" }, { "rightcharmin", "integer" }, { "leftchar", "integer" }, { "rightchar", "integer" }, { "alternative" }, { "rightedge" }, } } } implement { name = "sethyphenationfeatures", actions = setfeatures, arguments = "string" } implement { name = "registerhyphenationpattern", actions = traditional.registerpattern, arguments = { "string", "string", "boolean" } } implement { name = "registerhyphenationexception", actions = traditional.registerexception, arguments = "2 strings", } -- This is a relative large function with local variables and local functions. A previous -- implementation had the functions outside but this is cleaner and as efficient. The test -- runs 100 times over tufte.tex, knuth.tex, zapf.tex, ward.tex and darwin.tex in lower -- and uppercase with a 1mm hsize. -- -- language=0 language>0 4 | 3 * slower -- -- tex 2.34 | 1.30 2.55 | 1.45 0.21 | 0.15 -- lua 2.42 | 1.38 3.30 | 1.84 0.88 | 0.46 -- -- Of course we have extra overhead (virtual Lua machine) but also we check attributes and -- support specific local options). The test puts the typeset text in boxes and discards -- it. If we also flush the runtime is 4.31|2.56 and 4.99|2.94 seconds so the relative -- difference is (somehow) smaller. The test has 536 pages. There is a little bit of extra -- overhead because we store the patterns in a different way. -- -- As usual I will look for speedups. Some 0.01 seconds could be gained by sharing patterns -- which is not impressive but it does save some 3M memory on this test. (Some optimizations -- already brought the 3.30 seconds down to 3.14 but it all depends on aggressive caching.) -- As we kick in the hyphenator before fonts get handled, we don't look at implicit (font) -- kerns or ligatures. local starttiming = statistics.starttiming local stoptiming = statistics.stoptiming -- local strictids = { -- [nodecodes.hlist] = true, -- [nodecodes.vlist] = true, -- [nodecodes.rule] = true, -- [nodecodes.dir] = true, -- [nodecodes.whatsit] = true, -- [nodecodes.insert] = true, -- [nodecodes.adjust] = true, -- -- [nodecodes.math] = true, -- [nodecodes.disc] = true, -- -- [nodecodes.accent] = true, -- never used in context -- } -- a lot of overhead when only one char function traditional.hyphenate(head) local first = head local tail = nil local last = nil local current = first local dictionary = nil local instance = nil local characters = nil local unicodes = nil local exhyphenchar = tex.exhyphenchar local extrachars = nil local hyphenchars = nil local language = nil local lastfont = nil local start = nil local stop = nil local word = { } -- we reuse this table local size = 0 -- local leftchar = false -- local rightchar = false -- utfbyte("-") local leftexchar = false local rightexchar = false -- utfbyte("-") local leftmin = 0 local rightmin = 0 local charmin = 1 local leftcharmin = nil local rightcharmin = nil ----- leftwordmin = nil local rightwordmin = nil local rightchars = nil local leftchar = nil local rightchar = nil local attr = nil local lastwordlast = nil local hyphenated = hyphenate ----- strict = nil local exhyphenpenalty = tex.exhyphenpenalty local hyphenpenalty = tex.hyphenpenalty local autohyphen = false local hyphenonly = false -- We cannot use an 'enabled' boolean (false when no characters or extras) because we -- can have plugins that set a characters metatable and so) ... it doesn't save much -- anyway. Using (unicodes and unicodes[code]) and a nil table when no characters also -- doesn't save much. So there not that much to gain for languages that don't hyphenate. -- -- enabled = (unicodes and (next(unicodes) or getmetatable(unicodes))) -- or (extrachars and next(extrachars)) -- -- This can be used to not add characters i.e. keep size 0 but then we need to check for -- attributes that change it, which costs time too. Not much to gain there. starttiming(traditional) local function insertpenalty() local p = new_penalty(interwordpenalty) setattrlist(p,last) if trace_visualize then nuts.setvisual(p,"penalty") end last = getprev(last) first, last = insertafter(first,last,p) end local function synchronizefeatureset(a) local f = a and featuresets[a] if f then hyphenated = methods[f.alternative or "hyphenate"] extrachars = f.extrachars hyphenchars = f.hyphenchars rightwordmin = f.rightwordmin charmin = f.charmin leftcharmin = f.leftcharmin rightcharmin = f.rightcharmin leftchar = f.leftchar rightchar = f.rightchar -- strict = f.strict and strictids rightchars = f.rightchars autohyphen = f.autohyphen hyphenonly = f.hyphenonly if rightwordmin and rightwordmin > 0 and lastwordlast ~= rightwordmin then -- so we can change mid paragraph but it's kind of unpredictable then if not tail then tail = node_tail(first) end last = tail local inword = false local count = 0 while last and rightwordmin > 0 do local id = getid(last) if id == glyph_code then count = count + 1 inword = true if trace_visualize then setcolor(last,"darkgreen") end elseif inword then inword = false rightwordmin = rightwordmin - 1 if rightchars == true then if rightwordmin > 0 then insertpenalty() end elseif rightchars and count <= rightchars then insertpenalty() end end last = getprev(last) end lastwordlast = rightwordmin end if not charmin or charmin == 0 then charmin = 1 end else hyphenated = methods.hyphenate extrachars = false hyphenchars = false rightwordmin = false charmin = 1 leftcharmin = false rightcharmin = false leftchar = false rightchar = false -- strict = false autohyphen = false hyphenonly = false end return a end local function flush(hyphens) -- todo: no need for result local rightmin = size - rightmin local result = { } local rsize = 0 local position = 1 -- todo: remember last dics and don't go back to before that (plus message) ... -- for simplicity we also assume that we don't start with a dics node -- -- there can be a conflict: if we backtrack then we can end up in another disc -- and get out of sync (dup chars and so) while position <= size do if position >= leftmin and position <= rightmin then local hyphen = hyphens[position] if not hyphen then rsize = rsize + 1 result[rsize] = word[position] position = position + 1 elseif hyphen == true then rsize = rsize + 1 result[rsize] = word[position] rsize = rsize + 1 result[rsize] = true position = position + 1 else local o, h = hyphen[2] if o then -- { hyphen, offset) h = hyphen[1] else -- hyphen h = hyphen o = 1 end local b = position - o + (h.start or 1) local e = b + (h.length or 2) - 1 if b > 0 and e >= b then for i=1,b-position do rsize = rsize + 1 result[rsize] = word[position] position = position + 1 end rsize = rsize + 1 result[rsize] = { h.before or "", -- pre h.after or "", -- post concat(word,"",b,e), -- replace h.right, -- optional after pre h.left, -- optional before post } position = e + 1 else -- error rsize = rsize + 1 result[rsize] = word[position] position = position + 1 end end else rsize = rsize + 1 result[rsize] = word[position] position = position + 1 end end local function serialize(replacement,leftchar,rightchar) if not replacement then return elseif replacement == true then local glyph = copy_node(stop) setchar(glyph,leftchar or rightchar) return glyph end local head = nil local current = nil if leftchar then head = copy_node(stop) current = head setchar(head,leftchar) end local rsize = #replacement if rsize == 1 then local glyph = copy_node(stop) setchar(glyph,characters[replacement]) if head then insertafter(current,current,glyph) else head = glyph end current = glyph elseif rsize > 0 then local list = lpegmatch(p_split,replacement) -- this is an utf split (could be cached) for i=1,#list do local glyph = copy_node(stop) setchar(glyph,characters[list[i]]) if head then insertafter(current,current,glyph) else head = glyph end current = glyph end end if rightchar then local glyph = copy_node(stop) insertafter(current,current,glyph) setchar(glyph,rightchar) end return head end local current = start local attrnode = start -- will be different, just the first char for i=1,rsize do local r = result[i] if r == true then local disc = new_disc() local pre = nil local post = nil if rightchar then pre = serialize(true,rightchar) end if leftchar then post = serialize(true,leftchar) end setdisc(disc,pre,post,nil,regulardisc_code,hyphenpenalty) if attrnode then setattrlist(disc,attrnode) end -- could be a replace as well insertbefore(first,current,disc) elseif type(r) == "table" then local disc = new_disc() local pre = r[1] local post = r[2] local replace = r[3] local right = r[4] ~= false and rightchar local left = r[5] ~= false and leftchar if pre then if pre ~= "" then pre = serialize(pre,false,right) else pre = nil end end if post then if post ~= "" then post = serialize(post,left,false) else post = nil end end if replace then if replace ~= "" then replace = serialize(replace) else replace = nil end end -- maybe regular code setdisc(disc,pre,post,replace,regulardisc_code,hyphenpenalty) if attrnode then setattrlist(disc,attrnode) end insertbefore(first,current,disc) else setchar(current,characters[r]) if i < rsize then current = getnext(current) end end end if current and current ~= stop then local current = getnext(current) local last = getnext(stop) while current ~= last do first, current = remove_node(first,current,true) end end end local function inject(leftchar,rightchar,code,attrnode) if first ~= current then local disc = new_disc() first, current, glyph = remove_node(first,current) first, current = insertbefore(first,current,disc) if trace_visualize then setcolor(glyph,"darkred") -- these get checked setcolor(disc,"darkgreen") -- in the colorizer end local pre = nil local post = nil local replace = glyph if leftchar and leftchar > 0 then post = copy_node(glyph) setchar(post,leftchar) end pre = copy_node(glyph) setchar(pre,rightchar and rightchar > 0 and rightchar or code) setdisc(disc,pre,post,replace,automaticdisc_code,hyphenpenalty) -- ex ? if attrnode then setattrlist(disc,attrnode) end end return current end local function injectseries(current,last,next,attrnode) local disc = new_disc() local start = current first, current = insertbefore(first,current,disc) setprev(start) setnext(last) if next then setlink(current,next) else setnext(current) end local pre = copylist(start) local post = nil local replace = start setdisc(disc,pre,post,replace,automaticdisc_code,hyphenpenalty) -- ex ? if attrnode then setattrlist(disc,attrnode) end return current end local a = getattr(first,a_hyphenation) if a ~= attr then attr = synchronizefeatureset(a) end -- The first attribute in a word determines the way a word gets hyphenated and if -- relevant, other properties are also set then. We could optimize for silly one-char -- cases but it has no priority as the code is still not that much slower than the -- native hyphenator and this variant also provides room for extensions. local skipping = false -- In "word word word." the sequences "word" and "." can be a different font! while current and current ~= last do -- and current local code, id = isglyph(current) if code then if skipping then current = getnext(current) else local lang = getlanguage(current) local font = getfont(current) if lang ~= language or font ~= lastfont then if dictionary and size > charmin and leftmin + rightmin <= size then -- only german has many words starting with an uppercase character if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then -- skip else local hyphens = hyphenated(dictionary,word,size) if hyphens then flush(hyphens) end end end lastfont = font if language ~= lang and lang > 0 then -- dictionary = dictionaries[lang] instance = dictionary.instance characters = dictionary.characters unicodes = dictionary.unicodes -- local a = getattr(current,a_hyphenation) attr = synchronizefeatureset(a) leftchar = leftchar or (instance and posthyphenchar (instance)) -- we can make this more rightchar = rightchar or (instance and prehyphenchar (instance)) -- efficient if needed leftexchar = (instance and preexhyphenchar (instance)) rightexchar = (instance and postexhyphenchar(instance)) leftmin = leftcharmin or getfield(current,"lhmin") rightmin = rightcharmin or getfield(current,"rhmin") if not leftchar or leftchar < 0 then leftchar = false end if not rightchar or rightchar < 0 then rightchar = false end -- local char = unicodes[code] or (extrachars and extrachars[code]) if char then word[1] = char size = 1 start = current else size = 0 end else size = 0 end language = lang elseif language <= 0 then -- elseif size > 0 then local char = unicodes[code] or (extrachars and extrachars[code]) if char then size = size + 1 word[size] = char elseif dictionary then if not hyphenonly or code ~= exhyphenchar then if size > charmin and leftmin + rightmin <= size then if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then -- skip else local hyphens = hyphenated(dictionary,word,size) if hyphens then flush(hyphens) end end end end size = 0 if code == exhyphenchar then -- normally the - local next = getnext(current) local last = current local font = getfont(current) while next and ischar(next,font) == code do last = next next = getnext(next) end if not autohyphen then current = last elseif current == last then current = inject(leftexchar,rightexchar,code,current) else current = injectseries(current,last,next,current) end if hyphenonly then skipping = true end elseif hyphenchars then local char = hyphenchars[code] if char == true then char = code end if char then current = inject(leftchar and char or nil,rightchar and char or nil,char,current) end end end else local a = getattr(current,a_hyphenation) if a ~= attr then attr = synchronizefeatureset(a) -- influences extrachars leftchar = leftchar or (instance and posthyphenchar (instance)) -- we can make this more rightchar = rightchar or (instance and prehyphenchar (instance)) -- efficient if needed leftexchar = (instance and preexhyphenchar (instance)) rightexchar = (instance and postexhyphenchar(instance)) leftmin = leftcharmin or getfield(current,"lhmin") rightmin = rightcharmin or getfield(current,"rhmin") if not leftchar or leftchar < 0 then leftchar = false end if not rightchar or rightchar < 0 then rightchar = false end end -- local char = unicodes[code] or (extrachars and extrachars[code]) if char then word[1] = char size = 1 start = current end end stop = current current = getnext(current) end else if skipping then skipping = false end if id == disc_code then size = 0 current = getnext(current) if hyphenonly then skipping = true end -- elseif strict and strict[id] then -- current = id == math_code and getnext(endofmath(current)) or getnext(current) -- size = 0 else current = id == math_code and getnext(endofmath(current)) or getnext(current) end if size > 0 then if dictionary and size > charmin and leftmin + rightmin <= size then if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then -- skip else local hyphens = hyphenated(dictionary,word,size) if hyphens then flush(hyphens) end end end size = 0 end end end -- we can have quit due to last so we need to flush the last seen word, we could move -- this in the loop and test for current but ... messy if dictionary and size > charmin and leftmin + rightmin <= size then if categories[word[1]] == "lu" and getfield(start,"uchyph") < 0 then -- skip else local hyphens = hyphenated(dictionary,word,size) if hyphens then flush(hyphens) end end end stoptiming(traditional) return head end statistics.register("hyphenation",function() if nofwords > 0 or statistics.elapsed(traditional) > 0 then return string.format("%s words hyphenated, %s unique, used time %s", nofwords,nofhashed,statistics.elapsedseconds(traditional) or 0) end end) local texmethod = "builders.kernel.hyphenation" local oldmethod = texmethod local newmethod = texmethod -- local newmethod = "languages.hyphenators.traditional.hyphenate" -- -- nodes.tasks.prependaction("processors","words",newmethod) -- nodes.tasks.disableaction("processors",oldmethod) -- -- nodes.tasks.replaceaction("processors","words",oldmethod,newmethod) -- \enabledirectives[hyphenators.method=traditional] -- \enabledirectives[hyphenators.method=builtin] -- push / pop ? check first attribute -- local replaceaction = nodes.tasks.replaceaction -- no longer overload this way (too many local switches) local hyphenate = language.hyphenate local hyphenating = nuts.hyphenating local methods = { } local usedmethod = false local stack = { } local original = hyphenating and function(head) return (hyphenating(head)) end or function(head) hyphenate(tonode(head)) return head -- a nut end -- local has_language = language.has_language -- -- local function original(head) -- kernel.hyphenation(head) -- local h = tonode(head) -- if has_language(h) then -- hyphenate(h) -- end -- return head -- end local getcount = tex.getcount hyphenators.methods = methods local optimize = false directives.register("hyphenator.optimize", function(v) optimize = v end) function hyphenators.handler(head,groupcode) if usedmethod then if optimize and (groupcode == "hbox" or groupcode == "adjustedhbox") then if getcount("hyphenstate") > 0 then forced = false return usedmethod(head) else return head end else return usedmethod(head) end else return head end end methods.tex = original methods.original = original methods.expanded = original -- was expanded before 1.005 methods.traditional = languages.hyphenators.traditional.hyphenate methods.none = false -- function(head) return head, false end usedmethod = original local function setmethod(method) usedmethod = type(method) == "string" and methods[method] if usedmethod == nil then usedmethod = methods.tex end end local function pushmethod(method) insert(stack,usedmethod) usedmethod = type(method) == "string" and methods[method] if usedmethod == nil then usedmethod = methods.tex end end local function popmethod() usedmethod = remove(stack) or methods.tex end hyphenators.setmethod = setmethod hyphenators.pushmethod = pushmethod hyphenators.popmethod = popmethod directives.register("hyphenators.method",setmethod) function hyphenators.setup(specification) local method = specification.method if method then setmethod(method) end end implement { name = "sethyphenationmethod", actions = setmethod, arguments = "string" } implement { name = "pushhyphenation", actions = pushmethod, arguments = "string" } implement { name = "pophyphenation", actions = popmethod } -- can become a runtime loaded one: local context = context local ctx_NC = context.NC local ctx_NR = context.NR local ctx_verbatim = context.verbatim function hyphenators.showhyphenationtrace(language,word) if not word or word == "" then return end local saved = trace_steps trace_steps = "silent" local steps = traditional.gettrace(language,word) trace_steps = saved if steps then local n = #steps if n > 0 then context.starttabulate { "|r|l|l|l|" } for i=1,n do local s = steps[i] ctx_NC() if i > 1 and i < n then context(i-1) end ctx_NC() ctx_verbatim(s[1]) ctx_NC() ctx_verbatim(s[2]) ctx_NC() ctx_verbatim(s[3]) ctx_NC() ctx_NR() end context.stoptabulate() end end end implement { name = "showhyphenationtrace", actions = hyphenators.showhyphenationtrace, arguments = "2 strings", } function nodes.stripdiscretionaries(head) for l in nexthlist, head do for d in nextdisc, getlist(l) do remove_node(h,false,true) end end return head end else -- traditional.loadpatterns("nl","lang-nl") -- traditional.loadpatterns("de","lang-de") -- traditional.loadpatterns("us","lang-us") -- traditional.registerpattern("nl","e1ë", { start = 1, length = 2, before = "e", after = "e" } ) -- traditional.registerpattern("nl","oo7ë", { start = 2, length = 3, before = "o", after = "e" } ) -- traditional.registerpattern("de","qqxc9xkqq",{ start = 3, length = 4, before = "ab", after = "cd" } ) -- local specification = { -- leftcharmin = 2, -- rightcharmin = 2, -- leftchar = "<", -- rightchar = ">", -- } -- print("reëel", traditional.injecthyphens(dictionaries.nl,"reëel", specification),"r{e>}{}{}{