texlanguage.c /size: 72 Kb    last modification: 2024-01-16 10:22
1/*
2    See license.txt in the root of this project.
3*/
4
5# include "luametatex.h"
6
7/*tex
8
9    We no longer dump the patterns and exeptions as they as supposed to be loaded runtime. There is
10    no gain getting them from the format. But we do dump some of the properties.
11
12    There were all kind of checks for simple characters i.e. not ligatures but there is no need for
13    that in \LUAMETATEX. We have separated stages and the hyphenator sees just glyphs. And when a
14    traditional font has glyphs we can assume that the old school font encoding matches the patterns
15    i.e. that ligatures are not in the normal character slots.
16
17    Exceptions are stored at the \LUA\ end. We cannot easilly go dynamic because fonts are stored
18    in the eqtb so we would have to use some more indirect mechanism (doable as we do it for other
19    items) too.
20
21*/
22
23language_state_info lmt_language_state = {
24    .languages        = NULL,
25    .language_data    = {
26        .minimum      = min_language_size,
27        .maximum      = max_language_size,
28        .size         = memory_data_unset,
29        .step         = stp_language_size,
30        .allocated    = 0,
31        .itemsize     = 1,
32        .top          = 0,
33        .ptr          = 0,
34        .initial      = memory_data_unset,
35        .offset       = 0,
36    },
37    .handler_table_id = 0,
38    .handler_count    = 0,
39};
40
41/*tex
42    We can enforce a language id but we want to be sequential so we accept holes! So one
43    has to define bottom-up. As with fonts, we have a zero language but that one normally
44    is not set.
45*/
46
47static void tex_aux_reset_language(halfword id)
48{
49    tex_language *lang = lmt_language_state.languages[id];
50    lang->id = id;
51    lang->exceptions = 0;
52    lang->patterns = NULL;
53    lang->wordhandler = 0;
54    lang->pre_hyphen_char = '-';
55    lang->post_hyphen_char = 0;
56    lang->pre_exhyphen_char = 0;
57    lang->post_exhyphen_char = 0;
58    lang->hyphenation_min = -1;
59    lang->hjcode_head = NULL;
60}
61
62/*tex
63    A value below zero will bump the language id. Because we have a rather limited number of
64    languages there is no configuration, size is just maximum.
65*/
66
67static halfword tex_aux_new_language_id(halfword id)
68{
69    int top;
70    if (id >= 0) {
71        if (id <= lmt_language_state.language_data.top) {
72            if (lmt_language_state.languages[id]) {
73                return tex_formatted_error("languages", "the language with id %d is already created", id);
74            } else {
75                return id;
76            }
77        } else if (id > lmt_language_state.language_data.maximum) {
78            goto OVERFLOWERROR;
79        } else {
80            top = id;
81        }
82    } else if (lmt_language_state.language_data.ptr < lmt_language_state.language_data.top) {
83        ++lmt_language_state.language_data.ptr;
84        return lmt_language_state.language_data.ptr;
85    } else if (lmt_language_state.language_data.top >= lmt_language_state.language_data.maximum) {
86        goto OVERFLOWERROR;
87    } else if (lmt_language_state.language_data.top + lmt_language_state.language_data.step > lmt_language_state.language_data.maximum) {
88        top = lmt_language_state.language_data.maximum;
89    } else {
90        top = lmt_language_state.language_data.top + lmt_language_state.language_data.step;
91    }
92    /*tex Finally we can bump memory. */
93    {
94        tex_language **tmp = aux_reallocate_array(lmt_language_state.languages, sizeof(tex_language *), top, 0);
95        if (tmp) {
96            for (int i = lmt_language_state.language_data.top + 1; i <= top; i++) {
97                tmp[i] = NULL;
98            }
99            lmt_language_state.languages = tmp;
100            lmt_language_state.language_data.allocated += ((size_t) top - lmt_language_state.language_data.top) * sizeof(tex_language *);
101            lmt_language_state.language_data.top = top;
102            lmt_language_state.language_data.ptr += 1;
103            return lmt_language_state.language_data.ptr;
104        }
105    }
106  OVERFLOWERROR:
107    tex_overflow_error("languages", lmt_language_state.language_data.maximum);
108    return 0;
109}
110
111void tex_initialize_languages(void)
112{
113    tex_language **tmp = aux_allocate_clear_array(sizeof(tex_language *), lmt_language_state.language_data.minimum, 0);
114    if (tmp) {
115        for (int i = 0; i < lmt_language_state.language_data.minimum; i++) {
116            tmp[i] = NULL;
117        }
118        lmt_language_state.languages = tmp;
119        lmt_language_state.language_data.allocated += lmt_language_state.language_data.minimum * sizeof(tex_language *);
120        lmt_language_state.language_data.top = lmt_language_state.language_data.minimum;
121    } else {
122        tex_overflow_error("languages", lmt_language_state.language_data.minimum);
123    }
124}
125
126/*
127halfword tex_aux_maximum_language_id(void)
128{
129    return language_state.language_data.maximum;
130}
131*/
132
133int tex_is_valid_language(halfword n)
134{
135    if (n == 0) {
136        return 1;
137    } else if (n > 0 && n <= lmt_language_state.language_data.top) {
138        return lmt_language_state.languages[n] ? 1 : 0;
139    } else {
140        return 0;
141    }
142}
143
144tex_language *tex_new_language(halfword n)
145{
146    halfword id = tex_aux_new_language_id(n);
147    if (id >= 0) {
148        tex_language *lang = lmt_memory_malloc(sizeof(struct tex_language));
149        if (lang) {
150            lmt_language_state.languages[id] = lang;
151            lmt_language_state.language_data.allocated += sizeof(struct tex_language);
152            tex_aux_reset_language(id);
153            if (saving_hyph_codes_par) {
154                /*tex
155                    For now, we might just use specific value for whatever task. This will become
156                    obsolete.
157                */
158                tex_hj_codes_from_lc_codes(id);
159            }
160        } else {
161            tex_overflow_error("language", sizeof(struct tex_language));
162        }
163        return lang;
164    } else {
165        return NULL;
166    }
167}
168
169tex_language *tex_get_language(halfword n)
170{
171    if (n >= 0) {
172        if (n <= lmt_language_state.language_data.top && lmt_language_state.languages[n]) {
173            return lmt_language_state.languages[n];
174        }
175        if (n <= lmt_language_state.language_data.maximum) {
176            return tex_new_language(n);
177        }
178    }
179    return NULL;
180}
181
182/*tex
183    Freeing, dumping, undumping languages:
184*/
185
186/*
187void free_languages(void)
188{
189    for (int i = 0; i < language_state.language_data.top; i++) {
190        if (language_state.languages[i]) {
191            lmt_memory_free(language_state.languages[i]);
192            language_state.languages[i] = NULL;
193        }
194    }
195}
196*/
197
198void tex_dump_language_data(dumpstream f)
199{
200    dump_int(f, lmt_language_state.language_data.top);
201    dump_int(f, lmt_language_state.language_data.ptr);
202    if (lmt_language_state.language_data.top > 0) {
203        for (int i = 0; i < lmt_language_state.language_data.top; i++) {
204            tex_language *lang = lmt_language_state.languages[i];
205            if (lang) {
206                dump_via_int(f, 1);
207                dump_int(f, lang->id);
208                dump_int(f, lang->pre_hyphen_char);
209                dump_int(f, lang->post_hyphen_char);
210                dump_int(f, lang->pre_exhyphen_char);
211                dump_int(f, lang->post_exhyphen_char);
212                dump_int(f, lang->hyphenation_min);
213                tex_dump_language_hj_codes(f, i);
214            } else {
215                dump_via_int(f, 0);
216            }
217        }
218    }
219}
220
221void tex_undump_language_data(dumpstream f)
222{
223    int top, ptr;
224    undump_int(f, top);
225    undump_int(f, ptr);
226    if (top > 0) {
227        tex_language **tmp = aux_allocate_clear_array(sizeof(tex_language *), top, 0);
228        if (tmp) {
229            lmt_language_state.language_data.top = top;
230            lmt_language_state.language_data.ptr = ptr;
231            lmt_language_state.languages = tmp;
232            for (int i = 0; i < top; i++) {
233                int x;
234                undump_int(f, x);
235                if (x == 1) {
236                    tex_language *lang = lmt_memory_malloc(sizeof(struct tex_language));
237                    if (lang) {
238                        lmt_language_state.languages[i] = lang;
239                        lmt_language_state.language_data.allocated += sizeof(struct tex_language);
240                        lang->exceptions = 0;
241                        lang->patterns = NULL;
242                        lang->wordhandler = 0;
243                        lang->hjcode_head = NULL;
244                        undump_int(f, lang->id);
245                        undump_int(f, lang->pre_hyphen_char);
246                        undump_int(f, lang->post_hyphen_char);
247                        undump_int(f, lang->pre_exhyphen_char);
248                        undump_int(f, lang->post_exhyphen_char);
249                        undump_int(f, lang->hyphenation_min);
250                        tex_undump_language_hj_codes(f, i);
251                        if (lang->id != i) {
252                            tex_formatted_warning("languages", "undumped language id mismatch: %d <> %d", lang->id, i);
253                            lang->id = i;
254                        }
255                    } else {
256                        tex_overflow_error("languages", i);
257                    }
258                    tmp[i] = lang;
259                } else {
260                    tmp[i] = NULL;
261                }
262            }
263            lmt_language_state.language_data.initial = lmt_language_state.language_data.ptr;
264        } else {
265            tex_overflow_error("languages", top);
266            lmt_language_state.language_data.initial = 0;
267        }
268    } else {
269        /*tex Indeed we can have no languages stored. */
270        tex_initialize_languages();
271    }
272}
273
274/*tex All kind of accessors. */
275
276void tex_set_pre_hyphen_char(halfword n, halfword v)
277{
278    struct tex_language *l = tex_get_language(n);
279    if (l) {
280        l->pre_hyphen_char = v;
281    }
282}
283
284void tex_set_post_hyphen_char(halfword n, halfword v)
285{
286    struct tex_language *l = tex_get_language(n);
287    if (l) {
288        l->post_hyphen_char = v;
289    }
290}
291
292void tex_set_pre_exhyphen_char(halfword n, halfword v)
293{
294    struct tex_language *l = tex_get_language(n);
295    if (l) {
296        l->pre_exhyphen_char = v;
297    }
298}
299
300void tex_set_post_exhyphen_char(halfword n, halfword v)
301{
302    struct tex_language *l = tex_get_language(n);
303    if (l) {
304        l->post_exhyphen_char = v;
305    }
306}
307
308halfword tex_get_pre_hyphen_char(halfword n)
309{
310    struct tex_language *l = tex_get_language(n);
311    return l ? l->pre_hyphen_char : -1;
312}
313
314halfword tex_get_post_hyphen_char(halfword n)
315{
316    struct tex_language *l = tex_get_language(n);
317    return l ? l->post_hyphen_char : -1;
318}
319
320halfword tex_get_pre_exhyphen_char(halfword n)
321{
322    struct tex_language *l = tex_get_language(n);
323    return l ? l->pre_exhyphen_char : -1;
324}
325
326halfword tex_get_post_exhyphen_char(halfword n)
327{
328    struct tex_language *l = tex_get_language(n);
329    return (l) ? (int) l->post_exhyphen_char : -1;
330}
331
332void tex_set_hyphenation_min(halfword n, halfword v)
333{
334    struct tex_language *l = tex_get_language(n);
335    if (l) {
336        l->hyphenation_min = v;
337    }
338}
339
340halfword tex_get_hyphenation_min(halfword n)
341{
342    struct tex_language *l = tex_get_language((int) n);
343    return l ? l->hyphenation_min : -1;
344}
345
346void tex_load_patterns(struct tex_language *lang, const unsigned char *buff)
347{
348    if ((! lang) || (! buff) || strlen((const char *) buff) == 0) {
349        return;
350    } else {
351        if (! lang->patterns) {
352            lang->patterns = hnj_dictionary_new();
353        }
354        hnj_dictionary_load(lang->patterns, buff, tracing_hyphenation_par > 0);
355    }
356}
357
358void tex_clear_patterns(struct tex_language *lang)
359{
360    if (lang && lang->patterns) {
361        hnj_dictionary_clear(lang->patterns);
362    }
363}
364
365void tex_load_tex_patterns(halfword curlang, halfword head)
366{
367    /*tex We might want single hashes. */
368    char *s = tex_tokenlist_to_tstring(head, 1, NULL, 0, 0, 0, 0, 1); /* single hashes */
369    if (s) {
370        tex_load_patterns(tex_get_language(curlang), (unsigned char *) s);
371    }
372}
373
374/*
375    This cleans one word which is returned in |cleaned|, returns the new offset into |buffer|.
376*/
377
378/* define tex_isspace(c) (c == ' ' || c == '\t') */
379#  define tex_isspace(c) (c == ' ')
380
381const char *tex_clean_hyphenation(halfword id, const char *buff, char **cleaned)
382{
383    int items = 0;
384    /*tex Work buffer for bytes: */
385    unsigned char word[max_size_of_word + 1];
386    /*tex Work buffer for \UNICODE: */
387    unsigned uword[max_size_of_word + 1] = { 0 };
388    /*tex The \UNICODE\ buffer value: */
389    int i = 0;
390    char *uindex = (char *) word;
391    const char *s = buff;
392    while (*s && ! tex_isspace((unsigned char)*s)) {
393        word[i++] = (unsigned char) *s;
394        s++;
395        if ((s-buff) > max_size_of_word) {
396            /*tex Todo: this is too strict, should count \UNICODE, not bytes. */
397            *cleaned = NULL;
398            tex_handle_error(
399                normal_error_type,
400                "Exception too long",
401                NULL
402            );
403            return s;
404        }
405    }
406    /*tex Now convert the input to \UNICODE. */
407    word[i] = '\0';
408    aux_splitutf2uni(uword, (const char *)word);
409    /*tex
410        Build the new word string. The hjcode values < 32 indicate a length, so that
411        for instance \|hjcode`ܽ2| makes that ligature count okay.
412    */
413    i = 0;
414    while (uword[i] > 0) {
415        int u = uword[i++];
416        if (u == '-') {
417            /*tex Skip. */
418        } else if (u == '=') {
419            unsigned c = tex_get_hj_code(id, '-');
420            uindex = aux_uni2string(uindex, (! c || c <= 32) ? '-' : c);
421        } else if (u == '{') {
422            u = uword[i++];
423            items = 0;
424            while (u && u != '}') {
425                u = uword[i++];
426            }
427            if (u == '}') {
428                items++;
429                u = uword[i++];
430            }
431            while (u && u != '}') {
432                u = uword[i++];
433            }
434            if (u == '}') {
435                items++;
436                u = uword[i++];
437            }
438            if (u == '{') {
439                u = uword[i++];
440            }
441            while (u && u != '}') {
442                unsigned c = tex_get_hj_code(id, u);
443                uindex = aux_uni2string(uindex, (! c || c <= 32) ? u : c);
444                u = uword[i++];
445            }
446            if (u == '}') {
447                items++;
448            }
449            if (items != 3) {
450                /* hm, we intercept that elsewhere in a better way so why here? Best remove the test here or move the other one here. */
451                *cleaned = NULL;
452                tex_handle_error(
453                    normal_error_type,
454                    "Exception syntax error, a discretionary has three components: {}{}{}.",
455                    NULL
456                );
457                return s;
458            } else {
459                /* skip replacement (chars) */
460                if (uword[i] == '(') {
461                    while (uword[++i] && uword[i] != ')') { };
462                    if (uword[i] != ')') {
463                        tex_handle_error(
464                            normal_error_type,
465                            "Exception syntax error, an alternative replacement is defined as (text).",
466                            NULL
467                        );
468                        return s;
469                    } else if (uword[i]) {
470                        i++;
471                   }
472                }
473                /* skip penalty: [digit] but we intercept multiple digits */
474                if (uword[i] == '[') {
475                    if (uword[i+1] && uword[i+1] >= '0' && uword[i+1] <= '9' && uword[i+2] && uword[i+2] == ']') {
476                        i += 3;
477                    } else {
478                        tex_handle_error(
479                            normal_error_type,
480                            "Exception syntax error, a penalty is defined as [digit].",
481                            NULL
482                        );
483                        return s;
484                    }
485                }
486            }
487        } else {
488            unsigned c = tex_get_hj_code(id, u);
489            uindex = aux_uni2string(uindex, (! c || c <= 32) ? u : c);
490        }
491    }
492    *uindex = '\0';
493    *cleaned = lmt_memory_strdup((char *) word);
494    return s;
495}
496
497void tex_load_hyphenation(struct tex_language *lang, const unsigned char *buff)
498{
499    if (lang) {
500        lua_State *L = lmt_lua_state.lua_instance;
501        const char *s = (const char *) buff;
502        char *cleaned = NULL;
503        int id = lang->id;
504        if (lang->exceptions == 0) {
505            lua_newtable(L);
506            lang->exceptions = luaL_ref(L, LUA_REGISTRYINDEX);
507        }
508        lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions);
509        while (*s) {
510            while (tex_isspace((unsigned char) *s)) {
511                s++;
512            }
513            if (*s) {
514                const char *value = s;
515                s = tex_clean_hyphenation(id, s, &cleaned);
516                if (cleaned) {
517                    size_t len = s - value;
518                    if (len > 0) {
519                        lua_pushstring(L, cleaned);
520                        lua_pushlstring(L, value, len);
521                        lua_rawset(L, -3);
522                    }
523                    lmt_memory_free(cleaned);
524                } else {
525                    /* tex_formatted_warning("hyphenation","skipping invalid hyphenation exception: %s", value); */
526                }
527            }
528        }
529        lua_pop(L, 1);
530    }
531}
532
533void tex_clear_hyphenation(struct tex_language *lang)
534{
535    if (lang && lang->exceptions != 0) {
536        lua_State *L = lmt_lua_state.lua_instance;
537        luaL_unref(L, LUA_REGISTRYINDEX, lang->exceptions);
538        lang->exceptions = 0;
539    }
540}
541
542void tex_load_tex_hyphenation(halfword curlang, halfword head)
543{
544    char *s = tex_tokenlist_to_tstring(head, 1, NULL, 0, 0, 0, 0, 1); /* single hashes */
545    if (s) {
546        tex_load_hyphenation(tex_get_language(curlang), (unsigned char *) s);
547    }
548}
549
550static halfword tex_aux_insert_discretionary(halfword t, halfword pre, halfword post, halfword replace, quarterword subtype, int penalty)
551{
552    /*tex For compound words following explicit hyphens we take the current font. */
553    halfword d = tex_new_disc_node(subtype);
554    halfword a = node_attr(t) ;
555    disc_penalty(d) = penalty;
556    if (t == replace) {
557        /*tex We have |prev disc next-next|. */
558        tex_try_couple_nodes(d, node_next(t));
559        tex_try_couple_nodes(node_prev(t), d);
560        node_prev(t) = null;
561        node_next(t) = null;
562        replace = t;
563    } else {
564        /*tex We have |prev disc next|. */
565        tex_try_couple_nodes(d, node_next(t));
566        tex_couple_nodes(t, d);
567    }
568    if (a) {
569        tex_attach_attribute_list_attribute(d, a);
570    }
571    tex_set_disc_field(d, pre_break_code, pre);
572    tex_set_disc_field(d, post_break_code, post);
573    tex_set_disc_field(d, no_break_code, replace);
574    return d;
575}
576
577static halfword tex_aux_insert_syllable_discretionary(halfword t, lang_variables *lan)
578{
579    halfword n = tex_new_disc_node(syllable_discretionary_code);
580    disc_penalty(n) = hyphen_penalty_par;
581    tex_couple_nodes(n, node_next(t));
582    tex_couple_nodes(t, n);
583    tex_attach_attribute_list_attribute(n, get_attribute_list(t));
584    if (lan->pre_hyphen_char > 0) {
585        halfword g = tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), lan->pre_hyphen_char, t);
586        tex_set_disc_field(n, pre_break_code, g);
587    }
588    if (lan->post_hyphen_char > 0) {
589        halfword g = tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), lan->post_hyphen_char, t);
590        tex_set_disc_field(n, post_break_code, g);
591    }
592    return n;
593}
594
595static halfword tex_aux_compound_word_break(halfword t, halfword clang, halfword chr)
596{
597    halfword prechar, postchar, pre, post, disc;
598    if (chr == ex_hyphen_char_par) {
599        halfword pre_exhyphen_char = tex_get_pre_exhyphen_char(clang);
600        halfword post_exhyphen_char = tex_get_post_exhyphen_char(clang);
601        prechar  = pre_exhyphen_char  > 0 ? pre_exhyphen_char  : ex_hyphen_char_par;
602        postchar = post_exhyphen_char > 0 ? post_exhyphen_char : null;
603    } else {
604        /* we need a flag : use pre/post cf language spec */
605        prechar  = chr;
606        postchar = null;
607    }
608    pre  = prechar  > 0 ? tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), prechar,  t) : null;
609    post = postchar > 0 ? tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), postchar, t) : null;
610    disc = tex_aux_insert_discretionary(t, pre, post, t, automatic_discretionary_code, tex_automatic_disc_penalty(glyph_hyphenate(t)));
611    return disc;
612}
613
614static char *tex_aux_hyphenation_exception(int exceptions, char *w)
615{
616    lua_State *L = lmt_lua_state.lua_instance;
617    char *ret = NULL;
618    if (lua_rawgeti(L, LUA_REGISTRYINDEX, exceptions) == LUA_TTABLE) {
619        /*tex Word table: */
620        lua_pushstring(L, w);
621        lua_rawget(L, -2);
622        if (lua_type(L, -1) == LUA_TSTRING) {
623            ret = lmt_memory_strdup(lua_tostring(L, -1));
624        }
625        lua_pop(L, 2);
626    } else {
627        lua_pop(L, 1);
628    }
629    return ret;
630}
631
632/*tex
633
634    The sequence from |wordstart| to |r| can contain only normal characters it could be faster to
635    modify a halfword pointer and return an integer
636
637*/
638
639# define zws  0x200B /* zero width space makes no sense */
640# define zwnj 0x200C
641# define zwj  0x200D
642
643static halfword tex_aux_find_exception_part(unsigned int *j, unsigned int *uword, int len, halfword parent, char final)
644{
645    halfword head = null;
646    halfword tail = null;
647    unsigned i = *j;
648    int noligature = 0;
649    int nokerning = 0;
650    /*tex This puts uword[i] on the |{|. */
651    i++;
652    while (i < (unsigned) len && uword[i + 1] != (unsigned int) final) {
653        if (tail) {
654            switch (uword[i + 1]) {
655                case zwj:
656                    noligature = 1;
657                    nokerning = 0;
658                    break;
659                case zwnj:
660                    noligature = 1;
661                    nokerning = 1;
662                    break;
663                default:
664                    {
665                        halfword s = tex_new_glyph_node(glyph_unset_subtype, glyph_font(parent), (int) uword[i + 1], parent); /* todo: data */
666                        tex_couple_nodes(tail, s);
667                        if (noligature) {
668                            tex_add_glyph_option(tail, glyph_option_no_right_ligature);
669                            tex_add_glyph_option(s, glyph_option_no_left_ligature);
670                            noligature = 0;
671                        }
672                        if (nokerning) {
673                            tex_add_glyph_option(tail, glyph_option_no_right_kern);
674                            tex_add_glyph_option(s, glyph_option_no_left_kern);
675                            nokerning = 0;
676                        }
677                        tail = node_next(tail);
678                        break;
679                    }
680            }
681        } else {
682            head = tex_new_glyph_node(glyph_unset_subtype, glyph_font(parent), (int) uword[i + 1], parent); /* todo: data */
683            tail = head;
684        }
685        i++;
686    }
687    *j = ++i;
688    return head;
689}
690
691static int tex_aux_count_exception_part(unsigned int *j, unsigned int *uword, int len)
692{
693    int n = 0;
694    unsigned i = *j;
695    /*tex This puts uword[i] on the |{|. */
696    i++;
697    while (i < (unsigned) len && uword[i + 1] != '}') {
698        n++;
699        i++;
700    }
701    *j = ++i;
702    return n;
703}
704
705static void tex_aux_show_exception_error(const char *part)
706{
707    tex_handle_error(
708        normal_error_type,
709        "Invalid %s part in exception",
710        part,
711        "Exception discretionaries should contain three pairs of braced items.\n"
712        "No intervening spaces are allowed."
713    );
714}
715
716/*tex
717
718    The exceptions are taken as-is: no min values are taken into account. One can add normal
719    patterns on-the-fly if needed.
720
721*/
722
723static void tex_aux_do_exception(halfword wordstart, halfword r, char *replacement)
724{
725    halfword t = wordstart;
726    lang_variables langdata;
727    unsigned uword[max_size_of_word + 1] = { 0 };
728    unsigned len = aux_splitutf2uni(uword, replacement);
729    int clang = get_glyph_language(wordstart);
730    langdata.pre_hyphen_char = tex_get_pre_hyphen_char(clang);
731    langdata.post_hyphen_char = tex_get_post_hyphen_char(clang);
732    for (unsigned i = 0; i < len; i++) {
733        if (uword[i + 1] == 0 ) {
734            /*tex We ran out of the exception pattern. */
735            break;
736        } else if (uword[i + 1] == '-') {
737            /*tex A hyphen follows. */
738            if (node_next(t) == r) {
739                break;
740            } else {
741                tex_aux_insert_syllable_discretionary(t, &langdata);
742                /*tex Skip the new disc */
743                t = node_next(t);
744            }
745        } else if (uword[i + 1] == '=') {
746            /*tex We skip a disc. */
747            t = node_next(t);
748        } else if (uword[i + 1] == '{') {
749            /*tex We ran into an exception |{}{}{}| or |{}{}{}[]|. */
750            halfword pre = null;
751            halfword post = null;
752            halfword replace = null;
753            int count = 0;
754            int alternative = null;
755            halfword penalty;
756            /*tex |pre| */
757            pre = tex_aux_find_exception_part(&i, uword, (int) len, wordstart, '}');
758            if (i == len || uword[i + 1] != '{') {
759                tex_aux_show_exception_error("pre");
760            }
761            /*tex |post| */
762            post = tex_aux_find_exception_part(&i, uword, (int) len, wordstart, '}');
763            if (i == len || uword[i + 1] != '{') {
764                tex_aux_show_exception_error("post");
765            }
766            /*tex |replace| */
767            count = tex_aux_count_exception_part(&i, uword, (int) len);
768            if (i == len) {
769                tex_aux_show_exception_error("replace");
770            } else if (uword[i] && uword[i + 1] == '(') {
771                alternative = tex_aux_find_exception_part(&i, uword, (int) len, wordstart, ')');;
772            }
773            /*tex Play safe. */
774            if (node_next(t) == r) {
775                break;
776            } else {
777                /*tex Let's deal with an (optional) replacement. */
778                if (count > 0) {
779                    /*tex Assemble the replace stream. */
780                    halfword q = t;
781                    replace = node_next(q);
782                    while (count > 0 && q) {
783                        halfword t = node_type(q);
784                        q = node_next(q);
785                        if (t == glyph_node || t == disc_node) {
786                            count--;
787                        } else {
788                            break ;
789                        }
790                    }
791                    /*tex Remove it from the main stream */
792                    tex_try_couple_nodes(t, node_next(q));
793                    /*tex and finish it in the replace. */
794                    node_next(q) = null;
795                    if (alternative) {
796                        tex_flush_node_list(replace);
797                        replace = alternative;
798                    } else {
799                        /*tex Sanitize the replace stream (we could use the flattener instead). */
800                        q = replace ;
801                        while (q) {
802                            halfword n = node_next(q);
803                            if (node_type(q) == disc_node) {
804                                /*tex Beware: the replacement starts after the no_break pointer. */
805                                halfword nb = disc_no_break_head(q);
806                                disc_no_break_head(q) = null;
807                                node_prev(nb) = null ; /* used at all? */
808                                /*tex Insert the replacement glyph. */
809                                if (q == replace) {
810                                    replace = nb;
811                                } else {
812                                    tex_try_couple_nodes(node_prev(q), nb);
813                                }
814                                /*tex Append the glyph (one). */
815                                tex_try_couple_nodes(nb, n);
816                                /*tex Flush the disc. */
817                                tex_flush_node(q);
818                            }
819                            q = n ;
820                        }
821                    }
822                }
823                /*tex Let's check if we have a penalty spec. If we have more then we're toast, we just ignore them. */
824                if (uword[i] && uword[i + 1] == '[') {
825                    i += 2;
826                    if (uword[i] && uword[i] >= '0' && uword[i] <= '9') {
827                        if (exception_penalty_par > 0) {
828                            if (exception_penalty_par > infinite_penalty) {
829                                penalty = exception_penalty_par;
830                            } else {
831                                penalty = (uword[i] - '0') * exception_penalty_par ;
832                            }
833                        } else {
834                            penalty = hyphen_penalty_par;
835                        }
836                        ++i;
837                        while (uword[i] && uword[i] != ']') {
838                            ++i;
839                        }
840                    } else {
841                        penalty = hyphen_penalty_par;
842                    }
843                } else {
844                    penalty = hyphen_penalty_par;
845                }
846                /*tex And now we insert a disc node (this was |syllable_discretionary_code|). */
847                t = tex_aux_insert_discretionary(t, pre, post, replace, normal_discretionary_code, penalty);
848                /*tex We skip the new disc node. */
849                t = node_next(t);
850                /*tex 
851                    We need to check if we have two discretionaries in a row, test case: |\hyphenation 
852                    {a{>}{<}{b}{>}{<}{c}de} \hsize 1pt abcde \par| which gives |a> <> <de|. 
853                */
854                if (uword[i] && uword[i + 1] == '{') {
855                    i--;
856                    t = node_prev(t); /*tex Tricky! */
857                }
858            }
859        } else {
860            t = node_next(t);
861        }
862        /*tex Again we play safe. */
863        if (! t || node_next(t) == r) {
864            break;
865        }
866    }
867}
868
869/*tex
870
871    The following description is no longer valid for \LUATEX. Although we use the same algorithm
872    for hyphenation, it is not integrated in the par builder. Instead it is a separate run over
873    the node list, preceding the line-breaking routine, possibly replaced by a callback. We keep
874    the description here because the principles remain.
875
876    \startnarrower
877
878    When the line-breaking routine is unable to find a feasible sequence of breakpoints, it makes
879    a second pass over the paragraph, attempting to hyphenate the hyphenatable words. The goal of
880    hyphenation is to insert discretionary material into the paragraph so that there are more
881    potential places to break.
882
883    The general rules for hyphenation are somewhat complex and technical, because we want to be
884    able to hyphenate words that are preceded or followed by punctuation marks, and because we
885    want the rules to work for languages other than English. We also must contend with the fact
886    that hyphens might radically alter the ligature and kerning structure of a word.
887
888    A sequence of characters will be considered for hyphenation only if it belongs to a \quotation
889    {potentially hyphenatable part} of the current paragraph. This is a sequence of nodes $p_0p_1
890    \ldots p_m$ where $p_0$ is a glue node, $p_1\ldots p_{m-1}$ are either character or ligature
891    or whatsit or implicit kern nodes, and $p_m$ is a glue or penalty or insertion or adjust or
892    mark or whatsit or explicit kern node. (Therefore hyphenation is disabled by boxes, math
893    formulas, and discretionary nodes already inserted by the user.) The ligature nodes among $p_1
894    \ldots p_{m-1}$ are effectively expanded into the original non-ligature characters; the kern
895    nodes and whatsits are ignored. Each character |c| is now classified as either a nonletter (if
896    |lc_code(c)=0|), a lowercase letter (if |lc_code(c)=c|), or an uppercase letter (otherwise); an
897    uppercase letter is treated as if it were |lc_code(c)| for purposes of hyphenation. The
898    characters generated by $p_1\ldots p_{m-1}$ may begin with nonletters; let $c_1$ be the first
899    letter that is not in the middle of a ligature. Whatsit nodes preceding $c_1$ are ignored; a
900    whatsit found after $c_1$ will be the terminating node $p_m$. All characters that do not have
901    the same font as $c_1$ will be treated as nonletters. The |hyphen_char| for that font must be
902    between 0 and 255, otherwise hyphenation will not be attempted. \TeX\ looks ahead for as many
903    consecutive letters $c_1\ldots c_n$ as possible; however, |n| must be less than 64, so a
904    character that would otherwise be $c_{64}$ is effectively not a letter. Furthermore $c_n$ must
905    not be in the middle of a ligature. In this way we obtain a string of letters $c_1\ldots c_n$
906    that are generated by nodes $p_a\ldots p_b$, where |1<=a<=b+1<=m|. If |n>=l_hyf+r_hyf|, this
907    string qualifies for hyphenation; however, |uc_hyph| must be positive, if $c_1$ is uppercase.
908
909    The hyphenation process takes place in three stages. First, the candidate sequence $c_1 \ldots
910    c_n$ is found; then potential positions for hyphens are determined by referring to hyphenation
911    tables; and finally, the nodes $p_a\ldots p_b$ are replaced by a new sequence of nodes that
912    includes the discretionary breaks found.
913
914    Fortunately, we do not have to do all this calculation very often, because of the way it has
915    been taken out of \TEX's inner loop. For example, when the second edition of the author's
916    700-page book {\sl Seminumerical Algorithms} was typeset by \TEX, only about 1.2 hyphenations
917    needed to be tried per paragraph, since the line breaking algorithm needed to use two passes on
918    only about 5 per cent of the paragraphs. (This is not true in \LUATEX: we always hyphenate the
919    whole list.)
920
921    When a word been set up to contain a candidate for hyphenation, \TEX\ first looks to see if it
922    is in the user's exception dictionary. If not, hyphens are inserted based on patterns that
923    appear within the given word, using an algorithm due to Frank~M. Liang.
924
925    \stopnarrower
926
927    This is incompatible with \TEX\ because the first word of a paragraph can be hyphenated, but
928    most European users seem to agree that prohibiting hyphenation there was not the best idea ever.
929
930    To be documented: |\hyphenationmode| (a bit set).
931
932    \startbuffer
933    \parindent0pt \hsize=1.1cm
934    12-34-56 \par
935    12-34-\hbox{56} \par
936    12-34-\vrule width 1em height 1.5ex \par
937    12-\hbox{34}-56 \par
938    12-\vrule width 1em height 1.5ex-56 \par
939    \hjcode`\1=`\1 \hjcode`\2=`\2 \hjcode`\3=`\3 \hjcode`\4=`\4 \vskip.5cm
940    12-34-56 \par
941    12-34-\hbox{56} \par
942    12-34-\vrule width 1em height 1.5ex \par
943    12-\hbox{34}-56 \par
944    12-\vrule width 1em height 1.5ex-56 \par
945    \stopbuffer
946
947    \typebuffer
948
949    \startpacked \getbuffer \stopbuffer
950
951    We only accept an explicit hyphen when there is a preceding glyph and we skip a sequence of
952    explicit hyphens as that normally indicates a \type {--} or \type {---} ligature in which case
953    we can in a worse case usage get bad node lists later on due to messed up ligature building as
954    these dashes are ligatures in base fonts. This is a side effect of the separating the
955    hyphenation, ligaturing and kerning steps. A test is cmr with \type {------}.
956
957    A font handler can collapse successive hyphens but it's not nice to put the burden there. A
958    somewhat messy border case is \type {----} but in \LUATEX\ we don't treat \type {--} and \type
959    {---} special. Also, traditional \TEX\ will break a line at \type {-foo} but this can be
960    disabled by setting the automatic mode to \type {1}.
961
962*/
963
964inline static halfword tex_aux_is_hyphen_char(halfword chr)
965{
966    if (tex_get_hc_code(chr)) {
967        return tex_get_hc_code(chr);
968    } else if (chr == ex_hyphen_char_par) {
969        return chr;
970    } else {
971        return null;
972    }
973}
974
975static halfword tex_aux_find_next_wordstart(halfword r, halfword first_language)
976{
977    int start_ok = 1;
978    halfword lastglyph = r;
979    while (r) {
980        switch (node_type(r)) {
981            case boundary_node:
982                if (node_subtype(r) == word_boundary) {
983                    start_ok = 1;
984                }
985                break;
986            case disc_node:
987                start_ok = has_disc_option(r, disc_option_post_word);
988                break;
989            case hlist_node:
990            case vlist_node:
991            case rule_node:
992            case dir_node:
993            case whatsit_node:
994                if (hyphenation_permitted(glyph_hyphenate(lastglyph), strict_start_hyphenation_mode)) {
995                    start_ok = 0;
996                }
997                break;
998            case glue_node:
999                start_ok = 1;
1000                break;
1001            case math_node:
1002                if (node_subtype(r) == begin_inline_math) {
1003                    int mathlevel = 1;
1004                    while (mathlevel > 0) {
1005                        r = node_next(r);
1006                        if (! r) {
1007                            return r;
1008                        } else if (node_type(r) == math_node) {
1009                            if (node_subtype(r) == begin_inline_math) {
1010                                mathlevel++;
1011                            } else {
1012                                mathlevel--;
1013                            }
1014                        }
1015                    }
1016                }
1017                break;
1018            case glyph_node:
1019                {
1020                    /*tex
1021                        When we have no word yet and meet a hyphen (equivalent) we should just
1022                        keep going. This is not compatible but it does make sense.
1023                    */
1024                    int chr = glyph_character(r);
1025                    int hyp = tex_aux_is_hyphen_char(chr);
1026                    lastglyph = r;
1027                    if (hyp) {
1028                        if (hyphenation_permitted(glyph_hyphenate(r), ignore_bounds_hyphenation_mode)) {
1029                            /* maybe some tracing */
1030                        } else {
1031                            /* todo: already check if we have hj chars left/right i.e. no digits and minus mess */
1032                            halfword t = node_next(r) ;
1033                            /*tex Kind of weird that we have the opposite flag test here. */
1034                            if (t && (node_type(t) == glyph_node) && (! tex_aux_is_hyphen_char(glyph_character(t))) && ! hyphenation_permitted(glyph_hyphenate(r), automatic_hyphenation_mode)) {
1035                                /*tex We have no word yet and the next character is a non hyphen. */
1036                                r = tex_aux_compound_word_break(r, get_glyph_language(r), hyp);
1037                                // test case: \automatichyphenmode0 10\high{-6-1-2-4}
1038                                start_ok = 1; // todo: also in luatex
1039                            } else {
1040                                /*tex We jump over the sequence of hyphens. */
1041                                while (t && (node_type(t) == glyph_node) && tex_aux_is_hyphen_char(glyph_character(t))) {
1042                                    r = t ;
1043                                    t = node_next(r) ;
1044                                }
1045                                if (t) {
1046                                    /*tex We need a restart. */
1047                                    start_ok = 0;
1048                                } else {
1049                                    /*tex We reached the end of the list so we have no word start. */
1050                                    return null;
1051                                }
1052                            }
1053                        }
1054                    } else if (start_ok && (get_glyph_language(r) >= first_language) && get_glyph_dohyph(r)) {
1055                        int l = tex_get_hj_code(get_glyph_language(r), chr);
1056                        if (l > 0) {
1057                            if (l == chr || l <= 32 || get_glyph_uchyph(r)) {
1058                                return r;
1059                            } else {
1060                                start_ok = 0;
1061                            }
1062                        } else {
1063                            /*tex We go on. */
1064                        }
1065                    } else {
1066                        /*tex We go on. */
1067                    }
1068                }
1069                break;
1070            default:
1071                start_ok = 0;
1072                break;
1073        }
1074        r = node_next(r);
1075    }
1076    return r; /* null */
1077}
1078
1079/*tex
1080
1081    This is the original test, extended with bounds, but still the complex expression turned into
1082    a function.  However, it actually is part of the old mechanism where hyphenation was mixed
1083    with ligature building and kerning, so there was this skipping over a font kern whuch is no
1084    longer needed as we have separate steps.
1085
1086    We keep this as reference:
1087
1088    \starttyping
1089    static int valid_wordend(halfword s, halfword strict_bound)
1090    {
1091        if (s) {
1092            halfword r = s;
1093            int clang = get_glyph_language(s);
1094            while ( (r) &&
1095                   (    (type(r) == glyph_node && clang == get_glyph_language(r))
1096                     || (type(r) == kern_node && (subtype(r) == font_kern))
1097                    )
1098                   ) {
1099                r = node_next(r);
1100            }
1101            return (! r || (type(r) == glyph_node && clang != get_glyph_language(r))
1102                        ||  type(r) == glue_node
1103                        ||  type(r) == penalty_node
1104                        || (type(r) == kern_node && (subtype(r) == explicit_kern ||
1105                                                     subtype(r) == italic_kern   ||
1106                                                     subtype(r) == accent_kern   ))
1107                        ||  ((type(r) == hlist_node   ||
1108                              type(r) == vlist_node   ||
1109                              type(r) == rule_node    ||
1110                              type(r) == dir_node     ||
1111                              type(r) == whatsit_node ||
1112                              type(r) == insert_node  ||
1113                              type(r) == adjust_node
1114                             ) && ! (strict_bound == 2 || strict_bound == 3))
1115                        ||  type(r) == boundary_node
1116                );
1117        } else {
1118            return 1;
1119        }
1120    }
1121    \stopttyping
1122
1123*/
1124
1125static int tex_aux_valid_wordend(halfword end_word, halfword r)
1126{
1127    if (r) {
1128        switch (node_type(r)) {
1129         // case glyph_node:
1130         // case glue_node:
1131         // case penalty_node:
1132         // case kern_node:
1133         //     return 1;
1134            case disc_node:
1135                return has_disc_option(r, disc_option_pre_word);
1136            case hlist_node:
1137            case vlist_node:
1138            case rule_node:
1139            case dir_node:
1140            case whatsit_node:
1141            case insert_node:
1142            case adjust_node:
1143                return ! hyphenation_permitted(glyph_hyphenate(end_word), strict_end_hyphenation_mode);
1144        }
1145    }
1146    return 1;
1147}
1148
1149void tex_handle_hyphenation(halfword head, halfword tail)
1150{
1151    if (head && node_next(head)) {
1152        int callback_id = lmt_callback_defined(hyphenate_callback);
1153        if (callback_id > 0) {
1154            lua_State *L = lmt_lua_state.lua_instance;
1155            int top = 0;
1156            if (lmt_callback_okay(L, callback_id, &top)) {
1157                int i;
1158                lmt_node_list_to_lua(L, head);
1159                lmt_node_list_to_lua(L, tail);
1160                i = lmt_callback_call(L, 2, 0, top);
1161                if (i) {
1162                    lmt_callback_error(L, top, i);
1163                } else {
1164                    lmt_callback_wrapup(L, top);
1165                }
1166            }
1167        } else if (callback_id == 0) {
1168            tex_hyphenate_list(head, tail);
1169        } else {
1170            /* -1 : disabled */
1171        }
1172    }
1173}
1174
1175static int tex_aux_hnj_hyphen_hyphenate(
1176    hjn_dictionary *dict,
1177    halfword        first,
1178    halfword        last,
1179    int             length,
1180    halfword        left,
1181    halfword        right,
1182    lang_variables *lan
1183)
1184{
1185    /*tex +2 for dots at each end, +1 for points outside characters. */
1186    int ext_word_len = length + 2;
1187    int hyphen_len = ext_word_len + 1;
1188    /*tex Because we have a limit of 64 characters we could just use a static array here: */
1189    char *hyphens = lmt_memory_calloc(hyphen_len, sizeof(unsigned char));
1190    if (hyphens) {
1191        halfword here;
1192        int state = 0;
1193        int char_num = 0;
1194        int done = 0;
1195        /*tex Add a '.' to beginning and end to facilitate matching. */
1196        node_next(begin_period) = first;
1197        node_next(end_period) = node_next(last);
1198        node_next(last) = end_period;
1199
1200     // for (int i = 0; i < hyphen_len; i++) {
1201     //     hyphens[i] = '0';
1202     // }
1203     // hyphens[hyphen_len] = 0;
1204
1205        /*tex Now, run the finite state machine. */
1206        for (char_num = 0, here = begin_period; here != node_next(end_period); here = node_next(here)) {
1207            int ch;
1208            if (here == begin_period || here == end_period) {
1209                ch = '.';
1210            } else {
1211                ch = tex_get_hj_code(get_glyph_language(here), glyph_character(here));
1212                if (ch <= 32) {
1213                    ch = glyph_character(here);
1214                }
1215            }
1216            while (state != -1) {
1217                hjn_state *hstate = &dict->states[state];
1218                for (int k = 0; k < hstate->num_trans; k++) {
1219                    if (hstate->trans[k].uni_ch == ch) {
1220                        char *match;
1221                        state = hstate->trans[k].new_state;
1222                        match = dict->states[state].match;
1223                        if (match) {
1224                            /*tex
1225                                We add +2 because 1 string length is one bigger than offset and 1
1226                                hyphenation starts before first character.
1227
1228                                Why not store the length in states[state] instead of calculating
1229                                it each time? Okay, performance is okay but still ...
1230                            */
1231                            int offset = (int) (char_num + 2 - (int) strlen(match));
1232                            for (int m = 0; match[m]; m++) {
1233                                if (hyphens[offset + m] < match[m]) {
1234                                    hyphens[offset + m] = match[m];
1235                                }
1236                            }
1237                        }
1238                        goto NEXTLETTER;
1239                    }
1240                }
1241                state = hstate->fallback_state;
1242            }
1243            /*tex Nothing worked, let's go to the next character. */
1244            state = 0;
1245        NEXTLETTER:;
1246            char_num++;
1247        }
1248        /*tex Restore the correct pointers. */
1249        node_next(last) = node_next(end_period);
1250        /*tex
1251            Pattern is |.word.| and |word_len| is 4, |ext_word_len| is 6 and |hyphens| is 7; drop first
1252            two and stop after |word_len-1|.
1253         */
1254        for (here = first, char_num = 2; here != left; here = node_next(here)) {
1255            char_num++;
1256        }
1257        for (; here != right; here = node_next(here)) {
1258            if (hyphens[char_num] & 1) {
1259                here = tex_aux_insert_syllable_discretionary(here, lan);
1260                done += 1;
1261            }
1262            char_num++;
1263        }
1264        lmt_memory_free(hyphens);
1265        return done;
1266    } else {
1267        tex_overflow_error("patterns", hyphen_len);
1268        return 0;
1269    }
1270}
1271
1272/* we can also check the original */
1273
1274static int tex_aux_still_okay(halfword f, halfword l, halfword r, int n, const char *utf8original) {
1275    if (_valid_node_(f) && _valid_node_(l) && node_next(l) == r) {
1276        int i = 0;
1277        while (f) {
1278            ++i;
1279            if (node_type(f) != glyph_node) {
1280                tex_normal_warning("language", "the hyphenated word contains non-glyphs, skipping");
1281                return 0;
1282            } else {
1283                int cl; 
1284                halfword c = (halfword) aux_str2uni_len((const unsigned char *) utf8original, &cl);
1285                utf8original += cl;
1286                if (! (c && c == glyph_character(f))) {
1287                    tex_normal_warning("language", "the hyphenated word contains different characters, skipping");
1288                    return 0;
1289                } else if (f != l) {
1290                    f = node_next(f);
1291                } else if (i == n) {
1292                    return 1;
1293                } else {
1294                    tex_normal_warning("language", "the hyphenated word changed length, skipping");
1295                    return 0;
1296                }
1297            }
1298        }
1299    }
1300    tex_normal_warning("language", "the hyphenation list is messed up, skipping");
1301    return 0;
1302}
1303
1304static void tex_aux_hyphenate_show(halfword beg, halfword end)
1305{
1306    if (_valid_node_(beg) && _valid_node_(end)) {
1307        halfword nxt = node_next(end);
1308        node_next(end) = null;
1309        tex_show_node_list(beg, 100, 10000);
1310        node_next(end) = nxt;
1311    }
1312}
1313
1314/* maybe split: first a processing run */
1315
1316inline static int is_traditional_hyphen(halfword n)
1317{
1318    return (
1319        (glyph_character(n) == ex_hyphen_char_par)                             /*tex parameter */
1320     && (has_font_text_control(glyph_font(n),text_control_collapse_hyphens))   /*tex font driven */
1321     && (hyphenation_permitted(glyph_hyphenate(n), collapse_hyphenation_mode)) /*tex language driven */
1322    );
1323}
1324
1325int tex_collapse_list(halfword head, halfword c1, halfword c2, halfword c3) /* ex_hyphen_char_par 0x2013 0x2014 */
1326{
1327    /*tex Let's play safe: */
1328    halfword found = 0;
1329    if (head && c1 && c2 && c3) {
1330        halfword n1 = head;
1331        while (n1) {
1332            halfword n2 = node_next(n1);
1333            switch (node_type(n1)) {
1334                case glyph_node:
1335                    if (is_traditional_hyphen(n1)) {
1336                        set_glyph_discpart(n1, glyph_discpart_always);
1337                        if (n2 && node_type(n2) == glyph_node && is_traditional_hyphen(n2) && glyph_font(n1) == glyph_font(n2)) {
1338                            halfword n3 = node_next(n2);
1339                            if (n3 && node_type(n3) == glyph_node && is_traditional_hyphen(n3) && glyph_font(n1) == glyph_font(n3)) {
1340                                halfword n4 = node_next(n3);
1341                                glyph_character(n1) = c3;
1342                                tex_try_couple_nodes(n1, n4);
1343                                tex_flush_node(n2);
1344                                tex_flush_node(n3);
1345                                n1 = n4;
1346                            } else {
1347                                glyph_character(n1) = c2;
1348                                tex_try_couple_nodes(n1, n3);
1349                                tex_flush_node(n2);
1350                                n1 = n3;
1351                            }
1352                            found = 1;
1353                            goto AGAIN;
1354                        } else {
1355                            glyph_character(n1) = c1; /* can become language dependent */
1356                        }
1357                    }
1358                    break;
1359                case disc_node:
1360                    {
1361                        halfword done = 0;
1362                        if (disc_pre_break_head(n1) && tex_collapse_list(disc_pre_break_head(n1), c1, c2, c3)) {
1363                            ++done;
1364                        }
1365                        if (disc_post_break_head(n1) && tex_collapse_list(disc_post_break_head(n1), c1, c2, c3)) {
1366                            ++done;
1367                        }
1368                        if (disc_no_break_head(n1) && tex_collapse_list(disc_no_break_head(n1), c1, c2, c3)) {
1369                            ++done;
1370                        }
1371                        if (done) {
1372                            tex_check_disc_field(n1);
1373                        }
1374                        break;
1375                    }
1376                default:
1377                    break;
1378            }
1379            n1 = n2;
1380          AGAIN:;
1381        }
1382    }
1383    return found;
1384}
1385
1386void tex_hyphenate_list(halfword head, halfword tail)
1387{
1388    /*tex Let's play safe: */
1389    if (tail) {
1390        halfword first_language = first_valid_language_par; /* combine with check below */
1391        halfword trace = tracing_hyphenation_par;
1392        halfword r = head;
1393        /*tex
1394            This first movement assures two things:
1395
1396            \startitemize
1397                \startitem
1398                    That we won't waste lots of time on something that has been handled already (in
1399                    that case, none of the glyphs match |simple_character|).
1400                \stopitem
1401                \startitem
1402                    That the first word can be hyphenated. If the movement was not explicit, then
1403                    the indentation at the start of a paragraph list would make |find_next_wordstart()|
1404                    look too far ahead.
1405                \stopitem
1406            \stopitemize
1407        */
1408        while (r && node_type(r) != glyph_node) {
1409            r = node_next(r);
1410        }
1411        if (r) {
1412            r = tex_aux_find_next_wordstart(r, first_language);
1413            if (r) {
1414                lang_variables langdata;
1415                char utf8word[(4 * max_size_of_word) + 1] = { 0 };
1416                char utf8original[(4 * max_size_of_word) + 1] = { 0 };
1417                char *utf8ptr = utf8word;
1418                char *utf8ori = utf8original;
1419                int word_length = 0;
1420                int explicit_hyphen = 0;
1421                int last_char = 0;
1422                int valid = 0;
1423                halfword explicit_start = null;
1424                halfword saved_tail = node_next(tail);
1425                halfword penalty = tex_new_penalty_node(0, word_penalty_subtype);
1426                /* kind of curious hack, this addition that we later remove */
1427                tex_attach_attribute_list_copy(penalty, r);
1428                tex_couple_nodes(tail, penalty); /* todo: attrobute */
1429                while (r) {
1430                    halfword word_start = r;
1431                    int word_language = get_glyph_language(word_start);
1432                    if (tex_is_valid_language(word_language)) {
1433                        halfword word_end = r;
1434                        int lhmin = get_glyph_lhmin(word_start);
1435                        int rhmin = get_glyph_rhmin(word_start);
1436                        int hmin = tex_get_hyphenation_min(word_language);
1437                        halfword word_font = glyph_font(word_start);
1438                        if (! tex_is_valid_font(word_font) || font_hyphen_char(word_font) < 0) {
1439                            /*tex For backward compatibility we set: */
1440                            word_font = 0;
1441                        }
1442                        langdata.pre_hyphen_char = tex_get_pre_hyphen_char(word_language);
1443                        langdata.post_hyphen_char = tex_get_post_hyphen_char(word_language);
1444                        while (r && node_type(r) == glyph_node && word_language == get_glyph_language(r)) {
1445                            halfword chr = glyph_character(r);
1446                            halfword hyp = tex_aux_is_hyphen_char(chr);
1447                            if (word_language >= first_language) {
1448                                last_char = tex_get_hj_code(word_language, chr);
1449                                if (last_char > 0) {
1450                                    goto GOFORWARD;
1451                                }
1452                            }
1453                            if (hyp) {
1454                                last_char = hyp;
1455                             // if (last_char) {
1456                             //     goto GOFORWARD;
1457                             // }
1458                            } else {
1459                                break;
1460                            }
1461                          GOFORWARD:
1462                         // explicit_hyphen = is_hyphen_char(chr);
1463                            explicit_hyphen = hyp;
1464                            if (explicit_hyphen && node_next(r) && node_type(node_next(r)) != glyph_node && hyphenation_permitted(glyph_hyphenate(r), ignore_bounds_hyphenation_mode)) {
1465                                /* maybe some tracing */
1466                                explicit_hyphen = 0;
1467                            }
1468                            if (explicit_hyphen) {
1469                                break;
1470                            } else {
1471                                word_length++;
1472                                if (word_length >= max_size_of_word) {
1473                                    /* tex_normal_warning("language", "ignoring long word"); */
1474                                    while (r && node_type(r) == glyph_node) {
1475                                        r = node_next(r);
1476                                    }
1477                                    goto PICKUP;
1478                                } else {
1479                                    if (last_char <= 32) {
1480                                        if (last_char == 32) {
1481                                            last_char = 0 ;
1482                                        }
1483                                        if (word_length <= lhmin) {
1484                                            lhmin = lhmin - last_char + 1 ;
1485                                            if (lhmin < 0) {
1486                                                lhmin = 1;
1487                                            }
1488                                        }
1489                                        if (word_length >= rhmin) {
1490                                            rhmin = rhmin - last_char + 1 ;
1491                                            if (rhmin < 0) {
1492                                                rhmin = 1;
1493                                            }
1494                                        }
1495                                        hmin = hmin - last_char + 1 ;
1496                                        if (hmin < 0) {
1497                                            rhmin = 1;
1498                                        }
1499                                        last_char = chr ;
1500                                    }
1501                                    utf8ori = aux_uni2string(utf8ori, (unsigned) chr);
1502                                    utf8ptr = aux_uni2string(utf8ptr, (unsigned) last_char);
1503                                    word_end = r;
1504                                    r = node_next(r);
1505                                }
1506                            }
1507                        }
1508                        if (explicit_hyphen) {
1509                            /*tex We are not at the start, so we only need to look ahead. */
1510                            if ((get_glyph_discpart(r) == glyph_discpart_replace && ! hyphenation_permitted(glyph_hyphenate(r), syllable_hyphenation_mode))) {
1511                                /*tex
1512                                    This can be the consequence of inhibition too, see |finish_discretionary|
1513                                    in which case the replace got injected which can have a hyphen. And we want
1514                                    to run the callback if set in order to replace.
1515                                */
1516                                valid = 1;
1517                                goto MESSYCODE;
1518                            } else {
1519                                /*tex Maybe we should get rid of this ----- stuff. */
1520                                halfword t = node_next(r);
1521                                if (t && node_type(t) == glyph_node && ! tex_aux_is_hyphen_char(glyph_character(t)) && hyphenation_permitted(glyph_hyphenate(t), automatic_hyphenation_mode)) {
1522                                    /*tex we have a word already but the next character may not be a hyphen too */
1523                                    halfword g = r;
1524                                    r = tex_aux_compound_word_break(r, get_glyph_language(g), explicit_hyphen);
1525                                    if (trace > 1) {
1526                                        *utf8ori = 0;
1527                                        tex_begin_diagnostic();
1528                                        tex_print_format("[language: compound word break after %s]", utf8original);
1529                                        tex_end_diagnostic();
1530                                    }
1531                                    if (hyphenation_permitted(glyph_hyphenate(g), compound_hyphenation_mode)) {
1532                                        explicit_hyphen = 0;
1533                                        if (hyphenation_permitted(glyph_hyphenate(g), force_handler_hyphenation_mode) || hyphenation_permitted(glyph_hyphenate(g), feedback_compound_hyphenation_mode)) {
1534                                            set_disc_option(r, disc_option_pre_word | disc_option_post_word);
1535                                            explicit_start = null;
1536                                            valid = 1;
1537                                            goto MESSYCODE;
1538                                        } else {
1539                                            if (! explicit_start) {
1540                                                explicit_start = word_start;
1541                                            }
1542                                            /*tex For exceptions. */
1543                                            utf8ptr = aux_uni2string(utf8ptr, '-');
1544                                            r = t;
1545                                            continue;
1546                                        }
1547                                    }
1548                                } else {
1549                                    /*tex We jump over the sequence of hyphens ... traditional. */
1550                                    while (t && node_type(t) == glyph_node && tex_aux_is_hyphen_char(glyph_character(t))) {
1551                                        r = t;
1552                                        t = node_next(r);
1553                                    }
1554                                    if (! t) {
1555                                        /*tex we reached the end of the list and will quit the loop later */
1556                                        r = null;
1557                                    }
1558                                }
1559                            }
1560                        } else {
1561                            valid = tex_aux_valid_wordend(word_end, r);
1562                          MESSYCODE:
1563                            /*tex We have a word, r is at the next node. */
1564                            if (word_font && word_language >= first_language) {
1565                                /*tex We have a language, actually we already tested that. */
1566                                struct tex_language *lang = lmt_language_state.languages[word_language];
1567                                if (lang) {
1568                                    char *replacement = NULL;
1569                                    halfword start = explicit_start ? explicit_start : word_start;
1570                                    int okay = word_length >= lhmin + rhmin && (hmin <= 0 || word_length >= hmin) && hyphenation_permitted(glyph_hyphenate(start), syllable_hyphenation_mode);
1571                                    *utf8ptr = 0;
1572                                    *utf8ori = 0;
1573                                    if (lang->wordhandler && hyphenation_permitted(glyph_hyphenate(start), force_handler_hyphenation_mode)) {
1574                                        halfword restart = node_prev(start); /*tex before the word. */
1575                                        int done = lmt_handle_word(lang, utf8original, utf8word, word_length, start, word_end, &replacement);
1576                                        if (replacement) {
1577                                            if (tex_aux_still_okay(start, word_end, r, word_length, utf8original)) {
1578                                                goto EXCEPTIONS2;
1579                                            } else {
1580                                                goto PICKUP;
1581                                            }
1582                                        } else {
1583                                            /* 1: restart 2: exceptions+patterns 3: patterns *: next word */
1584                                            switch (done) {
1585                                                case 1:
1586                                                    if (_valid_node_(restart)) {
1587                                                        r = restart;
1588                                                    } else if (_valid_node_(start)) {
1589                                                        r = node_prev(start);
1590                                                    }
1591                                                    if (! r) {
1592                                                        if (_valid_node_(head)) {
1593                                                            tex_normal_warning("language", "the hyphenation list is messed up, recovering");
1594                                                            r = head;
1595                                                        } else {
1596                                                            tex_normal_error("language", "the hyphenated head is messed up, aborting");
1597                                                            return;
1598                                                        }
1599                                                    }
1600                                                    goto PICKUP;
1601                                                case 2:
1602                                                    if (tex_aux_still_okay(start, word_end, r, word_length, utf8original)) {
1603                                                        goto EXCEPTIONS1;
1604                                                    } else {
1605                                                        goto PICKUP;
1606                                                    }
1607                                                case 3:
1608                                                    if (tex_aux_still_okay(start, word_end, r, word_length, utf8original)) {
1609                                                        goto PATTERNS;
1610                                                    } else {
1611                                                        goto PICKUP;
1612                                                    }
1613                                                default:
1614                                                    if (_valid_node_(r)) { /* or word_end */
1615                                                        goto PICKUP;
1616                                                    } else if (_valid_node_(tail)) {
1617                                                        tex_normal_warning("language", "the hyphenation list is messed up, quitting");
1618                                                        goto ABORT;
1619                                                    } else {
1620                                                        // tex_normal_error("language","the hyphenated tail is messed up, aborting");
1621                                                        return;
1622                                                    }
1623                                            }
1624                                        }
1625                                    }
1626                                    if (! okay || ! valid) {
1627                                        goto PICKUP;
1628                                    }
1629                                    /*tex
1630                                        This is messy and nasty: we can have a word with a - in it which is why
1631                                        we have two branches. Also, every word that suits the length criteria
1632                                        is checked via \LUA. Optimizing this because tests have demonstrated
1633                                        that checking against the min and max lengths of exception strings has
1634                                        no gain.
1635                                    */
1636                                  EXCEPTIONS1:
1637                                    if (lang->exceptions) {
1638                                        replacement = tex_aux_hyphenation_exception(lang->exceptions, utf8word);
1639                                    }
1640                                  EXCEPTIONS2:
1641                                    if (replacement) {
1642                                        /*tex handle the exception and go on to the next word */
1643                                        halfword start = explicit_start ? explicit_start : word_start;
1644                                        halfword beg = node_prev(start);
1645                                        tex_aux_do_exception(start, r, replacement); // r == next_node(word_end)
1646                                        if (trace > 1) {
1647                                            tex_begin_diagnostic();
1648                                            tex_print_format("[language: exception %s to %s]", utf8original, replacement);
1649                                            if (trace > 2) {
1650                                                tex_aux_hyphenate_show(node_next(beg), node_prev(r));
1651                                            }
1652                                            tex_end_diagnostic();
1653                                        }
1654                                        lmt_memory_free(replacement);
1655                                        goto PICKUP;
1656                                    }
1657                                    PATTERNS:
1658                                    if (lang->patterns) {
1659                                        if (explicit_start) {
1660                                            /*tex We're done already */
1661                                        } else if (hyphenation_permitted(glyph_hyphenate(word_start), syllable_hyphenation_mode)) {
1662                                            halfword left = word_start;
1663                                            halfword right = r; /*tex We're one after |word_end|. */
1664                                            for (int i = lhmin; i > 1; i--) {
1665                                                left = node_next(left);
1666                                                if (! left || left == right) {
1667                                                    goto PICKUP;
1668                                                }
1669                                            }
1670                                            if (right != left) {
1671                                                int done = 0;
1672                                                for (int i = rhmin; i > 0; i--) {
1673                                                    right = node_prev(right);
1674                                                    if (! right || right == left) {
1675                                                        goto PICKUP;
1676                                                    }
1677                                                }
1678                                                done = tex_aux_hnj_hyphen_hyphenate(lang->patterns, word_start, word_end, word_length, left, right, &langdata);
1679                                                if (trace > 1) {
1680                                                    tex_begin_diagnostic();
1681                                                    if (done) {
1682                                                        tex_print_format("[language: hyphenated %s at %i positions]", utf8original, done);
1683                                                        if (trace > 2) {
1684                                                            tex_aux_hyphenate_show(node_next(left), node_prev(right));
1685                                                        }
1686                                                    } else {
1687                                                        tex_print_format("[language: not hyphenated %s]", utf8original);
1688                                                    }
1689                                                    tex_end_diagnostic();
1690                                                }
1691                                            }
1692                                        }
1693                                    }
1694                                }
1695                            }
1696                        }
1697                    }
1698                  PICKUP:
1699                    explicit_start = null ;
1700                    explicit_hyphen = 0;
1701                    word_length = 0;
1702                    utf8ptr = utf8word;
1703                    utf8ori = utf8original;
1704                    if (r) {
1705                        r = tex_aux_find_next_wordstart(r, first_language);
1706                    } else {
1707                        break;
1708                    }
1709                }
1710              ABORT:
1711                tex_flush_node(node_next(tail));
1712                node_next(tail) = saved_tail;
1713            }
1714        }
1715    }
1716}
1717
1718halfword tex_glyph_to_discretionary(halfword glyph, quarterword code, int keepkern)
1719{
1720    halfword prev = node_prev(glyph);
1721    halfword next = node_next(glyph);
1722    halfword disc = tex_new_disc_node(code);
1723    halfword kern = null;
1724    if (keepkern && next && node_type(next) == kern_node && node_subtype(next) == italic_kern_subtype) {
1725        kern = node_next(next);
1726        next = node_next(kern);
1727        node_next(kern) = null;
1728    } else { 
1729        node_next(glyph) = null;
1730    }
1731    node_prev(glyph) = null;
1732    tex_attach_attribute_list_copy(disc, glyph);
1733    tex_set_disc_field(disc, pre_break_code, tex_copy_node_list(glyph, null));
1734    tex_set_disc_field(disc, post_break_code, tex_copy_node_list(glyph, null));
1735    tex_set_disc_field(disc, no_break_code, glyph);
1736    tex_try_couple_nodes(prev, disc);
1737    tex_try_couple_nodes(disc, next);
1738    return disc; 
1739}