texlanguage.c /size: 76 Kb    last modification: 2025-02-21 11:03
1/*
2    See license.txt in the root of this project.
3*/
4
5# include "luametatex.h"
6
7/*tex
8
9    We no longer dump the patterns and exeptions as they as supposed to be loaded runtime. There is
10    no gain getting them from the format. But we do dump some of the properties.
11
12    There were all kind of checks for simple characters i.e. not ligatures but there is no need for
13    that in \LUAMETATEX. We have separated stages and the hyphenator sees just glyphs. And when a
14    traditional font has glyphs we can assume that the old school font encoding matches the patterns
15    i.e. that ligatures are not in the normal character slots.
16
17    Exceptions are stored at the \LUA\ end. We cannot easilly go dynamic because fonts are stored
18    in the eqtb so we would have to use some more indirect mechanism (doable as we do it for other
19    items) too.
20
21*/
22
23language_state_info lmt_language_state = {
24    .languages        = NULL,
25    .language_data    = {
26        .minimum      = min_language_size,
27        .maximum      = max_language_size,
28        .size         = memory_data_unset,
29        .step         = stp_language_size,
30        .allocated    = 0,
31        .itemsize     = sizeof(tex_language *),
32        .top          = 0,
33        .ptr          = 0,
34        .initial      = memory_data_unset,
35        .offset       = 0,
36        .extra        = 0, 
37    },
38    .handler_table_id    = 0,
39    .handler_count       = 0,
40    .list_count          = 0, 
41    .checked_count       = 0,
42    .exceptions_count    = 0,
43    .hyphenated_count    = 0,
44    .nothing_count       = 0,
45    .shared_word_buffer  = { 0 },
46    .shared_uword_buffer = { 0 }, 
47};
48
49/*tex
50    We can enforce a language id but we want to be sequential so we accept holes! So one
51    has to define bottom-up. As with fonts, we have a zero language but that one normally
52    is not set.
53*/
54
55static void tex_aux_reset_language(halfword id)
56{
57    tex_language *lang = lmt_language_state.languages[id];
58    lang->id = id;
59    lang->exceptions = 0;
60    lang->patterns = NULL;
61    lang->wordhandler = 0;
62    lang->pre_hyphen_char = '-';
63    lang->post_hyphen_char = 0;
64    lang->pre_exhyphen_char = 0;
65    lang->post_exhyphen_char = 0;
66    lang->hyphenation_min = -1;
67    lang->hjcode_head = NULL;
68}
69
70/*tex
71    A value below zero will bump the language id. Because we have a rather limited number of
72    languages there is no configuration, size is just maximum.
73*/
74
75static halfword tex_aux_new_language_id(halfword id)
76{
77    int top;
78    if (id >= 0) {
79        if (id <= lmt_language_state.language_data.top) {
80            if (lmt_language_state.languages[id]) {
81                return tex_formatted_error("languages", "the language with id %d is already created", id);
82            } else {
83                return id;
84            }
85        } else if (id > lmt_language_state.language_data.maximum) {
86            goto OVERFLOWERROR;
87        } else {
88            top = id;
89        }
90    } else if (lmt_language_state.language_data.ptr < lmt_language_state.language_data.top) {
91        ++lmt_language_state.language_data.ptr;
92        return lmt_language_state.language_data.ptr;
93    } else if (lmt_language_state.language_data.top >= lmt_language_state.language_data.maximum) {
94        goto OVERFLOWERROR;
95    } else if (lmt_language_state.language_data.top + lmt_language_state.language_data.step > lmt_language_state.language_data.maximum) {
96        top = lmt_language_state.language_data.maximum;
97    } else {
98        top = lmt_language_state.language_data.top + lmt_language_state.language_data.step;
99    }
100    /*tex Finally we can bump memory. */
101    {
102        tex_language **tmp = aux_reallocate_array(lmt_language_state.languages, sizeof(tex_language *), top, 0);
103        if (tmp) {
104            for (int i = lmt_language_state.language_data.top + 1; i <= top; i++) {
105                tmp[i] = NULL;
106            }
107            lmt_language_state.languages = tmp;
108            lmt_language_state.language_data.allocated = top;
109            lmt_language_state.language_data.top = top;
110            lmt_language_state.language_data.ptr += 1;
111            return lmt_language_state.language_data.ptr;
112        }
113    }
114  OVERFLOWERROR:
115    tex_overflow_error("languages", lmt_language_state.language_data.maximum);
116    return 0;
117}
118
119void tex_initialize_languages(void)
120{
121    tex_language **tmp = aux_allocate_clear_array(sizeof(tex_language *), lmt_language_state.language_data.minimum, 0);
122    if (tmp) {
123        for (int i = 0; i < lmt_language_state.language_data.minimum; i++) {
124            tmp[i] = NULL;
125        }
126        lmt_language_state.languages = tmp;
127        lmt_language_state.language_data.allocated = lmt_language_state.language_data.minimum;
128        lmt_language_state.language_data.top = lmt_language_state.language_data.minimum;
129    } else {
130        tex_overflow_error("languages", lmt_language_state.language_data.minimum);
131    }
132}
133
134/*
135halfword tex_aux_maximum_language_id(void)
136{
137    return language_state.language_data.maximum;
138}
139*/
140
141int tex_is_valid_language(halfword n)
142{
143    if (n == 0) {
144        return 1;
145    } else if (n > 0 && n <= lmt_language_state.language_data.top) {
146        return lmt_language_state.languages[n] ? 1 : 0;
147    } else {
148        return 0;
149    }
150}
151
152tex_language *tex_new_language(halfword n)
153{
154    halfword id = tex_aux_new_language_id(n);
155    if (id >= 0) {
156        tex_language *lang = lmt_memory_malloc(sizeof(struct tex_language));
157        if (lang) {
158            lmt_language_state.languages[id] = lang;
159            lmt_language_state.language_data.extra += sizeof(struct tex_language);
160            tex_aux_reset_language(id);
161            if (saving_hyph_codes_par) {
162                /*tex
163                    For now, we might just use specific value for whatever task. This will become
164                    obsolete.
165                */
166                tex_hj_codes_from_lc_codes(id);
167            }
168        } else {
169            tex_overflow_error("language", sizeof(struct tex_language));
170        }
171        return lang;
172    } else {
173        return NULL;
174    }
175}
176
177tex_language *tex_get_language(halfword n)
178{
179    if (n >= 0) {
180        if (n <= lmt_language_state.language_data.top && lmt_language_state.languages[n]) {
181            return lmt_language_state.languages[n];
182        }
183        if (n <= lmt_language_state.language_data.maximum) {
184            return tex_new_language(n);
185        }
186    }
187    return NULL;
188}
189
190/*tex
191    Freeing, dumping, undumping languages:
192*/
193
194/*
195void free_languages(void)
196{
197    for (int i = 0; i < language_state.language_data.top; i++) {
198        if (language_state.languages[i]) {
199            lmt_memory_free(language_state.languages[i]);
200            language_state.languages[i] = NULL;
201        }
202    }
203}
204*/
205
206void tex_dump_language_data(dumpstream f)
207{
208    dump_int(f, lmt_language_state.language_data.top);
209    dump_int(f, lmt_language_state.language_data.ptr);
210    if (lmt_language_state.language_data.top > 0) {
211        for (int i = 0; i < lmt_language_state.language_data.top; i++) {
212            tex_language *lang = lmt_language_state.languages[i];
213            if (lang) {
214                dump_via_uchar(f, 1);
215                dump_int(f, lang->id);
216                dump_int(f, lang->pre_hyphen_char);
217                dump_int(f, lang->post_hyphen_char);
218                dump_int(f, lang->pre_exhyphen_char);
219                dump_int(f, lang->post_exhyphen_char);
220                dump_int(f, lang->hyphenation_min);
221                tex_dump_language_hj_codes(f, i);
222            } else {
223                dump_via_uchar(f, 0);
224            }
225        }
226    }
227}
228
229void tex_undump_language_data(dumpstream f)
230{
231    int top, ptr;
232    undump_int(f, top);
233    undump_int(f, ptr);
234    if (top > 0) {
235        tex_language **tmp = aux_allocate_clear_array(sizeof(tex_language *), top, 0);
236        if (tmp) {
237            lmt_language_state.language_data.top = top;
238            lmt_language_state.language_data.ptr = ptr;
239            lmt_language_state.languages = tmp;
240            lmt_language_state.language_data.allocated = top;
241            for (int i = 0; i < top; i++) {
242                unsigned char marker;
243                undump_uchar(f, marker);
244                if (marker == 1) {
245                    tex_language *lang = lmt_memory_malloc(sizeof(struct tex_language));
246                    if (lang) {
247                        lmt_language_state.languages[i] = lang;
248                        lmt_language_state.language_data.extra += sizeof(struct tex_language);
249                        lang->exceptions = 0;
250                        lang->patterns = NULL;
251                        lang->wordhandler = 0;
252                        lang->hjcode_head = NULL;
253                        undump_int(f, lang->id);
254                        undump_int(f, lang->pre_hyphen_char);
255                        undump_int(f, lang->post_hyphen_char);
256                        undump_int(f, lang->pre_exhyphen_char);
257                        undump_int(f, lang->post_exhyphen_char);
258                        undump_int(f, lang->hyphenation_min);
259                        tex_undump_language_hj_codes(f, i);
260                        if (lang->id != i) {
261                            tex_formatted_warning("languages", "undumped language id mismatch: %d <> %d", lang->id, i);
262                            lang->id = i;
263                        }
264                    } else {
265                        tex_overflow_error("languages", i);
266                    }
267                    tmp[i] = lang;
268                } else {
269                    tmp[i] = NULL;
270                }
271            }
272            lmt_language_state.language_data.initial = lmt_language_state.language_data.ptr;
273        } else {
274            tex_overflow_error("languages", top);
275            lmt_language_state.language_data.initial = 0;
276        }
277    } else {
278        /*tex Indeed we can have no languages stored. */
279        tex_initialize_languages();
280    }
281}
282
283/*tex All kind of accessors. */
284
285void tex_set_pre_hyphen_char(halfword n, halfword v)
286{
287    struct tex_language *l = tex_get_language(n);
288    if (l) {
289        l->pre_hyphen_char = v;
290    }
291}
292
293void tex_set_post_hyphen_char(halfword n, halfword v)
294{
295    struct tex_language *l = tex_get_language(n);
296    if (l) {
297        l->post_hyphen_char = v;
298    }
299}
300
301void tex_set_pre_exhyphen_char(halfword n, halfword v)
302{
303    struct tex_language *l = tex_get_language(n);
304    if (l) {
305        l->pre_exhyphen_char = v;
306    }
307}
308
309void tex_set_post_exhyphen_char(halfword n, halfword v)
310{
311    struct tex_language *l = tex_get_language(n);
312    if (l) {
313        l->post_exhyphen_char = v;
314    }
315}
316
317halfword tex_get_pre_hyphen_char(halfword n)
318{
319    struct tex_language *l = tex_get_language(n);
320    return l ? l->pre_hyphen_char : -1;
321}
322
323halfword tex_get_post_hyphen_char(halfword n)
324{
325    struct tex_language *l = tex_get_language(n);
326    return l ? l->post_hyphen_char : -1;
327}
328
329halfword tex_get_pre_exhyphen_char(halfword n)
330{
331    struct tex_language *l = tex_get_language(n);
332    return l ? l->pre_exhyphen_char : -1;
333}
334
335halfword tex_get_post_exhyphen_char(halfword n)
336{
337    struct tex_language *l = tex_get_language(n);
338    return (l) ? (int) l->post_exhyphen_char : -1;
339}
340
341void tex_set_hyphenation_min(halfword n, halfword v)
342{
343    struct tex_language *l = tex_get_language(n);
344    if (l) {
345        l->hyphenation_min = v;
346    }
347}
348
349halfword tex_get_hyphenation_min(halfword n)
350{
351    struct tex_language *l = tex_get_language((int) n);
352    return l ? l->hyphenation_min : -1;
353}
354
355void tex_load_patterns(struct tex_language *lang, const unsigned char *buff)
356{
357    if ((! lang) || (! buff) || strlen((const char *) buff) == 0) {
358        return;
359    } else {
360        if (! lang->patterns) {
361            lang->patterns = hnj_dictionary_new();
362        }
363        hnj_dictionary_load(lang->patterns, buff, tracing_hyphenation_par > 0);
364    }
365}
366
367void tex_clear_patterns(struct tex_language *lang)
368{
369    if (lang && lang->patterns) {
370        hnj_dictionary_clear(lang->patterns);
371    }
372}
373
374void tex_load_tex_patterns(halfword curlang, halfword head)
375{
376    /*tex We might want single hashes. */
377    char *s = tex_tokenlist_to_tstring(head, 1, NULL, 0, 0, 0, 0, 1); /* single hashes */
378    if (s) {
379        tex_load_patterns(tex_get_language(curlang), (unsigned char *) s);
380    }
381}
382
383/*
384    This cleans one word which is returned in |cleaned|, returns the new offset into |buffer|.
385*/
386
387/* define tex_isspace(c) (c == ' ' || c == '\t') */
388#  define tex_isspace(c) (c == ' ')
389
390# define word_buffer  lmt_language_state.shared_word_buffer
391# define uword_buffer lmt_language_state.shared_uword_buffer
392
393const char *tex_clean_hyphenation(halfword id, const char *buff, char **cleaned)
394{
395    int items = 0;
396 // unsigned char word_buffer[max_size_of_word_buffer]; /*tex Work buffer for bytes (can be \UTF8): */
397 // unsigned uword_buffer[max_size_of_word_buffer];     /*tex Work buffer for \UNICODE\ (often too large): */
398    /*tex The \UNICODE\ buffer value: */
399    int i = 0;
400    char *uindex = (char *) word_buffer;
401    const char *s = buff;
402    while (*s && ! tex_isspace((unsigned char)*s)) {
403        word_buffer[i++] = (unsigned char) *s;
404        s++;
405        if ((s - buff) > max_size_of_word) {
406            /*tex We count utf characters so \quote {size of word} is somewhat misleading. */
407            *cleaned = NULL;
408            tex_handle_error(
409                normal_error_type,
410                "Exception too long",
411                NULL
412            );
413            return s;
414        }
415    }
416    /*tex Now convert the input to \UNICODE. */
417    word_buffer[i] = '\0';
418    /*tex We append a zero value as sentinal. */
419    aux_splitutf2uni(uword_buffer, (const char *) word_buffer);
420    /*tex
421        Build the new word string. The hjcode values < 32 indicate a length, so that
422        for instance \|hjcode`ܽ2| makes that ligature count okay.
423    */
424    i = 0;
425    while (uword_buffer[i] > 0) {
426        unsigned u = uword_buffer[i++];
427        if (u == '-') {
428            /*tex Skip. */
429        } else if (u == '=') {
430            unsigned c = tex_get_hj_code(id, '-');
431            uindex = aux_uni2string(uindex, (! c || c <= 32) ? '-' : c);
432        } else if (u == '{') {
433            u = uword_buffer[i++];
434            items = 0;
435            while (u && u != '}') {
436                u = uword_buffer[i++];
437            }
438            if (u == '}') {
439                items++;
440                u = uword_buffer[i++];
441            }
442            while (u && u != '}') {
443                u = uword_buffer[i++];
444            }
445            if (u == '}') {
446                items++;
447                u = uword_buffer[i++];
448            }
449            if (u == '{') {
450                u = uword_buffer[i++];
451            }
452            while (u && u != '}') {
453                unsigned c = tex_get_hj_code(id, u);
454                uindex = aux_uni2string(uindex, (! c || c <= 32) ? u : c);
455                u = uword_buffer[i++];
456            }
457            if (u == '}') {
458                items++;
459            }
460            if (items != 3) {
461                /* hm, we intercept that elsewhere in a better way so why here? Best remove the test here or move the other one here. */
462                *cleaned = NULL;
463                tex_handle_error(
464                    normal_error_type,
465                    "Exception syntax error, a discretionary has three components: {}{}{}.",
466                    NULL
467                );
468                return s;
469            } else {
470                /* skip replacement (chars) */
471                if (uword_buffer[i] == '(') {
472                    while (uword_buffer[++i] && uword_buffer[i] != ')') { };
473                    if (uword_buffer[i] != ')') {
474                        tex_handle_error(
475                            normal_error_type,
476                            "Exception syntax error, an alternative replacement is defined as (text).",
477                            NULL
478                        );
479                        return s;
480                    } else if (uword_buffer[i]) {
481                        i++;
482                   }
483                }
484                /* skip penalty: [digit] but we intercept multiple digits */
485                if (uword_buffer[i] == '[') {
486                    if (uword_buffer[i+1] && uword_buffer[i+1] >= '0' && uword_buffer[i+1] <= '9' && uword_buffer[i+2] && uword_buffer[i+2] == ']') {
487                        i += 3;
488                    } else {
489                        tex_handle_error(
490                            normal_error_type,
491                            "Exception syntax error, a penalty is defined as [digit].",
492                            NULL
493                        );
494                        return s;
495                    }
496                }
497            }
498        } else {
499            unsigned c = tex_get_hj_code(id, u);
500            uindex = aux_uni2string(uindex, (! c || c <= 32) ? u : c);
501        }
502    }
503    *uindex = '\0';
504    *cleaned = lmt_memory_strdup((char *) word_buffer);
505    return s;
506}
507
508void tex_load_hyphenation(struct tex_language *lang, const unsigned char *buff)
509{
510    if (lang) {
511        lua_State *L = lmt_lua_state.lua_instance;
512        const char *s = (const char *) buff;
513        char *cleaned = NULL;
514        int id = lang->id;
515        if (lang->exceptions == 0) {
516            lua_newtable(L);
517            lang->exceptions = luaL_ref(L, LUA_REGISTRYINDEX);
518        }
519        lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions);
520        while (*s) {
521            while (tex_isspace((unsigned char) *s)) {
522                s++;
523            }
524            if (*s) {
525                const char *value = s;
526                s = tex_clean_hyphenation(id, s, &cleaned);
527                if (cleaned) {
528                    size_t len = s - value;
529                    if (len > 0) {
530                        lua_pushstring(L, cleaned);
531                        lua_pushlstring(L, value, len);
532                        lua_rawset(L, -3);
533                    }
534                    lmt_memory_free(cleaned);
535                } else {
536                    /* tex_formatted_warning("hyphenation","skipping invalid hyphenation exception: %s", value); */
537                }
538            }
539        }
540        lua_pop(L, 1);
541    }
542}
543
544void tex_clear_hyphenation(struct tex_language *lang)
545{
546    if (lang && lang->exceptions != 0) {
547        lua_State *L = lmt_lua_state.lua_instance;
548        luaL_unref(L, LUA_REGISTRYINDEX, lang->exceptions);
549        lang->exceptions = 0;
550    }
551}
552
553void tex_load_tex_hyphenation(halfword curlang, halfword head)
554{
555    char *s = tex_tokenlist_to_tstring(head, 1, NULL, 0, 0, 0, 0, 1); /* single hashes */
556    if (s) {
557        tex_load_hyphenation(tex_get_language(curlang), (unsigned char *) s);
558    }
559}
560
561static halfword tex_aux_insert_discretionary(halfword t, halfword pre, halfword post, halfword replace, quarterword subtype, int penalty)
562{
563    /*tex For compound words following explicit hyphens we take the current font. */
564    halfword d = tex_new_disc_node(subtype);
565    halfword a = node_attr(t) ;
566    disc_penalty(d) = penalty;
567    if (t == replace) {
568        /*tex We have |prev disc next-next|. */
569        tex_try_couple_nodes(d, node_next(t));
570        tex_try_couple_nodes(node_prev(t), d);
571        node_prev(t) = null;
572        node_next(t) = null;
573        replace = t;
574    } else {
575        /*tex We have |prev disc next|. */
576        tex_try_couple_nodes(d, node_next(t));
577        tex_couple_nodes(t, d);
578    }
579    if (a) {
580        tex_attach_attribute_list_attribute(d, a);
581    }
582    tex_set_disc_field(d, pre_break_code, pre);
583    tex_set_disc_field(d, post_break_code, post);
584    tex_set_disc_field(d, no_break_code, replace);
585    return d;
586}
587
588static halfword tex_aux_insert_syllable_discretionary(halfword t, language_variables *lan)
589{
590    halfword n = tex_new_disc_node(syllable_discretionary_code);
591    disc_penalty(n) = hyphen_penalty_par;
592    tex_couple_nodes(n, node_next(t));
593    tex_couple_nodes(t, n);
594    tex_attach_attribute_list_attribute(n, get_attribute_list(t));
595    if (lan->pre_hyphen_char > 0) {
596        halfword g = tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), lan->pre_hyphen_char, t);
597        tex_set_disc_field(n, pre_break_code, g);
598        set_glyph_disccode(g, glyph_disc_syllable);
599    }
600    if (lan->post_hyphen_char > 0) {
601        halfword g = tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), lan->post_hyphen_char, t);
602        tex_set_disc_field(n, post_break_code, g);
603        set_glyph_disccode(g, glyph_disc_syllable);
604    }
605    return n;
606}
607
608static halfword tex_aux_compound_word_break(halfword t, halfword clang, halfword chr)
609{
610    halfword prechar, postchar, pre, post, disc;
611    if (chr == ex_hyphen_char_par) {
612        halfword pre_exhyphen_char = tex_get_pre_exhyphen_char(clang);
613        halfword post_exhyphen_char = tex_get_post_exhyphen_char(clang);
614        prechar  = pre_exhyphen_char  > 0 ? pre_exhyphen_char  : ex_hyphen_char_par;
615        postchar = post_exhyphen_char > 0 ? post_exhyphen_char : null;
616    } else {
617        /* we need a flag : use pre/post cf language spec */
618        prechar  = chr;
619        postchar = null;
620    }
621    pre  = prechar  > 0 ? tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), prechar,  t) : null;
622    post = postchar > 0 ? tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), postchar, t) : null;
623    if (pre) { 
624        set_glyph_disccode(pre, glyph_disc_automatic);
625    }
626    if (post) { 
627        set_glyph_disccode(post, glyph_disc_automatic);
628    }
629    disc = tex_aux_insert_discretionary(t, pre, post, t, automatic_discretionary_code, tex_automatic_disc_penalty(glyph_hyphenate(t)));
630    return disc;
631}
632
633static char *tex_aux_hyphenation_exception(int exceptions, char *w)
634{
635    lua_State *L = lmt_lua_state.lua_instance;
636    char *ret = NULL;
637    if (lua_rawgeti(L, LUA_REGISTRYINDEX, exceptions) == LUA_TTABLE) {
638        /*tex Word table: */
639        lua_pushstring(L, w);
640        lua_rawget(L, -2);
641        if (lua_type(L, -1) == LUA_TSTRING) {
642            ret = lmt_memory_strdup(lua_tostring(L, -1));
643        }
644        lua_pop(L, 2);
645    } else {
646        lua_pop(L, 1);
647    }
648    return ret;
649}
650
651/*tex
652
653    The sequence from |wordstart| to |r| can contain only normal characters it could be faster to
654    modify a halfword pointer and return an integer
655
656*/
657
658# define zws  0x200B /* zero width space makes no sense */
659# define zwnj 0x200C /* no ligatures and no kerns */
660# define zwj  0x200D /* no ligatures */
661
662static halfword tex_aux_find_exception_part(unsigned int *j, unsigned int *uword, int len, halfword parent, char final)
663{
664    halfword head = null;
665    halfword tail = null;
666    unsigned i = *j;
667    int noligature = 0;
668    int nokerning = 0;
669    /*tex This puts uword[i] on the |{|. */
670    i++;
671    while (i < (unsigned) len && uword[i + 1] != (unsigned int) final) {
672        if (tail) {
673            switch (uword[i + 1]) {
674                case zwj:
675                    noligature = 1;
676                    nokerning = 0;
677                    break;
678                case zwnj:
679                    noligature = 1;
680                    nokerning = 1;
681                    break;
682                default:
683                    {
684                        halfword s = tex_new_glyph_node(glyph_unset_subtype, glyph_font(parent), (int) uword[i + 1], parent); /* todo: data */
685                        tex_couple_nodes(tail, s);
686                        if (noligature) {
687                            tex_add_glyph_option(tail, glyph_option_no_right_ligature);
688                            tex_add_glyph_option(s, glyph_option_no_left_ligature);
689                            noligature = 0;
690                        }
691                        if (nokerning) {
692                            tex_add_glyph_option(tail, glyph_option_no_right_kern);
693                            tex_add_glyph_option(s, glyph_option_no_left_kern);
694                            nokerning = 0;
695                        }
696                        set_glyph_disccode(head, glyph_disc_syllable);
697                        tail = node_next(tail);
698                        break;
699                    }
700            }
701        } else {
702            head = tex_new_glyph_node(glyph_unset_subtype, glyph_font(parent), (int) uword[i + 1], parent); /* todo: data */
703            set_glyph_disccode(head, glyph_disc_syllable);
704            tail = head;
705        }
706        i++;
707    }
708    *j = ++i;
709    return head;
710}
711
712static int tex_aux_count_exception_part(unsigned int *j, unsigned int *uword, int len)
713{
714    int n = 0;
715    unsigned i = *j;
716    /*tex This puts uword[i] on the |{|. */
717    i++;
718    while (i < (unsigned) len && uword[i + 1] != '}') {
719        n++;
720        i++;
721    }
722    *j = ++i;
723    return n;
724}
725
726static void tex_aux_show_exception_error(const char *part)
727{
728    tex_handle_error(
729        normal_error_type,
730        "Invalid %s part in exception",
731        part,
732        "Exception discretionaries should contain three pairs of braced items.\n"
733        "No intervening spaces are allowed."
734    );
735}
736
737/*tex
738
739    The exceptions are taken as-is: no min values are taken into account. One can add normal
740    patterns on-the-fly if needed.
741
742*/
743
744static void tex_aux_do_exception(halfword wordstart, halfword r, char *replacement)
745{
746    halfword t = wordstart;
747    language_variables langdata;
748    unsigned uword[max_size_of_word_buffer]; //  = { 0 };
749    unsigned len = aux_splitutf2uni(uword, replacement);
750    int clang = get_glyph_language(wordstart);
751    langdata.pre_hyphen_char = tex_get_pre_hyphen_char(clang);
752    langdata.post_hyphen_char = tex_get_post_hyphen_char(clang);
753    for (unsigned i = 0; i < len; i++) {
754        if (uword[i + 1] == 0 ) {
755            /*tex We ran out of the exception pattern. */
756            break;
757        } else if (uword[i + 1] == '-') {
758            /*tex A hyphen follows. */
759            if (node_next(t) == r) {
760                break;
761            } else {
762                tex_aux_insert_syllable_discretionary(t, &langdata);
763                /*tex Skip the new disc */
764                t = node_next(t);
765            }
766        } else if (uword[i + 1] == '=') {
767            /*tex We skip a disc. */
768            t = node_next(t);
769        } else if (uword[i + 1] == '{') {
770            /*tex We ran into an exception |{}{}{}| or |{}{}{}[]|. */
771            halfword pre = null;
772            halfword post = null;
773            halfword replace = null;
774            int count = 0;
775            int alternative = null;
776            halfword penalty;
777            /*tex |pre| */
778            pre = tex_aux_find_exception_part(&i, uword, (int) len, wordstart, '}');
779            if (i == len || uword[i + 1] != '{') {
780                tex_aux_show_exception_error("pre");
781            }
782            /*tex |post| */
783            post = tex_aux_find_exception_part(&i, uword, (int) len, wordstart, '}');
784            if (i == len || uword[i + 1] != '{') {
785                tex_aux_show_exception_error("post");
786            }
787            /*tex |replace| */
788            count = tex_aux_count_exception_part(&i, uword, (int) len);
789            if (i == len) {
790                tex_aux_show_exception_error("replace");
791            } else if (uword[i] && uword[i + 1] == '(') {
792                alternative = tex_aux_find_exception_part(&i, uword, (int) len, wordstart, ')');;
793            }
794            /*tex Play safe. */
795            if (node_next(t) == r) {
796                break;
797            } else {
798                /*tex Let's deal with an (optional) replacement. */
799                if (count > 0) {
800                    /*tex Assemble the replace stream. */
801                    halfword q = t;
802                    replace = node_next(q);
803                    while (count > 0 && q) {
804                        halfword t = node_type(q);
805                        q = node_next(q);
806                        if (t == glyph_node || t == disc_node) {
807                            count--;
808                        } else {
809                            break ;
810                        }
811                    }
812                    /*tex Remove it from the main stream */
813                    tex_try_couple_nodes(t, node_next(q));
814                    /*tex and finish it in the replace. */
815                    node_next(q) = null;
816                    if (alternative) {
817                        tex_flush_node_list(replace);
818                        replace = alternative;
819                    } else {
820                        /*tex Sanitize the replace stream (we could use the flattener instead). */
821                        q = replace ;
822                        while (q) {
823                            halfword n = node_next(q);
824                            if (node_type(q) == disc_node) {
825                                /*tex Beware: the replacement starts after the no_break pointer. */
826                                halfword nb = disc_no_break_head(q);
827                                disc_no_break_head(q) = null;
828                                node_prev(nb) = null ; /* used at all? */
829                                /*tex Insert the replacement glyph. */
830                                if (q == replace) {
831                                    replace = nb;
832                                } else {
833                                    tex_try_couple_nodes(node_prev(q), nb);
834                                }
835                                /*tex Append the glyph (one). */
836                                tex_try_couple_nodes(nb, n);
837                                /*tex Flush the disc. */
838                                tex_flush_node(q);
839                            }
840                            q = n ;
841                        }
842                    }
843                }
844                /*tex Let's check if we have a penalty spec. If we have more then we're toast, we just ignore them. */
845                if (uword[i] && uword[i + 1] == '[') {
846                    i += 2;
847                    if (uword[i] && uword[i] >= '0' && uword[i] <= '9') {
848                        if (exception_penalty_par > 0) {
849                            if (exception_penalty_par > infinite_penalty) {
850                                penalty = exception_penalty_par;
851                            } else {
852                                penalty = (uword[i] - '0') * exception_penalty_par ;
853                            }
854                        } else if (exception_penalty_par < 0) {
855                            penalty = hyphen_penalty_par;
856                        } else { 
857                            penalty = (uword[i] - '0') * hyphen_penalty_par ;
858                        }
859                        ++i;
860                        while (uword[i] && uword[i] != ']') {
861                            ++i;
862                        }
863                    } else {
864                        penalty = hyphen_penalty_par;
865                    }
866                } else {
867                    penalty = hyphen_penalty_par;
868                }
869                /*tex And now we insert a disc node (this was |syllable_discretionary_code|). */
870                t = tex_aux_insert_discretionary(t, pre, post, replace, normal_discretionary_code, penalty);
871                /*tex We skip the new disc node. */
872                t = node_next(t);
873                /*tex 
874                    We need to check if we have two discretionaries in a row, test case: |\hyphenation 
875                    {a{>}{<}{b}{>}{<}{c}de} \hsize 1pt abcde \par| which gives |a> <> <de|. 
876                */
877                if (uword[i] && uword[i + 1] == '{') {
878                    i--;
879                    t = node_prev(t); /*tex Tricky! */
880                }
881            }
882        } else {
883            t = node_next(t);
884        }
885        /*tex Again we play safe. */
886        if (! t || node_next(t) == r) {
887            break;
888        }
889    }
890}
891
892/*tex
893
894    The following description is no longer valid for \LUATEX. Although we use the same algorithm
895    for hyphenation, it is not integrated in the par builder. Instead it is a separate run over
896    the node list, preceding the line-breaking routine, possibly replaced by a callback. We keep
897    the description here because the principles remain.
898
899    \startnarrower
900
901    When the line-breaking routine is unable to find a feasible sequence of breakpoints, it makes
902    a second pass over the paragraph, attempting to hyphenate the hyphenatable words. The goal of
903    hyphenation is to insert discretionary material into the paragraph so that there are more
904    potential places to break.
905
906    The general rules for hyphenation are somewhat complex and technical, because we want to be
907    able to hyphenate words that are preceded or followed by punctuation marks, and because we
908    want the rules to work for languages other than English. We also must contend with the fact
909    that hyphens might radically alter the ligature and kerning structure of a word.
910
911    A sequence of characters will be considered for hyphenation only if it belongs to a \quotation
912    {potentially hyphenatable part} of the current paragraph. This is a sequence of nodes $p_0p_1
913    \ldots p_m$ where $p_0$ is a glue node, $p_1\ldots p_{m-1}$ are either character or ligature
914    or whatsit or implicit kern nodes, and $p_m$ is a glue or penalty or insertion or adjust or
915    mark or whatsit or explicit kern node. (Therefore hyphenation is disabled by boxes, math
916    formulas, and discretionary nodes already inserted by the user.) The ligature nodes among $p_1
917    \ldots p_{m-1}$ are effectively expanded into the original non-ligature characters; the kern
918    nodes and whatsits are ignored. Each character |c| is now classified as either a nonletter (if
919    |lc_code(c)=0|), a lowercase letter (if |lc_code(c)=c|), or an uppercase letter (otherwise); an
920    uppercase letter is treated as if it were |lc_code(c)| for purposes of hyphenation. The
921    characters generated by $p_1\ldots p_{m-1}$ may begin with nonletters; let $c_1$ be the first
922    letter that is not in the middle of a ligature. Whatsit nodes preceding $c_1$ are ignored; a
923    whatsit found after $c_1$ will be the terminating node $p_m$. All characters that do not have
924    the same font as $c_1$ will be treated as nonletters. The |hyphen_char| for that font must be
925    between 0 and 255, otherwise hyphenation will not be attempted. \TeX\ looks ahead for as many
926    consecutive letters $c_1\ldots c_n$ as possible; however, |n| must be less than 64, so a
927    character that would otherwise be $c_{64}$ is effectively not a letter. Furthermore $c_n$ must
928    not be in the middle of a ligature. In this way we obtain a string of letters $c_1\ldots c_n$
929    that are generated by nodes $p_a\ldots p_b$, where |1<=a<=b+1<=m|. If |n>=l_hyf+r_hyf|, this
930    string qualifies for hyphenation; however, |uc_hyph| must be positive, if $c_1$ is uppercase.
931
932    The hyphenation process takes place in three stages. First, the candidate sequence $c_1 \ldots
933    c_n$ is found; then potential positions for hyphens are determined by referring to hyphenation
934    tables; and finally, the nodes $p_a\ldots p_b$ are replaced by a new sequence of nodes that
935    includes the discretionary breaks found.
936
937    Fortunately, we do not have to do all this calculation very often, because of the way it has
938    been taken out of \TEX's inner loop. For example, when the second edition of the author's
939    700-page book {\sl Seminumerical Algorithms} was typeset by \TEX, only about 1.2 hyphenations
940    needed to be tried per paragraph, since the line breaking algorithm needed to use two passes on
941    only about 5 per cent of the paragraphs. (This is not true in \LUATEX: we always hyphenate the
942    whole list.)
943
944    When a word been set up to contain a candidate for hyphenation, \TEX\ first looks to see if it
945    is in the user's exception dictionary. If not, hyphens are inserted based on patterns that
946    appear within the given word, using an algorithm due to Frank~M. Liang.
947
948    \stopnarrower
949
950    This is incompatible with \TEX\ because the first word of a paragraph can be hyphenated, but
951    most European users seem to agree that prohibiting hyphenation there was not the best idea ever.
952
953    To be documented: |\hyphenationmode| (a bit set).
954
955    \startbuffer
956    \parindent0pt \hsize=1.1cm
957    12-34-56 \par
958    12-34-\hbox{56} \par
959    12-34-\vrule width 1em height 1.5ex \par
960    12-\hbox{34}-56 \par
961    12-\vrule width 1em height 1.5ex-56 \par
962    \hjcode`\1=`\1 \hjcode`\2=`\2 \hjcode`\3=`\3 \hjcode`\4=`\4 \vskip.5cm
963    12-34-56 \par
964    12-34-\hbox{56} \par
965    12-34-\vrule width 1em height 1.5ex \par
966    12-\hbox{34}-56 \par
967    12-\vrule width 1em height 1.5ex-56 \par
968    \stopbuffer
969
970    \typebuffer
971
972    \startpacked \getbuffer \stopbuffer
973
974    We only accept an explicit hyphen when there is a preceding glyph and we skip a sequence of
975    explicit hyphens as that normally indicates a \type {--} or \type {---} ligature in which case
976    we can in a worse case usage get bad node lists later on due to messed up ligature building as
977    these dashes are ligatures in base fonts. This is a side effect of the separating the
978    hyphenation, ligaturing and kerning steps. A test is cmr with \type {------}.
979
980    A font handler can collapse successive hyphens but it's not nice to put the burden there. A
981    somewhat messy border case is \type {----} but in \LUATEX\ we don't treat \type {--} and \type
982    {---} special. Also, traditional \TEX\ will break a line at \type {-foo} but this can be
983    disabled by setting the automatic mode to \type {1}.
984
985*/
986
987static inline halfword tex_aux_is_hyphen_char(halfword chr)
988{
989    if (tex_get_hc_code(chr)) {
990        return tex_get_hc_code(chr);
991    } else if (chr == ex_hyphen_char_par) {
992        return chr;
993    } else {
994        return 0;
995    }
996}
997
998static halfword tex_aux_find_next_wordstart(halfword r, halfword first_language)
999{
1000    int start_ok = 1;
1001    halfword lastglyph = r;
1002    while (r) {
1003        switch (node_type(r)) {
1004            case boundary_node:
1005                if (node_subtype(r) == word_boundary) {
1006                    start_ok = 1;
1007                }
1008                break;
1009            case disc_node:
1010                start_ok = has_disc_option(r, disc_option_post_word);
1011                break;
1012            case hlist_node:
1013            case vlist_node:
1014            case rule_node:
1015            case dir_node:
1016            case whatsit_node:
1017                if (hyphenation_permitted(glyph_hyphenate(lastglyph), strict_start_hyphenation_mode)) {
1018                    start_ok = 0;
1019                }
1020                break;
1021            case glue_node:
1022                start_ok = 1;
1023                break;
1024            case math_node:
1025                if (node_subtype(r) == begin_inline_math) {
1026                    int mathlevel = 1;
1027                    while (mathlevel > 0) {
1028                        r = node_next(r);
1029                        if (! r) {
1030                            return r;
1031                        } else if (node_type(r) == math_node) {
1032                            if (node_subtype(r) == begin_inline_math) {
1033                                mathlevel++;
1034                            } else {
1035                                mathlevel--;
1036                            }
1037                        }
1038                    }
1039                }
1040                break;
1041            case glyph_node:
1042                {
1043                    /*tex
1044                        When we have no word yet and meet a hyphen (equivalent) we should just
1045                        keep going. This is not compatible but it does make sense.
1046                    */
1047                    int chr = glyph_character(r);
1048                    int hyp = tex_aux_is_hyphen_char(chr);
1049                    lastglyph = r;
1050                    if (hyp) {
1051                        if (hyphenation_permitted(glyph_hyphenate(r), ignore_bounds_hyphenation_mode)) {
1052                            /* maybe some tracing */
1053                        } else {
1054                            /* todo: already check if we have hj chars left/right i.e. no digits and minus mess */
1055                            halfword t = node_next(r) ;
1056                            /*tex Kind of weird that we have the opposite flag test here. */
1057                            if (t && (node_type(t) == glyph_node) && (! tex_aux_is_hyphen_char(glyph_character(t))) && ! hyphenation_permitted(glyph_hyphenate(r), automatic_hyphenation_mode)) {
1058                                /*tex We have no word yet and the next character is a non hyphen. */
1059                                r = tex_aux_compound_word_break(r, get_glyph_language(r), hyp);
1060                                // test case: \automatichyphenmode0 10\high{-6-1-2-4}
1061                                start_ok = 1; // todo: also in luatex
1062                            } else {
1063                                /*tex We jump over the sequence of hyphens. */
1064                                while (t && (node_type(t) == glyph_node) && tex_aux_is_hyphen_char(glyph_character(t))) {
1065                                    r = t ;
1066                                    t = node_next(r) ;
1067                                }
1068                                if (t) {
1069                                    /*tex We need a restart. */
1070                                    start_ok = 0;
1071                                } else {
1072                                    /*tex We reached the end of the list so we have no word start. */
1073                                    return null;
1074                                }
1075                            }
1076                        }
1077                    } else if (start_ok && (get_glyph_language(r) >= first_language) && get_glyph_dohyph(r)) {
1078                        int l = tex_get_hj_code(get_glyph_language(r), chr);
1079                        if (l > 0) {
1080                            if (l == chr || l <= 32 || get_glyph_uchyph(r)) {
1081                                return r;
1082                            } else {
1083                                start_ok = 0;
1084                            }
1085                        } else {
1086                            /*tex We go on. */
1087                        }
1088                    } else {
1089                        /*tex We go on. */
1090                    }
1091                }
1092                break;
1093            default:
1094                start_ok = 0;
1095                break;
1096        }
1097        r = node_next(r);
1098    }
1099    return r; /* null */
1100}
1101
1102/*tex
1103
1104    This is the original test, extended with bounds, but still the complex expression turned into
1105    a function.  However, it actually is part of the old mechanism where hyphenation was mixed
1106    with ligature building and kerning, so there was this skipping over a font kern whuch is no
1107    longer needed as we have separate steps.
1108
1109    We keep this as reference:
1110
1111    \starttyping
1112    static int valid_wordend(halfword s, halfword strict_bound)
1113    {
1114        if (s) {
1115            halfword r = s;
1116            int clang = get_glyph_language(s);
1117            while ( (r) &&
1118                   (    (type(r) == glyph_node && clang == get_glyph_language(r))
1119                     || (type(r) == kern_node && (subtype(r) == font_kern))
1120                    )
1121                   ) {
1122                r = node_next(r);
1123            }
1124            return (! r || (type(r) == glyph_node && clang != get_glyph_language(r))
1125                        ||  type(r) == glue_node
1126                        ||  type(r) == penalty_node
1127                        || (type(r) == kern_node && (subtype(r) == explicit_kern ||
1128                                                     subtype(r) == italic_kern   ||
1129                                                     subtype(r) == accent_kern   ))
1130                        ||  ((type(r) == hlist_node   ||
1131                              type(r) == vlist_node   ||
1132                              type(r) == rule_node    ||
1133                              type(r) == dir_node     ||
1134                              type(r) == whatsit_node ||
1135                              type(r) == insert_node  ||
1136                              type(r) == adjust_node
1137                             ) && ! (strict_bound == 2 || strict_bound == 3))
1138                        ||  type(r) == boundary_node
1139                );
1140        } else {
1141            return 1;
1142        }
1143    }
1144    \stopttyping
1145
1146*/
1147
1148static int tex_aux_valid_wordend(halfword end_word, halfword r)
1149{
1150    if (r) {
1151        switch (node_type(r)) {
1152         // case glyph_node:
1153         // case glue_node:
1154         // case penalty_node:
1155         // case kern_node:
1156         //     return 1;
1157            case disc_node:
1158                return has_disc_option(r, disc_option_pre_word);
1159            case hlist_node:
1160            case vlist_node:
1161            case rule_node:
1162            case dir_node:
1163            case whatsit_node:
1164            case insert_node:
1165            case adjust_node:
1166                return ! hyphenation_permitted(glyph_hyphenate(end_word), strict_end_hyphenation_mode);
1167        }
1168    }
1169    return 1;
1170}
1171
1172void tex_handle_hyphenation(halfword head, halfword tail)
1173{
1174    if (head && node_next(head)) {
1175        int callback_id = lmt_callback_defined(hyphenate_callback);
1176        if (callback_id > 0) {
1177            lua_State *L = lmt_lua_state.lua_instance;
1178            int top = 0;
1179            if (lmt_callback_okay(L, callback_id, &top)) {
1180                int i;
1181                lmt_node_list_to_lua(L, head);
1182                lmt_node_list_to_lua(L, tail);
1183                i = lmt_callback_call(L, 2, 0, top);
1184                if (i) {
1185                    lmt_callback_error(L, top, i);
1186                } else {
1187                    lmt_callback_wrapup(L, top);
1188                }
1189            }
1190        } else if (callback_id == 0) {
1191            tex_hyphenate_list(head, tail);
1192        } else {
1193            /* -1 : disabled */
1194        }
1195    }
1196}
1197
1198static int tex_aux_hnj_hyphen_hyphenate(
1199    hjn_dictionary     *dict,
1200    halfword            first,
1201    halfword            last,
1202    int                 length,
1203    halfword            left,
1204    halfword            right,
1205    language_variables *lan
1206)
1207{
1208    /*tex +2 for dots at each end, +1 for points outside characters. */
1209    int ext_word_len = length + 2;
1210    int hyphen_len = ext_word_len + 1;
1211    /*tex 
1212        Because we have a limit we could just use a static array here but then we 
1213        need to either zero the (large) array or do some more testing in order to 
1214        make the compiler happy for this |hyphens[char_num]| later on. 
1215    */
1216    char *hyphens = lmt_memory_calloc(hyphen_len, sizeof(unsigned char));
1217    if (hyphens) {
1218        halfword here;
1219        int state = 0;
1220        int char_num = 0;
1221        int done = 0;
1222        ++lmt_language_state.word_count;
1223        /*tex Add a '.' to beginning and end to facilitate matching. */
1224        node_next(begin_period) = first;
1225        node_next(end_period) = node_next(last);
1226        node_next(last) = end_period;
1227        /*tex Now, run the finite state machine. */
1228        for (here = begin_period, char_num = 0; here != node_next(end_period); here = node_next(here)) {
1229            int ch;
1230            if (here == begin_period || here == end_period) {
1231                ch = '.';
1232            } else {
1233                ch = tex_get_hj_code(get_glyph_language(here), glyph_character(here));
1234                if (ch <= 32) {
1235                    ch = glyph_character(here);
1236                }
1237            }
1238            while (state != -1) {
1239                hjn_state *hstate = &dict->states[state];
1240                for (int k = 0; k < hstate->num_trans; k++) {
1241                    if (hstate->trans[k].uni_ch == ch) {
1242                        char *match;
1243                        state = hstate->trans[k].new_state;
1244                        match = dict->states[state].match;
1245                        if (match) {
1246                            /*tex
1247                                We add +2 because 1 string length is one bigger than offset and 1
1248                                hyphenation starts before first character.
1249
1250                                Why not store the length in states[state] instead of calculating
1251                                it each time? Okay, performance is okay but still ...
1252                            */
1253                            int offset = (int) (char_num + 2 - (int) strlen(match));
1254                            for (int m = 0; match[m]; m++) {
1255                                if (hyphens[offset + m] < match[m]) {
1256                                    hyphens[offset + m] = match[m];
1257                                }
1258                            }
1259                        }
1260                        goto NEXTLETTER;
1261                    }
1262                }
1263                state = hstate->fallback_state;
1264            }
1265            /*tex Nothing worked, let's go to the next character. */
1266            state = 0;
1267        NEXTLETTER:;
1268            char_num++;
1269        }
1270        /*tex Restore the correct pointers. */
1271        node_next(last) = node_next(end_period);
1272        /*tex
1273            Pattern is |.word.| and |word_len| is 4, |ext_word_len| is 6 and |hyphens| is 7; drop first
1274            two and stop after |word_len-1|.
1275         */
1276        for (here = first, char_num = 2; here != left; here = node_next(here)) {
1277            char_num++;
1278        }
1279        for (; here != right; here = node_next(here)) {
1280            if (hyphens[char_num] & 1) {
1281                here = tex_aux_insert_syllable_discretionary(here, lan);
1282                done += 1;
1283            }
1284            char_num++;
1285        }
1286        lmt_memory_free(hyphens);
1287        return done;
1288    } else {
1289        tex_overflow_error("patterns", hyphen_len);
1290        return 0;
1291    }
1292}
1293
1294/* we can also check the original */
1295
1296static int tex_aux_still_okay(halfword f, halfword l, halfword r, int n, const char *utf8original) {
1297    if (_valid_node_(f) && _valid_node_(l) && node_next(l) == r) {
1298        int i = 0;
1299        while (f) {
1300            ++i;
1301            if (node_type(f) != glyph_node) {
1302                tex_normal_warning("language", "the hyphenated word contains non-glyphs, skipping");
1303                return 0;
1304            } else {
1305                int cl; 
1306                halfword c = (halfword) aux_str2uni_len((const unsigned char *) utf8original, &cl);
1307                utf8original += cl;
1308                if (! (c && c == glyph_character(f))) {
1309                    tex_normal_warning("language", "the hyphenated word contains different characters, skipping");
1310                    return 0;
1311                } else if (f != l) {
1312                    f = node_next(f);
1313                } else if (i == n) {
1314                    return 1;
1315                } else {
1316                    tex_normal_warning("language", "the hyphenated word changed length, skipping");
1317                    return 0;
1318                }
1319            }
1320        }
1321    }
1322    tex_normal_warning("language", "the hyphenation list is messed up, skipping");
1323    return 0;
1324}
1325
1326static void tex_aux_hyphenate_show(halfword beg, halfword end)
1327{
1328    if (_valid_node_(beg) && _valid_node_(end)) {
1329        halfword nxt = node_next(end);
1330        node_next(end) = null;
1331        tex_show_node_list(beg, 100, 10000);
1332        node_next(end) = nxt;
1333    }
1334}
1335
1336/* maybe split: first a processing run */
1337
1338static inline int is_traditional_hyphen(halfword n)
1339{
1340    return (
1341        (glyph_character(n) == ex_hyphen_char_par)                             /*tex parameter */
1342     && (has_font_text_control(glyph_font(n),text_control_collapse_hyphens))   /*tex font driven */
1343     && (hyphenation_permitted(glyph_hyphenate(n),collapse_hyphenation_mode)) /*tex language driven */
1344    );
1345}
1346
1347static inline int is_apostrophe(halfword n)
1348{
1349    return (
1350        (glyph_character(n) == ex_apostrophe_char_par)                                   /*tex parameter */
1351     && (has_font_text_control(glyph_font(n),text_control_replace_apostrophe))           /*tex font driven */
1352     && (hyphenation_permitted(glyph_hyphenate(n),replace_apostrophe_hyphenation_mode)) /*tex language driven */
1353    );
1354}
1355int tex_collapse_list(halfword head, halfword c1, halfword c2, halfword c3, halfword c4) /* ex_hyphen_char_par 0x2013 0x2014 0x2019 */
1356{
1357    /*tex Let's play safe: */
1358    halfword found = 0;
1359    if (head && c1 && c2 && c3) {
1360        halfword n1 = head;
1361        while (n1) {
1362            halfword n2 = node_next(n1);
1363            switch (node_type(n1)) {
1364                case glyph_node:
1365                    if (is_traditional_hyphen(n1)) {
1366                        set_glyph_discpart(n1, glyph_discpart_always);
1367                        if (n2 && node_type(n2) == glyph_node && is_traditional_hyphen(n2) && glyph_font(n1) == glyph_font(n2)) {
1368                            halfword n3 = node_next(n2);
1369                            if (n3 && node_type(n3) == glyph_node && is_traditional_hyphen(n3) && glyph_font(n1) == glyph_font(n3)) {
1370                                halfword n4 = node_next(n3);
1371                                glyph_character(n1) = c3;
1372                                tex_try_couple_nodes(n1, n4);
1373                                tex_flush_node(n2);
1374                                tex_flush_node(n3);
1375                                n1 = n4;
1376                            } else {
1377                                glyph_character(n1) = c2;
1378                                tex_try_couple_nodes(n1, n3);
1379                                tex_flush_node(n2);
1380                                n1 = n3;
1381                            }
1382                            found = 1;
1383                            goto AGAIN;
1384                        } else {
1385                            glyph_character(n1) = c1; /* can become language dependent */
1386                        }
1387                    } else if (is_apostrophe(n1)) {
1388                        glyph_character(n1) = c4; /* can become language dependent */
1389                        found = 1;
1390                    }
1391                    break;
1392                case disc_node:
1393                    {
1394                        halfword done = 0;
1395                        if (disc_pre_break_head(n1) && tex_collapse_list(disc_pre_break_head(n1), c1, c2, c3, c4)) {
1396                            ++done;
1397                        }
1398                        if (disc_post_break_head(n1) && tex_collapse_list(disc_post_break_head(n1), c1, c2, c3, c4)) {
1399                            ++done;
1400                        }
1401                        if (disc_no_break_head(n1) && tex_collapse_list(disc_no_break_head(n1), c1, c2, c3, c4)) {
1402                            ++done;
1403                        }
1404                        if (done) {
1405                            tex_check_disc_field(n1);
1406                        }
1407                        break;
1408                    }
1409                default:
1410                    break;
1411            }
1412            n1 = n2;
1413          AGAIN:;
1414        }
1415    }
1416    return found;
1417}
1418
1419void tex_hyphenate_list(halfword head, halfword tail)
1420{
1421    /*tex Let's play safe: */
1422    if (tail) {
1423        halfword first_language = first_valid_language_par; /* combine with check below */
1424        halfword trace = tracing_hyphenation_par;
1425        halfword r = head;
1426        halfword nothing = 1;
1427        ++lmt_language_state.list_count;
1428        /*tex
1429            This first movement assures two things:
1430
1431            \startitemize
1432                \startitem
1433                    That we won't waste lots of time on something that has been handled already (in
1434                    that case, none of the glyphs match |simple_character|).
1435                \stopitem
1436                \startitem
1437                    That the first word can be hyphenated. If the movement was not explicit, then
1438                    the indentation at the start of a paragraph list would make |find_next_wordstart()|
1439                    look too far ahead.
1440                \stopitem
1441            \stopitemize
1442        */
1443        while (r && node_type(r) != glyph_node) {
1444            r = node_next(r);
1445        }
1446        if (r) {
1447            /* maybe pass used_hyphen_penalty_par and used_ex_hyphen_penalty_par */
1448            halfword saved_hyphen_penalty_par = hyphen_penalty_par;
1449            halfword saved_ex_hyphen_penalty_par = ex_hyphen_penalty_par;
1450            halfword p = tex_find_par_par(head);
1451            int penalties_pushed = node_type(p) == par_node; /* maybe check for h|v subtype */
1452            ++lmt_language_state.checked_count;
1453            if (penalties_pushed) {
1454                hyphen_penalty_par = tex_get_par_par(p, par_hyphen_penalty_code); 
1455                ex_hyphen_penalty_par = tex_get_par_par(p, par_ex_hyphen_penalty_code); 
1456            }
1457            /* */
1458            r = tex_aux_find_next_wordstart(r, first_language);
1459            if (r) {
1460                language_variables langdata;
1461                char utf8word[max_size_of_word_buffer];
1462                char utf8original[max_size_of_word_buffer];
1463                char *utf8ptr = utf8word;
1464                char *utf8ori = utf8original;
1465                int word_length = 0;
1466                int explicit_hyphen = 0;
1467                int last_char = 0;
1468                int valid = 0;
1469                halfword explicit_start = null;
1470                halfword saved_tail = node_next(tail);
1471                halfword penalty = tex_new_penalty_node(0, word_penalty_subtype);
1472                /* kind of curious hack, this addition that we later remove */
1473                tex_attach_attribute_list_copy(penalty, r);
1474                tex_couple_nodes(tail, penalty); /* todo: attrobute */
1475                while (r) {
1476                    halfword word_start = r;
1477                    int word_language = get_glyph_language(word_start);
1478                    if (tex_is_valid_language(word_language)) {
1479                        halfword word_end = r;
1480                        int lhmin = get_glyph_lhmin(word_start);
1481                        int rhmin = get_glyph_rhmin(word_start);
1482                        int hmin = tex_get_hyphenation_min(word_language);
1483                        halfword word_font = glyph_font(word_start);
1484                        if (! tex_is_valid_font(word_font) || font_hyphen_char(word_font) < 0) {
1485                            /*tex For backward compatibility we set: */
1486                            word_font = 0;
1487                        }
1488                        langdata.pre_hyphen_char = tex_get_pre_hyphen_char(word_language);
1489                        langdata.post_hyphen_char = tex_get_post_hyphen_char(word_language);
1490                        while (r && node_type(r) == glyph_node && word_language == get_glyph_language(r)) {
1491                            halfword chr = glyph_character(r);
1492                            halfword hyp = tex_aux_is_hyphen_char(chr);
1493                            if (word_language >= first_language) {
1494                                last_char = tex_get_hj_code(word_language, chr);
1495                                if (last_char > 0) {
1496                                    goto GOFORWARD;
1497                                }
1498                            }
1499                            if (hyp) {
1500                                last_char = hyp;
1501                             // if (last_char) {
1502                             //     goto GOFORWARD;
1503                             // }
1504                            } else {
1505                                break;
1506                            }
1507                          GOFORWARD:
1508                         // explicit_hyphen = is_hyphen_char(chr);
1509                            explicit_hyphen = hyp;
1510                            if (explicit_hyphen && node_next(r) && node_type(node_next(r)) != glyph_node && hyphenation_permitted(glyph_hyphenate(r), ignore_bounds_hyphenation_mode)) {
1511                                /* maybe some tracing */
1512                                explicit_hyphen = 0;
1513                            }
1514                            if (explicit_hyphen) {
1515                                break;
1516                            } else {
1517                                word_length++;
1518                                if (word_length >= max_size_of_word) {
1519                                    /* tex_normal_warning("language", "ignoring long word"); */
1520                                    while (r && node_type(r) == glyph_node) {
1521                                        r = node_next(r);
1522                                    }
1523                                    goto PICKUP;
1524                                } else {
1525                                    if (last_char <= 32) {
1526                                        if (last_char == 32) {
1527                                            last_char = 0 ;
1528                                        }
1529                                        if (word_length <= lhmin) {
1530                                            lhmin = lhmin - last_char + 1 ;
1531                                            if (lhmin < 0) {
1532                                                lhmin = 1;
1533                                            }
1534                                        }
1535                                        if (word_length >= rhmin) {
1536                                            rhmin = rhmin - last_char + 1 ;
1537                                            if (rhmin < 0) {
1538                                                rhmin = 1;
1539                                            }
1540                                        }
1541                                        hmin = hmin - last_char + 1 ;
1542                                        if (hmin < 0) {
1543                                            rhmin = 1;
1544                                        }
1545                                        last_char = chr ;
1546                                    }
1547                                    utf8ori = aux_uni2string(utf8ori, (unsigned) chr);
1548                                    utf8ptr = aux_uni2string(utf8ptr, (unsigned) last_char);
1549                                    word_end = r;
1550                                    r = node_next(r);
1551                                }
1552                            }
1553                        }
1554                        if (explicit_hyphen) {
1555                            /*tex We are not at the start, so we only need to look ahead. */
1556                            if ((get_glyph_discpart(r) == glyph_discpart_replace && ! hyphenation_permitted(glyph_hyphenate(r), syllable_hyphenation_mode))) {
1557                                /*tex
1558                                    This can be the consequence of inhibition too, see |finish_discretionary|
1559                                    in which case the replace got injected which can have a hyphen. And we want
1560                                    to run the callback if set in order to replace.
1561                                */
1562                                valid = 1;
1563                                goto MESSYCODE;
1564                            } else {
1565                                /*tex Maybe we should get rid of this ----- stuff. */
1566                                halfword t = node_next(r);
1567                                if (t && node_type(t) == glyph_node && ! tex_aux_is_hyphen_char(glyph_character(t)) && hyphenation_permitted(glyph_hyphenate(t), automatic_hyphenation_mode)) {
1568                                    /*tex we have a word already but the next character may not be a hyphen too */
1569                                    halfword g = r;
1570                                    set_glyph_disccode(g, glyph_disc_automatic);
1571                                    r = tex_aux_compound_word_break(r, get_glyph_language(g), explicit_hyphen);
1572                                    if (trace > 1) {
1573                                        *utf8ori = 0;
1574                                        tex_begin_diagnostic();
1575                                        tex_print_format("[language: compound word break after %s]", utf8original);
1576                                        tex_end_diagnostic();
1577                                    }
1578                                    if (hyphenation_permitted(glyph_hyphenate(g), compound_hyphenation_mode)) {
1579                                        explicit_hyphen = 0;
1580                                        if (hyphenation_permitted(glyph_hyphenate(g), force_handler_hyphenation_mode) || hyphenation_permitted(glyph_hyphenate(g), feedback_compound_hyphenation_mode)) {
1581                                            set_disc_option(r, disc_option_pre_word | disc_option_post_word);
1582                                            explicit_start = null;
1583                                            valid = 1;
1584                                            goto MESSYCODE;
1585                                        } else {
1586                                            if (! explicit_start) {
1587                                                explicit_start = word_start;
1588                                            }
1589                                            /*tex For exceptions. */
1590                                            utf8ptr = aux_uni2string(utf8ptr, '-');
1591                                            r = t;
1592                                            continue;
1593                                        }
1594                                    }
1595                                } else {
1596                                    /*tex We jump over the sequence of hyphens ... traditional. */
1597                                    while (t && node_type(t) == glyph_node && tex_aux_is_hyphen_char(glyph_character(t))) {
1598                                        set_glyph_disccode(t, glyph_disc_automatic);
1599                                        r = t;
1600                                        t = node_next(r);
1601                                    }
1602                                    if (! t) {
1603                                        /*tex we reached the end of the list and will quit the loop later */
1604                                        r = null;
1605                                    }
1606                                }
1607                            }
1608                        } else {
1609                            valid = tex_aux_valid_wordend(word_end, r);
1610                          MESSYCODE:
1611                            /*tex We have a word, r is at the next node. */
1612                            if (word_font && word_language >= first_language) {
1613                                /*tex We have a language, actually we already tested that. */
1614                                struct tex_language *lang = lmt_language_state.languages[word_language];
1615                                if (lang) {
1616                                    char *replacement = NULL;
1617                                    halfword start = explicit_start ? explicit_start : word_start;
1618                                    int okay = word_length >= lhmin + rhmin && (hmin <= 0 || word_length >= hmin) && hyphenation_permitted(glyph_hyphenate(start), syllable_hyphenation_mode);
1619                                    *utf8ptr = '\0';
1620                                    *utf8ori = '\0';
1621                                    if (lang->wordhandler && hyphenation_permitted(glyph_hyphenate(start), force_handler_hyphenation_mode)) {
1622                                        halfword restart = node_prev(start); /*tex before the word. */
1623                                        int done = lmt_handle_word(lang, utf8original, utf8word, word_length, start, word_end, &replacement);
1624                                        if (replacement) {
1625                                            if (tex_aux_still_okay(start, word_end, r, word_length, utf8original)) {
1626                                                goto EXCEPTIONS2;
1627                                            } else {
1628                                                goto PICKUP;
1629                                            }
1630                                        } else {
1631                                            /* 1: restart 2: exceptions+patterns 3: patterns *: next word */
1632                                            switch (done) {
1633                                                case 1:
1634                                                    if (_valid_node_(restart)) {
1635                                                        r = restart;
1636                                                    } else if (_valid_node_(start)) {
1637                                                        r = node_prev(start);
1638                                                    }
1639                                                    if (! r) {
1640                                                        if (_valid_node_(head)) {
1641                                                            tex_normal_warning("language", "the hyphenation list is messed up, recovering");
1642                                                            r = head;
1643                                                        } else {
1644                                                            tex_normal_error("language", "the hyphenated head is messed up, aborting");
1645                                                            return;
1646                                                        }
1647                                                    }
1648                                                    goto PICKUP;
1649                                                case 2:
1650                                                    if (tex_aux_still_okay(start, word_end, r, word_length, utf8original)) {
1651                                                        goto EXCEPTIONS1;
1652                                                    } else {
1653                                                        goto PICKUP;
1654                                                    }
1655                                                case 3:
1656                                                    if (tex_aux_still_okay(start, word_end, r, word_length, utf8original)) {
1657                                                        goto PATTERNS;
1658                                                    } else {
1659                                                        goto PICKUP;
1660                                                    }
1661                                                default:
1662                                                    if (_valid_node_(r)) { /* or word_end */
1663                                                        goto PICKUP;
1664                                                    } else if (_valid_node_(tail)) {
1665                                                        tex_normal_warning("language", "the hyphenation list is messed up, quitting");
1666                                                        goto ABORT;
1667                                                    } else {
1668                                                        // tex_normal_error("language","the hyphenated tail is messed up, aborting");
1669                                                        return;
1670                                                    }
1671                                            }
1672                                        }
1673                                    }
1674                                    if (! okay || ! valid) {
1675                                        goto PICKUP;
1676                                    }
1677                                    /*tex
1678                                        This is messy and nasty: we can have a word with a - in it which is why
1679                                        we have two branches. Also, every word that suits the length criteria
1680                                        is checked via \LUA. Optimizing this because tests have demonstrated
1681                                        that checking against the min and max lengths of exception strings has
1682                                        no gain.
1683                                    */
1684                                  EXCEPTIONS1:
1685                                    if (lang->exceptions) {
1686                                        replacement = tex_aux_hyphenation_exception(lang->exceptions, utf8word);
1687                                    }
1688                                  EXCEPTIONS2:
1689                                    if (replacement) {
1690                                        /*tex handle the exception and go on to the next word */
1691                                        halfword start = explicit_start ? explicit_start : word_start;
1692                                        halfword beg = node_prev(start);
1693                                        ++lmt_language_state.exceptions_count;
1694                                        nothing = 0;
1695                                        tex_aux_do_exception(start, r, replacement); // r == next_node(word_end)
1696                                        if (trace > 1) {
1697                                            tex_begin_diagnostic();
1698                                            tex_print_format("[language: exception %s to %s]", utf8original, replacement);
1699                                            if (trace > 2) {
1700                                                tex_aux_hyphenate_show(node_next(beg), node_prev(r));
1701                                            }
1702                                            tex_end_diagnostic();
1703                                        }
1704                                        lmt_memory_free(replacement);
1705                                        goto PICKUP;
1706                                    }
1707                                  PATTERNS:
1708                                    if (lang->patterns) {
1709                                        if (explicit_start) {
1710                                            /*tex We're done already */
1711                                        } else if (hyphenation_permitted(glyph_hyphenate(word_start), syllable_hyphenation_mode)) {
1712                                            halfword left = word_start;
1713                                            halfword right = r; /*tex We're one after |word_end|. */
1714                                            for (int i = lhmin; i > 1; i--) {
1715                                                left = node_next(left);
1716                                                if (! left || left == right) {
1717                                                    goto PICKUP;
1718                                                }
1719                                            }
1720                                            if (right != left) {
1721                                                int done = 0;
1722                                                for (int i = rhmin; i > 0; i--) {
1723                                                    right = node_prev(right);
1724                                                    if (! right || right == left) {
1725                                                        goto PICKUP;
1726                                                    }
1727                                                }
1728                                                done = tex_aux_hnj_hyphen_hyphenate(lang->patterns, word_start, word_end, word_length, left, right, &langdata);
1729                                                if (done) { 
1730                                                    ++lmt_language_state.hyphenated_count;
1731                                                    nothing = 0;
1732                                                }
1733                                                if (trace > 1) {
1734                                                    tex_begin_diagnostic();
1735                                                    if (done) {
1736                                                        tex_print_format("[language: hyphenated %s at %i positions]", utf8original, done);
1737                                                        if (trace > 2) {
1738                                                            tex_aux_hyphenate_show(node_next(left), node_prev(right));
1739                                                        }
1740                                                    } else {
1741                                                        tex_print_format("[language: not hyphenated %s]", utf8original);
1742                                                    }
1743                                                    tex_end_diagnostic();
1744                                                }
1745                                            }
1746                                        }
1747                                    }
1748                                }
1749                            }
1750                        }
1751                    }
1752                  PICKUP:
1753                    explicit_start = null ;
1754                    explicit_hyphen = 0;
1755                    word_length = 0;
1756                    utf8ptr = utf8word;
1757                    utf8ori = utf8original;
1758                    if (r) {
1759                        r = tex_aux_find_next_wordstart(r, first_language);
1760                    } else {
1761                        break;
1762                    }
1763                }
1764              ABORT:
1765                tex_flush_node(node_next(tail));
1766                node_next(tail) = saved_tail;
1767            }
1768            /* */
1769            if (penalties_pushed) {
1770                hyphen_penalty_par = saved_hyphen_penalty_par;
1771                ex_hyphen_penalty_par = saved_ex_hyphen_penalty_par;
1772            }
1773            /* */
1774        }
1775        if (nothing) {
1776            ++lmt_language_state.nothing_count;
1777        }
1778    }
1779}
1780
1781halfword tex_glyph_to_discretionary(halfword glyph, quarterword code, int keepkern)
1782{
1783    halfword prev = node_prev(glyph);
1784    halfword next = node_next(glyph);
1785    halfword disc = tex_new_disc_node(code);
1786    halfword kern = null;
1787    if (keepkern && next && node_type(next) == kern_node && node_subtype(next) == italic_kern_subtype) {
1788        kern = node_next(next);
1789        next = node_next(kern);
1790        node_next(kern) = null;
1791    } else { 
1792        node_next(glyph) = null;
1793    }
1794    node_prev(glyph) = null;
1795    tex_attach_attribute_list_copy(disc, glyph);
1796    tex_set_disc_field(disc, pre_break_code, tex_copy_node_list(glyph, null));
1797    tex_set_disc_field(disc, post_break_code, tex_copy_node_list(glyph, null));
1798    tex_set_disc_field(disc, no_break_code, glyph);
1799    tex_try_couple_nodes(prev, disc);
1800    tex_try_couple_nodes(disc, next);
1801    return disc; 
1802}