1
4
5# include "luametatex.h"
6
7
22
23language_state_info lmt_language_state = {
24 .languages = NULL,
25 .language_data = {
26 .minimum = min_language_size,
27 .maximum = max_language_size,
28 .size = memory_data_unset,
29 .step = stp_language_size,
30 .allocated = 0,
31 .itemsize = 1,
32 .top = 0,
33 .ptr = 0,
34 .initial = memory_data_unset,
35 .offset = 0,
36 },
37 .handler_table_id = 0,
38 .handler_count = 0,
39};
40
41
46
47static void tex_aux_reset_language(halfword id)
48{
49 tex_language *lang = lmt_language_state.languages[id];
50 lang->id = id;
51 lang->exceptions = 0;
52 lang->patterns = NULL;
53 lang->wordhandler = 0;
54 lang->pre_hyphen_char = '-';
55 lang->post_hyphen_char = 0;
56 lang->pre_exhyphen_char = 0;
57 lang->post_exhyphen_char = 0;
58 lang->hyphenation_min = -1;
59 lang->hjcode_head = NULL;
60}
61
62
66
67static halfword tex_aux_new_language_id(halfword id)
68{
69 int top;
70 if (id >= 0) {
71 if (id <= lmt_language_state.language_data.top) {
72 if (lmt_language_state.languages[id]) {
73 return tex_formatted_error("languages", "the language with id %d is already created", id);
74 } else {
75 return id;
76 }
77 } else if (id > lmt_language_state.language_data.maximum) {
78 goto OVERFLOWERROR;
79 } else {
80 top = id;
81 }
82 } else if (lmt_language_state.language_data.ptr < lmt_language_state.language_data.top) {
83 ++lmt_language_state.language_data.ptr;
84 return lmt_language_state.language_data.ptr;
85 } else if (lmt_language_state.language_data.top >= lmt_language_state.language_data.maximum) {
86 goto OVERFLOWERROR;
87 } else if (lmt_language_state.language_data.top + lmt_language_state.language_data.step > lmt_language_state.language_data.maximum) {
88 top = lmt_language_state.language_data.maximum;
89 } else {
90 top = lmt_language_state.language_data.top + lmt_language_state.language_data.step;
91 }
92
93 {
94 tex_language **tmp = aux_reallocate_array(lmt_language_state.languages, sizeof(tex_language *), top, 0);
95 if (tmp) {
96 for (int i = lmt_language_state.language_data.top + 1; i <= top; i++) {
97 tmp[i] = NULL;
98 }
99 lmt_language_state.languages = tmp;
100 lmt_language_state.language_data.allocated += ((size_t) top - lmt_language_state.language_data.top) * sizeof(tex_language *);
101 lmt_language_state.language_data.top = top;
102 lmt_language_state.language_data.ptr += 1;
103 return lmt_language_state.language_data.ptr;
104 }
105 }
106 OVERFLOWERROR:
107 tex_overflow_error("languages", lmt_language_state.language_data.maximum);
108 return 0;
109}
110
111void tex_initialize_languages(void)
112{
113 tex_language **tmp = aux_allocate_clear_array(sizeof(tex_language *), lmt_language_state.language_data.minimum, 0);
114 if (tmp) {
115 for (int i = 0; i < lmt_language_state.language_data.minimum; i++) {
116 tmp[i] = NULL;
117 }
118 lmt_language_state.languages = tmp;
119 lmt_language_state.language_data.allocated += lmt_language_state.language_data.minimum * sizeof(tex_language *);
120 lmt_language_state.language_data.top = lmt_language_state.language_data.minimum;
121 } else {
122 tex_overflow_error("languages", lmt_language_state.language_data.minimum);
123 }
124}
125
126
132
133int tex_is_valid_language(halfword n)
134{
135 if (n == 0) {
136 return 1;
137 } else if (n > 0 && n <= lmt_language_state.language_data.top) {
138 return lmt_language_state.languages[n] ? 1 : 0;
139 } else {
140 return 0;
141 }
142}
143
144tex_language *tex_new_language(halfword n)
145{
146 halfword id = tex_aux_new_language_id(n);
147 if (id >= 0) {
148 tex_language *lang = lmt_memory_malloc(sizeof(struct tex_language));
149 if (lang) {
150 lmt_language_state.languages[id] = lang;
151 lmt_language_state.language_data.allocated += sizeof(struct tex_language);
152 tex_aux_reset_language(id);
153 if (saving_hyph_codes_par) {
154
158 tex_hj_codes_from_lc_codes(id);
159 }
160 } else {
161 tex_overflow_error("language", sizeof(struct tex_language));
162 }
163 return lang;
164 } else {
165 return NULL;
166 }
167}
168
169tex_language *tex_get_language(halfword n)
170{
171 if (n >= 0) {
172 if (n <= lmt_language_state.language_data.top && lmt_language_state.languages[n]) {
173 return lmt_language_state.languages[n];
174 }
175 if (n <= lmt_language_state.language_data.maximum) {
176 return tex_new_language(n);
177 }
178 }
179 return NULL;
180}
181
182
185
186
197
198void tex_dump_language_data(dumpstream f)
199{
200 dump_int(f, lmt_language_state.language_data.top);
201 dump_int(f, lmt_language_state.language_data.ptr);
202 if (lmt_language_state.language_data.top > 0) {
203 for (int i = 0; i < lmt_language_state.language_data.top; i++) {
204 tex_language *lang = lmt_language_state.languages[i];
205 if (lang) {
206 dump_via_int(f, 1);
207 dump_int(f, lang->id);
208 dump_int(f, lang->pre_hyphen_char);
209 dump_int(f, lang->post_hyphen_char);
210 dump_int(f, lang->pre_exhyphen_char);
211 dump_int(f, lang->post_exhyphen_char);
212 dump_int(f, lang->hyphenation_min);
213 tex_dump_language_hj_codes(f, i);
214 } else {
215 dump_via_int(f, 0);
216 }
217 }
218 }
219}
220
221void tex_undump_language_data(dumpstream f)
222{
223 int top, ptr;
224 undump_int(f, top);
225 undump_int(f, ptr);
226 if (top > 0) {
227 tex_language **tmp = aux_allocate_clear_array(sizeof(tex_language *), top, 0);
228 if (tmp) {
229 lmt_language_state.language_data.top = top;
230 lmt_language_state.language_data.ptr = ptr;
231 lmt_language_state.languages = tmp;
232 for (int i = 0; i < top; i++) {
233 int x;
234 undump_int(f, x);
235 if (x == 1) {
236 tex_language *lang = lmt_memory_malloc(sizeof(struct tex_language));
237 if (lang) {
238 lmt_language_state.languages[i] = lang;
239 lmt_language_state.language_data.allocated += sizeof(struct tex_language);
240 lang->exceptions = 0;
241 lang->patterns = NULL;
242 lang->wordhandler = 0;
243 lang->hjcode_head = NULL;
244 undump_int(f, lang->id);
245 undump_int(f, lang->pre_hyphen_char);
246 undump_int(f, lang->post_hyphen_char);
247 undump_int(f, lang->pre_exhyphen_char);
248 undump_int(f, lang->post_exhyphen_char);
249 undump_int(f, lang->hyphenation_min);
250 tex_undump_language_hj_codes(f, i);
251 if (lang->id != i) {
252 tex_formatted_warning("languages", "undumped language id mismatch: %d <> %d", lang->id, i);
253 lang->id = i;
254 }
255 } else {
256 tex_overflow_error("languages", i);
257 }
258 tmp[i] = lang;
259 } else {
260 tmp[i] = NULL;
261 }
262 }
263 lmt_language_state.language_data.initial = lmt_language_state.language_data.ptr;
264 } else {
265 tex_overflow_error("languages", top);
266 lmt_language_state.language_data.initial = 0;
267 }
268 } else {
269
270 tex_initialize_languages();
271 }
272}
273
274
275
276void tex_set_pre_hyphen_char(halfword n, halfword v)
277{
278 struct tex_language *l = tex_get_language(n);
279 if (l) {
280 l->pre_hyphen_char = v;
281 }
282}
283
284void tex_set_post_hyphen_char(halfword n, halfword v)
285{
286 struct tex_language *l = tex_get_language(n);
287 if (l) {
288 l->post_hyphen_char = v;
289 }
290}
291
292void tex_set_pre_exhyphen_char(halfword n, halfword v)
293{
294 struct tex_language *l = tex_get_language(n);
295 if (l) {
296 l->pre_exhyphen_char = v;
297 }
298}
299
300void tex_set_post_exhyphen_char(halfword n, halfword v)
301{
302 struct tex_language *l = tex_get_language(n);
303 if (l) {
304 l->post_exhyphen_char = v;
305 }
306}
307
308halfword tex_get_pre_hyphen_char(halfword n)
309{
310 struct tex_language *l = tex_get_language(n);
311 return l ? l->pre_hyphen_char : -1;
312}
313
314halfword tex_get_post_hyphen_char(halfword n)
315{
316 struct tex_language *l = tex_get_language(n);
317 return l ? l->post_hyphen_char : -1;
318}
319
320halfword tex_get_pre_exhyphen_char(halfword n)
321{
322 struct tex_language *l = tex_get_language(n);
323 return l ? l->pre_exhyphen_char : -1;
324}
325
326halfword tex_get_post_exhyphen_char(halfword n)
327{
328 struct tex_language *l = tex_get_language(n);
329 return (l) ? (int) l->post_exhyphen_char : -1;
330}
331
332void tex_set_hyphenation_min(halfword n, halfword v)
333{
334 struct tex_language *l = tex_get_language(n);
335 if (l) {
336 l->hyphenation_min = v;
337 }
338}
339
340halfword tex_get_hyphenation_min(halfword n)
341{
342 struct tex_language *l = tex_get_language((int) n);
343 return l ? l->hyphenation_min : -1;
344}
345
346void tex_load_patterns(struct tex_language *lang, const unsigned char *buff)
347{
348 if ((! lang) || (! buff) || strlen((const char *) buff) == 0) {
349 return;
350 } else {
351 if (! lang->patterns) {
352 lang->patterns = hnj_dictionary_new();
353 }
354 hnj_dictionary_load(lang->patterns, buff, tracing_hyphenation_par > 0);
355 }
356}
357
358void tex_clear_patterns(struct tex_language *lang)
359{
360 if (lang && lang->patterns) {
361 hnj_dictionary_clear(lang->patterns);
362 }
363}
364
365void tex_load_tex_patterns(halfword curlang, halfword head)
366{
367
368 char *s = tex_tokenlist_to_tstring(head, 1, NULL, 0, 0, 0, 0, 1);
369 if (s) {
370 tex_load_patterns(tex_get_language(curlang), (unsigned char *) s);
371 }
372}
373
374
377
378
379# define tex_isspace(c) (c == ' ')
380
381const char *tex_clean_hyphenation(halfword id, const char *buff, char **cleaned)
382{
383 int items = 0;
384
385 unsigned char word[max_size_of_word + 1];
386
387 unsigned uword[max_size_of_word + 1] = { 0 };
388
389 int i = 0;
390 char *uindex = (char *) word;
391 const char *s = buff;
392 while (*s && ! tex_isspace((unsigned char)*s)) {
393 word[i++] = (unsigned char) *s;
394 s++;
395 if ((s-buff) > max_size_of_word) {
396
397 *cleaned = NULL;
398 tex_handle_error(
399 normal_error_type,
400 "Exception too long",
401 NULL
402 );
403 return s;
404 }
405 }
406
407 word[i] = '\0';
408 aux_splitutf2uni(uword, (const char *)word);
409
413 i = 0;
414 while (uword[i] > 0) {
415 int u = uword[i++];
416 if (u == '-') {
417
418 } else if (u == '=') {
419 unsigned c = tex_get_hj_code(id, '-');
420 uindex = aux_uni2string(uindex, (! c || c <= 32) ? '-' : c);
421 } else if (u == '{') {
422 u = uword[i++];
423 items = 0;
424 while (u && u != '}') {
425 u = uword[i++];
426 }
427 if (u == '}') {
428 items++;
429 u = uword[i++];
430 }
431 while (u && u != '}') {
432 u = uword[i++];
433 }
434 if (u == '}') {
435 items++;
436 u = uword[i++];
437 }
438 if (u == '{') {
439 u = uword[i++];
440 }
441 while (u && u != '}') {
442 unsigned c = tex_get_hj_code(id, u);
443 uindex = aux_uni2string(uindex, (! c || c <= 32) ? u : c);
444 u = uword[i++];
445 }
446 if (u == '}') {
447 items++;
448 }
449 if (items != 3) {
450
451 *cleaned = NULL;
452 tex_handle_error(
453 normal_error_type,
454 "Exception syntax error, a discretionary has three components: {}{}{}.",
455 NULL
456 );
457 return s;
458 } else {
459
460 if (uword[i] == '(') {
461 while (uword[++i] && uword[i] != ')') { };
462 if (uword[i] != ')') {
463 tex_handle_error(
464 normal_error_type,
465 "Exception syntax error, an alternative replacement is defined as (text).",
466 NULL
467 );
468 return s;
469 } else if (uword[i]) {
470 i++;
471 }
472 }
473
474 if (uword[i] == '[') {
475 if (uword[i+1] && uword[i+1] >= '0' && uword[i+1] <= '9' && uword[i+2] && uword[i+2] == ']') {
476 i += 3;
477 } else {
478 tex_handle_error(
479 normal_error_type,
480 "Exception syntax error, a penalty is defined as [digit].",
481 NULL
482 );
483 return s;
484 }
485 }
486 }
487 } else {
488 unsigned c = tex_get_hj_code(id, u);
489 uindex = aux_uni2string(uindex, (! c || c <= 32) ? u : c);
490 }
491 }
492 *uindex = '\0';
493 *cleaned = lmt_memory_strdup((char *) word);
494 return s;
495}
496
497void tex_load_hyphenation(struct tex_language *lang, const unsigned char *buff)
498{
499 if (lang) {
500 lua_State *L = lmt_lua_state.lua_instance;
501 const char *s = (const char *) buff;
502 char *cleaned = NULL;
503 int id = lang->id;
504 if (lang->exceptions == 0) {
505 lua_newtable(L);
506 lang->exceptions = luaL_ref(L, LUA_REGISTRYINDEX);
507 }
508 lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions);
509 while (*s) {
510 while (tex_isspace((unsigned char) *s)) {
511 s++;
512 }
513 if (*s) {
514 const char *value = s;
515 s = tex_clean_hyphenation(id, s, &cleaned);
516 if (cleaned) {
517 size_t len = s - value;
518 if (len > 0) {
519 lua_pushstring(L, cleaned);
520 lua_pushlstring(L, value, len);
521 lua_rawset(L, -3);
522 }
523 lmt_memory_free(cleaned);
524 } else {
525
526 }
527 }
528 }
529 lua_pop(L, 1);
530 }
531}
532
533void tex_clear_hyphenation(struct tex_language *lang)
534{
535 if (lang && lang->exceptions != 0) {
536 lua_State *L = lmt_lua_state.lua_instance;
537 luaL_unref(L, LUA_REGISTRYINDEX, lang->exceptions);
538 lang->exceptions = 0;
539 }
540}
541
542void tex_load_tex_hyphenation(halfword curlang, halfword head)
543{
544 char *s = tex_tokenlist_to_tstring(head, 1, NULL, 0, 0, 0, 0, 1);
545 if (s) {
546 tex_load_hyphenation(tex_get_language(curlang), (unsigned char *) s);
547 }
548}
549
550static halfword tex_aux_insert_discretionary(halfword t, halfword pre, halfword post, halfword replace, quarterword subtype, int penalty)
551{
552
553 halfword d = tex_new_disc_node(subtype);
554 halfword a = node_attr(t) ;
555 disc_penalty(d) = penalty;
556 if (t == replace) {
557
558 tex_try_couple_nodes(d, node_next(t));
559 tex_try_couple_nodes(node_prev(t), d);
560 node_prev(t) = null;
561 node_next(t) = null;
562 replace = t;
563 } else {
564
565 tex_try_couple_nodes(d, node_next(t));
566 tex_couple_nodes(t, d);
567 }
568 if (a) {
569 tex_attach_attribute_list_attribute(d, a);
570 }
571 tex_set_disc_field(d, pre_break_code, pre);
572 tex_set_disc_field(d, post_break_code, post);
573 tex_set_disc_field(d, no_break_code, replace);
574 return d;
575}
576
577static halfword tex_aux_insert_syllable_discretionary(halfword t, lang_variables *lan)
578{
579 halfword n = tex_new_disc_node(syllable_discretionary_code);
580 disc_penalty(n) = hyphen_penalty_par;
581 tex_couple_nodes(n, node_next(t));
582 tex_couple_nodes(t, n);
583 tex_attach_attribute_list_attribute(n, get_attribute_list(t));
584 if (lan->pre_hyphen_char > 0) {
585 halfword g = tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), lan->pre_hyphen_char, t);
586 tex_set_disc_field(n, pre_break_code, g);
587 }
588 if (lan->post_hyphen_char > 0) {
589 halfword g = tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), lan->post_hyphen_char, t);
590 tex_set_disc_field(n, post_break_code, g);
591 }
592 return n;
593}
594
595static halfword tex_aux_compound_word_break(halfword t, halfword clang, halfword chr)
596{
597 halfword prechar, postchar, pre, post, disc;
598 if (chr == ex_hyphen_char_par) {
599 halfword pre_exhyphen_char = tex_get_pre_exhyphen_char(clang);
600 halfword post_exhyphen_char = tex_get_post_exhyphen_char(clang);
601 prechar = pre_exhyphen_char > 0 ? pre_exhyphen_char : ex_hyphen_char_par;
602 postchar = post_exhyphen_char > 0 ? post_exhyphen_char : null;
603 } else {
604
605 prechar = chr;
606 postchar = null;
607 }
608 pre = prechar > 0 ? tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), prechar, t) : null;
609 post = postchar > 0 ? tex_new_glyph_node(glyph_unset_subtype, glyph_font(t), postchar, t) : null;
610 disc = tex_aux_insert_discretionary(t, pre, post, t, automatic_discretionary_code, tex_automatic_disc_penalty(glyph_hyphenate(t)));
611 return disc;
612}
613
614static char *tex_aux_hyphenation_exception(int exceptions, char *w)
615{
616 lua_State *L = lmt_lua_state.lua_instance;
617 char *ret = NULL;
618 if (lua_rawgeti(L, LUA_REGISTRYINDEX, exceptions) == LUA_TTABLE) {
619
620 lua_pushstring(L, w);
621 lua_rawget(L, -2);
622 if (lua_type(L, -1) == LUA_TSTRING) {
623 ret = lmt_memory_strdup(lua_tostring(L, -1));
624 }
625 lua_pop(L, 2);
626 } else {
627 lua_pop(L, 1);
628 }
629 return ret;
630}
631
632
638
639# define zws 0x200B
640# define zwnj 0x200C
641# define zwj 0x200D
642
643static halfword tex_aux_find_exception_part(unsigned int *j, unsigned int *uword, int len, halfword parent, char final)
644{
645 halfword head = null;
646 halfword tail = null;
647 unsigned i = *j;
648 int noligature = 0;
649 int nokerning = 0;
650
651 i++;
652 while (i < (unsigned) len && uword[i + 1] != (unsigned int) final) {
653 if (tail) {
654 switch (uword[i + 1]) {
655 case zwj:
656 noligature = 1;
657 nokerning = 0;
658 break;
659 case zwnj:
660 noligature = 1;
661 nokerning = 1;
662 break;
663 default:
664 {
665 halfword s = tex_new_glyph_node(glyph_unset_subtype, glyph_font(parent), (int) uword[i + 1], parent);
666 tex_couple_nodes(tail, s);
667 if (noligature) {
668 tex_add_glyph_option(tail, glyph_option_no_right_ligature);
669 tex_add_glyph_option(s, glyph_option_no_left_ligature);
670 noligature = 0;
671 }
672 if (nokerning) {
673 tex_add_glyph_option(tail, glyph_option_no_right_kern);
674 tex_add_glyph_option(s, glyph_option_no_left_kern);
675 nokerning = 0;
676 }
677 tail = node_next(tail);
678 break;
679 }
680 }
681 } else {
682 head = tex_new_glyph_node(glyph_unset_subtype, glyph_font(parent), (int) uword[i + 1], parent);
683 tail = head;
684 }
685 i++;
686 }
687 *j = ++i;
688 return head;
689}
690
691static int tex_aux_count_exception_part(unsigned int *j, unsigned int *uword, int len)
692{
693 int n = 0;
694 unsigned i = *j;
695
696 i++;
697 while (i < (unsigned) len && uword[i + 1] != '}') {
698 n++;
699 i++;
700 }
701 *j = ++i;
702 return n;
703}
704
705static void tex_aux_show_exception_error(const char *part)
706{
707 tex_handle_error(
708 normal_error_type,
709 "Invalid %s part in exception",
710 part,
711 "Exception discretionaries should contain three pairs of braced items.\n"
712 "No intervening spaces are allowed."
713 );
714}
715
716
722
723static void tex_aux_do_exception(halfword wordstart, halfword r, char *replacement)
724{
725 halfword t = wordstart;
726 lang_variables langdata;
727 unsigned uword[max_size_of_word + 1] = { 0 };
728 unsigned len = aux_splitutf2uni(uword, replacement);
729 int clang = get_glyph_language(wordstart);
730 langdata.pre_hyphen_char = tex_get_pre_hyphen_char(clang);
731 langdata.post_hyphen_char = tex_get_post_hyphen_char(clang);
732 for (unsigned i = 0; i < len; i++) {
733 if (uword[i + 1] == 0 ) {
734
735 break;
736 } else if (uword[i + 1] == '-') {
737
738 if (node_next(t) == r) {
739 break;
740 } else {
741 tex_aux_insert_syllable_discretionary(t, &langdata);
742
743 t = node_next(t);
744 }
745 } else if (uword[i + 1] == '=') {
746
747 t = node_next(t);
748 } else if (uword[i + 1] == '{') {
749
750 halfword pre = null;
751 halfword post = null;
752 halfword replace = null;
753 int count = 0;
754 int alternative = null;
755 halfword penalty;
756
757 pre = tex_aux_find_exception_part(&i, uword, (int) len, wordstart, '}');
758 if (i == len || uword[i + 1] != '{') {
759 tex_aux_show_exception_error("pre");
760 }
761
762 post = tex_aux_find_exception_part(&i, uword, (int) len, wordstart, '}');
763 if (i == len || uword[i + 1] != '{') {
764 tex_aux_show_exception_error("post");
765 }
766
767 count = tex_aux_count_exception_part(&i, uword, (int) len);
768 if (i == len) {
769 tex_aux_show_exception_error("replace");
770 } else if (uword[i] && uword[i + 1] == '(') {
771 alternative = tex_aux_find_exception_part(&i, uword, (int) len, wordstart, ')');;
772 }
773
774 if (node_next(t) == r) {
775 break;
776 } else {
777
778 if (count > 0) {
779
780 halfword q = t;
781 replace = node_next(q);
782 while (count > 0 && q) {
783 halfword t = node_type(q);
784 q = node_next(q);
785 if (t == glyph_node || t == disc_node) {
786 count--;
787 } else {
788 break ;
789 }
790 }
791
792 tex_try_couple_nodes(t, node_next(q));
793
794 node_next(q) = null;
795 if (alternative) {
796 tex_flush_node_list(replace);
797 replace = alternative;
798 } else {
799
800 q = replace ;
801 while (q) {
802 halfword n = node_next(q);
803 if (node_type(q) == disc_node) {
804
805 halfword nb = disc_no_break_head(q);
806 disc_no_break_head(q) = null;
807 node_prev(nb) = null ;
808
809 if (q == replace) {
810 replace = nb;
811 } else {
812 tex_try_couple_nodes(node_prev(q), nb);
813 }
814
815 tex_try_couple_nodes(nb, n);
816
817 tex_flush_node(q);
818 }
819 q = n ;
820 }
821 }
822 }
823
824 if (uword[i] && uword[i + 1] == '[') {
825 i += 2;
826 if (uword[i] && uword[i] >= '0' && uword[i] <= '9') {
827 if (exception_penalty_par > 0) {
828 if (exception_penalty_par > infinite_penalty) {
829 penalty = exception_penalty_par;
830 } else {
831 penalty = (uword[i] - '0') * exception_penalty_par ;
832 }
833 } else {
834 penalty = hyphen_penalty_par;
835 }
836 ++i;
837 while (uword[i] && uword[i] != ']') {
838 ++i;
839 }
840 } else {
841 penalty = hyphen_penalty_par;
842 }
843 } else {
844 penalty = hyphen_penalty_par;
845 }
846
847 t = tex_aux_insert_discretionary(t, pre, post, replace, normal_discretionary_code, penalty);
848
849 t = node_next(t);
850
854 if (uword[i] && uword[i + 1] == '{') {
855 i--;
856 t = node_prev(t);
857 }
858 }
859 } else {
860 t = node_next(t);
861 }
862
863 if (! t || node_next(t) == r) {
864 break;
865 }
866 }
867}
868
869
963
964inline static halfword tex_aux_is_hyphen_char(halfword chr)
965{
966 if (tex_get_hc_code(chr)) {
967 return tex_get_hc_code(chr);
968 } else if (chr == ex_hyphen_char_par) {
969 return chr;
970 } else {
971 return null;
972 }
973}
974
975static halfword tex_aux_find_next_wordstart(halfword r, halfword first_language)
976{
977 int start_ok = 1;
978 halfword lastglyph = r;
979 while (r) {
980 switch (node_type(r)) {
981 case boundary_node:
982 if (node_subtype(r) == word_boundary) {
983 start_ok = 1;
984 }
985 break;
986 case disc_node:
987 start_ok = has_disc_option(r, disc_option_post_word);
988 break;
989 case hlist_node:
990 case vlist_node:
991 case rule_node:
992 case dir_node:
993 case whatsit_node:
994 if (hyphenation_permitted(glyph_hyphenate(lastglyph), strict_start_hyphenation_mode)) {
995 start_ok = 0;
996 }
997 break;
998 case glue_node:
999 start_ok = 1;
1000 break;
1001 case math_node:
1002 if (node_subtype(r) == begin_inline_math) {
1003 int mathlevel = 1;
1004 while (mathlevel > 0) {
1005 r = node_next(r);
1006 if (! r) {
1007 return r;
1008 } else if (node_type(r) == math_node) {
1009 if (node_subtype(r) == begin_inline_math) {
1010 mathlevel++;
1011 } else {
1012 mathlevel--;
1013 }
1014 }
1015 }
1016 }
1017 break;
1018 case glyph_node:
1019 {
1020
1024 int chr = glyph_character(r);
1025 int hyp = tex_aux_is_hyphen_char(chr);
1026 lastglyph = r;
1027 if (hyp) {
1028 if (hyphenation_permitted(glyph_hyphenate(r), ignore_bounds_hyphenation_mode)) {
1029
1030 } else {
1031
1032 halfword t = node_next(r) ;
1033
1034 if (t && (node_type(t) == glyph_node) && (! tex_aux_is_hyphen_char(glyph_character(t))) && ! hyphenation_permitted(glyph_hyphenate(r), automatic_hyphenation_mode)) {
1035
1036 r = tex_aux_compound_word_break(r, get_glyph_language(r), hyp);
1037
1038 start_ok = 1;
1039 } else {
1040
1041 while (t && (node_type(t) == glyph_node) && tex_aux_is_hyphen_char(glyph_character(t))) {
1042 r = t ;
1043 t = node_next(r) ;
1044 }
1045 if (t) {
1046
1047 start_ok = 0;
1048 } else {
1049
1050 return null;
1051 }
1052 }
1053 }
1054 } else if (start_ok && (get_glyph_language(r) >= first_language) && get_glyph_dohyph(r)) {
1055 int l = tex_get_hj_code(get_glyph_language(r), chr);
1056 if (l > 0) {
1057 if (l == chr || l <= 32 || get_glyph_uchyph(r)) {
1058 return r;
1059 } else {
1060 start_ok = 0;
1061 }
1062 } else {
1063
1064 }
1065 } else {
1066
1067 }
1068 }
1069 break;
1070 default:
1071 start_ok = 0;
1072 break;
1073 }
1074 r = node_next(r);
1075 }
1076 return r;
1077}
1078
1079
1124
1125static int tex_aux_valid_wordend(halfword end_word, halfword r)
1126{
1127 if (r) {
1128 switch (node_type(r)) {
1129
1130
1131
1132
1133
1134 case disc_node:
1135 return has_disc_option(r, disc_option_pre_word);
1136 case hlist_node:
1137 case vlist_node:
1138 case rule_node:
1139 case dir_node:
1140 case whatsit_node:
1141 case insert_node:
1142 case adjust_node:
1143 return ! hyphenation_permitted(glyph_hyphenate(end_word), strict_end_hyphenation_mode);
1144 }
1145 }
1146 return 1;
1147}
1148
1149void tex_handle_hyphenation(halfword head, halfword tail)
1150{
1151 if (head && node_next(head)) {
1152 int callback_id = lmt_callback_defined(hyphenate_callback);
1153 if (callback_id > 0) {
1154 lua_State *L = lmt_lua_state.lua_instance;
1155 int top = 0;
1156 if (lmt_callback_okay(L, callback_id, &top)) {
1157 int i;
1158 lmt_node_list_to_lua(L, head);
1159 lmt_node_list_to_lua(L, tail);
1160 i = lmt_callback_call(L, 2, 0, top);
1161 if (i) {
1162 lmt_callback_error(L, top, i);
1163 } else {
1164 lmt_callback_wrapup(L, top);
1165 }
1166 }
1167 } else if (callback_id == 0) {
1168 tex_hyphenate_list(head, tail);
1169 } else {
1170
1171 }
1172 }
1173}
1174
1175static int tex_aux_hnj_hyphen_hyphenate(
1176 hjn_dictionary *dict,
1177 halfword first,
1178 halfword last,
1179 int length,
1180 halfword left,
1181 halfword right,
1182 lang_variables *lan
1183)
1184{
1185
1186 int ext_word_len = length + 2;
1187 int hyphen_len = ext_word_len + 1;
1188
1189 char *hyphens = lmt_memory_calloc(hyphen_len, sizeof(unsigned char));
1190 if (hyphens) {
1191 halfword here;
1192 int state = 0;
1193 int char_num = 0;
1194 int done = 0;
1195
1196 node_next(begin_period) = first;
1197 node_next(end_period) = node_next(last);
1198 node_next(last) = end_period;
1199
1200
1201
1202
1203
1204
1205
1206 for (char_num = 0, here = begin_period; here != node_next(end_period); here = node_next(here)) {
1207 int ch;
1208 if (here == begin_period || here == end_period) {
1209 ch = '.';
1210 } else {
1211 ch = tex_get_hj_code(get_glyph_language(here), glyph_character(here));
1212 if (ch <= 32) {
1213 ch = glyph_character(here);
1214 }
1215 }
1216 while (state != -1) {
1217 hjn_state *hstate = &dict->states[state];
1218 for (int k = 0; k < hstate->num_trans; k++) {
1219 if (hstate->trans[k].uni_ch == ch) {
1220 char *match;
1221 state = hstate->trans[k].new_state;
1222 match = dict->states[state].match;
1223 if (match) {
1224
1231 int offset = (int) (char_num + 2 - (int) strlen(match));
1232 for (int m = 0; match[m]; m++) {
1233 if (hyphens[offset + m] < match[m]) {
1234 hyphens[offset + m] = match[m];
1235 }
1236 }
1237 }
1238 goto NEXTLETTER;
1239 }
1240 }
1241 state = hstate->fallback_state;
1242 }
1243
1244 state = 0;
1245 NEXTLETTER:;
1246 char_num++;
1247 }
1248
1249 node_next(last) = node_next(end_period);
1250
1254 for (here = first, char_num = 2; here != left; here = node_next(here)) {
1255 char_num++;
1256 }
1257 for (; here != right; here = node_next(here)) {
1258 if (hyphens[char_num] & 1) {
1259 here = tex_aux_insert_syllable_discretionary(here, lan);
1260 done += 1;
1261 }
1262 char_num++;
1263 }
1264 lmt_memory_free(hyphens);
1265 return done;
1266 } else {
1267 tex_overflow_error("patterns", hyphen_len);
1268 return 0;
1269 }
1270}
1271
1272
1273
1274static int tex_aux_still_okay(halfword f, halfword l, halfword r, int n, const char *utf8original) {
1275 if (_valid_node_(f) && _valid_node_(l) && node_next(l) == r) {
1276 int i = 0;
1277 while (f) {
1278 ++i;
1279 if (node_type(f) != glyph_node) {
1280 tex_normal_warning("language", "the hyphenated word contains non-glyphs, skipping");
1281 return 0;
1282 } else {
1283 int cl;
1284 halfword c = (halfword) aux_str2uni_len((const unsigned char *) utf8original, &cl);
1285 utf8original += cl;
1286 if (! (c && c == glyph_character(f))) {
1287 tex_normal_warning("language", "the hyphenated word contains different characters, skipping");
1288 return 0;
1289 } else if (f != l) {
1290 f = node_next(f);
1291 } else if (i == n) {
1292 return 1;
1293 } else {
1294 tex_normal_warning("language", "the hyphenated word changed length, skipping");
1295 return 0;
1296 }
1297 }
1298 }
1299 }
1300 tex_normal_warning("language", "the hyphenation list is messed up, skipping");
1301 return 0;
1302}
1303
1304static void tex_aux_hyphenate_show(halfword beg, halfword end)
1305{
1306 if (_valid_node_(beg) && _valid_node_(end)) {
1307 halfword nxt = node_next(end);
1308 node_next(end) = null;
1309 tex_show_node_list(beg, 100, 10000);
1310 node_next(end) = nxt;
1311 }
1312}
1313
1314
1315
1316inline static int is_traditional_hyphen(halfword n)
1317{
1318 return (
1319 (glyph_character(n) == ex_hyphen_char_par)
1320 && (has_font_text_control(glyph_font(n),text_control_collapse_hyphens))
1321 && (hyphenation_permitted(glyph_hyphenate(n), collapse_hyphenation_mode))
1322 );
1323}
1324
1325int tex_collapse_list(halfword head, halfword c1, halfword c2, halfword c3)
1326{
1327
1328 halfword found = 0;
1329 if (head && c1 && c2 && c3) {
1330 halfword n1 = head;
1331 while (n1) {
1332 halfword n2 = node_next(n1);
1333 switch (node_type(n1)) {
1334 case glyph_node:
1335 if (is_traditional_hyphen(n1)) {
1336 set_glyph_discpart(n1, glyph_discpart_always);
1337 if (n2 && node_type(n2) == glyph_node && is_traditional_hyphen(n2) && glyph_font(n1) == glyph_font(n2)) {
1338 halfword n3 = node_next(n2);
1339 if (n3 && node_type(n3) == glyph_node && is_traditional_hyphen(n3) && glyph_font(n1) == glyph_font(n3)) {
1340 halfword n4 = node_next(n3);
1341 glyph_character(n1) = c3;
1342 tex_try_couple_nodes(n1, n4);
1343 tex_flush_node(n2);
1344 tex_flush_node(n3);
1345 n1 = n4;
1346 } else {
1347 glyph_character(n1) = c2;
1348 tex_try_couple_nodes(n1, n3);
1349 tex_flush_node(n2);
1350 n1 = n3;
1351 }
1352 found = 1;
1353 goto AGAIN;
1354 } else {
1355 glyph_character(n1) = c1;
1356 }
1357 }
1358 break;
1359 case disc_node:
1360 {
1361 halfword done = 0;
1362 if (disc_pre_break_head(n1) && tex_collapse_list(disc_pre_break_head(n1), c1, c2, c3)) {
1363 ++done;
1364 }
1365 if (disc_post_break_head(n1) && tex_collapse_list(disc_post_break_head(n1), c1, c2, c3)) {
1366 ++done;
1367 }
1368 if (disc_no_break_head(n1) && tex_collapse_list(disc_no_break_head(n1), c1, c2, c3)) {
1369 ++done;
1370 }
1371 if (done) {
1372 tex_check_disc_field(n1);
1373 }
1374 break;
1375 }
1376 default:
1377 break;
1378 }
1379 n1 = n2;
1380 AGAIN:;
1381 }
1382 }
1383 return found;
1384}
1385
1386void tex_hyphenate_list(halfword head, halfword tail)
1387{
1388
1389 if (tail) {
1390 halfword first_language = first_valid_language_par;
1391 halfword trace = tracing_hyphenation_par;
1392 halfword r = head;
1393
1408 while (r && node_type(r) != glyph_node) {
1409 r = node_next(r);
1410 }
1411 if (r) {
1412 r = tex_aux_find_next_wordstart(r, first_language);
1413 if (r) {
1414 lang_variables langdata;
1415 char utf8word[(4 * max_size_of_word) + 1] = { 0 };
1416 char utf8original[(4 * max_size_of_word) + 1] = { 0 };
1417 char *utf8ptr = utf8word;
1418 char *utf8ori = utf8original;
1419 int word_length = 0;
1420 int explicit_hyphen = 0;
1421 int last_char = 0;
1422 int valid = 0;
1423 halfword explicit_start = null;
1424 halfword saved_tail = node_next(tail);
1425 halfword penalty = tex_new_penalty_node(0, word_penalty_subtype);
1426
1427 tex_attach_attribute_list_copy(penalty, r);
1428 tex_couple_nodes(tail, penalty);
1429 while (r) {
1430 halfword word_start = r;
1431 int word_language = get_glyph_language(word_start);
1432 if (tex_is_valid_language(word_language)) {
1433 halfword word_end = r;
1434 int lhmin = get_glyph_lhmin(word_start);
1435 int rhmin = get_glyph_rhmin(word_start);
1436 int hmin = tex_get_hyphenation_min(word_language);
1437 halfword word_font = glyph_font(word_start);
1438 if (! tex_is_valid_font(word_font) || font_hyphen_char(word_font) < 0) {
1439
1440 word_font = 0;
1441 }
1442 langdata.pre_hyphen_char = tex_get_pre_hyphen_char(word_language);
1443 langdata.post_hyphen_char = tex_get_post_hyphen_char(word_language);
1444 while (r && node_type(r) == glyph_node && word_language == get_glyph_language(r)) {
1445 halfword chr = glyph_character(r);
1446 halfword hyp = tex_aux_is_hyphen_char(chr);
1447 if (word_language >= first_language) {
1448 last_char = tex_get_hj_code(word_language, chr);
1449 if (last_char > 0) {
1450 goto GOFORWARD;
1451 }
1452 }
1453 if (hyp) {
1454 last_char = hyp;
1455
1456
1457
1458 } else {
1459 break;
1460 }
1461 GOFORWARD:
1462
1463 explicit_hyphen = hyp;
1464 if (explicit_hyphen && node_next(r) && node_type(node_next(r)) != glyph_node && hyphenation_permitted(glyph_hyphenate(r), ignore_bounds_hyphenation_mode)) {
1465
1466 explicit_hyphen = 0;
1467 }
1468 if (explicit_hyphen) {
1469 break;
1470 } else {
1471 word_length++;
1472 if (word_length >= max_size_of_word) {
1473
1474 while (r && node_type(r) == glyph_node) {
1475 r = node_next(r);
1476 }
1477 goto PICKUP;
1478 } else {
1479 if (last_char <= 32) {
1480 if (last_char == 32) {
1481 last_char = 0 ;
1482 }
1483 if (word_length <= lhmin) {
1484 lhmin = lhmin - last_char + 1 ;
1485 if (lhmin < 0) {
1486 lhmin = 1;
1487 }
1488 }
1489 if (word_length >= rhmin) {
1490 rhmin = rhmin - last_char + 1 ;
1491 if (rhmin < 0) {
1492 rhmin = 1;
1493 }
1494 }
1495 hmin = hmin - last_char + 1 ;
1496 if (hmin < 0) {
1497 rhmin = 1;
1498 }
1499 last_char = chr ;
1500 }
1501 utf8ori = aux_uni2string(utf8ori, (unsigned) chr);
1502 utf8ptr = aux_uni2string(utf8ptr, (unsigned) last_char);
1503 word_end = r;
1504 r = node_next(r);
1505 }
1506 }
1507 }
1508 if (explicit_hyphen) {
1509
1510 if ((get_glyph_discpart(r) == glyph_discpart_replace && ! hyphenation_permitted(glyph_hyphenate(r), syllable_hyphenation_mode))) {
1511
1516 valid = 1;
1517 goto MESSYCODE;
1518 } else {
1519
1520 halfword t = node_next(r);
1521 if (t && node_type(t) == glyph_node && ! tex_aux_is_hyphen_char(glyph_character(t)) && hyphenation_permitted(glyph_hyphenate(t), automatic_hyphenation_mode)) {
1522
1523 halfword g = r;
1524 r = tex_aux_compound_word_break(r, get_glyph_language(g), explicit_hyphen);
1525 if (trace > 1) {
1526 *utf8ori = 0;
1527 tex_begin_diagnostic();
1528 tex_print_format("[language: compound word break after %s]", utf8original);
1529 tex_end_diagnostic();
1530 }
1531 if (hyphenation_permitted(glyph_hyphenate(g), compound_hyphenation_mode)) {
1532 explicit_hyphen = 0;
1533 if (hyphenation_permitted(glyph_hyphenate(g), force_handler_hyphenation_mode) || hyphenation_permitted(glyph_hyphenate(g), feedback_compound_hyphenation_mode)) {
1534 set_disc_option(r, disc_option_pre_word | disc_option_post_word);
1535 explicit_start = null;
1536 valid = 1;
1537 goto MESSYCODE;
1538 } else {
1539 if (! explicit_start) {
1540 explicit_start = word_start;
1541 }
1542
1543 utf8ptr = aux_uni2string(utf8ptr, '-');
1544 r = t;
1545 continue;
1546 }
1547 }
1548 } else {
1549
1550 while (t && node_type(t) == glyph_node && tex_aux_is_hyphen_char(glyph_character(t))) {
1551 r = t;
1552 t = node_next(r);
1553 }
1554 if (! t) {
1555
1556 r = null;
1557 }
1558 }
1559 }
1560 } else {
1561 valid = tex_aux_valid_wordend(word_end, r);
1562 MESSYCODE:
1563
1564 if (word_font && word_language >= first_language) {
1565
1566 struct tex_language *lang = lmt_language_state.languages[word_language];
1567 if (lang) {
1568 char *replacement = NULL;
1569 halfword start = explicit_start ? explicit_start : word_start;
1570 int okay = word_length >= lhmin + rhmin && (hmin <= 0 || word_length >= hmin) && hyphenation_permitted(glyph_hyphenate(start), syllable_hyphenation_mode);
1571 *utf8ptr = 0;
1572 *utf8ori = 0;
1573 if (lang->wordhandler && hyphenation_permitted(glyph_hyphenate(start), force_handler_hyphenation_mode)) {
1574 halfword restart = node_prev(start);
1575 int done = lmt_handle_word(lang, utf8original, utf8word, word_length, start, word_end, &replacement);
1576 if (replacement) {
1577 if (tex_aux_still_okay(start, word_end, r, word_length, utf8original)) {
1578 goto EXCEPTIONS2;
1579 } else {
1580 goto PICKUP;
1581 }
1582 } else {
1583
1584 switch (done) {
1585 case 1:
1586 if (_valid_node_(restart)) {
1587 r = restart;
1588 } else if (_valid_node_(start)) {
1589 r = node_prev(start);
1590 }
1591 if (! r) {
1592 if (_valid_node_(head)) {
1593 tex_normal_warning("language", "the hyphenation list is messed up, recovering");
1594 r = head;
1595 } else {
1596 tex_normal_error("language", "the hyphenated head is messed up, aborting");
1597 return;
1598 }
1599 }
1600 goto PICKUP;
1601 case 2:
1602 if (tex_aux_still_okay(start, word_end, r, word_length, utf8original)) {
1603 goto EXCEPTIONS1;
1604 } else {
1605 goto PICKUP;
1606 }
1607 case 3:
1608 if (tex_aux_still_okay(start, word_end, r, word_length, utf8original)) {
1609 goto PATTERNS;
1610 } else {
1611 goto PICKUP;
1612 }
1613 default:
1614 if (_valid_node_(r)) {
1615 goto PICKUP;
1616 } else if (_valid_node_(tail)) {
1617 tex_normal_warning("language", "the hyphenation list is messed up, quitting");
1618 goto ABORT;
1619 } else {
1620
1621 return;
1622 }
1623 }
1624 }
1625 }
1626 if (! okay || ! valid) {
1627 goto PICKUP;
1628 }
1629
1636 EXCEPTIONS1:
1637 if (lang->exceptions) {
1638 replacement = tex_aux_hyphenation_exception(lang->exceptions, utf8word);
1639 }
1640 EXCEPTIONS2:
1641 if (replacement) {
1642
1643 halfword start = explicit_start ? explicit_start : word_start;
1644 halfword beg = node_prev(start);
1645 tex_aux_do_exception(start, r, replacement);
1646 if (trace > 1) {
1647 tex_begin_diagnostic();
1648 tex_print_format("[language: exception %s to %s]", utf8original, replacement);
1649 if (trace > 2) {
1650 tex_aux_hyphenate_show(node_next(beg), node_prev(r));
1651 }
1652 tex_end_diagnostic();
1653 }
1654 lmt_memory_free(replacement);
1655 goto PICKUP;
1656 }
1657 PATTERNS:
1658 if (lang->patterns) {
1659 if (explicit_start) {
1660
1661 } else if (hyphenation_permitted(glyph_hyphenate(word_start), syllable_hyphenation_mode)) {
1662 halfword left = word_start;
1663 halfword right = r;
1664 for (int i = lhmin; i > 1; i--) {
1665 left = node_next(left);
1666 if (! left || left == right) {
1667 goto PICKUP;
1668 }
1669 }
1670 if (right != left) {
1671 int done = 0;
1672 for (int i = rhmin; i > 0; i--) {
1673 right = node_prev(right);
1674 if (! right || right == left) {
1675 goto PICKUP;
1676 }
1677 }
1678 done = tex_aux_hnj_hyphen_hyphenate(lang->patterns, word_start, word_end, word_length, left, right, &langdata);
1679 if (trace > 1) {
1680 tex_begin_diagnostic();
1681 if (done) {
1682 tex_print_format("[language: hyphenated %s at %i positions]", utf8original, done);
1683 if (trace > 2) {
1684 tex_aux_hyphenate_show(node_next(left), node_prev(right));
1685 }
1686 } else {
1687 tex_print_format("[language: not hyphenated %s]", utf8original);
1688 }
1689 tex_end_diagnostic();
1690 }
1691 }
1692 }
1693 }
1694 }
1695 }
1696 }
1697 }
1698 PICKUP:
1699 explicit_start = null ;
1700 explicit_hyphen = 0;
1701 word_length = 0;
1702 utf8ptr = utf8word;
1703 utf8ori = utf8original;
1704 if (r) {
1705 r = tex_aux_find_next_wordstart(r, first_language);
1706 } else {
1707 break;
1708 }
1709 }
1710 ABORT:
1711 tex_flush_node(node_next(tail));
1712 node_next(tail) = saved_tail;
1713 }
1714 }
1715 }
1716}
1717
1718halfword tex_glyph_to_discretionary(halfword glyph, quarterword code, int keepkern)
1719{
1720 halfword prev = node_prev(glyph);
1721 halfword next = node_next(glyph);
1722 halfword disc = tex_new_disc_node(code);
1723 halfword kern = null;
1724 if (keepkern && next && node_type(next) == kern_node && node_subtype(next) == italic_kern_subtype) {
1725 kern = node_next(next);
1726 next = node_next(kern);
1727 node_next(kern) = null;
1728 } else {
1729 node_next(glyph) = null;
1730 }
1731 node_prev(glyph) = null;
1732 tex_attach_attribute_list_copy(disc, glyph);
1733 tex_set_disc_field(disc, pre_break_code, tex_copy_node_list(glyph, null));
1734 tex_set_disc_field(disc, post_break_code, tex_copy_node_list(glyph, null));
1735 tex_set_disc_field(disc, no_break_code, glyph);
1736 tex_try_couple_nodes(prev, disc);
1737 tex_try_couple_nodes(disc, next);
1738 return disc;
1739} |