1
4
5
45
46
47
48# include "luametatex.h"
49
50
51
52static void *hnj_malloc(int size)
53{
54 void *p = lmt_memory_malloc((size_t) size);
55 if (! p) {
56 tex_formatted_error("hyphenation", "allocating %d bytes failed\n", size);
57 }
58 return p;
59}
60
61static void *hnj_realloc(void *p, int size)
62{
63 void *n = lmt_memory_realloc(p, (size_t) size);
64 if (! n) {
65 tex_formatted_error("hyphenation", "reallocating %d bytes failed\n", size);
66 }
67 return n;
68}
69
70static void hnj_free(void *p)
71{
72 lmt_memory_free(p);
73}
74
75static unsigned char *hnj_strdup(const unsigned char *s)
76{
77 size_t l = strlen((const char *) s);
78 unsigned char *n = hnj_malloc((int) l + 1);
79 memcpy(n, s, l);
80 n[l] = 0;
81 return n;
82}
83
84
91
92static char *combine_patterns(char *expr, const char *subexpr)
93{
94 size_t l1 = strlen(expr);
95 size_t l2 = strlen(subexpr);
96 size_t off = l1 - l2;
97 for (unsigned j = 0; j < l2; j++) {
98 if (expr[off + j] < subexpr[j]) {
99 expr[off + j] = subexpr[j];
100 }
101 }
102 return expr;
103}
104
105
106
107static hjn_hashiterator *new_hashiterator(hjn_hashtable *h)
108{
109 hjn_hashiterator *i = hnj_malloc(sizeof(hjn_hashiterator));
110 i->e = h->entries;
111 i->cur = NULL;
112 i->ndx = -1;
113 return i;
114}
115
116static int nexthashstealpattern(hjn_hashiterator *i, unsigned char **word, char **pattern)
117{
118 while (i->cur == NULL) {
119 if (i->ndx >= HNJ_HASH_SIZE - 1) {
120 return 0;
121 } else {
122 i->cur = i->e[++i->ndx];
123 }
124 }
125 *word = i->cur->key;
126 *pattern = i->cur->u.hyppat;
127 i->cur->u.hyppat = NULL;
128 i->cur = i->cur->next;
129 return 1;
130}
131
132static int nexthash(hjn_hashiterator *i, unsigned char **word)
133{
134 while (! i->cur) {
135 if (i->ndx >= HNJ_HASH_SIZE - 1) {
136 return 0;
137 } else {
138 i->cur = i->e[++i->ndx];
139 }
140 }
141 *word = i->cur->key;
142 i->cur = i->cur->next;
143 return 1;
144}
145
146static int eachhash(hjn_hashiterator *i, unsigned char **word, char **pattern)
147{
148 while (! i->cur) {
149 if (i->ndx >= HNJ_HASH_SIZE - 1) {
150 return 0;
151 } else {
152 i->cur = i->e[++i->ndx];
153 }
154 }
155 *word = i->cur->key;
156 *pattern = i->cur->u.hyppat;
157 i->cur = i->cur->next;
158 return 1;
159}
160
161static void delete_hashiterator(hjn_hashiterator *i)
162{
163 hnj_free(i);
164}
165
166
167
168static unsigned int string_hash(const unsigned char *s)
169{
170 const unsigned char *p = s;
171 unsigned int h = 0, g;
172 for (; *p != '\0'; p += 1) {
173 h = (h << 4) + *p;
174 g = h & 0xf0000000;
175 if (g) {
176 h = h ^ (g >> 24);
177 h = h ^ g;
178 }
179 }
180 return h;
181}
182
183
184
185static void state_insert(hjn_hashtable *hashtab, unsigned char *key, int state)
186{
187 int i = (int) (string_hash(key) % HNJ_HASH_SIZE);
188 hjn_hashentry* e = hnj_malloc(sizeof(hjn_hashentry));
189 e->next = hashtab->entries[i];
190 e->key = key;
191 e->u.state = state;
192 hashtab->entries[i] = e;
193}
194
195
196
197static void insert_pattern(hjn_hashtable *hashtab, unsigned char *key, char *hyppat, int trace)
198{
199 hjn_hashentry *e;
200 int i = (int) (string_hash(key) % HNJ_HASH_SIZE);
201 for (e = hashtab->entries[i]; e; e = e->next) {
202 if (strcmp((char *) e->key, (char *) key) == 0) {
203 if (e->u.hyppat) {
204 if (trace && hyppat && strcmp((char *) e->u.hyppat, (char *) hyppat) != 0) {
205 tex_formatted_warning("hyphenation", "a conflicting pattern '%s' has been ignored", hyppat);
206 }
207 hnj_free(e->u.hyppat);
208 }
209 e->u.hyppat = hyppat;
210 hnj_free(key);
211 return;
212 }
213 }
214 e = hnj_malloc(sizeof(hjn_hashentry));
215 e->next = hashtab->entries[i];
216 e->key = key;
217 e->u.hyppat = hyppat;
218 hashtab->entries[i] = e;
219}
220
221
222
223static int state_lookup(hjn_hashtable *hashtab, const unsigned char *key)
224{
225 int i = (int) (string_hash(key) % HNJ_HASH_SIZE);
226 for (hjn_hashentry *e = hashtab->entries[i]; e; e = e->next) {
227 if (! strcmp((const char *) key, (const char *) e->key)) {
228 return e->u.state;
229 }
230 }
231 return -1;
232}
233
234
235
236static char *lookup_pattern(hjn_hashtable * hashtab, const unsigned char *chars, int l)
237{
238 int i;
239 unsigned char key[256];
240 strncpy((char *) key, (const char *) chars, (size_t) l);
241 key[l] = 0;
242 i = (int) (string_hash(key) % HNJ_HASH_SIZE);
243 for (hjn_hashentry *e = hashtab->entries[i]; e; e = e->next) {
244 if (! strcmp((char *) key, (char *) e->key)) {
245 return e->u.hyppat;
246 }
247 }
248 return NULL;
249}
250
251
252
253static int hnj_get_state(hjn_dictionary *dict, const unsigned char *str, int *state_num)
254{
255 *state_num = state_lookup(dict->state_num, str);
256 if (*state_num >= 0) {
257 return *state_num;
258 } else {
259 state_insert(dict->state_num, hnj_strdup(str), dict->num_states);
260
261 if (! (dict->num_states & (dict->num_states - 1))) {
262 dict->states = hnj_realloc(dict->states, (int) ((dict->num_states << 1) * (int) sizeof(hjn_state)));
263 }
264 dict->states[dict->num_states] = (hjn_state) { .match = NULL, .fallback_state = -1, .num_trans = 0, .trans = NULL };
265 return dict->num_states++;
266 }
267}
268
269
275
276static void hnj_add_trans(hjn_dictionary *dict, int state1, int state2, int chr)
277{
278
285 if (chr) {
286 int num_trans = dict->states[state1].num_trans;
287 if (num_trans == 0) {
288 dict->states[state1].trans = hnj_malloc(sizeof(hjn_transition));
289 } else {
290
305 dict->states[state1].trans = hnj_realloc(dict->states[state1].trans, (int) ((num_trans + 1) * sizeof(hjn_transition)));
306 }
307 dict->states[state1].trans[num_trans].uni_ch = chr;
308 dict->states[state1].trans[num_trans].new_state = state2;
309 dict->states[state1].num_trans++;
310 } else {
311 tex_normal_error("hyphenation","a nul character is not permited");
312 }
313}
314
315
321
322
323# define tex_isspace(c) (c == ' ')
324
325static const unsigned char *next_pattern(size_t* length, const unsigned char** buf)
326{
327 const unsigned char *here, *rover = *buf;
328 while (*rover && tex_isspace(*rover)) {
329 rover++;
330 }
331 here = rover;
332 while (*rover) {
333 if (tex_isspace(*rover)) {
334 *length = (size_t) (rover - here);
335 *buf = rover;
336 return here;
337 } else {
338 rover++;
339 }
340 }
341 *length = (size_t) (rover - here);
342 *buf = rover;
343 return *length ? here : NULL;
344}
345
346static void init_hash(hjn_hashtable **h)
347{
348 if (! *h) {
349 *h = hnj_malloc(sizeof(hjn_hashtable));
350 for (int i = 0; i < HNJ_HASH_SIZE; i++) {
351 (*h)->entries[i] = NULL;
352 }
353 }
354}
355
356static void clear_state_hash(hjn_hashtable **h)
357{
358 if (*h) {
359 for (int i = 0; i < HNJ_HASH_SIZE; i++) {
360 hjn_hashentry *e, *next;
361 for (e = (*h)->entries[i]; e; e = next) {
362 next = e->next;
363 hnj_free(e->key);
364 hnj_free(e);
365 }
366 }
367 hnj_free(*h);
368 *h = NULL;
369 }
370}
371
372static void clear_pattern_hash(hjn_hashtable **h)
373{
374 if (*h) {
375 for (int i = 0; i < HNJ_HASH_SIZE; i++) {
376 hjn_hashentry *e, *next;
377 for (e = (*h)->entries[i]; e; e = next) {
378 next = e->next;
379 hnj_free(e->key);
380 if (e->u.hyppat) {
381 hnj_free(e->u.hyppat);
382 }
383 hnj_free(e);
384 }
385 }
386 hnj_free(*h);
387 *h = NULL;
388 }
389}
390
391static void init_dictionary(hjn_dictionary *dict)
392{
393 dict->num_states = 1;
394 dict->pat_length = 0;
395 dict->states = hnj_malloc(sizeof(hjn_state));
396 dict->states[0] = (hjn_state) { .match = NULL, .fallback_state = -1, .num_trans = 0, .trans = NULL };
397 dict->patterns = NULL;
398 dict->merged = NULL;
399 dict->state_num = NULL;
400 init_hash(&dict->patterns);
401}
402
403static void clear_dictionary(hjn_dictionary *dict)
404{
405 for (int state_num = 0; state_num < dict->num_states; state_num++) {
406 hjn_state *hstate = &dict->states[state_num];
407 if (hstate->match) {
408 hnj_free(hstate->match);
409 }
410 if (hstate->trans) {
411 hnj_free(hstate->trans);
412 }
413 }
414 hnj_free(dict->states);
415 clear_pattern_hash(&dict->patterns);
416 clear_pattern_hash(&dict->merged);
417 clear_state_hash(&dict->state_num);
418}
419
420hjn_dictionary *hnj_dictionary_new(void)
421{
422 hjn_dictionary *dict = hnj_malloc(sizeof(hjn_dictionary));
423 init_dictionary(dict);
424 return dict;
425}
426
427void hnj_dictionary_clear(hjn_dictionary *dict)
428{
429 clear_dictionary(dict);
430 init_dictionary(dict);
431}
432
433void hnj_dictionary_free(hjn_dictionary *dict)
434{
435 clear_dictionary(dict);
436 hnj_free(dict);
437}
438
439unsigned char *hnj_dictionary_tostring(hjn_dictionary *dict)
440{
441 unsigned char *word;
442 char *pattern;
443 unsigned char *buf = hnj_malloc(dict->pat_length);
444 unsigned char *cur = buf;
445 hjn_hashiterator *v = new_hashiterator(dict->patterns);
446 while (eachhash(v, &word, &pattern)) {
447 int i = 0;
448 int e = 0;
449 while (word[e + i]) {
450 if (pattern[i] != '0') {
451 *cur++ = (unsigned char) pattern[i];
452 }
453 *cur++ = word[e + i++];
454 while (is_utf8_follow(word[e + i])) {
455 *cur++ = word[i + e++];
456 }
457 }
458 if (pattern[i] != '0') {
459 *cur++ = (unsigned char) pattern[i];
460 }
461 *cur++ = ' ';
462 }
463 delete_hashiterator(v);
464 *cur = 0;
465 return buf;
466}
467
468
486
487void hnj_dictionary_load(hjn_dictionary *dict, const unsigned char *f, int trace)
488{
489 int state_num, last_state;
490 int ch;
491 int found;
492 hjn_hashiterator *v;
493 unsigned char *word;
494 char *pattern;
495 size_t l = 0;
496 const unsigned char *format;
497 const unsigned char *begin = f;
498 while ((format = next_pattern(&l, &f)) != NULL) {
499 if (l > 0 && l < 255) {
500 int i, j, e1;
501 for (i = 0, j = 0, e1 = 0; (unsigned) i < l; i++) {
502 if (format[i] >= '0' && format[i] <= '9') {
503 j++;
504 }
505 if (is_utf8_follow(format[i])) {
506 e1++;
507 }
508 }
509
513 {
514 unsigned char *pat = (unsigned char *) hnj_malloc((1 + (int) l - j));
515 char *org = (char *) hnj_malloc(2 + (int) l - e1 - j);
516
517 org[0] = '0';
518 for (i = 0, j = 0, e1 = 0; (unsigned) i < l; i++) {
519 unsigned char c = format[i];
520 if (is_utf8_follow(c)) {
521 pat[j + e1++] = c;
522 } else if (c < '0' || c > '9') {
523 pat[e1 + j++] = c;
524 org[j] = '0';
525 } else {
526 org[j] = (char) c;
527 }
528 }
529 pat[e1 + j] = 0;
530 org[j + 1] = 0;
531 insert_pattern(dict->patterns, pat, org, trace);
532 }
533 } else {
534 tex_normal_warning("hyphenation", "a pattern of more than 254 bytes is ignored");
535 }
536 }
537
538 dict->pat_length += (int) ((f - begin) + 2);
539 init_hash(&dict->merged);
540 v = new_hashiterator(dict->patterns);
541 while (nexthash(v, &word)) {
542 int wordsize = (int) strlen((char *) word);
543 for (int l1 = 1; l1 <= wordsize; l1++) {
544 if (is_utf8_follow(word[l1])) {
545
546 } else {
547 for (int j1 = 1; j1 <= l1; j1++) {
548 int i1 = l1 - j1;
549 if (is_utf8_follow(word[i1])) {
550
551 } else {
552 char *subpat_pat = lookup_pattern(dict->patterns, word + i1, j1);
553 if (subpat_pat) {
554 char *newpat_pat = lookup_pattern(dict->merged, word, l1);
555 if (! newpat_pat) {
556 char *neworg;
557 unsigned char *newword = (unsigned char *) hnj_malloc(l1 + 1);
558 int e1 = 0;
559 strncpy((char *) newword, (char *) word, (size_t) l1);
560 newword[l1] = 0;
561 for (i1 = 0; i1 < l1; i1++) {
562 if (is_utf8_follow(newword[i1])) {
563 e1++;
564 }
565 }
566 neworg = hnj_malloc(l1 + 2 - e1);
567
568 sprintf(neworg, "%0*d", l1 + 1 - e1, 0);
569 insert_pattern(dict->merged, newword, combine_patterns(neworg, subpat_pat), trace);
570 } else {
571 combine_patterns(newpat_pat, subpat_pat);
572 }
573 }
574 }
575 }
576 }
577 }
578 }
579 delete_hashiterator(v);
580 init_hash(&dict->state_num);
581 state_insert(dict->state_num, hnj_strdup((const unsigned char *) ""), 0);
582 v = new_hashiterator(dict->merged);
583 while (nexthashstealpattern(v, &word, &pattern)) {
584 static unsigned char mask[] = { 0x3F, 0x1F, 0xF, 0x7 };
585 int j1 = (int) strlen((char *) word);
586 state_num = hnj_get_state(dict, word, &found);
587 dict->states[state_num].match = pattern;
588
589 while (found < 0) {
590 j1--;
591 last_state = state_num;
592 ch = word[j1];
593 if (ch >= 0x80) {
594 int m;
595 int i1 = 1;
596 while (is_utf8_follow(word[j1 - i1])) {
597 i1++;
598 }
599 ch = word[j1 - i1] & mask[i1];
600 m = j1 - i1;
601 while (i1--) {
602 ch = (ch << 6) + (0x3F & word[j1 - i1]);
603 }
604 j1 = m;
605 }
606 word[j1] = '\0';
607 state_num = hnj_get_state(dict, word, &found);
608 hnj_add_trans(dict, state_num, last_state, ch);
609 }
610 }
611 delete_hashiterator(v);
612 clear_pattern_hash(&dict->merged);
613
614 for (int i = 0; i < HNJ_HASH_SIZE; i++) {
615 for (hjn_hashentry *e = dict->state_num->entries[i]; e; e = e->next) {
616
617 if (e->u.state) {
618 for (int j = 1; 1; j++) {
619 state_num = state_lookup(dict->state_num, e->key + j);
620 if (state_num >= 0) {
621 break;
622 }
623 }
624 dict->states[e->u.state].fallback_state = state_num;
625 }
626 }
627 }
628 clear_state_hash(&dict->state_num);
629}
630 |