hnjhyphen.c /size: 19 Kb    last modification: 2025-02-21 11:03
1/*
2    See license.txt in the root of this project.
3*/
4
5/*
6
7    This file is derived from libhnj which is is dual licensed under LGPL and MPL. Boilerplate
8    for both licenses follows.
9
10    LibHnj - a library for high quality hyphenation and justification
11
12    (C) 1998 Raph Levien,
13    (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
14    (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
15
16    This library is free software; you can redistribute it and/or modify it under the terms of the
17    GNU Library General Public License as published by the Free Software Foundation; either version
18    2 of the License, or (at your option) any later version.
19
20    This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
21    without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
22    the GNU Library General Public License for more details.
23
24    You should have received a copy of the GNU Library General Public License along with this
25    library; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
26    Boston, MA 02111-1307 USA.
27
28    The contents of this file are subject to the Mozilla Public License Version 1.0 (the "MPL");
29    you may not use this file except in compliance with the MPL. You may obtain a copy of the MPL
30    at http://www.mozilla.org/MPL/
31
32    Software distributed under the MPL is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
33    KIND, either express or implied. See the MPL for the specific language governing rights and
34    limitations under the MPL.
35
36    Remark: I'm not sure if something fundamental was adapted in the perspective of using this
37    library in LuaTeX. However, for instance error reporting has been hooked into the Lua(Meta)TeX
38    error reporting mechanisms. Also a bit of reformatting was done. This module won't change.
39    Also, the code has been adapted a little in order to fit in the rest (function names etc)
40    because it is more exposed. We use the alternative memory allocator.
41
42    I might do a little cleanup like consistent function names etc. 
43
44*/
45
46/*tex We need the warning subsystem, so: */
47
48# include "luametatex.h"
49
50/*tex A few helpers (from |hnjalloc|): */
51
52static void *hnj_malloc(int size)
53{
54    void *p = lmt_memory_malloc((size_t) size);
55    if (! p) {
56        tex_formatted_error("hyphenation", "allocating %d bytes failed\n", size);
57    }
58    return p;
59}
60
61static void *hnj_realloc(void *p, int size)
62{
63    void *n = lmt_memory_realloc(p, (size_t) size);
64    if (! n) {
65        tex_formatted_error("hyphenation", "reallocating %d bytes failed\n", size);
66    }
67    return n;
68}
69
70static void hnj_free(void *p)
71{
72    lmt_memory_free(p);
73}
74
75static unsigned char *hnj_strdup(const unsigned char *s)
76{
77    size_t l = strlen((const char *) s);
78    unsigned char *n = hnj_malloc((int) l + 1);
79    memcpy(n, s, l);
80    n[l] = 0;
81    return n;
82}
83
84/*tex
85
86    Combine two right-aligned number patterns, 04000 + 020 becomes 04020. This works also for utf8
87    sequences because the substring is identical to the last |substring - length| bytes of expr
88    except for the (single byte) hyphenation encoders
89
90*/
91
92static char *combine_patterns(char *expr, const char *subexpr)
93{
94    size_t l1 = strlen(expr);
95    size_t l2 = strlen(subexpr);
96    size_t off = l1 - l2;
97    for (unsigned j = 0; j < l2; j++) {
98        if (expr[off + j] < subexpr[j]) {
99            expr[off + j] = subexpr[j];
100        }
101    }
102    return expr;
103}
104
105/*tex Some original code: */
106
107static hjn_hashiterator *new_hashiterator(hjn_hashtable *h)
108{
109    hjn_hashiterator *i = hnj_malloc(sizeof(hjn_hashiterator));
110    i->e = h->entries;
111    i->cur = NULL;
112    i->ndx = -1;
113    return i;
114}
115
116static int nexthashstealpattern(hjn_hashiterator *i, unsigned char **word, char **pattern)
117{
118    while (i->cur == NULL) {
119        if (i->ndx >= HNJ_HASH_SIZE - 1) {
120            return 0;
121        } else {
122            i->cur = i->e[++i->ndx];
123        }
124    }
125    *word = i->cur->key;
126    *pattern = i->cur->u.hyppat;
127    i->cur->u.hyppat = NULL;
128    i->cur = i->cur->next;
129    return 1;
130}
131
132static int nexthash(hjn_hashiterator *i, unsigned char **word)
133{
134    while (! i->cur) {
135        if (i->ndx >= HNJ_HASH_SIZE - 1) {
136            return 0;
137        } else {
138            i->cur = i->e[++i->ndx];
139        }
140    }
141    *word = i->cur->key;
142    i->cur = i->cur->next;
143    return 1;
144}
145
146static int eachhash(hjn_hashiterator *i, unsigned char **word, char **pattern)
147{
148    while (! i->cur) {
149        if (i->ndx >= HNJ_HASH_SIZE - 1) {
150            return 0;
151        } else {
152            i->cur = i->e[++i->ndx];
153        }
154    }
155    *word = i->cur->key;
156    *pattern = i->cur->u.hyppat;
157    i->cur = i->cur->next;
158    return 1;
159}
160
161static void delete_hashiterator(hjn_hashiterator *i)
162{
163    hnj_free(i);
164}
165
166/*tex A |char*| hash function from ASU, adapted from |Gtk+|: */
167
168static unsigned int string_hash(const unsigned char *s)
169{
170    const unsigned char *p = s;
171    unsigned int h = 0, g;
172    for (; *p != '\0'; p += 1) {
173        h = (h << 4) + *p;
174        g = h & 0xf0000000;
175        if (g) {
176            h = h ^ (g >> 24);
177            h = h ^ g;
178        }
179    }
180    return h;
181}
182
183/*tex This assumes that key is not already present! */
184
185static void state_insert(hjn_hashtable *hashtab, unsigned char *key, int state)
186{
187    int i = (int) (string_hash(key) % HNJ_HASH_SIZE);
188    hjn_hashentry* e = hnj_malloc(sizeof(hjn_hashentry));
189    e->next = hashtab->entries[i];
190    e->key = key;
191    e->u.state = state;
192    hashtab->entries[i] = e;
193}
194
195/*tex This also assumes that key is not already present! */
196
197static void insert_pattern(hjn_hashtable *hashtab, unsigned char *key, char *hyppat, int trace)
198{
199    hjn_hashentry *e;
200    int i = (int) (string_hash(key) % HNJ_HASH_SIZE);
201    for (e = hashtab->entries[i]; e; e = e->next) {
202        if (strcmp((char *) e->key, (char *) key) == 0) {
203            if (e->u.hyppat) {
204                if (trace && hyppat && strcmp((char *) e->u.hyppat, (char *) hyppat) != 0) {
205                    tex_formatted_warning("hyphenation", "a conflicting pattern '%s' has been ignored", hyppat);
206                }
207                hnj_free(e->u.hyppat);
208            }
209            e->u.hyppat = hyppat;
210            hnj_free(key);
211            return;
212        }
213    }
214    e = hnj_malloc(sizeof(hjn_hashentry));
215    e->next = hashtab->entries[i];
216    e->key = key;
217    e->u.hyppat = hyppat;
218    hashtab->entries[i] = e;
219}
220
221/*tex We return |state| if found, otherwise |-1|. */
222
223static int state_lookup(hjn_hashtable *hashtab, const unsigned char *key)
224{
225    int i = (int) (string_hash(key) % HNJ_HASH_SIZE);
226    for (hjn_hashentry *e = hashtab->entries[i]; e; e = e->next) {
227        if (! strcmp((const char *) key, (const char *) e->key)) {
228            return e->u.state;
229        }
230    }
231    return -1;
232}
233
234/*tex We return |state| if found, otherwise |-1|. The 256 should be enough. */
235
236static char *lookup_pattern(hjn_hashtable * hashtab, const unsigned char *chars, int l)
237{
238    int i;
239    unsigned char key[256];
240    strncpy((char *) key, (const char *) chars, (size_t) l);
241    key[l] = 0;
242    i = (int) (string_hash(key) % HNJ_HASH_SIZE);
243    for (hjn_hashentry *e = hashtab->entries[i]; e; e = e->next) {
244        if (! strcmp((char *) key, (char *) e->key)) {
245            return e->u.hyppat;
246        }
247    }
248    return NULL;
249}
250
251/*tex Get the state number, allocating a new state if necessary. */
252
253static int hnj_get_state(hjn_dictionary *dict, const unsigned char *str, int *state_num)
254{
255    *state_num = state_lookup(dict->state_num, str);
256    if (*state_num >= 0) {
257        return *state_num;
258    } else {
259        state_insert(dict->state_num, hnj_strdup(str), dict->num_states);
260        /*tex The predicate is true if |dict->num_states| is a power of two: */
261        if (! (dict->num_states & (dict->num_states - 1))) {
262            dict->states = hnj_realloc(dict->states, (int) ((dict->num_states << 1) * (int) sizeof(hjn_state)));
263        }
264        dict->states[dict->num_states] = (hjn_state) { .match = NULL, .fallback_state = -1, .num_trans = 0, .trans = NULL };
265        return dict->num_states++;
266    }
267}
268
269/*tex
270
271    Add a transition from state1 to state2 through ch - assumes that the transition does not
272    already exist.
273
274*/
275
276static void hnj_add_trans(hjn_dictionary *dict, int state1, int state2, int chr)
277{
278    /*tex
279
280        This test was a bit too strict, it is quite normal for old patterns to have chars in the
281        range 0-31 or 127-159 (inclusive). To ease the transition, let's only disallow |nul| for
282        now, which probably is a requirement of the code anyway.
283
284    */
285    if (chr) {
286        int num_trans = dict->states[state1].num_trans;
287        if (num_trans == 0) {
288            dict->states[state1].trans = hnj_malloc(sizeof(hjn_transition));
289        } else {
290            /*tex
291
292                The old version did:
293
294                \starttyping
295                } else if (!(num_trans & (num_trans - 1))) {
296                    ... = hnj_realloc(dict->states[state1].trans,
297                            (int) ((num_trans << 1) * sizeof(HyphenTrans)));
298                \stoptyping
299
300                but that is incredibly nasty when adding patters one-at-a-time. Controlled growth
301                would be nicer than the current +1, but if no one complains, and no one did in a
302                decade, this is good enough.
303
304            */
305            dict->states[state1].trans = hnj_realloc(dict->states[state1].trans, (int) ((num_trans + 1) * sizeof(hjn_transition)));
306        }
307        dict->states[state1].trans[num_trans].uni_ch = chr;
308        dict->states[state1].trans[num_trans].new_state = state2;
309        dict->states[state1].num_trans++;
310    } else {
311        tex_normal_error("hyphenation","a nul character is not permited");
312    }
313}
314
315/*tex
316
317    We did change the semantics a bit here: |hnj_hyphen_load| used to operate on a file, but now
318    the argument is a string buffer.
319
320*/
321
322/* define tex_isspace(c) (c == ' ' || c == '\t') */
323#  define tex_isspace(c) (c == ' ')
324
325static const unsigned char *next_pattern(size_t* length, const unsigned char** buf)
326{
327    const unsigned char *here, *rover = *buf;
328    while (*rover && tex_isspace(*rover)) {
329        rover++;
330    }
331    here = rover;
332    while (*rover) {
333        if (tex_isspace(*rover)) {
334            *length = (size_t) (rover - here);
335            *buf = rover;
336            return here;
337        } else {
338            rover++;
339        }
340    }
341    *length = (size_t) (rover - here);
342    *buf = rover;
343    return *length ? here : NULL;
344}
345
346static void init_hash(hjn_hashtable **h)
347{
348    if (! *h) {
349        *h = hnj_malloc(sizeof(hjn_hashtable));
350        for (int i = 0; i < HNJ_HASH_SIZE; i++) {
351            (*h)->entries[i] = NULL;
352        }
353    }
354}
355
356static void clear_state_hash(hjn_hashtable **h)
357{
358    if (*h) {
359        for (int i = 0; i < HNJ_HASH_SIZE; i++) {
360            hjn_hashentry *e, *next;
361            for (e = (*h)->entries[i]; e; e = next) {
362                next = e->next;
363                hnj_free(e->key);
364                hnj_free(e);
365            }
366        }
367        hnj_free(*h);
368        *h = NULL;
369    }
370}
371
372static void clear_pattern_hash(hjn_hashtable **h)
373{
374    if (*h) {
375        for (int i = 0; i < HNJ_HASH_SIZE; i++) {
376            hjn_hashentry *e, *next;
377            for (e = (*h)->entries[i]; e; e = next) {
378                next = e->next;
379                hnj_free(e->key);
380                if (e->u.hyppat) {
381                    hnj_free(e->u.hyppat);
382                }
383                hnj_free(e);
384            }
385        }
386        hnj_free(*h);
387        *h = NULL;
388    }
389}
390
391static void init_dictionary(hjn_dictionary *dict)
392{
393    dict->num_states = 1;
394    dict->pat_length = 0;
395    dict->states = hnj_malloc(sizeof(hjn_state));
396    dict->states[0] = (hjn_state) { .match = NULL, .fallback_state = -1, .num_trans = 0, .trans = NULL };
397    dict->patterns = NULL;
398    dict->merged = NULL;
399    dict->state_num = NULL;
400    init_hash(&dict->patterns);
401}
402
403static void clear_dictionary(hjn_dictionary *dict)
404{
405    for (int state_num = 0; state_num < dict->num_states; state_num++) {
406        hjn_state *hstate = &dict->states[state_num];
407        if (hstate->match) {
408            hnj_free(hstate->match);
409        }
410        if (hstate->trans) {
411            hnj_free(hstate->trans);
412        }
413    }
414    hnj_free(dict->states);
415    clear_pattern_hash(&dict->patterns);
416    clear_pattern_hash(&dict->merged);
417    clear_state_hash(&dict->state_num);
418}
419
420hjn_dictionary *hnj_dictionary_new(void)
421{
422    hjn_dictionary *dict = hnj_malloc(sizeof(hjn_dictionary));
423    init_dictionary(dict);
424    return dict;
425}
426
427void hnj_dictionary_clear(hjn_dictionary *dict)
428{
429    clear_dictionary(dict);
430    init_dictionary(dict);
431}
432
433void hnj_dictionary_free(hjn_dictionary *dict)
434{
435    clear_dictionary(dict);
436    hnj_free(dict);
437}
438
439unsigned char *hnj_dictionary_tostring(hjn_dictionary *dict)
440{
441    unsigned char *word;
442    char *pattern;
443    unsigned char *buf = hnj_malloc(dict->pat_length);
444    unsigned char *cur = buf;
445    hjn_hashiterator *v = new_hashiterator(dict->patterns);
446    while (eachhash(v, &word, &pattern)) {
447        int i = 0;
448        int e = 0;
449        while (word[e + i]) {
450            if (pattern[i] != '0') {
451                *cur++ = (unsigned char) pattern[i];
452            }
453            *cur++ = word[e + i++];
454            while (is_utf8_follow(word[e + i])) {
455                *cur++ = word[i + e++];
456            }
457        }
458        if (pattern[i] != '0') {
459            *cur++ = (unsigned char) pattern[i];
460        }
461        *cur++ = ' ';
462    }
463    delete_hashiterator(v);
464    *cur = 0;
465    return buf;
466}
467
468/*tex
469
470    In hyphenation patterns we use signed bytes where |0|, or actually any negative number,
471    indicates end:
472
473    \starttyping
474    prio(1+),startpos,length,len1,[replace],len2,[replace]
475    \starttyping
476
477    A basic example is:
478
479    \starttyping
480    p n 0 0 0
481    \starttyping
482
483    for a hyphenation point between characters.
484
485*/
486
487void hnj_dictionary_load(hjn_dictionary *dict, const unsigned char *f, int trace)
488{
489    int state_num, last_state;
490    int ch;
491    int found;
492    hjn_hashiterator *v;
493    unsigned char *word;
494    char *pattern;
495    size_t l = 0;
496    const unsigned char *format;
497    const unsigned char *begin = f;
498    while ((format = next_pattern(&l, &f)) != NULL) {
499        if (l > 0 && l < 255) {
500            int i, j, e1;
501            for (i = 0, j = 0, e1 = 0; (unsigned) i < l; i++) {
502                if (format[i] >= '0' && format[i] <= '9') {
503                    j++;
504                }
505                if (is_utf8_follow(format[i])) {
506                    e1++;
507                }
508            }
509            /*tex
510                Here |l-e1| is the number of {\em characters} not {\em bytes}, |l-j| the number of
511                pattern bytes and |l-e1-j| the number of pattern characters.
512            */
513            {
514                unsigned char *pat = (unsigned char *) hnj_malloc((1 + (int) l - j));
515                char *org = (char *) hnj_malloc(2 + (int) l - e1 - j);
516                /*tex Remove hyphenation encoders (digits) from pat. */
517                org[0] = '0';
518                for (i = 0, j = 0, e1 = 0; (unsigned) i < l; i++) {
519                    unsigned char c = format[i];
520                    if (is_utf8_follow(c)) {
521                        pat[j + e1++] = c;
522                    } else if (c < '0' || c > '9') {
523                        pat[e1 + j++] = c;
524                        org[j] = '0';
525                    } else {
526                        org[j] = (char) c;
527                    }
528                }
529                pat[e1 + j] = 0;
530                org[j + 1] = 0;
531                insert_pattern(dict->patterns, pat, org, trace);
532            }
533        } else {
534           tex_normal_warning("hyphenation", "a pattern of more than 254 bytes is ignored");
535        }
536    }
537    /*tex We add 2 bytes for spurious spaces. */
538    dict->pat_length += (int) ((f - begin) + 2);
539    init_hash(&dict->merged);
540    v = new_hashiterator(dict->patterns);
541    while (nexthash(v, &word)) {
542        int wordsize = (int) strlen((char *) word);
543        for (int l1 = 1; l1 <= wordsize; l1++) {
544            if (is_utf8_follow(word[l1])) {
545                /*tex Do not clip an utf8 sequence. */
546            } else {
547                for (int j1 = 1; j1 <= l1; j1++) {
548                    int i1 = l1 - j1;
549                    if (is_utf8_follow(word[i1])) {
550                        /*tex Do not start halfway an utf8 sequence. */
551                    } else {
552                        char *subpat_pat = lookup_pattern(dict->patterns, word + i1, j1);
553                        if (subpat_pat) {
554                            char *newpat_pat = lookup_pattern(dict->merged, word, l1);
555                            if (! newpat_pat) {
556                                char *neworg;
557                                unsigned char *newword = (unsigned char *) hnj_malloc(l1 + 1);
558                                int e1 = 0;
559                                strncpy((char *) newword, (char *) word, (size_t) l1);
560                                newword[l1] = 0;
561                                for (i1 = 0; i1 < l1; i1++) {
562                                    if (is_utf8_follow(newword[i1])) {
563                                        e1++;
564                                    }
565                                }
566                                neworg = hnj_malloc(l1 + 2 - e1);
567                                /*tex Fill with right amount of zeros: */
568                                sprintf(neworg, "%0*d", l1 + 1 - e1, 0);
569                                insert_pattern(dict->merged, newword, combine_patterns(neworg, subpat_pat), trace);
570                            } else {
571                                combine_patterns(newpat_pat, subpat_pat);
572                            }
573                        }
574                    }
575                }
576            }
577        }
578    }
579    delete_hashiterator(v);
580    init_hash(&dict->state_num);
581    state_insert(dict->state_num, hnj_strdup((const unsigned char *) ""), 0);
582    v = new_hashiterator(dict->merged);
583    while (nexthashstealpattern(v, &word, &pattern)) {
584        static unsigned char mask[] = { 0x3F, 0x1F, 0xF, 0x7 };
585        int j1 = (int) strlen((char *) word);
586        state_num = hnj_get_state(dict, word, &found);
587        dict->states[state_num].match = pattern;
588        /*tex Now, put in the prefix transitions. */
589        while (found < 0) {
590            j1--;
591            last_state = state_num;
592            ch = word[j1];
593            if (ch >= 0x80) { /* why not is_utf8_follow(ch) here */
594                int m;
595                int i1 = 1;
596                while (is_utf8_follow(word[j1 - i1])) {
597                    i1++;
598                }
599                ch = word[j1 - i1] & mask[i1];
600                m = j1 - i1;
601                while (i1--) {
602                    ch = (ch << 6) + (0x3F & word[j1 - i1]);
603                }
604                j1 = m;
605            }
606            word[j1] = '\0';
607            state_num = hnj_get_state(dict, word, &found);
608            hnj_add_trans(dict, state_num, last_state, ch);
609        }
610    }
611    delete_hashiterator(v);
612    clear_pattern_hash(&dict->merged);
613    /*tex Put in the fallback states. */
614    for (int i = 0; i < HNJ_HASH_SIZE; i++) {
615        for (hjn_hashentry *e = dict->state_num->entries[i]; e; e = e->next) {
616            /*tex Do not do |state == 0| otherwise things get confused. */
617            if (e->u.state) {
618                for (int j = 1; 1; j++) {
619                    state_num = state_lookup(dict->state_num, e->key + j);
620                    if (state_num >= 0) {
621                        break;
622                    }
623                }
624                dict->states[e->u.state].fallback_state = state_num;
625            }
626        }
627    }
628    clear_state_hash(&dict->state_num);
629}
630