texstringpool.c /size: 21 Kb    last modification: 2025-02-21 11:03
1/*
2    See license.txt in the root of this project.
3*/
4
5# include "luametatex.h"
6
7/*tex
8
9    Control sequence names and diagnostic messages are variable length strings of eight bit
10    characters. Since \PASCAL\ did not have a well-developed string mechanism, \TEX\ did all of its
11    string processing by homegrown methods.
12
13    Elaborate facilities for dynamic strings are not needed, so all of the necessary operations can
14    be handled with a simple data structure. The array |str_pool| contains all of the (eight-bit)
15    bytes off all of the strings, and the array |str_start| contains indices of the starting points
16    of each string. Strings are referred to by integer numbers, so that string number |s| comprises
17    the characters |str_pool[j]| for |str_start_macro(s) <= j < str_start_macro (s + 1)|. Additional
18    integer variables |pool_ptr| and |str_ptr| indicate the number of entries used so far in
19    |str_pool| and |str_start|, respectively; locations |str_pool[pool_ptr]| and |str_start_macro
20    (str_ptr)| are ready for the next string to be allocated.
21
22    String numbers 0 to |biggest_char| are reserved for strings that correspond to single \UNICODE\
23    characters. This is in accordance with the conventions of \WEB\ which converts single-character
24    strings into the ASCII code number of the single character involved.
25
26    The stringpool variables are collected in:
27
28*/
29
30string_pool_info lmt_string_pool_state = {
31    .string_pool           = NULL,
32    .string_pool_data      = {
33        .minimum   = min_pool_size,
34        .maximum   = max_pool_size,
35        .size      = siz_pool_size,
36        .step      = stp_pool_size,
37        .allocated = 0,
38        .itemsize  = sizeof(lstring),
39        .top       = 0,
40        .ptr       = 0,
41        .initial   = 0,
42        .offset    = cs_offset_value,
43        .extra     = 0, 
44    },
45    .string_body_data      = {
46        .minimum   = min_body_size,
47        .maximum   = max_body_size,
48        .size      = siz_body_size,
49        .step      = stp_body_size,
50        .allocated = 0,
51        .itemsize  = sizeof(unsigned char),
52        .top       = memory_data_unset,
53        .ptr       = memory_data_unset,
54        .initial   = 0,
55        .offset    = 0,
56        .extra     = 0, 
57    },
58    .reserved              = 0,
59    .string_max_length     = 0,
60    .string_temp           = NULL,
61    .string_temp_allocated = 0,
62    .string_temp_top       = 0,
63};
64
65/*tex
66
67    The array of strings is |string_pool|, the number of the current string being created is
68    |str_ptr|, the starting value of |str_ptr| is |init_str_ptr|, and the current string buffer,
69    the current index in that buffer, the mallocedsize of |cur_string| and the occupied byte count
70    are kept in |cur_string|, |cur_length|, |cur_string_size| and |pool_size|.
71
72    Once a sequence of characters has been appended to |cur_string|, it officially becomes a string
73    when the function |make_string| is called. This function returns the identification number of
74    the new string as its value.
75
76    Strings end with a zero character which makes \TEX\ string also valid \CCODE\ strings. The
77    |string_temp*| fields deal with a temporary string (building).
78
79    The |ptr| is always one ahead. This is kind of a safeguard: an overflow happens already when we
80    still assemble a new string.
81
82*/
83
84# define initial_temp_string_slots  256
85# define reserved_temp_string_slots   2
86
87static inline void tex_aux_increment_pool_string(int n)
88{
89    lmt_string_pool_state.string_body_data.allocated += n;
90    if (lmt_string_pool_state.string_body_data.allocated > lmt_string_pool_state.string_body_data.size) {
91        tex_overflow_error("poolbody", lmt_string_pool_state.string_body_data.allocated);
92    }
93}
94
95static inline void tex_aux_decrement_pool_string(int n)
96{
97    lmt_string_pool_state.string_body_data.allocated -= n;
98}
99
100static void tex_aux_flush_cur_string(void)
101{
102    if (lmt_string_pool_state.string_temp) {
103        aux_deallocate_array(lmt_string_pool_state.string_temp);
104    }
105    lmt_string_pool_state.string_temp = NULL;
106    lmt_string_pool_state.string_temp_top = 0;
107    lmt_string_pool_state.string_temp_allocated = 0;
108}
109
110void tex_reset_cur_string(void)
111{
112    unsigned char *tmp = aux_allocate_clear_array(sizeof(unsigned char), initial_temp_string_slots, reserved_temp_string_slots);
113    if (tmp) {
114        lmt_string_pool_state.string_temp = tmp;
115        lmt_string_pool_state.string_temp_top = 0;
116        lmt_string_pool_state.string_temp_allocated = initial_temp_string_slots;
117    } else {
118        tex_overflow_error("pool", initial_temp_string_slots);
119    }
120}
121
122static int tex_aux_room_in_string(int wsize)
123{
124    /* no callback here */
125    if (! lmt_string_pool_state.string_temp) {
126        tex_reset_cur_string();
127    }
128    if ((lmt_string_pool_state.string_temp_top + wsize) > lmt_string_pool_state.string_temp_allocated) {
129        unsigned char *tmp = NULL;
130        int size = lmt_string_pool_state.string_temp_allocated + lmt_string_pool_state.string_temp_allocated / 5 + STRING_EXTRA_AMOUNT;
131        if (size < wsize) {
132            size = wsize + STRING_EXTRA_AMOUNT;
133        }
134        tmp = aux_reallocate_array(lmt_string_pool_state.string_temp, sizeof(unsigned char), size, reserved_temp_string_slots);
135        if (tmp) {
136            lmt_string_pool_state.string_temp = tmp;
137            memset(tmp + lmt_string_pool_state.string_temp_top, 0, (size_t) size - lmt_string_pool_state.string_temp_top);
138        } else {
139            tex_overflow_error("pool", size);
140        }
141        lmt_string_pool_state.string_temp_allocated = size;
142    }
143    return 1;
144}
145
146# define reserved_string_slots 1
147
148/*tex Messy: ptr and top have cs_offset_value included */
149
150void tex_initialize_string_mem(void)
151{
152    int size = lmt_string_pool_state.string_pool_data.minimum;
153    if (lmt_main_state.run_state == initializing_state) {
154        size = lmt_string_pool_state.string_pool_data.minimum;
155        lmt_string_pool_state.string_pool_data.ptr = cs_offset_value;
156    } else {
157        size = lmt_string_pool_state.string_pool_data.allocated;
158        lmt_string_pool_state.string_pool_data.initial = lmt_string_pool_state.string_pool_data.ptr;
159    }
160    if (size > 0) {
161        lstring *pool = aux_allocate_clear_array(sizeof(lstring), size, reserved_string_slots);
162        if (pool) {
163            lmt_string_pool_state.string_pool = pool;
164            lmt_string_pool_state.string_pool_data.allocated = size;
165        } else {
166            tex_overflow_error("pool", size);
167        }
168    }
169}
170
171void tex_initialize_string_pool(void)
172{
173    unsigned char *nullstring = lmt_memory_malloc(1);
174    int size = lmt_string_pool_state.string_pool_data.allocated;
175    if (size && nullstring) {
176        lmt_string_pool_state.string_pool[0].s = nullstring;
177        nullstring[0] = '\0';
178        lmt_string_pool_state.string_pool_data.ptr++;
179        tex_reset_cur_string();
180    } else {
181        tex_overflow_error("pool", size);
182    }
183}
184
185static int tex_aux_room_in_string_pool(int n)
186{
187    int top = lmt_string_pool_state.string_pool_data.ptr + n;
188    if (top > lmt_string_pool_state.string_pool_data.top) {
189        lmt_string_pool_state.string_pool_data.top = top;
190        top -=  cs_offset_value;
191        if (top > lmt_string_pool_state.string_pool_data.allocated) {
192            lstring *tmp = NULL;
193            top = lmt_string_pool_state.string_pool_data.allocated;
194            do {
195                top += lmt_string_pool_state.string_pool_data.step;
196                n -= lmt_string_pool_state.string_pool_data.step;
197            } while (n > 0);
198            if (top > lmt_string_pool_state.string_pool_data.size) {
199                top = lmt_string_pool_state.string_pool_data.size;
200            }
201            if (top > lmt_string_pool_state.string_pool_data.allocated) {
202                lmt_string_pool_state.string_pool_data.allocated = top;
203                tmp = aux_reallocate_array(lmt_string_pool_state.string_pool, sizeof(lstring), top, reserved_string_slots);
204                lmt_string_pool_state.string_pool = tmp;
205            }
206            lmt_run_memory_callback("pool", tmp ? 1 : 0);
207            if (! tmp) {
208                tex_overflow_error("pool", top);
209                return 0;
210            }
211        }
212    }
213    return 1;
214}
215
216/*tex
217
218    Checking for the last one to be the same as the previous one doesn't save much some 10K on a
219    \CONTEXT\ format.
220
221*/
222
223strnumber tex_make_string(void)
224{
225    if (tex_aux_room_in_string(1)) {
226        int ptr = lmt_string_pool_state.string_pool_data.ptr;
227        lmt_string_pool_state.string_temp[lmt_string_pool_state.string_temp_top] = '\0';
228        str_string(ptr) = lmt_string_pool_state.string_temp;
229        str_length(ptr) = lmt_string_pool_state.string_temp_top;
230        tex_aux_increment_pool_string(lmt_string_pool_state.string_temp_top);
231        tex_reset_cur_string();
232        if (tex_aux_room_in_string_pool(1)) {
233            lmt_string_pool_state.string_pool_data.ptr++;
234        }
235        return ptr;
236    } else {
237        return get_nullstr();
238    }
239}
240
241strnumber tex_push_string(const unsigned char *s, int l)
242{
243    if (tex_aux_room_in_string_pool(1)) {
244        unsigned char *t = lmt_memory_malloc(sizeof(char) * ((size_t) l + 1));
245        if (t) {
246            int ptr = lmt_string_pool_state.string_pool_data.ptr;
247            memcpy(t, s, l);
248            t[l] = '\0';
249            str_string(ptr) = t;
250            str_length(ptr) = l;
251            lmt_string_pool_state.string_pool_data.ptr++;
252            tex_aux_increment_pool_string(l);
253            return ptr;
254        }
255    }
256    return get_nullstr();
257}
258
259char *tex_take_string(int *len)
260{
261    char* ptr = NULL;
262    if (tex_aux_room_in_string(1)) {
263        lmt_string_pool_state.string_temp[lmt_string_pool_state.string_temp_top] = '\0';
264        if (len) {
265            *len = lmt_string_pool_state.string_temp_top;
266        }
267        ptr = (char *) lmt_string_pool_state.string_temp;
268        tex_reset_cur_string();
269    }
270    return ptr;
271}
272
273/*tex
274
275    The following subroutine compares string |s| with another string of the same length that appears
276    in |buffer| starting at position |k|; the result is |true| if and only if the strings are equal.
277    Empirical tests indicate that |str_eq_buf| is used in such a way that it tends to return |true|
278    about 80 percent of the time.
279
280    \startyping
281    unsigned char *j = str_string(s);
282    unsigned char *l = j + str_length(s);
283    while (j < l) {
284        if (*j++ != buffer[k++])
285            return 0;
286    }
287    \stoptyping
288
289*/
290
291// int tex_str_eq_buf(strnumber s, int k, int n)
292// {
293//     if (s < cs_offset_value) {
294//         /* very unlikely */
295//         return buffer_to_unichar(k) == (unsigned int) s;
296//     } else {
297//         return memcmp(str_string(s), &lmt_fileio_state.io_buffer[k], n) == 0;
298//     }
299// }
300
301// int tex_str_eq_buf(strnumber s, int k, int n)
302// {
303//     if (s >= cs_offset_value) {
304//         return memcmp(str_string(s), &lmt_fileio_state.io_buffer[k], n) == 0;
305//     } else {
306//         /* very unlikely */
307//         return buffer_to_unichar(k) == (unsigned int) s;
308//     }
309// }
310
311int tex_str_eq_buf(strnumber s, int k, int n)
312{
313    return (s >= cs_offset_value) ? (memcmp(str_string(s), &lmt_fileio_state.io_buffer[k], n) == 0) : (buffer_to_unichar(k) == (unsigned int) s);
314}
315
316/*tex
317
318    Here is a similar routine, but it compares two strings in the string pool, and it does not
319    assume that they have the same length.
320
321    \starttyping
322    k = str_string(t);
323    j = str_string(s);
324    l = j + str_length(s);
325    while (j < l) {
326        if (*j++ != *k++)
327            return 0;
328    }
329    \stoptyping
330*/
331
332int tex_str_eq_str(strnumber s, strnumber t)
333{
334    if (s >= cs_offset_value) {
335        if (t >= cs_offset_value) {
336            /* s and t are strings, this is the most likely test */
337            return (str_length(s) == str_length(t)) && ! memcmp(str_string(s), str_string(t), str_length(s));
338        } else {
339            /* s is a string and t an unicode character, happens seldom */
340            return (strnumber) aux_str2uni(str_string(s)) == t;
341        }
342    } else if (t >= cs_offset_value) {
343        /* s is an unicode character and t is a string, happens seldom */
344        return (strnumber) aux_str2uni(str_string(t)) == s;
345    } else {
346        /* s and t are unicode characters */
347        return s == t;
348    }
349}
350
351/*tex A string compare helper: */
352
353int tex_str_eq_cstr(strnumber r, const char *s, size_t l)
354{
355    return (l == str_length(r)) && ! strncmp((const char *) (str_string(r)), s, l);
356}
357
358/*tex
359
360    The initial values of |str_pool|, |str_start|, |pool_ptr|, and |str_ptr| are computed set in
361    \INITEX\ mode. The first |string_offset| strings are single characters strings matching Unicode.
362    There is no point in generating all of these. But |str_ptr| has initialized properly, otherwise
363    |print_char| cannot see the difference between characters and strings.
364
365*/
366
367int tex_get_strings_started(void)
368{
369    tex_reset_cur_string();
370    return 1;
371}
372
373/*tex
374
375    The string recycling routines. \TEX\ uses 2 upto 4 {\em new} strings when scanning a filename
376    in an |\input|, |\openin|, or |\openout| operation. These strings are normally lost because the
377    reference to them are not saved after finishing the operation. |search_string| searches through
378    the string pool for the given string and returns either 0 or the found string number. However,
379    in \LUAMETATEX\ filenames (and fontnames) are implemented more efficiently so that code is gone.
380
381*/
382
383strnumber tex_maketexstring(const char *s)
384{
385    if (s && *s) {
386        return tex_maketexlstring(s, strlen(s));
387    } else {
388        return get_nullstr();
389    }
390}
391
392strnumber tex_maketexlstring(const char *s, size_t l)
393{
394    if (s && l > 0) {
395        int ptr = lmt_string_pool_state.string_pool_data.ptr;
396        size_t len = l + 1;
397        unsigned char *tmp = lmt_memory_malloc(len);
398        if (tmp) {
399            str_length(ptr) = l;
400            str_string(ptr) = tmp;
401            tex_aux_increment_pool_string((int) l);
402            memcpy(tmp, s, len);
403            if (tex_aux_room_in_string_pool(1)) {
404                lmt_string_pool_state.string_pool_data.ptr += 1;
405            }
406            return ptr;
407        } else {
408            tex_overflow_error("string pool", (int) len);
409        }
410    }
411    return get_nullstr();
412}
413
414/*tex
415    These two functions appends bytes to the current \TEX\ string. There is no checking on what
416    gets appended nd as in \LUA\ zero bytes are okay. Unlike the other engines we don't provide
417    |^^| escaping, which is already optional in \LUATEX.
418*/
419
420void tex_append_string(const unsigned char *s, unsigned l)
421{
422    if (s && l > 0 && tex_aux_room_in_string(l)) {
423        memcpy(lmt_string_pool_state.string_temp + lmt_string_pool_state.string_temp_top, s, l);
424        lmt_string_pool_state.string_temp_top += l;
425    }
426}
427
428void tex_append_char(unsigned char c)
429{
430    if (tex_aux_room_in_string(1)) {
431        lmt_string_pool_state.string_temp[lmt_string_pool_state.string_temp_top++] = (unsigned char) c;
432    }
433}
434
435char *tex_makeclstring(int s, size_t *len)
436{
437    if (s < cs_offset_value) {
438        *len = (size_t) utf8_size(s);
439        return (char *) aux_uni2str((unsigned) s);
440    } else {
441        size_t l = (size_t) str_length(s);
442        char *tmp = lmt_memory_malloc(l + 1);
443        if (tmp) {
444            memcpy(tmp, str_string(s), l);
445            tmp[l] = '\0';
446            *len = l;
447            return tmp;
448        } else {
449            tex_overflow_error("string pool", (int) l);
450            *len = 0;
451            return NULL;
452        }
453    }
454}
455
456/* 
457char *tex_makecstring(int s)
458{
459    if (s < cs_offset_value) {
460        return (char *) aux_uni2str((unsigned) s);
461    } else {
462        return lmt_memory_strdup((str_length(s) > 0) ? (const char *) str_string(s) : "");
463    }
464}
465*/
466
467/*tex 
468    I might eventually replace this because in quite some calls we know that we knwo that we have
469    a pointer in string space. We can kin dof predict in what cases we are below |cs_offset_value|
470    anyway. 
471*/
472
473char *tex_makecstring(int s, int *allocated)
474{
475    *allocated = s < cs_offset_value;
476    if (*allocated) {
477        return (char *) aux_uni2str((unsigned) s);
478    } else {
479        return str_length(s) > 0 ? (char *) str_string(s) : "";
480    }
481}
482
483/*tex
484
485    We can save some 150 K on the format file size by using a signed char as length (after checking)
486    because the max size of a string in \CONTEXT\ is around 70. A flag could indicate if we use 1 or
487    4 bytes for the length. But not yet (preroll needed). Dumping and undumping all strings in a
488    block (where we need to zero terminate them) doesn't really work out any better. Okay, in the end
489    it was done.
490
491*/
492
493/*tex We use the real accessors here, not the macros that use |cs_offset_value|. */
494
495void tex_compact_string_pool(void)
496{
497    int n_of_strings = lmt_string_pool_state.string_pool_data.ptr - cs_offset_value;
498    int max_length = 0;
499    for (int j = 1; j < n_of_strings; j++) {
500        if (lmt_string_pool_state.string_pool[j].l > (unsigned int) max_length) {
501            max_length = (int) lmt_string_pool_state.string_pool[j].l;
502        }
503    }
504    lmt_string_pool_state.string_max_length = max_length;
505    tex_print_format("max string length %i, ", max_length);
506}
507
508/*tex 
509    For short strings we use 0xFF as signal for -1. When we reload a zero length string is 
510    also allocated. 
511*/
512
513# define max_short_string 250
514# define no_short_string  255
515
516void tex_dump_string_pool(dumpstream f)
517{
518    int n_of_strings = lmt_string_pool_state.string_pool_data.ptr - cs_offset_value;
519    int total_length = lmt_string_pool_state.string_body_data.allocated;
520    int max_length = lmt_string_pool_state.string_max_length;
521    dump_via_int(f, lmt_string_pool_state.string_pool_data.allocated);
522    dump_via_int(f, lmt_string_pool_state.string_pool_data.top); /* includes cs_offset_value */
523    dump_via_int(f, lmt_string_pool_state.string_pool_data.ptr); /* includes cs_offset_value */
524    dump_via_int(f, n_of_strings);
525    dump_via_int(f, max_length);
526    dump_via_int(f, total_length);
527    if (max_length > 0 && max_length < max_short_string) {
528        /*tex We only have short strings. */
529        for (int j = 0; j < n_of_strings; j++) {
530            unsigned char l = lmt_string_pool_state.string_pool[j].s ? (unsigned char) lmt_string_pool_state.string_pool[j].l : no_short_string;
531            dump_uchar(f, l);
532            if (l > 0) {
533                dump_things(f, *lmt_string_pool_state.string_pool[j].s, l);
534            }
535        }
536    } else {
537        /*tex We also have long strings. */
538        for (int j = 0; j < n_of_strings; j++) {
539            int l = lmt_string_pool_state.string_pool[j].s ? (int) lmt_string_pool_state.string_pool[j].l : -1;
540            dump_int(f, l);
541            if (l > 0) {
542                dump_things(f, *lmt_string_pool_state.string_pool[j].s, l);
543            }
544        }
545    }
546}
547
548void tex_undump_string_pool(dumpstream f)
549{
550    int n_of_strings;
551    int max_length;
552    int total_length;
553    undump_int(f, lmt_string_pool_state.string_pool_data.allocated);
554    undump_int(f, lmt_string_pool_state.string_pool_data.top); /* includes cs_offset_value */
555    undump_int(f, lmt_string_pool_state.string_pool_data.ptr); /* includes cs_offset_value */
556    undump_int(f, n_of_strings);
557    undump_int(f, max_length);
558    undump_int(f, total_length);
559    lmt_string_pool_state.string_max_length = max_length;
560    tex_initialize_string_mem();
561    {
562        int allocated = 0;
563        int compact = max_length > 0 && max_length < max_short_string;
564        for (int j = 0; j < n_of_strings; j++) {
565            int l;
566            if (compact) {
567                /*tex We only have short strings. */
568                unsigned char sl;
569                undump_uchar(f, sl);
570                l = (sl == no_short_string) ? -1 : sl;
571            } else {
572                /*tex We also have long strings. */
573                undump_int(f, l);
574            }
575            if (l >= 0) {
576                /* we can overflow reserved_string_slots */
577                int n = l + 1;
578                unsigned char *s = aux_allocate_clear_array(sizeof(unsigned char), n, reserved_string_slots);
579                if (s) {
580                    lmt_string_pool_state.string_pool[j].s = s;
581                    undump_things(f, s[0], l);
582                    s[l] = '\0';
583                    allocated += n;
584                } else {
585                    tex_overflow_error("string pool", n);
586                    l = 0;
587                }
588            } else {
589                l = 0;
590            }
591            lmt_string_pool_state.string_pool[j].l = l;
592        }
593        lmt_string_pool_state.string_body_data.allocated = allocated;
594        lmt_string_pool_state.string_body_data.initial = allocated;
595    }
596}
597
598/*tex To destroy an already made string, we say |flush_str|. */
599
600void tex_flush_str(strnumber s)
601{
602    if (s > cs_offset_value) {
603        /*tex Don't ever delete the null string! */
604        tex_aux_decrement_pool_string((int) str_length(s));
605        str_length(s) = 0;
606        lmt_memory_free(str_string(s));
607        str_string(s) = NULL;
608     // string_pool_state.string_pool_data.ptr--;
609    }
610    /* why a loop and not in previous branch */
611    while (! str_string((lmt_string_pool_state.string_pool_data.ptr - 1))) {
612        lmt_string_pool_state.string_pool_data.ptr--;
613    }
614}
615
616/*
617    In the old filename code we had the following, but I suspect some mem issue there (as we ran
618    into GB leaks for thousands of names):
619
620    u = save_cur_string();
621    get_x_token();
622    restore_cur_string(u);
623*/
624
625strnumber tex_save_cur_string(void)
626{
627    return (lmt_string_pool_state.string_temp_top > 0 ? tex_make_string() : 0);
628}
629
630void tex_restore_cur_string(strnumber u)
631{
632    if (u) {
633        /*tex Beware, we have no 0 termination here! */
634        int ul = (int) str_length(u);
635        tex_aux_flush_cur_string();
636        if (tex_aux_room_in_string(u)) {
637            memcpy(lmt_string_pool_state.string_temp, str_string(u), ul);
638            lmt_string_pool_state.string_temp_allocated = ul;
639            lmt_string_pool_state.string_temp_top = ul;
640            tex_flush_str(u);
641        }
642    }
643}
644