texstringpool.c /size: 21 Kb    last modification: 2024-01-16 10:22
1/*
2    See license.txt in the root of this project.
3*/
4
5# include "luametatex.h"
6
7/*tex
8
9    Control sequence names and diagnostic messages are variable length strings of eight bit
10    characters. Since \PASCAL\ did not have a well-developed string mechanism, \TEX\ did all of its
11    string processing by homegrown methods.
12
13    Elaborate facilities for dynamic strings are not needed, so all of the necessary operations can
14    be handled with a simple data structure. The array |str_pool| contains all of the (eight-bit)
15    bytes off all of the strings, and the array |str_start| contains indices of the starting points
16    of each string. Strings are referred to by integer numbers, so that string number |s| comprises
17    the characters |str_pool[j]| for |str_start_macro(s) <= j < str_start_macro (s + 1)|. Additional
18    integer variables |pool_ptr| and |str_ptr| indicate the number of entries used so far in
19    |str_pool| and |str_start|, respectively; locations |str_pool[pool_ptr]| and |str_start_macro
20    (str_ptr)| are ready for the next string to be allocated.
21
22    String numbers 0 to |biggest_char| are reserved for strings that correspond to single \UNICODE\
23    characters. This is in accordance with the conventions of \WEB\ which converts single-character
24    strings into the ASCII code number of the single character involved.
25
26    The stringpool variables are collected in:
27
28*/
29
30string_pool_info lmt_string_pool_state = {
31    .string_pool           = NULL,
32    .string_pool_data      = {
33        .minimum   = min_pool_size,
34        .maximum   = max_pool_size,
35        .size      = siz_pool_size,
36        .step      = stp_pool_size,
37        .allocated = 0,
38        .itemsize  = sizeof(lstring),
39        .top       = 0,
40        .ptr       = 0,
41        .initial   = 0,
42        .offset    = cs_offset_value,
43    },
44    .string_body_data      = {
45        .minimum   = min_body_size,
46        .maximum   = max_body_size,
47        .size      = siz_body_size,
48        .step      = stp_body_size,
49        .allocated = 0,
50        .itemsize  = sizeof(unsigned char),
51        .top       = memory_data_unset,
52        .ptr       = memory_data_unset,
53        .initial   = 0,
54        .offset    = 0,
55    },
56    .reserved              = 0,
57    .string_max_length     = 0,
58    .string_temp           = NULL,
59    .string_temp_allocated = 0,
60    .string_temp_top       = 0,
61};
62
63/*tex
64
65    The array of strings is |string_pool|, the number of the current string being created is
66    |str_ptr|, the starting value of |str_ptr| is |init_str_ptr|, and the current string buffer,
67    the current index in that buffer, the mallocedsize of |cur_string| and the occupied byte count
68    are kept in |cur_string|, |cur_length|, |cur_string_size| and |pool_size|.
69
70    Once a sequence of characters has been appended to |cur_string|, it officially becomes a string
71    when the function |make_string| is called. This function returns the identification number of
72    the new string as its value.
73
74    Strings end with a zero character which makes \TEX\ string also valid \CCODE\ strings. The
75    |string_temp*| fields deal with a temporary string (building).
76
77    The |ptr| is always one ahead. This is kind of a safeguard: an overflow happens already when we
78    still assemble a new string.
79
80*/
81
82# define initial_temp_string_slots  256
83# define reserved_temp_string_slots   2
84
85inline static void tex_aux_increment_pool_string(int n)
86{
87    lmt_string_pool_state.string_body_data.allocated += n;
88    if (lmt_string_pool_state.string_body_data.allocated > lmt_string_pool_state.string_body_data.size) {
89        tex_overflow_error("poolbody", lmt_string_pool_state.string_body_data.allocated);
90    }
91}
92
93inline static void tex_aux_decrement_pool_string(int n)
94{
95    lmt_string_pool_state.string_body_data.allocated -= n;
96}
97
98static void tex_aux_flush_cur_string(void)
99{
100    if (lmt_string_pool_state.string_temp) {
101        aux_deallocate_array(lmt_string_pool_state.string_temp);
102    }
103    lmt_string_pool_state.string_temp = NULL;
104    lmt_string_pool_state.string_temp_top = 0;
105    lmt_string_pool_state.string_temp_allocated = 0;
106}
107
108void tex_reset_cur_string(void)
109{
110    unsigned char *tmp = aux_allocate_clear_array(sizeof(unsigned char), initial_temp_string_slots, reserved_temp_string_slots);
111    if (tmp) {
112        lmt_string_pool_state.string_temp = tmp;
113        lmt_string_pool_state.string_temp_top = 0;
114        lmt_string_pool_state.string_temp_allocated = initial_temp_string_slots;
115    } else {
116        tex_overflow_error("pool", initial_temp_string_slots);
117    }
118}
119
120static int tex_aux_room_in_string(int wsize)
121{
122    /* no callback here */
123    if (! lmt_string_pool_state.string_temp) {
124        tex_reset_cur_string();
125    }
126    if ((lmt_string_pool_state.string_temp_top + wsize) > lmt_string_pool_state.string_temp_allocated) {
127        unsigned char *tmp = NULL;
128        int size = lmt_string_pool_state.string_temp_allocated + lmt_string_pool_state.string_temp_allocated / 5 + STRING_EXTRA_AMOUNT;
129        if (size < wsize) {
130            size = wsize + STRING_EXTRA_AMOUNT;
131        }
132        tmp = aux_reallocate_array(lmt_string_pool_state.string_temp, sizeof(unsigned char), size, reserved_temp_string_slots);
133        if (tmp) {
134            lmt_string_pool_state.string_temp = tmp;
135            memset(tmp + lmt_string_pool_state.string_temp_top, 0, (size_t) size - lmt_string_pool_state.string_temp_top);
136        } else {
137            tex_overflow_error("pool", size);
138        }
139        lmt_string_pool_state.string_temp_allocated = size;
140    }
141    return 1;
142}
143
144# define reserved_string_slots 1
145
146/*tex Messy: ptr and top have cs_offset_value included */
147
148void tex_initialize_string_mem(void)
149{
150    int size = lmt_string_pool_state.string_pool_data.minimum;
151    if (lmt_main_state.run_state == initializing_state) {
152        size = lmt_string_pool_state.string_pool_data.minimum;
153        lmt_string_pool_state.string_pool_data.ptr = cs_offset_value;
154    } else {
155        size = lmt_string_pool_state.string_pool_data.allocated;
156        lmt_string_pool_state.string_pool_data.initial = lmt_string_pool_state.string_pool_data.ptr;
157    }
158    if (size > 0) {
159        lstring *pool = aux_allocate_clear_array(sizeof(lstring), size, reserved_string_slots);
160        if (pool) {
161            lmt_string_pool_state.string_pool = pool;
162            lmt_string_pool_state.string_pool_data.allocated = size;
163        } else {
164            tex_overflow_error("pool", size);
165        }
166    }
167}
168
169void tex_initialize_string_pool(void)
170{
171    unsigned char *nullstring = lmt_memory_malloc(1);
172    int size = lmt_string_pool_state.string_pool_data.allocated;
173    if (size && nullstring) {
174        lmt_string_pool_state.string_pool[0].s = nullstring;
175        nullstring[0] = '\0';
176        lmt_string_pool_state.string_pool_data.ptr += 1;
177        tex_reset_cur_string();
178    } else {
179        tex_overflow_error("pool", size);
180    }
181}
182
183static int tex_aux_room_in_string_pool(int n)
184{
185    int top = lmt_string_pool_state.string_pool_data.ptr + n;
186    if (top > lmt_string_pool_state.string_pool_data.top) {
187        lmt_string_pool_state.string_pool_data.top = top;
188        top -=  cs_offset_value;
189        if (top > lmt_string_pool_state.string_pool_data.allocated) {
190            lstring *tmp = NULL;
191            top = lmt_string_pool_state.string_pool_data.allocated;
192            do {
193                top += lmt_string_pool_state.string_pool_data.step;
194                n -= lmt_string_pool_state.string_pool_data.step;
195            } while (n > 0);
196            if (top > lmt_string_pool_state.string_pool_data.size) {
197                top = lmt_string_pool_state.string_pool_data.size;
198            }
199            if (top > lmt_string_pool_state.string_pool_data.allocated) {
200                lmt_string_pool_state.string_pool_data.allocated = top;
201                tmp = aux_reallocate_array(lmt_string_pool_state.string_pool, sizeof(lstring), top, reserved_string_slots);
202                lmt_string_pool_state.string_pool = tmp;
203            }
204            lmt_run_memory_callback("pool", tmp ? 1 : 0);
205            if (! tmp) {
206                tex_overflow_error("pool", top);
207                return 0;
208            }
209        }
210    }
211    return 1;
212}
213
214/*tex
215
216    Checking for the last one to be the same as the previous one doesn't save much some 10K on a
217    \CONTEXT\ format.
218
219*/
220
221strnumber tex_make_string(void)
222{
223    if (tex_aux_room_in_string(1)) {
224        int ptr = lmt_string_pool_state.string_pool_data.ptr;
225        lmt_string_pool_state.string_temp[lmt_string_pool_state.string_temp_top] = '\0';
226        str_string(ptr) = lmt_string_pool_state.string_temp;
227        str_length(ptr) = lmt_string_pool_state.string_temp_top;
228        tex_aux_increment_pool_string(lmt_string_pool_state.string_temp_top);
229        tex_reset_cur_string();
230        if (tex_aux_room_in_string_pool(1)) {
231            lmt_string_pool_state.string_pool_data.ptr++;
232        }
233        return ptr;
234    } else {
235        return get_nullstr();
236    }
237}
238
239strnumber tex_push_string(const unsigned char *s, int l)
240{
241    if (tex_aux_room_in_string_pool(1)) {
242        unsigned char *t = lmt_memory_malloc(sizeof(char) * ((size_t) l + 1));
243        if (t) {
244            int ptr = lmt_string_pool_state.string_pool_data.ptr;
245            memcpy(t, s, l);
246            t[l] = '\0';
247            str_string(ptr) = t;
248            str_length(ptr) = l;
249            lmt_string_pool_state.string_pool_data.ptr++;
250            tex_aux_increment_pool_string(l);
251            return ptr;
252        }
253    }
254    return get_nullstr();
255}
256
257char *tex_take_string(int *len)
258{
259    char* ptr = NULL;
260    if (tex_aux_room_in_string(1)) {
261        lmt_string_pool_state.string_temp[lmt_string_pool_state.string_temp_top] = '\0';
262        if (len) {
263            *len = lmt_string_pool_state.string_temp_top;
264        }
265        ptr = (char *) lmt_string_pool_state.string_temp;
266        tex_reset_cur_string();
267    }
268    return ptr;
269}
270
271/*tex
272
273    The following subroutine compares string |s| with another string of the same length that appears
274    in |buffer| starting at position |k|; the result is |true| if and only if the strings are equal.
275    Empirical tests indicate that |str_eq_buf| is used in such a way that it tends to return |true|
276    about 80 percent of the time.
277
278    \startyping
279    unsigned char *j = str_string(s);
280    unsigned char *l = j + str_length(s);
281    while (j < l) {
282        if (*j++ != buffer[k++])
283            return 0;
284    }
285    \stoptyping
286
287*/
288
289// int tex_str_eq_buf(strnumber s, int k, int n)
290// {
291//     if (s < cs_offset_value) {
292//         /* very unlikely */
293//         return buffer_to_unichar(k) == (unsigned int) s;
294//     } else {
295//         return memcmp(str_string(s), &lmt_fileio_state.io_buffer[k], n) == 0;
296//     }
297// }
298
299// int tex_str_eq_buf(strnumber s, int k, int n)
300// {
301//     if (s >= cs_offset_value) {
302//         return memcmp(str_string(s), &lmt_fileio_state.io_buffer[k], n) == 0;
303//     } else {
304//         /* very unlikely */
305//         return buffer_to_unichar(k) == (unsigned int) s;
306//     }
307// }
308
309int tex_str_eq_buf(strnumber s, int k, int n)
310{
311    return (s >= cs_offset_value) ? (memcmp(str_string(s), &lmt_fileio_state.io_buffer[k], n) == 0) : (buffer_to_unichar(k) == (unsigned int) s);
312}
313
314/*tex
315
316    Here is a similar routine, but it compares two strings in the string pool, and it does not
317    assume that they have the same length.
318
319    \starttyping
320    k = str_string(t);
321    j = str_string(s);
322    l = j + str_length(s);
323    while (j < l) {
324        if (*j++ != *k++)
325            return 0;
326    }
327    \stoptyping
328*/
329
330int tex_str_eq_str(strnumber s, strnumber t)
331{
332    if (s >= cs_offset_value) {
333        if (t >= cs_offset_value) {
334            /* s and t are strings, this is the most likely test */
335            return (str_length(s) == str_length(t)) && ! memcmp(str_string(s), str_string(t), str_length(s));
336        } else {
337            /* s is a string and t an unicode character, happens seldom */
338            return (strnumber) aux_str2uni(str_string(s)) == t;
339        }
340    } else if (t >= cs_offset_value) {
341        /* s is an unicode character and t is a string, happens seldom */
342        return (strnumber) aux_str2uni(str_string(t)) == s;
343    } else {
344        /* s and t are unicode characters */
345        return s == t;
346    }
347}
348
349/*tex A string compare helper: */
350
351int tex_str_eq_cstr(strnumber r, const char *s, size_t l)
352{
353    return (l == str_length(r)) && ! strncmp((const char *) (str_string(r)), s, l);
354}
355
356/*tex
357
358    The initial values of |str_pool|, |str_start|, |pool_ptr|, and |str_ptr| are computed set in
359    \INITEX\ mode. The first |string_offset| strings are single characters strings matching Unicode.
360    There is no point in generating all of these. But |str_ptr| has initialized properly, otherwise
361    |print_char| cannot see the difference between characters and strings.
362
363*/
364
365int tex_get_strings_started(void)
366{
367    tex_reset_cur_string();
368    return 1;
369}
370
371/*tex
372
373    The string recycling routines. \TEX\ uses 2 upto 4 {\em new} strings when scanning a filename
374    in an |\input|, |\openin|, or |\openout| operation. These strings are normally lost because the
375    reference to them are not saved after finishing the operation. |search_string| searches through
376    the string pool for the given string and returns either 0 or the found string number. However,
377    in \LUAMETATEX\ filenames (and fontnames) are implemented more efficiently so that code is gone.
378
379*/
380
381strnumber tex_maketexstring(const char *s)
382{
383    if (s && *s) {
384        return tex_maketexlstring(s, strlen(s));
385    } else {
386        return get_nullstr();
387    }
388}
389
390strnumber tex_maketexlstring(const char *s, size_t l)
391{
392    if (s && l > 0) {
393        int ptr = lmt_string_pool_state.string_pool_data.ptr;
394        size_t len = l + 1;
395        unsigned char *tmp = lmt_memory_malloc(len);
396        if (tmp) {
397            str_length(ptr) = l;
398            str_string(ptr) = tmp;
399            tex_aux_increment_pool_string((int) l);
400            memcpy(tmp, s, len);
401            if (tex_aux_room_in_string_pool(1)) {
402                lmt_string_pool_state.string_pool_data.ptr += 1;
403            }
404            return ptr;
405        } else {
406            tex_overflow_error("string pool", (int) len);
407        }
408    }
409    return get_nullstr();
410}
411
412/*tex
413    These two functions appends bytes to the current \TEX\ string. There is no checking on what
414    gets appended nd as in \LUA\ zero bytes are okay. Unlike the other engines we don't provide
415    |^^| escaping, which is already optional in \LUATEX.
416*/
417
418void tex_append_string(const unsigned char *s, unsigned l)
419{
420    if (s && l > 0 && tex_aux_room_in_string(l)) {
421        memcpy(lmt_string_pool_state.string_temp + lmt_string_pool_state.string_temp_top, s, l);
422        lmt_string_pool_state.string_temp_top += l;
423    }
424}
425
426void tex_append_char(unsigned char c)
427{
428    if (tex_aux_room_in_string(1)) {
429        lmt_string_pool_state.string_temp[lmt_string_pool_state.string_temp_top++] = (unsigned char) c;
430    }
431}
432
433char *tex_makeclstring(int s, size_t *len)
434{
435    if (s < cs_offset_value) {
436        *len = (size_t) utf8_size(s);
437        return (char *) aux_uni2str((unsigned) s);
438    } else {
439        size_t l = (size_t) str_length(s);
440        char *tmp = lmt_memory_malloc(l + 1);
441        if (tmp) {
442            memcpy(tmp, str_string(s), l);
443            tmp[l] = '\0';
444            *len = l;
445            return tmp;
446        } else {
447            tex_overflow_error("string pool", (int) l);
448            *len = 0;
449            return NULL;
450        }
451    }
452}
453
454/* 
455char *tex_makecstring(int s)
456{
457    if (s < cs_offset_value) {
458        return (char *) aux_uni2str((unsigned) s);
459    } else {
460        return lmt_memory_strdup((str_length(s) > 0) ? (const char *) str_string(s) : "");
461    }
462}
463*/
464
465/*tex 
466    I might eventually replace this because in qite some calls we know that we knwo that we have
467    a pointer in string space. We can kin dof predict in what cases we are below |cs_offset_value|
468    anyway. 
469*/
470
471char *tex_makecstring(int s, int *allocated)
472{
473    *allocated = s < cs_offset_value;
474    if (*allocated) {
475        return (char *) aux_uni2str((unsigned) s);
476    } else {
477        return str_length(s) > 0 ? (char *) str_string(s) : "";
478    }
479}
480
481/*tex
482
483    We can save some 150 K on the format file size by using a signed char as length (after checking)
484    because the max size of a string in \CONTEXT\ is around 70. A flag could indicate if we use 1 or
485    4 bytes for the length. But not yet (preroll needed). Dumping and undumping all strings in a
486    block (where we need to zero terminate them) doesn't really work out any better. Okay, in the end
487    it was done.
488
489*/
490
491/*tex We use the real accessors here, not the macros that use |cs_offset_value|. */
492
493void tex_compact_string_pool(void)
494{
495    int n_of_strings = lmt_string_pool_state.string_pool_data.ptr - cs_offset_value;
496    int max_length = 0;
497    for (int j = 1; j < n_of_strings; j++) {
498        if (lmt_string_pool_state.string_pool[j].l > (unsigned int) max_length) {
499            max_length = (int) lmt_string_pool_state.string_pool[j].l;
500        }
501    }
502    lmt_string_pool_state.string_max_length = max_length;
503    tex_print_format("max string length %i, ", max_length);
504}
505
506void tex_dump_string_pool(dumpstream f)
507{
508    int n_of_strings = lmt_string_pool_state.string_pool_data.ptr - cs_offset_value;
509    int total_length = lmt_string_pool_state.string_body_data.allocated;
510    int max_length = lmt_string_pool_state.string_max_length;
511    dump_via_int(f, lmt_string_pool_state.string_pool_data.allocated);
512    dump_via_int(f, lmt_string_pool_state.string_pool_data.top); /* includes cs_offset_value */
513    dump_via_int(f, lmt_string_pool_state.string_pool_data.ptr); /* includes cs_offset_value */
514    dump_via_int(f, n_of_strings);
515    dump_via_int(f, max_length);
516    dump_via_int(f, total_length);
517    if (max_length > 0 && max_length < 126) {
518        /*tex We only have short strings. */
519        for (int j = 0; j < n_of_strings; j++) {
520            int l = (int) lmt_string_pool_state.string_pool[j].l;
521            char c;
522            if (! lmt_string_pool_state.string_pool[j].s) {
523                l = -1;
524            }
525            c = (char) l;
526            dump_things(f, c, 1);
527            if (l > 0) {
528                dump_things(f, *lmt_string_pool_state.string_pool[j].s, l);
529            }
530        }
531    } else {
532        /*tex We also have long strings. */
533        for (int j = 0; j < n_of_strings; j++) {
534            int l = (int) lmt_string_pool_state.string_pool[j].l;
535            if (! lmt_string_pool_state.string_pool[j].s) {
536                l = -1;
537            }
538            dump_int(f, l);
539            if (l > 0) {
540                dump_things(f, *lmt_string_pool_state.string_pool[j].s, l);
541            }
542        }
543    }
544}
545
546void tex_undump_string_pool(dumpstream f)
547{
548    int n_of_strings;
549    int max_length;
550    int total_length;
551    undump_int(f, lmt_string_pool_state.string_pool_data.allocated);
552    undump_int(f, lmt_string_pool_state.string_pool_data.top); /* includes cs_offset_value */
553    undump_int(f, lmt_string_pool_state.string_pool_data.ptr); /* includes cs_offset_value */
554    undump_int(f, n_of_strings);
555    undump_int(f, max_length);
556    undump_int(f, total_length);
557    lmt_string_pool_state.string_max_length = max_length;
558    tex_initialize_string_mem();
559    {
560        int a = 0;
561        int compact = max_length > 0 && max_length < 126;
562        for (int j = 0; j < n_of_strings; j++) {
563            int x;
564            if (compact) {
565                /*tex We only have short strings. */
566                char c;
567                undump_things(f, c, 1);
568                x = c;
569            } else {
570                /*tex We also have long strings. */
571                undump_int(f, x);
572            }
573            if (x >= 0) {
574                /* we can overflow reserved_string_slots */
575                int n = x + 1;
576                unsigned char *s = aux_allocate_clear_array(sizeof(unsigned char), n, reserved_string_slots);
577                if (s) {
578                    lmt_string_pool_state.string_pool[j].s = s;
579                    undump_things(f, s[0], x);
580                    s[x] = '\0';
581                    a += n;
582                } else {
583                    tex_overflow_error("string pool", n);
584                    x = 0;
585                }
586            } else {
587                x = 0;
588            }
589            lmt_string_pool_state.string_pool[j].l = x;
590        }
591        lmt_string_pool_state.string_body_data.allocated = a;
592        lmt_string_pool_state.string_body_data.initial = a;
593    }
594}
595
596/*tex To destroy an already made string, we say |flush_str|. */
597
598void tex_flush_str(strnumber s)
599{
600    if (s > cs_offset_value) {
601        /*tex Don't ever delete the null string! */
602        tex_aux_decrement_pool_string((int) str_length(s));
603        str_length(s) = 0;
604        lmt_memory_free(str_string(s));
605        str_string(s) = NULL;
606     // string_pool_state.string_pool_data.ptr--;
607    }
608    /* why a loop and not in previous branch */
609    while (! str_string((lmt_string_pool_state.string_pool_data.ptr - 1))) {
610        lmt_string_pool_state.string_pool_data.ptr--;
611    }
612}
613
614/*
615    In the old filename code we had the following, but I suspect some mem issue there (as we ran
616    into GB leaks for thousands of names):
617
618    u = save_cur_string();
619    get_x_token();
620    restore_cur_string(u);
621*/
622
623strnumber tex_save_cur_string(void)
624{
625    return (lmt_string_pool_state.string_temp_top > 0 ? tex_make_string() : 0);
626}
627
628void tex_restore_cur_string(strnumber u)
629{
630    if (u) {
631        /*tex Beware, we have no 0 termination here! */
632        int ul = (int) str_length(u);
633        tex_aux_flush_cur_string();
634        if (tex_aux_room_in_string(u)) {
635            memcpy(lmt_string_pool_state.string_temp, str_string(u), ul);
636            lmt_string_pool_state.string_temp_allocated = ul;
637            lmt_string_pool_state.string_temp_top = ul;
638            tex_flush_str(u);
639        }
640    }
641}
642