texprimitive.c /size: 39 Kb    last modification: 2025-02-21 11:03
1/*
2    See license.txt in the root of this project.
3*/
4
5# include "luametatex.h"
6
7/*tex
8
9    Control sequences are stored and retrieved by means of a fairly standard hash table algorithm
10    called the method of \quote {coalescing lists} (cf.\ Algorithm 6.4C in {\em The Art of
11    Computer Programming}). Once a control sequence enters the table, it is never removed, because
12    there are complicated situations involving |\gdef| where the removal of a control sequence at
13    the end of a group would be a mistake preventable only by the introduction of a complicated
14    reference-count mechanism.
15
16    The actual sequence of letters forming a control sequence identifier is stored in the |str_pool|
17    array together with all the other strings. An auxiliary array |hash| consists of items with two
18    halfword fields per word. The first of these, called |next(p)|, points to the next identifier
19    belonging to the same coalesced list as the identifier corresponding to~|p|; and the other,
20    called |text(p)|, points to the |str_start| entry for |p|'s identifier. If position~|p| of the
21    hash table is empty, we have |text(p)=0|; if position |p| is either empty or the end of a
22    coalesced hash list, we have |next(p) = 0|. An auxiliary pointer variable called |hash_used| is
23    maintained in such a way that all locations |p >= hash_used| are nonempty. The global variable
24    |cs_count| tells how many multiletter control sequences have been defined, if statistics are
25    being kept.
26
27    A boolean variable called |no_new_control_sequence| is set to |true| during the time that new
28    hash table entries are forbidden.
29
30    The other variables in the following state structure are: the hash table: |hash|, the allocation
31    pointer |hash_used| for |hash|, |hash_extra| above |eqtb_size|, the maximum of the hash array
32    |hash_top|, the pointer to the next high hash location |hash_high|, the mentioned flag that says
33    if new identifiers are legal |no_new_control_sequence| and the total number of known identifiers:
34    |cs_count|.
35
36*/
37
38hash_state_info lmt_hash_state = {
39    .hash      = NULL,
40    .hash_data = {
41        .minimum   = min_hash_size,
42        .maximum   = max_hash_size,
43        .size      = siz_hash_size,
44        .step      = stp_hash_size,
45        .allocated = 0,
46        .itemsize  = sizeof(memoryword) + sizeof(memoryword),
47        .top       = 0,
48        .ptr       = 0,
49        .initial   = 0,
50        .offset    = 0, // eqtb_size,
51        .extra     = 0, 
52    },
53    .eqtb_data = {
54        .minimum   = min_hash_size,
55        .maximum   = max_hash_size,
56        .size      = siz_hash_size,
57        .step      = stp_hash_size,
58        .allocated = memory_data_unset,
59        .itemsize  = memory_data_unset,
60        .top       = frozen_control_sequence,
61        .ptr       = 0,
62        .initial   = 0,
63        .offset    = 0,
64        .extra     = 0, 
65    },
66    .eqtb        = NULL,
67    .no_new_cs   = 1,
68    .padding     = 0,
69    .destructors = { eq_none}, 
70};
71
72/*tex
73
74    The arrays |prim| and |prim_eqtb| are used for |name -> cmd, chr| lookups. The are modelled
75    after |hash| and |eqtb|, except that primitives do not have an |eq_level|, that field is
76    replaced by |origin|. Furthermore we have a link for coalesced lists: |prim_next (a)|; the
77    string number for control sequence name: |prim_text (a)|; test if all positions are occupied:
78    |prim_is_full|; some fields: |prim_origin_field (a)|, |prim_eq_type_field (a)| and
79    |prim_equiv_field(a)|; the level of definition: |prim_origin (a)|; the command code for
80    equivalent: |prim_eq_type(a)|; the equivalent value: |prim_equiv(a)|; the allocation pointer
81    for |prim|: |prim_used|; the primitives tables: |two_halves prim [(prim_size + 1)]| and
82    |memoryword prim_eqtb [(prim_size + 1)]|. The array |prim_data| works the other way around, it
83    is used for |cmd, chr| to name lookups.
84
85*/
86
87primitive_state_info lmt_primitive_state;
88
89/*tex Test if all positions are occupied: */
90
91# define prim_base           1
92# define reserved_hash_slots 1
93
94/*tex Initialize the memory arrays: */
95
96void tex_initialize_primitives(void)
97{
98    memset(lmt_primitive_state.prim_data, 0, sizeof(prim_info)  * (last_cmd  + 1));
99    memset(lmt_primitive_state.prim,      0, sizeof(memoryword) * (prim_size + 1));
100    memset(lmt_primitive_state.prim_eqtb, 0, sizeof(memoryword) * (prim_size + 1));
101    for (int k = 0; k <= prim_size; k++) {
102        prim_eq_type(k) = undefined_cs_cmd;
103    }
104    lmt_primitive_state.prim_used = prim_size;
105}
106
107void tex_initialize_hash_mem(void)
108{
109    if (lmt_main_state.run_state == initializing_state) {
110        if (lmt_hash_state.hash_data.minimum == 0) {
111            tex_emergency_message("startup error", "you need at least some hash size");
112        } else {
113            lmt_hash_state.hash_data.allocated = lmt_hash_state.hash_data.minimum;
114            lmt_hash_state.hash_data.top = eqtb_size + lmt_hash_state.hash_data.minimum;
115        }
116    }
117    {
118        int size = lmt_hash_state.hash_data.top + 1;
119        memoryword *hash = aux_allocate_clear_array(sizeof(memoryword), size, reserved_hash_slots);
120        memoryword *eqtb = aux_allocate_clear_array(sizeof(memoryword), size, reserved_hash_slots);
121        if (hash && eqtb) {
122            lmt_hash_state.hash = hash;
123            lmt_hash_state.eqtb = eqtb;
124            if (lmt_main_state.run_state == initializing_state) {
125                /*tex Initialization happens elsewhere. */
126            } else {
127                tex_initialize_undefined_cs();
128                for (int i = eqtb_size + 1; i <= lmt_hash_state.hash_data.top; i++) {
129                    copy_eqtb_entry(i, undefined_control_sequence);
130                }
131            }
132        } else {
133            tex_overflow_error("hash", size);
134        }
135    }
136}
137
138static int tex_aux_room_in_hash(void)
139{
140    if (lmt_hash_state.hash_data.allocated + lmt_hash_state.hash_data.step <= lmt_hash_state.hash_data.size) {
141        int size = lmt_hash_state.hash_data.top + lmt_hash_state.hash_data.step + 1;
142        memoryword *hash = aux_reallocate_array(lmt_hash_state.hash, sizeof(memoryword), size, reserved_hash_slots);
143        memoryword *eqtb = aux_reallocate_array(lmt_hash_state.eqtb, sizeof(memoryword), size, reserved_hash_slots);
144        if (hash && eqtb) {
145            memset(hash + lmt_hash_state.hash_data.top + 1, 0, sizeof(memoryword) * (size_t) lmt_hash_state.hash_data.step);
146            memset(eqtb + lmt_hash_state.hash_data.top + 1, 0, sizeof(memoryword) * (size_t) lmt_hash_state.hash_data.step);
147            lmt_hash_state.hash = hash;
148            lmt_hash_state.eqtb = eqtb;
149            /*tex
150                This is not really needed because we now do this when a new id is created which
151                is a better place anyway. But we play safe and still do it:
152            */
153            for (int i = lmt_hash_state.hash_data.top + 1; i <= size; i++) {
154                copy_eqtb_entry(i, undefined_control_sequence);
155            }
156            lmt_hash_state.hash_data.allocated += lmt_hash_state.hash_data.step;
157            lmt_hash_state.hash_data.top += lmt_hash_state.hash_data.step;
158            lmt_run_memory_callback("hash", 1);
159            return 1;
160        } else {
161            lmt_run_memory_callback("hash", 0);
162            tex_overflow_error("hash", size);
163        }
164    }
165    return 0;
166}
167
168/*tex
169
170    The value of |hash_prime| should be roughly 85\%! of |hash_size|, and it should be a prime
171    number. The theory of hashing tells us to expect fewer than two table probes, on the average,
172    when the search is successful. [See J.~S. Vitter, {\sl Journal of the ACM\/ \bf30} (1983),
173    231--258.]
174
175    https://en.wikipedia.org/wiki/Coalesced_hashing
176    https://programming.guide/coalesced-hashing.html
177
178    Because we seldom use uppercase we get many misses, multiplying a chr j[k] by k actually gives
179    a better spread.
180
181    Making a \CONTEXT\ format takes some 250.000 hash calculations while the \LUAMETATEX\ manual 
182    needs some 1.7 million for just over 250 pages (with an average string length of 15).
183
184    The primitive hash lookups are needed when we initialize and when we lookup an internal
185    variable.
186
187*/
188
189static inline halfword tex_aux_compute_hash(const char *j, unsigned l)
190{
191    halfword h = (unsigned const char) j[0];
192    for (unsigned k = 1; k < l; k++) {
193        h = (h + h + (unsigned const char) j[k]) % hash_prime;
194    }
195    return h;
196}
197
198static inline halfword tex_aux_compute_prim(const char *j, unsigned l)
199{
200    halfword h = (unsigned const char) j[0];
201    for (unsigned k = 1; k < l; k++) {
202        h = (h + h + (unsigned const char) j[k]) % prim_prime;
203    }
204    return h;
205}
206
207halfword tex_prim_lookup(strnumber s)
208{
209    /*tex The index in the |hash| array: */
210    if (s >= cs_offset_value) {
211        unsigned char *j = str_string(s);
212     // unsigned l = (unsigned) str_length(s);
213        halfword l = (halfword) str_length(s);
214        halfword h = tex_aux_compute_prim((char *) j, l);
215        /*tex We start searching here; note that |0 <= h < hash_prime|. */
216        halfword p = h + 1;
217        while (1) {
218         /* When using |halfword text = prim_text(p)| no intellisense warning for first test in: */
219            if ((prim_text(p) > 0) && (str_length(prim_text(p)) == (size_t) l) && tex_str_eq_str(prim_text(p), s)) {
220                return p;
221            } else if (prim_next(p)) {
222                p = prim_next(p);
223            } else if (lmt_hash_state.no_new_cs) {
224                return undefined_primitive;
225            } else {
226                /*tex Insert a new primitive after |p|, then make |p| point to it. */
227                if (prim_text(p) > 0) {
228                    /*tex Search for an empty location in |prim| */
229                    do {
230                        if (lmt_primitive_state.prim_used > prim_base) {
231                            --lmt_primitive_state.prim_used;
232                        } else {
233                            tex_overflow_error("primitive size", prim_size);
234                        }
235                    } while (prim_text(lmt_primitive_state.prim_used));
236                    prim_next(p) = lmt_primitive_state.prim_used;
237                    p = lmt_primitive_state.prim_used;
238                }
239                prim_text(p) = s;
240                break;
241            }
242        }
243        return p;
244    } else if ((s < 0) || (s == undefined_control_sequence)) {
245        return undefined_primitive;
246    } else {
247        return s;
248    }
249}
250
251/*tex How to test a csname for primitive-ness? */
252
253/*
254int tex_cs_is_primitive(strnumber csname)
255{
256    int m = prim_lookup(csname);
257    if (m != undefined_primitive) {
258        char *ss = makecstring(csname);
259        int n = string_locate(ss, str_length(csname), 0);
260        lmt_memory_free(ss);
261        return ((n != undefined_cs_cmd) && (eq_type(n) == prim_eq_type(m)) && (eq_value(n) == prim_equiv(m)));
262    } else {
263        return 0;
264    }
265}
266*/
267
268/*tex Dumping and undumping. */
269
270/* We cheat! It should be dump_things(f, prim_state.prim[p], 1); */
271
272void tex_dump_primitives(dumpstream f)
273{
274    /*
275    for (int p = 0; p <= prim_size; p++) {
276        dump_mem(f, prim_state.prim[p]);
277    }
278    for (int p = 0; p <= prim_size; p++) {
279        dump_mem(f, prim_state.prim_eqtb[p]);
280    }
281    */
282    dump_things(f, lmt_primitive_state.prim[0], prim_size + 1);
283    dump_things(f, lmt_primitive_state.prim_eqtb[0], prim_size + 1);
284    for (int p = 0; p <= last_cmd; p++) {
285        dump_int(f, lmt_primitive_state.prim_data[p].offset);
286        dump_int(f, lmt_primitive_state.prim_data[p].subids);
287        for (int q = 0; q < lmt_primitive_state.prim_data[p].subids; q++) {
288            dump_int(f, lmt_primitive_state.prim_data[p].names[q]);
289        }
290    }
291}
292
293void tex_undump_primitives(dumpstream f)
294{
295    undump_things(f, lmt_primitive_state.prim[0], prim_size + 1);
296    undump_things(f, lmt_primitive_state.prim_eqtb[0], prim_size + 1);
297    for (int p = 0; p <= last_cmd; p++) {
298        undump_int(f, lmt_primitive_state.prim_data[p].offset);
299        undump_int(f, lmt_primitive_state.prim_data[p].subids);
300        if (lmt_primitive_state.prim_data[p].subids > 0) {
301            int size = lmt_primitive_state.prim_data[p].subids;
302            strnumber *names = aux_allocate_clear_array(sizeof(strnumber *), size, 1);
303            if (names) {
304                lmt_primitive_state.prim_data[p].names = names;
305                for (int q = 0; q < lmt_primitive_state.prim_data[p].subids; q++) {
306                    undump_int(f, names[q]);
307                }
308            } else {
309                tex_overflow_error("primitives", size * sizeof(strnumber *));
310            }
311        }
312    }
313}
314
315/*tex
316
317    Dump the hash table, A different scheme is used to compress the hash table, since its lower
318    region is usually sparse. When |text (p) <> 0| for |p <= hash_used|, we output two words,
319    |p| and |hash[p]|. The hash table is, of course, densely packed for |p >= hash_used|, so the
320    remaining entries are output in a~block.
321
322*/
323
324void tex_dump_hashtable(dumpstream f)
325{
326    dump_int(f, lmt_hash_state.eqtb_data.top);
327    lmt_hash_state.eqtb_data.ptr = frozen_control_sequence - 1 - lmt_hash_state.eqtb_data.top + lmt_hash_state.hash_data.ptr;
328    /*tex The root entries, i.e. the direct hash slots, these are sparse. */
329    for (halfword p = hash_base; p <= lmt_hash_state.eqtb_data.top; p++) {
330        if (cs_text(p)) {
331            dump_int(f, p);
332            dump_mem(f, lmt_hash_state.hash[p]);
333            ++lmt_hash_state.eqtb_data.ptr;
334        }
335    }
336    /*tex The chain entries, these are not sparse. */
337    dump_things(f, lmt_hash_state.hash[lmt_hash_state.eqtb_data.top + 1], special_sequence_base - lmt_hash_state.eqtb_data.top);
338    if (lmt_hash_state.hash_data.ptr > 0) {
339        dump_things(f, lmt_hash_state.hash[eqtb_size + 1], lmt_hash_state.hash_data.ptr);
340    }
341    dump_int(f, lmt_hash_state.eqtb_data.ptr);
342}
343
344void tex_undump_hashtable(dumpstream f)
345{
346    undump_int(f, lmt_hash_state.eqtb_data.top);
347    if (lmt_hash_state.eqtb_data.top >= hash_base && lmt_hash_state.eqtb_data.top <= frozen_control_sequence) {
348        halfword p = hash_base - 1;
349        do {
350            halfword q;
351            undump_int(f, q);
352            if (q >= (p + 1) && q <= lmt_hash_state.eqtb_data.top) {
353                undump_mem(f, lmt_hash_state.hash[q]);
354                p = q;
355            } else {
356                goto BAD;
357            }
358        } while (p != lmt_hash_state.eqtb_data.top);
359        undump_things(f, lmt_hash_state.hash[lmt_hash_state.eqtb_data.top + 1], special_sequence_base - lmt_hash_state.eqtb_data.top);
360        if (lmt_hash_state.hash_data.ptr > 0) {
361            /* we get a warning on possible overrun here */
362            undump_things(f, lmt_hash_state.hash[eqtb_size + 1], lmt_hash_state.hash_data.ptr);
363        }
364        undump_int(f, lmt_hash_state.eqtb_data.ptr);
365        lmt_hash_state.eqtb_data.initial = lmt_hash_state.eqtb_data.ptr;
366        return;
367    }
368  BAD:
369    tex_fatal_undump_error("hash");
370}
371
372/*tex
373
374    We need to put \TEX's \quote {primitive} control sequences into the hash table, together with
375    their command code (which will be the |eq_type|) and an operand (which will be the |equiv|).
376    The |primitive| procedure does this, in a way that no \TEX\ user can. The global value |cur_val|
377    contains the new |eqtb| pointer after |primitive| has acted.
378
379    Because the definitions of the actual user-accessible name of a primitive can be postponed until
380    runtime, the function |primitive_def| is needed that does nothing except creating the control
381    sequence name.
382
383*/
384
385void tex_primitive_def(const char *str, size_t length, singleword cmd, halfword chr)
386{
387    /*tex This creates the |text()| string: */
388    cur_val = tex_string_locate(str, length, 1);
389    set_eq_level(cur_val, level_one);
390    set_eq_type(cur_val, cmd);
391    set_eq_flag(cur_val, primitive_flag_bit);
392    set_eq_value(cur_val, chr);
393}
394
395/*tex
396
397    The function |store_primitive_name| sets up the bookkeeping for the reverse lookup. It is
398    quite paranoid, because it is easy to mess this up accidentally.
399
400    The |offset| is needed because sometimes character codes (in |o|) are indices into |eqtb|
401    or are offset by a magical value to make sure they do not conflict with something else. We
402    don't want the |prim_data[c].names| to have too many entries as it will just be wasted room,
403    so |offset| is substracted from |o| before creating or accessing the array.
404
405*/
406
407static void tex_aux_store_primitive_name(strnumber s, singleword cmd, halfword chr, halfword offset)
408{
409    lmt_primitive_state.prim_data[cmd].offset = offset;
410    if (lmt_primitive_state.prim_data[cmd].subids < (chr + 1)) {
411        /*tex Not that efficient as each primitive triggers this now but only at ini time so ... */
412        strnumber *newstr = aux_allocate_clear_array(sizeof(strnumber *), chr + 1, 1);
413        if (lmt_primitive_state.prim_data[cmd].names) {
414            memcpy(newstr, lmt_primitive_state.prim_data[cmd].names, (unsigned) (lmt_primitive_state.prim_data[cmd].subids) * sizeof(strnumber));
415            aux_deallocate_array(lmt_primitive_state.prim_data[cmd].names);
416        }
417        lmt_primitive_state.prim_data[cmd].names = newstr;
418        lmt_primitive_state.prim_data[cmd].subids = chr + 1;
419    }
420    lmt_primitive_state.prim_data[cmd].names[chr] = s;
421}
422
423/*tex
424
425    Compared to \TEX82, |primitive| has two extra parameters. The |off| is an offset that will be
426    passed on to |store_primitive_name|, the |cmd_origin| is the bit that is used to group
427    primitives by originator. So the next function is called for each primitive and fills |prim_eqtb|.
428
429    Contrary to \LUATEX\ we define (using |primitive_def|) all primitives beforehand, so not only
430    those with |cmd_origin| values |core| and |tex|. As side effect, we don't get redundant string
431    entries as in \LUATEX.
432
433*/
434
435void tex_primitive(int cmd_origin, const char *str, singleword cmd, halfword chr, halfword offset)
436{
437    int prim_val;
438    strnumber ss;
439    if (cmd_origin != no_command) {
440        tex_primitive_def(str, strlen(str), cmd, offset + chr);
441        /*tex Indeed, |cur_val| has the latest primitive. */
442        ss = cs_text(cur_val);
443    } else {
444        ss = tex_maketexstring(str);
445    }
446    prim_val = tex_prim_lookup(ss);
447    prim_origin(prim_val) = (quarterword) cmd_origin;
448    prim_eq_type(prim_val) = cmd;
449    prim_equiv(prim_val) = offset + chr;
450    tex_aux_store_primitive_name(ss, cmd, chr, offset);
451}
452
453/*tex
454
455    Here is a helper that does the actual hash insertion. This code far from ideal: the existence
456    of |hash_extra| changes all the potential (short) coalesced lists into a single (long) one.
457    This will create a slowdown.
458
459    Here |hash_state.hash_used| starts out as the maximum \quote {normal} hash, not extra.
460
461*/
462
463static halfword tex_aux_insert_id(halfword p, const unsigned char *str, unsigned int l)
464{
465    if (cs_text(p) > 0) {
466      RESTART:
467        if (lmt_hash_state.hash_data.ptr < lmt_hash_state.hash_data.allocated) {
468            ++lmt_hash_state.hash_data.ptr;
469            cs_next(p) = lmt_hash_state.hash_data.ptr + eqtb_size;
470            p = cs_next(p);
471        } else if (tex_aux_room_in_hash()) {
472            goto RESTART;
473        } else {
474            /*tex
475                Search for an empty location in |hash|. This actually makes the direct first hit
476                in such a hash slot invalid but we check for the string anyway. As we now use a
477                hash size that is rather minimal, we don't really need this branch. It is a last
478                resort anyway.
479            */
480            do {
481                if (lmt_hash_state.eqtb_data.top == hash_base) {
482                    /*tex We cannot go lower than this. */
483                    tex_overflow_error("hash size", hash_size + lmt_hash_state.hash_data.allocated);
484                }
485                --lmt_hash_state.eqtb_data.top;
486            } while (cs_text(lmt_hash_state.eqtb_data.top) != 0);
487            cs_next(p) = lmt_hash_state.eqtb_data.top;
488            p = lmt_hash_state.eqtb_data.top;
489        }
490    }
491    cs_text(p) = tex_push_string(str, l);
492    copy_eqtb_entry(p, undefined_control_sequence);
493    ++lmt_hash_state.eqtb_data.ptr;
494    return p;
495}
496
497/*tex
498
499    Here is the subroutine that searches the hash table for an identifier that matches a given
500    string of length |l > 1| appearing in |buffer[j .. (j + l - 1)]|. If the identifier is found,
501    the corresponding hash table address is returned. Otherwise, if the global variable
502    |no_new_control_sequence| is |true|, the dummy address |undefined_control_sequence| is returned.
503    Otherwise the identifier is inserted into the hash table and its location is returned.
504
505    On the \LUAMETATEX\ manual we have 250K hits and 400K misses. Adapting the max and prime does
506    bring down the misses but also no gain in performance. In practice we seldom follow the chain.
507
508*/
509
510halfword tex_id_locate(int j, int l, int create)
511{
512    /*tex The index in |hash| array: */
513    halfword p = tex_aux_compute_hash((char *) (lmt_fileio_state.io_buffer + j), (unsigned) l) + hash_base;
514    /*tex We start searching here. Note that |0 <= h < hash_prime|: */
515    while (1) {
516        strnumber s = cs_text(p);
517        if ((s > 0) && (str_length(s) == (unsigned) l) && tex_str_eq_buf(s, j, l)) {
518            return p;
519        } else {
520            /*tex The next one in a chain: */
521            halfword n = cs_next(p);
522            if (n) {
523                p = n;
524            } else if (create) {
525                return tex_aux_insert_id(p, (lmt_fileio_state.io_buffer + j), (unsigned) l);
526            } else {
527                break;
528            }
529        }
530    }
531    return undefined_control_sequence;
532}
533
534halfword tex_id_locate_only(int j, int l)
535{
536    halfword p = tex_aux_compute_hash((char *) (lmt_fileio_state.io_buffer + j), (unsigned) l) + hash_base;
537    while (p) {
538        strnumber s = cs_text(p);
539        if ((s > 0) && (str_length(s) == (unsigned) l) && tex_str_eq_buf(s, j, l)) {
540     // if ((s > 0) && (str_length(s) == (unsigned) l) && memcmp(str_string(s), &lmt_fileio_state.io_buffer[j], l) == 0) {
541            return p;
542        } else {
543            p = cs_next(p);
544        }
545    }
546    return undefined_control_sequence;
547}
548
549int tex_id_locate_steps(const char *cs)
550{
551    if (cs) { 
552        unsigned int l = (unsigned) strlen(cs);
553        halfword p = tex_aux_compute_hash(cs, l) + hash_base;
554        int steps = 0;
555        while (p) {
556            strnumber s = cs_text(p);
557            ++steps;
558            if ((s > 0) && (str_length(s) == l) && memcmp(str_string(s), cs, l) == 0) {            
559                return steps;
560            } else {
561                p = cs_next(p);
562            }
563        }
564        return -steps;
565    } else { 
566        return 0; 
567    }
568}
569
570/*tex
571
572    Here is a similar subroutine for finding a primitive in the hash. This one is based on a \CCODE\
573    string.
574
575*/
576
577halfword tex_string_locate(const char *s, size_t l, int create)
578{
579    /*tex The hash code: */
580    halfword h = tex_aux_compute_hash(s, (unsigned) l);
581    /*tex The index in |hash| array. We start searching here. Note that |0 <= h < hash_prime|: */
582    halfword p = h + hash_base;
583    while (1) {
584        if (cs_text(p) > 0 && tex_str_eq_cstr(cs_text(p), s, (int) l)) {
585            return p;
586        } else {
587            halfword n = cs_next(p);
588            if (n) {
589                p = n;
590            } else if (create) {
591                return tex_aux_insert_id(p, (const unsigned char *) s, (unsigned) l);
592            } else {
593                break;
594            }
595        }
596    }
597    return undefined_control_sequence;
598}
599
600halfword tex_string_locate_only(const char *s, size_t l)
601{
602    halfword p = tex_aux_compute_hash(s, (unsigned) l) + hash_base;
603    while (p) {
604        if (cs_text(p) > 0 && tex_str_eq_cstr(cs_text(p), s, (int) l)) {
605            return p;
606        } else {
607            p = cs_next(p);
608        }
609    }
610    return undefined_control_sequence;
611}
612
613halfword tex_located_string(const char *s)
614{
615    return tex_string_locate_only(s, strlen(s));
616}
617
618/*tex
619
620    The |print_cmd_chr| routine prints a symbolic interpretation of a command code and its modifier.
621    This is used in certain \quotation {You can\'t} error messages, and in the implementation of
622    diagnostic routines like |\show|.
623
624    The body of |print_cmd_chr| use to be a rather tedious listing of print commands, and most of it
625    was essentially an inverse to the |primitive| routine that enters a \TEX\ primitive into |eqtb|.
626
627    Thanks to |prim_data|, there is no need for all that tediousness. What is left of |primt_cnd_chr|
628    are just the exceptions to the general rule that the |cmd,chr_code| pair represents in a single
629    primitive command.
630
631*/
632
633static void tex_aux_print_chr_cmd(const char *s, halfword cmd, halfword chr)
634{
635    tex_print_str(s);
636    if (chr) {
637        tex_print_str(cmd == letter_cmd ? " letter " : " character ");
638        tex_print_uhex(chr);
639        tex_print_char(' ');
640        /*
641            By using the the unicode (ascii) names for some we can better support syntax
642            highlighting (which often involves parsing). The names are enclused in single
643            quotes. For the chr codes above 128 we assume \UNICODE\ support.
644        */
645        /*tex
646            We already intercepted the line feed here so that it doesn't give a side effect here
647            in the original |tex_print_tex_str(chr)| call but we have now inlined similar code
648            but without side effects.
649        */
650        if (chr < 32 || chr == 127) {
651            return;
652        } else if (chr <= 0x7F) {
653            switch (chr) {
654                case '\n' : tex_print_str("'line feed'");            return;
655                case '\r' : tex_print_str("'carriage return'");      return;
656                case ' '  : tex_print_str("'space'");                return;
657                case '!'  : tex_print_str("'exclamation mark'");     return;
658                case '\"' : tex_print_str("'quotation mark'");       return;
659                case '#'  : tex_print_str("'hash tag'");             return;
660                case '$'  : tex_print_str("'dollar sign'");          return;
661                case '%'  : tex_print_str("'percent sign'");         return;
662                case '&'  : tex_print_str("'ampersand'");            return;
663                case '\'' : tex_print_str("'apostrophe'");           return;
664                case '('  : tex_print_str("'left parenthesis'");     return;
665                case ')'  : tex_print_str("'right parenthesis'");    return;
666                case '*'  : tex_print_str("'asterisk'");             return;
667                case '+'  : tex_print_str("'plus sign'");            return;
668                case ','  : tex_print_str("'comma'");                return;
669                case '-'  : tex_print_str("'hyphen minus'");         return;
670                case '.'  : tex_print_str("'full stop'");            return;
671                case '/'  : tex_print_str("'slash'");                return;
672                case ':'  : tex_print_str("'colon'");                return;
673                case ';'  : tex_print_str("'semicolon'");            return;
674                case '<'  : tex_print_str("'less than sign'");       return;
675                case '='  : tex_print_str("'equal sign'");           return;
676                case '>'  : tex_print_str("'more than sign'");       return;
677                case '?'  : tex_print_str("'question mark'");        return;
678                case '@'  : tex_print_str("'at sign'");              return;
679                case '['  : tex_print_str("'left square bracket'");  return;
680                case '\\' : tex_print_str("'backslash'");            return;
681                case ']'  : tex_print_str("'right square bracket'"); return;
682                case '^'  : tex_print_str("'circumflex accent'");    return;
683                case '_'  : tex_print_str("'low line'");             return;
684                case '`'  : tex_print_str("'grave accent'");         return;
685                case '{'  : tex_print_str("'left curly bracket'");   return;
686                case '|'  : tex_print_str("'vertical bar'");         return;
687                case '}'  : tex_print_str("'right curly bracket'");  return;
688                case '~'  : tex_print_str("'tilde'");                return;
689            }
690            tex_print_char(chr);
691        } else if (chr <= 0x7FF) {
692            tex_print_char(0xC0 + (chr / 0x40));
693            tex_print_char(0x80 + (chr % 0x40));
694        } else if (chr <= 0xFFFF) {
695            tex_print_char(0xE0 +  (chr / 0x1000));
696            tex_print_char(0x80 + ((chr % 0x1000) / 0x40));
697            tex_print_char(0x80 + ((chr % 0x1000) % 0x40));
698        } else if (chr <= 0x10FFFF) {
699            tex_print_char(0xF0 +   (chr / 0x40000));
700            tex_print_char(0x80 +  ((chr % 0x40000) / 0x1000));
701            tex_print_char(0x80 + (((chr % 0x40000) % 0x1000) / 0x40));
702            tex_print_char(0x80 + (((chr % 0x40000) % 0x1000) % 0x40));
703        }
704    }
705}
706
707/*tex |\TEX82| Didn't print the |cmd,idx| information, but it may be useful. */
708
709static void tex_aux_prim_cmd_chr(quarterword cmd, halfword idx, int is_chr)
710{
711    if (cmd <= last_visible_cmd) {
712        if (is_chr) {
713            idx -= lmt_primitive_state.prim_data[cmd].offset;
714        }
715        if (idx >= 0 && idx < lmt_primitive_state.prim_data[cmd].subids) {
716            if (lmt_primitive_state.prim_data[cmd].names && lmt_primitive_state.prim_data[cmd].names[idx]) {
717                tex_print_tex_str_esc(lmt_primitive_state.prim_data[cmd].names[idx]);
718            } else {
719                tex_print_format("[warning: cmd %i, chr %i, no name]", cmd, idx);
720            }
721        } else if (cmd == internal_integer_cmd && idx < number_integer_pars) {
722            /* a special case */
723            tex_print_format("[integer: chr %i, class specific]", cmd);
724        } else {
725            tex_print_format("[warning: cmd %i, chr %i, out of range]", cmd, idx);
726        }
727    } else {
728        tex_print_format("[warning: cmd %i, invalid]", cmd);
729    }
730}
731
732static void tex_aux_show_lua_call(const char *what, int slot)
733{
734    int callback_id = lmt_callback_defined(show_lua_call_callback);
735    if (callback_id) {
736        char *ss = NULL;
737        int lua_retval = lmt_run_callback(lmt_lua_state.lua_instance, callback_id, "Sd->S", what, slot, &ss);
738        if (lua_retval && ss && strlen(ss) > 0) {
739            tex_print_str(ss);
740            lmt_memory_free(ss);
741            return;
742        }
743    }
744    tex_print_format("%s %i", what, slot);
745}
746
747void tex_print_cmd_flags(halfword cs, halfword cmd, int flags, int escaped)
748{
749    if (flags) {
750        flags = eq_flag(cs);
751        if (eq_level(cs) == level_one) { 
752            (escaped ? tex_print_str_esc : tex_print_str)("global "); 
753        }
754        if (is_frozen   (flags)) { (escaped ? tex_print_str_esc : tex_print_str)("frozen "   ); }
755        if (is_permanent(flags)) { (escaped ? tex_print_str_esc : tex_print_str)("permanent "); }
756        if (is_immutable(flags)) { (escaped ? tex_print_str_esc : tex_print_str)("immutable "); }
757        if (is_primitive(flags)) { (escaped ? tex_print_str_esc : tex_print_str)("primitive "); }
758        if (is_mutable  (flags)) { (escaped ? tex_print_str_esc : tex_print_str)("mutable "  ); }
759        if (is_noaligned(flags)) { (escaped ? tex_print_str_esc : tex_print_str)("noaligned "); }
760        if (is_instance (flags)) { (escaped ? tex_print_str_esc : tex_print_str)("instance " ); }
761        if (is_untraced (flags)) { (escaped ? tex_print_str_esc : tex_print_str)("untraced " ); }
762    }
763    if (is_constant_cmd(cmd)) {
764        (escaped ? tex_print_str_esc : tex_print_str)("constant " );
765    }
766    if (is_tolerant_cmd(cmd)) {
767        (escaped ? tex_print_str_esc : tex_print_str)("tolerant " );
768    }
769    if (is_protected_cmd(cmd)) {
770        (escaped ? tex_print_str_esc : tex_print_str)("protected ");
771    } else if (is_semi_protected_cmd(cmd)) {
772        (escaped ? tex_print_str_esc : tex_print_str)("semiprotected ");
773    }
774}
775
776void tex_print_cmd_chr(singleword cmd, halfword chr)
777{
778    switch (cmd) {
779        case left_brace_cmd:
780            tex_aux_print_chr_cmd("begin group", cmd, chr);
781            break;
782        case right_brace_cmd:
783            tex_aux_print_chr_cmd("end group", cmd, chr);
784            break;
785        case math_shift_cmd:
786            tex_aux_print_chr_cmd("math shift", cmd, chr);
787            break;
788        case alignment_tab_cmd:
789            tex_aux_print_chr_cmd("alignment tab", cmd, chr);
790            break;
791        case parameter_cmd:
792            tex_aux_print_chr_cmd("parameter", cmd, chr);
793            break;
794        case superscript_cmd:
795            tex_aux_print_chr_cmd("superscript", cmd, chr);
796            break;
797        case subscript_cmd:
798            tex_aux_print_chr_cmd("subscript", cmd, chr);
799            break;
800        case spacer_cmd:
801            tex_aux_print_chr_cmd("blank space", cmd, chr);
802            break;
803        case letter_cmd:
804        case other_char_cmd:
805            tex_aux_print_chr_cmd("the", cmd, chr);
806            break;
807        case active_char_cmd:
808            tex_aux_print_chr_cmd("active", cmd, chr);
809            break;
810        /*
811        case comment_cmd:
812        case invalid_char_cmd:
813            break;
814        */
815        case end_template_cmd:
816            /*tex Kind of special: |chr| points to |null_list). */
817            tex_print_str_esc("endtemplate");
818         // tex_print_str("end of alignment template");
819            break;
820        case if_test_cmd:
821            if (chr <= last_if_test_code) {
822                tex_aux_prim_cmd_chr(cmd, chr, 1);
823            } else {
824                tex_aux_show_lua_call("luacondition", chr - last_if_test_code);
825            }
826            break;
827        case char_given_cmd:
828            tex_print_str_esc("char");
829            tex_print_qhex(chr);
830            break;
831        case lua_call_cmd:
832            tex_aux_show_lua_call("luacall", chr);
833            break;
834        case lua_local_call_cmd:
835            tex_aux_show_lua_call("local luacall", chr);
836            break;
837        case lua_protected_call_cmd:
838            tex_aux_show_lua_call("protected luacall", chr);
839            break;
840        case lua_semi_protected_call_cmd:
841            tex_aux_show_lua_call("semiprotected luacall", chr);
842            break;
843        case lua_value_cmd:
844            tex_aux_show_lua_call("luavalue", chr);
845            break;
846        case set_font_cmd:
847            tex_print_str("select font ");
848            tex_print_font(chr);
849            break;
850        case undefined_cs_cmd:
851            tex_print_str("undefined");
852            break;
853        case call_cmd:
854        case protected_call_cmd:
855        case semi_protected_call_cmd:
856        case constant_call_cmd:
857        case tolerant_call_cmd:
858        case tolerant_protected_call_cmd:
859        case tolerant_semi_protected_call_cmd:
860            tex_print_cmd_flags(cur_cs, cur_cmd, 1, 0);
861            tex_print_str("macro");
862            break;
863        case internal_toks_cmd:
864            tex_aux_prim_cmd_chr(cmd, chr, 1);
865            break;
866        case register_toks_cmd:
867            tex_print_str_esc("toks");
868            tex_print_int(register_toks_number(chr));
869            break;
870        case internal_integer_cmd:
871            tex_aux_prim_cmd_chr(cmd, chr, 1);
872            break;
873        case register_integer_cmd:
874            tex_print_str_esc("count");
875            tex_print_int(register_integer_number(chr));
876            break;
877        case internal_attribute_cmd:
878            tex_aux_prim_cmd_chr(cmd, chr, 1);
879            break;
880        case register_attribute_cmd:
881            tex_print_str_esc("attribute");
882            tex_print_int(register_attribute_number(chr));
883            break;
884        case register_posit_cmd:
885            tex_print_str_esc("posit");
886            tex_print_int(register_posit_number(chr));
887            break;
888        case internal_posit_cmd:
889            tex_aux_prim_cmd_chr(cmd, chr, 1);
890            break;
891        case internal_dimension_cmd:
892            tex_aux_prim_cmd_chr(cmd, chr, 1);
893            break;
894        case register_dimension_cmd:
895            tex_print_str_esc("dimen");
896            tex_print_int(register_dimension_number(chr));
897            break;
898        case internal_glue_cmd:
899            tex_aux_prim_cmd_chr(cmd, chr, 1);
900            break;
901        case register_glue_cmd:
902            tex_print_str_esc("skip");
903            tex_print_int(register_glue_number(chr));
904            break;
905        case internal_muglue_cmd:
906            tex_aux_prim_cmd_chr(cmd, chr, 1);
907            break;
908        case register_muglue_cmd:
909            tex_print_str_esc("muskip");
910            tex_print_int(register_muglue_number(chr));
911            break;
912        case node_cmd:
913            tex_print_str(node_token_flagged(chr) ? "large" : "small");
914            tex_print_str(" node reference");
915            break;
916        case integer_cmd:
917            tex_print_str("integer ");
918            tex_print_int(chr);
919            break;
920        case dimension_cmd:
921            tex_print_str("dimension ");
922            tex_print_dimension(chr, pt_unit);
923            break;
924        case posit_cmd:
925            tex_print_str("posit ");
926            tex_print_posit(chr);
927            break;
928        case gluespec_cmd:
929            tex_print_str("gluespec ");
930            tex_print_spec(chr, pt_unit);
931            break;
932        case mugluespec_cmd:
933            tex_print_str("mugluespec ");
934            tex_print_spec(chr, mu_unit);
935            break;
936        case index_cmd:
937            tex_print_str("parameter ");
938            tex_print_int(chr);
939            break;
940# if (match_experiment)
941case integer_reference_cmd:
942    tex_print_str(node_token_flagged(chr) ? "large" : "small");
943    tex_print_str(" integer reference");
944    break;
945case dimension_reference_cmd:
946    tex_print_str(node_token_flagged(chr) ? "large" : "small");
947    tex_print_str(" dimension reference");
948    break;
949# endif 
950        case mathspec_cmd:
951            switch (node_subtype(chr)) {
952                case tex_mathcode:
953                    tex_print_str_esc("mathchar");
954                    break;
955                case umath_mathcode:
956             /* case umathnum_mathcode: */
957                    tex_print_str_esc("Umathchar");
958                    break;
959                case mathspec_mathcode:
960                    tex_print_str("mathspec ");
961            }
962            tex_print_mathspec(chr);
963            break;
964        case fontspec_cmd:
965            /* We don't check for validity here. */
966            tex_print_str("fontspec ");
967            tex_print_fontspec(chr);
968            break;
969        case specificationspec_cmd:
970            /* Mo need now for more details. */
971            tex_print_str("specification ");
972            if (chr) {
973                switch (node_subtype(chr)) {
974                    case integer_list_code:
975                        tex_print_str_esc("count");
976                        break;
977                    case dimension_list_code:
978                        tex_print_str_esc("dimen");
979                        break;
980                    case posit_list_code:
981                        tex_print_str_esc("posit");
982                        break;
983                    default:
984                        tex_aux_prim_cmd_chr(specification_cmd, node_subtype(chr), 0);
985                        break;
986                }
987            } else { 
988                tex_print_str("<unset>");
989            }
990            break;
991        case deep_frozen_end_template_cmd:
992            /*tex Kind of special: |chr| points to |null_list|. */
993            tex_print_str_esc("endtemplate");
994            break;
995        case deep_frozen_dont_expand_cmd:
996            /*tex Kind of special. */
997            tex_print_str_esc("notexpanded");
998            break;
999        case deep_frozen_keep_constant_cmd:
1000            /*tex Kind of special. */
1001            tex_print_str_esc("keepconstant");
1002            break;
1003        case internal_box_reference_cmd:
1004            tex_print_str_esc("hiddenlocalbox");
1005            break;
1006        default:
1007            /*tex These are most commands, actually. Todo: local boxes*/
1008            tex_aux_prim_cmd_chr(cmd, chr, 1);
1009            break;
1010    }
1011}
1012