auxunistring.c /size: 6666 b    last modification: 2025-02-21 11:03
1/*
2    See license.txt in the root of this project.
3*/
4
5# include "luametatex.h"
6
7/*tex
8
9    The 5- and 6-byte UTF-8 sequences generate integers that are outside of the valid UCS range,
10    and therefore unsupported. We recover from an error with |0xFFFD|.
11
12*/
13
14unsigned aux_str2uni(const unsigned char *text)
15{
16    if (text[0] < 0x80) {
17        return (unsigned) text[0];
18    } else if (text[0] <= 0xBF) {
19        return 0xFFFD;
20    } else if (text[0] <= 0xDF) {
21        if (text[1] >= 0x80 && text[1] < 0xC0) {
22            return (unsigned) (((text[0] & 0x1F) << 6) | (text[1] & 0x3F));
23        }
24    } else if (text[0] <= 0xEF) {
25        if (text[1] >= 0x80 && text[1] < 0xC0 && text[2] >= 0x80 && text[2] < 0xC0) {
26            return (unsigned) (((text[0] & 0xF) << 12) | ((text[1] & 0x3F) << 6) | (text[2] & 0x3F));
27        }
28    } else if (text[0] <= 0xF7) {
29        if (text[1] <  0x80 || text[2] <  0x80 || text[3] <  0x80 ||
30            text[1] >= 0xC0 || text[2] >= 0xC0 || text[3] >= 0xC0) {
31            return 0xFFFD;
32        } else {
33            int w1 = (((text[0] & 0x7) << 2) | ((text[1] & 0x30) >> 4)) - 1;
34            int w2 = ((text[2] & 0xF) << 6) | (text[3] & 0x3F);
35            w1 = (w1 << 6) | ((text[1] & 0xF) << 2) | ((text[2] & 0x30) >> 4);
36            return (unsigned) (w1 * 0x400 + w2 + 0x10000);
37        }
38    }
39    return 0xFFFD;
40}
41
42unsigned aux_str2uni_len(const unsigned char *text, int *len)
43{
44    if (text[0] < 0x80) {
45        *len = 1;
46        return (unsigned) text[0];
47    } else if (text[0] <= 0xBF) {
48        *len = 1;
49        return 0xFFFD;
50    } else if (text[0] <= 0xDF) {
51        if (text[1] >= 0x80 && text[1] < 0xC0) {
52            *len = 2;
53            return (unsigned) (((text[0] & 0x1F) << 6) | (text[1] & 0x3F));
54        }
55    } else if (text[0] <= 0xEF) {
56        if (text[1] >= 0x80 && text[1] < 0xC0 && text[2] >= 0x80 && text[2] < 0xC0) {
57            *len = 3;
58            return (unsigned) (((text[0] & 0xF) << 12) | ((text[1] & 0x3F) << 6) | (text[2] & 0x3F));
59        }
60    } else if (text[0] <= 0xF7) {
61        if (text[1] <  0x80 || text[2] <  0x80 || text[3] <  0x80 ||
62            text[1] >= 0xC0 || text[2] >= 0xC0 || text[3] >= 0xC0) {
63            *len = 4;
64            return 0xFFFD;
65        } else {
66            int w1 = (((text[0] & 0x7) << 2) | ((text[1] & 0x30) >> 4)) - 1;
67            int w2 = ((text[2] & 0xF) << 6) | (text[3] & 0x3F);
68            w1 = (w1 << 6) | ((text[1] & 0xF) << 2) | ((text[2] & 0x30) >> 4);
69            *len = 4;
70            return (unsigned) (w1 * 0x400 + w2 + 0x10000);
71        }
72    }
73    *len = 1;
74    return 0xFFFD;
75}
76
77unsigned char *aux_uni2str(unsigned unic)
78{
79    unsigned char *buf = lmt_memory_malloc(5);
80    if (buf) {
81        if (unic < 0x80) {
82            buf[0] = (unsigned char) unic;
83            buf[1] = '\0';
84        } else if (unic < 0x800) {
85            buf[0] = (unsigned char) (0xC0 | (unic >> 6));
86            buf[1] = (unsigned char) (0x80 | (unic & 0x3F));
87            buf[2] = '\0';
88        } else if (unic < 0x10000) {
89            buf[0] = (unsigned char) (0xE0 | (unic >> 12));
90            buf[1] = (unsigned char) (0x80 | ((unic >> 6) & 0x3F));
91            buf[2] = (unsigned char) (0x80 | (unic & 0x3F));
92            buf[3] = '\0';
93        } else if (unic < 0x110000) {
94            int u; 
95            unic -= 0x10000;
96            u = (int) (((unic & 0xF0000) >> 16) + 1);
97            buf[0] = (unsigned char) (0xF0 | (u >> 2));
98            buf[1] = (unsigned char) (0x80 | ((u & 3) << 4) | ((unic & 0xF000) >> 12));
99            buf[2] = (unsigned char) (0x80 | ((unic & 0xFC0) >> 6));
100            buf[3] = (unsigned char) (0x80 | (unic & 0x3F));
101            buf[4] = '\0';
102        }
103    }
104    return buf;
105}
106
107/*tex
108
109    Function |buffer_to_unichar| converts a sequence of bytes in the |buffer| into a \UNICODE\
110    character value. It does not check for overflow of the |buffer|, but it is careful to check
111    the validity of the \UTF-8 encoding. For historical reasons all these small helpers look a bit
112    different but that has a certain charm so we keep it.
113
114*/
115
116char *aux_uni2string(char *utf8_text, unsigned unic)
117{
118    /*tex Increment and deposit character: */
119    if (unic <= 0x7F) {
120        *utf8_text++ = (char) unic;
121    } else if (unic <= 0x7FF) {
122        *utf8_text++ = (char) (0xC0 | (unic >> 6));
123        *utf8_text++ = (char) (0x80 | (unic & 0x3F));
124    } else if (unic <= 0xFFFF) {
125        *utf8_text++ = (char) (0xe0 | (unic >> 12));
126        *utf8_text++ = (char) (0x80 | ((unic >> 6) & 0x3F));
127        *utf8_text++ = (char) (0x80 | (unic & 0x3F));
128    } else if (unic < 0x110000) {
129        unsigned u; 
130        unic -= 0x10000;
131        u = ((unic & 0xF0000) >> 16) + 1;
132        *utf8_text++ = (char) (0xF0 | (u >> 2));
133        *utf8_text++ = (char) (0x80 | ((u & 3) << 4) | ((unic & 0xF000) >> 12));
134        *utf8_text++ = (char) (0x80 | ((unic & 0xFC0) >> 6));
135        *utf8_text++ = (char) (0x80 | (unic & 0x3F));
136    }
137    return utf8_text;
138}
139
140/*tex This one could be more efficient because we know a bit more. */
141
142unsigned aux_splitutf2uni(unsigned int *ubuf, const char *utf8buf)
143{
144    int len = (int) strlen(utf8buf);
145    unsigned int *upt = ubuf;
146    unsigned int *uend = ubuf + len;
147    const unsigned char *pt = (const unsigned char *) utf8buf;
148    const unsigned char *end = pt + len;
149    while (pt < end && *pt != '\0' && upt < uend) {
150        if (*pt <= 0x7F) {
151            *upt = *pt++;
152        } else if (*pt <= 0xDF) {
153            *upt = (unsigned int) (((*pt & 0x1F) << 6) | (pt[1] & 0x3F));
154            pt += 2;
155        } else if (*pt <= 0xEF) {
156            *upt = (unsigned int) (((*pt & 0xF) << 12) | ((pt[1] & 0x3F) << 6) | (pt[2] & 0x3F));
157            pt += 3;
158        } else {
159            int w1 = (((*pt & 0x7) << 2) | ((pt[1] & 0x30) >> 4)) - 1;
160            int w2 = ((pt[2] & 0xF) << 6) | (pt[3] & 0x3F);
161            w1 = (w1 << 6) | ((pt[1] & 0xF) << 2) | ((pt[2] & 0x30) >> 4);
162            *upt = (unsigned int) (w1 * 0x400 + w2 + 0x10000);
163            pt += 4;
164        }
165        ++upt;
166    }
167    *upt = 0; /*tex We have integers here, so assigning |\0| is a bit misleading. */
168    return (unsigned int) (upt - ubuf);
169}
170
171size_t aux_utf8len(const char *text, size_t size)
172{
173    size_t ind = 0;
174    size_t num = 0;
175    while (ind < size) {
176        unsigned char i = (unsigned char) *(text + ind);
177        if (i < 0x80) {
178            ind += 1;
179        } else if (i >= 0xF0) {
180            ind += 4;
181        } else if (i >= 0xE0) {
182            ind += 3;
183        } else if (i >= 0xC0) {
184            ind += 2;
185        } else {
186            ind += 1;
187        }
188        num += 1;
189    }
190    return num;
191}
192