auxunistring.c /size: 6530 b    last modification: 2024-01-16 10:22
1/*
2    See license.txt in the root of this project.
3*/
4
5# include "luametatex.h"
6
7/*tex
8
9    The 5- and 6-byte UTF-8 sequences generate integers that are outside of the valid UCS range,
10    and therefore unsupported. We recover from an error with |0xFFFD|.
11
12*/
13
14unsigned aux_str2uni(const unsigned char *text)
15{
16    if (text[0] < 0x80) {
17        return (unsigned) text[0];
18    } else if (text[0] <= 0xBF) {
19        return 0xFFFD;
20    } else if (text[0] <= 0xDF) {
21        if (text[1] >= 0x80 && text[1] < 0xC0) {
22            return (unsigned) (((text[0] & 0x1F) << 6) | (text[1] & 0x3F));
23        }
24    } else if (text[0] <= 0xEF) {
25        if (text[1] >= 0x80 && text[1] < 0xC0 && text[2] >= 0x80 && text[2] < 0xC0) {
26            return (unsigned) (((text[0] & 0xF) << 12) | ((text[1] & 0x3F) << 6) | (text[2] & 0x3F));
27        }
28    } else if (text[0] <= 0xF7) {
29        if (text[1] <  0x80 || text[2] <  0x80 || text[3] <  0x80 ||
30            text[1] >= 0xC0 || text[2] >= 0xC0 || text[3] >= 0xC0) {
31            return 0xFFFD;
32        } else {
33            int w1 = (((text[0] & 0x7) << 2) | ((text[1] & 0x30) >> 4)) - 1;
34            int w2 = ((text[2] & 0xF) << 6) | (text[3] & 0x3F);
35            w1 = (w1 << 6) | ((text[1] & 0xF) << 2) | ((text[2] & 0x30) >> 4);
36            return (unsigned) (w1 * 0x400 + w2 + 0x10000);
37        }
38    }
39    return 0xFFFD;
40}
41
42unsigned aux_str2uni_len(const unsigned char *text, int *len)
43{
44    if (text[0] < 0x80) {
45        *len = 1;
46        return (unsigned) text[0];
47    } else if (text[0] <= 0xBF) {
48        *len = 1;
49        return 0xFFFD;
50    } else if (text[0] <= 0xDF) {
51        if (text[1] >= 0x80 && text[1] < 0xC0) {
52            *len = 2;
53            return (unsigned) (((text[0] & 0x1F) << 6) | (text[1] & 0x3F));
54        }
55    } else if (text[0] <= 0xEF) {
56        if (text[1] >= 0x80 && text[1] < 0xC0 && text[2] >= 0x80 && text[2] < 0xC0) {
57            *len = 3;
58            return (unsigned) (((text[0] & 0xF) << 12) | ((text[1] & 0x3F) << 6) | (text[2] & 0x3F));
59        }
60    } else if (text[0] <= 0xF7) {
61        if (text[1] <  0x80 || text[2] <  0x80 || text[3] <  0x80 ||
62            text[1] >= 0xC0 || text[2] >= 0xC0 || text[3] >= 0xC0) {
63            *len = 4;
64            return 0xFFFD;
65        } else {
66            int w1 = (((text[0] & 0x7) << 2) | ((text[1] & 0x30) >> 4)) - 1;
67            int w2 = ((text[2] & 0xF) << 6) | (text[3] & 0x3F);
68            w1 = (w1 << 6) | ((text[1] & 0xF) << 2) | ((text[2] & 0x30) >> 4);
69            *len = 4;
70            return (unsigned) (w1 * 0x400 + w2 + 0x10000);
71        }
72    }
73    *len = 1;
74    return 0xFFFD;
75}
76
77
78unsigned char *aux_uni2str(unsigned unic)
79{
80    unsigned char *buf = lmt_memory_malloc(5);
81    if (buf) {
82        if (unic < 0x80) {
83            buf[0] = (unsigned char) unic;
84            buf[1] = '\0';
85        } else if (unic < 0x800) {
86            buf[0] = (unsigned char) (0xC0 | (unic >> 6));
87            buf[1] = (unsigned char) (0x80 | (unic & 0x3F));
88            buf[2] = '\0';
89        } else if (unic < 0x10000) {
90            buf[0] = (unsigned char) (0xE0 | (unic >> 12));
91            buf[1] = (unsigned char) (0x80 | ((unic >> 6) & 0x3F));
92            buf[2] = (unsigned char) (0x80 | (unic & 0x3F));
93            buf[3] = '\0';
94        } else if (unic < 0x110000) {
95            int u; 
96            unic -= 0x10000;
97            u = (int) (((unic & 0xF0000) >> 16) + 1);
98            buf[0] = (unsigned char) (0xF0 | (u >> 2));
99            buf[1] = (unsigned char) (0x80 | ((u & 3) << 4) | ((unic & 0xF000) >> 12));
100            buf[2] = (unsigned char) (0x80 | ((unic & 0xFC0) >> 6));
101            buf[3] = (unsigned char) (0x80 | (unic & 0x3F));
102            buf[4] = '\0';
103        }
104    }
105    return buf;
106}
107
108/*tex
109
110    Function |buffer_to_unichar| converts a sequence of bytes in the |buffer| into a \UNICODE\
111    character value. It does not check for overflow of the |buffer|, but it is careful to check
112    the validity of the \UTF-8 encoding. For historical reasons all these small helpers look a bit
113    different but that has a certain charm so we keep it.
114
115*/
116
117char *aux_uni2string(char *utf8_text, unsigned unic)
118{
119    /*tex Increment and deposit character: */
120    if (unic <= 0x7F) {
121        *utf8_text++ = (char) unic;
122    } else if (unic <= 0x7FF) {
123        *utf8_text++ = (char) (0xC0 | (unic >> 6));
124        *utf8_text++ = (char) (0x80 | (unic & 0x3F));
125    } else if (unic <= 0xFFFF) {
126        *utf8_text++ = (char) (0xe0 | (unic >> 12));
127        *utf8_text++ = (char) (0x80 | ((unic >> 6) & 0x3F));
128        *utf8_text++ = (char) (0x80 | (unic & 0x3F));
129    } else if (unic < 0x110000) {
130        unsigned u; 
131        unic -= 0x10000;
132        u = ((unic & 0xF0000) >> 16) + 1;
133        *utf8_text++ = (char) (0xF0 | (u >> 2));
134        *utf8_text++ = (char) (0x80 | ((u & 3) << 4) | ((unic & 0xF000) >> 12));
135        *utf8_text++ = (char) (0x80 | ((unic & 0xFC0) >> 6));
136        *utf8_text++ = (char) (0x80 | (unic & 0x3F));
137    }
138    return (utf8_text);
139}
140
141unsigned aux_splitutf2uni(unsigned int *ubuf, const char *utf8buf)
142{
143    int len = (int) strlen(utf8buf);
144    unsigned int *upt = ubuf;
145    unsigned int *uend = ubuf + len;
146    const unsigned char *pt = (const unsigned char *) utf8buf;
147    const unsigned char *end = pt + len;
148    while (pt < end && *pt != '\0' && upt < uend) {
149        if (*pt <= 0x7F) {
150            *upt = *pt++;
151        } else if (*pt <= 0xDF) {
152            *upt = (unsigned int) (((*pt & 0x1F) << 6) | (pt[1] & 0x3F));
153            pt += 2;
154        } else if (*pt <= 0xEF) {
155            *upt = (unsigned int) (((*pt & 0xF) << 12) | ((pt[1] & 0x3F) << 6) | (pt[2] & 0x3F));
156            pt += 3;
157        } else {
158            int w1 = (((*pt & 0x7) << 2) | ((pt[1] & 0x30) >> 4)) - 1;
159            int w2 = ((pt[2] & 0xF) << 6) | (pt[3] & 0x3F);
160            w1 = (w1 << 6) | ((pt[1] & 0xF) << 2) | ((pt[2] & 0x30) >> 4);
161            *upt = (unsigned int) (w1 * 0x400 + w2 + 0x10000);
162            pt += 4;
163        }
164        ++upt;
165    }
166    *upt = '\0';
167    return (unsigned int) (upt - ubuf);
168}
169
170size_t aux_utf8len(const char *text, size_t size)
171{
172    size_t ind = 0;
173    size_t num = 0;
174    while (ind < size) {
175        unsigned char i = (unsigned char) *(text + ind);
176        if (i < 0x80) {
177            ind += 1;
178        } else if (i >= 0xF0) {
179            ind += 4;
180        } else if (i >= 0xE0) {
181            ind += 3;
182        } else if (i >= 0xC0) {
183            ind += 2;
184        } else {
185            ind += 1;
186        }
187        num += 1;
188    }
189    return num;
190}
191