scite-context-lexer-bidi.lua /size: 18 Kb    last modification: 2020-07-01 14:35
1local info = {
2    version   = 1.002,
3    comment   = "scintilla lpeg lexer for plain text (with spell checking)",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files",
7}
8
9local P, S, Cmt, Cp = lpeg.P, lpeg.S, lpeg.Cmt, lpeg.Cp
10local find, match = string.find, string.match
11
12local lexer        = require("scite-context-lexer")
13local context      = lexer.context
14local patterns     = context.patterns
15
16local token        = lexer.token
17
18local bidilexer    = lexer.new("bidi","scite-context-lexer-bidi")
19local whitespace   = bidilexer.whitespace
20
21local space        = patterns.space
22local any          = patterns.any
23
24-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
25-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
26
27require("char-def")
28
29characters.directions  = { }
30
31setmetatable(characters.directions,{ __index = function(t,k)
32    local d = data[k]
33    if d then
34        local v = d.direction
35        if v then
36            t[k] = v
37            return v
38        end
39    end
40    t[k] = false -- maybe 'l'
41    return false
42end })
43
44characters.mirrors  = { }
45
46setmetatable(characters.mirrors,{ __index = function(t,k)
47    local d = data[k]
48    if d then
49        local v = d.mirror
50        if v then
51            t[k] = v
52            return v
53        end
54    end
55    t[k] = false
56    return false
57end })
58
59characters.textclasses  = { }
60
61setmetatable(characters.textclasses,{ __index = function(t,k)
62    local d = data[k]
63    if d then
64        local v = d.textclass
65        if v then
66            t[k] = v
67            return v
68        end
69    end
70    t[k] = false
71    return false
72end })
73
74local directiondata  = characters.directions
75local mirrordata     = characters.mirrors
76local textclassdata  = characters.textclasses
77
78local maximum_stack  = 0xFF -- unicode: 60, will be jumped to 125, we don't care too much
79local analyze_fences = false
80
81local whitespace = {
82    lre = true,
83    rle = true,
84    lro = true,
85    rlo = true,
86    pdf = true,
87    bn  = true,
88    ws  = true,
89}
90
91local b_s_ws_on = {
92    b   = true,
93    s   = true,
94    ws  = true,
95    on  = true
96}
97
98local mt_space  = { __index = { char = 0x0020, direction = "ws",  original = "ws",  level = 0 } }
99local mt_lre    = { __index = { char = 0x202A, direction = "lre", original = "lre", level = 0 } }
100local mt_rle    = { __index = { char = 0x202B, direction = "rle", original = "rle", level = 0 } }
101local mt_pdf    = { __index = { char = 0x202C, direction = "pdf", original = "pdf", level = 0 } }
102local mt_object = { __index = { char = 0xFFFC, direction = "on",  original = "on",  level = 0 } }
103
104local list  = { }
105local stack = { }
106
107setmetatable(stack, { __index = function(t,k) local v = { } t[k] = v return v end })
108
109local function build_list(head)
110    -- P1
111    local size = 0
112    lpegmatch(pattern,head)
113    return list, size
114end
115
116local function resolve_fences(list,size,start,limit)
117    -- N0: funny effects, not always better, so it's an option
118    local nofstack = 0
119    for i=start,limit do
120        local entry = list[i]
121        if entry.direction == "on" then
122            local char   = entry.char
123            local mirror = mirrordata[char]
124            if mirror then
125                local class = textclassdata[char]
126                entry.mirror = mirror
127                entry.class  = class
128                if class == "open" then
129                    nofstack       = nofstack + 1
130                    local stacktop = stack[nofstack]
131                    stacktop[1]    = mirror
132                    stacktop[2]    = i
133                    stacktop[3]    = false -- not used
134                elseif nofstack == 0 then
135                    -- skip
136                elseif class == "close" then
137                    while nofstack > 0 do
138                        local stacktop = stack[nofstack]
139                        if stacktop[1] == char then
140                            local open  = stacktop[2]
141                            local close = i
142                            list[open ].paired = close
143                            list[close].paired = open
144                            break
145                        else
146                            -- do we mirror or not
147                        end
148                        nofstack = nofstack - 1
149                    end
150                end
151            end
152        end
153    end
154end
155
156local function get_baselevel(list,size,direction)
157    if direction == "TRT" then
158        return 1, "TRT", true
159    elseif direction == "TLT" then
160        return 0, "TLT", true
161    end
162    -- P2, P3:
163    for i=1,size do
164        local entry     = list[i]
165        local direction = entry.direction
166        if direction == "r" or direction == "al" then -- and an ?
167            return 1, "TRT", true
168        elseif direction == "l" then
169            return 0, "TLT", true
170        end
171    end
172    return 0, "TLT", false
173end
174
175local function resolve_explicit(list,size,baselevel)
176-- if list.rle or list.lre or list.rlo or list.lro then
177    -- X1
178    local level    = baselevel
179    local override = "on"
180    local nofstack = 0
181    for i=1,size do
182        local entry     = list[i]
183        local direction = entry.direction
184        -- X2
185        if direction == "rle" then
186            if nofstack < maximum_stack then
187                nofstack        = nofstack + 1
188                local stacktop  = stack[nofstack]
189                stacktop[1]     = level
190                stacktop[2]     = override
191                level           = level + (level % 2 == 1 and 2 or 1) -- least_greater_odd(level)
192                override        = "on"
193                entry.level     = level
194                entry.direction = "bn"
195                entry.remove    = true
196            end
197        -- X3
198        elseif direction == "lre" then
199            if nofstack < maximum_stack then
200                nofstack        = nofstack + 1
201                local stacktop  = stack[nofstack]
202                stacktop[1]     = level
203                stacktop[2]     = override
204                level           = level + (level % 2 == 1 and 1 or 2) -- least_greater_even(level)
205                override        = "on"
206                entry.level     = level
207                entry.direction = "bn"
208                entry.remove    = true
209            end
210        -- X4
211        elseif direction == "rlo" then
212            if nofstack < maximum_stack then
213                nofstack        = nofstack + 1
214                local stacktop  = stack[nofstack]
215                stacktop[1]     = level
216                stacktop[2]     = override
217                level           = level + (level % 2 == 1 and 2 or 1) -- least_greater_odd(level)
218                override        = "r"
219                entry.level     = level
220                entry.direction = "bn"
221                entry.remove    = true
222            end
223        -- X5
224        elseif direction == "lro" then
225            if nofstack < maximum_stack then
226                nofstack        = nofstack + 1
227                local stacktop  = stack[nofstack]
228                stacktop[1]     = level
229                stacktop[2]     = override
230                level           = level + (level % 2 == 1 and 1 or 2) -- least_greater_even(level)
231                override        = "l"
232                entry.level     = level
233                entry.direction = "bn"
234                entry.remove    = true
235            end
236        -- X7
237        elseif direction == "pdf" then
238            if nofstack < maximum_stack then
239                local stacktop  = stack[nofstack]
240                level           = stacktop[1]
241                override        = stacktop[2]
242                nofstack        = nofstack - 1
243                entry.level     = level
244                entry.direction = "bn"
245                entry.remove    = true
246            end
247        -- X6
248        else
249            entry.level = level
250            if override ~= "on" then
251                entry.direction = override
252            end
253        end
254    end
255-- else
256--     for i=1,size do
257--         list[i].level = baselevel
258--     end
259-- end
260    -- X8 (reset states and overrides after paragraph)
261end
262
263local function resolve_weak(list,size,start,limit,orderbefore,orderafter)
264    -- W1: non spacing marks get the direction of the previous character
265-- if list.nsm then
266    for i=start,limit do
267        local entry = list[i]
268        if entry.direction == "nsm" then
269            if i == start then
270                entry.direction = orderbefore
271            else
272                entry.direction = list[i-1].direction
273            end
274        end
275    end
276-- end
277    -- W2: mess with numbers and arabic
278-- if list.en then
279    for i=start,limit do
280        local entry = list[i]
281        if entry.direction == "en" then
282            for j=i-1,start,-1 do
283                local prev = list[j]
284                local direction = prev.direction
285                if direction == "al" then
286                    entry.direction = "an"
287                    break
288                elseif direction == "r" or direction == "l" then
289                    break
290                end
291            end
292        end
293    end
294-- end
295    -- W3
296-- if list.al then
297    for i=start,limit do
298        local entry = list[i]
299        if entry.direction == "al" then
300            entry.direction = "r"
301        end
302    end
303-- end
304    -- W4: make separators number
305-- if list.es or list.cs then
306        -- skip
307--     if false then
308    if false then
309        for i=start+1,limit-1 do
310            local entry     = list[i]
311            local direction = entry.direction
312            if direction == "es" then
313                if list[i-1].direction == "en" and list[i+1].direction == "en" then
314                    entry.direction = "en"
315                end
316            elseif direction == "cs" then
317                local prevdirection = list[i-1].direction
318                if prevdirection == "en" then
319                    if list[i+1].direction == "en" then
320                        entry.direction = "en"
321                    end
322                elseif prevdirection == "an" and list[i+1].direction == "an" then
323                    entry.direction = "an"
324                end
325            end
326        end
327    else -- only more efficient when we have es/cs
328        local runner = start + 2
329        local before = list[start]
330        local entry  = list[start + 1]
331        local after  = list[runner]
332        while after do
333            local direction = entry.direction
334            if direction == "es" then
335                if before.direction == "en" and after.direction == "en" then
336                    entry.direction = "en"
337                end
338            elseif direction == "cs" then
339                local prevdirection = before.direction
340                if prevdirection == "en" then
341                    if after.direction == "en" then
342                        entry.direction = "en"
343                    end
344                elseif prevdirection == "an" and after.direction == "an" then
345                    entry.direction = "an"
346                end
347            end
348            before  = current
349            current = after
350            after   = list[runner]
351            runner  = runner + 1
352        end
353    end
354-- end
355    -- W5
356-- if list.et then
357    local i = start
358    while i <= limit do
359        if list[i].direction == "et" then
360            local runstart = i
361            local runlimit = runstart
362            for i=runstart,limit do
363                if list[i].direction == "et" then
364                    runlimit = i
365                else
366                    break
367                end
368            end
369            local rundirection = runstart == start and sor or list[runstart-1].direction
370            if rundirection ~= "en" then
371                rundirection = runlimit == limit and orderafter or list[runlimit+1].direction
372            end
373            if rundirection == "en" then
374                for j=runstart,runlimit do
375                    list[j].direction = "en"
376                end
377            end
378            i = runlimit
379        end
380        i = i + 1
381    end
382-- end
383    -- W6
384-- if list.es or list.cs or list.et then
385    for i=start,limit do
386        local entry     = list[i]
387        local direction = entry.direction
388        if direction == "es" or direction == "et" or direction == "cs" then
389            entry.direction = "on"
390        end
391    end
392-- end
393    -- W7
394    for i=start,limit do
395        local entry = list[i]
396        if entry.direction == "en" then
397            local prev_strong = orderbefore
398            for j=i-1,start,-1 do
399                local direction = list[j].direction
400                if direction == "l" or direction == "r" then
401                    prev_strong = direction
402                    break
403                end
404            end
405            if prev_strong == "l" then
406                entry.direction = "l"
407            end
408        end
409    end
410end
411
412local function resolve_neutral(list,size,start,limit,orderbefore,orderafter)
413    -- N1, N2
414    for i=start,limit do
415        local entry = list[i]
416        if b_s_ws_on[entry.direction] then
417            -- this needs checking
418            local leading_direction, trailing_direction, resolved_direction
419            local runstart = i
420            local runlimit = runstart
421            for j=runstart+1,limit do
422                if b_s_ws_on[list[j].direction] then
423                    runlimit = j
424                else
425                    break
426                end
427            end
428            if runstart == start then
429                leading_direction = orderbefore
430            else
431                leading_direction = list[runstart-1].direction
432                if leading_direction == "en" or leading_direction == "an" then
433                    leading_direction = "r"
434                end
435            end
436            if runlimit == limit then
437                trailing_direction = orderafter
438            else
439                trailing_direction = list[runlimit+1].direction
440                if trailing_direction == "en" or trailing_direction == "an" then
441                    trailing_direction = "r"
442                end
443            end
444            if leading_direction == trailing_direction then
445                -- N1
446                resolved_direction = leading_direction
447            else
448                -- N2 / does the weird period
449                resolved_direction = entry.level % 2 == 1 and "r" or "l"
450            end
451            for j=runstart,runlimit do
452                list[j].direction = resolved_direction
453            end
454            i = runlimit
455        end
456        i = i + 1
457    end
458end
459
460local function resolve_implicit(list,size,start,limit,orderbefore,orderafter,baselevel)
461    for i=start,limit do
462        local entry     = list[i]
463        local level     = entry.level
464        local direction = entry.direction
465        if level % 2 ~= 1 then -- even
466            -- I1
467            if direction == "r" then
468                entry.level = level + 1
469            elseif direction == "an" or direction == "en" then
470                entry.level = level + 2
471            end
472        else
473            -- I2
474            if direction == "l" or direction == "en" or direction == "an" then
475                entry.level = level + 1
476            end
477        end
478    end
479end
480
481local function resolve_levels(list,size,baselevel,analyze_fences)
482    -- X10
483    local start = 1
484    while start < size do
485        local level = list[start].level
486        local limit = start + 1
487        while limit < size and list[limit].level == level do
488            limit = limit + 1
489        end
490        local prev_level  = start == 1    and baselevel or list[start-1].level
491        local next_level  = limit == size and baselevel or list[limit+1].level
492        local orderbefore = (level > prev_level and level or prev_level) % 2 == 1 and "r" or "l"
493        local orderafter  = (level > next_level and level or next_level) % 2 == 1 and "r" or "l"
494        -- W1 .. W7
495        resolve_weak(list,size,start,limit,orderbefore,orderafter)
496        -- N0
497        if analyze_fences then
498            resolve_fences(list,size,start,limit)
499        end
500        -- N1 .. N2
501        resolve_neutral(list,size,start,limit,orderbefore,orderafter)
502        -- I1 .. I2
503        resolve_implicit(list,size,start,limit,orderbefore,orderafter,baselevel)
504        start = limit
505    end
506    -- L1
507    for i=1,size do
508        local entry     = list[i]
509        local direction = entry.original
510        -- (1)
511        if direction == "s" or direction == "b" then
512            entry.level = baselevel
513            -- (2)
514            for j=i-1,1,-1 do
515                local entry = list[j]
516                if whitespace[entry.original] then
517                    entry.level = baselevel
518                else
519                    break
520                end
521            end
522        end
523    end
524    -- (3)
525    for i=size,1,-1 do
526        local entry = list[i]
527        if whitespace[entry.original] then
528            entry.level = baselevel
529        else
530            break
531        end
532    end
533    -- L4
534    if analyze_fences then
535        for i=1,size do
536            local entry = list[i]
537            if entry.level % 2 == 1 then -- odd(entry.level)
538                if entry.mirror and not entry.paired then
539                    entry.mirror = false
540                end
541                -- okay
542            elseif entry.mirror then
543                entry.mirror = false
544            end
545        end
546    else
547        for i=1,size do
548            local entry = list[i]
549            if entry.level % 2 == 1 then -- odd(entry.level)
550                local mirror = mirrordata[entry.char]
551                if mirror then
552                    entry.mirror = mirror
553                end
554            end
555        end
556    end
557end
558
559local index = 1
560
561local function process(head,direction)
562    local list, size = build_list(head)
563    local baselevel = get_baselevel(list,size,direction) -- we always have an inline dir node in context
564    resolve_explicit(list,size,baselevel)
565    resolve_levels(list,size,baselevel,analyze_fences)
566    index = 1
567    return list, size
568end
569
570-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
571-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --
572
573local utf     = lexer.helpers.utfbytepattern
574
575-- local t_start = token("default", utf, function(s,i) if i == 1 then index = 1 process(s) end end))
576-- local t_bidi  = token("error",   utf / function() index = index + 1 return list[index].direction == "r" end)
577-- local t_rest  = token("default", any)
578
579-- bidilexer._rules = {
580--     { "start", t_start },
581--     { "bidi",  t_bidi  },
582--     { "rest",  t_rest  },
583-- }
584
585bidilexer._grammar = #utf * function(s,i)
586    process(s)
587    local t = { }
588    local n = 0
589    for i=1,size do
590        n = n + 1 t[n] = i
591        n = n + 1 t[n] = "error"
592    end
593    return t
594end
595
596bidilexer._tokenstyles = context.styleset
597
598return bidilexer
599