scite-context-lexer-lua.lua /size: 14 Kb    last modification: 2020-07-01 14:35
1local info = {
2    version   = 1.002,
3    comment   = "scintilla lpeg lexer for lua",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files",
7}
8
9-- beware: all multiline is messy, so even if it's no lexer, it should be an embedded lexer
10-- we probably could use a local whitespace variant but this is cleaner
11
12local P, R, S, C, Cmt, Cp = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cmt, lpeg.Cp
13local match, find = string.match, string.find
14local setmetatable = setmetatable
15
16local lexer       = require("scite-context-lexer")
17local context     = lexer.context
18local patterns    = context.patterns
19
20local token       = lexer.token
21local exact_match = lexer.exact_match
22local just_match  = lexer.just_match
23
24local lualexer    = lexer.new("lua","scite-context-lexer-lua")
25local whitespace  = lualexer.whitespace
26
27local stringlexer = lexer.load("scite-context-lexer-lua-longstring")
28----- labellexer  = lexer.load("scite-context-lexer-lua-labelstring")
29
30local directives  = { } -- communication channel
31
32-- this will be extended
33
34-- we could combine some in a hash that returns the class that then makes the token
35-- this can save time on large files
36
37local keywords = {
38    "and", "break", "do", "else", "elseif", "end", "false", "for", "function", -- "goto",
39    "if", "in", "local", "nil", "not", "or", "repeat", "return", "then", "true",
40    "until", "while",
41}
42
43local functions = {
44    "assert", "collectgarbage", "dofile", "error", "getmetatable",
45    "ipairs", "load", "loadfile", "module", "next", "pairs",
46    "pcall", "print", "rawequal", "rawget", "rawset", "require",
47    "setmetatable", "tonumber", "tostring", "type", "unpack", "xpcall", "select",
48
49    "string", "table", "coroutine", "debug", "file", "io", "lpeg", "math", "os", "package", "bit32", "utf8",
50}
51
52local constants = {
53    "_G", "_VERSION", "_M", "...", "_ENV",
54    -- here too
55    "__add", "__call", "__concat", "__div", "__idiv", "__eq", "__gc", "__index",
56    "__le", "__lt", "__metatable", "__mode", "__mul", "__newindex",
57    "__pow", "__sub", "__tostring", "__unm", "__len",
58    "__pairs", "__ipairs",
59    "__close",
60    "NaN",
61   "<const>", "<toclose>",
62}
63
64-- local tokenmappings = { }
65--
66-- for i=1,#keywords  do tokenmappings[keywords [i]] = "keyword"  }
67-- for i=1,#functions do tokenmappings[functions[i]] = "function" }
68-- for i=1,#constants do tokenmappings[constants[i]] = "constant" }
69
70local internals = { -- __
71    "add", "call", "concat", "div", "idiv", "eq", "gc", "index",
72    "le", "lt", "metatable", "mode", "mul", "newindex",
73    "pow", "sub", "tostring", "unm", "len",
74    "pairs", "ipairs",
75    "close",
76}
77
78local depricated = {
79    "arg", "arg.n",
80    "loadstring", "setfenv", "getfenv",
81    "pack",
82}
83
84local csnames = { -- todo: option
85    "commands",
86    "context",
87 -- "ctxcmd",
88 -- "ctx",
89    "metafun",
90    "metapost",
91}
92
93local level         = nil
94local setlevel      = function(_,i,s) level = s return i end
95
96local equals        = P("=")^0
97
98local longonestart  = P("[[")
99local longonestop   = P("]]")
100local longonestring = (1-longonestop)^0
101
102local longtwostart  = P("[") * Cmt(equals,setlevel) * P("[")
103local longtwostop   = P("]") *     equals           * P("]")
104
105local sentinels = { } setmetatable(sentinels, { __index = function(t,k) local v = "]" .. k .. "]" t[k] = v return v end })
106
107local longtwostring = P(function(input,index)
108    if level then
109     -- local sentinel = "]" .. level .. "]"
110        local sentinel = sentinels[level]
111        local _, stop = find(input,sentinel,index,true)
112        return stop and stop + 1 - #sentinel or #input + 1
113    end
114end)
115
116    local longtwostring_body = longtwostring
117
118    local longtwostring_end = P(function(input,index)
119        if level then
120         -- local sentinel = "]" .. level .. "]"
121            local sentinel = sentinels[level]
122            local _, stop = find(input,sentinel,index,true)
123            return stop and stop + 1 or #input + 1
124        end
125    end)
126
127local longcomment = Cmt(#("[[" + ("[" * C(equals) * "[")), function(input,index,level)
128 -- local sentinel = "]" .. level .. "]"
129    local sentinel = sentinels[level]
130    local _, stop = find(input,sentinel,index,true)
131    return stop and stop + 1 or #input + 1
132end)
133
134local space         = patterns.space -- S(" \n\r\t\f\v")
135local any           = patterns.any
136local eol           = patterns.eol
137
138local squote        = P("'")
139local dquote        = P('"')
140local escaped       = P("\\") * P(1)
141local dashes        = P("--")
142
143local spacing       = token(whitespace, space^1)
144local rest          = token("default",  any)
145
146local shortcomment  = token("comment", dashes * (1-eol)^0)
147local longcomment   = token("comment", dashes * longcomment)
148
149-- fails on very long string with \ at end of lines (needs embedded lexer)
150-- and also on newline before " but it makes no sense to waste time on it
151
152local shortstring   = token("quote",  dquote)
153                    * token("string", (escaped + (1-dquote))^0)
154                    * token("quote",  dquote)
155                    + token("quote",  squote)
156                    * token("string", (escaped + (1-squote))^0)
157                    * token("quote",  squote)
158
159----- longstring    = token("quote",  longonestart)
160-----               * token("string", longonestring)
161-----               * token("quote",  longonestop)
162-----               + token("quote",  longtwostart)
163-----               * token("string", longtwostring)
164-----               * token("quote",  longtwostop)
165
166local string        = shortstring
167-----               + longstring
168
169lexer.embed_lexer(lualexer, stringlexer, token("quote",longtwostart), token("string",longtwostring_body) * token("quote",longtwostring_end))
170
171local integer       = P("-")^-1 * (patterns.hexadecimal + patterns.decimal)
172local number        = token("number", patterns.float + integer)
173                    * (token("error",R("AZ","az","__")^1))^0
174
175-- officially 127-255 are ok but not utf so useless
176
177----- validword     = R("AZ","az","__") * R("AZ","az","__","09")^0
178
179local utf8character = P(1) * R("\128\191")^1
180local validword     = (R("AZ","az","__") + utf8character) * (R("AZ","az","__","09") + utf8character)^0
181local validsuffix   = (R("AZ","az")      + utf8character) * (R("AZ","az","__","09") + utf8character)^0
182
183local identifier    = token("default",validword)
184
185----- operator      = token("special", P('..') + P('~=') + S('+-*/%^#=<>;:,.{}[]()')) -- maybe split off {}[]()
186----- operator      = token("special", S('+-*/%^#=<>;:,{}[]()') + P('..') + P('.') + P('~=') ) -- maybe split off {}[]()
187----- operator      = token("special", S('+-*/%^#=<>;:,{}[]().') + P('~=') ) -- no ^1 because of nested lexers
188local operator      = token("special", S('+-*/%^#=<>;:,{}[]().|~')) -- no ^1 because of nested lexers
189
190local structure     = token("special", S('{}[]()'))
191
192local optionalspace = spacing^0
193local hasargument   = #S("{([")
194
195-- ideal should be an embedded lexer ..
196
197local gotokeyword   = token("keyword", P("goto"))
198                    * spacing
199                    * token("grouping",validword)
200local gotolabel     = token("keyword", P("::"))
201                    * (spacing + shortcomment)^0
202                    * token("grouping",validword)
203                    * (spacing + shortcomment)^0
204                    * token("keyword", P("::"))
205
206----- p_keywords    = exact_match(keywords)
207----- p_functions   = exact_match(functions)
208----- p_constants   = exact_match(constants)
209----- p_internals   = P("__")
210-----               * exact_match(internals)
211
212local p_finish      = #(1-R("az","AZ","__"))
213local p_keywords    = lexer.helpers.utfchartabletopattern(keywords)  * p_finish -- exact_match(keywords)
214local p_functions   = lexer.helpers.utfchartabletopattern(functions) * p_finish -- exact_match(functions)
215local p_constants   = lexer.helpers.utfchartabletopattern(constants) * p_finish -- exact_match(constants)
216local p_internals   = P("__")
217                    * lexer.helpers.utfchartabletopattern(internals) * p_finish -- exact_match(internals)
218
219local p_csnames     = lexer.helpers.utfchartabletopattern(csnames) -- * p_finish -- just_match(csnames)
220local p_ctnames     = P("ctx") * R("AZ","az","__")^0
221local keyword       = token("keyword", p_keywords)
222local builtin       = token("plain",   p_functions)
223local constant      = token("data",    p_constants)
224local internal      = token("data",    p_internals)
225local csname        = token("user",    p_csnames + p_ctnames)
226                    * p_finish * optionalspace * (
227                        hasargument
228                      + ( token("special", S(".:")) * optionalspace * token("user", validword) )^1
229                      )^-1
230
231-- we could also check S(".:") * p_keyword etc, could be faster
232
233local identifier    = token("default", validword)
234                    * ( optionalspace * token("special", S(".:")) * optionalspace * (
235                            token("warning", p_keywords) +
236                            token("data", p_internals) + -- needs checking
237                            token("default", validword )
238                    ) )^0
239
240-- local t = { } for k, v in next, tokenmappings do t[#t+1] = k end t = table.concat(t)
241-- -- local experimental =  (S(t)^1) / function(s) return tokenmappings[s] end * Cp()
242--
243-- local experimental =  Cmt(S(t)^1, function(_,i,s)
244--     local t = tokenmappings[s]
245--     if t then
246--         return true, t, i
247--     end
248-- end)
249
250lualexer._rules = {
251    { "whitespace",   spacing      },
252    { "keyword",      keyword      }, -- can be combined
253 -- { "structure",    structure    },
254    { "function",     builtin      }, -- can be combined
255    { "constant",     constant     }, -- can be combined
256 -- { "experimental", experimental }, -- works but better split
257    { "csname",       csname       },
258    { "goto",         gotokeyword  },
259    { "identifier",   identifier   },
260    { "string",       string       },
261    { "number",       number       },
262    { "longcomment",  longcomment  },
263    { "shortcomment", shortcomment },
264    { "label",        gotolabel    },
265    { "operator",     operator     },
266    { "rest",         rest         },
267}
268
269-- -- experiment
270--
271-- local idtoken = R("az","AZ","__")
272--
273-- function context.one_of_match(specification)
274--     local pattern = idtoken -- the concat catches _ etc
275--     local list = { }
276--     for i=1,#specification do
277--        local style = specification[i][1]
278--        local words = specification[i][2]
279--        pattern = pattern + S(table.concat(words))
280--        for i=1,#words do
281--            list[words[i]] = style
282--        end
283--    end
284--    return Cmt(pattern^1, function(_,i,s)
285--         local style = list[s]
286--         if style then
287--             return true, { style, i } -- and i or nil
288--         else
289--             -- fail
290--         end
291--    end)
292-- end
293--
294-- local whatever = context.one_of_match {
295--     { "keyword", keywords  }, -- keyword
296--     { "plain",   functions }, -- builtin
297--     { "data",    constants }, -- constant
298-- }
299--
300-- lualexer._rules = {
301--     { "whitespace",   spacing      },
302--     { "whatever",     whatever     },
303--     { "csname",       csname       },
304--     { "goto",         gotokeyword  },
305--     { "identifier",   identifier   },
306--     { "string",       string       },
307--     { "number",       number       },
308--     { "longcomment",  longcomment  },
309--     { "shortcomment", shortcomment },
310--     { "label",        gotolabel    },
311--     { "operator",     operator     },
312--     { "rest",         rest         },
313-- }
314
315lualexer._tokenstyles = context.styleset
316
317-- lualexer._foldpattern = R("az")^2 + S("{}[]") -- separate entry else interference
318
319lualexer._foldpattern = (P("end") + P("if") + P("do") + P("function") + P("repeat") + P("until")) * P(#(1 - R("az")))
320                      + S("{}[]")
321
322lualexer._foldsymbols = {
323    _patterns = {
324        "[a-z][a-z]+",
325        "[{}%[%]]",
326    },
327    ["keyword"] = { -- challenge:  if=0  then=1  else=-1  elseif=-1
328        ["if"]       =  1, -- if .. [then|else] .. end
329        ["do"]       =  1, -- [while] do .. end
330        ["function"] =  1, -- function .. end
331        ["repeat"]   =  1, -- repeat .. until
332        ["until"]    = -1,
333        ["end"]      = -1,
334      },
335    ["comment"] = {
336        ["["] = 1, ["]"] = -1,
337    },
338 -- ["quote"] = { -- confusing
339 --     ["["] = 1, ["]"] = -1,
340 -- },
341    ["special"] = {
342     -- ["("] = 1, [")"] = -1,
343        ["{"] = 1, ["}"] = -1,
344    },
345}
346
347-- embedded in tex:
348
349local cstoken         = R("az","AZ","\127\255") + S("@!?_")
350local texcsname       = P("\\") * cstoken^1
351local commentline     = P("%") * (1-S("\n\r"))^0
352
353local texcomment      = token("comment", Cmt(commentline, function() return directives.cld_inline end))
354
355local longthreestart  = P("\\!!bs")
356local longthreestop   = P("\\!!es")
357local longthreestring = (1-longthreestop)^0
358
359local texstring       = token("quote",  longthreestart)
360                      * token("string", longthreestring)
361                      * token("quote",  longthreestop)
362
363----- texcommand      = token("user", texcsname)
364local texcommand      = token("warning", texcsname)
365
366-- local texstring    = token("quote", longthreestart)
367--                    * (texcommand + token("string",P(1-texcommand-longthreestop)^1) - longthreestop)^0 -- we match long non-\cs sequences
368--                    * token("quote", longthreestop)
369
370-- local whitespace    = "whitespace"
371-- local spacing       = token(whitespace, space^1)
372
373lualexer._directives = directives
374
375lualexer._rules_cld = {
376    { "whitespace",   spacing      },
377    { "texstring",    texstring    },
378    { "texcomment",   texcomment   },
379    { "texcommand",   texcommand   },
380 -- { "structure",    structure    },
381    { "keyword",      keyword      },
382    { "function",     builtin      },
383    { "csname",       csname       },
384    { "goto",         gotokeyword  },
385    { "constant",     constant     },
386    { "identifier",   identifier   },
387    { "string",       string       },
388    { "longcomment",  longcomment  },
389    { "shortcomment", shortcomment }, -- should not be used inline so best signal it as comment (otherwise complex state till end of inline)
390    { "number",       number       },
391    { "label",        gotolabel    },
392    { "operator",     operator     },
393    { "rest",         rest         },
394}
395
396return lualexer
397