scite-context-lexer-xml.lua /size: 10 Kb    last modification: 2020-07-01 14:35
1local info = {
2    version   = 1.002,
3    comment   = "scintilla lpeg lexer for xml",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files",
7}
8
9-- adapted from the regular context pretty printer code (after all, lexing
10-- boils down to much of the same and there are only so many ways to do
11-- things). Simplified a bit as we have a different nesting model.
12
13-- todo: parse entities in attributes
14
15local global, string, table, lpeg = _G, string, table, lpeg
16local P, R, S, C, Cmt, Cp = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cmt, lpeg.Cp
17local type = type
18local match, find = string.match, string.find
19
20local lexer            = require("scite-context-lexer")
21local context          = lexer.context
22local patterns         = context.patterns
23
24local token            = lexer.token
25local exact_match      = lexer.exact_match
26
27local xmllexer         = lexer.new("xml","scite-context-lexer-xml")
28local whitespace       = xmllexer.whitespace
29
30local xmlcommentlexer  = lexer.load("scite-context-lexer-xml-comment")
31local xmlcdatalexer    = lexer.load("scite-context-lexer-xml-cdata")
32local xmlscriptlexer   = lexer.load("scite-context-lexer-xml-script")
33local lualexer         = lexer.load("scite-context-lexer-lua")
34
35local space            = patterns.space
36local any              = patterns.any
37
38local dquote           = P('"')
39local squote           = P("'")
40local colon            = P(":")
41local semicolon        = P(";")
42local equal            = P("=")
43local ampersand        = P("&")
44
45local name             = (R("az","AZ","09") + S("_-."))^1
46local openbegin        = P("<")
47local openend          = P("</")
48local closebegin       = P("/>") + P(">")
49local closeend         = P(">")
50local opencomment      = P("<!--")
51local closecomment     = P("-->")
52local openinstruction  = P("<?")
53local closeinstruction = P("?>")
54local opencdata        = P("<![CDATA[")
55local closecdata       = P("]]>")
56local opendoctype      = P("<!DOCTYPE") -- could grab the whole doctype
57local closedoctype     = P("]>") + P(">")
58local openscript       = openbegin * (P("script") + P("SCRIPT")) * (1-closeend)^0 * closeend -- begin
59local closescript      = openend   * (P("script") + P("SCRIPT"))                  * closeend
60
61local openlua          = "<?lua"
62local closelua         = "?>"
63
64-- <!DOCTYPE Something PUBLIC "... ..." "..." [ ... ] >
65-- <!DOCTYPE Something PUBLIC "... ..." "..." >
66-- <!DOCTYPE Something SYSTEM "... ..." [ ... ] >
67-- <!DOCTYPE Something SYSTEM "... ..." >
68-- <!DOCTYPE Something [ ... ] >
69-- <!DOCTYPE Something >
70
71local entity           = ampersand * (1-semicolon)^1 * semicolon
72
73local utfchar          = context.utfchar
74local wordtoken        = context.patterns.wordtoken
75local iwordtoken       = context.patterns.iwordtoken
76local wordpattern      = context.patterns.wordpattern
77local iwordpattern     = context.patterns.iwordpattern
78local invisibles       = context.patterns.invisibles
79local checkedword      = context.checkedword
80local styleofword      = context.styleofword
81local setwordlist      = context.setwordlist
82local validwords       = false
83local validminimum     = 3
84
85-- <?xml version="1.0" encoding="UTF-8" language="uk" ?>
86--
87-- <?context-directive editor language us ?>
88
89local t_preamble = Cmt(P("<?xml "), function(input,i,_) -- todo: utf bomb, no longer #
90    if i < 200 then
91        validwords, validminimum = false, 3
92        local language = match(input,"^<%?xml[^>]*%?>%s*<%?context%-directive%s+editor%s+language%s+(..)%s+%?>")
93     -- if not language then
94     --     language = match(input,"^<%?xml[^>]*language=[\"\'](..)[\"\'][^>]*%?>",i)
95     -- end
96        if language then
97            validwords, validminimum = setwordlist(language)
98        end
99    end
100    return false
101end)
102
103local t_word =
104--     Ct( iwordpattern / function(s) return styleofword(validwords,validminimum,s) end * Cp() ) -- the function can be inlined
105    iwordpattern / function(s) return styleofword(validwords,validminimum,s) end * Cp() -- the function can be inlined
106
107local t_rest =
108    token("default", any)
109
110local t_text =
111    token("default", (1-S("<>&")-space)^1)
112
113local t_spacing =
114    token(whitespace, space^1)
115
116local t_optionalwhitespace =
117    token("default", space^1)^0
118
119local t_localspacing =
120    token("default", space^1)
121
122-- Because we want a differently colored open and close we need an embedded lexer (whitespace
123-- trigger). What is actually needed is that scintilla applies the current whitespace style.
124-- Even using different style keys is not robust as they can be shared. I'll fix the main
125-- lexer code.
126
127local t_sstring =
128    token("quote",dquote)
129  * token("string",(1-dquote)^0)        -- different from context
130  * token("quote",dquote)
131
132local t_dstring =
133    token("quote",squote)
134  * token("string",(1-squote)^0)        -- different from context
135  * token("quote",squote)
136
137-- local t_comment =
138--     token("command",opencomment)
139--   * token("comment",(1-closecomment)^0) -- different from context
140--   * token("command",closecomment)
141
142-- local t_cdata =
143--     token("command",opencdata)
144--   * token("comment",(1-closecdata)^0)   -- different from context
145--   * token("command",closecdata)
146
147-- maybe cdata just text (then we don't need the extra lexer as we only have one comment then)
148
149-- <!DOCTYPE Something PUBLIC "... ..." "..." [ ... ] >
150-- <!DOCTYPE Something PUBLIC "... ..." "..." >
151-- <!DOCTYPE Something SYSTEM "... ..." [ ... ] >
152-- <!DOCTYPE Something SYSTEM "... ..." >
153-- <!DOCTYPE Something [ ... ] >
154-- <!DOCTYPE Something >
155
156-- <!ENTITY xxxx SYSTEM "yyyy" NDATA zzzz>
157-- <!ENTITY xxxx PUBLIC "yyyy" >
158-- <!ENTITY xxxx "yyyy" >
159
160local t_docstr  = t_dstring + t_sstring
161
162local t_docent  = token("command",P("<!ENTITY"))
163                * t_optionalwhitespace
164                * token("keyword",name)
165                * t_optionalwhitespace
166                * (
167                    (
168                        token("constant",P("SYSTEM"))
169                      * t_optionalwhitespace
170                      * t_docstr
171                      * t_optionalwhitespace
172                      * token("constant",P("NDATA"))
173                      * t_optionalwhitespace
174                      * token("keyword",name)
175                    ) + (
176                        token("constant",P("PUBLIC"))
177                      * t_optionalwhitespace
178                      * t_docstr
179                    ) + (
180                        t_docstr
181                    )
182                  )
183                * t_optionalwhitespace
184                * token("command",P(">"))
185
186local t_docele  = token("command",P("<!ELEMENT"))
187                * t_optionalwhitespace
188                * token("keyword",name)
189                * t_optionalwhitespace
190                * token("command",P("("))
191                * (
192                    t_localspacing
193                  + token("constant",P("#CDATA") + P("#PCDATA") + P("ANY"))
194                  + token("text",P(","))
195                  + token("comment",(1-S(",)"))^1)
196                  )^1
197                * token("command",P(")"))
198                * t_optionalwhitespace
199                * token("command",P(">"))
200
201local t_docset  = token("command",P("["))
202                * t_optionalwhitespace
203                * ((t_optionalwhitespace * (t_docent + t_docele))^1 + token("comment",(1-P("]"))^0))
204                * t_optionalwhitespace
205                * token("command",P("]"))
206
207local t_doctype = token("command",P("<!DOCTYPE"))
208                * t_optionalwhitespace
209                * token("keyword",name)
210                * t_optionalwhitespace
211                * (
212                    (
213                        token("constant",P("PUBLIC"))
214                      * t_optionalwhitespace
215                      * t_docstr
216                      * t_optionalwhitespace
217                      * t_docstr
218                      * t_optionalwhitespace
219                      ) + (
220                        token("constant",P("SYSTEM"))
221                      * t_optionalwhitespace
222                      * t_docstr
223                      * t_optionalwhitespace
224                      )
225                  )^-1
226                * t_docset^-1
227                * t_optionalwhitespace
228                * token("command",P(">"))
229
230lexer.embed_lexer(xmllexer, lualexer,        token("command", openlua),     token("command", closelua))
231lexer.embed_lexer(xmllexer, xmlcommentlexer, token("command", opencomment), token("command", closecomment))
232lexer.embed_lexer(xmllexer, xmlcdatalexer,   token("command", opencdata),   token("command", closecdata))
233lexer.embed_lexer(xmllexer, xmlscriptlexer,  token("command", openscript),  token("command", closescript))
234
235-- local t_name =
236--     token("plain",name)
237--   * (
238--         token("default",colon)
239--       * token("keyword",name)
240--     )
241--   + token("keyword",name)
242
243local t_name = -- more robust
244    token("plain",name * colon)^-1
245  * token("keyword",name)
246
247-- local t_key =
248--     token("plain",name)
249--   * (
250--         token("default",colon)
251--       * token("constant",name)
252--     )
253--   + token("constant",name)
254
255local t_key =
256    token("plain",name * colon)^-1
257  * token("constant",name)
258
259local t_attributes = (
260    t_optionalwhitespace
261  * t_key
262  * t_optionalwhitespace
263  * token("plain",equal)
264  * t_optionalwhitespace
265  * (t_dstring + t_sstring)
266  * t_optionalwhitespace
267)^0
268
269local t_open =
270    token("keyword",openbegin)
271  * (
272        t_name
273      * t_optionalwhitespace
274      * t_attributes
275      * token("keyword",closebegin)
276      +
277      token("error",(1-closebegin)^1)
278    )
279
280local t_close =
281    token("keyword",openend)
282  * (
283        t_name
284      * t_optionalwhitespace
285      * token("keyword",closeend)
286      +
287      token("error",(1-closeend)^1)
288    )
289
290local t_entity =
291    token("constant",entity)
292
293local t_instruction =
294    token("command",openinstruction * P("xml"))
295  * t_optionalwhitespace
296  * t_attributes
297  * t_optionalwhitespace
298  * token("command",closeinstruction)
299  + token("command",openinstruction * name)
300  * token("default",(1-closeinstruction)^1)
301  * token("command",closeinstruction)
302
303local t_invisible =
304    token("invisible",invisibles^1)
305
306-- local t_preamble =
307--     token("preamble",  t_preamble   )
308
309xmllexer._rules = {
310    { "whitespace",  t_spacing     },
311    { "preamble",    t_preamble    },
312    { "word",        t_word        },
313 -- { "text",        t_text        },
314 -- { "comment",     t_comment     },
315 -- { "cdata",       t_cdata       },
316    { "doctype",     t_doctype     },
317    { "instruction", t_instruction },
318    { "close",       t_close       },
319    { "open",        t_open        },
320    { "entity",      t_entity      },
321    { "invisible",   t_invisible   },
322    { "rest",        t_rest        },
323}
324
325xmllexer._tokenstyles = context.styleset
326
327xmllexer._foldpattern = P("</") + P("<") + P("/>") -- separate entry else interference
328+ P("<!--") + P("-->")
329
330xmllexer._foldsymbols = {
331    _patterns = {
332        "</",
333        "/>",
334        "<",
335    },
336    ["keyword"] = {
337        ["</"] = -1,
338        ["/>"] = -1,
339        ["<"]  =  1,
340    },
341    ["command"] = {
342        ["</"]   = -1,
343        ["/>"]   = -1,
344        ["<!--"] =  1,
345        ["-->"]  = -1,
346        ["<"]    =  1,
347    },
348}
349
350return xmllexer
351