scite-context-lexer-xml.lua /size: 10 Kb    last modification: 2021-10-28 13:49
1local info = {
2    version   = 1.002,
3    comment   = "scintilla lpeg lexer for xml",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files",
7}
8
9-- adapted from the regular context pretty printer code (after all, lexing
10-- boils down to much of the same and there are only so many ways to do
11-- things). Simplified a bit as we have a different nesting model.
12
13-- todo: parse entities in attributes
14
15local global, string, table, lpeg = _G, string, table, lpeg
16local P, R, S, C, Cmt, Cp = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cmt, lpeg.Cp
17local type = type
18local match, find = string.match, string.find
19
20local lexers           = require("scite-context-lexer")
21
22local patterns         = lexers.patterns
23local token            = lexers.token
24
25local xmllexer         = lexers.new("xml","scite-context-lexer-xml")
26local xmlwhitespace    = xmllexer.whitespace
27
28local xmlcommentlexer  = lexers.load("scite-context-lexer-xml-comment")
29local xmlcdatalexer    = lexers.load("scite-context-lexer-xml-cdata")
30local xmlscriptlexer   = lexers.load("scite-context-lexer-xml-script")
31local lualexer         = lexers.load("scite-context-lexer-lua")
32
33
34local space            = patterns.space
35local any              = patterns.any
36
37local dquote           = P('"')
38local squote           = P("'")
39local colon            = P(":")
40local semicolon        = P(";")
41local equal            = P("=")
42local ampersand        = P("&")
43
44local name             = (R("az","AZ","09") + S("_-."))^1
45local openbegin        = P("<")
46local openend          = P("</")
47local closebegin       = P("/>") + P(">")
48local closeend         = P(">")
49local opencomment      = P("<!--")
50local closecomment     = P("-->")
51local openinstruction  = P("<?")
52local closeinstruction = P("?>")
53local opencdata        = P("<![CDATA[")
54local closecdata       = P("]]>")
55local opendoctype      = P("<!DOCTYPE") -- could grab the whole doctype
56local closedoctype     = P("]>") + P(">")
57local openscript       = openbegin * (P("script") + P("SCRIPT")) * (1-closeend)^0 * closeend -- begin
58local closescript      = openend   * (P("script") + P("SCRIPT"))                  * closeend
59
60local openlua          = "<?lua"
61local closelua         = "?>"
62
63-- <!DOCTYPE Something PUBLIC "... ..." "..." [ ... ] >
64-- <!DOCTYPE Something PUBLIC "... ..." "..." >
65-- <!DOCTYPE Something SYSTEM "... ..." [ ... ] >
66-- <!DOCTYPE Something SYSTEM "... ..." >
67-- <!DOCTYPE Something [ ... ] >
68-- <!DOCTYPE Something >
69
70local entity           = ampersand * (1-semicolon)^1 * semicolon
71
72local utfchar          = lexers.helpers.utfchar
73local wordtoken        = patterns.wordtoken
74local iwordtoken       = patterns.iwordtoken
75local wordpattern      = patterns.wordpattern
76local iwordpattern     = patterns.iwordpattern
77local invisibles       = patterns.invisibles
78local styleofword      = lexers.styleofword
79local setwordlist      = lexers.setwordlist
80local validwords       = false
81local validminimum     = 3
82
83-- <?xml version="1.0" encoding="UTF-8" language="uk" ?>
84--
85-- <?context-directive editor language us ?>
86
87xmllexer.preamble = Cmt(P("<?xml " + P(true)), function(input,i) -- todo: utf bomb, no longer #
88    validwords   = false
89    validminimum = 3
90    local language = match(input,"^<%?xml[^>]*%?>%s*<%?context%-directive%s+editor%s+language%s+(..)%s+%?>")
91    if language then
92        validwords, validminimum = setwordlist(language)
93    end
94    return false -- so we go back and now handle the line as processing instruction
95end)
96
97local t_word =
98    C(iwordpattern) * Cp() / function(s,p) return styleofword(validwords,validminimum,s,p) end  -- a bit of a hack
99
100local t_rest =
101    token("default", any)
102
103local t_text =
104    token("default", (1-S("<>&")-space)^1)
105
106local t_spacing =
107    token(xmlwhitespace, space^1)
108
109local t_optionalwhitespace =
110    token("default", space^1)^0
111
112local t_localspacing =
113    token("default", space^1)
114
115-- Because we want a differently colored open and close we need an embedded lexer (whitespace
116-- trigger). What is actually needed is that scintilla applies the current whitespace style.
117-- Even using different style keys is not robust as they can be shared. I'll fix the main
118-- lexer code.
119
120local t_sstring =
121    token("quote",dquote)
122  * token("string",(1-dquote)^0)        -- different from context
123  * token("quote",dquote)
124
125local t_dstring =
126    token("quote",squote)
127  * token("string",(1-squote)^0)        -- different from context
128  * token("quote",squote)
129
130-- local t_comment =
131--     token("command",opencomment)
132--   * token("comment",(1-closecomment)^0) -- different from context
133--   * token("command",closecomment)
134
135-- local t_cdata =
136--     token("command",opencdata)
137--   * token("comment",(1-closecdata)^0)   -- different from context
138--   * token("command",closecdata)
139
140-- maybe cdata just text (then we don't need the extra lexer as we only have one comment then)
141
142-- <!DOCTYPE Something PUBLIC "... ..." "..." [ ... ] >
143-- <!DOCTYPE Something PUBLIC "... ..." "..." >
144-- <!DOCTYPE Something SYSTEM "... ..." [ ... ] >
145-- <!DOCTYPE Something SYSTEM "... ..." >
146-- <!DOCTYPE Something [ ... ] >
147-- <!DOCTYPE Something >
148
149-- <!ENTITY xxxx SYSTEM "yyyy" NDATA zzzz>
150-- <!ENTITY xxxx PUBLIC "yyyy" >
151-- <!ENTITY xxxx "yyyy" >
152
153local t_docstr  = t_dstring + t_sstring
154
155local t_docent  = token("command",P("<!ENTITY"))
156                * t_optionalwhitespace
157                * token("keyword",name)
158                * t_optionalwhitespace
159                * (
160                    (
161                        token("constant",P("SYSTEM"))
162                      * t_optionalwhitespace
163                      * t_docstr
164                      * t_optionalwhitespace
165                      * token("constant",P("NDATA"))
166                      * t_optionalwhitespace
167                      * token("keyword",name)
168                    ) + (
169                        token("constant",P("PUBLIC"))
170                      * t_optionalwhitespace
171                      * t_docstr
172                    ) + (
173                        t_docstr
174                    )
175                  )
176                * t_optionalwhitespace
177                * token("command",P(">"))
178
179local t_docele  = token("command",P("<!ELEMENT"))
180                * t_optionalwhitespace
181                * token("keyword",name)
182                * t_optionalwhitespace
183                * token("command",P("("))
184                * (
185                    t_localspacing
186                  + token("constant",P("#CDATA") + P("#PCDATA") + P("ANY"))
187                  + token("text",P(","))
188                  + token("comment",(1-S(",)"))^1)
189                  )^1
190                * token("command",P(")"))
191                * t_optionalwhitespace
192                * token("command",P(">"))
193
194local t_docset  = token("command",P("["))
195                * t_optionalwhitespace
196                * ((t_optionalwhitespace * (t_docent + t_docele))^1 + token("comment",(1-P("]"))^0))
197                * t_optionalwhitespace
198                * token("command",P("]"))
199
200local t_doctype = token("command",P("<!DOCTYPE"))
201                * t_optionalwhitespace
202                * token("keyword",name)
203                * t_optionalwhitespace
204                * (
205                    (
206                        token("constant",P("PUBLIC"))
207                      * t_optionalwhitespace
208                      * t_docstr
209                      * t_optionalwhitespace
210                      * t_docstr
211                      * t_optionalwhitespace
212                      ) + (
213                        token("constant",P("SYSTEM"))
214                      * t_optionalwhitespace
215                      * t_docstr
216                      * t_optionalwhitespace
217                      )
218                  )^-1
219                * t_docset^-1
220                * t_optionalwhitespace
221                * token("command",P(">"))
222
223lexers.embed(xmllexer, lualexer,        token("command", openlua),     token("command", closelua))
224lexers.embed(xmllexer, xmlcommentlexer, token("command", opencomment), token("command", closecomment))
225lexers.embed(xmllexer, xmlcdatalexer,   token("command", opencdata),   token("command", closecdata))
226lexers.embed(xmllexer, xmlscriptlexer,  token("command", openscript),  token("command", closescript))
227
228-- local t_name =
229--     token("plain",name)
230--   * (
231--         token("default",colon)
232--       * token("keyword",name)
233--     )
234--   + token("keyword",name)
235
236local t_name = -- more robust
237    token("plain",name * colon)^-1
238  * token("keyword",name)
239
240-- local t_key =
241--     token("plain",name)
242--   * (
243--         token("default",colon)
244--       * token("constant",name)
245--     )
246--   + token("constant",name)
247
248local t_key =
249    token("plain",name * colon)^-1
250  * token("constant",name)
251
252local t_attributes = (
253    t_optionalwhitespace
254  * t_key
255  * t_optionalwhitespace
256  * token("plain",equal)
257  * t_optionalwhitespace
258  * (t_dstring + t_sstring)
259  * t_optionalwhitespace
260)^0
261
262local t_open =
263    token("keyword",openbegin)
264  * (
265        t_name
266      * t_optionalwhitespace
267      * t_attributes
268      * token("keyword",closebegin)
269      +
270      token("error",(1-closebegin)^1)
271    )
272
273local t_close =
274    token("keyword",openend)
275  * (
276        t_name
277      * t_optionalwhitespace
278      * token("keyword",closeend)
279      +
280      token("error",(1-closeend)^1)
281    )
282
283local t_entity =
284    token("constant",entity)
285
286local t_instruction =
287    token("command",openinstruction * P("xml"))
288  * t_optionalwhitespace
289  * t_attributes
290  * t_optionalwhitespace
291  * token("command",closeinstruction)
292  + token("command",openinstruction * name)
293  * token("default",(1-closeinstruction)^1)
294  * token("command",closeinstruction)
295
296local t_invisible =
297    token("invisible",invisibles^1)
298
299xmllexer.rules = {
300    { "whitespace",  t_spacing     },
301    { "word",        t_word        },
302 -- { "text",        t_text        },
303 -- { "comment",     t_comment     },
304 -- { "cdata",       t_cdata       },
305    { "doctype",     t_doctype     },
306    { "instruction", t_instruction },
307    { "close",       t_close       },
308    { "open",        t_open        },
309    { "entity",      t_entity      },
310    { "invisible",   t_invisible   },
311    { "rest",        t_rest        },
312}
313
314xmllexer.folding = {
315    ["</"]   = { ["keyword"] = -1 },
316    ["/>"]   = { ["keyword"] = -1 },
317    ["<"]    = { ["keyword"] =  1 },
318    ["<?"]   = { ["command"] =  1 },
319    ["<!--"] = { ["command"] =  1 },
320    ["?>"]   = { ["command"] = -1 },
321    ["-->"]  = { ["command"] = -1 },
322    [">"]    = { ["command"] = -1 },
323}
324
325return xmllexer
326