scite-context-lexer-xml.lua /size: 11 Kb    last modification: 2025-02-21 11:03
1local info = {
2    version   = 1.002,
3    comment   = "scintilla lpeg lexer for xml",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files",
7}
8
9-- adapted from the regular context pretty printer code (after all, lexing
10-- boils down to much of the same and there are only so many ways to do
11-- things). Simplified a bit as we have a different nesting model.
12
13-- todo: parse entities in attributes
14
15local global, string, table, lpeg = _G, string, table, lpeg
16local P, R, S, C, Cmt, Cp = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cmt, lpeg.Cp
17local type = type
18local match, find = string.match, string.find
19
20local lexers           = require("scite-context-lexer")
21
22local patterns         = lexers.patterns
23local token            = lexers.token
24
25local xmllexer         = lexers.new("xml","scite-context-lexer-xml")
26local xmlwhitespace    = xmllexer.whitespace
27
28local xmlcommentlexer  = lexers.load("scite-context-lexer-xml-comment")
29local xmlcdatalexer    = lexers.load("scite-context-lexer-xml-cdata")
30local xmlscriptlexer   = lexers.load("scite-context-lexer-xml-script")
31local lualexer         = lexers.load("scite-context-lexer-lua")
32
33local space            = patterns.space
34local any              = patterns.any
35
36local dquote           = P('"')
37local squote           = P("'")
38local colon            = P(":")
39local semicolon        = P(";")
40local equal            = P("=")
41local ampersand        = P("&")
42
43-- NameStartChar ::= ":" | [A-Z] | "_" | [a-z]
44--                 | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF]
45--                 | [#x370-#x37D] | [#x37F-#x1FFF]
46--                 | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF]
47--                 | [#x3001-#xD7FF]
48--                 | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
49--
50-- NameChar	  ::= NameStartChar
51--                 | "-" | "." | [0-9] | #xB7
52--                 | [#x203F-#x2040]
53--                 | [#x0300-#x036F]
54
55local name             = ( -- We are a bit more tolerant here.
56                            R("az","AZ","09")
57                          + S("_-.")
58                          + patterns.utf8two + patterns.utf8three + patterns.utf8four
59                         )^1
60local openbegin        = P("<")
61local openend          = P("</")
62local closebegin       = P("/>") + P(">")
63local closeend         = P(">")
64local opencomment      = P("<!--")
65local closecomment     = P("-->")
66local openinstruction  = P("<?")
67local closeinstruction = P("?>")
68local opencdata        = P("<![CDATA[")
69local closecdata       = P("]]>")
70local opendoctype      = P("<!DOCTYPE") -- could grab the whole doctype
71local closedoctype     = P("]>") + P(">")
72local openscript       = openbegin * (P("script") + P("SCRIPT")) * (1-closeend)^0 * closeend -- begin
73local closescript      = openend   * (P("script") + P("SCRIPT"))                  * closeend
74local charpattern      = lexers.helpers.charpattern
75
76local openlua          = "<?lua"
77local closelua         = "?>"
78
79-- <!DOCTYPE Something PUBLIC "... ..." "..." [ ... ] >
80-- <!DOCTYPE Something PUBLIC "... ..." "..." >
81-- <!DOCTYPE Something SYSTEM "... ..." [ ... ] >
82-- <!DOCTYPE Something SYSTEM "... ..." >
83-- <!DOCTYPE Something [ ... ] >
84-- <!DOCTYPE Something >
85
86local entity           = ampersand * (1-semicolon)^1 * semicolon
87
88local wordtoken        = patterns.wordtoken
89local iwordtoken       = patterns.iwordtoken
90local wordpattern      = patterns.wordpattern
91local iwordpattern     = patterns.iwordpattern
92local invisibles       = patterns.invisibles
93local styleofword      = lexers.styleofword
94local setwordlist      = lexers.setwordlist
95local validwords       = false
96local validminimum     = 3
97
98-- <?xml version="1.0" encoding="UTF-8" language="uk" ?>
99--
100-- <?context-directive editor language us ?>
101
102xmllexer.preamble = Cmt(P("<?xml " + P(true)), function(input,i) -- todo: utf bomb, no longer #
103    validwords   = false
104    validminimum = 3
105    local language = match(input,"^<%?xml[^>]*%?>%s*<%?context%-directive%s+editor%s+language%s+(..)%s+%?>")
106    if language then
107        validwords, validminimum = setwordlist(language)
108    end
109    return false -- so we go back and now handle the line as processing instruction
110end)
111
112local t_word =
113    C(iwordpattern) * Cp() / function(s,p) return styleofword(validwords,validminimum,s,p) end  -- a bit of a hack
114
115local t_rest =
116    token("default", charpattern)
117
118local t_text =
119    token("default", (charpattern-S("<>&")-space)^1)
120
121local t_spacing =
122    token(xmlwhitespace, space^1)
123
124local t_optionalwhitespace =
125    token("default", space^1)^0
126
127local t_localspacing =
128    token("default", space^1)
129
130-- Because we want a differently colored open and close we need an embedded lexer (whitespace
131-- trigger). What is actually needed is that scintilla applies the current whitespace style.
132-- Even using different style keys is not robust as they can be shared. I'll fix the main
133-- lexer code.
134
135local t_sstring =
136    token("quote",dquote)
137  * token("string",(1-dquote)^0)        -- different from context
138  * token("quote",dquote)
139
140local t_dstring =
141    token("quote",squote)
142  * token("string",(1-squote)^0)        -- different from context
143  * token("quote",squote)
144
145-- local t_comment =
146--     token("command",opencomment)
147--   * token("comment",(1-closecomment)^0) -- different from context
148--   * token("command",closecomment)
149
150-- local t_cdata =
151--     token("command",opencdata)
152--   * token("comment",(1-closecdata)^0)   -- different from context
153--   * token("command",closecdata)
154
155-- maybe cdata just text (then we don't need the extra lexer as we only have one comment then)
156
157-- <!DOCTYPE Something PUBLIC "... ..." "..." [ ... ] >
158-- <!DOCTYPE Something PUBLIC "... ..." "..." >
159-- <!DOCTYPE Something SYSTEM "... ..." [ ... ] >
160-- <!DOCTYPE Something SYSTEM "... ..." >
161-- <!DOCTYPE Something [ ... ] >
162-- <!DOCTYPE Something >
163
164-- <!ENTITY xxxx SYSTEM "yyyy" NDATA zzzz>
165-- <!ENTITY xxxx PUBLIC "yyyy" >
166-- <!ENTITY xxxx "yyyy" >
167
168local t_docstr  = t_dstring + t_sstring
169
170local t_docent  = token("command",P("<!ENTITY"))
171                * t_optionalwhitespace
172                * token("keyword",name)
173                * t_optionalwhitespace
174                * (
175                    (
176                        token("constant",P("SYSTEM"))
177                      * t_optionalwhitespace
178                      * t_docstr
179                      * t_optionalwhitespace
180                      * token("constant",P("NDATA"))
181                      * t_optionalwhitespace
182                      * token("keyword",name)
183                    ) + (
184                        token("constant",P("PUBLIC"))
185                      * t_optionalwhitespace
186                      * t_docstr
187                    ) + (
188                        t_docstr
189                    )
190                  )
191                * t_optionalwhitespace
192                * token("command",P(">"))
193
194local t_docele  = token("command",P("<!ELEMENT"))
195                * t_optionalwhitespace
196                * token("keyword",name)
197                * t_optionalwhitespace
198                * token("command",P("("))
199                * (
200                    t_localspacing
201                  + token("constant",P("#CDATA") + P("#PCDATA") + P("ANY"))
202                  + token("text",P(","))
203                  + token("comment",(1-S(",)"))^1)
204                  )^1
205                * token("command",P(")"))
206                * t_optionalwhitespace
207                * token("command",P(">"))
208
209local t_docset  = token("command",P("["))
210                * t_optionalwhitespace
211                * ((t_optionalwhitespace * (t_docent + t_docele))^1 + token("comment",(1-P("]"))^0))
212                * t_optionalwhitespace
213                * token("command",P("]"))
214
215local t_doctype = token("command",P("<!DOCTYPE"))
216                * t_optionalwhitespace
217                * token("keyword",name)
218                * t_optionalwhitespace
219                * (
220                    (
221                        token("constant",P("PUBLIC"))
222                      * t_optionalwhitespace
223                      * t_docstr
224                      * t_optionalwhitespace
225                      * t_docstr
226                      * t_optionalwhitespace
227                      ) + (
228                        token("constant",P("SYSTEM"))
229                      * t_optionalwhitespace
230                      * t_docstr
231                      * t_optionalwhitespace
232                      )
233                  )^-1
234                * t_docset^-1
235                * t_optionalwhitespace
236                * token("command",P(">"))
237
238lexers.embed(xmllexer, lualexer,        token("command", openlua),     token("command", closelua))
239lexers.embed(xmllexer, xmlcommentlexer, token("command", opencomment), token("command", closecomment))
240lexers.embed(xmllexer, xmlcdatalexer,   token("command", opencdata),   token("command", closecdata))
241lexers.embed(xmllexer, xmlscriptlexer,  token("command", openscript),  token("command", closescript))
242
243-- local t_name =
244--     token("plain",name)
245--   * (
246--         token("default",colon)
247--       * token("keyword",name)
248--     )
249--   + token("keyword",name)
250
251local t_name = -- more robust
252    token("plain",name * colon)^-1
253  * token("keyword",name)
254
255-- local t_key =
256--     token("plain",name)
257--   * (
258--         token("default",colon)
259--       * token("constant",name)
260--     )
261--   + token("constant",name)
262
263local t_key =
264    token("plain",name * colon)^-1
265  * token("constant",name)
266
267local t_attributes = (
268    t_optionalwhitespace
269  * t_key
270  * t_optionalwhitespace
271  * token("plain",equal)
272  * t_optionalwhitespace
273  * (t_dstring + t_sstring)
274  * t_optionalwhitespace
275)^0
276
277local t_open =
278    token("keyword",openbegin)
279  * (
280        t_name
281      * t_optionalwhitespace
282      * t_attributes
283      * token("keyword",closebegin)
284      +
285      token("error",(1-closebegin)^1)
286    )
287
288local t_close =
289    token("keyword",openend)
290  * (
291        t_name
292      * t_optionalwhitespace
293      * token("keyword",closeend)
294      +
295      token("error",(1-closeend)^1)
296    )
297
298local t_entity =
299    token("constant",entity)
300
301local t_instruction =
302    token("command",openinstruction * P("xml"))
303  * t_optionalwhitespace
304  * t_attributes
305  * t_optionalwhitespace
306  * token("command",closeinstruction)
307  + token("command",openinstruction * name)
308  * token("default",(1-closeinstruction)^1)
309  * token("command",closeinstruction)
310
311local t_invisible =
312    token("invisible",invisibles^1)
313
314xmllexer.rules = {
315    { "whitespace",  t_spacing     },
316    { "word",        t_word        },
317 -- { "text",        t_text        },
318 -- { "comment",     t_comment     },
319 -- { "cdata",       t_cdata       },
320    { "doctype",     t_doctype     },
321    { "instruction", t_instruction },
322    { "close",       t_close       },
323    { "open",        t_open        },
324    { "entity",      t_entity      },
325    { "invisible",   t_invisible   },
326    { "rest",        t_rest        },
327}
328
329xmllexer.folding = {
330    ["</"]   = { ["keyword"] = -1 },
331    ["/>"]   = { ["keyword"] = -1 },
332    ["<"]    = { ["keyword"] =  1 },
333    ["<?"]   = { ["command"] =  1 },
334    ["<!--"] = { ["command"] =  1 },
335    ["?>"]   = { ["command"] = -1 },
336    ["-->"]  = { ["command"] = -1 },
337    [">"]    = { ["command"] = -1 },
338}
339
340return xmllexer
341