scite-context-lexer-xml.lua /size: 11 Kb    last modification: 2023-12-21 09:42
1local info = {
2    version   = 1.002,
3    comment   = "scintilla lpeg lexer for xml",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files",
7}
8
9-- adapted from the regular context pretty printer code (after all, lexing
10-- boils down to much of the same and there are only so many ways to do
11-- things). Simplified a bit as we have a different nesting model.
12
13-- todo: parse entities in attributes
14
15local global, string, table, lpeg = _G, string, table, lpeg
16local P, R, S, C, Cmt, Cp = lpeg.P, lpeg.R, lpeg.S, lpeg.C, lpeg.Cmt, lpeg.Cp
17local type = type
18local match, find = string.match, string.find
19
20local lexers           = require("scite-context-lexer")
21
22local patterns         = lexers.patterns
23local token            = lexers.token
24
25local xmllexer         = lexers.new("xml","scite-context-lexer-xml")
26local xmlwhitespace    = xmllexer.whitespace
27
28local xmlcommentlexer  = lexers.load("scite-context-lexer-xml-comment")
29local xmlcdatalexer    = lexers.load("scite-context-lexer-xml-cdata")
30local xmlscriptlexer   = lexers.load("scite-context-lexer-xml-script")
31local lualexer         = lexers.load("scite-context-lexer-lua")
32
33
34local space            = patterns.space
35local any              = patterns.any
36
37local dquote           = P('"')
38local squote           = P("'")
39local colon            = P(":")
40local semicolon        = P(";")
41local equal            = P("=")
42local ampersand        = P("&")
43
44-- NameStartChar ::= ":" | [A-Z] | "_" | [a-z]
45--                 | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF]
46--                 | [#x370-#x37D] | [#x37F-#x1FFF]
47--                 | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF]
48--                 | [#x3001-#xD7FF]
49--                 | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
50--
51-- NameChar	  ::= NameStartChar
52--                 | "-" | "." | [0-9] | #xB7
53--                 | [#x203F-#x2040]
54--                 | [#x0300-#x036F]
55
56local name             = ( -- We are a bit more tolerant here.
57                            R("az","AZ","09")
58                          + S("_-.")
59                          + patterns.utf8two + patterns.utf8three + patterns.utf8four
60                         )^1
61local openbegin        = P("<")
62local openend          = P("</")
63local closebegin       = P("/>") + P(">")
64local closeend         = P(">")
65local opencomment      = P("<!--")
66local closecomment     = P("-->")
67local openinstruction  = P("<?")
68local closeinstruction = P("?>")
69local opencdata        = P("<![CDATA[")
70local closecdata       = P("]]>")
71local opendoctype      = P("<!DOCTYPE") -- could grab the whole doctype
72local closedoctype     = P("]>") + P(">")
73local openscript       = openbegin * (P("script") + P("SCRIPT")) * (1-closeend)^0 * closeend -- begin
74local closescript      = openend   * (P("script") + P("SCRIPT"))                  * closeend
75
76local openlua          = "<?lua"
77local closelua         = "?>"
78
79-- <!DOCTYPE Something PUBLIC "... ..." "..." [ ... ] >
80-- <!DOCTYPE Something PUBLIC "... ..." "..." >
81-- <!DOCTYPE Something SYSTEM "... ..." [ ... ] >
82-- <!DOCTYPE Something SYSTEM "... ..." >
83-- <!DOCTYPE Something [ ... ] >
84-- <!DOCTYPE Something >
85
86local entity           = ampersand * (1-semicolon)^1 * semicolon
87
88local utfchar          = lexers.helpers.utfchar
89local wordtoken        = patterns.wordtoken
90local iwordtoken       = patterns.iwordtoken
91local wordpattern      = patterns.wordpattern
92local iwordpattern     = patterns.iwordpattern
93local invisibles       = patterns.invisibles
94local styleofword      = lexers.styleofword
95local setwordlist      = lexers.setwordlist
96local validwords       = false
97local validminimum     = 3
98
99-- <?xml version="1.0" encoding="UTF-8" language="uk" ?>
100--
101-- <?context-directive editor language us ?>
102
103xmllexer.preamble = Cmt(P("<?xml " + P(true)), function(input,i) -- todo: utf bomb, no longer #
104    validwords   = false
105    validminimum = 3
106    local language = match(input,"^<%?xml[^>]*%?>%s*<%?context%-directive%s+editor%s+language%s+(..)%s+%?>")
107    if language then
108        validwords, validminimum = setwordlist(language)
109    end
110    return false -- so we go back and now handle the line as processing instruction
111end)
112
113local t_word =
114    C(iwordpattern) * Cp() / function(s,p) return styleofword(validwords,validminimum,s,p) end  -- a bit of a hack
115
116local t_rest =
117    token("default", any)
118
119local t_text =
120    token("default", (1-S("<>&")-space)^1)
121
122local t_spacing =
123    token(xmlwhitespace, space^1)
124
125local t_optionalwhitespace =
126    token("default", space^1)^0
127
128local t_localspacing =
129    token("default", space^1)
130
131-- Because we want a differently colored open and close we need an embedded lexer (whitespace
132-- trigger). What is actually needed is that scintilla applies the current whitespace style.
133-- Even using different style keys is not robust as they can be shared. I'll fix the main
134-- lexer code.
135
136local t_sstring =
137    token("quote",dquote)
138  * token("string",(1-dquote)^0)        -- different from context
139  * token("quote",dquote)
140
141local t_dstring =
142    token("quote",squote)
143  * token("string",(1-squote)^0)        -- different from context
144  * token("quote",squote)
145
146-- local t_comment =
147--     token("command",opencomment)
148--   * token("comment",(1-closecomment)^0) -- different from context
149--   * token("command",closecomment)
150
151-- local t_cdata =
152--     token("command",opencdata)
153--   * token("comment",(1-closecdata)^0)   -- different from context
154--   * token("command",closecdata)
155
156-- maybe cdata just text (then we don't need the extra lexer as we only have one comment then)
157
158-- <!DOCTYPE Something PUBLIC "... ..." "..." [ ... ] >
159-- <!DOCTYPE Something PUBLIC "... ..." "..." >
160-- <!DOCTYPE Something SYSTEM "... ..." [ ... ] >
161-- <!DOCTYPE Something SYSTEM "... ..." >
162-- <!DOCTYPE Something [ ... ] >
163-- <!DOCTYPE Something >
164
165-- <!ENTITY xxxx SYSTEM "yyyy" NDATA zzzz>
166-- <!ENTITY xxxx PUBLIC "yyyy" >
167-- <!ENTITY xxxx "yyyy" >
168
169local t_docstr  = t_dstring + t_sstring
170
171local t_docent  = token("command",P("<!ENTITY"))
172                * t_optionalwhitespace
173                * token("keyword",name)
174                * t_optionalwhitespace
175                * (
176                    (
177                        token("constant",P("SYSTEM"))
178                      * t_optionalwhitespace
179                      * t_docstr
180                      * t_optionalwhitespace
181                      * token("constant",P("NDATA"))
182                      * t_optionalwhitespace
183                      * token("keyword",name)
184                    ) + (
185                        token("constant",P("PUBLIC"))
186                      * t_optionalwhitespace
187                      * t_docstr
188                    ) + (
189                        t_docstr
190                    )
191                  )
192                * t_optionalwhitespace
193                * token("command",P(">"))
194
195local t_docele  = token("command",P("<!ELEMENT"))
196                * t_optionalwhitespace
197                * token("keyword",name)
198                * t_optionalwhitespace
199                * token("command",P("("))
200                * (
201                    t_localspacing
202                  + token("constant",P("#CDATA") + P("#PCDATA") + P("ANY"))
203                  + token("text",P(","))
204                  + token("comment",(1-S(",)"))^1)
205                  )^1
206                * token("command",P(")"))
207                * t_optionalwhitespace
208                * token("command",P(">"))
209
210local t_docset  = token("command",P("["))
211                * t_optionalwhitespace
212                * ((t_optionalwhitespace * (t_docent + t_docele))^1 + token("comment",(1-P("]"))^0))
213                * t_optionalwhitespace
214                * token("command",P("]"))
215
216local t_doctype = token("command",P("<!DOCTYPE"))
217                * t_optionalwhitespace
218                * token("keyword",name)
219                * t_optionalwhitespace
220                * (
221                    (
222                        token("constant",P("PUBLIC"))
223                      * t_optionalwhitespace
224                      * t_docstr
225                      * t_optionalwhitespace
226                      * t_docstr
227                      * t_optionalwhitespace
228                      ) + (
229                        token("constant",P("SYSTEM"))
230                      * t_optionalwhitespace
231                      * t_docstr
232                      * t_optionalwhitespace
233                      )
234                  )^-1
235                * t_docset^-1
236                * t_optionalwhitespace
237                * token("command",P(">"))
238
239lexers.embed(xmllexer, lualexer,        token("command", openlua),     token("command", closelua))
240lexers.embed(xmllexer, xmlcommentlexer, token("command", opencomment), token("command", closecomment))
241lexers.embed(xmllexer, xmlcdatalexer,   token("command", opencdata),   token("command", closecdata))
242lexers.embed(xmllexer, xmlscriptlexer,  token("command", openscript),  token("command", closescript))
243
244-- local t_name =
245--     token("plain",name)
246--   * (
247--         token("default",colon)
248--       * token("keyword",name)
249--     )
250--   + token("keyword",name)
251
252local t_name = -- more robust
253    token("plain",name * colon)^-1
254  * token("keyword",name)
255
256-- local t_key =
257--     token("plain",name)
258--   * (
259--         token("default",colon)
260--       * token("constant",name)
261--     )
262--   + token("constant",name)
263
264local t_key =
265    token("plain",name * colon)^-1
266  * token("constant",name)
267
268local t_attributes = (
269    t_optionalwhitespace
270  * t_key
271  * t_optionalwhitespace
272  * token("plain",equal)
273  * t_optionalwhitespace
274  * (t_dstring + t_sstring)
275  * t_optionalwhitespace
276)^0
277
278local t_open =
279    token("keyword",openbegin)
280  * (
281        t_name
282      * t_optionalwhitespace
283      * t_attributes
284      * token("keyword",closebegin)
285      +
286      token("error",(1-closebegin)^1)
287    )
288
289local t_close =
290    token("keyword",openend)
291  * (
292        t_name
293      * t_optionalwhitespace
294      * token("keyword",closeend)
295      +
296      token("error",(1-closeend)^1)
297    )
298
299local t_entity =
300    token("constant",entity)
301
302local t_instruction =
303    token("command",openinstruction * P("xml"))
304  * t_optionalwhitespace
305  * t_attributes
306  * t_optionalwhitespace
307  * token("command",closeinstruction)
308  + token("command",openinstruction * name)
309  * token("default",(1-closeinstruction)^1)
310  * token("command",closeinstruction)
311
312local t_invisible =
313    token("invisible",invisibles^1)
314
315xmllexer.rules = {
316    { "whitespace",  t_spacing     },
317    { "word",        t_word        },
318 -- { "text",        t_text        },
319 -- { "comment",     t_comment     },
320 -- { "cdata",       t_cdata       },
321    { "doctype",     t_doctype     },
322    { "instruction", t_instruction },
323    { "close",       t_close       },
324    { "open",        t_open        },
325    { "entity",      t_entity      },
326    { "invisible",   t_invisible   },
327    { "rest",        t_rest        },
328}
329
330xmllexer.folding = {
331    ["</"]   = { ["keyword"] = -1 },
332    ["/>"]   = { ["keyword"] = -1 },
333    ["<"]    = { ["keyword"] =  1 },
334    ["<?"]   = { ["command"] =  1 },
335    ["<!--"] = { ["command"] =  1 },
336    ["?>"]   = { ["command"] = -1 },
337    ["-->"]  = { ["command"] = -1 },
338    [">"]    = { ["command"] = -1 },
339}
340
341return xmllexer
342