l-url.lua /size: 14 Kb    last modification: 2021-10-28 13:50
1if not modules then modules = { } end modules ['l-url'] = {
2    version   = 1.001,
3    comment   = "companion to luat-lib.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9local char, format, byte = string.char, string.format, string.byte
10local concat = table.concat
11local tonumber, type, next = tonumber, type, next
12local P, C, R, S, Cs, Cc, Ct, Cf, Cg, V = lpeg.P, lpeg.C, lpeg.R, lpeg.S, lpeg.Cs, lpeg.Cc, lpeg.Ct, lpeg.Cf, lpeg.Cg, lpeg.V
13local lpegmatch, lpegpatterns, replacer = lpeg.match, lpeg.patterns, lpeg.replacer
14local sortedhash = table.sortedhash
15
16-- from wikipedia:
17--
18--   foo://username:password@example.com:8042/over/there/index.dtb?type=animal;name=narwhal#nose
19--   \_/   \_______________/ \_________/ \__/            \___/ \_/ \______________________/ \__/
20--    |           |               |       |                |    |            |                |
21--    |       userinfo         hostname  port              |    |          query          fragment
22--    |    \________________________________/\_____________|____|/
23-- scheme                  |                          |    |    |
24--    |                authority                    path   |    |
25--    |                                                    |    |
26--    |            path                       interpretable as filename
27--    |   ___________|____________                              |
28--   / \ /                        \                             |
29--   urn:example:animal:ferret:nose               interpretable as extension
30--
31-- also nice: http://url.spec.whatwg.org/ (maybe some day ...)
32
33url       = url or { }
34local url = url
35
36local unescapes = { }
37local escapes   = { }
38
39setmetatable(unescapes, { __index = function(t,k)
40    local v = char(tonumber(k,16))
41    t[k] = v
42    return v
43end })
44
45setmetatable(escapes, { __index = function(t,k)
46    local v = format("%%%02X",byte(k))
47    t[k] = v
48    return v
49end })
50
51-- okay:
52
53local colon       = P(":")
54local qmark       = P("?")
55local hash        = P("#")
56local slash       = P("/")
57local atsign      = P("@")
58local percent     = P("%")
59local endofstring = P(-1)
60local hexdigit    = R("09","AF","af")
61local plus        = P("+")
62local nothing     = Cc("")
63local okay        = R("09","AZ","az") + S("-_.,:=+*~!'()@&$")
64
65local escapedchar   = (percent * C(hexdigit * hexdigit)) / unescapes
66local unescapedchar = P(1) / escapes
67local escaped       = (plus / " ") + escapedchar -- so no loc://foo++.tex
68local noslash       = P("/") / ""
69local plustospace   = P("+")/" "
70
71local decoder = Cs( (
72                    plustospace
73                  + escapedchar
74                  + P("\r\n")/"\n"
75                  + P(1)
76                )^0 )
77local encoder = Cs( (
78                    R("09","AZ","az")^1
79                  + S("-./_")^1
80                  + P(" ")/"+"
81                  + P("\n")/"\r\n"
82                  + unescapedchar
83                )^0 )
84
85lpegpatterns.urldecoder = decoder
86lpegpatterns.urlencoder = encoder
87
88function url.decode  (str) return str and lpegmatch(decoder,  str) or str end
89function url.encode  (str) return str and lpegmatch(encoder,  str) or str end
90function url.unescape(str) return str and lpegmatch(unescaper,str) or str end
91
92-- we assume schemes with more than 1 character (in order to avoid problems with windows disks)
93-- we also assume that when we have a scheme, we also have an authority
94--
95-- maybe we should already split the query (better for unescaping as = & can be part of a value
96
97local schemestr    = Cs((escaped+(1-colon-slash-qmark-hash))^2)
98local authoritystr = Cs((escaped+(1-      slash-qmark-hash))^0)
99local pathstr      = Cs((escaped+(1-            qmark-hash))^0)
100----- querystr     = Cs((escaped+(1-                  hash))^0)
101local querystr     = Cs((        (1-                  hash))^0)
102local fragmentstr  = Cs((escaped+(1-           endofstring))^0)
103
104local scheme    =                 schemestr    * colon + nothing
105local authority = slash * slash * authoritystr         + nothing
106local path      = slash         * pathstr              + nothing
107local query     = qmark         * querystr             + nothing
108local fragment  = hash          * fragmentstr          + nothing
109
110local validurl  = scheme * authority * path * query * fragment
111local parser    = Ct(validurl)
112
113lpegpatterns.url         = validurl
114lpegpatterns.urlsplitter = parser
115
116local escaper    = Cs((R("09","AZ","az")^1 + P(" ")/"%%20" + S("-./_:")^1 + P(1) / escapes)^0) -- space happens most
117local unescaper  = Cs((escapedchar + 1)^0)
118local getcleaner = Cs((P("+++")/"%%2B" + P("+")/"%%20" + P(1))^1)
119
120lpegpatterns.urlunescaped  = escapedchar
121lpegpatterns.urlescaper    = escaper
122lpegpatterns.urlunescaper  = unescaper
123lpegpatterns.urlgetcleaner = getcleaner
124
125function url.unescapeget(str)
126    return lpegmatch(getcleaner,str)
127end
128
129-- todo: reconsider Ct as we can as well have five return values (saves a table)
130-- so we can have two parsers, one with and one without
131
132local function split(str)
133    return (type(str) == "string" and lpegmatch(parser,str)) or str
134end
135
136local isscheme = schemestr * colon * slash * slash -- this test also assumes authority
137
138local function hasscheme(str)
139    if str then
140        local scheme = lpegmatch(isscheme,str) -- at least one character
141        return scheme ~= "" and scheme or false
142    else
143        return false
144    end
145end
146
147--~ print(hasscheme("home:"))
148--~ print(hasscheme("home://"))
149
150-- todo: cache them
151
152local rootletter       = R("az","AZ")
153                       + S("_-+")
154local separator        = P("://")
155local qualified        = P(".")^0 * P("/")
156                       + rootletter * P(":")
157                       + rootletter^1 * separator
158                       + rootletter^1 * P("/")
159local rootbased        = P("/")
160                       + rootletter * P(":")
161
162local barswapper       = replacer("|",":")
163local backslashswapper = replacer("\\","/")
164
165-- queries:
166
167local equal = P("=")
168local amp   = P("&")
169local key   = Cs(((plustospace + escapedchar + 1) - equal              )^0)
170local value = Cs(((plustospace + escapedchar + 1) - amp   - endofstring)^0)
171
172local splitquery = Cf ( Ct("") * P { "sequence",
173    sequence = V("pair") * (amp * V("pair"))^0,
174    pair     = Cg(key * equal * value),
175}, rawset)
176
177-- hasher
178
179local userpart       = (1-atsign-colon)^1
180local serverpart     = (1-colon)^1
181local splitauthority = ((Cs(userpart) * colon * Cs(userpart) + Cs(userpart) * Cc(nil)) * atsign + Cc(nil) * Cc(nil))
182                     * Cs(serverpart) * (colon * (serverpart/tonumber) + Cc(nil))
183
184local function hashed(str) -- not yet ok (/test?test)
185    if not str or str == "" then
186        return {
187            scheme   = "invalid",
188            original = str,
189        }
190    end
191    local detailed   = split(str)
192    local rawscheme  = ""
193    local rawquery   = ""
194    local somescheme = false
195    local somequery  = false
196    if detailed then
197        rawscheme  = detailed[1]
198        rawquery   = detailed[4]
199        somescheme = rawscheme ~= ""
200        somequery  = rawquery  ~= ""
201    end
202    if not somescheme and not somequery then
203        return {
204            scheme    = "file",
205            authority = "",
206            path      = str,
207            query     = "",
208            fragment  = "",
209            original  = str,
210            noscheme  = true,
211            filename  = str,
212        }
213    end
214    -- not always a filename but handy anyway
215    local authority = detailed[2]
216    local path      = detailed[3]
217    local filename  -- = nil
218    local username  -- = nil
219    local password  -- = nil
220    local host      -- = nil
221    local port      -- = nil
222    if authority ~= "" then
223        -- these can be invalid
224        username, password, host, port = lpegmatch(splitauthority,authority)
225    end
226    if authority == "" then
227        filename = path
228    elseif path == "" then
229        filename = ""
230    else
231        -- this one can be can be invalid
232        filename = authority .. "/" .. path
233    end
234    return {
235        scheme    = rawscheme,
236        authority = authority,
237        path      = path,
238        query     = lpegmatch(unescaper,rawquery),  -- unescaped, but possible conflict with & and =
239        queries   = lpegmatch(splitquery,rawquery), -- split first and then unescaped
240        fragment  = detailed[5],
241        original  = str,
242        noscheme  = false,
243        filename  = filename,
244        --
245        host      = host,
246        port      = port,
247     -- usename   = username,
248     -- password  = password,
249    }
250end
251
252-- inspect(hashed())
253-- inspect(hashed(""))
254-- inspect(hashed("template:///test"))
255-- inspect(hashed("template:///test++.whatever"))
256-- inspect(hashed("template:///test%2B%2B.whatever"))
257-- inspect(hashed("template:///test%x.whatever"))
258-- inspect(hashed("tem%2Bplate:///test%x.whatever"))
259
260-- Here we assume:
261--
262-- files: ///  = relative
263-- files: //// = absolute (!)
264
265--~ table.print(hashed("file://c:/opt/tex/texmf-local")) -- c:/opt/tex/texmf-local
266--~ table.print(hashed("file://opt/tex/texmf-local"   )) -- opt/tex/texmf-local
267--~ table.print(hashed("file:///opt/tex/texmf-local"  )) -- opt/tex/texmf-local
268--~ table.print(hashed("file:////opt/tex/texmf-local" )) -- /opt/tex/texmf-local
269--~ table.print(hashed("file:///./opt/tex/texmf-local" )) -- ./opt/tex/texmf-local
270
271--~ table.print(hashed("c:/opt/tex/texmf-local"       )) -- c:/opt/tex/texmf-local
272--~ table.print(hashed("opt/tex/texmf-local"          )) -- opt/tex/texmf-local
273--~ table.print(hashed("/opt/tex/texmf-local"         )) -- /opt/tex/texmf-local
274
275url.split     = split
276url.hasscheme = hasscheme
277url.hashed    = hashed
278
279function url.addscheme(str,scheme) -- no authority
280    if hasscheme(str) then
281        return str
282    elseif not scheme then
283        return "file:///" .. str
284    else
285        return scheme .. ":///" .. str
286    end
287end
288
289function url.construct(hash) -- dodo: we need to escape !
290    local result, r = { }, 0
291    local scheme    = hash.scheme
292    local authority = hash.authority
293    local path      = hash.path
294    local queries   = hash.queries
295    local fragment  = hash.fragment
296    if scheme and scheme ~= "" then
297        r = r + 1 ; result[r] = lpegmatch(escaper,scheme)
298        r = r + 1 ; result[r] = "://"
299    end
300    if authority and authority ~= "" then
301        r = r + 1 ; result[r] = lpegmatch(escaper,authority)
302    end
303    if path and path ~= "" then
304        r = r + 1 ; result[r] = "/"
305        r = r + 1 ; result[r] = lpegmatch(escaper,path)
306    end
307    if queries then
308        local done = false
309        for k, v in sortedhash(queries) do
310            r = r + 1 ; result[r] = done and "&" or "?"
311            r = r + 1 ; result[r] = lpegmatch(escaper,k) -- is this escaped
312            r = r + 1 ; result[r] = "="
313            r = r + 1 ; result[r] = lpegmatch(escaper,v) -- is this escaped
314            done = true
315        end
316    end
317    if fragment and fragment ~= "" then
318        r = r + 1 ; result[r] = "#"
319        r = r + 1 ; result[r] = lpegmatch(escaper,fragment)
320    end
321    return concat(result)
322end
323
324local pattern = Cs(slash^-1/"" * R("az","AZ") * ((S(":|")/":") + P(":")) * slash * P(1)^0)
325
326function url.filename(filename)
327    local spec = hashed(filename)
328    local path = spec.path
329    return (spec.scheme == "file" and path and lpegmatch(pattern,path)) or filename
330end
331
332-- print(url.filename("/c|/test"))
333-- print(url.filename("/c/test"))
334-- print(url.filename("file:///t:/sources/cow.svg"))
335
336local function escapestring(str)
337    return lpegmatch(escaper,str)
338end
339
340url.escape = escapestring
341
342function url.query(str)
343    if type(str) == "string" then
344        return lpegmatch(splitquery,str) or ""
345    else
346        return str
347    end
348end
349
350function url.toquery(data)
351    local td = type(data)
352    if td == "string" then
353        return #str and escape(data) or nil -- beware of double escaping
354    elseif td == "table" then
355        if next(data) then
356            local t = { }
357            for k, v in next, data do
358                t[#t+1] = format("%s=%s",k,escapestring(v))
359            end
360            return concat(t,"&")
361        end
362    else
363        -- nil is a signal that no query
364    end
365end
366
367-- /test/ | /test | test/ | test => test
368
369local pattern = Cs(noslash^0 * (1 - noslash * P(-1))^0)
370
371function url.barepath(path)
372    if not path or path == "" then
373        return ""
374    else
375        return lpegmatch(pattern,path)
376    end
377end
378
379-- print(url.barepath("/test"),url.barepath("test/"),url.barepath("/test/"),url.barepath("test"))
380-- print(url.barepath("/x/yz"),url.barepath("x/yz/"),url.barepath("/x/yz/"),url.barepath("x/yz"))
381
382-- print(url.filename("file:///c:/oeps.txt"))
383-- print(url.filename("c:/oeps.txt"))
384-- print(url.filename("file:///oeps.txt"))
385-- print(url.filename("file:///etc/test.txt"))
386-- print(url.filename("/oeps.txt"))
387
388-- from the spec on the web (sort of):
389
390-- local function test(str)
391--     local t = url.hashed(str)
392--     t.constructed = url.construct(t)
393--     print(table.serialize(t))
394-- end
395
396-- inspect(url.hashed("http://www.pragma-ade.com/test%20test?test=test%20test&x=123%3d45"))
397-- inspect(url.hashed("http://www.pragma-ade.com/test%20test?test=test%20test&x=123%3d45"))
398
399-- test("sys:///./colo-rgb")
400
401-- test("/data/site/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733/figuur-cow.jpg")
402-- test("file:///M:/q2p/develop/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733")
403-- test("M:/q2p/develop/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733")
404-- test("file:///q2p/develop/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733")
405-- test("/q2p/develop/output/q2p-develop/resources/ecaboperception4_res/topicresources/58313733")
406
407-- test("file:///cow%20with%20spaces")
408-- test("file:///cow%20with%20spaces.pdf")
409-- test("cow%20with%20spaces.pdf")
410-- test("some%20file")
411-- test("/etc/passwords")
412-- test("http://www.myself.com/some%20words.html")
413-- test("file:///c:/oeps.txt")
414-- test("file:///c|/oeps.txt")
415-- test("file:///etc/oeps.txt")
416-- test("file://./etc/oeps.txt")
417-- test("file:////etc/oeps.txt")
418-- test("ftp://ftp.is.co.za/rfc/rfc1808.txt")
419-- test("http://www.ietf.org/rfc/rfc2396.txt")
420-- test("ldap://[2001:db8::7]/c=GB?objectClass?one#what")
421-- test("mailto:John.Doe@example.com")
422-- test("news:comp.infosystems.www.servers.unix")
423-- test("tel:+1-816-555-1212")
424-- test("telnet://192.0.2.16:80/")
425-- test("urn:oasis:names:specification:docbook:dtd:xml:4.1.2")
426-- test("http://www.pragma-ade.com/spaced%20name")
427
428-- test("zip:///oeps/oeps.zip#bla/bla.tex")
429-- test("zip:///oeps/oeps.zip?bla/bla.tex")
430
431-- table.print(url.hashed("/test?test"))
432