util-str.lua /size: 45 Kb    last modification: 2023-12-21 09:44
1if not modules then modules = { } end modules ['util-str'] = {
2    version   = 1.001,
3    comment   = "companion to luat-lib.mkiv",
4    author    = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5    copyright = "PRAGMA ADE / ConTeXt Development Team",
6    license   = "see context related readme files"
7}
8
9utilities         = utilities or { }
10utilities.strings = utilities.strings or { }
11local strings     = utilities.strings
12
13local format, gsub, rep, sub, find, char = string.format, string.gsub, string.rep, string.sub, string.find, string.char
14local load, dump = load, string.dump
15local tonumber, type, tostring, next, setmetatable = tonumber, type, tostring, next, setmetatable
16local unpack, concat = table.unpack, table.concat
17local P, V, C, S, R, Ct, Cs, Cp, Carg, Cc = lpeg.P, lpeg.V, lpeg.C, lpeg.S, lpeg.R, lpeg.Ct, lpeg.Cs, lpeg.Cp, lpeg.Carg, lpeg.Cc
18local patterns, lpegmatch = lpeg.patterns, lpeg.match
19local tsplitat = lpeg.tsplitat
20local utfchar, utfbyte, utflen = utf.char, utf.byte, utf.len
21
22----- loadstripped = utilities.lua.loadstripped
23----- setmetatableindex = table.setmetatableindex
24
25local loadstripped = function(str,shortcuts)
26    if shortcuts then
27        return load(dump(load(str),true),nil,nil,shortcuts)
28    else
29        return load(dump(load(str),true))
30    end
31end
32
33-- todo: make a special namespace for the formatter
34
35if not number then number = { } end -- temp hack for luatex-fonts
36
37local stripzero   = patterns.stripzero
38local stripzeros  = patterns.stripzeros
39local newline     = patterns.newline
40local endofstring = patterns.endofstring
41local anything    = patterns.anything
42local whitespace  = patterns.whitespace
43local space       = patterns.space
44local spacer      = patterns.spacer
45local spaceortab  = patterns.spaceortab
46local digit       = patterns.digit
47local sign        = patterns.sign
48local period      = patterns.period
49
50-- local function points(n)
51--     n = tonumber(n)
52--     return (not n or n == 0) and "0pt" or lpegmatch(stripzeros,format("%.5fpt",n/65536))
53-- end
54
55-- local function basepoints(n)
56--     n = tonumber(n)
57--     return (not n or n == 0) and "0bp" or lpegmatch(stripzeros,format("%.5fbp", n*(7200/7227)/65536))
58-- end
59
60local ptf = 1 / 65536
61local bpf = (7200/7227) / 65536
62
63local function points(n)
64    if n == 0 then
65        return "0pt"
66    end
67    n = tonumber(n)
68    if not n or n == 0 then
69        return "0pt"
70    end
71    n = n * ptf
72    if n % 1 == 0 then
73        return format("%ipt",n)
74    else
75        return lpegmatch(stripzeros,format("%.5fpt",n)) -- plural as we need to keep the pt
76    end
77end
78
79local function nupoints(n)
80    if n == 0 then
81        return "0"
82    end
83    n = tonumber(n)
84    if not n or n == 0 then
85        return "0"
86    end
87    n = n * ptf
88    if n % 1 == 0 then
89        return format("%i",n)
90    else
91        return format("%.5f",n) -- no strip
92    end
93end
94
95local function basepoints(n)
96    if n == 0 then
97        return "0bp"
98    end
99    n = tonumber(n)
100    if not n or n == 0 then
101        return "0bp"
102    end
103    n = n * bpf
104    if n % 1 == 0 then
105        return format("%ibp",n)
106    else
107        return lpegmatch(stripzeros,format("%.5fbp",n)) -- plural as we need to keep the pt
108    end
109end
110
111local function nubasepoints(n)
112    if n == 0 then
113        return "0"
114    end
115    n = tonumber(n)
116    if not n or n == 0 then
117        return "0"
118    end
119    n = n * bpf
120    if n % 1 == 0 then
121        return format("%i",n)
122    else
123        return format("%.5f",n) -- no strip
124    end
125end
126
127number.points       = points
128number.nupoints     = nupoints
129number.basepoints   = basepoints
130number.nubasepoints = nubasepoints
131
132-- str = " \n \ntest  \n test\ntest "
133-- print("["..string.gsub(string.collapsecrlf(str),"\n","+").."]")
134
135local rubish     = spaceortab^0 * newline
136local anyrubish  = spaceortab + newline
137local stripped   = (spaceortab^1 / "") * newline
138local leading    = rubish^0 / ""
139local trailing   = (anyrubish^1 * endofstring) / ""
140local redundant  = rubish^3 / "\n"
141
142local pattern = Cs(leading * (trailing + redundant + stripped + anything)^0)
143
144function strings.collapsecrlf(str)
145    return lpegmatch(pattern,str)
146end
147
148-- The following functions might end up in another namespace.
149
150local repeaters = { } -- watch how we also moved the -1 in depth-1 to the creator
151
152function strings.newrepeater(str,offset)
153    offset = offset or 0
154    local s = repeaters[str]
155    if not s then
156        s = { }
157        repeaters[str] = s
158    end
159    local t = s[offset]
160    if t then
161        return t
162    end
163    t = { }
164    setmetatable(t, { __index = function(t,k)
165        if not k then
166            return ""
167        end
168        local n = k + offset
169        local s = n > 0 and rep(str,n) or ""
170        t[k] = s
171        return s
172    end })
173    s[offset] = t
174    return t
175end
176
177-- local dashes = strings.newrepeater("--",-1)
178-- print(dashes[2],dashes[3],dashes[1])
179
180local extra, tab, start = 0, 0, 4, 0
181
182local nspaces = strings.newrepeater(" ")
183
184string.nspaces = nspaces
185
186local pattern =
187    Carg(1) / function(t)
188        extra, tab, start = 0, t or 7, 1
189    end
190  * Cs((
191      Cp() * patterns.tab / function(position)
192          local current = (position - start + 1) + extra
193          local spaces = tab-(current-1) % tab
194          if spaces > 0 then
195              extra = extra + spaces - 1
196              return nspaces[spaces] -- rep(" ",spaces)
197          else
198              return ""
199          end
200      end
201    + newline * Cp() / function(position)
202          extra, start = 0, position
203      end
204    + anything
205  )^1)
206
207function strings.tabtospace(str,tab)
208    -- no real gain in first checking if a \t is there
209    return lpegmatch(pattern,str,1,tab or 7)
210end
211
212function string.utfpadding(s,n)
213    if not n or n == 0 then
214        return ""
215    end
216    local l = utflen(s)
217    if n > 0 then
218        return nspaces[n-l]
219    else
220        return nspaces[-n-l]
221    end
222end
223
224-- local t = {
225--     "1234567123456712345671234567",
226--     "\tb\tc",
227--     "a\tb\tc",
228--     "aa\tbb\tcc",
229--     "aaa\tbbb\tccc",
230--     "aaaa\tbbbb\tcccc",
231--     "aaaaa\tbbbbb\tccccc",
232--     "aaaaaa\tbbbbbb\tcccccc\n       aaaaaa\tbbbbbb\tcccccc",
233--     "one\n	two\nxxx	three\nxx	four\nx	five\nsix",
234-- }
235-- for k=1,#t do
236--     print(strings.tabtospace(t[k]))
237-- end
238
239-- todo: lpeg
240
241-- function strings.striplong(str) -- strips all leading spaces
242--     str = gsub(str,"^%s*","")
243--     str = gsub(str,"[\n\r]+ *","\n")
244--     return str
245-- end
246
247local optionalspace = spacer^0
248local nospace       = optionalspace/""
249local endofline     = nospace * newline
250
251local stripend      = (whitespace^1 * endofstring)/""
252
253local normalline    = (nospace * ((1-optionalspace*(newline+endofstring))^1) * nospace)
254
255local stripempty    = endofline^1/""
256local normalempty   = endofline^1
257local singleempty   = endofline * (endofline^0/"")
258local doubleempty   = endofline * endofline^-1 * (endofline^0/"")
259local stripstart    = stripempty^0
260
261local intospace     = whitespace^1/" "
262local noleading     = whitespace^1/""
263local notrailing    = noleading * endofstring
264
265local p_prune_normal    = Cs ( stripstart * ( stripend   + normalline + normalempty )^0 )
266local p_prune_collapse  = Cs ( stripstart * ( stripend   + normalline + doubleempty )^0 )
267local p_prune_noempty   = Cs ( stripstart * ( stripend   + normalline + singleempty )^0 )
268local p_prune_intospace = Cs ( noleading  * ( notrailing + intospace  + 1           )^0 )
269local p_retain_normal   = Cs (              (              normalline + normalempty )^0 )
270local p_retain_collapse = Cs (              (              normalline + doubleempty )^0 )
271local p_retain_noempty  = Cs (              (              normalline + singleempty )^0 )
272local p_collapse_all    = Cs ( stripstart * ( stripend   + ((whitespace+newline)^1/" ") + 1)^0 )
273
274-- function striplines(str,prune,collapse,noempty)
275--     if prune then
276--         if noempty then
277--             return lpegmatch(p_prune_noempty,str) or str
278--         elseif collapse then
279--             return lpegmatch(p_prune_collapse,str) or str
280--         else
281--             return lpegmatch(p_prune_normal,str) or str
282--         end
283--     else
284--         if noempty then
285--             return lpegmatch(p_retain_noempty,str) or str
286--         elseif collapse then
287--             return lpegmatch(p_retain_collapse,str) or str
288--         else
289--             return lpegmatch(p_retain_normal,str) or str
290--         end
291--     end
292-- end
293
294local striplinepatterns = {
295    ["prune"]               = p_prune_normal,
296    ["prune and collapse"]  = p_prune_collapse, -- default
297    ["prune and no empty"]  = p_prune_noempty,
298    ["prune and to space"]  = p_prune_intospace,
299    ["retain"]              = p_retain_normal,
300    ["retain and collapse"] = p_retain_collapse,
301    ["retain and no empty"] = p_retain_noempty,
302    ["collapse all"]        = p_collapse_all,
303    ["collapse"]            = patterns.collapser,
304}
305
306setmetatable(striplinepatterns,{ __index = function(t,k) return p_prune_collapse end })
307
308strings.striplinepatterns = striplinepatterns
309
310function strings.striplines(str,how)
311    return str and lpegmatch(striplinepatterns[how],str) or str
312end
313
314function strings.collapse(str) -- maybe also in strings
315    return str and lpegmatch(p_prune_intospace,str) or str
316end
317
318-- local s = " \naa\n\naa\na   a\n\n "
319-- local s = [[ \naa\n\naa\na   a\n\n]]
320-- print("["..strings.striplines(s,"collapse all").."]")
321
322-- also see: string.collapsespaces
323
324strings.striplong = strings.striplines -- for old times sake
325
326-- local str = table.concat( {
327-- "  ",
328-- "    aap",
329-- "  noot mies",
330-- "  ",
331-- "    ",
332-- " zus    wim jet",
333-- "zus    wim jet",
334-- "       zus    wim jet",
335-- "    ",
336-- }, "\n")
337--
338-- local str = table.concat( {
339-- "  aaaa",
340-- "  bb",
341-- "  cccccc",
342-- " ",
343-- }, "\n")
344--
345-- for k, v in table.sortedhash(utilities.strings.striplinepatterns) do
346--     logs.report("stripper","method: %s, result: [[%s]]",k,utilities.strings.striplines(str,k))
347-- end
348
349-- inspect(strings.striplong([[
350--   aaaa
351--   bb
352--   cccccc
353-- ]]))
354
355function strings.nice(str)
356    str = gsub(str,"[:%-+_]+"," ") -- maybe more
357    return str
358end
359
360-- Work in progress. Interesting is that compared to the built-in this is faster in
361-- luatex than in luajittex where we have a comparable speed. It only makes sense
362-- to use the formatter when a (somewhat) complex format is used a lot. Each formatter
363-- is a function so there is some overhead and not all formatted output is worth that
364-- overhead. Keep in mind that there is an extra function call involved. In principle
365-- we end up with a string concatination so one could inline such a sequence but often
366-- at the cost of less readabinity. So, it's a sort of (visual) compromise. Of course
367-- there is the benefit of more variants. (Concerning the speed: a simple format like
368-- %05fpt is better off with format than with a formatter, but as soon as you put
369-- something in front formatters become faster. Passing the pt as extra argument makes
370-- formatters behave better. Of course this is rather implementation dependent. Also,
371-- when a specific format is only used a few times the overhead in creating it is not
372-- compensated by speed.)
373--
374-- More info can be found in cld-mkiv.pdf so here I stick to a simple list.
375--
376-- integer            %...i   number
377-- integer            %...d   number
378-- unsigned           %...u   number -- not used
379-- character          %...c   number
380-- hexadecimal        %...x   number
381-- HEXADECIMAL        %...X   number
382-- octal              %...o   number
383-- string             %...s   string number
384-- float              %...f   number
385-- checked float      %...F   number
386-- exponential        %...e   number
387-- exponential        %...E   number
388-- stripped e         %...j   number
389-- stripped E         %...J   number
390-- autofloat          %...g   number
391-- autofloat          %...G   number
392-- utf character      %...c   number
393-- force tostring     %...S   any
394-- force tostring     %Q      any
395-- force tonumber     %N      number (strip leading zeros)
396-- signed number      %I      number
397-- rounded number     %r      number
398-- 0xhexadecimal      %...h   character number
399-- 0xHEXADECIMAL      %...H   character number
400-- U+hexadecimal      %...u   character number
401-- U+HEXADECIMAL      %...U   character number
402-- points             %p      number (scaled points)
403-- nupoints           %P      number (scaled points) / without unit / always 5 decimals
404-- basepoints         %b      number (scaled points)
405-- nubasepoints       %B      number (scaled points) / without unit / always 5 decimals
406-- table concat       %...t   table
407-- table concat       %{.}t   table
408-- serialize          %...T   sequenced (no nested tables)
409-- serialize          %{.}T   sequenced (no nested tables)
410-- boolean (logic)    %l      boolean
411-- BOOLEAN            %L      boolean
412-- whitespace         %...w   number
413-- whitespace         %...W   (fixed)
414-- automatic          %...a   'whatever' (string, table, ...)
415-- automatic          %...A   "whatever" (string, table, ...)
416-- zap                %...z   skip
417-- stripped  %...N    %...N
418-- comma/period real  %...m
419-- period/comma real  %...M
420-- formatted float    %...k   n.m
421
422local n = 0
423
424-- we are somewhat sloppy in parsing prefixes as it's not that critical
425
426-- hard to avoid but we can collect them in a private namespace if needed
427
428-- inline the next two makes no sense as we only use this in logging
429
430local sequenced = table.sequenced
431
432function string.autodouble(s,sep)
433    if s == nil then
434        return '""'
435    end
436    local t = type(s)
437    if t == "number" then
438        return tostring(s) -- tostring not really needed
439    end
440    if t == "table" then
441        return ('"' .. sequenced(s,sep or ",") .. '"')
442    end
443    return ('"' .. tostring(s) .. '"')
444end
445
446function string.autosingle(s,sep)
447    if s == nil then
448        return "''"
449    end
450    local t = type(s)
451    if t == "number" then
452        return tostring(s) -- tostring not really needed
453    end
454    if t == "table" then
455        return ("'" .. sequenced(s,sep or ",") .. "'")
456    end
457    return ("'" .. tostring(s) .. "'")
458end
459
460local tracedchars  = { [0] =
461    -- the regular bunch
462    "[null]", "[soh]", "[stx]", "[etx]", "[eot]", "[enq]", "[ack]", "[bel]",
463    "[bs]",   "[ht]",  "[lf]",  "[vt]",  "[ff]",  "[cr]",  "[so]",  "[si]",
464    "[dle]",  "[dc1]", "[dc2]", "[dc3]", "[dc4]", "[nak]", "[syn]", "[etb]",
465    "[can]",  "[em]",  "[sub]", "[esc]", "[fs]",  "[gs]",  "[rs]",  "[us]",
466    -- plus space
467    "[space]", -- 0x20
468}
469
470string.tracedchars = tracedchars
471strings.tracers    = tracedchars
472
473function string.tracedchar(b)
474    -- todo: table
475    if type(b) == "number" then
476        return tracedchars[b] or (utfchar(b) .. " (U+" .. format("%05X",b) .. ")")
477    else
478        local c = utfbyte(b)
479        return tracedchars[c] or (b .. " (U+" .. (c and format("%05X",c) or "?????") .. ")")
480    end
481end
482
483function number.signed(i)
484    if i > 0 then
485        return "+",  i
486    else
487        return "-", -i
488    end
489end
490
491-- maybe to util-num
492
493local two    = digit * digit
494local three  = two * digit
495local prefix = (Carg(1) * three)^1
496
497local splitter = Cs (
498    (((1 - (three^1 * period))^1 + C(three)) * prefix + C((1-period)^1))
499  * (anything/"" * Carg(2)) * C(2)
500)
501
502local splitter3 = Cs (
503    three * prefix * endofstring +
504    two   * prefix * endofstring +
505    digit * prefix * endofstring +
506    three +
507    two   +
508    digit
509)
510
511patterns.formattednumber = splitter
512
513function number.formatted(n,sep1,sep2)
514    if sep1 == false then
515        if type(n) == "number" then
516            n = tostring(n)
517        end
518        return lpegmatch(splitter3,n,1,sep2 or ".")
519    else
520        if type(n) == "number" then
521            n = format("%0.2f",n)
522        end
523        if sep1 == true then
524            return lpegmatch(splitter,n,1,".",",")
525        elseif sep1 == "." then
526            return lpegmatch(splitter,n,1,sep1,sep2 or ",")
527        elseif sep1 == "," then
528            return lpegmatch(splitter,n,1,sep1,sep2 or ".")
529        else
530            return lpegmatch(splitter,n,1,sep1 or ",",sep2 or ".")
531        end
532    end
533end
534
535-- print(number.formatted(1))
536-- print(number.formatted(12))
537-- print(number.formatted(123))
538-- print(number.formatted(1234))
539-- print(number.formatted(12345))
540-- print(number.formatted(123456))
541-- print(number.formatted(1234567))
542-- print(number.formatted(12345678))
543-- print(number.formatted(12345678,true))
544-- print(number.formatted(1,false))
545-- print(number.formatted(12,false))
546-- print(number.formatted(123,false))
547-- print(number.formatted(1234,false))
548-- print(number.formatted(12345,false))
549-- print(number.formatted(123456,false))
550-- print(number.formatted(1234567,false))
551-- print(number.formatted(12345678,false))
552-- print(number.formatted(1234.56,"!","?"))
553
554local p = Cs(
555        P("-")^0
556      * (P("0")^1/"")^0
557      * (1-period)^0
558      * (period * P("0")^1 * endofstring/"" + period^0)
559      * P(1-P("0")^1*endofstring)^0
560    )
561
562function number.compactfloat(n,fmt)
563    if n == 0 then
564        return "0"
565    elseif n == 1 then
566        return "1"
567    end
568    n = lpegmatch(p,format(fmt or "%0.3f",n))
569    if n == "." or n == "" or n == "-" then
570        return "0"
571    end
572    return n
573end
574
575local zero      = P("0")^1 / ""
576local plus      = P("+")   / ""
577local minus     = P("-")
578local separator = period
579local trailing  = zero^1 * #S("eE")
580local exponent  = (S("eE") * (plus + Cs((minus * zero^0 * endofstring)/"") + minus) * zero^0 * (endofstring * Cc("0") + anything^1))
581local pattern_a = Cs(minus^0 * digit^1 * (separator/"" * trailing + separator * (trailing + digit)^0) * exponent)
582local pattern_b = Cs((exponent + anything)^0)
583
584function number.sparseexponent(f,n)
585    if not n then
586        n = f
587        f = "%e"
588    end
589    local tn = type(n)
590    if tn == "string" then -- cast to number
591        local m = tonumber(n)
592        if m then
593            return lpegmatch((f == "%e" or f == "%E") and pattern_a or pattern_b,format(f,m))
594        end
595    elseif tn == "number" then
596        return lpegmatch((f == "%e" or f == "%E") and pattern_a or pattern_b,format(f,n))
597    end
598    return tostring(n)
599end
600
601local hf = { }
602local hs = { }
603
604setmetatable(hf, { __index = function(t,k)
605    local v = "%." .. k .. "f"
606    t[k] = v
607    return v
608end } )
609
610setmetatable(hs, { __index = function(t,k)
611    local v = "%" .. k .. "s"
612    t[k] = v
613    return v
614end } )
615
616function number.formattedfloat(n,b,a)
617    local s = format(hf[a],n)
618    local l = (b or 0) + (a or 0) + 1
619    if #s < l then
620        return format(hs[l],s)
621    else
622        return s
623    end
624end
625
626local template = [[
627%s
628%s
629return function(%s) return %s end
630]]
631
632-- We only use fast serialize in controlled cases.
633
634local pattern = Cs(Cc('"') * (
635    (1-S('"\\\n\r'))^1
636  + P('"')  / '\\"'
637  + P('\\') / '\\\\'
638  + P('\n') / '\\n'
639  + P('\r') / '\\r'
640)^0 * Cc('"'))
641
642-- -- I need to do more experiments with this:
643--
644-- local pattern = Cs(Cc('"') * (
645--     (1-S('"\\\n\r'))^1
646--   + P('"')  / '\\034'
647--   + P('\\') / '\\092'
648--   + P('\n') / '\\013'
649--   + P('\r') / '\\010'
650-- )^0 * Cc('"'))
651
652patterns.escapedquotes = pattern
653
654function string.escapedquotes(s)
655    return lpegmatch(pattern,s)
656end
657
658local pattern = (1 - P("\\"))^1 ; pattern = Cs (
659    pattern
660 * ( (P("\\") / "" * (digit^-3 / function(s) return char(tonumber(s)) end)) + pattern )^1
661)
662
663patterns.unescapedquotes = pattern
664
665function string.unescapedquotes(s)
666    return lpegmatch(pattern,s) or s
667end
668
669-- function string.longifneeded(s)
670--     if find(s,'["\\\n\r]') then
671--         return "[===[" .. s .. "]===]"
672--     else
673--         return '"' .. s ..'"'
674--     end
675-- end
676
677string.texnewlines = lpeg.replacer(patterns.newline,"\r",true)
678
679-- print(string.escapedquotes('1\\23\n"'))
680
681-- but for now here
682
683local preamble = ""
684
685local environment = {
686    global          = global or _G,
687    lpeg            = lpeg,
688    type            = type,
689    tostring        = tostring,
690    tonumber        = tonumber,
691    format          = string.format,
692    concat          = table.concat,
693    signed          = number.signed,
694    points          = number.points,
695    nupoints        = number.nupoints,
696    basepoints      = number.basepoints,
697    nubasepoints    = number.nubasepoints,
698    utfchar         = utf.char,
699    utfbyte         = utf.byte,
700    lpegmatch       = lpeg.match,
701    nspaces         = string.nspaces,
702    utfpadding      = string.utfpadding,
703    tracedchar      = string.tracedchar,
704    autosingle      = string.autosingle,
705    autodouble      = string.autodouble,
706    sequenced       = table.sequenced,
707    formattednumber = number.formatted,
708    sparseexponent  = number.sparseexponent,
709    formattedfloat  = number.formattedfloat,
710    stripzero       = patterns.stripzero,
711    stripzeros      = patterns.stripzeros,
712    escapedquotes   = string.escapedquotes,
713
714    FORMAT          = string.f6,
715}
716
717-- -- --
718
719local arguments = { "a1" } -- faster than previously used (select(n,...))
720
721setmetatable(arguments, { __index =
722    function(t,k)
723        local v = t[k-1] .. ",a" .. k
724        t[k] = v
725        return v
726    end
727})
728
729local prefix_any = C((sign + space + period + digit)^0)
730local prefix_sub = (C((sign + digit)^0) + Cc(0))
731                 * period
732                 * (C((sign + digit)^0) + Cc(0))
733local prefix_tab = P("{") * C((1-P("}"))^0) * P("}") + C((1-R("az","AZ","09","%%"))^0)
734
735-- we've split all cases as then we can optimize them (let's omit the fuzzy u)
736
737-- todo: replace outer formats in next by ..
738
739local format_s = function(f)
740    n = n + 1
741    if f and f ~= "" then
742        return format("format('%%%ss',a%s)",f,n)
743    else -- best no tostring in order to stay compatible (.. does a selective tostring too)
744        return format("(a%s or '')",n) -- goodie: nil check
745    end
746end
747
748local format_S = function(f) -- can be optimized
749    n = n + 1
750    if f and f ~= "" then
751        return format("format('%%%ss',tostring(a%s))",f,n)
752    else
753        return format("tostring(a%s)",n)
754    end
755end
756
757local format_right = function(f)
758    n = n + 1
759    f = tonumber(f)
760    if not f or f == 0 then
761        return format("(a%s or '')",n)
762    elseif f > 0 then
763        return format("utfpadding(a%s,%i)..a%s",n,f,n)
764    else
765        return format("a%s..utfpadding(a%s,%i)",n,n,f)
766    end
767end
768
769local format_left = function(f)
770    n = n + 1
771    f = tonumber(f)
772    if not f or f == 0 then
773        return format("(a%s or '')",n)
774    end
775    if f < 0 then
776        return format("utfpadding(a%s,%i)..a%s",n,-f,n)
777    else
778        return format("a%s..utfpadding(a%s,%i)",n,n,-f)
779    end
780end
781
782local format_q = JITSUPPORTED and function()
783    n = n + 1
784    -- lua 5.3 has a different q than lua 5.2 (which does a tostring on numbers)
785 -- return format("(a%s ~= nil and format('%%q',a%s) or '')",n,n)
786    return format("(a%s ~= nil and format('%%q',tostring(a%s)) or '')",n,n)
787 -- return format("(a%s ~= nil and escapedquotes(tostring(a%s)) or '')",n,n)
788end or function()
789    n = n + 1
790    return format("(a%s ~= nil and format('%%q',a%s) or '')",n,n)
791end
792
793
794local format_Q = function() -- fast escaping
795    n = n + 1
796--  return format("format('%%q',tostring(a%s))",n)
797    return format("escapedquotes(tostring(a%s))",n)
798end
799
800local format_i = function(f)
801    n = n + 1
802    if f and f ~= "" then
803        return format("format('%%%si',a%s)",f,n)
804    else
805        return format("format('%%i',a%s)",n) -- why not just tostring()
806    end
807end
808
809local format_d = format_i
810
811local format_I = function(f)
812    n = n + 1
813    return format("format('%%s%%%si',signed(a%s))",f,n)
814end
815
816local format_f = function(f)
817    n = n + 1
818    return format("format('%%%sf',a%s)",f,n)
819end
820
821-- The next one formats an integer as integer and very small values as zero. This is needed
822-- for pdf backend code.
823--
824--   1.23 % 1 : 0.23
825-- - 1.23 % 1 : 0.77
826--
827-- We could probably use just %s with integers but who knows what Lua 5.3 will do? So let's
828-- for the moment use %i.
829
830local format_F = function(f) -- beware, no cast to number
831    n = n + 1
832    if not f or f == "" then
833        return format("(((a%s > -0.0000000005 and a%s < 0.0000000005) and '0') or format((a%s %% 1 == 0) and '%%i' or '%%.9f',a%s))",n,n,n,n)
834    else
835        return format("format((a%s %% 1 == 0) and '%%i' or '%%%sf',a%s)",n,f,n)
836    end
837end
838
839-- if string.f9 then
840--     format_F = function(f) -- beware, no cast to number
841--         n = n + 1
842--         if not f or f == "" then
843--             return format("(((a%s > -0.0000000005 and a%s < 0.0000000005) and '0') or FORMAT(a%s))",n,n,n,n,n)
844--         else
845--             return format("((a%s %% 1 == 0) and format('%%i',a%s) or FORMAT(a%s,'%%%sf'))",n,n,n,f)
846--         end
847--     end
848-- end
849
850local format_k = function(b,a) -- slow
851    n = n + 1
852    return format("formattedfloat(a%s,%s,%s)",n,b or 0,a or 0)
853end
854
855local format_g = function(f)
856    n = n + 1
857    return format("format('%%%sg',a%s)",f,n)
858end
859
860local format_G = function(f)
861    n = n + 1
862    return format("format('%%%sG',a%s)",f,n)
863end
864
865local format_e = function(f)
866    n = n + 1
867    return format("format('%%%se',a%s)",f,n)
868end
869
870local format_E = function(f)
871    n = n + 1
872    return format("format('%%%sE',a%s)",f,n)
873end
874
875local format_j = function(f)
876    n = n + 1
877    return format("sparseexponent('%%%se',a%s)",f,n)
878end
879
880local format_J = function(f)
881    n = n + 1
882    return format("sparseexponent('%%%sE',a%s)",f,n)
883end
884
885local format_x = function(f)
886    n = n + 1
887    return format("format('%%%sx',a%s)",f,n)
888end
889
890local format_X = function(f)
891    n = n + 1
892    return format("format('%%%sX',a%s)",f,n)
893end
894
895local format_o = function(f)
896    n = n + 1
897    return format("format('%%%so',a%s)",f,n)
898end
899
900local format_c = function()
901    n = n + 1
902    return format("utfchar(a%s)",n)
903end
904
905local format_C = function()
906    n = n + 1
907    return format("tracedchar(a%s)",n)
908end
909
910local format_r = function(f)
911    n = n + 1
912    return format("format('%%%s.0f',a%s)",f,n)
913end
914
915local format_h = function(f)
916    n = n + 1
917    if f == "-" then
918        f = sub(f,2)
919        return format("format('%%%sx',type(a%s) == 'number' and a%s or utfbyte(a%s))",f == "" and "05" or f,n,n,n)
920    else
921        return format("format('0x%%%sx',type(a%s) == 'number' and a%s or utfbyte(a%s))",f == "" and "05" or f,n,n,n)
922    end
923end
924
925local format_H = function(f)
926    n = n + 1
927    if f == "-" then
928        f = sub(f,2)
929        return format("format('%%%sX',type(a%s) == 'number' and a%s or utfbyte(a%s))",f == "" and "05" or f,n,n,n)
930    else
931        return format("format('0x%%%sX',type(a%s) == 'number' and a%s or utfbyte(a%s))",f == "" and "05" or f,n,n,n)
932    end
933end
934
935local format_u = function(f)
936    n = n + 1
937    if f == "-" then
938        f = sub(f,2)
939        return format("format('%%%sx',type(a%s) == 'number' and a%s or utfbyte(a%s))",f == "" and "05" or f,n,n,n)
940    else
941        return format("format('u+%%%sx',type(a%s) == 'number' and a%s or utfbyte(a%s))",f == "" and "05" or f,n,n,n)
942    end
943end
944
945local format_U = function(f)
946    n = n + 1
947    if f == "-" then
948        f = sub(f,2)
949        return format("format('%%%sX',type(a%s) == 'number' and a%s or utfbyte(a%s))",f == "" and "05" or f,n,n,n)
950    else
951        return format("format('U+%%%sX',type(a%s) == 'number' and a%s or utfbyte(a%s))",f == "" and "05" or f,n,n,n)
952    end
953end
954
955local format_p = function()
956    n = n + 1
957    return format("points(a%s)",n)
958end
959
960local format_P = function()
961    n = n + 1
962    return format("nupoints(a%s)",n)
963end
964
965local format_b = function()
966    n = n + 1
967    return format("basepoints(a%s)",n)
968end
969
970local format_B = function()
971    n = n + 1
972    return format("nubasepoints(a%s)",n)
973end
974
975local format_t = function(f)
976    n = n + 1
977    if f and f ~= "" then
978        return format("concat(a%s,%q)",n,f)
979    else
980        return format("concat(a%s)",n)
981    end
982end
983
984local format_T = function(f)
985    n = n + 1
986    if f and f ~= "" then
987        return format("sequenced(a%s,%q)",n,f)
988    else
989        return format("sequenced(a%s)",n)
990    end
991end
992
993local format_l = function()
994    n = n + 1
995    return format("(a%s and 'true' or 'false')",n)
996end
997
998local format_L = function()
999    n = n + 1
1000    return format("(a%s and 'TRUE' or 'FALSE')",n)
1001end
1002
1003local format_n = function() -- strips leading and trailing zeros and removes .0, beware: can produce e notation
1004    n = n + 1
1005    return format("((a%s %% 1 == 0) and format('%%i',a%s) or tostring(a%s))",n,n,n)
1006end
1007
1008-- local format_N = function() -- strips leading and trailing zeros (also accepts string)
1009--     n = n + 1
1010--     return format("tostring(tonumber(a%s) or a%s)",n,n)
1011-- end
1012
1013-- local format_N = function(f) -- strips leading and trailing zeros
1014--     n = n + 1
1015--     -- stripzero (singular) as we only have a number
1016--     if not f or f == "" then
1017--         return format("(((a%s > -0.0000000005 and a%s < 0.0000000005) and '0') or ((a%s %% 1 == 0) and format('%%i',a%s)) or lpegmatch(stripzero,format('%%.9f',a%s)))",n,n,n,n,n)
1018--     else
1019--         return format("(((a%s %% 1 == 0) and format('%%i',a%s)) or lpegmatch(stripzero,format('%%%sf',a%s)))",n,n,f,n)
1020--     end
1021-- end
1022
1023-- local format_N = function(f) -- strips leading and trailing zeros
1024--     n = n + 1
1025--     -- stripzero (singular) as we only have a number
1026--     if not f or f == "" then
1027--         return format("(((a%s %% 1 == 0) and format('%%i',a%s)) or ((a%s > -0.0000000005 and a%s < 0.0000000005) and '0') or lpegmatch(stripzero,format('%%.9f',a%s)))",n,n,n,n,n)
1028--     else
1029--         return format("(((a%s %% 1 == 0) and format('%%i',a%s)) or lpegmatch(stripzero,format('%%%sf',a%s)))",n,n,f,n)
1030--     end
1031-- end
1032
1033local format_N  if environment.FORMAT then
1034
1035    format_N = function(f)
1036        n = n + 1
1037        if not f or f == "" then
1038            return format("FORMAT(a%s,'%%.9f')",n)
1039        elseif f == ".6" or f == "0.6" then
1040            return format("FORMAT(a%s)",n)
1041        else
1042            return format("FORMAT(a%s,'%%%sf')",n,f)
1043        end
1044    end
1045
1046else
1047
1048    format_N = function(f) -- strips leading and trailing zeros
1049        n = n + 1
1050        -- stripzero (singular) as we only have a number
1051        if not f or f == "" then
1052            f = ".9"
1053        end -- always a leading number !
1054        return format("(((a%s %% 1 == 0) and format('%%i',a%s)) or lpegmatch(stripzero,format('%%%sf',a%s)))",n,n,f,n)
1055    end
1056
1057end
1058
1059local format_a = function(f)
1060    n = n + 1
1061    if f and f ~= "" then
1062        return format("autosingle(a%s,%q)",n,f)
1063    else
1064        return format("autosingle(a%s)",n)
1065    end
1066end
1067
1068local format_A = function(f)
1069    n = n + 1
1070    if f and f ~= "" then
1071        return format("autodouble(a%s,%q)",n,f)
1072    else
1073        return format("autodouble(a%s)",n)
1074    end
1075end
1076
1077local format_w = function(f) -- handy when doing depth related indent
1078    n = n + 1
1079    f = tonumber(f)
1080    if f then -- not that useful
1081        return format("nspaces[%s+a%s]",f,n) -- no real need for tonumber
1082    else
1083        return format("nspaces[a%s]",n) -- no real need for tonumber
1084    end
1085end
1086
1087local format_W = function(f) -- handy when doing depth related indent
1088    return format("nspaces[%s]",tonumber(f) or 0)
1089end
1090
1091local format_m = function(f)
1092    n = n + 1
1093    if not f or f == "" then
1094        f = ","
1095    end
1096    if f == "0" then
1097        return format([[formattednumber(a%s,false)]],n)
1098    else
1099        return format([[formattednumber(a%s,%q,".")]],n,f)
1100    end
1101end
1102
1103local format_M = function(f)
1104    n = n + 1
1105    if not f or f == "" then
1106        f = "."
1107    end
1108    if f == "0" then
1109        return format([[formattednumber(a%s,false)]],n)
1110    else
1111        return format([[formattednumber(a%s,%q,",")]],n,f)
1112    end
1113end
1114
1115--
1116
1117local format_z = function(f)
1118    n = n + (tonumber(f) or 1)
1119    return "''" -- okay, not that efficient to append '' but a special case anyway
1120end
1121
1122--
1123
1124-- local strip
1125--
1126-- local format_Z = function(f)
1127--     n = n + 1
1128--     if not f or f == "" then
1129--         f = ".9"
1130--     end
1131--     return format("(((a%s %% 1 == 0) and format('%%i',a%s)) or (strip and lpegmatch(stripzero,format('%%%sf',a%s))) or format('%%%sf',a%s))",n,n,f,n,f,n)
1132-- end
1133--
1134-- function strings.stripformatterzeros()
1135--     strip = true
1136-- end
1137
1138-- add(formatters,"texexp", [[texexp(...)]], "local texexp = metapost.texexp")
1139--
1140-- add(formatters,"foo:bar",[[foo(...)]], { foo = function(...) print(...) return "!" end })
1141-- print(string.formatters["foo %3!foo:bar! bar"](1,2,3))
1142
1143
1144local format_rest = function(s)
1145    return format("%q",s) -- catches " and \n and such
1146end
1147
1148local format_extension = function(extensions,f,name)
1149    local extension = extensions[name] or "tostring(%s)"
1150    local f = tonumber(f) or 1
1151    local w = find(extension,"%.%.%.")
1152    if f == 0 then
1153        if w then
1154            extension = gsub(extension,"%.%.%.","")
1155        end
1156        return extension
1157    elseif f == 1 then
1158        if w then
1159            extension = gsub(extension,"%.%.%.","%%s")
1160        end
1161        n = n + 1
1162        local a = "a" .. n
1163        return format(extension,a,a) -- maybe more times?
1164    elseif f < 0 then
1165        if w then
1166            -- not supported
1167            extension = gsub(extension,"%.%.%.","")
1168            return extension
1169        else
1170            local a = "a" .. (n + f + 1)
1171            return format(extension,a,a)
1172        end
1173    else
1174        if w then
1175            extension = gsub(extension,"%.%.%.",rep("%%s,",f-1).."%%s")
1176        end
1177        -- we could fill an array and then n = n + 1 unpack(t,n,n+f) but as we
1178        -- cache we don't save much and there are hardly any extensions anyway
1179        local t = { }
1180        for i=1,f do
1181            n = n + 1
1182         -- t[#t+1] = "a" .. n
1183            t[i] = "a" .. n
1184        end
1185        return format(extension,unpack(t))
1186    end
1187end
1188
1189-- aA b cC d eE f gG hH iI jJ lL mM N o p qQ r sS tT uU wW xX z
1190
1191-- extensions : %!tag!
1192
1193-- can be made faster but not called that often
1194
1195local builder = Cs { "start",
1196    start = (
1197        (
1198            P("%") / ""
1199          * (
1200                V("!") -- new
1201              + V("s") + V("q")
1202              + V("i") + V("d")
1203              + V("f") + V("F") + V("g") + V("G") + V("e") + V("E")
1204              + V("x") + V("X") + V("o")
1205              --
1206              + V("c")
1207              + V("C")
1208              + V("S") -- new
1209              + V("Q") -- new
1210              + V("n") -- new
1211              + V("N") -- new
1212              + V("k") -- new
1213              --
1214              + V("r")
1215              + V("h") + V("H") + V("u") + V("U")
1216              + V("p") + V("P") + V("b") + V("B")
1217              + V("t") + V("T")
1218              + V("l") + V("L")
1219              + V("I")
1220              + V("w") -- new
1221              + V("W") -- new
1222              + V("a") -- new
1223              + V("A") -- new
1224              + V("j") + V("J") -- stripped e E
1225              + V("m") + V("M") -- new (formatted number)
1226              + V("z") -- new
1227              --
1228              + V(">") -- left padding
1229              + V("<") -- right padding
1230              --
1231           -- + V("?") -- ignored, probably messed up %
1232            )
1233          + V("*")
1234        )
1235     * (endofstring + Carg(1))
1236    )^0,
1237    --
1238    ["s"] = (prefix_any * P("s")) / format_s, -- %s => regular %s (string)
1239    ["q"] = (prefix_any * P("q")) / format_q, -- %q => regular %q (quoted string)
1240    ["i"] = (prefix_any * P("i")) / format_i, -- %i => regular %i (integer)
1241    ["d"] = (prefix_any * P("d")) / format_d, -- %d => regular %d (integer)
1242    ["f"] = (prefix_any * P("f")) / format_f, -- %f => regular %f (float)
1243    ["F"] = (prefix_any * P("F")) / format_F, -- %F => regular %f (float) but 0/1 check
1244    ["g"] = (prefix_any * P("g")) / format_g, -- %g => regular %g (float)
1245    ["G"] = (prefix_any * P("G")) / format_G, -- %G => regular %G (float)
1246    ["e"] = (prefix_any * P("e")) / format_e, -- %e => regular %e (float)
1247    ["E"] = (prefix_any * P("E")) / format_E, -- %E => regular %E (float)
1248    ["x"] = (prefix_any * P("x")) / format_x, -- %x => regular %x (hexadecimal)
1249    ["X"] = (prefix_any * P("X")) / format_X, -- %X => regular %X (HEXADECIMAL)
1250    ["o"] = (prefix_any * P("o")) / format_o, -- %o => regular %o (octal)
1251    --
1252    ["S"] = (prefix_any * P("S")) / format_S, -- %S => %s (tostring)
1253    ["Q"] = (prefix_any * P("Q")) / format_Q, -- %Q => %q (tostring)
1254    ["n"] = (prefix_any * P("n")) / format_n, -- %n => tonumber (strips leading and trailing zeros, as well as .0, expects number)
1255    ["N"] = (prefix_any * P("N")) / format_N, -- %N => tonumber (strips leading and trailing zeros, also takes string)
1256    ["k"] = (prefix_sub * P("k")) / format_k, -- %k => like f but with n.m
1257    ["c"] = (prefix_any * P("c")) / format_c, -- %c => utf character (extension to regular)
1258    ["C"] = (prefix_any * P("C")) / format_C, -- %c => U+.... utf character
1259    --
1260    ["r"] = (prefix_any * P("r")) / format_r, -- %r => round
1261    ["h"] = (prefix_any * P("h")) / format_h, -- %h => 0x0a1b2 (when - no 0x) was v
1262    ["H"] = (prefix_any * P("H")) / format_H, -- %H => 0x0A1B2 (when - no 0x) was V
1263    ["u"] = (prefix_any * P("u")) / format_u, -- %u => u+0a1b2 (when - no u+)
1264    ["U"] = (prefix_any * P("U")) / format_U, -- %U => U+0A1B2 (when - no U+)
1265    ["p"] = (prefix_any * P("p")) / format_p, -- %p => 12.345pt
1266    ["P"] = (prefix_any * P("P")) / format_P, -- %p => 12.345
1267    ["b"] = (prefix_any * P("b")) / format_b, -- %b => 12.342bp
1268    ["B"] = (prefix_any * P("B")) / format_B, -- %b => 12.342
1269    ["t"] = (prefix_tab * P("t")) / format_t, -- %t => concat
1270    ["T"] = (prefix_tab * P("T")) / format_T, -- %t => sequenced
1271    ["l"] = (prefix_any * P("l")) / format_l, -- %l => boolean
1272    ["L"] = (prefix_any * P("L")) / format_L, -- %L => BOOLEAN
1273    ["I"] = (prefix_any * P("I")) / format_I, -- %I => signed integer
1274    --
1275    ["w"] = (prefix_any * P("w")) / format_w, -- %w => n spaces (optional prefix is added)
1276    ["W"] = (prefix_any * P("W")) / format_W, -- %W => mandate prefix, no specifier
1277    --
1278    ["j"] = (prefix_any * P("j")) / format_j, -- %j => %e (float) stripped exponent (irrational)
1279    ["J"] = (prefix_any * P("J")) / format_J, -- %J => %E (float) stripped exponent (irrational)
1280    --
1281    ["m"] = (prefix_any * P("m")) / format_m, -- %m => xxx.xxx.xxx,xx (optional prefix instead of .)
1282    ["M"] = (prefix_any * P("M")) / format_M, -- %M => xxx,xxx,xxx.xx (optional prefix instead of ,)
1283    --
1284    ["z"] = (prefix_any * P("z")) / format_z, -- %z => skip n arguments
1285 -- ["Z"] = (prefix_any * P("Z")) / format_Z, -- %Z => optionally strip zeros
1286    --
1287    ["a"] = (prefix_any * P("a")) / format_a, -- %a => '...' (forces tostring)
1288    ["A"] = (prefix_any * P("A")) / format_A, -- %A => "..." (forces tostring)
1289    --
1290    ["<"] = (prefix_any * P("<")) / format_left,
1291    [">"] = (prefix_any * P(">")) / format_right,
1292    --
1293    ["*"] = Cs(((1-P("%"))^1 + P("%%")/"%%")^1) / format_rest, -- rest (including %%)
1294    ["?"] = Cs(((1-P("%"))^1               )^1) / format_rest, -- rest (including %%)
1295    --
1296    ["!"] = Carg(2) * prefix_any * P("!") * C((1-P("!"))^1) * P("!") / format_extension,
1297}
1298
1299-- We can be clever and only alias what is needed:
1300
1301local xx = setmetatable({ }, { __index = function(t,k) local v = format("%02x",k) t[k] = v return v end })
1302local XX = setmetatable({ }, { __index = function(t,k) local v = format("%02X",k) t[k] = v return v end })
1303
1304local preset = {
1305    ["%02x"] = function(n) return xx[n] end,
1306    ["%02X"] = function(n) return XX[n] end,
1307}
1308
1309local direct =
1310    P("%") * (sign + space + period + digit)^0 * S("sqidfgGeExXo") * endofstring
1311  / [[local format = string.format return function(str) return format("%0",str) end]]
1312
1313local function make(t,str)
1314    local f = preset[str]
1315    if f then
1316        return f
1317    end
1318    local p = lpegmatch(direct,str)
1319    if p then
1320     -- print("builder 1 >",p)
1321        f = loadstripped(p)()
1322    else
1323        n = 0 -- used in patterns
1324     -- p = lpegmatch(builder,str,1,"..",t._extensions_) -- after this we know n
1325        p = lpegmatch(builder,str,1,t._connector_,t._extensions_) -- after this we know n
1326        if n > 0 then
1327            p = format(template,preamble,t._preamble_,arguments[n],p)
1328         -- print("builder 2 >",p)
1329            f = loadstripped(p,t._environment_)() -- t._environment is not populated (was experiment)
1330        else
1331            f = function() return str end
1332        end
1333    end
1334    t[str] = f
1335    return f
1336end
1337
1338-- -- collect periodically
1339--
1340-- local threshold = 1000 -- max nof cached formats
1341--
1342-- local function make(t,str)
1343--     local f = rawget(t,str)
1344--     if f then
1345--         return f
1346--     end
1347--     local parent = t._t_
1348--     if parent._n_ > threshold then
1349--         local m = { _t_ = parent }
1350--         getmetatable(parent).__index = m
1351--         setmetatable(m, { __index = make })
1352--     else
1353--         parent._n_ = parent._n_ + 1
1354--     end
1355--     local f
1356--     local p = lpegmatch(direct,str)
1357--     if p then
1358--         f = loadstripped(p)()
1359--     else
1360--         n = 0
1361--         p = lpegmatch(builder,str,1,"..",parent._extensions_) -- after this we know n
1362--         if n > 0 then
1363--             p = format(template,preamble,parent._preamble_,arguments[n],p)
1364--          -- print("builder>",p)
1365--             f = loadstripped(p)()
1366--         else
1367--             f = function() return str end
1368--         end
1369--     end
1370--     t[str] = f
1371--     return f
1372-- end
1373
1374local function use(t,fmt,...)
1375    return t[fmt](...)
1376end
1377
1378strings.formatters = { }
1379
1380-- we cannot make these tables weak, unless we start using an indirect
1381-- table (metatable) in which case we could better keep a count and
1382-- clear that table when a threshold is reached
1383
1384-- _connector_ is an experiment
1385
1386function strings.formatters.new(noconcat)
1387    local e = { } -- better make a copy as we can overload
1388    for k, v in next, environment do
1389        e[k] = v
1390    end
1391    local t = {
1392        _type_        = "formatter",
1393        _connector_   = noconcat and "," or "..",
1394        _extensions_  = { },
1395        _preamble_    = "",
1396        _environment_ = e,
1397    }
1398    setmetatable(t, { __index = make, __call = use })
1399    return t
1400end
1401
1402local formatters   = strings.formatters.new() -- the default instance
1403
1404string.formatters  = formatters -- in the main string namespace
1405string.formatter   = function(str,...) return formatters[str](...) end -- sometimes nicer name
1406
1407local function add(t,name,template,preamble)
1408    if type(t) == "table" and t._type_ == "formatter" then
1409        t._extensions_[name] = template or "%s"
1410        if type(preamble) == "string" then
1411            t._preamble_ = preamble .. "\n" .. t._preamble_ -- so no overload !
1412        elseif type(preamble) == "table" then
1413            for k, v in next, preamble do
1414                t._environment_[k] = v
1415            end
1416        end
1417    end
1418end
1419
1420strings.formatters.add = add
1421
1422-- registered in the default instance (should we fall back on this one?)
1423
1424patterns.xmlescape = Cs((P("<")/"&lt;" + P(">")/"&gt;" + P("&")/"&amp;" + P('"')/"&quot;" + anything)^0)
1425patterns.texescape = Cs((C(S("#$%\\{}"))/"\\%1" + anything)^0)
1426patterns.luaescape = Cs(((1-S('"\n'))^1 + P('"')/'\\"' + P('\n')/'\\n"')^0) -- maybe also \0
1427patterns.luaquoted = Cs(Cc('"') * ((1-S('"\n'))^1 + P('"')/'\\"' + P('\n')/'\\n"')^0 * Cc('"'))
1428
1429-- escaping by lpeg is faster for strings without quotes, slower on a string with quotes, but
1430-- faster again when other q-escapables are found (the ones we don't need to escape)
1431
1432add(formatters,"xml",[[lpegmatch(xmlescape,%s)]],{ xmlescape = patterns.xmlescape })
1433add(formatters,"tex",[[lpegmatch(texescape,%s)]],{ texescape = patterns.texescape })
1434add(formatters,"lua",[[lpegmatch(luaescape,%s)]],{ luaescape = patterns.luaescape })
1435
1436-- -- yes or no:
1437--
1438-- local function make(t,str)
1439--     local f
1440--     local p = lpegmatch(direct,str)
1441--     if p then
1442--         f = loadstripped(p)()
1443--     else
1444--         n = 0
1445--         p = lpegmatch(builder,str,1,",") -- after this we know n
1446--         if n > 0 then
1447--             p = format(template,template_shortcuts,arguments[n],p)
1448--             f = loadstripped(p)()
1449--         else
1450--             f = function() return str end
1451--         end
1452--     end
1453--     t[str] = f
1454--     return f
1455-- end
1456--
1457-- local formatteds  = string.formatteds or { }
1458-- string.formatteds = formatteds
1459--
1460-- setmetatable(formatteds, { __index = make, __call = use })
1461
1462-- This is a somewhat silly one used in commandline reconstruction but the older
1463-- method, using a combination of fine, gsub, quoted and unquoted was not that
1464-- reliable.
1465--
1466-- '"foo"bar \"and " whatever"' => "foo\"bar \"and \" whatever"
1467-- 'foo"bar \"and " whatever'   => "foo\"bar \"and \" whatever"
1468
1469local dquote = patterns.dquote -- P('"')
1470local equote = patterns.escaped + dquote / '\\"' + 1
1471local cquote = Cc('"')
1472
1473local pattern =
1474    Cs(dquote * (equote - P(-2))^0 * dquote)                    -- we keep the outer but escape unescaped ones
1475  + Cs(cquote * (equote - space)^0 * space * equote^0 * cquote) -- we escape unescaped ones
1476
1477function string.optionalquoted(str)
1478    return lpegmatch(pattern,str) or str
1479end
1480
1481local pattern = Cs((newline / (os.newline or "\r") + 1)^0)
1482
1483function string.replacenewlines(str)
1484    return lpegmatch(pattern,str)
1485end
1486
1487--
1488
1489function strings.newcollector()
1490    local result, r = { }, 0
1491    return
1492        function(fmt,str,...) -- write
1493            r = r + 1
1494            result[r] = str == nil and fmt or formatters[fmt](str,...)
1495        end,
1496        function(connector) -- flush
1497            if result then
1498                local str = concat(result,connector)
1499                result, r = { }, 0
1500                return str
1501            end
1502        end
1503end
1504
1505--
1506
1507local f_16_16 = formatters["%0.5N"]
1508
1509function number.to16dot16(n)
1510    return f_16_16(n/65536.0)
1511end
1512
1513--
1514
1515if not string.explode then
1516
1517 -- local tsplitat = lpeg.tsplitat
1518
1519    local p_utf   = patterns.utf8character
1520    local p_check = C(p_utf) * (P("+") * Cc(true))^0
1521    local p_split = Ct(C(p_utf)^0)
1522    local p_space = Ct((C(1-P(" ")^1) + P(" ")^1)^0)
1523
1524    function string.explode(str,symbol)
1525        if symbol == "" then
1526            return lpegmatch(p_split,str)
1527        elseif symbol then
1528            local a, b = lpegmatch(p_check,symbol)
1529            if b then
1530                return lpegmatch(tsplitat(P(a)^1),str)
1531            else
1532                return lpegmatch(tsplitat(a),str)
1533            end
1534        else
1535            return lpegmatch(p_space,str)
1536        end
1537    end
1538
1539end
1540
1541
1542do
1543
1544    local p_whitespace = patterns.whitespace^1
1545
1546    local cache = setmetatable({ }, { __index = function(t,k)
1547        local p = tsplitat(p_whitespace * P(k) * p_whitespace)
1548        local v = function(s)
1549            return lpegmatch(p,s)
1550        end
1551        t[k] = v
1552        return v
1553    end })
1554
1555    function string.wordsplitter(s)
1556        return cache[s]
1557    end
1558
1559end
1560
1561if CONTEXTLMTXMODE and CONTEXTLMTXMODE > 0 then
1562
1563    local t = {
1564        ["#"]  = "#H",
1565        ["\n"] = "#L",
1566        ['"']  = "#Q",
1567        ["\r"] = "#R",
1568        [" "]  = "#S",
1569        ["\t"] = "#T",
1570        ["\\"] = "#X",
1571    }
1572
1573    function string.texhashed(s)
1574        return (gsub(s,".",t))
1575    end
1576
1577end
1578