1if not modules then modules = { } end modules ['sort-ini'] = {
2 version = 1.001,
3 comment = "companion to sort-ini.mkiv",
4 author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5 copyright = "PRAGMA ADE / ConTeXt Development Team",
6 license = "see context related readme files"
7}
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49local gsub, find, rep, sub, sort, concat, tohash, format = string.gsub, string.find, string.rep, string.sub, table.sort, table.concat, table.tohash, string.format
50local utfbyte, utfchar, utfcharacters = utf.byte, utf.char, utf.characters
51local next, type, tonumber, rawget, rawset = next, type, tonumber, rawget, rawset
52local P, Cs, R, S, lpegmatch, lpegpatterns = lpeg.P, lpeg.Cs, lpeg.R, lpeg.S, lpeg.match, lpeg.patterns
53
54local allocate = utilities.storage.allocate
55local setmetatableindex = table.setmetatableindex
56
57local trace_tests = false trackers.register("sorters.tests", function(v) trace_tests = v end)
58local trace_methods = false trackers.register("sorters.methods", function(v) trace_methods = v end)
59local trace_orders = false trackers.register("sorters.orders", function(v) trace_orders = v end)
60local trace_replacements= false trackers.register("sorters.replacements", function(v) trace_replacements = v end)
61
62local report_sorters = logs.reporter("languages","sorters")
63
64local comparers = { }
65local splitters = { }
66local definitions = allocate()
67local tracers = allocate()
68local ignoredoffset = 0x10000
69local replacementoffset = 0x10000
70local digitsoffset = 0x20000
71local digitsmaximum = 0xFFFFF
72
73local lccodes = characters.lccodes
74local uccodes = characters.uccodes
75local lcchars = characters.lcchars
76local ucchars = characters.ucchars
77local shchars = characters.shchars
78local fscodes = characters.fscodes
79local fschars = characters.fschars
80
81local decomposed = characters.decomposed
82
83local variables = interfaces.variables
84
85local v_numbers = variables.numbers
86local v_default = variables.default
87local v_before = variables.before
88local v_after = variables.after
89local v_first = variables.first
90local v_last = variables.last
91
92local validmethods = tohash {
93 "ch",
94 "mm",
95 "zm",
96 "pm",
97 "mc",
98 "zc",
99 "pc",
100 "uc",
101}
102
103local predefinedmethods = {
104 [v_default] = "zc,pc,zm,pm,uc",
105 [v_before] = "mm,mc,uc",
106 [v_after] = "pm,mc,uc",
107 [v_first] = "pc,mm,uc",
108 [v_last] = "mc,mm,uc",
109}
110
111sorters = {
112 comparers = comparers,
113 splitters = splitters,
114 definitions = definitions,
115 tracers = tracers,
116 constants = {
117 ignoredoffset = ignoredoffset,
118 replacementoffset = replacementoffset,
119 digitsoffset = digitsoffset,
120 digitsmaximum = digitsmaximum,
121 defaultlanguage = v_default,
122 defaultmethod = v_default,
123 defaultdigits = v_numbers,
124 validmethods = validmethods,
125 }
126}
127
128local sorters = sorters
129local constants = sorters.constants
130
131local data, language, method, digits
132local replacements, m_mappings, z_mappings, p_mappings, entries, orders, lower, upper, method, sequence, usedinsequence
133local thefirstofsplit
134
135local mte = {
136 __index = function(t,k)
137 if k and k ~= "" and utfbyte(k) < digitsoffset then
138 local el
139 if k then
140 local l = lower[k] or lcchars[k]
141 el = rawget(t,l)
142 end
143 if not el then
144 local l = shchars[k]
145 if l and l ~= k then
146 if #l > 1 then
147 l = sub(l,1,1)
148 end
149 el = rawget(t,l)
150 if not el then
151 l = lower[k] or lcchars[l]
152 if l then
153 el = rawget(t,l)
154 end
155 end
156 end
157 el = el or k
158 end
159
160 return el
161 else
162
163 end
164 end
165}
166
167local noorder = false
168local nothing = { 0 }
169
170local function preparetables(data)
171 local orders, lower, m_mappings, z_mappings, p_mappings = data.orders, data.lower, { }, { }, { }
172 for i=1,#orders do
173 local oi = orders[i]
174 local n = { 2 * i }
175 m_mappings[oi], z_mappings[oi], p_mappings[oi] = n, n, n
176 end
177 local mtm = {
178 __index = function(t,k)
179 local n, nn
180 if k then
181 if trace_orders then
182 report_sorters("simplifing character %C",k)
183 end
184 local l = lower[k] or lcchars[k]
185 if l then
186 if trace_orders then
187 report_sorters(" 1 lower: %C",l)
188 end
189 local ml = rawget(t,l)
190 if ml then
191 n = { }
192 nn = 0
193 for i=1,#ml do
194 nn = nn + 1
195 n[nn] = ml[i] + (t.__delta or 0)
196 end
197 if trace_orders then
198 report_sorters(" 2 order: % t",n)
199 end
200 end
201 end
202 if not n then
203 local s = shchars[k]
204 if s and s ~= k then
205 if trace_orders then
206 report_sorters(" 3 shape: %C",s)
207 end
208 n = { }
209 nn = 0
210 for l in utfcharacters(s) do
211 local ml = rawget(t,l)
212 if ml then
213 if trace_orders then
214 report_sorters(" 4 keep: %C",l)
215 end
216 if ml then
217 for i=1,#ml do
218 nn = nn + 1
219 n[nn] = ml[i]
220 end
221 end
222 else
223 l = lower[l] or lcchars[l]
224 if l then
225 if trace_orders then
226 report_sorters(" 5 lower: %C",l)
227 end
228 local ml = rawget(t,l)
229 if ml then
230 for i=1,#ml do
231 nn = nn + 1
232 n[nn] = ml[i] + (t.__delta or 0)
233 end
234 end
235 end
236 end
237 end
238 else
239
240
241
242
243
244
245
246
247
248
249
250
251 if k == "\000" then
252 n = nothing
253 if trace_orders then
254 report_sorters(" 6 split: space")
255 end
256 else
257 local b = 2 * #orders + utfbyte(k)
258 n = decomposed[b] or { b }
259 if trace_orders then
260 report_sorters(" 6 split: %s",utf.tostring(b))
261 end
262 end
263 end
264 if n then
265 if trace_orders then
266 report_sorters(" 7 order: % t",n)
267 end
268 else
269 n = noorder
270 if trace_orders then
271 report_sorters(" 8 order: 0")
272 end
273 end
274 end
275 else
276 n = noorder
277 if trace_orders then
278 report_sorters(" 9 order: 0")
279 end
280 end
281 rawset(t,k,n)
282 return n
283 end
284 }
285 data.m_mappings = m_mappings
286 data.z_mappings = z_mappings
287 data.p_mappings = p_mappings
288 m_mappings.__delta = -1
289 z_mappings.__delta = 0
290 p_mappings.__delta = 1
291 setmetatable(data.entries,mte)
292 setmetatable(data.m_mappings,mtm)
293 setmetatable(data.z_mappings,mtm)
294 setmetatable(data.p_mappings,mtm)
295 thefirstofsplit = data.firstofsplit
296end
297
298local function update()
299 for language, data in next, definitions do
300 local parent = data.parent or "default"
301 if language ~= "default" then
302 setmetatableindex(data,definitions[parent] or definitions.default)
303 end
304 data.language = language
305 data.parent = parent
306 data.m_mappings = { }
307 data.z_mappings = { }
308 data.p_mappings = { }
309 end
310end
311
312local function setlanguage(l,m,d,u)
313 language = (l ~= "" and l) or constants.defaultlanguage
314 data = definitions[language or constants.defaultlanguage] or definitions[constants.defaultlanguage]
315 method = (m ~= "" and m) or (data.method ~= "" and data.method) or constants.defaultmethod
316 digits = (d ~= "" and d) or (data.digits ~= "" and data.digits) or constants.defaultdigits
317 if trace_tests then
318 report_sorters("setting language %a, method %a, digits %a",language,method,digits)
319 end
320 replacements = data.replacements
321 entries = data.entries
322 orders = data.orders
323 lower = data.lower
324 upper = data.upper
325 preparetables(data)
326 m_mappings = data.m_mappings
327 z_mappings = data.z_mappings
328 p_mappings = data.p_mappings
329
330 method = predefinedmethods[variables[method]] or method
331 data.method = method
332
333 data.digits = digits
334
335 local seq = utilities.parsers.settings_to_array(method or "")
336 sequence = { }
337 local nofsequence = 0
338 for i=1,#seq do
339 local s = seq[i]
340 if validmethods[s] then
341 nofsequence = nofsequence + 1
342 sequence[nofsequence] = s
343 else
344 report_sorters("invalid sorter method %a in %a",s,method)
345 end
346 end
347 usedinsequence = tohash(sequence)
348 data.sequence = sequence
349 data.usedinsequence = usedinsequence
350
351 if trace_tests then
352 report_sorters("using sort sequence: % t",sequence)
353 end
354
355 return data
356end
357
358function sorters.update()
359 update()
360 setlanguage(language,method,numberorder)
361end
362
363function sorters.setlanguage(language,method,numberorder)
364 update()
365 setlanguage(language,method,numberorder)
366end
367
368
369
370
371
372local function basicsort(sort_a,sort_b)
373 if sort_a and sort_b then
374 local na = #sort_a
375 local nb = #sort_b
376 if na > nb then
377 na = nb
378 end
379 if na > 0 then
380 for i=1,na do
381 local ai, bi = sort_a[i], sort_b[i]
382 if ai > bi then
383 return 1
384 elseif ai < bi then
385 return -1
386 end
387 end
388 end
389 end
390 return 0
391end
392
393
394
395local function basic(a,b)
396 if a == b then
397
398 return 0
399 end
400 local ea = a.split
401 local eb = b.split
402 local na = #ea
403 local nb = #eb
404 if na == 0 and nb == 0 then
405
406 local result = 0
407 for j=1,#sequence do
408 local m = sequence[j]
409 result = basicsort(ea[m],eb[m])
410 if result ~= 0 then
411 return result
412 end
413 end
414 if result == 0 then
415 local la = #ea.uc
416 local lb = #eb.uc
417 if la > lb then
418 return 1
419 elseif lb > la then
420 return -1
421 else
422 return 0
423 end
424 else
425 return result
426 end
427 else
428
429 local result = 0
430 for i=1,nb < na and nb or na do
431 local eai = ea[i]
432 local ebi = eb[i]
433 for j=1,#sequence do
434 local m = sequence[j]
435 result = basicsort(eai[m],ebi[m])
436 if result ~= 0 then
437 return result
438 end
439 end
440 if result == 0 then
441 local la = #eai.uc
442 local lb = #ebi.uc
443 if la > lb then
444 return 1
445 elseif lb > la then
446 return -1
447 end
448 else
449 return result
450 end
451 end
452 if result ~= 0 then
453 return result
454 elseif na > nb then
455 return 1
456 elseif nb > na then
457 return -1
458 else
459 return 0
460 end
461 end
462end
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494comparers.basic = basic
495
496function sorters.basicsorter(a,b)
497 return basic(a,b) == -1
498end
499
500local function numify(old)
501 if digits == v_numbers then
502 local new = digitsoffset + tonumber(old)
503 if new > digitsmaximum then
504 new = digitsmaximum
505 end
506 return utfchar(new)
507 else
508 return old
509 end
510end
511
512local pattern = nil
513
514local function prepare()
515 pattern = Cs( (
516 characters.tex.toutfpattern()
517 + lpeg.patterns.whitespace / "\000"
518 + (P("\\Ux{") / "" * ((1-P("}"))^1/function(s) return utfchar(tonumber(s,16)) end) * (P("}")/""))
519 + (P("\\") / "") * R("AZ")^0 * (P(-1) + #(1-R("AZ")))
520 + (P("\\") * P(1) * R("az","AZ")^0) / ""
521 + S("[](){}$\"'") / ""
522 + R("09")^1 / numify
523 + P(1)
524 )^0 )
525 return pattern
526end
527
528local function strip(str)
529 if str and str ~= "" then
530 return lpegmatch(pattern or prepare(),str)
531 else
532 return ""
533 end
534end
535
536sorters.strip = strip
537
538local function firstofsplit(entry)
539
540 local split = entry.split
541 if #split > 0 then
542 split = split[1].ch
543 else
544 split = split.ch
545 end
546 local first = split and split[1] or ""
547 if thefirstofsplit then
548 return thefirstofsplit(first,data,entry)
549 else
550 return first, entries[first] or "\000"
551 end
552end
553
554sorters.firstofsplit = firstofsplit
555
556
557
558
559function splitters.utf(str,checked)
560 local nofreplacements = #replacements
561 if nofreplacements > 0 then
562
563 local replacer = replacements.replacer
564 if not replacer then
565 local rep = { }
566 for i=1,nofreplacements do
567 local r = replacements[i]
568 rep[strip(r[1])] = strip(r[2])
569 end
570 replacer = lpeg.utfchartabletopattern(rep)
571 replacer = Cs((replacer/rep + lpegpatterns.utf8character)^0)
572 replacements.replacer = replacer
573 end
574 local rep = lpegmatch(replacer,str)
575 if rep and rep ~= str then
576 if trace_replacements then
577 report_sorters("original : %s",str)
578 report_sorters("replacement: %s",rep)
579 end
580 str = rep
581 end
582
583
584
585
586
587
588
589 end
590 local m_case = { }
591 local z_case = { }
592 local p_case = { }
593 local m_mapping = { }
594 local z_mapping = { }
595 local p_mapping = { }
596 local char = { }
597 local byte = { }
598 local n = 0
599 local nm = 0
600 local nz = 0
601 local np = 0
602 for sc in utfcharacters(str) do
603 local b = utfbyte(sc)
604 if b >= digitsoffset then
605 if n == 0 then
606
607 z_case[1] = 0
608 m_case[1] = 0
609 p_case[1] = 0
610 char[1] = sc
611 byte[1] = 0
612 m_mapping[1] = 0
613 z_mapping[1] = 0
614 p_mapping[1] = 0
615 n = 2
616 else
617 n = n + 1
618 end
619 z_case[n] = b
620 m_case[n] = b
621 p_case[n] = b
622 char[n] = sc
623 byte[n] = b
624 nm = nm + 1
625 nz = nz + 1
626 np = np + 1
627 m_mapping[nm] = b
628 z_mapping[nz] = b
629 p_mapping[np] = b
630 else
631 n = n + 1
632 local l = lower[sc]
633 l = l and utfbyte(l) or lccodes[b] or b
634
635
636 if type(l) == "table" then
637 l = l[1]
638 end
639
640
641
642 z_case[n] = l
643 if l ~= b then
644 m_case[n] = l - 1
645 p_case[n] = l + 1
646 else
647 m_case[n] = l
648 p_case[n] = l
649 end
650 char[n], byte[n] = sc, b
651 local fs = fscodes[b] or b
652 local msc = m_mappings[sc]
653 if msc ~= noorder then
654 if not msc then
655 msc = m_mappings[fs]
656 end
657 for i=1,#msc do
658 nm = nm + 1
659 m_mapping[nm] = msc[i]
660 end
661 end
662 local zsc = z_mappings[sc]
663 if zsc ~= noorder then
664 if not zsc then
665 zsc = z_mappings[fs]
666 end
667 for i=1,#zsc do
668 nz = nz + 1
669 z_mapping[nz] = zsc[i]
670 end
671 end
672 local psc = p_mappings[sc]
673 if psc ~= noorder then
674 if not psc then
675 psc = p_mappings[fs]
676 end
677 for i=1,#psc do
678 np = np + 1
679 p_mapping[np] = psc[i]
680 end
681 end
682 end
683 end
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700 local result
701 if checked then
702 result = {
703 ch = trace_tests and char or nil,
704 uc = usedinsequence.uc and byte or nil,
705 mc = usedinsequence.mc and m_case or nil,
706 zc = usedinsequence.zc and z_case or nil,
707 pc = usedinsequence.pc and p_case or nil,
708 mm = usedinsequence.mm and m_mapping or nil,
709 zm = usedinsequence.zm and z_mapping or nil,
710 pm = usedinsequence.pm and p_mapping or nil,
711 }
712 else
713 result = {
714 ch = char,
715 uc = byte,
716 mc = m_case,
717 zc = z_case,
718 pc = p_case,
719 mm = m_mapping,
720 zm = z_mapping,
721 pm = p_mapping,
722 }
723 end
724
725
726
727
728
729
730
731
732 return result
733end
734
735local function packch(entry)
736 local split = entry.split
737 if split and #split > 0 then
738 local t = { }
739 for i=1,#split do
740 local tt = { }
741 local ch = split[i].ch
742 for j=1,#ch do
743 local chr = ch[j]
744 local byt = utfbyte(chr)
745 if byt > ignoredoffset then
746 tt[j] = "[]"
747 elseif byt == 0 then
748 tt[j] = " "
749 else
750 tt[j] = chr
751 end
752 end
753 t[i] = concat(tt)
754 end
755 return concat(t," + ")
756 else
757 local t = { }
758 local ch = (split and split.ch) or entry.ch or entry
759 if ch then
760 for i=1,#ch do
761 local chr = ch[i]
762 local byt = utfbyte(chr)
763 if byt > ignoredoffset then
764 t[i] = "[]"
765 elseif byt == 0 then
766 t[i] = " "
767 else
768 t[i] = chr
769 end
770 end
771 return concat(t)
772 else
773 return ""
774 end
775 end
776end
777
778local function packuc(entry)
779 local split = entry.split
780 if split and #split > 0 then
781 local t = { }
782 for i=1,#split do
783 t[i] = concat(split[i].uc, " ")
784 end
785 return concat(t," + ")
786 else
787 local uc = (split and split.uc) or entry.uc or entry
788 if uc then
789 return concat(uc," ")
790 else
791 return ""
792 end
793 end
794end
795
796sorters.packch = packch
797sorters.packuc = packuc
798
799function sorters.sort(entries,cmp)
800 if trace_methods then
801 local nofentries = #entries
802 report_sorters("entries: %s, language: %s, method: %s, digits: %s",nofentries,language,method,tostring(digits))
803 for i=1,nofentries do
804 report_sorters("entry %s",table.serialize(entries[i].split,i,true,true,true))
805 end
806 end
807 if trace_tests then
808 sort(entries,function(a,b)
809 local r = cmp(a,b)
810 local e = (not r and "?") or (r<0 and "<") or (r>0 and ">") or "="
811 report_sorters("%s %s %s | %s %s %s",packch(a),e,packch(b),packuc(a),e,packuc(b))
812 return r == -1
813 end)
814 local s
815 for i=1,#entries do
816 local entry = entries[i]
817 local letter, first = firstofsplit(entry)
818 if first == s then
819 first = " "
820 else
821 s = first
822 if first and letter then
823 report_sorters(">> %C (%C)",first,letter)
824 end
825 end
826 report_sorters(" %s | %s",packch(entry),packuc(entry))
827 end
828 else
829 sort(entries,function(a,b)
830 return cmp(a,b) == -1
831 end)
832 end
833end
834
835
836
837function sorters.replacementlist(list)
838 local replacements = { }
839 for i=1,#list do
840 replacements[i] = {
841 list[i],
842 utfchar(replacementoffset+i),
843 }
844 end
845 return replacements
846end
847 |