1if not modules then modules = { } end modules ['sort-ini'] = {
2 version = 1.001,
3 comment = "companion to sort-ini.mkiv",
4 author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5 copyright = "PRAGMA ADE / ConTeXt Development Team",
6 license = "see context related readme files"
7}
8
9
10
11
12
52
53local gsub, find, rep, sub, sort, concat, tohash, format = string.gsub, string.find, string.rep, string.sub, table.sort, table.concat, table.tohash, string.format
54local utfbyte, utfchar, utfcharacters = utf.byte, utf.char, utf.characters
55local next, type, tonumber, rawget, rawset = next, type, tonumber, rawget, rawset
56local P, Cs, R, S, lpegmatch, lpegpatterns = lpeg.P, lpeg.Cs, lpeg.R, lpeg.S, lpeg.match, lpeg.patterns
57
58local allocate = utilities.storage.allocate
59local setmetatableindex = table.setmetatableindex
60
61local trace_tests = false trackers.register("sorters.tests", function(v) trace_tests = v end)
62local trace_methods = false trackers.register("sorters.methods", function(v) trace_methods = v end)
63local trace_orders = false trackers.register("sorters.orders", function(v) trace_orders = v end)
64local trace_replacements= false trackers.register("sorters.replacements", function(v) trace_replacements = v end)
65
66local report_sorters = logs.reporter("languages","sorters")
67
68local comparers = { }
69local splitters = { }
70local definitions = allocate()
71local tracers = allocate()
72local ignoredoffset = 0x10000
73local replacementoffset = 0x10000
74local digitsoffset = 0x20000
75local digitsmaximum = 0xFFFFF
76
77local lccodes = characters.lccodes
78local uccodes = characters.uccodes
79local lcchars = characters.lcchars
80local ucchars = characters.ucchars
81local shchars = characters.shchars
82local fscodes = characters.fscodes
83local fschars = characters.fschars
84
85local decomposed = characters.decomposed
86
87local variables = interfaces.variables
88
89local v_numbers = variables.numbers
90local v_default = variables.default
91local v_before = variables.before
92local v_after = variables.after
93local v_first = variables.first
94local v_last = variables.last
95
96local validmethods = tohash {
97 "ch",
98 "mm",
99 "zm",
100 "pm",
101 "mc",
102 "zc",
103 "pc",
104 "uc",
105}
106
107local predefinedmethods = {
108 [v_default] = "zc,pc,zm,pm,uc",
109 [v_before] = "mm,mc,uc",
110 [v_after] = "pm,mc,uc",
111 [v_first] = "pc,mm,uc",
112 [v_last] = "mc,mm,uc",
113}
114
115sorters = {
116 comparers = comparers,
117 splitters = splitters,
118 definitions = definitions,
119 tracers = tracers,
120 constants = {
121 ignoredoffset = ignoredoffset,
122 replacementoffset = replacementoffset,
123 digitsoffset = digitsoffset,
124 digitsmaximum = digitsmaximum,
125 defaultlanguage = v_default,
126 defaultmethod = v_default,
127 defaultdigits = v_numbers,
128 validmethods = validmethods,
129 }
130}
131
132local sorters = sorters
133local constants = sorters.constants
134
135local data, language, method, digits
136local replacements, m_mappings, z_mappings, p_mappings, entries, orders, lower, upper, method, sequence, usedinsequence
137local thefirstofsplit
138
139local mte = {
140 __index = function(t,k)
141 if k and k ~= "" and utfbyte(k) < digitsoffset then
142 local el
143 if k then
144 local l = lower[k] or lcchars[k]
145 el = rawget(t,l)
146 end
147 if not el then
148 local l = shchars[k]
149 if l and l ~= k then
150 if #l > 1 then
151 l = sub(l,1,1)
152 end
153 el = rawget(t,l)
154 if not el then
155 l = lower[k] or lcchars[l]
156 if l then
157 el = rawget(t,l)
158 end
159 end
160 end
161 el = el or k
162 end
163
164 return el
165 else
166
167 end
168 end
169}
170
171local noorder = false
172local nothing = { 0 }
173
174local function preparetables(data)
175 local orders, lower, m_mappings, z_mappings, p_mappings = data.orders, data.lower, { }, { }, { }
176 for i=1,#orders do
177 local oi = orders[i]
178 local n = { 2 * i }
179 m_mappings[oi], z_mappings[oi], p_mappings[oi] = n, n, n
180 end
181 local mtm = {
182 __index = function(t,k)
183 local n, nn
184 if k then
185 if trace_orders then
186 report_sorters("simplifing character %C",k)
187 end
188 local l = lower[k] or lcchars[k]
189 if l then
190 if trace_orders then
191 report_sorters(" 1 lower: %C",l)
192 end
193 local ml = rawget(t,l)
194 if ml then
195 n = { }
196 nn = 0
197 for i=1,#ml do
198 nn = nn + 1
199 n[nn] = ml[i] + (t.__delta or 0)
200 end
201 if trace_orders then
202 report_sorters(" 2 order: % t",n)
203 end
204 end
205 end
206 if not n then
207 local s = shchars[k]
208 if s and s ~= k then
209 if trace_orders then
210 report_sorters(" 3 shape: %C",s)
211 end
212 n = { }
213 nn = 0
214 for l in utfcharacters(s) do
215 local ml = rawget(t,l)
216 if ml then
217 if trace_orders then
218 report_sorters(" 4 keep: %C",l)
219 end
220 if ml then
221 for i=1,#ml do
222 nn = nn + 1
223 n[nn] = ml[i]
224 end
225 end
226 else
227 l = lower[l] or lcchars[l]
228 if l then
229 if trace_orders then
230 report_sorters(" 5 lower: %C",l)
231 end
232 local ml = rawget(t,l)
233 if ml then
234 for i=1,#ml do
235 nn = nn + 1
236 n[nn] = ml[i] + (t.__delta or 0)
237 end
238 end
239 end
240 end
241 end
242 else
243
244
245
246
247
248
249
250
251
252
253
254
255 if k == "\000" then
256 n = nothing
257 if trace_orders then
258 report_sorters(" 6 split: space")
259 end
260 else
261 local b = 2 * #orders + utfbyte(k)
262 n = decomposed[b] or { b }
263 if trace_orders then
264 report_sorters(" 6 split: %s",utf.tostring(b))
265 end
266 end
267 end
268 if n then
269 if trace_orders then
270 report_sorters(" 7 order: % t",n)
271 end
272 else
273 n = noorder
274 if trace_orders then
275 report_sorters(" 8 order: 0")
276 end
277 end
278 end
279 else
280 n = noorder
281 if trace_orders then
282 report_sorters(" 9 order: 0")
283 end
284 end
285 rawset(t,k,n)
286 return n
287 end
288 }
289 data.m_mappings = m_mappings
290 data.z_mappings = z_mappings
291 data.p_mappings = p_mappings
292 m_mappings.__delta = -1
293 z_mappings.__delta = 0
294 p_mappings.__delta = 1
295 setmetatable(data.entries,mte)
296 setmetatable(data.m_mappings,mtm)
297 setmetatable(data.z_mappings,mtm)
298 setmetatable(data.p_mappings,mtm)
299 thefirstofsplit = data.firstofsplit
300end
301
302local function update()
303 for language, data in next, definitions do
304 local parent = data.parent or "default"
305 if language ~= "default" then
306 setmetatableindex(data,definitions[parent] or definitions.default)
307 end
308 data.language = language
309 data.parent = parent
310 data.m_mappings = { }
311 data.z_mappings = { }
312 data.p_mappings = { }
313 end
314end
315
316local function setlanguage(l,m,d,u)
317 language = (l ~= "" and l) or constants.defaultlanguage
318 data = definitions[language or constants.defaultlanguage] or definitions[constants.defaultlanguage]
319 method = (m ~= "" and m) or (data.method ~= "" and data.method) or constants.defaultmethod
320 digits = (d ~= "" and d) or (data.digits ~= "" and data.digits) or constants.defaultdigits
321 if trace_tests then
322 report_sorters("setting language %a, method %a, digits %a",language,method,digits)
323 end
324 replacements = data.replacements
325 entries = data.entries
326 orders = data.orders
327 lower = data.lower
328 upper = data.upper
329 preparetables(data)
330 m_mappings = data.m_mappings
331 z_mappings = data.z_mappings
332 p_mappings = data.p_mappings
333
334 method = predefinedmethods[variables[method]] or method
335 data.method = method
336
337 data.digits = digits
338
339 local seq = utilities.parsers.settings_to_array(method or "")
340 sequence = { }
341 local nofsequence = 0
342 for i=1,#seq do
343 local s = seq[i]
344 if validmethods[s] then
345 nofsequence = nofsequence + 1
346 sequence[nofsequence] = s
347 else
348 report_sorters("invalid sorter method %a in %a",s,method)
349 end
350 end
351 usedinsequence = tohash(sequence)
352 data.sequence = sequence
353 data.usedinsequence = usedinsequence
354
355 if trace_tests then
356 report_sorters("using sort sequence: % t",sequence)
357 end
358
359 return data
360end
361
362function sorters.update()
363 update()
364 setlanguage(language,method,numberorder)
365end
366
367function sorters.setlanguage(language,method,numberorder)
368 update()
369 setlanguage(language,method,numberorder)
370end
371
372
373
374
375
376local function basicsort(sort_a,sort_b)
377 if sort_a and sort_b then
378 local na = #sort_a
379 local nb = #sort_b
380 if na > nb then
381 na = nb
382 end
383 if na > 0 then
384 for i=1,na do
385 local ai, bi = sort_a[i], sort_b[i]
386 if ai > bi then
387 return 1
388 elseif ai < bi then
389 return -1
390 end
391 end
392 end
393 end
394 return 0
395end
396
397
398
399local function basic(a,b)
400 if a == b then
401
402 return 0
403 end
404 local ea = a.split
405 local eb = b.split
406 local na = #ea
407 local nb = #eb
408 if na == 0 and nb == 0 then
409
410 local result = 0
411 for j=1,#sequence do
412 local m = sequence[j]
413 result = basicsort(ea[m],eb[m])
414 if result ~= 0 then
415 return result
416 end
417 end
418 if result == 0 then
419 local la = #ea.uc
420 local lb = #eb.uc
421 if la > lb then
422 return 1
423 elseif lb > la then
424 return -1
425 else
426 return 0
427 end
428 else
429 return result
430 end
431 else
432
433 local result = 0
434 for i=1,nb < na and nb or na do
435 local eai = ea[i]
436 local ebi = eb[i]
437 for j=1,#sequence do
438 local m = sequence[j]
439 result = basicsort(eai[m],ebi[m])
440 if result ~= 0 then
441 return result
442 end
443 end
444 if result == 0 then
445 local la = #eai.uc
446 local lb = #ebi.uc
447 if la > lb then
448 return 1
449 elseif lb > la then
450 return -1
451 end
452 else
453 return result
454 end
455 end
456 if result ~= 0 then
457 return result
458 elseif na > nb then
459 return 1
460 elseif nb > na then
461 return -1
462 else
463 return 0
464 end
465 end
466end
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498comparers.basic = basic
499
500function sorters.basicsorter(a,b)
501 return basic(a,b) == -1
502end
503
504local function numify(old)
505 if digits == v_numbers then
506 local new = digitsoffset + tonumber(old)
507 if new > digitsmaximum then
508 new = digitsmaximum
509 end
510 return utfchar(new)
511 else
512 return old
513 end
514end
515
516local pattern = nil
517
518local function prepare()
519 pattern = Cs( (
520 characters.tex.toutfpattern()
521 + lpeg.patterns.whitespace / "\000"
522 + (P("\\Ux{") / "" * ((1-P("}"))^1/function(s) return utfchar(tonumber(s,16)) end) * (P("}")/""))
523 + (P("\\") / "") * R("AZ")^0 * (P(-1) + #(1-R("AZ")))
524 + (P("\\") * P(1) * R("az","AZ")^0) / ""
525 + S("[](){}$\"'") / ""
526 + R("09")^1 / numify
527 + P(1)
528 )^0 )
529 return pattern
530end
531
532local function strip(str)
533 if str and str ~= "" then
534 return lpegmatch(pattern or prepare(),str)
535 else
536 return ""
537 end
538end
539
540sorters.strip = strip
541
542local function firstofsplit(entry)
543
544 local split = entry.split
545 if #split > 0 then
546 split = split[1].ch
547 else
548 split = split.ch
549 end
550 local first = split and split[1] or ""
551 if thefirstofsplit then
552 return thefirstofsplit(first,data,entry)
553 else
554 return first, entries[first] or "\000"
555 end
556end
557
558sorters.firstofsplit = firstofsplit
559
560
561
562
563function splitters.utf(str,checked)
564 local nofreplacements = #replacements
565 if nofreplacements > 0 then
566
567 local replacer = replacements.replacer
568 if not replacer then
569 local rep = { }
570 for i=1,nofreplacements do
571 local r = replacements[i]
572 rep[strip(r[1])] = strip(r[2])
573 end
574 replacer = lpeg.utfchartabletopattern(rep)
575 replacer = Cs((replacer/rep + lpegpatterns.utf8character)^0)
576 replacements.replacer = replacer
577 end
578 local rep = lpegmatch(replacer,str)
579 if rep and rep ~= str then
580 if trace_replacements then
581 report_sorters("original : %s",str)
582 report_sorters("replacement: %s",rep)
583 end
584 str = rep
585 end
586
587
588
589
590
591
592
593 end
594 local m_case = { }
595 local z_case = { }
596 local p_case = { }
597 local m_mapping = { }
598 local z_mapping = { }
599 local p_mapping = { }
600 local char = { }
601 local byte = { }
602 local n = 0
603 local nm = 0
604 local nz = 0
605 local np = 0
606 for sc in utfcharacters(str) do
607 local b = utfbyte(sc)
608 if b >= digitsoffset then
609 if n == 0 then
610
611 z_case[1] = 0
612 m_case[1] = 0
613 p_case[1] = 0
614 char[1] = sc
615 byte[1] = 0
616 m_mapping[1] = 0
617 z_mapping[1] = 0
618 p_mapping[1] = 0
619 n = 2
620 else
621 n = n + 1
622 end
623 z_case[n] = b
624 m_case[n] = b
625 p_case[n] = b
626 char[n] = sc
627 byte[n] = b
628 nm = nm + 1
629 nz = nz + 1
630 np = np + 1
631 m_mapping[nm] = b
632 z_mapping[nz] = b
633 p_mapping[np] = b
634 else
635 n = n + 1
636 local l = lower[sc]
637 l = l and utfbyte(l) or lccodes[b] or b
638
639
640 if type(l) == "table" then
641 l = l[1]
642 end
643
644
645
646 z_case[n] = l
647 if l ~= b then
648 m_case[n] = l - 1
649 p_case[n] = l + 1
650 else
651 m_case[n] = l
652 p_case[n] = l
653 end
654 char[n], byte[n] = sc, b
655 local fs = fscodes[b] or b
656 local msc = m_mappings[sc]
657 if msc ~= noorder then
658 if not msc then
659 msc = m_mappings[fs]
660 end
661 for i=1,#msc do
662 nm = nm + 1
663 m_mapping[nm] = msc[i]
664 end
665 end
666 local zsc = z_mappings[sc]
667 if zsc ~= noorder then
668 if not zsc then
669 zsc = z_mappings[fs]
670 end
671 for i=1,#zsc do
672 nz = nz + 1
673 z_mapping[nz] = zsc[i]
674 end
675 end
676 local psc = p_mappings[sc]
677 if psc ~= noorder then
678 if not psc then
679 psc = p_mappings[fs]
680 end
681 for i=1,#psc do
682 np = np + 1
683 p_mapping[np] = psc[i]
684 end
685 end
686 end
687 end
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704 local result
705 if checked then
706 result = {
707 ch = trace_tests and char or nil,
708 uc = usedinsequence.uc and byte or nil,
709 mc = usedinsequence.mc and m_case or nil,
710 zc = usedinsequence.zc and z_case or nil,
711 pc = usedinsequence.pc and p_case or nil,
712 mm = usedinsequence.mm and m_mapping or nil,
713 zm = usedinsequence.zm and z_mapping or nil,
714 pm = usedinsequence.pm and p_mapping or nil,
715 }
716 else
717 result = {
718 ch = char,
719 uc = byte,
720 mc = m_case,
721 zc = z_case,
722 pc = p_case,
723 mm = m_mapping,
724 zm = z_mapping,
725 pm = p_mapping,
726 }
727 end
728
729
730
731
732
733
734
735
736 return result
737end
738
739local function packch(entry)
740 local split = entry.split
741 if split and #split > 0 then
742 local t = { }
743 for i=1,#split do
744 local tt = { }
745 local ch = split[i].ch
746 for j=1,#ch do
747 local chr = ch[j]
748 local byt = utfbyte(chr)
749 if byt > ignoredoffset then
750 tt[j] = "[]"
751 elseif byt == 0 then
752 tt[j] = " "
753 else
754 tt[j] = chr
755 end
756 end
757 t[i] = concat(tt)
758 end
759 return concat(t," + ")
760 else
761 local t = { }
762 local ch = (split and split.ch) or entry.ch or entry
763 if ch then
764 for i=1,#ch do
765 local chr = ch[i]
766 local byt = utfbyte(chr)
767 if byt > ignoredoffset then
768 t[i] = "[]"
769 elseif byt == 0 then
770 t[i] = " "
771 else
772 t[i] = chr
773 end
774 end
775 return concat(t)
776 else
777 return ""
778 end
779 end
780end
781
782local function packuc(entry)
783 local split = entry.split
784 if split and #split > 0 then
785 local t = { }
786 for i=1,#split do
787 t[i] = concat(split[i].uc, " ")
788 end
789 return concat(t," + ")
790 else
791 local uc = (split and split.uc) or entry.uc or entry
792 if uc then
793 return concat(uc," ")
794 else
795 return ""
796 end
797 end
798end
799
800sorters.packch = packch
801sorters.packuc = packuc
802
803function sorters.sort(entries,cmp)
804 if trace_methods then
805 local nofentries = #entries
806 report_sorters("entries: %s, language: %s, method: %s, digits: %s",nofentries,language,method,tostring(digits))
807 for i=1,nofentries do
808 report_sorters("entry %s",table.serialize(entries[i].split,i,true,true,true))
809 end
810 end
811 if trace_tests then
812 sort(entries,function(a,b)
813 local r = cmp(a,b)
814 local e = (not r and "?") or (r<0 and "<") or (r>0 and ">") or "="
815 report_sorters("%s %s %s | %s %s %s",packch(a),e,packch(b),packuc(a),e,packuc(b))
816 return r == -1
817 end)
818 local s
819 for i=1,#entries do
820 local entry = entries[i]
821 local letter, first = firstofsplit(entry)
822 if first == s then
823 first = " "
824 else
825 s = first
826 if first and letter then
827 report_sorters(">> %C (%C)",first,letter)
828 end
829 end
830 report_sorters(" %s | %s",packch(entry),packuc(entry))
831 end
832 else
833 sort(entries,function(a,b)
834 return cmp(a,b) == -1
835 end)
836 end
837end
838
839
840
841function sorters.replacementlist(list)
842 local replacements = { }
843 for i=1,#list do
844 replacements[i] = {
845 list[i],
846 utfchar(replacementoffset+i),
847 }
848 end
849 return replacements
850end
851 |