1if not modules then modules = { } end modules [ ' char-tex ' ] = {
2 version = 1 . 001 ,
3 comment = " companion to char-ini.mkiv " ,
4 author = " Hans Hagen, PRAGMA-ADE, Hasselt NL " ,
5 copyright = " PRAGMA ADE / ConTeXt Development Team " ,
6 license = " see context related readme files "
7}
8
9local lpeg = lpeg
10local tonumber , next , type = tonumber , next , type
11local format , find , gmatch , match = string . format , string . find , string . gmatch , string . match
12local utfchar , utfbyte = utf . char , utf . byte
13local concat , tohash = table . concat , table . tohash
14local P , C , R , S , V , Cs , Cc = lpeg . P , lpeg . C , lpeg . R , lpeg . S , lpeg . V , lpeg . Cs , lpeg . Cc
15
16local lpegpatterns = lpeg . patterns
17local lpegmatch = lpeg . match
18local utfchartabletopattern = lpeg . utfchartabletopattern
19
20local allocate = utilities . storage . allocate
21local mark = utilities . storage . mark
22
23local context = context
24local commands = commands
25
26local characters = characters
27local texcharacters = { }
28characters . tex = texcharacters
29local utffilters = characters . filters . utf
30
31local is_character = characters . is_character
32local is_letter = characters . is_letter
33local is_command = characters . is_command
34local is_spacing = characters . is_spacing
35local is_mark = characters . is_mark
36local is_punctuation = characters . is_punctuation
37
38local data = characters . data if not data then return end
39local blocks = characters . blocks
40
41local trace_defining = false trackers . register ( " characters.defining " , function ( v ) characters_defining = v end )
42
43local report_defining = logs . reporter ( " characters " )
44
45
56
57local low = allocate ( )
58local high = allocate ( )
59local escapes = allocate ( )
60local special = " ~#$%^&_{}\\| "
61
62local private = {
63 low = low ,
64 high = high ,
65 escapes = escapes ,
66}
67
68utffilters . private = private
69
70for ch in gmatch ( special , " . " ) do
71 local cb
72 if type ( ch ) = = " number " then
73 cb , ch = ch , utfchar ( ch )
74 else
75 cb = utfbyte ( ch )
76 end
77 if cb < 256 then
78 escapes [ ch ] = " \\ " . . ch
79 low [ ch ] = utfchar ( 0x0F0000 + cb )
80 if ch = = " % " then
81 ch = " %% "
82 end
83 high [ utfchar ( 0x0F0000 + cb ) ] = ch
84 end
85end
86
87local tohigh = lpeg . replacer ( low )
88local tolow = lpeg . replacer ( high )
89
90lpegpatterns . utftohigh = tohigh
91lpegpatterns . utftolow = tolow
92
93function utffilters . harden ( str )
94 return lpegmatch ( tohigh , str )
95end
96
97function utffilters . soften ( str )
98 return lpegmatch ( tolow , str )
99end
100
101private . escape = utf . remapper ( escapes )
102private . replace = utf . remapper ( low )
103private . revert = utf . remapper ( high )
104
105
115
116
117
118
119
120local accentmapping = allocate {
121 [ ' " ' ] = { [ " " ] = " ¨ " ,
122 A = " Ä " , a = " ä " ,
123 E = " Ë " , e = " ë " ,
124 I = " Ï " , i = " ï " , [ " ı " ] = " ï " , [ " \\i " ] = " ï " ,
125 O = " Ö " , o = " ö " ,
126 U = " Ü " , u = " ü " ,
127 Y = " Ÿ " , y = " ÿ " ,
128 } ,
129 [ " ' " ] = { [ " " ] = " ´ " ,
130 A = " Á " , a = " á " ,
131 C = " Ć " , c = " ć " ,
132 E = " É " , e = " é " ,
133 I = " Í " , i = " í " , [ " ı " ] = " í " , [ " \\i " ] = " í " ,
134 L = " Ĺ " , l = " ĺ " ,
135 N = " Ń " , n = " ń " ,
136 O = " Ó " , o = " ó " ,
137 R = " Ŕ " , r = " ŕ " ,
138 S = " Ś " , s = " ś " ,
139 U = " Ú " , u = " ú " ,
140 Y = " Ý " , y = " ý " ,
141 Z = " Ź " , z = " ź " ,
142 } ,
143 [ " . " ] = { [ " " ] = " ˙ " ,
144 C = " Ċ " , c = " ċ " ,
145 E = " Ė " , e = " ė " ,
146 G = " Ġ " , g = " ġ " ,
147 I = " İ " , i = " i " , [ " ı " ] = " i " , [ " \\i " ] = " i " ,
148 Z = " Ż " , z = " ż " ,
149 } ,
150 [ " = " ] = { [ " " ] = " ¯ " ,
151 A = " Ā " , a = " ā " ,
152 E = " Ē " , e = " ē " ,
153 I = " Ī " , i = " ī " , [ " ı " ] = " ī " , [ " \\i " ] = " ī " ,
154 O = " Ō " , o = " ō " ,
155 U = " Ū " , u = " ū " ,
156 } ,
157 [ " H " ] = { [ " " ] = " ˝ " ,
158 O = " Ő " , o = " ő " ,
159 U = " Ű " , u = " ű " ,
160 } ,
161 [ " ^ " ] = { [ " " ] = " ˆ " ,
162 A = " Â " , a = " â " ,
163 C = " Ĉ " , c = " ĉ " ,
164 E = " Ê " , e = " ê " ,
165 G = " Ĝ " , g = " ĝ " ,
166 H = " Ĥ " , h = " ĥ " ,
167 I = " Î " , i = " î " , [ " ı " ] = " î " , [ " \\i " ] = " î " ,
168 J = " Ĵ " , j = " ĵ " ,
169 O = " Ô " , o = " ô " ,
170 S = " Ŝ " , s = " ŝ " ,
171 U = " Û " , u = " û " ,
172 W = " Ŵ " , w = " ŵ " ,
173 Y = " Ŷ " , y = " ŷ " ,
174 } ,
175 [ " ` " ] = { [ " " ] = " ` " ,
176 A = " À " , a = " à " ,
177 E = " È " , e = " è " ,
178 I = " Ì " , i = " ì " , [ " ı " ] = " ì " , [ " \\i " ] = " ì " ,
179 O = " Ò " , o = " ò " ,
180 U = " Ù " , u = " ù " ,
181 Y = " Ỳ " , y = " ỳ " ,
182 } ,
183 [ " c " ] = { [ " " ] = " ¸ " ,
184 C = " Ç " , c = " ç " ,
185 K = " Ķ " , k = " ķ " ,
186 L = " Ļ " , l = " ļ " ,
187 N = " Ņ " , n = " ņ " ,
188 R = " Ŗ " , r = " ŗ " ,
189 S = " Ş " , s = " ş " ,
190 T = " Ţ " , t = " ţ " ,
191 } ,
192 [ " k " ] = { [ " " ] = " ˛ " ,
193 A = " Ą " , a = " ą " ,
194 E = " Ę " , e = " ę " ,
195 I = " Į " , i = " į " ,
196 U = " Ų " , u = " ų " ,
197 } ,
198 [ " r " ] = { [ " " ] = " ˚ " ,
199 A = " Å " , a = " å " ,
200 U = " Ů " , u = " ů " ,
201 } ,
202 [ " u " ] = { [ " " ] = " ˘ " ,
203 A = " Ă " , a = " ă " ,
204 E = " Ĕ " , e = " ĕ " ,
205 G = " Ğ " , g = " ğ " ,
206 I = " Ĭ " , i = " ĭ " , [ " ı " ] = " ĭ " , [ " \\i " ] = " ĭ " ,
207 O = " Ŏ " , o = " ŏ " ,
208 U = " Ŭ " , u = " ŭ " ,
209 } ,
210 [ " v " ] = { [ " " ] = " ˇ " ,
211 C = " Č " , c = " č " ,
212 D = " Ď " , d = " ď " ,
213 E = " Ě " , e = " ě " ,
214 L = " Ľ " , l = " ľ " ,
215 N = " Ň " , n = " ň " ,
216 R = " Ř " , r = " ř " ,
217 S = " Š " , s = " š " ,
218 T = " Ť " , t = " ť " ,
219 Z = " Ž " , z = " ž " ,
220 } ,
221 [ " ~ " ] = { [ " " ] = " ˜ " ,
222 A = " Ã " , a = " ã " ,
223 I = " Ĩ " , i = " ĩ " , [ " ı " ] = " ĩ " , [ " \\i " ] = " ĩ " ,
224 N = " Ñ " , n = " ñ " ,
225 O = " Õ " , o = " õ " ,
226 U = " Ũ " , u = " ũ " ,
227 } ,
228}
229
230texcharacters . accentmapping = accentmapping
231
232local accent_map = allocate {
233 [ ' ~ ' ] = " ̃ " ,
234 [ ' " ' ] = " ̈ " ,
235 [ " ` " ] = " ̀ " ,
236 [ " ' " ] = " ́ " ,
237 [ " ^ " ] = " ̂ " ,
238
239
240
241
242
243
244
245
246
247
248
249
250}
251
252
253
254local function remap_accent ( a , c , braced )
255 local m = accentmapping [ a ]
256 if m then
257 local n = m [ c ]
258 if n then
259 return n
260 end
261 end
262
263
264
265
266 if braced then
267 return " \\ " . . a . . " { " . . c . . " } "
268 else
269 return " \\ " . . a . . " " . . c
270 end
271end
272
273local commandmapping = allocate {
274 [ " aa " ] = " å " , [ " AA " ] = " Å " ,
275 [ " ae " ] = " æ " , [ " AE " ] = " Æ " ,
276 [ " cc " ] = " ç " , [ " CC " ] = " Ç " ,
277 [ " i " ] = " ı " , [ " j " ] = " ȷ " ,
278 [ " ij " ] = " ij " , [ " IJ " ] = " IJ " ,
279 [ " l " ] = " ł " , [ " L " ] = " Ł " ,
280 [ " o " ] = " ø " , [ " O " ] = " Ø " ,
281 [ " oe " ] = " œ " , [ " OE " ] = " Œ " ,
282 [ " sz " ] = " ß " , [ " SZ " ] = " SZ " , [ " ss " ] = " ß " , [ " SS " ] = " ß " ,
283}
284
285texcharacters . commandmapping = commandmapping
286
287local ligaturemapping = allocate {
288 [ " '' " ] = " ” " ,
289 [ " `` " ] = " “ " ,
290 [ " -- " ] = " – " ,
291 [ " --- " ] = " — " ,
292}
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340local untex
341
342local function toutfpattern ( )
343 if not untex then
344 local hash = { }
345 for k , v in next , accentmapping do
346 for kk , vv in next , v do
347 if ( k > = " a " and k < = " z " ) or ( k > = " A " and k < = " Z " ) then
348 hash [ " \\ " . . k . . " " . . kk ] = vv
349 hash [ " {\\ " . . k . . " " . . kk . . " } " ] = vv
350 else
351 hash [ " \\ " . . k . . kk ] = vv
352 hash [ " {\\ " . . k . . kk . . " } " ] = vv
353 end
354 hash [ " \\ " . . k . . " { " . . kk . . " } " ] = vv
355 hash [ " {\\ " . . k . . " { " . . kk . . " }} " ] = vv
356 end
357 end
358 for k , v in next , commandmapping do
359 hash [ " \\ " . . k . . " " ] = v
360 hash [ " {\\ " . . k . . " } " ] = v
361 hash [ " {\\ " . . k . . " } " ] = v
362 end
363 for k , v in next , ligaturemapping do
364 hash [ k ] = v
365 end
366 untex = utfchartabletopattern ( hash ) / hash
367 end
368 return untex
369end
370
371texcharacters . toutfpattern = toutfpattern
372
373local pattern = nil
374
375local function prepare ( )
376 pattern = Cs ( ( toutfpattern ( ) + P ( 1 ) ) ^ 0 )
377 return pattern
378end
379
380function texcharacters . toutf ( str , strip )
381 if str = = " " then
382 return str
383 elseif not find ( str , " \\ " , 1 , true ) then
384 return str
385
386 else
387 return lpegmatch ( pattern or prepare ( ) , str )
388 end
389end
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404function texcharacters . safechar ( n )
405 local c = data [ n ]
406 if c and c . contextname then
407 return " \\ " . . c . contextname
408 else
409 return utfchar ( n )
410 end
411end
412
413if not context or not commands then
414
415 return
416end
417
418
419
420if not interfaces then return end
421
422local implement = interfaces . implement
423
424local tex = tex
425local texsetlccode = tex . setlccode
426local texsetsfcode = tex . setsfcode
427local texsetcatcode = tex . setcatcode
428
429local contextsprint = context . sprint
430local ctxcatcodes = catcodes . numbers . ctxcatcodes
431
432local texsetmacro = tokens . setters . macro
433local texsetchar = tokens . setters . char
434
435function texcharacters . defineaccents ( )
436 local ctx_dodefineaccentcommand = context . dodefineaccentcommand
437 local ctx_dodefineaccent = context . dodefineaccent
438 local ctx_dodefinecommand = context . dodefinecommand
439 for accent , group in next , accentmapping do
440 ctx_dodefineaccentcommand ( accent )
441 for character , mapping in next , group do
442 ctx_dodefineaccent ( accent , character , mapping )
443 end
444 end
445 for command , mapping in next , commandmapping do
446 ctx_dodefinecommand ( command , mapping )
447 end
448end
449
450implement {
451 name = " defineaccents " ,
452 actions = texcharacters . defineaccents
453}
454
455
459
460function commands . makeactive ( n , name )
461 contextsprint ( ctxcatcodes , format ( " \\catcode%s=13\\unexpanded\\def %s{\\%s} " , n , utfchar ( n ) , name ) )
462
463end
464
465local function to_number ( s )
466 local n = tonumber ( s )
467 if n then
468 return n
469 end
470 return tonumber ( match ( s , ' ^"(.*)$ ' ) , 16 ) or 0
471end
472
473implement {
474 name = " utfchar " ,
475 actions = { to_number , utfchar , contextsprint } ,
476 arguments = " string "
477}
478
479implement {
480 name = " safechar " ,
481 actions = { to_number , texcharacters . safechar , contextsprint } ,
482 arguments = " string "
483}
484
485implement {
486 name = " uchar " ,
487 arguments = { " integer " , " integer " } ,
488 actions = function ( h , l )
489 context ( utfchar ( h * 256 + l ) )
490 end
491}
492
493tex . uprint = commands . utfchar
494
495
496
497
498
499
500local forbidden = tohash {
501 0x000A0 ,
502 0x000AD ,
503
504
505
506
507
508
509
510
511
512 0x02000 ,
513 0x02001 ,
514 0x02002 ,
515 0x02003 ,
516 0x02004 ,
517 0x02005 ,
518 0x02006 ,
519 0x02007 ,
520 0x02008 ,
521 0x02009 ,
522 0x0200A ,
523 0x0200B ,
524 0x0200C ,
525 0x0200D ,
526 0x0202F ,
527 0x0205F ,
528
529
530}
531
532local csletters = characters . csletters
533local activated = { }
534local sfmode = " unset "
535local block_too = false
536
537directives . register ( " characters.blockstoo " , function ( v ) block_too = v end )
538
539
540
541
542local function setuppersfcodes ( v , n )
543 if sfstate ~ = " unset " then
544 report_defining ( " setting uppercase sf codes to %a " , n )
545 for u , chr in next , data do
546 if chr . category = = " lu " then
547 texsetsfcode ( u , n )
548 end
549 end
550 end
551 sfstate = v
552end
553
554directives . register ( " characters.spaceafteruppercase " , function ( v )
555 if v = = " traditional " then
556 setuppersfcodes ( v , 999 )
557 elseif v = = " normal " then
558 setuppersfcodes ( v , 1000 )
559 end
560end )
561
562if not csletters then
563
564 csletters = allocate ( )
565 characters . csletters = csletters
566
567 report_defining ( " setting up character related codes and commands " )
568
569 if sfstate = = " unset " then
570 sfstate = " traditional "
571 end
572
573 local traditional = sfstate = = " traditional "
574
575 for u , chr in next , data do
576 local fallback = chr . fallback
577 if fallback then
578 contextsprint ( " {\\catcode " , u , " =13\\unexpanded\\gdef " , utfchar ( u ) , " {\\checkedchar{ " , u , " }{ " , fallback , " }}} " )
579 activated [ # activated + 1 ] = u
580 else
581 local contextname = chr . contextname
582 local category = chr . category
583 local isletter = is_letter [ category ]
584 if contextname then
585 if is_character [ category ] then
586 if chr . unicodeslot < 128 then
587 if isletter then
588 local c = utfchar ( u )
589 texsetmacro ( contextname , c )
590 csletters [ c ] = u
591 else
592 texsetchar ( contextname , u )
593 end
594 else
595 local c = utfchar ( u )
596 texsetmacro ( contextname , c )
597 if isletter and u > = 32 and u < = 65536 then
598 csletters [ c ] = u
599 end
600 end
601
602 if isletter then
603 local lc = chr . lccode
604 local uc = chr . uccode
605 if not lc then
606 chr . lccode = u
607 lc = u
608 elseif type ( lc ) = = " table " then
609 lc = u
610 end
611 if not uc then
612 chr . uccode = u
613 uc = u
614 elseif type ( uc ) = = " table " then
615 uc = u
616 end
617 texsetlccode ( u , lc , uc )
618 if traditional and category = = " lu " then
619 texsetsfcode ( code , 999 )
620 end
621 end
622
623 elseif is_command [ category ] and not forbidden [ u ] then
624
625
626 local c = utfchar ( u )
627 texsetmacro ( contextname , c )
628 elseif is_mark [ category ] then
629 texsetlccode ( u , u , u )
630 end
631
632 elseif isletter then
633 csletters [ utfchar ( u ) ] = u
634
635 local lc , uc = chr . lccode , chr . uccode
636 if not lc then
637 chr . lccode = u
638 lc = u
639 elseif type ( lc ) = = " table " then
640 lc = u
641 end
642 if not uc then
643 chr . uccode = u
644 uc = u
645 elseif type ( uc ) = = " table " then
646 uc = u
647 end
648 texsetlccode ( u , lc , uc )
649 if traditional and category = = " lu " then
650 texsetsfcode ( code , 999 )
651 end
652
653 elseif is_mark [ category ] then
654
655 texsetlccode ( u , u , u )
656
657 end
658 end
659 end
660
661 if blocks_too then
662
663 for k , v in next , blocks do
664 if v . catcode = = " letter " then
665 local first = v . first
666 local last = v . last
667 local gaps = v . gaps
668 if first and last then
669 for u = first , last do
670 csletters [ utfchar ( u ) ] = u
671
672
673
674 end
675 end
676 if gaps then
677 for i = 1 , # gaps do
678 local u = gaps [ i ]
679 csletters [ utfchar ( u ) ] = u
680
681
682
683 end
684 end
685 end
686 end
687 end
688
689 if storage then
690 storage . register ( " characters/csletters " , csletters , " characters.csletters " )
691 end
692
693else
694 mark ( csletters )
695end
696
697lpegpatterns . csletter = utfchartabletopattern ( csletters )
698
699
700
701
702function characters . setlettercatcodes ( cct )
703 if trace_defining then
704 report_defining ( " assigning letter catcodes to catcode table %a " , cct )
705 end
706 local saved = tex . catcodetable
707 tex . catcodetable = cct
708 texsetcatcode ( 0x200C , 11 )
709 texsetcatcode ( 0x200D , 11 )
710 for c , u in next , csletters do
711 texsetcatcode ( u , 11 )
712 end
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731 tex . catcodetable = saved
732end
733
734function characters . setactivecatcodes ( cct )
735 local saved = tex . catcodetable
736 tex . catcodetable = cct
737 for i = 1 , # activated do
738 local u = activated [ i ]
739 texsetcatcode ( u , 13 )
740 if trace_defining then
741 report_defining ( " character %U (%s) is active in set %a " , u , data [ u ] . description , cct )
742 end
743 end
744 tex . catcodetable = saved
745end
746
747
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792implement {
793 name = " chardescription " ,
794 arguments = " integer " ,
795 actions = function ( slot )
796 local d = data [ slot ]
797 if d then
798 context ( d . description )
799 end
800 end ,
801}
802
803
804
805characters . activeoffset = 0x10000
806
807function commands . remapentity ( chr , slot )
808 contextsprint ( format ( " {\\catcode%s=13\\xdef%s{\\string%s}} " , slot , utfchar ( slot ) , chr ) )
809end
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829implement { name = " setlettercatcodes " , scope = " private " , actions = characters . setlettercatcodes , arguments = " integer " }
830implement { name = " setactivecatcodes " , scope = " private " , actions = characters . setactivecatcodes , arguments = " integer " }
831
832
833
834
835local function overload ( c , u , code , codes )
836 local c = tonumber ( c )
837 if not c then
838 return
839 end
840 local u = utilities . parsers . settings_to_array ( u )
841 local n = # u
842 if n = = 0 then
843 return
844 end
845 local t = nil
846 if n = = 1 then
847 t = tonumber ( u [ 1 ] )
848 else
849 t = { }
850 for i = 1 , n do
851 t [ # t + 1 ] = tonumber ( u [ i ] )
852 end
853 end
854 if t then
855 data [ c ] [ code ] = t
856 characters [ codes ] [ c ] = nil
857 end
858end
859
860interfaces . implement {
861 name = " overloaduppercase " ,
862 arguments = " 2 strings " ,
863 actions = function ( c , u )
864 overload ( c , u , " uccode " , " uccodes " )
865 end
866}
867
868interfaces . implement {
869 name = " overloadlowercase " ,
870 arguments = " 2 strings " ,
871 actions = function ( c , u )
872 overload ( c , u , " lccode " , " lccodes " )
873 end
874}
875 |