1if not modules then modules = { } end modules ['mtx-patterns'] = {
2 version = 1.001,
3 comment = "companion to mtxrun.lua",
4 author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
5 copyright = "PRAGMA ADE / ConTeXt Development Team",
6 license = "see context related readme files"
7}
8
9local format, find, concat, gsub, match, gmatch = string.format, string.find, table.concat, string.gsub, string.match, string.gmatch
10local byte, char = utf.byte, utf.char
11local addsuffix = file.addsuffix
12local lpegmatch, lpegsplit, lpegpatterns, validutf8 = lpeg.match, lpeg.split, lpeg.patterns, lpeg.patterns.validutf8
13local P, V, Cs = lpeg.P, lpeg.V, lpeg.Cs
14
15local helpinfo = [[
16<?xml version="1.0"?>
17<application>
18 <metadata>
19 <entry name="name">mtx-patterns</entry>
20 <entry name="detail">ConTeXt Pattern File Management</entry>
21 <entry name="version">0.20</entry>
22 </metadata>
23 <flags>
24 <category name="basic">
25 <subcategory>
26 <flag name="convert"><short>generate context language files (mnemonic driven, if not given then all)</short></flag>
27 <flag name="check"><short>check pattern file (or those used by context when no file given)</short></flag>
28 <flag name="path"><short>source path where hyph-foo.tex files are stored</short></flag>
29 <flag name="destination"><short>destination path</short></flag>
30 <flag name="specification"><short>additional patterns: e.g.: =cy,hyph-cy,welsh</short></flag>
31 <flag name="compress"><short>compress data</short></flag>
32 <flag name="words"><short>update words in given file</short></flag>
33 <flag name="hyphenate"><short>show hypephenated words</short></flag>
34 </subcategory>
35 </category>
36 </flags>
37 <examples>
38 <category>
39 <title>Examples</title>
40 <subcategory>
41 <example><command>mtxrun --script pattern --check hyph-*.tex</command></example>
42 <example><command>mtxrun --script pattern --check --path=c:/data/develop/svn-hyphen/trunk/hyph-utf8/tex/generic/hyph-utf8/patterns</command></example>
43 <example><command>mtxrun --script pattern --convert --path=c:/data/develop/svn-hyphen/trunk/hyph-utf8/tex/generic/hyph-utf8/patterns/tex --destination=e:/tmp/patterns</command></example>
44 <example><command>mtxrun --script pattern --convert --path=c:/data/develop/svn-hyphen/trunk/hyph-utf8/tex/generic/hyph-utf8/patterns/txt --destination=e:/tmp/patterns</command></example>
45 <example><command>mtxrun --script pattern --hyphenate --language=nl --left=3 nogalwiedes inderdaad</command></example>
46 </subcategory>
47 </category>
48 </examples>
49</application>
50]]
51
52local application = logs.application {
53 name = "mtx-patterns",
54 banner = "ConTeXt Pattern File Management 0.20",
55 helpinfo = helpinfo,
56}
57
58local report = application.report
59
60scripts = scripts or { }
61scripts.patterns = scripts.patterns or { }
62
63local permitted_characters = table.tohash {
64 0x0009,
65 0x0027,
66 0x02BC,
67 0x002D,
68 0x200C,
69 0x2019,
70 0x1FBD,
71 0x1FBF,
72}
73
74local ignored_ancient_greek = table.tohash {
75 0x1FD3,
76 0x1FE3,
77 0x1FBD,
78 0x1FBF,
79 0x03F2,
80 0x02BC,
81}
82
83local ignored_french = table.tohash {
84 0x02BC,
85}
86
87local replaced_whatever = {
88 [char(0x2019)] = char(0x0027)
89}
90
91scripts.patterns.list = {
92 { "af", "hyph-af", "afrikaans" },
93
94
95 { "bg", "hyph-bg", "bulgarian" },
96 { "bn", "hyph-bn", "bengali" },
97 { "ca", "hyph-ca", "catalan" },
98
99 { "cs", "hyph-cs", "czech" },
100 { "cy", "hyph-cy", "welsh" },
101 { "da", "hyph-da", "danish" },
102 { "deo", "hyph-de-1901", "german, old spelling" },
103 { "de", "hyph-de-1996", "german, new spelling" },
104
105
106 { "gr", "hyph-el-monoton", "greek" },
107 { "agr", "hyph-grc", "ancient greek", ignored_ancient_greek },
108 { "gb", "hyph-en-gb", "british english" },
109 { "us", "hyph-en-us", "american english" },
110 { "eo", "hyph-eo", "esperanto" },
111 { "es", "hyph-es", "spanish" },
112 { "et", "hyph-et", "estonian" },
113 { "eu", "hyph-eu", "basque" },
114
115 { "fi", "hyph-fi", "finnish" },
116 { "fr", "hyph-fr", "french", ignored_french },
117
118
119 { "gu", "hyph-gu", "gujarati" },
120 { "hi", "hyph-hi", "hindi" },
121 { "hr", "hyph-hr", "croatian" },
122
123 { "hu", "hyph-hu", "hungarian" },
124 { "hy", "hyph-hy", "armenian" },
125
126 { "id", "hyph-id", "indonesian" },
127 { "is", "hyph-is", "icelandic" },
128 { "it", "hyph-it", "italian" },
129
130 { "kn", "hyph-kn", "kannada" },
131 { "la", "hyph-la", "latin" },
132 { "ala", "hyph-la-x-classic", "ancient latin" },
133
134 { "lt", "hyph-lt", "lithuanian" },
135 { "lv", "hyph-lv", "latvian" },
136 { "mk", "hyph-mk", "macedonian" },
137 { "ml", "hyph-ml", "malayalam" },
138 { "mn", "hyph-mn-cyrl", "mongolian, cyrillic script" },
139
140 { "nb", "hyph-nb", "norwegian bokmål" },
141 { "nl", "hyph-nl", "dutch" },
142 { "nn", "hyph-nn", "norwegian nynorsk" },
143
144
145
146 { "pl", "hyph-pl", "polish" },
147 { "pt", "hyph-pt", "portuguese" },
148 { "ro", "hyph-ro", "romanian" },
149 { "ru", "hyph-ru", "russian" },
150 { "sa", "hyph-sa", "sanskrit" },
151 { "sk", "hyph-sk", "slovak" },
152 { "sl", "hyph-sl", "slovenian" },
153 { "sq", "hyph-sq", "albanian" },
154 { "sr", "hyph-sr", "serbian", false, { "hyph-sh-cyrl", "hyph-sh-latn" }, },
155
156
157 { "sv", "hyph-sv", "swedish" },
158 { "ta", "hyph-ta", "tamil" },
159 { "te", "hyph-te", "telugu" },
160 { "th", "hyph-th", "thai" },
161 { "tk", "hyph-tk", "turkmen" },
162 { "tr", "hyph-tr", "turkish" },
163 { "uk", "hyph-uk", "ukrainian" },
164 { "zh", "hyph-zh-latn-pinyin","zh-latn, chinese pinyin" },
165}
166
167
168
169function utf.check(str)
170 return lpegmatch(lpegpatterns.validutf8,str)
171end
172
173
174
175
176function scripts.patterns.load(path,name,mnemonic,ignored, merged)
177 local fullname = file.join(path,name)
178 local basename = name
179 local texfile = addsuffix(fullname,"tex")
180 local hypfile = addsuffix(fullname,"hyp.txt")
181 local patfile = addsuffix(fullname,"pat.txt")
182 local licfile = addsuffix(fullname,"lic.txt")
183
184 local okay = true
185 local hyphenations, patterns, comment, stripset = "", "", "", ""
186 local splitpatternsnew, splithyphenationsnew = { }, { }
187 local splitpatternsold, splithyphenationsold = { }, { }
188 local usedpatterncharactersnew, usedhyphenationcharactersnew = { }, { }
189 if merged then
190
191 report("using merged txt files %s.[hyp|pat|lic].txt",name)
192 for i=1,#merged do
193 local fullname = file.join(path,merged[i])
194 comment = comment .. (io.loaddata(addsuffix(fullname,"lic.txt")) or "") .. "\n\n"
195 patterns = patterns .. (io.loaddata(addsuffix(fullname,"pat.txt")) or "") .. "\n\n"
196 hyphenations = hyphenations .. (io.loaddata(addsuffix(fullname,"hyp.txt")) or "") .. "\n\n"
197 end
198 elseif lfs.isfile(patfile) then
199
200 report("using txt files %s.[hyp|pat|lic].txt",name)
201 comment = io.loaddata(licfile) or ""
202 patterns = io.loaddata(patfile) or ""
203 hyphenations = io.loaddata(hypfile) or ""
204 elseif lfs.isfile(texfile) then
205
206 report("using tex file %s.txt",name)
207 local data = io.loaddata(texfile) or ""
208 if data ~= "" then
209 data = gsub(data,"([\n\r])\\input ([^ \n\r]+)", function(previous,subname)
210 local subname = addsuffix(subname,"tex")
211 local subfull = file.join(file.dirname(texfile),subname)
212 local subdata = io.loaddata(subfull) or ""
213 if subdata == "" then
214 report("%s: no subfile %s",basename,subname)
215 end
216 return previous .. subdata
217 end)
218 data = gsub(data,"%%.-[\n\r]","")
219 data = gsub(data," *[\n\r]+","\n")
220 patterns = match(data,"\\patterns[%s]*{[%s]*(.-)[%s]*}") or ""
221 hyphenations = match(data,"\\hyphenation[%s]*{[%s]*(.-)[%s]*}") or ""
222 comment = match(data,"^(.-)[\n\r]\\patterns") or ""
223 else
224 okay = false
225 end
226 else
227 okay = false
228 end
229 if okay then
230
231 local how = lpegpatterns.whitespace^1
232 splitpatternsnew = lpegsplit(how,patterns)
233 splithyphenationsnew = lpegsplit(how,hyphenations)
234 end
235 if okay then
236
237 local function check(data,splitdata,name)
238 if find(data,"%%") then
239 for i=1,#splitdata do
240 local line = splitdata[i]
241 if find(line,"%%") then
242 splitdata[i] = gsub(line,"%%.*$","")
243 report("%s: removing comment: %s",basename,line)
244 end
245 end
246 end
247 end
248 check(patterns,splitpatternsnew,patfile)
249 check(hyphenations,splithyphenationsnew,hypfile)
250 end
251 if okay then
252
253 local function check(data,splitdata,name)
254 if find(data,"\\") then
255 for i=1,#splitdata do
256 local line = splitdata[i]
257 if find(line,"\\") then
258 splitdata[i] = ""
259 report("%s: removing line with command: %s",basename,line)
260 end
261 end
262 end
263 end
264 check(patterns,splitpatternsnew,patfile)
265 check(hyphenations,splithyphenationsnew,hypfile)
266 end
267 if okay then
268
269 local function check(data,splitdata,name)
270 for i=1,#splitdata do
271 local line = splitdata[i]
272 local ok = lpegmatch(validutf8,line)
273 if not ok then
274 splitdata[i] = ""
275 report("%s: removing line with invalid utf: %s",basename,line)
276 end
277 end
278
279 end
280 check(patterns,splitpatternsnew,patfile)
281 check(hyphenations,splithyphenationsnew,hypfile)
282 end
283 if okay then
284
285 local cd = characters.data
286 local stripped = { }
287 local function check(splitdata,special,name)
288 local used = { }
289 for i=1,#splitdata do
290 local line = splitdata[i]
291 for b in line:utfvalues() do
292 if b == special then
293
294 elseif permitted_characters[b] then
295 used[char(b)] = true
296 else
297 local cdb = cd[b]
298 if not cdb then
299 report("%s: no entry in chardata for character %C",basename,b)
300 else
301 local ct = cdb.category
302 if ct == "lu" or ct == "ll" or ct == "lo" or ct == "mn" or ct == "mc" then
303 used[char(b)] = true
304 elseif ct == "nd" then
305
306 elseif ct == "cf" then
307 report("%s: %s line with suspected utf character %C, category %s: %s",basename,"keeping",b,ct,line)
308 used[char(b)] = true
309 else
310 report("%s: %s line with suspected utf character %C, category %s: %s",basename,"removing",b,ct,line)
311 splitdata[i] = ""
312 break
313 end
314 end
315 end
316 end
317 end
318 return used
319 end
320 usedpatterncharactersnew = check(splitpatternsnew,byte("."))
321 usedhyphenationcharactersnew = check(splithyphenationsnew,byte("-"))
322 for k, v in next, stripped do
323 report("%s: entries that contain character %C have been omitted",basename,k)
324 end
325 end
326 if okay then
327 local function stripped(what,ignored)
328
329 local p = nil
330 if ignored then
331 for k, v in next, ignored do
332 if p then
333 p = p + P(char(k))
334 else
335 p = P(char(k))
336 end
337 end
338 p = P{ p + 1 * V(1) }
339 end
340
341 local r = nil
342 for k, v in next, replaced_whatever do
343 if r then
344 r = r + P(k)/v
345 else
346 r = P(k)/v
347 end
348 end
349 r = Cs((r + 1)^0)
350 local result = { }
351 for i=1,#what do
352 local line = what[i]
353 if p and lpegmatch(p,line) then
354 report("%s: discarding conflicting pattern: %s",basename,line)
355 else
356 local l = lpegmatch(r,line)
357 if l ~= line then
358 report("%s: sanitizing pattern: %s -> %s (for old patterns)",basename,line,l)
359 end
360 result[#result+1] = l
361 end
362 end
363 return result
364 end
365
366 splitpatternsold = stripped(splitpatternsnew,ignored)
367 splithyphenationsold = stripped(splithyphenationsnew,ignored)
368
369 end
370 if okay then
371
372 local function check(data,splitdata,name)
373 local used, collected = { }, { }
374 for i=1,#splitdata do
375 local line = splitdata[i]
376 if line == "" then
377
378 elseif used[line] then
379
380 report("%s: discarding duplicate pattern: %s",basename,line)
381 else
382 used[line] = true
383 collected[#collected+1] = line
384 end
385 end
386 return collected
387 end
388 splitpatternsnew = check(patterns,splitpatternsnew,patfile)
389 splithyphenationsnew = check(hyphenations,splithyphenationsnew,hypfile)
390 splitpatternsold = check(patterns,splitpatternsold,patfile)
391 splithyphenationsold = check(hyphenations,splithyphenationsold,hypfile)
392 end
393 if not okay then
394 report("no valid file %s.*",name)
395 end
396
397 local function getused(t)
398 local u = { }
399 for k, v in next, t do
400 if ignored and ignored[k] then
401 elseif replaced_whatever[k] then
402 else
403 u[k] = v
404 end
405 end
406 return u
407 end
408 local usedpatterncharactersold = getused(usedpatterncharactersnew)
409 local usedhyphenationcharactersold = getused(usedhyphenationcharactersnew)
410
411 return okay,
412 splitpatternsnew, splithyphenationsnew, splitpatternsold, splithyphenationsold, comment, stripset,
413 usedpatterncharactersnew, usedhyphenationcharactersnew, usedpatterncharactersold, usedhyphenationcharactersold
414end
415
416function scripts.patterns.save(destination,mnemonic,name,patternsnew,hyphenationsnew,patternsold,hyphenationsold,comment,stripped,
417 pusednew,husednew,pusedold,husedold,ignored)
418 local nofpatternsnew, nofhyphenationsnew = #patternsnew, #hyphenationsnew
419 local nofpatternsold, nofhyphenationsold = #patternsold, #hyphenationsold
420 report("language %s has %s old and %s new patterns and %s old and %s new exceptions",mnemonic,nofpatternsold,nofpatternsnew,nofhyphenationsold,nofhyphenationsnew)
421 if mnemonic ~= "??" then
422 local punew = concat(table.sortedkeys(pusednew), " ")
423 local hunew = concat(table.sortedkeys(husednew), " ")
424 local puold = concat(table.sortedkeys(pusedold), " ")
425 local huold = concat(table.sortedkeys(husedold), " ")
426
427 local rmefile = file.join(destination,"lang-"..mnemonic..".rme")
428 local patfile = file.join(destination,"lang-"..mnemonic..".pat")
429 local hypfile = file.join(destination,"lang-"..mnemonic..".hyp")
430 local luafile = file.join(destination,"lang-"..mnemonic..".lua")
431
432 local topline = "% generated by mtxrun --script pattern --convert"
433 local banner = "% for comment and copyright, see " .. file.basename(rmefile)
434 report("saving language data for %s",mnemonic)
435 if not comment or comment == "" then comment = "% no comment" end
436 if not type(destination) == "string" then destination = "." end
437
438 local compression = environment.arguments.compress and "zlib" or nil
439
440 local lines = string.splitlines(comment)
441 for i=1,#lines do
442 if not find(lines[i],"^%%") then
443 lines[i] = "% " .. lines[i]
444 end
445 end
446
447 local metadata = {
448
449 texcomment = concat(lines,"\n"),
450 source = name,
451 mnemonic = mnemonic,
452 }
453
454 local patterndata, hyphenationdata
455 if nofpatternsnew > 0 then
456 local data = concat(patternsnew," ")
457 patterndata = {
458 n = nofpatternsnew,
459 compression = compression,
460 length = #data,
461 data = compression and zlib.compress(data,9) or data,
462 characters = concat(table.sortedkeys(pusednew),""),
463 lefthyphenmin = 1,
464 righthyphenmax = 1,
465 }
466 else
467 patterndata = {
468 n = 0,
469 }
470 end
471 if nofhyphenationsnew > 0 then
472 local data = concat(hyphenationsnew," ")
473 hyphenationdata = {
474 n = nofhyphenationsnew,
475 compression = compression,
476 length = #data,
477 data = compression and zlib.compress(data,9) or data,
478 characters = concat(table.sortedkeys(husednew),""),
479 }
480 else
481 hyphenationdata = {
482 n = 0,
483 }
484 end
485 local data = {
486
487
488 version = "1.001",
489 comment = topline,
490 metadata = metadata,
491 patterns = patterndata,
492 exceptions = hyphenationdata,
493 }
494
495 os.remove(rmefile)
496 os.remove(patfile)
497 os.remove(hypfile)
498 os.remove(luafile)
499
500 io.savedata(rmefile,format("%s\n\n%s",topline,comment))
501 io.savedata(patfile,format("%s\n\n%s\n\n%% used: %s\n\n\\patterns{\n%s}",topline,banner,puold,concat(patternsold,"\n")))
502 io.savedata(hypfile,format("%s\n\n%s\n\n%% used: %s\n\n\\hyphenation{\n%s}",topline,banner,huold,concat(hyphenationsold,"\n")))
503 io.savedata(luafile,table.serialize(data,true))
504 end
505end
506
507function scripts.patterns.prepare()
508
509 dofile(resolvers.findfile("char-def.lua"))
510
511 local specification = environment.argument("specification")
512 if specification then
513 local components = utilities.parsers.settings_to_array(specification)
514 if #components == 3 then
515 table.insert(scripts.patterns.list,1,components)
516 report("specification added: %s %s %s",table.unpack(components))
517 else
518 report('invalid specification: %q, "xx,lang-yy,zzzz" expected',specification)
519 end
520 end
521end
522
523function scripts.patterns.check()
524 local path = environment.argument("path") or "."
525 local files = environment.files
526 local only = false
527 if #files > 0 then
528 only = table.tohash(files)
529 end
530 for k, v in next, scripts.patterns.list do
531 local mnemonic, name, ignored, merged = v[1], v[2], v[4], v[5]
532 if not only or only[mnemonic] then
533 report("checking language %s, file %s", mnemonic, name)
534 local okay = scripts.patterns.load(path,name,mnemonic,ignored, merged)
535 if not okay then
536 report("there are errors that need to be fixed")
537 end
538 report()
539 end
540 end
541end
542
543function scripts.patterns.convert()
544 local path = environment.argument("path") or "."
545 if path == "" then
546 report("provide sourcepath using --path ")
547 else
548 local destination = environment.argument("destination") or "."
549 if path == destination then
550 report("source path and destination path should differ (use --path and/or --destination)")
551 else
552 local files = environment.files
553 local only = false
554 if #files > 0 then
555 only = table.tohash(files)
556 end
557 for k, v in next, scripts.patterns.list do
558 local mnemonic, name, ignored, merged = v[1], v[2], v[4], v[5]
559 if not only or only[mnemonic] then
560 report("converting language %s, file %s", mnemonic, name)
561 local okay, patternsnew, hyphenationsnew, patternsold, hyphenationsold, comment, stripped,
562 pusednew, husednew, pusedold, husedold = scripts.patterns.load(path,name,mnemonic,ignored,merged)
563 if okay then
564 scripts.patterns.save(destination,mnemonic,name,patternsnew,hyphenationsnew,patternsold,hyphenationsold,comment,stripped,
565 pusednew,husednew,pusedold,husedold,ignored)
566 else
567 report("convertion aborted due to error(s)")
568 end
569 report()
570 end
571 end
572 end
573 end
574end
575
576local function valid(filename)
577 local specification = table.load(filename)
578 if not specification then
579 return false
580 end
581 local lists = specification.lists
582 if not lists then
583 return false
584 end
585 return specification, lists
586end
587
588function scripts.patterns.words()
589 if environment.arguments.update then
590 local compress = environment.arguments.compress
591 for i=1,#environment.files do
592 local filename = environment.files[i]
593 local fullname = resolvers.findfile(filename)
594 if fullname and fullname ~= "" then
595 report("checking file %a",fullname)
596 local specification, lists = valid(fullname)
597 if specification and #lists> 0 then
598 report("updating %a of language %a",filename,specification.language)
599 for i=1,#lists do
600 local entry = lists[i]
601 local filename = entry.filename
602 if filename then
603 local fullname = resolvers.findfile(filename)
604 if fullname then
605 report("adding words from %a",fullname)
606 local data = io.loaddata(fullname) or ""
607 data = string.strip(data)
608 data = string.gsub(data,"%s+"," ")
609 if compress then
610 entry.data = zlib.compress(data,9)
611 entry.compression = "zlib"
612 entry.length = #data
613 else
614 entry.data = data
615 entry.compression = nil
616 entry.length = #data
617 end
618 else
619 entry.data = ""
620 entry.compression = nil
621 entry.length = 0
622 end
623 else
624 entry.data = ""
625 entry.compression = nil
626 entry.length = 0
627 end
628 end
629 specification.version = "1.00"
630 specification.timestamp = os.localtime()
631 report("updated file %a is saved",filename)
632 table.save(filename,specification)
633 else
634 report("no file %a",filename)
635 end
636 else
637 report("nothing done")
638 end
639 end
640 else
641 report("provide --update")
642 end
643end
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663function scripts.patterns.hyphenate()
664 require("lang-hyp")
665 local traditional = languages.hyphenators.traditional
666 local left = tonumber(environment.arguments.left) or 3
667 local right = tonumber(environment.arguments.right) or 3
668 local language = environment.arguments.language or "us"
669 local dictionary = traditional.loadpatterns(language)
670 local words = environment.files
671 local specification = {
672 leftcharmin = left,
673 rightcharmin = right,
674 leftchar = false,
675 rightchar = false,
676 }
677 trackers.enable("hyphenator.steps")
678 for i=1,#words do
679 local word = words[i]
680 report("%s %s %s : %s : %s",
681 language, left, right,
682 word,
683 traditional.injecthyphens(dictionary,word,specification)
684 )
685 end
686end
687
688if environment.argument("check") then
689 scripts.patterns.prepare()
690 scripts.patterns.check()
691elseif environment.argument("convert") then
692 scripts.patterns.prepare()
693 scripts.patterns.convert()
694elseif environment.argument("words") then
695 scripts.patterns.words()
696elseif environment.argument("hyphenate") then
697 scripts.patterns.hyphenate()
698elseif environment.argument("exporthelp") then
699 application.export(environment.argument("exporthelp"),environment.files[1])
700else
701 application.help()
702end
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745 |