Skip to content

Commit bde7f30

Browse files
author
scritch
committed
Improved regroupment algorithm and made it support different languages
git-svn-id: http://rhg.rubyforge.org/svn@69 2ba632a7-620d-0410-bd84-d74392fff1da
1 parent a5f5a02 commit bde7f30

File tree

2 files changed

+183
-96
lines changed

2 files changed

+183
-96
lines changed

wikigen/languages.yml

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
---
2+
ja:
3+
:iso_language: ja-JA
4+
:can_be_destination_language: false
5+
:chapter_name: chapter%02d.txt
6+
fr:
7+
:iso_language: fr-FR
8+
:can_be_destination_language: true
9+
:chapter_name: chapitre%02d.txt
10+
:translated_by_re: !ruby/regexp /^Traduction (.+)$/
11+
:not_translated: (non traduit)
12+
:footer: |
13+
<hr>
14+
15+
L'oeuvre originale est sous Copyright &copy; 2002 - 2004 Minero AOKI.<br>
16+
Traduction $tag(translated by)$<br>
17+
<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/2.5/"><img alt="Creative Commons License" border="0" src="images/somerights20.fr.png"/></a><br/>Cette oeuvre est sous la <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/2.5/deed.fr">Licence Creative Commons Attribution-NonCommercial-ShareAlike2.5</a>.
18+
19+
</body>
20+
</html>
21+
22+
en:
23+
:iso_language: en-US
24+
:can_be_destination_language: true
25+
:chapter_name: chapter%02d.txt
26+
:translated_by_re: !ruby/regexp /^Translated by (.+)$/
27+
:not_translated: (not translated)
28+
:footer: |
29+
<hr>
30+
31+
The original work is Copyright &copy; 2002 - 2004 Minero AOKI.<br />
32+
Translated by $tag(translated by)$<br />
33+
<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/2.5/"><img alt="Creative Commons License" border="0" src="images/somerights20.png"/></a><br/>This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/2.5/">Creative Commons Attribution-NonCommercial-ShareAlike2.5 License</a>.
34+
35+
</body>
36+
</html>
37+

wikigen/regroup.rb

+146-96
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,64 @@
11
#!/usr/bin/env ruby
22
# -*- coding: utf-8 -*- vim:set encoding=utf-8:
33
# TODO:
4-
# - cleanup (and remove dependency with rhg_html_gen)
4+
# - cleanup
55
# - images
6-
# - when generating the output data, if Japanese = English, add in the English something like "(To translate)"
6+
# - when generating the output data, if source language = destination language, add in the destination language something like "(to translate)" (and make it depend on the language)
77
$KCODE = 'u'
88

9-
ISOLanguage = 'en-US'
10-
119
$LOAD_PATH.unshift('../lib')
12-
require 'rhg_html_gen'
10+
require 'redcloth'
11+
require 'yaml'
12+
13+
Languages = YAML::load(IO.read('languages.yml'))
14+
AvailableDestinationLanguages = Languages.keys.select { |lang| Languages[lang][:can_be_destination_language] }.sort
15+
AvailableSourceLanguages = Languages.keys.sort
16+
17+
def syntax
18+
puts "syntax: #{$0} source_language destination_language chapter_number"
19+
puts "where the source language is one of the following: #{AvailableSourceLanguages.join(', ')}"
20+
puts "and the destination language is one of the following: #{AvailableDestinationLanguages.join(', ')}"
21+
exit 1
22+
end
23+
24+
syntax if ARGV.length != 3 or not AvailableSourceLanguages.include?(ARGV[0]) or not AvailableDestinationLanguages.include?(ARGV[1]) or ARGV[2].to_i == 0
25+
src_lang = ARGV[0]
26+
dst_lang = ARGV[1]
27+
chapter_num = ARGV[2].to_i
28+
29+
$tags = {}
30+
31+
HEADER = <<EOS
32+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
33+
<html lang="#{Languages[dst_lang][:iso_language]}">
34+
<head>
35+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
36+
<meta http-equiv="Content-Language" content="#{Languages[dst_lang][:iso_language]}">
37+
<link rel="stylesheet" type="text/css" href="rhg.css">
38+
<title>$tag(title)$</title>
39+
</head>
40+
<body>
41+
EOS
42+
FOOTER = Languages[dst_lang][:footer]
1343

1444
COMMENT_RE = /\$comment\((.+?)\)\$/
1545
AUTOLINK_RE = %r{(^|[^:])\b((?:ht|f)tp://\S+?)([^\w\/;]*?)(?=\s|<|$)}
1646
NEW_CODE_RE = /`([^<]*?)`/m
1747
TAG_RE = /\$tag\((.+?)\)\$/
48+
BLOCK_REGROUPING_RE = /^(h[1-9]\.|<pre\b|<p\b|▼)/
49+
50+
# manages tags
51+
def replace_tags(text)
52+
text.gsub(TAG_RE) do |m|
53+
tag_name = $~[1]
54+
if $tags[tag_name]
55+
$tags[tag_name]
56+
else
57+
puts "Warning: The tag #{tag_name} is not defined"
58+
''
59+
end
60+
end
61+
end
1862

1963
AUTO_CONV_ENDING=<<END
2064
<hr>
@@ -29,45 +73,13 @@
2973
Copyright (c) 2002-2004 Minero Aoki, All rights reserved.
3074
END
3175

32-
TranslatedByRE = /^Translated by (.+)$/
33-
34-
def rhg_redcloth_replace(text)
35-
text = text.dup
36-
if md = TranslatedByRE.match(text)
37-
$tags['translated by'] = md[1]
38-
text.sub!(TranslatedByRE, '')
39-
end
40-
text.sub!(AUTO_CONV_ENDING, '') # remove the ending in the automatically generated Japanese files
41-
text.gsub!(COMMENT_RE) { |m| '' } # remove comments
42-
text.gsub(TAG_RE) do |m| # manages tags
43-
tag_name = $~[1]
44-
if $tags[tag_name]
45-
$tags[tag_name]
46-
else
47-
puts "Warning: The tag #{tag_name} is not defined"
48-
''
49-
end
50-
end
51-
fig_counter = 0
52-
text.gsub!(RedCloth::IMAGE_RE) do |m| # must be done before the `` replacement
53-
fig_counter += 1
54-
stln,algn,atts,url,title,href,href_a1,href_a2 = $~[1..8]
55-
#puts "Warning: the images used the the RHG should be PNGs, not JPEGs" if /\.jpe?g$/i.match(url)
56-
"\n\n<p style=\"text-align:center;\">\n#{m.gsub(/`/, '')}<br />Figure #{fig_counter}: #{title}\n</p>\n\n"
57-
end
58-
text.gsub!(NEW_CODE_RE) { |m| "<code>#{$~[1]}</code>" }
59-
text.gsub!(AUTOLINK_RE) do |m|
60-
before, address, after = $~[1..3]
61-
"#{before}\"#{address}\":#{address}#{after}"
62-
end
63-
text
64-
end
65-
6676
class Blocks
67-
def initialize(filename)
68-
@data = rhg_redcloth_replace(IO.read(filename)).split(/\n/).map { |l| l.rstrip }
69-
@boundaries = []
77+
def initialize(filename, lang, is_destination_lang)
78+
@lang = lang
79+
@is_destination_lang = is_destination_lang
80+
@data = rhg_redcloth_replace(filename)
7081

82+
@boundaries = []
7183
find_boundaries
7284
end
7385

@@ -79,6 +91,10 @@ def [](i)
7991
@data[@boundaries[i]].join("\n")
8092
end
8193

94+
def each_from(i)
95+
i.upto(self.length-1) { yield self[i] }
96+
end
97+
8298
def regroup_with_following(i)
8399
@data[@boundaries[i].last] << "\n<==================================>"
84100
@boundaries[i] = @boundaries[i].first..@boundaries[i+1].last
@@ -114,85 +130,119 @@ def find_boundaries
114130
end
115131
end
116132
end
133+
134+
# transforms the modified RHG RedCloth syntax to normal RedCloth
135+
# and returns an array of lines (without end of lines)
136+
def rhg_redcloth_replace(filename)
137+
text = IO.read(filename)
138+
translated_by_re = Languages[@lang][:translated_by_re] # note: translated_by_re is not defined for Japanese
139+
if translated_by_re and md = translated_by_re.match(text)
140+
$tags['translated by'] = md[1] if @is_destination_lang
141+
text.sub!(translated_by_re, '')
142+
end
143+
text.sub!(AUTO_CONV_ENDING, '') if @lang == 'ja' # remove the ending in the automatically generated Japanese files if it's there
144+
text.gsub!(COMMENT_RE) { |m| '' } # remove comments
145+
text = replace_tags(text)
146+
fig_counter = 0
147+
text.gsub!(RedCloth::IMAGE_RE) do |m| # must be done before the `` replacement
148+
fig_counter += 1
149+
stln,algn,atts,url,title,href,href_a1,href_a2 = $~[1..8]
150+
#puts "Warning: the images used the the RHG should be PNGs, not JPEGs" if /\.jpe?g$/i.match(url)
151+
"\n\n<p style=\"text-align:center;\">\n#{m.gsub(/`/, '')}<br />Figure #{fig_counter}: #{title}\n</p>\n\n"
152+
end
153+
text.gsub!(NEW_CODE_RE) { |m| "<code>#{$~[1]}</code>" }
154+
text.gsub!(AUTOLINK_RE) do |m|
155+
before, address, after = $~[1..3]
156+
"#{before}\"#{address}\":#{address}#{after}"
157+
end
158+
text.split(/\n/).map { |l| l.rstrip }
159+
end
117160
end
118161

119-
chapter_num = sprintf("%02d", ARGV[0].to_i)
162+
dst_lang_file_name = "../#{dst_lang}/#{sprintf(Languages[dst_lang][:chapter_name], chapter_num)}"
163+
src_lang_file_name = "../#{src_lang}/#{sprintf(Languages[src_lang][:chapter_name], chapter_num)}"
120164

121-
en_file_name = "../en/chapter#{chapter_num}.txt"
122-
ja_file_name = "../ja/chapter#{chapter_num}.txt"
123-
# if the English file does not exist yet, just use the Japanese one as source
124-
if File.exists?(en_file_name)
125-
blocks_en = Blocks.new(en_file_name)
165+
blocks_src_lang = Blocks.new(src_lang_file_name, src_lang, false)
166+
# if the file in the destination language does not exist yet, just use the one in the source language as source
167+
if File.exists?(dst_lang_file_name)
168+
blocks_dst_lang = Blocks.new(dst_lang_file_name, dst_lang, true)
126169
else
127-
$tags['translated by'] = '(not translated yet)'
128-
blocks_en = Blocks.new(ja_file_name)
170+
puts "warning: the translation is not available for this chapter"
171+
blocks_dst_lang = Blocks.new(src_lang_file_name, src_lang, false)
172+
$tags['translated by'] = Languages[dst_lang][:not_translated]
129173
end
130-
blocks_ja = Blocks.new(ja_file_name)
131-
132-
BLOCK_REGROUPING_RE = /^(h[1-9]\.|<pre|▼)/
133174

175+
# the following code tries to have as many blocks of text in each language
176+
# it searches for anchors (defined by the BLOCK_REGROUPING_RE regexp) and tries to aligns the anchors in both languages
134177
i = 0
135178
regroup_pos = 0
136-
while i < blocks_ja.length and i < blocks_en.length
137-
block_ja = blocks_ja[i]
138-
block_en = blocks_en[i]
139-
if BLOCK_REGROUPING_RE.match(block_ja)
140-
if BLOCK_REGROUPING_RE.match(block_en)
141-
regroup_pos = i
142-
i += 1
179+
while i < blocks_src_lang.length and i < blocks_dst_lang.length
180+
block_src_lang = blocks_src_lang[i]
181+
block_dst_lang = blocks_dst_lang[i]
182+
if md_src = BLOCK_REGROUPING_RE.match(block_src_lang)
183+
if md_dst = BLOCK_REGROUPING_RE.match(block_dst_lang)
184+
if md_src[0] != md_dst[0]
185+
# if the anchors found at the current position are different in the two languages,
186+
# we search for the next anchor to know which side is the more likely to need a regroupment
187+
next_md_src = nil
188+
next_md_dst = nil
189+
blocks_src_lang.each_from(i+1) { |block| break if next_md_src = BLOCK_REGROUPING_RE.match(block) }
190+
blocks_dst_lang.each_from(i+1) { |block| break if next_md_dst = BLOCK_REGROUPING_RE.match(block) }
191+
if next_md_src and next_md_src[0] == md_dst[0]
192+
blocks_src_lang.regroup_with_following(regroup_pos)
193+
elsif next_md_dst and next_md_dst[0] == md_src[0]
194+
blocks_dst_lang.regroup_with_following(regroup_pos)
195+
else
196+
i += 1
197+
regroup_pos = i
198+
end
199+
else
200+
i += 1
201+
regroup_pos = i
202+
end
143203
else
144-
blocks_en.regroup_with_following(regroup_pos)
204+
blocks_dst_lang.regroup_with_following(regroup_pos)
145205
end
146-
elsif BLOCK_REGROUPING_RE.match(block_en)
147-
blocks_ja.regroup_with_following(regroup_pos)
206+
elsif md_dst = BLOCK_REGROUPING_RE.match(block_dst_lang)
207+
blocks_src_lang.regroup_with_following(regroup_pos)
148208
else
149209
i += 1
150210
end
151211
end
152212

153213
# regroup the last blocks to have the same number of blocks in both
154-
blocks_en.regroup_with_following(blocks_en.length-2) while blocks_ja.length < blocks_en.length
155-
blocks_ja.regroup_with_following(blocks_ja.length-2) while blocks_en.length < blocks_ja.length
214+
blocks_dst_lang.regroup_with_following(blocks_dst_lang.length-2) while blocks_src_lang.length < blocks_dst_lang.length
215+
blocks_src_lang.regroup_with_following(blocks_src_lang.length-2) while blocks_dst_lang.length < blocks_src_lang.length
156216

157-
blocks_en.each do |b|
217+
blocks_dst_lang.each do |b|
158218
if md = /h1\.\s*(.+)$/.match(b)
159-
$tags['title'] = md[1].gsub(/(<[^>]*>|`)/, '') # remove markup and backquotes from the title
219+
$tags['title'] = md[1].gsub(/(<[^>]*>|`)/, '') # remove markup and backquotes from the title
160220
break
161-
end
221+
end
162222
end
163223
if not $tags['title']
164-
STDERR.puts "error: no h1 section in source file"
224+
STDERR.puts "error: no h1 section in the file in the destination language"
165225
return
166226
end
167227

168-
File.open("chapter#{chapter_num}.txt", "w") do |f|
169-
f.puts "<table>"
170-
blocks_ja.length.times do |i|
171-
f.puts "<tr><td>"
172-
f.puts
173-
f.puts blocks_en[i]
174-
f.puts
175-
f.puts "</td><td>"
176-
f.puts
177-
f.puts blocks_ja[i]
178-
f.puts
179-
f.print "</td></tr>"
180-
end
181-
f.puts
182-
f.puts "</table>"
183-
end
184-
185-
FOOTER = <<EOS
186-
<hr>
228+
base_file_name = sprintf("chapter%02d_#{src_lang}_#{dst_lang}", chapter_num)
229+
html_file = "#{base_file_name}.html"
230+
redcloth_file = "#{base_file_name}.redcloth.txt"
187231

188-
The original work is Copyright &copy; 2002 - 2004 Minero AOKI.<br />
189-
Translated by #{$tags['translated by']}<br />
190-
<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/2.5/"><img alt="Creative Commons License" border="0" src="images/somerights20.png"/></a><br/>This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/2.5/">Creative Commons Attribution-NonCommercial-ShareAlike2.5 License</a>.
232+
redcloth_text = '<table>'
233+
blocks_src_lang.length.times do |i|
234+
redcloth_text << "<tr><td>\n\n#{blocks_dst_lang[i]}\n\n</td>"
235+
redcloth_text << "<td>\n\n#{blocks_src_lang[i]}\n\n</td></tr>\n"
236+
end
237+
redcloth_text << "\n</table>\n"
191238

192-
</body>
193-
</html>
194-
EOS
239+
File.open(redcloth_file, "w") do |f| f.puts redcloth_text end
195240

196-
RedClothRules = [ :textile ]
241+
r = RedCloth.new(redcloth_text)
197242

198-
generate_html("chapter#{chapter_num}.html", "chapter#{chapter_num}.txt")
243+
File.open(html_file, 'w') do |io|
244+
puts "Generating '#{$tags['title']}' - #{html_file}..."
245+
io.write(replace_tags(HEADER))
246+
io.write(r.to_html)
247+
io.write(replace_tags(FOOTER))
248+
end

0 commit comments

Comments
 (0)