|
| 1 | +# This script automatically converts the HTML pages |
| 2 | +# of the Ruby Hacking Guide to the textile-based text |
| 3 | +# format used for the translation. |
| 4 | +# It's far from being perfect and |
| 5 | +# won't be useful in any other case because it's |
| 6 | +# dependant on the way these HTML pages are written, |
| 7 | +# it's still better than doing this by hand. |
| 8 | +$KCODE = 'u' |
| 9 | + |
| 10 | +if ARGV.length != 2 |
| 11 | + puts "syntax: #{$0} input_html output_txt" |
| 12 | + exit |
| 13 | +end |
| 14 | + |
| 15 | +require 'nkf' |
| 16 | + |
| 17 | +# read the file, convert it to UTF-8 and tranform full width characters in ASCII |
| 18 | +data = File.open(ARGV[0], 'r') { |input_file| NKF::nkf('-w -Z', input_file.read) } |
| 19 | + |
| 20 | +File.open(ARGV[1], 'w') do |output| |
| 21 | + in_code = false |
| 22 | + data.gsub!(%r{&(amp|gt|lt);|</?pre\b|</?code>}) do |m| |
| 23 | + if m[0] == ?< |
| 24 | + in_code = (m[1] != ?/) |
| 25 | + if /code/.match(m) then '`' else m end |
| 26 | + else |
| 27 | + if in_code # replaces &xxx; in code and pre blocs |
| 28 | + { '&' => '&', '>' => '>', '<' => '<' }[m] |
| 29 | + else |
| 30 | + m |
| 31 | + end |
| 32 | + end |
| 33 | + end |
| 34 | + |
| 35 | + # different types of list |
| 36 | + list_type = nil |
| 37 | + data.gsub!(/<(ul|ol|li)>/) do |m| |
| 38 | + if m == '<li>' |
| 39 | + if list_type == '<ol>' then '# ' else '* ' end |
| 40 | + else |
| 41 | + list_type = m |
| 42 | + '' |
| 43 | + end |
| 44 | + end |
| 45 | + |
| 46 | + [ |
| 47 | + [ /.*?<body>(.*?)<\/body>.*/m, '\1' ], # we only want the body |
| 48 | + [ /<\/?(table|p( class=".+?")?)>|<\/(li|h\d|ol|ul)>/, '' ], # remove useless tags |
| 49 | + [ /▼/, '▼ ' ], # just add a space after the arrow |
| 50 | + [ /<h(\d)>/, 'h\1. ' ], # headers |
| 51 | + [ /<a href="(.+?)">(.+?)<\/a>/m, '"\2":\1' ], # images |
| 52 | + [ /<tr><td>|<td><td>|<td><\/tr>/, '|' ], # tables |
| 53 | + [ /<img src="(.+?)" alt=".+?"><br>\n図\d+: (.*)/, '!\1(\2)!' ], # images and captions |
| 54 | + [ /[ \t]+$/, '' ], # trims line ends |
| 55 | + [ /\A\n+|\n+\Z/, '' ], # remove beginning and ending empty lines |
| 56 | + [ /\n\n+/, "\n\n" ], # succession of empty lines |
| 57 | + ].each { |re, str| data.gsub!(re, str) } |
| 58 | + |
| 59 | + output.puts(data) |
| 60 | +end |
0 commit comments