Skip to content

Commit a6d6003

Browse files
committed
Basic diffing is complete.
1 parent 1101e2e commit a6d6003

12 files changed

+240
-19
lines changed

Diff for: .gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ pkg
33
*.png
44
*.wmf
55
*.emf
6+
~*

Diff for: Gemfile

+1
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ gem 'htmlentities'
66
gem 'roman-numerals'
77
gem 'rmagick'
88
gem 'ruby-prof'
9+
gem 'diff-lcs'

Diff for: examples/image2.docx

188 KB
Binary file not shown.

Diff for: examples/test1.docx

13 KB
Binary file not shown.

Diff for: examples/test2.docx

13 KB
Binary file not shown.

Diff for: examples/test3.docx

13 KB
Binary file not shown.

Diff for: examples/test4.docx

13.4 KB
Binary file not shown.

Diff for: lib/ydocx/builder.rb

+11-10
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,9 @@
88
module YDocx
99
class Builder
1010
include MarkupMethod
11-
attr_accessor :contents, :files, :style, :title
11+
attr_accessor :contents, :style, :title
1212
def initialize(contents)
1313
@contents = contents
14-
@files = Pathname.new('.')
1514
@style = false
1615
@title = ''
1716
init
@@ -99,11 +98,7 @@ def build_tag(tag, content, attributes, mode=:html)
9998
unless attributes.empty?
10099
attributes.each_pair do |key, value|
101100
next if mode == :xml and key.to_s =~ /(id|style|colspan)/u
102-
if tag == :img and key == :src
103-
_attributes << " src=\"#{resolve_path(value.to_s)}\""
104-
else
105-
_attributes << " #{key.to_s}=\"#{value.to_s}\""
106-
end
101+
_attributes << " #{key.to_s}=\"#{value.to_s}\""
107102
end
108103
end
109104
if mode == :xml
@@ -134,12 +129,18 @@ def style
134129
}
135130
td {
136131
padding: 5px 10px;
132+
}
133+
td.add {
134+
background: rgba(124,252,0,0.3);
135+
}
136+
td.delete {
137+
background: rgba(255,0,0,0.1);
138+
}
139+
td.modify {
140+
background: rgba(0,0,255,0.1);
137141
}
138142
CSS
139143
style.gsub(/\s\s+|\n/, ' ')
140144
end
141-
def resolve_path(path)
142-
@files.join path
143-
end
144145
end
145146
end

Diff for: lib/ydocx/command.rb

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# encoding: utf-8
33

44
require 'ydocx'
5+
require 'ydocx/differ'
56

67
module YDocx
78
class Command
@@ -72,9 +73,8 @@ def run_diff
7273
files << path
7374
end
7475
end
75-
docs = files.map do |file|
76-
puts YDocx::Document.open(file).contents.hash
77-
end
76+
docs = files.map { |f| YDocx::Document.open(f) }
77+
puts YDocx::Differ.diff(*docs)
7878
end
7979
end
8080
def version

Diff for: lib/ydocx/differ.rb

+151
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#!/usr/bin/env ruby
2+
# encoding: utf-8
3+
4+
require 'ydocx'
5+
require 'diff/lcs'
6+
7+
module YDocx
8+
class Differ
9+
def self.chunk_similarity(chunk1, chunk2)
10+
text1 = chunk1.map { |c| c.join('') }
11+
text2 = chunk2.map { |c| c.join('') }
12+
# Find the LCS between the runs
13+
lcs = Diff::LCS.LCS(text1, text2)
14+
lcs_len = 0
15+
lcs.each do |run|
16+
lcs_len += run.length
17+
end
18+
tlen = 0
19+
(text1 + text2).each do |run|
20+
tlen += run.length
21+
end
22+
if tlen == 0
23+
return 1
24+
else
25+
return 2.0 * lcs_len / tlen
26+
end
27+
end
28+
# Return the % similarity between two blocks (paragraphs, tables)
29+
def self.block_similarity(block1, block2)
30+
if block1.class != block2.class
31+
return 0
32+
end
33+
if block1.is_a? Paragraph
34+
chunk_similarity(block1.get_chunks, block2.get_chunks)
35+
elsif block1.is_a? Table
36+
return 1
37+
else
38+
return 0
39+
end
40+
end
41+
def self.get_chunks(paragraphs)
42+
chunks = []
43+
paragraphs.each_with_index do |p, i|
44+
if i > 0
45+
chunks << [Run.new("\r", Style.new)]
46+
end
47+
chunks += p.get_chunks
48+
end
49+
chunks
50+
end
51+
def self.diff(doc1, doc2)
52+
blocks1 = doc1.contents.blocks
53+
blocks2 = doc2.contents.blocks
54+
# Do an n^2 LCS diff on the blocks.
55+
n = blocks1.length
56+
m = blocks2.length
57+
lcs = Array.new(n+1) { Array.new(m+1, 0) }
58+
action = Array.new(n+1) { Array.new(m+1, -1) }
59+
blocks1.reverse.each_with_index do |a, ii|
60+
blocks2.reverse.each_with_index do |b, jj|
61+
if n*m > 1000
62+
sim = (a == b ? 1 : 0)
63+
else
64+
sim = block_similarity(a, b)
65+
end
66+
i = n-1-ii
67+
j = m-1-jj
68+
lcs[i][j] = lcs[i+1][j]
69+
action[i][j] = 0
70+
if lcs[i][j+1] > lcs[i][j]
71+
lcs[i][j] = lcs[i][j+1]
72+
action[i][j] = 1
73+
end
74+
if sim > 0.5 && lcs[i+1][j+1] + sim > lcs[i][j]
75+
lcs[i][j] = lcs[i+1][j+1] + sim
76+
action[i][j] = 2
77+
end
78+
end
79+
end
80+
i = 0
81+
j = 0
82+
lblocks = []
83+
rblocks = []
84+
diff_blocks = []
85+
while i < n || j < m
86+
if j == m || action[i][j] == 0
87+
lblocks << blocks1[i]
88+
i += 1
89+
elsif i == n || action[i][j] == 1
90+
rblocks << blocks2[j]
91+
j += 1
92+
else
93+
unless lblocks.empty? && rblocks.empty?
94+
if lblocks.empty? || rblocks.empty? ||
95+
chunk_similarity(get_chunks(lblocks), get_chunks(rblocks)) > 0.5
96+
diff_blocks << [lblocks.dup, rblocks.dup]
97+
else
98+
puts lblocks.to_s
99+
puts rblocks.to_s
100+
diff_blocks << [lblocks.dup, []]
101+
diff_blocks << [[], rblocks.dup]
102+
end
103+
lblocks = []
104+
rblocks = []
105+
end
106+
diff_blocks << [[blocks1[i]], [blocks2[j]]]
107+
i += 1
108+
j += 1
109+
end
110+
end
111+
unless lblocks.empty? && rblocks.empty?
112+
if lblocks.empty? || rblocks.empty? ||
113+
chunk_similarity(get_chunks(lblocks), get_chunks(rblocks)) > 0.5
114+
diff_blocks << [lblocks.dup, rblocks.dup]
115+
else
116+
diff_blocks << [lblocks.dup, []]
117+
diff_blocks << [[], rblocks.dup]
118+
end
119+
end
120+
121+
table = Table.new
122+
diff_blocks.each do |block|
123+
c1 = Cell.new
124+
c1.blocks = block[0]
125+
c2 = Cell.new
126+
c2.blocks = block[1]
127+
if block[0].empty?
128+
c2.class = 'add'
129+
elsif block[1].empty?
130+
c1.class = 'delete'
131+
elsif block[0] != block[1]
132+
c1.class = c2.class = 'modify'
133+
end
134+
table.cells << [c1, c2]
135+
end
136+
137+
[doc1, doc2].each do |doc|
138+
if !doc.images.empty?
139+
doc.create_files
140+
end
141+
end
142+
143+
html_doc = ParsedDocument.new
144+
html_doc.blocks << table
145+
builder = Builder.new(html_doc)
146+
builder.title = 'Diff Results'
147+
builder.style = true
148+
builder.build_html
149+
end
150+
end
151+
end

Diff for: lib/ydocx/document.rb

+2-3
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ def to_html(output=false)
3737
files = output_directory
3838
@builder = Builder.new(@contents) do |builder|
3939
builder.title = @path.basename
40-
builder.files = files.relative_path_from(files.dirname)
4140
builder.style = true
4241
html = builder.build_html
4342
end
@@ -64,7 +63,6 @@ def to_xml(output=false)
6463
end
6564
xml
6665
end
67-
private
6866
def create_files
6967
files_dir = output_directory
7068
mkdir Pathname.new(files_dir) unless files_dir.exist?
@@ -76,6 +74,7 @@ def create_files
7674
organize_image(origin_path, source_path, image[:data])
7775
end
7876
end
77+
private
7978
def organize_image(origin_path, source_path, data)
8079
if source_path.extname != origin_path.extname # convert
8180
output_file = output_directory.join(source_path)
@@ -118,7 +117,7 @@ def read(file)
118117
end
119118
end
120119
rel = @zip.find_entry('word/_rels/document.xml.rels').get_input_stream
121-
@parser = Parser.new(doc, rel, rel_files) do |parser|
120+
@parser = Parser.new(doc, rel, rel_files, output_directory) do |parser|
122121
@contents = parser.parse
123122
@images = parser.images
124123
end

0 commit comments

Comments
 (0)