root/trunk/lib/diff.rb

Revision 354, 10.0 kB (checked in by gaspard, 2 years ago)

More work to test multi sites.

Line 
1 # code taken from Instiki (http://www.instiki.org)
2 module HTMLDiff
3
4   Match = Struct.new(:start_in_old, :start_in_new, :size)
5   class Match
6     def end_in_old
7       self.start_in_old + self.size
8     end
9    
10     def end_in_new
11       self.start_in_new + self.size
12     end
13   end
14  
15   Operation = Struct.new(:action, :start_in_old, :end_in_old, :start_in_new, :end_in_new)
16
17   class DiffBuilder
18
19     def initialize(old_version, new_version)
20       @old_version, @new_version = old_version, new_version
21       @content = []
22     end
23
24     def build
25       split_inputs_to_words
26       index_new_words
27       operations.each { |op| perform_operation(op) }
28       return @content.join
29     end
30
31     def split_inputs_to_words
32       @old_words = convert_html_to_list_of_words(explode(@old_version))
33       @new_words = convert_html_to_list_of_words(explode(@new_version))
34     end
35
36     def index_new_words
37       @word_indices = Hash.new { |h, word| h[word] = [] }
38       @new_words.each_with_index { |word, i| @word_indices[word] << i }
39     end
40
41     def operations
42       position_in_old = position_in_new = 0
43       operations = []
44      
45       matches = matching_blocks
46       # an empty match at the end forces the loop below to handle the unmatched tails
47       # I'm sure it can be done more gracefully, but not at 23:52
48       matches << Match.new(@old_words.length, @new_words.length, 0)
49      
50       matches.each_with_index do |match, i|
51         match_starts_at_current_position_in_old = (position_in_old == match.start_in_old)
52         match_starts_at_current_position_in_new = (position_in_new == match.start_in_new)
53        
54         action_upto_match_positions =
55           case [match_starts_at_current_position_in_old, match_starts_at_current_position_in_new]
56           when [false, false]
57             :replace
58           when [true, false]
59             :insert
60           when [false, true]
61             :delete
62           else
63             # this happens if the first few words are same in both versions
64             :none
65           end
66
67         if action_upto_match_positions != :none
68           operation_upto_match_positions =
69               Operation.new(action_upto_match_positions,
70                   position_in_old, match.start_in_old,
71                   position_in_new, match.start_in_new)
72           operations << operation_upto_match_positions
73         end
74         if match.size != 0
75           match_operation = Operation.new(:equal,
76               match.start_in_old, match.end_in_old,
77               match.start_in_new, match.end_in_new)
78           operations << match_operation
79         end
80
81         position_in_old = match.end_in_old
82         position_in_new = match.end_in_new
83       end
84      
85       operations
86     end
87
88     def matching_blocks
89       matching_blocks = []
90       recursively_find_matching_blocks(0, @old_words.size, 0, @new_words.size, matching_blocks)
91       matching_blocks
92     end
93
94     def recursively_find_matching_blocks(start_in_old, end_in_old, start_in_new, end_in_new, matching_blocks)
95       match = find_match(start_in_old, end_in_old, start_in_new, end_in_new)
96       if match
97         if start_in_old < match.start_in_old and start_in_new < match.start_in_new
98           recursively_find_matching_blocks(
99               start_in_old, match.start_in_old, start_in_new, match.start_in_new, matching_blocks)
100         end
101         matching_blocks << match
102         if match.end_in_old < end_in_old and match.end_in_new < end_in_new
103           recursively_find_matching_blocks(
104               match.end_in_old, end_in_old, match.end_in_new, end_in_new, matching_blocks)
105         end
106       end
107     end
108
109     def find_match(start_in_old, end_in_old, start_in_new, end_in_new)
110
111       best_match_in_old = start_in_old
112       best_match_in_new = start_in_new
113       best_match_size = 0
114      
115       match_length_at = Hash.new { |h, index| h[index] = 0 }
116      
117       start_in_old.upto(end_in_old - 1) do |index_in_old|
118
119         new_match_length_at = Hash.new { |h, index| h[index] = 0 }
120
121         @word_indices[@old_words[index_in_old]].each do |index_in_new|
122           next  if index_in_new < start_in_new
123           break if index_in_new >= end_in_new
124
125           new_match_length = match_length_at[index_in_new - 1] + 1
126           new_match_length_at[index_in_new] = new_match_length
127
128           if new_match_length > best_match_size
129             best_match_in_old = index_in_old - new_match_length + 1
130             best_match_in_new = index_in_new - new_match_length + 1
131             best_match_size = new_match_length
132           end
133         end
134         match_length_at = new_match_length_at
135       end
136
137 #      best_match_in_old, best_match_in_new, best_match_size = add_matching_words_left(
138 #          best_match_in_old, best_match_in_new, best_match_size, start_in_old, start_in_new)
139 #      best_match_in_old, best_match_in_new, match_size = add_matching_words_right(
140 #          best_match_in_old, best_match_in_new, best_match_size, end_in_old, end_in_new)
141
142       return (best_match_size != 0 ? Match.new(best_match_in_old, best_match_in_new, best_match_size) : nil)
143     end
144
145     def add_matching_words_left(match_in_old, match_in_new, match_size, start_in_old, start_in_new)
146       while match_in_old > start_in_old and
147             match_in_new > start_in_new and
148             @old_words[match_in_old - 1] == @new_words[match_in_new - 1]
149         match_in_old -= 1
150         match_in_new -= 1
151         match_size += 1
152       end
153       [match_in_old, match_in_new, match_size]
154     end
155
156     def add_matching_words_right(match_in_old, match_in_new, match_size, end_in_old, end_in_new)
157       while match_in_old + match_size < end_in_old and
158             match_in_new + match_size < end_in_new and
159             @old_words[match_in_old + match_size] == @new_words[match_in_new + match_size]
160         match_size += 1
161       end
162       [match_in_old, match_in_new, match_size]
163     end
164    
165     VALID_METHODS = [:replace, :insert, :delete, :equal]
166
167     def perform_operation(operation)
168       @operation = operation
169       self.send operation.action, operation
170     end
171
172     def replace(operation)
173       delete(operation, 'diffmod')
174       insert(operation, 'diffmod')
175     end
176    
177     def insert(operation, tagclass = 'diffins')
178       insert_tag('ins', tagclass, @new_words[operation.start_in_new...operation.end_in_new])
179     end
180    
181     def delete(operation, tagclass = 'diffdel')
182        insert_tag('del', tagclass, @old_words[operation.start_in_old...operation.end_in_old])
183     end
184    
185     def equal(operation)
186       # no tags to insert, simply copy the matching words from one of the versions
187       @content += @new_words[operation.start_in_new...operation.end_in_new]
188     end
189  
190     def opening_tag?(item)
191       item =~ %r!^\s*<[^>]+>\s*$!
192     end
193
194     def closing_tag?(item)
195       item =~ %r!^\s*</[^>]+>\s*$!
196     end
197
198     def tag?(item)
199       opening_tag?(item) or closing_tag?(item)
200     end
201
202     def extract_consecutive_words(words, &condition)
203       index_of_first_tag = nil
204       words.each_with_index do |word, i|
205         if !condition.call(word)
206           index_of_first_tag = i
207           break
208         end
209       end
210       if index_of_first_tag
211         return words.slice!(0...index_of_first_tag)
212       else
213         return words.slice!(0..words.length)
214       end
215     end
216
217     # This method encloses words within a specified tag (ins or del), and adds this into @content,
218     # with a twist: if there are words contain tags, it actually creates multiple ins or del,
219     # so that they don't include any ins or del. This handles cases like
220     # old: '<p>a</p>'
221     # new: '<p>ab</p><p>c</b>'
222     # diff result: '<p>a<ins>b</ins></p><p><ins>c</ins></p>'
223     # this still doesn't guarantee valid HTML (hint: think about diffing a text containing ins or
224     # del tags), but handles correctly more cases than the earlier version.
225     #
226     # P.S.: Spare a thought for people who write HTML browsers. They live in this ... every day.
227
228     def insert_tag(tagname, cssclass, words)
229       loop do
230         break if words.empty?
231         non_tags = extract_consecutive_words(words) { |word| not tag?(word) }
232         @content << wrap_text(non_tags.join, tagname, cssclass) unless non_tags.empty?
233
234         break if words.empty?
235         @content += extract_consecutive_words(words) { |word| tag?(word) }
236       end
237     end
238
239     def wrap_text(text, tagname, cssclass)
240       %(<#{tagname} class="#{cssclass}">#{text}</#{tagname}>)
241     end
242
243     def explode(sequence)
244       sequence.is_a?(String) ? sequence.split(//) : sequence
245     end
246  
247     def end_of_tag?(char)
248       char == '>'
249     end
250  
251     def start_of_tag?(char)
252       char == '<'
253     end
254    
255     def whitespace?(char)
256       char =~ /\s/
257     end
258  
259     def convert_html_to_list_of_words(x, use_brackets = false)
260       mode = :char
261       current_word  = ''
262       words = []
263      
264       explode(x).each do |char|
265         case mode
266         when :tag
267           if end_of_tag? char
268             current_word << (use_brackets ? ']' : '>')
269             words << current_word
270             current_word = ''
271             if whitespace?(char)
272               mode = :whitespace
273             else
274               mode = :char
275             end
276           else
277             current_word << char
278           end
279         when :char
280           if start_of_tag? char
281             words << current_word unless current_word.empty?
282             current_word = (use_brackets ? '[' : '<')
283             mode = :tag
284           elsif /\s/.match char
285             words << current_word unless current_word.empty?
286             current_word = char
287             mode = :whitespace
288           else
289             current_word << char
290           end
291         when :whitespace
292           if start_of_tag? char
293             words << current_word unless current_word.empty?
294             current_word = (use_brackets ? '[' : '<')
295             mode = :tag
296           elsif /\s/.match char
297             current_word << char
298           else
299             words << current_word unless current_word.empty?
300             current_word = char
301             mode = :char
302           end
303         else
304           raise "Unknown mode #{mode.inspect}"
305         end
306       end
307       words << current_word unless current_word.empty?
308       words
309     end
310
311   end # of class Diff Builder
312  
313   def self.diff(a, b)
314     DiffBuilder.new(a, b).build
315   end
316
317 end
Note: See TracBrowser for help on using the browser.