1 | /** |
2 | * A class to compare vectors of objects. The result of comparison |
3 | * is a list of <code>change</code> objects which form an |
4 | * edit script. The objects compared are traditionally lines |
5 | * of text from two files. Comparison options such as "ignore |
6 | * whitespace" are implemented by modifying the <code>equals</code> |
7 | * and <code>hashcode</code> methods for the objects compared. |
8 | * <p/> |
9 | * The basic algorithm is described in: </br> |
10 | * "An O(ND) Difference Algorithm and its Variations", Eugene Myers, |
11 | * Algorithmica Vol. 1 No. 2, 1986, p 251. |
12 | * <p/> |
13 | * This class outputs different results from GNU diff 1.15 on some |
14 | * inputs. Our results are actually better (smaller change list, smaller |
15 | * total size of changes), but it would be nice to know why. Perhaps |
16 | * there is a memory overwrite bug in GNU diff 1.15. |
17 | * |
18 | * @author Stuart D. Gathman, translated from GNU diff 1.15 |
19 | * Copyright (C) 2000 Business Management Systems, Inc. |
20 | * <p/> |
21 | * This program is free software; you can redistribute it and/or modify |
22 | * it under the terms of the GNU General Public License as published by |
23 | * the Free Software Foundation; either version 1, or (at your option) |
24 | * any later version. |
25 | * <p/> |
26 | * This program is distributed in the hope that it will be useful, |
27 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
28 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
29 | * GNU General Public License for more details. |
30 | * <p/> |
31 | * You should have received a copy of the <a href=COPYING.txt> |
32 | * GNU General Public License</a> |
33 | * along with this program; if not, write to the Free Software |
34 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
35 | */ |
36 | |
37 | static class EGDiff { |
38 | |
39 | /** |
40 | * Prepare to find differences between two arrays. Each element of |
41 | * the arrays is translated to an "equivalence number" based on |
42 | * the result of <code>equals</code>. The original Object arrays |
43 | * are no longer needed for computing the differences. They will |
44 | * be needed again later to print the results of the comparison as |
45 | * an edit script, if desired. |
46 | */ |
47 | public EGDiff(Object[] a, Object[] b) { |
48 | Hashtable h = new Hashtable(a.length + b.length); |
49 | filevec[0] = new file_data(a, h); |
50 | filevec[1] = new file_data(b, h); |
51 | } |
52 | |
53 | /** |
54 | * 1 more than the maximum equivalence value used for this or its |
55 | * sibling file. |
56 | */ |
57 | private int equiv_max = 1; |
58 | |
59 | /** |
60 | * When set to true, the comparison uses a heuristic to speed it up. |
61 | * With this heuristic, for files with a constant small density |
62 | * of changes, the algorithm is linear in the file size. |
63 | */ |
64 | public boolean heuristic = false; |
65 | |
66 | /** |
67 | * When set to true, the algorithm returns a guarranteed minimal |
68 | * set of changes. This makes things slower, sometimes much slower. |
69 | */ |
70 | public boolean no_discards = false; |
71 | |
72 | private int[] xvec, yvec; /* Vectors being compared. */ |
73 | private int[] fdiag; /* Vector, indexed by diagonal, containing |
74 | the X coordinate of the point furthest |
75 | along the given diagonal in the forward |
76 | search of the edit matrix. */ |
77 | private int[] bdiag; /* Vector, indexed by diagonal, containing |
78 | the X coordinate of the point furthest |
79 | along the given diagonal in the backward |
80 | search of the edit matrix. */ |
81 | private int fdiagoff, bdiagoff; |
82 | private final file_data[] filevec = new file_data[2]; |
83 | private int cost; |
84 | |
85 | /** |
86 | * Find the midpoint of the shortest edit script for a specified |
87 | * portion of the two files. |
88 | * <p/> |
89 | * We scan from the beginnings of the files, and simultaneously from the ends, |
90 | * doing a breadth-first search through the space of edit-sequence. |
91 | * When the two searches meet, we have found the midpoint of the shortest |
92 | * edit sequence. |
93 | * <p/> |
94 | * The value returned is the number of the diagonal on which the midpoint lies. |
95 | * The diagonal number equals the number of inserted lines minus the number |
96 | * of deleted lines (counting only lines before the midpoint). |
97 | * The edit cost is stored into COST; this is the total number of |
98 | * lines inserted or deleted (counting only lines before the midpoint). |
99 | * <p/> |
100 | * This function assumes that the first lines of the specified portions |
101 | * of the two files do not match, and likewise that the last lines do not |
102 | * match. The caller must trim matching lines from the beginning and end |
103 | * of the portions it is going to specify. |
104 | * <p/> |
105 | * Note that if we return the "wrong" diagonal value, or if |
106 | * the value of bdiag at that diagonal is "wrong", |
107 | * the worst this can do is cause suboptimal diff output. |
108 | * It cannot cause incorrect diff output. |
109 | */ |
110 | |
111 | private int diag(int xoff, int xlim, int yoff, int ylim) { |
112 | final int[] fd = fdiag; // Give the compiler a chance. |
113 | final int[] bd = bdiag; // Additional help for the compiler. |
114 | final int[] xv = xvec; // Still more help for the compiler. |
115 | final int[] yv = yvec; // And more and more . . . |
116 | final int dmin = xoff - ylim; // Minimum valid diagonal. |
117 | final int dmax = xlim - yoff; // Maximum valid diagonal. |
118 | final int fmid = xoff - yoff; // Center diagonal of top-down search. |
119 | final int bmid = xlim - ylim; // Center diagonal of bottom-up search. |
120 | int fmin = fmid, fmax = fmid; // Limits of top-down search. |
121 | int bmin = bmid, bmax = bmid; // Limits of bottom-up search. |
122 | /* True if southeast corner is on an odd |
123 | diagonal with respect to the northwest. */ |
124 | final boolean odd = (fmid - bmid & 1) != 0; |
125 | |
126 | fd[fdiagoff + fmid] = xoff; |
127 | bd[bdiagoff + bmid] = xlim; |
128 | |
129 | for (int c = 1; ; ++c) { |
130 | int d; /* Active diagonal. */ |
131 | boolean big_snake = false; |
132 | |
133 | /* Extend the top-down search by an edit step in each diagonal. */ |
134 | if (fmin > dmin) |
135 | fd[fdiagoff + --fmin - 1] = -1; |
136 | else |
137 | ++fmin; |
138 | if (fmax < dmax) |
139 | fd[fdiagoff + ++fmax + 1] = -1; |
140 | else |
141 | --fmax; |
142 | for (d = fmax; d >= fmin; d -= 2) { |
143 | int x, y, oldx, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1]; |
144 | |
145 | if (tlo >= thi) |
146 | x = tlo + 1; |
147 | else |
148 | x = thi; |
149 | oldx = x; |
150 | y = x - d; |
151 | while (x < xlim && y < ylim && xv[x] == yv[y]) { |
152 | ++x; |
153 | ++y; |
154 | } |
155 | if (x - oldx > 20) |
156 | big_snake = true; |
157 | fd[fdiagoff + d] = x; |
158 | if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) { |
159 | cost = 2 * c - 1; |
160 | return d; |
161 | } |
162 | } |
163 | |
164 | /* Similar extend the bottom-up search. */ |
165 | if (bmin > dmin) |
166 | bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE; |
167 | else |
168 | ++bmin; |
169 | if (bmax < dmax) |
170 | bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE; |
171 | else |
172 | --bmax; |
173 | for (d = bmax; d >= bmin; d -= 2) { |
174 | int x, y, oldx, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1]; |
175 | |
176 | if (tlo < thi) |
177 | x = tlo; |
178 | else |
179 | x = thi - 1; |
180 | oldx = x; |
181 | y = x - d; |
182 | while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) { |
183 | --x; |
184 | --y; |
185 | } |
186 | if (oldx - x > 20) |
187 | big_snake = true; |
188 | bd[bdiagoff + d] = x; |
189 | if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) { |
190 | cost = 2 * c; |
191 | return d; |
192 | } |
193 | } |
194 | |
195 | /* Heuristic: check occasionally for a diagonal that has made |
196 | lots of progress compared with the edit distance. |
197 | If we have any such, find the one that has made the most |
198 | progress and return it as if it had succeeded. |
199 | |
200 | With this heuristic, for files with a constant small density |
201 | of changes, the algorithm is linear in the file size. */ |
202 | |
203 | if (c > 200 && big_snake && heuristic) { |
204 | int best = 0; |
205 | int bestpos = -1; |
206 | |
207 | for (d = fmax; d >= fmin; d -= 2) { |
208 | int dd = d - fmid; |
209 | if ((fd[fdiagoff + d] - xoff) * 2 - dd > 12 * (c + (dd > 0 ? dd : -dd))) { |
210 | if (fd[fdiagoff + d] * 2 - dd > best |
211 | && fd[fdiagoff + d] - xoff > 20 |
212 | && fd[fdiagoff + d] - d - yoff > 20) { |
213 | int k; |
214 | int x = fd[fdiagoff + d]; |
215 | |
216 | /* We have a good enough best diagonal; |
217 | now insist that it end with a significant snake. */ |
218 | for (k = 1; k <= 20; k++) |
219 | if (xvec[x - k] != yvec[x - d - k]) |
220 | break; |
221 | |
222 | if (k == 21) { |
223 | best = fd[fdiagoff + d] * 2 - dd; |
224 | bestpos = d; |
225 | } |
226 | } |
227 | } |
228 | } |
229 | if (best > 0) { |
230 | cost = 2 * c - 1; |
231 | return bestpos; |
232 | } |
233 | |
234 | best = 0; |
235 | for (d = bmax; d >= bmin; d -= 2) { |
236 | int dd = d - bmid; |
237 | if ((xlim - bd[bdiagoff + d]) * 2 + dd > 12 * (c + (dd > 0 ? dd : -dd))) { |
238 | if ((xlim - bd[bdiagoff + d]) * 2 + dd > best |
239 | && xlim - bd[bdiagoff + d] > 20 |
240 | && ylim - (bd[bdiagoff + d] - d) > 20) { |
241 | /* We have a good enough best diagonal; |
242 | now insist that it end with a significant snake. */ |
243 | int k; |
244 | int x = bd[bdiagoff + d]; |
245 | |
246 | for (k = 0; k < 20; k++) |
247 | if (xvec[x + k] != yvec[x - d + k]) |
248 | break; |
249 | if (k == 20) { |
250 | best = (xlim - bd[bdiagoff + d]) * 2 + dd; |
251 | bestpos = d; |
252 | } |
253 | } |
254 | } |
255 | } |
256 | if (best > 0) { |
257 | cost = 2 * c - 1; |
258 | return bestpos; |
259 | } |
260 | } |
261 | } |
262 | } |
263 | |
264 | /** |
265 | * Compare in detail contiguous subsequences of the two files |
266 | * which are known, as a whole, to match each other. |
267 | * <p/> |
268 | * The results are recorded in the vectors filevec[N].changed_flag, by |
269 | * storing a 1 in the element for each line that is an insertion or deletion. |
270 | * <p/> |
271 | * The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1. |
272 | * <p/> |
273 | * Note that XLIM, YLIM are exclusive bounds. |
274 | * All line numbers are origin-0 and discarded lines are not counted. |
275 | */ |
276 | |
277 | private void compareseq(int xoff, int xlim, int yoff, int ylim) { |
278 | /* Slide down the bottom initial diagonal. */ |
279 | while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) { |
280 | ++xoff; |
281 | ++yoff; |
282 | } |
283 | /* Slide up the top initial diagonal. */ |
284 | while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) { |
285 | --xlim; |
286 | --ylim; |
287 | } |
288 | |
289 | /* Handle simple cases. */ |
290 | if (xoff == xlim) |
291 | while (yoff < ylim) |
292 | filevec[1].changed_flag[1 + filevec[1].realindexes[yoff++]] = true; |
293 | else if (yoff == ylim) |
294 | while (xoff < xlim) |
295 | filevec[0].changed_flag[1 + filevec[0].realindexes[xoff++]] = true; |
296 | else { |
297 | /* Find a point of correspondence in the middle of the files. */ |
298 | |
299 | int d = diag(xoff, xlim, yoff, ylim); |
300 | int c = cost; |
301 | int b = bdiag[bdiagoff + d]; |
302 | |
303 | if (c == 1) { |
304 | /* This should be impossible, because it implies that |
305 | one of the two subsequences is empty, |
306 | and that case was handled above without calling `diag'. |
307 | Let's verify that this is true. */ |
308 | throw new IllegalArgumentException("Empty subsequence"); |
309 | } else { |
310 | /* Use that point to split this problem into two subproblems. */ |
311 | compareseq(xoff, b, yoff, b - d); |
312 | /* This used to use f instead of b, |
313 | but that is incorrect! |
314 | It is not necessarily the case that diagonal d |
315 | has a snake from b to f. */ |
316 | compareseq(b, xlim, b - d, ylim); |
317 | } |
318 | } |
319 | } |
320 | |
321 | /** |
322 | * Discard lines from one file that have no matches in the other file. |
323 | */ |
324 | |
325 | private void discard_confusing_lines() { |
326 | filevec[0].discard_confusing_lines(filevec[1]); |
327 | filevec[1].discard_confusing_lines(filevec[0]); |
328 | } |
329 | |
330 | private boolean inhibit = false; |
331 | |
332 | /** |
333 | * Adjust inserts/deletes of blank lines to join changes |
334 | * as much as possible. |
335 | */ |
336 | |
337 | private void shift_boundaries() { |
338 | if (inhibit) |
339 | return; |
340 | filevec[0].shift_boundaries(filevec[1]); |
341 | filevec[1].shift_boundaries(filevec[0]); |
342 | } |
343 | |
344 | public interface ScriptBuilder { |
345 | /** |
346 | * Scan the tables of which lines are inserted and deleted, |
347 | * producing an edit script. |
348 | * |
349 | * @param changed0 true for lines in first file which do not match 2nd |
350 | * @param len0 number of lines in first file |
351 | * @param changed1 true for lines in 2nd file which do not match 1st |
352 | * @param len1 number of lines in 2nd file |
353 | * @return a linked list of changes - or null |
354 | */ |
355 | public change build_script(boolean[] changed0, int len0, |
356 | boolean[] changed1, int len1); |
357 | } |
358 | |
359 | /** |
360 | * Scan the tables of which lines are inserted and deleted, |
361 | * producing an edit script in reverse order. |
362 | */ |
363 | |
364 | static class ReverseScript implements ScriptBuilder { |
365 | public change build_script(final boolean[] changed0, int len0, |
366 | final boolean[] changed1, int len1) { |
367 | change script = null; |
368 | int i0 = 0, i1 = 0; |
369 | while (i0 < len0 || i1 < len1) { |
370 | if (changed0[1 + i0] || changed1[1 + i1]) { |
371 | int line0 = i0, line1 = i1; |
372 | |
373 | /* Find # lines changed here in each file. */ |
374 | while (changed0[1 + i0]) ++i0; |
375 | while (changed1[1 + i1]) ++i1; |
376 | |
377 | /* Record this change. */ |
378 | script = new change(line0, line1, i0 - line0, i1 - line1, script); |
379 | } |
380 | |
381 | /* We have reached lines in the two files that match each other. */ |
382 | i0++; |
383 | i1++; |
384 | } |
385 | |
386 | return script; |
387 | } |
388 | } |
389 | |
390 | static class ForwardScript implements ScriptBuilder { |
391 | /** |
392 | * Scan the tables of which lines are inserted and deleted, |
393 | * producing an edit script in forward order. |
394 | */ |
395 | public change build_script(final boolean[] changed0, int len0, |
396 | final boolean[] changed1, int len1) { |
397 | change script = null; |
398 | int i0 = len0, i1 = len1; |
399 | |
400 | while (i0 >= 0 || i1 >= 0) { |
401 | if (changed0[i0] || changed1[i1]) { |
402 | int line0 = i0, line1 = i1; |
403 | |
404 | /* Find # lines changed here in each file. */ |
405 | while (changed0[i0]) --i0; |
406 | while (changed1[i1]) --i1; |
407 | |
408 | /* Record this change. */ |
409 | script = new change(i0, i1, line0 - i0, line1 - i1, script); |
410 | } |
411 | |
412 | /* We have reached lines in the two files that match each other. */ |
413 | i0--; |
414 | i1--; |
415 | } |
416 | |
417 | return script; |
418 | } |
419 | } |
420 | |
421 | /** |
422 | * Standard ScriptBuilders. |
423 | */ |
424 | public final static ScriptBuilder |
425 | forwardScript = new ForwardScript(), |
426 | reverseScript = new ReverseScript(); |
427 | |
428 | /* Report the differences of two files. DEPTH is the current directory |
429 | depth. */ |
430 | public final change diff_2(final boolean reverse) { |
431 | return diff(reverse ? reverseScript : forwardScript); |
432 | } |
433 | |
434 | /** |
435 | * Get the results of comparison as an edit script. The script |
436 | * is described by a list of changes. The standard ScriptBuilder |
437 | * implementations provide for forward and reverse edit scripts. |
438 | * Alternate implementations could, for instance, list common elements |
439 | * instead of differences. |
440 | * |
441 | * @param bld an object to build the script from change flags |
442 | * @return the head of a list of changes |
443 | */ |
444 | public change diff(final ScriptBuilder bld) { |
445 | |
446 | /* Some lines are obviously insertions or deletions |
447 | because they don't match anything. Detect them now, |
448 | and avoid even thinking about them in the main comparison algorithm. */ |
449 | |
450 | discard_confusing_lines(); |
451 | |
452 | /* Now do the main comparison algorithm, considering just the |
453 | undiscarded lines. */ |
454 | |
455 | xvec = filevec[0].undiscarded; |
456 | yvec = filevec[1].undiscarded; |
457 | |
458 | int diags = |
459 | filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3; |
460 | fdiag = new int[diags]; |
461 | fdiagoff = filevec[1].nondiscarded_lines + 1; |
462 | bdiag = new int[diags]; |
463 | bdiagoff = filevec[1].nondiscarded_lines + 1; |
464 | |
465 | compareseq(0, filevec[0].nondiscarded_lines, |
466 | 0, filevec[1].nondiscarded_lines); |
467 | fdiag = null; |
468 | bdiag = null; |
469 | |
470 | /* Modify the results slightly to make them prettier |
471 | in cases where that can validly be done. */ |
472 | |
473 | shift_boundaries(); |
474 | |
475 | /* Get the results of comparison in the form of a chain |
476 | of `struct change's -- an edit script. */ |
477 | return bld.build_script(filevec[0].changed_flag, |
478 | filevec[0].buffered_lines, |
479 | filevec[1].changed_flag, |
480 | filevec[1].buffered_lines); |
481 | |
482 | } |
483 | |
484 | /** |
485 | * The result of comparison is an "edit script": a chain of change objects. |
486 | * Each change represents one place where some lines are deleted |
487 | * and some are inserted. |
488 | * <p/> |
489 | * LINE0 and LINE1 are the first affected lines in the two files (origin 0). |
490 | * DELETED is the number of lines deleted here from file 0. |
491 | * INSERTED is the number of lines inserted here in file 1. |
492 | * <p/> |
493 | * If DELETED is 0 then LINE0 is the number of the line before |
494 | * which the insertion was done; vice versa for INSERTED and LINE1. |
495 | */ |
496 | |
497 | public static class change { |
498 | /** |
499 | * Previous or next edit command. |
500 | */ |
501 | public change link; |
502 | /** |
503 | * # lines of file 1 changed here. |
504 | */ |
505 | public final int inserted; |
506 | /** |
507 | * # lines of file 0 changed here. |
508 | */ |
509 | public final int deleted; |
510 | /** |
511 | * Line number of 1st deleted line. |
512 | */ |
513 | public final int line0; |
514 | /** |
515 | * Line number of 1st inserted line. |
516 | */ |
517 | public final int line1; |
518 | |
519 | /** |
520 | * Cons an additional entry onto the front of an edit script OLD. |
521 | * LINE0 and LINE1 are the first affected lines in the two files (origin 0). |
522 | * DELETED is the number of lines deleted here from file 0. |
523 | * INSERTED is the number of lines inserted here in file 1. |
524 | * <p/> |
525 | * If DELETED is 0 then LINE0 is the number of the line before |
526 | * which the insertion was done; vice versa for INSERTED and LINE1. |
527 | */ |
528 | public change(int line0, int line1, int deleted, int inserted, change old) { |
529 | this.line0 = line0; |
530 | this.line1 = line1; |
531 | this.inserted = inserted; |
532 | this.deleted = deleted; |
533 | this.link = old; |
534 | //System.err.println(line0+","+line1+","+inserted+","+deleted); |
535 | } |
536 | } |
537 | |
538 | /** |
539 | * Data on one input file being compared. |
540 | */ |
541 | |
542 | class file_data { |
543 | |
544 | /** |
545 | * Allocate changed array for the results of comparison. |
546 | */ |
547 | void clear() { |
548 | /* Allocate a flag for each line of each file, saying whether that line |
549 | is an insertion or deletion. |
550 | Allocate an extra element, always zero, at each end of each vector. |
551 | */ |
552 | changed_flag = new boolean[buffered_lines + 2]; |
553 | } |
554 | |
555 | /** |
556 | * Return equiv_count[I] as the number of lines in this file |
557 | * that fall in equivalence class I. |
558 | * |
559 | * @return the array of equivalence class counts. |
560 | */ |
561 | int[] equivCount() { |
562 | int[] equiv_count = new int[equiv_max]; |
563 | for (int i = 0; i < buffered_lines; ++i) |
564 | ++equiv_count[equivs[i]]; |
565 | return equiv_count; |
566 | } |
567 | |
568 | /** |
569 | * Discard lines that have no matches in another file. |
570 | * <p/> |
571 | * A line which is discarded will not be considered by the actual |
572 | * comparison algorithm; it will be as if that line were not in the file. |
573 | * The file's `realindexes' table maps virtual line numbers |
574 | * (which don't count the discarded lines) into real line numbers; |
575 | * this is how the actual comparison algorithm produces results |
576 | * that are comprehensible when the discarded lines are counted. |
577 | * <p/> |
578 | * When we discard a line, we also mark it as a deletion or insertion |
579 | * so that it will be printed in the output. |
580 | * |
581 | * @param f the other file |
582 | */ |
583 | void discard_confusing_lines(file_data f) { |
584 | clear(); |
585 | /* Set up table of which lines are going to be discarded. */ |
586 | final byte[] discarded = discardable(f.equivCount()); |
587 | |
588 | /* Don't really discard the provisional lines except when they occur |
589 | in a run of discardables, with nonprovisionals at the beginning |
590 | and end. */ |
591 | filterDiscards(discarded); |
592 | |
593 | /* Actually discard the lines. */ |
594 | discard(discarded); |
595 | } |
596 | |
597 | /** |
598 | * Mark to be discarded each line that matches no line of another file. |
599 | * If a line matches many lines, mark it as provisionally discardable. |
600 | * |
601 | * @param counts The count of each equivalence number for the other file. |
602 | * @return 0=nondiscardable, 1=discardable or 2=provisionally discardable |
603 | * for each line |
604 | */ |
605 | |
606 | private byte[] discardable(final int[] counts) { |
607 | final int end = buffered_lines; |
608 | final byte[] discards = new byte[end]; |
609 | final int[] equivs = this.equivs; |
610 | int many = 5; |
611 | int tem = end / 64; |
612 | |
613 | /* Multiply MANY by approximate square root of number of lines. |
614 | That is the threshold for provisionally discardable lines. */ |
615 | while ((tem = tem >> 2) > 0) |
616 | many *= 2; |
617 | |
618 | for (int i = 0; i < end; i++) { |
619 | int nmatch; |
620 | if (equivs[i] == 0) |
621 | continue; |
622 | nmatch = counts[equivs[i]]; |
623 | if (nmatch == 0) |
624 | discards[i] = 1; |
625 | else if (nmatch > many) |
626 | discards[i] = 2; |
627 | } |
628 | return discards; |
629 | } |
630 | |
631 | /** |
632 | * Don't really discard the provisional lines except when they occur |
633 | * in a run of discardables, with nonprovisionals at the beginning |
634 | * and end. |
635 | */ |
636 | |
637 | private void filterDiscards(final byte[] discards) { |
638 | final int end = buffered_lines; |
639 | |
640 | for (int i = 0; i < end; i++) { |
641 | /* Cancel provisional discards not in middle of run of discards. */ |
642 | if (discards[i] == 2) |
643 | discards[i] = 0; |
644 | else if (discards[i] != 0) { |
645 | /* We have found a nonprovisional discard. */ |
646 | int j; |
647 | int length; |
648 | int provisional = 0; |
649 | |
650 | /* Find end of this run of discardable lines. |
651 | Count how many are provisionally discardable. */ |
652 | for (j = i; j < end; j++) { |
653 | if (discards[j] == 0) |
654 | break; |
655 | if (discards[j] == 2) |
656 | ++provisional; |
657 | } |
658 | |
659 | /* Cancel provisional discards at end, and shrink the run. */ |
660 | while (j > i && discards[j - 1] == 2) { |
661 | discards[--j] = 0; |
662 | --provisional; |
663 | } |
664 | |
665 | /* Now we have the length of a run of discardable lines |
666 | whose first and last are not provisional. */ |
667 | length = j - i; |
668 | |
669 | /* If 1/4 of the lines in the run are provisional, |
670 | cancel discarding of all provisional lines in the run. */ |
671 | if (provisional * 4 > length) { |
672 | while (j > i) |
673 | if (discards[--j] == 2) |
674 | discards[j] = 0; |
675 | } else { |
676 | int consec; |
677 | int minimum = 1; |
678 | int tem = length / 4; |
679 | |
680 | /* MINIMUM is approximate square root of LENGTH/4. |
681 | A subrun of two or more provisionals can stand |
682 | when LENGTH is at least 16. |
683 | A subrun of 4 or more can stand when LENGTH >= 64. */ |
684 | while ((tem = tem >> 2) > 0) |
685 | minimum *= 2; |
686 | minimum++; |
687 | |
688 | /* Cancel any subrun of MINIMUM or more provisionals |
689 | within the larger run. */ |
690 | for (j = 0, consec = 0; j < length; j++) |
691 | if (discards[i + j] != 2) |
692 | consec = 0; |
693 | else if (minimum == ++consec) |
694 | /* Back up to start of subrun, to cancel it all. */ |
695 | j -= consec; |
696 | else if (minimum < consec) |
697 | discards[i + j] = 0; |
698 | |
699 | /* Scan from beginning of run |
700 | until we find 3 or more nonprovisionals in a row |
701 | or until the first nonprovisional at least 8 lines in. |
702 | Until that point, cancel any provisionals. */ |
703 | for (j = 0, consec = 0; j < length; j++) { |
704 | if (j >= 8 && discards[i + j] == 1) |
705 | break; |
706 | if (discards[i + j] == 2) { |
707 | consec = 0; |
708 | discards[i + j] = 0; |
709 | } else if (discards[i + j] == 0) |
710 | consec = 0; |
711 | else |
712 | consec++; |
713 | if (consec == 3) |
714 | break; |
715 | } |
716 | |
717 | /* I advances to the last line of the run. */ |
718 | i += length - 1; |
719 | |
720 | /* Same thing, from end. */ |
721 | for (j = 0, consec = 0; j < length; j++) { |
722 | if (j >= 8 && discards[i - j] == 1) |
723 | break; |
724 | if (discards[i - j] == 2) { |
725 | consec = 0; |
726 | discards[i - j] = 0; |
727 | } else if (discards[i - j] == 0) |
728 | consec = 0; |
729 | else |
730 | consec++; |
731 | if (consec == 3) |
732 | break; |
733 | } |
734 | } |
735 | } |
736 | } |
737 | } |
738 | |
739 | /** |
740 | * Actually discard the lines. |
741 | * |
742 | * @param discards flags lines to be discarded |
743 | */ |
744 | private void discard(final byte[] discards) { |
745 | final int end = buffered_lines; |
746 | int j = 0; |
747 | for (int i = 0; i < end; ++i) |
748 | if (no_discards || discards[i] == 0) { |
749 | undiscarded[j] = equivs[i]; |
750 | realindexes[j++] = i; |
751 | } else |
752 | changed_flag[1 + i] = true; |
753 | nondiscarded_lines = j; |
754 | } |
755 | |
756 | file_data(Object[] data, Hashtable h) { |
757 | buffered_lines = data.length; |
758 | |
759 | equivs = new int[buffered_lines]; |
760 | undiscarded = new int[buffered_lines]; |
761 | realindexes = new int[buffered_lines]; |
762 | |
763 | for (int i = 0; i < data.length; ++i) { |
764 | Integer ir = (Integer) h.get(data[i]); |
765 | if (ir == null) |
766 | h.put(data[i], new Integer(equivs[i] = equiv_max++)); |
767 | else |
768 | equivs[i] = ir.intValue(); |
769 | } |
770 | } |
771 | |
772 | /** |
773 | * Adjust inserts/deletes of blank lines to join changes |
774 | * as much as possible. |
775 | * <p/> |
776 | * We do something when a run of changed lines include a blank |
777 | * line at one end and have an excluded blank line at the other. |
778 | * We are free to choose which blank line is included. |
779 | * `compareseq' always chooses the one at the beginning, |
780 | * but usually it is cleaner to consider the following blank line |
781 | * to be the "change". The only exception is if the preceding blank line |
782 | * would join this change to other changes. |
783 | * |
784 | * @param f the file being compared against |
785 | */ |
786 | |
787 | void shift_boundaries(file_data f) { |
788 | final boolean[] changed = changed_flag; |
789 | final boolean[] other_changed = f.changed_flag; |
790 | int i = 0; |
791 | int j = 0; |
792 | int i_end = buffered_lines; |
793 | int preceding = -1; |
794 | int other_preceding = -1; |
795 | |
796 | for (; ;) { |
797 | int start, end, other_start; |
798 | |
799 | /* Scan forwards to find beginning of another run of changes. |
800 | Also keep track of the corresponding point in the other file. */ |
801 | |
802 | while (i < i_end && !changed[1 + i]) { |
803 | while (other_changed[1 + j++]) |
804 | /* Non-corresponding lines in the other file |
805 | will count as the preceding batch of changes. */ |
806 | other_preceding = j; |
807 | i++; |
808 | } |
809 | |
810 | if (i == i_end) |
811 | break; |
812 | |
813 | start = i; |
814 | other_start = j; |
815 | |
816 | for (; ;) { |
817 | /* Now find the end of this run of changes. */ |
818 | |
819 | while (i < i_end && changed[1 + i]) i++; |
820 | end = i; |
821 | |
822 | /* If the first changed line matches the following unchanged one, |
823 | and this run does not follow right after a previous run, |
824 | and there are no lines deleted from the other file here, |
825 | then classify the first changed line as unchanged |
826 | and the following line as changed in its place. */ |
827 | |
828 | /* You might ask, how could this run follow right after another? |
829 | Only because the previous run was shifted here. */ |
830 | |
831 | if (end != i_end |
832 | && equivs[start] == equivs[end] |
833 | && !other_changed[1 + j] |
834 | && end != i_end |
835 | && !((preceding >= 0 && start == preceding) |
836 | || (other_preceding >= 0 |
837 | && other_start == other_preceding))) { |
838 | changed[1 + end] = true; |
839 | changed[1 + start++] = false; |
840 | ++i; |
841 | /* Since one line-that-matches is now before this run |
842 | instead of after, we must advance in the other file |
843 | to keep in synch. */ |
844 | ++j; |
845 | } else |
846 | break; |
847 | } |
848 | |
849 | preceding = i; |
850 | other_preceding = j; |
851 | } |
852 | } |
853 | |
854 | /** |
855 | * Number of elements (lines) in this file. |
856 | */ |
857 | final int buffered_lines; |
858 | |
859 | /** |
860 | * Vector, indexed by line number, containing an equivalence code for |
861 | * each line. It is this vector that is actually compared with that |
862 | * of another file to generate differences. |
863 | */ |
864 | private final int[] equivs; |
865 | |
866 | /** |
867 | * Vector, like the previous one except that |
868 | * the elements for discarded lines have been squeezed out. |
869 | */ |
870 | final int[] undiscarded; |
871 | |
872 | /** |
873 | * Vector mapping virtual line numbers (not counting discarded lines) |
874 | * to real ones (counting those lines). Both are origin-0. |
875 | */ |
876 | final int[] realindexes; |
877 | |
878 | /** |
879 | * Total number of nondiscarded lines. |
880 | */ |
881 | int nondiscarded_lines; |
882 | |
883 | /** |
884 | * Array, indexed by real origin-1 line number, |
885 | * containing true for a line that is an insertion or a deletion. |
886 | * The results of comparison are stored here. |
887 | */ |
888 | boolean[] changed_flag; |
889 | |
890 | } |
891 | } |
download show line numbers debug dex old transpilations
Travelled to 18 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, ekrmjmnbrukm, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, sawdedvomwva, tslmcundralx, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv
No comments. add comment
Snippet ID: | #1000882 |
Snippet name: | class EGDiff |
Eternal ID of this version: | #1000882/1 |
Text MD5: | 84f35f163e88efc555efb9ca2944468a |
Author: | stefan |
Category: | javax |
Type: | JavaX fragment (include) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2016-05-25 17:15:36 |
Source code size: | 30610 bytes / 891 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 749 / 6262 |
Referenced in: | [show references] |