Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

891
LINES

< > BotCompany Repo | #1000882 // class EGDiff

JavaX fragment (include)

1  
/**
2  
 * A class to compare vectors of objects.  The result of comparison
3  
 * is a list of <code>change</code> objects which form an
4  
 * edit script.  The objects compared are traditionally lines
5  
 * of text from two files.  Comparison options such as "ignore
6  
 * whitespace" are implemented by modifying the <code>equals</code>
7  
 * and <code>hashcode</code> methods for the objects compared.
8  
 * <p/>
9  
 * The basic algorithm is described in: </br>
10  
 * "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
11  
 * Algorithmica Vol. 1 No. 2, 1986, p 251.
12  
 * <p/>
13  
 * This class outputs different results from GNU diff 1.15 on some
14  
 * inputs.  Our results are actually better (smaller change list, smaller
15  
 * total size of changes), but it would be nice to know why.  Perhaps
16  
 * there is a memory overwrite bug in GNU diff 1.15.
17  
 *
18  
 * @author Stuart D. Gathman, translated from GNU diff 1.15
19  
 *         Copyright (C) 2000  Business Management Systems, Inc.
20  
 *         <p/>
21  
 *         This program is free software; you can redistribute it and/or modify
22  
 *         it under the terms of the GNU General Public License as published by
23  
 *         the Free Software Foundation; either version 1, or (at your option)
24  
 *         any later version.
25  
 *         <p/>
26  
 *         This program is distributed in the hope that it will be useful,
27  
 *         but WITHOUT ANY WARRANTY; without even the implied warranty of
28  
 *         MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
29  
 *         GNU General Public License for more details.
30  
 *         <p/>
31  
 *         You should have received a copy of the <a href=COPYING.txt>
32  
 *         GNU General Public License</a>
33  
 *         along with this program; if not, write to the Free Software
34  
 *         Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35  
 */
36  
37  
static class EGDiff {
38  
39  
  /**
40  
   * Prepare to find differences between two arrays.  Each element of
41  
   * the arrays is translated to an "equivalence number" based on
42  
   * the result of <code>equals</code>.  The original Object arrays
43  
   * are no longer needed for computing the differences.  They will
44  
   * be needed again later to print the results of the comparison as
45  
   * an edit script, if desired.
46  
   */
47  
  public EGDiff(Object[] a, Object[] b) {
48  
    Hashtable h = new Hashtable(a.length + b.length);
49  
    filevec[0] = new file_data(a, h);
50  
    filevec[1] = new file_data(b, h);
51  
  }
52  
53  
  /**
54  
   * 1 more than the maximum equivalence value used for this or its
55  
   * sibling file.
56  
   */
57  
  private int equiv_max = 1;
58  
59  
  /**
60  
   * When set to true, the comparison uses a heuristic to speed it up.
61  
   * With this heuristic, for files with a constant small density
62  
   * of changes, the algorithm is linear in the file size.
63  
   */
64  
  public boolean heuristic = false;
65  
66  
  /**
67  
   * When set to true, the algorithm returns a guarranteed minimal
68  
   * set of changes.  This makes things slower, sometimes much slower.
69  
   */
70  
  public boolean no_discards = false;
71  
72  
  private int[] xvec, yvec;	/* Vectors being compared. */
73  
  private int[] fdiag;		/* Vector, indexed by diagonal, containing
74  
				   the X coordinate of the point furthest
75  
				   along the given diagonal in the forward
76  
				   search of the edit matrix. */
77  
  private int[] bdiag;		/* Vector, indexed by diagonal, containing
78  
				   the X coordinate of the point furthest
79  
				   along the given diagonal in the backward
80  
				   search of the edit matrix. */
81  
  private int fdiagoff, bdiagoff;
82  
  private final file_data[] filevec = new file_data[2];
83  
  private int cost;
84  
85  
  /**
86  
   * Find the midpoint of the shortest edit script for a specified
87  
   * portion of the two files.
88  
   * <p/>
89  
   * We scan from the beginnings of the files, and simultaneously from the ends,
90  
   * doing a breadth-first search through the space of edit-sequence.
91  
   * When the two searches meet, we have found the midpoint of the shortest
92  
   * edit sequence.
93  
   * <p/>
94  
   * The value returned is the number of the diagonal on which the midpoint lies.
95  
   * The diagonal number equals the number of inserted lines minus the number
96  
   * of deleted lines (counting only lines before the midpoint).
97  
   * The edit cost is stored into COST; this is the total number of
98  
   * lines inserted or deleted (counting only lines before the midpoint).
99  
   * <p/>
100  
   * This function assumes that the first lines of the specified portions
101  
   * of the two files do not match, and likewise that the last lines do not
102  
   * match.  The caller must trim matching lines from the beginning and end
103  
   * of the portions it is going to specify.
104  
   * <p/>
105  
   * Note that if we return the "wrong" diagonal value, or if
106  
   * the value of bdiag at that diagonal is "wrong",
107  
   * the worst this can do is cause suboptimal diff output.
108  
   * It cannot cause incorrect diff output.
109  
   */
110  
111  
  private int diag(int xoff, int xlim, int yoff, int ylim) {
112  
    final int[] fd = fdiag;	// Give the compiler a chance.
113  
    final int[] bd = bdiag;	// Additional help for the compiler.
114  
    final int[] xv = xvec;		// Still more help for the compiler.
115  
    final int[] yv = yvec;		// And more and more . . .
116  
    final int dmin = xoff - ylim;	// Minimum valid diagonal.
117  
    final int dmax = xlim - yoff;	// Maximum valid diagonal.
118  
    final int fmid = xoff - yoff;	// Center diagonal of top-down search.
119  
    final int bmid = xlim - ylim;	// Center diagonal of bottom-up search.
120  
    int fmin = fmid, fmax = fmid;	// Limits of top-down search.
121  
    int bmin = bmid, bmax = bmid;	// Limits of bottom-up search.
122  
    /* True if southeast corner is on an odd
123  
				     diagonal with respect to the northwest. */
124  
    final boolean odd = (fmid - bmid & 1) != 0;
125  
126  
    fd[fdiagoff + fmid] = xoff;
127  
    bd[bdiagoff + bmid] = xlim;
128  
129  
    for (int c = 1; ; ++c) {
130  
      int d;			/* Active diagonal. */
131  
      boolean big_snake = false;
132  
133  
      /* Extend the top-down search by an edit step in each diagonal. */
134  
      if (fmin > dmin)
135  
        fd[fdiagoff + --fmin - 1] = -1;
136  
      else
137  
        ++fmin;
138  
      if (fmax < dmax)
139  
        fd[fdiagoff + ++fmax + 1] = -1;
140  
      else
141  
        --fmax;
142  
      for (d = fmax; d >= fmin; d -= 2) {
143  
        int x, y, oldx, tlo = fd[fdiagoff + d - 1], thi = fd[fdiagoff + d + 1];
144  
145  
        if (tlo >= thi)
146  
          x = tlo + 1;
147  
        else
148  
          x = thi;
149  
        oldx = x;
150  
        y = x - d;
151  
        while (x < xlim && y < ylim && xv[x] == yv[y]) {
152  
          ++x;
153  
          ++y;
154  
        }
155  
        if (x - oldx > 20)
156  
          big_snake = true;
157  
        fd[fdiagoff + d] = x;
158  
        if (odd && bmin <= d && d <= bmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) {
159  
          cost = 2 * c - 1;
160  
          return d;
161  
        }
162  
      }
163  
164  
      /* Similar extend the bottom-up search. */
165  
      if (bmin > dmin)
166  
        bd[bdiagoff + --bmin - 1] = Integer.MAX_VALUE;
167  
      else
168  
        ++bmin;
169  
      if (bmax < dmax)
170  
        bd[bdiagoff + ++bmax + 1] = Integer.MAX_VALUE;
171  
      else
172  
        --bmax;
173  
      for (d = bmax; d >= bmin; d -= 2) {
174  
        int x, y, oldx, tlo = bd[bdiagoff + d - 1], thi = bd[bdiagoff + d + 1];
175  
176  
        if (tlo < thi)
177  
          x = tlo;
178  
        else
179  
          x = thi - 1;
180  
        oldx = x;
181  
        y = x - d;
182  
        while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) {
183  
          --x;
184  
          --y;
185  
        }
186  
        if (oldx - x > 20)
187  
          big_snake = true;
188  
        bd[bdiagoff + d] = x;
189  
        if (!odd && fmin <= d && d <= fmax && bd[bdiagoff + d] <= fd[fdiagoff + d]) {
190  
          cost = 2 * c;
191  
          return d;
192  
        }
193  
      }
194  
195  
      /* Heuristic: check occasionally for a diagonal that has made
196  
         lots of progress compared with the edit distance.
197  
         If we have any such, find the one that has made the most
198  
         progress and return it as if it had succeeded.
199  
200  
         With this heuristic, for files with a constant small density
201  
         of changes, the algorithm is linear in the file size.  */
202  
203  
      if (c > 200 && big_snake && heuristic) {
204  
        int best = 0;
205  
        int bestpos = -1;
206  
207  
        for (d = fmax; d >= fmin; d -= 2) {
208  
          int dd = d - fmid;
209  
          if ((fd[fdiagoff + d] - xoff) * 2 - dd > 12 * (c + (dd > 0 ? dd : -dd))) {
210  
            if (fd[fdiagoff + d] * 2 - dd > best
211  
              && fd[fdiagoff + d] - xoff > 20
212  
              && fd[fdiagoff + d] - d - yoff > 20) {
213  
              int k;
214  
              int x = fd[fdiagoff + d];
215  
216  
              /* We have a good enough best diagonal;
217  
                 now insist that it end with a significant snake.  */
218  
              for (k = 1; k <= 20; k++)
219  
                if (xvec[x - k] != yvec[x - d - k])
220  
                  break;
221  
222  
              if (k == 21) {
223  
                best = fd[fdiagoff + d] * 2 - dd;
224  
                bestpos = d;
225  
              }
226  
            }
227  
          }
228  
        }
229  
        if (best > 0) {
230  
          cost = 2 * c - 1;
231  
          return bestpos;
232  
        }
233  
234  
        best = 0;
235  
        for (d = bmax; d >= bmin; d -= 2) {
236  
          int dd = d - bmid;
237  
          if ((xlim - bd[bdiagoff + d]) * 2 + dd > 12 * (c + (dd > 0 ? dd : -dd))) {
238  
            if ((xlim - bd[bdiagoff + d]) * 2 + dd > best
239  
              && xlim - bd[bdiagoff + d] > 20
240  
              && ylim - (bd[bdiagoff + d] - d) > 20) {
241  
              /* We have a good enough best diagonal;
242  
                 now insist that it end with a significant snake.  */
243  
              int k;
244  
              int x = bd[bdiagoff + d];
245  
246  
              for (k = 0; k < 20; k++)
247  
                if (xvec[x + k] != yvec[x - d + k])
248  
                  break;
249  
              if (k == 20) {
250  
                best = (xlim - bd[bdiagoff + d]) * 2 + dd;
251  
                bestpos = d;
252  
              }
253  
            }
254  
          }
255  
        }
256  
        if (best > 0) {
257  
          cost = 2 * c - 1;
258  
          return bestpos;
259  
        }
260  
      }
261  
    }
262  
  }
263  
264  
  /**
265  
   * Compare in detail contiguous subsequences of the two files
266  
   * which are known, as a whole, to match each other.
267  
   * <p/>
268  
   * The results are recorded in the vectors filevec[N].changed_flag, by
269  
   * storing a 1 in the element for each line that is an insertion or deletion.
270  
   * <p/>
271  
   * The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
272  
   * <p/>
273  
   * Note that XLIM, YLIM are exclusive bounds.
274  
   * All line numbers are origin-0 and discarded lines are not counted.
275  
   */
276  
277  
  private void compareseq(int xoff, int xlim, int yoff, int ylim) {
278  
    /* Slide down the bottom initial diagonal. */
279  
    while (xoff < xlim && yoff < ylim && xvec[xoff] == yvec[yoff]) {
280  
      ++xoff;
281  
      ++yoff;
282  
    }
283  
    /* Slide up the top initial diagonal. */
284  
    while (xlim > xoff && ylim > yoff && xvec[xlim - 1] == yvec[ylim - 1]) {
285  
      --xlim;
286  
      --ylim;
287  
    }
288  
    
289  
    /* Handle simple cases. */
290  
    if (xoff == xlim)
291  
      while (yoff < ylim)
292  
        filevec[1].changed_flag[1 + filevec[1].realindexes[yoff++]] = true;
293  
    else if (yoff == ylim)
294  
      while (xoff < xlim)
295  
        filevec[0].changed_flag[1 + filevec[0].realindexes[xoff++]] = true;
296  
    else {
297  
      /* Find a point of correspondence in the middle of the files.  */
298  
299  
      int d = diag(xoff, xlim, yoff, ylim);
300  
      int c = cost;
301  
      int b = bdiag[bdiagoff + d];
302  
303  
      if (c == 1) {
304  
        /* This should be impossible, because it implies that
305  
           one of the two subsequences is empty,
306  
           and that case was handled above without calling `diag'.
307  
           Let's verify that this is true.  */
308  
        throw new IllegalArgumentException("Empty subsequence");
309  
      } else {
310  
        /* Use that point to split this problem into two subproblems.  */
311  
        compareseq(xoff, b, yoff, b - d);
312  
        /* This used to use f instead of b,
313  
           but that is incorrect!
314  
           It is not necessarily the case that diagonal d
315  
           has a snake from b to f.  */
316  
        compareseq(b, xlim, b - d, ylim);
317  
      }
318  
    }
319  
  }
320  
321  
  /**
322  
   * Discard lines from one file that have no matches in the other file.
323  
   */
324  
325  
  private void discard_confusing_lines() {
326  
    filevec[0].discard_confusing_lines(filevec[1]);
327  
    filevec[1].discard_confusing_lines(filevec[0]);
328  
  }
329  
330  
  private boolean inhibit = false;
331  
332  
  /**
333  
   * Adjust inserts/deletes of blank lines to join changes
334  
   * as much as possible.
335  
   */
336  
337  
  private void shift_boundaries() {
338  
    if (inhibit)
339  
      return;
340  
    filevec[0].shift_boundaries(filevec[1]);
341  
    filevec[1].shift_boundaries(filevec[0]);
342  
  }
343  
344  
  public interface ScriptBuilder {
345  
    /**
346  
     * Scan the tables of which lines are inserted and deleted,
347  
     * producing an edit script.
348  
     *
349  
     * @param changed0 true for lines in first file which do not match 2nd
350  
     * @param len0     number of lines in first file
351  
     * @param changed1 true for lines in 2nd file which do not match 1st
352  
     * @param len1     number of lines in 2nd file
353  
     * @return a linked list of changes - or null
354  
     */
355  
    public change build_script(boolean[] changed0, int len0,
356  
                               boolean[] changed1, int len1);
357  
  }
358  
359  
  /**
360  
   * Scan the tables of which lines are inserted and deleted,
361  
   * producing an edit script in reverse order.
362  
   */
363  
364  
  static class ReverseScript implements ScriptBuilder {
365  
    public change build_script(final boolean[] changed0, int len0,
366  
                               final boolean[] changed1, int len1) {
367  
      change script = null;
368  
      int i0 = 0, i1 = 0;
369  
      while (i0 < len0 || i1 < len1) {
370  
        if (changed0[1 + i0] || changed1[1 + i1]) {
371  
          int line0 = i0, line1 = i1;
372  
373  
          /* Find # lines changed here in each file.  */
374  
          while (changed0[1 + i0]) ++i0;
375  
          while (changed1[1 + i1]) ++i1;
376  
377  
          /* Record this change.  */
378  
          script = new change(line0, line1, i0 - line0, i1 - line1, script);
379  
        }
380  
381  
        /* We have reached lines in the two files that match each other.  */
382  
        i0++;
383  
        i1++;
384  
      }
385  
386  
      return script;
387  
    }
388  
  }
389  
390  
  static class ForwardScript implements ScriptBuilder {
391  
    /**
392  
     * Scan the tables of which lines are inserted and deleted,
393  
     * producing an edit script in forward order.
394  
     */
395  
    public change build_script(final boolean[] changed0, int len0,
396  
                               final boolean[] changed1, int len1) {
397  
      change script = null;
398  
      int i0 = len0, i1 = len1;
399  
400  
      while (i0 >= 0 || i1 >= 0) {
401  
        if (changed0[i0] || changed1[i1]) {
402  
          int line0 = i0, line1 = i1;
403  
404  
          /* Find # lines changed here in each file.  */
405  
          while (changed0[i0]) --i0;
406  
          while (changed1[i1]) --i1;
407  
408  
          /* Record this change.  */
409  
          script = new change(i0, i1, line0 - i0, line1 - i1, script);
410  
        }
411  
412  
        /* We have reached lines in the two files that match each other.  */
413  
        i0--;
414  
        i1--;
415  
      }
416  
417  
      return script;
418  
    }
419  
  }
420  
421  
  /**
422  
   * Standard ScriptBuilders.
423  
   */
424  
  public final static ScriptBuilder
425  
    forwardScript = new ForwardScript(),
426  
  reverseScript = new ReverseScript();
427  
428  
  /* Report the differences of two files.  DEPTH is the current directory
429  
     depth. */
430  
  public final change diff_2(final boolean reverse) {
431  
    return diff(reverse ? reverseScript : forwardScript);
432  
  }
433  
434  
  /**
435  
   * Get the results of comparison as an edit script.  The script
436  
   * is described by a list of changes.  The standard ScriptBuilder
437  
   * implementations provide for forward and reverse edit scripts.
438  
   * Alternate implementations could, for instance, list common elements
439  
   * instead of differences.
440  
   *
441  
   * @param bld an object to build the script from change flags
442  
   * @return the head of a list of changes
443  
   */
444  
  public change diff(final ScriptBuilder bld) {
445  
446  
    /* Some lines are obviously insertions or deletions
447  
       because they don't match anything.  Detect them now,
448  
       and avoid even thinking about them in the main comparison algorithm.  */
449  
450  
    discard_confusing_lines();
451  
452  
    /* Now do the main comparison algorithm, considering just the
453  
       undiscarded lines.  */
454  
455  
    xvec = filevec[0].undiscarded;
456  
    yvec = filevec[1].undiscarded;
457  
458  
    int diags =
459  
      filevec[0].nondiscarded_lines + filevec[1].nondiscarded_lines + 3;
460  
    fdiag = new int[diags];
461  
    fdiagoff = filevec[1].nondiscarded_lines + 1;
462  
    bdiag = new int[diags];
463  
    bdiagoff = filevec[1].nondiscarded_lines + 1;
464  
465  
    compareseq(0, filevec[0].nondiscarded_lines,
466  
      0, filevec[1].nondiscarded_lines);
467  
    fdiag = null;
468  
    bdiag = null;
469  
470  
    /* Modify the results slightly to make them prettier
471  
       in cases where that can validly be done.  */
472  
473  
    shift_boundaries();
474  
475  
    /* Get the results of comparison in the form of a chain
476  
       of `struct change's -- an edit script.  */
477  
    return bld.build_script(filevec[0].changed_flag,
478  
      filevec[0].buffered_lines,
479  
      filevec[1].changed_flag,
480  
      filevec[1].buffered_lines);
481  
482  
  }
483  
484  
  /**
485  
   * The result of comparison is an "edit script": a chain of change objects.
486  
   * Each change represents one place where some lines are deleted
487  
   * and some are inserted.
488  
   * <p/>
489  
   * LINE0 and LINE1 are the first affected lines in the two files (origin 0).
490  
   * DELETED is the number of lines deleted here from file 0.
491  
   * INSERTED is the number of lines inserted here in file 1.
492  
   * <p/>
493  
   * If DELETED is 0 then LINE0 is the number of the line before
494  
   * which the insertion was done; vice versa for INSERTED and LINE1.
495  
   */
496  
497  
  public static class change {
498  
    /**
499  
     * Previous or next edit command.
500  
     */
501  
    public change link;
502  
    /**
503  
     * # lines of file 1 changed here.
504  
     */
505  
    public final int inserted;
506  
    /**
507  
     * # lines of file 0 changed here.
508  
     */
509  
    public final int deleted;
510  
    /**
511  
     * Line number of 1st deleted line.
512  
     */
513  
    public final int line0;
514  
    /**
515  
     * Line number of 1st inserted line.
516  
     */
517  
    public final int line1;
518  
519  
    /**
520  
     * Cons an additional entry onto the front of an edit script OLD.
521  
     * LINE0 and LINE1 are the first affected lines in the two files (origin 0).
522  
     * DELETED is the number of lines deleted here from file 0.
523  
     * INSERTED is the number of lines inserted here in file 1.
524  
     * <p/>
525  
     * If DELETED is 0 then LINE0 is the number of the line before
526  
     * which the insertion was done; vice versa for INSERTED and LINE1.
527  
     */
528  
    public change(int line0, int line1, int deleted, int inserted, change old) {
529  
      this.line0 = line0;
530  
      this.line1 = line1;
531  
      this.inserted = inserted;
532  
      this.deleted = deleted;
533  
      this.link = old;
534  
      //System.err.println(line0+","+line1+","+inserted+","+deleted);
535  
    }
536  
  }
537  
538  
  /**
539  
   * Data on one input file being compared.
540  
   */
541  
542  
  class file_data {
543  
544  
    /**
545  
     * Allocate changed array for the results of comparison.
546  
     */
547  
    void clear() {
548  
      /* Allocate a flag for each line of each file, saying whether that line
549  
	 is an insertion or deletion.
550  
	 Allocate an extra element, always zero, at each end of each vector.
551  
       */
552  
      changed_flag = new boolean[buffered_lines + 2];
553  
    }
554  
555  
    /**
556  
     * Return equiv_count[I] as the number of lines in this file
557  
     * that fall in equivalence class I.
558  
     *
559  
     * @return the array of equivalence class counts.
560  
     */
561  
    int[] equivCount() {
562  
      int[] equiv_count = new int[equiv_max];
563  
      for (int i = 0; i < buffered_lines; ++i)
564  
        ++equiv_count[equivs[i]];
565  
      return equiv_count;
566  
    }
567  
568  
    /**
569  
     * Discard lines that have no matches in another file.
570  
     * <p/>
571  
     * A line which is discarded will not be considered by the actual
572  
     * comparison algorithm; it will be as if that line were not in the file.
573  
     * The file's `realindexes' table maps virtual line numbers
574  
     * (which don't count the discarded lines) into real line numbers;
575  
     * this is how the actual comparison algorithm produces results
576  
     * that are comprehensible when the discarded lines are counted.
577  
     * <p/>
578  
     * When we discard a line, we also mark it as a deletion or insertion
579  
     * so that it will be printed in the output.
580  
     *
581  
     * @param f the other file
582  
     */
583  
    void discard_confusing_lines(file_data f) {
584  
      clear();
585  
      /* Set up table of which lines are going to be discarded. */
586  
      final byte[] discarded = discardable(f.equivCount());
587  
588  
      /* Don't really discard the provisional lines except when they occur
589  
         in a run of discardables, with nonprovisionals at the beginning
590  
         and end.  */
591  
      filterDiscards(discarded);
592  
593  
      /* Actually discard the lines. */
594  
      discard(discarded);
595  
    }
596  
597  
    /**
598  
     * Mark to be discarded each line that matches no line of another file.
599  
     * If a line matches many lines, mark it as provisionally discardable.
600  
     *
601  
     * @param counts The count of each equivalence number for the other file.
602  
     * @return 0=nondiscardable, 1=discardable or 2=provisionally discardable
603  
     *         for each line
604  
     */
605  
606  
    private byte[] discardable(final int[] counts) {
607  
      final int end = buffered_lines;
608  
      final byte[] discards = new byte[end];
609  
      final int[] equivs = this.equivs;
610  
      int many = 5;
611  
      int tem = end / 64;
612  
613  
      /* Multiply MANY by approximate square root of number of lines.
614  
	 That is the threshold for provisionally discardable lines.  */
615  
      while ((tem = tem >> 2) > 0)
616  
        many *= 2;
617  
618  
      for (int i = 0; i < end; i++) {
619  
        int nmatch;
620  
        if (equivs[i] == 0)
621  
          continue;
622  
        nmatch = counts[equivs[i]];
623  
        if (nmatch == 0)
624  
          discards[i] = 1;
625  
        else if (nmatch > many)
626  
          discards[i] = 2;
627  
      }
628  
      return discards;
629  
    }
630  
631  
    /**
632  
     * Don't really discard the provisional lines except when they occur
633  
     * in a run of discardables, with nonprovisionals at the beginning
634  
     * and end.
635  
     */
636  
637  
    private void filterDiscards(final byte[] discards) {
638  
      final int end = buffered_lines;
639  
640  
      for (int i = 0; i < end; i++) {
641  
        /* Cancel provisional discards not in middle of run of discards.  */
642  
        if (discards[i] == 2)
643  
          discards[i] = 0;
644  
        else if (discards[i] != 0) {
645  
          /* We have found a nonprovisional discard.  */
646  
          int j;
647  
          int length;
648  
          int provisional = 0;
649  
650  
          /* Find end of this run of discardable lines.
651  
             Count how many are provisionally discardable.  */
652  
          for (j = i; j < end; j++) {
653  
            if (discards[j] == 0)
654  
              break;
655  
            if (discards[j] == 2)
656  
              ++provisional;
657  
          }
658  
659  
          /* Cancel provisional discards at end, and shrink the run.  */
660  
          while (j > i && discards[j - 1] == 2) {
661  
            discards[--j] = 0;
662  
            --provisional;
663  
          }
664  
665  
          /* Now we have the length of a run of discardable lines
666  
             whose first and last are not provisional.  */
667  
          length = j - i;
668  
669  
          /* If 1/4 of the lines in the run are provisional,
670  
             cancel discarding of all provisional lines in the run.  */
671  
          if (provisional * 4 > length) {
672  
            while (j > i)
673  
              if (discards[--j] == 2)
674  
                discards[j] = 0;
675  
          } else {
676  
            int consec;
677  
            int minimum = 1;
678  
            int tem = length / 4;
679  
680  
            /* MINIMUM is approximate square root of LENGTH/4.
681  
               A subrun of two or more provisionals can stand
682  
               when LENGTH is at least 16.
683  
               A subrun of 4 or more can stand when LENGTH >= 64.  */
684  
            while ((tem = tem >> 2) > 0)
685  
              minimum *= 2;
686  
            minimum++;
687  
688  
            /* Cancel any subrun of MINIMUM or more provisionals
689  
               within the larger run.  */
690  
            for (j = 0, consec = 0; j < length; j++)
691  
              if (discards[i + j] != 2)
692  
                consec = 0;
693  
              else if (minimum == ++consec)
694  
              /* Back up to start of subrun, to cancel it all.  */
695  
                j -= consec;
696  
              else if (minimum < consec)
697  
                discards[i + j] = 0;
698  
699  
            /* Scan from beginning of run
700  
               until we find 3 or more nonprovisionals in a row
701  
               or until the first nonprovisional at least 8 lines in.
702  
               Until that point, cancel any provisionals.  */
703  
            for (j = 0, consec = 0; j < length; j++) {
704  
              if (j >= 8 && discards[i + j] == 1)
705  
                break;
706  
              if (discards[i + j] == 2) {
707  
                consec = 0;
708  
                discards[i + j] = 0;
709  
              } else if (discards[i + j] == 0)
710  
                consec = 0;
711  
              else
712  
                consec++;
713  
              if (consec == 3)
714  
                break;
715  
            }
716  
717  
            /* I advances to the last line of the run.  */
718  
            i += length - 1;
719  
720  
            /* Same thing, from end.  */
721  
            for (j = 0, consec = 0; j < length; j++) {
722  
              if (j >= 8 && discards[i - j] == 1)
723  
                break;
724  
              if (discards[i - j] == 2) {
725  
                consec = 0;
726  
                discards[i - j] = 0;
727  
              } else if (discards[i - j] == 0)
728  
                consec = 0;
729  
              else
730  
                consec++;
731  
              if (consec == 3)
732  
                break;
733  
            }
734  
          }
735  
        }
736  
      }
737  
    }
738  
739  
    /**
740  
     * Actually discard the lines.
741  
     *
742  
     * @param discards flags lines to be discarded
743  
     */
744  
    private void discard(final byte[] discards) {
745  
      final int end = buffered_lines;
746  
      int j = 0;
747  
      for (int i = 0; i < end; ++i)
748  
        if (no_discards || discards[i] == 0) {
749  
          undiscarded[j] = equivs[i];
750  
          realindexes[j++] = i;
751  
        } else
752  
          changed_flag[1 + i] = true;
753  
      nondiscarded_lines = j;
754  
    }
755  
756  
    file_data(Object[] data, Hashtable h) {
757  
      buffered_lines = data.length;
758  
759  
      equivs = new int[buffered_lines];
760  
      undiscarded = new int[buffered_lines];
761  
      realindexes = new int[buffered_lines];
762  
763  
      for (int i = 0; i < data.length; ++i) {
764  
        Integer ir = (Integer) h.get(data[i]);
765  
        if (ir == null)
766  
          h.put(data[i], new Integer(equivs[i] = equiv_max++));
767  
        else
768  
          equivs[i] = ir.intValue();
769  
      }
770  
    }
771  
772  
    /**
773  
     * Adjust inserts/deletes of blank lines to join changes
774  
     * as much as possible.
775  
     * <p/>
776  
     * We do something when a run of changed lines include a blank
777  
     * line at one end and have an excluded blank line at the other.
778  
     * We are free to choose which blank line is included.
779  
     * `compareseq' always chooses the one at the beginning,
780  
     * but usually it is cleaner to consider the following blank line
781  
     * to be the "change".  The only exception is if the preceding blank line
782  
     * would join this change to other changes.
783  
     *
784  
     * @param f the file being compared against
785  
     */
786  
787  
    void shift_boundaries(file_data f) {
788  
      final boolean[] changed = changed_flag;
789  
      final boolean[] other_changed = f.changed_flag;
790  
      int i = 0;
791  
      int j = 0;
792  
      int i_end = buffered_lines;
793  
      int preceding = -1;
794  
      int other_preceding = -1;
795  
796  
      for (; ;) {
797  
        int start, end, other_start;
798  
799  
        /* Scan forwards to find beginning of another run of changes.
800  
           Also keep track of the corresponding point in the other file.  */
801  
802  
        while (i < i_end && !changed[1 + i]) {
803  
          while (other_changed[1 + j++])
804  
            /* Non-corresponding lines in the other file
805  
               will count as the preceding batch of changes.  */
806  
            other_preceding = j;
807  
          i++;
808  
        }
809  
810  
        if (i == i_end)
811  
          break;
812  
813  
        start = i;
814  
        other_start = j;
815  
816  
        for (; ;) {
817  
          /* Now find the end of this run of changes.  */
818  
819  
          while (i < i_end && changed[1 + i]) i++;
820  
          end = i;
821  
822  
          /* If the first changed line matches the following unchanged one,
823  
       and this run does not follow right after a previous run,
824  
       and there are no lines deleted from the other file here,
825  
       then classify the first changed line as unchanged
826  
       and the following line as changed in its place.  */
827  
828  
          /* You might ask, how could this run follow right after another?
829  
       Only because the previous run was shifted here.  */
830  
831  
          if (end != i_end
832  
            && equivs[start] == equivs[end]
833  
            && !other_changed[1 + j]
834  
            && end != i_end
835  
            && !((preceding >= 0 && start == preceding)
836  
            || (other_preceding >= 0
837  
            && other_start == other_preceding))) {
838  
            changed[1 + end] = true;
839  
            changed[1 + start++] = false;
840  
            ++i;
841  
            /* Since one line-that-matches is now before this run
842  
               instead of after, we must advance in the other file
843  
               to keep in synch.  */
844  
            ++j;
845  
          } else
846  
            break;
847  
        }
848  
849  
        preceding = i;
850  
        other_preceding = j;
851  
      }
852  
    }
853  
854  
    /**
855  
     * Number of elements (lines) in this file.
856  
     */
857  
    final int buffered_lines;
858  
859  
    /**
860  
     * Vector, indexed by line number, containing an equivalence code for
861  
     * each line.  It is this vector that is actually compared with that
862  
     * of another file to generate differences.
863  
     */
864  
    private final int[] equivs;
865  
866  
    /**
867  
     * Vector, like the previous one except that
868  
     * the elements for discarded lines have been squeezed out.
869  
     */
870  
    final int[] undiscarded;
871  
872  
    /**
873  
     * Vector mapping virtual line numbers (not counting discarded lines)
874  
     * to real ones (counting those lines).  Both are origin-0.
875  
     */
876  
    final int[] realindexes;
877  
878  
    /**
879  
     * Total number of nondiscarded lines.
880  
     */
881  
    int nondiscarded_lines;
882  
883  
    /**
884  
     * Array, indexed by real origin-1 line number,
885  
     * containing true for a line that is an insertion or a deletion.
886  
     * The results of comparison are stored here.
887  
     */
888  
    boolean[] changed_flag;
889  
890  
  }
891  
}

download  show line numbers  debug dex  old transpilations   

Travelled to 18 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, ddnzoavkxhuk, ekrmjmnbrukm, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, sawdedvomwva, tslmcundralx, tvejysmllsmz, vouqrxazstgt, xrpafgyirdlv

No comments. add comment

Snippet ID: #1000882
Snippet name: class EGDiff
Eternal ID of this version: #1000882/1
Text MD5: 84f35f163e88efc555efb9ca2944468a
Author: stefan
Category: javax
Type: JavaX fragment (include)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2016-05-25 17:15:36
Source code size: 30610 bytes / 891 lines
Pitched / IR pitched: No / No
Views / Downloads: 684 / 6200
Referenced in: [show references]