Pages

Thursday, April 19, 2012

A Generic Text Comparison Tool Implementation with LCS Approach





Detecting and showing differences of two texts (especially having hundreds or thousands of lines) is a common problem. Using pure java.lang.String class methods may be a solution, but the most important issue for that kind of operations, "performance" will not be satisfactory. We want an efficient solution which may have a view as below: 

Text Difference Tool Example
The problem contains two parts: 
  • Detecting differences of two texts: For detecting differences, an efficient dynamic algorithm of LCS (Longest Common Subsequence) used in this solution. This solution has O(text1WordCount * text2WordCount) complexity and coded as "longestCommonSubsequence" method below.
  • Visualizing the differences: For visualizing, an HTML tag based approach is used, which marks new words of text2 with green color and old words of text1 with red color. This solution has O(changedWordsCount * (text1WordCount+text2WordCount)) complexity and coded as "markTextDifferences" method below.
Note1: For simplicity, "normalizeText" method is used for removing \n, \t and multiple space characters. 
Note2: This class was created as a Vaadin component. But "longestCommonSubsequence" is pure generic and "markTextDifferences" method is generic on HTML based visual components, so they can also be used with different frameworks.


  1. import java.util.ArrayList;
  2. import com.vaadin.ui.CustomComponent;
  3. import com.vaadin.ui.Label;
  4. import com.vaadin.ui.Layout;
  5. import com.vaadin.ui.VerticalLayout;
  6. /**
  7.  * Text comparison component which marks differences of two texts with colors.
  8.  *
  9.  * @author cb
  10.  */
  11. public class TextCompareComponent extends CustomComponent {
  12.     private Layout mainLayout = new VerticalLayout();
  13.     private ArrayList<String> longestCommonSubsequenceList;
  14.     private static final String INSERT_COLOR = "#99FFCC";
  15.     private static final String DELETE_COLOR = "#CB6D6D";
  16.     public TextCompareComponent(String text1, String text2) {
  17.        
  18.         text1 = normalizeText(text1);
  19.         text2 = normalizeText(text2);
  20.         this.longestCommonSubsequenceList = longestCommonSubsequence(text1, text2);
  21.         String result = markTextDifferences(text1, text2,
  22.             longestCommonSubsequenceList, INSERT_COLOR, DELETE_COLOR);
  23.        
  24.         Label label = new Label(result, Label.CONTENT_XHTML);
  25.         mainLayout.addComponent(label);
  26.         setCompositionRoot(mainLayout);
  27.     }
  28.     /**
  29.      * Finds a list of longest common subsequences (lcs) of given two texts.
  30.      *
  31.      * @param text1
  32.      * @param text2
  33.      * @return - longest common subsequence list
  34.      */
  35.     private ArrayList<String> longestCommonSubsequence(String text1, String text2) {
  36.         String[] text1Words = text1.split(" ");
  37.         String[] text2Words = text2.split(" ");
  38.         int text1WordCount = text1Words.length;
  39.         int text2WordCount = text2Words.length;
  40.        
  41.         int[][] solutionMatrix = new int[text1WordCount + 1][text2WordCount + 1];
  42.        
  43.         for (int i = text1WordCount - 1; i >= 0; i--) {
  44.             for (int j = text2WordCount - 1; j >= 0; j--) {
  45.                 if (text1Words[i].equals(text2Words[j])) {
  46.                     solutionMatrix[i][j] = solutionMatrix[i + 1][j + 1] + 1;
  47.                 }
  48.                 else {
  49.                     solutionMatrix[i][j] = Math.max(solutionMatrix[i + 1][j],
  50.                         solutionMatrix[i][j + 1]);
  51.                 }
  52.             }
  53.         }
  54.        
  55.         int i = 0, j = 0;
  56.         ArrayList<String> lcsResultList = new ArrayList<String>();
  57.         while (i < text1WordCount && j < text2WordCount) {
  58.             if (text1Words[i].equals(text2Words[j])) {
  59.                 lcsResultList.add(text2Words[j]);
  60.                 i++;
  61.                 j++;
  62.             }
  63.             else if (solutionMatrix[i + 1][j] >= solutionMatrix[i][j + 1]) {
  64.                 i++;
  65.             }
  66.             else {
  67.                 j++;
  68.             }
  69.         }
  70.         return lcsResultList;
  71.     }
  72.    
  73.     /**
  74.      * Normalizes given string by deleting \n, \t and extra spaces.
  75.      *
  76.      * @param text - initial string
  77.      * @return - normalized string
  78.      */
  79.     private String normalizeText(String text) {
  80.        
  81.         text = text.trim();
  82.         text = text.replace("\n", " ");
  83.         text = text.replace("\t", " ");
  84.        
  85.         while (text.contains("  ")) {
  86.             text = text.replace("  ", " ");
  87.         }
  88.         return text;
  89.     }
  90.     /**
  91.      * Returns colored inserted/deleted text representation of given two texts.
  92.      * Uses longestCommonSubsequenceList to determine colored sections.
  93.      *
  94.      * @param text1
  95.      * @param text2
  96.      * @param lcsList
  97.      * @param insertColor
  98.      * @param deleteColor
  99.      * @return - colored result text
  100.      */
  101.     private String markTextDifferences(String text1, String text2,
  102.         ArrayList<String> lcsList, String insertColor, String deleteColor) {
  103.         StringBuffer stringBuffer = new StringBuffer();
  104.         if (text1 != null && lcsList != null) {
  105.             String[] text1Words = text1.split(" ");
  106.             String[] text2Words = text2.split(" ");
  107.             int i = 0, j = 0, word1LastIndex = 0, word2LastIndex = 0;
  108.             for (int k = 0; k < lcsList.size(); k++) {
  109.                 for (i = word1LastIndex, j = word2LastIndex;
  110.                     i < text1Words.length && j < text2Words.length;) {
  111.                     if (text1Words[i].equals(lcsList.get(k)) &&
  112.                         text2Words[j].equals(lcsList.get(k))) {
  113.                         stringBuffer.append("<SPAN>" + lcsList.get(k) + " </SPAN>");
  114.                         word1LastIndex = i + 1;
  115.                         word2LastIndex = j + 1;
  116.                         i = text1Words.length;
  117.                         j = text2Words.length;
  118.                     }
  119.                     else if (!text1Words[i].equals(lcsList.get(k))) {
  120.                         for (; i < text1Words.length &&
  121.                             !text1Words[i].equals(lcsList.get(k)); i++) {
  122.                             stringBuffer.append("<SPAN style='BACKGROUND-COLOR:" +
  123.                                 deleteColor + "'>" + text1Words[i] + " </SPAN>");
  124.                         }
  125.                     } else if (!text2Words[j].equals(lcsList.get(k))) {
  126.                         for (; j < text2Words.length &&
  127.                             !text2Words[j].equals(lcsList.get(k)), j++) {
  128.                             stringBuffer.append("<SPAN style='BACKGROUND-COLOR:" +
  129.                                 insertColor + "'>" + text2Words[j] + " </SPAN>");
  130.                         }
  131.                     }
  132.                 }
  133.             }
  134.             for (; word1LastIndex < text1Words.length; word1LastIndex++) {
  135.                 stringBuffer.append("<SPAN style='BACKGROUND-COLOR:" +
  136.                     deleteColor + "'>" + text1Words[word1LastIndex] + " </SPAN>");
  137.             }
  138.             for (; word2LastIndex < text2Words.length; word2LastIndex++) {
  139.                 stringBuffer.append("<SPAN style='BACKGROUND-COLOR:" +
  140.                     insertColor + "'>" + text2Words[word2LastIndex] + " </SPAN>");
  141.             }
  142.         }
  143.         return stringBuffer.toString();
  144.     }
  145. }

No comments:

Post a Comment