SymSpell Spike (super-fast spelling correction algorithm) [OK] [1012161]

!7

// https://towardsdatascience.com/symspell-vs-bk-tree-100x-faster-fuzzy-string-search-spell-checking-c4f10d80a078

// SymSpell: 1 million times faster through Symmetric Delete spelling correction algorithm
//
// The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup 
// for a given Damerau-Levenshtein distance. It is six orders of magnitude faster and language independent.
// Opposite to other algorithms only deletes are required, no transposes + replaces + inserts.
// Transposes + replaces + inserts of the input term are transformed into deletes of the dictionary term.
// Replaces and inserts are expensive and language dependent: e.g. Chinese has 70,000 Unicode Han characters!
//
// Copyright (C) 2015 Wolf Garbe
// Version: 3.0
// Author: Wolf Garbe <wolf.garbe@faroo.com>
// Maintainer: Wolf Garbe <wolf.garbe@faroo.com>
// URL: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/
// Description: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/
//
// License:
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License, 
// version 3.0 (LGPL-3.0) as published by the Free Software Foundation.
// http://www.opensource.org/licenses/LGPL-3.0
//
// Usage: single word + Enter:  Display spelling suggestions
//        Enter without input:  Terminate the program

private static int editDistanceMax=2;
private static int verbose = 0;
//0: top suggestion
//1: all suggestions of smallest edit distance 
//2: all suggestions <= editDistanceMax (slower, no early termination)

private static class dictionaryItem
{
    public List<Integer> suggestions = new ArrayList<Integer>();
    public int count = 0;
}

private static class suggestItem
{
    public String term = "";
    public int distance = 0;
    public int count = 0;

    @Override
    public boolean equals(Object obj)
    {
        return term.equals(((suggestItem)obj).term);
    }
    
    @Override
    public int hashCode()
    {
        return term.hashCode();
    }       
}

p {
  //Create the dictionary from a sample corpus
  //e.g. http://norvig.com/big.txt , or any other large text corpus
  //The dictionary may contain vocabulary from different languages. 
  //If you use mixed vocabulary use the language parameter in Correct() and CreateDictionary() accordingly.
  //You may use CreateDictionaryEntry() to update a (self learning) dictionary incrementally
  //To extend spelling correction beyond single words to phrases (e.g. correcting "unitedkingom" to "united kingdom") simply add those phrases with CreateDictionaryEntry().
  File corpus = prepareProgramFile("big.txt");
  gunzipFile(loadLibrary(#1012160), corpus);
  CreateDictionary(f2s(corpus), "");
  ReadFromStdIn();
}

//Dictionary that contains both the original words and the deletes derived from them. A term might be both word and delete from another word at the same time.
//For space reduction a item might be either of type dictionaryItem or Int. 
//A dictionaryItem is used for word, word/delete, and delete with multiple suggestions. Int is used for deletes with a single suggestion (the majority of entries).
private static HashMap<String, Object> dictionary = new HashMap<String, Object>(); //initialisierung

//List of unique words. By using the suggestions (Int) as index for this list they are translated into the original String.
private static List<String> wordlist = new ArrayList<String>();

//create a non-unique wordlist from sample text
//language independent (e.g. works with Chinese characters)
private static Iterable<String> parseWords(String text)
{
    // \w Alphanumeric characters (including non-latin characters, umlaut characters and digits) plus "_" 
    // \d Digits
    // Provides identical results to Norvigs regex "[a-z]+" for latin characters, while additionally providing compatibility with non-latin characters
	List<String> allMatches = new ArrayList<String>();
	Matcher m = Pattern.compile("[\\w-[\\d_]]+").matcher(text.toLowerCase());
	while (m.find()) {
		allMatches.add(m.group());
	}
	return allMatches;
}

public static int maxlength = 0;//maximum dictionary term length

//for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary
//every delete entry has a suggestions list, which points to the original term(s) it was created from
//The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry
private static boolean CreateDictionaryEntry(String key, String language)
{
	boolean result = false;
    dictionaryItem value=null;
    Object valueo;
    valueo = dictionary.get(language+key);
    if (valueo!=null)
    {
        //int or dictionaryItem? delete existed before word!
        if (valueo instanceof Integer) { 
        	int tmp = (int)valueo; 
        	value = new dictionaryItem(); 
        	value.suggestions.add(tmp); 
        	dictionary.put(language + key,value); 
    	}

        //already exists:
        //1. word appears several times
        //2. word1==deletes(word2) 
        else
        {
            value = (dictionaryItem)valueo;
        }

        //prevent overflow
        if (value.count < Integer.MAX_VALUE) value.count++;
    }
    else if (wordlist.size() < Integer.MAX_VALUE)
    {
        value = new dictionaryItem();
        value.count++;
        dictionary.put(language + key, value);

        if (key.length() > maxlength) maxlength = key.length();
    }
    
    //edits/suggestions are created only once, no matter how often word occurs
    //edits/suggestions are created only as soon as the word occurs in the corpus, 
    //even if the same term existed before in the dictionary as an edit from another word
    //a treshold might be specifid, when a term occurs so frequently in the corpus that it is considered a valid word for spelling correction
    if(value.count == 1)
    {
        //word2index
        wordlist.add(key);
        int keyint = (int)(wordlist.size() - 1);

        result = true;

        //create deletes
        for (String delete : Edits(key, 0, new HashSet<String>()))
        {
            Object value2;
            value2 = dictionary.get(language+delete);
            if (value2!=null)
            {
                //already exists:
                //1. word1==deletes(word2) 
                //2. deletes(word1)==deletes(word2) 
                //int or dictionaryItem? single delete existed before!
                if (value2 instanceof Integer) 
                {
                    //transformes int to dictionaryItem
                    int tmp = (int)value2; 
                    dictionaryItem di = new dictionaryItem(); 
                    di.suggestions.add(tmp); 
                    dictionary.put(language + delete,di);
                    if (!di.suggestions.contains(keyint)) AddLowestDistance(di, key, keyint, delete);
                }
                else if (!((dictionaryItem)value2).suggestions.contains(keyint)) AddLowestDistance((dictionaryItem) value2, key, keyint, delete);
            }
            else
            {
                dictionary.put(language + delete, keyint);         
            }

        }
    }
    return result;
}

//create a frequency dictionary from a corpus
private static void CreateDictionary(String corpus, String language)
{
	File f = new File(corpus);
    if(!(f.exists() && !f.isDirectory()))
    {
        System.out.println("File not found: " + corpus);
        return;
    }

    System.out.println("Creating dictionary ...");
    long startTime = System.currentTimeMillis();
    long wordCount = 0;
    
    BufferedReader br = null;
    try {
	br = new BufferedReader(new FileReader(corpus));
      String line;
      while ((line = br.readLine()) != null) 
      {
          for (String key : parseWords(line))
          {
             if (CreateDictionaryEntry(key, language)) wordCount++;
          }
      }
    }
    catch (Exception e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
}
    //wordlist.TrimExcess();
    long endTime = System.currentTimeMillis();
    System.out.println("\rDictionary: " + wordCount + " words, " + dictionary.size() + " entries, edit distance=" + editDistanceMax + " in " + (endTime-startTime)+"ms ");
}

//save some time and space
private static void AddLowestDistance(dictionaryItem item, String suggestion, int suggestionint, String delete)
{
    //remove all existing suggestions of higher distance, if verbose<2
    //index2word
	//TODO check
    if ((verbose < 2) && (item.suggestions.size() > 0) && (wordlist.get(item.suggestions.get(0)).length()-delete.length() > suggestion.length() - delete.length())) item.suggestions.clear();
    //do not add suggestion of higher distance than existing, if verbose<2
    if ((verbose == 2) || (item.suggestions.size() == 0) || (wordlist.get(item.suggestions.get(0)).length()-delete.length() >= suggestion.length() - delete.length())) item.suggestions.add(suggestionint); 
}

//inexpensive and language independent: only deletes, no transposes + replaces + inserts
//replaces and inserts are expensive and language dependent (Chinese has 70,000 Unicode Han characters)
private static HashSet<String> Edits(String word, int editDistance, HashSet<String> deletes)
{
    editDistance++;
    if (word.length() > 1)
    {
        for (int i = 0; i < word.length(); i++)
        {
        	//delete ith character
            String delete =  word.substring(0,i)+word.substring(i+1);
            if (deletes.add(delete))
            {
                //recursion, if maximum edit distance not yet reached
                if (editDistance < editDistanceMax) Edits(delete, editDistance, deletes);
            }
        }
    }
    return deletes;
}

private static List<suggestItem> Lookup(String input, String language, int editDistanceMax)
{
    //save some time
    if (input.length() - editDistanceMax > maxlength) 
    	return new ArrayList<suggestItem>();

    List<String> candidates = new ArrayList<String>();
    HashSet<String> hashset1 = new HashSet<String>();

    List<suggestItem> suggestions = new ArrayList<suggestItem>();
    HashSet<String> hashset2 = new HashSet<String>();

    Object valueo;

    //add original term
    candidates.add(input);

    while (candidates.size()>0)
    {
        String candidate = candidates.get(0);
        candidates.remove(0);

        //save some time
        //early termination
        //suggestion distance=candidate.distance... candidate.distance+editDistanceMax                
        //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected
        
        //label for c# goto replacement
        nosort:{
        
        	if ((verbose < 2) && (suggestions.size() > 0) && (input.length()-candidate.length() > suggestions.get(0).distance)) 
        		break nosort;

          //read candidate entry from dictionary
        	valueo = dictionary.get(language + candidate);
          if (valueo != null)
          {
              dictionaryItem value= new dictionaryItem();
              if (valueo instanceof Integer) 
              	value.suggestions.add((int)valueo);
              else value = (dictionaryItem)valueo;

              //if count>0 then candidate entry is correct dictionary term, not only delete item
              if ((value.count > 0) && hashset2.add(candidate))
              {
                  //add correct dictionary term term to suggestion list
                  suggestItem si = new suggestItem();
                  si.term = candidate;
                  si.count = value.count;
                  si.distance = input.length() - candidate.length();
                  suggestions.add(si);
                  //early termination
                  if ((verbose < 2) && (input.length() - candidate.length() == 0)) 
                  	break nosort;
              }

              //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list
              Object value2;
              for (int suggestionint : value.suggestions)
              {
                  //save some time 
                  //skipping double items early: different deletes of the input term can lead to the same suggestion
                  //index2word
              	//TODO
              	String suggestion = wordlist.get(suggestionint);
                  if (hashset2.add(suggestion))
                  {
                      //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0
                      //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. 
                      //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax.
                      //For inserts and deletes the resulting edit distance might exceed editDistanceMax.
                      //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides.
                      //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1)
                      //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2.
                      int distance = 0;
                      if (suggestion != input)
                      {
                          if (suggestion.length() == candidate.length()) distance = input.length() - candidate.length();
                          else if (input.length() == candidate.length()) distance = suggestion.length() - candidate.length();
                          else
                          {
                              //common prefixes and suffixes are ignored, because this speeds up the Damerau-levenshtein-Distance calculation without changing it.
                              int ii = 0;
                              int jj = 0;
                              while ((ii < suggestion.length()) && (ii < input.length()) && (suggestion.charAt(ii) == input.charAt(ii))) ii++;
                              while ((jj < suggestion.length() - ii) && (jj < input.length() - ii) && (suggestion.charAt(suggestion.length() - jj - 1) == input.charAt(input.length() - jj - 1))) jj++;
                              if ((ii > 0) || (jj > 0)) { 
                              	distance = DamerauLevenshteinDistance(suggestion.substring(ii, suggestion.length() - jj), input.substring(ii, input.length() - jj)); 
                              } 
                              else distance = DamerauLevenshteinDistance(suggestion, input);
                          }
                      }

                      //save some time.
                      //remove all existing suggestions of higher distance, if verbose<2
                      if ((verbose < 2) && (suggestions.size() > 0) && (suggestions.get(0).distance > distance)) suggestions.clear();
                      //do not process higher distances than those already found, if verbose<2
                      if ((verbose < 2) && (suggestions.size() > 0) && (distance > suggestions.get(0).distance)) continue;

                      if (distance <= editDistanceMax)
                      {
                      	value2 = dictionary.get(language + suggestion);
                      	if (value2!=null)
                          {
                              suggestItem si = new suggestItem();
                              si.term = suggestion;
                              si.count = ((dictionaryItem)value2).count;
                              si.distance = distance;
                              suggestions.add(si);
                          }
                      }
                  }
              }//end foreach
          }//end if         
          
          //add edits 
          //derive edits (deletes) from candidate (input) and add them to candidates list
          //this is a recursive process until the maximum edit distance has been reached
          if (input.length() - candidate.length() < editDistanceMax)
          {
              //save some time
              //do not create edits with edit distance smaller than suggestions already found
              if ((verbose < 2) && (suggestions.size() > 0) && (input.length() - candidate.length() >= suggestions.get(0).distance)) continue;

              for (int i = 0; i < candidate.length(); i++)
              {
                  String delete = candidate.substring(0, i)+candidate.substring(i+1);
                  if (hashset1.add(delete)) candidates.add(delete);
              }
          }
        } //end lable nosort
    } //end while
    
    //sort by ascending edit distance, then by descending word frequency
    if (verbose < 2) 
    	//suggestions.Sort((x, y) => -x.count.CompareTo(y.count));
    	Collections.sort(suggestions, new Comparator<suggestItem>()
                {
            public int compare(suggestItem f1, suggestItem f2)
            {
                return -(f1.count-f2.count);
            }        
        });
    else 
    	//suggestions.Sort((x, y) => 2*x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count));
    	Collections.sort(suggestions, new Comparator<suggestItem>()
                {
            public int compare(suggestItem x, suggestItem y)
            {
                return ((2*x.distance-y.distance)>0?1:0) - ((x.count - y.count)>0?1:0);
            }        
        });
    if ((verbose == 0)&&(suggestions.size()>1)) 
    	return suggestions.subList(0, 1); 
    else return suggestions;
}

private static void Correct(String input, String language)
{
    List<suggestItem> suggestions = null;

    /*
    //Benchmark: 1000 x Lookup
    Stopwatch stopWatch = new Stopwatch();
    stopWatch.Start();
    for (int i = 0; i < 1000; i++)
    {
        suggestions = Lookup(input,language,editDistanceMax);
    }
    stopWatch.Stop();
    Console.WriteLine(stopWatch.ElapsedMilliseconds.ToString());
    */
    
    
    //check in dictionary for existence and frequency; sort by ascending edit distance, then by descending word frequency
    suggestions = Lookup(input, language, editDistanceMax);

    //display term and frequency
    for (suggestItem suggestion: suggestions)
    {
        System.out.println( suggestion.term + " " + suggestion.distance + " " + suggestion.count);
    }
    if (verbose !=0) System.out.println(suggestions.size() + " suggestions");
    System.out.println("done");
}

private static void ReadFromStdIn()
{
    String word;
    BufferedReader br =  new BufferedReader(new InputStreamReader(System.in));
    try {
	while ((word = br.readLine())!=null)
	{
	    Correct(word,"");
	}
} catch (IOException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
}
}

// Damerau–Levenshtein distance algorithm and code 
// from http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance (as retrieved in June 2012)
public static int DamerauLevenshteinDistance(String a, String b) {
	  final int inf = a.length() + b.length() + 1;
  int[][] H = new int[a.length() + 2][b.length() + 2];
  for (int i = 0; i <= a.length(); i++) {
   H[i + 1][1] = i;
   H[i + 1][0] = inf;
  }
  for (int j = 0; j <= b.length(); j++) {
   H[1][j + 1] = j;
   H[0][j + 1] = inf;
  }
  HashMap<Character, Integer> DA = new HashMap<Character, Integer>();
  for (int d = 0; d < a.length(); d++) 
   if (!DA.containsKey(a.charAt(d)))
    DA.put(a.charAt(d), 0);
  
   
  for (int d = 0; d < b.length(); d++) 
   if (!DA.containsKey(b.charAt(d)))
    DA.put(b.charAt(d), 0);
  
  for (int i = 1; i <= a.length(); i++) {
   int DB = 0;
   for (int j = 1; j <= b.length(); j++) {
    final int i1 = DA.get(b.charAt(j - 1));
    final int j1 = DB;
    int d = 1;
    if (a.charAt(i - 1) == b.charAt(j - 1)) {
     d = 0;
     DB = j;
    }
    H[i + 1][j + 1] = min(
      H[i][j] + d, 
      H[i + 1][j] + 1,
      H[i][j + 1] + 1, 
      H[i1][j1] + ((i - i1 - 1)) 
      + 1 + ((j - j1 - 1)));
   }
   DA.put(a.charAt(i - 1), i);
  }
  return H[a.length() + 1][b.length() + 1];
 }
 
static int min(int a, int b, int c, int d) {
  return Math.min(a, Math.min(b, Math.min(c, d)));
}

static int min(int a, int b) { ret Math.min(a, b); }
static long min(long a, long b) { ret Math.min(a, b); }
static float min(float a, float b) { ret Math.min(a, b); }

Travelled to 13 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt

Snippet ID:	#1012161
Snippet name:	SymSpell Spike (super-fast spelling correction algorithm) [OK]
Eternal ID of this version:	#1012161/7
Text MD5:	19b2a088f0cf46b1fae9fa1a414b3ebf
Transpilation MD5:	1ce39b60bdb1b3a25225ee10b95bbba9
Author:	stefan
Category:	javax / nlp
Type:	JavaX source code (desktop)
Public (visible to everyone):	Yes
Archived (hidden from active list):	No
Created/modified:	2017-11-23 08:32:36
Source code size:	21115 bytes / 500 lines
Pitched / IR pitched:	No / No
Views / Downloads:	824 / 1551
Version history:	6 change(s)
Referenced in:	[show references]

< > BotCompany Repo | #1012161 // SymSpell Spike (super-fast spelling correction algorithm) [OK]

JavaX source code (desktop) [tags: use-pretranspiled] - run with: x30.jar

1	!7
2
3	// https://towardsdatascience.com/symspell-vs-bk-tree-100x-faster-fuzzy-string-search-spell-checking-c4f10d80a078
4
5	// SymSpell: 1 million times faster through Symmetric Delete spelling correction algorithm
6	//
7	// The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup
8	// for a given Damerau-Levenshtein distance. It is six orders of magnitude faster and language independent.
9	// Opposite to other algorithms only deletes are required, no transposes + replaces + inserts.
10	// Transposes + replaces + inserts of the input term are transformed into deletes of the dictionary term.
11	// Replaces and inserts are expensive and language dependent: e.g. Chinese has 70,000 Unicode Han characters!
12	//
13	// Copyright (C) 2015 Wolf Garbe
14	// Version: 3.0
15	// Author: Wolf Garbe <wolf.garbe@faroo.com>
16	// Maintainer: Wolf Garbe <wolf.garbe@faroo.com>
17	// URL: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/
18	// Description: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/
19	//
20	// License:
21	// This program is free software; you can redistribute it and/or modify
22	// it under the terms of the GNU Lesser General Public License,
23	// version 3.0 (LGPL-3.0) as published by the Free Software Foundation.
24	// http://www.opensource.org/licenses/LGPL-3.0
25	//
26	// Usage: single word + Enter: Display spelling suggestions
27	// Enter without input: Terminate the program
28
29	private static int editDistanceMax=2;
30	private static int verbose = 0;
31	//0: top suggestion
32	//1: all suggestions of smallest edit distance
33	//2: all suggestions <= editDistanceMax (slower, no early termination)
34
35	private static class dictionaryItem
36	{
37	public List<Integer> suggestions = new ArrayList<Integer>();
38	public int count = 0;
39	}
40
41	private static class suggestItem
42	{
43	public String term = "";
44	public int distance = 0;
45	public int count = 0;
46
47	@Override
48	public boolean equals(Object obj)
49	{
50	return term.equals(((suggestItem)obj).term);
51	}
52
53	@Override
54	public int hashCode()
55	{
56	return term.hashCode();
57	}
58	}
59
60	p {
61	//Create the dictionary from a sample corpus
62	//e.g. http://norvig.com/big.txt , or any other large text corpus
63	//The dictionary may contain vocabulary from different languages.
64	//If you use mixed vocabulary use the language parameter in Correct() and CreateDictionary() accordingly.
65	//You may use CreateDictionaryEntry() to update a (self learning) dictionary incrementally
66	//To extend spelling correction beyond single words to phrases (e.g. correcting "unitedkingom" to "united kingdom") simply add those phrases with CreateDictionaryEntry().
67	File corpus = prepareProgramFile("big.txt");
68	gunzipFile(loadLibrary(#1012160), corpus);
69	CreateDictionary(f2s(corpus), "");
70	ReadFromStdIn();
71	}
72
73	//Dictionary that contains both the original words and the deletes derived from them. A term might be both word and delete from another word at the same time.
74	//For space reduction a item might be either of type dictionaryItem or Int.
75	//A dictionaryItem is used for word, word/delete, and delete with multiple suggestions. Int is used for deletes with a single suggestion (the majority of entries).
76	private static HashMap<String, Object> dictionary = new HashMap<String, Object>(); //initialisierung
77
78	//List of unique words. By using the suggestions (Int) as index for this list they are translated into the original String.
79	private static List<String> wordlist = new ArrayList<String>();
80
81	//create a non-unique wordlist from sample text
82	//language independent (e.g. works with Chinese characters)
83	private static Iterable<String> parseWords(String text)
84	{
85	// \w Alphanumeric characters (including non-latin characters, umlaut characters and digits) plus "_"
86	// \d Digits
87	// Provides identical results to Norvigs regex "[a-z]+" for latin characters, while additionally providing compatibility with non-latin characters
88	List<String> allMatches = new ArrayList<String>();
89	Matcher m = Pattern.compile("[\\w-[\\d_]]+").matcher(text.toLowerCase());
90	while (m.find()) {
91	allMatches.add(m.group());
92	}
93	return allMatches;
94	}
95
96	public static int maxlength = 0;//maximum dictionary term length
97
98	//for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary
99	//every delete entry has a suggestions list, which points to the original term(s) it was created from
100	//The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry
101	private static boolean CreateDictionaryEntry(String key, String language)
102	{
103	boolean result = false;
104	dictionaryItem value=null;
105	Object valueo;
106	valueo = dictionary.get(language+key);
107	if (valueo!=null)
108	{
109	//int or dictionaryItem? delete existed before word!
110	if (valueo instanceof Integer) {
111	int tmp = (int)valueo;
112	value = new dictionaryItem();
113	value.suggestions.add(tmp);
114	dictionary.put(language + key,value);
115	}
116
117	//already exists:
118	//1. word appears several times
119	//2. word1==deletes(word2)
120	else
121	{
122	value = (dictionaryItem)valueo;
123	}
124
125	//prevent overflow
126	if (value.count < Integer.MAX_VALUE) value.count++;
127	}
128	else if (wordlist.size() < Integer.MAX_VALUE)
129	{
130	value = new dictionaryItem();
131	value.count++;
132	dictionary.put(language + key, value);
133
134	if (key.length() > maxlength) maxlength = key.length();
135	}
136
137	//edits/suggestions are created only once, no matter how often word occurs
138	//edits/suggestions are created only as soon as the word occurs in the corpus,
139	//even if the same term existed before in the dictionary as an edit from another word
140	//a treshold might be specifid, when a term occurs so frequently in the corpus that it is considered a valid word for spelling correction
141	if(value.count == 1)
142	{
143	//word2index
144	wordlist.add(key);
145	int keyint = (int)(wordlist.size() - 1);
146
147	result = true;
148
149	//create deletes
150	for (String delete : Edits(key, 0, new HashSet<String>()))
151	{
152	Object value2;
153	value2 = dictionary.get(language+delete);
154	if (value2!=null)
155	{
156	//already exists:
157	//1. word1==deletes(word2)
158	//2. deletes(word1)==deletes(word2)
159	//int or dictionaryItem? single delete existed before!
160	if (value2 instanceof Integer)
161	{
162	//transformes int to dictionaryItem
163	int tmp = (int)value2;
164	dictionaryItem di = new dictionaryItem();
165	di.suggestions.add(tmp);
166	dictionary.put(language + delete,di);
167	if (!di.suggestions.contains(keyint)) AddLowestDistance(di, key, keyint, delete);
168	}
169	else if (!((dictionaryItem)value2).suggestions.contains(keyint)) AddLowestDistance((dictionaryItem) value2, key, keyint, delete);
170	}
171	else
172	{
173	dictionary.put(language + delete, keyint);
174	}
175
176	}
177	}
178	return result;
179	}
180
181	//create a frequency dictionary from a corpus
182	private static void CreateDictionary(String corpus, String language)
183	{
184	File f = new File(corpus);
185	if(!(f.exists() && !f.isDirectory()))
186	{
187	System.out.println("File not found: " + corpus);
188	return;
189	}
190
191	System.out.println("Creating dictionary ...");
192	long startTime = System.currentTimeMillis();
193	long wordCount = 0;
194
195	BufferedReader br = null;
196	try {
197	br = new BufferedReader(new FileReader(corpus));
198	String line;
199	while ((line = br.readLine()) != null)
200	{
201	for (String key : parseWords(line))
202	{
203	if (CreateDictionaryEntry(key, language)) wordCount++;
204	}
205	}
206	}
207	catch (Exception e) {
208	// TODO Auto-generated catch block
209	e.printStackTrace();
210	}
211	//wordlist.TrimExcess();
212	long endTime = System.currentTimeMillis();
213	System.out.println("\rDictionary: " + wordCount + " words, " + dictionary.size() + " entries, edit distance=" + editDistanceMax + " in " + (endTime-startTime)+"ms ");
214	}
215
216	//save some time and space
217	private static void AddLowestDistance(dictionaryItem item, String suggestion, int suggestionint, String delete)
218	{
219	//remove all existing suggestions of higher distance, if verbose<2
220	//index2word
221	//TODO check
222	if ((verbose < 2) && (item.suggestions.size() > 0) && (wordlist.get(item.suggestions.get(0)).length()-delete.length() > suggestion.length() - delete.length())) item.suggestions.clear();
223	//do not add suggestion of higher distance than existing, if verbose<2
224	if ((verbose == 2) \|\| (item.suggestions.size() == 0) \|\| (wordlist.get(item.suggestions.get(0)).length()-delete.length() >= suggestion.length() - delete.length())) item.suggestions.add(suggestionint);
225	}
226
227	//inexpensive and language independent: only deletes, no transposes + replaces + inserts
228	//replaces and inserts are expensive and language dependent (Chinese has 70,000 Unicode Han characters)
229	private static HashSet<String> Edits(String word, int editDistance, HashSet<String> deletes)
230	{
231	editDistance++;
232	if (word.length() > 1)
233	{
234	for (int i = 0; i < word.length(); i++)
235	{
236	//delete ith character
237	String delete = word.substring(0,i)+word.substring(i+1);
238	if (deletes.add(delete))
239	{
240	//recursion, if maximum edit distance not yet reached
241	if (editDistance < editDistanceMax) Edits(delete, editDistance, deletes);
242	}
243	}
244	}
245	return deletes;
246	}
247
248	private static List<suggestItem> Lookup(String input, String language, int editDistanceMax)
249	{
250	//save some time
251	if (input.length() - editDistanceMax > maxlength)
252	return new ArrayList<suggestItem>();
253
254	List<String> candidates = new ArrayList<String>();
255	HashSet<String> hashset1 = new HashSet<String>();
256
257	List<suggestItem> suggestions = new ArrayList<suggestItem>();
258	HashSet<String> hashset2 = new HashSet<String>();
259
260	Object valueo;
261
262	//add original term
263	candidates.add(input);
264
265	while (candidates.size()>0)
266	{
267	String candidate = candidates.get(0);
268	candidates.remove(0);
269
270	//save some time
271	//early termination
272	//suggestion distance=candidate.distance... candidate.distance+editDistanceMax
273	//if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected
274
275	//label for c# goto replacement
276	nosort:{
277
278	if ((verbose < 2) && (suggestions.size() > 0) && (input.length()-candidate.length() > suggestions.get(0).distance))
279	break nosort;
280
281	//read candidate entry from dictionary
282	valueo = dictionary.get(language + candidate);
283	if (valueo != null)
284	{
285	dictionaryItem value= new dictionaryItem();
286	if (valueo instanceof Integer)
287	value.suggestions.add((int)valueo);
288	else value = (dictionaryItem)valueo;
289
290	//if count>0 then candidate entry is correct dictionary term, not only delete item
291	if ((value.count > 0) && hashset2.add(candidate))
292	{
293	//add correct dictionary term term to suggestion list
294	suggestItem si = new suggestItem();
295	si.term = candidate;
296	si.count = value.count;
297	si.distance = input.length() - candidate.length();
298	suggestions.add(si);
299	//early termination
300	if ((verbose < 2) && (input.length() - candidate.length() == 0))
301	break nosort;
302	}
303
304	//iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list
305	Object value2;
306	for (int suggestionint : value.suggestions)
307	{
308	//save some time
309	//skipping double items early: different deletes of the input term can lead to the same suggestion
310	//index2word
311	//TODO
312	String suggestion = wordlist.get(suggestionint);
313	if (hashset2.add(suggestion))
314	{
315	//True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0
316	//We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term.
317	//For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax.
318	//For inserts and deletes the resulting edit distance might exceed editDistanceMax.
319	//To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides.
320	//Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1)
321	//Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2.
322	int distance = 0;
323	if (suggestion != input)
324	{
325	if (suggestion.length() == candidate.length()) distance = input.length() - candidate.length();
326	else if (input.length() == candidate.length()) distance = suggestion.length() - candidate.length();
327	else
328	{
329	//common prefixes and suffixes are ignored, because this speeds up the Damerau-levenshtein-Distance calculation without changing it.
330	int ii = 0;
331	int jj = 0;
332	while ((ii < suggestion.length()) && (ii < input.length()) && (suggestion.charAt(ii) == input.charAt(ii))) ii++;
333	while ((jj < suggestion.length() - ii) && (jj < input.length() - ii) && (suggestion.charAt(suggestion.length() - jj - 1) == input.charAt(input.length() - jj - 1))) jj++;
334	if ((ii > 0) \|\| (jj > 0)) {
335	distance = DamerauLevenshteinDistance(suggestion.substring(ii, suggestion.length() - jj), input.substring(ii, input.length() - jj));
336	}
337	else distance = DamerauLevenshteinDistance(suggestion, input);
338	}
339	}
340
341	//save some time.
342	//remove all existing suggestions of higher distance, if verbose<2
343	if ((verbose < 2) && (suggestions.size() > 0) && (suggestions.get(0).distance > distance)) suggestions.clear();
344	//do not process higher distances than those already found, if verbose<2
345	if ((verbose < 2) && (suggestions.size() > 0) && (distance > suggestions.get(0).distance)) continue;
346
347	if (distance <= editDistanceMax)
348	{
349	value2 = dictionary.get(language + suggestion);
350	if (value2!=null)
351	{
352	suggestItem si = new suggestItem();
353	si.term = suggestion;
354	si.count = ((dictionaryItem)value2).count;
355	si.distance = distance;
356	suggestions.add(si);
357	}
358	}
359	}
360	}//end foreach
361	}//end if
362
363	//add edits
364	//derive edits (deletes) from candidate (input) and add them to candidates list
365	//this is a recursive process until the maximum edit distance has been reached
366	if (input.length() - candidate.length() < editDistanceMax)
367	{
368	//save some time
369	//do not create edits with edit distance smaller than suggestions already found
370	if ((verbose < 2) && (suggestions.size() > 0) && (input.length() - candidate.length() >= suggestions.get(0).distance)) continue;
371
372	for (int i = 0; i < candidate.length(); i++)
373	{
374	String delete = candidate.substring(0, i)+candidate.substring(i+1);
375	if (hashset1.add(delete)) candidates.add(delete);
376	}
377	}
378	} //end lable nosort
379	} //end while
380
381	//sort by ascending edit distance, then by descending word frequency
382	if (verbose < 2)
383	//suggestions.Sort((x, y) => -x.count.CompareTo(y.count));
384	Collections.sort(suggestions, new Comparator<suggestItem>()
385	{
386	public int compare(suggestItem f1, suggestItem f2)
387	{
388	return -(f1.count-f2.count);
389	}
390	});
391	else
392	//suggestions.Sort((x, y) => 2*x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count));
393	Collections.sort(suggestions, new Comparator<suggestItem>()
394	{
395	public int compare(suggestItem x, suggestItem y)
396	{
397	return ((2*x.distance-y.distance)>0?1:0) - ((x.count - y.count)>0?1:0);
398	}
399	});
400	if ((verbose == 0)&&(suggestions.size()>1))
401	return suggestions.subList(0, 1);
402	else return suggestions;
403	}
404
405	private static void Correct(String input, String language)
406	{
407	List<suggestItem> suggestions = null;
408
409	/*
410	//Benchmark: 1000 x Lookup
411	Stopwatch stopWatch = new Stopwatch();
412	stopWatch.Start();
413	for (int i = 0; i < 1000; i++)
414	{
415	suggestions = Lookup(input,language,editDistanceMax);
416	}
417	stopWatch.Stop();
418	Console.WriteLine(stopWatch.ElapsedMilliseconds.ToString());
419	*/
420
421
422	//check in dictionary for existence and frequency; sort by ascending edit distance, then by descending word frequency
423	suggestions = Lookup(input, language, editDistanceMax);
424
425	//display term and frequency
426	for (suggestItem suggestion: suggestions)
427	{
428	System.out.println( suggestion.term + " " + suggestion.distance + " " + suggestion.count);
429	}
430	if (verbose !=0) System.out.println(suggestions.size() + " suggestions");
431	System.out.println("done");
432	}
433
434	private static void ReadFromStdIn()
435	{
436	String word;
437	BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
438	try {
439	while ((word = br.readLine())!=null)
440	{
441	Correct(word,"");
442	}
443	} catch (IOException e) {
444	// TODO Auto-generated catch block
445	e.printStackTrace();
446	}
447	}
448
449	// Damerau–Levenshtein distance algorithm and code
450	// from http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance (as retrieved in June 2012)
451	public static int DamerauLevenshteinDistance(String a, String b) {
452	final int inf = a.length() + b.length() + 1;
453	int[][] H = new int[a.length() + 2][b.length() + 2];
454	for (int i = 0; i <= a.length(); i++) {
455	H[i + 1][1] = i;
456	H[i + 1][0] = inf;
457	}
458	for (int j = 0; j <= b.length(); j++) {
459	H[1][j + 1] = j;
460	H[0][j + 1] = inf;
461	}
462	HashMap<Character, Integer> DA = new HashMap<Character, Integer>();
463	for (int d = 0; d < a.length(); d++)
464	if (!DA.containsKey(a.charAt(d)))
465	DA.put(a.charAt(d), 0);
466
467
468	for (int d = 0; d < b.length(); d++)
469	if (!DA.containsKey(b.charAt(d)))
470	DA.put(b.charAt(d), 0);
471
472	for (int i = 1; i <= a.length(); i++) {
473	int DB = 0;
474	for (int j = 1; j <= b.length(); j++) {
475	final int i1 = DA.get(b.charAt(j - 1));
476	final int j1 = DB;
477	int d = 1;
478	if (a.charAt(i - 1) == b.charAt(j - 1)) {
479	d = 0;
480	DB = j;
481	}
482	H[i + 1][j + 1] = min(
483	H[i][j] + d,
484	H[i + 1][j] + 1,
485	H[i][j + 1] + 1,
486	H[i1][j1] + ((i - i1 - 1))
487	+ 1 + ((j - j1 - 1)));
488	}
489	DA.put(a.charAt(i - 1), i);
490	}
491	return H[a.length() + 1][b.length() + 1];
492	}
493
494	static int min(int a, int b, int c, int d) {
495	return Math.min(a, Math.min(b, Math.min(c, d)));
496	}
497
498	static int min(int a, int b) { ret Math.min(a, b); }
499	static long min(long a, long b) { ret Math.min(a, b); }
500	static float min(float a, float b) { ret Math.min(a, b); }