Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

500
LINES

< > BotCompany Repo | #1012161 // SymSpell Spike (super-fast spelling correction algorithm) [OK]

JavaX source code (desktop) [tags: use-pretranspiled] - run with: x30.jar

Download Jar. Libraryless. Click here for Pure Java version (16773L/120K).

1  
!7
2  
3  
// https://towardsdatascience.com/symspell-vs-bk-tree-100x-faster-fuzzy-string-search-spell-checking-c4f10d80a078
4  
5  
// SymSpell: 1 million times faster through Symmetric Delete spelling correction algorithm
6  
//
7  
// The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup 
8  
// for a given Damerau-Levenshtein distance. It is six orders of magnitude faster and language independent.
9  
// Opposite to other algorithms only deletes are required, no transposes + replaces + inserts.
10  
// Transposes + replaces + inserts of the input term are transformed into deletes of the dictionary term.
11  
// Replaces and inserts are expensive and language dependent: e.g. Chinese has 70,000 Unicode Han characters!
12  
//
13  
// Copyright (C) 2015 Wolf Garbe
14  
// Version: 3.0
15  
// Author: Wolf Garbe <wolf.garbe@faroo.com>
16  
// Maintainer: Wolf Garbe <wolf.garbe@faroo.com>
17  
// URL: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/
18  
// Description: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/
19  
//
20  
// License:
21  
// This program is free software; you can redistribute it and/or modify
22  
// it under the terms of the GNU Lesser General Public License, 
23  
// version 3.0 (LGPL-3.0) as published by the Free Software Foundation.
24  
// http://www.opensource.org/licenses/LGPL-3.0
25  
//
26  
// Usage: single word + Enter:  Display spelling suggestions
27  
//        Enter without input:  Terminate the program
28  
29  
private static int editDistanceMax=2;
30  
private static int verbose = 0;
31  
//0: top suggestion
32  
//1: all suggestions of smallest edit distance 
33  
//2: all suggestions <= editDistanceMax (slower, no early termination)
34  
35  
private static class dictionaryItem
36  
{
37  
    public List<Integer> suggestions = new ArrayList<Integer>();
38  
    public int count = 0;
39  
}
40  
41  
private static class suggestItem
42  
{
43  
    public String term = "";
44  
    public int distance = 0;
45  
    public int count = 0;
46  
47  
    @Override
48  
    public boolean equals(Object obj)
49  
    {
50  
        return term.equals(((suggestItem)obj).term);
51  
    }
52  
    
53  
    @Override
54  
    public int hashCode()
55  
    {
56  
        return term.hashCode();
57  
    }       
58  
}
59  
60  
p {
61  
  //Create the dictionary from a sample corpus
62  
  //e.g. http://norvig.com/big.txt , or any other large text corpus
63  
  //The dictionary may contain vocabulary from different languages. 
64  
  //If you use mixed vocabulary use the language parameter in Correct() and CreateDictionary() accordingly.
65  
  //You may use CreateDictionaryEntry() to update a (self learning) dictionary incrementally
66  
  //To extend spelling correction beyond single words to phrases (e.g. correcting "unitedkingom" to "united kingdom") simply add those phrases with CreateDictionaryEntry().
67  
  File corpus = prepareProgramFile("big.txt");
68  
  gunzipFile(loadLibrary(#1012160), corpus);
69  
  CreateDictionary(f2s(corpus), "");
70  
  ReadFromStdIn();
71  
}
72  
73  
//Dictionary that contains both the original words and the deletes derived from them. A term might be both word and delete from another word at the same time.
74  
//For space reduction a item might be either of type dictionaryItem or Int. 
75  
//A dictionaryItem is used for word, word/delete, and delete with multiple suggestions. Int is used for deletes with a single suggestion (the majority of entries).
76  
private static HashMap<String, Object> dictionary = new HashMap<String, Object>(); //initialisierung
77  
78  
//List of unique words. By using the suggestions (Int) as index for this list they are translated into the original String.
79  
private static List<String> wordlist = new ArrayList<String>();
80  
81  
//create a non-unique wordlist from sample text
82  
//language independent (e.g. works with Chinese characters)
83  
private static Iterable<String> parseWords(String text)
84  
{
85  
    // \w Alphanumeric characters (including non-latin characters, umlaut characters and digits) plus "_" 
86  
    // \d Digits
87  
    // Provides identical results to Norvigs regex "[a-z]+" for latin characters, while additionally providing compatibility with non-latin characters
88  
	List<String> allMatches = new ArrayList<String>();
89  
	Matcher m = Pattern.compile("[\\w-[\\d_]]+").matcher(text.toLowerCase());
90  
	while (m.find()) {
91  
		allMatches.add(m.group());
92  
	}
93  
	return allMatches;
94  
}
95  
96  
public static int maxlength = 0;//maximum dictionary term length
97  
98  
//for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary
99  
//every delete entry has a suggestions list, which points to the original term(s) it was created from
100  
//The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry
101  
private static boolean CreateDictionaryEntry(String key, String language)
102  
{
103  
	boolean result = false;
104  
    dictionaryItem value=null;
105  
    Object valueo;
106  
    valueo = dictionary.get(language+key);
107  
    if (valueo!=null)
108  
    {
109  
        //int or dictionaryItem? delete existed before word!
110  
        if (valueo instanceof Integer) { 
111  
        	int tmp = (int)valueo; 
112  
        	value = new dictionaryItem(); 
113  
        	value.suggestions.add(tmp); 
114  
        	dictionary.put(language + key,value); 
115  
    	}
116  
117  
        //already exists:
118  
        //1. word appears several times
119  
        //2. word1==deletes(word2) 
120  
        else
121  
        {
122  
            value = (dictionaryItem)valueo;
123  
        }
124  
125  
        //prevent overflow
126  
        if (value.count < Integer.MAX_VALUE) value.count++;
127  
    }
128  
    else if (wordlist.size() < Integer.MAX_VALUE)
129  
    {
130  
        value = new dictionaryItem();
131  
        value.count++;
132  
        dictionary.put(language + key, value);
133  
134  
        if (key.length() > maxlength) maxlength = key.length();
135  
    }
136  
    
137  
    //edits/suggestions are created only once, no matter how often word occurs
138  
    //edits/suggestions are created only as soon as the word occurs in the corpus, 
139  
    //even if the same term existed before in the dictionary as an edit from another word
140  
    //a treshold might be specifid, when a term occurs so frequently in the corpus that it is considered a valid word for spelling correction
141  
    if(value.count == 1)
142  
    {
143  
        //word2index
144  
        wordlist.add(key);
145  
        int keyint = (int)(wordlist.size() - 1);
146  
147  
        result = true;
148  
149  
        //create deletes
150  
        for (String delete : Edits(key, 0, new HashSet<String>()))
151  
        {
152  
            Object value2;
153  
            value2 = dictionary.get(language+delete);
154  
            if (value2!=null)
155  
            {
156  
                //already exists:
157  
                //1. word1==deletes(word2) 
158  
                //2. deletes(word1)==deletes(word2) 
159  
                //int or dictionaryItem? single delete existed before!
160  
                if (value2 instanceof Integer) 
161  
                {
162  
                    //transformes int to dictionaryItem
163  
                    int tmp = (int)value2; 
164  
                    dictionaryItem di = new dictionaryItem(); 
165  
                    di.suggestions.add(tmp); 
166  
                    dictionary.put(language + delete,di);
167  
                    if (!di.suggestions.contains(keyint)) AddLowestDistance(di, key, keyint, delete);
168  
                }
169  
                else if (!((dictionaryItem)value2).suggestions.contains(keyint)) AddLowestDistance((dictionaryItem) value2, key, keyint, delete);
170  
            }
171  
            else
172  
            {
173  
                dictionary.put(language + delete, keyint);         
174  
            }
175  
176  
        }
177  
    }
178  
    return result;
179  
}
180  
181  
//create a frequency dictionary from a corpus
182  
private static void CreateDictionary(String corpus, String language)
183  
{
184  
	File f = new File(corpus);
185  
    if(!(f.exists() && !f.isDirectory()))
186  
    {
187  
        System.out.println("File not found: " + corpus);
188  
        return;
189  
    }
190  
191  
    System.out.println("Creating dictionary ...");
192  
    long startTime = System.currentTimeMillis();
193  
    long wordCount = 0;
194  
    
195  
    BufferedReader br = null;
196  
    try {
197  
	br = new BufferedReader(new FileReader(corpus));
198  
      String line;
199  
      while ((line = br.readLine()) != null) 
200  
      {
201  
          for (String key : parseWords(line))
202  
          {
203  
             if (CreateDictionaryEntry(key, language)) wordCount++;
204  
          }
205  
      }
206  
    }
207  
    catch (Exception e) {
208  
	// TODO Auto-generated catch block
209  
	e.printStackTrace();
210  
}
211  
    //wordlist.TrimExcess();
212  
    long endTime = System.currentTimeMillis();
213  
    System.out.println("\rDictionary: " + wordCount + " words, " + dictionary.size() + " entries, edit distance=" + editDistanceMax + " in " + (endTime-startTime)+"ms ");
214  
}
215  
216  
//save some time and space
217  
private static void AddLowestDistance(dictionaryItem item, String suggestion, int suggestionint, String delete)
218  
{
219  
    //remove all existing suggestions of higher distance, if verbose<2
220  
    //index2word
221  
	//TODO check
222  
    if ((verbose < 2) && (item.suggestions.size() > 0) && (wordlist.get(item.suggestions.get(0)).length()-delete.length() > suggestion.length() - delete.length())) item.suggestions.clear();
223  
    //do not add suggestion of higher distance than existing, if verbose<2
224  
    if ((verbose == 2) || (item.suggestions.size() == 0) || (wordlist.get(item.suggestions.get(0)).length()-delete.length() >= suggestion.length() - delete.length())) item.suggestions.add(suggestionint); 
225  
}
226  
227  
//inexpensive and language independent: only deletes, no transposes + replaces + inserts
228  
//replaces and inserts are expensive and language dependent (Chinese has 70,000 Unicode Han characters)
229  
private static HashSet<String> Edits(String word, int editDistance, HashSet<String> deletes)
230  
{
231  
    editDistance++;
232  
    if (word.length() > 1)
233  
    {
234  
        for (int i = 0; i < word.length(); i++)
235  
        {
236  
        	//delete ith character
237  
            String delete =  word.substring(0,i)+word.substring(i+1);
238  
            if (deletes.add(delete))
239  
            {
240  
                //recursion, if maximum edit distance not yet reached
241  
                if (editDistance < editDistanceMax) Edits(delete, editDistance, deletes);
242  
            }
243  
        }
244  
    }
245  
    return deletes;
246  
}
247  
248  
private static List<suggestItem> Lookup(String input, String language, int editDistanceMax)
249  
{
250  
    //save some time
251  
    if (input.length() - editDistanceMax > maxlength) 
252  
    	return new ArrayList<suggestItem>();
253  
254  
    List<String> candidates = new ArrayList<String>();
255  
    HashSet<String> hashset1 = new HashSet<String>();
256  
257  
    List<suggestItem> suggestions = new ArrayList<suggestItem>();
258  
    HashSet<String> hashset2 = new HashSet<String>();
259  
260  
    Object valueo;
261  
262  
    //add original term
263  
    candidates.add(input);
264  
265  
    while (candidates.size()>0)
266  
    {
267  
        String candidate = candidates.get(0);
268  
        candidates.remove(0);
269  
270  
        //save some time
271  
        //early termination
272  
        //suggestion distance=candidate.distance... candidate.distance+editDistanceMax                
273  
        //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected
274  
        
275  
        //label for c# goto replacement
276  
        nosort:{
277  
        
278  
        	if ((verbose < 2) && (suggestions.size() > 0) && (input.length()-candidate.length() > suggestions.get(0).distance)) 
279  
        		break nosort;
280  
281  
          //read candidate entry from dictionary
282  
        	valueo = dictionary.get(language + candidate);
283  
          if (valueo != null)
284  
          {
285  
              dictionaryItem value= new dictionaryItem();
286  
              if (valueo instanceof Integer) 
287  
              	value.suggestions.add((int)valueo);
288  
              else value = (dictionaryItem)valueo;
289  
290  
              //if count>0 then candidate entry is correct dictionary term, not only delete item
291  
              if ((value.count > 0) && hashset2.add(candidate))
292  
              {
293  
                  //add correct dictionary term term to suggestion list
294  
                  suggestItem si = new suggestItem();
295  
                  si.term = candidate;
296  
                  si.count = value.count;
297  
                  si.distance = input.length() - candidate.length();
298  
                  suggestions.add(si);
299  
                  //early termination
300  
                  if ((verbose < 2) && (input.length() - candidate.length() == 0)) 
301  
                  	break nosort;
302  
              }
303  
304  
              //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list
305  
              Object value2;
306  
              for (int suggestionint : value.suggestions)
307  
              {
308  
                  //save some time 
309  
                  //skipping double items early: different deletes of the input term can lead to the same suggestion
310  
                  //index2word
311  
              	//TODO
312  
              	String suggestion = wordlist.get(suggestionint);
313  
                  if (hashset2.add(suggestion))
314  
                  {
315  
                      //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0
316  
                      //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. 
317  
                      //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax.
318  
                      //For inserts and deletes the resulting edit distance might exceed editDistanceMax.
319  
                      //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides.
320  
                      //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1)
321  
                      //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2.
322  
                      int distance = 0;
323  
                      if (suggestion != input)
324  
                      {
325  
                          if (suggestion.length() == candidate.length()) distance = input.length() - candidate.length();
326  
                          else if (input.length() == candidate.length()) distance = suggestion.length() - candidate.length();
327  
                          else
328  
                          {
329  
                              //common prefixes and suffixes are ignored, because this speeds up the Damerau-levenshtein-Distance calculation without changing it.
330  
                              int ii = 0;
331  
                              int jj = 0;
332  
                              while ((ii < suggestion.length()) && (ii < input.length()) && (suggestion.charAt(ii) == input.charAt(ii))) ii++;
333  
                              while ((jj < suggestion.length() - ii) && (jj < input.length() - ii) && (suggestion.charAt(suggestion.length() - jj - 1) == input.charAt(input.length() - jj - 1))) jj++;
334  
                              if ((ii > 0) || (jj > 0)) { 
335  
                              	distance = DamerauLevenshteinDistance(suggestion.substring(ii, suggestion.length() - jj), input.substring(ii, input.length() - jj)); 
336  
                              } 
337  
                              else distance = DamerauLevenshteinDistance(suggestion, input);
338  
                          }
339  
                      }
340  
341  
                      //save some time.
342  
                      //remove all existing suggestions of higher distance, if verbose<2
343  
                      if ((verbose < 2) && (suggestions.size() > 0) && (suggestions.get(0).distance > distance)) suggestions.clear();
344  
                      //do not process higher distances than those already found, if verbose<2
345  
                      if ((verbose < 2) && (suggestions.size() > 0) && (distance > suggestions.get(0).distance)) continue;
346  
347  
                      if (distance <= editDistanceMax)
348  
                      {
349  
                      	value2 = dictionary.get(language + suggestion);
350  
                      	if (value2!=null)
351  
                          {
352  
                              suggestItem si = new suggestItem();
353  
                              si.term = suggestion;
354  
                              si.count = ((dictionaryItem)value2).count;
355  
                              si.distance = distance;
356  
                              suggestions.add(si);
357  
                          }
358  
                      }
359  
                  }
360  
              }//end foreach
361  
          }//end if         
362  
          
363  
          //add edits 
364  
          //derive edits (deletes) from candidate (input) and add them to candidates list
365  
          //this is a recursive process until the maximum edit distance has been reached
366  
          if (input.length() - candidate.length() < editDistanceMax)
367  
          {
368  
              //save some time
369  
              //do not create edits with edit distance smaller than suggestions already found
370  
              if ((verbose < 2) && (suggestions.size() > 0) && (input.length() - candidate.length() >= suggestions.get(0).distance)) continue;
371  
372  
              for (int i = 0; i < candidate.length(); i++)
373  
              {
374  
                  String delete = candidate.substring(0, i)+candidate.substring(i+1);
375  
                  if (hashset1.add(delete)) candidates.add(delete);
376  
              }
377  
          }
378  
        } //end lable nosort
379  
    } //end while
380  
    
381  
    //sort by ascending edit distance, then by descending word frequency
382  
    if (verbose < 2) 
383  
    	//suggestions.Sort((x, y) => -x.count.CompareTo(y.count));
384  
    	Collections.sort(suggestions, new Comparator<suggestItem>()
385  
                {
386  
            public int compare(suggestItem f1, suggestItem f2)
387  
            {
388  
                return -(f1.count-f2.count);
389  
            }        
390  
        });
391  
    else 
392  
    	//suggestions.Sort((x, y) => 2*x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count));
393  
    	Collections.sort(suggestions, new Comparator<suggestItem>()
394  
                {
395  
            public int compare(suggestItem x, suggestItem y)
396  
            {
397  
                return ((2*x.distance-y.distance)>0?1:0) - ((x.count - y.count)>0?1:0);
398  
            }        
399  
        });
400  
    if ((verbose == 0)&&(suggestions.size()>1)) 
401  
    	return suggestions.subList(0, 1); 
402  
    else return suggestions;
403  
}
404  
405  
private static void Correct(String input, String language)
406  
{
407  
    List<suggestItem> suggestions = null;
408  
409  
    /*
410  
    //Benchmark: 1000 x Lookup
411  
    Stopwatch stopWatch = new Stopwatch();
412  
    stopWatch.Start();
413  
    for (int i = 0; i < 1000; i++)
414  
    {
415  
        suggestions = Lookup(input,language,editDistanceMax);
416  
    }
417  
    stopWatch.Stop();
418  
    Console.WriteLine(stopWatch.ElapsedMilliseconds.ToString());
419  
    */
420  
    
421  
    
422  
    //check in dictionary for existence and frequency; sort by ascending edit distance, then by descending word frequency
423  
    suggestions = Lookup(input, language, editDistanceMax);
424  
425  
    //display term and frequency
426  
    for (suggestItem suggestion: suggestions)
427  
    {
428  
        System.out.println( suggestion.term + " " + suggestion.distance + " " + suggestion.count);
429  
    }
430  
    if (verbose !=0) System.out.println(suggestions.size() + " suggestions");
431  
    System.out.println("done");
432  
}
433  
434  
private static void ReadFromStdIn()
435  
{
436  
    String word;
437  
    BufferedReader br =  new BufferedReader(new InputStreamReader(System.in));
438  
    try {
439  
	while ((word = br.readLine())!=null)
440  
	{
441  
	    Correct(word,"");
442  
	}
443  
} catch (IOException e) {
444  
	// TODO Auto-generated catch block
445  
	e.printStackTrace();
446  
}
447  
}
448  
449  
// Damerau–Levenshtein distance algorithm and code 
450  
// from http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance (as retrieved in June 2012)
451  
public static int DamerauLevenshteinDistance(String a, String b) {
452  
	  final int inf = a.length() + b.length() + 1;
453  
  int[][] H = new int[a.length() + 2][b.length() + 2];
454  
  for (int i = 0; i <= a.length(); i++) {
455  
   H[i + 1][1] = i;
456  
   H[i + 1][0] = inf;
457  
  }
458  
  for (int j = 0; j <= b.length(); j++) {
459  
   H[1][j + 1] = j;
460  
   H[0][j + 1] = inf;
461  
  }
462  
  HashMap<Character, Integer> DA = new HashMap<Character, Integer>();
463  
  for (int d = 0; d < a.length(); d++) 
464  
   if (!DA.containsKey(a.charAt(d)))
465  
    DA.put(a.charAt(d), 0);
466  
  
467  
   
468  
  for (int d = 0; d < b.length(); d++) 
469  
   if (!DA.containsKey(b.charAt(d)))
470  
    DA.put(b.charAt(d), 0);
471  
  
472  
  for (int i = 1; i <= a.length(); i++) {
473  
   int DB = 0;
474  
   for (int j = 1; j <= b.length(); j++) {
475  
    final int i1 = DA.get(b.charAt(j - 1));
476  
    final int j1 = DB;
477  
    int d = 1;
478  
    if (a.charAt(i - 1) == b.charAt(j - 1)) {
479  
     d = 0;
480  
     DB = j;
481  
    }
482  
    H[i + 1][j + 1] = min(
483  
      H[i][j] + d, 
484  
      H[i + 1][j] + 1,
485  
      H[i][j + 1] + 1, 
486  
      H[i1][j1] + ((i - i1 - 1)) 
487  
      + 1 + ((j - j1 - 1)));
488  
   }
489  
   DA.put(a.charAt(i - 1), i);
490  
  }
491  
  return H[a.length() + 1][b.length() + 1];
492  
 }
493  
 
494  
static int min(int a, int b, int c, int d) {
495  
  return Math.min(a, Math.min(b, Math.min(c, d)));
496  
}
497  
498  
static int min(int a, int b) { ret Math.min(a, b); }
499  
static long min(long a, long b) { ret Math.min(a, b); }
500  
static float min(float a, float b) { ret Math.min(a, b); }

download  show line numbers  debug dex  old transpilations   

Travelled to 13 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt

No comments. add comment

Snippet ID: #1012161
Snippet name: SymSpell Spike (super-fast spelling correction algorithm) [OK]
Eternal ID of this version: #1012161/7
Text MD5: 19b2a088f0cf46b1fae9fa1a414b3ebf
Transpilation MD5: 1ce39b60bdb1b3a25225ee10b95bbba9
Author: stefan
Category: javax / nlp
Type: JavaX source code (desktop)
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2017-11-23 08:32:36
Source code size: 21115 bytes / 500 lines
Pitched / IR pitched: No / No
Views / Downloads: 651 / 1121
Version history: 6 change(s)
Referenced in: [show references]