SymSpell Spike (super-fast spelling correction algorithm) [OK] [1012161]

!7

// https://towardsdatascience.com/symspell-vs-bk-tree-100x-faster-fuzzy-string-search-spell-checking-c4f10d80a078

// SymSpell: 1 million times faster through Symmetric Delete spelling correction algorithm
//
// The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup 
// for a given Damerau-Levenshtein distance. It is six orders of magnitude faster and language independent.
// Opposite to other algorithms only deletes are required, no transposes + replaces + inserts.
// Transposes + replaces + inserts of the input term are transformed into deletes of the dictionary term.
// Replaces and inserts are expensive and language dependent: e.g. Chinese has 70,000 Unicode Han characters!
//
// Copyright (C) 2015 Wolf Garbe
// Version: 3.0
// Author: Wolf Garbe <wolf.garbe@faroo.com>
// Maintainer: Wolf Garbe <wolf.garbe@faroo.com>
// URL: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/
// Description: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/
//
// License:
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License, 
// version 3.0 (LGPL-3.0) as published by the Free Software Foundation.
// http://www.opensource.org/licenses/LGPL-3.0
//
// Usage: single word + Enter:  Display spelling suggestions
//        Enter without input:  Terminate the program

private static int editDistanceMax=2;
private static int verbose = 0;
//0: top suggestion
//1: all suggestions of smallest edit distance 
//2: all suggestions <= editDistanceMax (slower, no early termination)

private static class dictionaryItem
{
    public List<Integer> suggestions = new ArrayList<Integer>();
    public int count = 0;
}

private static class suggestItem
{
    public String term = "";
    public int distance = 0;
    public int count = 0;

    @Override
    public boolean equals(Object obj)
    {
        return term.equals(((suggestItem)obj).term);
    }
    
    @Override
    public int hashCode()
    {
        return term.hashCode();
    }       
}

p {
  //Create the dictionary from a sample corpus
  //e.g. http://norvig.com/big.txt , or any other large text corpus
  //The dictionary may contain vocabulary from different languages. 
  //If you use mixed vocabulary use the language parameter in Correct() and CreateDictionary() accordingly.
  //You may use CreateDictionaryEntry() to update a (self learning) dictionary incrementally
  //To extend spelling correction beyond single words to phrases (e.g. correcting "unitedkingom" to "united kingdom") simply add those phrases with CreateDictionaryEntry().
  File corpus = prepareProgramFile("big.txt");
  gunzipFile(loadLibrary(#1012160), corpus);
  CreateDictionary(f2s(corpus), "");
  ReadFromStdIn();
}

//Dictionary that contains both the original words and the deletes derived from them. A term might be both word and delete from another word at the same time.
//For space reduction a item might be either of type dictionaryItem or Int. 
//A dictionaryItem is used for word, word/delete, and delete with multiple suggestions. Int is used for deletes with a single suggestion (the majority of entries).
private static HashMap<String, Object> dictionary = new HashMap<String, Object>(); //initialisierung

//List of unique words. By using the suggestions (Int) as index for this list they are translated into the original String.
private static List<String> wordlist = new ArrayList<String>();

//create a non-unique wordlist from sample text
//language independent (e.g. works with Chinese characters)
private static Iterable<String> parseWords(String text)
{
    // \w Alphanumeric characters (including non-latin characters, umlaut characters and digits) plus "_" 
    // \d Digits
    // Provides identical results to Norvigs regex "[a-z]+" for latin characters, while additionally providing compatibility with non-latin characters
	List<String> allMatches = new ArrayList<String>();
	Matcher m = Pattern.compile("[\\w-[\\d_]]+").matcher(text.toLowerCase());
	while (m.find()) {
		allMatches.add(m.group());
	}
	return allMatches;
}

public static int maxlength = 0;//maximum dictionary term length

//for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary
//every delete entry has a suggestions list, which points to the original term(s) it was created from
//The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry
private static boolean CreateDictionaryEntry(String key, String language)
{
	boolean result = false;
    dictionaryItem value=null;
    Object valueo;
    valueo = dictionary.get(language+key);
    if (valueo!=null)
    {
        //int or dictionaryItem? delete existed before word!
        if (valueo instanceof Integer) { 
        	int tmp = (int)valueo; 
        	value = new dictionaryItem(); 
        	value.suggestions.add(tmp); 
        	dictionary.put(language + key,value); 
    	}

        //already exists:
        //1. word appears several times
        //2. word1==deletes(word2) 
        else
        {
            value = (dictionaryItem)valueo;
        }

        //prevent overflow
        if (value.count < Integer.MAX_VALUE) value.count++;
    }
    else if (wordlist.size() < Integer.MAX_VALUE)
    {
        value = new dictionaryItem();
        value.count++;
        dictionary.put(language + key, value);

        if (key.length() > maxlength) maxlength = key.length();
    }
    
    //edits/suggestions are created only once, no matter how often word occurs
    //edits/suggestions are created only as soon as the word occurs in the corpus, 
    //even if the same term existed before in the dictionary as an edit from another word
    //a treshold might be specifid, when a term occurs so frequently in the corpus that it is considered a valid word for spelling correction
    if(value.count == 1)
    {
        //word2index
        wordlist.add(key);
        int keyint = (int)(wordlist.size() - 1);

        result = true;

        //create deletes
        for (String delete : Edits(key, 0, new HashSet<String>()))
        {
            Object value2;
            value2 = dictionary.get(language+delete);
            if (value2!=null)
            {
                //already exists:
                //1. word1==deletes(word2) 
                //2. deletes(word1)==deletes(word2) 
                //int or dictionaryItem? single delete existed before!
                if (value2 instanceof Integer) 
                {
                    //transformes int to dictionaryItem
                    int tmp = (int)value2; 
                    dictionaryItem di = new dictionaryItem(); 
                    di.suggestions.add(tmp); 
                    dictionary.put(language + delete,di);
                    if (!di.suggestions.contains(keyint)) AddLowestDistance(di, key, keyint, delete);
                }
                else if (!((dictionaryItem)value2).suggestions.contains(keyint)) AddLowestDistance((dictionaryItem) value2, key, keyint, delete);
            }
            else
            {
                dictionary.put(language + delete, keyint);         
            }

        }
    }
    return result;
}

//create a frequency dictionary from a corpus
private static void CreateDictionary(String corpus, String language)
{
	File f = new File(corpus);
    if(!(f.exists() && !f.isDirectory()))
    {
        System.out.println("File not found: " + corpus);
        return;
    }

    System.out.println("Creating dictionary ...");
    long startTime = System.currentTimeMillis();
    long wordCount = 0;
    
    BufferedReader br = null;
    try {
	br = new BufferedReader(new FileReader(corpus));
      String line;
      while ((line = br.readLine()) != null) 
      {
          for (String key : parseWords(line))
          {
             if (CreateDictionaryEntry(key, language)) wordCount++;
          }
      }
    }
    catch (Exception e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
}
    //wordlist.TrimExcess();
    long endTime = System.currentTimeMillis();
    System.out.println("\rDictionary: " + wordCount + " words, " + dictionary.size() + " entries, edit distance=" + editDistanceMax + " in " + (endTime-startTime)+"ms ");
}

//save some time and space
private static void AddLowestDistance(dictionaryItem item, String suggestion, int suggestionint, String delete)
{
    //remove all existing suggestions of higher distance, if verbose<2
    //index2word
	//TODO check
    if ((verbose < 2) && (item.suggestions.size() > 0) && (wordlist.get(item.suggestions.get(0)).length()-delete.length() > suggestion.length() - delete.length())) item.suggestions.clear();
    //do not add suggestion of higher distance than existing, if verbose<2
    if ((verbose == 2) || (item.suggestions.size() == 0) || (wordlist.get(item.suggestions.get(0)).length()-delete.length() >= suggestion.length() - delete.length())) item.suggestions.add(suggestionint); 
}

//inexpensive and language independent: only deletes, no transposes + replaces + inserts
//replaces and inserts are expensive and language dependent (Chinese has 70,000 Unicode Han characters)
private static HashSet<String> Edits(String word, int editDistance, HashSet<String> deletes)
{
    editDistance++;
    if (word.length() > 1)
    {
        for (int i = 0; i < word.length(); i++)
        {
        	//delete ith character
            String delete =  word.substring(0,i)+word.substring(i+1);
            if (deletes.add(delete))
            {
                //recursion, if maximum edit distance not yet reached
                if (editDistance < editDistanceMax) Edits(delete, editDistance, deletes);
            }
        }
    }
    return deletes;
}

private static List<suggestItem> Lookup(String input, String language, int editDistanceMax)
{
    //save some time
    if (input.length() - editDistanceMax > maxlength) 
    	return new ArrayList<suggestItem>();

    List<String> candidates = new ArrayList<String>();
    HashSet<String> hashset1 = new HashSet<String>();

    List<suggestItem> suggestions = new ArrayList<suggestItem>();
    HashSet<String> hashset2 = new HashSet<String>();

    Object valueo;

    //add original term
    candidates.add(input);

    while (candidates.size()>0)
    {
        String candidate = candidates.get(0);
        candidates.remove(0);

        //save some time
        //early termination
        //suggestion distance=candidate.distance... candidate.distance+editDistanceMax                
        //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected
        
        //label for c# goto replacement
        nosort:{
        
        	if ((verbose < 2) && (suggestions.size() > 0) && (input.length()-candidate.length() > suggestions.get(0).distance)) 
        		break nosort;

          //read candidate entry from dictionary
        	valueo = dictionary.get(language + candidate);
          if (valueo != null)
          {
              dictionaryItem value= new dictionaryItem();
              if (valueo instanceof Integer) 
              	value.suggestions.add((int)valueo);
              else value = (dictionaryItem)valueo;

              //if count>0 then candidate entry is correct dictionary term, not only delete item
              if ((value.count > 0) && hashset2.add(candidate))
              {
                  //add correct dictionary term term to suggestion list
                  suggestItem si = new suggestItem();
                  si.term = candidate;
                  si.count = value.count;
                  si.distance = input.length() - candidate.length();
                  suggestions.add(si);
                  //early termination
                  if ((verbose < 2) && (input.length() - candidate.length() == 0)) 
                  	break nosort;
              }

              //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list
              Object value2;
              for (int suggestionint : value.suggestions)
              {
                  //save some time 
                  //skipping double items early: different deletes of the input term can lead to the same suggestion
                  //index2word
              	//TODO
              	String suggestion = wordlist.get(suggestionint);
                  if (hashset2.add(suggestion))
                  {
                      //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0
                      //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. 
                      //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax.
                      //For inserts and deletes the resulting edit distance might exceed editDistanceMax.
                      //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides.
                      //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1)
                      //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2.
                      int distance = 0;
                      if (suggestion != input)
                      {
                          if (suggestion.length() == candidate.length()) distance = input.length() - candidate.length();
                          else if (input.length() == candidate.length()) distance = suggestion.length() - candidate.length();
                          else
                          {
                              //common prefixes and suffixes are ignored, because this speeds up the Damerau-levenshtein-Distance calculation without changing it.
                              int ii = 0;
                              int jj = 0;
                              while ((ii < suggestion.length()) && (ii < input.length()) && (suggestion.charAt(ii) == input.charAt(ii))) ii++;
                              while ((jj < suggestion.length() - ii) && (jj < input.length() - ii) && (suggestion.charAt(suggestion.length() - jj - 1) == input.charAt(input.length() - jj - 1))) jj++;
                              if ((ii > 0) || (jj > 0)) { 
                              	distance = DamerauLevenshteinDistance(suggestion.substring(ii, suggestion.length() - jj), input.substring(ii, input.length() - jj)); 
                              } 
                              else distance = DamerauLevenshteinDistance(suggestion, input);
                          }
                      }

                      //save some time.
                      //remove all existing suggestions of higher distance, if verbose<2
                      if ((verbose < 2) && (suggestions.size() > 0) && (suggestions.get(0).distance > distance)) suggestions.clear();
                      //do not process higher distances than those already found, if verbose<2
                      if ((verbose < 2) && (suggestions.size() > 0) && (distance > suggestions.get(0).distance)) continue;

                      if (distance <= editDistanceMax)
                      {
                      	value2 = dictionary.get(language + suggestion);
                      	if (value2!=null)
                          {
                              suggestItem si = new suggestItem();
                              si.term = suggestion;
                              si.count = ((dictionaryItem)value2).count;
                              si.distance = distance;
                              suggestions.add(si);
                          }
                      }
                  }
              }//end foreach
          }//end if         
          
          //add edits 
          //derive edits (deletes) from candidate (input) and add them to candidates list
          //this is a recursive process until the maximum edit distance has been reached
          if (input.length() - candidate.length() < editDistanceMax)
          {
              //save some time
              //do not create edits with edit distance smaller than suggestions already found
              if ((verbose < 2) && (suggestions.size() > 0) && (input.length() - candidate.length() >= suggestions.get(0).distance)) continue;

              for (int i = 0; i < candidate.length(); i++)
              {
                  String delete = candidate.substring(0, i)+candidate.substring(i+1);
                  if (hashset1.add(delete)) candidates.add(delete);
              }
          }
        } //end lable nosort
    } //end while
    
    //sort by ascending edit distance, then by descending word frequency
    if (verbose < 2) 
    	//suggestions.Sort((x, y) => -x.count.CompareTo(y.count));
    	Collections.sort(suggestions, new Comparator<suggestItem>()
                {
            public int compare(suggestItem f1, suggestItem f2)
            {
                return -(f1.count-f2.count);
            }        
        });
    else 
    	//suggestions.Sort((x, y) => 2*x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count));
    	Collections.sort(suggestions, new Comparator<suggestItem>()
                {
            public int compare(suggestItem x, suggestItem y)
            {
                return ((2*x.distance-y.distance)>0?1:0) - ((x.count - y.count)>0?1:0);
            }        
        });
    if ((verbose == 0)&&(suggestions.size()>1)) 
    	return suggestions.subList(0, 1); 
    else return suggestions;
}

private static void Correct(String input, String language)
{
    List<suggestItem> suggestions = null;

    /*
    //Benchmark: 1000 x Lookup
    Stopwatch stopWatch = new Stopwatch();
    stopWatch.Start();
    for (int i = 0; i < 1000; i++)
    {
        suggestions = Lookup(input,language,editDistanceMax);
    }
    stopWatch.Stop();
    Console.WriteLine(stopWatch.ElapsedMilliseconds.ToString());
    */
    
    
    //check in dictionary for existence and frequency; sort by ascending edit distance, then by descending word frequency
    suggestions = Lookup(input, language, editDistanceMax);

    //display term and frequency
    for (suggestItem suggestion: suggestions)
    {
        System.out.println( suggestion.term + " " + suggestion.distance + " " + suggestion.count);
    }
    if (verbose !=0) System.out.println(suggestions.size() + " suggestions");
    System.out.println("done");
}

private static void ReadFromStdIn()
{
    String word;
    BufferedReader br =  new BufferedReader(new InputStreamReader(System.in));
    try {
	while ((word = br.readLine())!=null)
	{
	    Correct(word,"");
	}
} catch (IOException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
}
}

// Damerau–Levenshtein distance algorithm and code 
// from http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance (as retrieved in June 2012)
public static int DamerauLevenshteinDistance(String a, String b) {
	  final int inf = a.length() + b.length() + 1;
  int[][] H = new int[a.length() + 2][b.length() + 2];
  for (int i = 0; i <= a.length(); i++) {
   H[i + 1][1] = i;
   H[i + 1][0] = inf;
  }
  for (int j = 0; j <= b.length(); j++) {
   H[1][j + 1] = j;
   H[0][j + 1] = inf;
  }
  HashMap<Character, Integer> DA = new HashMap<Character, Integer>();
  for (int d = 0; d < a.length(); d++) 
   if (!DA.containsKey(a.charAt(d)))
    DA.put(a.charAt(d), 0);
  
   
  for (int d = 0; d < b.length(); d++) 
   if (!DA.containsKey(b.charAt(d)))
    DA.put(b.charAt(d), 0);
  
  for (int i = 1; i <= a.length(); i++) {
   int DB = 0;
   for (int j = 1; j <= b.length(); j++) {
    final int i1 = DA.get(b.charAt(j - 1));
    final int j1 = DB;
    int d = 1;
    if (a.charAt(i - 1) == b.charAt(j - 1)) {
     d = 0;
     DB = j;
    }
    H[i + 1][j + 1] = min(
      H[i][j] + d, 
      H[i + 1][j] + 1,
      H[i][j + 1] + 1, 
      H[i1][j1] + ((i - i1 - 1)) 
      + 1 + ((j - j1 - 1)));
   }
   DA.put(a.charAt(i - 1), i);
  }
  return H[a.length() + 1][b.length() + 1];
 }
 
static int min(int a, int b, int c, int d) {
  return Math.min(a, Math.min(b, Math.min(c, d)));
}

static int min(int a, int b) { ret Math.min(a, b); }
static long min(long a, long b) { ret Math.min(a, b); }
static float min(float a, float b) { ret Math.min(a, b); }
Travelled to 13 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
Snippet ID:	#1012161
Snippet name:	SymSpell Spike (super-fast spelling correction algorithm) [OK]
Eternal ID of this version:	#1012161/7
Text MD5:	19b2a088f0cf46b1fae9fa1a414b3ebf
Transpilation MD5:	1ce39b60bdb1b3a25225ee10b95bbba9
Author:	stefan
Category:	javax / nlp
Type:	JavaX source code (desktop)
Public (visible to everyone):	Yes
Archived (hidden from active list):	No
Created/modified:	2017-11-23 08:32:36
Source code size:	21115 bytes / 500 lines
Pitched / IR pitched:	No / No
Views / Downloads:	825 / 1551
Version history:	6 change(s)
Referenced in:
< > BotCompany Repo | #1012161 // SymSpell Spike (super-fast spelling correction algorithm) [OK]

JavaX source code (desktop) [tags: use-pretranspiled] - run with: x30.jar