Download Jar. Libraryless. Click here for Pure Java version (16773L/120K).
1 | !7 |
2 | |
3 | // https://towardsdatascience.com/symspell-vs-bk-tree-100x-faster-fuzzy-string-search-spell-checking-c4f10d80a078 |
4 | |
5 | // SymSpell: 1 million times faster through Symmetric Delete spelling correction algorithm |
6 | // |
7 | // The Symmetric Delete spelling correction algorithm reduces the complexity of edit candidate generation and dictionary lookup |
8 | // for a given Damerau-Levenshtein distance. It is six orders of magnitude faster and language independent. |
9 | // Opposite to other algorithms only deletes are required, no transposes + replaces + inserts. |
10 | // Transposes + replaces + inserts of the input term are transformed into deletes of the dictionary term. |
11 | // Replaces and inserts are expensive and language dependent: e.g. Chinese has 70,000 Unicode Han characters! |
12 | // |
13 | // Copyright (C) 2015 Wolf Garbe |
14 | // Version: 3.0 |
15 | // Author: Wolf Garbe <wolf.garbe@faroo.com> |
16 | // Maintainer: Wolf Garbe <wolf.garbe@faroo.com> |
17 | // URL: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/ |
18 | // Description: http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/ |
19 | // |
20 | // License: |
21 | // This program is free software; you can redistribute it and/or modify |
22 | // it under the terms of the GNU Lesser General Public License, |
23 | // version 3.0 (LGPL-3.0) as published by the Free Software Foundation. |
24 | // http://www.opensource.org/licenses/LGPL-3.0 |
25 | // |
26 | // Usage: single word + Enter: Display spelling suggestions |
27 | // Enter without input: Terminate the program |
28 | |
29 | private static int editDistanceMax=2; |
30 | private static int verbose = 0; |
31 | //0: top suggestion |
32 | //1: all suggestions of smallest edit distance |
33 | //2: all suggestions <= editDistanceMax (slower, no early termination) |
34 | |
35 | private static class dictionaryItem |
36 | { |
37 | public List<Integer> suggestions = new ArrayList<Integer>(); |
38 | public int count = 0; |
39 | } |
40 | |
41 | private static class suggestItem |
42 | { |
43 | public String term = ""; |
44 | public int distance = 0; |
45 | public int count = 0; |
46 | |
47 | @Override |
48 | public boolean equals(Object obj) |
49 | { |
50 | return term.equals(((suggestItem)obj).term); |
51 | } |
52 | |
53 | @Override |
54 | public int hashCode() |
55 | { |
56 | return term.hashCode(); |
57 | } |
58 | } |
59 | |
60 | p { |
61 | //Create the dictionary from a sample corpus |
62 | //e.g. http://norvig.com/big.txt , or any other large text corpus |
63 | //The dictionary may contain vocabulary from different languages. |
64 | //If you use mixed vocabulary use the language parameter in Correct() and CreateDictionary() accordingly. |
65 | //You may use CreateDictionaryEntry() to update a (self learning) dictionary incrementally |
66 | //To extend spelling correction beyond single words to phrases (e.g. correcting "unitedkingom" to "united kingdom") simply add those phrases with CreateDictionaryEntry(). |
67 | File corpus = prepareProgramFile("big.txt"); |
68 | gunzipFile(loadLibrary(#1012160), corpus); |
69 | CreateDictionary(f2s(corpus), ""); |
70 | ReadFromStdIn(); |
71 | } |
72 | |
73 | //Dictionary that contains both the original words and the deletes derived from them. A term might be both word and delete from another word at the same time. |
74 | //For space reduction a item might be either of type dictionaryItem or Int. |
75 | //A dictionaryItem is used for word, word/delete, and delete with multiple suggestions. Int is used for deletes with a single suggestion (the majority of entries). |
76 | private static HashMap<String, Object> dictionary = new HashMap<String, Object>(); //initialisierung |
77 | |
78 | //List of unique words. By using the suggestions (Int) as index for this list they are translated into the original String. |
79 | private static List<String> wordlist = new ArrayList<String>(); |
80 | |
81 | //create a non-unique wordlist from sample text |
82 | //language independent (e.g. works with Chinese characters) |
83 | private static Iterable<String> parseWords(String text) |
84 | { |
85 | // \w Alphanumeric characters (including non-latin characters, umlaut characters and digits) plus "_" |
86 | // \d Digits |
87 | // Provides identical results to Norvigs regex "[a-z]+" for latin characters, while additionally providing compatibility with non-latin characters |
88 | List<String> allMatches = new ArrayList<String>(); |
89 | Matcher m = Pattern.compile("[\\w-[\\d_]]+").matcher(text.toLowerCase()); |
90 | while (m.find()) { |
91 | allMatches.add(m.group()); |
92 | } |
93 | return allMatches; |
94 | } |
95 | |
96 | public static int maxlength = 0;//maximum dictionary term length |
97 | |
98 | //for every word there all deletes with an edit distance of 1..editDistanceMax created and added to the dictionary |
99 | //every delete entry has a suggestions list, which points to the original term(s) it was created from |
100 | //The dictionary may be dynamically updated (word frequency and new words) at any time by calling createDictionaryEntry |
101 | private static boolean CreateDictionaryEntry(String key, String language) |
102 | { |
103 | boolean result = false; |
104 | dictionaryItem value=null; |
105 | Object valueo; |
106 | valueo = dictionary.get(language+key); |
107 | if (valueo!=null) |
108 | { |
109 | //int or dictionaryItem? delete existed before word! |
110 | if (valueo instanceof Integer) { |
111 | int tmp = (int)valueo; |
112 | value = new dictionaryItem(); |
113 | value.suggestions.add(tmp); |
114 | dictionary.put(language + key,value); |
115 | } |
116 | |
117 | //already exists: |
118 | //1. word appears several times |
119 | //2. word1==deletes(word2) |
120 | else |
121 | { |
122 | value = (dictionaryItem)valueo; |
123 | } |
124 | |
125 | //prevent overflow |
126 | if (value.count < Integer.MAX_VALUE) value.count++; |
127 | } |
128 | else if (wordlist.size() < Integer.MAX_VALUE) |
129 | { |
130 | value = new dictionaryItem(); |
131 | value.count++; |
132 | dictionary.put(language + key, value); |
133 | |
134 | if (key.length() > maxlength) maxlength = key.length(); |
135 | } |
136 | |
137 | //edits/suggestions are created only once, no matter how often word occurs |
138 | //edits/suggestions are created only as soon as the word occurs in the corpus, |
139 | //even if the same term existed before in the dictionary as an edit from another word |
140 | //a treshold might be specifid, when a term occurs so frequently in the corpus that it is considered a valid word for spelling correction |
141 | if(value.count == 1) |
142 | { |
143 | //word2index |
144 | wordlist.add(key); |
145 | int keyint = (int)(wordlist.size() - 1); |
146 | |
147 | result = true; |
148 | |
149 | //create deletes |
150 | for (String delete : Edits(key, 0, new HashSet<String>())) |
151 | { |
152 | Object value2; |
153 | value2 = dictionary.get(language+delete); |
154 | if (value2!=null) |
155 | { |
156 | //already exists: |
157 | //1. word1==deletes(word2) |
158 | //2. deletes(word1)==deletes(word2) |
159 | //int or dictionaryItem? single delete existed before! |
160 | if (value2 instanceof Integer) |
161 | { |
162 | //transformes int to dictionaryItem |
163 | int tmp = (int)value2; |
164 | dictionaryItem di = new dictionaryItem(); |
165 | di.suggestions.add(tmp); |
166 | dictionary.put(language + delete,di); |
167 | if (!di.suggestions.contains(keyint)) AddLowestDistance(di, key, keyint, delete); |
168 | } |
169 | else if (!((dictionaryItem)value2).suggestions.contains(keyint)) AddLowestDistance((dictionaryItem) value2, key, keyint, delete); |
170 | } |
171 | else |
172 | { |
173 | dictionary.put(language + delete, keyint); |
174 | } |
175 | |
176 | } |
177 | } |
178 | return result; |
179 | } |
180 | |
181 | //create a frequency dictionary from a corpus |
182 | private static void CreateDictionary(String corpus, String language) |
183 | { |
184 | File f = new File(corpus); |
185 | if(!(f.exists() && !f.isDirectory())) |
186 | { |
187 | System.out.println("File not found: " + corpus); |
188 | return; |
189 | } |
190 | |
191 | System.out.println("Creating dictionary ..."); |
192 | long startTime = System.currentTimeMillis(); |
193 | long wordCount = 0; |
194 | |
195 | BufferedReader br = null; |
196 | try { |
197 | br = new BufferedReader(new FileReader(corpus)); |
198 | String line; |
199 | while ((line = br.readLine()) != null) |
200 | { |
201 | for (String key : parseWords(line)) |
202 | { |
203 | if (CreateDictionaryEntry(key, language)) wordCount++; |
204 | } |
205 | } |
206 | } |
207 | catch (Exception e) { |
208 | // TODO Auto-generated catch block |
209 | e.printStackTrace(); |
210 | } |
211 | //wordlist.TrimExcess(); |
212 | long endTime = System.currentTimeMillis(); |
213 | System.out.println("\rDictionary: " + wordCount + " words, " + dictionary.size() + " entries, edit distance=" + editDistanceMax + " in " + (endTime-startTime)+"ms "); |
214 | } |
215 | |
216 | //save some time and space |
217 | private static void AddLowestDistance(dictionaryItem item, String suggestion, int suggestionint, String delete) |
218 | { |
219 | //remove all existing suggestions of higher distance, if verbose<2 |
220 | //index2word |
221 | //TODO check |
222 | if ((verbose < 2) && (item.suggestions.size() > 0) && (wordlist.get(item.suggestions.get(0)).length()-delete.length() > suggestion.length() - delete.length())) item.suggestions.clear(); |
223 | //do not add suggestion of higher distance than existing, if verbose<2 |
224 | if ((verbose == 2) || (item.suggestions.size() == 0) || (wordlist.get(item.suggestions.get(0)).length()-delete.length() >= suggestion.length() - delete.length())) item.suggestions.add(suggestionint); |
225 | } |
226 | |
227 | //inexpensive and language independent: only deletes, no transposes + replaces + inserts |
228 | //replaces and inserts are expensive and language dependent (Chinese has 70,000 Unicode Han characters) |
229 | private static HashSet<String> Edits(String word, int editDistance, HashSet<String> deletes) |
230 | { |
231 | editDistance++; |
232 | if (word.length() > 1) |
233 | { |
234 | for (int i = 0; i < word.length(); i++) |
235 | { |
236 | //delete ith character |
237 | String delete = word.substring(0,i)+word.substring(i+1); |
238 | if (deletes.add(delete)) |
239 | { |
240 | //recursion, if maximum edit distance not yet reached |
241 | if (editDistance < editDistanceMax) Edits(delete, editDistance, deletes); |
242 | } |
243 | } |
244 | } |
245 | return deletes; |
246 | } |
247 | |
248 | private static List<suggestItem> Lookup(String input, String language, int editDistanceMax) |
249 | { |
250 | //save some time |
251 | if (input.length() - editDistanceMax > maxlength) |
252 | return new ArrayList<suggestItem>(); |
253 | |
254 | List<String> candidates = new ArrayList<String>(); |
255 | HashSet<String> hashset1 = new HashSet<String>(); |
256 | |
257 | List<suggestItem> suggestions = new ArrayList<suggestItem>(); |
258 | HashSet<String> hashset2 = new HashSet<String>(); |
259 | |
260 | Object valueo; |
261 | |
262 | //add original term |
263 | candidates.add(input); |
264 | |
265 | while (candidates.size()>0) |
266 | { |
267 | String candidate = candidates.get(0); |
268 | candidates.remove(0); |
269 | |
270 | //save some time |
271 | //early termination |
272 | //suggestion distance=candidate.distance... candidate.distance+editDistanceMax |
273 | //if canddate distance is already higher than suggestion distance, than there are no better suggestions to be expected |
274 | |
275 | //label for c# goto replacement |
276 | nosort:{ |
277 | |
278 | if ((verbose < 2) && (suggestions.size() > 0) && (input.length()-candidate.length() > suggestions.get(0).distance)) |
279 | break nosort; |
280 | |
281 | //read candidate entry from dictionary |
282 | valueo = dictionary.get(language + candidate); |
283 | if (valueo != null) |
284 | { |
285 | dictionaryItem value= new dictionaryItem(); |
286 | if (valueo instanceof Integer) |
287 | value.suggestions.add((int)valueo); |
288 | else value = (dictionaryItem)valueo; |
289 | |
290 | //if count>0 then candidate entry is correct dictionary term, not only delete item |
291 | if ((value.count > 0) && hashset2.add(candidate)) |
292 | { |
293 | //add correct dictionary term term to suggestion list |
294 | suggestItem si = new suggestItem(); |
295 | si.term = candidate; |
296 | si.count = value.count; |
297 | si.distance = input.length() - candidate.length(); |
298 | suggestions.add(si); |
299 | //early termination |
300 | if ((verbose < 2) && (input.length() - candidate.length() == 0)) |
301 | break nosort; |
302 | } |
303 | |
304 | //iterate through suggestions (to other correct dictionary items) of delete item and add them to suggestion list |
305 | Object value2; |
306 | for (int suggestionint : value.suggestions) |
307 | { |
308 | //save some time |
309 | //skipping double items early: different deletes of the input term can lead to the same suggestion |
310 | //index2word |
311 | //TODO |
312 | String suggestion = wordlist.get(suggestionint); |
313 | if (hashset2.add(suggestion)) |
314 | { |
315 | //True Damerau-Levenshtein Edit Distance: adjust distance, if both distances>0 |
316 | //We allow simultaneous edits (deletes) of editDistanceMax on on both the dictionary and the input term. |
317 | //For replaces and adjacent transposes the resulting edit distance stays <= editDistanceMax. |
318 | //For inserts and deletes the resulting edit distance might exceed editDistanceMax. |
319 | //To prevent suggestions of a higher edit distance, we need to calculate the resulting edit distance, if there are simultaneous edits on both sides. |
320 | //Example: (bank==bnak and bank==bink, but bank!=kanb and bank!=xban and bank!=baxn for editDistanceMaxe=1) |
321 | //Two deletes on each side of a pair makes them all equal, but the first two pairs have edit distance=1, the others edit distance=2. |
322 | int distance = 0; |
323 | if (suggestion != input) |
324 | { |
325 | if (suggestion.length() == candidate.length()) distance = input.length() - candidate.length(); |
326 | else if (input.length() == candidate.length()) distance = suggestion.length() - candidate.length(); |
327 | else |
328 | { |
329 | //common prefixes and suffixes are ignored, because this speeds up the Damerau-levenshtein-Distance calculation without changing it. |
330 | int ii = 0; |
331 | int jj = 0; |
332 | while ((ii < suggestion.length()) && (ii < input.length()) && (suggestion.charAt(ii) == input.charAt(ii))) ii++; |
333 | while ((jj < suggestion.length() - ii) && (jj < input.length() - ii) && (suggestion.charAt(suggestion.length() - jj - 1) == input.charAt(input.length() - jj - 1))) jj++; |
334 | if ((ii > 0) || (jj > 0)) { |
335 | distance = DamerauLevenshteinDistance(suggestion.substring(ii, suggestion.length() - jj), input.substring(ii, input.length() - jj)); |
336 | } |
337 | else distance = DamerauLevenshteinDistance(suggestion, input); |
338 | } |
339 | } |
340 | |
341 | //save some time. |
342 | //remove all existing suggestions of higher distance, if verbose<2 |
343 | if ((verbose < 2) && (suggestions.size() > 0) && (suggestions.get(0).distance > distance)) suggestions.clear(); |
344 | //do not process higher distances than those already found, if verbose<2 |
345 | if ((verbose < 2) && (suggestions.size() > 0) && (distance > suggestions.get(0).distance)) continue; |
346 | |
347 | if (distance <= editDistanceMax) |
348 | { |
349 | value2 = dictionary.get(language + suggestion); |
350 | if (value2!=null) |
351 | { |
352 | suggestItem si = new suggestItem(); |
353 | si.term = suggestion; |
354 | si.count = ((dictionaryItem)value2).count; |
355 | si.distance = distance; |
356 | suggestions.add(si); |
357 | } |
358 | } |
359 | } |
360 | }//end foreach |
361 | }//end if |
362 | |
363 | //add edits |
364 | //derive edits (deletes) from candidate (input) and add them to candidates list |
365 | //this is a recursive process until the maximum edit distance has been reached |
366 | if (input.length() - candidate.length() < editDistanceMax) |
367 | { |
368 | //save some time |
369 | //do not create edits with edit distance smaller than suggestions already found |
370 | if ((verbose < 2) && (suggestions.size() > 0) && (input.length() - candidate.length() >= suggestions.get(0).distance)) continue; |
371 | |
372 | for (int i = 0; i < candidate.length(); i++) |
373 | { |
374 | String delete = candidate.substring(0, i)+candidate.substring(i+1); |
375 | if (hashset1.add(delete)) candidates.add(delete); |
376 | } |
377 | } |
378 | } //end lable nosort |
379 | } //end while |
380 | |
381 | //sort by ascending edit distance, then by descending word frequency |
382 | if (verbose < 2) |
383 | //suggestions.Sort((x, y) => -x.count.CompareTo(y.count)); |
384 | Collections.sort(suggestions, new Comparator<suggestItem>() |
385 | { |
386 | public int compare(suggestItem f1, suggestItem f2) |
387 | { |
388 | return -(f1.count-f2.count); |
389 | } |
390 | }); |
391 | else |
392 | //suggestions.Sort((x, y) => 2*x.distance.CompareTo(y.distance) - x.count.CompareTo(y.count)); |
393 | Collections.sort(suggestions, new Comparator<suggestItem>() |
394 | { |
395 | public int compare(suggestItem x, suggestItem y) |
396 | { |
397 | return ((2*x.distance-y.distance)>0?1:0) - ((x.count - y.count)>0?1:0); |
398 | } |
399 | }); |
400 | if ((verbose == 0)&&(suggestions.size()>1)) |
401 | return suggestions.subList(0, 1); |
402 | else return suggestions; |
403 | } |
404 | |
405 | private static void Correct(String input, String language) |
406 | { |
407 | List<suggestItem> suggestions = null; |
408 | |
409 | /* |
410 | //Benchmark: 1000 x Lookup |
411 | Stopwatch stopWatch = new Stopwatch(); |
412 | stopWatch.Start(); |
413 | for (int i = 0; i < 1000; i++) |
414 | { |
415 | suggestions = Lookup(input,language,editDistanceMax); |
416 | } |
417 | stopWatch.Stop(); |
418 | Console.WriteLine(stopWatch.ElapsedMilliseconds.ToString()); |
419 | */ |
420 | |
421 | |
422 | //check in dictionary for existence and frequency; sort by ascending edit distance, then by descending word frequency |
423 | suggestions = Lookup(input, language, editDistanceMax); |
424 | |
425 | //display term and frequency |
426 | for (suggestItem suggestion: suggestions) |
427 | { |
428 | System.out.println( suggestion.term + " " + suggestion.distance + " " + suggestion.count); |
429 | } |
430 | if (verbose !=0) System.out.println(suggestions.size() + " suggestions"); |
431 | System.out.println("done"); |
432 | } |
433 | |
434 | private static void ReadFromStdIn() |
435 | { |
436 | String word; |
437 | BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); |
438 | try { |
439 | while ((word = br.readLine())!=null) |
440 | { |
441 | Correct(word,""); |
442 | } |
443 | } catch (IOException e) { |
444 | // TODO Auto-generated catch block |
445 | e.printStackTrace(); |
446 | } |
447 | } |
448 | |
449 | // Damerau–Levenshtein distance algorithm and code |
450 | // from http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance (as retrieved in June 2012) |
451 | public static int DamerauLevenshteinDistance(String a, String b) { |
452 | final int inf = a.length() + b.length() + 1; |
453 | int[][] H = new int[a.length() + 2][b.length() + 2]; |
454 | for (int i = 0; i <= a.length(); i++) { |
455 | H[i + 1][1] = i; |
456 | H[i + 1][0] = inf; |
457 | } |
458 | for (int j = 0; j <= b.length(); j++) { |
459 | H[1][j + 1] = j; |
460 | H[0][j + 1] = inf; |
461 | } |
462 | HashMap<Character, Integer> DA = new HashMap<Character, Integer>(); |
463 | for (int d = 0; d < a.length(); d++) |
464 | if (!DA.containsKey(a.charAt(d))) |
465 | DA.put(a.charAt(d), 0); |
466 | |
467 | |
468 | for (int d = 0; d < b.length(); d++) |
469 | if (!DA.containsKey(b.charAt(d))) |
470 | DA.put(b.charAt(d), 0); |
471 | |
472 | for (int i = 1; i <= a.length(); i++) { |
473 | int DB = 0; |
474 | for (int j = 1; j <= b.length(); j++) { |
475 | final int i1 = DA.get(b.charAt(j - 1)); |
476 | final int j1 = DB; |
477 | int d = 1; |
478 | if (a.charAt(i - 1) == b.charAt(j - 1)) { |
479 | d = 0; |
480 | DB = j; |
481 | } |
482 | H[i + 1][j + 1] = min( |
483 | H[i][j] + d, |
484 | H[i + 1][j] + 1, |
485 | H[i][j + 1] + 1, |
486 | H[i1][j1] + ((i - i1 - 1)) |
487 | + 1 + ((j - j1 - 1))); |
488 | } |
489 | DA.put(a.charAt(i - 1), i); |
490 | } |
491 | return H[a.length() + 1][b.length() + 1]; |
492 | } |
493 | |
494 | static int min(int a, int b, int c, int d) { |
495 | return Math.min(a, Math.min(b, Math.min(c, d))); |
496 | } |
497 | |
498 | static int min(int a, int b) { ret Math.min(a, b); } |
499 | static long min(long a, long b) { ret Math.min(a, b); } |
500 | static float min(float a, float b) { ret Math.min(a, b); } |
download show line numbers debug dex old transpilations
Travelled to 13 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
Snippet ID: | #1012161 |
Snippet name: | SymSpell Spike (super-fast spelling correction algorithm) [OK] |
Eternal ID of this version: | #1012161/7 |
Text MD5: | 19b2a088f0cf46b1fae9fa1a414b3ebf |
Transpilation MD5: | 1ce39b60bdb1b3a25225ee10b95bbba9 |
Author: | stefan |
Category: | javax / nlp |
Type: | JavaX source code (desktop) |
Public (visible to everyone): | Yes |
Archived (hidden from active list): | No |
Created/modified: | 2017-11-23 08:32:36 |
Source code size: | 21115 bytes / 500 lines |
Pitched / IR pitched: | No / No |
Views / Downloads: | 770 / 1399 |
Version history: | 6 change(s) |
Referenced in: | [show references] |