sclass SimpleRecognizer { bool useCache1 = true, useCache2 = true; Lock lock = lock(); transient IF1 wordImagePreprocessor; // optional preprocessor for word images (e.g. auto-contast) class GlyphInfo { S meaning; bool multi; // multiple meanings seen toString { ret meaning; } } // key = md5 Map glyphInfos = synchroMap(); // optional for full similarity search - character image to MD5 Map fullSearchMap; S unknownCharacter = ocr_unknownCharacterPlaceholder(); // "\u2666" - diamond suit symbol; used for unknown characters *() {} void load(S info) { lock lock; recognizeGrouped_cache.clear(); new Matches m; for (S s : toLinesFullTrim(info)) { if (find3("the images * are the characters *", s, m)) { L md5s = splitAtSpace($1); L characters = eachCharAsString(dropSpaces($2)); saveMeanings(md5s, characters); } else if (find3("the images * are the grouped characters *", s, m)) { L md5s = splitAtSpace($1); L characters = ocr_parseGlyphs(dropSpaces($2)); saveMeanings(md5s, characters); } else if (nempty(javaTokC(s))) { print("huh? " + s); } } //print("Have " + n(l(glyphInfos), "glyph info")); //psl(glyphInfos); } void saveMeaning(S md5, S meaning) { GlyphInfo info = getGlyphInfo(md5); if (info.multi) ret; if (hasDifferent(info.meaning, meaning)) { //info.meaning = null; info.meaning = meaning; info.multi = true; //print("multi"); } else info.meaning = meaning; } // gets or creates GlyphInfo GlyphInfo getGlyphInfo(S md5) { synchronized(glyphInfos) { GlyphInfo info = glyphInfos.get(md5); if (info == null) glyphInfos.put(md5, info = new GlyphInfo); ret info; } } void saveMeanings(L md5s, L characters) { if (l(md5s) != l(characters)) { print("huh?"); ret; } for i over md5s: saveMeaning(md5s.get(i), characters.get(i)); } S recognize(BWImage img) { ret ocr_joinGroups(recognizeGrouped(img)); } Scored recognizeScored(BWImage img) { Scored> s = recognizeGrouped(img, null); ret scored(ocr_joinGroups(s!), s); } L recognizeGrouped(BWImage img) { ret getVar(recognizeGrouped(img, null)); } // md5 -> recognition result Map> recognizeGrouped_cache = synchroMap(); int cantCache, cacheHits, cacheMisses; Scored recognizeGrouped(BWImage img, L clips_out) { S md5 = null; if (clips_out != null || !useCache1) ++cantCache; else { img = callFOrKeep(wordImagePreprocessor, img); md5 = md5OfBWImage(img); Scored result = recognizeGrouped_cache.get(md5); if (result != null) { ++cacheHits; ret result; } else ++cacheMisses; } Scored result = recognizeGrouped_uncached(img, clips_out); if (md5 != null) recognizeGrouped_cache.put(md5, result); ret result; } Scored recognizeGrouped_uncached(BWImage img, L clips_out) { new LS buf; L rects = horizontalAutoSplit2ThenAutoCrop(img); if (empty(rects)) ret scored((L) emptyList(), 0.99); new L scores; iLoop: for (int i = 0; i < l(rects); i++) { Rect r = null; for (int j = i; j < l(rects); j++) { r = rectUnion(r, rects.get(j)); BWImage cImg = img.clip(r); Scored scored = recognizeGlyph(cImg, false); GlyphInfo info = getVar(scored); if (info != null && info.meaning != null) { buf.add(info.meaning); buf.addAll(rep("_", j-i)); if (clips_out != null) clips_out.addAll(rep(r, j-i+1)); scores.add(scored); i = j; continue iLoop; } } r = rects.get(i); Scored scored = recognizeGlyph(img.clip(r), true); GlyphInfo info = getVar(scored); if (info != null && info.meaning != null) buf.add(info.meaning); else buf.add(unknownCharacter); if (clips_out != null) clips_out.add(r); scores.add(scored); } ret scored(buf, averageScore(scores)); } // md5 -> recognition result Map> recognizeGlyph_cache = synchroMap(); static int cacheHits2, cacheMisses2; Scored recognizeGlyph(BWImage img) { ret recognizeGlyph(img, true); } Scored recognizeGlyph(BWImage img, bool fullSearch) { S md5 = md5OfBWImage(img); { //lock lock; GlyphInfo info = glyphInfos.get(md5); if (info != null || !fullSearch || fullSearchMap == null) ret fullScored(info); if (useCache2) { Scored result = recognizeGlyph_cache.get(md5); if (result != null) { ++cacheHits2; ret result; } cacheMisses2++; } } new Best best; for (BWImage cImg : /*concurrentlyIterateKeys*/keys(fullSearchMap)) { float sim = bwImageSimilarityResized(img, cImg, (float) best.bestScore()); best.put(fullSearchMap.get(cImg), sim); } Scored result = !best.has() ? null : scored(glyphInfos.get(best!), best.score()); if (useCache2) recognizeGlyph_cache.put(md5, result); ret result; } S cacheStats() { //ret "Cache size: " + l(recognizeGrouped_cache) + ", hits: " + cacheHits + ", misses: " + cacheMisses + ", uncachable: " + cantCache; ret "Cache size: " + l(recognizeGlyph_cache) + ", hits: " + cacheHits2 + ", misses: " + cacheMisses2 + ", full search map: " + l(fullSearchMap); } S sizeStats() { ret l(glyphInfos) + "/" + l(fullSearchMap); } }