Libraryless. Click here for Pure Java version (6241L/41K).
sclass SimpleRecognizer {
bool useCache1 = true, useCache2 = true;
Lock lock = lock();
transient IF1<BWImage> wordImagePreprocessor; // optional preprocessor for word images (e.g. auto-contast)
class GlyphInfo {
S meaning;
bool multi; // multiple meanings seen
toString { ret meaning; }
}
// key = md5
Map<S, GlyphInfo> glyphInfos = synchroMap();
// optional for full similarity search - character image to MD5
Map<BWImage, S> fullSearchMap;
S unknownCharacter = ocr_unknownCharacterPlaceholder(); // "\u2666" - diamond suit symbol; used for unknown characters
*() {}
void load(S info) {
lock lock;
recognizeGrouped_cache.clear();
new Matches m;
for (S s : toLinesFullTrim(info)) {
if (find3("the images * are the characters *", s, m)) {
L<S> md5s = splitAtSpace($1);
L<S> characters = eachCharAsString(dropSpaces($2));
saveMeanings(md5s, characters);
} else if (find3("the images * are the grouped characters *", s, m)) {
L<S> md5s = splitAtSpace($1);
L<S> characters = ocr_parseGlyphs(dropSpaces($2));
saveMeanings(md5s, characters);
} else if (nempty(javaTokC(s))) {
print("huh? " + s);
}
}
//print("Have " + n(l(glyphInfos), "glyph info"));
//psl(glyphInfos);
}
void saveMeaning(S md5, S meaning) {
GlyphInfo info = getGlyphInfo(md5);
if (info.multi) ret;
if (hasDifferent(info.meaning, meaning)) {
//info.meaning = null;
info.meaning = meaning;
info.multi = true;
//print("multi");
} else
info.meaning = meaning;
}
// gets or creates GlyphInfo
GlyphInfo getGlyphInfo(S md5) {
synchronized(glyphInfos) {
GlyphInfo info = glyphInfos.get(md5);
if (info == null)
glyphInfos.put(md5, info = new GlyphInfo);
ret info;
}
}
void saveMeanings(L<S> md5s, L<S> characters) {
if (l(md5s) != l(characters)) { print("huh?"); ret; }
for i over md5s:
saveMeaning(md5s.get(i), characters.get(i));
}
S recognize(BWImage img) {
ret ocr_joinGroups(recognizeGrouped(img));
}
Scored<S> recognizeScored(BWImage img) {
Scored<L<S>> s = recognizeGrouped(img, null);
ret scored(ocr_joinGroups(s!), s);
}
L<S> recognizeGrouped(BWImage img) {
ret getVar(recognizeGrouped(img, null));
}
// md5 -> recognition result
Map<S, Scored<LS>> recognizeGrouped_cache = synchroMap();
int cantCache, cacheHits, cacheMisses;
Scored<LS> recognizeGrouped(BWImage img, L<Rect> clips_out) {
S md5 = null;
if (clips_out != null || !useCache1) ++cantCache;
else {
img = callFOrKeep(wordImagePreprocessor, img);
md5 = md5OfBWImage(img);
Scored<LS> result = recognizeGrouped_cache.get(md5);
if (result != null) {
++cacheHits;
ret result;
} else ++cacheMisses;
}
Scored<LS> result = recognizeGrouped_uncached(img, clips_out);
if (md5 != null) recognizeGrouped_cache.put(md5, result);
ret result;
}
Scored<LS> recognizeGrouped_uncached(BWImage img, L<Rect> clips_out) {
new LS buf;
L<Rect> rects = horizontalAutoSplit2ThenAutoCrop(img);
if (empty(rects)) ret scored((L<S>) emptyList(), 0.99);
new L<Scored> scores;
iLoop: for (int i = 0; i < l(rects); i++) {
Rect r = null;
for (int j = i; j < l(rects); j++) {
r = rectUnion(r, rects.get(j));
BWImage cImg = img.clip(r);
Scored<GlyphInfo> scored = recognizeGlyph(cImg, false);
GlyphInfo info = getVar(scored);
if (info != null && info.meaning != null) {
buf.add(info.meaning);
buf.addAll(rep("_", j-i));
if (clips_out != null) clips_out.addAll(rep(r, j-i+1));
scores.add(scored);
i = j;
continue iLoop;
}
}
r = rects.get(i);
Scored<GlyphInfo> scored = recognizeGlyph(img.clip(r), true);
GlyphInfo info = getVar(scored);
if (info != null && info.meaning != null)
buf.add(info.meaning);
else
buf.add(unknownCharacter);
if (clips_out != null) clips_out.add(r);
scores.add(scored);
}
ret scored(buf, averageScore(scores));
}
// md5 -> recognition result
Map<S, Scored<GlyphInfo>> recognizeGlyph_cache = synchroMap();
static int cacheHits2, cacheMisses2;
Scored<GlyphInfo> recognizeGlyph(BWImage img) {
ret recognizeGlyph(img, true);
}
Scored<GlyphInfo> recognizeGlyph(BWImage img, bool fullSearch) {
S md5 = md5OfBWImage(img);
{
//lock lock;
GlyphInfo info = glyphInfos.get(md5);
if (info != null || !fullSearch || fullSearchMap == null) ret fullScored(info);
if (useCache2) {
Scored<GlyphInfo> result = recognizeGlyph_cache.get(md5);
if (result != null) { ++cacheHits2; ret result; }
cacheMisses2++;
}
}
new Best<S> best;
for (BWImage cImg : /*concurrentlyIterateKeys*/keys(fullSearchMap)) {
float sim = bwImageSimilarityResized(img, cImg, (float) best.bestScore());
best.put(fullSearchMap.get(cImg), sim);
}
Scored<GlyphInfo> result = !best.has() ? null : scored(glyphInfos.get(best!), best.score());
if (useCache2)
recognizeGlyph_cache.put(md5, result);
ret result;
}
S cacheStats() {
//ret "Cache size: " + l(recognizeGrouped_cache) + ", hits: " + cacheHits + ", misses: " + cacheMisses + ", uncachable: " + cantCache;
ret "Cache size: " + l(recognizeGlyph_cache) + ", hits: " + cacheHits2 + ", misses: " + cacheMisses2 + ", full search map: " + l(fullSearchMap);
}
S sizeStats() {
ret l(glyphInfos) + "/" + l(fullSearchMap);
}
}Began life as a copy of #1006103
download show line numbers debug dex old transpilations
Travelled to 14 computer(s): aoiabmzegqzx, bhatertpkbcr, cbybwowwnfue, cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, lpdgvwnxivlt, mqqgnosmbjvj, onxytkatvevr, pyentgdyhuwx, pzhvpgtvlbxg, tslmcundralx, tvejysmllsmz, vouqrxazstgt
No comments. add comment
| Snippet ID: | #1006108 |
| Snippet name: | SimpleRecognizer - recognizes a line of text |
| Eternal ID of this version: | #1006108/41 |
| Text MD5: | 1015f4609a0f4e17fa05ad438b036d60 |
| Transpilation MD5: | 1d2bcf46dbd08a869e56a96ee2f27137 |
| Author: | stefan |
| Category: | javax / ocr |
| Type: | JavaX fragment (include) |
| Public (visible to everyone): | Yes |
| Archived (hidden from active list): | No |
| Created/modified: | 2019-11-21 14:21:42 |
| Source code size: | 5918 bytes / 182 lines |
| Pitched / IR pitched: | No / No |
| Views / Downloads: | 1591 / 2444 |
| Version history: | 40 change(s) |
| Referenced in: | [show references] |