tesseract v5.3.3.20231005
dawg_test.cc
Go to the documentation of this file.
1// (C) Copyright 2017, Google Inc.
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5// http://www.apache.org/licenses/LICENSE-2.0
6// Unless required by applicable law or agreed to in writing, software
7// distributed under the License is distributed on an "AS IS" BASIS,
8// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9// See the License for the specific language governing permissions and
10// limitations under the License.
11
12#include "include_gunit.h"
13
14#include "ratngs.h"
15#include "trie.h"
16#include "unicharset.h"
17
18#include <sys/stat.h>
19#include <cstdlib> // for system
20#include <fstream> // for ifstream
21#include <set>
22#include <string>
23#include <vector>
24
25#ifndef SW_TESTING
26# define wordlist2dawg_prog "wordlist2dawg"
27# define dawg2wordlist_prog "dawg2wordlist"
28#endif
29
30namespace tesseract {
31
32// Test some basic functionality dealing with Dawgs (compressed dictionaries,
33// aka Directed Acyclic Word Graphs).
34class DawgTest : public testing::Test {
35protected:
36 void SetUp() override {
37 std::locale::global(std::locale(""));
39 }
40
41 void LoadWordlist(const std::string &filename, std::set<std::string> *words) const {
42 std::ifstream file(filename);
43 if (file.is_open()) {
44 std::string line;
45 while (getline(file, line)) {
46 // Remove trailing line terminators from line.
47 while (!line.empty() && (line.back() == '\n' || line.back() == '\r')) {
48 line.resize(line.size() - 1);
49 }
50 // Add line to set.
51 words->insert(line.c_str());
52 }
53 file.close();
54 }
55 }
56 std::string TessBinaryPath(const std::string &name) const {
57 return file::JoinPath(TESSBIN_DIR, name);
58 }
59 std::string OutputNameToPath(const std::string &name) const {
60 return file::JoinPath(FLAGS_test_tmpdir, name);
61 }
62 int RunCommand(const std::string &program, const std::string &arg1, const std::string &arg2,
63 const std::string &arg3) const {
64 std::string cmdline = TessBinaryPath(program) + " " + arg1 + " " + arg2 + " " + arg3;
65 return system(cmdline.c_str());
66 }
67 // Test that we are able to convert a wordlist file (one "word" per line) to
68 // a dawg (a compressed format) and then extract the original wordlist back
69 // out using the tools "wordlist2dawg" and "dawg2wordlist."
70 void TestDawgRoundTrip(const std::string &unicharset_filename,
71 const std::string &wordlist_filename) const {
72 std::set<std::string> orig_words, roundtrip_words;
73 std::string unicharset = file::JoinPath(TESTING_DIR, unicharset_filename);
74 std::string orig_wordlist = file::JoinPath(TESTING_DIR, wordlist_filename);
75 std::string output_dawg = OutputNameToPath(wordlist_filename + ".dawg");
76 std::string output_wordlist = OutputNameToPath(wordlist_filename);
77 LoadWordlist(orig_wordlist, &orig_words);
78 EXPECT_EQ(RunCommand(wordlist2dawg_prog, orig_wordlist, output_dawg, unicharset), 0);
79 EXPECT_EQ(RunCommand(dawg2wordlist_prog, unicharset, output_dawg, output_wordlist), 0);
80 LoadWordlist(output_wordlist, &roundtrip_words);
81 EXPECT_EQ(orig_words, roundtrip_words);
82 }
83};
84
85TEST_F(DawgTest, TestDawgConversion) {
86 TestDawgRoundTrip("eng.unicharset", "eng.wordlist.clean.freq");
87}
88
89TEST_F(DawgTest, TestMatching) {
90 UNICHARSET unicharset;
91 unicharset.load_from_file(file::JoinPath(TESTING_DIR, "eng.unicharset").c_str());
92 tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "basic_dawg", NGRAM_PERM, unicharset.size(), 0);
93 WERD_CHOICE space_apos(" '", unicharset);
94 trie.add_word_to_dawg(space_apos);
95
96 WERD_CHOICE space(" ", unicharset);
97
98 // partial match ok - then good!
99 EXPECT_TRUE(trie.prefix_in_dawg(space, false));
100 // require complete match - not present.
101 EXPECT_FALSE(trie.word_in_dawg(space));
102 EXPECT_FALSE(trie.prefix_in_dawg(space, true));
103
104 // partial or complete match ok for full word:
105 EXPECT_TRUE(trie.prefix_in_dawg(space_apos, false));
106 EXPECT_TRUE(trie.word_in_dawg(space_apos));
107 EXPECT_TRUE(trie.prefix_in_dawg(space_apos, true));
108}
109
110} // namespace tesseract
#define EXPECT_EQ(val1, val2)
Definition: gtest.h:2043
#define EXPECT_TRUE(condition)
Definition: gtest.h:1982
#define EXPECT_FALSE(condition)
Definition: gtest.h:1986
#define dawg2wordlist_prog
Definition: dawg_test.cc:27
#define wordlist2dawg_prog
Definition: dawg_test.cc:26
@ DAWG_TYPE_WORD
Definition: dawg.h:66
@ NGRAM_PERM
Definition: ratngs.h:241
TEST_F(EuroText, FastLatinOCR)
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
size_t size() const
Definition: unicharset.h:355
bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const
Definition: dawg.cpp:41
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:64
bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector< bool > *repetitions)
Definition: trie.cpp:159
int RunCommand(const std::string &program, const std::string &arg1, const std::string &arg2, const std::string &arg3) const
Definition: dawg_test.cc:62
void SetUp() override
Definition: dawg_test.cc:36
void LoadWordlist(const std::string &filename, std::set< std::string > *words) const
Definition: dawg_test.cc:41
std::string TessBinaryPath(const std::string &name) const
Definition: dawg_test.cc:56
void TestDawgRoundTrip(const std::string &unicharset_filename, const std::string &wordlist_filename) const
Definition: dawg_test.cc:70
std::string OutputNameToPath(const std::string &name) const
Definition: dawg_test.cc:59
static void MakeTmpdir()
Definition: include_gunit.h:38
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:65