tesseract  3.05.01
pango_font_info.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: pango_font_info.cpp
3  * Description: Font-related objects and helper functions
4  * Author: Ranjith Unnikrishnan
5  * Created: Mon Nov 18 2013
6  *
7  * (C) Copyright 2013, Google Inc.
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  * http://www.apache.org/licenses/LICENSE-2.0
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  *
18  **********************************************************************/
19 
20 // Include automatically generated configuration file if running autoconf.
21 #ifdef HAVE_CONFIG_H
22 #include "config_auto.h"
23 #endif
24 
25 #if (defined __MINGW32__) || (defined __CYGWIN__)
26 // workaround for stdlib.h and putenv
27 #undef __STRICT_ANSI__
28 
29 #if (defined __MINGW32__)
30 #include "strcasestr.h"
31 #elif !defined(_GNU_SOURCE)
32 // needed for strcasestr in string.h
33 #define _GNU_SOURCE
34 #endif
35 
36 #elif defined(_MSC_VER)
37 #include "strcasestr.h"
38 #define strncasecmp _strnicmp
39 #define strcasecmp _stricmp
40 #endif
41 
42 #include <stdlib.h>
43 #include <stdio.h>
44 #include <string.h>
45 #ifndef _MSC_VER
46 #include <sys/param.h>
47 #endif
48 #include <algorithm>
49 
50 #include "pango_font_info.h"
51 #include "commandlineflags.h"
52 #include "fileio.h"
53 #include "normstrngs.h"
54 #include "tlog.h"
55 #include "unichar.h"
56 #include "util.h"
57 #include "pango/pango.h"
58 #include "pango/pangocairo.h"
59 #include "pango/pangofc-font.h"
60 
61 STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp",
62  "Overrides fontconfig default temporary dir");
63 
64 #ifndef USE_STD_NAMESPACE
65 #include "ocr/trainingdata/typesetting/legacy_fonts.h"
66 BOOL_PARAM_FLAG(use_only_legacy_fonts, false,
67  "Overrides --fonts_dir and sets the known universe of fonts to"
68  "the list in legacy_fonts.h");
69 
70 STRING_PARAM_FLAG(fonts_dir, "/auto/ocr-data/tesstraining/fonts",
71  "Overrides system default font location");
72 #else
73 using std::pair;
74 STRING_PARAM_FLAG(fonts_dir, "",
75  "If empty it use system default. Otherwise it overrides"
76  " system default font location");
77 #endif
78 
79 namespace tesseract {
80 
81 // Default assumed output resolution. Required only for providing font metrics
82 // in pixels.
83 const int kDefaultResolution = 300;
84 
85 string PangoFontInfo::fonts_dir_;
86 string PangoFontInfo::cache_dir_;
87 
88 PangoFontInfo::PangoFontInfo() : desc_(NULL), resolution_(kDefaultResolution) {
89  Clear();
90 }
91 
92 PangoFontInfo::PangoFontInfo(const string& desc)
93  : desc_(NULL), resolution_(kDefaultResolution) {
94  if (!ParseFontDescriptionName(desc)) {
95  tprintf("ERROR: Could not parse %s\n", desc.c_str());
96  Clear();
97  }
98 }
99 
100 void PangoFontInfo::Clear() {
101  font_size_ = 0;
102  is_bold_ = false;
103  is_italic_ = false;
104  is_smallcaps_ = false;
105  is_monospace_ = false;
106  family_name_.clear();
107  font_type_ = UNKNOWN;
108  if (desc_) {
109  pango_font_description_free(desc_);
110  desc_ = NULL;
111  }
112 }
113 
114 PangoFontInfo::~PangoFontInfo() { pango_font_description_free(desc_); }
115 
117  if (!desc_) return "";
118  char* desc_str = pango_font_description_to_string(desc_);
119  string desc_name(desc_str);
120  g_free(desc_str);
121  return desc_name;
122 }
123 
124 // If not already initialized, initializes FontConfig by setting its
125 // environment variable and creating a fonts.conf file that points to the
126 // FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir.
127 /* static */
129  if (fonts_dir_.empty()) {
130  HardInitFontConfig(FLAGS_fonts_dir.c_str(),
131  FLAGS_fontconfig_tmpdir.c_str());
132  }
133 }
134 
135 // Re-initializes font config, whether or not already initialized.
136 // If already initialized, any existing cache is deleted, just to be sure.
137 /* static */
138 void PangoFontInfo::HardInitFontConfig(const string& fonts_dir,
139  const string& cache_dir) {
140  if (!cache_dir_.empty()) {
142  File::JoinPath(cache_dir_.c_str(), "*cache-?").c_str());
143  }
144  const int MAX_FONTCONF_FILESIZE = 1024;
145  char fonts_conf_template[MAX_FONTCONF_FILESIZE];
146  cache_dir_ = cache_dir;
147  fonts_dir_ = fonts_dir;
148  snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
149  "<?xml version=\"1.0\"?>\n"
150  "<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
151  "<fontconfig>\n"
152  "<dir>%s</dir>\n"
153  "<cachedir>%s</cachedir>\n"
154  "<config></config>\n"
155  "</fontconfig>",
156  fonts_dir.c_str(), cache_dir_.c_str());
157  string fonts_conf_file = File::JoinPath(cache_dir_.c_str(), "fonts.conf");
158  File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
159 #ifdef _WIN32
160  std::string env("FONTCONFIG_PATH=");
161  env.append(cache_dir_.c_str());
162  putenv(env.c_str());
163  putenv("LANG=en_US.utf8");
164 #else
165  setenv("FONTCONFIG_PATH", cache_dir_.c_str(), true);
166  // Fix the locale so that the reported font names are consistent.
167  setenv("LANG", "en_US.utf8", true);
168 #endif // _WIN32
169 
170  if (FcInitReinitialize() != FcTrue) {
171  tprintf("FcInitiReinitialize failed!!\n");
172  }
174  // Clear Pango's font cache too.
175  pango_cairo_font_map_set_default(NULL);
176 }
177 
178 static void ListFontFamilies(PangoFontFamily*** families,
179  int* n_families) {
181  PangoFontMap* font_map = pango_cairo_font_map_get_default();
183  pango_font_map_list_families(font_map, families, n_families);
184 }
185 
186 // Inspects whether a given font family is monospace. If the font is not
187 // available, it cannot make a decision and returns false by default.
188 static bool IsMonospaceFontFamily(const char* family_name) {
189  PangoFontFamily** families = 0;
190  int n_families = 0;
191  bool is_monospace = false;
192  ListFontFamilies(&families, &n_families);
193  ASSERT_HOST(n_families > 0);
194  bool found = false;
195  for (int i = 0; i < n_families; ++i) {
196  if (!strcasecmp(family_name, pango_font_family_get_name(families[i]))) {
197  is_monospace = pango_font_family_is_monospace(families[i]);
198  found = true;
199  break;
200  }
201  }
202  if (!found) {
203  tlog(1, "Could not find monospace property of family %s\n", family_name);
204  }
205  g_free(families);
206  return is_monospace;
207 }
208 
209 bool PangoFontInfo::ParseFontDescription(const PangoFontDescription *desc) {
210  Clear();
211  const char* family = pango_font_description_get_family(desc);
212  if (!family) {
213  char* desc_str = pango_font_description_to_string(desc);
214  tprintf("WARNING: Could not parse family name from description: '%s'\n",
215  desc_str);
216  g_free(desc_str);
217  return false;
218  }
219  family_name_ = string(family);
220  desc_ = pango_font_description_copy(desc);
221  is_monospace_ = IsMonospaceFontFamily(family);
222 
223  // Set font size in points
224  font_size_ = pango_font_description_get_size(desc);
225  if (!pango_font_description_get_size_is_absolute(desc)) {
226  font_size_ /= PANGO_SCALE;
227  }
228 
229  PangoStyle style = pango_font_description_get_style(desc);
230  is_italic_ = (PANGO_STYLE_ITALIC == style ||
231  PANGO_STYLE_OBLIQUE == style);
232  is_smallcaps_ = (pango_font_description_get_variant(desc)
233  == PANGO_VARIANT_SMALL_CAPS);
234 
235  is_bold_ = (pango_font_description_get_weight(desc) >= PANGO_WEIGHT_BOLD);
236  // We don't have a way to detect whether a font is of type Fraktur. The fonts
237  // we currently use all have "Fraktur" in their family name, so we do a
238  // fragile but functional check for that here.
239  is_fraktur_ = (strcasestr(family, "Fraktur") != NULL);
240  return true;
241 }
242 
243 bool PangoFontInfo::ParseFontDescriptionName(const string& name) {
244  PangoFontDescription *desc = pango_font_description_from_string(name.c_str());
245  bool success = ParseFontDescription(desc);
246  pango_font_description_free(desc);
247  return success;
248 }
249 
250 // Returns the PangoFont structure corresponding to the closest available font
251 // in the font map. Note that if the font is wholly missing, this could
252 // correspond to a completely different font family and face.
253 PangoFont* PangoFontInfo::ToPangoFont() const {
255  PangoFontMap* font_map = pango_cairo_font_map_get_default();
256  PangoContext* context = pango_context_new();
257  pango_cairo_context_set_resolution(context, resolution_);
258  pango_context_set_font_map(context, font_map);
259  PangoFont* font = NULL;
260  {
262  font = pango_font_map_load_font(font_map, context, desc_);
263  }
264  g_object_unref(context);
265  return font;
266 }
267 
268 bool PangoFontInfo::CoversUTF8Text(const char* utf8_text, int byte_length) const {
269  PangoFont* font = ToPangoFont();
270  PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
271  for (UNICHAR::const_iterator it = UNICHAR::begin(utf8_text, byte_length);
272  it != UNICHAR::end(utf8_text, byte_length);
273  ++it) {
274  if (IsWhitespace(*it) || pango_is_zero_width(*it))
275  continue;
276  if (pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) {
277  char tmp[5];
278  int len = it.get_utf8(tmp);
279  tmp[len] = '\0';
280  tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it);
281  return false;
282  }
283  }
284  return true;
285 }
286 
287 // This variant of strncpy permits src and dest to overlap. It will copy the
288 // first byte first.
289 static char* my_strnmove(char* dest, const char* src, size_t n) {
290  char* ret = dest;
291 
292  // Copy characters until n reaches zero or the src byte is a nul.
293  do {
294  *dest = *src;
295  --n;
296  ++dest;
297  ++src;
298  } while (n && src[0]);
299 
300  // If we reached a nul byte and there are more 'n' left, zero them out.
301  while (n) {
302  *dest = '\0';
303  --n;
304  ++dest;
305  }
306  return ret;
307 }
308 
309 int PangoFontInfo::DropUncoveredChars(string* utf8_text) const {
310  PangoFont* font = ToPangoFont();
311  PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
312  int num_dropped_chars = 0;
313  // Maintain two iterators that point into the string. For space efficiency, we
314  // will repeatedly copy one covered UTF8 character from one to the other, and
315  // at the end resize the string to the right length.
316  char* out = const_cast<char*>(utf8_text->c_str());
317  const UNICHAR::const_iterator it_begin =
318  UNICHAR::begin(utf8_text->c_str(), utf8_text->length());
319  const UNICHAR::const_iterator it_end =
320  UNICHAR::end(utf8_text->c_str(), utf8_text->length());
321  for (UNICHAR::const_iterator it = it_begin; it != it_end;) {
322  // Skip bad utf-8.
323  if (!it.is_legal()) {
324  ++it; // One suitable error message will still be issued.
325  continue;
326  }
327  int unicode = *it;
328  int utf8_len = it.utf8_len();
329  const char* utf8_char = it.utf8_data();
330  // Move it forward before the data gets modified.
331  ++it;
332  if (!IsWhitespace(unicode) && !pango_is_zero_width(unicode) &&
333  pango_coverage_get(coverage, unicode) != PANGO_COVERAGE_EXACT) {
334  if (TLOG_IS_ON(2)) {
335  UNICHAR unichar(unicode);
336  char* str = unichar.utf8_str();
337  tlog(2, "'%s' (U+%x) not covered by font\n", str, unicode);
338  delete[] str;
339  }
340  ++num_dropped_chars;
341  continue;
342  }
343  my_strnmove(out, utf8_char, utf8_len);
344  out += utf8_len;
345  }
346  utf8_text->resize(out - utf8_text->c_str());
347  return num_dropped_chars;
348 }
349 
350 bool PangoFontInfo::GetSpacingProperties(const string& utf8_char,
351  int* x_bearing, int* x_advance) const {
352  // Convert to equivalent PangoFont structure
353  PangoFont* font = ToPangoFont();
354  // Find the glyph index in the font for the supplied utf8 character.
355  int total_advance = 0;
356  int min_bearing = 0;
357  // Handle multi-unicode strings by reporting the left-most position of the
358  // x-bearing, and right-most position of the x-advance if the string were to
359  // be rendered.
360  const UNICHAR::const_iterator it_begin = UNICHAR::begin(utf8_char.c_str(),
361  utf8_char.length());
362  const UNICHAR::const_iterator it_end = UNICHAR::end(utf8_char.c_str(),
363  utf8_char.length());
364  for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
365  PangoGlyph glyph_index = pango_fc_font_get_glyph(
366  reinterpret_cast<PangoFcFont*>(font), *it);
367  if (!glyph_index) {
368  // Glyph for given unicode character doesn't exist in font.
369  return false;
370  }
371  // Find the ink glyph extents for the glyph
372  PangoRectangle ink_rect, logical_rect;
373  pango_font_get_glyph_extents(font, glyph_index, &ink_rect, &logical_rect);
374  pango_extents_to_pixels(&ink_rect, NULL);
375  pango_extents_to_pixels(&logical_rect, NULL);
376 
377  int bearing = total_advance + PANGO_LBEARING(ink_rect);
378  if (it == it_begin || bearing < min_bearing) {
379  min_bearing = bearing;
380  }
381  total_advance += PANGO_RBEARING(logical_rect);
382  }
383  *x_bearing = min_bearing;
384  *x_advance = total_advance;
385  return true;
386 }
387 
388 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len) const {
389  vector<string> graphemes;
390  return CanRenderString(utf8_word, len, &graphemes);
391 }
392 
393 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len,
394  vector<string>* graphemes) const {
395  if (graphemes) graphemes->clear();
396  // We check for font coverage of the text first, as otherwise Pango could
397  // (undesirably) fall back to another font that does have the required
398  // coverage.
399  if (!CoversUTF8Text(utf8_word, len)) {
400  return false;
401  }
402  // U+25CC dotted circle character that often (but not always) gets rendered
403  // when there is an illegal grapheme sequence.
404  const char32 kDottedCircleGlyph = 9676;
405  bool bad_glyph = false;
406  PangoFontMap* font_map = pango_cairo_font_map_get_default();
407  PangoContext* context = pango_context_new();
408  pango_context_set_font_map(context, font_map);
409  PangoLayout* layout;
410  {
411  // Pango is not relasing the cached layout.
413  layout = pango_layout_new(context);
414  }
415  if (desc_) {
416  pango_layout_set_font_description(layout, desc_);
417  } else {
418  PangoFontDescription *desc = pango_font_description_from_string(
419  DescriptionName().c_str());
420  pango_layout_set_font_description(layout, desc);
421  pango_font_description_free(desc);
422  }
423  pango_layout_set_text(layout, utf8_word, len);
424  PangoLayoutIter* run_iter = NULL;
425  { // Fontconfig caches some information here that is not freed before exit.
427  run_iter = pango_layout_get_iter(layout);
428  }
429  do {
430  PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
431  if (!run) {
432  tlog(2, "Found end of line NULL run marker\n");
433  continue;
434  }
435  PangoGlyph dotted_circle_glyph;
436  PangoFont* font = run->item->analysis.font;
437 
438 #ifdef _WIN32 // Fixme! Leaks memory and breaks unittests.
439  PangoGlyphString* glyphs = pango_glyph_string_new();
440  char s[] = "\xc2\xa7";
441  pango_shape(s, sizeof(s), &(run->item->analysis), glyphs);
442  dotted_circle_glyph = glyphs->glyphs[0].glyph;
443 #else
444  dotted_circle_glyph = pango_fc_font_get_glyph(
445  reinterpret_cast<PangoFcFont*>(font), kDottedCircleGlyph);
446 #endif
447 
448  if (TLOG_IS_ON(2)) {
449  PangoFontDescription* desc = pango_font_describe(font);
450  char* desc_str = pango_font_description_to_string(desc);
451  tlog(2, "Desc of font in run: %s\n", desc_str);
452  g_free(desc_str);
453  pango_font_description_free(desc);
454  }
455 
456  PangoGlyphItemIter cluster_iter;
457  gboolean have_cluster;
458  for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
459  run, utf8_word);
460  have_cluster && !bad_glyph;
461  have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
462  const int start_byte_index = cluster_iter.start_index;
463  const int end_byte_index = cluster_iter.end_index;
464  int start_glyph_index = cluster_iter.start_glyph;
465  int end_glyph_index = cluster_iter.end_glyph;
466  string cluster_text = string(utf8_word + start_byte_index,
467  end_byte_index - start_byte_index);
468  if (graphemes) graphemes->push_back(cluster_text);
469  if (IsUTF8Whitespace(cluster_text.c_str())) {
470  tlog(2, "Skipping whitespace\n");
471  continue;
472  }
473  if (TLOG_IS_ON(2)) {
474  printf("start_byte=%d end_byte=%d start_glyph=%d end_glyph=%d ",
475  start_byte_index, end_byte_index,
476  start_glyph_index, end_glyph_index);
477  }
478  for (int i = start_glyph_index,
479  step = (end_glyph_index > start_glyph_index) ? 1 : -1;
480  !bad_glyph && i != end_glyph_index; i+= step) {
481  const bool unknown_glyph =
482  (cluster_iter.glyph_item->glyphs->glyphs[i].glyph &
483  PANGO_GLYPH_UNKNOWN_FLAG);
484  const bool illegal_glyph =
485  (cluster_iter.glyph_item->glyphs->glyphs[i].glyph ==
486  dotted_circle_glyph);
487  bad_glyph = unknown_glyph || illegal_glyph;
488  if (TLOG_IS_ON(2)) {
489  printf("(%d=%d)", cluster_iter.glyph_item->glyphs->glyphs[i].glyph,
490  bad_glyph ? 1 : 0);
491  }
492  }
493  if (TLOG_IS_ON(2)) {
494  printf(" '%s'\n", cluster_text.c_str());
495  }
496  if (bad_glyph)
497  tlog(1, "Found illegal glyph!\n");
498  }
499  } while (!bad_glyph && pango_layout_iter_next_run(run_iter));
500 
501  pango_layout_iter_free(run_iter);
502  g_object_unref(context);
503  g_object_unref(layout);
504  if (bad_glyph && graphemes) graphemes->clear();
505  return !bad_glyph;
506 }
507 
508 
509 // ------------------------ FontUtils ------------------------------------
510 vector<string> FontUtils::available_fonts_; // cache list
511 
512 // Returns whether the specified font description is available in the fonts
513 // directory.
514 //
515 // The generated list of font families and faces includes "synthesized" font
516 // faces that are not truly loadable. Pango versions >=1.18 have a
517 // pango_font_face_is_synthesized method that can be used to prune the list.
518 // Until then, we are restricted to using a hack where we try to load the font
519 // from the font_map, and then check what we loaded to see if it has the
520 // description we expected. If it is not, then the font is deemed unavailable.
521 /* static */
522 bool FontUtils::IsAvailableFont(const char* input_query_desc,
523  string* best_match) {
524  string query_desc(input_query_desc);
525 #if (PANGO_VERSION <= 12005)
526  // Strip commas and any ' Medium' substring in the name.
527  query_desc.erase(std::remove(query_desc.begin(), query_desc.end(), ','),
528  query_desc.end());
529  const string kMediumStr = " Medium";
530  std::size_t found = query_desc.find(kMediumStr);
531  if (found != std::string::npos) {
532  query_desc.erase(found, kMediumStr.length());
533  }
534 #endif
535  PangoFontDescription *desc = pango_font_description_from_string(
536  query_desc.c_str());
537  PangoFont* selected_font = NULL;
538  {
540  PangoFontMap* font_map = pango_cairo_font_map_get_default();
541  PangoContext* context = pango_context_new();
542  pango_context_set_font_map(context, font_map);
543  {
545  selected_font = pango_font_map_load_font(font_map, context, desc);
546  }
547  g_object_unref(context);
548  }
549  if (selected_font == NULL) {
550  pango_font_description_free(desc);
551  return false;
552  }
553  PangoFontDescription* selected_desc = pango_font_describe(selected_font);
554 
555  bool equal = pango_font_description_equal(desc, selected_desc);
556  tlog(3, "query weight = %d \t selected weight =%d\n",
557  pango_font_description_get_weight(desc),
558  pango_font_description_get_weight(selected_desc));
559 
560  char* selected_desc_str = pango_font_description_to_string(selected_desc);
561  tlog(2, "query_desc: '%s' Selected: '%s'\n", query_desc.c_str(),
562  selected_desc_str);
563  if (!equal && best_match != NULL) {
564  *best_match = selected_desc_str;
565  // Clip the ending ' 0' if there is one. It seems that, if there is no
566  // point size on the end of the fontname, then Pango always appends ' 0'.
567  int len = best_match->size();
568  if (len > 2 && best_match->at(len - 1) == '0' &&
569  best_match->at(len - 2) == ' ') {
570  *best_match = best_match->substr(0, len - 2);
571  }
572  }
573  g_free(selected_desc_str);
574  pango_font_description_free(selected_desc);
575  g_object_unref(selected_font);
576  pango_font_description_free(desc);
577  return equal;
578 }
579 
580 static bool ShouldIgnoreFontFamilyName(const char* query) {
581  static const char* kIgnoredFamilyNames[]
582  = { "Sans", "Serif", "Monospace", NULL };
583  const char** list = kIgnoredFamilyNames;
584  for (; *list != NULL; ++list) {
585  if (!strcmp(*list, query))
586  return true;
587  }
588  return false;
589 }
590 
591 // Outputs description names of available fonts.
592 /* static */
593 const vector<string>& FontUtils::ListAvailableFonts() {
594  if (!available_fonts_.empty()) {
595  return available_fonts_;
596  }
597 #ifndef USE_STD_NAMESPACE
598  if (FLAGS_use_only_legacy_fonts) {
599  // Restrict view to list of fonts in legacy_fonts.h
600  tprintf("Using list of legacy fonts only\n");
601  const int kNumFontLists = 4;
602  for (int i = 0; i < kNumFontLists; ++i) {
603  for (int j = 0; kFontlists[i][j] != NULL; ++j) {
604  available_fonts_.push_back(kFontlists[i][j]);
605  }
606  }
607  return available_fonts_;
608  }
609 #endif
610 
611  PangoFontFamily** families = 0;
612  int n_families = 0;
613  ListFontFamilies(&families, &n_families);
614  for (int i = 0; i < n_families; ++i) {
615  const char* family_name = pango_font_family_get_name(families[i]);
616  tlog(2, "Listing family %s\n", family_name);
617  if (ShouldIgnoreFontFamilyName(family_name)) {
618  continue;
619  }
620 
621  int n_faces;
622  PangoFontFace** faces = NULL;
623  pango_font_family_list_faces(families[i], &faces, &n_faces);
624  for (int j = 0; j < n_faces; ++j) {
625  PangoFontDescription* desc = pango_font_face_describe(faces[j]);
626  char* desc_str = pango_font_description_to_string(desc);
627  if (IsAvailableFont(desc_str)) {
628  available_fonts_.push_back(desc_str);
629  }
630  pango_font_description_free(desc);
631  g_free(desc_str);
632  }
633  g_free(faces);
634  }
635  g_free(families);
636  sort(available_fonts_.begin(), available_fonts_.end());
637  return available_fonts_;
638 }
639 
640 
641 static void CharCoverageMapToBitmap(PangoCoverage* coverage,
642  vector<bool>* unichar_bitmap) {
643  const int kMinUnicodeValue = 33;
644  const int kMaxUnicodeValue = 0x10FFFF;
645  unichar_bitmap->resize(kMaxUnicodeValue + 1, false);
646  // Mark off characters that the font can render.
647  for (int i = kMinUnicodeValue; i <= kMaxUnicodeValue; ++i) {
648  if (IsInterchangeValid(i)) {
649  (*unichar_bitmap)[i]
650  = (pango_coverage_get(coverage, i) == PANGO_COVERAGE_EXACT);
651  }
652  }
653 }
654 
655 /* static */
656 void FontUtils::GetAllRenderableCharacters(vector<bool>* unichar_bitmap) {
657  const vector<string>& all_fonts = ListAvailableFonts();
658  return GetAllRenderableCharacters(all_fonts, unichar_bitmap);
659 }
660 
661 /* static */
662 void FontUtils::GetAllRenderableCharacters(const string& font_name,
663  vector<bool>* unichar_bitmap) {
664  PangoFontInfo font_info(font_name);
665  PangoCoverage* coverage = pango_font_get_coverage(
666  font_info.ToPangoFont(), NULL);
667  CharCoverageMapToBitmap(coverage, unichar_bitmap);
668 }
669 
670 /* static */
671 void FontUtils::GetAllRenderableCharacters(const vector<string>& fonts,
672  vector<bool>* unichar_bitmap) {
673  // Form the union of coverage maps from the fonts
674  PangoCoverage* all_coverage = pango_coverage_new();
675  tlog(1, "Processing %d fonts\n", fonts.size());
676  for (int i = 0; i < fonts.size(); ++i) {
677  PangoFontInfo font_info(fonts[i]);
678  PangoCoverage* coverage = pango_font_get_coverage(
679  font_info.ToPangoFont(), NULL);
680  // Mark off characters that any font can render.
681  pango_coverage_max(all_coverage, coverage);
682  }
683  CharCoverageMapToBitmap(all_coverage, unichar_bitmap);
684  pango_coverage_unref(all_coverage);
685 }
686 
687 
688 // Utilities written to be backward compatible with StringRender
689 
690 /* static */
691 int FontUtils::FontScore(const TessHashMap<char32, inT64>& ch_map,
692  const string& fontname, int* raw_score,
693  vector<bool>* ch_flags) {
694  PangoFontInfo font_info;
695  if (!font_info.ParseFontDescriptionName(fontname)) {
696  tprintf("ERROR: Could not parse %s\n", fontname.c_str());
697  }
698  PangoFont* font = font_info.ToPangoFont();
699  PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
700 
701  if (ch_flags) {
702  ch_flags->clear();
703  ch_flags->reserve(ch_map.size());
704  }
705  *raw_score = 0;
706  int ok_chars = 0;
707  for (TessHashMap<char32, inT64>::const_iterator it = ch_map.begin();
708  it != ch_map.end(); ++it) {
709  bool covered = (IsWhitespace(it->first) ||
710  (pango_coverage_get(coverage, it->first)
711  == PANGO_COVERAGE_EXACT));
712  if (covered) {
713  ++(*raw_score);
714  ok_chars += it->second;
715  }
716  if (ch_flags) {
717  ch_flags->push_back(covered);
718  }
719  }
720  return ok_chars;
721 }
722 
723 
724 /* static */
725 string FontUtils::BestFonts(const TessHashMap<char32, inT64>& ch_map,
726  vector<pair<const char*, vector<bool> > >* fonts) {
727  const double kMinOKFraction = 0.99;
728  // Weighted fraction of characters that must be renderable in a font to make
729  // it OK even if the raw count is not good.
730  const double kMinWeightedFraction = 0.99995;
731 
732  fonts->clear();
733  vector<vector<bool> > font_flags;
734  vector<int> font_scores;
735  vector<int> raw_scores;
736  int most_ok_chars = 0;
737  int best_raw_score = 0;
738  const vector<string>& font_names = FontUtils::ListAvailableFonts();
739  for (int i = 0; i < font_names.size(); ++i) {
740  vector<bool> ch_flags;
741  int raw_score = 0;
742  int ok_chars = FontScore(ch_map, font_names[i], &raw_score, &ch_flags);
743  most_ok_chars = MAX(ok_chars, most_ok_chars);
744  best_raw_score = MAX(raw_score, best_raw_score);
745 
746  font_flags.push_back(ch_flags);
747  font_scores.push_back(ok_chars);
748  raw_scores.push_back(raw_score);
749  }
750 
751  // Now select the fonts with a score above a threshold fraction
752  // of both the raw and weighted best scores. To prevent bogus fonts being
753  // selected for CJK, we require a high fraction (kMinOKFraction = 0.99) of
754  // BOTH weighted and raw scores.
755  // In low character-count scripts, the issue is more getting enough fonts,
756  // when only 1 or 2 might have all those rare dingbats etc in them, so we
757  // allow a font with a very high weighted (coverage) score
758  // (kMinWeightedFraction = 0.99995) to be used even if its raw score is poor.
759  int least_good_enough = static_cast<int>(most_ok_chars * kMinOKFraction);
760  int least_raw_enough = static_cast<int>(best_raw_score * kMinOKFraction);
761  int override_enough = static_cast<int>(most_ok_chars * kMinWeightedFraction);
762 
763  string font_list;
764  for (int i = 0; i < font_names.size(); ++i) {
765  int score = font_scores[i];
766  int raw_score = raw_scores[i];
767  if ((score >= least_good_enough && raw_score >= least_raw_enough) ||
768  score >= override_enough) {
769  fonts->push_back(make_pair(font_names[i].c_str(), font_flags[i]));
770  tlog(1, "OK font %s = %.4f%%, raw = %d = %.2f%%\n",
771  font_names[i].c_str(),
772  100.0 * score / most_ok_chars,
773  raw_score, 100.0 * raw_score / best_raw_score);
774  font_list += font_names[i];
775  font_list += "\n";
776  } else if (score >= least_good_enough || raw_score >= least_raw_enough) {
777  tlog(1, "Runner-up font %s = %.4f%%, raw = %d = %.2f%%\n",
778  font_names[i].c_str(),
779  100.0 * score / most_ok_chars,
780  raw_score, 100.0 * raw_score / best_raw_score);
781  }
782  }
783  return font_list;
784 }
785 
786 /* static */
787 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
788  string* font_name, vector<string>* graphemes) {
789  return SelectFont(utf8_word, utf8_len, ListAvailableFonts(), font_name,
790  graphemes);
791 }
792 
793 /* static */
794 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
795  const vector<string>& all_fonts,
796  string* font_name, vector<string>* graphemes) {
797  if (font_name) font_name->clear();
798  if (graphemes) graphemes->clear();
799  for (int i = 0; i < all_fonts.size(); ++i) {
800  PangoFontInfo font;
801  vector<string> found_graphemes;
802  ASSERT_HOST_MSG(font.ParseFontDescriptionName(all_fonts[i]),
803  "Could not parse font desc name %s\n",
804  all_fonts[i].c_str());
805  if (font.CanRenderString(utf8_word, utf8_len, &found_graphemes)) {
806  if (graphemes) graphemes->swap(found_graphemes);
807  if (font_name) *font_name = all_fonts[i];
808  return true;
809  }
810  }
811  return false;
812 }
813 
814 // PangoFontInfo is reinitialized, so clear the static list of fonts.
815 /* static */
816 void FontUtils::ReInit() { available_fonts_.clear(); }
817 
818 } // namespace tesseract
static bool DeleteMatchingFiles(const char *pattern)
Definition: fileio.cpp:113
bool GetSpacingProperties(const string &utf8_char, int *x_bearing, int *x_advance) const
bool CanRenderString(const char *utf8_word, int len, std::vector< string > *graphemes) const
bool ParseFontDescriptionName(const string &name)
static string BestFonts(const TessHashMap< char32, inT64 > &ch_map, std::vector< std::pair< const char *, std::vector< bool > > > *font_flag)
#define tlog(level,...)
Definition: tlog.h:33
bool CoversUTF8Text(const char *utf8_text, int byte_length) const
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:182
#define TLOG_IS_ON(level)
Definition: tlog.h:39
static int FontScore(const TessHashMap< char32, inT64 > &ch_map, const string &fontname, int *raw_score, std::vector< bool > *ch_flags)
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:176
static const_iterator end(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:204
BOOL_PARAM_FLAG(use_only_legacy_fonts, false, "Overrides --fonts_dir and sets the known universe of fonts to" "the list in legacy_fonts.h")
int DropUncoveredChars(string *utf8_text) const
static void GetAllRenderableCharacters(std::vector< bool > *unichar_bitmap)
#define ASSERT_HOST_MSG(x,...)
Definition: errcode.h:90
static const_iterator begin(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:200
const int kDefaultResolution
static bool SelectFont(const char *utf8_word, const int utf8_len, string *font_name, std::vector< string > *graphemes)
bool IsInterchangeValid(const char32 ch)
Definition: normstrngs.cpp:208
#define MAX(x, y)
Definition: ndminx.h:24
#define tprintf(...)
Definition: tprintf.h:31
int utf8_len() const
Definition: unichar.cpp:186
static bool IsAvailableFont(const char *font_desc)
static string JoinPath(const string &prefix, const string &suffix)
Definition: fileio.cpp:83
static void HardInitFontConfig(const string &fonts_dir, const string &cache_dir)
char * utf8_str() const
Definition: unichar.cpp:125
#define ASSERT_HOST(x)
Definition: errcode.h:84
static void WriteStringToFileOrDie(const string &str, const string &filename)
Definition: fileio.cpp:53
signed int char32
Definition: normstrngs.h:27
static const std::vector< string > & ListAvailableFonts()
#define DISABLE_HEAP_LEAK_CHECK
Definition: util.h:63
STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp", "Overrides fontconfig default temporary dir")