tesseract  3.05.01
stringrenderer.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: stringrenderer.cpp
3  * Description: Class for rendering UTF-8 text to an image, and retrieving
4  * bounding boxes around each grapheme cluster.
5  * Author: Ranjith Unnikrishnan
6  * Created: Mon Nov 18 2013
7  *
8  * (C) Copyright 2013, Google Inc.
9  * Licensed under the Apache License, Version 2.0 (the "License");
10  * you may not use this file except in compliance with the License.
11  * You may obtain a copy of the License at
12  * http://www.apache.org/licenses/LICENSE-2.0
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  *
19  **********************************************************************/
20 
21 #include "stringrenderer.h"
22 
23 #include <stdio.h>
24 #include <string.h>
25 #include <algorithm>
26 #include <map>
27 #include <utility>
28 #include <vector>
29 
30 #include "allheaders.h" // from leptonica
31 #include "boxchar.h"
32 #include "ligature_table.h"
33 #include "normstrngs.h"
34 #include "pango/pango-font.h"
35 #include "pango/pango-glyph-item.h"
36 #include "tlog.h"
37 #include "unichar.h"
38 #include "unicode/uchar.h" // from libicu
39 #include "util.h"
40 
41 #ifdef USE_STD_NAMESPACE
42 using std::map;
43 using std::max;
44 using std::min;
45 using std::swap;
46 #endif
47 
48 namespace tesseract {
49 
50 static const int kDefaultOutputResolution = 300;
51 
52 // Word joiner (U+2060) inserted after letters in ngram mode, as per
53 // recommendation in http://unicode.org/reports/tr14/ to avoid line-breaks at
54 // hyphens and other non-alpha characters.
55 static const char* kWordJoinerUTF8 = "\xE2\x81\xA0"; // u8"\u2060";
56 static const char32 kWordJoiner = 0x2060;
57 
58 static bool IsCombiner(int ch) {
59  const int char_type = u_charType(ch);
60  return ((char_type == U_NON_SPACING_MARK) ||
61  (char_type == U_ENCLOSING_MARK) ||
62  (char_type == U_COMBINING_SPACING_MARK));
63 }
64 
65 static string EncodeAsUTF8(const char32 ch32) {
66  UNICHAR uni_ch(ch32);
67  return string(uni_ch.utf8(), uni_ch.utf8_len());
68 }
69 
70 // Returns true with probability 'prob'.
71 static bool RandBool(const double prob, TRand* rand) {
72  if (prob == 1.0) return true;
73  if (prob == 0.0) return false;
74  return rand->UnsignedRand(1.0) < prob;
75 }
76 
77 /* static */
78 Pix* CairoARGB32ToPixFormat(cairo_surface_t *surface) {
79  if (cairo_image_surface_get_format(surface) != CAIRO_FORMAT_ARGB32) {
80  printf("Unexpected surface format %d\n",
81  cairo_image_surface_get_format(surface));
82  return NULL;
83  }
84  const int width = cairo_image_surface_get_width(surface);
85  const int height = cairo_image_surface_get_height(surface);
86  Pix* pix = pixCreate(width, height, 32);
87  int byte_stride = cairo_image_surface_get_stride(surface);
88 
89  for (int i = 0; i < height; ++i) {
90  memcpy(reinterpret_cast<unsigned char*>(pix->data + i * pix->wpl) + 1,
91  cairo_image_surface_get_data(surface) + i * byte_stride,
92  byte_stride - ((i == height - 1) ? 1 : 0));
93  }
94  return pix;
95 }
96 
97 StringRenderer::StringRenderer(const string& font_desc, int page_width,
98  int page_height)
99  : page_width_(page_width),
100  page_height_(page_height),
101  h_margin_(50),
102  v_margin_(50),
103  char_spacing_(0),
104  leading_(0),
105  vertical_text_(false),
106  gravity_hint_strong_(false),
107  render_fullwidth_latin_(false),
108  underline_start_prob_(0),
109  underline_continuation_prob_(0),
110  underline_style_(PANGO_UNDERLINE_SINGLE),
111  features_(NULL),
112  drop_uncovered_chars_(true),
113  strip_unrenderable_words_(false),
114  add_ligatures_(false),
115  output_word_boxes_(false),
116  surface_(NULL),
117  cr_(NULL),
118  layout_(NULL),
119  start_box_(0),
120  page_(0),
121  box_padding_(0),
122  total_chars_(0),
123  font_index_(0),
124  last_offset_(0) {
125  pen_color_[0] = 0.0;
126  pen_color_[1] = 0.0;
127  pen_color_[2] = 0.0;
128  set_font(font_desc);
129  set_resolution(kDefaultOutputResolution);
130  page_boxes_ = NULL;
131 }
132 
133 bool StringRenderer::set_font(const string& desc) {
134  bool success = font_.ParseFontDescriptionName(desc);
136  return success;
137 }
138 
139 void StringRenderer::set_resolution(const int resolution) {
140  resolution_ = resolution;
141  font_.set_resolution(resolution);
142 }
143 
145  underline_start_prob_ = min(max(frac, 0.0), 1.0);
146 }
147 
149  underline_continuation_prob_ = min(max(frac, 0.0), 1.0);
150 }
151 
153  free(features_);
154  ClearBoxes();
155  FreePangoCairo();
156 }
157 
159  FreePangoCairo();
160  surface_ = cairo_image_surface_create(CAIRO_FORMAT_ARGB32, page_width_,
161  page_height_);
162  cr_ = cairo_create(surface_);
163  {
165  layout_ = pango_cairo_create_layout(cr_);
166  }
167 
168  if (vertical_text_) {
169  PangoContext* context = pango_layout_get_context(layout_);
170  pango_context_set_base_gravity(context, PANGO_GRAVITY_EAST);
171  if (gravity_hint_strong_) {
172  pango_context_set_gravity_hint(context, PANGO_GRAVITY_HINT_STRONG);
173  }
174  pango_layout_context_changed(layout_);
175  }
176 
178 }
179 
181  string font_desc = font_.DescriptionName();
182  // Specify the font via a description name
183  PangoFontDescription *desc =
184  pango_font_description_from_string(font_desc.c_str());
185  // Assign the font description to the layout
186  pango_layout_set_font_description(layout_, desc);
187  pango_font_description_free(desc); // free the description
188  pango_cairo_context_set_resolution(pango_layout_get_context(layout_),
189  resolution_);
190 
191  int max_width = page_width_ - 2 * h_margin_;
192  int max_height = page_height_ - 2 * v_margin_;
193  tlog(3, "max_width = %d, max_height = %d\n", max_width, max_height);
194  if (vertical_text_) {
195  swap(max_width, max_height);
196  }
197  pango_layout_set_width(layout_, max_width * PANGO_SCALE);
198  pango_layout_set_wrap(layout_, PANGO_WRAP_WORD);
199 
200  // Adjust character spacing
201  PangoAttrList* attr_list = pango_attr_list_new();
202  if (char_spacing_) {
203  PangoAttribute* spacing_attr = pango_attr_letter_spacing_new(
204  static_cast<int>(char_spacing_ * PANGO_SCALE + 0.5));
205  spacing_attr->start_index = 0;
206  spacing_attr->end_index = static_cast<guint>(-1);
207  pango_attr_list_change(attr_list, spacing_attr);
208  }
209 #if (PANGO_VERSION_MAJOR == 1 && PANGO_VERSION_MINOR >= 38)
210  if (add_ligatures_) {
211  set_features("liga, clig, dlig, hlig");
212  PangoAttribute* feature_attr = pango_attr_font_features_new(features_);
213  pango_attr_list_change(attr_list, feature_attr);
214  }
215 #endif
216  pango_layout_set_attributes(layout_, attr_list);
217  pango_attr_list_unref(attr_list);
218  // Adjust line spacing
219  if (leading_) {
220  pango_layout_set_spacing(layout_, leading_ * PANGO_SCALE);
221  }
222 }
223 
225  if (layout_) {
226  g_object_unref(layout_);
227  layout_ = NULL;
228  }
229  if (cr_) {
230  cairo_destroy(cr_);
231  cr_ = NULL;
232  }
233  if (surface_) {
234  cairo_surface_destroy(surface_);
235  surface_ = NULL;
236  }
237 }
238 
239 void StringRenderer::SetWordUnderlineAttributes(const string& page_text) {
240  if (underline_start_prob_ == 0) return;
241  PangoAttrList* attr_list = pango_layout_get_attributes(layout_);
242 
243  const char* text = page_text.c_str();
244  int offset = 0;
245  TRand rand;
246  bool started_underline = false;
247  PangoAttribute* und_attr = NULL;
248 
249  while (offset < page_text.length()) {
250  offset += SpanUTF8Whitespace(text + offset);
251  if (offset == page_text.length()) break;
252 
253  int word_start = offset;
254  int word_len = SpanUTF8NotWhitespace(text + offset);
255  offset += word_len;
256  if (started_underline) {
257  // Should we continue the underline to the next word?
258  if (RandBool(underline_continuation_prob_, &rand)) {
259  // Continue the current underline to this word.
260  und_attr->end_index = word_start + word_len;
261  } else {
262  // Otherwise end the current underline attribute at the end of the
263  // previous word.
264  pango_attr_list_insert(attr_list, und_attr);
265  started_underline = false;
266  und_attr = NULL;
267  }
268  }
269  if (!started_underline && RandBool(underline_start_prob_, &rand)) {
270  // Start a new underline attribute
271  und_attr = pango_attr_underline_new(underline_style_);
272  und_attr->start_index = word_start;
273  und_attr->end_index = word_start + word_len;
274  started_underline = true;
275  }
276  }
277  // Finish the current underline attribute at the end of the page.
278  if (started_underline) {
279  und_attr->end_index = page_text.length();
280  pango_attr_list_insert(attr_list, und_attr);
281  }
282 }
283 
284 // Returns offset in utf8 bytes to first page.
286  int text_length) {
287  if (!text_length) return 0;
288  const int max_height = (page_height_ - 2 * v_margin_);
289  const int max_width = (page_width_ - 2 * h_margin_);
290  const int max_layout_height = vertical_text_ ? max_width : max_height;
291 
292  UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
293  const UNICHAR::const_iterator it_end = UNICHAR::end(text, text_length);
294  const int kMaxUnicodeBufLength = 15000;
295  for (int i = 0; i < kMaxUnicodeBufLength && it != it_end; ++it, ++i);
296  int buf_length = it.utf8_data() - text;
297  tlog(1, "len = %d buf_len = %d\n", text_length, buf_length);
298  pango_layout_set_text(layout_, text, buf_length);
299 
300  PangoLayoutIter* line_iter = NULL;
301  { // Fontconfig caches some info here that is not freed before exit.
303  line_iter = pango_layout_get_iter(layout_);
304  }
305  bool first_page = true;
306  int page_top = 0;
307  int offset = buf_length;
308  do {
309  // Get bounding box of the current line
310  PangoRectangle line_ink_rect;
311  pango_layout_iter_get_line_extents(line_iter, &line_ink_rect, NULL);
312  pango_extents_to_pixels(&line_ink_rect, NULL);
313  PangoLayoutLine* line = pango_layout_iter_get_line_readonly(line_iter);
314  if (first_page) {
315  page_top = line_ink_rect.y;
316  first_page = false;
317  }
318  int line_bottom = line_ink_rect.y + line_ink_rect.height;
319  if (line_bottom - page_top > max_layout_height) {
320  offset = line->start_index;
321  tlog(1, "Found offset = %d\n", offset);
322  break;
323  }
324  } while (pango_layout_iter_next_line(line_iter));
325  pango_layout_iter_free(line_iter);
326  return offset;
327 }
328 
329 const vector<BoxChar*>& StringRenderer::GetBoxes() const {
330  return boxchars_;
331 }
332 
334  return page_boxes_;
335 }
336 
337 void StringRenderer::RotatePageBoxes(float rotation) {
338  BoxChar::RotateBoxes(rotation, page_width_ / 2, page_height_ / 2,
339  start_box_, boxchars_.size(), &boxchars_);
340 }
341 
342 
344  for (int i = 0; i < boxchars_.size(); ++i)
345  delete boxchars_[i];
346  boxchars_.clear();
347  boxaDestroy(&page_boxes_);
348 }
349 
353 }
354 
358 }
359 
360 // Returns cluster strings in logical order.
361 bool StringRenderer::GetClusterStrings(vector<string>* cluster_text) {
362  map<int, string> start_byte_to_text;
363  PangoLayoutIter* run_iter = pango_layout_get_iter(layout_);
364  const char* full_text = pango_layout_get_text(layout_);
365  do {
366  PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
367  if (!run) {
368  // End of line NULL run marker
369  tlog(2, "Found end of line marker\n");
370  continue;
371  }
372  PangoGlyphItemIter cluster_iter;
373  gboolean have_cluster;
374  for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
375  run, full_text);
376  have_cluster;
377  have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
378  const int start_byte_index = cluster_iter.start_index;
379  const int end_byte_index = cluster_iter.end_index;
380  string text = string(full_text + start_byte_index,
381  end_byte_index - start_byte_index);
382  if (IsUTF8Whitespace(text.c_str())) {
383  tlog(2, "Found whitespace\n");
384  text = " ";
385  }
386  tlog(2, "start_byte=%d end_byte=%d : '%s'\n", start_byte_index,
387  end_byte_index, text.c_str());
388  if (add_ligatures_) {
389  // Make sure the output box files have ligatured text in case the font
390  // decided to use an unmapped glyph.
391  text = LigatureTable::Get()->AddLigatures(text, NULL);
392  }
393  start_byte_to_text[start_byte_index] = text;
394  }
395  } while (pango_layout_iter_next_run(run_iter));
396  pango_layout_iter_free(run_iter);
397 
398  cluster_text->clear();
399  for (map<int, string>::const_iterator it = start_byte_to_text.begin();
400  it != start_byte_to_text.end(); ++it) {
401  cluster_text->push_back(it->second);
402  }
403  return !cluster_text->empty();
404 }
405 
406 // Merges an array of BoxChars into words based on the identification of
407 // BoxChars containing the space character as inter-word separators.
408 //
409 // Sometime two adjacent characters in the sequence may be detected as lying on
410 // different lines based on their spatial positions. This may be the result of a
411 // newline character at end of the last word on a line in the source text, or of
412 // a discretionary line-break created by Pango at intra-word locations like
413 // hyphens. When this is detected the word is split at that location into
414 // multiple BoxChars. Otherwise, each resulting BoxChar will contain a word and
415 // its bounding box.
416 static void MergeBoxCharsToWords(vector<BoxChar*>* boxchars) {
417  vector<BoxChar*> result;
418  bool started_word = false;
419  for (int i = 0; i < boxchars->size(); ++i) {
420  if (boxchars->at(i)->ch() == " " ||
421  boxchars->at(i)->box() == NULL) {
422  result.push_back(boxchars->at(i));
423  boxchars->at(i) = NULL;
424  started_word = false;
425  continue;
426  }
427 
428  if (!started_word) {
429  // Begin new word
430  started_word = true;
431  result.push_back(boxchars->at(i));
432  boxchars->at(i) = NULL;
433  } else {
434  BoxChar* last_boxchar = result.back();
435  // Compute bounding box union
436  const Box* box = boxchars->at(i)->box();
437  Box* last_box = last_boxchar->mutable_box();
438  int left = min(last_box->x, box->x);
439  int right = max(last_box->x + last_box->w, box->x + box->w);
440  int top = min(last_box->y, box->y);
441  int bottom = max(last_box->y + last_box->h, box->y + box->h);
442  // Conclude that the word was broken to span multiple lines based on the
443  // size of the merged bounding box in relation to those of the individual
444  // characters seen so far.
445  if (right - left > last_box->w + 5 * box->w) {
446  tlog(1, "Found line break after '%s'", last_boxchar->ch().c_str());
447  // Insert a fake interword space and start a new word with the current
448  // boxchar.
449  result.push_back(new BoxChar(" ", 1));
450  result.push_back(boxchars->at(i));
451  boxchars->at(i) = NULL;
452  continue;
453  }
454  // Append to last word
455  last_boxchar->mutable_ch()->append(boxchars->at(i)->ch());
456  last_box->x = left;
457  last_box->w = right - left;
458  last_box->y = top;
459  last_box->h = bottom - top;
460  delete boxchars->at(i);
461  boxchars->at(i) = NULL;
462  }
463  }
464  boxchars->swap(result);
465 }
466 
467 
469  const char* text = pango_layout_get_text(layout_);
470  PangoLayoutIter* cluster_iter = pango_layout_get_iter(layout_);
471 
472  // Do a first pass to store cluster start indexes.
473  vector<int> cluster_start_indices;
474  do {
475  cluster_start_indices.push_back(pango_layout_iter_get_index(cluster_iter));
476  tlog(3, "Added %d\n", cluster_start_indices.back());
477  } while (pango_layout_iter_next_cluster(cluster_iter));
478  pango_layout_iter_free(cluster_iter);
479  cluster_start_indices.push_back(strlen(text));
480  tlog(3, "Added last index %d\n", cluster_start_indices.back());
481  // Sort the indices and create a map from start to end indices.
482  sort(cluster_start_indices.begin(), cluster_start_indices.end());
483  map<int, int> cluster_start_to_end_index;
484  for (int i = 0; i < cluster_start_indices.size() - 1; ++i) {
485  cluster_start_to_end_index[cluster_start_indices[i]]
486  = cluster_start_indices[i + 1];
487  }
488 
489  // Iterate again to compute cluster boxes and their text with the obtained
490  // cluster extent information.
491  cluster_iter = pango_layout_get_iter(layout_);
492  // Store BoxChars* sorted by their byte start positions
493  map<int, BoxChar*> start_byte_to_box;
494  do {
495  PangoRectangle cluster_rect;
496  pango_layout_iter_get_cluster_extents(cluster_iter, &cluster_rect,
497  NULL);
498  pango_extents_to_pixels(&cluster_rect, NULL);
499  const int start_byte_index = pango_layout_iter_get_index(cluster_iter);
500  const int end_byte_index = cluster_start_to_end_index[start_byte_index];
501  string cluster_text = string(text + start_byte_index,
502  end_byte_index - start_byte_index);
503  if (!cluster_text.empty() && cluster_text[0] == '\n') {
504  tlog(2, "Skipping newlines at start of text.\n");
505  continue;
506  }
507  if (!cluster_rect.width || !cluster_rect.height ||
508  IsUTF8Whitespace(cluster_text.c_str())) {
509  tlog(2, "Skipping whitespace with boxdim (%d,%d) '%s'\n",
510  cluster_rect.width, cluster_rect.height, cluster_text.c_str());
511  BoxChar* boxchar = new BoxChar(" ", 1);
512  boxchar->set_page(page_);
513  start_byte_to_box[start_byte_index] = boxchar;
514  continue;
515  }
516  // Prepare a boxchar for addition at this byte position.
517  tlog(2, "[%d %d], %d, %d : start_byte=%d end_byte=%d : '%s'\n",
518  cluster_rect.x, cluster_rect.y,
519  cluster_rect.width, cluster_rect.height,
520  start_byte_index, end_byte_index,
521  cluster_text.c_str());
522  ASSERT_HOST_MSG(cluster_rect.width,
523  "cluster_text:%s start_byte_index:%d\n",
524  cluster_text.c_str(), start_byte_index);
525  ASSERT_HOST_MSG(cluster_rect.height,
526  "cluster_text:%s start_byte_index:%d\n",
527  cluster_text.c_str(), start_byte_index);
528  if (box_padding_) {
529  cluster_rect.x = max(0, cluster_rect.x - box_padding_);
530  cluster_rect.width += 2 * box_padding_;
531  cluster_rect.y = max(0, cluster_rect.y - box_padding_);
532  cluster_rect.height += 2 * box_padding_;
533  }
534  if (add_ligatures_) {
535  // Make sure the output box files have ligatured text in case the font
536  // decided to use an unmapped glyph.
537  cluster_text = LigatureTable::Get()->AddLigatures(cluster_text, NULL);
538  }
539  BoxChar* boxchar = new BoxChar(cluster_text.c_str(), cluster_text.size());
540  boxchar->set_page(page_);
541  boxchar->AddBox(cluster_rect.x, cluster_rect.y,
542  cluster_rect.width, cluster_rect.height);
543  start_byte_to_box[start_byte_index] = boxchar;
544  } while (pango_layout_iter_next_cluster(cluster_iter));
545  pango_layout_iter_free(cluster_iter);
546 
547  // There is a subtle bug in the cluster text reported by the PangoLayoutIter
548  // on ligatured characters (eg. The word "Lam-Aliph" in arabic). To work
549  // around this, we use text reported using the PangoGlyphIter which is
550  // accurate.
551  // TODO(ranjith): Revisit whether this is still needed in newer versions of
552  // pango.
553  vector<string> cluster_text;
554  if (GetClusterStrings(&cluster_text)) {
555  ASSERT_HOST(cluster_text.size() == start_byte_to_box.size());
556  int ind = 0;
557  for (map<int, BoxChar*>::iterator it = start_byte_to_box.begin();
558  it != start_byte_to_box.end(); ++it, ++ind) {
559  it->second->mutable_ch()->swap(cluster_text[ind]);
560  }
561  }
562 
563  // Append to the boxchars list in byte order.
564  vector<BoxChar*> page_boxchars;
565  page_boxchars.reserve(start_byte_to_box.size());
566  string last_ch;
567  for (map<int, BoxChar*>::const_iterator it = start_byte_to_box.begin();
568  it != start_byte_to_box.end(); ++it) {
569  if (it->second->ch() == kWordJoinerUTF8) {
570  // Skip zero-width joiner characters (ZWJs) here.
571  delete it->second;
572  } else {
573  page_boxchars.push_back(it->second);
574  }
575  }
576  CorrectBoxPositionsToLayout(&page_boxchars);
577 
579  for (map<int, BoxChar*>::iterator it = start_byte_to_box.begin();
580  it != start_byte_to_box.end(); ++it) {
581  // Convert fullwidth Latin characters to their halfwidth forms.
582  string half(ConvertFullwidthLatinToBasicLatin(it->second->ch()));
583  it->second->mutable_ch()->swap(half);
584  }
585  }
586 
587  // Merge the character boxes into word boxes if we are rendering n-grams.
588  if (output_word_boxes_) {
589  MergeBoxCharsToWords(&page_boxchars);
590  }
591 
592  boxchars_.insert(boxchars_.end(), page_boxchars.begin(), page_boxchars.end());
593 
594  // Compute the page bounding box
595  Box* page_box = NULL;
596  Boxa* all_boxes = NULL;
597  for (int i = 0; i < page_boxchars.size(); ++i) {
598  if (page_boxchars[i]->box() == NULL) continue;
599  if (all_boxes == NULL)
600  all_boxes = boxaCreate(0);
601  boxaAddBox(all_boxes, page_boxchars[i]->mutable_box(), L_CLONE);
602  }
603  if (all_boxes != NULL) {
604  boxaGetExtent(all_boxes, NULL, NULL, &page_box);
605  boxaDestroy(&all_boxes);
606  if (page_boxes_ == NULL) page_boxes_ = boxaCreate(0);
607  boxaAddBox(page_boxes_, page_box, L_INSERT);
608  }
609 }
610 
611 
612 void StringRenderer::CorrectBoxPositionsToLayout(vector<BoxChar*>* boxchars) {
613  if (vertical_text_) {
614  const double rotation = - pango_gravity_to_rotation(
615  pango_context_get_base_gravity(pango_layout_get_context(layout_)));
618  0, boxchars->size(), boxchars);
619  } else {
621  }
622 }
623 
624 int StringRenderer::StripUnrenderableWords(string* utf8_text) const {
625  string output_text;
626  const char* text = utf8_text->c_str();
627  int offset = 0;
628  int num_dropped = 0;
629  while (offset < utf8_text->length()) {
630  int space_len = SpanUTF8Whitespace(text + offset);
631  output_text.append(text + offset, space_len);
632  offset += space_len;
633  if (offset == utf8_text->length()) break;
634 
635  int word_len = SpanUTF8NotWhitespace(text + offset);
636  if (font_.CanRenderString(text + offset, word_len)) {
637  output_text.append(text + offset, word_len);
638  } else {
639  ++num_dropped;
640  }
641  offset += word_len;
642  }
643  utf8_text->swap(output_text);
644 
645  if (num_dropped > 0) {
646  tprintf("Stripped %d unrenderable words\n", num_dropped);
647  }
648  return num_dropped;
649 }
650 
651 int StringRenderer::RenderToGrayscaleImage(const char* text, int text_length,
652  Pix** pix) {
653  Pix *orig_pix = NULL;
654  int offset = RenderToImage(text, text_length, &orig_pix);
655  if (orig_pix) {
656  *pix = pixConvertTo8(orig_pix, false);
657  pixDestroy(&orig_pix);
658  }
659  return offset;
660 }
661 
662 int StringRenderer::RenderToBinaryImage(const char* text, int text_length,
663  int threshold, Pix** pix) {
664  Pix *orig_pix = NULL;
665  int offset = RenderToImage(text, text_length, &orig_pix);
666  if (orig_pix) {
667  Pix* gray_pix = pixConvertTo8(orig_pix, false);
668  pixDestroy(&orig_pix);
669  *pix = pixThresholdToBinary(gray_pix, threshold);
670  pixDestroy(&gray_pix);
671  } else {
672  *pix = orig_pix;
673  }
674  return offset;
675 }
676 
677 // Add word joiner (WJ) characters between adjacent non-space characters except
678 // immediately before a combiner.
679 /* static */
680 string StringRenderer::InsertWordJoiners(const string& text) {
681  string out_str;
682  const UNICHAR::const_iterator it_end = UNICHAR::end(text.c_str(),
683  text.length());
684  for (UNICHAR::const_iterator it = UNICHAR::begin(text.c_str(), text.length());
685  it != it_end; ++it) {
686  // Add the symbol to the output string.
687  out_str.append(it.utf8_data(), it.utf8_len());
688  // Check the next symbol.
689  UNICHAR::const_iterator next_it = it;
690  ++next_it;
691  bool next_char_is_boundary = (next_it == it_end || *next_it == ' ');
692  bool next_char_is_combiner = (next_it == it_end) ?
693  false : IsCombiner(*next_it);
694  if (*it != ' ' && *it != '\n' && !next_char_is_boundary &&
695  !next_char_is_combiner) {
696  out_str += kWordJoinerUTF8;
697  }
698  }
699  return out_str;
700 }
701 
702 // Convert halfwidth Basic Latin characters to their fullwidth forms.
704  string full_str;
705  const UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(),
706  str.length());
707  for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length());
708  it != it_end; ++it) {
709  // Convert printable and non-space 7-bit ASCII characters to
710  // their fullwidth forms.
711  if (IsInterchangeValid7BitAscii(*it) && isprint(*it) && !isspace(*it)) {
712  // Convert by adding 0xFEE0 to the codepoint of 7-bit ASCII.
713  char32 full_char = *it + 0xFEE0;
714  full_str.append(EncodeAsUTF8(full_char));
715  } else {
716  full_str.append(it.utf8_data(), it.utf8_len());
717  }
718  }
719  return full_str;
720 }
721 
722 // Convert fullwidth Latin characters to their halfwidth forms.
724  string half_str;
725  UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
726  for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length());
727  it != it_end; ++it) {
728  char32 half_char = FullwidthToHalfwidth(*it);
729  // Convert fullwidth Latin characters to their halfwidth forms
730  // only if halfwidth forms are printable and non-space 7-bit ASCII.
731  if (IsInterchangeValid7BitAscii(half_char) &&
732  isprint(half_char) && !isspace(half_char)) {
733  half_str.append(EncodeAsUTF8(half_char));
734  } else {
735  half_str.append(it.utf8_data(), it.utf8_len());
736  }
737  }
738  return half_str;
739 }
740 
741 // Returns offset to end of text substring rendered in this method.
742 int StringRenderer::RenderToImage(const char* text, int text_length,
743  Pix** pix) {
744  if (pix && *pix) pixDestroy(pix);
745  InitPangoCairo();
746 
747  const int page_offset = FindFirstPageBreakOffset(text, text_length);
748  if (!page_offset) {
749  return 0;
750  }
751  start_box_ = boxchars_.size();
752 
753  if (!vertical_text_) {
754  // Translate by the specified margin
755  cairo_translate(cr_, h_margin_, v_margin_);
756  } else {
757  // Vertical text rendering is achieved by a two-step process of first
758  // performing regular horizontal layout with character orientation set to
759  // EAST, and then translating and rotating the layout before rendering onto
760  // the desired image surface. The settings required for the former step are
761  // done within InitPangoCairo().
762  //
763  // Translate to the top-right margin of page
764  cairo_translate(cr_, page_width_ - h_margin_, v_margin_);
765  // Rotate the layout
766  double rotation = - pango_gravity_to_rotation(
767  pango_context_get_base_gravity(pango_layout_get_context(layout_)));
768  tlog(2, "Rotating by %f radians\n", rotation);
769  cairo_rotate(cr_, rotation);
770  pango_cairo_update_layout(cr_, layout_);
771  }
772  string page_text(text, page_offset);
774  // Convert Basic Latin to their fullwidth forms.
775  page_text = ConvertBasicLatinToFullwidthLatin(page_text);
776  }
778  StripUnrenderableWords(&page_text);
779  }
780  if (drop_uncovered_chars_ &&
781  !font_.CoversUTF8Text(page_text.c_str(), page_text.length())) {
782  int num_dropped = font_.DropUncoveredChars(&page_text);
783  if (num_dropped) {
784  tprintf("WARNING: Dropped %d uncovered characters\n", num_dropped);
785  }
786  }
787  if (add_ligatures_) {
788  // Add ligatures wherever possible, including custom ligatures.
789  page_text = LigatureTable::Get()->AddLigatures(page_text, &font_);
790  }
791  if (underline_start_prob_ > 0) {
792  SetWordUnderlineAttributes(page_text);
793  }
794 
795  pango_layout_set_text(layout_, page_text.c_str(), page_text.length());
796 
797  if (pix) {
798  // Set a white background for the target image surface.
799  cairo_set_source_rgb(cr_, 1.0, 1.0, 1.0); // sets drawing colour to white
800  // Fill the surface with the active colour (if you don't do this, you will
801  // be given a surface with a transparent background to draw on)
802  cairo_paint(cr_);
803  // Set the ink color to black
804  cairo_set_source_rgb(cr_, pen_color_[0], pen_color_[1], pen_color_[2]);
805  // If the target surface or transformation properties of the cairo instance
806  // have changed, update the pango layout to reflect this
807  pango_cairo_update_layout(cr_, layout_);
808  {
809  DISABLE_HEAP_LEAK_CHECK; // for Fontconfig
810  // Draw the pango layout onto the cairo surface
811  pango_cairo_show_layout(cr_, layout_);
812  }
814  }
816  FreePangoCairo();
817  // Update internal state variables.
818  ++page_;
819  return page_offset;
820 }
821 
822 // Render a string to an image, returning it as an 8 bit pix. Behaves as
823 // RenderString, except that it ignores the font set at construction and works
824 // through all the fonts, returning 0 until they are exhausted, at which point
825 // it returns the value it should have returned all along, but no pix this time.
826 // Fonts that don't contain a given proportion of the characters in the string
827 // get skipped.
828 // Fonts that work each get rendered and the font name gets added
829 // to the image.
830 // NOTE that no boxes are produced by this function.
831 //
832 // Example usage: To render a null terminated char-array "txt"
833 //
834 // int offset = 0;
835 // do {
836 // Pix *pix;
837 // offset += renderer.RenderAllFontsToImage(min_proportion, txt + offset,
838 // strlen(txt + offset), NULL, &pix);
839 // ...
840 // } while (offset < strlen(text));
841 //
843  const char* text, int text_length,
844  string* font_used, Pix** image) {
845  *image = NULL;
846  // Select a suitable font to render the title with.
847  const char kTitleTemplate[] = "%s : %d hits = %.2f%%, raw = %d = %.2f%%";
848  string title_font;
849  if (!FontUtils::SelectFont(kTitleTemplate, strlen(kTitleTemplate),
850  &title_font, NULL)) {
851  tprintf("WARNING: Could not find a font to render image title with!\n");
852  title_font = "Arial";
853  }
854  title_font += " 8";
855  tlog(1, "Selected title font: %s\n", title_font.c_str());
856  if (font_used) font_used->clear();
857 
858  string orig_font = font_.DescriptionName();
859  if (char_map_.empty()) {
860  total_chars_ = 0;
861  // Fill the hash table and use that for computing which fonts to use.
862  for (UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
863  it != UNICHAR::end(text, text_length); ++it) {
864  ++total_chars_;
865  ++char_map_[*it];
866  }
867  tprintf("Total chars = %d\n", total_chars_);
868  }
869  const vector<string>& all_fonts = FontUtils::ListAvailableFonts();
870  for (int i = font_index_; i < all_fonts.size(); ++i) {
871  ++font_index_;
872  int raw_score = 0;
873  int ok_chars = FontUtils::FontScore(char_map_, all_fonts[i], &raw_score,
874  NULL);
875  if (ok_chars > 0 && ok_chars >= total_chars_ * min_coverage) {
876  set_font(all_fonts[i]);
877  int offset = RenderToBinaryImage(text, text_length, 128, image);
878  ClearBoxes(); // Get rid of them as they are garbage.
879  const int kMaxTitleLength = 1024;
880  char title[kMaxTitleLength];
881  snprintf(title, kMaxTitleLength, kTitleTemplate,
882  all_fonts[i].c_str(), ok_chars,
883  100.0 * ok_chars / total_chars_, raw_score,
884  100.0 * raw_score / char_map_.size());
885  tprintf("%s\n", title);
886  // This is a good font! Store the offset to return once we've tried all
887  // the fonts.
888  if (offset) {
889  last_offset_ = offset;
890  if (font_used) *font_used = all_fonts[i];
891  }
892  // Add the font to the image.
893  set_font(title_font);
894  v_margin_ /= 8;
895  Pix* title_image = NULL;
896  RenderToBinaryImage(title, strlen(title), 128, &title_image);
897  pixOr(*image, *image, title_image);
898  pixDestroy(&title_image);
899 
900  v_margin_ *= 8;
901  set_font(orig_font);
902  // We return the real offset only after cycling through the list of fonts.
903  return 0;
904  } else {
905  tprintf("Font %s failed with %d hits = %.2f%%\n",
906  all_fonts[i].c_str(), ok_chars, 100.0 * ok_chars / total_chars_);
907  }
908  }
909  font_index_ = 0;
910  char_map_.clear();
911  return last_offset_ == 0 ? -1 : last_offset_;
912 }
913 
914 } // namespace tesseract
static string ConvertBasicLatinToFullwidthLatin(const string &text)
static void TranslateBoxes(int xshift, int yshift, vector< BoxChar *> *boxes)
Definition: boxchar.cpp:52
bool CanRenderString(const char *utf8_word, int len, std::vector< string > *graphemes) const
bool ParseFontDescriptionName(const string &name)
TessHashMap< char32, inT64 > char_map_
const char * utf8_data() const
Definition: unichar.h:130
std::vector< BoxChar * > boxchars_
#define tlog(level,...)
Definition: tlog.h:33
int RenderToBinaryImage(const char *text, int text_length, int threshold, Pix **pix)
void set_underline_continuation_prob(const double frac)
void AddBox(int x, int y, int width, int height)
Definition: boxchar.cpp:47
int StripUnrenderableWords(string *utf8_text) const
bool CoversUTF8Text(const char *utf8_text, int byte_length) const
void set_resolution(const int resolution)
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:182
int SpanUTF8NotWhitespace(const char *text)
Definition: normstrngs.cpp:197
static int FontScore(const TessHashMap< char32, inT64 > &ch_map, const string &fontname, int *raw_score, std::vector< bool > *ch_flags)
StringRenderer(const string &font_desc, int page_width, int page_height)
int RenderToGrayscaleImage(const char *text, int text_length, Pix **pix)
static const_iterator end(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:204
bool IsInterchangeValid7BitAscii(const char32 ch)
Definition: normstrngs.cpp:232
int RenderAllFontsToImage(double min_coverage, const char *text, int text_length, string *font_used, Pix **pix)
string AddLigatures(const string &str, const PangoFontInfo *font) const
int DropUncoveredChars(string *utf8_text) const
static string GetTesseractBoxStr(int height, const vector< BoxChar *> &boxes)
Definition: boxchar.cpp:300
int RenderToImage(const char *text, int text_length, Pix **pix)
#define ASSERT_HOST_MSG(x,...)
Definition: errcode.h:90
static const_iterator begin(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:200
int FindFirstPageBreakOffset(const char *text, int text_length)
static bool SelectFont(const char *utf8_word, const int utf8_len, string *font_name, std::vector< string > *graphemes)
static string InsertWordJoiners(const string &text)
static string ConvertFullwidthLatinToBasicLatin(const string &text)
int SpanUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:186
static void PrepareToWrite(vector< BoxChar *> *boxes)
Definition: boxchar.cpp:66
const std::vector< BoxChar * > & GetBoxes() const
#define tprintf(...)
Definition: tprintf.h:31
cairo_surface_t * surface_
void RotatePageBoxes(float rotation)
static void WriteTesseractBoxFile(const string &name, int height, const vector< BoxChar *> &boxes)
Definition: boxchar.cpp:293
void CorrectBoxPositionsToLayout(std::vector< BoxChar *> *boxchars)
bool GetClusterStrings(std::vector< string > *cluster_text)
PangoUnderline underline_style_
char32 FullwidthToHalfwidth(const char32 ch)
Definition: normstrngs.cpp:239
void set_underline_start_prob(const double frac)
void set_features(const char *features)
static void RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box, vector< BoxChar *> *boxes)
Definition: boxchar.cpp:272
void WriteAllBoxes(const string &filename)
bool set_font(const string &desc)
void set_page(int page)
Definition: boxchar.h:55
Pix * CairoARGB32ToPixFormat(cairo_surface_t *surface)
void set_resolution(const int resolution)
static LigatureTable * Get()
#define ASSERT_HOST(x)
Definition: errcode.h:84
signed int char32
Definition: normstrngs.h:27
static const std::vector< string > & ListAvailableFonts()
#define DISABLE_HEAP_LEAK_CHECK
Definition: util.h:63
void SetWordUnderlineAttributes(const string &page_text)