Electroneum
language_base.h
Go to the documentation of this file.
1 // Copyrights(c) 2017-2021, The Electroneum Project
2 // Copyrights(c) 2014-2019, The Monero Project
3 //
4 // All rights reserved.
5 //
6 // Redistribution and use in source and binary forms, with or without modification, are
7 // permitted provided that the following conditions are met:
8 //
9 // 1. Redistributions of source code must retain the above copyright notice, this list of
10 // conditions and the following disclaimer.
11 //
12 // 2. Redistributions in binary form must reproduce the above copyright notice, this list
13 // of conditions and the following disclaimer in the documentation and/or other
14 // materials provided with the distribution.
15 //
16 // 3. Neither the name of the copyright holder nor the names of its contributors may be
17 // used to endorse or promote products derived from this software without specific
18 // prior written permission.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
21 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
22 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 // THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
27 // STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
28 // THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
36 #ifndef LANGUAGE_BASE_H
37 #define LANGUAGE_BASE_H
38 
39 #include <vector>
40 #include <unordered_map>
41 #include <string>
42 #include <boost/algorithm/string.hpp>
43 #include "misc_log_ex.h"
44 #include "fnv1.h"
45 
50 namespace Language
51 {
59  template<typename T>
60  inline T utf8prefix(const T &s, size_t count)
61  {
62  T prefix = "";
63  size_t avail = s.size();
64  const char *ptr = s.data();
65  while (count-- && avail--)
66  {
67  prefix += *ptr++;
68  while (avail && ((*ptr) & 0xc0) == 0x80)
69  {
70  prefix += *ptr++;
71  --avail;
72  }
73  }
74  return prefix;
75  }
76 
77  template<typename T>
78  inline T utf8canonical(const T &s)
79  {
80  T sc = "";
81  size_t avail = s.size();
82  const char *ptr = s.data();
83  wint_t cp = 0;
84  int bytes = 1;
85  char wbuf[8], *wptr;
86  while (avail--)
87  {
88  if ((*ptr & 0x80) == 0)
89  {
90  cp = *ptr++;
91  bytes = 1;
92  }
93  else if ((*ptr & 0xe0) == 0xc0)
94  {
95  if (avail < 1)
96  throw std::runtime_error("Invalid UTF-8");
97  cp = (*ptr++ & 0x1f) << 6;
98  cp |= *ptr++ & 0x3f;
99  --avail;
100  bytes = 2;
101  }
102  else if ((*ptr & 0xf0) == 0xe0)
103  {
104  if (avail < 2)
105  throw std::runtime_error("Invalid UTF-8");
106  cp = (*ptr++ & 0xf) << 12;
107  cp |= (*ptr++ & 0x3f) << 6;
108  cp |= *ptr++ & 0x3f;
109  avail -= 2;
110  bytes = 3;
111  }
112  else if ((*ptr & 0xf8) == 0xf0)
113  {
114  if (avail < 3)
115  throw std::runtime_error("Invalid UTF-8");
116  cp = (*ptr++ & 0x7) << 18;
117  cp |= (*ptr++ & 0x3f) << 12;
118  cp |= (*ptr++ & 0x3f) << 6;
119  cp |= *ptr++ & 0x3f;
120  avail -= 3;
121  bytes = 4;
122  }
123  else
124  throw std::runtime_error("Invalid UTF-8");
125 
126  cp = std::towlower(cp);
127  wptr = wbuf;
128  switch (bytes)
129  {
130  case 1: *wptr++ = cp; break;
131  case 2: *wptr++ = 0xc0 | (cp >> 6); *wptr++ = 0x80 | (cp & 0x3f); break;
132  case 3: *wptr++ = 0xe0 | (cp >> 12); *wptr++ = 0x80 | ((cp >> 6) & 0x3f); *wptr++ = 0x80 | (cp & 0x3f); break;
133  case 4: *wptr++ = 0xf0 | (cp >> 18); *wptr++ = 0x80 | ((cp >> 12) & 0x3f); *wptr++ = 0x80 | ((cp >> 6) & 0x3f); *wptr++ = 0x80 | (cp & 0x3f); break;
134  default: throw std::runtime_error("Invalid UTF-8");
135  }
136  *wptr = 0;
137  sc += T(wbuf, bytes);
138  cp = 0;
139  bytes = 1;
140  }
141  return sc;
142  }
143 
144  struct WordHash
145  {
146  std::size_t operator()(const epee::wipeable_string &s) const
147  {
148  const epee::wipeable_string sc = utf8canonical(s);
149  return epee::fnv::FNV1a(sc.data(), sc.size());
150  }
151  };
152 
153  struct WordEqual
154  {
155  bool operator()(const epee::wipeable_string &s0, const epee::wipeable_string &s1) const
156  {
157  const epee::wipeable_string s0c = utf8canonical(s0);
158  const epee::wipeable_string s1c = utf8canonical(s1);
159  return s0c == s1c;
160  }
161  };
162 
168  class Base
169  {
170  protected:
171  enum {
174  };
175  enum {
176  NWORDS = 1626
177  };
178  std::vector<std::string> word_list;
179  std::unordered_map<epee::wipeable_string, uint32_t, WordHash, WordEqual> word_map;
180  std::unordered_map<epee::wipeable_string, uint32_t, WordHash, WordEqual> trimmed_word_map;
187  void populate_maps(uint32_t flags = 0)
188  {
189  int ii;
190  std::vector<std::string>::const_iterator it;
191  if (word_list.size () != NWORDS)
192  throw std::runtime_error("Wrong word list length for " + language_name);
193  for (it = word_list.begin(), ii = 0; it != word_list.end(); it++, ii++)
194  {
195  word_map[*it] = ii;
196  if ((*it).size() < unique_prefix_length)
197  {
198  if (flags & ALLOW_SHORT_WORDS)
199  MWARNING(language_name << " word '" << *it << "' is shorter than its prefix length, " << unique_prefix_length);
200  else
201  throw std::runtime_error("Too short word in " + language_name + " word list: " + *it);
202  }
203  epee::wipeable_string trimmed;
204  if (it->length() > unique_prefix_length)
205  {
206  trimmed = utf8prefix(*it, unique_prefix_length);
207  }
208  else
209  {
210  trimmed = *it;
211  }
212  if (trimmed_word_map.find(trimmed) != trimmed_word_map.end())
213  {
214  if (flags & ALLOW_DUPLICATE_PREFIXES)
215  MWARNING("Duplicate prefix in " << language_name << " word list: " << std::string(trimmed.data(), trimmed.size()));
216  else
217  throw std::runtime_error("Duplicate prefix in " + language_name + " word list: " + std::string(trimmed.data(), trimmed.size()));
218  }
219  trimmed_word_map[trimmed] = ii;
220  }
221  }
222  public:
223  Base(const char *language_name, const char *english_language_name, const std::vector<std::string> &words, uint32_t prefix_length):
224  word_list(words),
225  unique_prefix_length(prefix_length),
228  {
229  }
230  virtual ~Base()
231  {
232  }
233  void set_words(const char * const words[])
234  {
235  word_list.resize(NWORDS);
236  for (size_t i = 0; i < NWORDS; ++i)
237  word_list[i] = words[i];
238  }
243  const std::vector<std::string>& get_word_list() const
244  {
245  return word_list;
246  }
251  const std::unordered_map<epee::wipeable_string, uint32_t, WordHash, WordEqual>& get_word_map() const
252  {
253  return word_map;
254  }
259  const std::unordered_map<epee::wipeable_string, uint32_t, WordHash, WordEqual>& get_trimmed_word_map() const
260  {
261  return trimmed_word_map;
262  }
268  {
269  return language_name;
270  }
276  {
277  return english_language_name;
278  }
284  {
285  return unique_prefix_length;
286  }
287  };
288 }
289 
290 #endif
const std::vector< std::string > & get_word_list() const
Returns a pointer to the word list.
const uint32_t T[512]
const std::string & get_language_name() const
Returns the name of the language.
size_t size() const noexcept
T utf8canonical(const T &s)
Definition: language_base.h:78
::std::string string
Definition: gtest-port.h:1097
Mnemonic language related namespace.
Base(const char *language_name, const char *english_language_name, const std::vector< std::string > &words, uint32_t prefix_length)
virtual ~Base()
std::vector< std::string > word_list
A base language class which all languages have to inherit from for Polymorphism.
const std::unordered_map< epee::wipeable_string, uint32_t, WordHash, WordEqual > & get_trimmed_word_map() const
Returns a pointer to the trimmed word map.
std::unordered_map< epee::wipeable_string, uint32_t, WordHash, WordEqual > word_map
bool operator()(const epee::wipeable_string &s0, const epee::wipeable_string &s1) const
std::string language_name
mdb_size_t count(MDB_cursor *cur)
const std::unordered_map< epee::wipeable_string, uint32_t, WordHash, WordEqual > & get_word_map() const
Returns a pointer to the word map.
std::unordered_map< epee::wipeable_string, uint32_t, WordHash, WordEqual > trimmed_word_map
unsigned int uint32_t
Definition: stdint.h:126
uint64_t FNV1a(const char *ptr, size_t sz)
Definition: fnv1.h:36
std::size_t operator()(const epee::wipeable_string &s) const
std::string english_language_name
#define MWARNING(x)
Definition: misc_log_ex.h:74
const std::string & get_english_language_name() const
Returns the name of the language in English.
const char * data() const noexcept
void populate_maps(uint32_t flags=0)
Populates the word maps after the list is ready.
uint32_t get_unique_prefix_length() const
Returns the number of unique starting characters to be used for matching.
uint32_t unique_prefix_length
T utf8prefix(const T &s, size_t count)
Returns a string made of (at most) the first count characters in s. Assumes well formedness. No check is made for this.
Definition: language_base.h:60
void set_words(const char *const words[])